From 245e1c196dab226675a02a8caca5a83373f5e4d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mart=C3=ADn=20Nieto?= Date: Thu, 16 Apr 2015 16:05:12 +0200 Subject: [PATCH 1/5] dir: allow a BOM at the beginning of exclude files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some text editors like Notepad or LibreOffice write an UTF-8 BOM in order to indicate that the file is Unicode text rather than whatever the current locale would indicate. If someone uses such an editor to edit a gitignore file, we are left with those three bytes at the beginning of the file. If we do not skip them, we will attempt to match a filename with the BOM as prefix, which won't match the files the user is expecting. Signed-off-by: Carlos Martín Nieto Signed-off-by: Junio C Hamano --- dir.c | 8 +++++++- t/t7061-wtstatus-ignore.sh | 9 +++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/dir.c b/dir.c index 3f7a0256b6..10c1f903ef 100644 --- a/dir.c +++ b/dir.c @@ -538,6 +538,7 @@ int add_excludes_from_file_to_list(const char *fname, struct stat st; int fd, i, lineno = 1; size_t size = 0; + static const unsigned char *utf8_bom = (unsigned char *) "\xef\xbb\xbf"; char *buf, *entry; fd = open(fname, O_RDONLY); @@ -574,7 +575,12 @@ int add_excludes_from_file_to_list(const char *fname, } el->filebuf = buf; - entry = buf; + + if (size >= 3 && !memcmp(buf, utf8_bom, 3)) + entry = buf + 3; + else + entry = buf; + for (i = 0; i < size; i++) { if (buf[i] == '\n') { if (entry != buf + i && entry[0] != '#') { diff --git a/t/t7061-wtstatus-ignore.sh b/t/t7061-wtstatus-ignore.sh index 460789b4d8..cdc0747bf0 100755 --- a/t/t7061-wtstatus-ignore.sh +++ b/t/t7061-wtstatus-ignore.sh @@ -20,6 +20,15 @@ test_expect_success 'status untracked directory with --ignored' ' test_cmp expected actual ' +test_expect_success 'same with gitignore starting with BOM' ' + printf "\357\273\277ignored\n" >.gitignore && + mkdir -p untracked && + : >untracked/ignored && + : >untracked/uncommitted && + git status --porcelain --ignored >actual && + test_cmp expected actual +' + cat >expected <<\EOF ?? .gitignore ?? actual From cb0abea87017559e1db3721a7e6d89a336d845e9 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Thu, 16 Apr 2015 11:26:29 -0700 Subject: [PATCH 2/5] add_excludes_from_file: clarify the bom skipping logic Even though the previous step shifts where the "entry" begins, we still iterate over the original buf[], which may begin with the UTF-8 BOM we are supposed to be skipping. At the end of the first line, the code grabs the contents of it starting at "entry", so there is nothing wrong per-se, but the logic looks really confused. Instead, move the buf pointer and shrink its size, to truly pretend that UTF-8 BOM did not exist in the input. Signed-off-by: Junio C Hamano --- dir.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/dir.c b/dir.c index 10c1f903ef..b5bb38977b 100644 --- a/dir.c +++ b/dir.c @@ -576,10 +576,11 @@ int add_excludes_from_file_to_list(const char *fname, el->filebuf = buf; - if (size >= 3 && !memcmp(buf, utf8_bom, 3)) - entry = buf + 3; - else - entry = buf; + if (size >= 3 && !memcmp(buf, utf8_bom, 3)) { + buf += 3; + size -= 3; + } + entry = buf; for (i = 0; i < size; i++) { if (buf[i] == '\n') { From dde843e7378f65004415bd108038659de9ce2abd Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Thu, 16 Apr 2015 10:45:29 -0700 Subject: [PATCH 3/5] utf8-bom: introduce skip_utf8_bom() helper With the recent change to ignore the UTF8 BOM at the beginning of .gitignore files, we now have two codepaths that do such a skipping (the other one is for reading the configuration files). Introduce utf8_bom[] constant string and skip_utf8_bom() helper and teach .gitignore code how to use it. Signed-off-by: Junio C Hamano --- dir.c | 9 ++++----- utf8.c | 11 +++++++++++ utf8.h | 3 +++ 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/dir.c b/dir.c index b5bb38977b..4c4bf910fa 100644 --- a/dir.c +++ b/dir.c @@ -12,6 +12,7 @@ #include "refs.h" #include "wildmatch.h" #include "pathspec.h" +#include "utf8.h" struct path_simplify { int len; @@ -538,7 +539,6 @@ int add_excludes_from_file_to_list(const char *fname, struct stat st; int fd, i, lineno = 1; size_t size = 0; - static const unsigned char *utf8_bom = (unsigned char *) "\xef\xbb\xbf"; char *buf, *entry; fd = open(fname, O_RDONLY); @@ -576,10 +576,9 @@ int add_excludes_from_file_to_list(const char *fname, el->filebuf = buf; - if (size >= 3 && !memcmp(buf, utf8_bom, 3)) { - buf += 3; - size -= 3; - } + if (skip_utf8_bom(&buf, size)) + size -= buf - el->filebuf; + entry = buf; for (i = 0; i < size; i++) { diff --git a/utf8.c b/utf8.c index 520fbb4994..28e6d76a42 100644 --- a/utf8.c +++ b/utf8.c @@ -633,3 +633,14 @@ int is_hfs_dotgit(const char *path) return 1; } + +const char utf8_bom[] = "\357\273\277"; + +int skip_utf8_bom(char **text, size_t len) +{ + if (len < strlen(utf8_bom) || + memcmp(*text, utf8_bom, strlen(utf8_bom))) + return 0; + *text += strlen(utf8_bom); + return 1; +} diff --git a/utf8.h b/utf8.h index e4d9183c5f..e7b2aa4168 100644 --- a/utf8.h +++ b/utf8.h @@ -13,6 +13,9 @@ int same_encoding(const char *, const char *); __attribute__((format (printf, 2, 3))) int utf8_fprintf(FILE *, const char *, ...); +extern const char utf8_bom[]; +extern int skip_utf8_bom(char **, size_t); + void strbuf_add_wrapped_text(struct strbuf *buf, const char *text, int indent, int indent2, int width); void strbuf_add_wrapped_bytes(struct strbuf *buf, const char *data, int len, From 599446dc323d15bab7f2a54f57ae8c5c4d7d6103 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Thu, 16 Apr 2015 10:47:45 -0700 Subject: [PATCH 4/5] config: use utf8_bom[] from utf.[ch] in git_parse_source() Because the function reads one character at the time, unfortunately we cannot use the easier skip_utf8_bom() helper, but at least we do not have to duplicate the constant string this way. Signed-off-by: Junio C Hamano --- config.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/config.c b/config.c index 752e2e227f..9618aa443d 100644 --- a/config.c +++ b/config.c @@ -12,6 +12,7 @@ #include "quote.h" #include "hashmap.h" #include "string-list.h" +#include "utf8.h" struct config_source { struct config_source *prev; @@ -412,8 +413,7 @@ static int git_parse_source(config_fn_t fn, void *data) struct strbuf *var = &cf->var; /* U+FEFF Byte Order Mark in UTF8 */ - static const unsigned char *utf8_bom = (unsigned char *) "\xef\xbb\xbf"; - const unsigned char *bomptr = utf8_bom; + const char *bomptr = utf8_bom; for (;;) { int c = get_next_char(); @@ -421,7 +421,7 @@ static int git_parse_source(config_fn_t fn, void *data) /* We are at the file beginning; skip UTF8-encoded BOM * if present. Sane editors won't put this in on their * own, but e.g. Windows Notepad will do it happily. */ - if ((unsigned char) c == *bomptr) { + if (c == (*bomptr & 0377)) { bomptr++; continue; } else { From 27547e5fccda134560ad0441aa5bfa187387cec0 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Thu, 16 Apr 2015 10:48:58 -0700 Subject: [PATCH 5/5] attr: skip UTF8 BOM at the beginning of the input file Signed-off-by: Junio C Hamano --- attr.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/attr.c b/attr.c index cd5469770a..7c530f4d0c 100644 --- a/attr.c +++ b/attr.c @@ -12,6 +12,7 @@ #include "exec_cmd.h" #include "attr.h" #include "dir.h" +#include "utf8.h" const char git_attr__true[] = "(builtin)true"; const char git_attr__false[] = "\0(builtin)false"; @@ -369,8 +370,12 @@ static struct attr_stack *read_attr_from_file(const char *path, int macro_ok) return NULL; } res = xcalloc(1, sizeof(*res)); - while (fgets(buf, sizeof(buf), fp)) - handle_attr_line(res, buf, path, ++lineno, macro_ok); + while (fgets(buf, sizeof(buf), fp)) { + char *bufp = buf; + if (!lineno) + skip_utf8_bom(&bufp, strlen(bufp)); + handle_attr_line(res, bufp, path, ++lineno, macro_ok); + } fclose(fp); return res; }