convert: add round trip check based on 'core.checkRoundtripEncoding'

UTF supports lossless conversion round tripping and conversions between
UTF and other encodings are mostly round trip safe as Unicode aims to be
a superset of all other character encodings. However, certain encodings
(e.g. SHIFT-JIS) are known to have round trip issues [1].

Add 'core.checkRoundtripEncoding', which contains a comma separated
list of encodings, to define for what encodings Git should check the
conversion round trip if they are used in the 'working-tree-encoding'
attribute.

Set SHIFT-JIS as default value for 'core.checkRoundtripEncoding'.

[1] https://support.microsoft.com/en-us/help/170559/prb-conversion-problem-between-shift-jis-and-unicode

Signed-off-by: Lars Schneider <larsxschneider@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
This commit is contained in:
Lars Schneider 2018-04-15 20:16:10 +02:00 committed by Junio C Hamano
parent 541d059cd9
commit e92d622536
7 changed files with 137 additions and 0 deletions

View file

@ -528,6 +528,12 @@ core.autocrlf::
This variable can be set to 'input', This variable can be set to 'input',
in which case no output conversion is performed. in which case no output conversion is performed.
core.checkRoundtripEncoding::
A comma and/or whitespace separated list of encodings that Git
performs UTF-8 round trip checks on if they are used in an
`working-tree-encoding` attribute (see linkgit:gitattributes[5]).
The default value is `SHIFT-JIS`.
core.symlinks:: core.symlinks::
If false, symbolic links are checked out as small plain files that If false, symbolic links are checked out as small plain files that
contain the link text. linkgit:git-update-index[1] and contain the link text. linkgit:git-update-index[1] and

View file

@ -312,6 +312,14 @@ number of pitfalls:
internal contents as UTF-8 and try to convert it to UTF-16 on checkout. internal contents as UTF-8 and try to convert it to UTF-16 on checkout.
That operation will fail and cause an error. That operation will fail and cause an error.
- Reencoding content to non-UTF encodings can cause errors as the
conversion might not be UTF-8 round trip safe. If you suspect your
encoding to not be round trip safe, then add it to
`core.checkRoundtripEncoding` to make Git check the round trip
encoding (see linkgit:git-config[1]). SHIFT-JIS (Japanese character
set) is known to have round trip issues with UTF-8 and is checked by
default.
- Reencoding content requires resources that might slow down certain - Reencoding content requires resources that might slow down certain
Git operations (e.g 'git checkout' or 'git add'). Git operations (e.g 'git checkout' or 'git add').

View file

@ -1172,6 +1172,11 @@ static int git_default_core_config(const char *var, const char *value)
return 0; return 0;
} }
if (!strcmp(var, "core.checkroundtripencoding")) {
check_roundtrip_encoding = xstrdup(value);
return 0;
}
if (!strcmp(var, "core.notesref")) { if (!strcmp(var, "core.notesref")) {
notes_ref_name = xstrdup(value); notes_ref_name = xstrdup(value);
return 0; return 0;

View file

@ -347,6 +347,42 @@ static void trace_encoding(const char *context, const char *path,
strbuf_release(&trace); strbuf_release(&trace);
} }
static int check_roundtrip(const char *enc_name)
{
/*
* check_roundtrip_encoding contains a string of comma and/or
* space separated encodings (eg. "UTF-16, ASCII, CP1125").
* Search for the given encoding in that string.
*/
const char *found = strcasestr(check_roundtrip_encoding, enc_name);
const char *next;
int len;
if (!found)
return 0;
next = found + strlen(enc_name);
len = strlen(check_roundtrip_encoding);
return (found && (
/*
* check that the found encoding is at the
* beginning of check_roundtrip_encoding or
* that it is prefixed with a space or comma
*/
found == check_roundtrip_encoding || (
(isspace(found[-1]) || found[-1] == ',')
)
) && (
/*
* check that the found encoding is at the
* end of check_roundtrip_encoding or
* that it is suffixed with a space or comma
*/
next == check_roundtrip_encoding + len || (
next < check_roundtrip_encoding + len &&
(isspace(next[0]) || next[0] == ',')
)
));
}
static const char *default_encoding = "UTF-8"; static const char *default_encoding = "UTF-8";
static int encode_to_git(const char *path, const char *src, size_t src_len, static int encode_to_git(const char *path, const char *src, size_t src_len,
@ -395,6 +431,47 @@ static int encode_to_git(const char *path, const char *src, size_t src_len,
} }
trace_encoding("destination", path, default_encoding, dst, dst_len); trace_encoding("destination", path, default_encoding, dst, dst_len);
/*
* UTF supports lossless conversion round tripping [1] and conversions
* between UTF and other encodings are mostly round trip safe as
* Unicode aims to be a superset of all other character encodings.
* However, certain encodings (e.g. SHIFT-JIS) are known to have round
* trip issues [2]. Check the round trip conversion for all encodings
* listed in core.checkRoundtripEncoding.
*
* The round trip check is only performed if content is written to Git.
* This ensures that no information is lost during conversion to/from
* the internal UTF-8 representation.
*
* Please note, the code below is not tested because I was not able to
* generate a faulty round trip without an iconv error. Iconv errors
* are already caught above.
*
* [1] http://unicode.org/faq/utf_bom.html#gen2
* [2] https://support.microsoft.com/en-us/help/170559/prb-conversion-problem-between-shift-jis-and-unicode
*/
if (die_on_error && check_roundtrip(enc)) {
char *re_src;
int re_src_len;
re_src = reencode_string_len(dst, dst_len,
enc, default_encoding,
&re_src_len);
trace_printf("Checking roundtrip encoding for %s...\n", enc);
trace_encoding("reencoded source", path, enc,
re_src, re_src_len);
if (!re_src || src_len != re_src_len ||
memcmp(src, re_src, src_len)) {
const char* msg = _("encoding '%s' from %s to %s and "
"back is not the same");
die(msg, path, enc, default_encoding);
}
free(re_src);
}
strbuf_attach(buf, dst, dst_len, dst_len + 1); strbuf_attach(buf, dst, dst_len, dst_len + 1);
return 1; return 1;
} }

View file

@ -56,6 +56,7 @@ struct delayed_checkout {
}; };
extern enum eol core_eol; extern enum eol core_eol;
extern char *check_roundtrip_encoding;
extern const char *get_cached_convert_stats_ascii(const struct index_state *istate, extern const char *get_cached_convert_stats_ascii(const struct index_state *istate,
const char *path); const char *path);
extern const char *get_wt_convert_stats_ascii(const char *path); extern const char *get_wt_convert_stats_ascii(const char *path);

View file

@ -50,6 +50,7 @@ int check_replace_refs = 1;
char *git_replace_ref_base; char *git_replace_ref_base;
enum eol core_eol = EOL_UNSET; enum eol core_eol = EOL_UNSET;
int global_conv_flags_eol = CONV_EOL_RNDTRP_WARN; int global_conv_flags_eol = CONV_EOL_RNDTRP_WARN;
char *check_roundtrip_encoding = "SHIFT-JIS";
unsigned whitespace_rule_cfg = WS_DEFAULT_RULE; unsigned whitespace_rule_cfg = WS_DEFAULT_RULE;
enum branch_track git_branch_track = BRANCH_TRACK_REMOTE; enum branch_track git_branch_track = BRANCH_TRACK_REMOTE;
enum rebase_setup_type autorebase = AUTOREBASE_NEVER; enum rebase_setup_type autorebase = AUTOREBASE_NEVER;

View file

@ -203,4 +203,43 @@ test_expect_success 'error if encoding garbage is already in Git' '
test_i18ngrep "error: BOM is required" err.out test_i18ngrep "error: BOM is required" err.out
' '
test_expect_success 'check roundtrip encoding' '
test_when_finished "rm -f roundtrip.shift roundtrip.utf16" &&
test_when_finished "git reset --hard HEAD" &&
text="hallo there!\nroundtrip test here!" &&
printf "$text" | iconv -f UTF-8 -t SHIFT-JIS >roundtrip.shift &&
printf "$text" | iconv -f UTF-8 -t UTF-16 >roundtrip.utf16 &&
echo "*.shift text working-tree-encoding=SHIFT-JIS" >>.gitattributes &&
# SHIFT-JIS encoded files are round-trip checked by default...
GIT_TRACE=1 git add .gitattributes roundtrip.shift 2>&1 |
grep "Checking roundtrip encoding for SHIFT-JIS" &&
git reset &&
# ... unless we overwrite the Git config!
! GIT_TRACE=1 git -c core.checkRoundtripEncoding=garbage \
add .gitattributes roundtrip.shift 2>&1 |
grep "Checking roundtrip encoding for SHIFT-JIS" &&
git reset &&
# UTF-16 encoded files should not be round-trip checked by default...
! GIT_TRACE=1 git add roundtrip.utf16 2>&1 |
grep "Checking roundtrip encoding for UTF-16" &&
git reset &&
# ... unless we tell Git to check it!
GIT_TRACE=1 git -c core.checkRoundtripEncoding="UTF-16, UTF-32" \
add roundtrip.utf16 2>&1 |
grep "Checking roundtrip encoding for utf-16" &&
git reset &&
# ... unless we tell Git to check it!
# (here we also check that the casing of the encoding is irrelevant)
GIT_TRACE=1 git -c core.checkRoundtripEncoding="UTF-32, utf-16" \
add roundtrip.utf16 2>&1 |
grep "Checking roundtrip encoding for utf-16" &&
git reset
'
test_done test_done