json: fix high surrogate escapes

Surrogate escapes must add not or 0x10000, as the specified bits go up
0xfffff.
This commit is contained in:
Pauli Virtanen 2024-05-05 14:42:16 +03:00
parent eea18a8a8a
commit 22ac5f85cc
3 changed files with 205 additions and 5 deletions

View file

@ -594,7 +594,7 @@ static inline int spa_json_parse_stringn(const char *val, int len, char *result,
v < 0xdc00 || v > 0xdfff)
continue;
p += 6;
cp = 0x010000 | ((cp & 0x3ff) << 10) | (v & 0x3ff);
cp = 0x010000 + (((cp & 0x3ff) << 10) | (v & 0x3ff));
} else if (cp >= 0xdc00 && cp <= 0xdfff)
continue;

View file

@ -1497,3 +1497,207 @@ true
[]
>>>
The following tests are generated by Python, and test unicode escape
decoding further:
import json, operator, functools
vs = [1 << j for j in range(6, 21)]
vs += [(1 << j) | 0xffff for j in range(6, 21)]
vs += [functools.reduce(operator.or_, (1 << k for k in range(j))) for j in range(8, 21)]
for v in sorted(set(vs)):
print(f"<<< y_string_{hex(v)}.json\n{json.dumps(chr(v))}\n===\n{json.dumps(chr(v), ensure_ascii=False)}\n>>>\n")
<<< y_string_0x40.json
"@"
===
"@"
>>>
<<< y_string_0x80.json
"\u0080"
===
"€"
>>>
<<< y_string_0xff.json
"\u00ff"
===
"ÿ"
>>>
<<< y_string_0x100.json
"\u0100"
===
"Ā"
>>>
<<< y_string_0x1ff.json
"\u01ff"
===
"ǿ"
>>>
<<< y_string_0x200.json
"\u0200"
===
"Ȁ"
>>>
<<< y_string_0x3ff.json
"\u03ff"
===
"Ͽ"
>>>
<<< y_string_0x400.json
"\u0400"
===
"Ѐ"
>>>
<<< y_string_0x7ff.json
"\u07ff"
===
"߿"
>>>
<<< y_string_0x800.json
"\u0800"
===
"ࠀ"
>>>
<<< y_string_0xfff.json
"\u0fff"
===
"࿿"
>>>
<<< y_string_0x1000.json
"\u1000"
===
"က"
>>>
<<< y_string_0x1fff.json
"\u1fff"
===
"῿"
>>>
<<< y_string_0x2000.json
"\u2000"
===
" "
>>>
<<< y_string_0x3fff.json
"\u3fff"
===
"㿿"
>>>
<<< y_string_0x4000.json
"\u4000"
===
"䀀"
>>>
<<< y_string_0x7fff.json
"\u7fff"
===
"翿"
>>>
<<< y_string_0x8000.json
"\u8000"
===
"耀"
>>>
<<< y_string_0xffff.json
"\uffff"
===
"￿"
>>>
<<< y_string_0x10000.json
"\ud800\udc00"
===
"𐀀"
>>>
<<< y_string_0x1ffff.json
"\ud83f\udfff"
===
"🿿"
>>>
<<< y_string_0x20000.json
"\ud840\udc00"
===
"𠀀"
>>>
<<< y_string_0x2ffff.json
"\ud87f\udfff"
===
"𯿿"
>>>
<<< y_string_0x3ffff.json
"\ud8bf\udfff"
===
"𿿿"
>>>
<<< y_string_0x40000.json
"\ud8c0\udc00"
===
"񀀀"
>>>
<<< y_string_0x4ffff.json
"\ud8ff\udfff"
===
"񏿿"
>>>
<<< y_string_0x7ffff.json
"\ud9bf\udfff"
===
"񿿿"
>>>
<<< y_string_0x80000.json
"\ud9c0\udc00"
===
"򀀀"
>>>
<<< y_string_0x8ffff.json
"\ud9ff\udfff"
===
"򏿿"
>>>
<<< y_string_0xfffff.json
"\udbbf\udfff"
===
"󿿿"
>>>
<<< y_string_0x100000.json
"\udbc0\udc00"
===
"􀀀"
>>>
<<< y_string_0x10ffff.json
"\udbff\udfff"
===
"􏿿"
>>>

View file

@ -989,10 +989,6 @@ PWTEST(json_data)
/* spa_json_parse_string API doesn't do \0 */
"y_object_escaped_null_in_key.json",
"y_string_null_escape.json",
/* XXX: something with surrogate handling? */
"y_string_last_surrogates_1_and_2.json",
"y_string_unicode_U+10FFFE_nonchar.json",
};
const char *basedir = getenv("PWTEST_DATA_DIR");