json: fix high surrogate escapes

Surrogate escapes must add not or 0x10000, as the specified bits go up 0xfffff.
2024-09-30 05:05:27 +00:00 · 2024-05-05 14:42:16 +03:00 · 2024-05-05 14:42:16 +03:00 · 22ac5f85cc
parent eea18a8a8a
commit 22ac5f85cc
3 changed files with 205 additions and 5 deletions
--- a/spa/include/spa/utils/json.h
+++ b/spa/include/spa/utils/json.h
@ -594,7 +594,7 @@ static inline int spa_json_parse_stringn(const char *val, int len, char *result,
 						    v < 0xdc00 || v > 0xdfff)
 							continue;
 						p += 6;
-						cp = 0x010000 | ((cp & 0x3ff) << 10) | (v & 0x3ff);
+						cp = 0x010000 + (((cp & 0x3ff) << 10) | (v & 0x3ff));
 					} else if (cp >= 0xdc00 && cp <= 0xdfff)
 						continue;

--- a/test/data/test-spa-json.txt
+++ b/test/data/test-spa-json.txt
@ -1497,3 +1497,207 @@ true
 []
 >>>

+
+The following tests are generated by Python, and test unicode escape
+decoding further:
+
+import json, operator, functools
+vs = [1 << j for j in range(6, 21)]
+vs += [(1 << j) | 0xffff for j in range(6, 21)]
+vs += [functools.reduce(operator.or_, (1 << k for k in range(j))) for j in range(8, 21)]
+for v in sorted(set(vs)):
+    print(f"<<< y_string_{hex(v)}.json\n{json.dumps(chr(v))}\n===\n{json.dumps(chr(v), ensure_ascii=False)}\n>>>\n")
+
+
+<<< y_string_0x40.json
+"@"
+===
+"@"
+>>>
+
+<<< y_string_0x80.json
+"\u0080"
+===
+""
+>>>
+
+<<< y_string_0xff.json
+"\u00ff"
+===
+"ÿ"
+>>>
+
+<<< y_string_0x100.json
+"\u0100"
+===
+"Ā"
+>>>
+
+<<< y_string_0x1ff.json
+"\u01ff"
+===
+"ǿ"
+>>>
+
+<<< y_string_0x200.json
+"\u0200"
+===
+"Ȁ"
+>>>
+
+<<< y_string_0x3ff.json
+"\u03ff"
+===
+"Ͽ"
+>>>
+
+<<< y_string_0x400.json
+"\u0400"
+===
+"Ѐ"
+>>>
+
+<<< y_string_0x7ff.json
+"\u07ff"
+===
+"߿"
+>>>
+
+<<< y_string_0x800.json
+"\u0800"
+===
+"ࠀ"
+>>>
+
+<<< y_string_0xfff.json
+"\u0fff"
+===
+"࿿"
+>>>
+
+<<< y_string_0x1000.json
+"\u1000"
+===
+"က"
+>>>
+
+<<< y_string_0x1fff.json
+"\u1fff"
+===
+"῿"
+>>>
+
+<<< y_string_0x2000.json
+"\u2000"
+===
+" "
+>>>
+
+<<< y_string_0x3fff.json
+"\u3fff"
+===
+"㿿"
+>>>
+
+<<< y_string_0x4000.json
+"\u4000"
+===
+"䀀"
+>>>
+
+<<< y_string_0x7fff.json
+"\u7fff"
+===
+"翿"
+>>>
+
+<<< y_string_0x8000.json
+"\u8000"
+===
+"耀"
+>>>
+
+<<< y_string_0xffff.json
+"\uffff"
+===
+""
+>>>
+
+<<< y_string_0x10000.json
+"\ud800\udc00"
+===
+"𐀀"
+>>>
+
+<<< y_string_0x1ffff.json
+"\ud83f\udfff"
+===
+"🿿"
+>>>
+
+<<< y_string_0x20000.json
+"\ud840\udc00"
+===
+"𠀀"
+>>>
+
+<<< y_string_0x2ffff.json
+"\ud87f\udfff"
+===
+"𯿿"
+>>>
+
+<<< y_string_0x3ffff.json
+"\ud8bf\udfff"
+===
+"𿿿"
+>>>
+
+<<< y_string_0x40000.json
+"\ud8c0\udc00"
+===
+"񀀀"
+>>>
+
+<<< y_string_0x4ffff.json
+"\ud8ff\udfff"
+===
+"񏿿"
+>>>
+
+<<< y_string_0x7ffff.json
+"\ud9bf\udfff"
+===
+"񿿿"
+>>>
+
+<<< y_string_0x80000.json
+"\ud9c0\udc00"
+===
+"򀀀"
+>>>
+
+<<< y_string_0x8ffff.json
+"\ud9ff\udfff"
+===
+"򏿿"
+>>>
+
+<<< y_string_0xfffff.json
+"\udbbf\udfff"
+===
+"󿿿"
+>>>
+
+<<< y_string_0x100000.json
+"\udbc0\udc00"
+===
+"􀀀"
+>>>
+
+<<< y_string_0x10ffff.json
+"\udbff\udfff"
+===
+"􏿿"
+>>>
+
--- a/test/test-spa-json.c
+++ b/test/test-spa-json.c
@ -989,10 +989,6 @@ PWTEST(json_data)
 		/* spa_json_parse_string API doesn't do \0 */
 		"y_object_escaped_null_in_key.json",
 		"y_string_null_escape.json",
-
-		/* XXX: something with surrogate handling? */
-		"y_string_last_surrogates_1_and_2.json",
-		"y_string_unicode_U+10FFFE_nonchar.json",
 	};

 	const char *basedir = getenv("PWTEST_DATA_DIR");