36415: remap bytes from invalid multibyte characters.

These now go to 0xdc00 + index. If wchar_t is a Unicode code point, this is by construction an invalid character within the Unicode range. If it isn't, we would hope the result was no worse than the current fudge.
2024-10-06 16:09:31 +00:00 · 2015-09-04 10:07:51 +01:00 · 2015-09-04 10:07:51 +01:00 · f52795ea3e
parent 32f5d3d8c1
commit f52795ea3e
3 changed files with 47 additions and 6 deletions
--- a/8
+++ b/8
@ -1,3 +1,11 @@
+2015-09-04  Peter Stephenson  <p.stephenson@samsung.com>
+
+	* 36415: Src/pattern.c, Test/D07multibyte.ztst: remap bytes from
+	invalid multibyte characters to 0xDC00 + index which is invalid
+	in Unicode.  Strictly this only works if whcar_t is
+	ISO-10646-compliant, however it ought to be at least as good as
+	the current fudge in any case.
+
 2015-09-03  Peter Stephenson  <p.stephenson@samsung.com>

 	* 36416: Src/Zle/zle_refresh.c, Src/Zle/zle_utils.c: If
--- a/Src/pattern.c
+++ b/Src/pattern.c
@ -224,6 +224,22 @@ typedef zlong zrange_t;
 typedef unsigned long zrange_t;
 #endif

+#ifdef MULTIBYTE_SUPPORT
+/*
+ * Handle a byte that's not part of a valid character.
+ *
+ * This range in Unicode is recommended for purposes of this
+ * kind as it corresponds to invalid characters.
+ *
+ * Note that this strictly only works if wchar_t represents
+ * Unicode code points, which isn't necessarily true; however,
+ * converting an invalid character into an unknown format is
+ * a bit tricky...
+ */
+#define WCHAR_INVALID(ch)			\
+    ((wchar_t) (0xDC00 + STOUC(ch)))
+#endif /* MULTIBYTE_SUPPORT */
+
 /*
 * Array of characters corresponding to zpc_chars enum, which it must match.
 */
@ -353,10 +369,10 @@ metacharinc(char **x)
 	return wc;
    }

-    /* Error.  Treat as single byte. */
+    /* Error. */
    /* Reset the shift state for next time. */
    memset(&shiftstate, 0, sizeof(shiftstate));
-    return (wchar_t) STOUC(*(*x)++);
+    return WCHAR_INVALID(*(*x)++);
 }

 #else
@ -1867,10 +1883,10 @@ charref(char *x, char *y)
    ret = mbrtowc(&wc, x, y-x, &shiftstate);

    if (ret == MB_INVALID || ret == MB_INCOMPLETE) {
-	/* Error.  Treat as single byte. */
+	/* Error. */
 	/* Reset the shift state for next time. */
 	memset(&shiftstate, 0, sizeof(shiftstate));
-	return (wchar_t) STOUC(*x);
+	return WCHAR_INVALID(*x);
    }

    return wc;
@ -1913,7 +1929,7 @@ charrefinc(char **x, char *y, int *z)
    size_t ret;

    if (!(patglobflags & GF_MULTIBYTE) || !(STOUC(**x) & 0x80))
-	return (wchar_t) STOUC(*(*x)++);
+	return WCHAR_INVALID(*(*x)++);

    ret = mbrtowc(&wc, *x, y-*x, &shiftstate);

@ -1922,7 +1938,7 @@ charrefinc(char **x, char *y, int *z)
 	*z = 1;
 	/* Reset the shift state for next time. */
 	memset(&shiftstate, 0, sizeof(shiftstate));
-	return (wchar_t) STOUC(*(*x)++);
+	return WCHAR_INVALID(*(*x)++);
    }

    /* Nulls here are normal characters */
--- a/Test/D07multibyte.ztst
+++ b/Test/D07multibyte.ztst
@ -508,3 +508,20 @@
     cd ..
  }
 0:cd with special characters
+
+  test_array=(
+  '[[ \xcc = \xcc ]]'
+  '[[ \xcc != \xcd ]]'
+  '[[ \xcc != \ucc ]]'
+  '[[ \ucc = \ucc ]]'
+  '[[ \ucc = [\ucc] ]]'
+  '[[ \xcc != [\ucc] ]]'
+  # Not clear how useful the following is...
+  '[[ \xcc = [\xcc] ]]'
+  )
+  for test in $test_array; do
+    if ! eval ${(g::)test} ; then
+      print -rl "Test $test failed" >&2
+    fi
+  done
+0:Invalid characters in pattern matching