awk: Merge upstream 2nd Edition Awk Book

Merge in the November 2nd, 2023 version of one true awk. This brings in Unicode support, CSV support and a number of bug fixes. Sponsored by: Netflix Reviewed by: delphij Differential Revision: https://reviews.freebsd.org/D42447
2024-10-01 14:14:56 +00:00 · 2023-11-02 10:32:01 -06:00 · 2023-11-02 10:32:01 -06:00 · 3fd60a6b73
parent 194df014fe 2e406c584f
commit 3fd60a6b73
38 changed files with 3178 additions and 1615 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -5,3 +5,4 @@
 *.py   diff=python
 . svn-properties=svn:keywords=tools/build/options/WITHOUT_LOADER_ZFS
 .clang-format svn-properties=svn:keywords=FreeBSD=%H
+contrib/one-true-awk/bugs-fixed/unicode-null-match.bad binary
--- a/5
+++ b/5
@ -27,6 +27,11 @@ NOTE TO PEOPLE WHO THINK THAT FreeBSD 15.x IS SLOW:
 	world, or to merely disable the most expensive debugging functionality
 	at runtime, run "ln -s 'abort:false,junk:false' /etc/malloc.conf".)

+20231114:
+	One True Awk updated to the Second Edition as of 20231102 (254b979f32df)
+	Notable features include UTF-8 support and --csv to support comma
+	separated data.
+
 20231113:
 	The WITHOUT_LLD_IS_LD option has been removed.  When LLD is enabled
 	it is always installed as /usr/bin/ld.
--- a/contrib/one-true-awk/FIXES
+++ b/contrib/one-true-awk/FIXES
--- a/contrib/one-true-awk/FIXES.1e
+++ b/contrib/one-true-awk/FIXES.1e
--- a/contrib/one-true-awk/README.md
+++ b/contrib/one-true-awk/README.md
@ -1,8 +1,38 @@
 # The One True Awk

 This is the version of `awk` described in _The AWK Programming Language_,
-by Al Aho, Brian Kernighan, and Peter Weinberger
-(Addison-Wesley, 1988, ISBN 0-201-07981-X).
+Second Edition, by Al Aho, Brian Kernighan, and Peter Weinberger
+(Addison-Wesley, 2024, ISBN-13 978-0138269722, ISBN-10 0138269726).
+
+## What's New? ##
+
+This version of Awk handles UTF-8 and comma-separated values (CSV) input.
+
+### Strings ###
+
+Functions that process strings now count Unicode code points, not bytes;
+this affects `length`, `substr`, `index`, `match`, `split`,
+`sub`, `gsub`, and others.  Note that code
+points are not necessarily characters.
+
+UTF-8 sequences may appear in literal strings and regular expressions.
+Aribtrary characters may be included with `\u` followed by 1 to 8 hexadecimal digits.
+
+### Regular expressions ###
+
+Regular expressions may include UTF-8 code points, including `\u`.
+Character classes are likely to be limited to about 256 characters
+when expanded.
+
+### CSV ###
+
+The option `--csv` turns on CSV processing of input:
+fields are separated by commas, fields may be quoted with
+double-quote (`"`) characters, quoted fields may contain embedded newlines.
+In CSV mode, `FS` is ignored.
+
+If no explicit separator argument is provided,
+field-splitting in `split` is determined by CSV mode.

 ## Copyright

@ -35,7 +65,7 @@ in `FIXES`.  If you distribute this code further, please please please
 distribute `FIXES` with it.

 If you find errors, please report them
-to bwk@cs.princeton.edu.
+to the current maintainer, ozan.yigit@gmail.com.
 Please _also_ open an issue in the GitHub issue tracker, to make
 it easy to track issues.
 Thanks.
@ -67,22 +97,22 @@ The program itself is created by

 which should produce a sequence of messages roughly like this:

-	yacc -d awkgram.y
-	conflicts: 43 shift/reduce, 85 reduce/reduce
-	mv y.tab.c ytab.c
-	mv y.tab.h ytab.h
-	cc -c ytab.c
-	cc -c b.c
-	cc -c main.c
-	cc -c parse.c
-	cc maketab.c -o maketab
-	./maketab >proctab.c
-	cc -c proctab.c
-	cc -c tran.c
-	cc -c lib.c
-	cc -c run.c
-	cc -c lex.c
-	cc ytab.o b.o main.o parse.o proctab.o tran.o lib.o run.o lex.o -lm
+	bison -d  awkgram.y
+	awkgram.y: warning: 44 shift/reduce conflicts [-Wconflicts-sr]
+	awkgram.y: warning: 85 reduce/reduce conflicts [-Wconflicts-rr]
+	awkgram.y: note: rerun with option '-Wcounterexamples' to generate conflict counterexamples
+	gcc -g -Wall -pedantic -Wcast-qual   -O2   -c -o awkgram.tab.o awkgram.tab.c
+	gcc -g -Wall -pedantic -Wcast-qual   -O2   -c -o b.o b.c
+	gcc -g -Wall -pedantic -Wcast-qual   -O2   -c -o main.o main.c
+	gcc -g -Wall -pedantic -Wcast-qual   -O2   -c -o parse.o parse.c
+	gcc -g -Wall -pedantic -Wcast-qual -O2 maketab.c -o maketab
+	./maketab awkgram.tab.h >proctab.c
+	gcc -g -Wall -pedantic -Wcast-qual   -O2   -c -o proctab.o proctab.c
+	gcc -g -Wall -pedantic -Wcast-qual   -O2   -c -o tran.o tran.c
+	gcc -g -Wall -pedantic -Wcast-qual   -O2   -c -o lib.o lib.c
+	gcc -g -Wall -pedantic -Wcast-qual   -O2   -c -o run.o run.c
+	gcc -g -Wall -pedantic -Wcast-qual   -O2   -c -o lex.o lex.c
+	gcc -g -Wall -pedantic -Wcast-qual   -O2 awkgram.tab.o b.o main.o parse.o proctab.o tran.o lib.o run.o lex.o   -lm

 This produces an executable `a.out`; you will eventually want to
 move this to some place like `/usr/bin/awk`.
@ -90,7 +120,7 @@ move this to some place like `/usr/bin/awk`.
 If your system does not have `yacc` or `bison` (the GNU
 equivalent), you need to install one of them first.

-NOTE: This version uses ANSI C (C 99), as you should also.  We have
+NOTE: This version uses ISO/IEC C99, as you should also.  We have
 compiled this without any changes using `gcc -Wall` and/or local C
 compilers on a variety of systems, but new systems or compilers
 may raise some new complaint; reports of difficulties are
@ -102,14 +132,9 @@ the standard developer tools.
 You can also use `make CC=g++` to build with the GNU C++ compiler,
 should you choose to do so.

-The version of `malloc` that comes with some systems is sometimes
-astonishly slow.  If `awk` seems slow, you might try fixing that.
-More generally, turning on optimization can significantly improve
-`awk`'s speed, perhaps by 1/3 for highest levels.
-
 ## A Note About Releases

-We don't do releases. 
+We don't usually do releases.

 ## A Note About Maintenance

@ -120,4 +145,4 @@ is not at the top of our priority list.

 #### Last Updated

-Sat Jul 25 14:00:07 EDT 2021
+Sun 15 Oct 2023 06:28:36 IDT
--- a/contrib/one-true-awk/awk.1
+++ b/contrib/one-true-awk/awk.1
@ -20,6 +20,8 @@ awk \- pattern-directed scanning and processing language
 [
 .BI \-F
 .I fs
+|
+.B \-\^\-csv
 ]
 [
 .BI \-v
@ -76,6 +78,12 @@ The
 .I fs
 option defines the input field separator to be the regular expression
 .IR fs .
+The
+.B \-\^\-csv
+option causes
+.I awk
+to process records using (more or less) standard comma-separated values
+(CSV) format.
 .PP
 An input line is normally made up of fields separated by white space,
 or by the regular expression
@ -202,9 +210,9 @@ and
 .B sqrt
 are built in.
 Other built-in functions:
-.TF length
+.TF "\fBlength(\fR[\fIv\^\fR]\fB)\fR"
 .TP
-.B length
+\fBlength(\fR[\fIv\^\fR]\fB)\fR
 the length of its argument
 taken as a string,
 number of elements in an array for an array argument,
@ -212,15 +220,15 @@ or length of
 .B $0
 if no argument.
 .TP
-.B rand
+.B rand()
 random number on [0,1).
 .TP
-.B srand
+\fBsrand(\fR[\fIs\^\fR]\fB)\fR
 sets seed for
 .B rand
 and returns the previous seed.
 .TP
-.B int
+.BI int( x\^ )
 truncates to an integer value.
 .TP
 \fBsubstr(\fIs\fB, \fIm\fR [\fB, \fIn\^\fR]\fB)\fR
@ -441,7 +449,7 @@ in a pattern.
 A pattern may consist of two patterns separated by a comma;
 in this case, the action is performed for all lines
 from an occurrence of the first pattern
-though an occurrence of the second.
+through an occurrence of the second, inclusive.
 .PP
 A relational expression is one of the following:
 .IP
@ -451,7 +459,7 @@ A relational expression is one of the following:
 .br
 .IB expression " in " array-name
 .br
-.BI ( expr , expr,... ") in " array-name
+.BI ( expr ,\| expr ,\| ... ") in " array-name
 .PP
 where a
 .I relop
@ -551,7 +559,7 @@ separates multiple subscripts (default 034).
 Functions may be defined (at the position of a pattern-action statement) thus:
 .IP
 .B
-function foo(a, b, c) { ...; return x }
+function foo(a, b, c) { ... }
 .PP
 Parameters are passed by value if scalar and by reference if array name;
 functions may be called recursively.
@ -617,8 +625,8 @@ BEGIN	{	# Simulate echo(1)
 .IR sed (1)
 .br
 A. V. Aho, B. W. Kernighan, P. J. Weinberger,
-.IR "The AWK Programming Language" ,
-Addison-Wesley, 1988.  ISBN 0-201-07981-X.
+.IR "The AWK Programming Language, Second Edition" ,
+Addison-Wesley, 2024.  ISBN 978-0-13-826972-2, 0-13-826972-6.
 .SH BUGS
 There are no explicit conversions between numbers and strings.
 To force an expression to be treated as a number add 0 to it;
@ -628,7 +636,8 @@ to force it to be treated as a string concatenate
 The scope rules for variables in functions are a botch;
 the syntax is worse.
 .PP
-Only eight-bit characters sets are handled correctly.
+Input is expected to be UTF-8 encoded. Other multibyte
+character sets are not handled.
 .SH UNUSUAL FLOATING-POINT VALUES
 .I Awk
 was designed before IEEE 754 arithmetic defined Not-A-Number (NaN)
--- a/contrib/one-true-awk/awk.h
+++ b/contrib/one-true-awk/awk.h
@ -37,7 +37,7 @@ typedef double	Awkfloat;

 typedef	unsigned char uschar;

-#define	xfree(a)	{ if ((a) != NULL) { free((void *)(intptr_t)(a)); (a) = NULL; } }
+#define	xfree(a)	{ free((void *)(intptr_t)(a)); (a) = NULL; }
 /*
 * We sometimes cheat writing read-only pointers to NUL-terminate them
 * and then put back the original value
@ -64,6 +64,8 @@ extern bool	safe;		/* false => unsafe, true => safe */
 #define	RECSIZE	(8 * 1024)	/* sets limit on records, fields, etc., etc. */
 extern int	recsize;	/* size of current record, orig RECSIZE */

+extern size_t	awk_mb_cur_max;	/* max size of a multi-byte character */
+
 extern char	EMPTY[];	/* this avoid -Wwritable-strings issues */
 extern char	**FS;
 extern char	**RS;
@ -78,6 +80,8 @@ extern char	**SUBSEP;
 extern Awkfloat *RSTART;
 extern Awkfloat *RLENGTH;

+extern bool	CSV;		/* true for csv input */
+
 extern char	*record;	/* points to $0 */
 extern int	lineno;		/* line number in awk program */
 extern int	errorflag;	/* 1 if error has occurred */
@ -233,7 +237,8 @@ extern	int	pairstack[], paircnt;

 /* structures used by regular expression matching machinery, mostly b.c: */

-#define NCHARS	(256+3)		/* 256 handles 8-bit chars; 128 does 7-bit */
+#define NCHARS	(1256+3)		/* 256 handles 8-bit chars; 128 does 7-bit */
+				/* BUG: some overflows (caught) if we use 256 */
 				/* watch out in match(), etc. */
 #define	HAT	(NCHARS+2)	/* matches ^ in regular expr */
 #define NSTATES	32
@ -244,12 +249,19 @@ typedef struct rrow {
 		int i;
 		Node *np;
 		uschar *up;
+		int *rp; /* rune representation of char class */
 	} lval;		/* because Al stores a pointer in it! */
 	int	*lfollow;
 } rrow;

+typedef struct gtt { /* gototab entry */
+	unsigned int ch;
+	unsigned int state;
+} gtt;
+
 typedef struct fa {
-	unsigned int	**gototab;
+	gtt	**gototab;
+	int	gototab_len;
 	uschar	*out;
 	uschar	*restr;
 	int	**posns;
--- a/contrib/one-true-awk/awkgram.y
+++ b/contrib/one-true-awk/awkgram.y
@ -204,11 +204,12 @@ ppattern:
 		{ $$ = op2(BOR, notnull($1), notnull($3)); }
 	| ppattern and ppattern %prec AND
 		{ $$ = op2(AND, notnull($1), notnull($3)); }
-	| ppattern MATCHOP reg_expr	{ $$ = op3($2, NIL, $1, (Node*)makedfa($3, 0)); }
+	| ppattern MATCHOP reg_expr	{ $$ = op3($2, NIL, $1, (Node*)makedfa($3, 0)); free($3); }
 	| ppattern MATCHOP ppattern
-		{ if (constnode($3))
+		{ if (constnode($3)) {
 			$$ = op3($2, NIL, $1, (Node*)makedfa(strnode($3), 0));
-		  else
+			free($3);
+		  } else
 			$$ = op3($2, (Node *)1, $1, $3); }
 	| ppattern IN varname		{ $$ = op2(INTEST, $1, makearr($3)); }
 	| '(' plist ')' IN varname	{ $$ = op2(INTEST, $2, makearr($5)); }
@ -231,11 +232,12 @@ pattern:
 	| pattern LE pattern		{ $$ = op2($2, $1, $3); }
 	| pattern LT pattern		{ $$ = op2($2, $1, $3); }
 	| pattern NE pattern		{ $$ = op2($2, $1, $3); }
-	| pattern MATCHOP reg_expr	{ $$ = op3($2, NIL, $1, (Node*)makedfa($3, 0)); }
+	| pattern MATCHOP reg_expr	{ $$ = op3($2, NIL, $1, (Node*)makedfa($3, 0)); free($3); }
 	| pattern MATCHOP pattern
-		{ if (constnode($3))
+		{ if (constnode($3)) {
 			$$ = op3($2, NIL, $1, (Node*)makedfa(strnode($3), 0));
-		  else
+			free($3);
+		  } else
 			$$ = op3($2, (Node *)1, $1, $3); }
 	| pattern IN varname		{ $$ = op2(INTEST, $1, makearr($3)); }
 	| '(' plist ')' IN varname	{ $$ = op2(INTEST, $2, makearr($5)); }
@ -280,7 +282,7 @@ rbrace:

 re:
 	   reg_expr
-		{ $$ = op3(MATCH, NIL, rectonode(), (Node*)makedfa($1, 0)); }
+		{ $$ = op3(MATCH, NIL, rectonode(), (Node*)makedfa($1, 0)); free($1); }
 	| NOT re	{ $$ = op1(NOT, notnull($2)); }
 	;

@ -378,17 +380,19 @@ term:
 	| GENSUB '(' reg_expr comma pattern comma pattern ')'
 		{ $$ = op5(GENSUB, NIL, (Node*)makedfa($3, 1), $5, $7, rectonode()); }
 	| GENSUB '(' pattern comma pattern comma pattern ')'
-		{ if (constnode($3))
+		{ if (constnode($3)) {
 			$$ = op5(GENSUB, NIL, (Node *)makedfa(strnode($3), 1), $5, $7, rectonode());
-		  else
+			free($3);
+		  } else
 			$$ = op5(GENSUB, (Node *)1, $3, $5, $7, rectonode());
 		}
 	| GENSUB '(' reg_expr comma pattern comma pattern comma pattern ')'
 		{ $$ = op5(GENSUB, NIL, (Node*)makedfa($3, 1), $5, $7, $9); }
 	| GENSUB '(' pattern comma pattern comma pattern comma pattern ')'
-		{ if (constnode($3))
+		{ if (constnode($3)) {
 			$$ = op5(GENSUB, NIL, (Node *)makedfa(strnode($3),1), $5,$7,$9);
-		  else
+			free($3);
+		  } else
 			$$ = op5(GENSUB, (Node *)1, $3, $5, $7, $9);
 		}
 	| GETLINE var LT term		{ $$ = op3(GETLINE, $2, itonp($3), $4); }
@ -402,34 +406,37 @@ term:
 		  $$ = op2(INDEX, $3, (Node*)$5); }
 	| '(' pattern ')'		{ $$ = $2; }
 	| MATCHFCN '(' pattern comma reg_expr ')'
-		{ $$ = op3(MATCHFCN, NIL, $3, (Node*)makedfa($5, 1)); }
+		{ $$ = op3(MATCHFCN, NIL, $3, (Node*)makedfa($5, 1)); free($5); }
 	| MATCHFCN '(' pattern comma pattern ')'
-		{ if (constnode($5))
+		{ if (constnode($5)) {
 			$$ = op3(MATCHFCN, NIL, $3, (Node*)makedfa(strnode($5), 1));
-		  else
+			free($5);
+		  } else
 			$$ = op3(MATCHFCN, (Node *)1, $3, $5); }
 	| NUMBER			{ $$ = celltonode($1, CCON); }
 	| SPLIT '(' pattern comma varname comma pattern ')'     /* string */
 		{ $$ = op4(SPLIT, $3, makearr($5), $7, (Node*)STRING); }
 	| SPLIT '(' pattern comma varname comma reg_expr ')'    /* const /regexp/ */
-		{ $$ = op4(SPLIT, $3, makearr($5), (Node*)makedfa($7, 1), (Node *)REGEXPR); }
+		{ $$ = op4(SPLIT, $3, makearr($5), (Node*)makedfa($7, 1), (Node *)REGEXPR); free($7); }
 	| SPLIT '(' pattern comma varname ')'
 		{ $$ = op4(SPLIT, $3, makearr($5), NIL, (Node*)STRING); }  /* default */
 	| SPRINTF '(' patlist ')'	{ $$ = op1($1, $3); }
 	| string	 		{ $$ = celltonode($1, CCON); }
 	| subop '(' reg_expr comma pattern ')'
-		{ $$ = op4($1, NIL, (Node*)makedfa($3, 1), $5, rectonode()); }
+		{ $$ = op4($1, NIL, (Node*)makedfa($3, 1), $5, rectonode()); free($3); }
 	| subop '(' pattern comma pattern ')'
-		{ if (constnode($3))
+		{ if (constnode($3)) {
 			$$ = op4($1, NIL, (Node*)makedfa(strnode($3), 1), $5, rectonode());
-		  else
+			free($3);
+		  } else
 			$$ = op4($1, (Node *)1, $3, $5, rectonode()); }
 	| subop '(' reg_expr comma pattern comma var ')'
-		{ $$ = op4($1, NIL, (Node*)makedfa($3, 1), $5, $7); }
+		{ $$ = op4($1, NIL, (Node*)makedfa($3, 1), $5, $7); free($3); }
 	| subop '(' pattern comma pattern comma var ')'
-		{ if (constnode($3))
+		{ if (constnode($3)) {
 			$$ = op4($1, NIL, (Node*)makedfa(strnode($3), 1), $5, $7);
-		  else
+			free($3);
+		  } else
 			$$ = op4($1, (Node *)1, $3, $5, $7); }
 	| SUBSTR '(' pattern comma pattern comma pattern ')'
 		{ $$ = op3(SUBSTR, $3, $5, $7); }
--- a/contrib/one-true-awk/b.c
+++ b/contrib/one-true-awk/b.c
@ -80,6 +80,43 @@ int	patlen;
 fa	*fatab[NFA];
 int	nfatab	= 0;	/* entries in fatab */

+extern int u8_nextlen(const char *s);
+
+
+/* utf-8 mechanism:
+
+   For most of Awk, utf-8 strings just "work", since they look like
+   null-terminated sequences of 8-bit bytes.
+
+   Functions like length(), index(), and substr() have to operate
+   in units of utf-8 characters.  The u8_* functions in run.c
+   handle this.
+
+   Regular expressions are more complicated, since the basic
+   mechanism of the goto table used 8-bit byte indices into the
+   gototab entries to compute the next state.  Unicode is a lot
+   bigger, so the gototab entries are now structs with a character
+   and a next state, and there is a linear search of the characters
+   to find the state.  (Yes, this is slower, by a significant
+   amount.  Tough.)
+
+   Throughout the RE mechanism in b.c, utf-8 characters are
+   converted to their utf-32 value.  This mostly shows up in
+   cclenter, which expands character class ranges like a-z and now
+   alpha-omega.  The size of a gototab array is still about 256.
+   This should be dynamic, but for now things work ok for a single
+   code page of Unicode, which is the most likely case.
+
+   The code changes are localized in run.c and b.c.  I have added a
+   handful of functions to somewhat better hide the implementation,
+   but a lot more could be done.
+
+ */
+
+static int get_gototab(fa*, int, int);
+static int set_gototab(fa*, int, int, int);
+extern int u8_rune(int *, const uschar *);
+
 static int *
 intalloc(size_t n, const char *f)
 {
@ -105,7 +142,7 @@ resizesetvec(const char *f)
 static void
 resize_state(fa *f, int state)
 {
-	unsigned int **p;
+	gtt **p;
 	uschar *p2;
 	int **p3;
 	int i, new_count;
@ -115,7 +152,7 @@ resize_state(fa *f, int state)

 	new_count = state + 10; /* needs to be tuned */

-	p = (unsigned int **) realloc(f->gototab, new_count * sizeof(f->gototab[0]));
+	p = (gtt **) realloc(f->gototab, new_count * sizeof(f->gototab[0]));
 	if (p == NULL)
 		goto out;
 	f->gototab = p;
@ -131,12 +168,13 @@ resize_state(fa *f, int state)
 	f->posns = p3;

 	for (i = f->state_count; i < new_count; ++i) {
-		f->gototab[i] = (unsigned int *) calloc(NCHARS, sizeof(**f->gototab));
+		f->gototab[i] = (gtt *) calloc(NCHARS, sizeof(**f->gototab));
 		if (f->gototab[i] == NULL)
 			goto out;
 		f->out[i]  = 0;
 		f->posns[i] = NULL;
 	}
+	f->gototab_len = NCHARS; /* should be variable, growable */
 	f->state_count = new_count;
 	return;
 out:
@ -231,7 +269,7 @@ int makeinit(fa *f, bool anchor)
 	if ((f->posns[2])[1] == f->accept)
 		f->out[2] = 1;
 	for (i = 0; i < NCHARS; i++)
-		f->gototab[2][i] = 0;
+		set_gototab(f, 2, 0, 0); /* f->gototab[2][i] = 0; */
 	f->curstat = cgoto(f, 2, HAT);
 	if (anchor) {
 		*f->posns[2] = k-1;	/* leave out position 0 */
@ -300,13 +338,13 @@ void freetr(Node *p)	/* free parse tree */
 /* in the parsing of regular expressions, metacharacters like . have */
 /* to be seen literally;  \056 is not a metacharacter. */

-int hexstr(const uschar **pp)	/* find and eval hex string at pp, return new p */
+int hexstr(const uschar **pp, int max)	/* find and eval hex string at pp, return new p */
 {			/* only pick up one 8-bit byte (2 chars) */
 	const uschar *p;
 	int n = 0;
 	int i;

-	for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) {
+	for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
 		if (isdigit(*p))
 			n = 16 * n + *p - '0';
 		else if (*p >= 'a' && *p <= 'f')
@ -318,6 +356,8 @@ int hexstr(const uschar **pp)	/* find and eval hex string at pp, return new p */
 	return n;
 }

+
+
 #define isoctdigit(c) ((c) >= '0' && (c) <= '7')	/* multiple use of arg */

 int quoted(const uschar **pp)	/* pick up next thing after a \\ */
@ -326,24 +366,28 @@ int quoted(const uschar **pp)	/* pick up next thing after a \\ */
 	const uschar *p = *pp;
 	int c;

-	if ((c = *p++) == 't')
+/* BUG: should advance by utf-8 char even if makes no sense */
+
+	if ((c = *p++) == 't') {
 		c = '\t';
-	else if (c == 'n')
+	} else if (c == 'n') {
 		c = '\n';
-	else if (c == 'f')
+	} else if (c == 'f') {
 		c = '\f';
-	else if (c == 'r')
+	} else if (c == 'r') {
 		c = '\r';
-	else if (c == 'b')
+	} else if (c == 'b') {
 		c = '\b';
-	else if (c == 'v')
+	} else if (c == 'v') {
 		c = '\v';
-	else if (c == 'a')
+	} else if (c == 'a') {
 		c = '\a';
-	else if (c == '\\')
+	} else if (c == '\\') {
 		c = '\\';
-	else if (c == 'x') {	/* hexadecimal goo follows */
-		c = hexstr(&p);	/* this adds a null if number is invalid */
+	} else if (c == 'x') {	/* 2 hex digits follow */
+		c = hexstr(&p, 2);	/* this adds a null if number is invalid */
+	} else if (c == 'u') {	/* unicode char number up to 8 hex digits */
+		c = hexstr(&p, 8);
 	} else if (isoctdigit(c)) {	/* \d \dd \ddd */
 		int n = c - '0';
 		if (isoctdigit(*p)) {
@ -358,50 +402,67 @@ int quoted(const uschar **pp)	/* pick up next thing after a \\ */
 	return c;
 }

-char *cclenter(const char *argp)	/* add a character class */
+int *cclenter(const char *argp)	/* add a character class */
 {
 	int i, c, c2;
-	const uschar *op, *p = (const uschar *) argp;
-	uschar *bp;
-	static uschar *buf = NULL;
+	int n;
+	const uschar *p = (const uschar *) argp;
+	int *bp, *retp;
+	static int *buf = NULL;
 	static int bufsz = 100;

-	op = p;
-	if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
+	if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
 		FATAL("out of space for character class [%.10s...] 1", p);
 	bp = buf;
-	for (i = 0; (c = *p++) != 0; ) {
+	for (i = 0; *p != 0; ) {
+		n = u8_rune(&c, p);
+		p += n;
 		if (c == '\\') {
 			c = quoted(&p);
 		} else if (c == '-' && i > 0 && bp[-1] != 0) {
 			if (*p != 0) {
 				c = bp[-1];
-				c2 = *p++;
+				/* c2 = *p++; */
+				n = u8_rune(&c2, p);
+				p += n;
 				if (c2 == '\\')
-					c2 = quoted(&p);
+					c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
 				if (c > c2) {	/* empty; ignore */
 					bp--;
 					i--;
 					continue;
 				}
 				while (c < c2) {
-					if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
-						FATAL("out of space for character class [%.10s...] 2", p);
+					if (i >= bufsz) {
+						bufsz *= 2;
+						buf = (int *) realloc(buf, bufsz * sizeof(int));
+						if (buf == NULL)
+							FATAL("out of space for character class [%.10s...] 2", p);
+						bp = buf + i;
+					}
 					*bp++ = ++c;
 					i++;
 				}
 				continue;
 			}
 		}
-		if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
-			FATAL("out of space for character class [%.10s...] 3", p);
+		if (i >= bufsz) {
+			bufsz *= 2;
+			buf = (int *) realloc(buf, bufsz * sizeof(int));
+			if (buf == NULL)
+				FATAL("out of space for character class [%.10s...] 2", p);
+			bp = buf + i;
+		}
 		*bp++ = c;
 		i++;
 	}
 	*bp = 0;
-	DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf);
-	xfree(op);
-	return (char *) tostring((char *) buf);
+	/* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
+	/* xfree(op);  BUG: what are we freeing here? */
+	retp = (int *) calloc(bp-buf+1, sizeof(int));
+	for (i = 0; i < bp-buf+1; i++)
+		retp[i] = buf[i];
+	return retp;
 }

 void overflo(const char *s)
@ -468,7 +529,7 @@ int first(Node *p)	/* collects initially active leaves of p into setvec */
 			setvec[lp] = 1;
 			setcnt++;
 		}
-		if (type(p) == CCL && (*(char *) right(p)) == '\0')
+		if (type(p) == CCL && (*(int *) right(p)) == 0)
 			return(0);		/* empty CCL */
 		return(1);
 	case PLUS:
@ -524,9 +585,9 @@ void follow(Node *v)	/* collects leaves that can follow v into setvec */
 	}
 }

-int member(int c, const char *sarg)	/* is c in s? */
+int member(int c, int *sarg)	/* is c in s? */
 {
-	const uschar *s = (const uschar *) sarg;
+	int *s = (int *) sarg;

 	while (*s)
 		if (c == *s++)
@ -534,11 +595,41 @@ int member(int c, const char *sarg)	/* is c in s? */
 	return(0);
 }

+static int get_gototab(fa *f, int state, int ch) /* hide gototab inplementation */
+{
+	int i;
+	for (i = 0; i < f->gototab_len; i++) {
+		if (f->gototab[state][i].ch == 0)
+			break;
+		if (f->gototab[state][i].ch == ch)
+			return f->gototab[state][i].state;
+	}
+	return 0;
+}
+
+static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplementation */
+{
+	int i;
+	for (i = 0; i < f->gototab_len; i++) {
+		if (f->gototab[state][i].ch == 0 || f->gototab[state][i].ch == ch) {
+			f->gototab[state][i].ch = ch;
+			f->gototab[state][i].state = val;
+			return val;
+		}
+	}
+	overflo(__func__);
+	return val; /* not used anywhere at the moment */
+}
+
 int match(fa *f, const char *p0)	/* shortest match ? */
 {
 	int s, ns;
+	int n;
+	int rune;
 	const uschar *p = (const uschar *) p0;

+	/* return pmatch(f, p0); does it matter whether longest or shortest? */
+
 	s = f->initstat;
 	assert (s < f->state_count);

@ -546,19 +637,25 @@ int match(fa *f, const char *p0)	/* shortest match ? */
 		return(1);
 	do {
 		/* assert(*p < NCHARS); */
-		if ((ns = f->gototab[s][*p]) != 0)
+		n = u8_rune(&rune, p);
+		if ((ns = get_gototab(f, s, rune)) != 0)
 			s = ns;
 		else
-			s = cgoto(f, s, *p);
+			s = cgoto(f, s, rune);
 		if (f->out[s])
 			return(1);
-	} while (*p++ != 0);
+		if (*p == 0)
+			break;
+		p += n;
+	} while (1);  /* was *p++ != 0 */
 	return(0);
 }

 int pmatch(fa *f, const char *p0)	/* longest match, for sub */
 {
 	int s, ns;
+	int n;
+	int rune;
 	const uschar *p = (const uschar *) p0;
 	const uschar *q;

@ -573,10 +670,11 @@ int pmatch(fa *f, const char *p0)	/* longest match, for sub */
 			if (f->out[s])		/* final state */
 				patlen = q-p;
 			/* assert(*q < NCHARS); */
-			if ((ns = f->gototab[s][*q]) != 0)
+			n = u8_rune(&rune, q);
+			if ((ns = get_gototab(f, s, rune)) != 0)
 				s = ns;
 			else
-				s = cgoto(f, s, *q);
+				s = cgoto(f, s, rune);

 			assert(s < f->state_count);

@ -588,7 +686,11 @@ int pmatch(fa *f, const char *p0)	/* longest match, for sub */
 				else
 					goto nextin;	/* no match */
 			}
-		} while (*q++ != 0);
+			if (*q == 0)
+				break;
+			q += n;
+		} while (1);
+		q++;  /* was *q++ */
 		if (f->out[s])
 			patlen = q-p-1;	/* don't count $ */
 		if (patlen >= 0) {
@ -597,13 +699,19 @@ int pmatch(fa *f, const char *p0)	/* longest match, for sub */
 		}
 	nextin:
 		s = 2;
-	} while (*p++);
+		if (*p == 0)
+			break;
+		n = u8_rune(&rune, p);
+		p += n;
+	} while (1); /* was *p++ */
 	return (0);
 }

 int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
 {
 	int s, ns;
+        int n;
+        int rune;
 	const uschar *p = (const uschar *) p0;
 	const uschar *q;

@ -618,10 +726,11 @@ int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
 			if (f->out[s])		/* final state */
 				patlen = q-p;
 			/* assert(*q < NCHARS); */
-			if ((ns = f->gototab[s][*q]) != 0)
+			n = u8_rune(&rune, q);
+			if ((ns = get_gototab(f, s, rune)) != 0)
 				s = ns;
 			else
-				s = cgoto(f, s, *q);
+				s = cgoto(f, s, rune);
 			if (s == 1) {	/* no transition */
 				if (patlen > 0) {
 					patbeg = (const char *) p;
@ -629,7 +738,11 @@ int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
 				} else
 					goto nnextin;	/* no nonempty match */
 			}
-		} while (*q++ != 0);
+			if (*q == 0)
+				break;
+			q += n;
+		} while (1);
+		q++;
 		if (f->out[s])
 			patlen = q-p-1;	/* don't count $ */
 		if (patlen > 0 ) {
@ -644,6 +757,61 @@ int nematch(fa *f, const char *p0)	/* non-empty match, for sub */
 }


+#define MAX_UTF_BYTES	4	// UTF-8 is up to 4 bytes long
+
+// Read one rune at a time from the given FILE*. Return both
+// the bytes and the actual rune.
+
+struct runedata {
+	int rune;
+	size_t len;
+	char bytes[6];
+};
+
+struct runedata getrune(FILE *fp)
+{
+	struct runedata result;
+	int c, next;
+
+	memset(&result, 0, sizeof(result));
+
+	c = getc(fp);
+	if (c == EOF)
+		return result;	// result.rune == 0 --> EOF
+	else if (c < 128 || awk_mb_cur_max == 1) {
+		result.bytes[0] = c;
+		result.len = 1;
+		result.rune = c;
+
+		return result;
+	}
+
+	// need to get bytes and fill things in
+	result.bytes[0] = c;
+	result.len = 1;
+
+	next = 1;
+	for (int i = 1; i < MAX_UTF_BYTES; i++) {
+		c = getc(fp);
+		if (c == EOF)
+			break;
+		result.bytes[next++] = c;
+		result.len++;
+	}
+
+	// put back any extra input bytes
+	int actual_len = u8_nextlen(result.bytes);
+	while (result.len > actual_len) {
+		ungetc(result.bytes[--result.len], fp);
+	}
+
+	result.bytes[result.len] = '\0';
+	(void) u8_rune(& result.rune, (uschar *) result.bytes);
+
+	return result;
+}
+
+
 /*
 * NAME
 *     fnematch
@ -663,7 +831,8 @@ bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
 {
 	char *buf = *pbuf;
 	int bufsize = *pbufsize;
-	int c, i, j, k, ns, s;
+	int i, j, k, ns, s;
+	struct runedata r;

 	s = pfa->initstat;
 	patlen = 0;
@ -672,35 +841,38 @@ bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
 	 * All indices relative to buf.
 	 * i <= j <= k <= bufsize
 	 *
-	 * i: origin of active substring
-	 * j: current character
+	 * i: origin of active substring (first byte of first character)
+	 * j: current character		(last byte of current character)
 	 * k: destination of next getc()
 	 */
 	i = -1, k = 0;
        do {
 		j = i++;
 		do {
-			if (++j == k) {
-				if (k == bufsize)
+			r = getrune(f);
+			if ((++j + r.len) >= k) {
+				if (k >= bufsize)
 					if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
 						FATAL("stream '%.30s...' too long", buf);
-				buf[k++] = (c = getc(f)) != EOF ? c : 0;
 			}
-			c = (uschar)buf[j];
-			/* assert(c < NCHARS); */
+			memcpy(buf + k, r.bytes, r.len);
+			j += r.len - 1;	// incremented next time around the loop
+			k += r.len;

-			if ((ns = pfa->gototab[s][c]) != 0)
+			if ((ns = get_gototab(pfa, s, r.rune)) != 0)
 				s = ns;
 			else
-				s = cgoto(pfa, s, c);
+				s = cgoto(pfa, s, r.rune);

 			if (pfa->out[s]) {	/* final state */
 				patlen = j - i + 1;
-				if (c == 0)	/* don't count $ */
+				if (r.rune == 0)	/* don't count $ */
 					patlen--;
 			}
 		} while (buf[j] && s != 1);
 		s = 2;
+		if (r.len > 1)
+			i += r.len - 1;	// i incremented around the loop
 	} while (buf[i] && !patlen);

 	/* adjbuf() may have relocated a resized buffer. Inform the world. */
@ -721,8 +893,9 @@ bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
 		 * terminate the buffer.
 		 */
 		do
-			if (buf[--k] && ungetc(buf[k], f) == EOF)
-				FATAL("unable to ungetc '%c'", buf[k]);
+			for (int ii = r.len; ii > 0; ii--)
+				if (buf[--k] && ungetc(buf[k], f) == EOF)
+					FATAL("unable to ungetc '%c'", buf[k]);
 		while (k > i + patlen);
 		buf[k] = '\0';
 		return true;
@ -797,7 +970,7 @@ Node *primary(void)
 		rtok = relex();
 		if (rtok == ')') {	/* special pleading for () */
 			rtok = relex();
-			return unary(op2(CCL, NIL, (Node *) tostring("")));
+			return unary(op2(CCL, NIL, (Node *) cclenter("")));
 		}
 		np = regexp();
 		if (rtok == ')') {
@ -820,7 +993,7 @@ Node *concat(Node *np)
 		return (concat(op2(CAT, np, primary())));
 	case EMPTYRE:
 		rtok = relex();
-		return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
+		return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
 				primary())));
 	}
 	return (np);
@ -1019,6 +1192,8 @@ static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
 	return 0;
 }

+extern int u8_rune(int *, const uschar *); /* run.c; should be in header file */
+
 int relex(void)		/* lexical analyzer for reparse */
 {
 	int c, n;
@ -1036,6 +1211,12 @@ int relex(void)		/* lexical analyzer for reparse */
 rescan:
 	starttok = prestr;

+	if ((n = u8_rune(&rlxval, prestr)) > 1) {
+		prestr += n;
+		starttok = prestr;
+		return CHAR;
+	}
+
 	switch (c = *prestr++) {
 	case '|': return OR;
 	case '*': return STAR;
@ -1073,10 +1254,15 @@ int relex(void)		/* lexical analyzer for reparse */
 		}
 		else
 			cflag = 0;
-		n = 2 * strlen((const char *) prestr)+1;
+		n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2.  what value? */
 		if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
 			FATAL("out of space for reg expr %.10s...", lastre);
 		for (; ; ) {
+			if ((n = u8_rune(&rlxval, prestr)) > 1) {
+				for (i = 0; i < n; i++)
+					*bp++ = *prestr++;
+				continue;
+			}
 			if ((c = *prestr++) == '\\') {
 				*bp++ = '\\';
 				if ((c = *prestr++) == '\0')
@ -1101,7 +1287,7 @@ int relex(void)		/* lexical analyzer for reparse */
 					 * program to track each string's length.
 					 */
 					for (i = 1; i <= UCHAR_MAX; i++) {
-						if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
+						if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
 						    FATAL("out of space for reg expr %.10s...", lastre);
 						if (cc->cc_func(i)) {
 							/* escape backslash */
@ -1243,7 +1429,7 @@ int cgoto(fa *f, int s, int c)
 	int *p, *q;
 	int i, j, k;

-	assert(c == HAT || c < NCHARS);
+	/* assert(c == HAT || c < NCHARS);  BUG: seg fault if disable test */
 	while (f->accept >= maxsetvec) {	/* guessing here! */
 		resizesetvec(__func__);
 	}
@ -1259,8 +1445,8 @@ int cgoto(fa *f, int s, int c)
 			 || (k == DOT && c != 0 && c != HAT)
 			 || (k == ALL && c != 0)
 			 || (k == EMPTYRE && c != 0)
-			 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
-			 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
+			 || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
+			 || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
 				q = f->re[p[i]].lfollow;
 				for (j = 1; j <= *q; j++) {
 					if (q[j] >= maxsetvec) {
@ -1292,7 +1478,7 @@ int cgoto(fa *f, int s, int c)
 				goto different;
 		/* setvec is state i */
 		if (c != HAT)
-			f->gototab[s][c] = i;
+			set_gototab(f, s, c, i);
 		return i;
 	  different:;
 	}
@ -1301,13 +1487,13 @@ int cgoto(fa *f, int s, int c)
 	++(f->curstat);
 	resize_state(f, f->curstat);
 	for (i = 0; i < NCHARS; i++)
-		f->gototab[f->curstat][i] = 0;
+		set_gototab(f, f->curstat, 0, 0);
 	xfree(f->posns[f->curstat]);
 	p = intalloc(setcnt + 1, __func__);

 	f->posns[f->curstat] = p;
 	if (c != HAT)
-		f->gototab[s][c] = f->curstat;
+		set_gototab(f, s, c, f->curstat);
 	for (i = 0; i <= setcnt; i++)
 		p[i] = tmpset[i];
 	if (setvec[f->accept])
--- a/contrib/one-true-awk/bugs-fixed/getline-corruption.awk
+++ b/contrib/one-true-awk/bugs-fixed/getline-corruption.awk
@ -0,0 +1,5 @@
+BEGIN { 
+	getline l
+	getline l
+	print (s=substr(l,1,10)) " len=" length(s)
+}
--- a/contrib/one-true-awk/bugs-fixed/getline-corruption.in
+++ b/contrib/one-true-awk/bugs-fixed/getline-corruption.in
@ -0,0 +1 @@
+a
--- a/contrib/one-true-awk/bugs-fixed/getline-corruption.ok
+++ b/contrib/one-true-awk/bugs-fixed/getline-corruption.ok
@ -0,0 +1 @@
+a len=1
--- a/contrib/one-true-awk/bugs-fixed/rstart-rlength.awk
+++ b/contrib/one-true-awk/bugs-fixed/rstart-rlength.awk
@ -0,0 +1,10 @@
+BEGIN {
+	str="\342\200\257"
+	print length(str)
+	match(str,/^/)
+	print RSTART, RLENGTH	
+	match(str,/.+/)
+	print RSTART, RLENGTH
+	match(str,/$/)
+	print RSTART, RLENGTH
+}
--- a/contrib/one-true-awk/bugs-fixed/rstart-rlength.ok
+++ b/contrib/one-true-awk/bugs-fixed/rstart-rlength.ok
@ -0,0 +1,4 @@
+1
+1 0
+1 1
+2 0
--- a/contrib/one-true-awk/bugs-fixed/unicode-fs-rs-1.awk
+++ b/contrib/one-true-awk/bugs-fixed/unicode-fs-rs-1.awk
@ -0,0 +1,6 @@
+BEGIN {
+	FS="␟"
+	RS="␞"
+	OFS=","
+}
+{ print $1, $2, $3 }
--- a/contrib/one-true-awk/bugs-fixed/unicode-fs-rs-1.in
+++ b/contrib/one-true-awk/bugs-fixed/unicode-fs-rs-1.in
@ -0,0 +1,2 @@
+id␟name␟age␞1␟Bob "Billy" Smith␟42␞2␟Jane
+  Brown␟37
--- a/contrib/one-true-awk/bugs-fixed/unicode-fs-rs-1.ok
+++ b/contrib/one-true-awk/bugs-fixed/unicode-fs-rs-1.ok
@ -0,0 +1,5 @@
+id,name,age
+1,Bob "Billy" Smith,42
+2,Jane
+  Brown,37
+
--- a/contrib/one-true-awk/bugs-fixed/unicode-fs-rs-2.awk
+++ b/contrib/one-true-awk/bugs-fixed/unicode-fs-rs-2.awk
@ -0,0 +1,7 @@
+BEGIN {
+	FS = "א"
+	RS = "בב"
+	OFS = ","
+}
+
+{ print $1, $2, $3 }
--- a/contrib/one-true-awk/bugs-fixed/unicode-fs-rs-2.in
+++ b/contrib/one-true-awk/bugs-fixed/unicode-fs-rs-2.in
@ -0,0 +1,2 @@
+idאnameאageא1אBob "Billy" Smithא42א2בבJane
+  Brownא37
--- a/contrib/one-true-awk/bugs-fixed/unicode-fs-rs-2.ok
+++ b/contrib/one-true-awk/bugs-fixed/unicode-fs-rs-2.ok
@ -0,0 +1,4 @@
+id,name,age
+Jane
+  Brown,37
+,
--- a/contrib/one-true-awk/bugs-fixed/unicode-null-match.awk
+++ b/contrib/one-true-awk/bugs-fixed/unicode-null-match.awk
@ -0,0 +1,6 @@
+BEGIN {
+	# str = "\342\200\257"
+	str = "あ"
+	n = gsub(//, "X", str)
+	print n, str
+}
--- a/contrib/one-true-awk/bugs-fixed/unicode-null-match.bad
+++ b/contrib/one-true-awk/bugs-fixed/unicode-null-match.bad
@ -0,0 +1 @@
+4 X綛々９
--- a/contrib/one-true-awk/bugs-fixed/unicode-null-match.ok
+++ b/contrib/one-true-awk/bugs-fixed/unicode-null-match.ok
@ -0,0 +1 @@
+2 XあX
--- a/contrib/one-true-awk/lex.c
+++ b/contrib/one-true-awk/lex.c
@ -377,6 +377,8 @@ int yylex(void)
 	}
 }

+extern int runetochar(char *str, int c);
+
 int string(void)
 {
 	int c, n;
@ -424,20 +426,50 @@ int string(void)
 				*bp++ = n;
 				break;

-			case 'x':	/* hex  \x0-9a-fA-F + */
-			    {	char xbuf[100], *px;
-				for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
-					if (isdigit(c)
-					 || (c >= 'a' && c <= 'f')
-					 || (c >= 'A' && c <= 'F'))
-						*px++ = c;
-					else
+			case 'x':	/* hex  \x0-9a-fA-F (exactly two) */
+			    {
+				int i;
+
+				n = 0;
+				for (i = 1; i <= 2; i++) {
+					c = input();
+					if (c == 0)
+						break;
+					if (isxdigit(c)) {
+						c = tolower(c);
+						n *= 16;
+						if (isdigit(c))
+							n += (c - '0');
+						else
+							n += 10 + (c - 'a');
+					} else
 						break;
 				}
-				*px = 0;
+				if (n)
+					*bp++ = n;
+				else
+					unput(c);
+				break;
+			    }
+
+			case 'u':	/* utf  \u0-9a-fA-F (1..8) */
+			    {
+				int i;
+
+				n = 0;
+				for (i = 0; i < 8; i++) {
+					c = input();
+					if (!isxdigit(c) || c == 0)
+						break;
+					c = tolower(c);
+					n *= 16;
+					if (isdigit(c))
+						n += (c - '0');
+					else
+						n += 10 + (c - 'a');
+				}
 				unput(c);
-	  			sscanf(xbuf, "%x", (unsigned int *) &n);
-				*bp++ = n;
+				bp += runetochar(bp, n);
 				break;
 			    }

@ -534,7 +566,7 @@ int regexpr(void)
 	char *bp;

 	if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
-		FATAL("out of space for rex expr");
+		FATAL("out of space for reg expr");
 	bp = buf;
 	for ( ; (c = input()) != '/' && c != 0; ) {
 		if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
--- a/contrib/one-true-awk/lib.c
+++ b/contrib/one-true-awk/lib.c
@ -34,6 +34,8 @@ THIS SOFTWARE.
 #include <math.h>
 #include "awk.h"

+extern int u8_nextlen(const char *s);
+
 char	EMPTY[] = { '\0' };
 FILE	*infile	= NULL;
 bool	innew;		/* true = infile has not been read by readrec */
@ -150,11 +152,6 @@ int getrec(char **pbuf, int *pbufsize, bool isrecord)	/* get next input record *
 	}
 	DPRINTF("RS=<%s>, FS=<%s>, ARGC=%g, FILENAME=%s\n",
 		*RS, *FS, *ARGC, *FILENAME);
-	if (isrecord) {
-		donefld = false;
-		donerec = true;
-		savefs();
-	}
 	saveb0 = buf[0];
 	buf[0] = 0;
 	while (argno < *ARGC || infile == stdin) {
@ -194,6 +191,9 @@ int getrec(char **pbuf, int *pbufsize, bool isrecord)	/* get next input record *
 					fldtab[0]->fval = result;
 					fldtab[0]->tval |= NUM;
 				}
+				donefld = false;
+				donerec = true;
+				savefs();
 			}
 			setfval(nrloc, nrloc->fval+1);
 			setfval(fnrloc, fnrloc->fval+1);
@ -221,16 +221,22 @@ void nextfile(void)
 	argno++;
 }

+extern int readcsvrec(char **pbuf, int *pbufsize, FILE *inf, bool newflag);
+
 int readrec(char **pbuf, int *pbufsize, FILE *inf, bool newflag)	/* read one record into buf */
 {
-	int sep, c, isrec;
-	char *rr, *buf = *pbuf;
+	int sep, c, isrec; // POTENTIAL BUG? isrec is a macro in awk.h
+	char *rr = *pbuf, *buf = *pbuf;
 	int bufsize = *pbufsize;
 	char *rs = getsval(rsloc);

-	if (*rs && rs[1]) {
+	if (CSV) {
+		c = readcsvrec(pbuf, pbufsize, inf, newflag);
+		isrec = (c == EOF && rr == buf) ? false : true;
+	} else if (*rs && rs[1]) {
 		bool found;

+		memset(buf, 0, bufsize);
 		fa *pfa = makedfa(rs, 1);
 		if (newflag)
 			found = fnematch(pfa, inf, &buf, &bufsize, recsize);
@ -243,6 +249,7 @@ int readrec(char **pbuf, int *pbufsize, FILE *inf, bool newflag)	/* read one rec
 		if (found)
 			setptr(patbeg, '\0');
 		isrec = (found == 0 && *buf == '\0') ? false : true;
+
 	} else {
 		if ((sep = *rs) == 0) {
 			sep = '\n';
@ -280,6 +287,52 @@ int readrec(char **pbuf, int *pbufsize, FILE *inf, bool newflag)	/* read one rec
 	return isrec;
 }

+
+/*******************
+ * loose ends here:
+ *   \r\n should become \n
+ *   what about bare \r?  Excel uses that for embedded newlines
+ *   can't have "" in unquoted fields, according to RFC 4180
+*/
+
+
+int readcsvrec(char **pbuf, int *pbufsize, FILE *inf, bool newflag) /* csv can have \n's */
+{			/* so read a complete record that might be multiple lines */
+	int sep, c;
+	char *rr = *pbuf, *buf = *pbuf;
+	int bufsize = *pbufsize;
+	bool in_quote = false;
+
+	sep = '\n'; /* the only separator; have to skip over \n embedded in "..." */
+	rr = buf;
+	while ((c = getc(inf)) != EOF) {
+		if (c == sep) {
+			if (! in_quote)
+				break;
+			if (rr > buf && rr[-1] == '\r')	// remove \r if was \r\n
+				rr--;
+		}
+
+		if (rr-buf+1 > bufsize)
+			if (!adjbuf(&buf, &bufsize, 1+rr-buf,
+			    recsize, &rr, "readcsvrec 1"))
+				FATAL("input record `%.30s...' too long", buf);
+		*rr++ = c;
+		if (c == '"')
+			in_quote = ! in_quote;
+ 	}
+	if (c == '\n' && rr > buf && rr[-1] == '\r') 	// remove \r if was \r\n
+		rr--;
+
+	if (!adjbuf(&buf, &bufsize, 1+rr-buf, recsize, &rr, "readcsvrec 4"))
+		FATAL("input record `%.30s...' too long", buf);
+	*rr = 0;
+	*pbuf = buf;
+	*pbufsize = bufsize;
+	DPRINTF("readcsvrec saw <%s>, returns %d\n", buf, c);
+	return c;
+}
+
 char *getargv(int n)	/* get ARGV[n] */
 {
 	Cell *x;
@ -297,12 +350,16 @@ char *getargv(int n)	/* get ARGV[n] */

 void setclvar(char *s)	/* set var=value from s */
 {
-	char *p;
+	char *e, *p;
 	Cell *q;
 	double result;

+/* commit f3d9187d4e0f02294fb1b0e31152070506314e67 broke T.argv test */
+/* I don't understand why it was changed. */
+
 	for (p=s; *p != '='; p++)
 		;
+	e = p;
 	*p++ = 0;
 	p = qstring(p, '\0');
 	q = setsymtab(s, p, 0.0, STR, symtab);
@ -312,6 +369,8 @@ void setclvar(char *s)	/* set var=value from s */
 		q->tval |= NUM;
 	}
 	DPRINTF("command line set %s to |%s|\n", s, p);
+	free(p);
+	*e = '=';
 }


@ -342,7 +401,7 @@ void fldbld(void)	/* create fields from current record */
 		savefs();
 	if (strlen(inputFS) > 1) {	/* it's a regular expression */
 		i = refldbld(r, inputFS);
-	} else if ((sep = *inputFS) == ' ') {	/* default whitespace */
+	} else if (!CSV && (sep = *inputFS) == ' ') {	/* default whitespace */
 		for (i = 0; ; ) {
 			while (*r == ' ' || *r == '\t' || *r == '\n')
 				r++;
@ -361,26 +420,58 @@ void fldbld(void)	/* create fields from current record */
 			*fr++ = 0;
 		}
 		*fr = 0;
-	} else if ((sep = *inputFS) == 0) {		/* new: FS="" => 1 char/field */
-		for (i = 0; *r != '\0'; r += n) {
-			char buf[MB_LEN_MAX + 1];
-
+	} else if (CSV) {	/* CSV processing.  no error handling */
+		if (*r != 0) {
+			for (;;) {
+				i++;
+				if (i > nfields)
+					growfldtab(i);
+				if (freeable(fldtab[i]))
+					xfree(fldtab[i]->sval);
+				fldtab[i]->sval = fr;
+				fldtab[i]->tval = FLD | STR | DONTFREE;
+				if (*r == '"' ) { /* start of "..." */
+					for (r++ ; *r != '\0'; ) {
+						if (*r == '"' && r[1] != '\0' && r[1] == '"') {
+							r += 2; /* doubled quote */
+							*fr++ = '"';
+						} else if (*r == '"' && (r[1] == '\0' || r[1] == ',')) {
+							r++; /* skip over closing quote */
+							break;
+						} else {
+							*fr++ = *r++;
+						}
+					}
+					*fr++ = 0;
+				} else {	/* unquoted field */
+					while (*r != ',' && *r != '\0')
+						*fr++ = *r++;
+					*fr++ = 0;
+				}
+				if (*r++ == 0)
+					break;
+	
+			}
+		}
+		*fr = 0;
+	} else if ((sep = *inputFS) == 0) {	/* new: FS="" => 1 char/field */
+		for (i = 0; *r != '\0'; ) {
+			char buf[10];
 			i++;
 			if (i > nfields)
 				growfldtab(i);
 			if (freeable(fldtab[i]))
 				xfree(fldtab[i]->sval);
-			n = mblen(r, MB_LEN_MAX);
-			if (n < 0)
-				n = 1;
-			memcpy(buf, r, n);
-			buf[n] = '\0';
+			n = u8_nextlen(r);
+			for (j = 0; j < n; j++)
+				buf[j] = *r++;
+			buf[j] = '\0';
 			fldtab[i]->sval = tostring(buf);
 			fldtab[i]->tval = FLD | STR;
 		}
 		*fr = 0;
 	} else if (*r != 0) {	/* if 0, it's a null field */
-		/* subtlecase : if length(FS) == 1 && length(RS > 0)
+		/* subtle case: if length(FS) == 1 && length(RS > 0)
 		 * \n is NOT a field separator (cf awk book 61,84).
 		 * this variable is tested in the inner while loop.
 		 */
@ -803,12 +894,12 @@ bool is_valid_number(const char *s, bool trailing_stuff_ok,
 * the prior FreeBSD behavior.
 */
 #if 0
-	// no hex floating point, sorry
+	/* no hex floating point, sorry */
 	if (s[0] == '0' && tolower(s[1]) == 'x')
 		return false;
 #endif

-	// allow +nan, -nan, +inf, -inf, any other letter, no
+	/* allow +nan, -nan, +inf, -inf, any other letter, no */
 	if (s[0] == '+' || s[0] == '-') {
 		is_nan = (strncasecmp(s+1, "nan", 3) == 0);
 		is_inf = (strncasecmp(s+1, "inf", 3) == 0);
@ -842,7 +933,7 @@ bool is_valid_number(const char *s, bool trailing_stuff_ok,
 	if (no_trailing != NULL)
 		*no_trailing = (*ep == '\0');

-        // return true if found the end, or trailing stuff is allowed
+        /* return true if found the end, or trailing stuff is allowed */
 	retval = *ep == '\0' || trailing_stuff_ok;

 	return retval;
--- a/contrib/one-true-awk/main.c
+++ b/contrib/one-true-awk/main.c
@ -22,7 +22,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
 THIS SOFTWARE.
 ****************************************************************/

-const char	*version = "version 20210724";
+const char	*version = "version 20231030";

 #define DEBUG
 #include <stdio.h>
@ -49,8 +49,12 @@ static size_t	maxpfile;	/* max program filename */
 static size_t	npfile;		/* number of filenames */
 static size_t	curpfile;	/* current filename */

+bool	CSV = false;	/* true for csv input */
+
 bool	safe = false;	/* true => "safe" mode */

+size_t	awk_mb_cur_max = 1;
+
 static noreturn void fpecatch(int n
 #ifdef SA_SIGINFO
 	, siginfo_t *si, void *uc
@ -116,10 +120,11 @@ int main(int argc, char *argv[])

 	setlocale(LC_CTYPE, "");
 	setlocale(LC_NUMERIC, "C"); /* for parsing cmdline & prog */
+	awk_mb_cur_max = MB_CUR_MAX;
 	cmdname = argv[0];
 	if (argc == 1) {
 		fprintf(stderr,
-		  "usage: %s [-F fs] [-v var=value] [-f progfile | 'prog'] [file ...]\n",
+		  "usage: %s [-F fs | --csv] [-v var=value] [-f progfile | 'prog'] [file ...]\n",
 		  cmdname);
 		exit(1);
 	}
@ -152,6 +157,12 @@ int main(int argc, char *argv[])
 			argv++;
 			break;
 		}
+		if (strcmp(argv[1], "--csv") == 0) {	/* turn on csv input processing */
+			CSV = true;
+			argc--;
+			argv++;
+			continue;
+		}
 		switch (argv[1][1]) {
 		case 's':
 			if (strcmp(argv[1], "-safe") == 0)
--- a/contrib/one-true-awk/makefile
+++ b/contrib/one-true-awk/makefile
@ -36,6 +36,7 @@ CC = $(HOSTCC)  # change this is cross-compiling.

 # By fiat, to make our lives easier, yacc is now defined to be bison.
 # If you want something else, you're on your own.
+# YACC = yacc -d -b awkgram
 YACC = bison -d

 OFILES = b.o main.o parse.o proctab.o tran.o lib.o run.o lex.o
--- a/contrib/one-true-awk/parse.c
+++ b/contrib/one-true-awk/parse.c
@ -29,7 +29,7 @@ THIS SOFTWARE.
 #include "awk.h"
 #include "awkgram.tab.h"

-Node *nodealloc(int n)
+Node *nodealloc(size_t n)
 {
 	Node *x;

--- a/contrib/one-true-awk/proto.h
+++ b/contrib/one-true-awk/proto.h
@ -43,14 +43,13 @@ extern	fa	*mkdfa(const char *, bool);
 extern	int	makeinit(fa *, bool);
 extern	void	penter(Node *);
 extern	void	freetr(Node *);
-extern	int	hexstr(const uschar **);
 extern	int	quoted(const uschar **);
-extern	char	*cclenter(const char *);
+extern	int	*cclenter(const char *);
 extern	noreturn void	overflo(const char *);
 extern	void	cfoll(fa *, Node *);
 extern	int	first(Node *);
 extern	void	follow(Node *);
-extern	int	member(int, const char *);
+extern	int	member(int, int *);
 extern	int	match(fa *, const char *);
 extern	int	pmatch(fa *, const char *);
 extern	int	nematch(fa *, const char *);
@ -68,7 +67,7 @@ extern	void	freefa(fa *);
 extern	int	pgetc(void);
 extern	char	*cursource(void);

-extern	Node	*nodealloc(int);
+extern	Node	*nodealloc(size_t);
 extern	Node	*exptostat(Node *);
 extern	Node	*node1(int, Node *);
 extern	Node	*node2(int, Node *, Node *);
--- a/contrib/one-true-awk/run.c
+++ b/contrib/one-true-awk/run.c
@ -26,7 +26,6 @@ THIS SOFTWARE.
 #include <stdio.h>
 #include <ctype.h>
 #include <errno.h>
-#include <wchar.h>
 #include <wctype.h>
 #include <fcntl.h>
 #include <setjmp.h>
@ -40,8 +39,10 @@ THIS SOFTWARE.
 #include "awk.h"
 #include "awkgram.tab.h"

+
 static void stdinit(void);
 static void flush_all(void);
+static char *wide_char_to_byte_str(int rune, size_t *outlen);

 #if 1
 #define tempfree(x)	do { if (istemp(x)) tfree(x); } while (/*CONSTCOND*/0)
@ -447,13 +448,15 @@ Cell *awkgetline(Node **a, int n)	/* get next line from specific input */
 			n = getrec(&record, &recsize, true);
 		else {			/* getline var */
 			n = getrec(&buf, &bufsize, false);
-			x = execute(a[0]);
-			setsval(x, buf);
-			if (is_number(x->sval, & result)) {
-				x->fval = result;
-				x->tval |= NUM;
+			if (n > 0) {
+				x = execute(a[0]);
+				setsval(x, buf);
+				if (is_number(x->sval, & result)) {
+					x->fval = result;
+					x->tval |= NUM;
+				}
+				tempfree(x);
 			}
-			tempfree(x);
 		}
 	}
 	setfval(r, (Awkfloat) n);
@ -577,11 +580,225 @@ Cell *intest(Node **a, int n)	/* a[0] is index (list), a[1] is symtab */
 }


+/* ======== utf-8 code ========== */
+
+/*
+ * Awk strings can contain ascii, random 8-bit items (eg Latin-1),
+ * or utf-8.  u8_isutf tests whether a string starts with a valid
+ * utf-8 sequence, and returns 0 if not (e.g., high bit set).
+ * u8_nextlen returns length of next valid sequence, which is
+ * 1 for ascii, 2..4 for utf-8, or 1 for high bit non-utf.
+ * u8_strlen returns length of string in valid utf-8 sequences
+ * and/or high-bit bytes.  Conversion functions go between byte
+ * number and character number.
+ *
+ * In theory, this behaves the same as before for non-utf8 bytes.
+ *
+ * Limited checking! This is a potential security hole.
+ */
+
+/* is s the beginning of a valid utf-8 string? */
+/* return length 1..4 if yes, 0 if no */
+int u8_isutf(const char *s)
+{
+	int n, ret;
+	unsigned char c;
+
+	c = s[0];
+	if (c < 128 || awk_mb_cur_max == 1)
+		return 1; /* what if it's 0? */
+
+	n = strlen(s);
+	if (n >= 2 && ((c>>5) & 0x7) == 0x6 && (s[1] & 0xC0) == 0x80) {
+		ret = 2; /* 110xxxxx 10xxxxxx */
+	} else if (n >= 3 && ((c>>4) & 0xF) == 0xE && (s[1] & 0xC0) == 0x80
+			 && (s[2] & 0xC0) == 0x80) {
+		ret = 3; /* 1110xxxx 10xxxxxx 10xxxxxx */
+	} else if (n >= 4 && ((c>>3) & 0x1F) == 0x1E && (s[1] & 0xC0) == 0x80
+			 && (s[2] & 0xC0) == 0x80 && (s[3] & 0xC0) == 0x80) {
+		ret = 4; /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
+	} else {
+		ret = 0;
+	}
+	return ret;
+}
+
+/* Convert (prefix of) utf8 string to utf-32 rune. */
+/* Sets *rune to the value, returns the length. */
+/* No error checking: watch out. */
+int u8_rune(int *rune, const char *s)
+{
+	int n, ret;
+	unsigned char c;
+
+	c = s[0];
+	if (c < 128 || awk_mb_cur_max == 1) {
+		*rune = c;
+		return 1;
+	}
+
+	n = strlen(s);
+	if (n >= 2 && ((c>>5) & 0x7) == 0x6 && (s[1] & 0xC0) == 0x80) {
+		*rune = ((c & 0x1F) << 6) | (s[1] & 0x3F); /* 110xxxxx 10xxxxxx */
+		ret = 2;
+	} else if (n >= 3 && ((c>>4) & 0xF) == 0xE && (s[1] & 0xC0) == 0x80
+			  && (s[2] & 0xC0) == 0x80) {
+		*rune = ((c & 0xF) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);
+			/* 1110xxxx 10xxxxxx 10xxxxxx */
+		ret = 3;
+	} else if (n >= 4 && ((c>>3) & 0x1F) == 0x1E && (s[1] & 0xC0) == 0x80
+			  && (s[2] & 0xC0) == 0x80 && (s[3] & 0xC0) == 0x80) {
+		*rune = ((c & 0x7) << 18) | ((s[1] & 0x3F) << 12) | ((s[2] & 0x3F) << 6) | (s[3] & 0x3F);
+			/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
+		ret = 4;
+	} else {
+		*rune = c;
+		ret = 1;
+	}
+	return ret; /* returns one byte if sequence doesn't look like utf */
+}
+
+/* return length of next sequence: 1 for ascii or random, 2..4 for valid utf8 */
+int u8_nextlen(const char *s)
+{
+	int len;
+
+	len = u8_isutf(s);
+	if (len == 0)
+		len = 1;
+	return len;
+}
+
+/* return number of utf characters or single non-utf bytes */
+int u8_strlen(const char *s)
+{
+	int i, len, n, totlen;
+	unsigned char c;
+
+	n = strlen(s);
+	totlen = 0;
+	for (i = 0; i < n; i += len) {
+		c = s[i];
+		if (c < 128 || awk_mb_cur_max == 1) {
+			len = 1;
+		} else {
+			len = u8_nextlen(&s[i]);
+		}
+		totlen++;
+		if (i > n)
+			FATAL("bad utf count [%s] n=%d i=%d\n", s, n, i);
+	}
+	return totlen;
+}
+
+/* convert utf-8 char number in a string to its byte offset */
+int u8_char2byte(const char *s, int charnum)
+{
+	int n;
+	int bytenum = 0;
+
+	while (charnum > 0) {
+		n = u8_nextlen(s);
+		s += n;
+		bytenum += n;
+		charnum--;
+	}
+	return bytenum;
+}
+
+/* convert byte offset in s to utf-8 char number that starts there */
+int u8_byte2char(const char *s, int bytenum)
+{
+	int i, len, b;
+	int charnum = 0; /* BUG: what origin? */
+	/* should be 0 to match start==0 which means no match */	
+
+	b = strlen(s);
+	if (bytenum > b) {
+		return -1; /* ??? */
+	}
+	for (i = 0; i <= bytenum; i += len) {
+		len = u8_nextlen(s+i);
+		charnum++;
+	}
+	return charnum;
+}
+
+/* runetochar() adapted from rune.c in the Plan 9 distributione */
+
+enum
+{
+	Runeerror = 128, /* from somewhere else */
+	Runemax = 0x10FFFF,
+
+	Bit1    = 7,
+	Bitx    = 6,
+	Bit2    = 5,
+	Bit3    = 4,
+	Bit4    = 3,
+	Bit5    = 2,
+
+	T1      = ((1<<(Bit1+1))-1) ^ 0xFF,     /* 0000 0000 */
+	Tx      = ((1<<(Bitx+1))-1) ^ 0xFF,     /* 1000 0000 */
+	T2      = ((1<<(Bit2+1))-1) ^ 0xFF,     /* 1100 0000 */
+	T3      = ((1<<(Bit3+1))-1) ^ 0xFF,     /* 1110 0000 */
+	T4      = ((1<<(Bit4+1))-1) ^ 0xFF,     /* 1111 0000 */
+	T5      = ((1<<(Bit5+1))-1) ^ 0xFF,     /* 1111 1000 */
+
+	Rune1   = (1<<(Bit1+0*Bitx))-1,	 	/* 0000 0000 0000 0000 0111 1111 */
+	Rune2   = (1<<(Bit2+1*Bitx))-1,	 	/* 0000 0000 0000 0111 1111 1111 */
+	Rune3   = (1<<(Bit3+2*Bitx))-1,	 	/* 0000 0000 1111 1111 1111 1111 */
+	Rune4   = (1<<(Bit4+3*Bitx))-1,	 	/* 0011 1111 1111 1111 1111 1111 */
+
+	Maskx   = (1<<Bitx)-1,		  	/* 0011 1111 */
+	Testx   = Maskx ^ 0xFF,		 	/* 1100 0000 */
+
+};
+
+int runetochar(char *str, int c)
+{	
+	/* one character sequence 00000-0007F => 00-7F */     
+	if (c <= Rune1) {
+		str[0] = c;
+		return 1;
+	}
+	
+	/* two character sequence 00080-007FF => T2 Tx */
+	if (c <= Rune2) {
+		str[0] = T2 | (c >> 1*Bitx);
+		str[1] = Tx | (c & Maskx);
+		return 2;
+	}
+
+	/* three character sequence 00800-0FFFF => T3 Tx Tx */
+	if (c > Runemax)
+		c = Runeerror;
+	if (c <= Rune3) {
+		str[0] = T3 |  (c >> 2*Bitx);
+		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
+		str[2] = Tx |  (c & Maskx);
+		return 3;
+	}
+	
+	/* four character sequence 010000-1FFFFF => T4 Tx Tx Tx */
+	str[0] = T4 |  (c >> 3*Bitx);
+	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
+	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
+	str[3] = Tx |  (c & Maskx);
+	return 4;
+}               
+
+
+/* ========== end of utf8 code =========== */
+
+
+
 Cell *matchop(Node **a, int n)	/* ~ and match() */
 {
 	Cell *x, *y;
 	char *s, *t;
 	int i;
+	int cstart, cpatlen, len;
 	fa *pfa;
 	int (*mf)(fa *, const char *) = match, mode = 0;

@ -602,9 +819,21 @@ Cell *matchop(Node **a, int n)	/* ~ and match() */
 	}
 	tempfree(x);
 	if (n == MATCHFCN) {
-		int start = patbeg - s + 1;
-		if (patlen < 0)
-			start = 0;
+		int start = patbeg - s + 1; /* origin 1 */
+		if (patlen < 0) {
+			start = 0; /* not found */
+		} else {
+			cstart = u8_byte2char(s, start-1);
+			cpatlen = 0;
+			for (i = 0; i < patlen; i += len) {
+				len = u8_nextlen(patbeg+i);
+				cpatlen++;
+			}
+
+			start = cstart;
+			patlen = cpatlen;
+		}
+
 		setfval(rstartloc, (Awkfloat) start);
 		setfval(rlengthloc, (Awkfloat) patlen);
 		x = gettemp();
@ -655,10 +884,15 @@ Cell *relop(Node **a, int n)	/* a[0 < a[1], etc. */
 	int i;
 	Cell *x, *y;
 	Awkfloat j;
+	bool x_is_nan, y_is_nan;

 	x = execute(a[0]);
 	y = execute(a[1]);
+	x_is_nan = isnan(x->fval);
+	y_is_nan = isnan(y->fval);
 	if (x->tval&NUM && y->tval&NUM) {
+		if ((x_is_nan || y_is_nan) && n != NE)
+			return(False);
 		j = x->fval - y->fval;
 		i = j<0? -1: (j>0? 1: 0);
 	} else {
@ -671,7 +905,8 @@ Cell *relop(Node **a, int n)	/* a[0 < a[1], etc. */
 			else return(False);
 	case LE:	if (i<=0) return(True);
 			else return(False);
-	case NE:	if (i!=0) return(True);
+	case NE:	if (x_is_nan && y_is_nan) return(True);
+			else if (i!=0) return(True);
 			else return(False);
 	case EQ:	if (i == 0) return(True);
 			else return(False);
@ -740,6 +975,7 @@ Cell *indirect(Node **a, int n)	/* $( a[0] ) */
 Cell *substr(Node **a, int nnn)		/* substr(a[0], a[1], a[2]) */
 {
 	int k, m, n;
+	int mb, nb;
 	char *s;
 	int temp;
 	Cell *x, *y, *z = NULL;
@ -749,7 +985,7 @@ Cell *substr(Node **a, int nnn)		/* substr(a[0], a[1], a[2]) */
 	if (a[2] != NULL)
 		z = execute(a[2]);
 	s = getsval(x);
-	k = strlen(s) + 1;
+	k = u8_strlen(s) + 1;
 	if (k <= 1) {
 		tempfree(x);
 		tempfree(y);
@ -775,12 +1011,16 @@ Cell *substr(Node **a, int nnn)		/* substr(a[0], a[1], a[2]) */
 		n = 0;
 	else if (n > k - m)
 		n = k - m;
+	/* m is start, n is length from there */
 	DPRINTF("substr: m=%d, n=%d, s=%s\n", m, n, s);
 	y = gettemp();
-	temp = s[n+m-1];	/* with thanks to John Linderman */
-	s[n+m-1] = '\0';
-	setsval(y, s + m - 1);
-	s[n+m-1] = temp;
+	mb = u8_char2byte(s, m-1); /* byte offset of start char in s */
+	nb = u8_char2byte(s, m-1+n);  /* byte offset of end+1 char in s */
+
+	temp = s[nb];	/* with thanks to John Linderman */
+	s[nb] = '\0';
+	setsval(y, s + mb);
+	s[nb] = temp;
 	tempfree(x);
 	return(y);
 }
@ -801,7 +1041,15 @@ Cell *sindex(Node **a, int nnn)		/* index(a[0], a[1]) */
 		for (q = p1, p2 = s2; *p2 != '\0' && *q == *p2; q++, p2++)
 			continue;
 		if (*p2 == '\0') {
-			v = (Awkfloat) (p1 - s1 + 1);	/* origin 1 */
+			/* v = (Awkfloat) (p1 - s1 + 1);	 origin 1 */
+
+		   /* should be a function: used in match() as well */
+			int i, len;
+			v = 0;
+			for (i = 0; i < p1-s1+1; i += len) {
+				len = u8_nextlen(s1+i);
+				v++;
+			}
 			break;
 		}
 	}
@ -811,6 +1059,18 @@ Cell *sindex(Node **a, int nnn)		/* index(a[0], a[1]) */
 	return(z);
 }

+int has_utf8(char *s)	/* return 1 if s contains any utf-8 (2 bytes or more) character */
+{
+	int n;
+
+	for (n = 0; *s != 0; s += n) {
+		n = u8_nextlen(s);
+		if (n > 1)
+			return 1;
+	}
+	return 0;
+}
+
 #define	MAXNUMSIZE	50

 int format(char **pbuf, int *pbufsize, const char *s, Node *a)	/* printf-like conversions */
@ -853,7 +1113,6 @@ int format(char **pbuf, int *pbufsize, const char *s, Node *a)	/* printf-like co
 			s += 2;
 			continue;
 		}
-		/* have to be real careful in case this is a huge number, eg, %100000d */
 		fmtwd = atoi(s+1);
 		if (fmtwd < 0)
 			fmtwd = -fmtwd;
@ -929,7 +1188,8 @@ int format(char **pbuf, int *pbufsize, const char *s, Node *a)	/* printf-like co
 			n = fmtwd;
 		adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format5");
 		switch (flag) {
-		case '?':	snprintf(p, BUFSZ(p), "%s", fmt);	/* unknown, so dump it too */
+		case '?':
+			snprintf(p, BUFSZ(p), "%s", fmt);	/* unknown, so dump it too */
 			t = getsval(x);
 			n = strlen(t);
 			if (fmtwd > n)
@ -943,37 +1203,186 @@ int format(char **pbuf, int *pbufsize, const char *s, Node *a)	/* printf-like co
 		case 'f':	snprintf(p, BUFSZ(p), fmt, getfval(x)); break;
 		case 'd':	snprintf(p, BUFSZ(p), fmt, (intmax_t) getfval(x)); break;
 		case 'u':	snprintf(p, BUFSZ(p), fmt, (uintmax_t) getfval(x)); break;
-		case 's':
+
+		case 's': {
 			t = getsval(x);
 			n = strlen(t);
-			if (fmtwd > n)
-				n = fmtwd;
-			if (!adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format7"))
-				FATAL("huge string/format (%d chars) in printf %.30s... ran format() out of memory", n, t);
-			snprintf(p, BUFSZ(p), fmt, t);
+			/* if simple format or no utf-8 in the string, sprintf works */
+			if (!has_utf8(t) || strcmp(fmt,"%s") == 0) {
+				if (fmtwd > n)
+					n = fmtwd;
+				if (!adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format7"))
+					FATAL("huge string/format (%d chars) in printf %.30s..." \
+						" ran format() out of memory", n, t);
+				snprintf(p, BUFSZ(p), fmt, t);
+				break;
+			}
+
+			/* get here if string has utf-8 chars and fmt is not plain %s */
+			/* "%-w.ps", where -, w and .p are all optional */
+			/* '0' before the w is a flag character */
+			/* fmt points at % */
+			int ljust = 0, wid = 0, prec = n, pad = 0;
+			char *f = fmt+1;
+			if (f[0] == '-') {
+				ljust = 1;
+				f++;
+			}
+			// flags '0' and '+' are recognized but skipped
+			if (f[0] == '0') {
+				f++;
+				if (f[0] == '+')
+					f++;
+			}
+			if (f[0] == '+') {
+				f++;
+				if (f[0] == '0')
+					f++;
+			}
+			if (isdigit(f[0])) { /* there is a wid */
+				wid = strtol(f, &f, 10);
+			}
+			if (f[0] == '.') { /* there is a .prec */
+				prec = strtol(++f, &f, 10);
+			}
+			if (prec > u8_strlen(t))
+				prec = u8_strlen(t);
+			pad = wid>prec ? wid - prec : 0;  // has to be >= 0
+			int i, k, n;
+			
+			if (ljust) { // print prec chars from t, then pad blanks
+				n = u8_char2byte(t, prec);
+				for (k = 0; k < n; k++) {
+					//putchar(t[k]);
+					*p++ = t[k];
+				}
+				for (i = 0; i < pad; i++) {
+					//printf(" ");
+					*p++ = ' ';
+				}
+			} else { // print pad blanks, then prec chars from t
+				for (i = 0; i < pad; i++) {
+					//printf(" ");
+					*p++ = ' ';
+				}
+				n = u8_char2byte(t, prec);
+				for (k = 0; k < n; k++) {
+					//putchar(t[k]);
+					*p++ = t[k];
+				}
+			}
+			*p = 0;
 			break;
-		case 'c':
+		}
+
+               case 'c': {
+			/*
+			 * If a numeric value is given, awk should just turn
+			 * it into a character and print it:
+			 *      BEGIN { printf("%c\n", 65) }
+			 * prints "A".
+			 *
+			 * But what if the numeric value is > 128 and
+			 * represents a valid Unicode code point?!? We do
+			 * our best to convert it back into UTF-8. If we
+			 * can't, we output the encoding of the Unicode
+			 * "invalid character", 0xFFFD.
+			 */
 			if (isnum(x)) {
-				if ((int)getfval(x))
-					snprintf(p, BUFSZ(p), fmt, (int) getfval(x));
-				else {
+				int charval = (int) getfval(x);
+
+				if (charval != 0) {
+					if (charval < 128 || awk_mb_cur_max == 1)
+						snprintf(p, BUFSZ(p), fmt, charval);
+					else {
+						// possible unicode character
+						size_t count;
+						char *bs = wide_char_to_byte_str(charval, &count);
+
+						if (bs == NULL)	{ // invalid character
+							// use unicode invalid character, 0xFFFD
+							bs = "\357\277\275";
+							count = 3;
+						}
+						t = bs;
+						n = count;
+						goto format_percent_c;
+					}
+				} else {
 					*p++ = '\0'; /* explicit null byte */
 					*p = '\0';   /* next output will start here */
 				}
-			} else
+				break;
+			}
+			t = getsval(x);
+			n = u8_nextlen(t);
+		format_percent_c:
+			if (n < 2) { /* not utf8 */
 				snprintf(p, BUFSZ(p), fmt, getsval(x)[0]);
+				break;
+			}
+
+			// utf8 character, almost same song and dance as for %s
+			int ljust = 0, wid = 0, prec = n, pad = 0;
+			char *f = fmt+1;
+			if (f[0] == '-') {
+				ljust = 1;
+				f++;
+			}
+			// flags '0' and '+' are recognized but skipped
+			if (f[0] == '0') {
+				f++;
+				if (f[0] == '+')
+					f++;
+			}
+			if (f[0] == '+') {
+				f++;
+				if (f[0] == '0')
+					f++;
+			}
+			if (isdigit(f[0])) { /* there is a wid */
+				wid = strtol(f, &f, 10);
+			}
+			if (f[0] == '.') { /* there is a .prec */
+				prec = strtol(++f, &f, 10);
+			}
+			if (prec > 1)           // %c --> only one character
+				prec = 1;
+			pad = wid>prec ? wid - prec : 0;  // has to be >= 0
+			int i;
+
+			if (ljust) { // print one char from t, then pad blanks
+				for (i = 0; i < n; i++)
+					*p++ = t[i];
+				for (i = 0; i < pad; i++) {
+					//printf(" ");
+					*p++ = ' ';
+				}
+			} else { // print pad blanks, then prec chars from t
+				for (i = 0; i < pad; i++) {
+					//printf(" ");
+					*p++ = ' ';
+				}
+				for (i = 0; i < n; i++)
+					*p++ = t[i];
+			}
+			*p = 0;
 			break;
+		}
 		default:
 			FATAL("can't happen: bad conversion %c in format()", flag);
 		}
+
 		tempfree(x);
 		p += strlen(p);
 		s++;
 	}
 	*p = '\0';
 	free(fmt);
-	for ( ; a; a = a->nnext)		/* evaluate any remaining args */
-		execute(a);
+	for ( ; a; a = a->nnext) {		/* evaluate any remaining args */
+		x = execute(a);
+		tempfree(x);
+	}
 	*pbuf = buf;
 	*pbufsize = bufsize;
 	return p - buf;
@ -1196,16 +1605,17 @@ Cell *cat(Node **a, int q)	/* a[0] cat a[1] */

 	x = execute(a[0]);
 	n1 = strlen(getsval(x));
-	adjbuf(&s, &ssz, n1, recsize, 0, "cat1");
+	adjbuf(&s, &ssz, n1 + 1, recsize, 0, "cat1");
 	memcpy(s, x->sval, n1);

+	tempfree(x);
+
 	y = execute(a[1]);
 	n2 = strlen(getsval(y));
 	adjbuf(&s, &ssz, n1 + n2 + 1, recsize, 0, "cat2");
 	memcpy(s + n1, y->sval, n2);
 	s[n1 + n2] = '\0';

-	tempfree(x);
 	tempfree(y);

 	z = gettemp();
@ -1263,23 +1673,27 @@ Cell *split(Node **a, int nnn)	/* split(a[0], a[1], a[2]); a[3] is type */
 	int sep;
 	char temp, num[50];
 	int n, tempstat, arg3type;
+	int j;
 	double result;

 	y = execute(a[0]);	/* source string */
 	origs = s = strdup(getsval(y));
+	tempfree(y);
 	arg3type = ptoi(a[3]);
-	if (a[2] == NULL)		/* fs string */
+	if (a[2] == NULL) {		/* BUG: CSV should override implicit fs but not explicit */
 		fs = getsval(fsloc);
-	else if (arg3type == STRING) {	/* split(str,arr,"string") */
+	} else if (arg3type == STRING) {	/* split(str,arr,"string") */
 		x = execute(a[2]);
 		fs = origfs = strdup(getsval(x));
 		tempfree(x);
-	} else if (arg3type == REGEXPR)
+	} else if (arg3type == REGEXPR) {
 		fs = "(regexpr)";	/* split(str,arr,/regexpr/) */
-	else
+	} else {
 		FATAL("illegal type of split");
+	}
 	sep = *fs;
 	ap = execute(a[1]);	/* array name */
+/* BUG 7/26/22: this appears not to reset array: see C1/asplit */
 	freesymtab(ap);
 	DPRINTF("split: s=|%s|, a=%s, sep=|%s|\n", s, NN(ap->nval), fs);
 	ap->tval &= ~STR;
@ -1333,7 +1747,41 @@ Cell *split(Node **a, int nnn)	/* split(a[0], a[1], a[2]); a[3] is type */
 			setsymtab(num, s, 0.0, STR, (Array *) ap->sval);
  spdone:
 		pfa = NULL;
-	} else if (sep == ' ') {
+
+	} else if (a[2] == NULL && CSV) {	/* CSV only if no explicit separator */
+		char *newt = (char *) malloc(strlen(s)); /* for building new string; reuse for each field */
+		for (;;) {
+			char *fr = newt;
+			n++;
+			if (*s == '"' ) { /* start of "..." */
+				for (s++ ; *s != '\0'; ) {
+					if (*s == '"' && s[1] != '\0' && s[1] == '"') {
+						s += 2; /* doubled quote */
+						*fr++ = '"';
+					} else if (*s == '"' && (s[1] == '\0' || s[1] == ',')) {
+						s++; /* skip over closing quote */
+						break;
+					} else {
+						*fr++ = *s++;
+					}
+				}
+				*fr++ = 0;
+			} else {	/* unquoted field */
+				while (*s != ',' && *s != '\0')
+					*fr++ = *s++;
+				*fr++ = 0;
+			}
+			snprintf(num, sizeof(num), "%d", n);
+			if (is_number(newt, &result))
+				setsymtab(num, newt, result, STR|NUM, (Array *) ap->sval);
+			else
+				setsymtab(num, newt, 0.0, STR, (Array *) ap->sval);
+			if (*s++ == '\0')
+				break;
+		}
+		free(newt);
+
+	} else if (!CSV && sep == ' ') { /* usual case: split on white space */
 		for (n = 0; ; ) {
 #define ISWS(c)	((c) == ' ' || (c) == '\t' || (c) == '\n')
 			while (ISWS(*s))
@ -1356,19 +1804,25 @@ Cell *split(Node **a, int nnn)	/* split(a[0], a[1], a[2]); a[3] is type */
 			if (*s != '\0')
 				s++;
 		}
+
 	} else if (sep == 0) {	/* new: split(s, a, "") => 1 char/elem */
-		for (n = 0; *s != '\0'; s++) {
-			char buf[2];
+		for (n = 0; *s != '\0'; s += u8_nextlen(s)) {
+			char buf[10];
 			n++;
 			snprintf(num, sizeof(num), "%d", n);
-			buf[0] = *s;
-			buf[1] = '\0';
+
+			for (j = 0; j < u8_nextlen(s); j++) {
+				buf[j] = s[j];
+			}
+			buf[j] = '\0';
+
 			if (isdigit((uschar)buf[0]))
 				setsymtab(num, buf, atof(buf), STR|NUM, (Array *) ap->sval);
 			else
 				setsymtab(num, buf, 0.0, STR, (Array *) ap->sval);
 		}
-	} else if (*s != '\0') {
+
+	} else if (*s != '\0') {  /* some random single character */
 		for (;;) {
 			n++;
 			t = s;
@ -1387,7 +1841,6 @@ Cell *split(Node **a, int nnn)	/* split(a[0], a[1], a[2]); a[3] is type */
 		}
 	}
 	tempfree(ap);
-	tempfree(y);
 	xfree(origs);
 	xfree(origfs);
 	x = gettemp();
@ -1527,7 +1980,8 @@ static char *nawk_convert(const char *s, int (*fun_c)(int),
 	const char *ps = NULL;
 	size_t n       = 0;
 	wchar_t wc;
-	size_t sz = MB_CUR_MAX;
+	const size_t sz = awk_mb_cur_max;
+	int unused;

 	if (sz == 1) {
 		buf = tostring(s);
@ -1547,7 +2001,7 @@ static char *nawk_convert(const char *s, int (*fun_c)(int),
 		 * doesn't work.)
 		 * Increment said variable to avoid a different warning.
 		 */
-		int unused = wctomb(NULL, L'\0');
+		unused = wctomb(NULL, L'\0');
 		unused++;

 		ps   = s;
@ -1601,6 +2055,8 @@ static char *nawk_tolower(const char *s)
 	return nawk_convert(s, tolower, towlower);
 }

+
+
 Cell *bltin(Node **a, int n)	/* builtin functions. a[0] is type, a[1] is arg list */
 {
 	Cell *x, *y;
@ -1622,7 +2078,7 @@ Cell *bltin(Node **a, int n)	/* builtin functions. a[0] is type, a[1] is arg lis
 		if (isarr(x))
 			u = ((Array *) x->sval)->nelem;	/* GROT.  should be function*/
 		else
-			u = strlen(getsval(x));
+			u = u8_strlen(getsval(x));
 		break;
 	case FLOG:
 		errno = 0;
@ -1808,8 +2264,10 @@ Cell *bltin(Node **a, int n)	/* builtin functions. a[0] is type, a[1] is arg lis
 	setfval(x, u);
 	if (nextarg != NULL) {
 		WARNING("warning: function has too many arguments");
-		for ( ; nextarg; nextarg = nextarg->nnext)
-			execute(nextarg);
+		for ( ; nextarg; nextarg = nextarg->nnext) {
+			y = execute(nextarg);
+			tempfree(y);
+		}
 	}
 	return(x);
 }
@ -1876,13 +2334,13 @@ static void stdinit(void)	/* in case stdin, etc., are not constants */
 	if (files == NULL)
 		FATAL("can't allocate file memory for %zu files", nfiles);
        files[0].fp = stdin;
-	files[0].fname = "/dev/stdin";
+	files[0].fname = tostring("/dev/stdin");
 	files[0].mode = LT;
        files[1].fp = stdout;
-	files[1].fname = "/dev/stdout";
+	files[1].fname = tostring("/dev/stdout");
 	files[1].mode = GT;
        files[2].fp = stderr;
-	files[2].fname = "/dev/stderr";
+	files[2].fname = tostring("/dev/stderr");
 	files[2].mode = GT;
 }

@ -1956,8 +2414,8 @@ const char *filename(FILE *fp)
 	return "???";
 }

- Cell *closefile(Node **a, int n)
- {
+Cell *closefile(Node **a, int n)
+{
 	Cell *x;
 	size_t i;
 	bool stat;
@ -1968,8 +2426,15 @@ const char *filename(FILE *fp)
 	for (i = 0; i < nfiles; i++) {
 		if (!files[i].fname || strcmp(x->sval, files[i].fname) != 0)
 			continue;
-		if (ferror(files[i].fp))
-			FATAL("i/o error occurred on %s", files[i].fname);
+		if (files[i].mode == GT || files[i].mode == '|')
+			fflush(files[i].fp);
+		if (ferror(files[i].fp)) {
+			if ((files[i].mode == GT && files[i].fp != stderr)
+			  || files[i].mode == '|')
+				FATAL("write error on %s", files[i].fname);
+			else
+				WARNING("i/o error occurred on %s", files[i].fname);
+		}
 		if (files[i].fp == stdin || files[i].fp == stdout ||
 		    files[i].fp == stderr)
 			stat = freopen("/dev/null", "r+", files[i].fp) == NULL;
@ -1978,9 +2443,8 @@ const char *filename(FILE *fp)
 		else
 			stat = fclose(files[i].fp) == EOF;
 		if (stat)
-			FATAL("i/o error occurred closing %s", files[i].fname);
-		if (i > 2)	/* don't do /dev/std... */
-			xfree(files[i].fname);
+			WARNING("i/o error occurred closing %s", files[i].fname);
+		xfree(files[i].fname);
 		files[i].fname = NULL;	/* watch out for ref thru this */
 		files[i].fp = NULL;
 		break;
@ -1989,7 +2453,7 @@ const char *filename(FILE *fp)
 	x = gettemp();
 	setfval(x, (Awkfloat) (stat ? -1 : 0));
 	return(x);
- }
+}

 void closeall(void)
 {
@ -1999,18 +2463,24 @@ void closeall(void)
 	for (i = 0; i < nfiles; i++) {
 		if (! files[i].fp)
 			continue;
-		if (ferror(files[i].fp))
-			FATAL( "i/o error occurred on %s", files[i].fname );
-		if (files[i].fp == stdin)
+		if (files[i].mode == GT || files[i].mode == '|')
+			fflush(files[i].fp);
+		if (ferror(files[i].fp)) {
+			if ((files[i].mode == GT && files[i].fp != stderr)
+			  || files[i].mode == '|')
+				FATAL("write error on %s", files[i].fname);
+			else
+				WARNING("i/o error occurred on %s", files[i].fname);
+		}
+		if (files[i].fp == stdin || files[i].fp == stdout ||
+		    files[i].fp == stderr)
 			continue;
 		if (files[i].mode == '|' || files[i].mode == LE)
 			stat = pclose(files[i].fp) == -1;
-		else if (files[i].fp == stdout || files[i].fp == stderr)
-			stat = fflush(files[i].fp) == EOF;
 		else
 			stat = fclose(files[i].fp) == EOF;
 		if (stat)
-			FATAL( "i/o error occurred while closing %s", files[i].fname );
+			WARNING("i/o error occurred while closing %s", files[i].fname);
 	}
 }

@ -2094,6 +2564,7 @@ Cell *gsub(Node **a, int nnn)	/* global substitute */
 	fa *pfa;
 	int mflag, tempstat, num;
 	int bufsz = recsize;
+	int charlen = 0;

 	if ((buf = (char *) malloc(bufsz)) == NULL)
 		FATAL("out of memory in gsub");
@ -2135,7 +2606,9 @@ Cell *gsub(Node **a, int nnn)	/* global substitute */
 				if (*t == '\0')	/* at end */
 					goto done;
 				adjbuf(&buf, &bufsz, 2+pb-buf, recsize, &pb, "gsub");
-				*pb++ = *t++;
+				charlen = u8_nextlen(t);
+				while (charlen-- > 0)
+					*pb++ = *t++;
 				if (pb > buf + bufsz)	/* BUG: not sure of this test */
 					FATAL("gsub result0 %.30s too big; can't happen", buf);
 				mflag = 0;
@ -2364,3 +2837,41 @@ void backsub(char **pb_ptr, const char **sptr_ptr)	/* handle \\& variations */
 	*pb_ptr = pb;
 	*sptr_ptr = sptr;
 }
+
+static char *wide_char_to_byte_str(int rune, size_t *outlen)
+{
+	static char buf[5];
+	int len;
+
+	if (rune < 0 || rune > 0x10FFFF)
+		return NULL;
+
+	memset(buf, 0, sizeof(buf));
+
+	len = 0;
+	if (rune <= 0x0000007F) {
+		buf[len++] = rune;
+	} else if (rune <= 0x000007FF) {
+		// 110xxxxx 10xxxxxx
+		buf[len++] = 0xC0 | (rune >> 6);
+		buf[len++] = 0x80 | (rune & 0x3F);
+	} else if (rune <= 0x0000FFFF) {
+		// 1110xxxx 10xxxxxx 10xxxxxx
+		buf[len++] = 0xE0 | (rune >> 12);
+		buf[len++] = 0x80 | ((rune >> 6) & 0x3F);
+		buf[len++] = 0x80 | (rune & 0x3F);
+
+	} else {
+		// 0x00010000 - 0x10FFFF
+		// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+		buf[len++] = 0xF0 | (rune >> 18);
+		buf[len++] = 0x80 | ((rune >> 12) & 0x3F);
+		buf[len++] = 0x80 | ((rune >> 6) & 0x3F);
+		buf[len++] = 0x80 | (rune & 0x3F);
+	}
+
+	*outlen = len;
+	buf[len++] = '\0';
+
+	return buf;
+}
--- a/contrib/one-true-awk/testdir/T.argv
+++ b/contrib/one-true-awk/testdir/T.argv
@ -97,6 +97,12 @@ echo '111
 $awk '{print L $0}' L=11 foo0 L=22 foo0 >foo2
 diff foo1 foo2 || echo 'BAD: T.argv (L=11 L=22)'

+echo >foo0
+echo 'name=value
+name=value' >foo1
+$awk 'BEGIN { print ARGV[1] } { print ARGV[1] }' name=value foo0 >foo2
+diff foo1 foo2 || echo 'BAD: T.argv assignment operand modified'
+
 echo 3.345 >foo1
 $awk 'BEGIN { print ARGV[1] + ARGV[2]}' 1 2.345 >foo2
 diff foo1 foo2 || echo 'BAD: T.argv (ARGV[1] + ARGV[2])'
--- a/contrib/one-true-awk/testdir/T.csv
+++ b/contrib/one-true-awk/testdir/T.csv
@ -0,0 +1,81 @@
+#!/bin/sh
+
+echo T.csv: tests of csv field splitting, no embedded newlines
+
+awk=${awk-../a.out}
+
+$awk '
+BEGIN {
+	FS = "\t"
+	awk = "../a.out --csv"
+}
+NF == 0 || $1 ~ /^#/ {
+	next
+}
+$1 ~ /try/ {	# new test
+	nt++
+	sub(/try /, "")
+	prog = $0
+	printf("%3d  %s\n", nt, prog)
+	prog = sprintf("%s -F\"\\t\" '"'"'%s'"'"'", awk, prog)
+	# print "prog is", prog
+	nt2 = 0
+	while (getline > 0) {
+		if (NF == 0)	# blank line terminates a sequence
+			break
+		input = $1
+		for (i = 2; i < NF; i++)	# input data
+			input = input "\t" $i
+		test = sprintf("./echo '"'"'%s'"'"' | %s >foo1; ",
+			input, prog)
+		if ($NF == "\"\"")
+			output = ">foo2;"
+		else
+			output = sprintf("./echo '"'"'%s'"'"' >foo2; ", $NF)
+		gsub(/\\t/, "\t", output)
+		gsub(/\\n/, "\n", output)
+		run = sprintf("cmp foo1 foo2 || echo test %d.%d failed",
+			nt, ++nt2)
+		# print  "input is", input
+		# print  "test is", test
+		# print  "output is", output
+		# print  "run is", run
+		system(test output run)
+	}
+	tt += nt2
+}
+END { print tt, "tests" }
+' <<\!!!!
+# General format:
+# try program as rest of line
+# $1	$2	$3	output1  (\t for tab, \n for newline,
+# $1	$2	$3	output2  ("" for null)
+# ... terminated by blank line
+
+
+try  { for (i=1; i<=NF; i++) printf("[%s]", $i); printf("\n") }
+a	[a]
+  a	[  a]
+,a	[][a]
+ , a	[ ][ a]
+a,b	[a][b]
+a,b,c	[a][b][c]
+""	[]
+"abc"	[abc]
+"a""b"	[a"b]
+"a","b"	[a][b]
+a""b	[a""b]
+"a,b"	[a,b]
+""""	["]
+""""""	[""]
+"""x"""	["x"]
+""","""	[","]
+,,""	[][][]
+a""b	[a""b]
+a''b	[a''b]
+,,	[][][]
+a,	[a][]
+"",	[][]
+,	[][]
+a"b	[a"b]	
+!!!!
--- a/contrib/one-true-awk/testdir/T.misc
+++ b/contrib/one-true-awk/testdir/T.misc
@ -504,3 +504,9 @@ cmp -s foo1 foo2 || echo 'BAD: T.misc END must preserve $0'
 echo 'E 2' >foo1
 (trap '' PIPE; "$awk" 'BEGIN { print "hi"; }' 2>/dev/null; echo "E $?" >foo2) | :
 cmp -s foo1 foo2 || echo 'BAD: T.misc exit status on I/O error'
+
+# Check for clobbering of the lexer's regular expression buffer.
+# If the output is "a1" instead of "1b", /b/ clobbered /a/.
+echo 1b >foo1
+echo ab | $awk '{ sub(/a/, "b" ~ /b/); print }' >foo2
+cmp -s foo1 foo2 || echo 'BAD: T.misc lexer regex buffer clobbered'
--- a/contrib/one-true-awk/testdir/T.utf
+++ b/contrib/one-true-awk/testdir/T.utf
@ -0,0 +1,194 @@
+#!/bin/sh
+
+echo T.utf: tests of utf functions
+
+awk=${awk-../a.out}
+
+$awk '
+BEGIN {
+	FS = "\t"
+	awk = "../a.out"
+}
+NF == 0 || $1 ~ /^#/ {
+	next
+}
+$1 ~ /try/ {	# new test
+	nt++
+	sub(/try [a-zA-Z_0-9]+ /, "")
+	prog = $0
+	printf("try %3d %s\n", nt, prog)
+	prog = sprintf("%s -F\"\\t\" '"'"'%s'"'"'", awk, prog)
+	# print "prog is", prog
+	nt2 = 0
+	while (getline > 0) {
+		if (NF == 0)	# blank line terminates a sequence
+			break
+		input = $1
+		for (i = 2; i < NF; i++)	# input data
+			input = input "\t" $i
+		test = sprintf("./echo '"'"'%s'"'"' | %s >foo1; ",
+			input, prog)
+		if ($NF == "\"\"")
+			output = ">foo2;"
+		else
+			output = sprintf("./echo '"'"'%s'"'"' >foo2; ", $NF)
+		gsub(/\\t/, "\t", output)
+		gsub(/\\n/, "\n", output)
+		run = sprintf("diff foo1 foo2 || echo test %d.%d failed",
+			nt, ++nt2)
+		# print  "input is", input
+		# print  "test is", test
+		# print  "output is", output
+		# print  "run is", run
+		system(test output run)
+	}
+	tt += nt2
+}
+END { print tt, "tests" }
+' <<\!!!!
+# General format:
+# try program as rest of line
+# $1	$2	$3	output1  (\t for tab, \n for newline,
+# $1	$2	$3	output2  ("" for null)
+# ... terminated by blank line
+
+# try another program...
+
+try length { print length($1) }
+	0
+a	1
+の今がその時だ	7
+Сейчас	6
+现在是时候了	6
+给所有的好男	6
+来参加聚会。	6
+😀	1
+🖕 finger	8
+Τωρα	4
+για	3
+να	2
+עכשיו	5
+לכל	3
+לבוא	4
+の今がその時だ	7
+지금이	3
+모든	2
+파티에	3
+Сейчас	6
+для	3
+прийти	6
+
+try index { print index($1, $2) }
+abc	a	1
+abc	b	2
+abc	x	0
+现在是时候了	""	0
+现在是时候了	了	6
+现在是时候了	在是	2
+现在是时候了	x	0
+现x在是时候了	x	2
+🖕 fingerすべての善人のためにすべての善人のために	f	3
+🖕 finger🖕	r🖕	8
+
+try substr { print substr($0, 2, 3) }
+abcdef	bcd
+Τωρα ειναι η	ωρα
+Τω	ω
+지금 이절호의	금 이
+xпyрийти	пyр
+
+try rematch { print $1 ~ $2 }
+abc	a	1
+abc	x	0
+すべての善人のために	の	1
+すべての善人のために	の.*の	1
+すべての善人のために	の.*て	0
+Τωρα	ω+	1
+
+# replace first occurrence of $2 by $3 in $1
+try sub { n = sub($2, $3, $1); print n, $1 }
+abcdef	bc	XYZ	1 aXYZdef
+abcdef	xy	XYZ	0 abcdef
+の今がその時だ	の	NO	1 NO今がその時だ
+🖕 finger	🖕.*g	FING	1 FINGer
+Сейчас	.	x	1 xейчас
+
+# replace all occurrences of $2 by $3 in $1
+try gsub { n = gsub($2, $3, $1); print n, $1 }
+abcdef	bc	XYZ	1 aXYZdef
+abcdef	xy	XYZ	0 abcdef
+の今がその時だ	の	NO	2 NO今がそNO時だ
+🖕 finger	🖕.*g	FING	1 FINGer
+Сейчас	.	x	6 xxxxxx
+
+try match { print match($1, $2), RSTART, RLENGTH }
+abc	[^a]	2 2 1
+abc	[^ab]	3 3 1
+すべての善人のために	[^す]	2 2 1
+すべての善人のために	[^ぁ-ゖ]	5 5 1
+abc	a	1 1 1
+abc	x	0 0 -1
+すべての善人のために	の	4 4 1
+すべての善人のために	の.*の	4 4 4
+すべての善人のために	の.*て	0 0 -1
+Τωρα	ω+	2 2 1
+Τωρα	x+	0 0 -1
+Τωρα	ω.	2 2 2
+すべての善人のために	[の]	4 4 1
+すべての善人のために	[ぁ-え]	0 0 -1
+すべての善人のために	[^ぁ-え]	1 1 1
+Τωρα ειναι η	[α-ω]	2 2 1
+Τωρα ειναι η	[α-ω]+	2 2 3
+xxxΤωρα ειναι η	[Α-Ω]	4 4 1
+για όλους τους καλούς ά	α.*α	3 3 15
+να έρθει στο πά	[^ν]	2 2 1
+
+# FS="" should split into unicode chars
+try emptyFS BEGIN {FS=""} {print NF}
+すべての善人のために	10
+の今がその時だ	7
+Сейчас	6
+现在是时候了	6
+给所有的好男	6
+来参加聚会。	6
+😀	1
+🖕 finger	8
+
+# printf(%N.Ns) for utf8 strings
+try printfs1 {printf("[%5.2s][%-5.2s]\n"), $1, $1}
+abcd	[   ab][ab   ]
+现在abc	[   现在][现在   ]
+现ωabc	[   现ω][现ω   ]
+ωabc	[   ωa][ωa   ]
+Сейчас	[   Се][Се   ]
+Сейxyz	[   Се][Се   ]
+😀	[    😀][😀    ]
+
+# printf(%N.Ns) for utf8 strings
+try printfs2 {printf("[%5s][%-5s]\n"), $1, $1}
+abcd	[ abcd][abcd ]
+现在ab	[ 现在ab][现在ab ]
+a现在ab	[a现在ab][a现在ab]
+a现在abc	[a现在abc][a现在abc]
+现ωab	[ 现ωab][现ωab ]
+ωabc	[ ωabc][ωabc ]
+Сейчас	[Сейчас][Сейчас]
+😀	[    😀][😀    ]
+
+# printf(%N.Ns) for utf8 strings
+try printfs3 {printf("[%.2s][%-.2s]\n"), $1, $1}
+abcd	[ab][ab]
+现在abc	[现在][现在]
+现ωabc	[现ω][现ω]
+ω	[ω][ω]
+😀	[😀][😀]
+
+# printf(%c) for utf
+try printfc {printf("%c %c\n", $1, substr($1,2,1))}
+すべての善人のために	す べ
+の今がその時だ	の 今
+Сейчас	С е
+现在是时候了	现 在
+😀🖕	😀 🖕
+
+!!!!
--- a/contrib/one-true-awk/testdir/T.utfre
+++ b/contrib/one-true-awk/testdir/T.utfre
@ -0,0 +1,234 @@
+echo T.utfre: tests of regular expression code for Unicode/utf-8
+# adapted from T.re
+
+awk '
+BEGIN {
+	FS = "\t"
+	awk = "../a.out"
+}
+NF == 0 {
+	next
+}
+$1 != "" {	# new test
+	re = $1
+}
+$2 != "" {	# either ~ or !~
+	op = $2
+	if (op == "~")
+		neg = "!"
+	else if (op == "!~")
+		neg = ""
+}
+$3 != "" {	# new test string
+	str = $3
+}
+$3 == "\"\"" {	# explicit empty line
+	$3 = ""
+}
+NF > 2 {	# generate a test
+	input = $3
+	test = sprintf("./echo '"'"'%s'"'"' | %s '"'"'%s/%s/ {print \"%d fails %s %s %s\"}'"'"'",
+		input, awk, neg, re, NR, re, op, input)
+	# printf(" %3d   %s %s %s:\n", NR, re, op, input)
+	# print "test is |" test "|"
+	system(test)
+	# system("bprint -c ../a.out")
+	nt++
+}
+END { print "	" nt, "tests" }
+' <<\!!!!
+	~	🖕
+		🖕🖕
+		🖕🖕🖕
+		""
+🖕	~	🖕
+		b🖕
+		b🖕b
+	!~	""
+		时
+		xxxxx
+.	~	时
+		x时x
+		🙂
+	!~	""			
+.の	~	xの
+		xxの
+		xのx
+	!~	の
+		のx
+		""
+$	~	x
+		🙂
+		""
+.$	~	모
+		xx모
+		x모x
+	!~	""
+д$	~	д
+		bд
+		bbbд
+	!~	дb
+		x
+		""
+^	~	и
+		""
+		^
+^λ$	~	λ
+	!~	xλ
+		λx
+		xλx
+		""
+^λ.$	~	λx
+		λλ
+	!~	xλ
+		λλλ
+		λxy
+		""
+^$	~	""
+	!~	に
+		^
+^.해	~	め해
+		め해해
+	!~	해
+		""
+^.*해	~	해
+		め해
+		めめめめめめ해
+	!~	""
+^.+해	~	め해
+		めめめめめめ해
+	!~	""
+		해
+		해め
+해*	~	""
+		해
+		해해해해
+		め해
+		めめめめ
+해해*	~	해
+		해해해
+		め해
+	!~	めめめめ
+		""
+\$	~	🖕$
+		$
+		$🖕
+		🖕$🖕
+	!~	""
+		🖕
+\.	~	.
+	!~	🖕
+		""
+xθ+y	~	xθy
+		xθθy
+		xθθθθθθy
+	!~	θy
+		xy
+		xθ
+xθ?y	~	xy
+		xθy
+	!~	xθθy
+θ?b?の?	~	""
+		x
+^a?b?め	~	め
+		aめ
+		bめ
+		abめ
+		めa
+	!~	""
+		ab
+		aba			
+[Α-Ω]	~	Α
+		aΔb
+		xyΩ
+	!~	abc
+		β
+		""
+[^Α-Ω]	~	δ
+		aΔb
+		xyΩ
+	!~	Α
+		Δ
+		""
+[Α-ΔΦ-Ω]	~	Α
+		Β
+		Δ
+		Φ
+		Ω
+	!~	Π
+		Σ
+Π[[:lower:]]+	~	Πa
+		Πab
+	!~	Π
+		ΠX
+		Π:
+		Π[
+		Π]
+の[0-9]+に	~	の0に
+		の23に
+		の12345に
+	!~	0に
+		のに
+の[0-9]?に	~	のに
+		の1に
+	!~	の23に
+の[[]に	~	の[に
+	!~	のに
+		の[[]に
+		の]に
+の[[-]に	~	の[に
+		の-に
+	!~	のに
+		の[[]に
+		の]に
+の[[-a]に	~	の[に
+		のaに
+		の]に
+	!~	のに
+		の[[]に
+		の-に
+の[]-a]に	~	の]に
+		のaに
+	!~	のに
+		の[に
+		の-に
+の[]]に	~	の]に
+	!~	のに
+		の[]]に
+		の[に
+の[^[]に	~	のaに
+	!~	の[に
+の[-]に	~	の-に
+	!~	のに
+		の+に
+の[^-]に	~	の+に
+	!~	の-に
+		のに
+の[][]に	~	の[に
+		の]に
+	!~	のに
+		の][に
+		の[]に
+の[z-a]に	~	のに
+	!~	の
+		に
+		のaに
+		のzに
+		の-に
+に|だ	~	だ
+		に
+		だに
+	!~	a
+		""
+^στο|τους$	~	στο
+		στοd
+		aτους
+		τους
+	!~	xστο
+		τουςa
+^(στο|τους)$	~	στο
+		τους
+	!~	στοτους
+		στοx
+		cτους
+!!!!
--- a/contrib/one-true-awk/testdir/tt.15
+++ b/contrib/one-true-awk/testdir/tt.15
@ -15,7 +15,7 @@ BEGIN {
 END  { printline() }

 function addword(w) {
-    ## print "adding [", w, "] ", length(w), length(line), maxlen
+    print "adding [", w, "] ", length(w), length(line), maxlen
    if (length(line) + length(w) > maxlen)
        printline()
    if (length(w) > 2 && ( w ~ /[\.!]["?)]?$/ || w ~ /[?!]"?$/) &&
--- a/contrib/one-true-awk/testdir/xc
+++ b/contrib/one-true-awk/testdir/xc
@ -1,17 +0,0 @@
-for i in $*
-do
-	echo $i >/dev/tty
-	echo $i '<<<'
-	cd ..
-	echo testdir/$i:
-	ind <testdir/$i
-	a.out -f testdir/$i >drek.c
-	cat drek.c
-	make drek || ( echo $i '	' bad compile; echo $i '	' bad compile >/dev/tty; continue )
-	cd testdir
-	time awk -f $i test.countries >foo1
-	time ../drek test.countries >foo2
-	cmp foo1 foo2 || ( echo $i '	' bad; echo $i '	' bad >/dev/tty; diff foo1 foo2 )
-	echo '>>>' $i
-	echo
-done
--- a/contrib/one-true-awk/tran.c
+++ b/contrib/one-true-awk/tran.c
@ -70,18 +70,6 @@ Cell	*literal0;

 extern Cell **fldtab;

-static void
-setfree(Cell *vp)
-{
-	if (&vp->sval == FS || &vp->sval == RS ||
-	    &vp->sval == OFS || &vp->sval == ORS ||
-	    &vp->sval == OFMT || &vp->sval == CONVFMT ||
-	    &vp->sval == FILENAME || &vp->sval == SUBSEP)
-		vp->tval |= DONTFREE;
-	else
-		vp->tval &= ~DONTFREE;
-}
-
 void syminit(void)	/* initialize symbol table with builtin vars */
 {
 	literal0 = setsymtab("0", "0", 0.0, NUM|STR|CON|DONTFREE, symtab);
@ -320,7 +308,7 @@ Awkfloat setfval(Cell *vp, Awkfloat f)	/* set float val of a Cell */
 	} else if (&vp->fval == NF) {
 		donerec = false;	/* mark $0 invalid */
 		setlastfld(f);
-		DPRINTF("setting NF to %g\n", f);
+		DPRINTF("setfval: setting NF to %g\n", f);
 	} else if (isrec(vp)) {
 		donefld = false;	/* mark $1... invalid */
 		donerec = true;
@ -360,6 +348,10 @@ char *setsval(Cell *vp, const char *s)	/* set string val of a Cell */
 		(void*)vp, NN(vp->nval), s, vp->tval, donerec, donefld);
 	if ((vp->tval & (NUM | STR)) == 0)
 		funnyvar(vp, "assign to");
+	if (CSV && (vp == rsloc))
+		WARNING("danger: don't set RS when --csv is in effect");
+	if (CSV && (vp == fsloc))
+		WARNING("danger: don't set FS when --csv is in effect");
 	if (isfld(vp)) {
 		donerec = false;	/* mark $0 invalid */
 		fldno = atoi(vp->nval);
@ -377,10 +369,9 @@ char *setsval(Cell *vp, const char *s)	/* set string val of a Cell */
 	t = s ? tostring(s) : tostring("");	/* in case it's self-assign */
 	if (freeable(vp))
 		xfree(vp->sval);
-	vp->tval &= ~(NUM|CONVC|CONVO);
+	vp->tval &= ~(NUM|DONTFREE|CONVC|CONVO);
 	vp->tval |= STR;
 	vp->fmt = NULL;
-	setfree(vp);
 	DPRINTF("setsval %p: %s = \"%s (%p) \", t=%o r,f=%d,%d\n",
 		(void*)vp, NN(vp->nval), t, (void*)t, vp->tval, donerec, donefld);
 	vp->sval = t;
@ -388,7 +379,7 @@ char *setsval(Cell *vp, const char *s)	/* set string val of a Cell */
 		donerec = false;	/* mark $0 invalid */
 		f = getfval(vp);
 		setlastfld(f);
-		DPRINTF("setting NF to %g\n", f);
+		DPRINTF("setsval: setting NF to %g\n", f);
 	}

 	return(vp->sval);
@ -576,7 +567,6 @@ Cell *catstr(Cell *a, Cell *b) /* concatenate a and b */

 char *qstring(const char *is, int delim)	/* collect string up to next delim */
 {
-	const char *os = is;
 	int c, n;
 	const uschar *s = (const uschar *) is;
 	uschar *buf, *bp;
@ -585,7 +575,7 @@ char *qstring(const char *is, int delim)	/* collect string up to next delim */
 		FATAL( "out of space in qstring(%s)", s);
 	for (bp = buf; (c = *s) != delim; s++) {
 		if (c == '\n')
-			SYNTAX( "newline in string %.20s...", os );
+			SYNTAX( "newline in string %.20s...", is );
 		else if (c != '\\')
 			*bp++ = c;
 		else {	/* \something */