ota: Import One True Awk from 20240122 (6a07a6d3bb63)

Jan 22, 2024:
        Restore the ability to compile with g++. Thanks to
        Arnold Robbins.

Dec 24, 2023:
        Matchop dereference after free problem fix when the first
        argument is a function call. Thanks to Oguz Ismail Uysal.
        Fix inconsistent handling of --csv and FS set in the
        command line. Thanks to Wilbert van der Poel.
        Casting changes to int for is* functions.

Nov 27, 2023:
        Fix exit status of system on MacOS. Update to REGRESS.
        Thanks to Arnold Robbins.
        Fix inconsistent handling of -F and --csv, and loss of csv
        mode when FS is set.

Sponsored by:		Netflix
This commit is contained in:
Warner Losh 2024-02-28 08:05:17 -07:00
parent 18df98168f
commit e8a605e129
10 changed files with 92 additions and 59 deletions

48
FIXES
View file

@ -25,10 +25,27 @@ THIS SOFTWARE.
This file lists all bug fixes, changes, etc., made since the
second edition of the AWK book was published in September 2023.
Jan 22, 2024:
Restore the ability to compile with g++. Thanks to
Arnold Robbins.
Dec 24, 2023:
Matchop dereference after free problem fix when the first
argument is a function call. Thanks to Oguz Ismail Uysal.
Fix inconsistent handling of --csv and FS set in the
command line. Thanks to Wilbert van der Poel.
Casting changes to int for is* functions.
Nov 27, 2023:
Fix exit status of system on MacOS. Update to REGRESS.
Thanks to Arnold Robbins.
Fix inconsistent handling of -F and --csv, and loss of csv
mode when FS is set.
Nov 24, 2023:
Fix issue #199: gototab improvements to dynamically resize the
table, qsort and bsearch to improve the lookup speed as the
table gets larger for multibyte input. thanks to Arnold Robbins.
table gets larger for multibyte input. Thanks to Arnold Robbins.
Nov 23, 2023:
Fix Issue #169, related to escape sequences in strings.
@ -37,29 +54,29 @@ Nov 23, 2023:
by Miguel Pineiro Jr.
Nov 20, 2023:
rewrite of fnematch to fix a number of issues, including
Rewrite of fnematch to fix a number of issues, including
extraneous output, out-of-bounds access, number of bytes
to push back after a failed match etc.
thanks to Miguel Pineiro Jr.
Thanks to Miguel Pineiro Jr.
Nov 15, 2023:
Man page edit, regression test fixes. thanks to Arnold Robbins
consolidation of sub and gsub into dosub, removing duplicate
code. thanks to Miguel Pineiro Jr.
Man page edit, regression test fixes. Thanks to Arnold Robbins
Consolidation of sub and gsub into dosub, removing duplicate
code. Thanks to Miguel Pineiro Jr.
gcc replaced with cc everywhere.
Oct 30, 2023:
multiple fixes and a minor code cleanup.
disabled utf-8 for non-multibyte locales, such as C or POSIX.
fixed a bad char * cast that causes incorrect results on big-endian
systems. also fixed an out-of-bounds read for empty CCL.
fixed a buffer overflow in substr with utf-8 strings.
many thanks to Todd C Miller.
Multiple fixes and a minor code cleanup.
Disabled utf-8 for non-multibyte locales, such as C or POSIX.
Fixed a bad char * cast that causes incorrect results on big-endian
systems. Also fixed an out-of-bounds read for empty CCL.
Fixed a buffer overflow in substr with utf-8 strings.
Many thanks to Todd C Miller.
Sep 24, 2023:
fnematch and getrune have been overhauled to solve issues around
unicode FS and RS. also fixed gsub null match issue with unicode.
big thanks to Arnold Robbins.
unicode FS and RS. Also fixed gsub null match issue with unicode.
Big thanks to Arnold Robbins.
Sep 12, 2023:
Fixed a length error in u8_byte2char that set RSTART to
@ -84,9 +101,8 @@ Sep 12, 2023:
of a string of 3 emojis is 3, not 12 as it would be if bytes
were counted.
Regular expressions are processes as UTF-8.
Regular expressions are processed as UTF-8.
Unicode literals can be written as \u followed by one
to eight hexadecimal digits. These may appear in strings and
regular expressions.

View file

@ -27,6 +27,7 @@ Regular expressions may include UTF-8 code points, including `\u`.
The option `--csv` turns on CSV processing of input:
fields are separated by commas, fields may be quoted with
double-quote (`"`) characters, quoted fields may contain embedded newlines.
Double-quotes in fields have to be doubled and enclosed in quoted fields.
In CSV mode, `FS` is ignored.
If no explicit separator argument is provided,
@ -117,6 +118,8 @@ move this to some place like `/usr/bin/awk`.
If your system does not have `yacc` or `bison` (the GNU
equivalent), you need to install one of them first.
The default in the `makefile` is `bison`; you will have
to edit the `makefile` to use `yacc`.
NOTE: This version uses ISO/IEC C99, as you should also. We have
compiled this without any changes using `gcc -Wall` and/or local C
@ -143,4 +146,4 @@ is not at the top of our priority list.
#### Last Updated
Mon 16 Oct 2023 11:23:08 IDT
Mon 05 Feb 2024 08:46:55 IST

38
b.c
View file

@ -116,7 +116,7 @@ static int entry_cmp(const void *l, const void *r);
static int get_gototab(fa*, int, int);
static int set_gototab(fa*, int, int, int);
static void clear_gototab(fa*, int);
extern int u8_rune(int *, const uschar *);
extern int u8_rune(int *, const char *);
static int *
intalloc(size_t n, const char *f)
@ -346,7 +346,7 @@ int hexstr(const uschar **pp, int max) /* find and eval hex string at pp, return
int i;
for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
if (isdigit(*p))
if (isdigit((int) *p))
n = 16 * n + *p - '0';
else if (*p >= 'a' && *p <= 'f')
n = 16 * n + *p - 'a' + 10;
@ -416,7 +416,7 @@ int *cclenter(const char *argp) /* add a character class */
FATAL("out of space for character class [%.10s...] 1", p);
bp = buf;
for (i = 0; *p != 0; ) {
n = u8_rune(&c, p);
n = u8_rune(&c, (const char *) p);
p += n;
if (c == '\\') {
c = quoted(&p);
@ -424,7 +424,7 @@ int *cclenter(const char *argp) /* add a character class */
if (*p != 0) {
c = bp[-1];
/* c2 = *p++; */
n = u8_rune(&c2, p);
n = u8_rune(&c2, (const char *) p);
p += n;
if (c2 == '\\')
c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
@ -607,18 +607,18 @@ static void resize_gototab(fa *f, int state)
size_t orig_size = f->gototab[state].allocated; // 2nd half of new mem is this size
memset(p + orig_size, 0, orig_size * sizeof(gtte)); // clean it out
f->gototab[state].allocated = new_size; // update gotottab info
f->gototab[state].allocated = new_size; // update gototab info
f->gototab[state].entries = p;
}
static int get_gototab(fa *f, int state, int ch) /* hide gototab inplementation */
static int get_gototab(fa *f, int state, int ch) /* hide gototab implementation */
{
gtte key;
gtte *item;
key.ch = ch;
key.state = 0; /* irrelevant */
item = bsearch(& key, f->gototab[state].entries,
item = (gtte *) bsearch(& key, f->gototab[state].entries,
f->gototab[state].inuse, sizeof(gtte),
entry_cmp);
@ -638,7 +638,7 @@ static int entry_cmp(const void *l, const void *r)
return left->ch - right->ch;
}
static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplementation */
static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab implementation */
{
if (f->gototab[state].inuse == 0) {
f->gototab[state].entries[0].ch = ch;
@ -662,7 +662,7 @@ static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplem
key.ch = ch;
key.state = 0; /* irrelevant */
item = bsearch(& key, f->gototab[state].entries,
item = (gtte *) bsearch(& key, f->gototab[state].entries,
f->gototab[state].inuse, sizeof(gtte),
entry_cmp);
@ -710,7 +710,7 @@ int match(fa *f, const char *p0) /* shortest match ? */
return(1);
do {
/* assert(*p < NCHARS); */
n = u8_rune(&rune, p);
n = u8_rune(&rune, (const char *) p);
if ((ns = get_gototab(f, s, rune)) != 0)
s = ns;
else
@ -743,7 +743,7 @@ int pmatch(fa *f, const char *p0) /* longest match, for sub */
if (f->out[s]) /* final state */
patlen = q-p;
/* assert(*q < NCHARS); */
n = u8_rune(&rune, q);
n = u8_rune(&rune, (const char *) q);
if ((ns = get_gototab(f, s, rune)) != 0)
s = ns;
else
@ -774,7 +774,7 @@ int pmatch(fa *f, const char *p0) /* longest match, for sub */
s = 2;
if (*p == 0)
break;
n = u8_rune(&rune, p);
n = u8_rune(&rune, (const char *) p);
p += n;
} while (1); /* was *p++ */
return (0);
@ -799,7 +799,7 @@ int nematch(fa *f, const char *p0) /* non-empty match, for sub */
if (f->out[s]) /* final state */
patlen = q-p;
/* assert(*q < NCHARS); */
n = u8_rune(&rune, q);
n = u8_rune(&rune, (const char *) q);
if ((ns = get_gototab(f, s, rune)) != 0)
s = ns;
else
@ -887,7 +887,7 @@ bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
}
}
j += u8_rune(&c, (uschar *)j);
j += u8_rune(&c, j);
if ((ns = get_gototab(pfa, s, c)) != 0)
s = ns;
@ -907,7 +907,7 @@ bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
break; /* best match found */
/* no match at origin i, next i and start over */
i += u8_rune(&c, (uschar *)i);
i += u8_rune(&c, i);
if (c == 0)
break; /* no match */
j = i;
@ -1229,8 +1229,6 @@ static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
return 0;
}
extern int u8_rune(int *, const uschar *); /* run.c; should be in header file */
int relex(void) /* lexical analyzer for reparse */
{
int c, n;
@ -1248,7 +1246,7 @@ int relex(void) /* lexical analyzer for reparse */
rescan:
starttok = prestr;
if ((n = u8_rune(&rlxval, prestr)) > 1) {
if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
prestr += n;
starttok = prestr;
return CHAR;
@ -1295,7 +1293,7 @@ int relex(void) /* lexical analyzer for reparse */
if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
FATAL("out of space for reg expr %.10s...", lastre);
for (; ; ) {
if ((n = u8_rune(&rlxval, prestr)) > 1) {
if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
for (i = 0; i < n; i++)
*bp++ = *prestr++;
continue;
@ -1389,7 +1387,7 @@ int relex(void) /* lexical analyzer for reparse */
}
break;
case '{':
if (isdigit(*(prestr))) {
if (isdigit((int) *(prestr))) {
num = 0; /* Process as a repetition */
n = -1; m = -1;
commafound = false;

View file

@ -11,6 +11,7 @@ do
echo === $i
OUT=${i%.awk}.OUT
OK=${i%.awk}.ok
OK2=${i%.awk}.ok2
IN=${i%.awk}.in
input=
if [ -f $IN ]
@ -20,9 +21,12 @@ do
../a.out -f $i $input > $OUT 2>&1
if cmp -s $OK $OUT
then
rm -f $OUT
elif [ -f $OK2 ] && cmp -s $OK2 $OUT
then
rm -f $OUT
else
echo ++++ $i failed!
echo '++++ $i failed!'
fi
done

12
lib.c
View file

@ -399,7 +399,7 @@ void fldbld(void) /* create fields from current record */
i = 0; /* number of fields accumulated here */
if (inputFS == NULL) /* make sure we have a copy of FS */
savefs();
if (strlen(inputFS) > 1) { /* it's a regular expression */
if (!CSV && strlen(inputFS) > 1) { /* it's a regular expression */
i = refldbld(r, inputFS);
} else if (!CSV && (sep = *inputFS) == ' ') { /* default whitespace */
for (i = 0; ; ) {
@ -845,10 +845,10 @@ int isclvar(const char *s) /* is s of form var=something ? */
{
const char *os = s;
if (!isalpha((uschar) *s) && *s != '_')
if (!isalpha((int) *s) && *s != '_')
return 0;
for ( ; *s; s++)
if (!(isalnum((uschar) *s) || *s == '_'))
if (!(isalnum((int) *s) || *s == '_'))
break;
return *s == '=' && s > os;
}
@ -883,7 +883,7 @@ bool is_valid_number(const char *s, bool trailing_stuff_ok,
if (no_trailing)
*no_trailing = false;
while (isspace(*s))
while (isspace((int) *s))
s++;
/* no hex floating point, sorry */
@ -895,7 +895,7 @@ bool is_valid_number(const char *s, bool trailing_stuff_ok,
is_nan = (strncasecmp(s+1, "nan", 3) == 0);
is_inf = (strncasecmp(s+1, "inf", 3) == 0);
if ((is_nan || is_inf)
&& (isspace(s[4]) || s[4] == '\0'))
&& (isspace((int) s[4]) || s[4] == '\0'))
goto convert;
else if (! isdigit(s[1]) && s[1] != '.')
return false;
@ -918,7 +918,7 @@ bool is_valid_number(const char *s, bool trailing_stuff_ok,
/*
* check for trailing stuff
*/
while (isspace(*ep))
while (isspace((int) *ep))
ep++;
if (no_trailing != NULL)

6
main.c
View file

@ -22,7 +22,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
THIS SOFTWARE.
****************************************************************/
const char *version = "version 20231124";
const char *version = "version 20240122";
#define DEBUG
#include <stdio.h>
@ -199,6 +199,10 @@ int main(int argc, char *argv[])
argc--;
argv++;
}
if (CSV && (fs != NULL || lookup("FS", symtab) != NULL))
WARNING("danger: don't set FS when --csv is in effect");
/* argv[1] is now the first argument */
if (npfile == 0) { /* no -f; first argument is program */
if (argc <= 1) {

31
run.c
View file

@ -795,7 +795,7 @@ int runetochar(char *str, int c)
Cell *matchop(Node **a, int n) /* ~ and match() */
{
Cell *x, *y;
Cell *x, *y, *z;
char *s, *t;
int i;
int cstart, cpatlen, len;
@ -817,7 +817,7 @@ Cell *matchop(Node **a, int n) /* ~ and match() */
i = (*mf)(pfa, s);
tempfree(y);
}
tempfree(x);
z = x;
if (n == MATCHFCN) {
int start = patbeg - s + 1; /* origin 1 */
if (patlen < 0) {
@ -839,11 +839,13 @@ Cell *matchop(Node **a, int n) /* ~ and match() */
x = gettemp();
x->tval = NUM;
x->fval = start;
return x;
} else if ((n == MATCH && i == 1) || (n == NOTMATCH && i == 0))
return(True);
x = True;
else
return(False);
x = False;
tempfree(z);
return x;
}
@ -1298,7 +1300,8 @@ int format(char **pbuf, int *pbufsize, const char *s, Node *a) /* printf-like co
if (bs == NULL) { // invalid character
// use unicode invalid character, 0xFFFD
bs = "\357\277\275";
static char invalid_char[] = "\357\277\275";
bs = invalid_char;
count = 3;
}
t = bs;
@ -2067,6 +2070,7 @@ Cell *bltin(Node **a, int n) /* builtin functions. a[0] is type, a[1] is arg lis
int status = 0;
time_t tv;
struct tm *tm;
int estatus = 0;
t = ptoi(a[0]);
x = execute(a[1]);
@ -2167,20 +2171,21 @@ Cell *bltin(Node **a, int n) /* builtin functions. a[0] is type, a[1] is arg lis
break;
case FSYSTEM:
fflush(stdout); /* in case something is buffered already */
status = system(getsval(x));
u = status;
estatus = status = system(getsval(x));
if (status != -1) {
if (WIFEXITED(status)) {
u = WEXITSTATUS(status);
estatus = WEXITSTATUS(status);
} else if (WIFSIGNALED(status)) {
u = WTERMSIG(status) + 256;
estatus = WTERMSIG(status) + 256;
#ifdef WCOREDUMP
if (WCOREDUMP(status))
u += 256;
estatus += 256;
#endif
} else /* something else?!? */
u = 0;
estatus = 0;
}
/* else estatus was set to -1 */
u = estatus;
break;
case FRAND:
/* random() returns numbers in [0..2^31-1]
@ -2539,7 +2544,7 @@ Cell *dosub(Node **a, int subop) /* sub and gsub */
start = getsval(x);
while (pmatch(pfa, start)) {
if (buf == NULL) {
if ((pb = buf = malloc(bufsz)) == NULL)
if ((pb = buf = (char *) malloc(bufsz)) == NULL)
FATAL("out of memory in dosub");
tempstat = pfa->initstat;
pfa->initstat = 2;

View file

@ -17,7 +17,7 @@ $1 ~ /try/ { # new test
sub(/try /, "")
prog = $0
printf("%3d %s\n", nt, prog)
prog = sprintf("%s -F\"\\t\" '"'"'%s'"'"'", awk, prog)
prog = sprintf("%s '"'"'%s'"'"'", awk, prog)
# print "prog is", prog
nt2 = 0
while (getline > 0) {

Can't render this file because it contains an unexpected character in line 9 and column 7.

View file

@ -84,3 +84,5 @@ grep "out of range field" foo >/dev/null || echo 1>&2 "BAD: T.overflow \$400000"
rm -rf /tmp/awktestfoo*
$awk 'BEGIN { for (i=1; i <= 1000; i++) print i >("/tmp/awktestfoo" i) }'
ls /tmp/awktestfoo* | grep '1000' >/dev/null || echo 1>&2 "BAD: T.overflow openfiles"
rm -rf /tmp/awktestfoo*
exit 0

View file

@ -220,5 +220,6 @@ $awk 'BEGIN {
echo 'cat dog' > $TEMP2
diff $TEMP1 $TEMP2 || fail 'BAD: T.split(a, b, "[\r\n]+")'
rm -rf $WORKDIR
exit $RESULT