mirror of
https://github.com/freebsd/freebsd-src
synced 2024-07-22 10:48:02 +00:00
Merge one true awk from 2024-01-22 for the Awk Second Edition support
This brings in Unicode support, CSV support and a number of bug fixes. They are described in _The AWK Programming Language_, Second Edition, by Al Aho, Brian Kernighan, and Peter Weinberger (Addison-Wesley, 2024, ISBN-13 978-0138269722, ISBN-10 0138269726). Sponsored by: Netflix
This commit is contained in:
commit
f32a6403d3
File diff suppressed because it is too large
Load diff
1429
contrib/one-true-awk/FIXES.1e
Normal file
1429
contrib/one-true-awk/FIXES.1e
Normal file
File diff suppressed because it is too large
Load diff
|
@ -1,8 +1,37 @@
|
|||
# The One True Awk
|
||||
|
||||
This is the version of `awk` described in _The AWK Programming Language_,
|
||||
by Al Aho, Brian Kernighan, and Peter Weinberger
|
||||
(Addison-Wesley, 1988, ISBN 0-201-07981-X).
|
||||
Second Edition, by Al Aho, Brian Kernighan, and Peter Weinberger
|
||||
(Addison-Wesley, 2024, ISBN-13 978-0138269722, ISBN-10 0138269726).
|
||||
|
||||
## What's New? ##
|
||||
|
||||
This version of Awk handles UTF-8 and comma-separated values (CSV) input.
|
||||
|
||||
### Strings ###
|
||||
|
||||
Functions that process strings now count Unicode code points, not bytes;
|
||||
this affects `length`, `substr`, `index`, `match`, `split`,
|
||||
`sub`, `gsub`, and others. Note that code
|
||||
points are not necessarily characters.
|
||||
|
||||
UTF-8 sequences may appear in literal strings and regular expressions.
|
||||
Aribtrary characters may be included with `\u` followed by 1 to 8 hexadecimal digits.
|
||||
|
||||
### Regular expressions ###
|
||||
|
||||
Regular expressions may include UTF-8 code points, including `\u`.
|
||||
|
||||
### CSV ###
|
||||
|
||||
The option `--csv` turns on CSV processing of input:
|
||||
fields are separated by commas, fields may be quoted with
|
||||
double-quote (`"`) characters, quoted fields may contain embedded newlines.
|
||||
Double-quotes in fields have to be doubled and enclosed in quoted fields.
|
||||
In CSV mode, `FS` is ignored.
|
||||
|
||||
If no explicit separator argument is provided,
|
||||
field-splitting in `split` is determined by CSV mode.
|
||||
|
||||
## Copyright
|
||||
|
||||
|
@ -35,7 +64,7 @@ in `FIXES`. If you distribute this code further, please please please
|
|||
distribute `FIXES` with it.
|
||||
|
||||
If you find errors, please report them
|
||||
to bwk@cs.princeton.edu.
|
||||
to the current maintainer, ozan.yigit@gmail.com.
|
||||
Please _also_ open an issue in the GitHub issue tracker, to make
|
||||
it easy to track issues.
|
||||
Thanks.
|
||||
|
@ -67,30 +96,32 @@ The program itself is created by
|
|||
|
||||
which should produce a sequence of messages roughly like this:
|
||||
|
||||
yacc -d awkgram.y
|
||||
conflicts: 43 shift/reduce, 85 reduce/reduce
|
||||
mv y.tab.c ytab.c
|
||||
mv y.tab.h ytab.h
|
||||
cc -c ytab.c
|
||||
cc -c b.c
|
||||
cc -c main.c
|
||||
cc -c parse.c
|
||||
cc maketab.c -o maketab
|
||||
./maketab >proctab.c
|
||||
cc -c proctab.c
|
||||
cc -c tran.c
|
||||
cc -c lib.c
|
||||
cc -c run.c
|
||||
cc -c lex.c
|
||||
cc ytab.o b.o main.o parse.o proctab.o tran.o lib.o run.o lex.o -lm
|
||||
bison -d awkgram.y
|
||||
awkgram.y: warning: 44 shift/reduce conflicts [-Wconflicts-sr]
|
||||
awkgram.y: warning: 85 reduce/reduce conflicts [-Wconflicts-rr]
|
||||
awkgram.y: note: rerun with option '-Wcounterexamples' to generate conflict counterexamples
|
||||
gcc -g -Wall -pedantic -Wcast-qual -O2 -c -o awkgram.tab.o awkgram.tab.c
|
||||
gcc -g -Wall -pedantic -Wcast-qual -O2 -c -o b.o b.c
|
||||
gcc -g -Wall -pedantic -Wcast-qual -O2 -c -o main.o main.c
|
||||
gcc -g -Wall -pedantic -Wcast-qual -O2 -c -o parse.o parse.c
|
||||
gcc -g -Wall -pedantic -Wcast-qual -O2 maketab.c -o maketab
|
||||
./maketab awkgram.tab.h >proctab.c
|
||||
gcc -g -Wall -pedantic -Wcast-qual -O2 -c -o proctab.o proctab.c
|
||||
gcc -g -Wall -pedantic -Wcast-qual -O2 -c -o tran.o tran.c
|
||||
gcc -g -Wall -pedantic -Wcast-qual -O2 -c -o lib.o lib.c
|
||||
gcc -g -Wall -pedantic -Wcast-qual -O2 -c -o run.o run.c
|
||||
gcc -g -Wall -pedantic -Wcast-qual -O2 -c -o lex.o lex.c
|
||||
gcc -g -Wall -pedantic -Wcast-qual -O2 awkgram.tab.o b.o main.o parse.o proctab.o tran.o lib.o run.o lex.o -lm
|
||||
|
||||
This produces an executable `a.out`; you will eventually want to
|
||||
move this to some place like `/usr/bin/awk`.
|
||||
|
||||
If your system does not have `yacc` or `bison` (the GNU
|
||||
equivalent), you need to install one of them first.
|
||||
The default in the `makefile` is `bison`; you will have
|
||||
to edit the `makefile` to use `yacc`.
|
||||
|
||||
NOTE: This version uses ANSI C (C 99), as you should also. We have
|
||||
NOTE: This version uses ISO/IEC C99, as you should also. We have
|
||||
compiled this without any changes using `gcc -Wall` and/or local C
|
||||
compilers on a variety of systems, but new systems or compilers
|
||||
may raise some new complaint; reports of difficulties are
|
||||
|
@ -102,14 +133,9 @@ the standard developer tools.
|
|||
You can also use `make CC=g++` to build with the GNU C++ compiler,
|
||||
should you choose to do so.
|
||||
|
||||
The version of `malloc` that comes with some systems is sometimes
|
||||
astonishly slow. If `awk` seems slow, you might try fixing that.
|
||||
More generally, turning on optimization can significantly improve
|
||||
`awk`'s speed, perhaps by 1/3 for highest levels.
|
||||
|
||||
## A Note About Releases
|
||||
|
||||
We don't do releases.
|
||||
We don't usually do releases.
|
||||
|
||||
## A Note About Maintenance
|
||||
|
||||
|
@ -120,4 +146,4 @@ is not at the top of our priority list.
|
|||
|
||||
#### Last Updated
|
||||
|
||||
Sat Jul 25 14:00:07 EDT 2021
|
||||
Mon 05 Feb 2024 08:46:55 IST
|
||||
|
|
|
@ -20,6 +20,8 @@ awk \- pattern-directed scanning and processing language
|
|||
[
|
||||
.BI \-F
|
||||
.I fs
|
||||
|
|
||||
.B \-\^\-csv
|
||||
]
|
||||
[
|
||||
.BI \-v
|
||||
|
@ -76,6 +78,12 @@ The
|
|||
.I fs
|
||||
option defines the input field separator to be the regular expression
|
||||
.IR fs .
|
||||
The
|
||||
.B \-\^\-csv
|
||||
option causes
|
||||
.I awk
|
||||
to process records using (more or less) standard comma-separated values
|
||||
(CSV) format.
|
||||
.PP
|
||||
An input line is normally made up of fields separated by white space,
|
||||
or by the regular expression
|
||||
|
@ -202,9 +210,9 @@ and
|
|||
.B sqrt
|
||||
are built in.
|
||||
Other built-in functions:
|
||||
.TF length
|
||||
.TF "\fBlength(\fR[\fIv\^\fR]\fB)\fR"
|
||||
.TP
|
||||
.B length
|
||||
\fBlength(\fR[\fIv\^\fR]\fB)\fR
|
||||
the length of its argument
|
||||
taken as a string,
|
||||
number of elements in an array for an array argument,
|
||||
|
@ -212,15 +220,15 @@ or length of
|
|||
.B $0
|
||||
if no argument.
|
||||
.TP
|
||||
.B rand
|
||||
.B rand()
|
||||
random number on [0,1).
|
||||
.TP
|
||||
.B srand
|
||||
\fBsrand(\fR[\fIs\^\fR]\fB)\fR
|
||||
sets seed for
|
||||
.B rand
|
||||
and returns the previous seed.
|
||||
.TP
|
||||
.B int
|
||||
.BI int( x\^ )
|
||||
truncates to an integer value.
|
||||
.TP
|
||||
\fBsubstr(\fIs\fB, \fIm\fR [\fB, \fIn\^\fR]\fB)\fR
|
||||
|
@ -441,7 +449,7 @@ in a pattern.
|
|||
A pattern may consist of two patterns separated by a comma;
|
||||
in this case, the action is performed for all lines
|
||||
from an occurrence of the first pattern
|
||||
though an occurrence of the second.
|
||||
through an occurrence of the second, inclusive.
|
||||
.PP
|
||||
A relational expression is one of the following:
|
||||
.IP
|
||||
|
@ -451,7 +459,7 @@ A relational expression is one of the following:
|
|||
.br
|
||||
.IB expression " in " array-name
|
||||
.br
|
||||
.BI ( expr , expr,... ") in " array-name
|
||||
.BI ( expr ,\| expr ,\| ... ") in " array-name
|
||||
.PP
|
||||
where a
|
||||
.I relop
|
||||
|
@ -551,7 +559,7 @@ separates multiple subscripts (default 034).
|
|||
Functions may be defined (at the position of a pattern-action statement) thus:
|
||||
.IP
|
||||
.B
|
||||
function foo(a, b, c) { ...; return x }
|
||||
function foo(a, b, c) { ... }
|
||||
.PP
|
||||
Parameters are passed by value if scalar and by reference if array name;
|
||||
functions may be called recursively.
|
||||
|
@ -617,8 +625,8 @@ BEGIN { # Simulate echo(1)
|
|||
.IR sed (1)
|
||||
.br
|
||||
A. V. Aho, B. W. Kernighan, P. J. Weinberger,
|
||||
.IR "The AWK Programming Language" ,
|
||||
Addison-Wesley, 1988. ISBN 0-201-07981-X.
|
||||
.IR "The AWK Programming Language, Second Edition" ,
|
||||
Addison-Wesley, 2024. ISBN 978-0-13-826972-2, 0-13-826972-6.
|
||||
.SH BUGS
|
||||
There are no explicit conversions between numbers and strings.
|
||||
To force an expression to be treated as a number add 0 to it;
|
||||
|
@ -628,7 +636,11 @@ to force it to be treated as a string concatenate
|
|||
The scope rules for variables in functions are a botch;
|
||||
the syntax is worse.
|
||||
.PP
|
||||
Only eight-bit characters sets are handled correctly.
|
||||
Input is expected to be UTF-8 encoded. Other multibyte
|
||||
character sets are not handled.
|
||||
However, in eight-bit locales,
|
||||
.I awk
|
||||
treats each input byte as a separate character.
|
||||
.SH UNUSUAL FLOATING-POINT VALUES
|
||||
.I Awk
|
||||
was designed before IEEE 754 arithmetic defined Not-A-Number (NaN)
|
||||
|
|
|
@ -37,7 +37,7 @@ typedef double Awkfloat;
|
|||
|
||||
typedef unsigned char uschar;
|
||||
|
||||
#define xfree(a) { if ((a) != NULL) { free((void *)(intptr_t)(a)); (a) = NULL; } }
|
||||
#define xfree(a) { free((void *)(intptr_t)(a)); (a) = NULL; }
|
||||
/*
|
||||
* We sometimes cheat writing read-only pointers to NUL-terminate them
|
||||
* and then put back the original value
|
||||
|
@ -64,6 +64,8 @@ extern bool safe; /* false => unsafe, true => safe */
|
|||
#define RECSIZE (8 * 1024) /* sets limit on records, fields, etc., etc. */
|
||||
extern int recsize; /* size of current record, orig RECSIZE */
|
||||
|
||||
extern size_t awk_mb_cur_max; /* max size of a multi-byte character */
|
||||
|
||||
extern char EMPTY[]; /* this avoid -Wwritable-strings issues */
|
||||
extern char **FS;
|
||||
extern char **RS;
|
||||
|
@ -78,6 +80,8 @@ extern char **SUBSEP;
|
|||
extern Awkfloat *RSTART;
|
||||
extern Awkfloat *RLENGTH;
|
||||
|
||||
extern bool CSV; /* true for csv input */
|
||||
|
||||
extern char *record; /* points to $0 */
|
||||
extern int lineno; /* line number in awk program */
|
||||
extern int errorflag; /* 1 if error has occurred */
|
||||
|
@ -233,7 +237,8 @@ extern int pairstack[], paircnt;
|
|||
|
||||
/* structures used by regular expression matching machinery, mostly b.c: */
|
||||
|
||||
#define NCHARS (256+3) /* 256 handles 8-bit chars; 128 does 7-bit */
|
||||
#define NCHARS (1256+3) /* 256 handles 8-bit chars; 128 does 7-bit */
|
||||
/* BUG: some overflows (caught) if we use 256 */
|
||||
/* watch out in match(), etc. */
|
||||
#define HAT (NCHARS+2) /* matches ^ in regular expr */
|
||||
#define NSTATES 32
|
||||
|
@ -244,12 +249,24 @@ typedef struct rrow {
|
|||
int i;
|
||||
Node *np;
|
||||
uschar *up;
|
||||
int *rp; /* rune representation of char class */
|
||||
} lval; /* because Al stores a pointer in it! */
|
||||
int *lfollow;
|
||||
} rrow;
|
||||
|
||||
typedef struct gtte { /* gototab entry */
|
||||
unsigned int ch;
|
||||
unsigned int state;
|
||||
} gtte;
|
||||
|
||||
typedef struct gtt { /* gototab */
|
||||
size_t allocated;
|
||||
size_t inuse;
|
||||
gtte *entries;
|
||||
} gtt;
|
||||
|
||||
typedef struct fa {
|
||||
unsigned int **gototab;
|
||||
gtt *gototab;
|
||||
uschar *out;
|
||||
uschar *restr;
|
||||
int **posns;
|
||||
|
|
|
@ -204,11 +204,12 @@ ppattern:
|
|||
{ $$ = op2(BOR, notnull($1), notnull($3)); }
|
||||
| ppattern and ppattern %prec AND
|
||||
{ $$ = op2(AND, notnull($1), notnull($3)); }
|
||||
| ppattern MATCHOP reg_expr { $$ = op3($2, NIL, $1, (Node*)makedfa($3, 0)); }
|
||||
| ppattern MATCHOP reg_expr { $$ = op3($2, NIL, $1, (Node*)makedfa($3, 0)); free($3); }
|
||||
| ppattern MATCHOP ppattern
|
||||
{ if (constnode($3))
|
||||
{ if (constnode($3)) {
|
||||
$$ = op3($2, NIL, $1, (Node*)makedfa(strnode($3), 0));
|
||||
else
|
||||
free($3);
|
||||
} else
|
||||
$$ = op3($2, (Node *)1, $1, $3); }
|
||||
| ppattern IN varname { $$ = op2(INTEST, $1, makearr($3)); }
|
||||
| '(' plist ')' IN varname { $$ = op2(INTEST, $2, makearr($5)); }
|
||||
|
@ -231,11 +232,12 @@ pattern:
|
|||
| pattern LE pattern { $$ = op2($2, $1, $3); }
|
||||
| pattern LT pattern { $$ = op2($2, $1, $3); }
|
||||
| pattern NE pattern { $$ = op2($2, $1, $3); }
|
||||
| pattern MATCHOP reg_expr { $$ = op3($2, NIL, $1, (Node*)makedfa($3, 0)); }
|
||||
| pattern MATCHOP reg_expr { $$ = op3($2, NIL, $1, (Node*)makedfa($3, 0)); free($3); }
|
||||
| pattern MATCHOP pattern
|
||||
{ if (constnode($3))
|
||||
{ if (constnode($3)) {
|
||||
$$ = op3($2, NIL, $1, (Node*)makedfa(strnode($3), 0));
|
||||
else
|
||||
free($3);
|
||||
} else
|
||||
$$ = op3($2, (Node *)1, $1, $3); }
|
||||
| pattern IN varname { $$ = op2(INTEST, $1, makearr($3)); }
|
||||
| '(' plist ')' IN varname { $$ = op2(INTEST, $2, makearr($5)); }
|
||||
|
@ -280,7 +282,7 @@ rbrace:
|
|||
|
||||
re:
|
||||
reg_expr
|
||||
{ $$ = op3(MATCH, NIL, rectonode(), (Node*)makedfa($1, 0)); }
|
||||
{ $$ = op3(MATCH, NIL, rectonode(), (Node*)makedfa($1, 0)); free($1); }
|
||||
| NOT re { $$ = op1(NOT, notnull($2)); }
|
||||
;
|
||||
|
||||
|
@ -378,17 +380,19 @@ term:
|
|||
| GENSUB '(' reg_expr comma pattern comma pattern ')'
|
||||
{ $$ = op5(GENSUB, NIL, (Node*)makedfa($3, 1), $5, $7, rectonode()); }
|
||||
| GENSUB '(' pattern comma pattern comma pattern ')'
|
||||
{ if (constnode($3))
|
||||
{ if (constnode($3)) {
|
||||
$$ = op5(GENSUB, NIL, (Node *)makedfa(strnode($3), 1), $5, $7, rectonode());
|
||||
else
|
||||
free($3);
|
||||
} else
|
||||
$$ = op5(GENSUB, (Node *)1, $3, $5, $7, rectonode());
|
||||
}
|
||||
| GENSUB '(' reg_expr comma pattern comma pattern comma pattern ')'
|
||||
{ $$ = op5(GENSUB, NIL, (Node*)makedfa($3, 1), $5, $7, $9); }
|
||||
| GENSUB '(' pattern comma pattern comma pattern comma pattern ')'
|
||||
{ if (constnode($3))
|
||||
{ if (constnode($3)) {
|
||||
$$ = op5(GENSUB, NIL, (Node *)makedfa(strnode($3),1), $5,$7,$9);
|
||||
else
|
||||
free($3);
|
||||
} else
|
||||
$$ = op5(GENSUB, (Node *)1, $3, $5, $7, $9);
|
||||
}
|
||||
| GETLINE var LT term { $$ = op3(GETLINE, $2, itonp($3), $4); }
|
||||
|
@ -402,34 +406,37 @@ term:
|
|||
$$ = op2(INDEX, $3, (Node*)$5); }
|
||||
| '(' pattern ')' { $$ = $2; }
|
||||
| MATCHFCN '(' pattern comma reg_expr ')'
|
||||
{ $$ = op3(MATCHFCN, NIL, $3, (Node*)makedfa($5, 1)); }
|
||||
{ $$ = op3(MATCHFCN, NIL, $3, (Node*)makedfa($5, 1)); free($5); }
|
||||
| MATCHFCN '(' pattern comma pattern ')'
|
||||
{ if (constnode($5))
|
||||
{ if (constnode($5)) {
|
||||
$$ = op3(MATCHFCN, NIL, $3, (Node*)makedfa(strnode($5), 1));
|
||||
else
|
||||
free($5);
|
||||
} else
|
||||
$$ = op3(MATCHFCN, (Node *)1, $3, $5); }
|
||||
| NUMBER { $$ = celltonode($1, CCON); }
|
||||
| SPLIT '(' pattern comma varname comma pattern ')' /* string */
|
||||
{ $$ = op4(SPLIT, $3, makearr($5), $7, (Node*)STRING); }
|
||||
| SPLIT '(' pattern comma varname comma reg_expr ')' /* const /regexp/ */
|
||||
{ $$ = op4(SPLIT, $3, makearr($5), (Node*)makedfa($7, 1), (Node *)REGEXPR); }
|
||||
{ $$ = op4(SPLIT, $3, makearr($5), (Node*)makedfa($7, 1), (Node *)REGEXPR); free($7); }
|
||||
| SPLIT '(' pattern comma varname ')'
|
||||
{ $$ = op4(SPLIT, $3, makearr($5), NIL, (Node*)STRING); } /* default */
|
||||
| SPRINTF '(' patlist ')' { $$ = op1($1, $3); }
|
||||
| string { $$ = celltonode($1, CCON); }
|
||||
| subop '(' reg_expr comma pattern ')'
|
||||
{ $$ = op4($1, NIL, (Node*)makedfa($3, 1), $5, rectonode()); }
|
||||
{ $$ = op4($1, NIL, (Node*)makedfa($3, 1), $5, rectonode()); free($3); }
|
||||
| subop '(' pattern comma pattern ')'
|
||||
{ if (constnode($3))
|
||||
{ if (constnode($3)) {
|
||||
$$ = op4($1, NIL, (Node*)makedfa(strnode($3), 1), $5, rectonode());
|
||||
else
|
||||
free($3);
|
||||
} else
|
||||
$$ = op4($1, (Node *)1, $3, $5, rectonode()); }
|
||||
| subop '(' reg_expr comma pattern comma var ')'
|
||||
{ $$ = op4($1, NIL, (Node*)makedfa($3, 1), $5, $7); }
|
||||
{ $$ = op4($1, NIL, (Node*)makedfa($3, 1), $5, $7); free($3); }
|
||||
| subop '(' pattern comma pattern comma var ')'
|
||||
{ if (constnode($3))
|
||||
{ if (constnode($3)) {
|
||||
$$ = op4($1, NIL, (Node*)makedfa(strnode($3), 1), $5, $7);
|
||||
else
|
||||
free($3);
|
||||
} else
|
||||
$$ = op4($1, (Node *)1, $3, $5, $7); }
|
||||
| SUBSTR '(' pattern comma pattern comma pattern ')'
|
||||
{ $$ = op3(SUBSTR, $3, $5, $7); }
|
||||
|
|
|
@ -80,6 +80,44 @@ int patlen;
|
|||
fa *fatab[NFA];
|
||||
int nfatab = 0; /* entries in fatab */
|
||||
|
||||
extern int u8_nextlen(const char *s);
|
||||
|
||||
|
||||
/* utf-8 mechanism:
|
||||
|
||||
For most of Awk, utf-8 strings just "work", since they look like
|
||||
null-terminated sequences of 8-bit bytes.
|
||||
|
||||
Functions like length(), index(), and substr() have to operate
|
||||
in units of utf-8 characters. The u8_* functions in run.c
|
||||
handle this.
|
||||
|
||||
Regular expressions are more complicated, since the basic
|
||||
mechanism of the goto table used 8-bit byte indices into the
|
||||
gototab entries to compute the next state. Unicode is a lot
|
||||
bigger, so the gototab entries are now structs with a character
|
||||
and a next state. These are sorted by code point and binary
|
||||
searched.
|
||||
|
||||
Throughout the RE mechanism in b.c, utf-8 characters are
|
||||
converted to their utf-32 value. This mostly shows up in
|
||||
cclenter, which expands character class ranges like a-z and now
|
||||
alpha-omega. The size of a gototab array is still about 256.
|
||||
This should be dynamic, but for now things work ok for a single
|
||||
code page of Unicode, which is the most likely case.
|
||||
|
||||
The code changes are localized in run.c and b.c. I have added a
|
||||
handful of functions to somewhat better hide the implementation,
|
||||
but a lot more could be done.
|
||||
|
||||
*/
|
||||
|
||||
static int entry_cmp(const void *l, const void *r);
|
||||
static int get_gototab(fa*, int, int);
|
||||
static int set_gototab(fa*, int, int, int);
|
||||
static void clear_gototab(fa*, int);
|
||||
extern int u8_rune(int *, const char *);
|
||||
|
||||
static int *
|
||||
intalloc(size_t n, const char *f)
|
||||
{
|
||||
|
@ -105,7 +143,7 @@ resizesetvec(const char *f)
|
|||
static void
|
||||
resize_state(fa *f, int state)
|
||||
{
|
||||
unsigned int **p;
|
||||
gtt *p;
|
||||
uschar *p2;
|
||||
int **p3;
|
||||
int i, new_count;
|
||||
|
@ -115,7 +153,7 @@ resize_state(fa *f, int state)
|
|||
|
||||
new_count = state + 10; /* needs to be tuned */
|
||||
|
||||
p = (unsigned int **) realloc(f->gototab, new_count * sizeof(f->gototab[0]));
|
||||
p = (gtt *) realloc(f->gototab, new_count * sizeof(gtt));
|
||||
if (p == NULL)
|
||||
goto out;
|
||||
f->gototab = p;
|
||||
|
@ -131,10 +169,12 @@ resize_state(fa *f, int state)
|
|||
f->posns = p3;
|
||||
|
||||
for (i = f->state_count; i < new_count; ++i) {
|
||||
f->gototab[i] = (unsigned int *) calloc(NCHARS, sizeof(**f->gototab));
|
||||
if (f->gototab[i] == NULL)
|
||||
f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte));
|
||||
if (f->gototab[i].entries == NULL)
|
||||
goto out;
|
||||
f->out[i] = 0;
|
||||
f->gototab[i].allocated = NCHARS;
|
||||
f->gototab[i].inuse = 0;
|
||||
f->out[i] = 0;
|
||||
f->posns[i] = NULL;
|
||||
}
|
||||
f->state_count = new_count;
|
||||
|
@ -230,8 +270,7 @@ int makeinit(fa *f, bool anchor)
|
|||
}
|
||||
if ((f->posns[2])[1] == f->accept)
|
||||
f->out[2] = 1;
|
||||
for (i = 0; i < NCHARS; i++)
|
||||
f->gototab[2][i] = 0;
|
||||
clear_gototab(f, 2);
|
||||
f->curstat = cgoto(f, 2, HAT);
|
||||
if (anchor) {
|
||||
*f->posns[2] = k-1; /* leave out position 0 */
|
||||
|
@ -300,14 +339,14 @@ void freetr(Node *p) /* free parse tree */
|
|||
/* in the parsing of regular expressions, metacharacters like . have */
|
||||
/* to be seen literally; \056 is not a metacharacter. */
|
||||
|
||||
int hexstr(const uschar **pp) /* find and eval hex string at pp, return new p */
|
||||
int hexstr(const uschar **pp, int max) /* find and eval hex string at pp, return new p */
|
||||
{ /* only pick up one 8-bit byte (2 chars) */
|
||||
const uschar *p;
|
||||
int n = 0;
|
||||
int i;
|
||||
|
||||
for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) {
|
||||
if (isdigit(*p))
|
||||
for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
|
||||
if (isdigit((int) *p))
|
||||
n = 16 * n + *p - '0';
|
||||
else if (*p >= 'a' && *p <= 'f')
|
||||
n = 16 * n + *p - 'a' + 10;
|
||||
|
@ -318,6 +357,8 @@ int hexstr(const uschar **pp) /* find and eval hex string at pp, return new p */
|
|||
return n;
|
||||
}
|
||||
|
||||
|
||||
|
||||
#define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
|
||||
|
||||
int quoted(const uschar **pp) /* pick up next thing after a \\ */
|
||||
|
@ -326,24 +367,28 @@ int quoted(const uschar **pp) /* pick up next thing after a \\ */
|
|||
const uschar *p = *pp;
|
||||
int c;
|
||||
|
||||
if ((c = *p++) == 't')
|
||||
/* BUG: should advance by utf-8 char even if makes no sense */
|
||||
|
||||
if ((c = *p++) == 't') {
|
||||
c = '\t';
|
||||
else if (c == 'n')
|
||||
} else if (c == 'n') {
|
||||
c = '\n';
|
||||
else if (c == 'f')
|
||||
} else if (c == 'f') {
|
||||
c = '\f';
|
||||
else if (c == 'r')
|
||||
} else if (c == 'r') {
|
||||
c = '\r';
|
||||
else if (c == 'b')
|
||||
} else if (c == 'b') {
|
||||
c = '\b';
|
||||
else if (c == 'v')
|
||||
} else if (c == 'v') {
|
||||
c = '\v';
|
||||
else if (c == 'a')
|
||||
} else if (c == 'a') {
|
||||
c = '\a';
|
||||
else if (c == '\\')
|
||||
} else if (c == '\\') {
|
||||
c = '\\';
|
||||
else if (c == 'x') { /* hexadecimal goo follows */
|
||||
c = hexstr(&p); /* this adds a null if number is invalid */
|
||||
} else if (c == 'x') { /* 2 hex digits follow */
|
||||
c = hexstr(&p, 2); /* this adds a null if number is invalid */
|
||||
} else if (c == 'u') { /* unicode char number up to 8 hex digits */
|
||||
c = hexstr(&p, 8);
|
||||
} else if (isoctdigit(c)) { /* \d \dd \ddd */
|
||||
int n = c - '0';
|
||||
if (isoctdigit(*p)) {
|
||||
|
@ -358,50 +403,67 @@ int quoted(const uschar **pp) /* pick up next thing after a \\ */
|
|||
return c;
|
||||
}
|
||||
|
||||
char *cclenter(const char *argp) /* add a character class */
|
||||
int *cclenter(const char *argp) /* add a character class */
|
||||
{
|
||||
int i, c, c2;
|
||||
const uschar *op, *p = (const uschar *) argp;
|
||||
uschar *bp;
|
||||
static uschar *buf = NULL;
|
||||
int n;
|
||||
const uschar *p = (const uschar *) argp;
|
||||
int *bp, *retp;
|
||||
static int *buf = NULL;
|
||||
static int bufsz = 100;
|
||||
|
||||
op = p;
|
||||
if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
|
||||
if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
|
||||
FATAL("out of space for character class [%.10s...] 1", p);
|
||||
bp = buf;
|
||||
for (i = 0; (c = *p++) != 0; ) {
|
||||
for (i = 0; *p != 0; ) {
|
||||
n = u8_rune(&c, (const char *) p);
|
||||
p += n;
|
||||
if (c == '\\') {
|
||||
c = quoted(&p);
|
||||
} else if (c == '-' && i > 0 && bp[-1] != 0) {
|
||||
if (*p != 0) {
|
||||
c = bp[-1];
|
||||
c2 = *p++;
|
||||
/* c2 = *p++; */
|
||||
n = u8_rune(&c2, (const char *) p);
|
||||
p += n;
|
||||
if (c2 == '\\')
|
||||
c2 = quoted(&p);
|
||||
c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
|
||||
if (c > c2) { /* empty; ignore */
|
||||
bp--;
|
||||
i--;
|
||||
continue;
|
||||
}
|
||||
while (c < c2) {
|
||||
if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
|
||||
FATAL("out of space for character class [%.10s...] 2", p);
|
||||
if (i >= bufsz) {
|
||||
bufsz *= 2;
|
||||
buf = (int *) realloc(buf, bufsz * sizeof(int));
|
||||
if (buf == NULL)
|
||||
FATAL("out of space for character class [%.10s...] 2", p);
|
||||
bp = buf + i;
|
||||
}
|
||||
*bp++ = ++c;
|
||||
i++;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
|
||||
FATAL("out of space for character class [%.10s...] 3", p);
|
||||
if (i >= bufsz) {
|
||||
bufsz *= 2;
|
||||
buf = (int *) realloc(buf, bufsz * sizeof(int));
|
||||
if (buf == NULL)
|
||||
FATAL("out of space for character class [%.10s...] 2", p);
|
||||
bp = buf + i;
|
||||
}
|
||||
*bp++ = c;
|
||||
i++;
|
||||
}
|
||||
*bp = 0;
|
||||
DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf);
|
||||
xfree(op);
|
||||
return (char *) tostring((char *) buf);
|
||||
/* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
|
||||
/* xfree(op); BUG: what are we freeing here? */
|
||||
retp = (int *) calloc(bp-buf+1, sizeof(int));
|
||||
for (i = 0; i < bp-buf+1; i++)
|
||||
retp[i] = buf[i];
|
||||
return retp;
|
||||
}
|
||||
|
||||
void overflo(const char *s)
|
||||
|
@ -468,7 +530,7 @@ int first(Node *p) /* collects initially active leaves of p into setvec */
|
|||
setvec[lp] = 1;
|
||||
setcnt++;
|
||||
}
|
||||
if (type(p) == CCL && (*(char *) right(p)) == '\0')
|
||||
if (type(p) == CCL && (*(int *) right(p)) == 0)
|
||||
return(0); /* empty CCL */
|
||||
return(1);
|
||||
case PLUS:
|
||||
|
@ -524,9 +586,9 @@ void follow(Node *v) /* collects leaves that can follow v into setvec */
|
|||
}
|
||||
}
|
||||
|
||||
int member(int c, const char *sarg) /* is c in s? */
|
||||
int member(int c, int *sarg) /* is c in s? */
|
||||
{
|
||||
const uschar *s = (const uschar *) sarg;
|
||||
int *s = (int *) sarg;
|
||||
|
||||
while (*s)
|
||||
if (c == *s++)
|
||||
|
@ -534,11 +596,113 @@ int member(int c, const char *sarg) /* is c in s? */
|
|||
return(0);
|
||||
}
|
||||
|
||||
static void resize_gototab(fa *f, int state)
|
||||
{
|
||||
size_t new_size = f->gototab[state].allocated * 2;
|
||||
gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte));
|
||||
if (p == NULL)
|
||||
overflo(__func__);
|
||||
|
||||
// need to initialized the new memory to zero
|
||||
size_t orig_size = f->gototab[state].allocated; // 2nd half of new mem is this size
|
||||
memset(p + orig_size, 0, orig_size * sizeof(gtte)); // clean it out
|
||||
|
||||
f->gototab[state].allocated = new_size; // update gototab info
|
||||
f->gototab[state].entries = p;
|
||||
}
|
||||
|
||||
static int get_gototab(fa *f, int state, int ch) /* hide gototab implementation */
|
||||
{
|
||||
gtte key;
|
||||
gtte *item;
|
||||
|
||||
key.ch = ch;
|
||||
key.state = 0; /* irrelevant */
|
||||
item = (gtte *) bsearch(& key, f->gototab[state].entries,
|
||||
f->gototab[state].inuse, sizeof(gtte),
|
||||
entry_cmp);
|
||||
|
||||
if (item == NULL)
|
||||
return 0;
|
||||
else
|
||||
return item->state;
|
||||
}
|
||||
|
||||
static int entry_cmp(const void *l, const void *r)
|
||||
{
|
||||
const gtte *left, *right;
|
||||
|
||||
left = (const gtte *) l;
|
||||
right = (const gtte *) r;
|
||||
|
||||
return left->ch - right->ch;
|
||||
}
|
||||
|
||||
static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab implementation */
|
||||
{
|
||||
if (f->gototab[state].inuse == 0) {
|
||||
f->gototab[state].entries[0].ch = ch;
|
||||
f->gototab[state].entries[0].state = val;
|
||||
f->gototab[state].inuse++;
|
||||
return val;
|
||||
} else if (ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) {
|
||||
// not seen yet, insert and return
|
||||
gtt *tab = & f->gototab[state];
|
||||
if (tab->inuse + 1 >= tab->allocated)
|
||||
resize_gototab(f, state);
|
||||
|
||||
f->gototab[state].entries[f->gototab[state].inuse-1].ch = ch;
|
||||
f->gototab[state].entries[f->gototab[state].inuse-1].state = val;
|
||||
f->gototab[state].inuse++;
|
||||
return val;
|
||||
} else {
|
||||
// maybe we have it, maybe we don't
|
||||
gtte key;
|
||||
gtte *item;
|
||||
|
||||
key.ch = ch;
|
||||
key.state = 0; /* irrelevant */
|
||||
item = (gtte *) bsearch(& key, f->gototab[state].entries,
|
||||
f->gototab[state].inuse, sizeof(gtte),
|
||||
entry_cmp);
|
||||
|
||||
if (item != NULL) {
|
||||
// we have it, update state and return
|
||||
item->state = val;
|
||||
return item->state;
|
||||
}
|
||||
// otherwise, fall through to insert and reallocate.
|
||||
}
|
||||
|
||||
gtt *tab = & f->gototab[state];
|
||||
if (tab->inuse + 1 >= tab->allocated)
|
||||
resize_gototab(f, state);
|
||||
++tab->inuse;
|
||||
f->gototab[state].entries[tab->inuse].ch = ch;
|
||||
f->gototab[state].entries[tab->inuse].state = val;
|
||||
|
||||
qsort(f->gototab[state].entries,
|
||||
f->gototab[state].inuse, sizeof(gtte), entry_cmp);
|
||||
|
||||
return val; /* not used anywhere at the moment */
|
||||
}
|
||||
|
||||
static void clear_gototab(fa *f, int state)
|
||||
{
|
||||
memset(f->gototab[state].entries, 0,
|
||||
f->gototab[state].allocated * sizeof(gtte));
|
||||
f->gototab[state].inuse = 0;
|
||||
}
|
||||
|
||||
int match(fa *f, const char *p0) /* shortest match ? */
|
||||
{
|
||||
int s, ns;
|
||||
int n;
|
||||
int rune;
|
||||
const uschar *p = (const uschar *) p0;
|
||||
|
||||
/* return pmatch(f, p0); does it matter whether longest or shortest? */
|
||||
|
||||
s = f->initstat;
|
||||
assert (s < f->state_count);
|
||||
|
||||
|
@ -546,19 +710,25 @@ int match(fa *f, const char *p0) /* shortest match ? */
|
|||
return(1);
|
||||
do {
|
||||
/* assert(*p < NCHARS); */
|
||||
if ((ns = f->gototab[s][*p]) != 0)
|
||||
n = u8_rune(&rune, (const char *) p);
|
||||
if ((ns = get_gototab(f, s, rune)) != 0)
|
||||
s = ns;
|
||||
else
|
||||
s = cgoto(f, s, *p);
|
||||
s = cgoto(f, s, rune);
|
||||
if (f->out[s])
|
||||
return(1);
|
||||
} while (*p++ != 0);
|
||||
if (*p == 0)
|
||||
break;
|
||||
p += n;
|
||||
} while (1); /* was *p++ != 0 */
|
||||
return(0);
|
||||
}
|
||||
|
||||
int pmatch(fa *f, const char *p0) /* longest match, for sub */
|
||||
{
|
||||
int s, ns;
|
||||
int n;
|
||||
int rune;
|
||||
const uschar *p = (const uschar *) p0;
|
||||
const uschar *q;
|
||||
|
||||
|
@ -573,10 +743,11 @@ int pmatch(fa *f, const char *p0) /* longest match, for sub */
|
|||
if (f->out[s]) /* final state */
|
||||
patlen = q-p;
|
||||
/* assert(*q < NCHARS); */
|
||||
if ((ns = f->gototab[s][*q]) != 0)
|
||||
n = u8_rune(&rune, (const char *) q);
|
||||
if ((ns = get_gototab(f, s, rune)) != 0)
|
||||
s = ns;
|
||||
else
|
||||
s = cgoto(f, s, *q);
|
||||
s = cgoto(f, s, rune);
|
||||
|
||||
assert(s < f->state_count);
|
||||
|
||||
|
@ -588,7 +759,11 @@ int pmatch(fa *f, const char *p0) /* longest match, for sub */
|
|||
else
|
||||
goto nextin; /* no match */
|
||||
}
|
||||
} while (*q++ != 0);
|
||||
if (*q == 0)
|
||||
break;
|
||||
q += n;
|
||||
} while (1);
|
||||
q++; /* was *q++ */
|
||||
if (f->out[s])
|
||||
patlen = q-p-1; /* don't count $ */
|
||||
if (patlen >= 0) {
|
||||
|
@ -597,13 +772,19 @@ int pmatch(fa *f, const char *p0) /* longest match, for sub */
|
|||
}
|
||||
nextin:
|
||||
s = 2;
|
||||
} while (*p++);
|
||||
if (*p == 0)
|
||||
break;
|
||||
n = u8_rune(&rune, (const char *) p);
|
||||
p += n;
|
||||
} while (1); /* was *p++ */
|
||||
return (0);
|
||||
}
|
||||
|
||||
int nematch(fa *f, const char *p0) /* non-empty match, for sub */
|
||||
{
|
||||
int s, ns;
|
||||
int n;
|
||||
int rune;
|
||||
const uschar *p = (const uschar *) p0;
|
||||
const uschar *q;
|
||||
|
||||
|
@ -618,10 +799,11 @@ int nematch(fa *f, const char *p0) /* non-empty match, for sub */
|
|||
if (f->out[s]) /* final state */
|
||||
patlen = q-p;
|
||||
/* assert(*q < NCHARS); */
|
||||
if ((ns = f->gototab[s][*q]) != 0)
|
||||
n = u8_rune(&rune, (const char *) q);
|
||||
if ((ns = get_gototab(f, s, rune)) != 0)
|
||||
s = ns;
|
||||
else
|
||||
s = cgoto(f, s, *q);
|
||||
s = cgoto(f, s, rune);
|
||||
if (s == 1) { /* no transition */
|
||||
if (patlen > 0) {
|
||||
patbeg = (const char *) p;
|
||||
|
@ -629,7 +811,11 @@ int nematch(fa *f, const char *p0) /* non-empty match, for sub */
|
|||
} else
|
||||
goto nnextin; /* no nonempty match */
|
||||
}
|
||||
} while (*q++ != 0);
|
||||
if (*q == 0)
|
||||
break;
|
||||
q += n;
|
||||
} while (1);
|
||||
q++;
|
||||
if (f->out[s])
|
||||
patlen = q-p-1; /* don't count $ */
|
||||
if (patlen > 0 ) {
|
||||
|
@ -644,6 +830,8 @@ int nematch(fa *f, const char *p0) /* non-empty match, for sub */
|
|||
}
|
||||
|
||||
|
||||
#define MAX_UTF_BYTES 4 // UTF-8 is up to 4 bytes long
|
||||
|
||||
/*
|
||||
* NAME
|
||||
* fnematch
|
||||
|
@ -661,54 +849,76 @@ int nematch(fa *f, const char *p0) /* non-empty match, for sub */
|
|||
|
||||
bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
|
||||
{
|
||||
char *buf = *pbuf;
|
||||
char *i, *j, *k, *buf = *pbuf;
|
||||
int bufsize = *pbufsize;
|
||||
int c, i, j, k, ns, s;
|
||||
int c, n, ns, s;
|
||||
|
||||
s = pfa->initstat;
|
||||
patlen = 0;
|
||||
|
||||
/*
|
||||
* All indices relative to buf.
|
||||
* i <= j <= k <= bufsize
|
||||
* buf <= i <= j <= k <= buf+bufsize
|
||||
*
|
||||
* i: origin of active substring
|
||||
* j: current character
|
||||
* k: destination of next getc()
|
||||
* k: destination of the next getc
|
||||
*/
|
||||
i = -1, k = 0;
|
||||
do {
|
||||
j = i++;
|
||||
do {
|
||||
if (++j == k) {
|
||||
if (k == bufsize)
|
||||
if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
|
||||
FATAL("stream '%.30s...' too long", buf);
|
||||
buf[k++] = (c = getc(f)) != EOF ? c : 0;
|
||||
}
|
||||
c = (uschar)buf[j];
|
||||
/* assert(c < NCHARS); */
|
||||
|
||||
if ((ns = pfa->gototab[s][c]) != 0)
|
||||
s = ns;
|
||||
else
|
||||
s = cgoto(pfa, s, c);
|
||||
i = j = k = buf;
|
||||
|
||||
if (pfa->out[s]) { /* final state */
|
||||
patlen = j - i + 1;
|
||||
if (c == 0) /* don't count $ */
|
||||
patlen--;
|
||||
do {
|
||||
/*
|
||||
* Call u8_rune with at least MAX_UTF_BYTES ahead in
|
||||
* the buffer until EOF interferes.
|
||||
*/
|
||||
if (k - j < MAX_UTF_BYTES) {
|
||||
if (k + MAX_UTF_BYTES > buf + bufsize) {
|
||||
adjbuf((char **) &buf, &bufsize,
|
||||
bufsize + MAX_UTF_BYTES,
|
||||
quantum, 0, "fnematch");
|
||||
}
|
||||
} while (buf[j] && s != 1);
|
||||
for (n = MAX_UTF_BYTES ; n > 0; n--) {
|
||||
*k++ = (c = getc(f)) != EOF ? c : 0;
|
||||
if (c == EOF) {
|
||||
if (ferror(f))
|
||||
FATAL("fnematch: getc error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
j += u8_rune(&c, j);
|
||||
|
||||
if ((ns = get_gototab(pfa, s, c)) != 0)
|
||||
s = ns;
|
||||
else
|
||||
s = cgoto(pfa, s, c);
|
||||
|
||||
if (pfa->out[s]) { /* final state */
|
||||
patbeg = i;
|
||||
patlen = j - i;
|
||||
if (c == 0) /* don't count $ */
|
||||
patlen--;
|
||||
}
|
||||
|
||||
if (c && s != 1)
|
||||
continue; /* origin i still viable, next j */
|
||||
if (patlen)
|
||||
break; /* best match found */
|
||||
|
||||
/* no match at origin i, next i and start over */
|
||||
i += u8_rune(&c, i);
|
||||
if (c == 0)
|
||||
break; /* no match */
|
||||
j = i;
|
||||
s = 2;
|
||||
} while (buf[i] && !patlen);
|
||||
} while (1);
|
||||
|
||||
/* adjbuf() may have relocated a resized buffer. Inform the world. */
|
||||
*pbuf = buf;
|
||||
*pbufsize = bufsize;
|
||||
|
||||
if (patlen) {
|
||||
patbeg = (char *) buf + i;
|
||||
/*
|
||||
* Under no circumstances is the last character fed to
|
||||
* the automaton part of the match. It is EOF's nullbyte,
|
||||
|
@ -721,10 +931,10 @@ bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
|
|||
* terminate the buffer.
|
||||
*/
|
||||
do
|
||||
if (buf[--k] && ungetc(buf[k], f) == EOF)
|
||||
FATAL("unable to ungetc '%c'", buf[k]);
|
||||
while (k > i + patlen);
|
||||
buf[k] = '\0';
|
||||
if (*--k && ungetc(*k, f) == EOF)
|
||||
FATAL("unable to ungetc '%c'", *k);
|
||||
while (k > patbeg + patlen);
|
||||
*k = '\0';
|
||||
return true;
|
||||
}
|
||||
else
|
||||
|
@ -797,7 +1007,7 @@ Node *primary(void)
|
|||
rtok = relex();
|
||||
if (rtok == ')') { /* special pleading for () */
|
||||
rtok = relex();
|
||||
return unary(op2(CCL, NIL, (Node *) tostring("")));
|
||||
return unary(op2(CCL, NIL, (Node *) cclenter("")));
|
||||
}
|
||||
np = regexp();
|
||||
if (rtok == ')') {
|
||||
|
@ -820,7 +1030,7 @@ Node *concat(Node *np)
|
|||
return (concat(op2(CAT, np, primary())));
|
||||
case EMPTYRE:
|
||||
rtok = relex();
|
||||
return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
|
||||
return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
|
||||
primary())));
|
||||
}
|
||||
return (np);
|
||||
|
@ -1036,6 +1246,12 @@ int relex(void) /* lexical analyzer for reparse */
|
|||
rescan:
|
||||
starttok = prestr;
|
||||
|
||||
if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
|
||||
prestr += n;
|
||||
starttok = prestr;
|
||||
return CHAR;
|
||||
}
|
||||
|
||||
switch (c = *prestr++) {
|
||||
case '|': return OR;
|
||||
case '*': return STAR;
|
||||
|
@ -1073,10 +1289,15 @@ int relex(void) /* lexical analyzer for reparse */
|
|||
}
|
||||
else
|
||||
cflag = 0;
|
||||
n = 2 * strlen((const char *) prestr)+1;
|
||||
n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2. what value? */
|
||||
if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
|
||||
FATAL("out of space for reg expr %.10s...", lastre);
|
||||
for (; ; ) {
|
||||
if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
|
||||
for (i = 0; i < n; i++)
|
||||
*bp++ = *prestr++;
|
||||
continue;
|
||||
}
|
||||
if ((c = *prestr++) == '\\') {
|
||||
*bp++ = '\\';
|
||||
if ((c = *prestr++) == '\0')
|
||||
|
@ -1101,7 +1322,7 @@ int relex(void) /* lexical analyzer for reparse */
|
|||
* program to track each string's length.
|
||||
*/
|
||||
for (i = 1; i <= UCHAR_MAX; i++) {
|
||||
if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
|
||||
if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
|
||||
FATAL("out of space for reg expr %.10s...", lastre);
|
||||
if (cc->cc_func(i)) {
|
||||
/* escape backslash */
|
||||
|
@ -1166,7 +1387,7 @@ int relex(void) /* lexical analyzer for reparse */
|
|||
}
|
||||
break;
|
||||
case '{':
|
||||
if (isdigit(*(prestr))) {
|
||||
if (isdigit((int) *(prestr))) {
|
||||
num = 0; /* Process as a repetition */
|
||||
n = -1; m = -1;
|
||||
commafound = false;
|
||||
|
@ -1243,7 +1464,7 @@ int cgoto(fa *f, int s, int c)
|
|||
int *p, *q;
|
||||
int i, j, k;
|
||||
|
||||
assert(c == HAT || c < NCHARS);
|
||||
/* assert(c == HAT || c < NCHARS); BUG: seg fault if disable test */
|
||||
while (f->accept >= maxsetvec) { /* guessing here! */
|
||||
resizesetvec(__func__);
|
||||
}
|
||||
|
@ -1259,8 +1480,8 @@ int cgoto(fa *f, int s, int c)
|
|||
|| (k == DOT && c != 0 && c != HAT)
|
||||
|| (k == ALL && c != 0)
|
||||
|| (k == EMPTYRE && c != 0)
|
||||
|| (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
|
||||
|| (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
|
||||
|| (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
|
||||
|| (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
|
||||
q = f->re[p[i]].lfollow;
|
||||
for (j = 1; j <= *q; j++) {
|
||||
if (q[j] >= maxsetvec) {
|
||||
|
@ -1292,7 +1513,7 @@ int cgoto(fa *f, int s, int c)
|
|||
goto different;
|
||||
/* setvec is state i */
|
||||
if (c != HAT)
|
||||
f->gototab[s][c] = i;
|
||||
set_gototab(f, s, c, i);
|
||||
return i;
|
||||
different:;
|
||||
}
|
||||
|
@ -1300,14 +1521,13 @@ int cgoto(fa *f, int s, int c)
|
|||
/* add tmpset to current set of states */
|
||||
++(f->curstat);
|
||||
resize_state(f, f->curstat);
|
||||
for (i = 0; i < NCHARS; i++)
|
||||
f->gototab[f->curstat][i] = 0;
|
||||
clear_gototab(f, f->curstat);
|
||||
xfree(f->posns[f->curstat]);
|
||||
p = intalloc(setcnt + 1, __func__);
|
||||
|
||||
f->posns[f->curstat] = p;
|
||||
if (c != HAT)
|
||||
f->gototab[s][c] = f->curstat;
|
||||
set_gototab(f, s, c, f->curstat);
|
||||
for (i = 0; i <= setcnt; i++)
|
||||
p[i] = tmpset[i];
|
||||
if (setvec[f->accept])
|
||||
|
@ -1325,7 +1545,8 @@ void freefa(fa *f) /* free a finite automaton */
|
|||
if (f == NULL)
|
||||
return;
|
||||
for (i = 0; i < f->state_count; i++)
|
||||
xfree(f->gototab[i])
|
||||
xfree(f->gototab[i].entries);
|
||||
xfree(f->gototab);
|
||||
for (i = 0; i <= f->curstat; i++)
|
||||
xfree(f->posns[i]);
|
||||
for (i = 0; i <= f->accept; i++) {
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#! /bin/bash
|
||||
#! /bin/sh
|
||||
|
||||
if [ ! -f ../a.out ]
|
||||
then
|
||||
|
@ -11,6 +11,7 @@ do
|
|||
echo === $i
|
||||
OUT=${i%.awk}.OUT
|
||||
OK=${i%.awk}.ok
|
||||
OK2=${i%.awk}.ok2
|
||||
IN=${i%.awk}.in
|
||||
input=
|
||||
if [ -f $IN ]
|
||||
|
@ -20,9 +21,12 @@ do
|
|||
|
||||
../a.out -f $i $input > $OUT 2>&1
|
||||
if cmp -s $OK $OUT
|
||||
then
|
||||
rm -f $OUT
|
||||
elif [ -f $OK2 ] && cmp -s $OK2 $OUT
|
||||
then
|
||||
rm -f $OUT
|
||||
else
|
||||
echo ++++ $i failed!
|
||||
echo '++++ $i failed!'
|
||||
fi
|
||||
done
|
||||
|
|
5
contrib/one-true-awk/bugs-fixed/getline-corruption.awk
Normal file
5
contrib/one-true-awk/bugs-fixed/getline-corruption.awk
Normal file
|
@ -0,0 +1,5 @@
|
|||
BEGIN {
|
||||
getline l
|
||||
getline l
|
||||
print (s=substr(l,1,10)) " len=" length(s)
|
||||
}
|
1
contrib/one-true-awk/bugs-fixed/getline-corruption.in
Normal file
1
contrib/one-true-awk/bugs-fixed/getline-corruption.in
Normal file
|
@ -0,0 +1 @@
|
|||
a
|
1
contrib/one-true-awk/bugs-fixed/getline-corruption.ok
Normal file
1
contrib/one-true-awk/bugs-fixed/getline-corruption.ok
Normal file
|
@ -0,0 +1 @@
|
|||
a len=1
|
11
contrib/one-true-awk/bugs-fixed/matchop-deref.awk
Normal file
11
contrib/one-true-awk/bugs-fixed/matchop-deref.awk
Normal file
|
@ -0,0 +1,11 @@
|
|||
function foo() {
|
||||
return "aaaaaab"
|
||||
}
|
||||
|
||||
BEGIN {
|
||||
print match(foo(), "b")
|
||||
}
|
||||
|
||||
{
|
||||
print match(substr($0, 1), "b")
|
||||
}
|
2
contrib/one-true-awk/bugs-fixed/matchop-deref.bad
Normal file
2
contrib/one-true-awk/bugs-fixed/matchop-deref.bad
Normal file
|
@ -0,0 +1,2 @@
|
|||
-1
|
||||
-1
|
1
contrib/one-true-awk/bugs-fixed/matchop-deref.in
Normal file
1
contrib/one-true-awk/bugs-fixed/matchop-deref.in
Normal file
|
@ -0,0 +1 @@
|
|||
aaaaaab
|
2
contrib/one-true-awk/bugs-fixed/matchop-deref.ok
Normal file
2
contrib/one-true-awk/bugs-fixed/matchop-deref.ok
Normal file
|
@ -0,0 +1,2 @@
|
|||
7
|
||||
7
|
2
contrib/one-true-awk/bugs-fixed/missing-precision.ok
Normal file
2
contrib/one-true-awk/bugs-fixed/missing-precision.ok
Normal file
|
@ -0,0 +1,2 @@
|
|||
../a.out: not enough args in printf(%*s)
|
||||
source line number 1
|
2
contrib/one-true-awk/bugs-fixed/negative-nf.ok
Normal file
2
contrib/one-true-awk/bugs-fixed/negative-nf.ok
Normal file
|
@ -0,0 +1,2 @@
|
|||
../a.out: cannot set NF to a negative value
|
||||
source line number 1
|
4
contrib/one-true-awk/bugs-fixed/pfile-overflow.ok
Normal file
4
contrib/one-true-awk/bugs-fixed/pfile-overflow.ok
Normal file
|
@ -0,0 +1,4 @@
|
|||
../a.out: syntax error at source line 1 source file pfile-overflow.awk
|
||||
context is
|
||||
>>> <<<
|
||||
../a.out: bailing out at source line 1 source file pfile-overflow.awk
|
10
contrib/one-true-awk/bugs-fixed/rstart-rlength.awk
Normal file
10
contrib/one-true-awk/bugs-fixed/rstart-rlength.awk
Normal file
|
@ -0,0 +1,10 @@
|
|||
BEGIN {
|
||||
str="\342\200\257"
|
||||
print length(str)
|
||||
match(str,/^/)
|
||||
print RSTART, RLENGTH
|
||||
match(str,/.+/)
|
||||
print RSTART, RLENGTH
|
||||
match(str,/$/)
|
||||
print RSTART, RLENGTH
|
||||
}
|
4
contrib/one-true-awk/bugs-fixed/rstart-rlength.ok
Normal file
4
contrib/one-true-awk/bugs-fixed/rstart-rlength.ok
Normal file
|
@ -0,0 +1,4 @@
|
|||
1
|
||||
1 0
|
||||
1 1
|
||||
2 0
|
19
contrib/one-true-awk/bugs-fixed/system-status.awk
Normal file
19
contrib/one-true-awk/bugs-fixed/system-status.awk
Normal file
|
@ -0,0 +1,19 @@
|
|||
# Unmodified nawk prints the 16 bit exit status divided by 256, but
|
||||
# does so using floating point arithmetic, yielding strange results.
|
||||
#
|
||||
# The fix is to use the various macros defined for wait(2) and to
|
||||
# use the signal number + 256 for death by signal, or signal number + 512
|
||||
# for death by signal with core dump.
|
||||
|
||||
BEGIN {
|
||||
status = system("exit 42")
|
||||
print "normal status", status
|
||||
|
||||
status = system("kill -HUP $$")
|
||||
print "death by signal status", status
|
||||
|
||||
status = system("kill -ABRT $$")
|
||||
print "death by signal with core dump status", status
|
||||
|
||||
system("rm -f core*")
|
||||
}
|
3
contrib/one-true-awk/bugs-fixed/system-status.bad
Normal file
3
contrib/one-true-awk/bugs-fixed/system-status.bad
Normal file
|
@ -0,0 +1,3 @@
|
|||
normal status 42
|
||||
death by signal status 0.00390625
|
||||
death by signal with core dump status 0.523438
|
3
contrib/one-true-awk/bugs-fixed/system-status.ok
Normal file
3
contrib/one-true-awk/bugs-fixed/system-status.ok
Normal file
|
@ -0,0 +1,3 @@
|
|||
normal status 42
|
||||
death by signal status 257
|
||||
death by signal with core dump status 518
|
3
contrib/one-true-awk/bugs-fixed/system-status.ok2
Normal file
3
contrib/one-true-awk/bugs-fixed/system-status.ok2
Normal file
|
@ -0,0 +1,3 @@
|
|||
normal status 42
|
||||
death by signal status 257
|
||||
death by signal with core dump status 262
|
6
contrib/one-true-awk/bugs-fixed/unicode-fs-rs-1.awk
Normal file
6
contrib/one-true-awk/bugs-fixed/unicode-fs-rs-1.awk
Normal file
|
@ -0,0 +1,6 @@
|
|||
BEGIN {
|
||||
FS="␟"
|
||||
RS="␞"
|
||||
OFS=","
|
||||
}
|
||||
{ print $1, $2, $3 }
|
2
contrib/one-true-awk/bugs-fixed/unicode-fs-rs-1.in
Normal file
2
contrib/one-true-awk/bugs-fixed/unicode-fs-rs-1.in
Normal file
|
@ -0,0 +1,2 @@
|
|||
id␟name␟age␞1␟Bob "Billy" Smith␟42␞2␟Jane
|
||||
Brown␟37
|
5
contrib/one-true-awk/bugs-fixed/unicode-fs-rs-1.ok
Normal file
5
contrib/one-true-awk/bugs-fixed/unicode-fs-rs-1.ok
Normal file
|
@ -0,0 +1,5 @@
|
|||
id,name,age
|
||||
1,Bob "Billy" Smith,42
|
||||
2,Jane
|
||||
Brown,37
|
||||
|
7
contrib/one-true-awk/bugs-fixed/unicode-fs-rs-2.awk
Normal file
7
contrib/one-true-awk/bugs-fixed/unicode-fs-rs-2.awk
Normal file
|
@ -0,0 +1,7 @@
|
|||
BEGIN {
|
||||
FS = "א"
|
||||
RS = "בב"
|
||||
OFS = ","
|
||||
}
|
||||
|
||||
{ print $1, $2, $3 }
|
2
contrib/one-true-awk/bugs-fixed/unicode-fs-rs-2.in
Normal file
2
contrib/one-true-awk/bugs-fixed/unicode-fs-rs-2.in
Normal file
|
@ -0,0 +1,2 @@
|
|||
idאnameאageא1אBob "Billy" Smithא42א2בבJane
|
||||
Brownא37
|
4
contrib/one-true-awk/bugs-fixed/unicode-fs-rs-2.ok
Normal file
4
contrib/one-true-awk/bugs-fixed/unicode-fs-rs-2.ok
Normal file
|
@ -0,0 +1,4 @@
|
|||
id,name,age
|
||||
Jane
|
||||
Brown,37
|
||||
,
|
6
contrib/one-true-awk/bugs-fixed/unicode-null-match.awk
Normal file
6
contrib/one-true-awk/bugs-fixed/unicode-null-match.awk
Normal file
|
@ -0,0 +1,6 @@
|
|||
BEGIN {
|
||||
# str = "\342\200\257"
|
||||
str = "あ"
|
||||
n = gsub(//, "X", str)
|
||||
print n, str
|
||||
}
|
1
contrib/one-true-awk/bugs-fixed/unicode-null-match.bad
Normal file
1
contrib/one-true-awk/bugs-fixed/unicode-null-match.bad
Normal file
|
@ -0,0 +1 @@
|
|||
4 X綛々9
|
1
contrib/one-true-awk/bugs-fixed/unicode-null-match.ok
Normal file
1
contrib/one-true-awk/bugs-fixed/unicode-null-match.ok
Normal file
|
@ -0,0 +1 @@
|
|||
2 XあX
|
|
@ -377,6 +377,8 @@ int yylex(void)
|
|||
}
|
||||
}
|
||||
|
||||
extern int runetochar(char *str, int c);
|
||||
|
||||
int string(void)
|
||||
{
|
||||
int c, n;
|
||||
|
@ -424,20 +426,54 @@ int string(void)
|
|||
*bp++ = n;
|
||||
break;
|
||||
|
||||
case 'x': /* hex \x0-9a-fA-F + */
|
||||
{ char xbuf[100], *px;
|
||||
for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
|
||||
if (isdigit(c)
|
||||
|| (c >= 'a' && c <= 'f')
|
||||
|| (c >= 'A' && c <= 'F'))
|
||||
*px++ = c;
|
||||
else
|
||||
break;
|
||||
case 'x': /* hex \x0-9a-fA-F (exactly two) */
|
||||
{
|
||||
int i;
|
||||
|
||||
if (!isxdigit(peek())) {
|
||||
unput(c);
|
||||
break;
|
||||
}
|
||||
n = 0;
|
||||
for (i = 0; i < 2; i++) {
|
||||
c = input();
|
||||
if (c == 0)
|
||||
break;
|
||||
if (isxdigit(c)) {
|
||||
c = tolower(c);
|
||||
n *= 16;
|
||||
if (isdigit(c))
|
||||
n += (c - '0');
|
||||
else
|
||||
n += 10 + (c - 'a');
|
||||
} else {
|
||||
unput(c);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (i)
|
||||
*bp++ = n;
|
||||
break;
|
||||
}
|
||||
|
||||
case 'u': /* utf \u0-9a-fA-F (1..8) */
|
||||
{
|
||||
int i;
|
||||
|
||||
n = 0;
|
||||
for (i = 0; i < 8; i++) {
|
||||
c = input();
|
||||
if (!isxdigit(c) || c == 0)
|
||||
break;
|
||||
c = tolower(c);
|
||||
n *= 16;
|
||||
if (isdigit(c))
|
||||
n += (c - '0');
|
||||
else
|
||||
n += 10 + (c - 'a');
|
||||
}
|
||||
*px = 0;
|
||||
unput(c);
|
||||
sscanf(xbuf, "%x", (unsigned int *) &n);
|
||||
*bp++ = n;
|
||||
bp += runetochar(bp, n);
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -534,7 +570,7 @@ int regexpr(void)
|
|||
char *bp;
|
||||
|
||||
if (buf == NULL && (buf = (char *) malloc(bufsz)) == NULL)
|
||||
FATAL("out of space for rex expr");
|
||||
FATAL("out of space for reg expr");
|
||||
bp = buf;
|
||||
for ( ; (c = input()) != '/' && c != 0; ) {
|
||||
if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
|
||||
|
|
|
@ -34,6 +34,8 @@ THIS SOFTWARE.
|
|||
#include <math.h>
|
||||
#include "awk.h"
|
||||
|
||||
extern int u8_nextlen(const char *s);
|
||||
|
||||
char EMPTY[] = { '\0' };
|
||||
FILE *infile = NULL;
|
||||
bool innew; /* true = infile has not been read by readrec */
|
||||
|
@ -150,11 +152,6 @@ int getrec(char **pbuf, int *pbufsize, bool isrecord) /* get next input record *
|
|||
}
|
||||
DPRINTF("RS=<%s>, FS=<%s>, ARGC=%g, FILENAME=%s\n",
|
||||
*RS, *FS, *ARGC, *FILENAME);
|
||||
if (isrecord) {
|
||||
donefld = false;
|
||||
donerec = true;
|
||||
savefs();
|
||||
}
|
||||
saveb0 = buf[0];
|
||||
buf[0] = 0;
|
||||
while (argno < *ARGC || infile == stdin) {
|
||||
|
@ -194,6 +191,9 @@ int getrec(char **pbuf, int *pbufsize, bool isrecord) /* get next input record *
|
|||
fldtab[0]->fval = result;
|
||||
fldtab[0]->tval |= NUM;
|
||||
}
|
||||
donefld = false;
|
||||
donerec = true;
|
||||
savefs();
|
||||
}
|
||||
setfval(nrloc, nrloc->fval+1);
|
||||
setfval(fnrloc, fnrloc->fval+1);
|
||||
|
@ -221,16 +221,22 @@ void nextfile(void)
|
|||
argno++;
|
||||
}
|
||||
|
||||
extern int readcsvrec(char **pbuf, int *pbufsize, FILE *inf, bool newflag);
|
||||
|
||||
int readrec(char **pbuf, int *pbufsize, FILE *inf, bool newflag) /* read one record into buf */
|
||||
{
|
||||
int sep, c, isrec;
|
||||
char *rr, *buf = *pbuf;
|
||||
int sep, c, isrec; // POTENTIAL BUG? isrec is a macro in awk.h
|
||||
char *rr = *pbuf, *buf = *pbuf;
|
||||
int bufsize = *pbufsize;
|
||||
char *rs = getsval(rsloc);
|
||||
|
||||
if (*rs && rs[1]) {
|
||||
if (CSV) {
|
||||
c = readcsvrec(pbuf, pbufsize, inf, newflag);
|
||||
isrec = (c == EOF && rr == buf) ? false : true;
|
||||
} else if (*rs && rs[1]) {
|
||||
bool found;
|
||||
|
||||
memset(buf, 0, bufsize);
|
||||
fa *pfa = makedfa(rs, 1);
|
||||
if (newflag)
|
||||
found = fnematch(pfa, inf, &buf, &bufsize, recsize);
|
||||
|
@ -243,6 +249,7 @@ int readrec(char **pbuf, int *pbufsize, FILE *inf, bool newflag) /* read one rec
|
|||
if (found)
|
||||
setptr(patbeg, '\0');
|
||||
isrec = (found == 0 && *buf == '\0') ? false : true;
|
||||
|
||||
} else {
|
||||
if ((sep = *rs) == 0) {
|
||||
sep = '\n';
|
||||
|
@ -280,6 +287,52 @@ int readrec(char **pbuf, int *pbufsize, FILE *inf, bool newflag) /* read one rec
|
|||
return isrec;
|
||||
}
|
||||
|
||||
|
||||
/*******************
|
||||
* loose ends here:
|
||||
* \r\n should become \n
|
||||
* what about bare \r? Excel uses that for embedded newlines
|
||||
* can't have "" in unquoted fields, according to RFC 4180
|
||||
*/
|
||||
|
||||
|
||||
int readcsvrec(char **pbuf, int *pbufsize, FILE *inf, bool newflag) /* csv can have \n's */
|
||||
{ /* so read a complete record that might be multiple lines */
|
||||
int sep, c;
|
||||
char *rr = *pbuf, *buf = *pbuf;
|
||||
int bufsize = *pbufsize;
|
||||
bool in_quote = false;
|
||||
|
||||
sep = '\n'; /* the only separator; have to skip over \n embedded in "..." */
|
||||
rr = buf;
|
||||
while ((c = getc(inf)) != EOF) {
|
||||
if (c == sep) {
|
||||
if (! in_quote)
|
||||
break;
|
||||
if (rr > buf && rr[-1] == '\r') // remove \r if was \r\n
|
||||
rr--;
|
||||
}
|
||||
|
||||
if (rr-buf+1 > bufsize)
|
||||
if (!adjbuf(&buf, &bufsize, 1+rr-buf,
|
||||
recsize, &rr, "readcsvrec 1"))
|
||||
FATAL("input record `%.30s...' too long", buf);
|
||||
*rr++ = c;
|
||||
if (c == '"')
|
||||
in_quote = ! in_quote;
|
||||
}
|
||||
if (c == '\n' && rr > buf && rr[-1] == '\r') // remove \r if was \r\n
|
||||
rr--;
|
||||
|
||||
if (!adjbuf(&buf, &bufsize, 1+rr-buf, recsize, &rr, "readcsvrec 4"))
|
||||
FATAL("input record `%.30s...' too long", buf);
|
||||
*rr = 0;
|
||||
*pbuf = buf;
|
||||
*pbufsize = bufsize;
|
||||
DPRINTF("readcsvrec saw <%s>, returns %d\n", buf, c);
|
||||
return c;
|
||||
}
|
||||
|
||||
char *getargv(int n) /* get ARGV[n] */
|
||||
{
|
||||
Cell *x;
|
||||
|
@ -297,12 +350,16 @@ char *getargv(int n) /* get ARGV[n] */
|
|||
|
||||
void setclvar(char *s) /* set var=value from s */
|
||||
{
|
||||
char *p;
|
||||
char *e, *p;
|
||||
Cell *q;
|
||||
double result;
|
||||
|
||||
/* commit f3d9187d4e0f02294fb1b0e31152070506314e67 broke T.argv test */
|
||||
/* I don't understand why it was changed. */
|
||||
|
||||
for (p=s; *p != '='; p++)
|
||||
;
|
||||
e = p;
|
||||
*p++ = 0;
|
||||
p = qstring(p, '\0');
|
||||
q = setsymtab(s, p, 0.0, STR, symtab);
|
||||
|
@ -312,6 +369,8 @@ void setclvar(char *s) /* set var=value from s */
|
|||
q->tval |= NUM;
|
||||
}
|
||||
DPRINTF("command line set %s to |%s|\n", s, p);
|
||||
free(p);
|
||||
*e = '=';
|
||||
}
|
||||
|
||||
|
||||
|
@ -340,9 +399,9 @@ void fldbld(void) /* create fields from current record */
|
|||
i = 0; /* number of fields accumulated here */
|
||||
if (inputFS == NULL) /* make sure we have a copy of FS */
|
||||
savefs();
|
||||
if (strlen(inputFS) > 1) { /* it's a regular expression */
|
||||
if (!CSV && strlen(inputFS) > 1) { /* it's a regular expression */
|
||||
i = refldbld(r, inputFS);
|
||||
} else if ((sep = *inputFS) == ' ') { /* default whitespace */
|
||||
} else if (!CSV && (sep = *inputFS) == ' ') { /* default whitespace */
|
||||
for (i = 0; ; ) {
|
||||
while (*r == ' ' || *r == '\t' || *r == '\n')
|
||||
r++;
|
||||
|
@ -361,26 +420,58 @@ void fldbld(void) /* create fields from current record */
|
|||
*fr++ = 0;
|
||||
}
|
||||
*fr = 0;
|
||||
} else if ((sep = *inputFS) == 0) { /* new: FS="" => 1 char/field */
|
||||
for (i = 0; *r != '\0'; r += n) {
|
||||
char buf[MB_LEN_MAX + 1];
|
||||
|
||||
} else if (CSV) { /* CSV processing. no error handling */
|
||||
if (*r != 0) {
|
||||
for (;;) {
|
||||
i++;
|
||||
if (i > nfields)
|
||||
growfldtab(i);
|
||||
if (freeable(fldtab[i]))
|
||||
xfree(fldtab[i]->sval);
|
||||
fldtab[i]->sval = fr;
|
||||
fldtab[i]->tval = FLD | STR | DONTFREE;
|
||||
if (*r == '"' ) { /* start of "..." */
|
||||
for (r++ ; *r != '\0'; ) {
|
||||
if (*r == '"' && r[1] != '\0' && r[1] == '"') {
|
||||
r += 2; /* doubled quote */
|
||||
*fr++ = '"';
|
||||
} else if (*r == '"' && (r[1] == '\0' || r[1] == ',')) {
|
||||
r++; /* skip over closing quote */
|
||||
break;
|
||||
} else {
|
||||
*fr++ = *r++;
|
||||
}
|
||||
}
|
||||
*fr++ = 0;
|
||||
} else { /* unquoted field */
|
||||
while (*r != ',' && *r != '\0')
|
||||
*fr++ = *r++;
|
||||
*fr++ = 0;
|
||||
}
|
||||
if (*r++ == 0)
|
||||
break;
|
||||
|
||||
}
|
||||
}
|
||||
*fr = 0;
|
||||
} else if ((sep = *inputFS) == 0) { /* new: FS="" => 1 char/field */
|
||||
for (i = 0; *r != '\0'; ) {
|
||||
char buf[10];
|
||||
i++;
|
||||
if (i > nfields)
|
||||
growfldtab(i);
|
||||
if (freeable(fldtab[i]))
|
||||
xfree(fldtab[i]->sval);
|
||||
n = mblen(r, MB_LEN_MAX);
|
||||
if (n < 0)
|
||||
n = 1;
|
||||
memcpy(buf, r, n);
|
||||
buf[n] = '\0';
|
||||
n = u8_nextlen(r);
|
||||
for (j = 0; j < n; j++)
|
||||
buf[j] = *r++;
|
||||
buf[j] = '\0';
|
||||
fldtab[i]->sval = tostring(buf);
|
||||
fldtab[i]->tval = FLD | STR;
|
||||
}
|
||||
*fr = 0;
|
||||
} else if (*r != 0) { /* if 0, it's a null field */
|
||||
/* subtlecase : if length(FS) == 1 && length(RS > 0)
|
||||
/* subtle case: if length(FS) == 1 && length(RS > 0)
|
||||
* \n is NOT a field separator (cf awk book 61,84).
|
||||
* this variable is tested in the inner while loop.
|
||||
*/
|
||||
|
@ -754,10 +845,10 @@ int isclvar(const char *s) /* is s of form var=something ? */
|
|||
{
|
||||
const char *os = s;
|
||||
|
||||
if (!isalpha((uschar) *s) && *s != '_')
|
||||
if (!isalpha((int) *s) && *s != '_')
|
||||
return 0;
|
||||
for ( ; *s; s++)
|
||||
if (!(isalnum((uschar) *s) || *s == '_'))
|
||||
if (!(isalnum((int) *s) || *s == '_'))
|
||||
break;
|
||||
return *s == '=' && s > os;
|
||||
}
|
||||
|
@ -792,28 +883,19 @@ bool is_valid_number(const char *s, bool trailing_stuff_ok,
|
|||
if (no_trailing)
|
||||
*no_trailing = false;
|
||||
|
||||
while (isspace(*s))
|
||||
while (isspace((int) *s))
|
||||
s++;
|
||||
|
||||
/*
|
||||
* This test, while allowed by newer POSIX standards, represents a regression
|
||||
* where hex strings were treated as numbers in nawk the whole time it has been
|
||||
* in FreeBSD (since 2001). The POSIX 2001 through 2004 standards mandated this
|
||||
* behavior and the current standard allows it. Deviate from upstream by restoring
|
||||
* the prior FreeBSD behavior.
|
||||
*/
|
||||
#if 0
|
||||
// no hex floating point, sorry
|
||||
/* no hex floating point, sorry */
|
||||
if (s[0] == '0' && tolower(s[1]) == 'x')
|
||||
return false;
|
||||
#endif
|
||||
|
||||
// allow +nan, -nan, +inf, -inf, any other letter, no
|
||||
/* allow +nan, -nan, +inf, -inf, any other letter, no */
|
||||
if (s[0] == '+' || s[0] == '-') {
|
||||
is_nan = (strncasecmp(s+1, "nan", 3) == 0);
|
||||
is_inf = (strncasecmp(s+1, "inf", 3) == 0);
|
||||
if ((is_nan || is_inf)
|
||||
&& (isspace(s[4]) || s[4] == '\0'))
|
||||
&& (isspace((int) s[4]) || s[4] == '\0'))
|
||||
goto convert;
|
||||
else if (! isdigit(s[1]) && s[1] != '.')
|
||||
return false;
|
||||
|
@ -836,13 +918,13 @@ bool is_valid_number(const char *s, bool trailing_stuff_ok,
|
|||
/*
|
||||
* check for trailing stuff
|
||||
*/
|
||||
while (isspace(*ep))
|
||||
while (isspace((int) *ep))
|
||||
ep++;
|
||||
|
||||
if (no_trailing != NULL)
|
||||
*no_trailing = (*ep == '\0');
|
||||
|
||||
// return true if found the end, or trailing stuff is allowed
|
||||
/* return true if found the end, or trailing stuff is allowed */
|
||||
retval = *ep == '\0' || trailing_stuff_ok;
|
||||
|
||||
return retval;
|
||||
|
|
|
@ -22,7 +22,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
|
|||
THIS SOFTWARE.
|
||||
****************************************************************/
|
||||
|
||||
const char *version = "version 20210724";
|
||||
const char *version = "version 20240122";
|
||||
|
||||
#define DEBUG
|
||||
#include <stdio.h>
|
||||
|
@ -49,8 +49,12 @@ static size_t maxpfile; /* max program filename */
|
|||
static size_t npfile; /* number of filenames */
|
||||
static size_t curpfile; /* current filename */
|
||||
|
||||
bool CSV = false; /* true for csv input */
|
||||
|
||||
bool safe = false; /* true => "safe" mode */
|
||||
|
||||
size_t awk_mb_cur_max = 1;
|
||||
|
||||
static noreturn void fpecatch(int n
|
||||
#ifdef SA_SIGINFO
|
||||
, siginfo_t *si, void *uc
|
||||
|
@ -89,10 +93,8 @@ static const char *
|
|||
setfs(char *p)
|
||||
{
|
||||
/* wart: t=>\t */
|
||||
if (p[0] == 't' && p[1] == '\0') {
|
||||
WARNING("-Ft to imply tab separator is deprecated behavior.");
|
||||
if (p[0] == 't' && p[1] == '\0')
|
||||
return "\t";
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
|
@ -116,10 +118,11 @@ int main(int argc, char *argv[])
|
|||
|
||||
setlocale(LC_CTYPE, "");
|
||||
setlocale(LC_NUMERIC, "C"); /* for parsing cmdline & prog */
|
||||
awk_mb_cur_max = MB_CUR_MAX;
|
||||
cmdname = argv[0];
|
||||
if (argc == 1) {
|
||||
fprintf(stderr,
|
||||
"usage: %s [-F fs] [-v var=value] [-f progfile | 'prog'] [file ...]\n",
|
||||
"usage: %s [-F fs | --csv] [-v var=value] [-f progfile | 'prog'] [file ...]\n",
|
||||
cmdname);
|
||||
exit(1);
|
||||
}
|
||||
|
@ -152,6 +155,12 @@ int main(int argc, char *argv[])
|
|||
argv++;
|
||||
break;
|
||||
}
|
||||
if (strcmp(argv[1], "--csv") == 0) { /* turn on csv input processing */
|
||||
CSV = true;
|
||||
argc--;
|
||||
argv++;
|
||||
continue;
|
||||
}
|
||||
switch (argv[1][1]) {
|
||||
case 's':
|
||||
if (strcmp(argv[1], "-safe") == 0)
|
||||
|
@ -190,6 +199,10 @@ int main(int argc, char *argv[])
|
|||
argc--;
|
||||
argv++;
|
||||
}
|
||||
|
||||
if (CSV && (fs != NULL || lookup("FS", symtab) != NULL))
|
||||
WARNING("danger: don't set FS when --csv is in effect");
|
||||
|
||||
/* argv[1] is now the first argument */
|
||||
if (npfile == 0) { /* no -f; first argument is program */
|
||||
if (argc <= 1) {
|
||||
|
|
|
@ -28,10 +28,10 @@ CFLAGS =
|
|||
CFLAGS = -O2
|
||||
|
||||
# compiler options
|
||||
#CC = gcc -Wall -g -Wwrite-strings
|
||||
#CC = gcc -O4 -Wall -pedantic -fno-strict-aliasing
|
||||
#CC = gcc -fprofile-arcs -ftest-coverage # then gcov f1.c; cat f1.c.gcov
|
||||
HOSTCC = gcc -g -Wall -pedantic -Wcast-qual
|
||||
#CC = cc -Wall -g -Wwrite-strings
|
||||
#CC = cc -O4 -Wall -pedantic -fno-strict-aliasing
|
||||
#CC = cc -fprofile-arcs -ftest-coverage # then gcov f1.c; cat f1.c.gcov
|
||||
HOSTCC = cc -g -Wall -pedantic -Wcast-qual
|
||||
CC = $(HOSTCC) # change this is cross-compiling.
|
||||
|
||||
# By fiat, to make our lives easier, yacc is now defined to be bison.
|
||||
|
|
|
@ -52,8 +52,8 @@ struct xx
|
|||
{ ARRAY, "array", NULL },
|
||||
{ INDIRECT, "indirect", "$(" },
|
||||
{ SUBSTR, "substr", "substr" },
|
||||
{ SUB, "sub", "sub" },
|
||||
{ GSUB, "gsub", "gsub" },
|
||||
{ SUB, "dosub", "sub" },
|
||||
{ GSUB, "dosub", "gsub" },
|
||||
{ INDEX, "sindex", "sindex" },
|
||||
{ SPRINTF, "awksprintf", "sprintf " },
|
||||
{ ADD, "arith", " + " },
|
||||
|
|
|
@ -29,7 +29,7 @@ THIS SOFTWARE.
|
|||
#include "awk.h"
|
||||
#include "awkgram.tab.h"
|
||||
|
||||
Node *nodealloc(int n)
|
||||
Node *nodealloc(size_t n)
|
||||
{
|
||||
Node *x;
|
||||
|
||||
|
|
|
@ -43,14 +43,13 @@ extern fa *mkdfa(const char *, bool);
|
|||
extern int makeinit(fa *, bool);
|
||||
extern void penter(Node *);
|
||||
extern void freetr(Node *);
|
||||
extern int hexstr(const uschar **);
|
||||
extern int quoted(const uschar **);
|
||||
extern char *cclenter(const char *);
|
||||
extern int *cclenter(const char *);
|
||||
extern noreturn void overflo(const char *);
|
||||
extern void cfoll(fa *, Node *);
|
||||
extern int first(Node *);
|
||||
extern void follow(Node *);
|
||||
extern int member(int, const char *);
|
||||
extern int member(int, int *);
|
||||
extern int match(fa *, const char *);
|
||||
extern int pmatch(fa *, const char *);
|
||||
extern int nematch(fa *, const char *);
|
||||
|
@ -68,7 +67,7 @@ extern void freefa(fa *);
|
|||
extern int pgetc(void);
|
||||
extern char *cursource(void);
|
||||
|
||||
extern Node *nodealloc(int);
|
||||
extern Node *nodealloc(size_t);
|
||||
extern Node *exptostat(Node *);
|
||||
extern Node *node1(int, Node *);
|
||||
extern Node *node2(int, Node *, Node *);
|
||||
|
@ -199,8 +198,7 @@ extern FILE *openfile(int, const char *, bool *);
|
|||
extern const char *filename(FILE *);
|
||||
extern Cell *closefile(Node **, int);
|
||||
extern void closeall(void);
|
||||
extern Cell *sub(Node **, int);
|
||||
extern Cell *gsub(Node **, int);
|
||||
extern Cell *dosub(Node **, int);
|
||||
extern Cell *gensub(Node **, int);
|
||||
|
||||
extern FILE *popen(const char *, const char *);
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -4,7 +4,7 @@ oldawk=${oldawk-awk}
|
|||
awk=${awk-../a.out}
|
||||
|
||||
echo compiling time.c
|
||||
gcc time.c -o time
|
||||
cc time.c -o time
|
||||
time=./time
|
||||
|
||||
echo time command = $time
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#!/bin/sh
|
||||
|
||||
uname -a
|
||||
gcc echo.c -o echo && echo echo compiled
|
||||
cc echo.c -o echo && echo echo compiled
|
||||
|
||||
oldawk=${oldawk-awk}
|
||||
awk=${awk-../a.out}
|
||||
|
|
|
@ -97,6 +97,12 @@ echo '111
|
|||
$awk '{print L $0}' L=11 foo0 L=22 foo0 >foo2
|
||||
diff foo1 foo2 || echo 'BAD: T.argv (L=11 L=22)'
|
||||
|
||||
echo >foo0
|
||||
echo 'name=value
|
||||
name=value' >foo1
|
||||
$awk 'BEGIN { print ARGV[1] } { print ARGV[1] }' name=value foo0 >foo2
|
||||
diff foo1 foo2 || echo 'BAD: T.argv assignment operand modified'
|
||||
|
||||
echo 3.345 >foo1
|
||||
$awk 'BEGIN { print ARGV[1] + ARGV[2]}' 1 2.345 >foo2
|
||||
diff foo1 foo2 || echo 'BAD: T.argv (ARGV[1] + ARGV[2])'
|
||||
|
|
80
contrib/one-true-awk/testdir/T.csv
Executable file
80
contrib/one-true-awk/testdir/T.csv
Executable file
|
@ -0,0 +1,80 @@
|
|||
#!/bin/sh
|
||||
|
||||
echo T.csv: tests of csv field splitting, no embedded newlines
|
||||
|
||||
awk=${awk-../a.out}
|
||||
|
||||
$awk '
|
||||
BEGIN {
|
||||
FS = "\t"
|
||||
awk = "../a.out --csv"
|
||||
}
|
||||
NF == 0 || $1 ~ /^#/ {
|
||||
next
|
||||
}
|
||||
$1 ~ /try/ { # new test
|
||||
nt++
|
||||
sub(/try /, "")
|
||||
prog = $0
|
||||
printf("%3d %s\n", nt, prog)
|
||||
prog = sprintf("%s '"'"'%s'"'"'", awk, prog)
|
||||
# print "prog is", prog
|
||||
nt2 = 0
|
||||
while (getline > 0) {
|
||||
if (NF == 0) # blank line terminates a sequence
|
||||
break
|
||||
input = $1
|
||||
for (i = 2; i < NF; i++) # input data
|
||||
input = input "\t" $i
|
||||
test = sprintf("./echo '"'"'%s'"'"' | %s >foo1; ",
|
||||
input, prog)
|
||||
if ($NF == "\"\"")
|
||||
output = ">foo2;"
|
||||
else
|
||||
output = sprintf("./echo '"'"'%s'"'"' >foo2; ", $NF)
|
||||
gsub(/\\t/, "\t", output)
|
||||
gsub(/\\n/, "\n", output)
|
||||
run = sprintf("cmp foo1 foo2 || echo test %d.%d failed",
|
||||
nt, ++nt2)
|
||||
# print "input is", input
|
||||
# print "test is", test
|
||||
# print "output is", output
|
||||
# print "run is", run
|
||||
system(test output run)
|
||||
}
|
||||
tt += nt2
|
||||
}
|
||||
END { print tt, "tests" }
|
||||
' <<\!!!!
|
||||
# General format:
|
||||
# try program as rest of line
|
||||
# $1 $2 $3 output1 (\t for tab, \n for newline,
|
||||
# $1 $2 $3 output2 ("" for null)
|
||||
# ... terminated by blank line
|
||||
|
||||
|
||||
try { for (i=1; i<=NF; i++) printf("[%s]", $i); printf("\n") }
|
||||
a [a]
|
||||
a [ a]
|
||||
,a [][a]
|
||||
, a [ ][ a]
|
||||
a,b [a][b]
|
||||
a,b,c [a][b][c]
|
||||
"" []
|
||||
"abc" [abc]
|
||||
"a""b" [a"b]
|
||||
"a","b" [a][b]
|
||||
a""b [a""b]
|
||||
"a,b" [a,b]
|
||||
"""" ["]
|
||||
"""""" [""]
|
||||
"""x""" ["x"]
|
||||
""",""" [","]
|
||||
,,"" [][][]
|
||||
a""b [a""b]
|
||||
a''b [a''b]
|
||||
,, [][][]
|
||||
a, [a][]
|
||||
"", [][]
|
||||
, [][]
|
||||
!!!!
|
Can't render this file because it contains an unexpected character in line 9 and column 7.
|
|
@ -20,5 +20,6 @@ grep 'unknown option' foo >/dev/null || echo 'T.flags: bad unknown option'
|
|||
$awk -F >foo 2>&1
|
||||
grep 'no field separator' foo >/dev/null || echo 'T.flags: bad missing field separator'
|
||||
|
||||
$awk -F '' >foo 2>&1
|
||||
grep 'field separator FS is empty' foo >/dev/null || echo 'T.flags: bad empty field separator'
|
||||
### Awk is now like gawk and splits into separate characters if FS = ""
|
||||
# $awk -F '' >foo 2>&1
|
||||
# grep 'field separator FS is empty' foo >/dev/null || echo 'T.flags: bad empty field separator'
|
||||
|
|
|
@ -504,3 +504,23 @@ cmp -s foo1 foo2 || echo 'BAD: T.misc END must preserve $0'
|
|||
echo 'E 2' >foo1
|
||||
(trap '' PIPE; "$awk" 'BEGIN { print "hi"; }' 2>/dev/null; echo "E $?" >foo2) | :
|
||||
cmp -s foo1 foo2 || echo 'BAD: T.misc exit status on I/O error'
|
||||
|
||||
# Check for clobbering of the lexer's regular expression buffer.
|
||||
# If the output is "a1" instead of "1b", /b/ clobbered /a/.
|
||||
echo 1b >foo1
|
||||
echo ab | $awk '{ sub(/a/, "b" ~ /b/); print }' >foo2
|
||||
cmp -s foo1 foo2 || echo 'BAD: T.misc lexer regex buffer clobbered'
|
||||
|
||||
# Check handling of octal \OOO and hex \xHH esc. seqs. in strings.
|
||||
echo 'hello888
|
||||
hello
|
||||
hello
|
||||
helloxGOO
|
||||
hello
|
||||
0A' > foo1
|
||||
$awk 'BEGIN { print "hello\888" }' > foo2
|
||||
$awk 'BEGIN { print "hello\x000A" }' >> foo2
|
||||
$awk 'BEGIN { printf "hello\x0A" }' >> foo2
|
||||
$awk 'BEGIN { print "hello\xGOO" }' >> foo2
|
||||
$awk 'BEGIN { print "hello\x0A0A" }' >> foo2
|
||||
cmp -s foo1 foo2 || echo '<27>BAD: T.misc escape sequences in strings mishandled'
|
||||
|
|
|
@ -84,3 +84,5 @@ grep "out of range field" foo >/dev/null || echo 1>&2 "BAD: T.overflow \$400000"
|
|||
rm -rf /tmp/awktestfoo*
|
||||
$awk 'BEGIN { for (i=1; i <= 1000; i++) print i >("/tmp/awktestfoo" i) }'
|
||||
ls /tmp/awktestfoo* | grep '1000' >/dev/null || echo 1>&2 "BAD: T.overflow openfiles"
|
||||
rm -rf /tmp/awktestfoo*
|
||||
exit 0
|
||||
|
|
|
@ -220,5 +220,6 @@ $awk 'BEGIN {
|
|||
echo 'cat dog' > $TEMP2
|
||||
diff $TEMP1 $TEMP2 || fail 'BAD: T.split(a, b, "[\r\n]+")'
|
||||
|
||||
rm -rf $WORKDIR
|
||||
|
||||
exit $RESULT
|
||||
|
|
194
contrib/one-true-awk/testdir/T.utf
Executable file
194
contrib/one-true-awk/testdir/T.utf
Executable file
|
@ -0,0 +1,194 @@
|
|||
#!/bin/sh
|
||||
|
||||
echo T.utf: tests of utf functions
|
||||
|
||||
awk=${awk-../a.out}
|
||||
|
||||
$awk '
|
||||
BEGIN {
|
||||
FS = "\t"
|
||||
awk = "../a.out"
|
||||
}
|
||||
NF == 0 || $1 ~ /^#/ {
|
||||
next
|
||||
}
|
||||
$1 ~ /try/ { # new test
|
||||
nt++
|
||||
sub(/try [a-zA-Z_0-9]+ /, "")
|
||||
prog = $0
|
||||
printf("try %3d %s\n", nt, prog)
|
||||
prog = sprintf("%s -F\"\\t\" '"'"'%s'"'"'", awk, prog)
|
||||
# print "prog is", prog
|
||||
nt2 = 0
|
||||
while (getline > 0) {
|
||||
if (NF == 0) # blank line terminates a sequence
|
||||
break
|
||||
input = $1
|
||||
for (i = 2; i < NF; i++) # input data
|
||||
input = input "\t" $i
|
||||
test = sprintf("./echo '"'"'%s'"'"' | %s >foo1; ",
|
||||
input, prog)
|
||||
if ($NF == "\"\"")
|
||||
output = ">foo2;"
|
||||
else
|
||||
output = sprintf("./echo '"'"'%s'"'"' >foo2; ", $NF)
|
||||
gsub(/\\t/, "\t", output)
|
||||
gsub(/\\n/, "\n", output)
|
||||
run = sprintf("diff foo1 foo2 || echo test %d.%d failed",
|
||||
nt, ++nt2)
|
||||
# print "input is", input
|
||||
# print "test is", test
|
||||
# print "output is", output
|
||||
# print "run is", run
|
||||
system(test output run)
|
||||
}
|
||||
tt += nt2
|
||||
}
|
||||
END { print tt, "tests" }
|
||||
' <<\!!!!
|
||||
# General format:
|
||||
# try program as rest of line
|
||||
# $1 $2 $3 output1 (\t for tab, \n for newline,
|
||||
# $1 $2 $3 output2 ("" for null)
|
||||
# ... terminated by blank line
|
||||
|
||||
# try another program...
|
||||
|
||||
try length { print length($1) }
|
||||
0
|
||||
a 1
|
||||
の今がその時だ 7
|
||||
Сейчас 6
|
||||
现在是时候了 6
|
||||
给所有的好男 6
|
||||
来参加聚会。 6
|
||||
😀 1
|
||||
🖕 finger 8
|
||||
Τωρα 4
|
||||
για 3
|
||||
να 2
|
||||
עכשיו 5
|
||||
לכל 3
|
||||
לבוא 4
|
||||
の今がその時だ 7
|
||||
지금이 3
|
||||
모든 2
|
||||
파티에 3
|
||||
Сейчас 6
|
||||
для 3
|
||||
прийти 6
|
||||
|
||||
try index { print index($1, $2) }
|
||||
abc a 1
|
||||
abc b 2
|
||||
abc x 0
|
||||
现在是时候了 "" 0
|
||||
现在是时候了 了 6
|
||||
现在是时候了 在是 2
|
||||
现在是时候了 x 0
|
||||
现x在是时候了 x 2
|
||||
🖕 fingerすべての善人のためにすべての善人のために f 3
|
||||
🖕 finger🖕 r🖕 8
|
||||
|
||||
try substr { print substr($0, 2, 3) }
|
||||
abcdef bcd
|
||||
Τωρα ειναι η ωρα
|
||||
Τω ω
|
||||
지금 이절호의 금 이
|
||||
xпyрийти пyр
|
||||
|
||||
try rematch { print $1 ~ $2 }
|
||||
abc a 1
|
||||
abc x 0
|
||||
すべての善人のために の 1
|
||||
すべての善人のために の.*の 1
|
||||
すべての善人のために の.*て 0
|
||||
Τωρα ω+ 1
|
||||
|
||||
# replace first occurrence of $2 by $3 in $1
|
||||
try sub { n = sub($2, $3, $1); print n, $1 }
|
||||
abcdef bc XYZ 1 aXYZdef
|
||||
abcdef xy XYZ 0 abcdef
|
||||
の今がその時だ の NO 1 NO今がその時だ
|
||||
🖕 finger 🖕.*g FING 1 FINGer
|
||||
Сейчас . x 1 xейчас
|
||||
|
||||
# replace all occurrences of $2 by $3 in $1
|
||||
try gsub { n = gsub($2, $3, $1); print n, $1 }
|
||||
abcdef bc XYZ 1 aXYZdef
|
||||
abcdef xy XYZ 0 abcdef
|
||||
の今がその時だ の NO 2 NO今がそNO時だ
|
||||
🖕 finger 🖕.*g FING 1 FINGer
|
||||
Сейчас . x 6 xxxxxx
|
||||
|
||||
try match { print match($1, $2), RSTART, RLENGTH }
|
||||
abc [^a] 2 2 1
|
||||
abc [^ab] 3 3 1
|
||||
すべての善人のために [^す] 2 2 1
|
||||
すべての善人のために [^ぁ-ゖ] 5 5 1
|
||||
abc a 1 1 1
|
||||
abc x 0 0 -1
|
||||
すべての善人のために の 4 4 1
|
||||
すべての善人のために の.*の 4 4 4
|
||||
すべての善人のために の.*て 0 0 -1
|
||||
Τωρα ω+ 2 2 1
|
||||
Τωρα x+ 0 0 -1
|
||||
Τωρα ω. 2 2 2
|
||||
すべての善人のために [の] 4 4 1
|
||||
すべての善人のために [ぁ-え] 0 0 -1
|
||||
すべての善人のために [^ぁ-え] 1 1 1
|
||||
Τωρα ειναι η [α-ω] 2 2 1
|
||||
Τωρα ειναι η [α-ω]+ 2 2 3
|
||||
xxxΤωρα ειναι η [Α-Ω] 4 4 1
|
||||
για όλους τους καλούς ά α.*α 3 3 15
|
||||
να έρθει στο πά [^ν] 2 2 1
|
||||
|
||||
# FS="" should split into unicode chars
|
||||
try emptyFS BEGIN {FS=""} {print NF}
|
||||
すべての善人のために 10
|
||||
の今がその時だ 7
|
||||
Сейчас 6
|
||||
现在是时候了 6
|
||||
给所有的好男 6
|
||||
来参加聚会。 6
|
||||
😀 1
|
||||
🖕 finger 8
|
||||
|
||||
# printf(%N.Ns) for utf8 strings
|
||||
try printfs1 {printf("[%5.2s][%-5.2s]\n"), $1, $1}
|
||||
abcd [ ab][ab ]
|
||||
现在abc [ 现在][现在 ]
|
||||
现ωabc [ 现ω][现ω ]
|
||||
ωabc [ ωa][ωa ]
|
||||
Сейчас [ Се][Се ]
|
||||
Сейxyz [ Се][Се ]
|
||||
😀 [ 😀][😀 ]
|
||||
|
||||
# printf(%N.Ns) for utf8 strings
|
||||
try printfs2 {printf("[%5s][%-5s]\n"), $1, $1}
|
||||
abcd [ abcd][abcd ]
|
||||
现在ab [ 现在ab][现在ab ]
|
||||
a现在ab [a现在ab][a现在ab]
|
||||
a现在abc [a现在abc][a现在abc]
|
||||
现ωab [ 现ωab][现ωab ]
|
||||
ωabc [ ωabc][ωabc ]
|
||||
Сейчас [Сейчас][Сейчас]
|
||||
😀 [ 😀][😀 ]
|
||||
|
||||
# printf(%N.Ns) for utf8 strings
|
||||
try printfs3 {printf("[%.2s][%-.2s]\n"), $1, $1}
|
||||
abcd [ab][ab]
|
||||
现在abc [现在][现在]
|
||||
现ωabc [现ω][现ω]
|
||||
ω [ω][ω]
|
||||
😀 [😀][😀]
|
||||
|
||||
# printf(%c) for utf
|
||||
try printfc {printf("%c %c\n", $1, substr($1,2,1))}
|
||||
すべての善人のために す べ
|
||||
の今がその時だ の 今
|
||||
Сейчас С е
|
||||
现在是时候了 现 在
|
||||
😀🖕 😀 🖕
|
||||
|
||||
!!!!
|
234
contrib/one-true-awk/testdir/T.utfre
Executable file
234
contrib/one-true-awk/testdir/T.utfre
Executable file
|
@ -0,0 +1,234 @@
|
|||
echo T.utfre: tests of regular expression code for Unicode/utf-8
|
||||
# adapted from T.re
|
||||
|
||||
awk '
|
||||
BEGIN {
|
||||
FS = "\t"
|
||||
awk = "../a.out"
|
||||
}
|
||||
NF == 0 {
|
||||
next
|
||||
}
|
||||
$1 != "" { # new test
|
||||
re = $1
|
||||
}
|
||||
$2 != "" { # either ~ or !~
|
||||
op = $2
|
||||
if (op == "~")
|
||||
neg = "!"
|
||||
else if (op == "!~")
|
||||
neg = ""
|
||||
}
|
||||
$3 != "" { # new test string
|
||||
str = $3
|
||||
}
|
||||
$3 == "\"\"" { # explicit empty line
|
||||
$3 = ""
|
||||
}
|
||||
NF > 2 { # generate a test
|
||||
input = $3
|
||||
test = sprintf("./echo '"'"'%s'"'"' | %s '"'"'%s/%s/ {print \"%d fails %s %s %s\"}'"'"'",
|
||||
input, awk, neg, re, NR, re, op, input)
|
||||
# printf(" %3d %s %s %s:\n", NR, re, op, input)
|
||||
# print "test is |" test "|"
|
||||
system(test)
|
||||
# system("bprint -c ../a.out")
|
||||
nt++
|
||||
}
|
||||
END { print " " nt, "tests" }
|
||||
' <<\!!!!
|
||||
~ 🖕
|
||||
🖕🖕
|
||||
🖕🖕🖕
|
||||
""
|
||||
🖕 ~ 🖕
|
||||
b🖕
|
||||
b🖕b
|
||||
!~ ""
|
||||
时
|
||||
xxxxx
|
||||
. ~ 时
|
||||
x时x
|
||||
🙂
|
||||
!~ ""
|
||||
.の ~ xの
|
||||
xxの
|
||||
xのx
|
||||
!~ の
|
||||
のx
|
||||
""
|
||||
$ ~ x
|
||||
🙂
|
||||
""
|
||||
.$ ~ 모
|
||||
xx모
|
||||
x모x
|
||||
!~ ""
|
||||
д$ ~ д
|
||||
bд
|
||||
bbbд
|
||||
!~ дb
|
||||
x
|
||||
""
|
||||
^ ~ и
|
||||
""
|
||||
^
|
||||
^λ$ ~ λ
|
||||
!~ xλ
|
||||
λx
|
||||
xλx
|
||||
""
|
||||
^λ.$ ~ λx
|
||||
λλ
|
||||
!~ xλ
|
||||
λλλ
|
||||
λxy
|
||||
""
|
||||
^$ ~ ""
|
||||
!~ に
|
||||
^
|
||||
^.해 ~ め해
|
||||
め해해
|
||||
!~ 해
|
||||
""
|
||||
^.*해 ~ 해
|
||||
め해
|
||||
めめめめめめ해
|
||||
!~ ""
|
||||
^.+해 ~ め해
|
||||
めめめめめめ해
|
||||
!~ ""
|
||||
해
|
||||
해め
|
||||
해* ~ ""
|
||||
해
|
||||
해해해해
|
||||
め해
|
||||
めめめめ
|
||||
해해* ~ 해
|
||||
해해해
|
||||
め해
|
||||
!~ めめめめ
|
||||
""
|
||||
\$ ~ 🖕$
|
||||
$
|
||||
$🖕
|
||||
🖕$🖕
|
||||
!~ ""
|
||||
🖕
|
||||
\. ~ .
|
||||
!~ 🖕
|
||||
""
|
||||
xθ+y ~ xθy
|
||||
xθθy
|
||||
xθθθθθθy
|
||||
!~ θy
|
||||
xy
|
||||
xθ
|
||||
xθ?y ~ xy
|
||||
xθy
|
||||
!~ xθθy
|
||||
θ?b?の? ~ ""
|
||||
x
|
||||
^a?b?め ~ め
|
||||
aめ
|
||||
bめ
|
||||
abめ
|
||||
めa
|
||||
!~ ""
|
||||
ab
|
||||
aba
|
||||
[Α-Ω] ~ Α
|
||||
aΔb
|
||||
xyΩ
|
||||
!~ abc
|
||||
β
|
||||
""
|
||||
[^Α-Ω] ~ δ
|
||||
aΔb
|
||||
xyΩ
|
||||
!~ Α
|
||||
Δ
|
||||
""
|
||||
[Α-ΔΦ-Ω] ~ Α
|
||||
Β
|
||||
Δ
|
||||
Φ
|
||||
Ω
|
||||
!~ Π
|
||||
Σ
|
||||
Π[[:lower:]]+ ~ Πa
|
||||
Πab
|
||||
!~ Π
|
||||
ΠX
|
||||
Π:
|
||||
Π[
|
||||
Π]
|
||||
の[0-9]+に ~ の0に
|
||||
の23に
|
||||
の12345に
|
||||
!~ 0に
|
||||
のに
|
||||
の[0-9]?に ~ のに
|
||||
の1に
|
||||
!~ の23に
|
||||
の[[]に ~ の[に
|
||||
!~ のに
|
||||
の[[]に
|
||||
の]に
|
||||
の[[-]に ~ の[に
|
||||
の-に
|
||||
!~ のに
|
||||
の[[]に
|
||||
の]に
|
||||
の[[-a]に ~ の[に
|
||||
のaに
|
||||
の]に
|
||||
!~ のに
|
||||
の[[]に
|
||||
の-に
|
||||
の[]-a]に ~ の]に
|
||||
のaに
|
||||
!~ のに
|
||||
の[に
|
||||
の-に
|
||||
の[]]に ~ の]に
|
||||
!~ のに
|
||||
の[]]に
|
||||
の[に
|
||||
の[^[]に ~ のaに
|
||||
!~ の[に
|
||||
の[-]に ~ の-に
|
||||
!~ のに
|
||||
の+に
|
||||
の[^-]に ~ の+に
|
||||
!~ の-に
|
||||
のに
|
||||
の[][]に ~ の[に
|
||||
の]に
|
||||
!~ のに
|
||||
の][に
|
||||
の[]に
|
||||
の[z-a]に ~ のに
|
||||
!~ の
|
||||
に
|
||||
のaに
|
||||
のzに
|
||||
の-に
|
||||
に|だ ~ だ
|
||||
に
|
||||
だに
|
||||
!~ a
|
||||
""
|
||||
^στο|τους$ ~ στο
|
||||
στοd
|
||||
aτους
|
||||
τους
|
||||
!~ xστο
|
||||
τουςa
|
||||
^(στο|τους)$ ~ στο
|
||||
τους
|
||||
!~ στοτους
|
||||
στοx
|
||||
cτους
|
||||
!!!!
|
|
@ -15,7 +15,7 @@ BEGIN {
|
|||
END { printline() }
|
||||
|
||||
function addword(w) {
|
||||
## print "adding [", w, "] ", length(w), length(line), maxlen
|
||||
print "adding [", w, "] ", length(w), length(line), maxlen
|
||||
if (length(line) + length(w) > maxlen)
|
||||
printline()
|
||||
if (length(w) > 2 && ( w ~ /[\.!]["?)]?$/ || w ~ /[?!]"?$/) &&
|
||||
|
|
|
@ -70,18 +70,6 @@ Cell *literal0;
|
|||
|
||||
extern Cell **fldtab;
|
||||
|
||||
static void
|
||||
setfree(Cell *vp)
|
||||
{
|
||||
if (&vp->sval == FS || &vp->sval == RS ||
|
||||
&vp->sval == OFS || &vp->sval == ORS ||
|
||||
&vp->sval == OFMT || &vp->sval == CONVFMT ||
|
||||
&vp->sval == FILENAME || &vp->sval == SUBSEP)
|
||||
vp->tval |= DONTFREE;
|
||||
else
|
||||
vp->tval &= ~DONTFREE;
|
||||
}
|
||||
|
||||
void syminit(void) /* initialize symbol table with builtin vars */
|
||||
{
|
||||
literal0 = setsymtab("0", "0", 0.0, NUM|STR|CON|DONTFREE, symtab);
|
||||
|
@ -320,7 +308,7 @@ Awkfloat setfval(Cell *vp, Awkfloat f) /* set float val of a Cell */
|
|||
} else if (&vp->fval == NF) {
|
||||
donerec = false; /* mark $0 invalid */
|
||||
setlastfld(f);
|
||||
DPRINTF("setting NF to %g\n", f);
|
||||
DPRINTF("setfval: setting NF to %g\n", f);
|
||||
} else if (isrec(vp)) {
|
||||
donefld = false; /* mark $1... invalid */
|
||||
donerec = true;
|
||||
|
@ -360,6 +348,10 @@ char *setsval(Cell *vp, const char *s) /* set string val of a Cell */
|
|||
(void*)vp, NN(vp->nval), s, vp->tval, donerec, donefld);
|
||||
if ((vp->tval & (NUM | STR)) == 0)
|
||||
funnyvar(vp, "assign to");
|
||||
if (CSV && (vp == rsloc))
|
||||
WARNING("danger: don't set RS when --csv is in effect");
|
||||
if (CSV && (vp == fsloc))
|
||||
WARNING("danger: don't set FS when --csv is in effect");
|
||||
if (isfld(vp)) {
|
||||
donerec = false; /* mark $0 invalid */
|
||||
fldno = atoi(vp->nval);
|
||||
|
@ -377,10 +369,9 @@ char *setsval(Cell *vp, const char *s) /* set string val of a Cell */
|
|||
t = s ? tostring(s) : tostring(""); /* in case it's self-assign */
|
||||
if (freeable(vp))
|
||||
xfree(vp->sval);
|
||||
vp->tval &= ~(NUM|CONVC|CONVO);
|
||||
vp->tval &= ~(NUM|DONTFREE|CONVC|CONVO);
|
||||
vp->tval |= STR;
|
||||
vp->fmt = NULL;
|
||||
setfree(vp);
|
||||
DPRINTF("setsval %p: %s = \"%s (%p) \", t=%o r,f=%d,%d\n",
|
||||
(void*)vp, NN(vp->nval), t, (void*)t, vp->tval, donerec, donefld);
|
||||
vp->sval = t;
|
||||
|
@ -388,7 +379,7 @@ char *setsval(Cell *vp, const char *s) /* set string val of a Cell */
|
|||
donerec = false; /* mark $0 invalid */
|
||||
f = getfval(vp);
|
||||
setlastfld(f);
|
||||
DPRINTF("setting NF to %g\n", f);
|
||||
DPRINTF("setsval: setting NF to %g\n", f);
|
||||
}
|
||||
|
||||
return(vp->sval);
|
||||
|
@ -576,7 +567,6 @@ Cell *catstr(Cell *a, Cell *b) /* concatenate a and b */
|
|||
|
||||
char *qstring(const char *is, int delim) /* collect string up to next delim */
|
||||
{
|
||||
const char *os = is;
|
||||
int c, n;
|
||||
const uschar *s = (const uschar *) is;
|
||||
uschar *buf, *bp;
|
||||
|
@ -585,7 +575,7 @@ char *qstring(const char *is, int delim) /* collect string up to next delim */
|
|||
FATAL( "out of space in qstring(%s)", s);
|
||||
for (bp = buf; (c = *s) != delim; s++) {
|
||||
if (c == '\n')
|
||||
SYNTAX( "newline in string %.20s...", os );
|
||||
SYNTAX( "newline in string %.20s...", is );
|
||||
else if (c != '\\')
|
||||
*bp++ = c;
|
||||
else { /* \something */
|
||||
|
|
Loading…
Reference in a new issue