Fork of http://sourceforge.net/p/pagc/code/HEAD/tree/branches/sew-refactor/postgresql (svn://svn.code.sf.net/p/pagc/code/branches/sew-refactor/postgresql ) at r361 (Stephen Woodbridge, Walter Sinclair contribution) - address_standardizer extension for PostgreSQL forked from PAGC address standardizer to work with PostgreSQL

git-svn-id: http://svn.osgeo.org/postgis/trunk@12716 b70326c6-7e19-0410-871a-916f4a2858ee
This commit is contained in:
Regina Obe 2014-07-03 02:14:31 +00:00
parent 5b79d69994
commit 1f64e92017
43 changed files with 63907 additions and 0 deletions

View file

@ -0,0 +1,12 @@
Copyright 2006-2013 Stephen Woodbridge.
Copyright (c) 2008 Walter Bruce Sinclair
woodbri@swoodbridge.com
woodbr@imaptools.com
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View file

@ -0,0 +1,122 @@
OBJS = \
address_parser.o \
address_standardizer.o \
std_pg_hash.o \
analyze.o \
err_param.o \
export.o \
gamma.o \
hash.o \
lexicon.o \
pagc_tools.o \
parseaddress-api.o \
standard.o \
tokenize.o
OBJS_test_main = \
test_main.o \
analyze.o \
err_param.o \
export.o \
gamma.o \
hash.o \
lexicon.o \
pagc_tools.o \
standard.o \
tokenize.o
MODULE_big = address_standardizer
EXTENSION = address_standardizer
DATA_built = address_standardizer--1.0.sql us-lex.sql us-gaz.sql us-rules.sql
DOCS = README.address_standardizer
PG_CPPFLAGS = -g -O0
SHLIB_LINK = -lpcre
EXTRA_CLEAN = usps-st-city-name.txt mk-st-regexp mk-city-regex test_main
PG_CONFIG = pg_config
PGXS := $(shell $(PG_CONFIG) --pgxs)
PGVER := $(shell $(PG_CONFIG) --version)
include $(PGXS)
PERL := $(shell which perl)
address_standardizer--1.0.sql: address_standardizer--1.0.sql.in
$(PERL) mk-sql.pl '$(PGVER)' address_standardizer--1.0.sql.in > address_standardizer--1.0.sql
us-lex.sql: lexicon.csv
$(PERL) pagc-data-psql lex lexicon.csv > us-lex.sql
us-gaz.sql: gazeteer.csv
$(PERL) pagc-data-psql gaz gazeteer.csv > us-gaz.sql
us-rules.sql: rules.txt
$(PERL) pagc-data-psql rules rules.txt > us-rules.sql
mk-st-regexp: mk-st-regexp.pl
$(PERL) -c mk-st-regexp.pl
rm -f mk-st-regexp
echo "#! " $(PERL) > mk-st-regexp
cat mk-st-regexp.pl >> mk-st-regexp
chmod ugo+x mk-st-regexp
mk-city-regex: mk-city-regex.pl usps-st-city-name.txt
$(PERL) -c mk-city-regex.pl
rm -f mk-city-regex
echo "#! " $(PERL) > mk-city-regex
cat mk-city-regex.pl >> mk-city-regex
chmod ugo+x mk-city-regex
usps-st-city-name.txt: usps-st-city-orig.txt usps-st-city-adds.txt
cat usps-st-city-orig.txt usps-st-city-adds.txt | sort -u >usps-st-city-name.txt
parseaddress-stcities.h: mk-city-regex
./mk-city-regex > parseaddress-stcities.h
parseaddress-regex.h: mk-st-regexp
./mk-st-regexp > parseaddress-regex.h
dist-clean:
rm -f mk-st-regexp mk-city-regex usps-st-city-name.txt parseaddress-stcities.h parseaddress-regex.h test_main
test:
@echo "To run the test on parse_address do the follow:"
@echo "1. (make && sudo make install) to compile and install extension"
@echo "2. create a database and install the address_standardizer extension"
@echo "3. psql test_db -f test-parseaddress.sql"
@echo "it should report '(0 rows)' if all tests passed or"
@echo "report which ones failed."
test_main: $(OBJS_test_main)
gcc -o test_main $(OBJS_test_main) $(LDFLAGS) $(LIBS)
test_main.o: test_main.c pagc_api.h pagc_std_api.h
address_parser.o: address_parser.c parseaddress-api.h
address_standardizer.o: address_standardizer.c std_pg_hash.h pagc_api.h pagc_std_api.h
analyze.o: analyze.c pagc_api.h
err_param.o: err_param.c pagc_api.h
export.o: export.c pagc_api.h pagc_tools.h
gamma.o: gamma.c pagc_api.h pagc_std_api.h gamma.h
hash.o: hash.c hash.h khash.h
lexicon.o: lexicon.c pagc_api.h pagc_std_api.h
pagc_tools.o: pagc_tools.c pagc_tools.h pagc_common.h
parseaddress-api.o: parseaddress-api.c parseaddress-api.h parseaddress-stcities.h parseaddress-regex.h
standard.o: standard.c pagc_api.h
tokenize.o: tokenize.c pagc_api.h
std_pg_hash.o: std_pg_hash.c std_pg_hash.h pagc_api.h pagc_std_api.h

View file

@ -0,0 +1,219 @@
This is a fork of the PAGC standardizer and a single line address parser.
The code is built into a single postgresql extension library.
Portions of this code belong to their respective contributors.
This code is released under an MIT-X license.
Copyright (c) 2006-2013 Stephen Woodbridge.
Copyright (c) 2008 Walter Bruce Sinclair
woodbri@swoodbridge.com
woodbr@imaptools.com
Also read files COPYING
-------------------------------------------------------------------------------
Makefile - PGXS makefile
mk-city-regex.pl - Perl script to create parseaddress-regex.h
mk-st-regexp.pl - Perl script to create parseaddress-stcities.h
README.address_standardizer - this file
COPYING - License file
usps-st-city-adds.txt - add local additions of ST<tab>CITY NAME
usps-st-city-orig.txt - Steve's extract of USPS city names
mk-city-regex - created by make
mk-st-regexp - created by make
usps-st-city-name.txt - created by make
from usps-st-city-orig.txt and usps-st-city-adds.txt
parseaddress-regex.h - created by make and mk-st-regexp
parseaddress-stcities.h - created by make and mk-city-regex
from usps-st-city-name.txt
-------------------------------------------------------------------------------
PREREQUISITES:
o Postgresql headers and PGXS tools
o Perl 5 and Perl module Regexp::List which can be install with:
sudo perl -MCPAN -e "install Regexp::Assemble"
o libpcre and headers
sudo apt-get install libpcre3-dev libpcre3
-------------------------------------------------------------------------------
Build and Install:
make
sudo make install
For postgresql 9.1+ this will install all the files need for CREATE EXTENSION
createdb testdb
psql -c "create extension address_standardizer"
-------------------------------------------------------------------------------
How the parser works
The parser works from right to left looking first at the macro elements
for postcode, state/province, city, and then looks micro elements to determine
if we are dealing with a house number street or intersection or landmark.
It currently does not look for a country code or name, but that could be
introduced in the future.
Country code
------------
Assumed to be US or CA based on:
postcode as US or Canada
state/province as US or Canada
else US
Postcode/zipcode
----------------
These are recognized using Perl compatible regular expressions.
These regexs are currently in the parseaddress-api.c and are relatively
simple to make changes to if needed.
State/province
--------------
These are recognized using Perl compatible regular expressions.
These regexs are currently in the parseaddress-api.c but could get moved
into includes in the future for easier maintenance.
City name
---------
This part is rather complicated and there are lots of issues around ambiguities
as to where to split a series of tokens when a token might belong to either
the city or the street name. The current strategy follows something like this:
1. if we have a state, then get the city regex for that state
2. if we can match that to the end of our remaining address string then
extract the city name and continue.
3. if we do not have a state or fail to match it then
cycle through a series of regex patterns that try to separate the city
from the street, stop and extract the city if we match
Number street name
1. check for a leading house number, and extract that
2. if there is an '@' then split the string on the '@' into street and
street2 else put the rest into street
------------------------------------------------------------------------------
Managing the regexes
The regexes are used to recognize US states and Canadian provinces
and USPS city names.
City regexes
------------
usps-st-city-orig.txt - this file contains all the acceptable USPS city
names by state. I periodically extract these from the
USPS and generate this file. I do NOT recommend
editing this file.
usps-st-city-adds.txt - this file you can add new definitions to if you need
them. The format of both these files is:
<StateAbbrev><tab><CityName>
These files are assembled into usps-st-city-name.txt which is compiled by a
perl script mk-city-regex.pl into parseaddress-stcities.h which is used to
lookup the city regex for a specific state or province.
As I mentioned above is these fail to detect the city, then a secondary
strategy is is deployed by cycling through a list of regex patterns. These
patterns and regexes are generated by mk-st-regexp.pl which creates the
parseaddress-regex.h include. This is a perl script so you can view and edit
it if that is needed.
I think that there might be some room for improved in the area if coodinating
this process with PAGC's lexicon.csv and gazeteer.csv in the future.
----------------------------------------------------------------------------
Author: API: Stephen Woodbridge <woodbri@imaptools.com>
PAGC: Walter Sinclar
This is a first attempt at extracting the PAGC standardizer code into a
separate library. This was done so I could attempt to wrap it into a
postgresql stored procedure. (see the directory psql for that).
This code is a part of PAGC and is release under an MIT-X license.
Assumptions:
Linux
Sudo is installed and user has access to it.
PAGC does compile under Windows so you might get this to compile into a dll.
Build and installation:
Read the Makefile and change as appropriate.
make clean
make
make install
make test_main # build the CLI interactive test program
Author: Stephen Woodbridge <woodbri@imaptools.com>
These are postgresql wrappers for PAGC standardizer and address parser.
These are released un an MIT-X style license.
Assumptions:
Linux
sudo is installed and user has sudo access (see Makefile)
postgresql 8.3 (make changes in the Makefile to change)
Build and Installation:
make
make install
# create a new database using a postgis template
createdb -T template_postgis -E LATIN1 mydb
# add the stored procedures
psql mydb -f /path/to/standardize_address.sql
Install PAGC lexicon.csv, gazeteer.csv or rules.txt using a perl script.
./pagc-data-psql lex lexicon.csv | psql mydb
./pagc-data-psql gaz gazeteer.csv | psql mydb
./pagc-data-psql rules rules.txt | psql mydb
Now you should be able to test some queries like:
psql mydb
select * from parse_address('2099 university ave w, saint paul, mn, 55104-3431');
select * from parse_address('university ave w @ main st, saint paul, mn, 55104-3431');
select * from parse_address('385 Landgrove Rd Landgrove VT 05148');
-- "385";"Landgrove Rd";"";"385 Landgrove Rd";"Landgrove";"VT";"05148";"";"US"
select * from standardize_address(
'select seq, word::text, stdword::text, token from gaz union all select seq, word::text, stdword::text, token from lex ',
'select seq, word::text, stdword::text, token from gaz order by id',
'select * from rules order by id',
'select 0::int4 as id, ''1071 B Ave''::text as micro, ''Loxley, AL 36551''::text as macro');
select * from standardize_address(
'select seq, word::text, stdword::text, token from lex order by id',
'select seq, word::text, stdword::text, token from gaz order by id',
'select * from rules order by id',
'select 0::int4 as id, ''116 commonwealth ave apt a''::text as micro, ''west concord, ma 01742''::text as macro');
\q

View file

@ -0,0 +1,120 @@
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include "postgres.h"
#include "funcapi.h"
#include "catalog/pg_type.h"
#include "fmgr.h"
#include "parseaddress-api.h"
#include <pcre.h>
#include <string.h>
#undef DEBUG
//#define DEBUG 1
#ifdef DEBUG
#define DBG(format, arg...) \
elog(NOTICE, format , ## arg)
#else
#define DBG(format, arg...) do { ; } while (0)
#endif
Datum parse_address(PG_FUNCTION_ARGS);
static char *text2char(text *in)
{
char *out = palloc(VARSIZE(in));
memcpy(out, VARDATA(in), VARSIZE(in) - VARHDRSZ);
out[VARSIZE(in) - VARHDRSZ] = '\0';
return out;
}
PG_FUNCTION_INFO_V1(parse_address);
Datum parse_address(PG_FUNCTION_ARGS)
{
TupleDesc tupdesc;
AttInMetadata *attinmeta;
Datum result;
ADDRESS *paddr;
HHash *stH;
char *str;
char **values;
int err;
HeapTuple tuple;
DBG("Start standardize_address");
str = text2char(PG_GETARG_TEXT_P(0));
DBG("str='%s'", str);
if (get_call_result_type( fcinfo, NULL, &tupdesc ) != TYPEFUNC_COMPOSITE ) {
elog(ERROR, "function returning record called in context"
" that cannot accept type record");
return -1;
}
BlessTupleDesc(tupdesc);
attinmeta = TupleDescGetAttInMetadata(tupdesc);
DBG("Got tupdesc, allocating HHash");
stH = (HHash *) palloc0(sizeof(HHash));
if (!stH) {
elog(ERROR, "parse_address: Failed to allocate memory for hash!");
return -1;
}
DBG("going to load_state_hash");
err = load_state_hash(stH);
if (err) {
DBG("got err=%d from load_state_hash().", err);
#ifdef USE_HSEARCH
DBG("calling hdestroy_r(stH).");
hdestroy_r(stH);
#endif
elog(ERROR, "parse_address: load_state_hash() failed(%d)!", err);
return -1;
}
DBG("calling parseaddress()");
paddr = parseaddress(stH, str, &err);
if (!paddr) {
elog(ERROR, "parse_address: parseaddress() failed!");
return -1;
}
DBG("setup values array for natts=%d", tupdesc->natts);
values = (char **) palloc(9 * sizeof(char *));
if (!values) {
elog(ERROR, "parse_address: out of memory!");
return -1;
}
values[0] = paddr->num;
values[1] = paddr->street;
values[2] = paddr->street2;
values[3] = paddr->address1;
values[4] = paddr->city;
values[5] = paddr->st;
values[6] = paddr->zip;
values[7] = paddr->zipplus;
values[8] = paddr->cc;
DBG("calling heap_form_tuple");
tuple = BuildTupleFromCStrings(attinmeta, values);
/* make the tuple into a datum */
DBG("calling HeapTupleGetDatum");
result = HeapTupleGetDatum(tuple);
/* clean up (this is not really necessary */
DBG("freeing values, hash, and paddr");
free_state_hash(stH);
DBG("returning parsed address result");
return result;
}

View file

@ -0,0 +1,61 @@
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "CREATE EXTENSION address_standardizer" to load this file. \quit
---------------------------------------------------------------------
-- Core function to access the PAGC address standardizer
-- Author: Stephen Woodbridge <woodbri@imaptools.com>
---------------------------------------------------------------------
DROP TYPE IF EXISTS stdaddr;
CREATE TYPE stdaddr AS (
building text,
house_num text,
predir text,
qual text,
pretype text,
name text,
suftype text,
sufdir text,
ruralroute text,
extra text,
city text,
state text,
country text,
postcode text,
box text,
unit text
);
CREATE OR REPLACE FUNCTION standardize_address(
lextab text,
gaztab text,
rultab text,
micro text,
macro text )
RETURNS SETOF stdaddr
AS '$libdir/address_standardizer', 'standardize_address'
LANGUAGE 'c' IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION standardize_address(
lextab text,
gaztab text,
rultab text,
address text )
RETURNS SETOF stdaddr
AS '$libdir/address_standardizer', 'standardize_address1'
LANGUAGE 'c' IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION parse_address(IN text,
OUT num text,
OUT street text,
OUT street2 text,
OUT address1 text,
OUT city text,
OUT state text,
OUT zip text,
OUT zipplus text,
OUT country text)
RETURNS record
AS '$libdir/address_standardizer', 'parse_address'
LANGUAGE 'c' IMMUTABLE;

View file

@ -0,0 +1,284 @@
#include "postgres.h"
#include "funcapi.h"
#include "catalog/pg_type.h"
#include "fmgr.h"
#undef DEBUG
//#define DEBUG 1
#include "pagc_api.h"
#include "pagc_std_api.h"
#include "std_pg_hash.h"
#include "parseaddress-api.h"
#ifdef PG_MODULE_MAGIC
PG_MODULE_MAGIC;
#endif
Datum standardize_address(PG_FUNCTION_ARGS);
Datum standardize_address1(PG_FUNCTION_ARGS);
static char *text2char(text *in)
{
char *out = palloc(VARSIZE(in));
memcpy(out, VARDATA(in), VARSIZE(in) - VARHDRSZ);
out[VARSIZE(in) - VARHDRSZ] = '\0';
return out;
}
/*
* The signature for standardize_address follows. The lextab, gaztab and
* rultab should not change once the reference has been standardized and
* the same tables must be used for a geocode request as were used on the
* reference set or the matching will get degregated.
*
* select * from standardize_address(
* lextab text, -- name of table of view
* gaztab text, -- name of table or view
* rultab text, -- name of table of view
* micro text, -- '123 main st'
* macro text); -- 'boston ma 01002'
*
* If you want to standardize a whole table then call it like:
*
* insert into stdaddr (...)
* select (std).* from (
* select standardize_address(
* 'lextab', 'gaztab', 'rultab', micro, marco) as std
* from table_to_standardize) as foo;
*
* The structure of the lextab and gaztab tables of views must be:
*
* seq int4
* word text
* stdword text
* token int4
*
* the rultab table or view must have columns:
*
* rule text
*/
PG_FUNCTION_INFO_V1(standardize_address);
Datum standardize_address(PG_FUNCTION_ARGS)
{
TupleDesc tuple_desc;
AttInMetadata *attinmeta;
STANDARDIZER *std;
char *lextab;
char *gaztab;
char *rultab;
char *micro;
char *macro;
Datum result;
STDADDR *stdaddr;
char **values;
int k;
HeapTuple tuple;
DBG("Start standardize_address");
lextab = text2char(PG_GETARG_TEXT_P(0));
gaztab = text2char(PG_GETARG_TEXT_P(1));
rultab = text2char(PG_GETARG_TEXT_P(2));
micro = text2char(PG_GETARG_TEXT_P(3));
macro = text2char(PG_GETARG_TEXT_P(4));
DBG("calling RelationNameGetTupleDesc");
if (get_call_result_type( fcinfo, NULL, &tuple_desc ) != TYPEFUNC_COMPOSITE ) {
elog(ERROR, "standardize_address() was called in a way that cannot accept record as a result");
}
BlessTupleDesc(tuple_desc);
attinmeta = TupleDescGetAttInMetadata(tuple_desc);
DBG("calling GetStdUsingFCInfo(fcinfo, '%s', '%s', '%s')", lextab, gaztab, rultab);
std = GetStdUsingFCInfo(fcinfo, lextab, gaztab, rultab);
if (!std)
elog(ERROR, "standardize_address() failed to create the address standardizer object!");
DBG("calling std_standardize_mm('%s', '%s')", micro, macro);
stdaddr = std_standardize_mm( std, micro, macro, 0 );
DBG("back from fetch_stdaddr");
values = (char **) palloc(16 * sizeof(char *));
for (k=0; k<16; k++) {
values[k] = NULL;
}
DBG("setup values array for natts=%d", tuple_desc->natts);
if (stdaddr) {
values[0] = stdaddr->building ? pstrdup(stdaddr->building) : NULL;
values[1] = stdaddr->house_num ? pstrdup(stdaddr->house_num) : NULL;
values[2] = stdaddr->predir ? pstrdup(stdaddr->predir) : NULL;
values[3] = stdaddr->qual ? pstrdup(stdaddr->qual) : NULL;
values[4] = stdaddr->pretype ? pstrdup(stdaddr->pretype) : NULL;
values[5] = stdaddr->name ? pstrdup(stdaddr->name) : NULL;
values[6] = stdaddr->suftype ? pstrdup(stdaddr->suftype) : NULL;
values[7] = stdaddr->sufdir ? pstrdup(stdaddr->sufdir) : NULL;
values[8] = stdaddr->ruralroute ? pstrdup(stdaddr->ruralroute) : NULL;
values[9] = stdaddr->extra ? pstrdup(stdaddr->extra) : NULL;
values[10] = stdaddr->city ? pstrdup(stdaddr->city) : NULL;
values[11] = stdaddr->state ? pstrdup(stdaddr->state) : NULL;
values[12] = stdaddr->country ? pstrdup(stdaddr->country) : NULL;
values[13] = stdaddr->postcode ? pstrdup(stdaddr->postcode) : NULL;
values[14] = stdaddr->box ? pstrdup(stdaddr->box) : NULL;
values[15] = stdaddr->unit ? pstrdup(stdaddr->unit) : NULL;
}
DBG("calling heap_form_tuple");
tuple = BuildTupleFromCStrings(attinmeta, values);
/* make the tuple into a datum */
DBG("calling HeapTupleGetDatum");
result = HeapTupleGetDatum(tuple);
/* clean up (this is not really necessary */
DBG("freeing values, nulls, and stdaddr");
stdaddr_free(stdaddr);
DBG("returning standardized result");
PG_RETURN_DATUM(result);
}
PG_FUNCTION_INFO_V1(standardize_address1);
Datum standardize_address1(PG_FUNCTION_ARGS)
{
TupleDesc tuple_desc;
AttInMetadata *attinmeta;
STANDARDIZER *std;
char *lextab;
char *gaztab;
char *rultab;
char *addr;
char *micro;
char *macro;
Datum result;
STDADDR *stdaddr;
char **values;
int k;
HeapTuple tuple;
ADDRESS *paddr;
HHash *stH;
int err;
DBG("Start standardize_address");
lextab = text2char(PG_GETARG_TEXT_P(0));
gaztab = text2char(PG_GETARG_TEXT_P(1));
rultab = text2char(PG_GETARG_TEXT_P(2));
addr = text2char(PG_GETARG_TEXT_P(3));
DBG("calling RelationNameGetTupleDesc");
if (get_call_result_type( fcinfo, NULL, &tuple_desc ) != TYPEFUNC_COMPOSITE ) {
elog(ERROR, "standardize_address() was called in a way that cannot accept record as a result");
}
BlessTupleDesc(tuple_desc);
attinmeta = TupleDescGetAttInMetadata(tuple_desc);
DBG("Got tupdesc, allocating HHash");
stH = (HHash *) palloc0(sizeof(HHash));
if (!stH) {
elog(ERROR, "standardize_address: Failed to allocate memory for hash!");
return -1;
}
DBG("going to load_state_hash");
err = load_state_hash(stH);
if (err) {
DBG("got err=%d from load_state_hash().", err);
#ifdef USE_HSEARCH
DBG("calling hdestroy_r(stH).");
hdestroy_r(stH);
#endif
elog(ERROR, "standardize_address: load_state_hash() failed(%d)!", err);
return -1;
}
DBG("calling parseaddress()");
paddr = parseaddress(stH, addr, &err);
if (!paddr) {
elog(ERROR, "parse_address: parseaddress() failed!");
return -1;
}
/* check for errors and comput length of macro string */
if (paddr->street2)
elog(ERROR, "standardize_address() can not be passed an intersection.");
if (! paddr-> address1)
elog(ERROR, "standardize_address() could not parse the address into components.");
k = 1;
if (paddr->city) k += strlen(paddr->city) + 1;
if (paddr->st) k += strlen(paddr->st) + 1;
if (paddr->zip) k += strlen(paddr->zip) + 1;
if (paddr->cc) k += strlen(paddr->cc) + 1;
/* create micro and macro from paddr */
micro = pstrdup(paddr->address1);
macro = (char *) palloc(k * sizeof(char));
*macro = '\0';
if (paddr->city) { strcat(macro, paddr->city); strcat(macro, ","); }
if (paddr->st ) { strcat(macro, paddr->st ); strcat(macro, ","); }
if (paddr->zip ) { strcat(macro, paddr->zip ); strcat(macro, ","); }
if (paddr->cc ) { strcat(macro, paddr->cc ); strcat(macro, ","); }
DBG("calling GetStdUsingFCInfo(fcinfo, '%s', '%s', '%s')", lextab, gaztab, rultab);
std = GetStdUsingFCInfo(fcinfo, lextab, gaztab, rultab);
if (!std)
elog(ERROR, "standardize_address() failed to create the address standardizer object!");
DBG("calling std_standardize_mm('%s', '%s')", micro, macro);
stdaddr = std_standardize_mm( std, micro, macro, 0 );
DBG("back from fetch_stdaddr");
values = (char **) palloc(16 * sizeof(char *));
for (k=0; k<16; k++) {
values[k] = NULL;
}
DBG("setup values array for natts=%d", tuple_desc->natts);
if (stdaddr) {
values[0] = stdaddr->building ? pstrdup(stdaddr->building) : NULL;
values[1] = stdaddr->house_num ? pstrdup(stdaddr->house_num) : NULL;
values[2] = stdaddr->predir ? pstrdup(stdaddr->predir) : NULL;
values[3] = stdaddr->qual ? pstrdup(stdaddr->qual) : NULL;
values[4] = stdaddr->pretype ? pstrdup(stdaddr->pretype) : NULL;
values[5] = stdaddr->name ? pstrdup(stdaddr->name) : NULL;
values[6] = stdaddr->suftype ? pstrdup(stdaddr->suftype) : NULL;
values[7] = stdaddr->sufdir ? pstrdup(stdaddr->sufdir) : NULL;
values[8] = stdaddr->ruralroute ? pstrdup(stdaddr->ruralroute) : NULL;
values[9] = stdaddr->extra ? pstrdup(stdaddr->extra) : NULL;
values[10] = stdaddr->city ? pstrdup(stdaddr->city) : NULL;
values[11] = stdaddr->state ? pstrdup(stdaddr->state) : NULL;
values[12] = stdaddr->country ? pstrdup(stdaddr->country) : NULL;
values[13] = stdaddr->postcode ? pstrdup(stdaddr->postcode) : NULL;
values[14] = stdaddr->box ? pstrdup(stdaddr->box) : NULL;
values[15] = stdaddr->unit ? pstrdup(stdaddr->unit) : NULL;
}
DBG("calling heap_form_tuple");
tuple = BuildTupleFromCStrings(attinmeta, values);
/* make the tuple into a datum */
DBG("calling HeapTupleGetDatum");
result = HeapTupleGetDatum(tuple);
/* clean up (this is not really necessary */
DBG("freeing values, nulls, and stdaddr");
stdaddr_free(stdaddr);
DBG("freeing values, hash, and paddr");
free_state_hash(stH);
DBG("returning standardized result");
PG_RETURN_DATUM(result);
}

View file

@ -0,0 +1,5 @@
# address_standardizer extension
comment = ''
default_version = '1.0'
encoding = 'LATIN1'
relocatable = true

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,278 @@
/* -- err_param.c
This file handles the buffering and output of errors
Prototype 7H08 (This file was written by Walter Sinclair).
Copyright (c) 2009 Walter Bruce Sinclair
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
/* For pagc-0.4.0 : last revised 2010-11-01 */
#undef DEBUG
//#define DEBUG
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "pagc_api.h"
static FILE *open_error_log ( const char *, DS_Handle , ERR_PARAM * ) ;
static int turn_off_error_log( ERR_PARAM * ) ;
#define PRINT_ERROR( TEMP , MSG ) \
DBG( TEMP, MSG ) ;
#define RESET_ERR_P \
err_p -> first_err = 0 ; \
err_p -> last_err = 0 ; \
err_p -> next_fatal = TRUE ; \
err_mem = err_p -> err_array ; \
err_p -> error_buf = err_mem -> content_buf ; \
err_mem -> is_fatal = TRUE ; \
BLANK_STRING( err_mem -> content_buf )
/* ------------------------------------------------------------
err_param.c (init_errors) :
calls : err_param.c (open_error_log), stdlib.h (malloc, free) ,
stdio.h (fprintf, fflush) stdlib.h (malloc,free)
--------------------------------------------------------------*/
ERR_PARAM *init_errors( PAGC_GLOBAL *pagc_glo_p ,
const char *log_name ) {
ERR_PARAM *err_p ;
ERR_REC *err_mem ;
err_p = ( ERR_PARAM * ) malloc( sizeof( ERR_PARAM ) ) ;
if ( err_p == NULL ) {
#ifndef NO_STDERR_OUTPUT
PRINT_ERROR( "%s\n" ,
"FATAL ERROR : Could not allocate memory for pagc_init_errors" ) ;
#endif
return NULL ;
}
/* -- set up first record -- */
RESET_ERR_P ;
/* -- a null log_name means we don't log , but collect -- */
if ( log_name == NULL ) {
err_p -> stream = NULL ;
}
else {
err_p -> stream = open_error_log( log_name ,
pagc_glo_p -> _file_sys ,
err_p ) ;
if ( err_p -> stream == NULL ) {
FREE_AND_NULL( err_p ) ;
#ifndef NO_STDERR_OUTPUT
PRINT_ERROR( "Could not create error log for pathname: %s\n" ,
log_name ) ;
#endif
return NULL ;
}
}
return err_p ;
}
/* ------------------------------------------------------------
err_param.c (close_errors)
uses macros BLANK_STRING, FREE_AND_NULL, and PRINT_ERROR
--------------------------------------------------------------*/
void close_errors( ERR_PARAM *err_p ) {
int is_fatal_error ;
char err_out_buf[ MAXSTRLEN ] ;
if ( err_p == NULL ) {
return ;
}
BLANK_STRING( err_out_buf ) ;
/* -- read each error into the buffer and then
output it as a single line -- */
while ( empty_errors( err_p ,
&is_fatal_error ,
err_out_buf ) ) {
if ( is_fatal_error ) {
#ifndef NO_STDERR_OUTPUT
PRINT_ERROR( "ERROR: %s\n" ,
err_out_buf ) ;
} else {
PRINT_ERROR( "%s\n" ,
err_out_buf ) ;
#endif
}
BLANK_STRING( err_out_buf ) ;
}
FREE_AND_NULL( err_p ) ;
}
/* ------------------------------------------------------------
err_param.c (turn_off_error_log)
called by err_param.c (empty_errors)
stdio.h (fclose)
--------------------------------------------------------------*/
static int turn_off_error_log( ERR_PARAM *err_p ) {
ERR_REC *err_mem ;
if ( ( err_p == NULL ) ||
( err_p -> stream == NULL ) ) {
return FALSE ;
}
fclose( err_p -> stream ) ;
err_p -> stream = NULL ;
RESET_ERR_P ;
return TRUE ;
}
/* ----------------------------------------------------------
err_param.c (empty_errors)
calls : err_param.c (turn_off_error_log)
returns FALSE when all errors have been reported.
TRUE otherwise
------------------------------------------------------------*/
int empty_errors( ERR_PARAM *err_p ,
int *is_fatal ,
char *err_dest ) {
ERR_REC *err_mem ;
if ( err_p == NULL ) {
return FALSE ;
}
if ( err_p -> first_err >= err_p -> last_err ) {
/* -- reset the counters -- */
RESET_ERR_P ;
return FALSE ; /* -- indicate empty -- */
}
/* -- if logging, turn it off and indicate empty -- */
if ( turn_off_error_log( err_p ) ) {
return FALSE ;
}
/* -- output the current lowest record -- */
err_mem = err_p -> err_array + err_p -> first_err ;
append_string_to_max( err_dest ,
err_mem -> content_buf ,
MAXSTRLEN ) ;
*is_fatal = err_mem -> is_fatal ;
/* -- update the low mark -- */
err_p -> first_err ++ ;
return TRUE ; /* indicate error there */
}
/* ------------------------------------------------
err_param.c (open_error_log) :
called by init_errors
calls : stdlib.h (free) stdio.h (fopen)
uses macros OPEN_ALLOCATED_NAME, FREE_AND_NULL
--------------------------------------------------- */
static FILE *open_error_log( const char *client_log_name ,
DS_Handle _file_sys_p ,
ERR_PARAM *err_p ) {
#ifdef BUILD_API
return NULL;
#else
char *alloc_log_name ;
FILE *error_file ;
if ( client_log_name != NULL ) {
/* -- will overwrite previous log in same location -- */
OPEN_ALLOCATED_NAME(alloc_log_name,"err",error_file,client_log_name,"wb+",_file_sys_p,err_p,NULL) ;
}
FREE_AND_NULL( alloc_log_name ) ;
return error_file ;
#endif
}
/* -----------------------------------------------------------
err_param.c (register_error)
called after the error is written to the error_buf
stdlib.h (malloc) stdio.h (fprintf,fflush) string.h (strcpy)
------------------------------------------------------------ */
void register_error( ERR_PARAM *err_p ) {
int i ;
ERR_REC *err_mem ;
/* -- check if there is anything in the error_buf -- */
if ( err_p -> error_buf[ 0 ] == SENTINEL ) {
return ;
}
if ( strlen( err_p -> error_buf ) > MAXSTRLEN ) {
#ifndef NO_STDERR_OUTPUT
PRINT_ERROR( "Error message %s is too long" ,
err_p -> error_buf ) ;
#endif
return ;
}
/* -- print it out immediately, if we're logging -- */
if ( err_p -> stream != NULL ) {
fprintf( err_p -> stream ,
"%s\n" ,
err_p -> error_buf ) ;
fflush( err_p -> stream ) ;
/* -- set up for next error -- */
BLANK_STRING( err_p -> error_buf ) ;
return ;
}
/* -- update the current error record -- */
err_mem = err_p -> err_array + err_p -> last_err ;
err_mem -> is_fatal = err_p -> next_fatal ;
if ( err_p -> last_err == ( MAX_ERRORS - 1 ) ) {
#ifndef NO_STDERR_OUTPUT
PRINT_ERROR( "%s is too many errors - losing old ones" ,
err_p -> error_buf ) ;
#endif
/* -- move the whole array down a slot to make room for
the next error. The first in the array disappears -- */
for ( i = err_p -> first_err ;
i < err_p -> last_err ;
i++ ) {
err_p -> err_array[ i ] . is_fatal = err_p -> err_array[ i + 1 ] . is_fatal ;
strcpy( err_p -> err_array[ i ] . content_buf ,
err_p -> err_array[ i + 1 ] . content_buf ) ;
}
} else {
/* -- last_err points to the next one to fill -- */
err_p -> last_err ++ ;
err_mem = err_p -> err_array + err_p -> last_err ;
}
/* -- reset error_buf to the new content_buf -- */
err_p -> error_buf = err_mem -> content_buf ;
BLANK_STRING( err_mem -> content_buf ) ;
err_p -> next_fatal = TRUE ;
return ;
}
/*==========================================
2006-11-02 add new arg
===========================================*/
void send_fields_to_error( ERR_PARAM *err_p ,
char **s_fields ) {
send_fields_to_stream( s_fields , /* 2006-11-02 */
err_p -> stream ,
SCREEN , FALSE ) ;
}

View file

@ -0,0 +1,432 @@
/* -- export.c
This file contains the routines for extracting the sequence of
postal attributes and definitions produced by the standardizer
into strings of text (in __standard_fields__).
Prototype 7H08 (This file was written by Walter Sinclair).
Copyright (c) 2009 Walter Bruce Sinclair
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
/* For pagc-0.4.0 : last revised 2009-10-03 */
#include <stdio.h>
#include <string.h>
#include <stddef.h>
#include "pagc_api.h"
#include "pagc_tools.h"
#define ORDER_DISPLACEMENT 2
/* -- local prototypes -- */
static void _copy_standard_( STAND_PARAM * , SYMB , int , int ) ;
static void _scan_target_( STAND_PARAM * , SYMB , int ) ;
static char *_get_standard_( STAND_PARAM * , int , int ) ;
static char *_get_definition_text_( STAND_PARAM * , int ) ;
//#ifndef BUILD_API
/* -- local storage -- */
static const char *__field_start_tag__[][3] = {
{ " <Build>", "\"", "Building: " },
{ " <Civic>", "\"", "House Address: " },
{ " <PreDir>", "\"", "Prefix Direction: " },
{ " <Qualif>", "\"", "Qualifier: " },
{ " <PreTyp>", "\"", "Prefix Type: " },
{ " <Street>", "\"", "Street Name: " },
{ " <SufTyp>", "\"", "Suffix Type: " },
{ " <SufDir>", "\"", "Suffix Direction: " },
{ " <Rural>", "\"", "Rural Route: " },
{ " <Extra>", "\"", "Additional Info: " },
{ " <City>", "\"", "Municipal: " },
{ " <Prov>", "\"", "Province/State: " },
{ " <Nation>", "\"", "Country: " },
{ " <Postal>", "\"", "Postal/Zip Code: " },
{ " <Box>", "\"", "Box: " },
{ " <Unit>", "\"", "Unit: " }
} ;
static const char *__land_field_start_tag__[][3] = {
{ "<FeatureName>", "\"", "FeatureName " },
{ "<FeatureType>", "\"", "FeatureType " },
{ "<FeatureArea>", "\"", "FeatureArea " }
} ;
static const char *__land_field_tag_end__[][3] = {
{ "</FeatureName>\n", "\",", "\n" },
{ "</FeatureType>\n", "\",", "\n" },
{ "</FeatureArea>\n", "\",", "\n" }
} ;
static const char *__field_tag_end__[][3] = {
{ "</Build>\n", "\",", "\n" },
{ "</Civic>\n", "\",", "\n" },
{ "</PreDir>\n", "\",", "\n" },
{ "</Qualif>\n", "\",", "\n" },
{ "</PreTyp>\n", "\",", "\n" },
{ "</Street>\n", "\",", "\n" },
{ "</SufTyp>\n", "\",", "\n" },
{ "</SufDir>\n", "\",", "\n" },
{ "</Rural>\n", "\",", "\n" },
{ "</Extra>\n", "\",", "\n" },
{ "</City>\n", "\",", "\n" },
{ "</Prov>\n", "\",", "\n" },
{ "</Nation>\n", "\",", "\n" },
{ "</Postal>\n", "\",", "\n" },
{ "</Box>\n", "\",", "\n" },
{ "</Unit>\n", "\",", "\n" }
} ;
static const char *__record_start_tag__[ ] = {
" <address>\n" , "\n", "\n"
} ;
static const char *__landmark_record_start_tag__[ ] = {
" <landmark>\n" , "\n", "\n"
} ;
static const char *__record_end_tag__[ ] = {
" </address>\n", "\n", "\n"
} ;
static const char *__landmark_record_end_tag__[ ] = {
" </landmark>\n" , "\n", "\n"
} ;
//#endif
static SYMB __ord_list__[] = { ORD, FAIL } ;
/*----------------------------------------------------------------
export.c (init_output_fields)
----------------------------------------------------------------*/
void init_output_fields( STAND_PARAM *__stand_param__ , int which_fields )
{
/* -- called with BOTH to erase both the micro and macro fields
called with RIGHT to erase only the macro fields, and
LEFT to erase only the micro fields -- */
int i ;
char **__standard_fields__ = __stand_param__->standard_fields ;
/*-- Decide which set of fields to initialize --*/
if ( which_fields == BOTH )
{
for ( i = 0 ; i < MAXOUTSYM ; i++ )
{
__standard_fields__[i][0] = SENTINEL ;
}
}
else
{
/*-- Clean only one set --*/
if ( which_fields == RIGHT )
{
/*-- Erase the macro fields only --*/
for ( i = CITY ; i < NEEDHEAD ; i++ )
{
__standard_fields__[i][0] = SENTINEL ;
}
}
else
{
/*-- Erase the micro fields only --*/
for ( i = BLDNG ; i < CITY ; i++ )
{
__standard_fields__[i][0] = SENTINEL ;
}
for ( i = NEEDHEAD ; i < MAXOUTSYM ; i++ )
{
__standard_fields__[i][0] = SENTINEL ;
}
}
}
}
/*-----------------------------------------
export.c (sym_to_field)
-------------------------------------------*/
int sym_to_field( SYMB sym )
{
int fld = NEEDHEAD ;
if ( sym == BOXH || sym == BOXT ) return fld ;
fld++ ;
if ( sym == UNITH || sym == UNITT ) return fld ;
if ( sym >= BLDNG && sym < MAXOUTSYM ) return sym ;
return FAIL ;
}
/*--------------------------------------------------
export.c (_get_definition_text_)
-- called by export.c (_get_standard_)
---------------------------------------------------*/
static char *_get_definition_text_( STAND_PARAM *__stand_param__ , int lex_pos )
{
DEF *__best_DEF__ = __stand_param__->best_defs[lex_pos] ;
if (!( __best_DEF__->Protect ))
{
return ( __best_DEF__->Standard ) ;
}
return ( __stand_param__->lex_vector[lex_pos].Text ) ;
}
/*-----------------------------------------
export.c (stuff_fields)
--calls export.c (_scan_target_)
-------------------------------------------*/
void stuff_fields( STAND_PARAM *__stand_param__ )
{
int fld ;
/*-- Translate the symbols and definitions of the standardization into
the __standard_fields__ for output --*/
for (fld = 0 ;fld < NEEDHEAD ;fld++)
{
/*-- Fields that correspond one to one with the symbols --*/
_scan_target_(__stand_param__ ,fld,fld) ;
}
/*-- These two fields have two tokens for each field --*/
_scan_target_( __stand_param__ , BOXH, NEEDHEAD ) ;
_scan_target_( __stand_param__ , BOXT, NEEDHEAD ) ;
_scan_target_( __stand_param__ , UNITH, NEEDHEAD+1 ) ;
_scan_target_( __stand_param__ , UNITT, NEEDHEAD+1 ) ;
}
//#ifndef BUILD_API
/*---------------------------------------------------------------------
export.c (send_fields_to_stream)
uses BLANK_STRING
2009-09-27 modify to display landmark fields
----------------------------------------------------------------------*/
#define STREAM_BUF_SIZE MAXSTRLEN
void send_fields_to_stream( char **__standard_fields__ , FILE *__dest_file__ , int opt , int is_landmark)
{
int output_order ;
if (opt < NO_FORMAT)
{
if (__dest_file__ != NULL)
{
fprintf(__dest_file__,"%s\n",(is_landmark? __landmark_record_start_tag__[opt] : __record_start_tag__[opt])) ;
}
else
{
printf("%s\n",(is_landmark? __landmark_record_start_tag__[opt] : __record_start_tag__[opt])) ;
}
}
/*-- We want to rearrange so that unit and box come first --*/
for (output_order = 0; output_order < (NEEDHEAD + ORDER_DISPLACEMENT); output_order++)
{
char __line_buf__[STREAM_BUF_SIZE] ;
int loc = ((output_order < ORDER_DISPLACEMENT)? (NEEDHEAD + output_order) : (output_order - ORDER_DISPLACEMENT)) ;
char *__field_string__ = __standard_fields__[loc] ;
BLANK_STRING(__line_buf__) ;
if (*__field_string__ != SENTINEL)
{
if (opt < NO_FORMAT)
{
char * __source_start_tag__ ;
if (is_landmark)
{
switch (loc)
{
case FEATNAME :
__source_start_tag__ = ( char *) __land_field_start_tag__[0][opt] ;
break ;
case FEATTYPE :
__source_start_tag__ = ( char *) __land_field_start_tag__[1][opt] ;
break ;
case FEATAREA :
__source_start_tag__ = ( char *) __land_field_start_tag__[2][opt] ;
break ;
default :
__source_start_tag__ = ( char * ) __field_start_tag__[loc][opt] ;
}
}
else
{
__source_start_tag__ = (char *) __field_start_tag__[loc][opt] ;
}
append_string_to_max(__line_buf__, __source_start_tag__ , STREAM_BUF_SIZE) ;
}
append_string_to_max( __line_buf__, __field_string__ , STREAM_BUF_SIZE ) ;
if (opt < NO_FORMAT)
{
char * __source_end_tag__ ;
if (is_landmark)
{
switch (loc)
{
case FEATNAME :
__source_end_tag__ = ( char *) __land_field_tag_end__[ 0 ][ opt ] ;
break ;
case FEATTYPE :
__source_end_tag__ = ( char *) __land_field_tag_end__[ 1 ][ opt ] ;
break ;
case FEATAREA :
__source_end_tag__ = ( char *) __land_field_tag_end__[ 2 ][ opt ] ;
break ;
default :
__source_end_tag__ = ( char * ) __field_tag_end__[ loc ][ opt ] ;
}
}
else
{
__source_end_tag__ = ( char * ) __field_tag_end__[ loc ][ opt ] ;
}
append_string_to_max( __line_buf__ , __source_end_tag__ , STREAM_BUF_SIZE ) ;
}
if ( __dest_file__ != NULL )
{
fprintf( __dest_file__ , "%s" , __line_buf__ ) ;
}
else
{
printf( "%s" , __line_buf__ ) ;
}
}
}
if ( opt < NO_FORMAT )
{
if ( __dest_file__ != NULL )
{
fprintf( __dest_file__ , "%s\n", ( is_landmark? __landmark_record_end_tag__[ opt ] : __record_end_tag__[ opt ]));
}
else
{
printf( "%s\n" , ( is_landmark? __landmark_record_end_tag__[ opt ] : __record_end_tag__[ opt ] ) );
}
}
if ( __dest_file__ != NULL )
{
fflush( __dest_file__ ) ;
}
else
{
fflush( stdout ) ;
}
}
//#endif
/*-----------------------------------------
export.c (_get_standard_)
-- called by export.c (_copy_standard_)
-- calls _get_definition_text_ , find_def_type
uses MACRO BLANK_STRING
-------------------------------------------*/
static char *_get_standard_(STAND_PARAM *__stand_param__ ,int lex_pos, int output_sym)
{
char *__selected_standardization__ ;
DEF *__best_DEF__ = __stand_param__->best_defs[lex_pos] ;
if ((output_sym == STREET) && (find_def_type(__best_DEF__,__ord_list__)) && (__best_DEF__->Type == WORD))
{
/*-- <remarks> If the best definition is a streetname typed as a word, but also
including an ordinal type, then substitute the ordinal
standardization - however, the lexicon should take care of most
cases of this. </remarks> --*/
DEF *__scan_DEF__ ;
for (__scan_DEF__ = __stand_param__->lex_vector[lex_pos].DefList;__scan_DEF__ != NULL;__scan_DEF__ = __scan_DEF__->Next)
{
if (__scan_DEF__->Type == ORD)
{
if ((__selected_standardization__ = __scan_DEF__->Standard) != NULL)
{
return (__selected_standardization__) ;
}
break ;
}
}
}
/*-- If it is in the lexicon, use the standardization there, otherwise
use the form that emerged from tokenization --*/
__selected_standardization__ = _get_definition_text_(__stand_param__,lex_pos) ;
if ((output_sym == HOUSE) && (*__selected_standardization__ == '0'))
{
/*-- Remove leading zeroes to simplify match comparisons
on the house number that use strings rather than integers -
we won't do this on zip codes. There may arise some need to
do it for unit and box numbers in the future. --*/
char *__zero_pointer__ ;
char *__buffer_pointer__ = __zero_pointer__ = __selected_standardization__ ;
while ( *__zero_pointer__ == '0' ) __zero_pointer__++ ; /*-- Move to first nonzero character --*/
while ( *__zero_pointer__ != SENTINEL ) *__buffer_pointer__++ = *__zero_pointer__++ ; /*-- Move down in buffer --*/
/*-- Trim down all-zeroes to a single zero: if deleting all
the zeros leaves an empty buffer, put a zero back --*/
if ( __buffer_pointer__ == __selected_standardization__ ) *__buffer_pointer__++ = '0' ;
BLANK_STRING( __buffer_pointer__ ) ;
}
return ( __selected_standardization__ ) ;
}
/*-----------------------------------------
export.c (_scan_target_ )
-- calls export.c (_copy_standard_)
-- called by export.c (stuff_fields)
-------------------------------------------*/
static void _scan_target_(STAND_PARAM *__stand_param__,SYMB sym , int dest)
{
int i ;
int n = __stand_param__->LexNum ;
SYMB *__output_syms__ = __stand_param__->best_output ;
/*-- <remarks> Probe the array of output symbols in the best output and find
the position of a matching symbol and send it to be copied to
the output string fields. The order of the words in each field
will therefore follow the order that they appear in the input </remarks> --*/
for (i = FIRST_LEX_POS;i < n;i++)
{
if (__output_syms__[i] == sym)
{
_copy_standard_(__stand_param__,sym,dest,i) ;
}
}
}
/*-----------------------------------------
export.c (_copy_standard_)
-- called by export.c (_scan_target_) --
--calls export.c (_get_standard_,
strlen, strcpy
uses macro SPACE_APPEND_WITH_LEN
-------------------------------------------*/
static void _copy_standard_( STAND_PARAM *__stand_param__ , SYMB output_sym , int fld , int lex_pos )
{
/*-- Retrieve the standardized string --*/
char *__stan_str__ = _get_standard_( __stand_param__ , lex_pos , output_sym ) ;
char *__dest_buf__ = __stand_param__->standard_fields[fld] ;
if (( strlen( __stan_str__ ) + strlen( __dest_buf__ )) > MAXFLDLEN )
{
/*-- Truncate without warning --*/
return ;
}
if ( *__dest_buf__ != SENTINEL )
{
SPACE_APPEND_WITH_LEN( __dest_buf__ , __stan_str__ , MAXFLDLEN ) ;
}
else if ( output_sym == UNITT )
{
/*-- If the unit id type is missing, one needs to be provided.
This might result in a mismatch, when the type is implicit
in one of the compared addresses, and explicit in the
other. Not much you can do with implicit. Better a generic
identifier than nothing at all --*/
strcpy( __dest_buf__ , "# " ) ; /* -- reconsider this -- */
append_string_to_max( __dest_buf__ , __stan_str__ , MAXFLDLEN ) ;
}
else if ( output_sym == BOXT )
{
strcpy( __dest_buf__, "BOX " ) ;
append_string_to_max( __dest_buf__ , __stan_str__ ,MAXFLDLEN ) ;
}
else
{
strcpy( __dest_buf__ , __stan_str__ ) ;
}
}

View file

@ -0,0 +1,940 @@
/* -- gamma.c
This file reads the rules file into memory and sets up the rule
lookup structures. These are based on the optimized Aho-Corasick
algorithms in Watson (1994).
Copyright (c) 2008 Walter Bruce Sinclair
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
/* For pagc-0.4.0 : last revised 2010-11-01 */
#undef DEBUG
//#define DEBUG
#include <stdlib.h>
#include <stdio.h>
#include <stddef.h>
#include "pagc_api.h"
#include "gamma.h"
#ifdef BUILD_API
#include "pagc_std_api.h"
RULES *rules_init( ERR_PARAM *err_p ) ;
#endif
/* -- local prototypes -- */
static int initialize_link( ERR_PARAM *, KW *** , NODE ) ;
static void classify_link( RULE_PARAM * , KW ***, KW *, NODE , SYMB , SYMB ) ;
static void add_failure_linkage( KW ***, NODE , NODE ) ;
static NODE **precompute_gamma_function( ERR_PARAM *, NODE ** , KW ***, NODE ) ;
static double load_value[ NUMBER_OF_WEIGHTS ] = {
0.00, 0.325, 0.35 , 0.375 , 0.4 ,
0.475 , 0.55, 0.6 , 0.65 , 0.675 ,
0.7 , 0.75 , 0.8 , 0.825 , 0.85 ,
0.9 , 0.95 , 1.00 } ;
/*---------------------------------------------------------------------------
gamma.c (refresh_transducer)
called by analyze.c (prepare_target_pattern)
The registry of matching keywords is regenerated with the use of the
precomputed Gamma function, Output Links and the current target.
----------------------------------------------------------------------------*/
void refresh_transducer( NODE *r ,
SYMB *S ,
NODE **gamma_function ) {
NODE q ;
int i ;
i = 0 ;
q = r[ i ] = EPSILON ;
while ( S[ i ] != FAIL ) {
q = gamma_function[ q ][ S[ i ] ] ;
i++ ;
r[ i ] = q ;
}
}
/*---------------------------------------------------------------------------
gamma.c (is_input_symbol)
called by gamma.c (create_rules)
----------------------------------------------------------------------------*/
int is_input_symbol( SYMB sym ) {
if ( sym > MAXINSYM ||
sym < 0 )
return FALSE ;
return TRUE ;
}
/*---------------------------------------------------------------------------
gamma.c (is_output_symbol)
called by gamma.c (create_rules)
----------------------------------------------------------------------------*/
int is_output_symbol( SYMB sym ) {
if ( sym > MAXOUTSYM ||
sym < 0 )
return FALSE ;
return TRUE ;
}
#ifdef BUILD_API
/*
typedef struct RULES_s {
int ready;
int rule_number;
int last_node;
RULE_PARAM *r_p;
ERR_PARAM *err_p;
NODE **Trie;
SYMB *rule_end ;
SYMB *r ;
} RULES;
*/
/*---------------------------------------------------------------------------
gamma.c (rules_init)
api interface to replace (create_rules)
---------------------------------------------------------------------------*/
RULES *rules_init( ERR_PARAM *err_p ) {
RULES *rules;
/* -- returns size of Gamma Function Matrix -- */
SYMB a ;
KW *k_s ;
KW ***o_l ;
NODE **Trie ;
SYMB *r_s ;
RULE_PARAM *r_p ;
PAGC_CALLOC_STRUC(rules,RULES,1,err_p,NULL);
rules->err_p = err_p;
rules->ready = 0;
rules->rule_number = 0;
rules->last_node = EPSILON;
PAGC_ALLOC_STRUC(r_p,RULE_PARAM,err_p,NULL) ;
rules->r_p = r_p;
/* -- initialize the statistics record -- */
r_p -> collect_statistics = FALSE ;
r_p -> total_best_keys = 0 ;
r_p -> total_key_hits = 0 ;
/* -- storage for input and output records -- */
PAGC_CALLOC_STRUC(r_s,SYMB,RULESPACESIZE,err_p,NULL);
/* -- storage for temporary trie for rules -- */
PAGC_CALLOC_STRUC(Trie,NODE *,MAXNODES,err_p,NULL);
/* -- initialize the first( EPSILON ) node of the trie -- */
PAGC_CALLOC_STRUC(Trie[EPSILON],NODE,MAXINSYM,err_p,NULL);
for ( a = 0 ;
a < MAXINSYM ;
a++ ) {
Trie[ EPSILON ][ a ] = FAIL ;
}
/* -- storage for global output_link -- */
PAGC_CALLOC_STRUC(o_l,KW **,MAXNODES,err_p,NULL);
PAGC_CALLOC_STRUC(k_s,KW,MAXRULES,err_p,NULL);
if ( !initialize_link( err_p ,
o_l ,
EPSILON ) ) {
return NULL ;
}
rules -> r_p -> rule_space = r_s ;
rules -> r_p -> key_space = k_s ;
rules -> r_p -> output_link = o_l ;
rules -> Trie = Trie ;
rules -> rule_end = r_s + RULESPACESIZE ;
rules -> r = r_s ;
return rules;
}
int rules_add_rule(RULES *rules, int num, int *rule) {
int i ,
w ;
SYMB a ,
t ;
SYMB *rule_start ,
*r ,
*r_s ;
NODE u ;
NODE **Trie ;
KW *keyw ,
*k_s ;
KW ***o_l ;
if ( !rules ) return 1; /* error rules obj not initialized */
if ( !rules -> r_p ) return 2; /* RULE_PARAM not allocated */
if ( rules -> ready ) return 3; /* rules have already be readied */
if ( rules -> rule_number >= MAXRULES ) {
RET_ERR( "rules_add_rule: Too many rules are being added.",
rules -> err_p, 4);
}
/* get local copies of stuff saved in RULES */
o_l = rules -> r_p -> output_link ;
k_s = rules -> r_p -> key_space ;
r_s = rules -> r_p -> rule_space ;
Trie = rules -> Trie ;
r = rules -> r ;
keyw = k_s + rules -> rule_number ;
MEM_ERR(keyw, rules -> err_p, 5);
u = EPSILON ;
rule_start = r ; /* save rule start for inclusion in the record */
if ( rule_start > rules -> rule_end ) {
RET_ERR( "rules_add_rule: Too many rules for allocated memory.",
rules -> err_p, 5);
}
for (i=0; ; i++, r++ ) {
if (i >= num) {
RET_ERR( "rules_add_rule: invalid rule structure.",
rules -> err_p, 6);
}
*r = rule[i] ;
/* -- a fail at the beginning of a field indicates end of record
unless it's at the beginning of the record, in which case
it's the end of file -- */
if ( *r == FAIL ) {
if ( i == 0 ) return 0;
break;
}
/* -- check the input -- */
if ( !is_input_symbol( *r ) ) {
RET_ERR2( "rules_add_rule: Bad Input Token %d at rule %d",
*r,
rules -> rule_number ,
rules -> err_p,
7 ) ;
}
/* -- build the trie structure -- */
if ( Trie[ u ][ *r ] == FAIL ) {
if ( ++rules -> last_node >= MAXNODES ) {
RET_ERR( "rules_add_rule: Too many nodes in gamma function",
rules -> err_p,
8 ) ;
}
Trie[ u ][ *r ] = rules -> last_node ;
PAGC_CALLOC_STRUC(Trie[rules -> last_node],NODE,MAXINSYM,rules -> err_p,9) ;
for ( a = 0 ;
a < MAXINSYM ;
a++ ) {
Trie[ rules -> last_node ][ a ] = FAIL ;
}
if ( !initialize_link( rules -> err_p ,
o_l ,
rules -> last_node ) ) {
return 10;
}
}
u = Trie[ u ][ *r ] ;
} /* end of for loop */
keyw -> Input = rule_start ;
if ( ( keyw -> Length = i ) == 0 ) {
RET_ERR1( "rules_add_rule: Error 0 length rule #%d",
rules -> rule_number,
rules -> err_p,
11 ) ;
}
/* -- read the output tokens into the rule_space -- */
r++ ; /* -- move to beginning of the output tokens -- */
rule_start = r ; /* -- remember the beginning -- */
while ( TRUE ) {
i++;
if ( i >= num ) {
RET_ERR( "rules_add_rule: invalid rule structure.",
rules -> err_p, 6);
}
*r = rule[i] ;
if ( *r == FAIL ) break;
if ( !is_output_symbol( *r ) ) {
RET_ERR2( "rules_add_rule: Rule File: Non-Token %d in Rule #%d\n",
*r ,
rules -> rule_number,
rules -> err_p,
7 ) ;
}
r++ ;
}
keyw -> Output = rule_start ;
/* -- classify the output -- */
i++ ;
t = rule[i] ;
i++ ;
w = rule[i] ;
classify_link( rules -> r_p ,
o_l ,
keyw ,
u ,
w ,
t ) ;
rules -> rule_number++ ;
rules -> r = ++r ; ;
return 0;
}
int rules_ready(RULES *rules) {
SYMB a;
if (!rules) return 1; /* error rules obj not initialized */
if (!rules->r_p) return 2; /* RULE_PARAM not allocated */
if (rules->ready) return 3; /* rules have already be readied */
rules -> r_p -> rules_read = rules->rule_number ;
if ( ++rules -> last_node >= MAXNODES ) {
RET_ERR( "rules_ready: Too many nodes in gamma function" ,
rules -> err_p, 4) ;
}
/* -- change the EPSILON node transitions in preparation for Gamma -- */
for ( a = 0 ;
a < MAXINSYM ;
a++ ) {
if ( rules -> Trie[ EPSILON ][ a ] == FAIL ) {
rules -> Trie[ EPSILON ][ a ] = EPSILON ;
}
}
/* -- create the global Gamma function matrix -- */
if ( ( rules -> r_p -> gamma_matrix =
precompute_gamma_function( rules -> err_p,
rules -> Trie ,
rules -> r_p -> output_link ,
rules -> last_node ) ) == NULL ) {
return 5 ;
}
/* -- no longer need the Trie -- */
PAGC_DESTROY_2D_ARRAY(rules -> Trie,NODE,rules -> last_node) ;
rules -> Trie = NULL ;
rules -> r_p -> num_nodes = rules -> last_node ;
/*
if ( glo_p -> log_init ) {
CLIENT_ERR( err_p ) ;
LOG_MESS2( "create_rules: Rules installed with %d nodes and %d rules",
rules -> last_node ,
rules->rule_number ,
err_p ) ;
}
*/
rules -> ready = 1 ;
return 0;
}
void rules_free(RULES *rules) {
if (!rules) return;
if (rules->r_p) destroy_rules(rules->r_p);
free(rules);
rules = NULL;
}
#else
/*---------------------------------------------------------------------------
gamma.c (create_rules)
called by standard.l (init_stand_process)
calls util.c (open_aux_file)
calls gamma.c (initialize_link, is_input_symbol, is_output_symbol,
classify_link,precompute_gamma_function)
----------------------------------------------------------------------------*/
RULE_PARAM *create_rules( const char *rule_name ,
PAGC_GLOBAL *glo_p ) {
/* -- returns size of Gamma Function Matrix -- */
SYMB a ,
t ;
NODE u ;
int i ,
w ;
int is_eof = FALSE ;
int rule_number = 0 ;
int last_node = EPSILON ;
FILE *rule_file ;
SYMB *rule_start ,
*rule_end ,
*r ;
KW *keyw , *k_s ;
KW ***o_l ;
NODE **Trie ;
SYMB *r_s ;
RULE_PARAM *r_p ;
ERR_PARAM *err_p ;
err_p = glo_p -> process_errors ;
PAGC_ALLOC_STRUC(r_p,RULE_PARAM,err_p,NULL) ;
/* -- initialize the statistics record -- */
r_p -> collect_statistics = FALSE ;
r_p -> total_best_keys = 0 ;
r_p -> total_key_hits = 0 ;
/* -- open the rule file, if possible -- */
if ( ( rule_file = open_aux_file( glo_p ,
rule_name ) ) == NULL ) {
return NULL ;
}
/* -- rule file has the format of i i ... i -1 o o ... o -1 t f -- */
/* -- storage for input and output records -- */
PAGC_CALLOC_STRUC(r_s,SYMB,RULESPACESIZE,err_p,NULL);
/* -- storage for temporary trie for rules -- */
PAGC_CALLOC_STRUC(Trie,NODE *,MAXNODES,err_p,NULL);
/* -- initialize the first( EPSILON ) node of the trie -- */
PAGC_CALLOC_STRUC(Trie[EPSILON],NODE,MAXINSYM,err_p,NULL);
for ( a = 0 ;
a < MAXINSYM ;
a++ ) {
Trie[ EPSILON ][ a ] = FAIL ;
}
/* -- storage for global output_link -- */
PAGC_CALLOC_STRUC(o_l,KW **,MAXNODES,err_p,NULL);
PAGC_CALLOC_STRUC(k_s,KW,MAXRULES,err_p,NULL);
rule_end = r_s + RULESPACESIZE ;
if ( !initialize_link( err_p ,
o_l ,
EPSILON ) ) {
return NULL ;
}
for ( r = r_s ;
!feof( rule_file ) ;
r++, rule_number++ ) {
if ( rule_number >= MAXRULES ) {
CLIENT_ERR( err_p ) ;
RET_ERR( "create_rules: Too many rules in file",
err_p,
NULL) ;
}
keyw = k_s + rule_number ;
MEM_ERR(keyw,err_p,NULL);
/* -- get input record -- */
u = EPSILON ;
rule_start = r ; /* -- save rule start for inclusion in record -- */
if ( rule_start > rule_end ) {
RET_ERR( "create_rules: Too many rules for allocated memory",
err_p,
NULL ) ;
}
for ( i = 0 ;
;
i++, r++ ) {
/* -- read the first integer -- */
fscanf( rule_file,
"%d",
r ) ;
/* -- a fail at the beginning of a field indicates end of record
unless it's at the beginning of the record, in which case
it's the end of file -- */
if ( *r == FAIL ) {
if ( i == 0 ) {
is_eof = TRUE ;
}
break ;
}
/* -- check the input -- */
if ( !is_input_symbol( *r ) ) {
CLIENT_ERR( err_p ) ;
RET_ERR2( "create_rules: Rule file: Bad Input Token %d at rule %d",
*r,
rule_number ,
err_p,
NULL ) ;
}
/* -- build the trie structure -- */
if ( Trie[ u ][ *r ] == FAIL ) {
if ( ++last_node >= MAXNODES ) {
RET_ERR( "create_rules: Too many nodes in gamma function",
err_p,
NULL ) ;
}
Trie[ u ][ *r ] = last_node ;
PAGC_CALLOC_STRUC(Trie[last_node],NODE,MAXINSYM,err_p,NULL) ;
for ( a = 0 ;
a < MAXINSYM ;
a++ ) {
Trie[ last_node ][ a ] = FAIL ;
}
if ( !initialize_link( err_p ,
o_l ,
last_node ) ) {
return NULL ;
}
}
u = Trie[ u ][ *r ] ;
}
if ( is_eof )
break ;
keyw -> Input = rule_start ;
if ( ( keyw -> Length = i ) == 0 ) {
CLIENT_ERR( err_p ) ;
RET_ERR1( "create_rules: Error Rule File: 0 length rule #%d",
rule_number,
err_p,
NULL ) ;
}
/* -- read the output tokens into the rule_space -- */
r++ ; /* -- move to beginning of the output tokens -- */
rule_start = r ; /* -- remember the beginning -- */
while ( TRUE ) {
fscanf( rule_file,
"%d",
r ) ;
if ( *r == FAIL )
break ;
if ( !is_output_symbol( *r ) ) {
RET_ERR2( "create_rules: Rule File: Non-Token %d in Rule #%d\n",
*r ,
rule_number,
err_p,
NULL ) ;
}
r++ ;
}
keyw -> Output = rule_start ;
/* -- classify the output -- */
fscanf( rule_file ,
"%d" ,
&t ) ;
fscanf( rule_file ,
"%d" ,
&w ) ;
classify_link( r_p ,
o_l ,
keyw ,
u ,
w ,
t ) ;
} /* -- end of file read -- */
r_p -> rule_space = r_s ;
r_p -> key_space = k_s ;
r_p -> output_link = o_l ;
r_p -> rules_read = rule_number ;
fclose( rule_file ) ;
if ( ++last_node >= MAXNODES ) {
RET_ERR( "create_rules: Too many nodes in gamma function" ,
err_p,
NULL) ;
}
/* -- change the EPSILON node transitions in preparation for Gamma -- */
for ( a = 0 ;
a < MAXINSYM ;
a++ ) {
if ( Trie[ EPSILON ][ a ] == FAIL ) {
Trie[ EPSILON ][ a ] = EPSILON ;
}
}
/* -- create the global Gamma function matrix -- */
if ( ( r_p -> gamma_matrix = precompute_gamma_function( err_p,
Trie ,
o_l ,
last_node ) ) == NULL ) {
return NULL ;
}
/* -- no longer need the Trie -- */
PAGC_DESTROY_2D_ARRAY(Trie,NODE,last_node) ;
r_p -> num_nodes = last_node ;
if ( glo_p -> log_init ) {
CLIENT_ERR( err_p ) ;
LOG_MESS2( "create_rules: Rules installed with %d nodes and %d rules",
last_node ,
rule_number ,
err_p ) ;
}
return r_p ;
}
#endif
/*---------------------------------------------------------------------------
gamma.c (destroy_rules)
----------------------------------------------------------------------------*/
void destroy_rules( RULE_PARAM * r_p ) {
if ( r_p != NULL ) {
DBG("destroy_rules 1");
FREE_AND_NULL( r_p -> rule_space ) ;
DBG("destroy_rules 2");
FREE_AND_NULL( r_p -> key_space ) ;
DBG("destroy_rules 3");
PAGC_DESTROY_2D_ARRAY(r_p->output_link,KW*,r_p->num_nodes) ;
DBG("destroy_rules 4");
PAGC_DESTROY_2D_ARRAY(r_p->gamma_matrix,NODE,r_p->num_nodes) ;
DBG(" destroy_rules 5");
FREE_AND_NULL( r_p ) ;
}
}
/* ========================= Output Links ========================= */
/*---------------------------------------------------------------------------
gamma.c (initalize_link)
called by gamma.c (create_rules)
----------------------------------------------------------------------------*/
static int initialize_link( ERR_PARAM *err_p ,
KW ***o_l ,
NODE u ) {
int cl ;
/* -- classification by clause type -- */
PAGC_CALLOC_STRUC(o_l[u],KW *,MAX_CL,err_p,FALSE);
for ( cl = 0 ;
cl < MAX_CL ;
cl++ ) {
o_l[ u ][ cl ] = NULL ;
}
return TRUE ;
}
/*---------------------------------------------------------------------------
gamma.c (classify_link)
called by gamma.c (create_rules)
----------------------------------------------------------------------------*/
static void classify_link( RULE_PARAM *r_p ,
KW ***o_l , /* -- 2006-11-02 : arg -- */
KW *k ,
NODE u ,
SYMB w ,
SYMB c ) {
/* -- classification by clause type -- */
KW * last_key ,
* penult ;
k -> hits = 0 ;
k -> best = 0 ;
k -> Type = c ;
k -> Weight = w ;
last_key = o_l[ u ][ c ] ; /* -- 2006-11-02 : arg -- */
if ( last_key == NULL ) {
o_l[ u ][ c ] = k ; /* -- 2006-11-02 : arg -- */
} else {
/* -- if the same input symbols are used... -- */
while ( ( penult = last_key -> OutputNext ) != NULL )
last_key = penult ;
last_key -> OutputNext = k ;
}
/* -- initialize in anticipation of failure extensions -- */
k -> OutputNext = NULL ;
}
/*---------------------------------------------------------------------------
gamma.c (add_failure_linkage)
called by gamma.c (precompute_gamma_function)
----------------------------------------------------------------------------*/
static void add_failure_linkage( KW ***o_l ,
NODE x ,
NODE u ) {
/* -- called by precompute_gamma_function
-- x is the node in the failure function of the node u
-- classification by clause type -- */
KW *k ,
*fk ;
int cl ;
for ( cl = 0 ;
cl < MAX_CL ;
cl++ ) {
/* -- append the failure keys for each class to the end of the
appropriate chain -- */
fk = o_l[ x ][ cl ] ;
k = o_l[ u ][ cl ] ;
if ( k == NULL ) {
o_l[ u ][ cl ] = fk ;
} else {
/* -- since the chain will be already null-terminated, we only find
the end of the chain if fk is non-null -- */
if ( fk != NULL ) {
/* -- append to the end of the list and make sure that the longer
lengths go first - this is probably redundant. -- */
while ( k -> OutputNext != NULL ) {
k = k -> OutputNext ;
}
k -> OutputNext = fk ;
}
}
}
}
/*---------------------------------------------------------------------------
gamma.c (precompute_gamma_function)
called by gamma.c (create_rules)
calls gamma.c (add_failure_linkage)
----------------------------------------------------------------------------*/
static NODE **precompute_gamma_function( ERR_PARAM *err_p ,
NODE **Trie ,
KW ***o_l ,
NODE n ) {
NODE u ,
ua ,
x ;
SYMB a ;
int i ,
j ;
NODE **Gamma ;
NODE *Failure ,
*Queue ;
/* -- Storage for Failure Function -- */
PAGC_CALLOC_STRUC(Failure,NODE,n,err_p,NULL) ;
/* -- Storage for Breadth First Search Queue -- */
PAGC_CALLOC_STRUC(Queue,NODE,n,err_p,NULL) ;
PAGC_CALLOC_2D_ARRAY(Gamma,NODE,n,MAXINSYM,err_p,NULL) ;
u = EPSILON ;
i = 0 ;
for ( a = 0 ;
a < MAXINSYM ;
a++ ) {
x = Trie[ EPSILON ][ a ] ;
Gamma[ EPSILON ][ a ] = x ;
Failure[ x ] = EPSILON ;
/* -- add to Queue for breadth-first search -- */
if ( x != EPSILON ) {
Queue[ i++ ] = x ;
}
}
Queue[ i ] = FAIL ; /* -- terminate the list of nodes to process -- */
for ( j = 0 ;
Queue[ j ] != FAIL ;
j++ ) {
u = Queue[ j ] ;
/* -- get non-Fail transitions from Trie onto queue -- */
for ( a = 0 ;
a < MAXINSYM ;
a++ ) {
if ( ( x = Trie[ u ][ a ] ) != FAIL ) {
Queue[ i++ ] = x ;
}
}
Queue[ i ] = FAIL ; /* -- mark end of list -- */
x = Failure[ u ] ;
add_failure_linkage( o_l ,
x ,
u ) ;
for ( a = 0 ;
a < MAXINSYM ;
a ++ ) {
ua = Trie[ u ][ a ] ;
if ( ua != FAIL ) {
Gamma[ u ][ a ] = ua ;
Failure[ ua ] = Gamma[ x ][ a ] ;
} else {
Gamma[ u ][ a ] = Gamma[ x ][ a ] ;
}
}
}
FREE_AND_NULL( Failure ) ;
FREE_AND_NULL( Queue ) ;
return Gamma ;
}
static const char *rule_type_names[] = {
"MACRO" , "MICRO" , "ARC" , "CIVIC" , "EXTRA"
} ;
/* =========================================
gamma.c (output_rule_statistics)
uses macro OPEN_ALLOCATED_NAME
stdio.h (printf,fprintf,fflush,fclose)
===========================================*/
#ifdef BUILD_API
int output_rule_statistics( RULE_PARAM *r_p, ERR_PARAM *err_p ) {
#else
int output_rule_statistics( RULE_PARAM *r_p ,
ERR_PARAM *err_p ,
char *name ,
DS_Handle _file_sys_p ) {
#endif
int i ,
found_count ,
n ;
SYMB *OL ;
char *sts_name = NULL ;
FILE *sts_file = NULL ;
KW * k ;
KW * k_s ;
double hit_frequency ,
best_frequency ;
if ( !r_p -> collect_statistics ) {
printf( "Statistics were not collected\n" ) ;
return FALSE ;
}
#ifndef BUILD_API
if ( name != NULL && name[ 0 ] != SENTINEL ) {
OPEN_ALLOCATED_NAME(sts_name,"sts",sts_file,name,"wb+",_file_sys_p,err_p,FALSE) ;
}
#endif
/* -- cycle through the keys -- */
n = r_p -> rules_read ;
k_s = r_p -> key_space ;
for ( i = 0 , found_count = 0 ;
i < n ;
i++ ) {
k = k_s + i ;
if ( k -> hits == 0 ) {
continue ;
}
found_count++ ;
if ( sts_file == NULL ) {
printf( "\nRule %d is of type %d (%s)\n: " ,
i ,
k -> Type ,
rule_type_names[ k -> Type ] ) ;
printf( "Input : " ) ;
} else {
fprintf( sts_file ,
"\nRule %d is of type %d (%s)\n: " ,
i ,
k -> Type ,
rule_type_names[ k -> Type ] ) ;
fprintf( sts_file ,
"Input : " ) ;
}
for ( OL = k -> Input ;
*OL != FAIL ;
OL++ ) {
if ( sts_file == NULL ) {
printf( "|%d (%s)|" ,
*OL ,
in_symb_name( *OL ) ) ;
} else {
fprintf( sts_file ,
"|%d (%s)|" ,
*OL ,
in_symb_name( *OL ) ) ;
}
}
if ( sts_file == NULL ) {
printf( "\nOutput: " ) ;
} else {
fprintf( sts_file ,
"\nOutput: " ) ;
}
/* -- output the output symbols -- */
for ( OL = k -> Output ;
*OL != FAIL ;
OL++ ) {
if ( sts_file == NULL ) {
printf( "|%d (%s)|" ,
*OL ,
out_symb_name( *OL ) ) ;
} else {
fprintf( sts_file ,
"|%d (%s)|" ,
*OL ,
out_symb_name( *OL ) ) ;
}
}
if ( sts_file == NULL ) {
printf ( "\nrank %d ( %f): hits %d out of %d\n" ,
k -> Weight ,
load_value[ k -> Weight ] ,
k->hits,
r_p -> total_key_hits ) ;
} else {
hit_frequency = ( ( double ) k -> hits ) / ( ( double ) r_p -> total_key_hits ) ;
best_frequency = ( ( double ) k -> best ) / ( ( double ) r_p -> total_best_keys ) ;
fprintf( sts_file ,
"\nrank %d ( %f): hit frequency: %f, best frequency: %f" ,
k -> Weight ,
load_value[ k -> Weight ] ,
hit_frequency ,
best_frequency ) ;
fprintf ( sts_file ,
"\n%d hits out of %d, best %d out of %d\n" ,
k->hits, r_p -> total_key_hits, k-> best, r_p -> total_best_keys ) ;
}
k -> hits = 0 ;
k -> best = 0 ;
}
if ( sts_file == NULL ) {
printf( "Found %d rules hit\n" , found_count ) ;
} else {
fprintf( sts_file ,
"Found %d rules hit\n" ,
found_count ) ;
}
/* -- start over -- */
r_p -> total_key_hits = 0 ;
r_p -> total_best_keys = 0 ;
if ( sts_file != NULL ) {
fflush( sts_file ) ;
fclose( sts_file ) ;
FREE_AND_NULL( sts_name ) ;
} else {
fflush( stdout ) ;
}
return TRUE ;
}

View file

@ -0,0 +1,3 @@
#define MAXRULES 4500
#define MAXNODES 5000
#define RULESPACESIZE 60000

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,191 @@
//
// hash.c
//
// Copyright (c) 2012 TJ Holowaychuk <tj@vision-media.ca>
//
#include "hash.h"
/*
* Set hash `key` to `val`.
*/
inline void
hash_set(hash_t *self, char *key, void *val) {
int ret;
khiter_t k = kh_put(ptr, self, key, &ret);
kh_value(self, k) = val;
}
/*
* Get hash `key`, or NULL.
*/
inline void *
hash_get(hash_t *self, char *key) {
khiter_t k = kh_get(ptr, self, key);
return k == kh_end(self) ? NULL : kh_value(self, k);
}
/*
* Check if hash `key` exists.
*/
inline int
hash_has(hash_t *self, char *key) {
khiter_t k = kh_get(ptr, self, key);
return kh_exist(self, k);
}
/*
* Remove hash `key`.
*/
void
hash_del(hash_t *self, char *key) {
khiter_t k = kh_get(ptr, self, key);
kh_del(ptr, self, k);
}
// tests
#ifdef TEST_HASH
#include <stdio.h>
#include <assert.h>
#include <string.h>
void
test_hash_set() {
hash_t *hash = hash_new();
assert(0 == hash_size(hash));
hash_set(hash, "name", "tobi");
hash_set(hash, "species", "ferret");
assert(2 == hash_size(hash));
assert(0 == strcmp("tobi", hash_get(hash, "name")));
assert(0 == strcmp("ferret", hash_get(hash, "species")));
}
void
test_hash_get() {
hash_t *hash = hash_new();
hash_set(hash, "foo", "bar");
assert(0 == strcmp("bar", hash_get(hash, "foo")));
assert(NULL == hash_get(hash, "bar"));
}
void
test_hash_has() {
hash_t *hash = hash_new();
hash_set(hash, "foo", "bar");
assert(1 == hash_has(hash, "foo"));
assert(0 == hash_has(hash, "bar"));
}
void
test_hash_size() {
hash_t *hash = hash_new();
assert(0 == hash_size(hash));
hash_set(hash, "foo", "bar");
assert(1 == hash_size(hash));
hash_set(hash, "bar", "baz");
assert(2 == hash_size(hash));
}
void
test_hash_del() {
hash_t *hash = hash_new();
hash_set(hash, "foo", "bar");
assert(1 == hash_has(hash, "foo"));
assert(0 == hash_has(hash, "bar"));
hash_del(hash, "foo");
hash_del(hash, "bar");
assert(0 == hash_has(hash, "foo"));
}
void
test_hash_clear() {
hash_t *hash = hash_new();
hash_set(hash, "foo", "bar");
hash_set(hash, "bar", "baz");
hash_set(hash, "raz", "jaz");
assert(3 == hash_size(hash));
hash_clear(hash);
assert(0 == hash_size(hash));
}
void
test_hash_each() {
hash_t *hash = hash_new();
hash_set(hash, "name", "tj");
hash_set(hash, "age", "25");
const char *keys[2];
void *vals[2];
int n = 0;
hash_each(hash, {
keys[n] = key;
vals[n] = val;
n++;
});
assert(0 == strcmp("age", keys[0]) || 0 == strcmp("name", keys[0]));
assert(0 == strcmp("age", keys[1]) || 0 == strcmp("name", keys[1]));
assert(0 == strcmp("25", vals[0]) || 0 == strcmp("tj", vals[0]));
assert(0 == strcmp("25", vals[1]) || 0 == strcmp("tj", vals[1]));
}
void
test_hash_each_key() {
hash_t *hash = hash_new();
hash_set(hash, "name", "tj");
hash_set(hash, "age", "25");
const char *keys[2];
int n = 0;
hash_each_key(hash, {
keys[n++] = key;
});
assert(0 == strcmp("age", keys[0]) || 0 == strcmp("name", keys[0]));
assert(0 == strcmp("age", keys[1]) || 0 == strcmp("name", keys[1]));
}
void
test_hash_each_val() {
hash_t *hash = hash_new();
hash_set(hash, "name", "tj");
hash_set(hash, "age", "25");
void *vals[2];
int n = 0;
hash_each_val(hash, {
vals[n++] = val;
});
assert(0 == strcmp("25", vals[0]) || 0 == strcmp("tj", vals[0]));
assert(0 == strcmp("25", vals[1]) || 0 == strcmp("tj", vals[1]));
}
int
main(){
test_hash_set();
test_hash_get();
test_hash_has();
test_hash_del();
test_hash_size();
test_hash_clear();
test_hash_each();
test_hash_each_key();
test_hash_each_val();
printf("\n \e[32m\u2713 \e[90mok\e[0m\n\n");
return 0;
}
#endif

View file

@ -0,0 +1,107 @@
//
// hash.h
//
// Copyright (c) 2012 TJ Holowaychuk <tj@vision-media.ca>
//
#ifndef HASH
#define HASH
//#include <postgres.h>
#include "khash.h"
// pointer hash
KHASH_MAP_INIT_STR(ptr, void *);
/*
* Hash type.
*/
typedef khash_t(ptr) hash_t;
/*
* Allocate a new hash.
*/
#define hash_new() kh_init(ptr)
/*
* Destroy the hash.
*/
#define hash_free(self) kh_destroy(ptr, self)
/*
* Hash size.
*/
#define hash_size kh_size
/*
* Remove all pairs in the hash.
*/
#define hash_clear(self) kh_clear(ptr, self)
/*
* Iterate hash keys and ptrs, populating
* `key` and `val`.
*/
#define hash_each(self, block) { \
const char *key; \
void *val; \
for (khiter_t k = kh_begin(self); k < kh_end(self); ++k) { \
if (!kh_exist(self, k)) continue; \
key = kh_key(self, k); \
val = kh_value(self, k); \
block; \
} \
}
/*
* Iterate hash keys, populating `key`.
*/
#define hash_each_key(self, block) { \
const char *key; \
for (khiter_t k = kh_begin(self); k < kh_end(self); ++k) { \
if (!kh_exist(self, k)) continue; \
key = kh_key(self, k); \
block; \
} \
}
/*
* Iterate hash ptrs, populating `val`.
*/
#define hash_each_val(self, block) { \
void *val; \
for (khiter_t k = kh_begin(self); k < kh_end(self); ++k) { \
if (!kh_exist(self, k)) continue; \
val = kh_value(self, k); \
block; \
} \
}
// protos
void
hash_set(hash_t *self, char *key, void *val);
void *
hash_get(hash_t *self, char *key);
int
hash_has(hash_t *self, char *key);
void
hash_del(hash_t *self, char *key);
void
hash_clear(hash_t *self);
#endif /* HASH */

View file

@ -0,0 +1,317 @@
/* The MIT License
Copyright (c) 2008, by Attractive Chaos <attractivechaos@aol.co.uk>
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
/*
An example:
#include "khash.h"
KHASH_MAP_INIT_INT(32, char)
int main() {
int ret, is_missing;
khiter_t k;
khash_t(32) *h = kh_init(32);
k = kh_put(32, h, 5, &ret);
if (!ret) kh_del(32, h, k);
kh_value(h, k) = 10;
k = kh_get(32, h, 10);
is_missing = (k == kh_end(h));
k = kh_get(32, h, 5);
kh_del(32, h, k);
for (k = kh_begin(h); k != kh_end(h); ++k)
if (kh_exist(h, k)) kh_value(h, k) = 1;
kh_destroy(32, h);
return 0;
}
*/
/*
2008-09-19 (0.2.3):
* Corrected the example
* Improved interfaces
2008-09-11 (0.2.2):
* Improved speed a little in kh_put()
2008-09-10 (0.2.1):
* Added kh_clear()
* Fixed a compiling error
2008-09-02 (0.2.0):
* Changed to token concatenation which increases flexibility.
2008-08-31 (0.1.2):
* Fixed a bug in kh_get(), which has not been tested previously.
2008-08-31 (0.1.1):
* Added destructor
*/
#ifndef __AC_KHASH_H
#define __AC_KHASH_H
#define AC_VERSION_KHASH_H "0.2.2"
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
typedef uint32_t khint_t;
typedef khint_t khiter_t;
#define __ac_HASH_PRIME_SIZE 32
static const uint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] =
{
0ul, 3ul, 11ul, 23ul, 53ul,
97ul, 193ul, 389ul, 769ul, 1543ul,
3079ul, 6151ul, 12289ul, 24593ul, 49157ul,
98317ul, 196613ul, 393241ul, 786433ul, 1572869ul,
3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul,
100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul,
3221225473ul, 4294967291ul
};
#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
static const double __ac_HASH_UPPER = 0.77;
#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
typedef struct { \
khint_t n_buckets, size, n_occupied, upper_bound; \
uint32_t *flags; \
khkey_t *keys; \
khval_t *vals; \
} kh_##name##_t; \
static inline kh_##name##_t *kh_init_##name() { \
return (kh_##name##_t*)calloc(1,sizeof(kh_##name##_t)); \
} \
static inline void kh_destroy_##name(kh_##name##_t *h) \
{ \
if (h) { \
free(h->keys); \
free(h->flags); \
free(h->vals); \
/*free(h);*/ \
} \
} \
static inline void kh_clear_##name(kh_##name##_t *h) \
{ \
if (h && h->flags) { \
memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t)); \
h->size = h->n_occupied = 0; \
} \
} \
static inline khint_t kh_get_##name(kh_##name##_t *h, khkey_t key) \
{ \
if (h->n_buckets) { \
khint_t inc, k, i, last; \
k = __hash_func(key); i = k % h->n_buckets; \
inc = 1 + k % (h->n_buckets - 1); last = i; \
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
else i += inc; \
if (i == last) return h->n_buckets; \
} \
return __ac_iseither(h->flags, i)? h->n_buckets : i; \
} else return 0; \
} \
static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
{ \
uint32_t *new_flags = 0; \
khint_t j = 1; \
{ \
khint_t t = __ac_HASH_PRIME_SIZE - 1; \
while (__ac_prime_list[t] > new_n_buckets) --t; \
new_n_buckets = __ac_prime_list[t+1]; \
if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \
else { \
new_flags = (uint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \
memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \
if (h->n_buckets < new_n_buckets) { \
h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
if (kh_is_map) \
h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
} \
} \
} \
if (j) { \
for (j = 0; j != h->n_buckets; ++j) { \
if (__ac_iseither(h->flags, j) == 0) { \
khkey_t key = h->keys[j]; \
khval_t val; \
if (kh_is_map) val = h->vals[j]; \
__ac_set_isdel_true(h->flags, j); \
while (1) { \
khint_t inc, k, i; \
k = __hash_func(key); \
i = k % new_n_buckets; \
inc = 1 + k % (new_n_buckets - 1); \
while (!__ac_isempty(new_flags, i)) { \
if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \
else i += inc; \
} \
__ac_set_isempty_false(new_flags, i); \
if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \
{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
__ac_set_isdel_true(h->flags, i); \
} else { \
h->keys[i] = key; \
if (kh_is_map) h->vals[i] = val; \
break; \
} \
} \
} \
} \
if (h->n_buckets > new_n_buckets) { \
h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
if (kh_is_map) \
h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
} \
free(h->flags); \
h->flags = new_flags; \
h->n_buckets = new_n_buckets; \
h->n_occupied = h->size; \
h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
} \
} \
static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
{ \
khint_t x; \
if (h->n_occupied >= h->upper_bound) { \
if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \
else kh_resize_##name(h, h->n_buckets + 1); \
} \
{ \
khint_t inc, k, i, site, last; \
x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \
if (__ac_isempty(h->flags, i)) x = i; \
else { \
inc = 1 + k % (h->n_buckets - 1); last = i; \
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
if (__ac_isdel(h->flags, i)) site = i; \
if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
else i += inc; \
if (i == last) { x = site; break; } \
} \
if (x == h->n_buckets) { \
if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
else x = i; \
} \
} \
} \
if (__ac_isempty(h->flags, x)) { \
h->keys[x] = key; \
__ac_set_isboth_false(h->flags, x); \
++h->size; ++h->n_occupied; \
*ret = 1; \
} else if (__ac_isdel(h->flags, x)) { \
h->keys[x] = key; \
__ac_set_isboth_false(h->flags, x); \
++h->size; \
*ret = 2; \
} else *ret = 0; \
return x; \
} \
static inline void kh_del_##name(kh_##name##_t *h, khint_t x) \
{ \
if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
__ac_set_isdel_true(h->flags, x); \
--h->size; \
} \
}
/* --- BEGIN OF HASH FUNCTIONS --- */
#define kh_int_hash_func(key) (uint32_t)(key)
#define kh_int_hash_equal(a, b) (a == b)
#define kh_int64_hash_func(key) (uint32_t)((key)>>33^(key)^(key)<<11)
#define kh_int64_hash_equal(a, b) (a == b)
static inline khint_t __ac_X31_hash_string(const char *s)
{
khint_t h = *s;
if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
return h;
}
#define kh_str_hash_func(key) __ac_X31_hash_string(key)
#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
/* --- END OF HASH FUNCTIONS --- */
/* Other necessary macros... */
#define khash_t(name) kh_##name##_t
#define kh_init(name) kh_init_##name()
#define kh_destroy(name, h) kh_destroy_##name(h)
#define kh_clear(name, h) kh_clear_##name(h)
#define kh_resize(name, h, s) kh_resize_##name(h, s)
#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
#define kh_get(name, h, k) kh_get_##name(h, k)
#define kh_del(name, h, k) kh_del_##name(h, k)
#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
#define kh_key(h, x) ((h)->keys[x])
#define kh_val(h, x) ((h)->vals[x])
#define kh_value(h, x) ((h)->vals[x])
#define kh_begin(h) (khint_t)(0)
#define kh_end(h) ((h)->n_buckets)
#define kh_size(h) ((h)->size)
#define kh_n_buckets(h) ((h)->n_buckets)
/* More conenient interfaces */
#define KHASH_SET_INIT_INT(name) \
KHASH_INIT(name, uint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
#define KHASH_MAP_INIT_INT(name, khval_t) \
KHASH_INIT(name, uint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
#define KHASH_SET_INIT_INT64(name) \
KHASH_INIT(name, uint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
#define KHASH_MAP_INIT_INT64(name, khval_t) \
KHASH_INIT(name, uint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
typedef const char *kh_cstr_t;
#define KHASH_SET_INIT_STR(name) \
KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
#define KHASH_MAP_INIT_STR(name, khval_t) \
KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
#endif /* __AC_KHASH_H */

View file

@ -0,0 +1,529 @@
/* -- lexicon.c
This file reads the lexicon definitions into a chained
hash table and handles the lookups of words in the hash table,
returning definitions in the form of an input symbol and a
standardized text.
Prototype 7H08 (This file was written by Walter Sinclair).
This file is part of pagc.
Copyright (c) 2008 Walter Bruce Sinclair
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
/* For pagc-0.4.2 : last revised 2012-05-23 */
#undef DEBUG
//#define DEBUG
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stddef.h>
#include <ctype.h>
#include "pagc_api.h"
/* -- Hash table size should be a prime number -- */
/* 5581, 5953, 6337, 6733, 7561, 7993, 8893, 10333, 10837, 11353, 12421, 12973, 13537, 15913, 18481 */
#define LEXICON_HTABSIZE 7561
#ifdef BUILD_API
#include "pagc_std_api.h"
#endif
/* -- local prototypes -- */
static unsigned calc_hash( char * ) ;
static ENTRY **create_hash_table( ERR_PARAM * ) ;
static int add_dict_entry( ERR_PARAM *, ENTRY ** , char * , int , SYMB , char * ) ;
#ifndef BUILD_API
static char *convert_field( char * , char * ) ;
static int read_lexicon( ERR_PARAM *, ENTRY ** , FILE * ) ;
#endif
LEXICON *lex_init( ERR_PARAM *err_p ) ;
static int append_new_def( ERR_PARAM *, ENTRY * , SYMB , char * , int ) ;
static unsigned elf_hash( char * ) ;
void print_lexicon( ENTRY ** hash_table ) ;
#ifdef BUILD_API
/*
typedef struct LEXICON_s {
ENTRY **hash_table;
ERR_PARAM *err_p;
} LEXICON;
*/
LEXICON *lex_init( ERR_PARAM *err_p )
{
LEXICON *lex;
PAGC_CALLOC_STRUC(lex,LEXICON,1,err_p,NULL);
lex->hash_table = create_hash_table( err_p );
if (lex->hash_table == NULL) {
lex_free(lex);
return NULL;
}
lex->err_p = err_p;
return lex;
}
int lex_add_entry(LEXICON *lex, int seq, char *word, char *stdword, SYMB token)
{
return add_dict_entry( lex->err_p, lex->hash_table, word, seq-1, token, stdword);
}
void lex_free(LEXICON *lex)
{
if (lex == NULL) return;
destroy_lexicon(lex->hash_table);
free(lex);
lex = NULL;
}
#else
/* ---------------------------------------------------------------------
lexicon.c (create_lexicon) -
read the lexicon file into memory, chaining off a hash table
returns a pointer to the hash table, or NULL if error.
called by standard.l (init_stand_process)
calls util.c (open_aux_file) lexicon.c (read_lexicon, create_hash_table)
uses macro LOG_MESS
stdio.h (fclose)
-----------------------------------------------------------------------*/
ENTRY **create_lexicon( PAGC_GLOBAL *glo_p ,
const char *lex_name ,
const char *gaz_name ) {
/* -- called by init_stand_process to read in the Lexicon and set up the
definitions in memory for hash table access -- */
FILE *gaz_file ,
*dict_file ;
ENTRY **hash_table ;
if ( (hash_table = create_hash_table( glo_p -> process_errors ) ) == NULL ) {
return NULL ;
}
/* 2009-08-13 : support multiple lexicons */
if ( gaz_name != NULL ) {
if ( ( gaz_file = open_aux_file( glo_p ,
gaz_name ) ) == NULL )
return NULL ;
if ( !read_lexicon( glo_p -> process_errors ,
hash_table ,
gaz_file ) ) {
fclose( gaz_file ) ;
return NULL ;
}
fclose( gaz_file ) ;
}
if ( ( dict_file = open_aux_file( glo_p ,
lex_name ) ) == NULL )
return NULL ;
if ( !read_lexicon( glo_p -> process_errors ,
hash_table ,
dict_file ) ) {
fclose( dict_file ) ;
return NULL ;
}
fclose( dict_file ) ;
return hash_table ;
}
/* ----------------------------------------------------
lexicon.c (read_lexicon) -
called by lexicon.c (create_lexicon) for each file
calls convert_field, add_dict_entry
returns FALSE if error encountered
stdio.h (fgets,feof,sscanf)
uses macro BLANK_STRING
-------------------------------------------------------*/
static int read_lexicon( ERR_PARAM *err_p ,
ENTRY **hash_table ,
FILE *CFile ) {
char record_buffer[ MAXSTRLEN ] ;
char lookup_str[ MAXTEXT ] ;
char num_str[ MAXTEXT ] ;
int cur_token ;
int num_def ;
char standard_str[ MAXTEXT ] ;
char *next_str ;
while ( !feof( CFile ) ) {
/* -- read in each line of the csv file and add to hash table -- */
BLANK_STRING(record_buffer) ;
fgets( record_buffer ,
MAXSTRLEN ,
CFile ) ;
#ifdef SEW_NOT_SURE_IF_WE_NEED_THIS
/* -- check for and skip over blank lines -- */
if (strspn(record_buffer, " \t\r\n") == strlen(record_buffer))
continue;
#endif
/* -- comma-separated values are handled only as well as necessary
in the present context -- */
if ( ( next_str =
convert_field( num_str ,
record_buffer ) ) == NULL ) {
break ;
}
sscanf( num_str ,
"%d" ,
&num_def ) ;
next_str = convert_field( lookup_str ,
next_str ) ;
next_str = convert_field( num_str ,
next_str ) ;
sscanf( num_str ,
"%d" ,
&cur_token ) ;
next_str = convert_field( standard_str ,
next_str ) ;
if ( add_dict_entry( err_p ,
hash_table ,
lookup_str ,
( num_def - 1 ) ,
cur_token ,
standard_str ) == ERR_FAIL ) {
return FALSE ;
}
}
return TRUE ;
}
/* ----------------------------------------------------
lexicon.c (convert_field)
called by lexicon.c (read_lexicon)
ctype.h (isspace)
uses macro BLANK_STRING
-------------------------------------------------------*/
static char *convert_field( char *buf ,
char *inp ) {
char c ;
char *d = buf;
char *s = inp ;
BLANK_STRING(d) ;
/* -- space at the beginning of a line will stop the read -- */
if ( isspace( *s ) )
return NULL ;
while ( ( c = *s++ ) != SENTINEL ) {
if ( c == '\"' ||
c == '\r' )
continue ; /* -- ignore quotes and carriage returns -- */
/* -- zero terminate field and record delimiters -- */
if ( c == '\n' ||
c == ',' ) {
BLANK_STRING(d) ;
return s ;
}
*d++ = c ; /* -- copy it -- */
}
return NULL ;
}
#endif
/* ----------------------------------------------------
lexicon.c (destroy_lexicon)
called by standard.l (close_stand_process)
calls lexicon.c (destroy_def_list)
uses macro FREE_AND_NULL
-------------------------------------------------------*/
void destroy_lexicon(ENTRY ** hash_table)
{
/* -- called by Clean-Up - */
unsigned __i__ ;
ENTRY *__E__,*__F__ ;
if (hash_table == NULL)
{
return ;
}
for (__i__ = 0 ;__i__ < LEXICON_HTABSIZE ;__i__++ )
{
for (__E__ = hash_table[__i__] ;__E__ != NULL ;__E__ = __F__)
{
destroy_def_list(__E__->DefList) ;
__F__ = __E__->Next ;
FREE_AND_NULL(__E__->Lookup) ;
FREE_AND_NULL(__E__) ;
}
}
DBG("destroy_lexicon: i=%d", __i__);
/* <revision date='2012-05-23'>free hash table</revision> */
FREE_AND_NULL(hash_table);
DBG("leaving destroy_lexicon");
}
/* ----------------------------------------------------------
lexicon.c (destroy_def_list)
called by destroy_lexicon and tokenize.c (remove_default_defs)
uses macro FREE_AND_NULL
------------------------------------------------------------*/
void destroy_def_list( DEF *start_def ) {
DEF *cur_def ;
DEF *next_def = NULL ;
for ( cur_def = start_def ;
cur_def != NULL ;
cur_def = next_def ) {
next_def = cur_def -> Next ;
/* -- Default definitions have no associated text -- */
if ( cur_def -> Protect == 0 ) {
FREE_AND_NULL( cur_def -> Standard ) ;
}
FREE_AND_NULL( cur_def ) ;
}
}
/* ----------------------------------------------------
lexicon.c (find_entry)
called by lexicon.c (add_dict_entry)
calls lexicon.c (calc_hash)
string.h (strcmp)
-------------------------------------------------------*/
ENTRY *find_entry(ENTRY **hash_table,char *lookup_str)
{
/* -- called to create a lexeme -- */
ENTRY *__E__ ;
unsigned __hash_index__ ; /* -- 2006-11-20 : to return hash table pointer -- */
__hash_index__ = calc_hash(lookup_str) ;
for (__E__ = hash_table[__hash_index__] ; __E__ != NULL ; __E__ = __E__->Next)
{
if (strcmp(lookup_str,__E__->Lookup) == 0)
{
return __E__ ;
}
}
return __E__ ;
}
#define US sizeof( unsigned )
/* ----------------------------------------------------
lexicon.c (elf_hash)
called by lexicon.c (calc_hash)
-------------------------------------------------------*/
static unsigned elf_hash( char *key_str ) {
unsigned h ,
g ,
c ;
h = 0 ;
while ( ( c = ( unsigned ) *key_str ) != '\0' ) {
h = ( h << US ) + c ;
if ( ( g = h & ( ~ ( ( unsigned )( ~0 ) >> US ) ) ) )
h ^= g >> ( US * 6 ) ;
h &= ~g ;
key_str++ ;
}
return h ;
}
/* ----------------------------------------------------
lexicon.c (calc_hash)
called by lexicon.c (find_entry, add_dict_entry)
calls lexicon.c (elf_hash)
-------------------------------------------------------*/
static unsigned calc_hash( char *key_str ) {
unsigned h ;
h = elf_hash( key_str ) ;
return ( h % LEXICON_HTABSIZE ) ;
}
/* ----------------------------------------------------
lexicon.c (create_hash_table)
allocate and initialize hash table in memory
return NULL if error
called by create_lexicon
uses macro PAGC_CALLOC_STRUC
-------------------------------------------------------*/
static ENTRY **create_hash_table(ERR_PARAM *err_p)
{
unsigned __i__ ;
ENTRY **__hash_table__ ;
PAGC_CALLOC_STRUC(__hash_table__,ENTRY *,LEXICON_HTABSIZE,err_p,NULL) ;
for (__i__ = 0 ;__i__ < LEXICON_HTABSIZE ;__i__++ )
{
__hash_table__[__i__] = NULL ;
}
return __hash_table__ ;
}
/* ----------------------------------------------------
lexicon.c (add_dict_entry)
called by lexicon.c (read_lexicon)
calls lexicon.c (calc_hash, create_def, append_new_def)
uses macro PAGC_ALLOC_STRUC , PAGC_STORE_STR, RET_ERR
return ERR_FAIL if error
-------------------------------------------------------*/
static int add_dict_entry( ERR_PARAM *err_p ,
ENTRY **hash_table ,
char *lookup_str ,
int def_num ,
SYMB t ,
char *standard_str ) {
ENTRY *E ;
E = find_entry( hash_table ,
lookup_str ) ;
if ( E == NULL ) {
unsigned hash_index ;
PAGC_ALLOC_STRUC(E,ENTRY,err_p,ERR_FAIL);
/* -- add the Lookup string to the record -- */
PAGC_STORE_STR(E->Lookup,lookup_str,err_p,ERR_FAIL) ;
/* -- add new entry to beginning of table -- */
hash_index = calc_hash( lookup_str ) ;
E -> Next = hash_table[ hash_index ] ; /* -- collision chain -- */
hash_table[ hash_index ] = E ;
if ( ( E -> DefList = create_def( t ,
standard_str ,
def_num ,
FALSE ,
err_p ) ) == NULL ) {
return ERR_FAIL ;
}
} else {
int err_stat ;
if ( E -> DefList == NULL ) {
RET_ERR("add_dict_entry: Lexical entry lacks definition" ,
err_p ,
ERR_FAIL ) ;
}
if ( ( err_stat = append_new_def( err_p ,
E ,
t ,
standard_str ,
def_num ) ) != TRUE ) {
return err_stat ;
}
}
return TRUE ;
}
/* ----------------------------------------------------
lexicon.c (append_new_def)
called by lexicon.c (add_dict_entry)
calls lexicon.c (create_def)
returns FALSE if entry is already there
returns ERR_FAIL on allocation error
-------------------------------------------------------*/
static int append_new_def( ERR_PARAM *err_p ,
ENTRY *E ,
SYMB t ,
char *text ,
int def_num ) {
DEF *D,
*pd,
*cd ;
for ( cd = E -> DefList , pd = NULL ;
cd != NULL ;
cd = cd -> Next ) {
pd = cd ;
/* -- avoid duplication except for local entries -- */
if ( cd -> Type == t ) {
return FALSE ;
}
}
if ( ( D = create_def( t ,
text ,
def_num ,
FALSE ,
err_p ) ) == NULL ) {
return ERR_FAIL ;
}
if ( pd == NULL ) {
E -> DefList = D ;
} else {
D -> Next = pd -> Next ;
pd -> Next = D ;
}
return TRUE ;
}
/*--------------------------------------------------------------------
lexicon.c (create_def)
called by lexicon.c (append_new_def) tokenize.c (setup_default_defs)
allocate memory for lexicon entry.
Pflag is TRUE for default entries
returns NULL for allocation error
uses macro PAGC_ALLOC_STRUC, PAGC_STORE_STR
-------------------------------------------------------------------- */
DEF *create_def ( SYMB s ,
char *standard_str ,
int def_num ,
int PFlag ,
ERR_PARAM *err_p ) {
/* -- allocate the memory and set up the definition structure with the
standard form -- */
DEF *cur_def ;
/* -- initialization-time allocation -- */
PAGC_ALLOC_STRUC(cur_def,DEF,err_p,NULL) ;
cur_def -> Type = s ;
cur_def -> Protect = PFlag ; /* -- False for definitions from lexicon
true for default definitions -- */
if ( !PFlag ) {
/* -- initialization-time allocation -- */
PAGC_STORE_STR(cur_def->Standard,standard_str,err_p,NULL) ;
} else
cur_def -> Standard = NULL ;
cur_def -> Order = def_num ;
cur_def -> Next = NULL ;
return cur_def ;
}
/*--------------------------------------------------------------------
lexicon.c (print_lexicon)
not called by useful for debugging. It will print out the lexicon.
--------------------------------------------------------------------*/
void print_lexicon( ENTRY ** hash_table )
{
unsigned i;
ENTRY *E;
if (!hash_table) return;
for (i=0; i< LEXICON_HTABSIZE; i++)
{
E = hash_table[i];
while (E)
{
DEF *D = E->DefList;
printf("'%s'\n", E->Lookup);
while (D)
{
printf(" %d, %d, %d, '%s'\n", D->Order, D->Type, D->Protect, D->Standard);
D = D->Next;
}
E = E->Next;
}
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,110 @@
drop table if exists addresses cascade;
create table addresses (
id serial not null primary key,
micro text,
macro text
);
copy addresses (micro,macro) from stdin;
1017 LINWOOD AVE APT 12 ST PAUL,MN,55105
1029 ATLANTIC ST APT 302 ST PAUL,MN,55106
1032 PORTLAND AVE ST PAUL,MN,55104
1093 EDGERTON ST FL 2 ST PAUL,MN,55130
111 KELLOGG BLVD E APT 3210 ST PAUL,MN,55101
1113 HAWTHORNE AVE E ST PAUL,MN,55106
1120 BARCLAY ST APT 4 ST PAUL,MN,55106
1137 CHURCHILL ST ST PAUL,MN,55103
1140 GALTIER ST ST PAUL,MN,55117
1147 MINNEHAHA AVE W ST PAUL,MN,55104
1167 BURNQUIST ST ST PAUL,MN,55106
1168 SUPORNICK LN APT A ST PAUL,MN,55106
1169 RANDOLPH AVE ST PAUL,MN,55105
1223 BERKELEY AVE ST PAUL,MN,55105
125 MAGNOLIA AVE E ST PAUL,MN,55117
1263 RICE ST ST PAUL,MN,55117
1305 CONWAY ST ST PAUL,MN,55106
1341 CASE AVE ST PAUL,MN,55106
1345 CASE AVE ST PAUL,MN,55106
1351 CONWAY ST APT 10 ST PAUL,MN,55106
1363 OSCEOLA AVE ST PAUL,MN,55105
1377 MAYNARD DR W APT 168 ST PAUL,MN,55116
1379 MAYNARD DR W APT 176 ST PAUL,MN,55116
1379 MAYNARD DR W APT 177 ST PAUL,MN,55116
1388 BARCLAY ST ST PAUL,MN,55106
1390 ARONA ST ST PAUL,MN,55108
1435 GRAND AVE APT 2 ST PAUL,MN,55105
1484 ASHLAND AVE UNIT 101 ST PAUL,MN,55101
1509 SUMMIT AVE ST PAUL,MN,55106
1548 VAN BUREN AVE ST PAUL,MN,55104
1561 WHEELOCK LN UNIT 303 ST PAUL,MN,55117
1621 ASHLAND AVE APT UPPER ST PAUL,MN,55104
1636 WOODBRIDGE ST ST PAUL,MN,55117
1644 DAYTON AVE APT 3 ST PAUL,MN,55104
1709 LEONE AVE ST PAUL,MN,55106
1743 HIGHLAND PKWY ST PAUL,MN,55116
1776 ST CLAIR AVE APT 107 ST PAUL,MN,55105
1776 ST CLAIR AVE APT 205 ST PAUL,MN,55105
1821 UNIVERSITY AVE W RM 336 ST PAUL,MN,55104
1877 GOODRICH AVE APT LOWER ST PAUL,MN,55105
1898 LACROSSE AVE ST PAUL,MN,55119
1899 BEECHWOOD AVE ST PAUL,MN,55116
1915 MARSHALL AVE APT 2 ST PAUL,MN,55104
1940 NEVADA AVE E ST PAUL,MN,55119
1944 NOKOMIS AVE ST PAUL,MN,55119
1962 SAUNDERS AVE ST PAUL,MN,55116
1968 NEBRASKA AVE E ST PAUL,MN,55119
1971 SARGENT AVE ST PAUL,MN,55105
204 CONGRESS ST E APT D ST PAUL,MN,55107
2085 GRAND AVE APT 203 ST PAUL,MN,55101
21 BATTLE CREEK PL ST PAUL,MN,55119
2174 ELEANOR AVE ST PAUL,MN,55116
2224 MAILAND RD ST PAUL,MN,55119
2272 BENSON AVE UNIT C ST PAUL,MN,55116
2285 BENSON AVE ST PAUL,MN,55116
233 CONCORD ST ST PAUL,MN,55107
235 MCKNIGHT RD S APT B4 ST PAUL,MN,55119
2360 BUFORD AVE ST PAUL,MN,55108
256 POINT DOUGLAS RD N ST PAUL,MN,55106
261 UNIVERSITY AVE E APT 303 ST PAUL,MN,55130
2706 GANNON RD ST PAUL,MN,55116
289 5TH ST E UNIT 309 ST PAUL,MN,55101
303 WILDER ST N FL 1 ST PAUL,MN,55104
317 ROBIE ST E ST PAUL,MN,55107
333 SMITH AVE N ST PAUL,MN,55102
377 HAMLINE AVE S ST PAUL,MN,55105
418 MARYLAND AVE W APT 204 ST PAUL,MN,55117
444 CLINTON AVE ST PAUL,MN,55107
444 FRY ST ST PAUL,MN,55104
536 THOMAS AVE ST PAUL,MN,55103
544 SIMPSON ST ST PAUL,MN,55104
56 IRVINE PARK ST PAUL,MN,55102
597 BLAIR AVE APT 5 ST PAUL,MN,55103
602 HUMBOLDT AVE ST PAUL,MN,55107
605 CAPITOL BLVD APT B ST PAUL,MN,55103
617 ROSE AVE E ST PAUL,MN,55130
635 WESTERN AVE N ST PAUL,MN,55103
660 5TH ST E APT DOWNST ST PAUL,MN,55106
672 UNIVERSITY AVE W ST PAUL,MN,55104
675 WHEELOCK PKWY W ST PAUL,MN,55117
70 IOWA AVE W ST PAUL,MN,55117
711 MARSHALL AVE ST PAUL,MN,55104
712 SNELLING AVE N APT 1 ST PAUL,MN,55104
715 CALIFORNIA AVE E ST PAUL,MN,55106
715 MARSHALL AVE ST PAUL,MN,55104
735 LINCOLN AVE ST PAUL,MN,55105
750 BLAIR AVE ST PAUL,MN,55104
754 BLAIR AVE FL 1 ST PAUL,MN,55104
771 BUTTERNUT AVE ST PAUL,MN,55102
812 7TH ST E ST PAUL,MN,55106
83 CALIFORNIA AVE W APT 206 ST PAUL,MN,55117
838 LAFOND AVE ST PAUL,MN,55104
852 HOLLY AVE ST PAUL,MN,55104
859 OSCEOLA AVE APT 1 ST PAUL,MN,55105
885 CASE AVE ST PAUL,MN,55106
927 WAKEFIELD AVE ST PAUL,MN,55106
93 MARIA AVE ST PAUL,MN,55106
935 HYACINTH AVE E ST PAUL,MN,55106
94 FRONT AVE ST PAUL,MN,55117
953 HYACINTH AVE E ST PAUL,MN,55106
\.

View file

@ -0,0 +1,50 @@
#! /usr/bin/perl
#!/usr/bin/perl -w
use strict;
use Regexp::Assemble;
my @cities = split(/[\r\n]+/, qx(cat usps-st-city-name.txt));
my %st= ();
for my $x (@cities) {
my ($st, $ct) = split(/\t/, $x);
push @{$st{$st}}, $ct;
}
my $re;
my $ra = Regexp::Assemble->new(flags => "i");
my %re =();
for my $x (sort keys %st) {
$ra->add(@{$st{$x}});
$re = $ra->re;
$re =~ s/\\/\\\\/g;
$re{$x} = $re;
}
print "#define NUM_STATES " . scalar (keys %re) . "\n\n";
print " static const char *states[NUM_STATES] = \n";
print " {\"" . join('","', sort keys %re) . "\"};\n\n";
print " static const char *stcities[NUM_STATES] = {\n";
my $cnt = 0;
my $a = '';
my $b = '';
for my $x (sort keys %re) {
$re = "(?:\\\\b)($re{$x})\$";
print " ,\n" if $cnt;
print " /* -- $x -- $x -- $x -- $x -- $x -- $x -- $x -- $x -- $x -- $x -- */\n";
while ($re =~ s/^(.{1,65})//) {
$a = $1;
if ($a =~ s/(\\+)$//) {
print " \"$b$a\"\n";
$b = $1;
}
else {
print " \"$b$a\"\n";
$b = '';
}
}
$cnt++;
}
print " };\n";

View file

@ -0,0 +1,31 @@
#!/usr/bin/perl -w
use strict;
sub Usage {
print "Usage: mk-sql.pl pgver file-in.sql\n";
exit 1;
}
my $ver = shift @ARGV || Usage();
my $fin = shift @ARGV || Usage();
my $nver;
if ($ver =~ /^PostgreSQL (\d+)\.(\d+)/) {
$nver = $1 * 100 + $2;
}
else {
die "Failed to parse '$ver' as /^PostgreSQL (\\d+)\\.(\\d+)/\n/\n";
}
open(IN, $fin) || die "Failed to open file '$fin' : $!\n";
while (my $x = <IN>) {
if ($nver >= 804) {
$x =~ s/\$libdir\/lib/\$libdir\//;
}
if ($nver < 901) {
$x =~ s/^\\echo/--\\echo/;
}
print $x;
}
close(IN);

View file

@ -0,0 +1,734 @@
#!/usr/bin/perl -w
use strict;
use Regexp::Assemble;
# TODO
# Add prefix types like:
# Ave(nue)? of( the)? names
# Ave(nue)? (d'|du|de)(la)?\s?names
# Ave(nue|nida)? \w{1,2}
# calle names
# suffix of( the)? names
# route (\d+([a-z]|bus(iness)?)(by(pass))?
# business (\d+([a-z]|bus(iness)?)(by(pass))?
# (interstate|I-) \d+\s*[nsew]?
#
# Add better number recognizer
# dir num dir num dir
# dir num letter
# num? fraction 123 1/2
#
# Add patterns to recognize intersections
# street & street, city, state
#
#
# Probably the following were removed as they are PREFIX TYPES
# RTE, ROUTE
# CALLE
#
# and maybe RUE
# RUE can be either: RUE d'la whatever; Charles Rue
#
# Many of the SUFFIX TYPES can be used in a prefix contexted like:
# AVENUE of the Americas
#
#my @cities = split(/[\r\n]+/, qx(cat usps-city-names.txt));
# ==============================
my @stwords = qw(
ALLEE
ALLEY
ALLY
ALY
ANEX
ANNEX
ANNX
ANX
ARC
ARCADE
AV
AVE
AVEN
AVENU
AVENUE
AVN
AVNUE
BAYOO
BAYOU
BCH
BEACH
BEND
BG
BGS
BLF
BLFS
BLUF
BLUFF
BLUFFS
BLVD
BND
BOT
BOTTM
BOTTOM
BOUL
BOULEVARD
BOULV
BR
BRANCH
BRDGE
BRG
BRIDGE
BRK
BRKS
BRNCH
BROOK
BROOKS
BTM
BURG
BURGS
BYP
BYPA
BYPAS
BYPASS
BYPS
BYU
CAMP
CANYN
CANYON
CAPE
CAUSEWAY
CAUSWAY
CEN
CENT
CENTER
CENTERS
CENTR
CENTRE
CIR
CIRC
CIRCL
CIRCLE
CIRCLES
CIRS
CK
CLB
CLF
CLFS
CLIFF
CLIFFS
CLUB
CMN
CMP
CNTER
CNTR
CNYN
COMMON
COR
CORNER
CORNERS
CORS
COURSE
COURT
COURTS
COVE
COVES
CP
CPE
CR
CRCL
CRCLE
CRECENT
CREEK
CRES
CRESCENT
CRESENT
CREST
CRK
CROSSING
CROSSROAD
CRSCNT
CRSE
CRSENT
CRSNT
CRSSING
CRSSNG
CRST
CRT
CSWY
CT
CTR
CTRS
CTS
CURV
CURVE
CV
CVS
CYN
DALE
DAM
DIV
DIVIDE
DL
DM
DR
DRIV
DRIVE
DRIVES
DRS
DRV
DV
DVD
EST
ESTATE
ESTATES
ESTS
EXP
EXPR
EXPRESS
EXPRESSWAY
EXPW
EXPY
EXT
EXTENSION
EXTENSIONS
EXTN
EXTNSN
EXTS
FALL
FALLS
FERRY
FIELD
FIELDS
FLAT
FLATS
FLD
FLDS
FLS
FLT
FLTS
FORD
FORDS
FOREST
FORESTS
FORG
FORGE
FORGES
FORK
FORKS
FORT
FRD
FRDS
FREEWAY
FREEWY
FRG
FRGS
FRK
FRKS
FRRY
FRST
FRT
FRWAY
FRWY
FRY
FT
FWY
GARDEN
GARDENS
GARDN
GATEWAY
GATEWY
GATWAY
GDN
GDNS
GLEN
GLENS
GLN
GLNS
GRDEN
GRDN
GRDNS
GREEN
GREENS
GRN
GRNS
GROV
GROVE
GROVES
GRV
GRVS
GTWAY
GTWY
HARB
HARBOR
HARBORS
HARBR
HAVEN
HAVN
HBR
HBRS
HEIGHT
HEIGHTS
HGTS
HIGHWAY
HIGHWY
HILL
HILLS
HIWAY
HIWY
HL
HLLW
HLS
HOLLOW
HOLLOWS
HOLW
HOLWS
HRBOR
HT
HTS
HVN
HWAY
HWY
INLET
INLT
IS
ISLAND
ISLANDS
ISLE
ISLES
ISLND
ISLNDS
ISS
JCT
JCTION
JCTN
JCTNS
JCTS
JUNCTION
JUNCTIONS
JUNCTN
JUNCTON
KEY
KEYS
KNL
KNLS
KNOL
KNOLL
KNOLLS
KY
KYS
LA
LAKE
LAKES
LAND
LANDING
LANE
LANES
LCK
LCKS
LDG
LDGE
LF
LGT
LGTS
LIGHT
LIGHTS
LINE
LK
LKS
LN
LNDG
LNDNG
LOAF
LOCK
LOCKS
LODG
LODGE
LOOP
LOOPS
MALL
MANOR
MANORS
MDW
MDWS
MEADOW
MEADOWS
MEDOWS
MEWS
MILL
MILLS
MISSION
MISSN
ML
MLS
MNR
MNRS
MNT
MNTAIN
MNTN
MNTNS
MOTORWAY
MOUNT
MOUNTAIN
MOUNTAINS
MOUNTIN
MSN
MSSN
MT
MTIN
MTN
MTNS
MTWY
NCK
NECK
OPAS
ORCH
ORCHARD
ORCHRD
OVAL
OVERPASS
OVL
PARK
PARKS
PARKWAY
PARKWAYS
PARKWY
PASS
PASSAGE
PATH
PATHS
PIKE
PIKES
PINE
PINES
PK
PKWAY
PKWY
PKWYS
PKY
PL
PLACE
PLAIN
PLAINES
PLAINS
PLAZA
PLN
PLNS
PLZ
PLZA
PNE
PNES
POINT
POINTS
PORT
PORTS
PR
PRAIRIE
PRARIE
PRK
PRR
PRT
PRTS
PSGE
PT
PTS
RAD
RADIAL
RADIEL
RADL
RAMP
RANCH
RANCHES
RAPID
RAPIDS
RD
RDG
RDGE
RDGS
RDS
REST
RIDGE
RIDGES
RIV
RIVER
RIVR
RNCH
RNCHS
ROAD
ROADS
ROW
RPD
RPDS
RST
RUE
RUN
RVR
SHL
SHLS
SHOAL
SHOALS
SHOAR
SHOARS
SHORE
SHORES
SHR
SHRS
SKWY
SKYWAY
SMT
SPG
SPGS
SPNG
SPNGS
SPRING
SPRINGS
SPRNG
SPRNGS
SPUR
SPURS
SQ
SQR
SQRE
SQRS
SQS
SQU
SQUARE
SQUARES
ST
STA
STATION
STATN
STN
STR
STRA
STRAV
STRAVE
STRAVEN
STRAVENUE
STRAVN
STREAM
STREET
STREETS
STREME
STRM
STRT
STRVN
STRVNUE
STS
SUMIT
SUMITT
SUMMIT
TER
TERR
TERRACE
THROUGHWAY
TPK
TPKE
TR
TRACE
TRACES
TRACK
TRACKS
TRAFFICWAY
TRAIL
TRAILS
TRAK
TRCE
TRFY
TRK
TRKS
TRL
TRLS
TRNPK
TRPK
TRWY
TUNEL
TUNL
TUNLS
TUNNEL
TUNNELS
TUNNL
TURNPIKE
TURNPK
UN
UNDERPASS
UNION
UNIONS
UNS
UPAS
VALLEY
VALLEYS
VALLY
VDCT
VIA
VIADCT
VIADUCT
VIEW
VIEWS
VILL
VILLAG
VILLAGE
VILLAGES
VILLE
VILLG
VILLIAGE
VIS
VIST
VISTA
VL
VLG
VLGS
VLLY
VLY
VLYS
VST
VSTA
VW
VWS
WALK
WALKS
WALL
WAY
WAYS
WELL
WELLS
WL
WLS
WY
XING
XRD
);
# ==============================
my @secwords = qw(
APARTMENT
APT
BASEMENT
BLDG
BSMT
BUILDING
DEPARTMENT
DEPT
FL
FLOOR
FRNT
FRONT
HANGAR
HNGR
LBBY
LOBBY
LOT
LOWER
LOWR
OFC
OFFICE
PENTHOUSE
PH
PIER
REAR
RM
ROOM
SIDE
SLIP
SPACE
SPC
STE
STOP
SUITE
TRAILER
TRLR
UNIT
UPPER
UPPR
);
my @dirs = qw(
NORTH N NORD
SOUTH S SUD
EAST E EST
WEST W OEST O
NORTHEAST NE
NORTHWEST NW
SOUTHEAST SE
SOUTHWEST SW
NORTH-EAST N-E
NORTH-WEST N-W
SOUTH-EAST S-E
SOUTH-WEST S-W
);
my @saints = (
"st",
"st.",
"ste",
"ste.",
"saint",
);
my $re;
my $l = Regexp::Assemble->new(flags => "i");
#$re = $l->set(modifiers=>'i')->list2re(@cities);
#$re =~ s/\\/\\\\/g;
#my $cities = $re;
#print " static const char *cities = \n";
#while ($re =~ s/^(.{1,75})//) {
# print " \"$1\"\n";
#}
#print " ;\n";
$l->add(@stwords);
$re = $l->re;
$re =~ s/\\/\\\\/g;
$re =~ s/\?\^/?-xism/g;
my $sttype = $re;
#print " static const char *sttype = \"$re\";\n\n";
$l->add(@secwords);
$re = $l->re;
$re =~ s/\\/\\\\/g;
$re =~ s/\?\^/?-xism/g;
my $unittype = $re;
#print " static const char *unittype = \"$re\";\n\n";
$l->add(@dirs);
$re = $l->re;
$re =~ s/\\/\\\\/g;
$re =~ s/\?\^/?-xism/g;
my $dirs = $re;
#print " static const char *dirtype = \"$re\";\n\n";
$l->add(@saints);
$re = $l->re;
$re =~ s/\\/\\\\/g;
$re =~ s/\?\^/?-xism/g;
my $saint = $re;
#print " static const char *saints = \"$re\";\n\n";
my $word = "\\\\w+";
my $words = "($word(\\\\s$word)*)";
my @reg = ();
#push @reg, "(?:,\\\\s*)([^,]+)\$";
#push @reg, "\\\\b($cities)\$";
push @reg, "(?:\\\\b$sttype\\\\s(?:$dirs\\\\s))($dirs\\\\s$words)\$";
push @reg, "(?:\\\\b$sttype\\\\s(?:$dirs\\\\s))($dirs\\\\s$saint\\\\s$words)\$";
push @reg, "(?:\\\\b$sttype\\\\s)($dirs\\\\s$saint\\\\s$words)\$";
push @reg, "(?:\\\\b$sttype\\\\s)($saint\\\\s$words)\$";
push @reg, "(?:\\\\b$sttype\\\\s)($dirs\\\\s$words)\$";
push @reg, "(?:\\\\b$sttype\\\\s)($words)\$";
push @reg, "(?:\\\\s)($dirs\\\\s$words)\$";
push @reg, "^(?:\\\\d+\\\\s(?:(?:\\\\w+\\\\s)$sttype))()\$";
push @reg, "^(?:\\\\d+\\\\s(?:(?:\\\\w+\\\\s)*\\\\w+\\\\s))($word)\$";
my $nn = scalar @reg;
print " const int nreg = $nn;\n";
print " static const char *t_regx[$nn] = {\n \"";
print join("\",\n \"", @reg);
print "\"\n };\n";

View file

@ -0,0 +1,71 @@
#!/usr/bin/perl -w
use strict;
sub Usage {
die "Usage: pagc-data-psql [lex|gaz|rules] file\n";
}
my $mode = shift @ARGV || Usage();
my $file = shift @ARGV || Usage();
my $x;
open(IN, $file) || die "Failed to open '$file' : $!\n";
if ($mode eq 'lex') {
print <<EOF;
drop table if exists lex cascade;
create table lex (
id serial not null primary key,
seq integer,
word text,
stdword text,
token integer
);
copy lex (seq, word, token, stdword) from stdin;
EOF
while ($x = <IN>) {
$x =~ s/["\r\n]//g;
$x =~ s/,/\t/g;
print "$x\n";
}
print "\\.\n";
}
elsif ($mode eq 'gaz') {
print <<EOF;
drop table if exists gaz cascade;
create table gaz (
id serial not null primary key,
seq integer,
word text,
stdword text,
token integer
);
copy gaz (seq, word, token, stdword) from stdin;
EOF
while ($x = <IN>) {
$x =~ s/["\r\n]//g;
$x =~ s/,/\t/g;
print "$x\n";
}
print "\\.\n";
}
elsif ($mode eq 'rules') {
print <<EOF;
drop table if exists rules cascade;
create table rules (
id serial not null primary key,
rule text
);
copy rules (rule) from stdin;
EOF
while ($x = <IN>) {
$x =~ s/["\r\n]//g;
print "$x\n";
}
print "\\.\n";
}
else {
Usage();
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,118 @@
/*-- pagc_common.h --
Certain common definitions used both by the pagc library and its clients
Prototype 20H10 (This file was written by Walter Sinclair).
This file is part of PAGC.
Copyright (c) 2010 Walter Bruce Sinclair
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
/* For pagc-0.4.0 : last revised 2010-11-08 */
#ifndef PAGC_COM_H
#define PAGC_COM_H
#ifdef MAXPATHLEN
#define PATHNAME_LEN MAXPATHLEN
#else
#define PATHNAME_LEN 1024
#endif
/* -- 2006-04-25 : structure added to index arc endpoints -- */
typedef struct pagc_point {
double X ;
double Y ;
} PAGC_POINT ;
typedef int SYMB ;
#define ERR_FAIL -2
#define FAIL -1
#define NULL_READ 0
#define MATCH_READ 2
#define BOTH 2
/*------------------------------------
strategy types
------------------------------------*/
#define ADDRESS_SCORING 0
#define INTERSECTION_SCORING 1
#define LANDMARK_SCORING 3
#define SITE_MATCH 0
#define SITE_INTERPOLATE 1
#define INTERSECTION 2
#define ADDRESS_RANGE_2 3
#define ADDRESS_RANGE_4 4
#define REVERSE_SITE 5
#define REVERSE_INTERSECTION 6
#define INTERSECTION_B 7
#define CONCAT 8
#define LANDMARK_NAME 9
/*----------------------------------
response format types :
------------------------------------*/
#define CSV 0
#define JSON 1
#define XML 2
/* -- build flags -- */
#define STATISTICS 2 /* -- output statistics on rules used. FLSTATS in schema-- */
#define PRINT_PROGRESS 128 /* output 10% completion points */
#define LOG_COMPLETE 2048 /* log certain initializations when complete */
#define ZERO_IS_BLANK 512 /* schema: FLZBLNK */
#define RNF_PRETYPE_REDIRECT 4096 /* schema: FLRNFRE */
#define SENTINEL '\0'
#define BLANK_STRING(STR) *STR = SENTINEL
#define MAXSTRLEN 256
/* -- boolean -- */
#ifndef TRUE
#define TRUE 1
#define FALSE 0
#endif
#define READ_ONLY_MODE 0
#define WRITE_CREATE_MODE 1
#define WRITE_APPEND_MODE 2
#define PAGE_SIZE 4096
#define MAX_REF_CANDS 100
#ifdef ENABLE_THREADED
#define MAX_CONTEXTS 20
#else
#define MAX_CONTEXTS 1
#endif
#define BACK_SLASH 0x5c
#define FORE_SLASH '/'
#define IS_DOT(CH) ( CH == '.' )
#define IS_DIR_SEP(CH) ( CH == global_path_separator )
#define IS_COLON(CH) ( CH == ':' )
#define NOT_PATH_DELIMITOR(CH) \
( CH != global_path_separator ) && \
( !IS_COLON(CH) )
#define IS_PATH_DELIMITOR(CH) \
( IS_DIR_SEP(CH) || \
IS_COLON(CH) )
#define COMMA_APPEND_WITH_LEN( D , S , L ) \
char_append( "," , D , S , L )
#endif

View file

@ -0,0 +1,75 @@
#ifndef PAGC_STD_API_H
#define PAGC_STD_API_H
#define BUILD_API
typedef struct LEXICON_s {
ENTRY **hash_table;
ERR_PARAM *err_p;
} LEXICON;
typedef struct RULES_s {
int ready;
int rule_number;
int last_node;
RULE_PARAM *r_p;
ERR_PARAM *err_p;
NODE **Trie;
SYMB *rule_end ;
SYMB *r ;
} RULES;
typedef struct STANDARDIZER_s {
PAGC_GLOBAL *pagc_p;
STAND_PARAM *misc_stand;
ERR_PARAM *err_p;
} STANDARDIZER;
typedef struct STDADDR_s { // define as required
char *building;
char *house_num;
char *predir;
char *qual;
char *pretype;
char *name;
char *suftype;
char *sufdir;
char *ruralroute;
char *extra;
char *city;
char *state;
char *country;
char *postcode;
char *box;
char *unit;
} STDADDR;
LEXICON * lex_init();
int lex_add_entry(LEXICON *lex, int seq, char *word, char
*stdword, SYMB token);
void lex_free(LEXICON *lex);
RULES *rules_init();
int rules_add_rule(RULES *rules, int num, int *rule);
int rules_add_rule_from_str(RULES *rules, char *rule);
int rules_ready(RULES *rules);
void rules_free(RULES *rules);
STANDARDIZER *std_init();
int std_use_lex(STANDARDIZER *std, LEXICON *lex);
int std_use_gaz(STANDARDIZER *std, LEXICON *gaz);
int std_use_rules(STANDARDIZER *std, RULES *rules);
int std_ready_standardizer(STANDARDIZER *std);
void std_free(STANDARDIZER *std);
STDADDR *std_standardize_one(STANDARDIZER *std, char *address_one_line, int options);
STDADDR *std_standardize_mm(STANDARDIZER *std, char *micro, char *macro, int options);
STDADDR *std_standardize(STANDARDIZER *std, char *address, char *city, char *state, char *postcode, char *country, int options);
void stdaddr_free(STDADDR *stdaddr);
void print_stdaddr(STDADDR *stdaddr);
#endif

View file

@ -0,0 +1,441 @@
/* -- pagc_tools.c
Various and miscellaneous functions.
Prototype 20H10 (This file was written by Walter Sinclair).
This file is part of PAGC.
Copyright (c) 2010 Walter Bruce Sinclair
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
/* For pagc-0.4.0 : last revised 2010-11-25 */
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <unistd.h>
#include "pagc_common.h"
#include "pagc_tools.h"
#ifndef MAXOUTSYM
#define MAXOUTSYM 18
#endif
#ifdef MSYS_POSIX
static void conform_directory_separator( char * ) ;
#endif
static const char *OutSymbNames[] = {
"BLDNG",
"HOUSE",
"PREDIR",
"QUALIF",
"PRETYP",
"STREET",
"SUFTYP",
"SUFDIR",
"RR",
"UNKNWN",
"CITY",
"PROV",
"NATION",
"POSTAL",
"BOXH",
"BOXT",
"UNITH",
"UNITT"
} ;
static const char *InSymbNames[] = {
"NUMBER",
"WORD",
"TYPE",
"QUALIF",
"PRETYP",
"STREET",
"ROAD",
"STOPWORD",
"RR",
"DASH",
"CITY",
"PROV",
"NATION",
"AMPERS",
"BOXH",
"ORD",
"UNITH",
"UNITT",
"SINGLE",
"BUILDH",
"MILE",
"DOUBLE",
"DIRECT",
"MIXED",
"BUILDT",
"FRACT",
"PCT",
"PCH",
"QUINT",
"QUAD",
} ;
/* ------------------------------------------------------------
ISO 8859 character set may pop up in some files. After 1998
TigerLine will use them.
------------------------------------------------------------- */
void convert_latin_one ( char *inp ) {
unsigned char *str ;
for ( str = ( unsigned char * ) inp ;
*str != SENTINEL ;
str++ ) {
unsigned char ch ;
ch = *str ;
/* -------------------------------------------
if bit 7 is set, reset bit 5 so both upper
and lower case can be done together
--------------------------------------------- */
if ( ch & 0x80 ) {
ch &= 0xDF ;
/* -----------------------------------------
reduce letters with diacritical marks to
their unmarked base letters
------------------------------------------ */
if ( ch >= 0xC0 &&
ch <= 0xC6 )
ch = 'A' ;
else if ( ch == 0xc7 )
ch = 'C' ;
else if ( ch >= 0xc8 && ch <= 0xcb )
ch = 'E' ;
else if ( ch >= 0xcc && ch <= 0xcf )
ch = 'I' ;
else if ( ch == 0xd0 )
ch = 'D' ;
else if ( ch == 0xd1 )
ch = 'N' ;
else if ( ch >= 0xd2 && ch <= 0xd6 )
ch = 'O' ;
else if ( ch >= 0xd9 && ch <= 0xdc )
ch = 'U' ;
else if ( ch >= 0xdd && ch < 0xdf )
ch = 'Y' ;
else
/* -------------------------------
just clear the top bit so it
won't gum up the edit distance
machinery
-------------------------------- */
ch &= 0x7f ;
}
*str = ch ;
}
/* ----------------------------------------------
while we're at it, add a newline to the end
because the lexical scanner likes it like that
----------------------------------------------- */
*str++ = '\n' ;
*str = SENTINEL ;
}
void char_append( const char *div ,
char *dest ,
const char *src ,
int max_wid ) {
if ( *src == SENTINEL )
return ;
/* -- skip the delimitor if dest is empty -- */
if ( *dest == SENTINEL ) {
append_string_to_max( dest ,
( char * ) src ,
max_wid ) ;
return ;
}
append_string_to_max( dest , ( char * ) div , max_wid ) ;
append_string_to_max( dest , ( char * ) src , max_wid ) ;
}
const char *out_symb_name( int i ) {
return ( OutSymbNames[ i ] ) ;
}
const char *in_symb_name( int i ) {
return ( InSymbNames[ i ] ) ;
}
int out_symb_value( const char *src ) {
int i ;
/* -- linear search -- */
for ( i = 0 ;
i < MAXOUTSYM ;
i++ ) {
if ( strcmp( src ,
OutSymbNames[ i ] ) == 0 )
return i ;
}
return FAIL ;
}
/*-------------------------------------------
util.c (get_input_line)
called by initial.c (restore_build_state)
--------------------------------------------*/
int get_input_line( char *buf ,
FILE *fp ) {
int i ;
BLANK_STRING(buf) ;
if ( ( fgets( buf ,
MAXSTRLEN ,
fp ) ) == NULL )
return FALSE ;
for ( i = strlen( buf ) ;
i > 0 ;
i-- ) {
if ( strchr( "\n\r",
buf[ i - 1 ] ) ) {
buf[ i - 1 ] = SENTINEL ;
} else
break ;
}
return TRUE ;
}
/*-------------------------------------------------------
pagc_tools.c (parse_file_name)
called by open_aux_file, main.c (main)
copies the file name to the output_tail and the path to
the output_head
--------------------------------------------------------*/
void parse_file_name( const char *input_path_name ,
char global_path_separator ,
char *output_tail ,
char *output_head ) {
const char *end_ptr , *src ;
char *dest ;
/* -- find the file name part first -- */
/* -- move to end of the pathname -- */
for ( end_ptr = input_path_name ; *end_ptr != SENTINEL ; end_ptr++ ) ;
/* -- find the last directory delimitor -- */
while ( ( end_ptr > input_path_name ) && NOT_PATH_DELIMITOR(*end_ptr) ) {
end_ptr -- ;
}
/* ---------------------------------------------------------------
either end_ptr has the last delimitor or it is at string start.
If the first case, we need to increment to get the filename and
need to copy everything up to and including for the path.
-----------------------------------------------------------------*/
/* -- copy from beg to endptr to output path -- */
dest = output_head ;
src = input_path_name ;
/* if end_ptr points to a path delimitor, copy everything up but not
including it into the output_head (if output_head isn't NULL) */
if ( IS_PATH_DELIMITOR( *end_ptr ) ) {
while ( src < end_ptr ) {
if ( dest != NULL ) {
*dest++ = *src ;
}
src++ ;
}
src++ ;
}
/* -- copy from endptr to end to output file name -- */
if ( dest != NULL ) {
BLANK_STRING(dest) ;
}
/* copy everything after the delimitor up to the sentinel
into the output_tail */
if ( ( dest = output_tail ) != NULL ) {
while ( TRUE ) {
if ( ( *dest++ = *src++ ) == SENTINEL ) {
break ;
}
}
}
}
/*--------------------------------------------------
pagc_tools.c (combine_path_file)
called by util.c (open_aux_file)
calls char_append
--------------------------------------------------*/
void combine_path_file( char global_path_separator ,
char *input_head ,
char *input_tail ,
char *output_path_name ) {
char combine_buf[ 2 ] ;
combine_buf[ 0 ] = global_path_separator ;
combine_buf[ 1 ] = SENTINEL ;
if ( ( input_head != NULL ) &&
( input_head[ 0 ] != SENTINEL ) ) {
append_string_to_max( output_path_name ,
input_head ,
PATHNAME_LEN ) ;
char_append( combine_buf ,
output_path_name ,
input_tail ,
PATHNAME_LEN ) ;
return ;
}
append_string_to_max( output_path_name ,
input_tail ,
PATHNAME_LEN ) ;
}
void upper_case( char *d ,
const char *s ) {
/* -- make an uppercase copy in d of string in s -- */
for ( ;
*s != SENTINEL ;
s++ ) {
*d++ = ( islower( *s )? toupper( *s ) : *s ) ;
}
BLANK_STRING(d) ;
}
/* 2010-10-22 : new routine */
int upper_case_compare( char *str1 , char* str2 ) {
char upper_buf1[ MAXSTRLEN ] ;
char upper_buf2[ MAXSTRLEN ] ;
upper_case( upper_buf1 , str1 ) ;
upper_case( upper_buf2 , str2 ) ;
return ( strcmp( upper_buf1 , upper_buf2 ) ) ;
}
/* 2010-10-30 : moved here for use in ds */
void fast_reverse_endian( char *location_to_reverse , int bytes_to_reverse ) {
char *start_byte_ptr , *end_byte_ptr ;
for ( start_byte_ptr = location_to_reverse , end_byte_ptr = location_to_reverse + bytes_to_reverse - 1 ; start_byte_ptr < end_byte_ptr ; start_byte_ptr++ , end_byte_ptr-- ) {
char a = *start_byte_ptr ;
*start_byte_ptr = *end_byte_ptr ;
*end_byte_ptr = a ;
}
}
/*=================================================================
pagc_tools.c (append_string_to_max ) = format.c (format_ncat)
=================================================================*/
void append_string_to_max( char *dest_buf_start ,
char *src_str_start ,
int buf_size ) {
char a ;
char *d_ptr , *s_ptr , *buf_end ;
/* -- move to end of current contents of buffer -- */
d_ptr = dest_buf_start ;
while ( ( a = *d_ptr ) != SENTINEL ) {
d_ptr ++ ;
}
buf_end = dest_buf_start + buf_size - 1 ;
if ( d_ptr >= buf_end ) {
#ifndef BUILD_API
#ifndef NO_STDERR_OUTPUT
fprintf( stderr , "format_strncat: fatal buffer overflow of %s\n" , dest_buf_start ) ;
fprintf( stderr , "No room for %s\n" , src_str_start ) ;
#endif
exit( 1 ) ;
#else
/* TODO if postgresql we can throw and error or notice
but for now we will just truncate the string */
*d_ptr = SENTINEL ;
return;
#endif
}
s_ptr = src_str_start ;
while ( ( ( a = *s_ptr++ ) != SENTINEL ) &&
( d_ptr != buf_end ) ) {
*d_ptr++ = a ;
}
*d_ptr = SENTINEL ;
}
/* ========================================================
pagc_tools.c (establish_directory)
Determine the current working directory and path_separator
========================================================= */
int establish_directory( char * c_w_d ,
char * p_s ) {
char *c_w_d_ptr ;
c_w_d_ptr = getcwd( c_w_d ,
( PATHNAME_LEN - 1 ) ) ;
if ( c_w_d_ptr == NULL ) {
return FALSE ;
}
*p_s = FORE_SLASH ;
#ifdef MSYS_POSIX
/* ..... transform cwd's non-POSIX directory separators to conform ..... */
conform_directory_separator( c_w_d ) ;
#endif
if ( isalpha( c_w_d[ 0 ] ) ) {
/* ..... drive letter, colon, dir_sep ..... */
if ( IS_COLON( c_w_d[ 1 ] ) ) {
*p_s = c_w_d[ 2 ] ;
if ( ( *p_s != FORE_SLASH ) &&
( *p_s != BACK_SLASH ) ) {
return FALSE ;
}
} else {
return FALSE ;
}
}
return TRUE ;
}
#ifdef MSYS_POSIX
/*------------------------------------------------------------------
pagc_tools.c (conform_directory_separator)
-- called only if compiled with MSYS_POSIX defined .....
-- transform non-POSIX directory separators to conform with POSIX --
called by init_global
string.h (strlen)
-------------------------------------------------------------------*/
static void conform_directory_separator( char * path_name ) {
int i ,
pn_len ;
pn_len = strlen( path_name ) ;
for ( i = 0 ;
i < pn_len ;
i++ ) {
if ( path_name[ i ] == BACK_SLASH ) {
path_name[ i ] = FORE_SLASH ;
}
}
}
/* ..... END OF IFDEF MSYS_POSIX ..... */
#endif

View file

@ -0,0 +1,40 @@
/*=================================================================
-- pagc_tools.h --
Certain common tools used both by the pagc library and its clients
Prototype 20H10 (This file was written by Walter Sinclair).
This file is part of PAGC.
Copyright (c) 2010 Walter Bruce Sinclair
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
/* For pagc-0.4.0 : last revised 2010-11-25 */
#ifndef PGC_T_H
#define PGC_T_H
void convert_latin_one ( char * ) ;
void char_append( const char * , char * , const char * , int ) ;
void append_string_to_max( char * , char * , int ) ;
const char *out_symb_name( int ) ;
const char *in_symb_name( int ) ;
int out_symb_value( const char * ) ;
int get_input_line( char * , FILE * ) ;
void combine_path_file( char , char * , char * , char * ) ;
int upper_case_compare( char * , char* ) ; /* 2010-10-22 */
void fast_reverse_endian( char * , int ) ; /* 2010-10-30 */
void upper_case( char * , const char * ) ;
void parse_file_name( const char * , char , char * , char * ) ;
int establish_directory( char * , char * ) ;
#endif

View file

@ -0,0 +1,524 @@
/*
* parseaddres.c - utility to crack a string into address, city st zip
*
* Copyright 2006 Stephen Woodbridge
*
* This code is released under and MIT-X style license,
*
* Stphen Woodbridge
* woodbri@swoodbridge.com
* woodbr@imaptools.com
*
* $Id: parseaddress.c,v 2.6 2010/07/25 00:47:24 woodbri Exp $
*
* TODO:
* * add recognition of country before or after postalcode
* * have clean trailing punctionation return a code if a comma was removed
* if comma and no state then there is probably no city
*
*/
#include <string.h>
#include <ctype.h>
#include <stdio.h>
#include <pcre.h>
#include "parseaddress-api.h"
#undef DEBUG
//#define DEBUG 1
#ifdef DEBUG
#define DBG(format, arg...) \
elog(NOTICE, format , ## arg)
#else
#define DBG(format, arg...) do { ; } while (0)
#endif
const char *get_state_regex(char *st);
const char *parseaddress_cvsid();
char *clean_leading_punct(char *s);
const char *get_state_regex(char *st)
{
int i;
int cmp;
#include "parseaddress-stcities.h"
if (!st || strlen(st) != 2) return NULL;
for (i=0; i<NUM_STATES; i++) {
cmp = strcmp(states[i], st);
if (cmp == 0)
return stcities[i];
else if (cmp > 0)
return NULL;
}
return NULL;
}
int clean_trailing_punct(char *s)
{
int i;
int ret = 0;
i=strlen(s)-1;
while (ispunct(s[i]) || isspace(s[i])) {
if (s[i] == ',') ret = 1;
s[i--] = '\0';
}
return ret;
}
char *clean_leading_punct(char *s)
{
int i;
for (i=0; i<strlen(s); i++)
if (!(ispunct(s[i]) || isspace(s[i])))
break;
return s + i;
}
void strtoupper(char *s)
{
int i;
for (i=0; i<strlen(s); i++)
s[i] = toupper(s[i]);
}
int match(char *pattern, char *s, int *ovect, int options)
{
const char *error;
int erroffset;
pcre *re;
int rc;
re = pcre_compile(pattern, options, &error, &erroffset, NULL);
if (!re) return -99;
rc = pcre_exec(re, NULL, s, strlen(s), 0, 0, ovect, OVECCOUNT);
free(re);
if (rc < 0) return rc;
else if (rc == 0) rc = OVECCOUNT/3;
return rc;
}
#define RET_ERROR(a,e) if (!a) {*reterr = e; return NULL;}
ADDRESS *parseaddress(HHash *stH, char *s, int *reterr)
{
#include "parseaddress-regex.h"
int ovect[OVECCOUNT];
char c;
char *stregx;
char *caregx;
char *state = NULL;
char *regx;
int mi;
int i, j;
int rc;
int comma = 0;
ADDRESS *ret;
#ifdef USE_HSEARCH
ENTRY e, *ep;
int err;
#else
char *key;
char *val;
#endif
ret = (ADDRESS *) palloc0(sizeof(ADDRESS));
/* check if we were passed a lat lon */
rc = match("^\\s*([-+]?\\d+(\\.\\d*)?)[\\,\\s]+([-+]?\\d+(\\.\\d*)?)\\s*$", s, ovect, 0);
if (rc >= 3) {
*(s+ovect[3]) = '\0';
ret->lat = strtod(s+ovect[2], NULL);
ret->lon = strtod(s+ovect[6], NULL);
return ret;
}
/* clean the string of multiple white spaces and . */
for (i=0, j=0; i<strlen(s); i++) {
c = s[i];
if (c == '.') c = s[i] = ' ';
if (j == 0 && isspace(c)) continue;
if (i && isspace(c) && isspace(s[i-1])) continue;
s[j] = s[i];
j++;
}
if (isspace(s[j-1])) j--;
s[j] = '\0';
/* clean trailing punctuation */
comma |= clean_trailing_punct(s);
/* assume country code is US */
ret->cc = (char *) palloc0(3 * sizeof(char));
strcpy(ret->cc, "US");
/* get US zipcode components */
rc = match("\\b(\\d{5})[-\\s]?(\\d{4})?$", s, ovect, 0);
if (rc >= 2) {
ret->zip = (char *) palloc0((ovect[3]-ovect[2]+1) * sizeof(char));
strncpy(ret->zip, s+ovect[2], ovect[3]-ovect[2]);
if (rc >= 3) {
ret->zipplus = (char *) palloc0((ovect[5]-ovect[4]+1) * sizeof(char));
strncpy(ret->zipplus, s+ovect[4], ovect[5]-ovect[4]);
}
/* truncate the postalcode off the string */
*(s+ovect[0]) = '\0';
comma = 0;
}
/* get canada zipcode components */
else {
rc = match("\\b([a-z]\\d[a-z]\\s?\\d[a-z]\\d)$", s, ovect, PCRE_CASELESS);
if (rc >= 1) {
ret->zip = (char *) palloc0((ovect[1]-ovect[0]+1) * sizeof(char));
strncpy(ret->zip, s+ovect[0], ovect[1]-ovect[0]);
strcpy(ret->cc, "CA");
/* truncate the postalcode off the string */
*(s+ovect[0]) = '\0';
comma = 0;
}
}
/* clean trailing punctuation */
comma |= clean_trailing_punct(s);
/* get state components */
caregx = "^(?-xism:(?i:(?=[abmnopqsy])(?:n[ltsu]|[am]b|[bq]c|on|pe|sk|yt)))$";
stregx = "\\b(?-xism:(?i:(?=[abcdfghiklmnopqrstuvwy])(?:a(?:l(?:a(?:bam|sk)a|berta)?|mer(?:ican)?\\ samoa|r(?:k(?:ansas)?|izona)?|[kszb])|s(?:a(?:moa|skatchewan)|outh\\ (?:carolin|dakot)a|\\ (?:carolin|dakot)a|[cdk])|c(?:a(?:lif(?:ornia)?)?|o(?:nn(?:ecticut)?|lorado)?|t)|d(?:e(?:la(?:ware)?)?|istrict\\ of\\ columbia|c)|f(?:l(?:(?:orid)?a)?|ederal\\ states\\ of\\ micronesia|m)|m(?:i(?:c(?:h(?:igan)?|ronesia)|nn(?:esota)?|ss(?:(?:issipp|our)i)?)?|a(?:r(?:shall(?:\\ is(?:l(?:and)?)?)?|yland)|ss(?:achusetts)?|ine|nitoba)?|o(?:nt(?:ana)?)?|[ehdnstpb])|g(?:u(?:am)?|(?:eorgi)?a)|h(?:awai)?i|i(?:d(?:aho)?|l(?:l(?:inois)?)?|n(?:d(?:iana)?)?|(?:ow)?a)|k(?:(?:ansa)?s|(?:entuck)?y)|l(?:a(?:bordor)?|ouisiana)|n(?:e(?:w(?:\\ (?:foundland(?:\\ and\\ labordor)?|hampshire|jersey|mexico|(?:yor|brunswic)k)|foundland)|(?:brask|vad)a)?|o(?:rth(?:\\ (?:mariana(?:\\ is(?:l(?:and)?)?)?|(?:carolin|dakot)a)|west\\ territor(?:ies|y))|va\\ scotia)|\\ (?:carolin|dakot)a|u(?:navut)?|[vhjmycdblsf]|w?t)|o(?:h(?:io)?|k(?:lahoma)?|r(?:egon)?|n(?:t(?:ario)?)?)|p(?:a(?:lau)?|e(?:nn(?:sylvania)?|i)?|r(?:ince\\ edward\\ island)?|w|uerto\\ rico)|r(?:hode\\ island|i)|t(?:e(?:nn(?:essee)?|xas)|[nx])|ut(?:ah)?|v(?:i(?:rgin(?:\\ islands|ia))?|(?:ermon)?t|a)|w(?:a(?:sh(?:ington)?)?|i(?:sc(?:onsin)?)?|y(?:oming)?|(?:est)?\\ virginia|v)|b(?:ritish\\ columbia|c)|q(?:uebe)?c|y(?:ukon|t))))$";
rc = match(stregx, s, ovect, PCRE_CASELESS);
if (rc > 0) {
state = (char *) palloc0((ovect[1]-ovect[0]+1) * sizeof(char));
strncpy(state, s+ovect[0], ovect[1]-ovect[0]);
/* truncate the state/province off the string */
*(s+ovect[0]) = '\0';
/* lookup state in hash and get abbreviation */
strtoupper(state);
#ifdef USE_HSEARCH
e.key = state;
err = hsearch_r(e, FIND, &ep, stH);
if (err) {
ret->st = (char *) palloc0(3 * sizeof(char));
strcpy(ret->st, ep->data);
}
#else
key = state;
val = (char *)hash_get(stH, key);
if (val) {
ret->st = pstrdup(val);
}
#endif
else {
*reterr = 1002;
return NULL;
}
/* check if it a Canadian Province */
rc = match(caregx, ret->st, ovect, PCRE_CASELESS);
if (rc > 0) {
strcpy(ret->cc, "CA");
// if (ret->cc) printf(" CC: %s\n", ret->cc);
}
comma = 0;
}
/* clean trailing punctuation */
comma |= clean_trailing_punct(s);
/* get city components */
/*
* This part is ambiguous without punctuation after the street
* because we can have any of the following forms:
*
* num predir? prefixtype? street+ suffixtype? suffdir?,
* ((north|south|east|west)? city)? state? zip?
*
* and technically num can be of the form:
*
* pn1? n1 pn2? n2? sn2?
* where
* pn1 is a prefix character
* n1 is a number
* pn2 is a prefix character
* n2 is a number
* sn2 is a suffix character
*
* and a trailing letter might be [NSEW] which predir can also be
*
* So it is ambigious whether a directional between street and city
* belongs to which component. Futher since the the street and the city
* are both just a string of arbitrary words, it is difficult if not
* impossible to determine if an give word belongs to sone side or the
* other.
*
* So for the best results users should include a comma after the street.
*
* The approach will be as follows:
* 1. look for a comma and assume this is the separator
* 2. if we can find a state specific regex try that
* 3. else loop through an array of possible regex patterns
* 4. fail and assume there is not city
*/
/* look for a comma */
DBG("parse_address: s=%s", s);
mi = 0;
regx = "(?:,\\s*)([^,]+)$";
rc = match((char *)regx, s, ovect, 0);
if (rc <= 0) {
/* look for state specific regex */
mi++;
regx = (char *) get_state_regex(ret->st);
if (regx)
rc = match((char *)regx, s, ovect, 0);
}
DBG("Checked for comma: %d", rc);
if (rc <= 0 && ret->st && strlen(ret->st)) {
/* look for state specific regex */
mi++;
regx = (char *) get_state_regex(ret->st);
if (regx)
rc = match((char *)regx, s, ovect, 0);
}
DBG("Checked for state-city: %d", rc);
if (rc <= 0) {
/* run through the regx's and see if we get a match */
for (i=0; i<nreg; i++) {
mi++;
rc = match((char *)t_regx[i], s, ovect, 0);
DBG(" rc=%d, i=%d", rc, i);
if (rc > 0) break;
}
DBG("rc=%d, i=%d", rc, i);
}
DBG("Checked regexs: %d, %d, %d", rc, ovect[2], ovect[3]);
if (rc > 0 && ovect[3]>ovect[2]) {
/* we have a match so process it */
ret->city = (char *) palloc0((ovect[3]-ovect[2]+1) * sizeof(char));
strncpy(ret->city, s+ovect[2], ovect[3]-ovect[2]);
/* truncate the state/province off the string */
*(s+ovect[2]) = '\0';
}
/* clean trailing punctuation */
clean_trailing_punct(s);
/* check for [@] that would indicate a intersection */
/* -- 2010-12-11 : per Nancy R. we are using @ to indicate an intersection
ampersand is used in both street names and landmarks so it is highly
ambiguous -- */
rc = match("^([^@]+)\\s*[@]\\s*([^@]+)$", s, ovect, 0);
if (rc > 0) {
s[ovect[3]] = '\0';
clean_trailing_punct(s+ovect[2]);
ret->street = pstrdup(s+ovect[2]);
s[ovect[5]] = '\0';
clean_leading_punct(s+ovect[4]);
ret->street2 = pstrdup(s+ovect[4]);
}
else {
/* and the remainder must be the address components */
ret->address1 = pstrdup(clean_leading_punct(s));
/* split the number off the street if it exists */
rc = match("^((?i)[nsew]?\\d+[-nsew]*\\d*[nsew]?\\b)", s, ovect, 0);
if (rc > 0) {
ret->num = (char *) palloc0((ovect[1]-ovect[0]+1) * sizeof(char));
strncpy(ret->num, s, ovect[1]-ovect[0]);
ret->street = pstrdup(clean_leading_punct(s+ovect[1]));
}
}
return ret;
}
int load_state_hash(HHash *stH)
{
char * words[][2] = {
{"ALABAMA" , "AL"},
{"ALASKA" , "AK"},
{"AMERICAN SAMOA" , "AS"},
{"AMER SAMOA" , "AS"},
{"SAMOA" , "AS"},
{"ARIZONA" , "AZ"},
{"ARKANSAS" , "AR"},
{"ARK" , "AR"},
{"CALIFORNIA" , "CA"},
{"CALIF" , "CA"},
{"COLORADO" , "CO"},
{"CONNECTICUT" , "CT"},
{"CONN" , "CT"},
{"DELAWARE" , "DE"},
{"DELA" , "DE"},
{"DISTRICT OF COLUMBIA" , "DC"},
{"FEDERAL STATES OF MICRONESIA" , "FM"},
{"MICRONESIA" , "FM"},
{"FLORIDA" , "FL"},
{"FLA" , "FL"},
{"GEORGIA" , "GA"},
{"GUAM" , "GU"},
{"HAWAII" , "HI"},
{"IDAHO" , "ID"},
{"ILLINOIS" , "IL"},
{"ILL" , "IL"},
{"INDIANA" , "IN"},
{"IND" , "IN"},
{"IOWA" , "IA"},
{"KANSAS" , "KS"},
{"KENTUCKY" , "KY"},
{"LOUISIANA" , "LA"},
{"MAINE" , "ME"},
{"MARSHALL ISLAND" , "MH"},
{"MARSHALL ISL" , "MH"},
{"MARSHALL IS" , "MH"},
{"MARSHALL" , "MH"},
{"MARYLAND" , "MD"},
{"MASSACHUSETTS" , "MA"},
{"MASS" , "MA"},
{"MICHIGAN" , "MI"},
{"MICH" , "MI"},
{"MINNESOTA" , "MN"},
{"MINN" , "MN"},
{"MISSISSIPPI" , "MS"},
{"MISS" , "MS"},
{"MISSOURI" , "MO"},
{"MONTANA" , "MT"},
{"MONT" , "MT"},
{"NEBRASKA" , "NE"},
{"NEVADA" , "NV"},
{"NEW HAMPSHIRE" , "NH"},
{"NEW JERSEY" , "NJ"},
{"NEW MEXICO" , "NM"},
{"NEW YORK" , "NY"},
{"NORTH CAROLINA" , "NC"},
{"N CAROLINA" , "NC"},
{"NORTH DAKOTA" , "ND"},
{"N DAKOTA" , "ND"},
{"NORTH MARIANA ISL" , "MP"},
{"NORTH MARIANA IS" , "MP"},
{"NORTH MARIANA" , "MP"},
{"NORTH MARIANA ISLAND" , "MP"},
{"OHIO" , "OH"},
{"OKLAHOMA" , "OK"},
{"OREGON" , "OR"},
{"PALAU" , "PW"},
{"PENNSYLVANIA" , "PA"},
{"PENN" , "PA"},
{"PUERTO RICO" , "PR"},
{"RHODE ISLAND" , "RI"},
{"SOUTH CAROLINA" , "SC"},
{"S CAROLINA" , "SC"},
{"SOUTH DAKOTA" , "SD"},
{"S DAKOTA" , "SD"},
{"TENNESSEE" , "TN"},
{"TENN" , "TN"},
{"TEXAS" , "TX"},
{"UTAH" , "UT"},
{"VERMONT" , "VT"},
{"VIRGIN ISLANDS" , "VI"},
{"VIRGINIA" , "VA"},
{"WASHINGTON" , "WA"},
{"WASH" , "WA"},
{"WEST VIRGINIA" , "WV"},
{"W VIRGINIA" , "WV"},
{"WISCONSIN" , "WI"},
{"WISC" , "WI"},
{"WYOMING" , "WY"},
{"ALBERTA" , "AB"},
{"BRITISH COLUMBIA" , "BC"},
{"MANITOBA" , "MB"},
{"NEW BRUNSWICK" , "NB"},
{"NEW FOUNDLAND AND LABORDOR" , "NL"},
{"NEW FOUNDLAND" , "NL"},
{"NEWFOUNDLAND" , "NL"},
{"LABORDOR" , "NL"},
{"NORTHWEST TERRITORIES" , "NT"},
{"NORTHWEST TERRITORY" , "NT"},
{"NWT" , "NT"},
{"NOVA SCOTIA" , "NS"},
{"NUNAVUT" , "NU"},
{"ONTARIO" , "ON"},
{"ONT" , "ON"},
{"PRINCE EDWARD ISLAND" , "PE"},
{"PEI" , "PE"},
{"QUEBEC" , "QC"},
{"SASKATCHEWAN" , "SK"},
{"YUKON" , "YT"},
{"NF" , "NL"},
{NULL, NULL}
};
#ifdef USE_HSEARCH
ENTRY e, *ep;
int err;
#else
char *key;
char *val;
#endif
int i, cnt;
/* count the entries above */
cnt = 0;
while (words[cnt][0]) cnt++;
DBG("Words cnt=%d", cnt);
#ifdef USE_HSEARCH
if (! hcreate_r(cnt*2, stH)) return 1001;
for (i=0; i<cnt; i++) {
e.key = words[i][0];
e.data = words[i][1];
err = hsearch_r(e, ENTER, &ep, stH);
/* there should be no failures */
if (!err) return 1003;
e.key = words[i][1];
e.data = words[i][1];
err = hsearch_r(e, ENTER, &ep, stH);
/* there should be no failures */
if (!err) return 1003;
}
#else
if (! stH ) return 1001;
for (i=0; i<cnt; i++) {
//DBG("load_hash i=%d", i);
key = words[i][0];
val = words[i][1];
hash_set(stH, key, (void *)val);
key = words[i][1];
val = words[i][1];
hash_set(stH, key, (void *)val);
}
#endif
return 0;
}
void free_state_hash(HHash *stH)
{
//#if 0
#ifdef USE_HSEARCH
if (stH) hdestroy_r(stH);
#else
if (stH) hash_free(stH);
#endif
//#endif
}

View file

@ -0,0 +1,75 @@
/*
parseaddres-api.h - utility to crack a string into address, city st zip
Copyright 2006-2010 Stephen Woodbridge.
woodbri@swoodbridge.com
woodbr@imaptools.com
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
$Id: $
*/
#ifndef PARSEADDRESS_API_H
#define PARSEADDRESS_API_H
#include "postgres.h"
#define OVECCOUNT 30
#ifdef USE_HSEARCH
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <search.h>
typedef struct hsearch_data HHash;
#else
#include "hash.h"
typedef hash_t HHash;
#endif
typedef struct address_struct {
char *num;
char *street;
char *street2;
char *address1;
char *city;
char *st;
char *zip;
char *zipplus;
char *cc;
double lat;
double lon;
} ADDRESS;
int clean_trailing_punct(char *s);
void strtoupper(char *s);
int match(char *pattern, char *s, int *ovect, int options);
ADDRESS *parseaddress(HHash *stH, char *s, int *err);
int load_state_hash(HHash *stH);
void free_state_hash(HHash *stH);
void free_address(ADDRESS *a);
/*
* ERRORS
*
* 1000 general memory allocation error
* 1001 failed to create hash table structure
* 1002 failed to find state abbreviation
* 1003 hash table is full, failled to add new entry
*
*/
#endif

View file

@ -0,0 +1,12 @@
const int nreg = 9;
static const char *t_regx[9] = {
"(?:\\b(?-xism:(?i:(?:C(?:R(?:[KT]|E(?:S(?:(?:C?EN)?T)?|CENT|EK)|S(?:(?:C?N)?T|E(?:NT)?|SI?NG)|OSS(?:ROAD|ING)|CLE?)?|O(?:R(?:NERS?|S)?|UR(?:TS?|SE)|MMON|VES?)|A(?:USE?WAY|NYO?N|MP|PE)|IR(?:C(?:L(?:ES?)?)?|S)?|EN(?:T(?:ERS?|RE?)?)?|L(?:IFFS?|FS?|U?B)|N(?:TE?R|YN)|T(?:RS?|S)?|M[NP]|URVE?|PE?|SWY|VS?|YN|K)|S(?:T(?:[NS]|R(?:[MT]|A(?:V(?:E(?:N(?:UE)?)?|N)?)?|E(?:ETS?|AM|ME)|VN(?:UE)?)?|A(?:T(?:IO)?N)?)?|H(?:O(?:A(?:LS?|RS?)|RES?)|LS?|RS?)|P(?:R(?:INGS?|NGS?)|NGS?|URS?|GS?)|Q(?:U(?:ARES?)?|R[ES]?|S)?|(?:UM(?:IT?|MI)|M)T|K(?:YWA|W)Y)|P(?:A(?:RK(?:W(?:AYS?|Y)|S)?|SS(?:AGE)?|THS?)|L(?:A(?:IN(?:E?S)?|CE|ZA)|NS?|ZA?)?|R(?:[KR]|AI?RIE|TS?)?|K(?:W(?:YS?|AY)|Y)?|O(?:INTS?|RTS?)|I(?:KES?|NES?)|NES?|SGE|TS?)|B(?:O(?:UL(?:EVARD|V)?|T(?:TO?M)?)|R(?:A?NCH|I?DGE|OOKS?|KS?|G)?|Y(?:P(?:A(?:S?S)?|S)?|U)|L(?:UF(?:FS?)?|FS?|VD)|E(?:ACH|ND)|AYO[OU]|URGS?|GS?|CH|ND|TM)|M(?:O(?:UNT(?:AINS?|IN)?|TORWAY)|N(?:T(?:AIN|NS?)?|RS?)|E(?:(?:DO)?WS|ADOWS?)|I(?:SS(?:IO)?N|LLS?)|T(?:NS?|IN|WY)?|A(?:NORS?|LL)|DWS?|S?SN|LS?)|T(?:R(?:A(?:C(?:ES?|KS?)|FFICWAY|ILS?|K)|[FW]Y|N?PK|KS?|LS?|CE)?|U(?:N(?:N(?:ELS?|L)|LS?|EL)|RNP(?:IKE|K))|ER(?:R(?:ACE)?)?|HROUGHWAY|PKE?)|F(?:R(?:(?:(?:EE)?WA?|R)?Y|DS?|GS?|KS?|S?T)|OR(?:G(?:ES?)?|ESTS?|DS?|KS?|T)|L(?:ATS?|DS?|TS?|S)|(?:ERR|W)Y|IELDS?|ALLS?|T)|R(?:A(?:D(?:(?:I[AE])?L)?|NCH(?:ES)?|PIDS?|MP)|I(?:V(?:E?R)?|DGES?)|D(?:G[ES]?|S)?|O(?:ADS?|W)|NCHS?|U[EN]|E?ST|PDS?|VR)|H(?:A(?:RB(?:ORS?|R)?|VE?N)|I(?:(?:GH)?WA?Y|LLS?)|OL(?:LOWS?|WS?)|L(?:LW|S)?|EIGHTS?|BRS?|RBOR|WA?Y|GTS|TS?|VN)|V(?:I(?:LL(?:AG(?:ES?)?|(?:IAG)?E|G)?|A(?:DU?CT)?|S(?:TA?)?|EWS?)|L(?:GS?|YS?|LY)?|ALL(?:EYS?|Y)|STA?|DCT|WS?)|G(?:R(?:D(?:NS?|EN)|OV(?:ES?)?|EENS?|NS?|VS?)|A(?:T(?:EWA?|WA)Y|RD(?:ENS?|N))|L(?:ENS?|NS?)|TWA?Y|DNS?)|L(?:A(?:N(?:D(?:ING)?|ES?)|KES?)?|O(?:CKS?|DGE?|OPS?|AF)|I(?:GHTS?|NE)|N(?:DN?G)?|CKS?|DGE?|GTS?|KS?|F)|E(?:X(?:P(?:[WY]|R(?:ESS(?:WAY)?)?)?|T(?:(?:NS)?N|ENSIONS?|S)?)|ST(?:ATES?|S)?)|A(?:V(?:E(?:N(?:UE?)?)?|N(?:UE)?)?|L(?:L(?:E[EY]|Y)|Y)|RC(?:ADE)?|NN?E?X)|D(?:[LM]|R(?:[SV]|IV(?:ES?)?)?|IV(?:IDE)?|A(?:LE|M)|VD?)|J(?:UNCT(?:IONS?|O?N)|CT(?:ION|NS?|S)?)|I(?:S(?:L(?:ANDS?|NDS?|ES?)|S)?|NLE?T)|O(?:V(?:ERPASS|A?L)|RCH(?:A?RD)?|PAS)|W(?:A(?:L(?:KS?|L)|YS?)|ELLS?|LS?|Y)|K(?:N(?:OL(?:LS?)?|LS?)|EYS?|YS?)|U(?:N(?:(?:DERPAS)?S|IONS?)?|PAS)|X(?:ING|RD)|NE?CK)))\\s(?:(?-xism:(?i:(?:N(?:[EW]|OR(?:TH(?:-?(?:EA|WE)ST)?|D)|-[EW])?|S(?:[EW]|OUTH(?:-?(?:EA|WE)ST)?|-[EW]|UD)?|E(?:A?ST)?|O(?:EST)?|W(?:EST)?)))\\s))((?-xism:(?i:(?:N(?:[EW]|OR(?:TH(?:-?(?:EA|WE)ST)?|D)|-[EW])?|S(?:[EW]|OUTH(?:-?(?:EA|WE)ST)?|-[EW]|UD)?|E(?:A?ST)?|O(?:EST)?|W(?:EST)?)))\\s(\\w+(\\s\\w+)*))$",
"(?:\\b(?-xism:(?i:(?:C(?:R(?:[KT]|E(?:S(?:(?:C?EN)?T)?|CENT|EK)|S(?:(?:C?N)?T|E(?:NT)?|SI?NG)|OSS(?:ROAD|ING)|CLE?)?|O(?:R(?:NERS?|S)?|UR(?:TS?|SE)|MMON|VES?)|A(?:USE?WAY|NYO?N|MP|PE)|IR(?:C(?:L(?:ES?)?)?|S)?|EN(?:T(?:ERS?|RE?)?)?|L(?:IFFS?|FS?|U?B)|N(?:TE?R|YN)|T(?:RS?|S)?|M[NP]|URVE?|PE?|SWY|VS?|YN|K)|S(?:T(?:[NS]|R(?:[MT]|A(?:V(?:E(?:N(?:UE)?)?|N)?)?|E(?:ETS?|AM|ME)|VN(?:UE)?)?|A(?:T(?:IO)?N)?)?|H(?:O(?:A(?:LS?|RS?)|RES?)|LS?|RS?)|P(?:R(?:INGS?|NGS?)|NGS?|URS?|GS?)|Q(?:U(?:ARES?)?|R[ES]?|S)?|(?:UM(?:IT?|MI)|M)T|K(?:YWA|W)Y)|P(?:A(?:RK(?:W(?:AYS?|Y)|S)?|SS(?:AGE)?|THS?)|L(?:A(?:IN(?:E?S)?|CE|ZA)|NS?|ZA?)?|R(?:[KR]|AI?RIE|TS?)?|K(?:W(?:YS?|AY)|Y)?|O(?:INTS?|RTS?)|I(?:KES?|NES?)|NES?|SGE|TS?)|B(?:O(?:UL(?:EVARD|V)?|T(?:TO?M)?)|R(?:A?NCH|I?DGE|OOKS?|KS?|G)?|Y(?:P(?:A(?:S?S)?|S)?|U)|L(?:UF(?:FS?)?|FS?|VD)|E(?:ACH|ND)|AYO[OU]|URGS?|GS?|CH|ND|TM)|M(?:O(?:UNT(?:AINS?|IN)?|TORWAY)|N(?:T(?:AIN|NS?)?|RS?)|E(?:(?:DO)?WS|ADOWS?)|I(?:SS(?:IO)?N|LLS?)|T(?:NS?|IN|WY)?|A(?:NORS?|LL)|DWS?|S?SN|LS?)|T(?:R(?:A(?:C(?:ES?|KS?)|FFICWAY|ILS?|K)|[FW]Y|N?PK|KS?|LS?|CE)?|U(?:N(?:N(?:ELS?|L)|LS?|EL)|RNP(?:IKE|K))|ER(?:R(?:ACE)?)?|HROUGHWAY|PKE?)|F(?:R(?:(?:(?:EE)?WA?|R)?Y|DS?|GS?|KS?|S?T)|OR(?:G(?:ES?)?|ESTS?|DS?|KS?|T)|L(?:ATS?|DS?|TS?|S)|(?:ERR|W)Y|IELDS?|ALLS?|T)|R(?:A(?:D(?:(?:I[AE])?L)?|NCH(?:ES)?|PIDS?|MP)|I(?:V(?:E?R)?|DGES?)|D(?:G[ES]?|S)?|O(?:ADS?|W)|NCHS?|U[EN]|E?ST|PDS?|VR)|H(?:A(?:RB(?:ORS?|R)?|VE?N)|I(?:(?:GH)?WA?Y|LLS?)|OL(?:LOWS?|WS?)|L(?:LW|S)?|EIGHTS?|BRS?|RBOR|WA?Y|GTS|TS?|VN)|V(?:I(?:LL(?:AG(?:ES?)?|(?:IAG)?E|G)?|A(?:DU?CT)?|S(?:TA?)?|EWS?)|L(?:GS?|YS?|LY)?|ALL(?:EYS?|Y)|STA?|DCT|WS?)|G(?:R(?:D(?:NS?|EN)|OV(?:ES?)?|EENS?|NS?|VS?)|A(?:T(?:EWA?|WA)Y|RD(?:ENS?|N))|L(?:ENS?|NS?)|TWA?Y|DNS?)|L(?:A(?:N(?:D(?:ING)?|ES?)|KES?)?|O(?:CKS?|DGE?|OPS?|AF)|I(?:GHTS?|NE)|N(?:DN?G)?|CKS?|DGE?|GTS?|KS?|F)|E(?:X(?:P(?:[WY]|R(?:ESS(?:WAY)?)?)?|T(?:(?:NS)?N|ENSIONS?|S)?)|ST(?:ATES?|S)?)|A(?:V(?:E(?:N(?:UE?)?)?|N(?:UE)?)?|L(?:L(?:E[EY]|Y)|Y)|RC(?:ADE)?|NN?E?X)|D(?:[LM]|R(?:[SV]|IV(?:ES?)?)?|IV(?:IDE)?|A(?:LE|M)|VD?)|J(?:UNCT(?:IONS?|O?N)|CT(?:ION|NS?|S)?)|I(?:S(?:L(?:ANDS?|NDS?|ES?)|S)?|NLE?T)|O(?:V(?:ERPASS|A?L)|RCH(?:A?RD)?|PAS)|W(?:A(?:L(?:KS?|L)|YS?)|ELLS?|LS?|Y)|K(?:N(?:OL(?:LS?)?|LS?)|EYS?|YS?)|U(?:N(?:(?:DERPAS)?S|IONS?)?|PAS)|X(?:ING|RD)|NE?CK)))\\s(?:(?-xism:(?i:(?:N(?:[EW]|OR(?:TH(?:-?(?:EA|WE)ST)?|D)|-[EW])?|S(?:[EW]|OUTH(?:-?(?:EA|WE)ST)?|-[EW]|UD)?|E(?:A?ST)?|O(?:EST)?|W(?:EST)?)))\\s))((?-xism:(?i:(?:N(?:[EW]|OR(?:TH(?:-?(?:EA|WE)ST)?|D)|-[EW])?|S(?:[EW]|OUTH(?:-?(?:EA|WE)ST)?|-[EW]|UD)?|E(?:A?ST)?|O(?:EST)?|W(?:EST)?)))\\s(?-xism:(?i:s(?:t(?:e.?|.)?|aint)))\\s(\\w+(\\s\\w+)*))$",
"(?:\\b(?-xism:(?i:(?:C(?:R(?:[KT]|E(?:S(?:(?:C?EN)?T)?|CENT|EK)|S(?:(?:C?N)?T|E(?:NT)?|SI?NG)|OSS(?:ROAD|ING)|CLE?)?|O(?:R(?:NERS?|S)?|UR(?:TS?|SE)|MMON|VES?)|A(?:USE?WAY|NYO?N|MP|PE)|IR(?:C(?:L(?:ES?)?)?|S)?|EN(?:T(?:ERS?|RE?)?)?|L(?:IFFS?|FS?|U?B)|N(?:TE?R|YN)|T(?:RS?|S)?|M[NP]|URVE?|PE?|SWY|VS?|YN|K)|S(?:T(?:[NS]|R(?:[MT]|A(?:V(?:E(?:N(?:UE)?)?|N)?)?|E(?:ETS?|AM|ME)|VN(?:UE)?)?|A(?:T(?:IO)?N)?)?|H(?:O(?:A(?:LS?|RS?)|RES?)|LS?|RS?)|P(?:R(?:INGS?|NGS?)|NGS?|URS?|GS?)|Q(?:U(?:ARES?)?|R[ES]?|S)?|(?:UM(?:IT?|MI)|M)T|K(?:YWA|W)Y)|P(?:A(?:RK(?:W(?:AYS?|Y)|S)?|SS(?:AGE)?|THS?)|L(?:A(?:IN(?:E?S)?|CE|ZA)|NS?|ZA?)?|R(?:[KR]|AI?RIE|TS?)?|K(?:W(?:YS?|AY)|Y)?|O(?:INTS?|RTS?)|I(?:KES?|NES?)|NES?|SGE|TS?)|B(?:O(?:UL(?:EVARD|V)?|T(?:TO?M)?)|R(?:A?NCH|I?DGE|OOKS?|KS?|G)?|Y(?:P(?:A(?:S?S)?|S)?|U)|L(?:UF(?:FS?)?|FS?|VD)|E(?:ACH|ND)|AYO[OU]|URGS?|GS?|CH|ND|TM)|M(?:O(?:UNT(?:AINS?|IN)?|TORWAY)|N(?:T(?:AIN|NS?)?|RS?)|E(?:(?:DO)?WS|ADOWS?)|I(?:SS(?:IO)?N|LLS?)|T(?:NS?|IN|WY)?|A(?:NORS?|LL)|DWS?|S?SN|LS?)|T(?:R(?:A(?:C(?:ES?|KS?)|FFICWAY|ILS?|K)|[FW]Y|N?PK|KS?|LS?|CE)?|U(?:N(?:N(?:ELS?|L)|LS?|EL)|RNP(?:IKE|K))|ER(?:R(?:ACE)?)?|HROUGHWAY|PKE?)|F(?:R(?:(?:(?:EE)?WA?|R)?Y|DS?|GS?|KS?|S?T)|OR(?:G(?:ES?)?|ESTS?|DS?|KS?|T)|L(?:ATS?|DS?|TS?|S)|(?:ERR|W)Y|IELDS?|ALLS?|T)|R(?:A(?:D(?:(?:I[AE])?L)?|NCH(?:ES)?|PIDS?|MP)|I(?:V(?:E?R)?|DGES?)|D(?:G[ES]?|S)?|O(?:ADS?|W)|NCHS?|U[EN]|E?ST|PDS?|VR)|H(?:A(?:RB(?:ORS?|R)?|VE?N)|I(?:(?:GH)?WA?Y|LLS?)|OL(?:LOWS?|WS?)|L(?:LW|S)?|EIGHTS?|BRS?|RBOR|WA?Y|GTS|TS?|VN)|V(?:I(?:LL(?:AG(?:ES?)?|(?:IAG)?E|G)?|A(?:DU?CT)?|S(?:TA?)?|EWS?)|L(?:GS?|YS?|LY)?|ALL(?:EYS?|Y)|STA?|DCT|WS?)|G(?:R(?:D(?:NS?|EN)|OV(?:ES?)?|EENS?|NS?|VS?)|A(?:T(?:EWA?|WA)Y|RD(?:ENS?|N))|L(?:ENS?|NS?)|TWA?Y|DNS?)|L(?:A(?:N(?:D(?:ING)?|ES?)|KES?)?|O(?:CKS?|DGE?|OPS?|AF)|I(?:GHTS?|NE)|N(?:DN?G)?|CKS?|DGE?|GTS?|KS?|F)|E(?:X(?:P(?:[WY]|R(?:ESS(?:WAY)?)?)?|T(?:(?:NS)?N|ENSIONS?|S)?)|ST(?:ATES?|S)?)|A(?:V(?:E(?:N(?:UE?)?)?|N(?:UE)?)?|L(?:L(?:E[EY]|Y)|Y)|RC(?:ADE)?|NN?E?X)|D(?:[LM]|R(?:[SV]|IV(?:ES?)?)?|IV(?:IDE)?|A(?:LE|M)|VD?)|J(?:UNCT(?:IONS?|O?N)|CT(?:ION|NS?|S)?)|I(?:S(?:L(?:ANDS?|NDS?|ES?)|S)?|NLE?T)|O(?:V(?:ERPASS|A?L)|RCH(?:A?RD)?|PAS)|W(?:A(?:L(?:KS?|L)|YS?)|ELLS?|LS?|Y)|K(?:N(?:OL(?:LS?)?|LS?)|EYS?|YS?)|U(?:N(?:(?:DERPAS)?S|IONS?)?|PAS)|X(?:ING|RD)|NE?CK)))\\s)((?-xism:(?i:(?:N(?:[EW]|OR(?:TH(?:-?(?:EA|WE)ST)?|D)|-[EW])?|S(?:[EW]|OUTH(?:-?(?:EA|WE)ST)?|-[EW]|UD)?|E(?:A?ST)?|O(?:EST)?|W(?:EST)?)))\\s(?-xism:(?i:s(?:t(?:e.?|.)?|aint)))\\s(\\w+(\\s\\w+)*))$",
"(?:\\b(?-xism:(?i:(?:C(?:R(?:[KT]|E(?:S(?:(?:C?EN)?T)?|CENT|EK)|S(?:(?:C?N)?T|E(?:NT)?|SI?NG)|OSS(?:ROAD|ING)|CLE?)?|O(?:R(?:NERS?|S)?|UR(?:TS?|SE)|MMON|VES?)|A(?:USE?WAY|NYO?N|MP|PE)|IR(?:C(?:L(?:ES?)?)?|S)?|EN(?:T(?:ERS?|RE?)?)?|L(?:IFFS?|FS?|U?B)|N(?:TE?R|YN)|T(?:RS?|S)?|M[NP]|URVE?|PE?|SWY|VS?|YN|K)|S(?:T(?:[NS]|R(?:[MT]|A(?:V(?:E(?:N(?:UE)?)?|N)?)?|E(?:ETS?|AM|ME)|VN(?:UE)?)?|A(?:T(?:IO)?N)?)?|H(?:O(?:A(?:LS?|RS?)|RES?)|LS?|RS?)|P(?:R(?:INGS?|NGS?)|NGS?|URS?|GS?)|Q(?:U(?:ARES?)?|R[ES]?|S)?|(?:UM(?:IT?|MI)|M)T|K(?:YWA|W)Y)|P(?:A(?:RK(?:W(?:AYS?|Y)|S)?|SS(?:AGE)?|THS?)|L(?:A(?:IN(?:E?S)?|CE|ZA)|NS?|ZA?)?|R(?:[KR]|AI?RIE|TS?)?|K(?:W(?:YS?|AY)|Y)?|O(?:INTS?|RTS?)|I(?:KES?|NES?)|NES?|SGE|TS?)|B(?:O(?:UL(?:EVARD|V)?|T(?:TO?M)?)|R(?:A?NCH|I?DGE|OOKS?|KS?|G)?|Y(?:P(?:A(?:S?S)?|S)?|U)|L(?:UF(?:FS?)?|FS?|VD)|E(?:ACH|ND)|AYO[OU]|URGS?|GS?|CH|ND|TM)|M(?:O(?:UNT(?:AINS?|IN)?|TORWAY)|N(?:T(?:AIN|NS?)?|RS?)|E(?:(?:DO)?WS|ADOWS?)|I(?:SS(?:IO)?N|LLS?)|T(?:NS?|IN|WY)?|A(?:NORS?|LL)|DWS?|S?SN|LS?)|T(?:R(?:A(?:C(?:ES?|KS?)|FFICWAY|ILS?|K)|[FW]Y|N?PK|KS?|LS?|CE)?|U(?:N(?:N(?:ELS?|L)|LS?|EL)|RNP(?:IKE|K))|ER(?:R(?:ACE)?)?|HROUGHWAY|PKE?)|F(?:R(?:(?:(?:EE)?WA?|R)?Y|DS?|GS?|KS?|S?T)|OR(?:G(?:ES?)?|ESTS?|DS?|KS?|T)|L(?:ATS?|DS?|TS?|S)|(?:ERR|W)Y|IELDS?|ALLS?|T)|R(?:A(?:D(?:(?:I[AE])?L)?|NCH(?:ES)?|PIDS?|MP)|I(?:V(?:E?R)?|DGES?)|D(?:G[ES]?|S)?|O(?:ADS?|W)|NCHS?|U[EN]|E?ST|PDS?|VR)|H(?:A(?:RB(?:ORS?|R)?|VE?N)|I(?:(?:GH)?WA?Y|LLS?)|OL(?:LOWS?|WS?)|L(?:LW|S)?|EIGHTS?|BRS?|RBOR|WA?Y|GTS|TS?|VN)|V(?:I(?:LL(?:AG(?:ES?)?|(?:IAG)?E|G)?|A(?:DU?CT)?|S(?:TA?)?|EWS?)|L(?:GS?|YS?|LY)?|ALL(?:EYS?|Y)|STA?|DCT|WS?)|G(?:R(?:D(?:NS?|EN)|OV(?:ES?)?|EENS?|NS?|VS?)|A(?:T(?:EWA?|WA)Y|RD(?:ENS?|N))|L(?:ENS?|NS?)|TWA?Y|DNS?)|L(?:A(?:N(?:D(?:ING)?|ES?)|KES?)?|O(?:CKS?|DGE?|OPS?|AF)|I(?:GHTS?|NE)|N(?:DN?G)?|CKS?|DGE?|GTS?|KS?|F)|E(?:X(?:P(?:[WY]|R(?:ESS(?:WAY)?)?)?|T(?:(?:NS)?N|ENSIONS?|S)?)|ST(?:ATES?|S)?)|A(?:V(?:E(?:N(?:UE?)?)?|N(?:UE)?)?|L(?:L(?:E[EY]|Y)|Y)|RC(?:ADE)?|NN?E?X)|D(?:[LM]|R(?:[SV]|IV(?:ES?)?)?|IV(?:IDE)?|A(?:LE|M)|VD?)|J(?:UNCT(?:IONS?|O?N)|CT(?:ION|NS?|S)?)|I(?:S(?:L(?:ANDS?|NDS?|ES?)|S)?|NLE?T)|O(?:V(?:ERPASS|A?L)|RCH(?:A?RD)?|PAS)|W(?:A(?:L(?:KS?|L)|YS?)|ELLS?|LS?|Y)|K(?:N(?:OL(?:LS?)?|LS?)|EYS?|YS?)|U(?:N(?:(?:DERPAS)?S|IONS?)?|PAS)|X(?:ING|RD)|NE?CK)))\\s)((?-xism:(?i:s(?:t(?:e.?|.)?|aint)))\\s(\\w+(\\s\\w+)*))$",
"(?:\\b(?-xism:(?i:(?:C(?:R(?:[KT]|E(?:S(?:(?:C?EN)?T)?|CENT|EK)|S(?:(?:C?N)?T|E(?:NT)?|SI?NG)|OSS(?:ROAD|ING)|CLE?)?|O(?:R(?:NERS?|S)?|UR(?:TS?|SE)|MMON|VES?)|A(?:USE?WAY|NYO?N|MP|PE)|IR(?:C(?:L(?:ES?)?)?|S)?|EN(?:T(?:ERS?|RE?)?)?|L(?:IFFS?|FS?|U?B)|N(?:TE?R|YN)|T(?:RS?|S)?|M[NP]|URVE?|PE?|SWY|VS?|YN|K)|S(?:T(?:[NS]|R(?:[MT]|A(?:V(?:E(?:N(?:UE)?)?|N)?)?|E(?:ETS?|AM|ME)|VN(?:UE)?)?|A(?:T(?:IO)?N)?)?|H(?:O(?:A(?:LS?|RS?)|RES?)|LS?|RS?)|P(?:R(?:INGS?|NGS?)|NGS?|URS?|GS?)|Q(?:U(?:ARES?)?|R[ES]?|S)?|(?:UM(?:IT?|MI)|M)T|K(?:YWA|W)Y)|P(?:A(?:RK(?:W(?:AYS?|Y)|S)?|SS(?:AGE)?|THS?)|L(?:A(?:IN(?:E?S)?|CE|ZA)|NS?|ZA?)?|R(?:[KR]|AI?RIE|TS?)?|K(?:W(?:YS?|AY)|Y)?|O(?:INTS?|RTS?)|I(?:KES?|NES?)|NES?|SGE|TS?)|B(?:O(?:UL(?:EVARD|V)?|T(?:TO?M)?)|R(?:A?NCH|I?DGE|OOKS?|KS?|G)?|Y(?:P(?:A(?:S?S)?|S)?|U)|L(?:UF(?:FS?)?|FS?|VD)|E(?:ACH|ND)|AYO[OU]|URGS?|GS?|CH|ND|TM)|M(?:O(?:UNT(?:AINS?|IN)?|TORWAY)|N(?:T(?:AIN|NS?)?|RS?)|E(?:(?:DO)?WS|ADOWS?)|I(?:SS(?:IO)?N|LLS?)|T(?:NS?|IN|WY)?|A(?:NORS?|LL)|DWS?|S?SN|LS?)|T(?:R(?:A(?:C(?:ES?|KS?)|FFICWAY|ILS?|K)|[FW]Y|N?PK|KS?|LS?|CE)?|U(?:N(?:N(?:ELS?|L)|LS?|EL)|RNP(?:IKE|K))|ER(?:R(?:ACE)?)?|HROUGHWAY|PKE?)|F(?:R(?:(?:(?:EE)?WA?|R)?Y|DS?|GS?|KS?|S?T)|OR(?:G(?:ES?)?|ESTS?|DS?|KS?|T)|L(?:ATS?|DS?|TS?|S)|(?:ERR|W)Y|IELDS?|ALLS?|T)|R(?:A(?:D(?:(?:I[AE])?L)?|NCH(?:ES)?|PIDS?|MP)|I(?:V(?:E?R)?|DGES?)|D(?:G[ES]?|S)?|O(?:ADS?|W)|NCHS?|U[EN]|E?ST|PDS?|VR)|H(?:A(?:RB(?:ORS?|R)?|VE?N)|I(?:(?:GH)?WA?Y|LLS?)|OL(?:LOWS?|WS?)|L(?:LW|S)?|EIGHTS?|BRS?|RBOR|WA?Y|GTS|TS?|VN)|V(?:I(?:LL(?:AG(?:ES?)?|(?:IAG)?E|G)?|A(?:DU?CT)?|S(?:TA?)?|EWS?)|L(?:GS?|YS?|LY)?|ALL(?:EYS?|Y)|STA?|DCT|WS?)|G(?:R(?:D(?:NS?|EN)|OV(?:ES?)?|EENS?|NS?|VS?)|A(?:T(?:EWA?|WA)Y|RD(?:ENS?|N))|L(?:ENS?|NS?)|TWA?Y|DNS?)|L(?:A(?:N(?:D(?:ING)?|ES?)|KES?)?|O(?:CKS?|DGE?|OPS?|AF)|I(?:GHTS?|NE)|N(?:DN?G)?|CKS?|DGE?|GTS?|KS?|F)|E(?:X(?:P(?:[WY]|R(?:ESS(?:WAY)?)?)?|T(?:(?:NS)?N|ENSIONS?|S)?)|ST(?:ATES?|S)?)|A(?:V(?:E(?:N(?:UE?)?)?|N(?:UE)?)?|L(?:L(?:E[EY]|Y)|Y)|RC(?:ADE)?|NN?E?X)|D(?:[LM]|R(?:[SV]|IV(?:ES?)?)?|IV(?:IDE)?|A(?:LE|M)|VD?)|J(?:UNCT(?:IONS?|O?N)|CT(?:ION|NS?|S)?)|I(?:S(?:L(?:ANDS?|NDS?|ES?)|S)?|NLE?T)|O(?:V(?:ERPASS|A?L)|RCH(?:A?RD)?|PAS)|W(?:A(?:L(?:KS?|L)|YS?)|ELLS?|LS?|Y)|K(?:N(?:OL(?:LS?)?|LS?)|EYS?|YS?)|U(?:N(?:(?:DERPAS)?S|IONS?)?|PAS)|X(?:ING|RD)|NE?CK)))\\s)((?-xism:(?i:(?:N(?:[EW]|OR(?:TH(?:-?(?:EA|WE)ST)?|D)|-[EW])?|S(?:[EW]|OUTH(?:-?(?:EA|WE)ST)?|-[EW]|UD)?|E(?:A?ST)?|O(?:EST)?|W(?:EST)?)))\\s(\\w+(\\s\\w+)*))$",
"(?:\\b(?-xism:(?i:(?:C(?:R(?:[KT]|E(?:S(?:(?:C?EN)?T)?|CENT|EK)|S(?:(?:C?N)?T|E(?:NT)?|SI?NG)|OSS(?:ROAD|ING)|CLE?)?|O(?:R(?:NERS?|S)?|UR(?:TS?|SE)|MMON|VES?)|A(?:USE?WAY|NYO?N|MP|PE)|IR(?:C(?:L(?:ES?)?)?|S)?|EN(?:T(?:ERS?|RE?)?)?|L(?:IFFS?|FS?|U?B)|N(?:TE?R|YN)|T(?:RS?|S)?|M[NP]|URVE?|PE?|SWY|VS?|YN|K)|S(?:T(?:[NS]|R(?:[MT]|A(?:V(?:E(?:N(?:UE)?)?|N)?)?|E(?:ETS?|AM|ME)|VN(?:UE)?)?|A(?:T(?:IO)?N)?)?|H(?:O(?:A(?:LS?|RS?)|RES?)|LS?|RS?)|P(?:R(?:INGS?|NGS?)|NGS?|URS?|GS?)|Q(?:U(?:ARES?)?|R[ES]?|S)?|(?:UM(?:IT?|MI)|M)T|K(?:YWA|W)Y)|P(?:A(?:RK(?:W(?:AYS?|Y)|S)?|SS(?:AGE)?|THS?)|L(?:A(?:IN(?:E?S)?|CE|ZA)|NS?|ZA?)?|R(?:[KR]|AI?RIE|TS?)?|K(?:W(?:YS?|AY)|Y)?|O(?:INTS?|RTS?)|I(?:KES?|NES?)|NES?|SGE|TS?)|B(?:O(?:UL(?:EVARD|V)?|T(?:TO?M)?)|R(?:A?NCH|I?DGE|OOKS?|KS?|G)?|Y(?:P(?:A(?:S?S)?|S)?|U)|L(?:UF(?:FS?)?|FS?|VD)|E(?:ACH|ND)|AYO[OU]|URGS?|GS?|CH|ND|TM)|M(?:O(?:UNT(?:AINS?|IN)?|TORWAY)|N(?:T(?:AIN|NS?)?|RS?)|E(?:(?:DO)?WS|ADOWS?)|I(?:SS(?:IO)?N|LLS?)|T(?:NS?|IN|WY)?|A(?:NORS?|LL)|DWS?|S?SN|LS?)|T(?:R(?:A(?:C(?:ES?|KS?)|FFICWAY|ILS?|K)|[FW]Y|N?PK|KS?|LS?|CE)?|U(?:N(?:N(?:ELS?|L)|LS?|EL)|RNP(?:IKE|K))|ER(?:R(?:ACE)?)?|HROUGHWAY|PKE?)|F(?:R(?:(?:(?:EE)?WA?|R)?Y|DS?|GS?|KS?|S?T)|OR(?:G(?:ES?)?|ESTS?|DS?|KS?|T)|L(?:ATS?|DS?|TS?|S)|(?:ERR|W)Y|IELDS?|ALLS?|T)|R(?:A(?:D(?:(?:I[AE])?L)?|NCH(?:ES)?|PIDS?|MP)|I(?:V(?:E?R)?|DGES?)|D(?:G[ES]?|S)?|O(?:ADS?|W)|NCHS?|U[EN]|E?ST|PDS?|VR)|H(?:A(?:RB(?:ORS?|R)?|VE?N)|I(?:(?:GH)?WA?Y|LLS?)|OL(?:LOWS?|WS?)|L(?:LW|S)?|EIGHTS?|BRS?|RBOR|WA?Y|GTS|TS?|VN)|V(?:I(?:LL(?:AG(?:ES?)?|(?:IAG)?E|G)?|A(?:DU?CT)?|S(?:TA?)?|EWS?)|L(?:GS?|YS?|LY)?|ALL(?:EYS?|Y)|STA?|DCT|WS?)|G(?:R(?:D(?:NS?|EN)|OV(?:ES?)?|EENS?|NS?|VS?)|A(?:T(?:EWA?|WA)Y|RD(?:ENS?|N))|L(?:ENS?|NS?)|TWA?Y|DNS?)|L(?:A(?:N(?:D(?:ING)?|ES?)|KES?)?|O(?:CKS?|DGE?|OPS?|AF)|I(?:GHTS?|NE)|N(?:DN?G)?|CKS?|DGE?|GTS?|KS?|F)|E(?:X(?:P(?:[WY]|R(?:ESS(?:WAY)?)?)?|T(?:(?:NS)?N|ENSIONS?|S)?)|ST(?:ATES?|S)?)|A(?:V(?:E(?:N(?:UE?)?)?|N(?:UE)?)?|L(?:L(?:E[EY]|Y)|Y)|RC(?:ADE)?|NN?E?X)|D(?:[LM]|R(?:[SV]|IV(?:ES?)?)?|IV(?:IDE)?|A(?:LE|M)|VD?)|J(?:UNCT(?:IONS?|O?N)|CT(?:ION|NS?|S)?)|I(?:S(?:L(?:ANDS?|NDS?|ES?)|S)?|NLE?T)|O(?:V(?:ERPASS|A?L)|RCH(?:A?RD)?|PAS)|W(?:A(?:L(?:KS?|L)|YS?)|ELLS?|LS?|Y)|K(?:N(?:OL(?:LS?)?|LS?)|EYS?|YS?)|U(?:N(?:(?:DERPAS)?S|IONS?)?|PAS)|X(?:ING|RD)|NE?CK)))\\s)((\\w+(\\s\\w+)*))$",
"(?:\\s)((?-xism:(?i:(?:N(?:[EW]|OR(?:TH(?:-?(?:EA|WE)ST)?|D)|-[EW])?|S(?:[EW]|OUTH(?:-?(?:EA|WE)ST)?|-[EW]|UD)?|E(?:A?ST)?|O(?:EST)?|W(?:EST)?)))\\s(\\w+(\\s\\w+)*))$",
"^(?:\\d+\\s(?:(?:\\w+\\s)(?-xism:(?i:(?:C(?:R(?:[KT]|E(?:S(?:(?:C?EN)?T)?|CENT|EK)|S(?:(?:C?N)?T|E(?:NT)?|SI?NG)|OSS(?:ROAD|ING)|CLE?)?|O(?:R(?:NERS?|S)?|UR(?:TS?|SE)|MMON|VES?)|A(?:USE?WAY|NYO?N|MP|PE)|IR(?:C(?:L(?:ES?)?)?|S)?|EN(?:T(?:ERS?|RE?)?)?|L(?:IFFS?|FS?|U?B)|N(?:TE?R|YN)|T(?:RS?|S)?|M[NP]|URVE?|PE?|SWY|VS?|YN|K)|S(?:T(?:[NS]|R(?:[MT]|A(?:V(?:E(?:N(?:UE)?)?|N)?)?|E(?:ETS?|AM|ME)|VN(?:UE)?)?|A(?:T(?:IO)?N)?)?|H(?:O(?:A(?:LS?|RS?)|RES?)|LS?|RS?)|P(?:R(?:INGS?|NGS?)|NGS?|URS?|GS?)|Q(?:U(?:ARES?)?|R[ES]?|S)?|(?:UM(?:IT?|MI)|M)T|K(?:YWA|W)Y)|P(?:A(?:RK(?:W(?:AYS?|Y)|S)?|SS(?:AGE)?|THS?)|L(?:A(?:IN(?:E?S)?|CE|ZA)|NS?|ZA?)?|R(?:[KR]|AI?RIE|TS?)?|K(?:W(?:YS?|AY)|Y)?|O(?:INTS?|RTS?)|I(?:KES?|NES?)|NES?|SGE|TS?)|B(?:O(?:UL(?:EVARD|V)?|T(?:TO?M)?)|R(?:A?NCH|I?DGE|OOKS?|KS?|G)?|Y(?:P(?:A(?:S?S)?|S)?|U)|L(?:UF(?:FS?)?|FS?|VD)|E(?:ACH|ND)|AYO[OU]|URGS?|GS?|CH|ND|TM)|M(?:O(?:UNT(?:AINS?|IN)?|TORWAY)|N(?:T(?:AIN|NS?)?|RS?)|E(?:(?:DO)?WS|ADOWS?)|I(?:SS(?:IO)?N|LLS?)|T(?:NS?|IN|WY)?|A(?:NORS?|LL)|DWS?|S?SN|LS?)|T(?:R(?:A(?:C(?:ES?|KS?)|FFICWAY|ILS?|K)|[FW]Y|N?PK|KS?|LS?|CE)?|U(?:N(?:N(?:ELS?|L)|LS?|EL)|RNP(?:IKE|K))|ER(?:R(?:ACE)?)?|HROUGHWAY|PKE?)|F(?:R(?:(?:(?:EE)?WA?|R)?Y|DS?|GS?|KS?|S?T)|OR(?:G(?:ES?)?|ESTS?|DS?|KS?|T)|L(?:ATS?|DS?|TS?|S)|(?:ERR|W)Y|IELDS?|ALLS?|T)|R(?:A(?:D(?:(?:I[AE])?L)?|NCH(?:ES)?|PIDS?|MP)|I(?:V(?:E?R)?|DGES?)|D(?:G[ES]?|S)?|O(?:ADS?|W)|NCHS?|U[EN]|E?ST|PDS?|VR)|H(?:A(?:RB(?:ORS?|R)?|VE?N)|I(?:(?:GH)?WA?Y|LLS?)|OL(?:LOWS?|WS?)|L(?:LW|S)?|EIGHTS?|BRS?|RBOR|WA?Y|GTS|TS?|VN)|V(?:I(?:LL(?:AG(?:ES?)?|(?:IAG)?E|G)?|A(?:DU?CT)?|S(?:TA?)?|EWS?)|L(?:GS?|YS?|LY)?|ALL(?:EYS?|Y)|STA?|DCT|WS?)|G(?:R(?:D(?:NS?|EN)|OV(?:ES?)?|EENS?|NS?|VS?)|A(?:T(?:EWA?|WA)Y|RD(?:ENS?|N))|L(?:ENS?|NS?)|TWA?Y|DNS?)|L(?:A(?:N(?:D(?:ING)?|ES?)|KES?)?|O(?:CKS?|DGE?|OPS?|AF)|I(?:GHTS?|NE)|N(?:DN?G)?|CKS?|DGE?|GTS?|KS?|F)|E(?:X(?:P(?:[WY]|R(?:ESS(?:WAY)?)?)?|T(?:(?:NS)?N|ENSIONS?|S)?)|ST(?:ATES?|S)?)|A(?:V(?:E(?:N(?:UE?)?)?|N(?:UE)?)?|L(?:L(?:E[EY]|Y)|Y)|RC(?:ADE)?|NN?E?X)|D(?:[LM]|R(?:[SV]|IV(?:ES?)?)?|IV(?:IDE)?|A(?:LE|M)|VD?)|J(?:UNCT(?:IONS?|O?N)|CT(?:ION|NS?|S)?)|I(?:S(?:L(?:ANDS?|NDS?|ES?)|S)?|NLE?T)|O(?:V(?:ERPASS|A?L)|RCH(?:A?RD)?|PAS)|W(?:A(?:L(?:KS?|L)|YS?)|ELLS?|LS?|Y)|K(?:N(?:OL(?:LS?)?|LS?)|EYS?|YS?)|U(?:N(?:(?:DERPAS)?S|IONS?)?|PAS)|X(?:ING|RD)|NE?CK)))))()$",
"^(?:\\d+\\s(?:(?:\\w+\\s)*\\w+\\s))(\\w+)$"
};

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,692 @@
/* -- standard.c
interface for the standardizer
Prototype 7H08 (This file was written by Walter Sinclair).
This file is part of PAGC.
Copyright (c) 2009 Walter Bruce Sinclair
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
/*-- For pagc-0.4.2 : last revised 2012-07-18 --*/
#undef DEBUG
//#define DEBUG 1
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include "pagc_api.h"
#ifdef BUILD_API
#include "pagc_std_api.h"
#endif
#define GAZ_LEXICON
/* -- local prototypes -- */
/*-- <revision date='2012-07-22'> Keep track of start_state </revision> --*/
static int _Close_Stand_Field_(STAND_PARAM *) ;
static int _Scan_String_(STAND_PARAM *, char *) ;
static char * _Scan_Next_(STAND_PARAM *, char *) ;
static char __spacer__[] = " \\-.)}>_" ;
#define TERM_AND_LENGTH \
*__dest__ = SENTINEL ; \
n = strlen(__scan_buf__)
#define RETURN_NEW_MORPH(TOKEN_ARG) \
if (!new_morph(__stand_param__,TOKEN_ARG,__scan_buf__,n))\
{\
return NULL ; \
} \
return __src__
#define COLLECT_LOOKAHEAD \
*__dest__++ = a ; __src__++ ; *__dest__++ = b ; __src__++
#define COLLECT_WHILE(COND) \
do { *__dest__++ = a ; __src__++ ; a = *__src__ ; } while (COND)
#define NO_COLLECT_WHILE(COND) \
do { __dest__++ ; __src__++ ; a = *__src__ ; } while (COND)
#define TEST_FOR_ORD_DIGIT(N,NEXT_LOW,NEXT_UP) \
if ((b == NEXT_LOW) || (b == NEXT_UP)) \
{ \
if (last_digit == N)\
{ \
if ((n < 2 ) || (*(__dest__-2) != '1')) \
{ \
COLLECT_LOOKAHEAD ; \
TERM_AND_LENGTH ; \
RETURN_NEW_MORPH(DORD) ; \
} \
} \
} \
break
/*========================================================================
<summary>
<function name='standard.c (standardize_field)'/>
<remarks>This function is called with a pointer to the
str to standardize and a start state indicating
the kind of standardization to perform. It invokes
the scanner to start the creation of the morphemes
<calls><functionref='tokenize.c (initialize_morphs)'/></calls>
<calls><functionref='_Close_Stand_Field_s'/></calls>
<calls><functionref='_Scan_String_'/></calls>
</summary>
=========================================================================*/
int standardize_field(STAND_PARAM *__stand_param__ ,char *__in_str__ , int client_start_state )
{
/*-- <revision date='2009-08-13'> Support multiple lexicons </revision> --*/
/*-- <revision date='2012-06-01'> Add gaz_lexicon to be triggered on start_state= MACRO </revision> --*/
__stand_param__->lexicon = __stand_param__->address_lexicon ;
if (client_start_state > EXTRA_STATE)
{
__stand_param__->lexicon = __stand_param__->poi_lexicon ;
}
#ifdef GAZ_LEXICON
else
{
if (client_start_state == MACRO)
{
__stand_param__->lexicon = __stand_param__->gaz_lexicon ;
}
}
#endif
/*-- <revision date='2012-07-22'> Keep track of start_state </revision> --*/
__stand_param__->start_state = client_start_state ;
initialize_morphs(__stand_param__) ;
if (!_Scan_String_(__stand_param__,__in_str__))
{
return FALSE ;
}
/*-- <revision date='2012-07-22'> Keep track of start_state </revision> --*/
return (_Close_Stand_Field_(__stand_param__)) ;
}
static int _Scan_String_(STAND_PARAM *__stand_param__ ,char *__in_str__ )
{
char *__src__ = __in_str__ ;
while (TRUE)
{
char a = *__src__ ;
/*-- <remarks> If we're done, process the tokens: </remarks> --*/
if ((a == '\n') || (a == SENTINEL))
{
return (process_input(__stand_param__)) ;
}
/*-- <remarks> Gather sequences into tokens: </remarks> --*/
__src__ = _Scan_Next_(__stand_param__,__src__) ;
/*-- <remarks> Check for errors: </remarks> --*/
if (__src__ == NULL)
{
break ;
}
}
return FALSE ;
}
static char * _Scan_Next_( STAND_PARAM *__stand_param__,char * __in_ptr__)
{
int n ;
char __scan_buf__[MAXSTRLEN] ;
/*-- <remarks> Collect a sequence of characters into the scan_buf </remarks> --*/
char *__src__ = __in_ptr__ ;
char a = *__src__ ;
char *__dest__ = __scan_buf__ ;
*__dest__ = SENTINEL ;
/*-- <remarks> Type one terminators </remarks> --*/
if ((a == ',') || (a == '\t') || (a == ';'))
{
*__dest__++ = a ;
*__dest__ = SENTINEL;
set_term(__stand_param__,1,__scan_buf__) ;
/*-- <remarks> Point to next input char </remarks> --*/
return (__src__ + 1) ;
}
/*-- <remarks> Numeric sequences : ordinals, fractions and numbers </remarks> --*/
if (isdigit(a))
{
char b ;
char last_digit ;
COLLECT_WHILE(isdigit(a)) ;
/*-- <remarks> Get a character of lookahead and one of lookbehind </remarks> --*/
b = *(__src__ + 1 ) ;
last_digit = *(__dest__ - 1 ) ; /*-- last digit collected --*/
n = __dest__ - __scan_buf__ ;
switch (a)
{
/*-- <remarks> Fractions </remarks> --*/
case '/' :
/*-- <remarks> Collect the rest of the fraction </remarks> --*/
if (isdigit(b))
{
switch (b)
{
case '2' :
if (last_digit == '1')
{
COLLECT_LOOKAHEAD ;
TERM_AND_LENGTH ;
RETURN_NEW_MORPH(DFRACT) ;
}
break ;
case '3' :
if ((last_digit == '1') || (last_digit == '2'))
{
COLLECT_LOOKAHEAD ;
TERM_AND_LENGTH ;
RETURN_NEW_MORPH(DFRACT) ;
}
break ;
case '4' :
if ((last_digit == '1') || (last_digit == '3'))
{
COLLECT_LOOKAHEAD ;
TERM_AND_LENGTH ;
RETURN_NEW_MORPH(DFRACT) ;
}
break ;
} /*-- <remarks> end of switch on lookahead </remarks> --*/
}
break ;
/*-- <remarks> ordinals -- */
case 's' : case 'S' :
/*-- <remarks> 1st, 21st, 31st, -- for 1 </remarks> --*/
/*-- <remarks> Point to next input char </remarks> --*/
TEST_FOR_ORD_DIGIT('1','t','T') ;
case 'r' : case 'R' :
/*-- <remarks> 3rd, 23rd, 33rd, -- for 3 </remarks> --*/
/*-- <remarks> Point to next input char </remarks> --*/
TEST_FOR_ORD_DIGIT('3','d','D') ;
case 'n' : case 'N' :
/*-- <remarks> 2nd, 22nd, 32nd, -- for 2 </remarks> --*/
/*-- <remarks> Point to next input char </remarks> --*/
TEST_FOR_ORD_DIGIT('2','d','D') ;
case 't' : case 'T' :
if ((b == 'h') || (b == 'H'))
{
switch (last_digit)
{
case '1' : case '2' : case '3' :
/*-- <remarks> 11th, 111th, 211th etc -- for 11-13 </remarks> --*/
if ((n > 1) && (*(__dest__ - 2) == '1'))
{
COLLECT_LOOKAHEAD ;
TERM_AND_LENGTH ;
/*-- <remarks> Point to next input char </remarks> --*/
RETURN_NEW_MORPH(DORD) ;
}
break ;
default :
/*-- <remarks> 4th, 14th, 24th etc -- for 0, 4-9 </remarks> --*/
COLLECT_LOOKAHEAD ;
TERM_AND_LENGTH ;
/*-- <remarks> Point to next input char </remarks> --*/
RETURN_NEW_MORPH(DORD) ;
}
}
break ;
}
/*-- <remarks> ordinary numeric sequence </remarks> --*/
TERM_AND_LENGTH ;
/*-- <remarks> Retain position </remarks> --*/
RETURN_NEW_MORPH(DNUMBER) ;
}
/*-- <revision date='2009-08-15'> Fix ampersand : P&R --> P & R </remarks> --*/
if (a == '&')
{
COLLECT_WHILE(a == '&') ;
TERM_AND_LENGTH ;
RETURN_NEW_MORPH(DSINGLE) ;
}
/*-- <remarks> Alphabetic sequence </remarks> --*/
if ((isalpha(a)) || (a == '\'') || (a == '#'))
{
COLLECT_WHILE((isalpha(a)) || (a == '\'')) ;
TERM_AND_LENGTH ;
/*-- <remarks> Retain position </remarks> --*/
switch (n)
{
case 1 :
RETURN_NEW_MORPH(DSINGLE) ;
case 2 :
RETURN_NEW_MORPH(DDOUBLE) ;
default :
RETURN_NEW_MORPH( DWORDT ) ;
}
/*-- <remarks> Retain position </remarks> --*/
return __src__ ;
}
/*-- <remarks> Type 2 terminators ( spacing ) </remarks> --*/
if (strchr(__spacer__,a) != NULL)
{
NO_COLLECT_WHILE(strchr(__spacer__,a) != NULL) ;
set_term(__stand_param__,2,__scan_buf__) ;
/*-- <remarks> Retain position </remarks> --*/
return (__src__) ;
}
/*-- <remarks> Ignore everything not specified. Point to next input char. </remarks> --*/
return (__src__ + 1) ;
}
#ifdef BUILD_API
/*
typedef struct STANDARDIZER_s {
int data;
char *err_msg;
} STANDARDIZER;
typedef struct STDADDR_s { // define as required
char *house_num;
char *prequal;
char *pretype;
char *predir;
char *name;
char *suftype;
char *sufdir;
char *sufqual;
char *extra;
char *city;
char *state;
char *postcode;
char *country;
} STDADDR;
*/
STANDARDIZER *std_init()
{
STANDARDIZER *std;
std = (STANDARDIZER *) calloc(1,sizeof(STANDARDIZER)) ;
if ( std == NULL ) return NULL ;
std -> pagc_p = (PAGC_GLOBAL *) calloc(1,sizeof(PAGC_GLOBAL)) ;
if ( std -> pagc_p == NULL ) {
free( std ) ;
return NULL ;
}
std -> pagc_p -> process_errors = init_errors(std -> pagc_p, NULL) ;
std -> err_p = std -> pagc_p -> process_errors ;
return std;
}
int std_use_lex(STANDARDIZER *std, LEXICON *lex)
{
std -> pagc_p -> addr_lexicon = lex -> hash_table ;
lex -> hash_table = NULL;
lex_free(lex);
if (!setup_default_defs(std -> pagc_p)) return FALSE ;
return (install_def_block_table(std -> pagc_p -> addr_lexicon, std -> pagc_p -> process_errors)) ;
}
int std_use_gaz(STANDARDIZER *std, LEXICON *gaz)
{
std -> pagc_p -> gaz_lexicon = gaz -> hash_table ;
gaz -> hash_table = NULL;
lex_free(gaz);
return 0;
}
int std_use_rules(STANDARDIZER *std, RULES *rules)
{
if ( ! rules -> ready ) {
RET_ERR("std_use_rules: Rules have not been readied!", std -> err_p, 1);
}
std -> pagc_p -> rules = rules -> r_p ;
rules -> r_p = NULL;
rules_free(rules);
return 0;
}
int std_ready_standardizer(STANDARDIZER *std)
{
std -> misc_stand =
init_stand_context(std -> pagc_p, std -> err_p, 1);
if (std -> misc_stand == NULL)
return 1;
return 0;
}
void std_free(STANDARDIZER *std)
{
if ( std == NULL ) return;
DBG("Calling close_stand_process");
if ( std -> pagc_p != NULL ) close_stand_process( std -> pagc_p ) ;
if ( std -> pagc_p -> process_errors != NULL ) {
DBG("Calling close_errors");
close_errors( std -> pagc_p -> process_errors );
DBG("Calling FREE_AND_NULL");
FREE_AND_NULL( std -> pagc_p ) ;
}
DBG("Calling close_stand_context");
close_stand_context( std -> misc_stand );
DBG("Calling free");
free( std );
}
void stdaddr_free(STDADDR *stdaddr)
{
if (!stdaddr) return;
if (stdaddr->building) free(stdaddr->building);
if (stdaddr->house_num) free(stdaddr->house_num);
if (stdaddr->predir) free(stdaddr->predir);
if (stdaddr->qual) free(stdaddr->qual);
if (stdaddr->pretype) free(stdaddr->pretype);
if (stdaddr->name) free(stdaddr->name);
if (stdaddr->suftype) free(stdaddr->suftype);
if (stdaddr->sufdir) free(stdaddr->sufdir);
if (stdaddr->ruralroute) free(stdaddr->ruralroute);
if (stdaddr->extra) free(stdaddr->extra);
if (stdaddr->city) free(stdaddr->city);
if (stdaddr->state) free(stdaddr->state);
if (stdaddr->country) free(stdaddr->country);
if (stdaddr->postcode) free(stdaddr->postcode);
if (stdaddr->box) free(stdaddr->box);
if (stdaddr->unit) free(stdaddr->unit);
free(stdaddr);
stdaddr = NULL;
}
static char *coalesce( char *a, char *b )
{
return a?a:b;
}
void print_stdaddr( STDADDR *result )
{
if (result) {
printf(" building: %s\n", coalesce(result -> building, ""));
printf(" house_num: %s\n", coalesce(result -> house_num, ""));
printf(" predir: %s\n", coalesce(result -> predir, ""));
printf(" qual: %s\n", coalesce(result -> qual, ""));
printf(" pretype: %s\n", coalesce(result -> pretype, ""));
printf(" name: %s\n", coalesce(result -> name, ""));
printf(" suftype: %s\n", coalesce(result -> suftype, ""));
printf(" sufdir: %s\n", coalesce(result -> sufdir, ""));
printf("ruralroute: %s\n", coalesce(result -> ruralroute, ""));
printf(" extra: %s\n", coalesce(result -> extra, ""));
printf(" city: %s\n", coalesce(result -> city, ""));
printf(" state: %s\n", coalesce(result -> state, ""));
printf(" country: %s\n", coalesce(result -> country, ""));
printf(" postcode: %s\n", coalesce(result -> postcode, ""));
printf(" box: %s\n", coalesce(result -> box, ""));
printf(" unit: %s\n", coalesce(result -> unit, ""));
}
}
/*
STDADDR *std_standardize_one(STANDARDIZER *std, char *address_one_line, int options)
{
return NULL;
}
*/
STDADDR *std_standardize_mm(STANDARDIZER *std, char *micro, char *macro, int options)
{
STAND_PARAM *stand_address;
STDADDR *stdaddr;
int err;
stand_address = std -> misc_stand ;
if (stand_address == NULL)
return NULL;
if (!micro || ( IS_BLANK( micro ))) {
RET_ERR("std_standardize_mm: micro attribute to standardize!", std -> err_p, NULL);
}
init_output_fields( stand_address, BOTH );
if (macro && macro[0] != '\0') {
err = standardize_field( stand_address, macro, MACRO );
if (!err) {
RET_ERR1("std_standardize_mm: No standardization of %s!",
macro, std -> err_p, NULL);
}
if (options & 1) {
printf("After standardize_field for macro:\n");
output_raw_elements( stand_address , NULL ) ;
send_fields_to_stream(stand_address->standard_fields , NULL, 0, 0);
}
}
err = standardize_field( stand_address, micro, MICRO_M );
if (!err) {
RET_ERR1("std_standardize_mm: No standardization of %s!",
micro, std -> err_p, NULL);
}
if (options & 1) {
printf("After standardize_field for micro:\n");
send_fields_to_stream(stand_address->standard_fields , NULL, 0, 0);
}
PAGC_CALLOC_STRUC(stdaddr,STDADDR,1,std -> err_p,NULL);
if (strlen(stand_address -> standard_fields[0]))
stdaddr->building = strdup(stand_address -> standard_fields[0]);
if (strlen(stand_address -> standard_fields[1]))
stdaddr->house_num = strdup(stand_address -> standard_fields[1]);
if (strlen(stand_address -> standard_fields[2]))
stdaddr->predir = strdup(stand_address -> standard_fields[2]);
if (strlen(stand_address -> standard_fields[3]))
stdaddr->qual = strdup(stand_address -> standard_fields[3]);
if (strlen(stand_address -> standard_fields[4]))
stdaddr->pretype = strdup(stand_address -> standard_fields[4]);
if (strlen(stand_address -> standard_fields[5]))
stdaddr->name = strdup(stand_address -> standard_fields[5]);
if (strlen(stand_address -> standard_fields[6]))
stdaddr->suftype = strdup(stand_address -> standard_fields[6]);
if (strlen(stand_address -> standard_fields[7]))
stdaddr->sufdir = strdup(stand_address -> standard_fields[7]);
if (strlen(stand_address -> standard_fields[8]))
stdaddr->ruralroute = strdup(stand_address -> standard_fields[8]);
if (strlen(stand_address -> standard_fields[9]))
stdaddr->extra = strdup(stand_address -> standard_fields[9]);
if (strlen(stand_address -> standard_fields[10]))
stdaddr->city = strdup(stand_address -> standard_fields[10]);
if (strlen(stand_address -> standard_fields[11]))
stdaddr->state = strdup(stand_address -> standard_fields[11]);
if (strlen(stand_address -> standard_fields[12]))
stdaddr->country = strdup(stand_address -> standard_fields[12]);
if (strlen(stand_address -> standard_fields[13]))
stdaddr->postcode = strdup(stand_address -> standard_fields[13]);
if (strlen(stand_address -> standard_fields[14]))
stdaddr->box = strdup(stand_address -> standard_fields[14]);
if (strlen(stand_address -> standard_fields[15]))
stdaddr->unit = strdup(stand_address -> standard_fields[15]);
return stdaddr;
}
STDADDR *std_standardize(STANDARDIZER *std, char *address, char *city, char *state, char *postcode, char *country, int options)
{
return NULL;
}
#else
/*========================================================================
<summary>
<function name='standard.c (init_stand_process)'/>
<remarks>set up process level, opens the lexicons and rules
and default definitions for the tokenizer</remarks>
<calls><functionref='(gamma.c) create_rules'/>, <functionref='(lexicon.c) create_lexicon'/>,
<functionref='(tokenize.c) setup_default_defs'/> and
<functionref='(analyze.c) install_def_block_table'/></calls>
</summary>
=========================================================================*/
int init_stand_process(PAGC_GLOBAL *__pagc_global__ ,const char *__rule_name__, const char *__lexicon_name__ , const char *__gazetteer_name__ , const char *__featword_name__)
{
if ((__pagc_global__->rules = create_rules(__rule_name__,__pagc_global__)) == NULL)
{
return FALSE ;
}
/*-- <revision date='2009-08-13'> Support multiple lexicons </revision> --*/
if ((__pagc_global__->addr_lexicon = create_lexicon(__pagc_global__ ,__lexicon_name__ , __gazetteer_name__)) == NULL)
{
return FALSE ;
}
if ((__pagc_global__->poi_lexicon = create_lexicon(__pagc_global__ ,__featword_name__ ,NULL)) == NULL)
{
return FALSE ;
}
#ifdef GAZ_LEXICON
/*-- <revision date='2012-06-01'> Add gaz_lexicon to be triggered on _start_state_ = MACRO </revision> --*/
if ((__pagc_global__->gaz_lexicon = create_lexicon(__pagc_global__,__gazetteer_name__,NULL)) == NULL)
{
return FALSE ;
}
#endif
if (!setup_default_defs(__pagc_global__))
{
return FALSE ;
}
return (install_def_block_table(__pagc_global__->addr_lexicon ,__pagc_global__->process_errors)) ;
}
#endif
/*========================================================================
<summary>
<function name='standard.c (close_stand_process)'/>
<remarks> Called on exit to close down standardizer </remarks>
<calls> <functionref='(tokenize.c) remove_default_defs'/>,
<functionref='(gamma.c) destroy_rules'/> and
<functionref='lexicon.c (destroy_lexicon)'/></calls>
</summary>
=========================================================================*/
void close_stand_process(PAGC_GLOBAL * __pagc_global__)
{
if (__pagc_global__ == NULL)
{
return ;
}
DBG("remove_default_defs(__pagc_global__)");
remove_default_defs(__pagc_global__) ;
DBG("destroy_rules(__pagc_global__->rules) ;");
destroy_rules(__pagc_global__->rules) ;
/*-- <revision date='2009-08-13'> Support multiple lexicons </revision> --*/
DBG("destroy_lexicon(__pagc_global__->addr_lexicon)");
destroy_lexicon(__pagc_global__->addr_lexicon) ;
DBG("destroy_lexicon(__pagc_global__->poi_lexicon)");
destroy_lexicon(__pagc_global__->poi_lexicon) ;
/*-- <revision date='2012-06-01'> Add gaz_lexicon to be triggered on _start_state_ = MACRO </revision> --*/
#ifdef GAZ_LEXICON
DBG("destroy_lexicon(__pagc_global__->gaz_lexicon)");
destroy_lexicon(__pagc_global__->gaz_lexicon) ;
#endif
}
/*========================================================================
<summary>
<function name='standard.c (init_stand_context)'/>
<param name='__err_param__'>belongs to the dataset context.</param>
<calls><functionref='analyze.c (create_segments)'/>
<returns>NULL returned on error - if so, call <functionref='close_stand_context'/></returns>
</summary>
=========================================================================*/
STAND_PARAM *init_stand_context(PAGC_GLOBAL *__pagc_global__,ERR_PARAM *__err_param__,int exhaustive_flag)
{
STAND_PARAM *__stand_param__ ;
/*-- <remarks> Initialization-time allocation </remarks> --*/
PAGC_CALLOC_STRUC(__stand_param__,STAND_PARAM,1,__err_param__,NULL) ;
if ((__stand_param__->stz_info = create_segments(__err_param__)) == NULL)
{
return NULL ;
}
PAGC_CALLOC_2D_ARRAY(__stand_param__->standard_fields, char, MAXOUTSYM, MAXFLDLEN, __err_param__, NULL) ;
__stand_param__->analyze_complete = exhaustive_flag ;
__stand_param__->errors = __err_param__ ;
__stand_param__->have_ref_att = NULL ;
/*-- <remarks> Transfer from global </remarks> --*/
__stand_param__->rules = __pagc_global__->rules ;
/*-- <revision date='2009-08-13'> Support multiple lexicons </revision> --*/
/*-- <remarks> Transfer from global </remarks> --*/
__stand_param__->address_lexicon = __pagc_global__->addr_lexicon ;
/*-- <remarks> Transfer from global </remarks> --*/
__stand_param__->poi_lexicon = __pagc_global__->poi_lexicon ;
/*-- <revision date='2012-06-01'> Add gaz_lexicon to be triggered on _start_state_ = MACRO </revision> --*/
#ifdef GAZ_LEXICON
__stand_param__->gaz_lexicon = __pagc_global__->gaz_lexicon ;
#endif
__stand_param__->default_def = __pagc_global__->default_def ;
return __stand_param__ ;
}
/*========================================================================
<summary>
<function name='standard.c (close_stand_context)'/>
<remarks> Closes the <code>STAND_PARAM</code> record </remarks>
<calls> <functionref='analyze.c (destroy_segments)'/>,
<macroref='FREE_AND_NULL'/></calls>
<summary>
=========================================================================*/
void close_stand_context( STAND_PARAM *__stand_param__ )
{
if (__stand_param__ == NULL)
{
return ;
}
destroy_segments(__stand_param__->stz_info) ;
if (__stand_param__->standard_fields != NULL)
{
PAGC_DESTROY_2D_ARRAY(__stand_param__->standard_fields,char,MAXOUTSYM) ;
}
/*-- <remarks> Cleanup time memory release </remarks> --*/
FREE_AND_NULL(__stand_param__) ;
}
/*========================================================================
<summary>
<function name='standard.c (_Close_Stand_Field_)'/>
<remarks> Sends the scanned and processed input to the evaluator </remarks>
<called-by> <functionref='standard.c (standardize_field)'/></called-by>
<calls> <functionref='analyze.c (evaluator)'/> , <functionref='export.c (stuff_fields)'/></calls>
<returns>FALSE on error</returns>
<revision date='2012-07-22'> Keep track of start_state </revision>
</summary>
=========================================================================*/
static int _Close_Stand_Field_(STAND_PARAM *__stand_param__)
{
/*-- <revision date='2012-07-22'> Keep track of start_state </revision> --*/
if (evaluator(__stand_param__))
{
/*-- <remarks> Write the output into the fields. </remarks> --*/
stuff_fields(__stand_param__) ;
return TRUE ;
}
RET_ERR("_Close_Stand_Field_: Address failed to standardize",__stand_param__->errors,FALSE) ;
}

View file

@ -0,0 +1,895 @@
/* PostgreSQL headers */
#include "postgres.h"
#include "fmgr.h"
#include "miscadmin.h"
#include "utils/memutils.h"
#include "executor/spi.h"
#include "access/hash.h"
#include "utils/hsearch.h"
#include "funcapi.h"
#include "catalog/pg_type.h"
/* standardizer headers */
#undef DEBUG
//#define DEBUG 1
#include "pagc_api.h"
#include "pagc_std_api.h"
#include "std_pg_hash.h"
/* C headers */
#include <sys/time.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
#ifdef DEBUG
#define SET_TIME(a) gettimeofday(&(a), NULL)
#define ELAPSED_T(a,b) \
elapsed = (b.tv_sec - a.tv_sec)*1000.0; \
elapsed += (b.tv_usec - a.tv_usec)/1000.0;
#else
#define SET_TIME(a) do { ; } while (0)
#define ELAPSED_T(a,b) do { ; } while (0)
#endif
#define MAX_RULE_LENGTH 128
#define TUPLIMIT 1000
#define STD_CACHE_ITEMS 4
#define STD_BACKEND_HASH_SIZE 16
static HTAB* StdHash = NULL;
typedef struct
{
char *lextab;
char *gaztab;
char *rultab;
STANDARDIZER *std;
MemoryContext std_mcxt;
}
StdCacheItem;
typedef struct
{
StdCacheItem StdCache[STD_CACHE_ITEMS];
int NextSlot;
MemoryContext StdCacheContext;
}
StdPortalCache;
typedef struct
{
MemoryContext context;
STANDARDIZER *std;
}
StdHashEntry;
typedef struct lex_columns
{
int seq;
int word;
int stdword;
int token;
} lex_columns_t;
typedef struct rules_columns
{
int rule;
} rules_columns_t;
/* Memory context hash table function prototypes */
uint32 mcxt_ptr_hash_std(const void *key, Size keysize);
static void CreateStdHash(void);
static void AddStdHashEntry(MemoryContext mcxt, STANDARDIZER *std);
static StdHashEntry *GetStdHashEntry(MemoryContext mcxt);
static void DeleteStdHashEntry(MemoryContext mcxt);
/* Memory context cache function prototypes */
static void StdCacheInit(MemoryContext context);
static void StdCacheReset(MemoryContext context);
static void StdCacheDelete(MemoryContext context);
static bool StdCacheIsEmpty(MemoryContext context);
static void StdCacheStats(MemoryContext context, int level);
#ifdef MEMORY_CONTEXT_CHECKING
static void StdCacheCheck(MemoryContext context);
#endif
static bool IsInStdPortalCache(StdPortalCache *STDCache, char *lextab, char *gaztab, char *rultab);
static STANDARDIZER *GetStdFromPortalCache(StdPortalCache *STDCache, char *lextab, char *gaztab, char *rultab);
static void AddToStdPortalCache(StdPortalCache *STDCache, char *lextab, char *gaztab, char *rultab);
static StdPortalCache *GetStdPortalCache(FunctionCallInfo fcinfo);
/* standardizer api functions */
static STANDARDIZER *CreateStd(char *lextab, char *gaztab, char *rultab);
static int parse_rule(char *buf, int *rule);
static int fetch_lex_columns(SPITupleTable *tuptable, lex_columns_t *lex_cols);
static int tableNameOk(char *t);
static int load_lex(LEXICON *lex, char *tabname);
static int fetch_rules_columns(SPITupleTable *tuptable, rules_columns_t *rules_cols);
static int load_rules(RULES *rules, char *tabname);
/* Memory context definition must match the current version of PostgreSQL */
static MemoryContextMethods StdCacheContextMethods =
{
NULL,
NULL,
NULL,
StdCacheInit,
StdCacheReset,
StdCacheDelete,
NULL,
StdCacheIsEmpty,
StdCacheStats
#ifdef MEMORY_CONTEXT_CHECKING
, StdCacheCheck
#endif
};
static void
StdCacheInit(MemoryContext context)
{
/* NOP - initialized when first used. */
}
static void
StdCacheReset(MemoryContext context)
{
// NOP - Seems to be a required function
}
static void
StdCacheDelete(MemoryContext context)
{
StdHashEntry *she;
DBG("Enter: StdCacheDelete");
/* lookup the hash entry in the global hash table
so we can free it */
she = GetStdHashEntry(context);
if (!she)
elog(ERROR, "StdCacheDelete: Trying to delete non-existant hash entry object with MemoryContext key (%p)", (void *)context);
DBG("deleting std object (%p) with MemoryContext key (%p)", she->std, context);
if (she->std)
std_free(she->std);
DeleteStdHashEntry(context);
}
static bool
StdCacheIsEmpty(MemoryContext context)
{
// always return false - another required function
return FALSE;
}
static void
StdCacheStats(MemoryContext context, int level)
{
// another required function
fprintf(stderr, "%s: STANDARDIZER context\n", context->name);
}
#ifdef MEMORY_CONTEXT_CHECKING
static void
StdCacheCheck(MemoryContext context)
{
// NOP - another reuired function
}
#endif
uint32
mcxt_ptr_hash_std(const void *key, Size keysize)
{
uint32 hashval;
hashval = DatumGetUInt32(hash_any(key, keysize));
return hashval;
}
static void
CreateStdHash(void)
{
HASHCTL ctl;
ctl.keysize = sizeof(MemoryContext);
ctl.entrysize = sizeof(StdHashEntry);
ctl.hash = mcxt_ptr_hash_std;
StdHash = hash_create("PAGC Address Standardizer Backend MemoryContext Hash", STD_BACKEND_HASH_SIZE, &ctl, (HASH_ELEM | HASH_FUNCTION));
DBG("CreateStdHash: created StdHash (%p)", StdHash);
}
static void
AddStdHashEntry(MemoryContext mcxt, STANDARDIZER *std)
{
bool found;
void **key;
StdHashEntry *he;
DBG("Enter: AddStdHashEntry(mcxt=%p, std=%p)", mcxt, std);
/* The hash key is the MemoryContext pointer */
key = (void *)&mcxt;
he = (StdHashEntry *) hash_search(StdHash, key, HASH_ENTER, &found);
DBG("AddStdHashEntry: he=%p, found=%d", he, found);
if (!found) {
DBG("&he->context=%p", &he->context);
he->context = mcxt;
DBG("&he->std=%p", &he->std);
he->std = std;
DBG("Leaving AddStdHashEntry");
}
else {
elog(ERROR, "AddStdHashEntry: This memory context is already in use! (%p)", (void *)mcxt);
}
}
static StdHashEntry *
GetStdHashEntry(MemoryContext mcxt)
{
void **key;
StdHashEntry *he;
DBG("Enter: GetStdHashEntry");
key = (void *)&mcxt;
he = (StdHashEntry *) hash_search(StdHash, key, HASH_FIND, NULL);
return he;
}
static void
DeleteStdHashEntry(MemoryContext mcxt)
{
void **key;
StdHashEntry *he;
DBG("Enter: DeleteStdHashEntry");
key = (void *)&mcxt;
he = (StdHashEntry *) hash_search(StdHash, key, HASH_REMOVE, NULL);
if (!he)
elog(ERROR, "DeleteStdHashEntry: There was an error removing the STD object from this MemoryContext (%p)", (void *)mcxt);
he->std = NULL;
}
/* public api */
bool
IsInStdCache(StdCache STDCache, char *lextab, char *gaztab, char *rultab) {
return IsInStdPortalCache((StdPortalCache *) STDCache, lextab, gaztab, rultab);
}
static bool
IsInStdPortalCache(StdPortalCache *STDCache, char *lextab, char *gaztab, char *rultab)
{
int i;
DBG("Enter: IsInStdPortalCache");
for (i=0; i<STD_CACHE_ITEMS; i++) {
StdCacheItem *ci = &STDCache->StdCache[i];
if (ci->lextab && !strcmp(ci->lextab, lextab) &&
ci->lextab && !strcmp(ci->gaztab, gaztab) &&
ci->lextab && !strcmp(ci->rultab, rultab))
return TRUE;
}
return FALSE;
}
/* public api */
STANDARDIZER *
GetStdFromStdCache(StdCache STDCache, char *lextab, char *gaztab, char *rultab) {
return GetStdFromPortalCache((StdPortalCache *) STDCache, lextab, gaztab, rultab);
}
static STANDARDIZER *
GetStdFromPortalCache(StdPortalCache *STDCache, char *lextab, char *gaztab, char *rultab)
{
int i;
DBG("Enter: GetStdFromPortalCache");
for (i=0; i<STD_CACHE_ITEMS; i++) {
StdCacheItem *ci = &STDCache->StdCache[i];
if (ci->lextab && !strcmp(ci->lextab, lextab) &&
ci->lextab && !strcmp(ci->gaztab, gaztab) &&
ci->lextab && !strcmp(ci->rultab, rultab))
return STDCache->StdCache[i].std;
}
return NULL;
}
static void
DeleteNextSlotFromStdCache(StdPortalCache *STDCache)
{
MemoryContext old_context;
DBG("Enter: DeleteNextSlotFromStdCache");
if (STDCache->StdCache[STDCache->NextSlot].std != NULL) {
StdCacheItem *ce = &STDCache->StdCache[STDCache->NextSlot];
DBG("Removing STD cache entry ('%s', '%s', '%s') index %d", ce->lextab, ce->gaztab, ce->rultab, STDCache->NextSlot);
/* zero out the entries and free the memory context
We will get a callback to free the std object.
*/
old_context = MemoryContextSwitchTo(STDCache->StdCacheContext);
MemoryContextDelete(ce->std_mcxt);
pfree(ce->lextab);
ce->lextab = NULL;
pfree(ce->gaztab);
ce->gaztab = NULL;
pfree(ce->rultab);
ce->rultab = NULL;
ce->std = NULL;
MemoryContextSwitchTo(old_context);
}
}
/* public api */
void
AddToStdCache(StdCache cache, char *lextab, char *gaztab, char *rultab) {
AddToStdPortalCache((StdPortalCache *) cache, lextab, gaztab, rultab);
}
static void
AddToStdPortalCache(StdPortalCache *STDCache, char *lextab, char *gaztab, char *rultab)
{
MemoryContext STDMemoryContext;
MemoryContext old_context;
STANDARDIZER *std = NULL;
DBG("Enter: AddToStdPortalCache");
std = CreateStd(lextab, gaztab, rultab);
if (!std)
elog(ERROR,
"AddToStdPortalCache: could not create address standardizer for '%s', '%s', '%s'", lextab, gaztab, rultab);
/* if the NextSlot in the cache is used, then delete it */
if (STDCache->StdCache[STDCache->NextSlot].std != NULL) {
#ifdef DEBUG
StdCacheItem *ce = &STDCache->StdCache[STDCache->NextSlot];
DBG("Removing item from STD cache ('%s', '%s', '%s') index %d", ce->lextab, ce->gaztab, ce->rultab, STDCache->NextSlot);
#endif
DeleteNextSlotFromStdCache(STDCache);
}
DBG("Adding item to STD cache ('%s', '%s', '%s') index %d", lextab, gaztab, rultab, STDCache->NextSlot);
STDMemoryContext = MemoryContextCreate(T_AllocSetContext, 8192,
&StdCacheContextMethods,
STDCache->StdCacheContext,
"PAGC STD Memory Context");
/* Create the backend hash if it doesn't already exist */
DBG("Check if StdHash exists (%p)", StdHash);
if (!StdHash)
CreateStdHash();
/*
* Add the MemoryContext to the backend hash so we can
* clean up upon portal shutdown
*/
DBG("Adding standardizer obj (%p) to hash table with MemoryContext key (%p)", std, STDMemoryContext);
AddStdHashEntry(STDMemoryContext, std);
/* change memory contexts so the pstrdup are allocated in the
* context of this cache item. They will be freed when the
* cache item is deleted.
*/
DBG("AddToStdPortalCache: changing memory context to %p", STDCache->StdCacheContext);
old_context = MemoryContextSwitchTo(STDCache->StdCacheContext);
DBG(" old_context= %p", old_context);
STDCache->StdCache[STDCache->NextSlot].lextab = pstrdup(lextab);
DBG(" pstrdup(lextab) completed");
STDCache->StdCache[STDCache->NextSlot].gaztab = pstrdup(gaztab);
DBG(" pstrdup(gaztab) completed");
STDCache->StdCache[STDCache->NextSlot].rultab = pstrdup(rultab);
DBG(" pstrdup(rultab) completed");
MemoryContextSwitchTo(old_context);
DBG(" changed memory context to %p", old_context);
STDCache->StdCache[STDCache->NextSlot].std = std;
STDCache->StdCache[STDCache->NextSlot].std_mcxt = STDMemoryContext;
STDCache->NextSlot = (STDCache->NextSlot + 1) % STD_CACHE_ITEMS;
DBG("STDCache->NextSlot=%d", STDCache->NextSlot);
}
/* pubilc api */
StdCache
GetStdCache(FunctionCallInfo fcinfo) {
return (StdCache) GetStdPortalCache(fcinfo);
}
static StdPortalCache *
GetStdPortalCache(FunctionCallInfo fcinfo)
{
StdPortalCache *STDCache;
DBG("Enter: GetStdPortalCache");
/* create it if we don't already have one for this portal */
if (fcinfo->flinfo->fn_extra == NULL) {
MemoryContext old_context;
old_context = MemoryContextSwitchTo(fcinfo->flinfo->fn_mcxt);
STDCache = palloc(sizeof(StdPortalCache));
MemoryContextSwitchTo(old_context);
if (STDCache) {
int i;
DBG("Allocating STDCache for portal with STD MemoryContext (%p)", fcinfo->flinfo->fn_mcxt);
/* initial the cache items */
for (i=0; i<STD_CACHE_ITEMS; i++) {
STDCache->StdCache[i].lextab = NULL;
STDCache->StdCache[i].gaztab = NULL;
STDCache->StdCache[i].rultab = NULL;
STDCache->StdCache[i].std = NULL;
STDCache->StdCache[i].std_mcxt = NULL;
}
STDCache->NextSlot = 0;
STDCache->StdCacheContext = fcinfo->flinfo->fn_mcxt;
/* Store the pointer in fcinfo->flinfo->fn_extra */
fcinfo->flinfo->fn_extra = STDCache;
}
}
else {
/* Use the existing cache */
STDCache = fcinfo->flinfo->fn_extra;
}
return STDCache;
}
/* public api */
STANDARDIZER *
GetStdUsingFCInfo(FunctionCallInfo fcinfo, char *lextab, char *gaztab, char *rultab)
{
STANDARDIZER *std;
StdCache *std_cache = NULL;
DBG("GetStdUsingFCInfo: calling GetStdCache(fcinfo)");
std_cache = GetStdCache(fcinfo);
if (!std_cache)
return NULL;
DBG("GetStdUsingFCInfo: calling IsInStdCache(std_cache, lextab, gaztab, rultab)");
if (!IsInStdCache(std_cache, lextab, gaztab, rultab)) {
DBG("GetStdUsingFCInfo: calling AddToStdCache(std_cache, lextab, gaztab, rultab)");
AddToStdCache(std_cache, lextab, gaztab, rultab);
}
DBG("GetStdUsingFCInfo: GetStdFromStdCache(std_cache, lextab, gaztab, rultab)");
std = GetStdFromStdCache(std_cache, lextab, gaztab, rultab);
return std;
}
static STANDARDIZER *
CreateStd(char *lextab, char *gaztab, char *rultab)
{
STANDARDIZER *std;
LEXICON *lex;
LEXICON *gaz;
RULES *rules;
int err;
int SPIcode;
DBG("Enter: CreateStd");
SPIcode = SPI_connect();
if (SPIcode != SPI_OK_CONNECT) {
elog(ERROR, "CreateStd: couldn't open a connection to SPI");
}
std = std_init();
if (!std)
elog(ERROR, "CreateStd: could not allocate memory (std)");
lex = lex_init(std->err_p);
if (!lex) {
std_free(std);
SPI_finish();
elog(ERROR, "CreateStd: could not allocate memory (lex)");
}
err = load_lex(lex, lextab);
if (err == -1) {
lex_free(lex);
std_free(std);
SPI_finish();
elog(ERROR, "CreateStd: failed to load '%s' for lexicon", lextab);
}
gaz = lex_init(std->err_p);
if (!gaz) {
lex_free(lex);
std_free(std);
SPI_finish();
elog(ERROR, "CreateStd: could not allocate memory (gaz)");
}
err = load_lex(gaz, gaztab);
if (err == -1) {
lex_free(gaz);
lex_free(lex);
std_free(std);
SPI_finish();
elog(ERROR, "CreateStd: failed to load '%s' for gazeteer", gaztab);
}
rules = rules_init(std->err_p);
if (!rules) {
lex_free(gaz);
lex_free(lex);
std_free(std);
SPI_finish();
elog(ERROR, "CreateStd: could not allocate memory (rules)");
}
err = load_rules(rules, rultab);
if (err == -1) {
rules_free(rules);
lex_free(gaz);
lex_free(lex);
std_free(std);
SPI_finish();
elog(ERROR, "CreateStd: failed to load '%s' for rules", rultab);
}
std_use_lex(std, lex);
std_use_gaz(std, gaz);
std_use_rules(std, rules);
std_ready_standardizer(std);
SPI_finish();
return std;
}
static int parse_rule(char *buf, int *rule)
{
int nr = 0;
int *r = rule;
char *p = buf;
char *q;
while (1) {
*r = strtol( p, &q, 10 );
if (p == q) break;
p = q;
nr++;
r++;
if (nr > MAX_RULE_LENGTH) return -1;
}
return nr;
}
#define FETCH_COL(TRGT,NAME,NAME2) \
TRGT->NAME = SPI_fnumber(SPI_tuptable->tupdesc,NAME2);\
if (TRGT->NAME == SPI_ERROR_NOATTRIBUTE) err++;
#define CHECK_TYP(TRGT,NAME,TYPE) \
if (SPI_gettypeid(SPI_tuptable->tupdesc, TRGT->NAME) != TYPE) {\
DBG("CHECK_TYP: expecting %d, got: %d", TYPE, SPI_gettypeid(SPI_tuptable->tupdesc, TRGT->NAME));\
err++;\
}
#define GET_INT_FROM_TUPLE(TRGT,WHICH,NULLMSG) \
binval = SPI_getbinval(tuple, tupdesc, WHICH, &isnull);\
if (isnull) { \
elog(NOTICE, NULLMSG); \
return -1; \
} \
TRGT = DatumGetInt32(binval);
#define GET_TEXT_FROM_TUPLE(TRGT,WHICH) \
TRGT = DatumGetCString(SPI_getvalue(tuple, tupdesc, WHICH));
static int fetch_lex_columns(SPITupleTable *tuptable, lex_columns_t *lex_cols)
{
int err = 0;
FETCH_COL(lex_cols,seq,"seq");
FETCH_COL(lex_cols,word,"word");
FETCH_COL(lex_cols,stdword,"stdword");
FETCH_COL(lex_cols,token,"token");
if (err) {
elog(NOTICE, "lexicon queries must return columns 'seq', 'word', 'stdword' and 'token'");
return -1;
}
CHECK_TYP(lex_cols,seq,INT4OID);
CHECK_TYP(lex_cols,word,TEXTOID);
CHECK_TYP(lex_cols,stdword,TEXTOID);
CHECK_TYP(lex_cols,token,INT4OID);
if (err) {
elog(NOTICE, "lexicon column types must be: 'seq' int4, 'word' text, 'stdword' text, and 'token' int4");
return -1;
}
return 0;
}
/* snitize table names, leave '.' for schema */
static int tableNameOk(char *t)
{
while (*t != '\0') {
if (!(isalnum(*t) || *t == '_' || *t == '.' || *t == '"'))
return 0;
t++;
}
return 1;
}
static int load_lex(LEXICON *lex, char *tab)
{
int ret;
SPIPlanPtr SPIplan;
Portal SPIportal;
bool moredata = TRUE;
#ifdef DEBUG
struct timeval t1, t2;
double elapsed;
#endif
char *sql;
int ntuples;
int total_tuples = 0;
lex_columns_t lex_columns = {seq: -1, word: -1, stdword: -1, token: -1};
int seq;
char *word;
char *stdword;
int token;
DBG("start load_lex\n");
SET_TIME(t1);
if (!tab || !strlen(tab)) {
elog(NOTICE, "load_lex: rules table is not usable");
return -1;
}
if (!tableNameOk(tab)) {
elog(NOTICE, "load_lex: lex and gaz table names may only be alphanum and '.\"_' characters (%s)", tab);
return -1;
}
sql = SPI_palloc(strlen(tab)+65);
strcpy(sql, "select seq, word, stdword, token from ");
strcat(sql, tab);
strcat(sql, " order by id ");
/* get the sql for the lexicon records and prepare the query */
SPIplan = SPI_prepare(sql, 0, NULL);
if (SPIplan == NULL) {
elog(NOTICE, "load_lex: couldn't create query plan for the lex data via SPI (%s)", sql);
return -1;
}
/* get the sql for the lexicon records and prepare the query */
SPIplan = SPI_prepare(sql, 0, NULL);
if (SPIplan == NULL) {
elog(NOTICE, "load_lex: couldn't create query plan for the lexicon data via SPI");
return -1;
}
if ((SPIportal = SPI_cursor_open(NULL, SPIplan, NULL, NULL, true)) == NULL) {
elog(NOTICE, "load_lex: SPI_cursor_open('%s') returns NULL", sql);
return -1;
}
while (moredata == TRUE) {
//DBG("calling SPI_cursor_fetch");
SPI_cursor_fetch(SPIportal, TRUE, TUPLIMIT);
if (SPI_tuptable == NULL) {
elog(NOTICE, "load_lex: SPI_tuptable is NULL");
return -1;
}
if (lex_columns.seq == -1) {
ret = fetch_lex_columns(SPI_tuptable, &lex_columns);
if (ret)
return ret;
}
ntuples = SPI_processed;
//DBG("Reading edges: %i - %i", total_tuples, total_tuples+ntuples);
total_tuples += ntuples;
if (ntuples > 0) {
int t;
Datum binval;
bool isnull;
SPITupleTable *tuptable = SPI_tuptable;
TupleDesc tupdesc = SPI_tuptable->tupdesc;
for (t = 0; t < ntuples; t++) {
//if (t%100 == 0) { DBG(" t: %i", t); }
HeapTuple tuple = tuptable->vals[t];
GET_INT_FROM_TUPLE(seq,lex_columns.seq,"load_lex: seq contains a null value");
GET_TEXT_FROM_TUPLE(word,lex_columns.word);
GET_TEXT_FROM_TUPLE(stdword,lex_columns.stdword);
GET_INT_FROM_TUPLE(token,lex_columns.token,"load_lex: token contains a null value");
lex_add_entry(lex, seq, word, stdword, token);
}
//DBG("calling SPI_freetuptable");
SPI_freetuptable(tuptable);
//DBG("back from SPI_freetuptable");
}
else
moredata = FALSE;
}
SET_TIME(t2);
ELAPSED_T(t1, t2);
DBG("Time to read %i lexicon records: %.1f ms.", total_tuples, elapsed);
return 0;
}
static int fetch_rules_columns(SPITupleTable *tuptable, rules_columns_t *rules_cols)
{
int err = 0;
FETCH_COL(rules_cols,rule,"rule");
if (err) {
elog(NOTICE, "rules queries must return column 'rule'");
return -1;
}
CHECK_TYP(rules_cols,rule,TEXTOID);
if (err) {
elog(NOTICE, "rules column type must be: 'rule' text");
return -1;
}
return 0;
}
static int load_rules(RULES *rules, char *tab)
{
int ret;
SPIPlanPtr SPIplan;
Portal SPIportal;
bool moredata = TRUE;
#ifdef DEBUG
struct timeval t1, t2;
double elapsed;
#endif
char *sql;
int rule_arr[MAX_RULE_LENGTH];
int ntuples;
int total_tuples = 0;
rules_columns_t rules_columns = {rule: -1};
char *rule;
DBG("start load_rules\n");
SET_TIME(t1);
if (!tab || !strlen(tab)) {
elog(NOTICE, "load_rules: rules table is not usable");
return -1;
}
if (!tableNameOk(tab)) {
elog(NOTICE, "load_rules: rules table name may only be alphanum and '.\"_' characters (%s)", tab);
return -1;
}
sql = SPI_palloc(strlen(tab)+35);
strcpy(sql, "select rule from ");
strcat(sql, tab);
strcat(sql, " order by id ");
/* get the sql for the lexicon records and prepare the query */
SPIplan = SPI_prepare(sql, 0, NULL);
if (SPIplan == NULL) {
elog(NOTICE, "load_rules: couldn't create query plan for the rule data via SPI (%s)", sql);
return -1;
}
if ((SPIportal = SPI_cursor_open(NULL, SPIplan, NULL, NULL, true)) == NULL) {
elog(NOTICE, "load_rules: SPI_cursor_open('%s') returns NULL", sql);
return -1;
}
while (moredata == TRUE) {
//DBG("calling SPI_cursor_fetch");
SPI_cursor_fetch(SPIportal, TRUE, TUPLIMIT);
if (SPI_tuptable == NULL) {
elog(NOTICE, "load_rules: SPI_tuptable is NULL");
return -1;
}
if (rules_columns.rule == -1) {
ret = fetch_rules_columns(SPI_tuptable, &rules_columns);
if (ret)
return ret;
}
ntuples = SPI_processed;
//DBG("Reading edges: %i - %i", total_tuples, total_tuples+ntuples);
if (ntuples > 0) {
int t;
SPITupleTable *tuptable = SPI_tuptable;
TupleDesc tupdesc = SPI_tuptable->tupdesc;
for (t = 0; t < ntuples; t++) {
int nr;
//if (t%100 == 0) { DBG(" t: %i", t); }
HeapTuple tuple = tuptable->vals[t];
GET_TEXT_FROM_TUPLE(rule,rules_columns.rule);
nr = parse_rule(rule, rule_arr);
if (nr == -1) {
elog(NOTICE, "load_roles: rule exceeds 128 terms");
return -1;
}
ret = rules_add_rule(rules, nr, rule_arr);
if (ret != 0) {
elog(NOTICE,"load_roles: failed to add rule %d (%d): %s",
total_tuples+t+1, ret, rule);
return -1;
}
}
//DBG("calling SPI_freetuptable");
SPI_freetuptable(tuptable);
//DBG("back from SPI_freetuptable");
}
else
moredata = FALSE;
total_tuples += ntuples;
}
ret = rules_ready(rules);
if (ret != 0) {
elog(NOTICE, "load_roles: failed to ready the rules: err: %d", ret);
return -1;
}
SET_TIME(t2);
ELAPSED_T(t1, t2);
DBG("Time to read %i rule records: %.1f ms.", total_tuples, elapsed);
return 0;
}

View file

@ -0,0 +1,16 @@
/* Opaque type to use in standardizer cache API */
typedef void *StdCache;
StdCache GetStdCache(FunctionCallInfo fcinfo);
bool IsInStdCache(StdCache STDCache, char *lextab, char *gaztab, char *rultab);
void AddToStdCache(StdCache cache, char *lextab, char *gaztab, char *rultab);
STANDARDIZER *GetStdFromStdCache(StdCache STDCache, char *lextab, char *gaztab, char *rultab);
/*
* This is the only interface external code should be calling
* it will get the standardizer out of the cache, or
* it will create a new one and save it in the cache
*/
STANDARDIZER *GetStdUsingFCInfo(FunctionCallInfo fcinfo, char *lextab, char *gaztab, char *rultab);

View file

@ -0,0 +1,76 @@
\pset pager off
drop table if exists test_parse_address cascade;
create table test_parse_address (
id serial not null primary key,
instring text not null,
outstring text
);
copy test_parse_address (instring, outstring) from stdin;
@@ sttype dirs dirs words$
123 oak ln e n mycity ny (123,"oak ln e",,"123 oak ln e","n mycity",NY,,,US)
123 oak lane east n mycity ny (123,"oak lane east",,"123 oak lane east","n mycity",NY,,,US)
123 oak ln e north mycity ny (123,"oak ln e",,"123 oak ln e","north mycity",NY,,,US)
@@ sttype dirs dirs saint words$
123 oak ln e n st marie ny (123,"oak ln e",,"123 oak ln e","n st marie",NY,,,US)
123 oak lane east n st marie ny (123,"oak lane east",,"123 oak lane east","n st marie",NY,,,US)
123 oak ln e north st marie ny (123,"oak ln e",,"123 oak ln e","north st marie",NY,,,US)
123 oak ln e n saint marie ny (123,"oak ln e",,"123 oak ln e","n saint marie",NY,,,US)
123 oak lane east n saint marie ny (123,"oak lane east",,"123 oak lane east","n saint marie",NY,,,US)
123 oak ln e north saint marie ny (123,"oak ln e",,"123 oak ln e","north saint marie",NY,,,US)
@@ sttype dirs saint words$
123 oak ln e st marie ny (123,"oak ln",,"123 oak ln","e st marie",NY,,,US)
123 oak lane east st marie ny (123,"oak lane",,"123 oak lane","east st marie",NY,,,US)
123 oak ln e st marie ny (123,"oak ln",,"123 oak ln","e st marie",NY,,,US)
123 oak ln e saint marie ny (123,"oak ln",,"123 oak ln","e saint marie",NY,,,US)
123 oak lane east saint marie ny (123,"oak lane",,"123 oak lane","east saint marie",NY,,,US)
123 oak ln e saint marie ny (123,"oak ln",,"123 oak ln","e saint marie",NY,,,US)
@@ sttype saint words$
123 oak ln st marie ny (123,"oak ln",,"123 oak ln","st marie",NY,,,US)
123 oak lane st marie ny (123,"oak lane",,"123 oak lane","st marie",NY,,,US)
123 oak ln st marie ny (123,"oak ln",,"123 oak ln","st marie",NY,,,US)
123 oak ln saint marie ny (123,"oak ln",,"123 oak ln","saint marie",NY,,,US)
123 oak lane saint marie ny (123,"oak lane",,"123 oak lane","saint marie",NY,,,US)
123 oak ln saint marie ny (123,"oak ln",,"123 oak ln","saint marie",NY,,,US)
@@ sttype words$
123 oak ln marie ny (123,"oak ln",,"123 oak ln",marie,NY,,,US)
123 oak ln new marie ny (123,"oak ln",,"123 oak ln","new marie",NY,,,US)
@@ === same as above but with commas ===
@@ sttype dirs dirs words$
123 oak ln e, n mycity ny (123,"oak ln e",,"123 oak ln e","n mycity",NY,,,US)
123 oak lane east, n mycity ny (123,"oak lane east",,"123 oak lane east","n mycity",NY,,,US)
123 oak ln e, north mycity ny (123,"oak ln e",,"123 oak ln e","north mycity",NY,,,US)
123 oak ln e n, mycity ny (123,"oak ln e n",,"123 oak ln e n",mycity,NY,,,US)
123 oak lane east n, mycity ny (123,"oak lane east n",,"123 oak lane east n",mycity,NY,,,US)
123 oak ln e north, mycity ny (123,"oak ln e north",,"123 oak ln e north",mycity,NY,,,US)
@@ sttype dirs dirs saint words$
123 oak ln e, n st marie ny (123,"oak ln e",,"123 oak ln e","n st marie",NY,,,US)
123 oak lane east, n st marie ny (123,"oak lane east",,"123 oak lane east","n st marie",NY,,,US)
123 oak ln e, north st marie ny (123,"oak ln e",,"123 oak ln e","north st marie",NY,,,US)
123 oak ln e, n saint marie ny (123,"oak ln e",,"123 oak ln e","n saint marie",NY,,,US)
123 oak lane east, n saint marie ny (123,"oak lane east",,"123 oak lane east","n saint marie",NY,,,US)
123 oak ln e, north saint marie ny (123,"oak ln e",,"123 oak ln e","north saint marie",NY,,,US)
@@ sttype dirs saint words$
123 oak ln e, st marie ny (123,"oak ln e",,"123 oak ln e","st marie",NY,,,US)
123 oak lane east, st marie ny (123,"oak lane east",,"123 oak lane east","st marie",NY,,,US)
123 oak ln e, st marie ny (123,"oak ln e",,"123 oak ln e","st marie",NY,,,US)
123 oak ln e, saint marie ny (123,"oak ln e",,"123 oak ln e","saint marie",NY,,,US)
123 oak lane east, saint marie ny (123,"oak lane east",,"123 oak lane east","saint marie",NY,,,US)
123 oak ln e, saint marie ny (123,"oak ln e",,"123 oak ln e","saint marie",NY,,,US)
@@ sttype saint words$
123 oak ln, st marie ny (123,"oak ln",,"123 oak ln","st marie",NY,,,US)
123 oak lane, st marie ny (123,"oak lane",,"123 oak lane","st marie",NY,,,US)
123 oak ln, st marie ny (123,"oak ln",,"123 oak ln","st marie",NY,,,US)
123 oak ln, saint marie ny (123,"oak ln",,"123 oak ln","saint marie",NY,,,US)
123 oak lane, saint marie ny (123,"oak lane",,"123 oak lane","saint marie",NY,,,US)
123 oak ln, saint marie ny (123,"oak ln",,"123 oak ln","saint marie",NY,,,US)
@@ sttype words$
123 oak ln, marie ny (123,"oak ln",,"123 oak ln",marie,NY,,,US)
123 oak ln, new marie ny (123,"oak ln",,"123 oak ln","new marie",NY,,,US)
\.
select id, instring, outstring as expected, parse_address(instring) as got_result
from test_parse_address
where instring not like '@@%' and parse_address(instring)::text != outstring;
\q

View file

@ -0,0 +1,12 @@
\set ECHO queries
\pset pager off
select * from parse_address('123 Main Street, Kansas City, MO 45678');
\i /usr/share/postgresql/9.2/extension/us-lex.sql
\i /usr/share/postgresql/9.2/extension/us-gaz.sql
\i /usr/share/postgresql/9.2/extension/us-rules.sql
select * from standardize_address('lex'::text, 'gaz'::text, 'rules'::text, '123 Main Street'::text, 'Kansas City, MO 45678'::text);
\q

View file

@ -0,0 +1,8 @@
\set ECHO queries
\pset pager off
\i micro-macro.sql
select (std).* from (
select standardize_address('lex', 'gaz', 'rules', micro, macro) as std
from addresses) as foo;

View file

@ -0,0 +1,335 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <assert.h>
#include "pagc_api.h"
#include "pagc_std_api.h"
#define RULESIZE 40
#define LEXIN "lexicon.csv"
#define GAZIN "gazeteer.csv"
#define RULESIN "rules.txt"
static int standardize_command_line( STANDARDIZER *std ,
char *input_str ,
int option ) ;
void print_lexicon( ENTRY ** hash_table ) ;
/*
parse_csv() parses the following file format into fields
"1","#",16,"#"
"2","#",7,"#"
"1","&",13,"AND"
"2","&",1,"AND"
"3","&",7,"AND"
"1","-","9","-"
*/
/* ----------------------------------------------------
lexicon.c (convert_field)
called by lexicon.c (read_lexicon)
ctype.h (isspace)
uses macro BLANK_STRING
-------------------------------------------------------*/
static char *convert_field( char *buf ,
char *inp ) {
char c ;
char *d = buf;
char *s = inp ;
*d = '\0' ;
/* -- space at the beginning of a line will stop the read -- */
if ( isspace( *s ) )
return NULL ;
while ( ( c = *s++ ) != '\0' ) {
if ( c == '\"' ||
c == '\r' )
continue ; /* -- ignore quotes and carriage returns -- */
/* -- zero terminate field and record delimiters -- */
if ( c == '\n' ||
c == ',' ) {
*d = '\0' ;
return s ;
}
*d++ = c ; /* -- copy it -- */
}
return NULL ;
}
static int parse_csv(char *buf, int *seq, char *word, char *stdword, int *token)
{
char *next_str ;
char num_str[512];
if ( ( next_str = convert_field( num_str , buf) ) == NULL ) return 0;
sscanf( num_str, "%d", seq );
next_str = convert_field( word, next_str);
next_str = convert_field( num_str, next_str);
sscanf( num_str, "%d", token );
next_str = convert_field( stdword, next_str);
return 1;
}
/*
parse_rule() reads lines the following and loads them into int[] and
returns the number of items read.
1 2 11 28 -1 10 10 11 13 -1 0 16
1 2 11 28 12 -1 10 10 11 13 12 -1 0 17
1 2 11 28 29 -1 10 10 11 13 13 -1 0 16
1 2 11 28 29 12 -1 10 10 11 13 13 12 -1 0 17
-1
*/
int parse_rule(char *buf, int *rule)
{
int nr = 0;
int *r = rule;
char *p = buf;
char *q;
while (1) {
*r = strtol( p, &q, 10 );
if (p == q) break;
p = q;
nr++;
r++;
}
return nr;
}
void Usage()
{
printf("Usage: test_main [-o n] \n");
printf(" -o n = options bit flag\n");
printf(" 1 = print lexicon\n");
printf(" 2 = print gazeteer\n");
printf(" 4 = print standardized fields\n");
printf(" 8 = print rule statistics\n");
exit(1);
}
int main(int argc, char *argv[])
{
STANDARDIZER *std;
LEXICON *lex;
LEXICON *gaz;
RULES *rules;
char buf[1024];
int seq;
char input_str[ 4096 ] ;
char word[512];
char stdword[512];
int token;
int nr;
int rule[RULESIZE];
int err;
int cnt;
int option = 0;
FILE *in;
if (argc == 3 && !strcmp(argv[1], "-o")) {
option = strtol(argv[2], NULL, 10);
argc -= 2;
argv += 2;
}
else if (argc != 1)
Usage();
std = std_init();
assert(std);
lex = lex_init(std->err_p);
assert(lex);
in = fopen(LEXIN, "rb");
assert(in);
cnt = 0;
while (!feof(in) && fgets(buf, 1024, in)) {
cnt++;
/* parse into fields */
if (parse_csv(buf, &seq, word, stdword, &token)) {
/* add the record to the lexicon */
err = lex_add_entry(lex, seq, word, stdword, token);
if (err != 1)
printf("lex: Failed: %d: %s", cnt, buf);
}
else {
printf("lex: Skipping: %d: %s", cnt, buf);
}
}
fclose(in);
if (option & 1) {
printf("------------ address lexicon --------------\n");
print_lexicon(lex->hash_table);
printf("\n");
}
gaz = lex_init(std->err_p);
assert(gaz);
in = fopen(GAZIN, "rb");
assert(in);
cnt = 0;
while (!feof(in) && fgets(buf, 1024, in)) {
cnt++;
/* parse into fields */
if (parse_csv(buf, &seq, word, stdword, &token)) {
/* add the record to the lexicon */
err = lex_add_entry(gaz, seq, word, stdword, token);
if (err != 1)
printf("gaz: Failed: %d: %s", cnt, buf);
}
else {
printf("gaz: Skipping: %d: %s", cnt, buf);
}
}
fclose(in);
if (option & 2) {
printf("------------ gazeteer lexicon --------------\n");
print_lexicon(gaz->hash_table);
printf("\n");
}
rules = rules_init(std->err_p);
assert(rules);
rules -> r_p -> collect_statistics = TRUE ;
/* ************ RULES **************** */
in = fopen(RULESIN, "rb");
assert(in);
cnt = 0;
while (!feof(in) && fgets(buf, 1024, in)) {
cnt++;
/* parse into fields */
nr = parse_rule(buf, rule);
/* add the record to the rules */
err = rules_add_rule(rules, nr, rule);
if (err != 0)
printf("rules: Failed: %d (%d): %s", cnt, err, buf);
}
err = rules_ready(rules);
if (err != 0)
printf("rules: Failed: err=%d\n", err);
fclose(in);
std_use_lex(std, lex);
std_use_gaz(std, gaz);
std_use_rules(std, rules);
std_ready_standardizer(std);
printf( "Standardization test. Type \"exit\" to quit:\n" ) ;
fflush( stdout ) ;
while ( TRUE ) {
err = standardize_command_line( std, input_str, option ) ;
if ( err == FAIL ) {
break ;
}
}
printf( "OK\n" ) ;
fflush( stdout ) ;
std_free(std);
/* these were freed when we bound them with std_use_*()
rules_free(rules);
lex_free(gaz);
lex_free(lex);
*/
return 0;
}
static int standardize_command_line( STANDARDIZER *std ,
char *input_str ,
int option ) {
STDADDR *result;
int fld_num ,
have_user_macros ,
num_prompts ;
char unstandard_mic[ MAXSTRLEN ] ;
char unstandard_mac_left[ MAXSTRLEN ] ;
num_prompts = 3 ;
unstandard_mic[ 0 ] = SENTINEL ;
unstandard_mac_left[ 0 ] = SENTINEL ; ;
have_user_macros = FALSE ;
for ( fld_num = 1 ;
fld_num < num_prompts ;
fld_num++ ) {
/* -- print prompt -- */
if ( fld_num == 1 )
printf( "MICRO:" ) ;
else
printf( "MACRO:" ) ;
fflush( stdout ) ; /* -- to ensure prompt goes out --*/
memset( input_str ,
0 ,
MAXSTRLEN ) ;
input_str[ 0 ] = SENTINEL ;
/* -- get user's input -- */
if ( ( !get_input_line( input_str , stdin ) ) ||
( strncmp( input_str , "exit" , 4 ) == 0 ) ||
( strncmp( input_str , "quit" , 4 ) == 0 ) ||
( strncmp( input_str , "done" , 4 ) == 0 )
) {
return FAIL ; /* -- indicate exit -- */
}
/* -- get input first, then standardize -- */
if ( fld_num == 1 ) {
strcpy( unstandard_mic ,
input_str ) ;
if ( *unstandard_mic == SENTINEL ) {
printf( "No MICRO input\n" ) ;
return FALSE ; /* -- indicate no standardization -- */
}
convert_latin_one ( unstandard_mic ) ;
} else {
strcpy( unstandard_mac_left ,
input_str ) ;
if ( *unstandard_mac_left != SENTINEL ) {
have_user_macros = TRUE ;
convert_latin_one ( unstandard_mac_left ) ;
}
}
}
result = std_standardize_mm( std,
unstandard_mic,
unstandard_mac_left,
(option & 4)?1:0 ) ;
print_stdaddr( result );
if (option & 8)
output_rule_statistics( std->pagc_p->rules, std->err_p ) ;
stdaddr_free(result);
return 1;
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,11 @@
NY NY
NY N Y
NY NYC
NY N Y C
NY NEW YORK
NY NY
NY NY
NY NY
NY MANHATTAN
NY QUEENS
NY THE BRONX

File diff suppressed because it is too large Load diff