mirror of
https://git.osgeo.org/gitea/postgis/postgis
synced 2024-10-23 16:42:35 +00:00
Fork of http://sourceforge.net/p/pagc/code/HEAD/tree/branches/sew-refactor/postgresql (svn://svn.code.sf.net/p/pagc/code/branches/sew-refactor/postgresql ) at r361 (Stephen Woodbridge, Walter Sinclair contribution) - address_standardizer extension for PostgreSQL forked from PAGC address standardizer to work with PostgreSQL
git-svn-id: http://svn.osgeo.org/postgis/trunk@12716 b70326c6-7e19-0410-871a-916f4a2858ee
This commit is contained in:
parent
5b79d69994
commit
1f64e92017
12
extras/address_standardizer/COPYING
Normal file
12
extras/address_standardizer/COPYING
Normal file
|
@ -0,0 +1,12 @@
|
|||
Copyright 2006-2013 Stephen Woodbridge.
|
||||
Copyright (c) 2008 Walter Bruce Sinclair
|
||||
|
||||
woodbri@swoodbridge.com
|
||||
woodbr@imaptools.com
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
122
extras/address_standardizer/Makefile
Normal file
122
extras/address_standardizer/Makefile
Normal file
|
@ -0,0 +1,122 @@
|
|||
OBJS = \
|
||||
address_parser.o \
|
||||
address_standardizer.o \
|
||||
std_pg_hash.o \
|
||||
analyze.o \
|
||||
err_param.o \
|
||||
export.o \
|
||||
gamma.o \
|
||||
hash.o \
|
||||
lexicon.o \
|
||||
pagc_tools.o \
|
||||
parseaddress-api.o \
|
||||
standard.o \
|
||||
tokenize.o
|
||||
|
||||
OBJS_test_main = \
|
||||
test_main.o \
|
||||
analyze.o \
|
||||
err_param.o \
|
||||
export.o \
|
||||
gamma.o \
|
||||
hash.o \
|
||||
lexicon.o \
|
||||
pagc_tools.o \
|
||||
standard.o \
|
||||
tokenize.o
|
||||
|
||||
|
||||
MODULE_big = address_standardizer
|
||||
EXTENSION = address_standardizer
|
||||
DATA_built = address_standardizer--1.0.sql us-lex.sql us-gaz.sql us-rules.sql
|
||||
DOCS = README.address_standardizer
|
||||
PG_CPPFLAGS = -g -O0
|
||||
SHLIB_LINK = -lpcre
|
||||
EXTRA_CLEAN = usps-st-city-name.txt mk-st-regexp mk-city-regex test_main
|
||||
|
||||
PG_CONFIG = pg_config
|
||||
PGXS := $(shell $(PG_CONFIG) --pgxs)
|
||||
PGVER := $(shell $(PG_CONFIG) --version)
|
||||
include $(PGXS)
|
||||
|
||||
PERL := $(shell which perl)
|
||||
|
||||
address_standardizer--1.0.sql: address_standardizer--1.0.sql.in
|
||||
$(PERL) mk-sql.pl '$(PGVER)' address_standardizer--1.0.sql.in > address_standardizer--1.0.sql
|
||||
|
||||
us-lex.sql: lexicon.csv
|
||||
$(PERL) pagc-data-psql lex lexicon.csv > us-lex.sql
|
||||
|
||||
us-gaz.sql: gazeteer.csv
|
||||
$(PERL) pagc-data-psql gaz gazeteer.csv > us-gaz.sql
|
||||
|
||||
us-rules.sql: rules.txt
|
||||
$(PERL) pagc-data-psql rules rules.txt > us-rules.sql
|
||||
|
||||
mk-st-regexp: mk-st-regexp.pl
|
||||
$(PERL) -c mk-st-regexp.pl
|
||||
rm -f mk-st-regexp
|
||||
echo "#! " $(PERL) > mk-st-regexp
|
||||
cat mk-st-regexp.pl >> mk-st-regexp
|
||||
chmod ugo+x mk-st-regexp
|
||||
|
||||
mk-city-regex: mk-city-regex.pl usps-st-city-name.txt
|
||||
$(PERL) -c mk-city-regex.pl
|
||||
rm -f mk-city-regex
|
||||
echo "#! " $(PERL) > mk-city-regex
|
||||
cat mk-city-regex.pl >> mk-city-regex
|
||||
chmod ugo+x mk-city-regex
|
||||
|
||||
usps-st-city-name.txt: usps-st-city-orig.txt usps-st-city-adds.txt
|
||||
cat usps-st-city-orig.txt usps-st-city-adds.txt | sort -u >usps-st-city-name.txt
|
||||
|
||||
parseaddress-stcities.h: mk-city-regex
|
||||
./mk-city-regex > parseaddress-stcities.h
|
||||
|
||||
parseaddress-regex.h: mk-st-regexp
|
||||
./mk-st-regexp > parseaddress-regex.h
|
||||
|
||||
dist-clean:
|
||||
rm -f mk-st-regexp mk-city-regex usps-st-city-name.txt parseaddress-stcities.h parseaddress-regex.h test_main
|
||||
|
||||
test:
|
||||
@echo "To run the test on parse_address do the follow:"
|
||||
@echo "1. (make && sudo make install) to compile and install extension"
|
||||
@echo "2. create a database and install the address_standardizer extension"
|
||||
@echo "3. psql test_db -f test-parseaddress.sql"
|
||||
@echo "it should report '(0 rows)' if all tests passed or"
|
||||
@echo "report which ones failed."
|
||||
|
||||
test_main: $(OBJS_test_main)
|
||||
gcc -o test_main $(OBJS_test_main) $(LDFLAGS) $(LIBS)
|
||||
|
||||
test_main.o: test_main.c pagc_api.h pagc_std_api.h
|
||||
|
||||
address_parser.o: address_parser.c parseaddress-api.h
|
||||
|
||||
address_standardizer.o: address_standardizer.c std_pg_hash.h pagc_api.h pagc_std_api.h
|
||||
|
||||
analyze.o: analyze.c pagc_api.h
|
||||
|
||||
err_param.o: err_param.c pagc_api.h
|
||||
|
||||
export.o: export.c pagc_api.h pagc_tools.h
|
||||
|
||||
gamma.o: gamma.c pagc_api.h pagc_std_api.h gamma.h
|
||||
|
||||
hash.o: hash.c hash.h khash.h
|
||||
|
||||
lexicon.o: lexicon.c pagc_api.h pagc_std_api.h
|
||||
|
||||
pagc_tools.o: pagc_tools.c pagc_tools.h pagc_common.h
|
||||
|
||||
parseaddress-api.o: parseaddress-api.c parseaddress-api.h parseaddress-stcities.h parseaddress-regex.h
|
||||
|
||||
standard.o: standard.c pagc_api.h
|
||||
|
||||
tokenize.o: tokenize.c pagc_api.h
|
||||
|
||||
std_pg_hash.o: std_pg_hash.c std_pg_hash.h pagc_api.h pagc_std_api.h
|
||||
|
||||
|
||||
|
219
extras/address_standardizer/README.address_standardizer
Normal file
219
extras/address_standardizer/README.address_standardizer
Normal file
|
@ -0,0 +1,219 @@
|
|||
This is a fork of the PAGC standardizer and a single line address parser.
|
||||
The code is built into a single postgresql extension library.
|
||||
|
||||
Portions of this code belong to their respective contributors.
|
||||
This code is released under an MIT-X license.
|
||||
|
||||
Copyright (c) 2006-2013 Stephen Woodbridge.
|
||||
Copyright (c) 2008 Walter Bruce Sinclair
|
||||
|
||||
woodbri@swoodbridge.com
|
||||
woodbr@imaptools.com
|
||||
|
||||
Also read files COPYING
|
||||
|
||||
-------------------------------------------------------------------------------
|
||||
|
||||
Makefile - PGXS makefile
|
||||
mk-city-regex.pl - Perl script to create parseaddress-regex.h
|
||||
mk-st-regexp.pl - Perl script to create parseaddress-stcities.h
|
||||
README.address_standardizer - this file
|
||||
COPYING - License file
|
||||
usps-st-city-adds.txt - add local additions of ST<tab>CITY NAME
|
||||
usps-st-city-orig.txt - Steve's extract of USPS city names
|
||||
|
||||
mk-city-regex - created by make
|
||||
mk-st-regexp - created by make
|
||||
usps-st-city-name.txt - created by make
|
||||
from usps-st-city-orig.txt and usps-st-city-adds.txt
|
||||
parseaddress-regex.h - created by make and mk-st-regexp
|
||||
parseaddress-stcities.h - created by make and mk-city-regex
|
||||
from usps-st-city-name.txt
|
||||
|
||||
-------------------------------------------------------------------------------
|
||||
PREREQUISITES:
|
||||
|
||||
o Postgresql headers and PGXS tools
|
||||
|
||||
o Perl 5 and Perl module Regexp::List which can be install with:
|
||||
sudo perl -MCPAN -e "install Regexp::Assemble"
|
||||
|
||||
o libpcre and headers
|
||||
sudo apt-get install libpcre3-dev libpcre3
|
||||
|
||||
-------------------------------------------------------------------------------
|
||||
Build and Install:
|
||||
|
||||
make
|
||||
sudo make install
|
||||
|
||||
For postgresql 9.1+ this will install all the files need for CREATE EXTENSION
|
||||
|
||||
createdb testdb
|
||||
psql -c "create extension address_standardizer"
|
||||
|
||||
|
||||
-------------------------------------------------------------------------------
|
||||
|
||||
How the parser works
|
||||
|
||||
The parser works from right to left looking first at the macro elements
|
||||
for postcode, state/province, city, and then looks micro elements to determine
|
||||
if we are dealing with a house number street or intersection or landmark.
|
||||
It currently does not look for a country code or name, but that could be
|
||||
introduced in the future.
|
||||
|
||||
Country code
|
||||
------------
|
||||
|
||||
Assumed to be US or CA based on:
|
||||
|
||||
postcode as US or Canada
|
||||
state/province as US or Canada
|
||||
else US
|
||||
|
||||
Postcode/zipcode
|
||||
----------------
|
||||
|
||||
These are recognized using Perl compatible regular expressions.
|
||||
These regexs are currently in the parseaddress-api.c and are relatively
|
||||
simple to make changes to if needed.
|
||||
|
||||
State/province
|
||||
--------------
|
||||
|
||||
These are recognized using Perl compatible regular expressions.
|
||||
These regexs are currently in the parseaddress-api.c but could get moved
|
||||
into includes in the future for easier maintenance.
|
||||
|
||||
City name
|
||||
---------
|
||||
|
||||
This part is rather complicated and there are lots of issues around ambiguities
|
||||
as to where to split a series of tokens when a token might belong to either
|
||||
the city or the street name. The current strategy follows something like this:
|
||||
|
||||
1. if we have a state, then get the city regex for that state
|
||||
2. if we can match that to the end of our remaining address string then
|
||||
extract the city name and continue.
|
||||
3. if we do not have a state or fail to match it then
|
||||
cycle through a series of regex patterns that try to separate the city
|
||||
from the street, stop and extract the city if we match
|
||||
|
||||
Number street name
|
||||
|
||||
1. check for a leading house number, and extract that
|
||||
2. if there is an '@' then split the string on the '@' into street and
|
||||
street2 else put the rest into street
|
||||
|
||||
------------------------------------------------------------------------------
|
||||
|
||||
Managing the regexes
|
||||
|
||||
The regexes are used to recognize US states and Canadian provinces
|
||||
and USPS city names.
|
||||
|
||||
City regexes
|
||||
------------
|
||||
|
||||
usps-st-city-orig.txt - this file contains all the acceptable USPS city
|
||||
names by state. I periodically extract these from the
|
||||
USPS and generate this file. I do NOT recommend
|
||||
editing this file.
|
||||
usps-st-city-adds.txt - this file you can add new definitions to if you need
|
||||
them. The format of both these files is:
|
||||
<StateAbbrev><tab><CityName>
|
||||
|
||||
These files are assembled into usps-st-city-name.txt which is compiled by a
|
||||
perl script mk-city-regex.pl into parseaddress-stcities.h which is used to
|
||||
lookup the city regex for a specific state or province.
|
||||
|
||||
As I mentioned above is these fail to detect the city, then a secondary
|
||||
strategy is is deployed by cycling through a list of regex patterns. These
|
||||
patterns and regexes are generated by mk-st-regexp.pl which creates the
|
||||
parseaddress-regex.h include. This is a perl script so you can view and edit
|
||||
it if that is needed.
|
||||
|
||||
I think that there might be some room for improved in the area if coodinating
|
||||
this process with PAGC's lexicon.csv and gazeteer.csv in the future.
|
||||
|
||||
----------------------------------------------------------------------------
|
||||
|
||||
Author: API: Stephen Woodbridge <woodbri@imaptools.com>
|
||||
PAGC: Walter Sinclar
|
||||
|
||||
This is a first attempt at extracting the PAGC standardizer code into a
|
||||
separate library. This was done so I could attempt to wrap it into a
|
||||
postgresql stored procedure. (see the directory psql for that).
|
||||
|
||||
This code is a part of PAGC and is release under an MIT-X license.
|
||||
|
||||
Assumptions:
|
||||
|
||||
Linux
|
||||
Sudo is installed and user has access to it.
|
||||
|
||||
PAGC does compile under Windows so you might get this to compile into a dll.
|
||||
|
||||
Build and installation:
|
||||
|
||||
Read the Makefile and change as appropriate.
|
||||
|
||||
make clean
|
||||
make
|
||||
make install
|
||||
make test_main # build the CLI interactive test program
|
||||
|
||||
|
||||
Author: Stephen Woodbridge <woodbri@imaptools.com>
|
||||
|
||||
These are postgresql wrappers for PAGC standardizer and address parser.
|
||||
These are released un an MIT-X style license.
|
||||
|
||||
Assumptions:
|
||||
|
||||
Linux
|
||||
sudo is installed and user has sudo access (see Makefile)
|
||||
postgresql 8.3 (make changes in the Makefile to change)
|
||||
|
||||
Build and Installation:
|
||||
|
||||
make
|
||||
make install
|
||||
|
||||
# create a new database using a postgis template
|
||||
createdb -T template_postgis -E LATIN1 mydb
|
||||
|
||||
# add the stored procedures
|
||||
psql mydb -f /path/to/standardize_address.sql
|
||||
|
||||
Install PAGC lexicon.csv, gazeteer.csv or rules.txt using a perl script.
|
||||
|
||||
./pagc-data-psql lex lexicon.csv | psql mydb
|
||||
./pagc-data-psql gaz gazeteer.csv | psql mydb
|
||||
./pagc-data-psql rules rules.txt | psql mydb
|
||||
|
||||
Now you should be able to test some queries like:
|
||||
|
||||
psql mydb
|
||||
|
||||
select * from parse_address('2099 university ave w, saint paul, mn, 55104-3431');
|
||||
select * from parse_address('university ave w @ main st, saint paul, mn, 55104-3431');
|
||||
|
||||
select * from parse_address('385 Landgrove Rd Landgrove VT 05148');
|
||||
-- "385";"Landgrove Rd";"";"385 Landgrove Rd";"Landgrove";"VT";"05148";"";"US"
|
||||
|
||||
select * from standardize_address(
|
||||
'select seq, word::text, stdword::text, token from gaz union all select seq, word::text, stdword::text, token from lex ',
|
||||
'select seq, word::text, stdword::text, token from gaz order by id',
|
||||
'select * from rules order by id',
|
||||
'select 0::int4 as id, ''1071 B Ave''::text as micro, ''Loxley, AL 36551''::text as macro');
|
||||
|
||||
select * from standardize_address(
|
||||
'select seq, word::text, stdword::text, token from lex order by id',
|
||||
'select seq, word::text, stdword::text, token from gaz order by id',
|
||||
'select * from rules order by id',
|
||||
'select 0::int4 as id, ''116 commonwealth ave apt a''::text as micro, ''west concord, ma 01742''::text as macro');
|
||||
|
||||
\q
|
||||
|
120
extras/address_standardizer/address_parser.c
Normal file
120
extras/address_standardizer/address_parser.c
Normal file
|
@ -0,0 +1,120 @@
|
|||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#endif
|
||||
#include "postgres.h"
|
||||
#include "funcapi.h"
|
||||
#include "catalog/pg_type.h"
|
||||
#include "fmgr.h"
|
||||
|
||||
#include "parseaddress-api.h"
|
||||
#include <pcre.h>
|
||||
#include <string.h>
|
||||
|
||||
#undef DEBUG
|
||||
//#define DEBUG 1
|
||||
|
||||
#ifdef DEBUG
|
||||
#define DBG(format, arg...) \
|
||||
elog(NOTICE, format , ## arg)
|
||||
#else
|
||||
#define DBG(format, arg...) do { ; } while (0)
|
||||
#endif
|
||||
|
||||
Datum parse_address(PG_FUNCTION_ARGS);
|
||||
|
||||
static char *text2char(text *in)
|
||||
{
|
||||
char *out = palloc(VARSIZE(in));
|
||||
memcpy(out, VARDATA(in), VARSIZE(in) - VARHDRSZ);
|
||||
out[VARSIZE(in) - VARHDRSZ] = '\0';
|
||||
return out;
|
||||
}
|
||||
|
||||
PG_FUNCTION_INFO_V1(parse_address);
|
||||
|
||||
Datum parse_address(PG_FUNCTION_ARGS)
|
||||
{
|
||||
TupleDesc tupdesc;
|
||||
AttInMetadata *attinmeta;
|
||||
Datum result;
|
||||
ADDRESS *paddr;
|
||||
HHash *stH;
|
||||
char *str;
|
||||
char **values;
|
||||
int err;
|
||||
HeapTuple tuple;
|
||||
|
||||
|
||||
DBG("Start standardize_address");
|
||||
|
||||
str = text2char(PG_GETARG_TEXT_P(0));
|
||||
|
||||
DBG("str='%s'", str);
|
||||
|
||||
if (get_call_result_type( fcinfo, NULL, &tupdesc ) != TYPEFUNC_COMPOSITE ) {
|
||||
elog(ERROR, "function returning record called in context"
|
||||
" that cannot accept type record");
|
||||
return -1;
|
||||
}
|
||||
BlessTupleDesc(tupdesc);
|
||||
attinmeta = TupleDescGetAttInMetadata(tupdesc);
|
||||
|
||||
DBG("Got tupdesc, allocating HHash");
|
||||
|
||||
stH = (HHash *) palloc0(sizeof(HHash));
|
||||
if (!stH) {
|
||||
elog(ERROR, "parse_address: Failed to allocate memory for hash!");
|
||||
return -1;
|
||||
}
|
||||
|
||||
DBG("going to load_state_hash");
|
||||
|
||||
err = load_state_hash(stH);
|
||||
if (err) {
|
||||
DBG("got err=%d from load_state_hash().", err);
|
||||
#ifdef USE_HSEARCH
|
||||
DBG("calling hdestroy_r(stH).");
|
||||
hdestroy_r(stH);
|
||||
#endif
|
||||
elog(ERROR, "parse_address: load_state_hash() failed(%d)!", err);
|
||||
return -1;
|
||||
}
|
||||
|
||||
DBG("calling parseaddress()");
|
||||
paddr = parseaddress(stH, str, &err);
|
||||
if (!paddr) {
|
||||
elog(ERROR, "parse_address: parseaddress() failed!");
|
||||
return -1;
|
||||
}
|
||||
|
||||
DBG("setup values array for natts=%d", tupdesc->natts);
|
||||
values = (char **) palloc(9 * sizeof(char *));
|
||||
if (!values) {
|
||||
elog(ERROR, "parse_address: out of memory!");
|
||||
return -1;
|
||||
}
|
||||
values[0] = paddr->num;
|
||||
values[1] = paddr->street;
|
||||
values[2] = paddr->street2;
|
||||
values[3] = paddr->address1;
|
||||
values[4] = paddr->city;
|
||||
values[5] = paddr->st;
|
||||
values[6] = paddr->zip;
|
||||
values[7] = paddr->zipplus;
|
||||
values[8] = paddr->cc;
|
||||
|
||||
DBG("calling heap_form_tuple");
|
||||
tuple = BuildTupleFromCStrings(attinmeta, values);
|
||||
|
||||
/* make the tuple into a datum */
|
||||
DBG("calling HeapTupleGetDatum");
|
||||
result = HeapTupleGetDatum(tuple);
|
||||
|
||||
/* clean up (this is not really necessary */
|
||||
DBG("freeing values, hash, and paddr");
|
||||
free_state_hash(stH);
|
||||
|
||||
DBG("returning parsed address result");
|
||||
return result;
|
||||
}
|
||||
|
61
extras/address_standardizer/address_standardizer--1.0.sql.in
Normal file
61
extras/address_standardizer/address_standardizer--1.0.sql.in
Normal file
|
@ -0,0 +1,61 @@
|
|||
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
|
||||
\echo Use "CREATE EXTENSION address_standardizer" to load this file. \quit
|
||||
|
||||
---------------------------------------------------------------------
|
||||
-- Core function to access the PAGC address standardizer
|
||||
-- Author: Stephen Woodbridge <woodbri@imaptools.com>
|
||||
---------------------------------------------------------------------
|
||||
|
||||
DROP TYPE IF EXISTS stdaddr;
|
||||
CREATE TYPE stdaddr AS (
|
||||
building text,
|
||||
house_num text,
|
||||
predir text,
|
||||
qual text,
|
||||
pretype text,
|
||||
name text,
|
||||
suftype text,
|
||||
sufdir text,
|
||||
ruralroute text,
|
||||
extra text,
|
||||
city text,
|
||||
state text,
|
||||
country text,
|
||||
postcode text,
|
||||
box text,
|
||||
unit text
|
||||
);
|
||||
|
||||
CREATE OR REPLACE FUNCTION standardize_address(
|
||||
lextab text,
|
||||
gaztab text,
|
||||
rultab text,
|
||||
micro text,
|
||||
macro text )
|
||||
RETURNS SETOF stdaddr
|
||||
AS '$libdir/address_standardizer', 'standardize_address'
|
||||
LANGUAGE 'c' IMMUTABLE STRICT;
|
||||
|
||||
CREATE OR REPLACE FUNCTION standardize_address(
|
||||
lextab text,
|
||||
gaztab text,
|
||||
rultab text,
|
||||
address text )
|
||||
RETURNS SETOF stdaddr
|
||||
AS '$libdir/address_standardizer', 'standardize_address1'
|
||||
LANGUAGE 'c' IMMUTABLE STRICT;
|
||||
|
||||
CREATE OR REPLACE FUNCTION parse_address(IN text,
|
||||
OUT num text,
|
||||
OUT street text,
|
||||
OUT street2 text,
|
||||
OUT address1 text,
|
||||
OUT city text,
|
||||
OUT state text,
|
||||
OUT zip text,
|
||||
OUT zipplus text,
|
||||
OUT country text)
|
||||
RETURNS record
|
||||
AS '$libdir/address_standardizer', 'parse_address'
|
||||
LANGUAGE 'c' IMMUTABLE;
|
||||
|
284
extras/address_standardizer/address_standardizer.c
Normal file
284
extras/address_standardizer/address_standardizer.c
Normal file
|
@ -0,0 +1,284 @@
|
|||
#include "postgres.h"
|
||||
#include "funcapi.h"
|
||||
#include "catalog/pg_type.h"
|
||||
#include "fmgr.h"
|
||||
|
||||
#undef DEBUG
|
||||
//#define DEBUG 1
|
||||
|
||||
#include "pagc_api.h"
|
||||
#include "pagc_std_api.h"
|
||||
#include "std_pg_hash.h"
|
||||
#include "parseaddress-api.h"
|
||||
|
||||
#ifdef PG_MODULE_MAGIC
|
||||
PG_MODULE_MAGIC;
|
||||
#endif
|
||||
|
||||
Datum standardize_address(PG_FUNCTION_ARGS);
|
||||
Datum standardize_address1(PG_FUNCTION_ARGS);
|
||||
|
||||
|
||||
static char *text2char(text *in)
|
||||
{
|
||||
char *out = palloc(VARSIZE(in));
|
||||
memcpy(out, VARDATA(in), VARSIZE(in) - VARHDRSZ);
|
||||
out[VARSIZE(in) - VARHDRSZ] = '\0';
|
||||
return out;
|
||||
}
|
||||
|
||||
/*
|
||||
* The signature for standardize_address follows. The lextab, gaztab and
|
||||
* rultab should not change once the reference has been standardized and
|
||||
* the same tables must be used for a geocode request as were used on the
|
||||
* reference set or the matching will get degregated.
|
||||
*
|
||||
* select * from standardize_address(
|
||||
* lextab text, -- name of table of view
|
||||
* gaztab text, -- name of table or view
|
||||
* rultab text, -- name of table of view
|
||||
* micro text, -- '123 main st'
|
||||
* macro text); -- 'boston ma 01002'
|
||||
*
|
||||
* If you want to standardize a whole table then call it like:
|
||||
*
|
||||
* insert into stdaddr (...)
|
||||
* select (std).* from (
|
||||
* select standardize_address(
|
||||
* 'lextab', 'gaztab', 'rultab', micro, marco) as std
|
||||
* from table_to_standardize) as foo;
|
||||
*
|
||||
* The structure of the lextab and gaztab tables of views must be:
|
||||
*
|
||||
* seq int4
|
||||
* word text
|
||||
* stdword text
|
||||
* token int4
|
||||
*
|
||||
* the rultab table or view must have columns:
|
||||
*
|
||||
* rule text
|
||||
*/
|
||||
|
||||
PG_FUNCTION_INFO_V1(standardize_address);
|
||||
|
||||
Datum standardize_address(PG_FUNCTION_ARGS)
|
||||
{
|
||||
TupleDesc tuple_desc;
|
||||
AttInMetadata *attinmeta;
|
||||
STANDARDIZER *std;
|
||||
char *lextab;
|
||||
char *gaztab;
|
||||
char *rultab;
|
||||
char *micro;
|
||||
char *macro;
|
||||
Datum result;
|
||||
STDADDR *stdaddr;
|
||||
char **values;
|
||||
int k;
|
||||
HeapTuple tuple;
|
||||
|
||||
DBG("Start standardize_address");
|
||||
|
||||
lextab = text2char(PG_GETARG_TEXT_P(0));
|
||||
gaztab = text2char(PG_GETARG_TEXT_P(1));
|
||||
rultab = text2char(PG_GETARG_TEXT_P(2));
|
||||
micro = text2char(PG_GETARG_TEXT_P(3));
|
||||
macro = text2char(PG_GETARG_TEXT_P(4));
|
||||
|
||||
DBG("calling RelationNameGetTupleDesc");
|
||||
if (get_call_result_type( fcinfo, NULL, &tuple_desc ) != TYPEFUNC_COMPOSITE ) {
|
||||
elog(ERROR, "standardize_address() was called in a way that cannot accept record as a result");
|
||||
}
|
||||
BlessTupleDesc(tuple_desc);
|
||||
attinmeta = TupleDescGetAttInMetadata(tuple_desc);
|
||||
|
||||
DBG("calling GetStdUsingFCInfo(fcinfo, '%s', '%s', '%s')", lextab, gaztab, rultab);
|
||||
std = GetStdUsingFCInfo(fcinfo, lextab, gaztab, rultab);
|
||||
if (!std)
|
||||
elog(ERROR, "standardize_address() failed to create the address standardizer object!");
|
||||
|
||||
DBG("calling std_standardize_mm('%s', '%s')", micro, macro);
|
||||
stdaddr = std_standardize_mm( std, micro, macro, 0 );
|
||||
|
||||
DBG("back from fetch_stdaddr");
|
||||
|
||||
values = (char **) palloc(16 * sizeof(char *));
|
||||
for (k=0; k<16; k++) {
|
||||
values[k] = NULL;
|
||||
}
|
||||
DBG("setup values array for natts=%d", tuple_desc->natts);
|
||||
if (stdaddr) {
|
||||
values[0] = stdaddr->building ? pstrdup(stdaddr->building) : NULL;
|
||||
values[1] = stdaddr->house_num ? pstrdup(stdaddr->house_num) : NULL;
|
||||
values[2] = stdaddr->predir ? pstrdup(stdaddr->predir) : NULL;
|
||||
values[3] = stdaddr->qual ? pstrdup(stdaddr->qual) : NULL;
|
||||
values[4] = stdaddr->pretype ? pstrdup(stdaddr->pretype) : NULL;
|
||||
values[5] = stdaddr->name ? pstrdup(stdaddr->name) : NULL;
|
||||
values[6] = stdaddr->suftype ? pstrdup(stdaddr->suftype) : NULL;
|
||||
values[7] = stdaddr->sufdir ? pstrdup(stdaddr->sufdir) : NULL;
|
||||
values[8] = stdaddr->ruralroute ? pstrdup(stdaddr->ruralroute) : NULL;
|
||||
values[9] = stdaddr->extra ? pstrdup(stdaddr->extra) : NULL;
|
||||
values[10] = stdaddr->city ? pstrdup(stdaddr->city) : NULL;
|
||||
values[11] = stdaddr->state ? pstrdup(stdaddr->state) : NULL;
|
||||
values[12] = stdaddr->country ? pstrdup(stdaddr->country) : NULL;
|
||||
values[13] = stdaddr->postcode ? pstrdup(stdaddr->postcode) : NULL;
|
||||
values[14] = stdaddr->box ? pstrdup(stdaddr->box) : NULL;
|
||||
values[15] = stdaddr->unit ? pstrdup(stdaddr->unit) : NULL;
|
||||
}
|
||||
|
||||
DBG("calling heap_form_tuple");
|
||||
tuple = BuildTupleFromCStrings(attinmeta, values);
|
||||
|
||||
/* make the tuple into a datum */
|
||||
DBG("calling HeapTupleGetDatum");
|
||||
result = HeapTupleGetDatum(tuple);
|
||||
|
||||
/* clean up (this is not really necessary */
|
||||
DBG("freeing values, nulls, and stdaddr");
|
||||
stdaddr_free(stdaddr);
|
||||
|
||||
DBG("returning standardized result");
|
||||
PG_RETURN_DATUM(result);
|
||||
}
|
||||
|
||||
|
||||
PG_FUNCTION_INFO_V1(standardize_address1);
|
||||
|
||||
Datum standardize_address1(PG_FUNCTION_ARGS)
|
||||
{
|
||||
TupleDesc tuple_desc;
|
||||
AttInMetadata *attinmeta;
|
||||
STANDARDIZER *std;
|
||||
char *lextab;
|
||||
char *gaztab;
|
||||
char *rultab;
|
||||
char *addr;
|
||||
char *micro;
|
||||
char *macro;
|
||||
Datum result;
|
||||
STDADDR *stdaddr;
|
||||
char **values;
|
||||
int k;
|
||||
HeapTuple tuple;
|
||||
ADDRESS *paddr;
|
||||
HHash *stH;
|
||||
int err;
|
||||
|
||||
DBG("Start standardize_address");
|
||||
|
||||
lextab = text2char(PG_GETARG_TEXT_P(0));
|
||||
gaztab = text2char(PG_GETARG_TEXT_P(1));
|
||||
rultab = text2char(PG_GETARG_TEXT_P(2));
|
||||
addr = text2char(PG_GETARG_TEXT_P(3));
|
||||
|
||||
DBG("calling RelationNameGetTupleDesc");
|
||||
if (get_call_result_type( fcinfo, NULL, &tuple_desc ) != TYPEFUNC_COMPOSITE ) {
|
||||
elog(ERROR, "standardize_address() was called in a way that cannot accept record as a result");
|
||||
}
|
||||
BlessTupleDesc(tuple_desc);
|
||||
attinmeta = TupleDescGetAttInMetadata(tuple_desc);
|
||||
|
||||
DBG("Got tupdesc, allocating HHash");
|
||||
|
||||
stH = (HHash *) palloc0(sizeof(HHash));
|
||||
if (!stH) {
|
||||
elog(ERROR, "standardize_address: Failed to allocate memory for hash!");
|
||||
return -1;
|
||||
}
|
||||
|
||||
DBG("going to load_state_hash");
|
||||
|
||||
err = load_state_hash(stH);
|
||||
if (err) {
|
||||
DBG("got err=%d from load_state_hash().", err);
|
||||
#ifdef USE_HSEARCH
|
||||
DBG("calling hdestroy_r(stH).");
|
||||
hdestroy_r(stH);
|
||||
#endif
|
||||
elog(ERROR, "standardize_address: load_state_hash() failed(%d)!", err);
|
||||
return -1;
|
||||
}
|
||||
|
||||
DBG("calling parseaddress()");
|
||||
paddr = parseaddress(stH, addr, &err);
|
||||
if (!paddr) {
|
||||
elog(ERROR, "parse_address: parseaddress() failed!");
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* check for errors and comput length of macro string */
|
||||
if (paddr->street2)
|
||||
elog(ERROR, "standardize_address() can not be passed an intersection.");
|
||||
if (! paddr-> address1)
|
||||
elog(ERROR, "standardize_address() could not parse the address into components.");
|
||||
|
||||
k = 1;
|
||||
if (paddr->city) k += strlen(paddr->city) + 1;
|
||||
if (paddr->st) k += strlen(paddr->st) + 1;
|
||||
if (paddr->zip) k += strlen(paddr->zip) + 1;
|
||||
if (paddr->cc) k += strlen(paddr->cc) + 1;
|
||||
|
||||
/* create micro and macro from paddr */
|
||||
micro = pstrdup(paddr->address1);
|
||||
macro = (char *) palloc(k * sizeof(char));
|
||||
|
||||
*macro = '\0';
|
||||
if (paddr->city) { strcat(macro, paddr->city); strcat(macro, ","); }
|
||||
if (paddr->st ) { strcat(macro, paddr->st ); strcat(macro, ","); }
|
||||
if (paddr->zip ) { strcat(macro, paddr->zip ); strcat(macro, ","); }
|
||||
if (paddr->cc ) { strcat(macro, paddr->cc ); strcat(macro, ","); }
|
||||
|
||||
DBG("calling GetStdUsingFCInfo(fcinfo, '%s', '%s', '%s')", lextab, gaztab, rultab);
|
||||
std = GetStdUsingFCInfo(fcinfo, lextab, gaztab, rultab);
|
||||
if (!std)
|
||||
elog(ERROR, "standardize_address() failed to create the address standardizer object!");
|
||||
|
||||
DBG("calling std_standardize_mm('%s', '%s')", micro, macro);
|
||||
stdaddr = std_standardize_mm( std, micro, macro, 0 );
|
||||
|
||||
DBG("back from fetch_stdaddr");
|
||||
|
||||
values = (char **) palloc(16 * sizeof(char *));
|
||||
for (k=0; k<16; k++) {
|
||||
values[k] = NULL;
|
||||
}
|
||||
DBG("setup values array for natts=%d", tuple_desc->natts);
|
||||
if (stdaddr) {
|
||||
values[0] = stdaddr->building ? pstrdup(stdaddr->building) : NULL;
|
||||
values[1] = stdaddr->house_num ? pstrdup(stdaddr->house_num) : NULL;
|
||||
values[2] = stdaddr->predir ? pstrdup(stdaddr->predir) : NULL;
|
||||
values[3] = stdaddr->qual ? pstrdup(stdaddr->qual) : NULL;
|
||||
values[4] = stdaddr->pretype ? pstrdup(stdaddr->pretype) : NULL;
|
||||
values[5] = stdaddr->name ? pstrdup(stdaddr->name) : NULL;
|
||||
values[6] = stdaddr->suftype ? pstrdup(stdaddr->suftype) : NULL;
|
||||
values[7] = stdaddr->sufdir ? pstrdup(stdaddr->sufdir) : NULL;
|
||||
values[8] = stdaddr->ruralroute ? pstrdup(stdaddr->ruralroute) : NULL;
|
||||
values[9] = stdaddr->extra ? pstrdup(stdaddr->extra) : NULL;
|
||||
values[10] = stdaddr->city ? pstrdup(stdaddr->city) : NULL;
|
||||
values[11] = stdaddr->state ? pstrdup(stdaddr->state) : NULL;
|
||||
values[12] = stdaddr->country ? pstrdup(stdaddr->country) : NULL;
|
||||
values[13] = stdaddr->postcode ? pstrdup(stdaddr->postcode) : NULL;
|
||||
values[14] = stdaddr->box ? pstrdup(stdaddr->box) : NULL;
|
||||
values[15] = stdaddr->unit ? pstrdup(stdaddr->unit) : NULL;
|
||||
}
|
||||
|
||||
DBG("calling heap_form_tuple");
|
||||
tuple = BuildTupleFromCStrings(attinmeta, values);
|
||||
|
||||
/* make the tuple into a datum */
|
||||
DBG("calling HeapTupleGetDatum");
|
||||
result = HeapTupleGetDatum(tuple);
|
||||
|
||||
/* clean up (this is not really necessary */
|
||||
DBG("freeing values, nulls, and stdaddr");
|
||||
stdaddr_free(stdaddr);
|
||||
|
||||
DBG("freeing values, hash, and paddr");
|
||||
free_state_hash(stH);
|
||||
|
||||
DBG("returning standardized result");
|
||||
PG_RETURN_DATUM(result);
|
||||
}
|
||||
|
||||
|
5
extras/address_standardizer/address_standardizer.control
Normal file
5
extras/address_standardizer/address_standardizer.control
Normal file
|
@ -0,0 +1,5 @@
|
|||
# address_standardizer extension
|
||||
comment = ''
|
||||
default_version = '1.0'
|
||||
encoding = 'LATIN1'
|
||||
relocatable = true
|
1501
extras/address_standardizer/analyze.c
Normal file
1501
extras/address_standardizer/analyze.c
Normal file
File diff suppressed because it is too large
Load diff
278
extras/address_standardizer/err_param.c
Normal file
278
extras/address_standardizer/err_param.c
Normal file
|
@ -0,0 +1,278 @@
|
|||
/* -- err_param.c
|
||||
|
||||
This file handles the buffering and output of errors
|
||||
|
||||
Prototype 7H08 (This file was written by Walter Sinclair).
|
||||
|
||||
Copyright (c) 2009 Walter Bruce Sinclair
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
|
||||
*/
|
||||
|
||||
/* For pagc-0.4.0 : last revised 2010-11-01 */
|
||||
|
||||
#undef DEBUG
|
||||
//#define DEBUG
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "pagc_api.h"
|
||||
|
||||
static FILE *open_error_log ( const char *, DS_Handle , ERR_PARAM * ) ;
|
||||
static int turn_off_error_log( ERR_PARAM * ) ;
|
||||
|
||||
#define PRINT_ERROR( TEMP , MSG ) \
|
||||
DBG( TEMP, MSG ) ;
|
||||
|
||||
#define RESET_ERR_P \
|
||||
err_p -> first_err = 0 ; \
|
||||
err_p -> last_err = 0 ; \
|
||||
err_p -> next_fatal = TRUE ; \
|
||||
err_mem = err_p -> err_array ; \
|
||||
err_p -> error_buf = err_mem -> content_buf ; \
|
||||
err_mem -> is_fatal = TRUE ; \
|
||||
BLANK_STRING( err_mem -> content_buf )
|
||||
|
||||
|
||||
|
||||
/* ------------------------------------------------------------
|
||||
err_param.c (init_errors) :
|
||||
calls : err_param.c (open_error_log), stdlib.h (malloc, free) ,
|
||||
stdio.h (fprintf, fflush) stdlib.h (malloc,free)
|
||||
--------------------------------------------------------------*/
|
||||
ERR_PARAM *init_errors( PAGC_GLOBAL *pagc_glo_p ,
|
||||
const char *log_name ) {
|
||||
ERR_PARAM *err_p ;
|
||||
ERR_REC *err_mem ;
|
||||
|
||||
err_p = ( ERR_PARAM * ) malloc( sizeof( ERR_PARAM ) ) ;
|
||||
if ( err_p == NULL ) {
|
||||
#ifndef NO_STDERR_OUTPUT
|
||||
PRINT_ERROR( "%s\n" ,
|
||||
"FATAL ERROR : Could not allocate memory for pagc_init_errors" ) ;
|
||||
#endif
|
||||
return NULL ;
|
||||
}
|
||||
|
||||
/* -- set up first record -- */
|
||||
RESET_ERR_P ;
|
||||
/* -- a null log_name means we don't log , but collect -- */
|
||||
if ( log_name == NULL ) {
|
||||
err_p -> stream = NULL ;
|
||||
}
|
||||
else {
|
||||
err_p -> stream = open_error_log( log_name ,
|
||||
pagc_glo_p -> _file_sys ,
|
||||
err_p ) ;
|
||||
if ( err_p -> stream == NULL ) {
|
||||
FREE_AND_NULL( err_p ) ;
|
||||
#ifndef NO_STDERR_OUTPUT
|
||||
PRINT_ERROR( "Could not create error log for pathname: %s\n" ,
|
||||
log_name ) ;
|
||||
#endif
|
||||
return NULL ;
|
||||
}
|
||||
}
|
||||
return err_p ;
|
||||
}
|
||||
|
||||
|
||||
/* ------------------------------------------------------------
|
||||
err_param.c (close_errors)
|
||||
uses macros BLANK_STRING, FREE_AND_NULL, and PRINT_ERROR
|
||||
--------------------------------------------------------------*/
|
||||
void close_errors( ERR_PARAM *err_p ) {
|
||||
int is_fatal_error ;
|
||||
char err_out_buf[ MAXSTRLEN ] ;
|
||||
|
||||
|
||||
if ( err_p == NULL ) {
|
||||
return ;
|
||||
}
|
||||
|
||||
BLANK_STRING( err_out_buf ) ;
|
||||
|
||||
/* -- read each error into the buffer and then
|
||||
output it as a single line -- */
|
||||
while ( empty_errors( err_p ,
|
||||
&is_fatal_error ,
|
||||
err_out_buf ) ) {
|
||||
if ( is_fatal_error ) {
|
||||
#ifndef NO_STDERR_OUTPUT
|
||||
PRINT_ERROR( "ERROR: %s\n" ,
|
||||
err_out_buf ) ;
|
||||
} else {
|
||||
PRINT_ERROR( "%s\n" ,
|
||||
err_out_buf ) ;
|
||||
#endif
|
||||
}
|
||||
BLANK_STRING( err_out_buf ) ;
|
||||
}
|
||||
FREE_AND_NULL( err_p ) ;
|
||||
}
|
||||
|
||||
/* ------------------------------------------------------------
|
||||
err_param.c (turn_off_error_log)
|
||||
called by err_param.c (empty_errors)
|
||||
stdio.h (fclose)
|
||||
--------------------------------------------------------------*/
|
||||
static int turn_off_error_log( ERR_PARAM *err_p ) {
|
||||
ERR_REC *err_mem ;
|
||||
|
||||
if ( ( err_p == NULL ) ||
|
||||
( err_p -> stream == NULL ) ) {
|
||||
return FALSE ;
|
||||
}
|
||||
fclose( err_p -> stream ) ;
|
||||
err_p -> stream = NULL ;
|
||||
RESET_ERR_P ;
|
||||
return TRUE ;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------------
|
||||
err_param.c (empty_errors)
|
||||
calls : err_param.c (turn_off_error_log)
|
||||
returns FALSE when all errors have been reported.
|
||||
TRUE otherwise
|
||||
------------------------------------------------------------*/
|
||||
int empty_errors( ERR_PARAM *err_p ,
|
||||
int *is_fatal ,
|
||||
char *err_dest ) {
|
||||
|
||||
ERR_REC *err_mem ;
|
||||
|
||||
if ( err_p == NULL ) {
|
||||
return FALSE ;
|
||||
}
|
||||
|
||||
if ( err_p -> first_err >= err_p -> last_err ) {
|
||||
/* -- reset the counters -- */
|
||||
RESET_ERR_P ;
|
||||
return FALSE ; /* -- indicate empty -- */
|
||||
}
|
||||
|
||||
/* -- if logging, turn it off and indicate empty -- */
|
||||
if ( turn_off_error_log( err_p ) ) {
|
||||
return FALSE ;
|
||||
}
|
||||
|
||||
/* -- output the current lowest record -- */
|
||||
err_mem = err_p -> err_array + err_p -> first_err ;
|
||||
append_string_to_max( err_dest ,
|
||||
err_mem -> content_buf ,
|
||||
MAXSTRLEN ) ;
|
||||
*is_fatal = err_mem -> is_fatal ;
|
||||
|
||||
/* -- update the low mark -- */
|
||||
err_p -> first_err ++ ;
|
||||
return TRUE ; /* indicate error there */
|
||||
}
|
||||
|
||||
/* ------------------------------------------------
|
||||
err_param.c (open_error_log) :
|
||||
called by init_errors
|
||||
calls : stdlib.h (free) stdio.h (fopen)
|
||||
uses macros OPEN_ALLOCATED_NAME, FREE_AND_NULL
|
||||
--------------------------------------------------- */
|
||||
static FILE *open_error_log( const char *client_log_name ,
|
||||
DS_Handle _file_sys_p ,
|
||||
ERR_PARAM *err_p ) {
|
||||
#ifdef BUILD_API
|
||||
return NULL;
|
||||
#else
|
||||
char *alloc_log_name ;
|
||||
FILE *error_file ;
|
||||
|
||||
if ( client_log_name != NULL ) {
|
||||
/* -- will overwrite previous log in same location -- */
|
||||
OPEN_ALLOCATED_NAME(alloc_log_name,"err",error_file,client_log_name,"wb+",_file_sys_p,err_p,NULL) ;
|
||||
}
|
||||
FREE_AND_NULL( alloc_log_name ) ;
|
||||
return error_file ;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* -----------------------------------------------------------
|
||||
err_param.c (register_error)
|
||||
called after the error is written to the error_buf
|
||||
stdlib.h (malloc) stdio.h (fprintf,fflush) string.h (strcpy)
|
||||
------------------------------------------------------------ */
|
||||
void register_error( ERR_PARAM *err_p ) {
|
||||
int i ;
|
||||
ERR_REC *err_mem ;
|
||||
|
||||
|
||||
/* -- check if there is anything in the error_buf -- */
|
||||
if ( err_p -> error_buf[ 0 ] == SENTINEL ) {
|
||||
return ;
|
||||
}
|
||||
if ( strlen( err_p -> error_buf ) > MAXSTRLEN ) {
|
||||
#ifndef NO_STDERR_OUTPUT
|
||||
PRINT_ERROR( "Error message %s is too long" ,
|
||||
err_p -> error_buf ) ;
|
||||
#endif
|
||||
return ;
|
||||
}
|
||||
/* -- print it out immediately, if we're logging -- */
|
||||
if ( err_p -> stream != NULL ) {
|
||||
fprintf( err_p -> stream ,
|
||||
"%s\n" ,
|
||||
err_p -> error_buf ) ;
|
||||
fflush( err_p -> stream ) ;
|
||||
/* -- set up for next error -- */
|
||||
BLANK_STRING( err_p -> error_buf ) ;
|
||||
return ;
|
||||
}
|
||||
/* -- update the current error record -- */
|
||||
err_mem = err_p -> err_array + err_p -> last_err ;
|
||||
err_mem -> is_fatal = err_p -> next_fatal ;
|
||||
|
||||
if ( err_p -> last_err == ( MAX_ERRORS - 1 ) ) {
|
||||
#ifndef NO_STDERR_OUTPUT
|
||||
PRINT_ERROR( "%s is too many errors - losing old ones" ,
|
||||
err_p -> error_buf ) ;
|
||||
#endif
|
||||
/* -- move the whole array down a slot to make room for
|
||||
the next error. The first in the array disappears -- */
|
||||
for ( i = err_p -> first_err ;
|
||||
i < err_p -> last_err ;
|
||||
i++ ) {
|
||||
err_p -> err_array[ i ] . is_fatal = err_p -> err_array[ i + 1 ] . is_fatal ;
|
||||
strcpy( err_p -> err_array[ i ] . content_buf ,
|
||||
err_p -> err_array[ i + 1 ] . content_buf ) ;
|
||||
}
|
||||
} else {
|
||||
/* -- last_err points to the next one to fill -- */
|
||||
err_p -> last_err ++ ;
|
||||
err_mem = err_p -> err_array + err_p -> last_err ;
|
||||
}
|
||||
|
||||
/* -- reset error_buf to the new content_buf -- */
|
||||
err_p -> error_buf = err_mem -> content_buf ;
|
||||
BLANK_STRING( err_mem -> content_buf ) ;
|
||||
err_p -> next_fatal = TRUE ;
|
||||
return ;
|
||||
}
|
||||
|
||||
|
||||
/*==========================================
|
||||
2006-11-02 add new arg
|
||||
===========================================*/
|
||||
void send_fields_to_error( ERR_PARAM *err_p ,
|
||||
char **s_fields ) {
|
||||
|
||||
send_fields_to_stream( s_fields , /* 2006-11-02 */
|
||||
err_p -> stream ,
|
||||
SCREEN , FALSE ) ;
|
||||
}
|
||||
|
432
extras/address_standardizer/export.c
Normal file
432
extras/address_standardizer/export.c
Normal file
|
@ -0,0 +1,432 @@
|
|||
/* -- export.c
|
||||
|
||||
This file contains the routines for extracting the sequence of
|
||||
postal attributes and definitions produced by the standardizer
|
||||
into strings of text (in __standard_fields__).
|
||||
|
||||
Prototype 7H08 (This file was written by Walter Sinclair).
|
||||
|
||||
Copyright (c) 2009 Walter Bruce Sinclair
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
*/
|
||||
|
||||
/* For pagc-0.4.0 : last revised 2009-10-03 */
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stddef.h>
|
||||
#include "pagc_api.h"
|
||||
#include "pagc_tools.h"
|
||||
|
||||
#define ORDER_DISPLACEMENT 2
|
||||
|
||||
/* -- local prototypes -- */
|
||||
static void _copy_standard_( STAND_PARAM * , SYMB , int , int ) ;
|
||||
static void _scan_target_( STAND_PARAM * , SYMB , int ) ;
|
||||
static char *_get_standard_( STAND_PARAM * , int , int ) ;
|
||||
static char *_get_definition_text_( STAND_PARAM * , int ) ;
|
||||
|
||||
//#ifndef BUILD_API
|
||||
|
||||
/* -- local storage -- */
|
||||
static const char *__field_start_tag__[][3] = {
|
||||
{ " <Build>", "\"", "Building: " },
|
||||
{ " <Civic>", "\"", "House Address: " },
|
||||
{ " <PreDir>", "\"", "Prefix Direction: " },
|
||||
{ " <Qualif>", "\"", "Qualifier: " },
|
||||
{ " <PreTyp>", "\"", "Prefix Type: " },
|
||||
{ " <Street>", "\"", "Street Name: " },
|
||||
{ " <SufTyp>", "\"", "Suffix Type: " },
|
||||
{ " <SufDir>", "\"", "Suffix Direction: " },
|
||||
{ " <Rural>", "\"", "Rural Route: " },
|
||||
{ " <Extra>", "\"", "Additional Info: " },
|
||||
{ " <City>", "\"", "Municipal: " },
|
||||
{ " <Prov>", "\"", "Province/State: " },
|
||||
{ " <Nation>", "\"", "Country: " },
|
||||
{ " <Postal>", "\"", "Postal/Zip Code: " },
|
||||
{ " <Box>", "\"", "Box: " },
|
||||
{ " <Unit>", "\"", "Unit: " }
|
||||
} ;
|
||||
static const char *__land_field_start_tag__[][3] = {
|
||||
{ "<FeatureName>", "\"", "FeatureName " },
|
||||
{ "<FeatureType>", "\"", "FeatureType " },
|
||||
{ "<FeatureArea>", "\"", "FeatureArea " }
|
||||
} ;
|
||||
static const char *__land_field_tag_end__[][3] = {
|
||||
{ "</FeatureName>\n", "\",", "\n" },
|
||||
{ "</FeatureType>\n", "\",", "\n" },
|
||||
{ "</FeatureArea>\n", "\",", "\n" }
|
||||
} ;
|
||||
static const char *__field_tag_end__[][3] = {
|
||||
{ "</Build>\n", "\",", "\n" },
|
||||
{ "</Civic>\n", "\",", "\n" },
|
||||
{ "</PreDir>\n", "\",", "\n" },
|
||||
{ "</Qualif>\n", "\",", "\n" },
|
||||
{ "</PreTyp>\n", "\",", "\n" },
|
||||
{ "</Street>\n", "\",", "\n" },
|
||||
{ "</SufTyp>\n", "\",", "\n" },
|
||||
{ "</SufDir>\n", "\",", "\n" },
|
||||
{ "</Rural>\n", "\",", "\n" },
|
||||
{ "</Extra>\n", "\",", "\n" },
|
||||
{ "</City>\n", "\",", "\n" },
|
||||
{ "</Prov>\n", "\",", "\n" },
|
||||
{ "</Nation>\n", "\",", "\n" },
|
||||
{ "</Postal>\n", "\",", "\n" },
|
||||
{ "</Box>\n", "\",", "\n" },
|
||||
{ "</Unit>\n", "\",", "\n" }
|
||||
} ;
|
||||
static const char *__record_start_tag__[ ] = {
|
||||
" <address>\n" , "\n", "\n"
|
||||
} ;
|
||||
static const char *__landmark_record_start_tag__[ ] = {
|
||||
" <landmark>\n" , "\n", "\n"
|
||||
} ;
|
||||
static const char *__record_end_tag__[ ] = {
|
||||
" </address>\n", "\n", "\n"
|
||||
} ;
|
||||
static const char *__landmark_record_end_tag__[ ] = {
|
||||
" </landmark>\n" , "\n", "\n"
|
||||
} ;
|
||||
|
||||
//#endif
|
||||
|
||||
static SYMB __ord_list__[] = { ORD, FAIL } ;
|
||||
|
||||
/*----------------------------------------------------------------
|
||||
export.c (init_output_fields)
|
||||
----------------------------------------------------------------*/
|
||||
void init_output_fields( STAND_PARAM *__stand_param__ , int which_fields )
|
||||
{
|
||||
/* -- called with BOTH to erase both the micro and macro fields
|
||||
called with RIGHT to erase only the macro fields, and
|
||||
LEFT to erase only the micro fields -- */
|
||||
int i ;
|
||||
char **__standard_fields__ = __stand_param__->standard_fields ;
|
||||
/*-- Decide which set of fields to initialize --*/
|
||||
if ( which_fields == BOTH )
|
||||
{
|
||||
for ( i = 0 ; i < MAXOUTSYM ; i++ )
|
||||
{
|
||||
__standard_fields__[i][0] = SENTINEL ;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/*-- Clean only one set --*/
|
||||
if ( which_fields == RIGHT )
|
||||
{
|
||||
/*-- Erase the macro fields only --*/
|
||||
for ( i = CITY ; i < NEEDHEAD ; i++ )
|
||||
{
|
||||
__standard_fields__[i][0] = SENTINEL ;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/*-- Erase the micro fields only --*/
|
||||
for ( i = BLDNG ; i < CITY ; i++ )
|
||||
{
|
||||
__standard_fields__[i][0] = SENTINEL ;
|
||||
}
|
||||
for ( i = NEEDHEAD ; i < MAXOUTSYM ; i++ )
|
||||
{
|
||||
__standard_fields__[i][0] = SENTINEL ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*-----------------------------------------
|
||||
export.c (sym_to_field)
|
||||
-------------------------------------------*/
|
||||
int sym_to_field( SYMB sym )
|
||||
{
|
||||
int fld = NEEDHEAD ;
|
||||
if ( sym == BOXH || sym == BOXT ) return fld ;
|
||||
fld++ ;
|
||||
if ( sym == UNITH || sym == UNITT ) return fld ;
|
||||
if ( sym >= BLDNG && sym < MAXOUTSYM ) return sym ;
|
||||
return FAIL ;
|
||||
}
|
||||
|
||||
/*--------------------------------------------------
|
||||
export.c (_get_definition_text_)
|
||||
-- called by export.c (_get_standard_)
|
||||
---------------------------------------------------*/
|
||||
static char *_get_definition_text_( STAND_PARAM *__stand_param__ , int lex_pos )
|
||||
{
|
||||
DEF *__best_DEF__ = __stand_param__->best_defs[lex_pos] ;
|
||||
if (!( __best_DEF__->Protect ))
|
||||
{
|
||||
return ( __best_DEF__->Standard ) ;
|
||||
}
|
||||
return ( __stand_param__->lex_vector[lex_pos].Text ) ;
|
||||
}
|
||||
|
||||
/*-----------------------------------------
|
||||
export.c (stuff_fields)
|
||||
--calls export.c (_scan_target_)
|
||||
-------------------------------------------*/
|
||||
void stuff_fields( STAND_PARAM *__stand_param__ )
|
||||
{
|
||||
int fld ;
|
||||
/*-- Translate the symbols and definitions of the standardization into
|
||||
the __standard_fields__ for output --*/
|
||||
for (fld = 0 ;fld < NEEDHEAD ;fld++)
|
||||
{
|
||||
/*-- Fields that correspond one to one with the symbols --*/
|
||||
_scan_target_(__stand_param__ ,fld,fld) ;
|
||||
}
|
||||
/*-- These two fields have two tokens for each field --*/
|
||||
_scan_target_( __stand_param__ , BOXH, NEEDHEAD ) ;
|
||||
_scan_target_( __stand_param__ , BOXT, NEEDHEAD ) ;
|
||||
_scan_target_( __stand_param__ , UNITH, NEEDHEAD+1 ) ;
|
||||
_scan_target_( __stand_param__ , UNITT, NEEDHEAD+1 ) ;
|
||||
}
|
||||
|
||||
//#ifndef BUILD_API
|
||||
|
||||
/*---------------------------------------------------------------------
|
||||
export.c (send_fields_to_stream)
|
||||
uses BLANK_STRING
|
||||
2009-09-27 modify to display landmark fields
|
||||
----------------------------------------------------------------------*/
|
||||
#define STREAM_BUF_SIZE MAXSTRLEN
|
||||
void send_fields_to_stream( char **__standard_fields__ , FILE *__dest_file__ , int opt , int is_landmark)
|
||||
{
|
||||
int output_order ;
|
||||
if (opt < NO_FORMAT)
|
||||
{
|
||||
if (__dest_file__ != NULL)
|
||||
{
|
||||
fprintf(__dest_file__,"%s\n",(is_landmark? __landmark_record_start_tag__[opt] : __record_start_tag__[opt])) ;
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("%s\n",(is_landmark? __landmark_record_start_tag__[opt] : __record_start_tag__[opt])) ;
|
||||
}
|
||||
}
|
||||
/*-- We want to rearrange so that unit and box come first --*/
|
||||
for (output_order = 0; output_order < (NEEDHEAD + ORDER_DISPLACEMENT); output_order++)
|
||||
{
|
||||
char __line_buf__[STREAM_BUF_SIZE] ;
|
||||
int loc = ((output_order < ORDER_DISPLACEMENT)? (NEEDHEAD + output_order) : (output_order - ORDER_DISPLACEMENT)) ;
|
||||
char *__field_string__ = __standard_fields__[loc] ;
|
||||
BLANK_STRING(__line_buf__) ;
|
||||
if (*__field_string__ != SENTINEL)
|
||||
{
|
||||
if (opt < NO_FORMAT)
|
||||
{
|
||||
char * __source_start_tag__ ;
|
||||
if (is_landmark)
|
||||
{
|
||||
switch (loc)
|
||||
{
|
||||
case FEATNAME :
|
||||
__source_start_tag__ = ( char *) __land_field_start_tag__[0][opt] ;
|
||||
break ;
|
||||
case FEATTYPE :
|
||||
__source_start_tag__ = ( char *) __land_field_start_tag__[1][opt] ;
|
||||
break ;
|
||||
case FEATAREA :
|
||||
__source_start_tag__ = ( char *) __land_field_start_tag__[2][opt] ;
|
||||
break ;
|
||||
default :
|
||||
__source_start_tag__ = ( char * ) __field_start_tag__[loc][opt] ;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
__source_start_tag__ = (char *) __field_start_tag__[loc][opt] ;
|
||||
}
|
||||
append_string_to_max(__line_buf__, __source_start_tag__ , STREAM_BUF_SIZE) ;
|
||||
}
|
||||
append_string_to_max( __line_buf__, __field_string__ , STREAM_BUF_SIZE ) ;
|
||||
if (opt < NO_FORMAT)
|
||||
{
|
||||
char * __source_end_tag__ ;
|
||||
if (is_landmark)
|
||||
{
|
||||
switch (loc)
|
||||
{
|
||||
case FEATNAME :
|
||||
__source_end_tag__ = ( char *) __land_field_tag_end__[ 0 ][ opt ] ;
|
||||
break ;
|
||||
case FEATTYPE :
|
||||
__source_end_tag__ = ( char *) __land_field_tag_end__[ 1 ][ opt ] ;
|
||||
break ;
|
||||
case FEATAREA :
|
||||
__source_end_tag__ = ( char *) __land_field_tag_end__[ 2 ][ opt ] ;
|
||||
break ;
|
||||
default :
|
||||
__source_end_tag__ = ( char * ) __field_tag_end__[ loc ][ opt ] ;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
__source_end_tag__ = ( char * ) __field_tag_end__[ loc ][ opt ] ;
|
||||
}
|
||||
append_string_to_max( __line_buf__ , __source_end_tag__ , STREAM_BUF_SIZE ) ;
|
||||
}
|
||||
if ( __dest_file__ != NULL )
|
||||
{
|
||||
fprintf( __dest_file__ , "%s" , __line_buf__ ) ;
|
||||
}
|
||||
else
|
||||
{
|
||||
printf( "%s" , __line_buf__ ) ;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ( opt < NO_FORMAT )
|
||||
{
|
||||
if ( __dest_file__ != NULL )
|
||||
{
|
||||
fprintf( __dest_file__ , "%s\n", ( is_landmark? __landmark_record_end_tag__[ opt ] : __record_end_tag__[ opt ]));
|
||||
}
|
||||
else
|
||||
{
|
||||
printf( "%s\n" , ( is_landmark? __landmark_record_end_tag__[ opt ] : __record_end_tag__[ opt ] ) );
|
||||
}
|
||||
}
|
||||
if ( __dest_file__ != NULL )
|
||||
{
|
||||
fflush( __dest_file__ ) ;
|
||||
}
|
||||
else
|
||||
{
|
||||
fflush( stdout ) ;
|
||||
}
|
||||
}
|
||||
|
||||
//#endif
|
||||
|
||||
/*-----------------------------------------
|
||||
export.c (_get_standard_)
|
||||
-- called by export.c (_copy_standard_)
|
||||
-- calls _get_definition_text_ , find_def_type
|
||||
uses MACRO BLANK_STRING
|
||||
-------------------------------------------*/
|
||||
static char *_get_standard_(STAND_PARAM *__stand_param__ ,int lex_pos, int output_sym)
|
||||
{
|
||||
char *__selected_standardization__ ;
|
||||
DEF *__best_DEF__ = __stand_param__->best_defs[lex_pos] ;
|
||||
if ((output_sym == STREET) && (find_def_type(__best_DEF__,__ord_list__)) && (__best_DEF__->Type == WORD))
|
||||
{
|
||||
/*-- <remarks> If the best definition is a streetname typed as a word, but also
|
||||
including an ordinal type, then substitute the ordinal
|
||||
standardization - however, the lexicon should take care of most
|
||||
cases of this. </remarks> --*/
|
||||
|
||||
DEF *__scan_DEF__ ;
|
||||
|
||||
for (__scan_DEF__ = __stand_param__->lex_vector[lex_pos].DefList;__scan_DEF__ != NULL;__scan_DEF__ = __scan_DEF__->Next)
|
||||
{
|
||||
if (__scan_DEF__->Type == ORD)
|
||||
{
|
||||
if ((__selected_standardization__ = __scan_DEF__->Standard) != NULL)
|
||||
{
|
||||
return (__selected_standardization__) ;
|
||||
}
|
||||
break ;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*-- If it is in the lexicon, use the standardization there, otherwise
|
||||
use the form that emerged from tokenization --*/
|
||||
|
||||
__selected_standardization__ = _get_definition_text_(__stand_param__,lex_pos) ;
|
||||
if ((output_sym == HOUSE) && (*__selected_standardization__ == '0'))
|
||||
{
|
||||
/*-- Remove leading zeroes to simplify match comparisons
|
||||
on the house number that use strings rather than integers -
|
||||
we won't do this on zip codes. There may arise some need to
|
||||
do it for unit and box numbers in the future. --*/
|
||||
char *__zero_pointer__ ;
|
||||
char *__buffer_pointer__ = __zero_pointer__ = __selected_standardization__ ;
|
||||
while ( *__zero_pointer__ == '0' ) __zero_pointer__++ ; /*-- Move to first nonzero character --*/
|
||||
while ( *__zero_pointer__ != SENTINEL ) *__buffer_pointer__++ = *__zero_pointer__++ ; /*-- Move down in buffer --*/
|
||||
/*-- Trim down all-zeroes to a single zero: if deleting all
|
||||
the zeros leaves an empty buffer, put a zero back --*/
|
||||
if ( __buffer_pointer__ == __selected_standardization__ ) *__buffer_pointer__++ = '0' ;
|
||||
BLANK_STRING( __buffer_pointer__ ) ;
|
||||
}
|
||||
return ( __selected_standardization__ ) ;
|
||||
}
|
||||
|
||||
/*-----------------------------------------
|
||||
export.c (_scan_target_ )
|
||||
-- calls export.c (_copy_standard_)
|
||||
-- called by export.c (stuff_fields)
|
||||
-------------------------------------------*/
|
||||
static void _scan_target_(STAND_PARAM *__stand_param__,SYMB sym , int dest)
|
||||
{
|
||||
int i ;
|
||||
|
||||
int n = __stand_param__->LexNum ;
|
||||
SYMB *__output_syms__ = __stand_param__->best_output ;
|
||||
/*-- <remarks> Probe the array of output symbols in the best output and find
|
||||
the position of a matching symbol and send it to be copied to
|
||||
the output string fields. The order of the words in each field
|
||||
will therefore follow the order that they appear in the input </remarks> --*/
|
||||
for (i = FIRST_LEX_POS;i < n;i++)
|
||||
{
|
||||
if (__output_syms__[i] == sym)
|
||||
{
|
||||
_copy_standard_(__stand_param__,sym,dest,i) ;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*-----------------------------------------
|
||||
export.c (_copy_standard_)
|
||||
-- called by export.c (_scan_target_) --
|
||||
--calls export.c (_get_standard_,
|
||||
strlen, strcpy
|
||||
uses macro SPACE_APPEND_WITH_LEN
|
||||
-------------------------------------------*/
|
||||
static void _copy_standard_( STAND_PARAM *__stand_param__ , SYMB output_sym , int fld , int lex_pos )
|
||||
{
|
||||
|
||||
/*-- Retrieve the standardized string --*/
|
||||
char *__stan_str__ = _get_standard_( __stand_param__ , lex_pos , output_sym ) ;
|
||||
char *__dest_buf__ = __stand_param__->standard_fields[fld] ;
|
||||
if (( strlen( __stan_str__ ) + strlen( __dest_buf__ )) > MAXFLDLEN )
|
||||
{
|
||||
/*-- Truncate without warning --*/
|
||||
return ;
|
||||
}
|
||||
if ( *__dest_buf__ != SENTINEL )
|
||||
{
|
||||
SPACE_APPEND_WITH_LEN( __dest_buf__ , __stan_str__ , MAXFLDLEN ) ;
|
||||
}
|
||||
else if ( output_sym == UNITT )
|
||||
{
|
||||
/*-- If the unit id type is missing, one needs to be provided.
|
||||
This might result in a mismatch, when the type is implicit
|
||||
in one of the compared addresses, and explicit in the
|
||||
other. Not much you can do with implicit. Better a generic
|
||||
identifier than nothing at all --*/
|
||||
|
||||
strcpy( __dest_buf__ , "# " ) ; /* -- reconsider this -- */
|
||||
append_string_to_max( __dest_buf__ , __stan_str__ , MAXFLDLEN ) ;
|
||||
}
|
||||
else if ( output_sym == BOXT )
|
||||
{
|
||||
strcpy( __dest_buf__, "BOX " ) ;
|
||||
append_string_to_max( __dest_buf__ , __stan_str__ ,MAXFLDLEN ) ;
|
||||
}
|
||||
else
|
||||
{
|
||||
strcpy( __dest_buf__ , __stan_str__ ) ;
|
||||
}
|
||||
}
|
||||
|
940
extras/address_standardizer/gamma.c
Normal file
940
extras/address_standardizer/gamma.c
Normal file
|
@ -0,0 +1,940 @@
|
|||
/* -- gamma.c
|
||||
|
||||
This file reads the rules file into memory and sets up the rule
|
||||
lookup structures. These are based on the optimized Aho-Corasick
|
||||
algorithms in Watson (1994).
|
||||
|
||||
Copyright (c) 2008 Walter Bruce Sinclair
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
*/
|
||||
/* For pagc-0.4.0 : last revised 2010-11-01 */
|
||||
|
||||
#undef DEBUG
|
||||
//#define DEBUG
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <stddef.h>
|
||||
#include "pagc_api.h"
|
||||
#include "gamma.h"
|
||||
|
||||
#ifdef BUILD_API
|
||||
#include "pagc_std_api.h"
|
||||
RULES *rules_init( ERR_PARAM *err_p ) ;
|
||||
#endif
|
||||
|
||||
/* -- local prototypes -- */
|
||||
static int initialize_link( ERR_PARAM *, KW *** , NODE ) ;
|
||||
static void classify_link( RULE_PARAM * , KW ***, KW *, NODE , SYMB , SYMB ) ;
|
||||
static void add_failure_linkage( KW ***, NODE , NODE ) ;
|
||||
static NODE **precompute_gamma_function( ERR_PARAM *, NODE ** , KW ***, NODE ) ;
|
||||
|
||||
static double load_value[ NUMBER_OF_WEIGHTS ] = {
|
||||
0.00, 0.325, 0.35 , 0.375 , 0.4 ,
|
||||
0.475 , 0.55, 0.6 , 0.65 , 0.675 ,
|
||||
0.7 , 0.75 , 0.8 , 0.825 , 0.85 ,
|
||||
0.9 , 0.95 , 1.00 } ;
|
||||
|
||||
/*---------------------------------------------------------------------------
|
||||
gamma.c (refresh_transducer)
|
||||
called by analyze.c (prepare_target_pattern)
|
||||
The registry of matching keywords is regenerated with the use of the
|
||||
precomputed Gamma function, Output Links and the current target.
|
||||
----------------------------------------------------------------------------*/
|
||||
void refresh_transducer( NODE *r ,
|
||||
SYMB *S ,
|
||||
NODE **gamma_function ) {
|
||||
NODE q ;
|
||||
int i ;
|
||||
|
||||
i = 0 ;
|
||||
q = r[ i ] = EPSILON ;
|
||||
while ( S[ i ] != FAIL ) {
|
||||
q = gamma_function[ q ][ S[ i ] ] ;
|
||||
i++ ;
|
||||
r[ i ] = q ;
|
||||
}
|
||||
}
|
||||
|
||||
/*---------------------------------------------------------------------------
|
||||
gamma.c (is_input_symbol)
|
||||
called by gamma.c (create_rules)
|
||||
----------------------------------------------------------------------------*/
|
||||
int is_input_symbol( SYMB sym ) {
|
||||
|
||||
if ( sym > MAXINSYM ||
|
||||
sym < 0 )
|
||||
return FALSE ;
|
||||
return TRUE ;
|
||||
}
|
||||
|
||||
/*---------------------------------------------------------------------------
|
||||
gamma.c (is_output_symbol)
|
||||
called by gamma.c (create_rules)
|
||||
----------------------------------------------------------------------------*/
|
||||
int is_output_symbol( SYMB sym ) {
|
||||
if ( sym > MAXOUTSYM ||
|
||||
sym < 0 )
|
||||
return FALSE ;
|
||||
return TRUE ;
|
||||
}
|
||||
|
||||
#ifdef BUILD_API
|
||||
|
||||
/*
|
||||
typedef struct RULES_s {
|
||||
int ready;
|
||||
int rule_number;
|
||||
int last_node;
|
||||
RULE_PARAM *r_p;
|
||||
ERR_PARAM *err_p;
|
||||
NODE **Trie;
|
||||
SYMB *rule_end ;
|
||||
SYMB *r ;
|
||||
} RULES;
|
||||
*/
|
||||
|
||||
/*---------------------------------------------------------------------------
|
||||
gamma.c (rules_init)
|
||||
api interface to replace (create_rules)
|
||||
---------------------------------------------------------------------------*/
|
||||
RULES *rules_init( ERR_PARAM *err_p ) {
|
||||
RULES *rules;
|
||||
/* -- returns size of Gamma Function Matrix -- */
|
||||
SYMB a ;
|
||||
KW *k_s ;
|
||||
KW ***o_l ;
|
||||
NODE **Trie ;
|
||||
SYMB *r_s ;
|
||||
RULE_PARAM *r_p ;
|
||||
|
||||
|
||||
PAGC_CALLOC_STRUC(rules,RULES,1,err_p,NULL);
|
||||
rules->err_p = err_p;
|
||||
rules->ready = 0;
|
||||
rules->rule_number = 0;
|
||||
rules->last_node = EPSILON;
|
||||
|
||||
PAGC_ALLOC_STRUC(r_p,RULE_PARAM,err_p,NULL) ;
|
||||
rules->r_p = r_p;
|
||||
|
||||
/* -- initialize the statistics record -- */
|
||||
r_p -> collect_statistics = FALSE ;
|
||||
r_p -> total_best_keys = 0 ;
|
||||
r_p -> total_key_hits = 0 ;
|
||||
|
||||
/* -- storage for input and output records -- */
|
||||
PAGC_CALLOC_STRUC(r_s,SYMB,RULESPACESIZE,err_p,NULL);
|
||||
|
||||
/* -- storage for temporary trie for rules -- */
|
||||
PAGC_CALLOC_STRUC(Trie,NODE *,MAXNODES,err_p,NULL);
|
||||
|
||||
/* -- initialize the first( EPSILON ) node of the trie -- */
|
||||
PAGC_CALLOC_STRUC(Trie[EPSILON],NODE,MAXINSYM,err_p,NULL);
|
||||
|
||||
for ( a = 0 ;
|
||||
a < MAXINSYM ;
|
||||
a++ ) {
|
||||
Trie[ EPSILON ][ a ] = FAIL ;
|
||||
}
|
||||
|
||||
/* -- storage for global output_link -- */
|
||||
PAGC_CALLOC_STRUC(o_l,KW **,MAXNODES,err_p,NULL);
|
||||
PAGC_CALLOC_STRUC(k_s,KW,MAXRULES,err_p,NULL);
|
||||
|
||||
if ( !initialize_link( err_p ,
|
||||
o_l ,
|
||||
EPSILON ) ) {
|
||||
return NULL ;
|
||||
}
|
||||
|
||||
rules -> r_p -> rule_space = r_s ;
|
||||
rules -> r_p -> key_space = k_s ;
|
||||
rules -> r_p -> output_link = o_l ;
|
||||
|
||||
rules -> Trie = Trie ;
|
||||
rules -> rule_end = r_s + RULESPACESIZE ;
|
||||
|
||||
rules -> r = r_s ;
|
||||
|
||||
return rules;
|
||||
}
|
||||
|
||||
|
||||
int rules_add_rule(RULES *rules, int num, int *rule) {
|
||||
int i ,
|
||||
w ;
|
||||
SYMB a ,
|
||||
t ;
|
||||
SYMB *rule_start ,
|
||||
*r ,
|
||||
*r_s ;
|
||||
NODE u ;
|
||||
NODE **Trie ;
|
||||
KW *keyw ,
|
||||
*k_s ;
|
||||
KW ***o_l ;
|
||||
|
||||
if ( !rules ) return 1; /* error rules obj not initialized */
|
||||
if ( !rules -> r_p ) return 2; /* RULE_PARAM not allocated */
|
||||
if ( rules -> ready ) return 3; /* rules have already be readied */
|
||||
if ( rules -> rule_number >= MAXRULES ) {
|
||||
RET_ERR( "rules_add_rule: Too many rules are being added.",
|
||||
rules -> err_p, 4);
|
||||
}
|
||||
|
||||
/* get local copies of stuff saved in RULES */
|
||||
o_l = rules -> r_p -> output_link ;
|
||||
k_s = rules -> r_p -> key_space ;
|
||||
r_s = rules -> r_p -> rule_space ;
|
||||
Trie = rules -> Trie ;
|
||||
r = rules -> r ;
|
||||
|
||||
keyw = k_s + rules -> rule_number ;
|
||||
MEM_ERR(keyw, rules -> err_p, 5);
|
||||
|
||||
u = EPSILON ;
|
||||
rule_start = r ; /* save rule start for inclusion in the record */
|
||||
if ( rule_start > rules -> rule_end ) {
|
||||
RET_ERR( "rules_add_rule: Too many rules for allocated memory.",
|
||||
rules -> err_p, 5);
|
||||
}
|
||||
|
||||
for (i=0; ; i++, r++ ) {
|
||||
if (i >= num) {
|
||||
RET_ERR( "rules_add_rule: invalid rule structure.",
|
||||
rules -> err_p, 6);
|
||||
}
|
||||
|
||||
*r = rule[i] ;
|
||||
/* -- a fail at the beginning of a field indicates end of record
|
||||
unless it's at the beginning of the record, in which case
|
||||
it's the end of file -- */
|
||||
if ( *r == FAIL ) {
|
||||
if ( i == 0 ) return 0;
|
||||
break;
|
||||
}
|
||||
|
||||
/* -- check the input -- */
|
||||
if ( !is_input_symbol( *r ) ) {
|
||||
RET_ERR2( "rules_add_rule: Bad Input Token %d at rule %d",
|
||||
*r,
|
||||
rules -> rule_number ,
|
||||
rules -> err_p,
|
||||
7 ) ;
|
||||
}
|
||||
|
||||
/* -- build the trie structure -- */
|
||||
if ( Trie[ u ][ *r ] == FAIL ) {
|
||||
if ( ++rules -> last_node >= MAXNODES ) {
|
||||
RET_ERR( "rules_add_rule: Too many nodes in gamma function",
|
||||
rules -> err_p,
|
||||
8 ) ;
|
||||
}
|
||||
Trie[ u ][ *r ] = rules -> last_node ;
|
||||
PAGC_CALLOC_STRUC(Trie[rules -> last_node],NODE,MAXINSYM,rules -> err_p,9) ;
|
||||
for ( a = 0 ;
|
||||
a < MAXINSYM ;
|
||||
a++ ) {
|
||||
Trie[ rules -> last_node ][ a ] = FAIL ;
|
||||
}
|
||||
if ( !initialize_link( rules -> err_p ,
|
||||
o_l ,
|
||||
rules -> last_node ) ) {
|
||||
return 10;
|
||||
}
|
||||
}
|
||||
u = Trie[ u ][ *r ] ;
|
||||
} /* end of for loop */
|
||||
|
||||
keyw -> Input = rule_start ;
|
||||
if ( ( keyw -> Length = i ) == 0 ) {
|
||||
RET_ERR1( "rules_add_rule: Error 0 length rule #%d",
|
||||
rules -> rule_number,
|
||||
rules -> err_p,
|
||||
11 ) ;
|
||||
}
|
||||
|
||||
/* -- read the output tokens into the rule_space -- */
|
||||
r++ ; /* -- move to beginning of the output tokens -- */
|
||||
rule_start = r ; /* -- remember the beginning -- */
|
||||
while ( TRUE ) {
|
||||
i++;
|
||||
if ( i >= num ) {
|
||||
RET_ERR( "rules_add_rule: invalid rule structure.",
|
||||
rules -> err_p, 6);
|
||||
}
|
||||
*r = rule[i] ;
|
||||
if ( *r == FAIL ) break;
|
||||
if ( !is_output_symbol( *r ) ) {
|
||||
RET_ERR2( "rules_add_rule: Rule File: Non-Token %d in Rule #%d\n",
|
||||
*r ,
|
||||
rules -> rule_number,
|
||||
rules -> err_p,
|
||||
7 ) ;
|
||||
}
|
||||
r++ ;
|
||||
}
|
||||
keyw -> Output = rule_start ;
|
||||
|
||||
/* -- classify the output -- */
|
||||
i++ ;
|
||||
t = rule[i] ;
|
||||
i++ ;
|
||||
w = rule[i] ;
|
||||
|
||||
classify_link( rules -> r_p ,
|
||||
o_l ,
|
||||
keyw ,
|
||||
u ,
|
||||
w ,
|
||||
t ) ;
|
||||
|
||||
rules -> rule_number++ ;
|
||||
rules -> r = ++r ; ;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int rules_ready(RULES *rules) {
|
||||
SYMB a;
|
||||
|
||||
if (!rules) return 1; /* error rules obj not initialized */
|
||||
if (!rules->r_p) return 2; /* RULE_PARAM not allocated */
|
||||
if (rules->ready) return 3; /* rules have already be readied */
|
||||
|
||||
rules -> r_p -> rules_read = rules->rule_number ;
|
||||
|
||||
if ( ++rules -> last_node >= MAXNODES ) {
|
||||
RET_ERR( "rules_ready: Too many nodes in gamma function" ,
|
||||
rules -> err_p, 4) ;
|
||||
}
|
||||
|
||||
/* -- change the EPSILON node transitions in preparation for Gamma -- */
|
||||
for ( a = 0 ;
|
||||
a < MAXINSYM ;
|
||||
a++ ) {
|
||||
if ( rules -> Trie[ EPSILON ][ a ] == FAIL ) {
|
||||
rules -> Trie[ EPSILON ][ a ] = EPSILON ;
|
||||
}
|
||||
}
|
||||
|
||||
/* -- create the global Gamma function matrix -- */
|
||||
if ( ( rules -> r_p -> gamma_matrix =
|
||||
precompute_gamma_function( rules -> err_p,
|
||||
rules -> Trie ,
|
||||
rules -> r_p -> output_link ,
|
||||
rules -> last_node ) ) == NULL ) {
|
||||
return 5 ;
|
||||
}
|
||||
|
||||
/* -- no longer need the Trie -- */
|
||||
PAGC_DESTROY_2D_ARRAY(rules -> Trie,NODE,rules -> last_node) ;
|
||||
rules -> Trie = NULL ;
|
||||
|
||||
rules -> r_p -> num_nodes = rules -> last_node ;
|
||||
|
||||
/*
|
||||
if ( glo_p -> log_init ) {
|
||||
CLIENT_ERR( err_p ) ;
|
||||
LOG_MESS2( "create_rules: Rules installed with %d nodes and %d rules",
|
||||
rules -> last_node ,
|
||||
rules->rule_number ,
|
||||
err_p ) ;
|
||||
}
|
||||
*/
|
||||
|
||||
rules -> ready = 1 ;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void rules_free(RULES *rules) {
|
||||
|
||||
if (!rules) return;
|
||||
if (rules->r_p) destroy_rules(rules->r_p);
|
||||
free(rules);
|
||||
rules = NULL;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/*---------------------------------------------------------------------------
|
||||
gamma.c (create_rules)
|
||||
called by standard.l (init_stand_process)
|
||||
calls util.c (open_aux_file)
|
||||
calls gamma.c (initialize_link, is_input_symbol, is_output_symbol,
|
||||
classify_link,precompute_gamma_function)
|
||||
----------------------------------------------------------------------------*/
|
||||
RULE_PARAM *create_rules( const char *rule_name ,
|
||||
PAGC_GLOBAL *glo_p ) {
|
||||
/* -- returns size of Gamma Function Matrix -- */
|
||||
SYMB a ,
|
||||
t ;
|
||||
NODE u ;
|
||||
int i ,
|
||||
w ;
|
||||
int is_eof = FALSE ;
|
||||
int rule_number = 0 ;
|
||||
int last_node = EPSILON ;
|
||||
FILE *rule_file ;
|
||||
SYMB *rule_start ,
|
||||
*rule_end ,
|
||||
*r ;
|
||||
KW *keyw , *k_s ;
|
||||
KW ***o_l ;
|
||||
NODE **Trie ;
|
||||
SYMB *r_s ;
|
||||
RULE_PARAM *r_p ;
|
||||
ERR_PARAM *err_p ;
|
||||
|
||||
err_p = glo_p -> process_errors ;
|
||||
|
||||
PAGC_ALLOC_STRUC(r_p,RULE_PARAM,err_p,NULL) ;
|
||||
|
||||
/* -- initialize the statistics record -- */
|
||||
r_p -> collect_statistics = FALSE ;
|
||||
r_p -> total_best_keys = 0 ;
|
||||
r_p -> total_key_hits = 0 ;
|
||||
|
||||
|
||||
/* -- open the rule file, if possible -- */
|
||||
if ( ( rule_file = open_aux_file( glo_p ,
|
||||
rule_name ) ) == NULL ) {
|
||||
return NULL ;
|
||||
}
|
||||
/* -- rule file has the format of i i ... i -1 o o ... o -1 t f -- */
|
||||
|
||||
|
||||
/* -- storage for input and output records -- */
|
||||
PAGC_CALLOC_STRUC(r_s,SYMB,RULESPACESIZE,err_p,NULL);
|
||||
|
||||
/* -- storage for temporary trie for rules -- */
|
||||
PAGC_CALLOC_STRUC(Trie,NODE *,MAXNODES,err_p,NULL);
|
||||
|
||||
/* -- initialize the first( EPSILON ) node of the trie -- */
|
||||
PAGC_CALLOC_STRUC(Trie[EPSILON],NODE,MAXINSYM,err_p,NULL);
|
||||
|
||||
for ( a = 0 ;
|
||||
a < MAXINSYM ;
|
||||
a++ ) {
|
||||
Trie[ EPSILON ][ a ] = FAIL ;
|
||||
}
|
||||
|
||||
/* -- storage for global output_link -- */
|
||||
PAGC_CALLOC_STRUC(o_l,KW **,MAXNODES,err_p,NULL);
|
||||
PAGC_CALLOC_STRUC(k_s,KW,MAXRULES,err_p,NULL);
|
||||
|
||||
rule_end = r_s + RULESPACESIZE ;
|
||||
if ( !initialize_link( err_p ,
|
||||
o_l ,
|
||||
EPSILON ) ) {
|
||||
return NULL ;
|
||||
}
|
||||
for ( r = r_s ;
|
||||
!feof( rule_file ) ;
|
||||
r++, rule_number++ ) {
|
||||
if ( rule_number >= MAXRULES ) {
|
||||
CLIENT_ERR( err_p ) ;
|
||||
RET_ERR( "create_rules: Too many rules in file",
|
||||
err_p,
|
||||
NULL) ;
|
||||
}
|
||||
keyw = k_s + rule_number ;
|
||||
MEM_ERR(keyw,err_p,NULL);
|
||||
/* -- get input record -- */
|
||||
|
||||
u = EPSILON ;
|
||||
rule_start = r ; /* -- save rule start for inclusion in record -- */
|
||||
if ( rule_start > rule_end ) {
|
||||
RET_ERR( "create_rules: Too many rules for allocated memory",
|
||||
err_p,
|
||||
NULL ) ;
|
||||
}
|
||||
for ( i = 0 ;
|
||||
;
|
||||
i++, r++ ) {
|
||||
|
||||
/* -- read the first integer -- */
|
||||
fscanf( rule_file,
|
||||
"%d",
|
||||
r ) ;
|
||||
/* -- a fail at the beginning of a field indicates end of record
|
||||
unless it's at the beginning of the record, in which case
|
||||
it's the end of file -- */
|
||||
if ( *r == FAIL ) {
|
||||
if ( i == 0 ) {
|
||||
is_eof = TRUE ;
|
||||
}
|
||||
break ;
|
||||
}
|
||||
/* -- check the input -- */
|
||||
if ( !is_input_symbol( *r ) ) {
|
||||
CLIENT_ERR( err_p ) ;
|
||||
RET_ERR2( "create_rules: Rule file: Bad Input Token %d at rule %d",
|
||||
*r,
|
||||
rule_number ,
|
||||
err_p,
|
||||
NULL ) ;
|
||||
}
|
||||
|
||||
/* -- build the trie structure -- */
|
||||
if ( Trie[ u ][ *r ] == FAIL ) {
|
||||
if ( ++last_node >= MAXNODES ) {
|
||||
RET_ERR( "create_rules: Too many nodes in gamma function",
|
||||
err_p,
|
||||
NULL ) ;
|
||||
}
|
||||
Trie[ u ][ *r ] = last_node ;
|
||||
PAGC_CALLOC_STRUC(Trie[last_node],NODE,MAXINSYM,err_p,NULL) ;
|
||||
for ( a = 0 ;
|
||||
a < MAXINSYM ;
|
||||
a++ ) {
|
||||
Trie[ last_node ][ a ] = FAIL ;
|
||||
}
|
||||
if ( !initialize_link( err_p ,
|
||||
o_l ,
|
||||
last_node ) ) {
|
||||
return NULL ;
|
||||
}
|
||||
}
|
||||
u = Trie[ u ][ *r ] ;
|
||||
}
|
||||
if ( is_eof )
|
||||
break ;
|
||||
keyw -> Input = rule_start ;
|
||||
if ( ( keyw -> Length = i ) == 0 ) {
|
||||
CLIENT_ERR( err_p ) ;
|
||||
RET_ERR1( "create_rules: Error Rule File: 0 length rule #%d",
|
||||
rule_number,
|
||||
err_p,
|
||||
NULL ) ;
|
||||
}
|
||||
|
||||
/* -- read the output tokens into the rule_space -- */
|
||||
r++ ; /* -- move to beginning of the output tokens -- */
|
||||
rule_start = r ; /* -- remember the beginning -- */
|
||||
while ( TRUE ) {
|
||||
fscanf( rule_file,
|
||||
"%d",
|
||||
r ) ;
|
||||
if ( *r == FAIL )
|
||||
break ;
|
||||
if ( !is_output_symbol( *r ) ) {
|
||||
RET_ERR2( "create_rules: Rule File: Non-Token %d in Rule #%d\n",
|
||||
*r ,
|
||||
rule_number,
|
||||
err_p,
|
||||
NULL ) ;
|
||||
}
|
||||
r++ ;
|
||||
}
|
||||
keyw -> Output = rule_start ;
|
||||
|
||||
/* -- classify the output -- */
|
||||
fscanf( rule_file ,
|
||||
"%d" ,
|
||||
&t ) ;
|
||||
fscanf( rule_file ,
|
||||
"%d" ,
|
||||
&w ) ;
|
||||
|
||||
classify_link( r_p ,
|
||||
o_l ,
|
||||
keyw ,
|
||||
u ,
|
||||
w ,
|
||||
t ) ;
|
||||
} /* -- end of file read -- */
|
||||
|
||||
|
||||
r_p -> rule_space = r_s ;
|
||||
r_p -> key_space = k_s ;
|
||||
r_p -> output_link = o_l ;
|
||||
r_p -> rules_read = rule_number ;
|
||||
|
||||
fclose( rule_file ) ;
|
||||
|
||||
|
||||
if ( ++last_node >= MAXNODES ) {
|
||||
RET_ERR( "create_rules: Too many nodes in gamma function" ,
|
||||
err_p,
|
||||
NULL) ;
|
||||
}
|
||||
/* -- change the EPSILON node transitions in preparation for Gamma -- */
|
||||
for ( a = 0 ;
|
||||
a < MAXINSYM ;
|
||||
a++ ) {
|
||||
if ( Trie[ EPSILON ][ a ] == FAIL ) {
|
||||
Trie[ EPSILON ][ a ] = EPSILON ;
|
||||
}
|
||||
}
|
||||
|
||||
/* -- create the global Gamma function matrix -- */
|
||||
if ( ( r_p -> gamma_matrix = precompute_gamma_function( err_p,
|
||||
Trie ,
|
||||
o_l ,
|
||||
last_node ) ) == NULL ) {
|
||||
return NULL ;
|
||||
}
|
||||
|
||||
/* -- no longer need the Trie -- */
|
||||
PAGC_DESTROY_2D_ARRAY(Trie,NODE,last_node) ;
|
||||
|
||||
|
||||
r_p -> num_nodes = last_node ;
|
||||
|
||||
if ( glo_p -> log_init ) {
|
||||
CLIENT_ERR( err_p ) ;
|
||||
LOG_MESS2( "create_rules: Rules installed with %d nodes and %d rules",
|
||||
last_node ,
|
||||
rule_number ,
|
||||
err_p ) ;
|
||||
}
|
||||
|
||||
return r_p ;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*---------------------------------------------------------------------------
|
||||
gamma.c (destroy_rules)
|
||||
----------------------------------------------------------------------------*/
|
||||
void destroy_rules( RULE_PARAM * r_p ) {
|
||||
if ( r_p != NULL ) {
|
||||
DBG("destroy_rules 1");
|
||||
FREE_AND_NULL( r_p -> rule_space ) ;
|
||||
DBG("destroy_rules 2");
|
||||
FREE_AND_NULL( r_p -> key_space ) ;
|
||||
DBG("destroy_rules 3");
|
||||
PAGC_DESTROY_2D_ARRAY(r_p->output_link,KW*,r_p->num_nodes) ;
|
||||
DBG("destroy_rules 4");
|
||||
PAGC_DESTROY_2D_ARRAY(r_p->gamma_matrix,NODE,r_p->num_nodes) ;
|
||||
DBG(" destroy_rules 5");
|
||||
FREE_AND_NULL( r_p ) ;
|
||||
}
|
||||
}
|
||||
|
||||
/* ========================= Output Links ========================= */
|
||||
|
||||
/*---------------------------------------------------------------------------
|
||||
gamma.c (initalize_link)
|
||||
called by gamma.c (create_rules)
|
||||
----------------------------------------------------------------------------*/
|
||||
static int initialize_link( ERR_PARAM *err_p ,
|
||||
KW ***o_l ,
|
||||
NODE u ) {
|
||||
int cl ;
|
||||
|
||||
/* -- classification by clause type -- */
|
||||
|
||||
PAGC_CALLOC_STRUC(o_l[u],KW *,MAX_CL,err_p,FALSE);
|
||||
for ( cl = 0 ;
|
||||
cl < MAX_CL ;
|
||||
cl++ ) {
|
||||
|
||||
o_l[ u ][ cl ] = NULL ;
|
||||
}
|
||||
return TRUE ;
|
||||
}
|
||||
|
||||
/*---------------------------------------------------------------------------
|
||||
gamma.c (classify_link)
|
||||
called by gamma.c (create_rules)
|
||||
----------------------------------------------------------------------------*/
|
||||
static void classify_link( RULE_PARAM *r_p ,
|
||||
KW ***o_l , /* -- 2006-11-02 : arg -- */
|
||||
KW *k ,
|
||||
NODE u ,
|
||||
SYMB w ,
|
||||
SYMB c ) {
|
||||
|
||||
/* -- classification by clause type -- */
|
||||
KW * last_key ,
|
||||
* penult ;
|
||||
|
||||
k -> hits = 0 ;
|
||||
k -> best = 0 ;
|
||||
k -> Type = c ;
|
||||
k -> Weight = w ;
|
||||
last_key = o_l[ u ][ c ] ; /* -- 2006-11-02 : arg -- */
|
||||
if ( last_key == NULL ) {
|
||||
o_l[ u ][ c ] = k ; /* -- 2006-11-02 : arg -- */
|
||||
|
||||
} else {
|
||||
/* -- if the same input symbols are used... -- */
|
||||
while ( ( penult = last_key -> OutputNext ) != NULL )
|
||||
last_key = penult ;
|
||||
last_key -> OutputNext = k ;
|
||||
}
|
||||
/* -- initialize in anticipation of failure extensions -- */
|
||||
k -> OutputNext = NULL ;
|
||||
|
||||
}
|
||||
|
||||
/*---------------------------------------------------------------------------
|
||||
gamma.c (add_failure_linkage)
|
||||
called by gamma.c (precompute_gamma_function)
|
||||
----------------------------------------------------------------------------*/
|
||||
static void add_failure_linkage( KW ***o_l ,
|
||||
NODE x ,
|
||||
NODE u ) {
|
||||
/* -- called by precompute_gamma_function
|
||||
-- x is the node in the failure function of the node u
|
||||
-- classification by clause type -- */
|
||||
KW *k ,
|
||||
*fk ;
|
||||
int cl ;
|
||||
|
||||
for ( cl = 0 ;
|
||||
cl < MAX_CL ;
|
||||
cl++ ) {
|
||||
/* -- append the failure keys for each class to the end of the
|
||||
appropriate chain -- */
|
||||
fk = o_l[ x ][ cl ] ;
|
||||
k = o_l[ u ][ cl ] ;
|
||||
if ( k == NULL ) {
|
||||
o_l[ u ][ cl ] = fk ;
|
||||
} else {
|
||||
/* -- since the chain will be already null-terminated, we only find
|
||||
the end of the chain if fk is non-null -- */
|
||||
if ( fk != NULL ) {
|
||||
/* -- append to the end of the list and make sure that the longer
|
||||
lengths go first - this is probably redundant. -- */
|
||||
while ( k -> OutputNext != NULL ) {
|
||||
k = k -> OutputNext ;
|
||||
}
|
||||
k -> OutputNext = fk ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*---------------------------------------------------------------------------
|
||||
gamma.c (precompute_gamma_function)
|
||||
called by gamma.c (create_rules)
|
||||
calls gamma.c (add_failure_linkage)
|
||||
----------------------------------------------------------------------------*/
|
||||
static NODE **precompute_gamma_function( ERR_PARAM *err_p ,
|
||||
NODE **Trie ,
|
||||
KW ***o_l ,
|
||||
NODE n ) {
|
||||
NODE u ,
|
||||
ua ,
|
||||
x ;
|
||||
SYMB a ;
|
||||
int i ,
|
||||
j ;
|
||||
NODE **Gamma ;
|
||||
NODE *Failure ,
|
||||
*Queue ;
|
||||
|
||||
/* -- Storage for Failure Function -- */
|
||||
PAGC_CALLOC_STRUC(Failure,NODE,n,err_p,NULL) ;
|
||||
/* -- Storage for Breadth First Search Queue -- */
|
||||
PAGC_CALLOC_STRUC(Queue,NODE,n,err_p,NULL) ;
|
||||
|
||||
PAGC_CALLOC_2D_ARRAY(Gamma,NODE,n,MAXINSYM,err_p,NULL) ;
|
||||
|
||||
u = EPSILON ;
|
||||
i = 0 ;
|
||||
for ( a = 0 ;
|
||||
a < MAXINSYM ;
|
||||
a++ ) {
|
||||
x = Trie[ EPSILON ][ a ] ;
|
||||
Gamma[ EPSILON ][ a ] = x ;
|
||||
Failure[ x ] = EPSILON ;
|
||||
/* -- add to Queue for breadth-first search -- */
|
||||
if ( x != EPSILON ) {
|
||||
Queue[ i++ ] = x ;
|
||||
}
|
||||
}
|
||||
Queue[ i ] = FAIL ; /* -- terminate the list of nodes to process -- */
|
||||
|
||||
for ( j = 0 ;
|
||||
Queue[ j ] != FAIL ;
|
||||
j++ ) {
|
||||
u = Queue[ j ] ;
|
||||
/* -- get non-Fail transitions from Trie onto queue -- */
|
||||
for ( a = 0 ;
|
||||
a < MAXINSYM ;
|
||||
a++ ) {
|
||||
if ( ( x = Trie[ u ][ a ] ) != FAIL ) {
|
||||
Queue[ i++ ] = x ;
|
||||
}
|
||||
}
|
||||
Queue[ i ] = FAIL ; /* -- mark end of list -- */
|
||||
x = Failure[ u ] ;
|
||||
add_failure_linkage( o_l ,
|
||||
x ,
|
||||
u ) ;
|
||||
for ( a = 0 ;
|
||||
a < MAXINSYM ;
|
||||
a ++ ) {
|
||||
ua = Trie[ u ][ a ] ;
|
||||
if ( ua != FAIL ) {
|
||||
Gamma[ u ][ a ] = ua ;
|
||||
Failure[ ua ] = Gamma[ x ][ a ] ;
|
||||
} else {
|
||||
Gamma[ u ][ a ] = Gamma[ x ][ a ] ;
|
||||
}
|
||||
}
|
||||
}
|
||||
FREE_AND_NULL( Failure ) ;
|
||||
FREE_AND_NULL( Queue ) ;
|
||||
return Gamma ;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static const char *rule_type_names[] = {
|
||||
"MACRO" , "MICRO" , "ARC" , "CIVIC" , "EXTRA"
|
||||
} ;
|
||||
|
||||
/* =========================================
|
||||
gamma.c (output_rule_statistics)
|
||||
uses macro OPEN_ALLOCATED_NAME
|
||||
stdio.h (printf,fprintf,fflush,fclose)
|
||||
===========================================*/
|
||||
#ifdef BUILD_API
|
||||
int output_rule_statistics( RULE_PARAM *r_p, ERR_PARAM *err_p ) {
|
||||
#else
|
||||
int output_rule_statistics( RULE_PARAM *r_p ,
|
||||
ERR_PARAM *err_p ,
|
||||
char *name ,
|
||||
DS_Handle _file_sys_p ) {
|
||||
#endif
|
||||
int i ,
|
||||
found_count ,
|
||||
n ;
|
||||
SYMB *OL ;
|
||||
char *sts_name = NULL ;
|
||||
FILE *sts_file = NULL ;
|
||||
KW * k ;
|
||||
KW * k_s ;
|
||||
double hit_frequency ,
|
||||
best_frequency ;
|
||||
|
||||
if ( !r_p -> collect_statistics ) {
|
||||
printf( "Statistics were not collected\n" ) ;
|
||||
return FALSE ;
|
||||
}
|
||||
|
||||
#ifndef BUILD_API
|
||||
if ( name != NULL && name[ 0 ] != SENTINEL ) {
|
||||
OPEN_ALLOCATED_NAME(sts_name,"sts",sts_file,name,"wb+",_file_sys_p,err_p,FALSE) ;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* -- cycle through the keys -- */
|
||||
n = r_p -> rules_read ;
|
||||
k_s = r_p -> key_space ;
|
||||
for ( i = 0 , found_count = 0 ;
|
||||
i < n ;
|
||||
i++ ) {
|
||||
k = k_s + i ;
|
||||
if ( k -> hits == 0 ) {
|
||||
continue ;
|
||||
}
|
||||
|
||||
found_count++ ;
|
||||
if ( sts_file == NULL ) {
|
||||
printf( "\nRule %d is of type %d (%s)\n: " ,
|
||||
i ,
|
||||
k -> Type ,
|
||||
rule_type_names[ k -> Type ] ) ;
|
||||
printf( "Input : " ) ;
|
||||
} else {
|
||||
fprintf( sts_file ,
|
||||
"\nRule %d is of type %d (%s)\n: " ,
|
||||
i ,
|
||||
k -> Type ,
|
||||
rule_type_names[ k -> Type ] ) ;
|
||||
fprintf( sts_file ,
|
||||
"Input : " ) ;
|
||||
}
|
||||
for ( OL = k -> Input ;
|
||||
*OL != FAIL ;
|
||||
OL++ ) {
|
||||
if ( sts_file == NULL ) {
|
||||
printf( "|%d (%s)|" ,
|
||||
*OL ,
|
||||
in_symb_name( *OL ) ) ;
|
||||
} else {
|
||||
fprintf( sts_file ,
|
||||
"|%d (%s)|" ,
|
||||
*OL ,
|
||||
in_symb_name( *OL ) ) ;
|
||||
}
|
||||
}
|
||||
if ( sts_file == NULL ) {
|
||||
printf( "\nOutput: " ) ;
|
||||
|
||||
} else {
|
||||
fprintf( sts_file ,
|
||||
"\nOutput: " ) ;
|
||||
}
|
||||
/* -- output the output symbols -- */
|
||||
for ( OL = k -> Output ;
|
||||
*OL != FAIL ;
|
||||
OL++ ) {
|
||||
if ( sts_file == NULL ) {
|
||||
printf( "|%d (%s)|" ,
|
||||
*OL ,
|
||||
out_symb_name( *OL ) ) ;
|
||||
} else {
|
||||
fprintf( sts_file ,
|
||||
"|%d (%s)|" ,
|
||||
*OL ,
|
||||
out_symb_name( *OL ) ) ;
|
||||
}
|
||||
}
|
||||
if ( sts_file == NULL ) {
|
||||
printf ( "\nrank %d ( %f): hits %d out of %d\n" ,
|
||||
k -> Weight ,
|
||||
load_value[ k -> Weight ] ,
|
||||
k->hits,
|
||||
r_p -> total_key_hits ) ;
|
||||
} else {
|
||||
hit_frequency = ( ( double ) k -> hits ) / ( ( double ) r_p -> total_key_hits ) ;
|
||||
best_frequency = ( ( double ) k -> best ) / ( ( double ) r_p -> total_best_keys ) ;
|
||||
fprintf( sts_file ,
|
||||
"\nrank %d ( %f): hit frequency: %f, best frequency: %f" ,
|
||||
k -> Weight ,
|
||||
load_value[ k -> Weight ] ,
|
||||
hit_frequency ,
|
||||
best_frequency ) ;
|
||||
fprintf ( sts_file ,
|
||||
"\n%d hits out of %d, best %d out of %d\n" ,
|
||||
k->hits, r_p -> total_key_hits, k-> best, r_p -> total_best_keys ) ;
|
||||
}
|
||||
k -> hits = 0 ;
|
||||
k -> best = 0 ;
|
||||
}
|
||||
if ( sts_file == NULL ) {
|
||||
printf( "Found %d rules hit\n" , found_count ) ;
|
||||
} else {
|
||||
fprintf( sts_file ,
|
||||
"Found %d rules hit\n" ,
|
||||
found_count ) ;
|
||||
}
|
||||
/* -- start over -- */
|
||||
r_p -> total_key_hits = 0 ;
|
||||
r_p -> total_best_keys = 0 ;
|
||||
if ( sts_file != NULL ) {
|
||||
fflush( sts_file ) ;
|
||||
fclose( sts_file ) ;
|
||||
FREE_AND_NULL( sts_name ) ;
|
||||
} else {
|
||||
fflush( stdout ) ;
|
||||
}
|
||||
return TRUE ;
|
||||
}
|
||||
|
3
extras/address_standardizer/gamma.h
Normal file
3
extras/address_standardizer/gamma.h
Normal file
|
@ -0,0 +1,3 @@
|
|||
#define MAXRULES 4500
|
||||
#define MAXNODES 5000
|
||||
#define RULESPACESIZE 60000
|
1074
extras/address_standardizer/gazeteer.csv
Normal file
1074
extras/address_standardizer/gazeteer.csv
Normal file
File diff suppressed because it is too large
Load diff
191
extras/address_standardizer/hash.c
Normal file
191
extras/address_standardizer/hash.c
Normal file
|
@ -0,0 +1,191 @@
|
|||
|
||||
//
|
||||
// hash.c
|
||||
//
|
||||
// Copyright (c) 2012 TJ Holowaychuk <tj@vision-media.ca>
|
||||
//
|
||||
|
||||
#include "hash.h"
|
||||
|
||||
/*
|
||||
* Set hash `key` to `val`.
|
||||
*/
|
||||
|
||||
inline void
|
||||
hash_set(hash_t *self, char *key, void *val) {
|
||||
int ret;
|
||||
khiter_t k = kh_put(ptr, self, key, &ret);
|
||||
kh_value(self, k) = val;
|
||||
}
|
||||
|
||||
/*
|
||||
* Get hash `key`, or NULL.
|
||||
*/
|
||||
|
||||
inline void *
|
||||
hash_get(hash_t *self, char *key) {
|
||||
khiter_t k = kh_get(ptr, self, key);
|
||||
return k == kh_end(self) ? NULL : kh_value(self, k);
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if hash `key` exists.
|
||||
*/
|
||||
|
||||
inline int
|
||||
hash_has(hash_t *self, char *key) {
|
||||
khiter_t k = kh_get(ptr, self, key);
|
||||
return kh_exist(self, k);
|
||||
}
|
||||
|
||||
/*
|
||||
* Remove hash `key`.
|
||||
*/
|
||||
|
||||
void
|
||||
hash_del(hash_t *self, char *key) {
|
||||
khiter_t k = kh_get(ptr, self, key);
|
||||
kh_del(ptr, self, k);
|
||||
}
|
||||
|
||||
// tests
|
||||
|
||||
#ifdef TEST_HASH
|
||||
|
||||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
|
||||
void
|
||||
test_hash_set() {
|
||||
hash_t *hash = hash_new();
|
||||
assert(0 == hash_size(hash));
|
||||
|
||||
hash_set(hash, "name", "tobi");
|
||||
hash_set(hash, "species", "ferret");
|
||||
assert(2 == hash_size(hash));
|
||||
|
||||
assert(0 == strcmp("tobi", hash_get(hash, "name")));
|
||||
assert(0 == strcmp("ferret", hash_get(hash, "species")));
|
||||
}
|
||||
|
||||
void
|
||||
test_hash_get() {
|
||||
hash_t *hash = hash_new();
|
||||
hash_set(hash, "foo", "bar");
|
||||
assert(0 == strcmp("bar", hash_get(hash, "foo")));
|
||||
assert(NULL == hash_get(hash, "bar"));
|
||||
}
|
||||
|
||||
void
|
||||
test_hash_has() {
|
||||
hash_t *hash = hash_new();
|
||||
hash_set(hash, "foo", "bar");
|
||||
assert(1 == hash_has(hash, "foo"));
|
||||
assert(0 == hash_has(hash, "bar"));
|
||||
}
|
||||
|
||||
void
|
||||
test_hash_size() {
|
||||
hash_t *hash = hash_new();
|
||||
assert(0 == hash_size(hash));
|
||||
hash_set(hash, "foo", "bar");
|
||||
assert(1 == hash_size(hash));
|
||||
hash_set(hash, "bar", "baz");
|
||||
assert(2 == hash_size(hash));
|
||||
}
|
||||
|
||||
void
|
||||
test_hash_del() {
|
||||
hash_t *hash = hash_new();
|
||||
hash_set(hash, "foo", "bar");
|
||||
assert(1 == hash_has(hash, "foo"));
|
||||
assert(0 == hash_has(hash, "bar"));
|
||||
hash_del(hash, "foo");
|
||||
hash_del(hash, "bar");
|
||||
assert(0 == hash_has(hash, "foo"));
|
||||
}
|
||||
|
||||
void
|
||||
test_hash_clear() {
|
||||
hash_t *hash = hash_new();
|
||||
hash_set(hash, "foo", "bar");
|
||||
hash_set(hash, "bar", "baz");
|
||||
hash_set(hash, "raz", "jaz");
|
||||
assert(3 == hash_size(hash));
|
||||
hash_clear(hash);
|
||||
assert(0 == hash_size(hash));
|
||||
}
|
||||
|
||||
void
|
||||
test_hash_each() {
|
||||
hash_t *hash = hash_new();
|
||||
hash_set(hash, "name", "tj");
|
||||
hash_set(hash, "age", "25");
|
||||
|
||||
const char *keys[2];
|
||||
void *vals[2];
|
||||
int n = 0;
|
||||
|
||||
hash_each(hash, {
|
||||
keys[n] = key;
|
||||
vals[n] = val;
|
||||
n++;
|
||||
});
|
||||
|
||||
assert(0 == strcmp("age", keys[0]) || 0 == strcmp("name", keys[0]));
|
||||
assert(0 == strcmp("age", keys[1]) || 0 == strcmp("name", keys[1]));
|
||||
assert(0 == strcmp("25", vals[0]) || 0 == strcmp("tj", vals[0]));
|
||||
assert(0 == strcmp("25", vals[1]) || 0 == strcmp("tj", vals[1]));
|
||||
}
|
||||
|
||||
void
|
||||
test_hash_each_key() {
|
||||
hash_t *hash = hash_new();
|
||||
hash_set(hash, "name", "tj");
|
||||
hash_set(hash, "age", "25");
|
||||
|
||||
const char *keys[2];
|
||||
int n = 0;
|
||||
|
||||
hash_each_key(hash, {
|
||||
keys[n++] = key;
|
||||
});
|
||||
|
||||
assert(0 == strcmp("age", keys[0]) || 0 == strcmp("name", keys[0]));
|
||||
assert(0 == strcmp("age", keys[1]) || 0 == strcmp("name", keys[1]));
|
||||
}
|
||||
|
||||
void
|
||||
test_hash_each_val() {
|
||||
hash_t *hash = hash_new();
|
||||
hash_set(hash, "name", "tj");
|
||||
hash_set(hash, "age", "25");
|
||||
|
||||
void *vals[2];
|
||||
int n = 0;
|
||||
|
||||
hash_each_val(hash, {
|
||||
vals[n++] = val;
|
||||
});
|
||||
|
||||
assert(0 == strcmp("25", vals[0]) || 0 == strcmp("tj", vals[0]));
|
||||
assert(0 == strcmp("25", vals[1]) || 0 == strcmp("tj", vals[1]));
|
||||
}
|
||||
|
||||
int
|
||||
main(){
|
||||
test_hash_set();
|
||||
test_hash_get();
|
||||
test_hash_has();
|
||||
test_hash_del();
|
||||
test_hash_size();
|
||||
test_hash_clear();
|
||||
test_hash_each();
|
||||
test_hash_each_key();
|
||||
test_hash_each_val();
|
||||
printf("\n \e[32m\u2713 \e[90mok\e[0m\n\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
107
extras/address_standardizer/hash.h
Normal file
107
extras/address_standardizer/hash.h
Normal file
|
@ -0,0 +1,107 @@
|
|||
|
||||
//
|
||||
// hash.h
|
||||
//
|
||||
// Copyright (c) 2012 TJ Holowaychuk <tj@vision-media.ca>
|
||||
//
|
||||
|
||||
#ifndef HASH
|
||||
#define HASH
|
||||
|
||||
//#include <postgres.h>
|
||||
#include "khash.h"
|
||||
|
||||
// pointer hash
|
||||
|
||||
KHASH_MAP_INIT_STR(ptr, void *);
|
||||
|
||||
/*
|
||||
* Hash type.
|
||||
*/
|
||||
|
||||
typedef khash_t(ptr) hash_t;
|
||||
|
||||
/*
|
||||
* Allocate a new hash.
|
||||
*/
|
||||
|
||||
#define hash_new() kh_init(ptr)
|
||||
|
||||
/*
|
||||
* Destroy the hash.
|
||||
*/
|
||||
|
||||
#define hash_free(self) kh_destroy(ptr, self)
|
||||
|
||||
/*
|
||||
* Hash size.
|
||||
*/
|
||||
|
||||
#define hash_size kh_size
|
||||
|
||||
/*
|
||||
* Remove all pairs in the hash.
|
||||
*/
|
||||
|
||||
#define hash_clear(self) kh_clear(ptr, self)
|
||||
|
||||
/*
|
||||
* Iterate hash keys and ptrs, populating
|
||||
* `key` and `val`.
|
||||
*/
|
||||
|
||||
#define hash_each(self, block) { \
|
||||
const char *key; \
|
||||
void *val; \
|
||||
for (khiter_t k = kh_begin(self); k < kh_end(self); ++k) { \
|
||||
if (!kh_exist(self, k)) continue; \
|
||||
key = kh_key(self, k); \
|
||||
val = kh_value(self, k); \
|
||||
block; \
|
||||
} \
|
||||
}
|
||||
|
||||
/*
|
||||
* Iterate hash keys, populating `key`.
|
||||
*/
|
||||
|
||||
#define hash_each_key(self, block) { \
|
||||
const char *key; \
|
||||
for (khiter_t k = kh_begin(self); k < kh_end(self); ++k) { \
|
||||
if (!kh_exist(self, k)) continue; \
|
||||
key = kh_key(self, k); \
|
||||
block; \
|
||||
} \
|
||||
}
|
||||
|
||||
/*
|
||||
* Iterate hash ptrs, populating `val`.
|
||||
*/
|
||||
|
||||
#define hash_each_val(self, block) { \
|
||||
void *val; \
|
||||
for (khiter_t k = kh_begin(self); k < kh_end(self); ++k) { \
|
||||
if (!kh_exist(self, k)) continue; \
|
||||
val = kh_value(self, k); \
|
||||
block; \
|
||||
} \
|
||||
}
|
||||
|
||||
// protos
|
||||
|
||||
void
|
||||
hash_set(hash_t *self, char *key, void *val);
|
||||
|
||||
void *
|
||||
hash_get(hash_t *self, char *key);
|
||||
|
||||
int
|
||||
hash_has(hash_t *self, char *key);
|
||||
|
||||
void
|
||||
hash_del(hash_t *self, char *key);
|
||||
|
||||
void
|
||||
hash_clear(hash_t *self);
|
||||
|
||||
#endif /* HASH */
|
317
extras/address_standardizer/khash.h
Normal file
317
extras/address_standardizer/khash.h
Normal file
|
@ -0,0 +1,317 @@
|
|||
/* The MIT License
|
||||
|
||||
Copyright (c) 2008, by Attractive Chaos <attractivechaos@aol.co.uk>
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
An example:
|
||||
|
||||
#include "khash.h"
|
||||
KHASH_MAP_INIT_INT(32, char)
|
||||
int main() {
|
||||
int ret, is_missing;
|
||||
khiter_t k;
|
||||
khash_t(32) *h = kh_init(32);
|
||||
k = kh_put(32, h, 5, &ret);
|
||||
if (!ret) kh_del(32, h, k);
|
||||
kh_value(h, k) = 10;
|
||||
k = kh_get(32, h, 10);
|
||||
is_missing = (k == kh_end(h));
|
||||
k = kh_get(32, h, 5);
|
||||
kh_del(32, h, k);
|
||||
for (k = kh_begin(h); k != kh_end(h); ++k)
|
||||
if (kh_exist(h, k)) kh_value(h, k) = 1;
|
||||
kh_destroy(32, h);
|
||||
return 0;
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
2008-09-19 (0.2.3):
|
||||
|
||||
* Corrected the example
|
||||
* Improved interfaces
|
||||
|
||||
2008-09-11 (0.2.2):
|
||||
|
||||
* Improved speed a little in kh_put()
|
||||
|
||||
2008-09-10 (0.2.1):
|
||||
|
||||
* Added kh_clear()
|
||||
* Fixed a compiling error
|
||||
|
||||
2008-09-02 (0.2.0):
|
||||
|
||||
* Changed to token concatenation which increases flexibility.
|
||||
|
||||
2008-08-31 (0.1.2):
|
||||
|
||||
* Fixed a bug in kh_get(), which has not been tested previously.
|
||||
|
||||
2008-08-31 (0.1.1):
|
||||
|
||||
* Added destructor
|
||||
*/
|
||||
|
||||
|
||||
#ifndef __AC_KHASH_H
|
||||
#define __AC_KHASH_H
|
||||
|
||||
#define AC_VERSION_KHASH_H "0.2.2"
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
typedef uint32_t khint_t;
|
||||
typedef khint_t khiter_t;
|
||||
|
||||
#define __ac_HASH_PRIME_SIZE 32
|
||||
static const uint32_t __ac_prime_list[__ac_HASH_PRIME_SIZE] =
|
||||
{
|
||||
0ul, 3ul, 11ul, 23ul, 53ul,
|
||||
97ul, 193ul, 389ul, 769ul, 1543ul,
|
||||
3079ul, 6151ul, 12289ul, 24593ul, 49157ul,
|
||||
98317ul, 196613ul, 393241ul, 786433ul, 1572869ul,
|
||||
3145739ul, 6291469ul, 12582917ul, 25165843ul, 50331653ul,
|
||||
100663319ul, 201326611ul, 402653189ul, 805306457ul, 1610612741ul,
|
||||
3221225473ul, 4294967291ul
|
||||
};
|
||||
|
||||
#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2)
|
||||
#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1)
|
||||
#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3)
|
||||
#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1)))
|
||||
#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1)))
|
||||
#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1)))
|
||||
#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1))
|
||||
|
||||
static const double __ac_HASH_UPPER = 0.77;
|
||||
|
||||
#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \
|
||||
typedef struct { \
|
||||
khint_t n_buckets, size, n_occupied, upper_bound; \
|
||||
uint32_t *flags; \
|
||||
khkey_t *keys; \
|
||||
khval_t *vals; \
|
||||
} kh_##name##_t; \
|
||||
static inline kh_##name##_t *kh_init_##name() { \
|
||||
return (kh_##name##_t*)calloc(1,sizeof(kh_##name##_t)); \
|
||||
} \
|
||||
static inline void kh_destroy_##name(kh_##name##_t *h) \
|
||||
{ \
|
||||
if (h) { \
|
||||
free(h->keys); \
|
||||
free(h->flags); \
|
||||
free(h->vals); \
|
||||
/*free(h);*/ \
|
||||
} \
|
||||
} \
|
||||
static inline void kh_clear_##name(kh_##name##_t *h) \
|
||||
{ \
|
||||
if (h && h->flags) { \
|
||||
memset(h->flags, 0xaa, ((h->n_buckets>>4) + 1) * sizeof(uint32_t)); \
|
||||
h->size = h->n_occupied = 0; \
|
||||
} \
|
||||
} \
|
||||
static inline khint_t kh_get_##name(kh_##name##_t *h, khkey_t key) \
|
||||
{ \
|
||||
if (h->n_buckets) { \
|
||||
khint_t inc, k, i, last; \
|
||||
k = __hash_func(key); i = k % h->n_buckets; \
|
||||
inc = 1 + k % (h->n_buckets - 1); last = i; \
|
||||
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
|
||||
if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
|
||||
else i += inc; \
|
||||
if (i == last) return h->n_buckets; \
|
||||
} \
|
||||
return __ac_iseither(h->flags, i)? h->n_buckets : i; \
|
||||
} else return 0; \
|
||||
} \
|
||||
static inline void kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \
|
||||
{ \
|
||||
uint32_t *new_flags = 0; \
|
||||
khint_t j = 1; \
|
||||
{ \
|
||||
khint_t t = __ac_HASH_PRIME_SIZE - 1; \
|
||||
while (__ac_prime_list[t] > new_n_buckets) --t; \
|
||||
new_n_buckets = __ac_prime_list[t+1]; \
|
||||
if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; \
|
||||
else { \
|
||||
new_flags = (uint32_t*)malloc(((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \
|
||||
memset(new_flags, 0xaa, ((new_n_buckets>>4) + 1) * sizeof(uint32_t)); \
|
||||
if (h->n_buckets < new_n_buckets) { \
|
||||
h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
|
||||
if (kh_is_map) \
|
||||
h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
if (j) { \
|
||||
for (j = 0; j != h->n_buckets; ++j) { \
|
||||
if (__ac_iseither(h->flags, j) == 0) { \
|
||||
khkey_t key = h->keys[j]; \
|
||||
khval_t val; \
|
||||
if (kh_is_map) val = h->vals[j]; \
|
||||
__ac_set_isdel_true(h->flags, j); \
|
||||
while (1) { \
|
||||
khint_t inc, k, i; \
|
||||
k = __hash_func(key); \
|
||||
i = k % new_n_buckets; \
|
||||
inc = 1 + k % (new_n_buckets - 1); \
|
||||
while (!__ac_isempty(new_flags, i)) { \
|
||||
if (i + inc >= new_n_buckets) i = i + inc - new_n_buckets; \
|
||||
else i += inc; \
|
||||
} \
|
||||
__ac_set_isempty_false(new_flags, i); \
|
||||
if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { \
|
||||
{ khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \
|
||||
if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \
|
||||
__ac_set_isdel_true(h->flags, i); \
|
||||
} else { \
|
||||
h->keys[i] = key; \
|
||||
if (kh_is_map) h->vals[i] = val; \
|
||||
break; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
if (h->n_buckets > new_n_buckets) { \
|
||||
h->keys = (khkey_t*)realloc(h->keys, new_n_buckets * sizeof(khkey_t)); \
|
||||
if (kh_is_map) \
|
||||
h->vals = (khval_t*)realloc(h->vals, new_n_buckets * sizeof(khval_t)); \
|
||||
} \
|
||||
free(h->flags); \
|
||||
h->flags = new_flags; \
|
||||
h->n_buckets = new_n_buckets; \
|
||||
h->n_occupied = h->size; \
|
||||
h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \
|
||||
} \
|
||||
} \
|
||||
static inline khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \
|
||||
{ \
|
||||
khint_t x; \
|
||||
if (h->n_occupied >= h->upper_bound) { \
|
||||
if (h->n_buckets > (h->size<<1)) kh_resize_##name(h, h->n_buckets - 1); \
|
||||
else kh_resize_##name(h, h->n_buckets + 1); \
|
||||
} \
|
||||
{ \
|
||||
khint_t inc, k, i, site, last; \
|
||||
x = site = h->n_buckets; k = __hash_func(key); i = k % h->n_buckets; \
|
||||
if (__ac_isempty(h->flags, i)) x = i; \
|
||||
else { \
|
||||
inc = 1 + k % (h->n_buckets - 1); last = i; \
|
||||
while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \
|
||||
if (__ac_isdel(h->flags, i)) site = i; \
|
||||
if (i + inc >= h->n_buckets) i = i + inc - h->n_buckets; \
|
||||
else i += inc; \
|
||||
if (i == last) { x = site; break; } \
|
||||
} \
|
||||
if (x == h->n_buckets) { \
|
||||
if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \
|
||||
else x = i; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
if (__ac_isempty(h->flags, x)) { \
|
||||
h->keys[x] = key; \
|
||||
__ac_set_isboth_false(h->flags, x); \
|
||||
++h->size; ++h->n_occupied; \
|
||||
*ret = 1; \
|
||||
} else if (__ac_isdel(h->flags, x)) { \
|
||||
h->keys[x] = key; \
|
||||
__ac_set_isboth_false(h->flags, x); \
|
||||
++h->size; \
|
||||
*ret = 2; \
|
||||
} else *ret = 0; \
|
||||
return x; \
|
||||
} \
|
||||
static inline void kh_del_##name(kh_##name##_t *h, khint_t x) \
|
||||
{ \
|
||||
if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \
|
||||
__ac_set_isdel_true(h->flags, x); \
|
||||
--h->size; \
|
||||
} \
|
||||
}
|
||||
|
||||
/* --- BEGIN OF HASH FUNCTIONS --- */
|
||||
|
||||
#define kh_int_hash_func(key) (uint32_t)(key)
|
||||
#define kh_int_hash_equal(a, b) (a == b)
|
||||
#define kh_int64_hash_func(key) (uint32_t)((key)>>33^(key)^(key)<<11)
|
||||
#define kh_int64_hash_equal(a, b) (a == b)
|
||||
static inline khint_t __ac_X31_hash_string(const char *s)
|
||||
{
|
||||
khint_t h = *s;
|
||||
if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
|
||||
return h;
|
||||
}
|
||||
#define kh_str_hash_func(key) __ac_X31_hash_string(key)
|
||||
#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0)
|
||||
|
||||
/* --- END OF HASH FUNCTIONS --- */
|
||||
|
||||
/* Other necessary macros... */
|
||||
|
||||
#define khash_t(name) kh_##name##_t
|
||||
|
||||
#define kh_init(name) kh_init_##name()
|
||||
#define kh_destroy(name, h) kh_destroy_##name(h)
|
||||
#define kh_clear(name, h) kh_clear_##name(h)
|
||||
#define kh_resize(name, h, s) kh_resize_##name(h, s)
|
||||
#define kh_put(name, h, k, r) kh_put_##name(h, k, r)
|
||||
#define kh_get(name, h, k) kh_get_##name(h, k)
|
||||
#define kh_del(name, h, k) kh_del_##name(h, k)
|
||||
|
||||
#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x)))
|
||||
#define kh_key(h, x) ((h)->keys[x])
|
||||
#define kh_val(h, x) ((h)->vals[x])
|
||||
#define kh_value(h, x) ((h)->vals[x])
|
||||
#define kh_begin(h) (khint_t)(0)
|
||||
#define kh_end(h) ((h)->n_buckets)
|
||||
#define kh_size(h) ((h)->size)
|
||||
#define kh_n_buckets(h) ((h)->n_buckets)
|
||||
|
||||
/* More conenient interfaces */
|
||||
|
||||
#define KHASH_SET_INIT_INT(name) \
|
||||
KHASH_INIT(name, uint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal)
|
||||
|
||||
#define KHASH_MAP_INIT_INT(name, khval_t) \
|
||||
KHASH_INIT(name, uint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal)
|
||||
|
||||
#define KHASH_SET_INIT_INT64(name) \
|
||||
KHASH_INIT(name, uint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal)
|
||||
|
||||
#define KHASH_MAP_INIT_INT64(name, khval_t) \
|
||||
KHASH_INIT(name, uint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal)
|
||||
|
||||
typedef const char *kh_cstr_t;
|
||||
#define KHASH_SET_INIT_STR(name) \
|
||||
KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal)
|
||||
|
||||
#define KHASH_MAP_INIT_STR(name, khval_t) \
|
||||
KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal)
|
||||
|
||||
#endif /* __AC_KHASH_H */
|
529
extras/address_standardizer/lexicon.c
Normal file
529
extras/address_standardizer/lexicon.c
Normal file
|
@ -0,0 +1,529 @@
|
|||
/* -- lexicon.c
|
||||
|
||||
This file reads the lexicon definitions into a chained
|
||||
hash table and handles the lookups of words in the hash table,
|
||||
returning definitions in the form of an input symbol and a
|
||||
standardized text.
|
||||
|
||||
Prototype 7H08 (This file was written by Walter Sinclair).
|
||||
|
||||
This file is part of pagc.
|
||||
|
||||
Copyright (c) 2008 Walter Bruce Sinclair
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
*/
|
||||
/* For pagc-0.4.2 : last revised 2012-05-23 */
|
||||
|
||||
#undef DEBUG
|
||||
//#define DEBUG
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stddef.h>
|
||||
#include <ctype.h>
|
||||
#include "pagc_api.h"
|
||||
|
||||
/* -- Hash table size should be a prime number -- */
|
||||
/* 5581, 5953, 6337, 6733, 7561, 7993, 8893, 10333, 10837, 11353, 12421, 12973, 13537, 15913, 18481 */
|
||||
#define LEXICON_HTABSIZE 7561
|
||||
|
||||
#ifdef BUILD_API
|
||||
#include "pagc_std_api.h"
|
||||
#endif
|
||||
|
||||
/* -- local prototypes -- */
|
||||
static unsigned calc_hash( char * ) ;
|
||||
static ENTRY **create_hash_table( ERR_PARAM * ) ;
|
||||
static int add_dict_entry( ERR_PARAM *, ENTRY ** , char * , int , SYMB , char * ) ;
|
||||
|
||||
#ifndef BUILD_API
|
||||
static char *convert_field( char * , char * ) ;
|
||||
static int read_lexicon( ERR_PARAM *, ENTRY ** , FILE * ) ;
|
||||
#endif
|
||||
|
||||
LEXICON *lex_init( ERR_PARAM *err_p ) ;
|
||||
static int append_new_def( ERR_PARAM *, ENTRY * , SYMB , char * , int ) ;
|
||||
static unsigned elf_hash( char * ) ;
|
||||
void print_lexicon( ENTRY ** hash_table ) ;
|
||||
|
||||
#ifdef BUILD_API
|
||||
|
||||
/*
|
||||
typedef struct LEXICON_s {
|
||||
ENTRY **hash_table;
|
||||
ERR_PARAM *err_p;
|
||||
} LEXICON;
|
||||
|
||||
*/
|
||||
|
||||
LEXICON *lex_init( ERR_PARAM *err_p )
|
||||
{
|
||||
LEXICON *lex;
|
||||
|
||||
PAGC_CALLOC_STRUC(lex,LEXICON,1,err_p,NULL);
|
||||
|
||||
lex->hash_table = create_hash_table( err_p );
|
||||
if (lex->hash_table == NULL) {
|
||||
lex_free(lex);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
lex->err_p = err_p;
|
||||
|
||||
return lex;
|
||||
}
|
||||
|
||||
int lex_add_entry(LEXICON *lex, int seq, char *word, char *stdword, SYMB token)
|
||||
{
|
||||
return add_dict_entry( lex->err_p, lex->hash_table, word, seq-1, token, stdword);
|
||||
}
|
||||
|
||||
void lex_free(LEXICON *lex)
|
||||
{
|
||||
if (lex == NULL) return;
|
||||
destroy_lexicon(lex->hash_table);
|
||||
free(lex);
|
||||
lex = NULL;
|
||||
}
|
||||
|
||||
|
||||
|
||||
#else
|
||||
|
||||
/* ---------------------------------------------------------------------
|
||||
lexicon.c (create_lexicon) -
|
||||
read the lexicon file into memory, chaining off a hash table
|
||||
returns a pointer to the hash table, or NULL if error.
|
||||
called by standard.l (init_stand_process)
|
||||
calls util.c (open_aux_file) lexicon.c (read_lexicon, create_hash_table)
|
||||
uses macro LOG_MESS
|
||||
stdio.h (fclose)
|
||||
-----------------------------------------------------------------------*/
|
||||
ENTRY **create_lexicon( PAGC_GLOBAL *glo_p ,
|
||||
const char *lex_name ,
|
||||
const char *gaz_name ) {
|
||||
/* -- called by init_stand_process to read in the Lexicon and set up the
|
||||
definitions in memory for hash table access -- */
|
||||
FILE *gaz_file ,
|
||||
*dict_file ;
|
||||
ENTRY **hash_table ;
|
||||
|
||||
if ( (hash_table = create_hash_table( glo_p -> process_errors ) ) == NULL ) {
|
||||
return NULL ;
|
||||
}
|
||||
/* 2009-08-13 : support multiple lexicons */
|
||||
if ( gaz_name != NULL ) {
|
||||
if ( ( gaz_file = open_aux_file( glo_p ,
|
||||
gaz_name ) ) == NULL )
|
||||
return NULL ;
|
||||
if ( !read_lexicon( glo_p -> process_errors ,
|
||||
hash_table ,
|
||||
gaz_file ) ) {
|
||||
fclose( gaz_file ) ;
|
||||
return NULL ;
|
||||
}
|
||||
fclose( gaz_file ) ;
|
||||
}
|
||||
|
||||
if ( ( dict_file = open_aux_file( glo_p ,
|
||||
lex_name ) ) == NULL )
|
||||
return NULL ;
|
||||
if ( !read_lexicon( glo_p -> process_errors ,
|
||||
hash_table ,
|
||||
dict_file ) ) {
|
||||
fclose( dict_file ) ;
|
||||
return NULL ;
|
||||
}
|
||||
fclose( dict_file ) ;
|
||||
return hash_table ;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------
|
||||
lexicon.c (read_lexicon) -
|
||||
called by lexicon.c (create_lexicon) for each file
|
||||
calls convert_field, add_dict_entry
|
||||
returns FALSE if error encountered
|
||||
stdio.h (fgets,feof,sscanf)
|
||||
uses macro BLANK_STRING
|
||||
-------------------------------------------------------*/
|
||||
static int read_lexicon( ERR_PARAM *err_p ,
|
||||
ENTRY **hash_table ,
|
||||
FILE *CFile ) {
|
||||
char record_buffer[ MAXSTRLEN ] ;
|
||||
char lookup_str[ MAXTEXT ] ;
|
||||
char num_str[ MAXTEXT ] ;
|
||||
int cur_token ;
|
||||
int num_def ;
|
||||
char standard_str[ MAXTEXT ] ;
|
||||
char *next_str ;
|
||||
|
||||
while ( !feof( CFile ) ) {
|
||||
/* -- read in each line of the csv file and add to hash table -- */
|
||||
BLANK_STRING(record_buffer) ;
|
||||
fgets( record_buffer ,
|
||||
MAXSTRLEN ,
|
||||
CFile ) ;
|
||||
|
||||
#ifdef SEW_NOT_SURE_IF_WE_NEED_THIS
|
||||
/* -- check for and skip over blank lines -- */
|
||||
if (strspn(record_buffer, " \t\r\n") == strlen(record_buffer))
|
||||
continue;
|
||||
#endif
|
||||
|
||||
/* -- comma-separated values are handled only as well as necessary
|
||||
in the present context -- */
|
||||
if ( ( next_str =
|
||||
convert_field( num_str ,
|
||||
record_buffer ) ) == NULL ) {
|
||||
break ;
|
||||
}
|
||||
sscanf( num_str ,
|
||||
"%d" ,
|
||||
&num_def ) ;
|
||||
next_str = convert_field( lookup_str ,
|
||||
next_str ) ;
|
||||
next_str = convert_field( num_str ,
|
||||
next_str ) ;
|
||||
sscanf( num_str ,
|
||||
"%d" ,
|
||||
&cur_token ) ;
|
||||
next_str = convert_field( standard_str ,
|
||||
next_str ) ;
|
||||
if ( add_dict_entry( err_p ,
|
||||
hash_table ,
|
||||
lookup_str ,
|
||||
( num_def - 1 ) ,
|
||||
cur_token ,
|
||||
standard_str ) == ERR_FAIL ) {
|
||||
return FALSE ;
|
||||
}
|
||||
}
|
||||
return TRUE ;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------
|
||||
lexicon.c (convert_field)
|
||||
called by lexicon.c (read_lexicon)
|
||||
ctype.h (isspace)
|
||||
uses macro BLANK_STRING
|
||||
-------------------------------------------------------*/
|
||||
static char *convert_field( char *buf ,
|
||||
char *inp ) {
|
||||
char c ;
|
||||
char *d = buf;
|
||||
char *s = inp ;
|
||||
|
||||
BLANK_STRING(d) ;
|
||||
/* -- space at the beginning of a line will stop the read -- */
|
||||
if ( isspace( *s ) )
|
||||
return NULL ;
|
||||
while ( ( c = *s++ ) != SENTINEL ) {
|
||||
if ( c == '\"' ||
|
||||
c == '\r' )
|
||||
continue ; /* -- ignore quotes and carriage returns -- */
|
||||
/* -- zero terminate field and record delimiters -- */
|
||||
if ( c == '\n' ||
|
||||
c == ',' ) {
|
||||
BLANK_STRING(d) ;
|
||||
return s ;
|
||||
}
|
||||
*d++ = c ; /* -- copy it -- */
|
||||
}
|
||||
return NULL ;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/* ----------------------------------------------------
|
||||
lexicon.c (destroy_lexicon)
|
||||
called by standard.l (close_stand_process)
|
||||
calls lexicon.c (destroy_def_list)
|
||||
uses macro FREE_AND_NULL
|
||||
-------------------------------------------------------*/
|
||||
void destroy_lexicon(ENTRY ** hash_table)
|
||||
{
|
||||
/* -- called by Clean-Up - */
|
||||
unsigned __i__ ;
|
||||
ENTRY *__E__,*__F__ ;
|
||||
if (hash_table == NULL)
|
||||
{
|
||||
return ;
|
||||
}
|
||||
for (__i__ = 0 ;__i__ < LEXICON_HTABSIZE ;__i__++ )
|
||||
{
|
||||
for (__E__ = hash_table[__i__] ;__E__ != NULL ;__E__ = __F__)
|
||||
{
|
||||
destroy_def_list(__E__->DefList) ;
|
||||
__F__ = __E__->Next ;
|
||||
FREE_AND_NULL(__E__->Lookup) ;
|
||||
FREE_AND_NULL(__E__) ;
|
||||
}
|
||||
}
|
||||
DBG("destroy_lexicon: i=%d", __i__);
|
||||
/* <revision date='2012-05-23'>free hash table</revision> */
|
||||
FREE_AND_NULL(hash_table);
|
||||
DBG("leaving destroy_lexicon");
|
||||
}
|
||||
|
||||
|
||||
/* ----------------------------------------------------------
|
||||
lexicon.c (destroy_def_list)
|
||||
called by destroy_lexicon and tokenize.c (remove_default_defs)
|
||||
uses macro FREE_AND_NULL
|
||||
------------------------------------------------------------*/
|
||||
void destroy_def_list( DEF *start_def ) {
|
||||
DEF *cur_def ;
|
||||
DEF *next_def = NULL ;
|
||||
|
||||
|
||||
|
||||
for ( cur_def = start_def ;
|
||||
cur_def != NULL ;
|
||||
cur_def = next_def ) {
|
||||
next_def = cur_def -> Next ;
|
||||
/* -- Default definitions have no associated text -- */
|
||||
if ( cur_def -> Protect == 0 ) {
|
||||
FREE_AND_NULL( cur_def -> Standard ) ;
|
||||
}
|
||||
FREE_AND_NULL( cur_def ) ;
|
||||
}
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------
|
||||
lexicon.c (find_entry)
|
||||
called by lexicon.c (add_dict_entry)
|
||||
calls lexicon.c (calc_hash)
|
||||
string.h (strcmp)
|
||||
-------------------------------------------------------*/
|
||||
ENTRY *find_entry(ENTRY **hash_table,char *lookup_str)
|
||||
{
|
||||
/* -- called to create a lexeme -- */
|
||||
ENTRY *__E__ ;
|
||||
unsigned __hash_index__ ; /* -- 2006-11-20 : to return hash table pointer -- */
|
||||
|
||||
__hash_index__ = calc_hash(lookup_str) ;
|
||||
for (__E__ = hash_table[__hash_index__] ; __E__ != NULL ; __E__ = __E__->Next)
|
||||
{
|
||||
if (strcmp(lookup_str,__E__->Lookup) == 0)
|
||||
{
|
||||
return __E__ ;
|
||||
}
|
||||
}
|
||||
return __E__ ;
|
||||
}
|
||||
|
||||
#define US sizeof( unsigned )
|
||||
/* ----------------------------------------------------
|
||||
lexicon.c (elf_hash)
|
||||
called by lexicon.c (calc_hash)
|
||||
-------------------------------------------------------*/
|
||||
static unsigned elf_hash( char *key_str ) {
|
||||
unsigned h ,
|
||||
g ,
|
||||
c ;
|
||||
|
||||
h = 0 ;
|
||||
while ( ( c = ( unsigned ) *key_str ) != '\0' ) {
|
||||
h = ( h << US ) + c ;
|
||||
if ( ( g = h & ( ~ ( ( unsigned )( ~0 ) >> US ) ) ) )
|
||||
h ^= g >> ( US * 6 ) ;
|
||||
h &= ~g ;
|
||||
key_str++ ;
|
||||
}
|
||||
return h ;
|
||||
}
|
||||
|
||||
|
||||
/* ----------------------------------------------------
|
||||
lexicon.c (calc_hash)
|
||||
called by lexicon.c (find_entry, add_dict_entry)
|
||||
calls lexicon.c (elf_hash)
|
||||
-------------------------------------------------------*/
|
||||
|
||||
static unsigned calc_hash( char *key_str ) {
|
||||
unsigned h ;
|
||||
|
||||
h = elf_hash( key_str ) ;
|
||||
return ( h % LEXICON_HTABSIZE ) ;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------
|
||||
lexicon.c (create_hash_table)
|
||||
allocate and initialize hash table in memory
|
||||
return NULL if error
|
||||
called by create_lexicon
|
||||
uses macro PAGC_CALLOC_STRUC
|
||||
-------------------------------------------------------*/
|
||||
static ENTRY **create_hash_table(ERR_PARAM *err_p)
|
||||
{
|
||||
unsigned __i__ ;
|
||||
ENTRY **__hash_table__ ;
|
||||
PAGC_CALLOC_STRUC(__hash_table__,ENTRY *,LEXICON_HTABSIZE,err_p,NULL) ;
|
||||
for (__i__ = 0 ;__i__ < LEXICON_HTABSIZE ;__i__++ )
|
||||
{
|
||||
__hash_table__[__i__] = NULL ;
|
||||
}
|
||||
return __hash_table__ ;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------
|
||||
lexicon.c (add_dict_entry)
|
||||
called by lexicon.c (read_lexicon)
|
||||
calls lexicon.c (calc_hash, create_def, append_new_def)
|
||||
uses macro PAGC_ALLOC_STRUC , PAGC_STORE_STR, RET_ERR
|
||||
return ERR_FAIL if error
|
||||
-------------------------------------------------------*/
|
||||
static int add_dict_entry( ERR_PARAM *err_p ,
|
||||
ENTRY **hash_table ,
|
||||
char *lookup_str ,
|
||||
int def_num ,
|
||||
SYMB t ,
|
||||
char *standard_str ) {
|
||||
ENTRY *E ;
|
||||
|
||||
E = find_entry( hash_table ,
|
||||
lookup_str ) ;
|
||||
if ( E == NULL ) {
|
||||
unsigned hash_index ;
|
||||
|
||||
PAGC_ALLOC_STRUC(E,ENTRY,err_p,ERR_FAIL);
|
||||
/* -- add the Lookup string to the record -- */
|
||||
PAGC_STORE_STR(E->Lookup,lookup_str,err_p,ERR_FAIL) ;
|
||||
/* -- add new entry to beginning of table -- */
|
||||
hash_index = calc_hash( lookup_str ) ;
|
||||
|
||||
E -> Next = hash_table[ hash_index ] ; /* -- collision chain -- */
|
||||
hash_table[ hash_index ] = E ;
|
||||
if ( ( E -> DefList = create_def( t ,
|
||||
standard_str ,
|
||||
def_num ,
|
||||
FALSE ,
|
||||
err_p ) ) == NULL ) {
|
||||
return ERR_FAIL ;
|
||||
}
|
||||
} else {
|
||||
int err_stat ;
|
||||
if ( E -> DefList == NULL ) {
|
||||
RET_ERR("add_dict_entry: Lexical entry lacks definition" ,
|
||||
err_p ,
|
||||
ERR_FAIL ) ;
|
||||
}
|
||||
if ( ( err_stat = append_new_def( err_p ,
|
||||
E ,
|
||||
t ,
|
||||
standard_str ,
|
||||
def_num ) ) != TRUE ) {
|
||||
return err_stat ;
|
||||
}
|
||||
}
|
||||
return TRUE ;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------------
|
||||
lexicon.c (append_new_def)
|
||||
called by lexicon.c (add_dict_entry)
|
||||
calls lexicon.c (create_def)
|
||||
returns FALSE if entry is already there
|
||||
returns ERR_FAIL on allocation error
|
||||
-------------------------------------------------------*/
|
||||
static int append_new_def( ERR_PARAM *err_p ,
|
||||
ENTRY *E ,
|
||||
SYMB t ,
|
||||
char *text ,
|
||||
int def_num ) {
|
||||
|
||||
DEF *D,
|
||||
*pd,
|
||||
*cd ;
|
||||
for ( cd = E -> DefList , pd = NULL ;
|
||||
cd != NULL ;
|
||||
cd = cd -> Next ) {
|
||||
pd = cd ;
|
||||
/* -- avoid duplication except for local entries -- */
|
||||
if ( cd -> Type == t ) {
|
||||
return FALSE ;
|
||||
}
|
||||
}
|
||||
if ( ( D = create_def( t ,
|
||||
text ,
|
||||
def_num ,
|
||||
FALSE ,
|
||||
err_p ) ) == NULL ) {
|
||||
return ERR_FAIL ;
|
||||
}
|
||||
if ( pd == NULL ) {
|
||||
E -> DefList = D ;
|
||||
} else {
|
||||
D -> Next = pd -> Next ;
|
||||
pd -> Next = D ;
|
||||
}
|
||||
return TRUE ;
|
||||
}
|
||||
|
||||
/*--------------------------------------------------------------------
|
||||
lexicon.c (create_def)
|
||||
called by lexicon.c (append_new_def) tokenize.c (setup_default_defs)
|
||||
allocate memory for lexicon entry.
|
||||
Pflag is TRUE for default entries
|
||||
returns NULL for allocation error
|
||||
uses macro PAGC_ALLOC_STRUC, PAGC_STORE_STR
|
||||
-------------------------------------------------------------------- */
|
||||
DEF *create_def ( SYMB s ,
|
||||
char *standard_str ,
|
||||
int def_num ,
|
||||
int PFlag ,
|
||||
ERR_PARAM *err_p ) {
|
||||
/* -- allocate the memory and set up the definition structure with the
|
||||
standard form -- */
|
||||
DEF *cur_def ;
|
||||
|
||||
/* -- initialization-time allocation -- */
|
||||
PAGC_ALLOC_STRUC(cur_def,DEF,err_p,NULL) ;
|
||||
cur_def -> Type = s ;
|
||||
cur_def -> Protect = PFlag ; /* -- False for definitions from lexicon
|
||||
true for default definitions -- */
|
||||
if ( !PFlag ) {
|
||||
/* -- initialization-time allocation -- */
|
||||
PAGC_STORE_STR(cur_def->Standard,standard_str,err_p,NULL) ;
|
||||
} else
|
||||
cur_def -> Standard = NULL ;
|
||||
cur_def -> Order = def_num ;
|
||||
cur_def -> Next = NULL ;
|
||||
return cur_def ;
|
||||
}
|
||||
|
||||
/*--------------------------------------------------------------------
|
||||
lexicon.c (print_lexicon)
|
||||
not called by useful for debugging. It will print out the lexicon.
|
||||
--------------------------------------------------------------------*/
|
||||
void print_lexicon( ENTRY ** hash_table )
|
||||
{
|
||||
unsigned i;
|
||||
ENTRY *E;
|
||||
|
||||
if (!hash_table) return;
|
||||
|
||||
for (i=0; i< LEXICON_HTABSIZE; i++)
|
||||
{
|
||||
E = hash_table[i];
|
||||
while (E)
|
||||
{
|
||||
DEF *D = E->DefList;
|
||||
printf("'%s'\n", E->Lookup);
|
||||
while (D)
|
||||
{
|
||||
printf(" %d, %d, %d, '%s'\n", D->Order, D->Type, D->Protect, D->Standard);
|
||||
D = D->Next;
|
||||
}
|
||||
E = E->Next;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
2938
extras/address_standardizer/lexicon.csv
Normal file
2938
extras/address_standardizer/lexicon.csv
Normal file
File diff suppressed because it is too large
Load diff
110
extras/address_standardizer/micro-macro.sql
Normal file
110
extras/address_standardizer/micro-macro.sql
Normal file
|
@ -0,0 +1,110 @@
|
|||
drop table if exists addresses cascade;
|
||||
create table addresses (
|
||||
id serial not null primary key,
|
||||
micro text,
|
||||
macro text
|
||||
);
|
||||
|
||||
copy addresses (micro,macro) from stdin;
|
||||
1017 LINWOOD AVE APT 12 ST PAUL,MN,55105
|
||||
1029 ATLANTIC ST APT 302 ST PAUL,MN,55106
|
||||
1032 PORTLAND AVE ST PAUL,MN,55104
|
||||
1093 EDGERTON ST FL 2 ST PAUL,MN,55130
|
||||
111 KELLOGG BLVD E APT 3210 ST PAUL,MN,55101
|
||||
1113 HAWTHORNE AVE E ST PAUL,MN,55106
|
||||
1120 BARCLAY ST APT 4 ST PAUL,MN,55106
|
||||
1137 CHURCHILL ST ST PAUL,MN,55103
|
||||
1140 GALTIER ST ST PAUL,MN,55117
|
||||
1147 MINNEHAHA AVE W ST PAUL,MN,55104
|
||||
1167 BURNQUIST ST ST PAUL,MN,55106
|
||||
1168 SUPORNICK LN APT A ST PAUL,MN,55106
|
||||
1169 RANDOLPH AVE ST PAUL,MN,55105
|
||||
1223 BERKELEY AVE ST PAUL,MN,55105
|
||||
125 MAGNOLIA AVE E ST PAUL,MN,55117
|
||||
1263 RICE ST ST PAUL,MN,55117
|
||||
1305 CONWAY ST ST PAUL,MN,55106
|
||||
1341 CASE AVE ST PAUL,MN,55106
|
||||
1345 CASE AVE ST PAUL,MN,55106
|
||||
1351 CONWAY ST APT 10 ST PAUL,MN,55106
|
||||
1363 OSCEOLA AVE ST PAUL,MN,55105
|
||||
1377 MAYNARD DR W APT 168 ST PAUL,MN,55116
|
||||
1379 MAYNARD DR W APT 176 ST PAUL,MN,55116
|
||||
1379 MAYNARD DR W APT 177 ST PAUL,MN,55116
|
||||
1388 BARCLAY ST ST PAUL,MN,55106
|
||||
1390 ARONA ST ST PAUL,MN,55108
|
||||
1435 GRAND AVE APT 2 ST PAUL,MN,55105
|
||||
1484 ASHLAND AVE UNIT 101 ST PAUL,MN,55101
|
||||
1509 SUMMIT AVE ST PAUL,MN,55106
|
||||
1548 VAN BUREN AVE ST PAUL,MN,55104
|
||||
1561 WHEELOCK LN UNIT 303 ST PAUL,MN,55117
|
||||
1621 ASHLAND AVE APT UPPER ST PAUL,MN,55104
|
||||
1636 WOODBRIDGE ST ST PAUL,MN,55117
|
||||
1644 DAYTON AVE APT 3 ST PAUL,MN,55104
|
||||
1709 LEONE AVE ST PAUL,MN,55106
|
||||
1743 HIGHLAND PKWY ST PAUL,MN,55116
|
||||
1776 ST CLAIR AVE APT 107 ST PAUL,MN,55105
|
||||
1776 ST CLAIR AVE APT 205 ST PAUL,MN,55105
|
||||
1821 UNIVERSITY AVE W RM 336 ST PAUL,MN,55104
|
||||
1877 GOODRICH AVE APT LOWER ST PAUL,MN,55105
|
||||
1898 LACROSSE AVE ST PAUL,MN,55119
|
||||
1899 BEECHWOOD AVE ST PAUL,MN,55116
|
||||
1915 MARSHALL AVE APT 2 ST PAUL,MN,55104
|
||||
1940 NEVADA AVE E ST PAUL,MN,55119
|
||||
1944 NOKOMIS AVE ST PAUL,MN,55119
|
||||
1962 SAUNDERS AVE ST PAUL,MN,55116
|
||||
1968 NEBRASKA AVE E ST PAUL,MN,55119
|
||||
1971 SARGENT AVE ST PAUL,MN,55105
|
||||
204 CONGRESS ST E APT D ST PAUL,MN,55107
|
||||
2085 GRAND AVE APT 203 ST PAUL,MN,55101
|
||||
21 BATTLE CREEK PL ST PAUL,MN,55119
|
||||
2174 ELEANOR AVE ST PAUL,MN,55116
|
||||
2224 MAILAND RD ST PAUL,MN,55119
|
||||
2272 BENSON AVE UNIT C ST PAUL,MN,55116
|
||||
2285 BENSON AVE ST PAUL,MN,55116
|
||||
233 CONCORD ST ST PAUL,MN,55107
|
||||
235 MCKNIGHT RD S APT B4 ST PAUL,MN,55119
|
||||
2360 BUFORD AVE ST PAUL,MN,55108
|
||||
256 POINT DOUGLAS RD N ST PAUL,MN,55106
|
||||
261 UNIVERSITY AVE E APT 303 ST PAUL,MN,55130
|
||||
2706 GANNON RD ST PAUL,MN,55116
|
||||
289 5TH ST E UNIT 309 ST PAUL,MN,55101
|
||||
303 WILDER ST N FL 1 ST PAUL,MN,55104
|
||||
317 ROBIE ST E ST PAUL,MN,55107
|
||||
333 SMITH AVE N ST PAUL,MN,55102
|
||||
377 HAMLINE AVE S ST PAUL,MN,55105
|
||||
418 MARYLAND AVE W APT 204 ST PAUL,MN,55117
|
||||
444 CLINTON AVE ST PAUL,MN,55107
|
||||
444 FRY ST ST PAUL,MN,55104
|
||||
536 THOMAS AVE ST PAUL,MN,55103
|
||||
544 SIMPSON ST ST PAUL,MN,55104
|
||||
56 IRVINE PARK ST PAUL,MN,55102
|
||||
597 BLAIR AVE APT 5 ST PAUL,MN,55103
|
||||
602 HUMBOLDT AVE ST PAUL,MN,55107
|
||||
605 CAPITOL BLVD APT B ST PAUL,MN,55103
|
||||
617 ROSE AVE E ST PAUL,MN,55130
|
||||
635 WESTERN AVE N ST PAUL,MN,55103
|
||||
660 5TH ST E APT DOWNST ST PAUL,MN,55106
|
||||
672 UNIVERSITY AVE W ST PAUL,MN,55104
|
||||
675 WHEELOCK PKWY W ST PAUL,MN,55117
|
||||
70 IOWA AVE W ST PAUL,MN,55117
|
||||
711 MARSHALL AVE ST PAUL,MN,55104
|
||||
712 SNELLING AVE N APT 1 ST PAUL,MN,55104
|
||||
715 CALIFORNIA AVE E ST PAUL,MN,55106
|
||||
715 MARSHALL AVE ST PAUL,MN,55104
|
||||
735 LINCOLN AVE ST PAUL,MN,55105
|
||||
750 BLAIR AVE ST PAUL,MN,55104
|
||||
754 BLAIR AVE FL 1 ST PAUL,MN,55104
|
||||
771 BUTTERNUT AVE ST PAUL,MN,55102
|
||||
812 7TH ST E ST PAUL,MN,55106
|
||||
83 CALIFORNIA AVE W APT 206 ST PAUL,MN,55117
|
||||
838 LAFOND AVE ST PAUL,MN,55104
|
||||
852 HOLLY AVE ST PAUL,MN,55104
|
||||
859 OSCEOLA AVE APT 1 ST PAUL,MN,55105
|
||||
885 CASE AVE ST PAUL,MN,55106
|
||||
927 WAKEFIELD AVE ST PAUL,MN,55106
|
||||
93 MARIA AVE ST PAUL,MN,55106
|
||||
935 HYACINTH AVE E ST PAUL,MN,55106
|
||||
94 FRONT AVE ST PAUL,MN,55117
|
||||
953 HYACINTH AVE E ST PAUL,MN,55106
|
||||
\.
|
||||
|
50
extras/address_standardizer/mk-city-regex.pl
Normal file
50
extras/address_standardizer/mk-city-regex.pl
Normal file
|
@ -0,0 +1,50 @@
|
|||
#! /usr/bin/perl
|
||||
#!/usr/bin/perl -w
|
||||
use strict;
|
||||
use Regexp::Assemble;
|
||||
|
||||
my @cities = split(/[\r\n]+/, qx(cat usps-st-city-name.txt));
|
||||
|
||||
my %st= ();
|
||||
for my $x (@cities) {
|
||||
my ($st, $ct) = split(/\t/, $x);
|
||||
push @{$st{$st}}, $ct;
|
||||
}
|
||||
|
||||
my $re;
|
||||
my $ra = Regexp::Assemble->new(flags => "i");
|
||||
|
||||
my %re =();
|
||||
for my $x (sort keys %st) {
|
||||
$ra->add(@{$st{$x}});
|
||||
$re = $ra->re;
|
||||
$re =~ s/\\/\\\\/g;
|
||||
$re{$x} = $re;
|
||||
}
|
||||
|
||||
print "#define NUM_STATES " . scalar (keys %re) . "\n\n";
|
||||
print " static const char *states[NUM_STATES] = \n";
|
||||
print " {\"" . join('","', sort keys %re) . "\"};\n\n";
|
||||
print " static const char *stcities[NUM_STATES] = {\n";
|
||||
my $cnt = 0;
|
||||
my $a = '';
|
||||
my $b = '';
|
||||
for my $x (sort keys %re) {
|
||||
$re = "(?:\\\\b)($re{$x})\$";
|
||||
print " ,\n" if $cnt;
|
||||
print " /* -- $x -- $x -- $x -- $x -- $x -- $x -- $x -- $x -- $x -- $x -- */\n";
|
||||
while ($re =~ s/^(.{1,65})//) {
|
||||
$a = $1;
|
||||
if ($a =~ s/(\\+)$//) {
|
||||
print " \"$b$a\"\n";
|
||||
$b = $1;
|
||||
}
|
||||
else {
|
||||
print " \"$b$a\"\n";
|
||||
$b = '';
|
||||
}
|
||||
}
|
||||
$cnt++;
|
||||
}
|
||||
print " };\n";
|
||||
|
31
extras/address_standardizer/mk-sql.pl
Normal file
31
extras/address_standardizer/mk-sql.pl
Normal file
|
@ -0,0 +1,31 @@
|
|||
#!/usr/bin/perl -w
|
||||
use strict;
|
||||
|
||||
sub Usage {
|
||||
print "Usage: mk-sql.pl pgver file-in.sql\n";
|
||||
exit 1;
|
||||
}
|
||||
|
||||
my $ver = shift @ARGV || Usage();
|
||||
my $fin = shift @ARGV || Usage();
|
||||
|
||||
my $nver;
|
||||
|
||||
if ($ver =~ /^PostgreSQL (\d+)\.(\d+)/) {
|
||||
$nver = $1 * 100 + $2;
|
||||
}
|
||||
else {
|
||||
die "Failed to parse '$ver' as /^PostgreSQL (\\d+)\\.(\\d+)/\n/\n";
|
||||
}
|
||||
|
||||
open(IN, $fin) || die "Failed to open file '$fin' : $!\n";
|
||||
while (my $x = <IN>) {
|
||||
if ($nver >= 804) {
|
||||
$x =~ s/\$libdir\/lib/\$libdir\//;
|
||||
}
|
||||
if ($nver < 901) {
|
||||
$x =~ s/^\\echo/--\\echo/;
|
||||
}
|
||||
print $x;
|
||||
}
|
||||
close(IN);
|
734
extras/address_standardizer/mk-st-regexp.pl
Normal file
734
extras/address_standardizer/mk-st-regexp.pl
Normal file
|
@ -0,0 +1,734 @@
|
|||
#!/usr/bin/perl -w
|
||||
use strict;
|
||||
use Regexp::Assemble;
|
||||
|
||||
# TODO
|
||||
# Add prefix types like:
|
||||
# Ave(nue)? of( the)? names
|
||||
# Ave(nue)? (d'|du|de)(la)?\s?names
|
||||
# Ave(nue|nida)? \w{1,2}
|
||||
# calle names
|
||||
# suffix of( the)? names
|
||||
# route (\d+([a-z]|bus(iness)?)(by(pass))?
|
||||
# business (\d+([a-z]|bus(iness)?)(by(pass))?
|
||||
# (interstate|I-) \d+\s*[nsew]?
|
||||
#
|
||||
# Add better number recognizer
|
||||
# dir num dir num dir
|
||||
# dir num letter
|
||||
# num? fraction 123 1/2
|
||||
#
|
||||
# Add patterns to recognize intersections
|
||||
# street & street, city, state
|
||||
#
|
||||
#
|
||||
# Probably the following were removed as they are PREFIX TYPES
|
||||
# RTE, ROUTE
|
||||
# CALLE
|
||||
#
|
||||
# and maybe RUE
|
||||
# RUE can be either: RUE d'la whatever; Charles Rue
|
||||
#
|
||||
# Many of the SUFFIX TYPES can be used in a prefix contexted like:
|
||||
# AVENUE of the Americas
|
||||
#
|
||||
|
||||
|
||||
#my @cities = split(/[\r\n]+/, qx(cat usps-city-names.txt));
|
||||
|
||||
# ==============================
|
||||
|
||||
|
||||
my @stwords = qw(
|
||||
ALLEE
|
||||
ALLEY
|
||||
ALLY
|
||||
ALY
|
||||
ANEX
|
||||
ANNEX
|
||||
ANNX
|
||||
ANX
|
||||
ARC
|
||||
ARCADE
|
||||
AV
|
||||
AVE
|
||||
AVEN
|
||||
AVENU
|
||||
AVENUE
|
||||
AVN
|
||||
AVNUE
|
||||
BAYOO
|
||||
BAYOU
|
||||
BCH
|
||||
BEACH
|
||||
BEND
|
||||
BG
|
||||
BGS
|
||||
BLF
|
||||
BLFS
|
||||
BLUF
|
||||
BLUFF
|
||||
BLUFFS
|
||||
BLVD
|
||||
BND
|
||||
BOT
|
||||
BOTTM
|
||||
BOTTOM
|
||||
BOUL
|
||||
BOULEVARD
|
||||
BOULV
|
||||
BR
|
||||
BRANCH
|
||||
BRDGE
|
||||
BRG
|
||||
BRIDGE
|
||||
BRK
|
||||
BRKS
|
||||
BRNCH
|
||||
BROOK
|
||||
BROOKS
|
||||
BTM
|
||||
BURG
|
||||
BURGS
|
||||
BYP
|
||||
BYPA
|
||||
BYPAS
|
||||
BYPASS
|
||||
BYPS
|
||||
BYU
|
||||
CAMP
|
||||
CANYN
|
||||
CANYON
|
||||
CAPE
|
||||
CAUSEWAY
|
||||
CAUSWAY
|
||||
CEN
|
||||
CENT
|
||||
CENTER
|
||||
CENTERS
|
||||
CENTR
|
||||
CENTRE
|
||||
CIR
|
||||
CIRC
|
||||
CIRCL
|
||||
CIRCLE
|
||||
CIRCLES
|
||||
CIRS
|
||||
CK
|
||||
CLB
|
||||
CLF
|
||||
CLFS
|
||||
CLIFF
|
||||
CLIFFS
|
||||
CLUB
|
||||
CMN
|
||||
CMP
|
||||
CNTER
|
||||
CNTR
|
||||
CNYN
|
||||
COMMON
|
||||
COR
|
||||
CORNER
|
||||
CORNERS
|
||||
CORS
|
||||
COURSE
|
||||
COURT
|
||||
COURTS
|
||||
COVE
|
||||
COVES
|
||||
CP
|
||||
CPE
|
||||
CR
|
||||
CRCL
|
||||
CRCLE
|
||||
CRECENT
|
||||
CREEK
|
||||
CRES
|
||||
CRESCENT
|
||||
CRESENT
|
||||
CREST
|
||||
CRK
|
||||
CROSSING
|
||||
CROSSROAD
|
||||
CRSCNT
|
||||
CRSE
|
||||
CRSENT
|
||||
CRSNT
|
||||
CRSSING
|
||||
CRSSNG
|
||||
CRST
|
||||
CRT
|
||||
CSWY
|
||||
CT
|
||||
CTR
|
||||
CTRS
|
||||
CTS
|
||||
CURV
|
||||
CURVE
|
||||
CV
|
||||
CVS
|
||||
CYN
|
||||
DALE
|
||||
DAM
|
||||
DIV
|
||||
DIVIDE
|
||||
DL
|
||||
DM
|
||||
DR
|
||||
DRIV
|
||||
DRIVE
|
||||
DRIVES
|
||||
DRS
|
||||
DRV
|
||||
DV
|
||||
DVD
|
||||
EST
|
||||
ESTATE
|
||||
ESTATES
|
||||
ESTS
|
||||
EXP
|
||||
EXPR
|
||||
EXPRESS
|
||||
EXPRESSWAY
|
||||
EXPW
|
||||
EXPY
|
||||
EXT
|
||||
EXTENSION
|
||||
EXTENSIONS
|
||||
EXTN
|
||||
EXTNSN
|
||||
EXTS
|
||||
FALL
|
||||
FALLS
|
||||
FERRY
|
||||
FIELD
|
||||
FIELDS
|
||||
FLAT
|
||||
FLATS
|
||||
FLD
|
||||
FLDS
|
||||
FLS
|
||||
FLT
|
||||
FLTS
|
||||
FORD
|
||||
FORDS
|
||||
FOREST
|
||||
FORESTS
|
||||
FORG
|
||||
FORGE
|
||||
FORGES
|
||||
FORK
|
||||
FORKS
|
||||
FORT
|
||||
FRD
|
||||
FRDS
|
||||
FREEWAY
|
||||
FREEWY
|
||||
FRG
|
||||
FRGS
|
||||
FRK
|
||||
FRKS
|
||||
FRRY
|
||||
FRST
|
||||
FRT
|
||||
FRWAY
|
||||
FRWY
|
||||
FRY
|
||||
FT
|
||||
FWY
|
||||
GARDEN
|
||||
GARDENS
|
||||
GARDN
|
||||
GATEWAY
|
||||
GATEWY
|
||||
GATWAY
|
||||
GDN
|
||||
GDNS
|
||||
GLEN
|
||||
GLENS
|
||||
GLN
|
||||
GLNS
|
||||
GRDEN
|
||||
GRDN
|
||||
GRDNS
|
||||
GREEN
|
||||
GREENS
|
||||
GRN
|
||||
GRNS
|
||||
GROV
|
||||
GROVE
|
||||
GROVES
|
||||
GRV
|
||||
GRVS
|
||||
GTWAY
|
||||
GTWY
|
||||
HARB
|
||||
HARBOR
|
||||
HARBORS
|
||||
HARBR
|
||||
HAVEN
|
||||
HAVN
|
||||
HBR
|
||||
HBRS
|
||||
HEIGHT
|
||||
HEIGHTS
|
||||
HGTS
|
||||
HIGHWAY
|
||||
HIGHWY
|
||||
HILL
|
||||
HILLS
|
||||
HIWAY
|
||||
HIWY
|
||||
HL
|
||||
HLLW
|
||||
HLS
|
||||
HOLLOW
|
||||
HOLLOWS
|
||||
HOLW
|
||||
HOLWS
|
||||
HRBOR
|
||||
HT
|
||||
HTS
|
||||
HVN
|
||||
HWAY
|
||||
HWY
|
||||
INLET
|
||||
INLT
|
||||
IS
|
||||
ISLAND
|
||||
ISLANDS
|
||||
ISLE
|
||||
ISLES
|
||||
ISLND
|
||||
ISLNDS
|
||||
ISS
|
||||
JCT
|
||||
JCTION
|
||||
JCTN
|
||||
JCTNS
|
||||
JCTS
|
||||
JUNCTION
|
||||
JUNCTIONS
|
||||
JUNCTN
|
||||
JUNCTON
|
||||
KEY
|
||||
KEYS
|
||||
KNL
|
||||
KNLS
|
||||
KNOL
|
||||
KNOLL
|
||||
KNOLLS
|
||||
KY
|
||||
KYS
|
||||
LA
|
||||
LAKE
|
||||
LAKES
|
||||
LAND
|
||||
LANDING
|
||||
LANE
|
||||
LANES
|
||||
LCK
|
||||
LCKS
|
||||
LDG
|
||||
LDGE
|
||||
LF
|
||||
LGT
|
||||
LGTS
|
||||
LIGHT
|
||||
LIGHTS
|
||||
LINE
|
||||
LK
|
||||
LKS
|
||||
LN
|
||||
LNDG
|
||||
LNDNG
|
||||
LOAF
|
||||
LOCK
|
||||
LOCKS
|
||||
LODG
|
||||
LODGE
|
||||
LOOP
|
||||
LOOPS
|
||||
MALL
|
||||
MANOR
|
||||
MANORS
|
||||
MDW
|
||||
MDWS
|
||||
MEADOW
|
||||
MEADOWS
|
||||
MEDOWS
|
||||
MEWS
|
||||
MILL
|
||||
MILLS
|
||||
MISSION
|
||||
MISSN
|
||||
ML
|
||||
MLS
|
||||
MNR
|
||||
MNRS
|
||||
MNT
|
||||
MNTAIN
|
||||
MNTN
|
||||
MNTNS
|
||||
MOTORWAY
|
||||
MOUNT
|
||||
MOUNTAIN
|
||||
MOUNTAINS
|
||||
MOUNTIN
|
||||
MSN
|
||||
MSSN
|
||||
MT
|
||||
MTIN
|
||||
MTN
|
||||
MTNS
|
||||
MTWY
|
||||
NCK
|
||||
NECK
|
||||
OPAS
|
||||
ORCH
|
||||
ORCHARD
|
||||
ORCHRD
|
||||
OVAL
|
||||
OVERPASS
|
||||
OVL
|
||||
PARK
|
||||
PARKS
|
||||
PARKWAY
|
||||
PARKWAYS
|
||||
PARKWY
|
||||
PASS
|
||||
PASSAGE
|
||||
PATH
|
||||
PATHS
|
||||
PIKE
|
||||
PIKES
|
||||
PINE
|
||||
PINES
|
||||
PK
|
||||
PKWAY
|
||||
PKWY
|
||||
PKWYS
|
||||
PKY
|
||||
PL
|
||||
PLACE
|
||||
PLAIN
|
||||
PLAINES
|
||||
PLAINS
|
||||
PLAZA
|
||||
PLN
|
||||
PLNS
|
||||
PLZ
|
||||
PLZA
|
||||
PNE
|
||||
PNES
|
||||
POINT
|
||||
POINTS
|
||||
PORT
|
||||
PORTS
|
||||
PR
|
||||
PRAIRIE
|
||||
PRARIE
|
||||
PRK
|
||||
PRR
|
||||
PRT
|
||||
PRTS
|
||||
PSGE
|
||||
PT
|
||||
PTS
|
||||
RAD
|
||||
RADIAL
|
||||
RADIEL
|
||||
RADL
|
||||
RAMP
|
||||
RANCH
|
||||
RANCHES
|
||||
RAPID
|
||||
RAPIDS
|
||||
RD
|
||||
RDG
|
||||
RDGE
|
||||
RDGS
|
||||
RDS
|
||||
REST
|
||||
RIDGE
|
||||
RIDGES
|
||||
RIV
|
||||
RIVER
|
||||
RIVR
|
||||
RNCH
|
||||
RNCHS
|
||||
ROAD
|
||||
ROADS
|
||||
ROW
|
||||
RPD
|
||||
RPDS
|
||||
RST
|
||||
RUE
|
||||
RUN
|
||||
RVR
|
||||
SHL
|
||||
SHLS
|
||||
SHOAL
|
||||
SHOALS
|
||||
SHOAR
|
||||
SHOARS
|
||||
SHORE
|
||||
SHORES
|
||||
SHR
|
||||
SHRS
|
||||
SKWY
|
||||
SKYWAY
|
||||
SMT
|
||||
SPG
|
||||
SPGS
|
||||
SPNG
|
||||
SPNGS
|
||||
SPRING
|
||||
SPRINGS
|
||||
SPRNG
|
||||
SPRNGS
|
||||
SPUR
|
||||
SPURS
|
||||
SQ
|
||||
SQR
|
||||
SQRE
|
||||
SQRS
|
||||
SQS
|
||||
SQU
|
||||
SQUARE
|
||||
SQUARES
|
||||
ST
|
||||
STA
|
||||
STATION
|
||||
STATN
|
||||
STN
|
||||
STR
|
||||
STRA
|
||||
STRAV
|
||||
STRAVE
|
||||
STRAVEN
|
||||
STRAVENUE
|
||||
STRAVN
|
||||
STREAM
|
||||
STREET
|
||||
STREETS
|
||||
STREME
|
||||
STRM
|
||||
STRT
|
||||
STRVN
|
||||
STRVNUE
|
||||
STS
|
||||
SUMIT
|
||||
SUMITT
|
||||
SUMMIT
|
||||
TER
|
||||
TERR
|
||||
TERRACE
|
||||
THROUGHWAY
|
||||
TPK
|
||||
TPKE
|
||||
TR
|
||||
TRACE
|
||||
TRACES
|
||||
TRACK
|
||||
TRACKS
|
||||
TRAFFICWAY
|
||||
TRAIL
|
||||
TRAILS
|
||||
TRAK
|
||||
TRCE
|
||||
TRFY
|
||||
TRK
|
||||
TRKS
|
||||
TRL
|
||||
TRLS
|
||||
TRNPK
|
||||
TRPK
|
||||
TRWY
|
||||
TUNEL
|
||||
TUNL
|
||||
TUNLS
|
||||
TUNNEL
|
||||
TUNNELS
|
||||
TUNNL
|
||||
TURNPIKE
|
||||
TURNPK
|
||||
UN
|
||||
UNDERPASS
|
||||
UNION
|
||||
UNIONS
|
||||
UNS
|
||||
UPAS
|
||||
VALLEY
|
||||
VALLEYS
|
||||
VALLY
|
||||
VDCT
|
||||
VIA
|
||||
VIADCT
|
||||
VIADUCT
|
||||
VIEW
|
||||
VIEWS
|
||||
VILL
|
||||
VILLAG
|
||||
VILLAGE
|
||||
VILLAGES
|
||||
VILLE
|
||||
VILLG
|
||||
VILLIAGE
|
||||
VIS
|
||||
VIST
|
||||
VISTA
|
||||
VL
|
||||
VLG
|
||||
VLGS
|
||||
VLLY
|
||||
VLY
|
||||
VLYS
|
||||
VST
|
||||
VSTA
|
||||
VW
|
||||
VWS
|
||||
WALK
|
||||
WALKS
|
||||
WALL
|
||||
WAY
|
||||
WAYS
|
||||
WELL
|
||||
WELLS
|
||||
WL
|
||||
WLS
|
||||
WY
|
||||
XING
|
||||
XRD
|
||||
);
|
||||
# ==============================
|
||||
|
||||
my @secwords = qw(
|
||||
APARTMENT
|
||||
APT
|
||||
BASEMENT
|
||||
BLDG
|
||||
BSMT
|
||||
BUILDING
|
||||
DEPARTMENT
|
||||
DEPT
|
||||
FL
|
||||
FLOOR
|
||||
FRNT
|
||||
FRONT
|
||||
HANGAR
|
||||
HNGR
|
||||
LBBY
|
||||
LOBBY
|
||||
LOT
|
||||
LOWER
|
||||
LOWR
|
||||
OFC
|
||||
OFFICE
|
||||
PENTHOUSE
|
||||
PH
|
||||
PIER
|
||||
REAR
|
||||
RM
|
||||
ROOM
|
||||
SIDE
|
||||
SLIP
|
||||
SPACE
|
||||
SPC
|
||||
STE
|
||||
STOP
|
||||
SUITE
|
||||
TRAILER
|
||||
TRLR
|
||||
UNIT
|
||||
UPPER
|
||||
UPPR
|
||||
);
|
||||
|
||||
my @dirs = qw(
|
||||
NORTH N NORD
|
||||
SOUTH S SUD
|
||||
EAST E EST
|
||||
WEST W OEST O
|
||||
NORTHEAST NE
|
||||
NORTHWEST NW
|
||||
SOUTHEAST SE
|
||||
SOUTHWEST SW
|
||||
NORTH-EAST N-E
|
||||
NORTH-WEST N-W
|
||||
SOUTH-EAST S-E
|
||||
SOUTH-WEST S-W
|
||||
);
|
||||
|
||||
my @saints = (
|
||||
"st",
|
||||
"st.",
|
||||
"ste",
|
||||
"ste.",
|
||||
"saint",
|
||||
);
|
||||
|
||||
my $re;
|
||||
my $l = Regexp::Assemble->new(flags => "i");
|
||||
#$re = $l->set(modifiers=>'i')->list2re(@cities);
|
||||
#$re =~ s/\\/\\\\/g;
|
||||
#my $cities = $re;
|
||||
|
||||
#print " static const char *cities = \n";
|
||||
#while ($re =~ s/^(.{1,75})//) {
|
||||
# print " \"$1\"\n";
|
||||
#}
|
||||
#print " ;\n";
|
||||
|
||||
|
||||
$l->add(@stwords);
|
||||
$re = $l->re;
|
||||
$re =~ s/\\/\\\\/g;
|
||||
$re =~ s/\?\^/?-xism/g;
|
||||
my $sttype = $re;
|
||||
#print " static const char *sttype = \"$re\";\n\n";
|
||||
|
||||
$l->add(@secwords);
|
||||
$re = $l->re;
|
||||
$re =~ s/\\/\\\\/g;
|
||||
$re =~ s/\?\^/?-xism/g;
|
||||
my $unittype = $re;
|
||||
#print " static const char *unittype = \"$re\";\n\n";
|
||||
|
||||
$l->add(@dirs);
|
||||
$re = $l->re;
|
||||
$re =~ s/\\/\\\\/g;
|
||||
$re =~ s/\?\^/?-xism/g;
|
||||
my $dirs = $re;
|
||||
#print " static const char *dirtype = \"$re\";\n\n";
|
||||
|
||||
$l->add(@saints);
|
||||
$re = $l->re;
|
||||
$re =~ s/\\/\\\\/g;
|
||||
$re =~ s/\?\^/?-xism/g;
|
||||
my $saint = $re;
|
||||
#print " static const char *saints = \"$re\";\n\n";
|
||||
|
||||
my $word = "\\\\w+";
|
||||
my $words = "($word(\\\\s$word)*)";
|
||||
|
||||
my @reg = ();
|
||||
#push @reg, "(?:,\\\\s*)([^,]+)\$";
|
||||
#push @reg, "\\\\b($cities)\$";
|
||||
push @reg, "(?:\\\\b$sttype\\\\s(?:$dirs\\\\s))($dirs\\\\s$words)\$";
|
||||
push @reg, "(?:\\\\b$sttype\\\\s(?:$dirs\\\\s))($dirs\\\\s$saint\\\\s$words)\$";
|
||||
push @reg, "(?:\\\\b$sttype\\\\s)($dirs\\\\s$saint\\\\s$words)\$";
|
||||
push @reg, "(?:\\\\b$sttype\\\\s)($saint\\\\s$words)\$";
|
||||
push @reg, "(?:\\\\b$sttype\\\\s)($dirs\\\\s$words)\$";
|
||||
push @reg, "(?:\\\\b$sttype\\\\s)($words)\$";
|
||||
push @reg, "(?:\\\\s)($dirs\\\\s$words)\$";
|
||||
push @reg, "^(?:\\\\d+\\\\s(?:(?:\\\\w+\\\\s)$sttype))()\$";
|
||||
push @reg, "^(?:\\\\d+\\\\s(?:(?:\\\\w+\\\\s)*\\\\w+\\\\s))($word)\$";
|
||||
|
||||
my $nn = scalar @reg;
|
||||
print " const int nreg = $nn;\n";
|
||||
print " static const char *t_regx[$nn] = {\n \"";
|
||||
print join("\",\n \"", @reg);
|
||||
print "\"\n };\n";
|
||||
|
||||
|
71
extras/address_standardizer/pagc-data-psql
Normal file
71
extras/address_standardizer/pagc-data-psql
Normal file
|
@ -0,0 +1,71 @@
|
|||
#!/usr/bin/perl -w
|
||||
use strict;
|
||||
|
||||
sub Usage {
|
||||
die "Usage: pagc-data-psql [lex|gaz|rules] file\n";
|
||||
}
|
||||
|
||||
my $mode = shift @ARGV || Usage();
|
||||
my $file = shift @ARGV || Usage();
|
||||
my $x;
|
||||
|
||||
open(IN, $file) || die "Failed to open '$file' : $!\n";
|
||||
|
||||
if ($mode eq 'lex') {
|
||||
print <<EOF;
|
||||
drop table if exists lex cascade;
|
||||
create table lex (
|
||||
id serial not null primary key,
|
||||
seq integer,
|
||||
word text,
|
||||
stdword text,
|
||||
token integer
|
||||
);
|
||||
copy lex (seq, word, token, stdword) from stdin;
|
||||
EOF
|
||||
|
||||
while ($x = <IN>) {
|
||||
$x =~ s/["\r\n]//g;
|
||||
$x =~ s/,/\t/g;
|
||||
print "$x\n";
|
||||
}
|
||||
print "\\.\n";
|
||||
}
|
||||
elsif ($mode eq 'gaz') {
|
||||
print <<EOF;
|
||||
drop table if exists gaz cascade;
|
||||
create table gaz (
|
||||
id serial not null primary key,
|
||||
seq integer,
|
||||
word text,
|
||||
stdword text,
|
||||
token integer
|
||||
);
|
||||
copy gaz (seq, word, token, stdword) from stdin;
|
||||
EOF
|
||||
|
||||
while ($x = <IN>) {
|
||||
$x =~ s/["\r\n]//g;
|
||||
$x =~ s/,/\t/g;
|
||||
print "$x\n";
|
||||
}
|
||||
print "\\.\n";
|
||||
}
|
||||
elsif ($mode eq 'rules') {
|
||||
print <<EOF;
|
||||
drop table if exists rules cascade;
|
||||
create table rules (
|
||||
id serial not null primary key,
|
||||
rule text
|
||||
);
|
||||
copy rules (rule) from stdin;
|
||||
EOF
|
||||
while ($x = <IN>) {
|
||||
$x =~ s/["\r\n]//g;
|
||||
print "$x\n";
|
||||
}
|
||||
print "\\.\n";
|
||||
}
|
||||
else {
|
||||
Usage();
|
||||
}
|
1260
extras/address_standardizer/pagc_api.h
Normal file
1260
extras/address_standardizer/pagc_api.h
Normal file
File diff suppressed because it is too large
Load diff
118
extras/address_standardizer/pagc_common.h
Normal file
118
extras/address_standardizer/pagc_common.h
Normal file
|
@ -0,0 +1,118 @@
|
|||
/*-- pagc_common.h --
|
||||
|
||||
Certain common definitions used both by the pagc library and its clients
|
||||
|
||||
Prototype 20H10 (This file was written by Walter Sinclair).
|
||||
|
||||
This file is part of PAGC.
|
||||
|
||||
Copyright (c) 2010 Walter Bruce Sinclair
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
*/
|
||||
|
||||
/* For pagc-0.4.0 : last revised 2010-11-08 */
|
||||
|
||||
|
||||
#ifndef PAGC_COM_H
|
||||
#define PAGC_COM_H
|
||||
|
||||
#ifdef MAXPATHLEN
|
||||
#define PATHNAME_LEN MAXPATHLEN
|
||||
#else
|
||||
#define PATHNAME_LEN 1024
|
||||
#endif
|
||||
|
||||
/* -- 2006-04-25 : structure added to index arc endpoints -- */
|
||||
typedef struct pagc_point {
|
||||
double X ;
|
||||
double Y ;
|
||||
} PAGC_POINT ;
|
||||
|
||||
|
||||
typedef int SYMB ;
|
||||
|
||||
|
||||
#define ERR_FAIL -2
|
||||
#define FAIL -1
|
||||
#define NULL_READ 0
|
||||
#define MATCH_READ 2
|
||||
#define BOTH 2
|
||||
|
||||
|
||||
/*------------------------------------
|
||||
strategy types
|
||||
------------------------------------*/
|
||||
#define ADDRESS_SCORING 0
|
||||
#define INTERSECTION_SCORING 1
|
||||
#define LANDMARK_SCORING 3
|
||||
|
||||
#define SITE_MATCH 0
|
||||
#define SITE_INTERPOLATE 1
|
||||
#define INTERSECTION 2
|
||||
#define ADDRESS_RANGE_2 3
|
||||
#define ADDRESS_RANGE_4 4
|
||||
#define REVERSE_SITE 5
|
||||
#define REVERSE_INTERSECTION 6
|
||||
#define INTERSECTION_B 7
|
||||
#define CONCAT 8
|
||||
#define LANDMARK_NAME 9
|
||||
|
||||
/*----------------------------------
|
||||
response format types :
|
||||
------------------------------------*/
|
||||
#define CSV 0
|
||||
#define JSON 1
|
||||
#define XML 2
|
||||
|
||||
/* -- build flags -- */
|
||||
#define STATISTICS 2 /* -- output statistics on rules used. FLSTATS in schema-- */
|
||||
#define PRINT_PROGRESS 128 /* output 10% completion points */
|
||||
#define LOG_COMPLETE 2048 /* log certain initializations when complete */
|
||||
#define ZERO_IS_BLANK 512 /* schema: FLZBLNK */
|
||||
#define RNF_PRETYPE_REDIRECT 4096 /* schema: FLRNFRE */
|
||||
|
||||
#define SENTINEL '\0'
|
||||
#define BLANK_STRING(STR) *STR = SENTINEL
|
||||
#define MAXSTRLEN 256
|
||||
|
||||
/* -- boolean -- */
|
||||
#ifndef TRUE
|
||||
#define TRUE 1
|
||||
#define FALSE 0
|
||||
#endif
|
||||
|
||||
#define READ_ONLY_MODE 0
|
||||
#define WRITE_CREATE_MODE 1
|
||||
#define WRITE_APPEND_MODE 2
|
||||
|
||||
#define PAGE_SIZE 4096
|
||||
#define MAX_REF_CANDS 100
|
||||
|
||||
#ifdef ENABLE_THREADED
|
||||
#define MAX_CONTEXTS 20
|
||||
#else
|
||||
#define MAX_CONTEXTS 1
|
||||
#endif
|
||||
|
||||
#define BACK_SLASH 0x5c
|
||||
#define FORE_SLASH '/'
|
||||
#define IS_DOT(CH) ( CH == '.' )
|
||||
#define IS_DIR_SEP(CH) ( CH == global_path_separator )
|
||||
#define IS_COLON(CH) ( CH == ':' )
|
||||
#define NOT_PATH_DELIMITOR(CH) \
|
||||
( CH != global_path_separator ) && \
|
||||
( !IS_COLON(CH) )
|
||||
#define IS_PATH_DELIMITOR(CH) \
|
||||
( IS_DIR_SEP(CH) || \
|
||||
IS_COLON(CH) )
|
||||
#define COMMA_APPEND_WITH_LEN( D , S , L ) \
|
||||
char_append( "," , D , S , L )
|
||||
|
||||
#endif
|
75
extras/address_standardizer/pagc_std_api.h
Normal file
75
extras/address_standardizer/pagc_std_api.h
Normal file
|
@ -0,0 +1,75 @@
|
|||
|
||||
#ifndef PAGC_STD_API_H
|
||||
#define PAGC_STD_API_H
|
||||
|
||||
#define BUILD_API
|
||||
|
||||
typedef struct LEXICON_s {
|
||||
ENTRY **hash_table;
|
||||
ERR_PARAM *err_p;
|
||||
} LEXICON;
|
||||
|
||||
typedef struct RULES_s {
|
||||
int ready;
|
||||
int rule_number;
|
||||
int last_node;
|
||||
RULE_PARAM *r_p;
|
||||
ERR_PARAM *err_p;
|
||||
NODE **Trie;
|
||||
SYMB *rule_end ;
|
||||
SYMB *r ;
|
||||
} RULES;
|
||||
|
||||
typedef struct STANDARDIZER_s {
|
||||
PAGC_GLOBAL *pagc_p;
|
||||
STAND_PARAM *misc_stand;
|
||||
ERR_PARAM *err_p;
|
||||
} STANDARDIZER;
|
||||
|
||||
typedef struct STDADDR_s { // define as required
|
||||
char *building;
|
||||
char *house_num;
|
||||
char *predir;
|
||||
char *qual;
|
||||
char *pretype;
|
||||
char *name;
|
||||
char *suftype;
|
||||
char *sufdir;
|
||||
char *ruralroute;
|
||||
char *extra;
|
||||
char *city;
|
||||
char *state;
|
||||
char *country;
|
||||
char *postcode;
|
||||
char *box;
|
||||
char *unit;
|
||||
} STDADDR;
|
||||
|
||||
LEXICON * lex_init();
|
||||
int lex_add_entry(LEXICON *lex, int seq, char *word, char
|
||||
*stdword, SYMB token);
|
||||
void lex_free(LEXICON *lex);
|
||||
|
||||
RULES *rules_init();
|
||||
int rules_add_rule(RULES *rules, int num, int *rule);
|
||||
int rules_add_rule_from_str(RULES *rules, char *rule);
|
||||
int rules_ready(RULES *rules);
|
||||
void rules_free(RULES *rules);
|
||||
|
||||
STANDARDIZER *std_init();
|
||||
int std_use_lex(STANDARDIZER *std, LEXICON *lex);
|
||||
int std_use_gaz(STANDARDIZER *std, LEXICON *gaz);
|
||||
int std_use_rules(STANDARDIZER *std, RULES *rules);
|
||||
int std_ready_standardizer(STANDARDIZER *std);
|
||||
void std_free(STANDARDIZER *std);
|
||||
|
||||
STDADDR *std_standardize_one(STANDARDIZER *std, char *address_one_line, int options);
|
||||
|
||||
STDADDR *std_standardize_mm(STANDARDIZER *std, char *micro, char *macro, int options);
|
||||
|
||||
STDADDR *std_standardize(STANDARDIZER *std, char *address, char *city, char *state, char *postcode, char *country, int options);
|
||||
|
||||
void stdaddr_free(STDADDR *stdaddr);
|
||||
void print_stdaddr(STDADDR *stdaddr);
|
||||
|
||||
#endif
|
441
extras/address_standardizer/pagc_tools.c
Normal file
441
extras/address_standardizer/pagc_tools.c
Normal file
|
@ -0,0 +1,441 @@
|
|||
/* -- pagc_tools.c
|
||||
|
||||
Various and miscellaneous functions.
|
||||
|
||||
Prototype 20H10 (This file was written by Walter Sinclair).
|
||||
|
||||
This file is part of PAGC.
|
||||
|
||||
Copyright (c) 2010 Walter Bruce Sinclair
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
*/
|
||||
|
||||
/* For pagc-0.4.0 : last revised 2010-11-25 */
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
#include <unistd.h>
|
||||
#include "pagc_common.h"
|
||||
#include "pagc_tools.h"
|
||||
|
||||
#ifndef MAXOUTSYM
|
||||
#define MAXOUTSYM 18
|
||||
#endif
|
||||
|
||||
#ifdef MSYS_POSIX
|
||||
static void conform_directory_separator( char * ) ;
|
||||
#endif
|
||||
|
||||
|
||||
static const char *OutSymbNames[] = {
|
||||
"BLDNG",
|
||||
"HOUSE",
|
||||
"PREDIR",
|
||||
"QUALIF",
|
||||
"PRETYP",
|
||||
"STREET",
|
||||
"SUFTYP",
|
||||
"SUFDIR",
|
||||
"RR",
|
||||
"UNKNWN",
|
||||
"CITY",
|
||||
"PROV",
|
||||
"NATION",
|
||||
"POSTAL",
|
||||
"BOXH",
|
||||
"BOXT",
|
||||
"UNITH",
|
||||
"UNITT"
|
||||
} ;
|
||||
|
||||
static const char *InSymbNames[] = {
|
||||
"NUMBER",
|
||||
"WORD",
|
||||
"TYPE",
|
||||
"QUALIF",
|
||||
"PRETYP",
|
||||
"STREET",
|
||||
"ROAD",
|
||||
"STOPWORD",
|
||||
"RR",
|
||||
"DASH",
|
||||
"CITY",
|
||||
"PROV",
|
||||
"NATION",
|
||||
"AMPERS",
|
||||
"BOXH",
|
||||
"ORD",
|
||||
"UNITH",
|
||||
"UNITT",
|
||||
"SINGLE",
|
||||
"BUILDH",
|
||||
"MILE",
|
||||
"DOUBLE",
|
||||
"DIRECT",
|
||||
"MIXED",
|
||||
"BUILDT",
|
||||
"FRACT",
|
||||
"PCT",
|
||||
"PCH",
|
||||
"QUINT",
|
||||
"QUAD",
|
||||
} ;
|
||||
|
||||
/* ------------------------------------------------------------
|
||||
ISO 8859 character set may pop up in some files. After 1998
|
||||
TigerLine will use them.
|
||||
------------------------------------------------------------- */
|
||||
void convert_latin_one ( char *inp ) {
|
||||
unsigned char *str ;
|
||||
|
||||
for ( str = ( unsigned char * ) inp ;
|
||||
*str != SENTINEL ;
|
||||
str++ ) {
|
||||
unsigned char ch ;
|
||||
ch = *str ;
|
||||
/* -------------------------------------------
|
||||
if bit 7 is set, reset bit 5 so both upper
|
||||
and lower case can be done together
|
||||
--------------------------------------------- */
|
||||
if ( ch & 0x80 ) {
|
||||
ch &= 0xDF ;
|
||||
/* -----------------------------------------
|
||||
reduce letters with diacritical marks to
|
||||
their unmarked base letters
|
||||
------------------------------------------ */
|
||||
if ( ch >= 0xC0 &&
|
||||
ch <= 0xC6 )
|
||||
ch = 'A' ;
|
||||
else if ( ch == 0xc7 )
|
||||
ch = 'C' ;
|
||||
else if ( ch >= 0xc8 && ch <= 0xcb )
|
||||
ch = 'E' ;
|
||||
else if ( ch >= 0xcc && ch <= 0xcf )
|
||||
ch = 'I' ;
|
||||
else if ( ch == 0xd0 )
|
||||
ch = 'D' ;
|
||||
else if ( ch == 0xd1 )
|
||||
ch = 'N' ;
|
||||
else if ( ch >= 0xd2 && ch <= 0xd6 )
|
||||
ch = 'O' ;
|
||||
else if ( ch >= 0xd9 && ch <= 0xdc )
|
||||
ch = 'U' ;
|
||||
else if ( ch >= 0xdd && ch < 0xdf )
|
||||
ch = 'Y' ;
|
||||
else
|
||||
/* -------------------------------
|
||||
just clear the top bit so it
|
||||
won't gum up the edit distance
|
||||
machinery
|
||||
-------------------------------- */
|
||||
ch &= 0x7f ;
|
||||
}
|
||||
*str = ch ;
|
||||
}
|
||||
|
||||
/* ----------------------------------------------
|
||||
while we're at it, add a newline to the end
|
||||
because the lexical scanner likes it like that
|
||||
----------------------------------------------- */
|
||||
*str++ = '\n' ;
|
||||
*str = SENTINEL ;
|
||||
}
|
||||
|
||||
void char_append( const char *div ,
|
||||
char *dest ,
|
||||
const char *src ,
|
||||
int max_wid ) {
|
||||
if ( *src == SENTINEL )
|
||||
return ;
|
||||
/* -- skip the delimitor if dest is empty -- */
|
||||
if ( *dest == SENTINEL ) {
|
||||
append_string_to_max( dest ,
|
||||
( char * ) src ,
|
||||
max_wid ) ;
|
||||
return ;
|
||||
}
|
||||
append_string_to_max( dest , ( char * ) div , max_wid ) ;
|
||||
append_string_to_max( dest , ( char * ) src , max_wid ) ;
|
||||
}
|
||||
|
||||
const char *out_symb_name( int i ) {
|
||||
return ( OutSymbNames[ i ] ) ;
|
||||
}
|
||||
|
||||
const char *in_symb_name( int i ) {
|
||||
return ( InSymbNames[ i ] ) ;
|
||||
}
|
||||
|
||||
int out_symb_value( const char *src ) {
|
||||
int i ;
|
||||
|
||||
/* -- linear search -- */
|
||||
for ( i = 0 ;
|
||||
i < MAXOUTSYM ;
|
||||
i++ ) {
|
||||
if ( strcmp( src ,
|
||||
OutSymbNames[ i ] ) == 0 )
|
||||
return i ;
|
||||
}
|
||||
return FAIL ;
|
||||
}
|
||||
|
||||
/*-------------------------------------------
|
||||
util.c (get_input_line)
|
||||
called by initial.c (restore_build_state)
|
||||
--------------------------------------------*/
|
||||
int get_input_line( char *buf ,
|
||||
FILE *fp ) {
|
||||
int i ;
|
||||
|
||||
BLANK_STRING(buf) ;
|
||||
if ( ( fgets( buf ,
|
||||
MAXSTRLEN ,
|
||||
fp ) ) == NULL )
|
||||
return FALSE ;
|
||||
for ( i = strlen( buf ) ;
|
||||
i > 0 ;
|
||||
i-- ) {
|
||||
if ( strchr( "\n\r",
|
||||
buf[ i - 1 ] ) ) {
|
||||
buf[ i - 1 ] = SENTINEL ;
|
||||
} else
|
||||
break ;
|
||||
}
|
||||
return TRUE ;
|
||||
}
|
||||
|
||||
|
||||
/*-------------------------------------------------------
|
||||
pagc_tools.c (parse_file_name)
|
||||
called by open_aux_file, main.c (main)
|
||||
copies the file name to the output_tail and the path to
|
||||
the output_head
|
||||
--------------------------------------------------------*/
|
||||
void parse_file_name( const char *input_path_name ,
|
||||
char global_path_separator ,
|
||||
char *output_tail ,
|
||||
char *output_head ) {
|
||||
const char *end_ptr , *src ;
|
||||
char *dest ;
|
||||
/* -- find the file name part first -- */
|
||||
/* -- move to end of the pathname -- */
|
||||
for ( end_ptr = input_path_name ; *end_ptr != SENTINEL ; end_ptr++ ) ;
|
||||
/* -- find the last directory delimitor -- */
|
||||
while ( ( end_ptr > input_path_name ) && NOT_PATH_DELIMITOR(*end_ptr) ) {
|
||||
end_ptr -- ;
|
||||
}
|
||||
/* ---------------------------------------------------------------
|
||||
either end_ptr has the last delimitor or it is at string start.
|
||||
If the first case, we need to increment to get the filename and
|
||||
need to copy everything up to and including for the path.
|
||||
-----------------------------------------------------------------*/
|
||||
/* -- copy from beg to endptr to output path -- */
|
||||
dest = output_head ;
|
||||
src = input_path_name ;
|
||||
/* if end_ptr points to a path delimitor, copy everything up but not
|
||||
including it into the output_head (if output_head isn't NULL) */
|
||||
if ( IS_PATH_DELIMITOR( *end_ptr ) ) {
|
||||
while ( src < end_ptr ) {
|
||||
if ( dest != NULL ) {
|
||||
*dest++ = *src ;
|
||||
}
|
||||
src++ ;
|
||||
}
|
||||
src++ ;
|
||||
}
|
||||
/* -- copy from endptr to end to output file name -- */
|
||||
if ( dest != NULL ) {
|
||||
BLANK_STRING(dest) ;
|
||||
}
|
||||
/* copy everything after the delimitor up to the sentinel
|
||||
into the output_tail */
|
||||
if ( ( dest = output_tail ) != NULL ) {
|
||||
while ( TRUE ) {
|
||||
if ( ( *dest++ = *src++ ) == SENTINEL ) {
|
||||
break ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*--------------------------------------------------
|
||||
pagc_tools.c (combine_path_file)
|
||||
called by util.c (open_aux_file)
|
||||
calls char_append
|
||||
--------------------------------------------------*/
|
||||
void combine_path_file( char global_path_separator ,
|
||||
char *input_head ,
|
||||
char *input_tail ,
|
||||
char *output_path_name ) {
|
||||
char combine_buf[ 2 ] ;
|
||||
|
||||
combine_buf[ 0 ] = global_path_separator ;
|
||||
combine_buf[ 1 ] = SENTINEL ;
|
||||
|
||||
if ( ( input_head != NULL ) &&
|
||||
( input_head[ 0 ] != SENTINEL ) ) {
|
||||
append_string_to_max( output_path_name ,
|
||||
input_head ,
|
||||
PATHNAME_LEN ) ;
|
||||
|
||||
char_append( combine_buf ,
|
||||
output_path_name ,
|
||||
input_tail ,
|
||||
PATHNAME_LEN ) ;
|
||||
return ;
|
||||
}
|
||||
append_string_to_max( output_path_name ,
|
||||
input_tail ,
|
||||
PATHNAME_LEN ) ;
|
||||
}
|
||||
|
||||
|
||||
void upper_case( char *d ,
|
||||
const char *s ) {
|
||||
/* -- make an uppercase copy in d of string in s -- */
|
||||
for ( ;
|
||||
*s != SENTINEL ;
|
||||
s++ ) {
|
||||
*d++ = ( islower( *s )? toupper( *s ) : *s ) ;
|
||||
}
|
||||
BLANK_STRING(d) ;
|
||||
}
|
||||
|
||||
/* 2010-10-22 : new routine */
|
||||
int upper_case_compare( char *str1 , char* str2 ) {
|
||||
char upper_buf1[ MAXSTRLEN ] ;
|
||||
char upper_buf2[ MAXSTRLEN ] ;
|
||||
upper_case( upper_buf1 , str1 ) ;
|
||||
upper_case( upper_buf2 , str2 ) ;
|
||||
return ( strcmp( upper_buf1 , upper_buf2 ) ) ;
|
||||
}
|
||||
|
||||
/* 2010-10-30 : moved here for use in ds */
|
||||
void fast_reverse_endian( char *location_to_reverse , int bytes_to_reverse ) {
|
||||
char *start_byte_ptr , *end_byte_ptr ;
|
||||
|
||||
for ( start_byte_ptr = location_to_reverse , end_byte_ptr = location_to_reverse + bytes_to_reverse - 1 ; start_byte_ptr < end_byte_ptr ; start_byte_ptr++ , end_byte_ptr-- ) {
|
||||
char a = *start_byte_ptr ;
|
||||
*start_byte_ptr = *end_byte_ptr ;
|
||||
*end_byte_ptr = a ;
|
||||
}
|
||||
}
|
||||
|
||||
/*=================================================================
|
||||
pagc_tools.c (append_string_to_max ) = format.c (format_ncat)
|
||||
=================================================================*/
|
||||
void append_string_to_max( char *dest_buf_start ,
|
||||
char *src_str_start ,
|
||||
int buf_size ) {
|
||||
|
||||
char a ;
|
||||
char *d_ptr , *s_ptr , *buf_end ;
|
||||
|
||||
/* -- move to end of current contents of buffer -- */
|
||||
d_ptr = dest_buf_start ;
|
||||
while ( ( a = *d_ptr ) != SENTINEL ) {
|
||||
d_ptr ++ ;
|
||||
}
|
||||
buf_end = dest_buf_start + buf_size - 1 ;
|
||||
|
||||
if ( d_ptr >= buf_end ) {
|
||||
#ifndef BUILD_API
|
||||
#ifndef NO_STDERR_OUTPUT
|
||||
fprintf( stderr , "format_strncat: fatal buffer overflow of %s\n" , dest_buf_start ) ;
|
||||
fprintf( stderr , "No room for %s\n" , src_str_start ) ;
|
||||
#endif
|
||||
exit( 1 ) ;
|
||||
#else
|
||||
/* TODO if postgresql we can throw and error or notice
|
||||
but for now we will just truncate the string */
|
||||
*d_ptr = SENTINEL ;
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
s_ptr = src_str_start ;
|
||||
while ( ( ( a = *s_ptr++ ) != SENTINEL ) &&
|
||||
( d_ptr != buf_end ) ) {
|
||||
*d_ptr++ = a ;
|
||||
}
|
||||
*d_ptr = SENTINEL ;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/* ========================================================
|
||||
pagc_tools.c (establish_directory)
|
||||
Determine the current working directory and path_separator
|
||||
========================================================= */
|
||||
int establish_directory( char * c_w_d ,
|
||||
char * p_s ) {
|
||||
char *c_w_d_ptr ;
|
||||
|
||||
c_w_d_ptr = getcwd( c_w_d ,
|
||||
( PATHNAME_LEN - 1 ) ) ;
|
||||
if ( c_w_d_ptr == NULL ) {
|
||||
return FALSE ;
|
||||
}
|
||||
|
||||
*p_s = FORE_SLASH ;
|
||||
|
||||
#ifdef MSYS_POSIX
|
||||
|
||||
/* ..... transform cwd's non-POSIX directory separators to conform ..... */
|
||||
|
||||
conform_directory_separator( c_w_d ) ;
|
||||
|
||||
#endif
|
||||
|
||||
if ( isalpha( c_w_d[ 0 ] ) ) {
|
||||
|
||||
/* ..... drive letter, colon, dir_sep ..... */
|
||||
|
||||
if ( IS_COLON( c_w_d[ 1 ] ) ) {
|
||||
*p_s = c_w_d[ 2 ] ;
|
||||
if ( ( *p_s != FORE_SLASH ) &&
|
||||
( *p_s != BACK_SLASH ) ) {
|
||||
return FALSE ;
|
||||
}
|
||||
} else {
|
||||
return FALSE ;
|
||||
}
|
||||
}
|
||||
return TRUE ;
|
||||
}
|
||||
|
||||
#ifdef MSYS_POSIX
|
||||
/*------------------------------------------------------------------
|
||||
pagc_tools.c (conform_directory_separator)
|
||||
-- called only if compiled with MSYS_POSIX defined .....
|
||||
-- transform non-POSIX directory separators to conform with POSIX --
|
||||
called by init_global
|
||||
string.h (strlen)
|
||||
-------------------------------------------------------------------*/
|
||||
static void conform_directory_separator( char * path_name ) {
|
||||
int i ,
|
||||
pn_len ;
|
||||
|
||||
pn_len = strlen( path_name ) ;
|
||||
for ( i = 0 ;
|
||||
i < pn_len ;
|
||||
i++ ) {
|
||||
if ( path_name[ i ] == BACK_SLASH ) {
|
||||
path_name[ i ] = FORE_SLASH ;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* ..... END OF IFDEF MSYS_POSIX ..... */
|
||||
#endif
|
||||
|
||||
|
40
extras/address_standardizer/pagc_tools.h
Normal file
40
extras/address_standardizer/pagc_tools.h
Normal file
|
@ -0,0 +1,40 @@
|
|||
/*=================================================================
|
||||
-- pagc_tools.h --
|
||||
|
||||
Certain common tools used both by the pagc library and its clients
|
||||
|
||||
Prototype 20H10 (This file was written by Walter Sinclair).
|
||||
|
||||
This file is part of PAGC.
|
||||
|
||||
Copyright (c) 2010 Walter Bruce Sinclair
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
*/
|
||||
|
||||
/* For pagc-0.4.0 : last revised 2010-11-25 */
|
||||
|
||||
#ifndef PGC_T_H
|
||||
#define PGC_T_H
|
||||
|
||||
void convert_latin_one ( char * ) ;
|
||||
void char_append( const char * , char * , const char * , int ) ;
|
||||
void append_string_to_max( char * , char * , int ) ;
|
||||
const char *out_symb_name( int ) ;
|
||||
const char *in_symb_name( int ) ;
|
||||
int out_symb_value( const char * ) ;
|
||||
int get_input_line( char * , FILE * ) ;
|
||||
void combine_path_file( char , char * , char * , char * ) ;
|
||||
int upper_case_compare( char * , char* ) ; /* 2010-10-22 */
|
||||
void fast_reverse_endian( char * , int ) ; /* 2010-10-30 */
|
||||
void upper_case( char * , const char * ) ;
|
||||
void parse_file_name( const char * , char , char * , char * ) ;
|
||||
int establish_directory( char * , char * ) ;
|
||||
|
||||
#endif
|
524
extras/address_standardizer/parseaddress-api.c
Normal file
524
extras/address_standardizer/parseaddress-api.c
Normal file
|
@ -0,0 +1,524 @@
|
|||
/*
|
||||
* parseaddres.c - utility to crack a string into address, city st zip
|
||||
*
|
||||
* Copyright 2006 Stephen Woodbridge
|
||||
*
|
||||
* This code is released under and MIT-X style license,
|
||||
*
|
||||
* Stphen Woodbridge
|
||||
* woodbri@swoodbridge.com
|
||||
* woodbr@imaptools.com
|
||||
*
|
||||
* $Id: parseaddress.c,v 2.6 2010/07/25 00:47:24 woodbri Exp $
|
||||
*
|
||||
* TODO:
|
||||
* * add recognition of country before or after postalcode
|
||||
* * have clean trailing punctionation return a code if a comma was removed
|
||||
* if comma and no state then there is probably no city
|
||||
*
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
#include <stdio.h>
|
||||
#include <pcre.h>
|
||||
#include "parseaddress-api.h"
|
||||
|
||||
#undef DEBUG
|
||||
//#define DEBUG 1
|
||||
|
||||
#ifdef DEBUG
|
||||
#define DBG(format, arg...) \
|
||||
elog(NOTICE, format , ## arg)
|
||||
#else
|
||||
#define DBG(format, arg...) do { ; } while (0)
|
||||
#endif
|
||||
|
||||
const char *get_state_regex(char *st);
|
||||
const char *parseaddress_cvsid();
|
||||
char *clean_leading_punct(char *s);
|
||||
|
||||
const char *get_state_regex(char *st)
|
||||
{
|
||||
int i;
|
||||
int cmp;
|
||||
#include "parseaddress-stcities.h"
|
||||
|
||||
if (!st || strlen(st) != 2) return NULL;
|
||||
|
||||
for (i=0; i<NUM_STATES; i++) {
|
||||
cmp = strcmp(states[i], st);
|
||||
if (cmp == 0)
|
||||
return stcities[i];
|
||||
else if (cmp > 0)
|
||||
return NULL;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int clean_trailing_punct(char *s)
|
||||
{
|
||||
int i;
|
||||
int ret = 0;
|
||||
|
||||
i=strlen(s)-1;
|
||||
while (ispunct(s[i]) || isspace(s[i])) {
|
||||
if (s[i] == ',') ret = 1;
|
||||
s[i--] = '\0';
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
char *clean_leading_punct(char *s)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i=0; i<strlen(s); i++)
|
||||
if (!(ispunct(s[i]) || isspace(s[i])))
|
||||
break;
|
||||
|
||||
return s + i;
|
||||
}
|
||||
|
||||
void strtoupper(char *s)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i=0; i<strlen(s); i++)
|
||||
s[i] = toupper(s[i]);
|
||||
}
|
||||
|
||||
int match(char *pattern, char *s, int *ovect, int options)
|
||||
{
|
||||
const char *error;
|
||||
int erroffset;
|
||||
pcre *re;
|
||||
int rc;
|
||||
|
||||
re = pcre_compile(pattern, options, &error, &erroffset, NULL);
|
||||
if (!re) return -99;
|
||||
|
||||
rc = pcre_exec(re, NULL, s, strlen(s), 0, 0, ovect, OVECCOUNT);
|
||||
free(re);
|
||||
|
||||
if (rc < 0) return rc;
|
||||
else if (rc == 0) rc = OVECCOUNT/3;
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
#define RET_ERROR(a,e) if (!a) {*reterr = e; return NULL;}
|
||||
|
||||
ADDRESS *parseaddress(HHash *stH, char *s, int *reterr)
|
||||
{
|
||||
|
||||
#include "parseaddress-regex.h"
|
||||
|
||||
int ovect[OVECCOUNT];
|
||||
char c;
|
||||
char *stregx;
|
||||
char *caregx;
|
||||
char *state = NULL;
|
||||
char *regx;
|
||||
int mi;
|
||||
int i, j;
|
||||
int rc;
|
||||
int comma = 0;
|
||||
ADDRESS *ret;
|
||||
#ifdef USE_HSEARCH
|
||||
ENTRY e, *ep;
|
||||
int err;
|
||||
#else
|
||||
char *key;
|
||||
char *val;
|
||||
#endif
|
||||
|
||||
ret = (ADDRESS *) palloc0(sizeof(ADDRESS));
|
||||
|
||||
/* check if we were passed a lat lon */
|
||||
rc = match("^\\s*([-+]?\\d+(\\.\\d*)?)[\\,\\s]+([-+]?\\d+(\\.\\d*)?)\\s*$", s, ovect, 0);
|
||||
if (rc >= 3) {
|
||||
*(s+ovect[3]) = '\0';
|
||||
ret->lat = strtod(s+ovect[2], NULL);
|
||||
ret->lon = strtod(s+ovect[6], NULL);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* clean the string of multiple white spaces and . */
|
||||
|
||||
for (i=0, j=0; i<strlen(s); i++) {
|
||||
c = s[i];
|
||||
if (c == '.') c = s[i] = ' ';
|
||||
if (j == 0 && isspace(c)) continue;
|
||||
if (i && isspace(c) && isspace(s[i-1])) continue;
|
||||
s[j] = s[i];
|
||||
j++;
|
||||
}
|
||||
if (isspace(s[j-1])) j--;
|
||||
s[j] = '\0';
|
||||
|
||||
/* clean trailing punctuation */
|
||||
comma |= clean_trailing_punct(s);
|
||||
|
||||
/* assume country code is US */
|
||||
|
||||
ret->cc = (char *) palloc0(3 * sizeof(char));
|
||||
strcpy(ret->cc, "US");
|
||||
|
||||
/* get US zipcode components */
|
||||
|
||||
rc = match("\\b(\\d{5})[-\\s]?(\\d{4})?$", s, ovect, 0);
|
||||
if (rc >= 2) {
|
||||
ret->zip = (char *) palloc0((ovect[3]-ovect[2]+1) * sizeof(char));
|
||||
strncpy(ret->zip, s+ovect[2], ovect[3]-ovect[2]);
|
||||
if (rc >= 3) {
|
||||
ret->zipplus = (char *) palloc0((ovect[5]-ovect[4]+1) * sizeof(char));
|
||||
strncpy(ret->zipplus, s+ovect[4], ovect[5]-ovect[4]);
|
||||
}
|
||||
/* truncate the postalcode off the string */
|
||||
*(s+ovect[0]) = '\0';
|
||||
comma = 0;
|
||||
}
|
||||
/* get canada zipcode components */
|
||||
else {
|
||||
rc = match("\\b([a-z]\\d[a-z]\\s?\\d[a-z]\\d)$", s, ovect, PCRE_CASELESS);
|
||||
if (rc >= 1) {
|
||||
ret->zip = (char *) palloc0((ovect[1]-ovect[0]+1) * sizeof(char));
|
||||
strncpy(ret->zip, s+ovect[0], ovect[1]-ovect[0]);
|
||||
strcpy(ret->cc, "CA");
|
||||
/* truncate the postalcode off the string */
|
||||
*(s+ovect[0]) = '\0';
|
||||
comma = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* clean trailing punctuation */
|
||||
comma |= clean_trailing_punct(s);
|
||||
|
||||
/* get state components */
|
||||
|
||||
caregx = "^(?-xism:(?i:(?=[abmnopqsy])(?:n[ltsu]|[am]b|[bq]c|on|pe|sk|yt)))$";
|
||||
stregx = "\\b(?-xism:(?i:(?=[abcdfghiklmnopqrstuvwy])(?:a(?:l(?:a(?:bam|sk)a|berta)?|mer(?:ican)?\\ samoa|r(?:k(?:ansas)?|izona)?|[kszb])|s(?:a(?:moa|skatchewan)|outh\\ (?:carolin|dakot)a|\\ (?:carolin|dakot)a|[cdk])|c(?:a(?:lif(?:ornia)?)?|o(?:nn(?:ecticut)?|lorado)?|t)|d(?:e(?:la(?:ware)?)?|istrict\\ of\\ columbia|c)|f(?:l(?:(?:orid)?a)?|ederal\\ states\\ of\\ micronesia|m)|m(?:i(?:c(?:h(?:igan)?|ronesia)|nn(?:esota)?|ss(?:(?:issipp|our)i)?)?|a(?:r(?:shall(?:\\ is(?:l(?:and)?)?)?|yland)|ss(?:achusetts)?|ine|nitoba)?|o(?:nt(?:ana)?)?|[ehdnstpb])|g(?:u(?:am)?|(?:eorgi)?a)|h(?:awai)?i|i(?:d(?:aho)?|l(?:l(?:inois)?)?|n(?:d(?:iana)?)?|(?:ow)?a)|k(?:(?:ansa)?s|(?:entuck)?y)|l(?:a(?:bordor)?|ouisiana)|n(?:e(?:w(?:\\ (?:foundland(?:\\ and\\ labordor)?|hampshire|jersey|mexico|(?:yor|brunswic)k)|foundland)|(?:brask|vad)a)?|o(?:rth(?:\\ (?:mariana(?:\\ is(?:l(?:and)?)?)?|(?:carolin|dakot)a)|west\\ territor(?:ies|y))|va\\ scotia)|\\ (?:carolin|dakot)a|u(?:navut)?|[vhjmycdblsf]|w?t)|o(?:h(?:io)?|k(?:lahoma)?|r(?:egon)?|n(?:t(?:ario)?)?)|p(?:a(?:lau)?|e(?:nn(?:sylvania)?|i)?|r(?:ince\\ edward\\ island)?|w|uerto\\ rico)|r(?:hode\\ island|i)|t(?:e(?:nn(?:essee)?|xas)|[nx])|ut(?:ah)?|v(?:i(?:rgin(?:\\ islands|ia))?|(?:ermon)?t|a)|w(?:a(?:sh(?:ington)?)?|i(?:sc(?:onsin)?)?|y(?:oming)?|(?:est)?\\ virginia|v)|b(?:ritish\\ columbia|c)|q(?:uebe)?c|y(?:ukon|t))))$";
|
||||
|
||||
rc = match(stregx, s, ovect, PCRE_CASELESS);
|
||||
if (rc > 0) {
|
||||
state = (char *) palloc0((ovect[1]-ovect[0]+1) * sizeof(char));
|
||||
strncpy(state, s+ovect[0], ovect[1]-ovect[0]);
|
||||
|
||||
/* truncate the state/province off the string */
|
||||
*(s+ovect[0]) = '\0';
|
||||
|
||||
/* lookup state in hash and get abbreviation */
|
||||
strtoupper(state);
|
||||
#ifdef USE_HSEARCH
|
||||
e.key = state;
|
||||
err = hsearch_r(e, FIND, &ep, stH);
|
||||
if (err) {
|
||||
ret->st = (char *) palloc0(3 * sizeof(char));
|
||||
strcpy(ret->st, ep->data);
|
||||
}
|
||||
#else
|
||||
key = state;
|
||||
val = (char *)hash_get(stH, key);
|
||||
if (val) {
|
||||
ret->st = pstrdup(val);
|
||||
}
|
||||
#endif
|
||||
else {
|
||||
*reterr = 1002;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* check if it a Canadian Province */
|
||||
rc = match(caregx, ret->st, ovect, PCRE_CASELESS);
|
||||
if (rc > 0) {
|
||||
strcpy(ret->cc, "CA");
|
||||
// if (ret->cc) printf(" CC: %s\n", ret->cc);
|
||||
}
|
||||
comma = 0;
|
||||
}
|
||||
|
||||
/* clean trailing punctuation */
|
||||
comma |= clean_trailing_punct(s);
|
||||
|
||||
/* get city components */
|
||||
|
||||
/*
|
||||
* This part is ambiguous without punctuation after the street
|
||||
* because we can have any of the following forms:
|
||||
*
|
||||
* num predir? prefixtype? street+ suffixtype? suffdir?,
|
||||
* ((north|south|east|west)? city)? state? zip?
|
||||
*
|
||||
* and technically num can be of the form:
|
||||
*
|
||||
* pn1? n1 pn2? n2? sn2?
|
||||
* where
|
||||
* pn1 is a prefix character
|
||||
* n1 is a number
|
||||
* pn2 is a prefix character
|
||||
* n2 is a number
|
||||
* sn2 is a suffix character
|
||||
*
|
||||
* and a trailing letter might be [NSEW] which predir can also be
|
||||
*
|
||||
* So it is ambigious whether a directional between street and city
|
||||
* belongs to which component. Futher since the the street and the city
|
||||
* are both just a string of arbitrary words, it is difficult if not
|
||||
* impossible to determine if an give word belongs to sone side or the
|
||||
* other.
|
||||
*
|
||||
* So for the best results users should include a comma after the street.
|
||||
*
|
||||
* The approach will be as follows:
|
||||
* 1. look for a comma and assume this is the separator
|
||||
* 2. if we can find a state specific regex try that
|
||||
* 3. else loop through an array of possible regex patterns
|
||||
* 4. fail and assume there is not city
|
||||
*/
|
||||
|
||||
/* look for a comma */
|
||||
DBG("parse_address: s=%s", s);
|
||||
mi = 0;
|
||||
|
||||
regx = "(?:,\\s*)([^,]+)$";
|
||||
rc = match((char *)regx, s, ovect, 0);
|
||||
if (rc <= 0) {
|
||||
/* look for state specific regex */
|
||||
mi++;
|
||||
regx = (char *) get_state_regex(ret->st);
|
||||
if (regx)
|
||||
rc = match((char *)regx, s, ovect, 0);
|
||||
}
|
||||
DBG("Checked for comma: %d", rc);
|
||||
if (rc <= 0 && ret->st && strlen(ret->st)) {
|
||||
/* look for state specific regex */
|
||||
mi++;
|
||||
regx = (char *) get_state_regex(ret->st);
|
||||
if (regx)
|
||||
rc = match((char *)regx, s, ovect, 0);
|
||||
}
|
||||
DBG("Checked for state-city: %d", rc);
|
||||
if (rc <= 0) {
|
||||
/* run through the regx's and see if we get a match */
|
||||
for (i=0; i<nreg; i++) {
|
||||
mi++;
|
||||
rc = match((char *)t_regx[i], s, ovect, 0);
|
||||
DBG(" rc=%d, i=%d", rc, i);
|
||||
if (rc > 0) break;
|
||||
}
|
||||
DBG("rc=%d, i=%d", rc, i);
|
||||
}
|
||||
DBG("Checked regexs: %d, %d, %d", rc, ovect[2], ovect[3]);
|
||||
if (rc > 0 && ovect[3]>ovect[2]) {
|
||||
/* we have a match so process it */
|
||||
ret->city = (char *) palloc0((ovect[3]-ovect[2]+1) * sizeof(char));
|
||||
strncpy(ret->city, s+ovect[2], ovect[3]-ovect[2]);
|
||||
/* truncate the state/province off the string */
|
||||
*(s+ovect[2]) = '\0';
|
||||
}
|
||||
|
||||
/* clean trailing punctuation */
|
||||
clean_trailing_punct(s);
|
||||
|
||||
/* check for [@] that would indicate a intersection */
|
||||
/* -- 2010-12-11 : per Nancy R. we are using @ to indicate an intersection
|
||||
ampersand is used in both street names and landmarks so it is highly
|
||||
ambiguous -- */
|
||||
rc = match("^([^@]+)\\s*[@]\\s*([^@]+)$", s, ovect, 0);
|
||||
if (rc > 0) {
|
||||
s[ovect[3]] = '\0';
|
||||
clean_trailing_punct(s+ovect[2]);
|
||||
ret->street = pstrdup(s+ovect[2]);
|
||||
|
||||
s[ovect[5]] = '\0';
|
||||
clean_leading_punct(s+ovect[4]);
|
||||
ret->street2 = pstrdup(s+ovect[4]);
|
||||
}
|
||||
else {
|
||||
|
||||
/* and the remainder must be the address components */
|
||||
ret->address1 = pstrdup(clean_leading_punct(s));
|
||||
|
||||
/* split the number off the street if it exists */
|
||||
rc = match("^((?i)[nsew]?\\d+[-nsew]*\\d*[nsew]?\\b)", s, ovect, 0);
|
||||
if (rc > 0) {
|
||||
ret->num = (char *) palloc0((ovect[1]-ovect[0]+1) * sizeof(char));
|
||||
strncpy(ret->num, s, ovect[1]-ovect[0]);
|
||||
ret->street = pstrdup(clean_leading_punct(s+ovect[1]));
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int load_state_hash(HHash *stH)
|
||||
{
|
||||
char * words[][2] = {
|
||||
{"ALABAMA" , "AL"},
|
||||
{"ALASKA" , "AK"},
|
||||
{"AMERICAN SAMOA" , "AS"},
|
||||
{"AMER SAMOA" , "AS"},
|
||||
{"SAMOA" , "AS"},
|
||||
{"ARIZONA" , "AZ"},
|
||||
{"ARKANSAS" , "AR"},
|
||||
{"ARK" , "AR"},
|
||||
{"CALIFORNIA" , "CA"},
|
||||
{"CALIF" , "CA"},
|
||||
{"COLORADO" , "CO"},
|
||||
{"CONNECTICUT" , "CT"},
|
||||
{"CONN" , "CT"},
|
||||
{"DELAWARE" , "DE"},
|
||||
{"DELA" , "DE"},
|
||||
{"DISTRICT OF COLUMBIA" , "DC"},
|
||||
{"FEDERAL STATES OF MICRONESIA" , "FM"},
|
||||
{"MICRONESIA" , "FM"},
|
||||
{"FLORIDA" , "FL"},
|
||||
{"FLA" , "FL"},
|
||||
{"GEORGIA" , "GA"},
|
||||
{"GUAM" , "GU"},
|
||||
{"HAWAII" , "HI"},
|
||||
{"IDAHO" , "ID"},
|
||||
{"ILLINOIS" , "IL"},
|
||||
{"ILL" , "IL"},
|
||||
{"INDIANA" , "IN"},
|
||||
{"IND" , "IN"},
|
||||
{"IOWA" , "IA"},
|
||||
{"KANSAS" , "KS"},
|
||||
{"KENTUCKY" , "KY"},
|
||||
{"LOUISIANA" , "LA"},
|
||||
{"MAINE" , "ME"},
|
||||
{"MARSHALL ISLAND" , "MH"},
|
||||
{"MARSHALL ISL" , "MH"},
|
||||
{"MARSHALL IS" , "MH"},
|
||||
{"MARSHALL" , "MH"},
|
||||
{"MARYLAND" , "MD"},
|
||||
{"MASSACHUSETTS" , "MA"},
|
||||
{"MASS" , "MA"},
|
||||
{"MICHIGAN" , "MI"},
|
||||
{"MICH" , "MI"},
|
||||
{"MINNESOTA" , "MN"},
|
||||
{"MINN" , "MN"},
|
||||
{"MISSISSIPPI" , "MS"},
|
||||
{"MISS" , "MS"},
|
||||
{"MISSOURI" , "MO"},
|
||||
{"MONTANA" , "MT"},
|
||||
{"MONT" , "MT"},
|
||||
{"NEBRASKA" , "NE"},
|
||||
{"NEVADA" , "NV"},
|
||||
{"NEW HAMPSHIRE" , "NH"},
|
||||
{"NEW JERSEY" , "NJ"},
|
||||
{"NEW MEXICO" , "NM"},
|
||||
{"NEW YORK" , "NY"},
|
||||
{"NORTH CAROLINA" , "NC"},
|
||||
{"N CAROLINA" , "NC"},
|
||||
{"NORTH DAKOTA" , "ND"},
|
||||
{"N DAKOTA" , "ND"},
|
||||
{"NORTH MARIANA ISL" , "MP"},
|
||||
{"NORTH MARIANA IS" , "MP"},
|
||||
{"NORTH MARIANA" , "MP"},
|
||||
{"NORTH MARIANA ISLAND" , "MP"},
|
||||
{"OHIO" , "OH"},
|
||||
{"OKLAHOMA" , "OK"},
|
||||
{"OREGON" , "OR"},
|
||||
{"PALAU" , "PW"},
|
||||
{"PENNSYLVANIA" , "PA"},
|
||||
{"PENN" , "PA"},
|
||||
{"PUERTO RICO" , "PR"},
|
||||
{"RHODE ISLAND" , "RI"},
|
||||
{"SOUTH CAROLINA" , "SC"},
|
||||
{"S CAROLINA" , "SC"},
|
||||
{"SOUTH DAKOTA" , "SD"},
|
||||
{"S DAKOTA" , "SD"},
|
||||
{"TENNESSEE" , "TN"},
|
||||
{"TENN" , "TN"},
|
||||
{"TEXAS" , "TX"},
|
||||
{"UTAH" , "UT"},
|
||||
{"VERMONT" , "VT"},
|
||||
{"VIRGIN ISLANDS" , "VI"},
|
||||
{"VIRGINIA" , "VA"},
|
||||
{"WASHINGTON" , "WA"},
|
||||
{"WASH" , "WA"},
|
||||
{"WEST VIRGINIA" , "WV"},
|
||||
{"W VIRGINIA" , "WV"},
|
||||
{"WISCONSIN" , "WI"},
|
||||
{"WISC" , "WI"},
|
||||
{"WYOMING" , "WY"},
|
||||
{"ALBERTA" , "AB"},
|
||||
{"BRITISH COLUMBIA" , "BC"},
|
||||
{"MANITOBA" , "MB"},
|
||||
{"NEW BRUNSWICK" , "NB"},
|
||||
{"NEW FOUNDLAND AND LABORDOR" , "NL"},
|
||||
{"NEW FOUNDLAND" , "NL"},
|
||||
{"NEWFOUNDLAND" , "NL"},
|
||||
{"LABORDOR" , "NL"},
|
||||
{"NORTHWEST TERRITORIES" , "NT"},
|
||||
{"NORTHWEST TERRITORY" , "NT"},
|
||||
{"NWT" , "NT"},
|
||||
{"NOVA SCOTIA" , "NS"},
|
||||
{"NUNAVUT" , "NU"},
|
||||
{"ONTARIO" , "ON"},
|
||||
{"ONT" , "ON"},
|
||||
{"PRINCE EDWARD ISLAND" , "PE"},
|
||||
{"PEI" , "PE"},
|
||||
{"QUEBEC" , "QC"},
|
||||
{"SASKATCHEWAN" , "SK"},
|
||||
{"YUKON" , "YT"},
|
||||
{"NF" , "NL"},
|
||||
{NULL, NULL}
|
||||
};
|
||||
|
||||
#ifdef USE_HSEARCH
|
||||
ENTRY e, *ep;
|
||||
int err;
|
||||
#else
|
||||
char *key;
|
||||
char *val;
|
||||
#endif
|
||||
int i, cnt;
|
||||
|
||||
/* count the entries above */
|
||||
cnt = 0;
|
||||
while (words[cnt][0]) cnt++;
|
||||
|
||||
DBG("Words cnt=%d", cnt);
|
||||
|
||||
#ifdef USE_HSEARCH
|
||||
if (! hcreate_r(cnt*2, stH)) return 1001;
|
||||
for (i=0; i<cnt; i++) {
|
||||
e.key = words[i][0];
|
||||
e.data = words[i][1];
|
||||
err = hsearch_r(e, ENTER, &ep, stH);
|
||||
/* there should be no failures */
|
||||
if (!err) return 1003;
|
||||
e.key = words[i][1];
|
||||
e.data = words[i][1];
|
||||
err = hsearch_r(e, ENTER, &ep, stH);
|
||||
/* there should be no failures */
|
||||
if (!err) return 1003;
|
||||
}
|
||||
#else
|
||||
if (! stH ) return 1001;
|
||||
for (i=0; i<cnt; i++) {
|
||||
//DBG("load_hash i=%d", i);
|
||||
key = words[i][0];
|
||||
val = words[i][1];
|
||||
hash_set(stH, key, (void *)val);
|
||||
key = words[i][1];
|
||||
val = words[i][1];
|
||||
hash_set(stH, key, (void *)val);
|
||||
}
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
void free_state_hash(HHash *stH)
|
||||
{
|
||||
//#if 0
|
||||
#ifdef USE_HSEARCH
|
||||
if (stH) hdestroy_r(stH);
|
||||
#else
|
||||
if (stH) hash_free(stH);
|
||||
#endif
|
||||
//#endif
|
||||
}
|
75
extras/address_standardizer/parseaddress-api.h
Normal file
75
extras/address_standardizer/parseaddress-api.h
Normal file
|
@ -0,0 +1,75 @@
|
|||
/*
|
||||
parseaddres-api.h - utility to crack a string into address, city st zip
|
||||
|
||||
Copyright 2006-2010 Stephen Woodbridge.
|
||||
|
||||
woodbri@swoodbridge.com
|
||||
woodbr@imaptools.com
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
$Id: $
|
||||
|
||||
*/
|
||||
|
||||
#ifndef PARSEADDRESS_API_H
|
||||
#define PARSEADDRESS_API_H
|
||||
|
||||
#include "postgres.h"
|
||||
|
||||
#define OVECCOUNT 30
|
||||
|
||||
#ifdef USE_HSEARCH
|
||||
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#endif
|
||||
#include <search.h>
|
||||
|
||||
typedef struct hsearch_data HHash;
|
||||
|
||||
#else
|
||||
|
||||
#include "hash.h"
|
||||
|
||||
typedef hash_t HHash;
|
||||
|
||||
#endif
|
||||
|
||||
typedef struct address_struct {
|
||||
char *num;
|
||||
char *street;
|
||||
char *street2;
|
||||
char *address1;
|
||||
char *city;
|
||||
char *st;
|
||||
char *zip;
|
||||
char *zipplus;
|
||||
char *cc;
|
||||
double lat;
|
||||
double lon;
|
||||
} ADDRESS;
|
||||
|
||||
int clean_trailing_punct(char *s);
|
||||
void strtoupper(char *s);
|
||||
int match(char *pattern, char *s, int *ovect, int options);
|
||||
ADDRESS *parseaddress(HHash *stH, char *s, int *err);
|
||||
int load_state_hash(HHash *stH);
|
||||
void free_state_hash(HHash *stH);
|
||||
void free_address(ADDRESS *a);
|
||||
|
||||
/*
|
||||
* ERRORS
|
||||
*
|
||||
* 1000 general memory allocation error
|
||||
* 1001 failed to create hash table structure
|
||||
* 1002 failed to find state abbreviation
|
||||
* 1003 hash table is full, failled to add new entry
|
||||
*
|
||||
*/
|
||||
|
||||
#endif
|
12
extras/address_standardizer/parseaddress-regex.h
Normal file
12
extras/address_standardizer/parseaddress-regex.h
Normal file
|
@ -0,0 +1,12 @@
|
|||
const int nreg = 9;
|
||||
static const char *t_regx[9] = {
|
||||
"(?:\\b(?-xism:(?i:(?:C(?:R(?:[KT]|E(?:S(?:(?:C?EN)?T)?|CENT|EK)|S(?:(?:C?N)?T|E(?:NT)?|SI?NG)|OSS(?:ROAD|ING)|CLE?)?|O(?:R(?:NERS?|S)?|UR(?:TS?|SE)|MMON|VES?)|A(?:USE?WAY|NYO?N|MP|PE)|IR(?:C(?:L(?:ES?)?)?|S)?|EN(?:T(?:ERS?|RE?)?)?|L(?:IFFS?|FS?|U?B)|N(?:TE?R|YN)|T(?:RS?|S)?|M[NP]|URVE?|PE?|SWY|VS?|YN|K)|S(?:T(?:[NS]|R(?:[MT]|A(?:V(?:E(?:N(?:UE)?)?|N)?)?|E(?:ETS?|AM|ME)|VN(?:UE)?)?|A(?:T(?:IO)?N)?)?|H(?:O(?:A(?:LS?|RS?)|RES?)|LS?|RS?)|P(?:R(?:INGS?|NGS?)|NGS?|URS?|GS?)|Q(?:U(?:ARES?)?|R[ES]?|S)?|(?:UM(?:IT?|MI)|M)T|K(?:YWA|W)Y)|P(?:A(?:RK(?:W(?:AYS?|Y)|S)?|SS(?:AGE)?|THS?)|L(?:A(?:IN(?:E?S)?|CE|ZA)|NS?|ZA?)?|R(?:[KR]|AI?RIE|TS?)?|K(?:W(?:YS?|AY)|Y)?|O(?:INTS?|RTS?)|I(?:KES?|NES?)|NES?|SGE|TS?)|B(?:O(?:UL(?:EVARD|V)?|T(?:TO?M)?)|R(?:A?NCH|I?DGE|OOKS?|KS?|G)?|Y(?:P(?:A(?:S?S)?|S)?|U)|L(?:UF(?:FS?)?|FS?|VD)|E(?:ACH|ND)|AYO[OU]|URGS?|GS?|CH|ND|TM)|M(?:O(?:UNT(?:AINS?|IN)?|TORWAY)|N(?:T(?:AIN|NS?)?|RS?)|E(?:(?:DO)?WS|ADOWS?)|I(?:SS(?:IO)?N|LLS?)|T(?:NS?|IN|WY)?|A(?:NORS?|LL)|DWS?|S?SN|LS?)|T(?:R(?:A(?:C(?:ES?|KS?)|FFICWAY|ILS?|K)|[FW]Y|N?PK|KS?|LS?|CE)?|U(?:N(?:N(?:ELS?|L)|LS?|EL)|RNP(?:IKE|K))|ER(?:R(?:ACE)?)?|HROUGHWAY|PKE?)|F(?:R(?:(?:(?:EE)?WA?|R)?Y|DS?|GS?|KS?|S?T)|OR(?:G(?:ES?)?|ESTS?|DS?|KS?|T)|L(?:ATS?|DS?|TS?|S)|(?:ERR|W)Y|IELDS?|ALLS?|T)|R(?:A(?:D(?:(?:I[AE])?L)?|NCH(?:ES)?|PIDS?|MP)|I(?:V(?:E?R)?|DGES?)|D(?:G[ES]?|S)?|O(?:ADS?|W)|NCHS?|U[EN]|E?ST|PDS?|VR)|H(?:A(?:RB(?:ORS?|R)?|VE?N)|I(?:(?:GH)?WA?Y|LLS?)|OL(?:LOWS?|WS?)|L(?:LW|S)?|EIGHTS?|BRS?|RBOR|WA?Y|GTS|TS?|VN)|V(?:I(?:LL(?:AG(?:ES?)?|(?:IAG)?E|G)?|A(?:DU?CT)?|S(?:TA?)?|EWS?)|L(?:GS?|YS?|LY)?|ALL(?:EYS?|Y)|STA?|DCT|WS?)|G(?:R(?:D(?:NS?|EN)|OV(?:ES?)?|EENS?|NS?|VS?)|A(?:T(?:EWA?|WA)Y|RD(?:ENS?|N))|L(?:ENS?|NS?)|TWA?Y|DNS?)|L(?:A(?:N(?:D(?:ING)?|ES?)|KES?)?|O(?:CKS?|DGE?|OPS?|AF)|I(?:GHTS?|NE)|N(?:DN?G)?|CKS?|DGE?|GTS?|KS?|F)|E(?:X(?:P(?:[WY]|R(?:ESS(?:WAY)?)?)?|T(?:(?:NS)?N|ENSIONS?|S)?)|ST(?:ATES?|S)?)|A(?:V(?:E(?:N(?:UE?)?)?|N(?:UE)?)?|L(?:L(?:E[EY]|Y)|Y)|RC(?:ADE)?|NN?E?X)|D(?:[LM]|R(?:[SV]|IV(?:ES?)?)?|IV(?:IDE)?|A(?:LE|M)|VD?)|J(?:UNCT(?:IONS?|O?N)|CT(?:ION|NS?|S)?)|I(?:S(?:L(?:ANDS?|NDS?|ES?)|S)?|NLE?T)|O(?:V(?:ERPASS|A?L)|RCH(?:A?RD)?|PAS)|W(?:A(?:L(?:KS?|L)|YS?)|ELLS?|LS?|Y)|K(?:N(?:OL(?:LS?)?|LS?)|EYS?|YS?)|U(?:N(?:(?:DERPAS)?S|IONS?)?|PAS)|X(?:ING|RD)|NE?CK)))\\s(?:(?-xism:(?i:(?:N(?:[EW]|OR(?:TH(?:-?(?:EA|WE)ST)?|D)|-[EW])?|S(?:[EW]|OUTH(?:-?(?:EA|WE)ST)?|-[EW]|UD)?|E(?:A?ST)?|O(?:EST)?|W(?:EST)?)))\\s))((?-xism:(?i:(?:N(?:[EW]|OR(?:TH(?:-?(?:EA|WE)ST)?|D)|-[EW])?|S(?:[EW]|OUTH(?:-?(?:EA|WE)ST)?|-[EW]|UD)?|E(?:A?ST)?|O(?:EST)?|W(?:EST)?)))\\s(\\w+(\\s\\w+)*))$",
|
||||
"(?:\\b(?-xism:(?i:(?:C(?:R(?:[KT]|E(?:S(?:(?:C?EN)?T)?|CENT|EK)|S(?:(?:C?N)?T|E(?:NT)?|SI?NG)|OSS(?:ROAD|ING)|CLE?)?|O(?:R(?:NERS?|S)?|UR(?:TS?|SE)|MMON|VES?)|A(?:USE?WAY|NYO?N|MP|PE)|IR(?:C(?:L(?:ES?)?)?|S)?|EN(?:T(?:ERS?|RE?)?)?|L(?:IFFS?|FS?|U?B)|N(?:TE?R|YN)|T(?:RS?|S)?|M[NP]|URVE?|PE?|SWY|VS?|YN|K)|S(?:T(?:[NS]|R(?:[MT]|A(?:V(?:E(?:N(?:UE)?)?|N)?)?|E(?:ETS?|AM|ME)|VN(?:UE)?)?|A(?:T(?:IO)?N)?)?|H(?:O(?:A(?:LS?|RS?)|RES?)|LS?|RS?)|P(?:R(?:INGS?|NGS?)|NGS?|URS?|GS?)|Q(?:U(?:ARES?)?|R[ES]?|S)?|(?:UM(?:IT?|MI)|M)T|K(?:YWA|W)Y)|P(?:A(?:RK(?:W(?:AYS?|Y)|S)?|SS(?:AGE)?|THS?)|L(?:A(?:IN(?:E?S)?|CE|ZA)|NS?|ZA?)?|R(?:[KR]|AI?RIE|TS?)?|K(?:W(?:YS?|AY)|Y)?|O(?:INTS?|RTS?)|I(?:KES?|NES?)|NES?|SGE|TS?)|B(?:O(?:UL(?:EVARD|V)?|T(?:TO?M)?)|R(?:A?NCH|I?DGE|OOKS?|KS?|G)?|Y(?:P(?:A(?:S?S)?|S)?|U)|L(?:UF(?:FS?)?|FS?|VD)|E(?:ACH|ND)|AYO[OU]|URGS?|GS?|CH|ND|TM)|M(?:O(?:UNT(?:AINS?|IN)?|TORWAY)|N(?:T(?:AIN|NS?)?|RS?)|E(?:(?:DO)?WS|ADOWS?)|I(?:SS(?:IO)?N|LLS?)|T(?:NS?|IN|WY)?|A(?:NORS?|LL)|DWS?|S?SN|LS?)|T(?:R(?:A(?:C(?:ES?|KS?)|FFICWAY|ILS?|K)|[FW]Y|N?PK|KS?|LS?|CE)?|U(?:N(?:N(?:ELS?|L)|LS?|EL)|RNP(?:IKE|K))|ER(?:R(?:ACE)?)?|HROUGHWAY|PKE?)|F(?:R(?:(?:(?:EE)?WA?|R)?Y|DS?|GS?|KS?|S?T)|OR(?:G(?:ES?)?|ESTS?|DS?|KS?|T)|L(?:ATS?|DS?|TS?|S)|(?:ERR|W)Y|IELDS?|ALLS?|T)|R(?:A(?:D(?:(?:I[AE])?L)?|NCH(?:ES)?|PIDS?|MP)|I(?:V(?:E?R)?|DGES?)|D(?:G[ES]?|S)?|O(?:ADS?|W)|NCHS?|U[EN]|E?ST|PDS?|VR)|H(?:A(?:RB(?:ORS?|R)?|VE?N)|I(?:(?:GH)?WA?Y|LLS?)|OL(?:LOWS?|WS?)|L(?:LW|S)?|EIGHTS?|BRS?|RBOR|WA?Y|GTS|TS?|VN)|V(?:I(?:LL(?:AG(?:ES?)?|(?:IAG)?E|G)?|A(?:DU?CT)?|S(?:TA?)?|EWS?)|L(?:GS?|YS?|LY)?|ALL(?:EYS?|Y)|STA?|DCT|WS?)|G(?:R(?:D(?:NS?|EN)|OV(?:ES?)?|EENS?|NS?|VS?)|A(?:T(?:EWA?|WA)Y|RD(?:ENS?|N))|L(?:ENS?|NS?)|TWA?Y|DNS?)|L(?:A(?:N(?:D(?:ING)?|ES?)|KES?)?|O(?:CKS?|DGE?|OPS?|AF)|I(?:GHTS?|NE)|N(?:DN?G)?|CKS?|DGE?|GTS?|KS?|F)|E(?:X(?:P(?:[WY]|R(?:ESS(?:WAY)?)?)?|T(?:(?:NS)?N|ENSIONS?|S)?)|ST(?:ATES?|S)?)|A(?:V(?:E(?:N(?:UE?)?)?|N(?:UE)?)?|L(?:L(?:E[EY]|Y)|Y)|RC(?:ADE)?|NN?E?X)|D(?:[LM]|R(?:[SV]|IV(?:ES?)?)?|IV(?:IDE)?|A(?:LE|M)|VD?)|J(?:UNCT(?:IONS?|O?N)|CT(?:ION|NS?|S)?)|I(?:S(?:L(?:ANDS?|NDS?|ES?)|S)?|NLE?T)|O(?:V(?:ERPASS|A?L)|RCH(?:A?RD)?|PAS)|W(?:A(?:L(?:KS?|L)|YS?)|ELLS?|LS?|Y)|K(?:N(?:OL(?:LS?)?|LS?)|EYS?|YS?)|U(?:N(?:(?:DERPAS)?S|IONS?)?|PAS)|X(?:ING|RD)|NE?CK)))\\s(?:(?-xism:(?i:(?:N(?:[EW]|OR(?:TH(?:-?(?:EA|WE)ST)?|D)|-[EW])?|S(?:[EW]|OUTH(?:-?(?:EA|WE)ST)?|-[EW]|UD)?|E(?:A?ST)?|O(?:EST)?|W(?:EST)?)))\\s))((?-xism:(?i:(?:N(?:[EW]|OR(?:TH(?:-?(?:EA|WE)ST)?|D)|-[EW])?|S(?:[EW]|OUTH(?:-?(?:EA|WE)ST)?|-[EW]|UD)?|E(?:A?ST)?|O(?:EST)?|W(?:EST)?)))\\s(?-xism:(?i:s(?:t(?:e.?|.)?|aint)))\\s(\\w+(\\s\\w+)*))$",
|
||||
"(?:\\b(?-xism:(?i:(?:C(?:R(?:[KT]|E(?:S(?:(?:C?EN)?T)?|CENT|EK)|S(?:(?:C?N)?T|E(?:NT)?|SI?NG)|OSS(?:ROAD|ING)|CLE?)?|O(?:R(?:NERS?|S)?|UR(?:TS?|SE)|MMON|VES?)|A(?:USE?WAY|NYO?N|MP|PE)|IR(?:C(?:L(?:ES?)?)?|S)?|EN(?:T(?:ERS?|RE?)?)?|L(?:IFFS?|FS?|U?B)|N(?:TE?R|YN)|T(?:RS?|S)?|M[NP]|URVE?|PE?|SWY|VS?|YN|K)|S(?:T(?:[NS]|R(?:[MT]|A(?:V(?:E(?:N(?:UE)?)?|N)?)?|E(?:ETS?|AM|ME)|VN(?:UE)?)?|A(?:T(?:IO)?N)?)?|H(?:O(?:A(?:LS?|RS?)|RES?)|LS?|RS?)|P(?:R(?:INGS?|NGS?)|NGS?|URS?|GS?)|Q(?:U(?:ARES?)?|R[ES]?|S)?|(?:UM(?:IT?|MI)|M)T|K(?:YWA|W)Y)|P(?:A(?:RK(?:W(?:AYS?|Y)|S)?|SS(?:AGE)?|THS?)|L(?:A(?:IN(?:E?S)?|CE|ZA)|NS?|ZA?)?|R(?:[KR]|AI?RIE|TS?)?|K(?:W(?:YS?|AY)|Y)?|O(?:INTS?|RTS?)|I(?:KES?|NES?)|NES?|SGE|TS?)|B(?:O(?:UL(?:EVARD|V)?|T(?:TO?M)?)|R(?:A?NCH|I?DGE|OOKS?|KS?|G)?|Y(?:P(?:A(?:S?S)?|S)?|U)|L(?:UF(?:FS?)?|FS?|VD)|E(?:ACH|ND)|AYO[OU]|URGS?|GS?|CH|ND|TM)|M(?:O(?:UNT(?:AINS?|IN)?|TORWAY)|N(?:T(?:AIN|NS?)?|RS?)|E(?:(?:DO)?WS|ADOWS?)|I(?:SS(?:IO)?N|LLS?)|T(?:NS?|IN|WY)?|A(?:NORS?|LL)|DWS?|S?SN|LS?)|T(?:R(?:A(?:C(?:ES?|KS?)|FFICWAY|ILS?|K)|[FW]Y|N?PK|KS?|LS?|CE)?|U(?:N(?:N(?:ELS?|L)|LS?|EL)|RNP(?:IKE|K))|ER(?:R(?:ACE)?)?|HROUGHWAY|PKE?)|F(?:R(?:(?:(?:EE)?WA?|R)?Y|DS?|GS?|KS?|S?T)|OR(?:G(?:ES?)?|ESTS?|DS?|KS?|T)|L(?:ATS?|DS?|TS?|S)|(?:ERR|W)Y|IELDS?|ALLS?|T)|R(?:A(?:D(?:(?:I[AE])?L)?|NCH(?:ES)?|PIDS?|MP)|I(?:V(?:E?R)?|DGES?)|D(?:G[ES]?|S)?|O(?:ADS?|W)|NCHS?|U[EN]|E?ST|PDS?|VR)|H(?:A(?:RB(?:ORS?|R)?|VE?N)|I(?:(?:GH)?WA?Y|LLS?)|OL(?:LOWS?|WS?)|L(?:LW|S)?|EIGHTS?|BRS?|RBOR|WA?Y|GTS|TS?|VN)|V(?:I(?:LL(?:AG(?:ES?)?|(?:IAG)?E|G)?|A(?:DU?CT)?|S(?:TA?)?|EWS?)|L(?:GS?|YS?|LY)?|ALL(?:EYS?|Y)|STA?|DCT|WS?)|G(?:R(?:D(?:NS?|EN)|OV(?:ES?)?|EENS?|NS?|VS?)|A(?:T(?:EWA?|WA)Y|RD(?:ENS?|N))|L(?:ENS?|NS?)|TWA?Y|DNS?)|L(?:A(?:N(?:D(?:ING)?|ES?)|KES?)?|O(?:CKS?|DGE?|OPS?|AF)|I(?:GHTS?|NE)|N(?:DN?G)?|CKS?|DGE?|GTS?|KS?|F)|E(?:X(?:P(?:[WY]|R(?:ESS(?:WAY)?)?)?|T(?:(?:NS)?N|ENSIONS?|S)?)|ST(?:ATES?|S)?)|A(?:V(?:E(?:N(?:UE?)?)?|N(?:UE)?)?|L(?:L(?:E[EY]|Y)|Y)|RC(?:ADE)?|NN?E?X)|D(?:[LM]|R(?:[SV]|IV(?:ES?)?)?|IV(?:IDE)?|A(?:LE|M)|VD?)|J(?:UNCT(?:IONS?|O?N)|CT(?:ION|NS?|S)?)|I(?:S(?:L(?:ANDS?|NDS?|ES?)|S)?|NLE?T)|O(?:V(?:ERPASS|A?L)|RCH(?:A?RD)?|PAS)|W(?:A(?:L(?:KS?|L)|YS?)|ELLS?|LS?|Y)|K(?:N(?:OL(?:LS?)?|LS?)|EYS?|YS?)|U(?:N(?:(?:DERPAS)?S|IONS?)?|PAS)|X(?:ING|RD)|NE?CK)))\\s)((?-xism:(?i:(?:N(?:[EW]|OR(?:TH(?:-?(?:EA|WE)ST)?|D)|-[EW])?|S(?:[EW]|OUTH(?:-?(?:EA|WE)ST)?|-[EW]|UD)?|E(?:A?ST)?|O(?:EST)?|W(?:EST)?)))\\s(?-xism:(?i:s(?:t(?:e.?|.)?|aint)))\\s(\\w+(\\s\\w+)*))$",
|
||||
"(?:\\b(?-xism:(?i:(?:C(?:R(?:[KT]|E(?:S(?:(?:C?EN)?T)?|CENT|EK)|S(?:(?:C?N)?T|E(?:NT)?|SI?NG)|OSS(?:ROAD|ING)|CLE?)?|O(?:R(?:NERS?|S)?|UR(?:TS?|SE)|MMON|VES?)|A(?:USE?WAY|NYO?N|MP|PE)|IR(?:C(?:L(?:ES?)?)?|S)?|EN(?:T(?:ERS?|RE?)?)?|L(?:IFFS?|FS?|U?B)|N(?:TE?R|YN)|T(?:RS?|S)?|M[NP]|URVE?|PE?|SWY|VS?|YN|K)|S(?:T(?:[NS]|R(?:[MT]|A(?:V(?:E(?:N(?:UE)?)?|N)?)?|E(?:ETS?|AM|ME)|VN(?:UE)?)?|A(?:T(?:IO)?N)?)?|H(?:O(?:A(?:LS?|RS?)|RES?)|LS?|RS?)|P(?:R(?:INGS?|NGS?)|NGS?|URS?|GS?)|Q(?:U(?:ARES?)?|R[ES]?|S)?|(?:UM(?:IT?|MI)|M)T|K(?:YWA|W)Y)|P(?:A(?:RK(?:W(?:AYS?|Y)|S)?|SS(?:AGE)?|THS?)|L(?:A(?:IN(?:E?S)?|CE|ZA)|NS?|ZA?)?|R(?:[KR]|AI?RIE|TS?)?|K(?:W(?:YS?|AY)|Y)?|O(?:INTS?|RTS?)|I(?:KES?|NES?)|NES?|SGE|TS?)|B(?:O(?:UL(?:EVARD|V)?|T(?:TO?M)?)|R(?:A?NCH|I?DGE|OOKS?|KS?|G)?|Y(?:P(?:A(?:S?S)?|S)?|U)|L(?:UF(?:FS?)?|FS?|VD)|E(?:ACH|ND)|AYO[OU]|URGS?|GS?|CH|ND|TM)|M(?:O(?:UNT(?:AINS?|IN)?|TORWAY)|N(?:T(?:AIN|NS?)?|RS?)|E(?:(?:DO)?WS|ADOWS?)|I(?:SS(?:IO)?N|LLS?)|T(?:NS?|IN|WY)?|A(?:NORS?|LL)|DWS?|S?SN|LS?)|T(?:R(?:A(?:C(?:ES?|KS?)|FFICWAY|ILS?|K)|[FW]Y|N?PK|KS?|LS?|CE)?|U(?:N(?:N(?:ELS?|L)|LS?|EL)|RNP(?:IKE|K))|ER(?:R(?:ACE)?)?|HROUGHWAY|PKE?)|F(?:R(?:(?:(?:EE)?WA?|R)?Y|DS?|GS?|KS?|S?T)|OR(?:G(?:ES?)?|ESTS?|DS?|KS?|T)|L(?:ATS?|DS?|TS?|S)|(?:ERR|W)Y|IELDS?|ALLS?|T)|R(?:A(?:D(?:(?:I[AE])?L)?|NCH(?:ES)?|PIDS?|MP)|I(?:V(?:E?R)?|DGES?)|D(?:G[ES]?|S)?|O(?:ADS?|W)|NCHS?|U[EN]|E?ST|PDS?|VR)|H(?:A(?:RB(?:ORS?|R)?|VE?N)|I(?:(?:GH)?WA?Y|LLS?)|OL(?:LOWS?|WS?)|L(?:LW|S)?|EIGHTS?|BRS?|RBOR|WA?Y|GTS|TS?|VN)|V(?:I(?:LL(?:AG(?:ES?)?|(?:IAG)?E|G)?|A(?:DU?CT)?|S(?:TA?)?|EWS?)|L(?:GS?|YS?|LY)?|ALL(?:EYS?|Y)|STA?|DCT|WS?)|G(?:R(?:D(?:NS?|EN)|OV(?:ES?)?|EENS?|NS?|VS?)|A(?:T(?:EWA?|WA)Y|RD(?:ENS?|N))|L(?:ENS?|NS?)|TWA?Y|DNS?)|L(?:A(?:N(?:D(?:ING)?|ES?)|KES?)?|O(?:CKS?|DGE?|OPS?|AF)|I(?:GHTS?|NE)|N(?:DN?G)?|CKS?|DGE?|GTS?|KS?|F)|E(?:X(?:P(?:[WY]|R(?:ESS(?:WAY)?)?)?|T(?:(?:NS)?N|ENSIONS?|S)?)|ST(?:ATES?|S)?)|A(?:V(?:E(?:N(?:UE?)?)?|N(?:UE)?)?|L(?:L(?:E[EY]|Y)|Y)|RC(?:ADE)?|NN?E?X)|D(?:[LM]|R(?:[SV]|IV(?:ES?)?)?|IV(?:IDE)?|A(?:LE|M)|VD?)|J(?:UNCT(?:IONS?|O?N)|CT(?:ION|NS?|S)?)|I(?:S(?:L(?:ANDS?|NDS?|ES?)|S)?|NLE?T)|O(?:V(?:ERPASS|A?L)|RCH(?:A?RD)?|PAS)|W(?:A(?:L(?:KS?|L)|YS?)|ELLS?|LS?|Y)|K(?:N(?:OL(?:LS?)?|LS?)|EYS?|YS?)|U(?:N(?:(?:DERPAS)?S|IONS?)?|PAS)|X(?:ING|RD)|NE?CK)))\\s)((?-xism:(?i:s(?:t(?:e.?|.)?|aint)))\\s(\\w+(\\s\\w+)*))$",
|
||||
"(?:\\b(?-xism:(?i:(?:C(?:R(?:[KT]|E(?:S(?:(?:C?EN)?T)?|CENT|EK)|S(?:(?:C?N)?T|E(?:NT)?|SI?NG)|OSS(?:ROAD|ING)|CLE?)?|O(?:R(?:NERS?|S)?|UR(?:TS?|SE)|MMON|VES?)|A(?:USE?WAY|NYO?N|MP|PE)|IR(?:C(?:L(?:ES?)?)?|S)?|EN(?:T(?:ERS?|RE?)?)?|L(?:IFFS?|FS?|U?B)|N(?:TE?R|YN)|T(?:RS?|S)?|M[NP]|URVE?|PE?|SWY|VS?|YN|K)|S(?:T(?:[NS]|R(?:[MT]|A(?:V(?:E(?:N(?:UE)?)?|N)?)?|E(?:ETS?|AM|ME)|VN(?:UE)?)?|A(?:T(?:IO)?N)?)?|H(?:O(?:A(?:LS?|RS?)|RES?)|LS?|RS?)|P(?:R(?:INGS?|NGS?)|NGS?|URS?|GS?)|Q(?:U(?:ARES?)?|R[ES]?|S)?|(?:UM(?:IT?|MI)|M)T|K(?:YWA|W)Y)|P(?:A(?:RK(?:W(?:AYS?|Y)|S)?|SS(?:AGE)?|THS?)|L(?:A(?:IN(?:E?S)?|CE|ZA)|NS?|ZA?)?|R(?:[KR]|AI?RIE|TS?)?|K(?:W(?:YS?|AY)|Y)?|O(?:INTS?|RTS?)|I(?:KES?|NES?)|NES?|SGE|TS?)|B(?:O(?:UL(?:EVARD|V)?|T(?:TO?M)?)|R(?:A?NCH|I?DGE|OOKS?|KS?|G)?|Y(?:P(?:A(?:S?S)?|S)?|U)|L(?:UF(?:FS?)?|FS?|VD)|E(?:ACH|ND)|AYO[OU]|URGS?|GS?|CH|ND|TM)|M(?:O(?:UNT(?:AINS?|IN)?|TORWAY)|N(?:T(?:AIN|NS?)?|RS?)|E(?:(?:DO)?WS|ADOWS?)|I(?:SS(?:IO)?N|LLS?)|T(?:NS?|IN|WY)?|A(?:NORS?|LL)|DWS?|S?SN|LS?)|T(?:R(?:A(?:C(?:ES?|KS?)|FFICWAY|ILS?|K)|[FW]Y|N?PK|KS?|LS?|CE)?|U(?:N(?:N(?:ELS?|L)|LS?|EL)|RNP(?:IKE|K))|ER(?:R(?:ACE)?)?|HROUGHWAY|PKE?)|F(?:R(?:(?:(?:EE)?WA?|R)?Y|DS?|GS?|KS?|S?T)|OR(?:G(?:ES?)?|ESTS?|DS?|KS?|T)|L(?:ATS?|DS?|TS?|S)|(?:ERR|W)Y|IELDS?|ALLS?|T)|R(?:A(?:D(?:(?:I[AE])?L)?|NCH(?:ES)?|PIDS?|MP)|I(?:V(?:E?R)?|DGES?)|D(?:G[ES]?|S)?|O(?:ADS?|W)|NCHS?|U[EN]|E?ST|PDS?|VR)|H(?:A(?:RB(?:ORS?|R)?|VE?N)|I(?:(?:GH)?WA?Y|LLS?)|OL(?:LOWS?|WS?)|L(?:LW|S)?|EIGHTS?|BRS?|RBOR|WA?Y|GTS|TS?|VN)|V(?:I(?:LL(?:AG(?:ES?)?|(?:IAG)?E|G)?|A(?:DU?CT)?|S(?:TA?)?|EWS?)|L(?:GS?|YS?|LY)?|ALL(?:EYS?|Y)|STA?|DCT|WS?)|G(?:R(?:D(?:NS?|EN)|OV(?:ES?)?|EENS?|NS?|VS?)|A(?:T(?:EWA?|WA)Y|RD(?:ENS?|N))|L(?:ENS?|NS?)|TWA?Y|DNS?)|L(?:A(?:N(?:D(?:ING)?|ES?)|KES?)?|O(?:CKS?|DGE?|OPS?|AF)|I(?:GHTS?|NE)|N(?:DN?G)?|CKS?|DGE?|GTS?|KS?|F)|E(?:X(?:P(?:[WY]|R(?:ESS(?:WAY)?)?)?|T(?:(?:NS)?N|ENSIONS?|S)?)|ST(?:ATES?|S)?)|A(?:V(?:E(?:N(?:UE?)?)?|N(?:UE)?)?|L(?:L(?:E[EY]|Y)|Y)|RC(?:ADE)?|NN?E?X)|D(?:[LM]|R(?:[SV]|IV(?:ES?)?)?|IV(?:IDE)?|A(?:LE|M)|VD?)|J(?:UNCT(?:IONS?|O?N)|CT(?:ION|NS?|S)?)|I(?:S(?:L(?:ANDS?|NDS?|ES?)|S)?|NLE?T)|O(?:V(?:ERPASS|A?L)|RCH(?:A?RD)?|PAS)|W(?:A(?:L(?:KS?|L)|YS?)|ELLS?|LS?|Y)|K(?:N(?:OL(?:LS?)?|LS?)|EYS?|YS?)|U(?:N(?:(?:DERPAS)?S|IONS?)?|PAS)|X(?:ING|RD)|NE?CK)))\\s)((?-xism:(?i:(?:N(?:[EW]|OR(?:TH(?:-?(?:EA|WE)ST)?|D)|-[EW])?|S(?:[EW]|OUTH(?:-?(?:EA|WE)ST)?|-[EW]|UD)?|E(?:A?ST)?|O(?:EST)?|W(?:EST)?)))\\s(\\w+(\\s\\w+)*))$",
|
||||
"(?:\\b(?-xism:(?i:(?:C(?:R(?:[KT]|E(?:S(?:(?:C?EN)?T)?|CENT|EK)|S(?:(?:C?N)?T|E(?:NT)?|SI?NG)|OSS(?:ROAD|ING)|CLE?)?|O(?:R(?:NERS?|S)?|UR(?:TS?|SE)|MMON|VES?)|A(?:USE?WAY|NYO?N|MP|PE)|IR(?:C(?:L(?:ES?)?)?|S)?|EN(?:T(?:ERS?|RE?)?)?|L(?:IFFS?|FS?|U?B)|N(?:TE?R|YN)|T(?:RS?|S)?|M[NP]|URVE?|PE?|SWY|VS?|YN|K)|S(?:T(?:[NS]|R(?:[MT]|A(?:V(?:E(?:N(?:UE)?)?|N)?)?|E(?:ETS?|AM|ME)|VN(?:UE)?)?|A(?:T(?:IO)?N)?)?|H(?:O(?:A(?:LS?|RS?)|RES?)|LS?|RS?)|P(?:R(?:INGS?|NGS?)|NGS?|URS?|GS?)|Q(?:U(?:ARES?)?|R[ES]?|S)?|(?:UM(?:IT?|MI)|M)T|K(?:YWA|W)Y)|P(?:A(?:RK(?:W(?:AYS?|Y)|S)?|SS(?:AGE)?|THS?)|L(?:A(?:IN(?:E?S)?|CE|ZA)|NS?|ZA?)?|R(?:[KR]|AI?RIE|TS?)?|K(?:W(?:YS?|AY)|Y)?|O(?:INTS?|RTS?)|I(?:KES?|NES?)|NES?|SGE|TS?)|B(?:O(?:UL(?:EVARD|V)?|T(?:TO?M)?)|R(?:A?NCH|I?DGE|OOKS?|KS?|G)?|Y(?:P(?:A(?:S?S)?|S)?|U)|L(?:UF(?:FS?)?|FS?|VD)|E(?:ACH|ND)|AYO[OU]|URGS?|GS?|CH|ND|TM)|M(?:O(?:UNT(?:AINS?|IN)?|TORWAY)|N(?:T(?:AIN|NS?)?|RS?)|E(?:(?:DO)?WS|ADOWS?)|I(?:SS(?:IO)?N|LLS?)|T(?:NS?|IN|WY)?|A(?:NORS?|LL)|DWS?|S?SN|LS?)|T(?:R(?:A(?:C(?:ES?|KS?)|FFICWAY|ILS?|K)|[FW]Y|N?PK|KS?|LS?|CE)?|U(?:N(?:N(?:ELS?|L)|LS?|EL)|RNP(?:IKE|K))|ER(?:R(?:ACE)?)?|HROUGHWAY|PKE?)|F(?:R(?:(?:(?:EE)?WA?|R)?Y|DS?|GS?|KS?|S?T)|OR(?:G(?:ES?)?|ESTS?|DS?|KS?|T)|L(?:ATS?|DS?|TS?|S)|(?:ERR|W)Y|IELDS?|ALLS?|T)|R(?:A(?:D(?:(?:I[AE])?L)?|NCH(?:ES)?|PIDS?|MP)|I(?:V(?:E?R)?|DGES?)|D(?:G[ES]?|S)?|O(?:ADS?|W)|NCHS?|U[EN]|E?ST|PDS?|VR)|H(?:A(?:RB(?:ORS?|R)?|VE?N)|I(?:(?:GH)?WA?Y|LLS?)|OL(?:LOWS?|WS?)|L(?:LW|S)?|EIGHTS?|BRS?|RBOR|WA?Y|GTS|TS?|VN)|V(?:I(?:LL(?:AG(?:ES?)?|(?:IAG)?E|G)?|A(?:DU?CT)?|S(?:TA?)?|EWS?)|L(?:GS?|YS?|LY)?|ALL(?:EYS?|Y)|STA?|DCT|WS?)|G(?:R(?:D(?:NS?|EN)|OV(?:ES?)?|EENS?|NS?|VS?)|A(?:T(?:EWA?|WA)Y|RD(?:ENS?|N))|L(?:ENS?|NS?)|TWA?Y|DNS?)|L(?:A(?:N(?:D(?:ING)?|ES?)|KES?)?|O(?:CKS?|DGE?|OPS?|AF)|I(?:GHTS?|NE)|N(?:DN?G)?|CKS?|DGE?|GTS?|KS?|F)|E(?:X(?:P(?:[WY]|R(?:ESS(?:WAY)?)?)?|T(?:(?:NS)?N|ENSIONS?|S)?)|ST(?:ATES?|S)?)|A(?:V(?:E(?:N(?:UE?)?)?|N(?:UE)?)?|L(?:L(?:E[EY]|Y)|Y)|RC(?:ADE)?|NN?E?X)|D(?:[LM]|R(?:[SV]|IV(?:ES?)?)?|IV(?:IDE)?|A(?:LE|M)|VD?)|J(?:UNCT(?:IONS?|O?N)|CT(?:ION|NS?|S)?)|I(?:S(?:L(?:ANDS?|NDS?|ES?)|S)?|NLE?T)|O(?:V(?:ERPASS|A?L)|RCH(?:A?RD)?|PAS)|W(?:A(?:L(?:KS?|L)|YS?)|ELLS?|LS?|Y)|K(?:N(?:OL(?:LS?)?|LS?)|EYS?|YS?)|U(?:N(?:(?:DERPAS)?S|IONS?)?|PAS)|X(?:ING|RD)|NE?CK)))\\s)((\\w+(\\s\\w+)*))$",
|
||||
"(?:\\s)((?-xism:(?i:(?:N(?:[EW]|OR(?:TH(?:-?(?:EA|WE)ST)?|D)|-[EW])?|S(?:[EW]|OUTH(?:-?(?:EA|WE)ST)?|-[EW]|UD)?|E(?:A?ST)?|O(?:EST)?|W(?:EST)?)))\\s(\\w+(\\s\\w+)*))$",
|
||||
"^(?:\\d+\\s(?:(?:\\w+\\s)(?-xism:(?i:(?:C(?:R(?:[KT]|E(?:S(?:(?:C?EN)?T)?|CENT|EK)|S(?:(?:C?N)?T|E(?:NT)?|SI?NG)|OSS(?:ROAD|ING)|CLE?)?|O(?:R(?:NERS?|S)?|UR(?:TS?|SE)|MMON|VES?)|A(?:USE?WAY|NYO?N|MP|PE)|IR(?:C(?:L(?:ES?)?)?|S)?|EN(?:T(?:ERS?|RE?)?)?|L(?:IFFS?|FS?|U?B)|N(?:TE?R|YN)|T(?:RS?|S)?|M[NP]|URVE?|PE?|SWY|VS?|YN|K)|S(?:T(?:[NS]|R(?:[MT]|A(?:V(?:E(?:N(?:UE)?)?|N)?)?|E(?:ETS?|AM|ME)|VN(?:UE)?)?|A(?:T(?:IO)?N)?)?|H(?:O(?:A(?:LS?|RS?)|RES?)|LS?|RS?)|P(?:R(?:INGS?|NGS?)|NGS?|URS?|GS?)|Q(?:U(?:ARES?)?|R[ES]?|S)?|(?:UM(?:IT?|MI)|M)T|K(?:YWA|W)Y)|P(?:A(?:RK(?:W(?:AYS?|Y)|S)?|SS(?:AGE)?|THS?)|L(?:A(?:IN(?:E?S)?|CE|ZA)|NS?|ZA?)?|R(?:[KR]|AI?RIE|TS?)?|K(?:W(?:YS?|AY)|Y)?|O(?:INTS?|RTS?)|I(?:KES?|NES?)|NES?|SGE|TS?)|B(?:O(?:UL(?:EVARD|V)?|T(?:TO?M)?)|R(?:A?NCH|I?DGE|OOKS?|KS?|G)?|Y(?:P(?:A(?:S?S)?|S)?|U)|L(?:UF(?:FS?)?|FS?|VD)|E(?:ACH|ND)|AYO[OU]|URGS?|GS?|CH|ND|TM)|M(?:O(?:UNT(?:AINS?|IN)?|TORWAY)|N(?:T(?:AIN|NS?)?|RS?)|E(?:(?:DO)?WS|ADOWS?)|I(?:SS(?:IO)?N|LLS?)|T(?:NS?|IN|WY)?|A(?:NORS?|LL)|DWS?|S?SN|LS?)|T(?:R(?:A(?:C(?:ES?|KS?)|FFICWAY|ILS?|K)|[FW]Y|N?PK|KS?|LS?|CE)?|U(?:N(?:N(?:ELS?|L)|LS?|EL)|RNP(?:IKE|K))|ER(?:R(?:ACE)?)?|HROUGHWAY|PKE?)|F(?:R(?:(?:(?:EE)?WA?|R)?Y|DS?|GS?|KS?|S?T)|OR(?:G(?:ES?)?|ESTS?|DS?|KS?|T)|L(?:ATS?|DS?|TS?|S)|(?:ERR|W)Y|IELDS?|ALLS?|T)|R(?:A(?:D(?:(?:I[AE])?L)?|NCH(?:ES)?|PIDS?|MP)|I(?:V(?:E?R)?|DGES?)|D(?:G[ES]?|S)?|O(?:ADS?|W)|NCHS?|U[EN]|E?ST|PDS?|VR)|H(?:A(?:RB(?:ORS?|R)?|VE?N)|I(?:(?:GH)?WA?Y|LLS?)|OL(?:LOWS?|WS?)|L(?:LW|S)?|EIGHTS?|BRS?|RBOR|WA?Y|GTS|TS?|VN)|V(?:I(?:LL(?:AG(?:ES?)?|(?:IAG)?E|G)?|A(?:DU?CT)?|S(?:TA?)?|EWS?)|L(?:GS?|YS?|LY)?|ALL(?:EYS?|Y)|STA?|DCT|WS?)|G(?:R(?:D(?:NS?|EN)|OV(?:ES?)?|EENS?|NS?|VS?)|A(?:T(?:EWA?|WA)Y|RD(?:ENS?|N))|L(?:ENS?|NS?)|TWA?Y|DNS?)|L(?:A(?:N(?:D(?:ING)?|ES?)|KES?)?|O(?:CKS?|DGE?|OPS?|AF)|I(?:GHTS?|NE)|N(?:DN?G)?|CKS?|DGE?|GTS?|KS?|F)|E(?:X(?:P(?:[WY]|R(?:ESS(?:WAY)?)?)?|T(?:(?:NS)?N|ENSIONS?|S)?)|ST(?:ATES?|S)?)|A(?:V(?:E(?:N(?:UE?)?)?|N(?:UE)?)?|L(?:L(?:E[EY]|Y)|Y)|RC(?:ADE)?|NN?E?X)|D(?:[LM]|R(?:[SV]|IV(?:ES?)?)?|IV(?:IDE)?|A(?:LE|M)|VD?)|J(?:UNCT(?:IONS?|O?N)|CT(?:ION|NS?|S)?)|I(?:S(?:L(?:ANDS?|NDS?|ES?)|S)?|NLE?T)|O(?:V(?:ERPASS|A?L)|RCH(?:A?RD)?|PAS)|W(?:A(?:L(?:KS?|L)|YS?)|ELLS?|LS?|Y)|K(?:N(?:OL(?:LS?)?|LS?)|EYS?|YS?)|U(?:N(?:(?:DERPAS)?S|IONS?)?|PAS)|X(?:ING|RD)|NE?CK)))))()$",
|
||||
"^(?:\\d+\\s(?:(?:\\w+\\s)*\\w+\\s))(\\w+)$"
|
||||
};
|
5346
extras/address_standardizer/parseaddress-stcities.h
Normal file
5346
extras/address_standardizer/parseaddress-stcities.h
Normal file
File diff suppressed because it is too large
Load diff
4368
extras/address_standardizer/rules.txt
Normal file
4368
extras/address_standardizer/rules.txt
Normal file
File diff suppressed because it is too large
Load diff
692
extras/address_standardizer/standard.c
Normal file
692
extras/address_standardizer/standard.c
Normal file
|
@ -0,0 +1,692 @@
|
|||
/* -- standard.c
|
||||
|
||||
interface for the standardizer
|
||||
|
||||
Prototype 7H08 (This file was written by Walter Sinclair).
|
||||
|
||||
This file is part of PAGC.
|
||||
|
||||
Copyright (c) 2009 Walter Bruce Sinclair
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
*/
|
||||
|
||||
/*-- For pagc-0.4.2 : last revised 2012-07-18 --*/
|
||||
|
||||
#undef DEBUG
|
||||
//#define DEBUG 1
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
#include "pagc_api.h"
|
||||
#ifdef BUILD_API
|
||||
#include "pagc_std_api.h"
|
||||
#endif
|
||||
|
||||
#define GAZ_LEXICON
|
||||
|
||||
/* -- local prototypes -- */
|
||||
/*-- <revision date='2012-07-22'> Keep track of start_state </revision> --*/
|
||||
static int _Close_Stand_Field_(STAND_PARAM *) ;
|
||||
static int _Scan_String_(STAND_PARAM *, char *) ;
|
||||
static char * _Scan_Next_(STAND_PARAM *, char *) ;
|
||||
|
||||
static char __spacer__[] = " \\-.)}>_" ;
|
||||
|
||||
#define TERM_AND_LENGTH \
|
||||
*__dest__ = SENTINEL ; \
|
||||
n = strlen(__scan_buf__)
|
||||
|
||||
#define RETURN_NEW_MORPH(TOKEN_ARG) \
|
||||
if (!new_morph(__stand_param__,TOKEN_ARG,__scan_buf__,n))\
|
||||
{\
|
||||
return NULL ; \
|
||||
} \
|
||||
return __src__
|
||||
|
||||
#define COLLECT_LOOKAHEAD \
|
||||
*__dest__++ = a ; __src__++ ; *__dest__++ = b ; __src__++
|
||||
|
||||
#define COLLECT_WHILE(COND) \
|
||||
do { *__dest__++ = a ; __src__++ ; a = *__src__ ; } while (COND)
|
||||
|
||||
#define NO_COLLECT_WHILE(COND) \
|
||||
do { __dest__++ ; __src__++ ; a = *__src__ ; } while (COND)
|
||||
|
||||
#define TEST_FOR_ORD_DIGIT(N,NEXT_LOW,NEXT_UP) \
|
||||
if ((b == NEXT_LOW) || (b == NEXT_UP)) \
|
||||
{ \
|
||||
if (last_digit == N)\
|
||||
{ \
|
||||
if ((n < 2 ) || (*(__dest__-2) != '1')) \
|
||||
{ \
|
||||
COLLECT_LOOKAHEAD ; \
|
||||
TERM_AND_LENGTH ; \
|
||||
RETURN_NEW_MORPH(DORD) ; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
break
|
||||
|
||||
|
||||
/*========================================================================
|
||||
<summary>
|
||||
<function name='standard.c (standardize_field)'/>
|
||||
<remarks>This function is called with a pointer to the
|
||||
str to standardize and a start state indicating
|
||||
the kind of standardization to perform. It invokes
|
||||
the scanner to start the creation of the morphemes
|
||||
<calls><functionref='tokenize.c (initialize_morphs)'/></calls>
|
||||
<calls><functionref='_Close_Stand_Field_s'/></calls>
|
||||
<calls><functionref='_Scan_String_'/></calls>
|
||||
</summary>
|
||||
=========================================================================*/
|
||||
int standardize_field(STAND_PARAM *__stand_param__ ,char *__in_str__ , int client_start_state )
|
||||
{
|
||||
/*-- <revision date='2009-08-13'> Support multiple lexicons </revision> --*/
|
||||
/*-- <revision date='2012-06-01'> Add gaz_lexicon to be triggered on start_state= MACRO </revision> --*/
|
||||
__stand_param__->lexicon = __stand_param__->address_lexicon ;
|
||||
if (client_start_state > EXTRA_STATE)
|
||||
{
|
||||
__stand_param__->lexicon = __stand_param__->poi_lexicon ;
|
||||
}
|
||||
#ifdef GAZ_LEXICON
|
||||
else
|
||||
{
|
||||
if (client_start_state == MACRO)
|
||||
{
|
||||
__stand_param__->lexicon = __stand_param__->gaz_lexicon ;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
/*-- <revision date='2012-07-22'> Keep track of start_state </revision> --*/
|
||||
__stand_param__->start_state = client_start_state ;
|
||||
initialize_morphs(__stand_param__) ;
|
||||
if (!_Scan_String_(__stand_param__,__in_str__))
|
||||
{
|
||||
return FALSE ;
|
||||
}
|
||||
/*-- <revision date='2012-07-22'> Keep track of start_state </revision> --*/
|
||||
return (_Close_Stand_Field_(__stand_param__)) ;
|
||||
}
|
||||
|
||||
static int _Scan_String_(STAND_PARAM *__stand_param__ ,char *__in_str__ )
|
||||
{
|
||||
char *__src__ = __in_str__ ;
|
||||
while (TRUE)
|
||||
{
|
||||
char a = *__src__ ;
|
||||
/*-- <remarks> If we're done, process the tokens: </remarks> --*/
|
||||
if ((a == '\n') || (a == SENTINEL))
|
||||
{
|
||||
return (process_input(__stand_param__)) ;
|
||||
}
|
||||
/*-- <remarks> Gather sequences into tokens: </remarks> --*/
|
||||
__src__ = _Scan_Next_(__stand_param__,__src__) ;
|
||||
/*-- <remarks> Check for errors: </remarks> --*/
|
||||
if (__src__ == NULL)
|
||||
{
|
||||
break ;
|
||||
}
|
||||
}
|
||||
return FALSE ;
|
||||
}
|
||||
|
||||
static char * _Scan_Next_( STAND_PARAM *__stand_param__,char * __in_ptr__)
|
||||
{
|
||||
int n ;
|
||||
char __scan_buf__[MAXSTRLEN] ;
|
||||
|
||||
/*-- <remarks> Collect a sequence of characters into the scan_buf </remarks> --*/
|
||||
|
||||
char *__src__ = __in_ptr__ ;
|
||||
char a = *__src__ ;
|
||||
char *__dest__ = __scan_buf__ ;
|
||||
*__dest__ = SENTINEL ;
|
||||
|
||||
/*-- <remarks> Type one terminators </remarks> --*/
|
||||
if ((a == ',') || (a == '\t') || (a == ';'))
|
||||
{
|
||||
*__dest__++ = a ;
|
||||
*__dest__ = SENTINEL;
|
||||
set_term(__stand_param__,1,__scan_buf__) ;
|
||||
/*-- <remarks> Point to next input char </remarks> --*/
|
||||
return (__src__ + 1) ;
|
||||
}
|
||||
/*-- <remarks> Numeric sequences : ordinals, fractions and numbers </remarks> --*/
|
||||
if (isdigit(a))
|
||||
{
|
||||
char b ;
|
||||
char last_digit ;
|
||||
|
||||
COLLECT_WHILE(isdigit(a)) ;
|
||||
/*-- <remarks> Get a character of lookahead and one of lookbehind </remarks> --*/
|
||||
b = *(__src__ + 1 ) ;
|
||||
last_digit = *(__dest__ - 1 ) ; /*-- last digit collected --*/
|
||||
n = __dest__ - __scan_buf__ ;
|
||||
switch (a)
|
||||
{
|
||||
/*-- <remarks> Fractions </remarks> --*/
|
||||
case '/' :
|
||||
/*-- <remarks> Collect the rest of the fraction </remarks> --*/
|
||||
if (isdigit(b))
|
||||
{
|
||||
switch (b)
|
||||
{
|
||||
case '2' :
|
||||
if (last_digit == '1')
|
||||
{
|
||||
COLLECT_LOOKAHEAD ;
|
||||
TERM_AND_LENGTH ;
|
||||
RETURN_NEW_MORPH(DFRACT) ;
|
||||
}
|
||||
break ;
|
||||
case '3' :
|
||||
if ((last_digit == '1') || (last_digit == '2'))
|
||||
{
|
||||
COLLECT_LOOKAHEAD ;
|
||||
TERM_AND_LENGTH ;
|
||||
RETURN_NEW_MORPH(DFRACT) ;
|
||||
}
|
||||
break ;
|
||||
case '4' :
|
||||
if ((last_digit == '1') || (last_digit == '3'))
|
||||
{
|
||||
COLLECT_LOOKAHEAD ;
|
||||
TERM_AND_LENGTH ;
|
||||
RETURN_NEW_MORPH(DFRACT) ;
|
||||
}
|
||||
break ;
|
||||
} /*-- <remarks> end of switch on lookahead </remarks> --*/
|
||||
}
|
||||
break ;
|
||||
/*-- <remarks> ordinals -- */
|
||||
case 's' : case 'S' :
|
||||
/*-- <remarks> 1st, 21st, 31st, -- for 1 </remarks> --*/
|
||||
/*-- <remarks> Point to next input char </remarks> --*/
|
||||
TEST_FOR_ORD_DIGIT('1','t','T') ;
|
||||
case 'r' : case 'R' :
|
||||
/*-- <remarks> 3rd, 23rd, 33rd, -- for 3 </remarks> --*/
|
||||
/*-- <remarks> Point to next input char </remarks> --*/
|
||||
TEST_FOR_ORD_DIGIT('3','d','D') ;
|
||||
case 'n' : case 'N' :
|
||||
/*-- <remarks> 2nd, 22nd, 32nd, -- for 2 </remarks> --*/
|
||||
/*-- <remarks> Point to next input char </remarks> --*/
|
||||
TEST_FOR_ORD_DIGIT('2','d','D') ;
|
||||
case 't' : case 'T' :
|
||||
if ((b == 'h') || (b == 'H'))
|
||||
{
|
||||
switch (last_digit)
|
||||
{
|
||||
case '1' : case '2' : case '3' :
|
||||
/*-- <remarks> 11th, 111th, 211th etc -- for 11-13 </remarks> --*/
|
||||
if ((n > 1) && (*(__dest__ - 2) == '1'))
|
||||
{
|
||||
COLLECT_LOOKAHEAD ;
|
||||
TERM_AND_LENGTH ;
|
||||
/*-- <remarks> Point to next input char </remarks> --*/
|
||||
RETURN_NEW_MORPH(DORD) ;
|
||||
}
|
||||
break ;
|
||||
default :
|
||||
/*-- <remarks> 4th, 14th, 24th etc -- for 0, 4-9 </remarks> --*/
|
||||
COLLECT_LOOKAHEAD ;
|
||||
TERM_AND_LENGTH ;
|
||||
/*-- <remarks> Point to next input char </remarks> --*/
|
||||
RETURN_NEW_MORPH(DORD) ;
|
||||
}
|
||||
}
|
||||
break ;
|
||||
}
|
||||
/*-- <remarks> ordinary numeric sequence </remarks> --*/
|
||||
TERM_AND_LENGTH ;
|
||||
/*-- <remarks> Retain position </remarks> --*/
|
||||
RETURN_NEW_MORPH(DNUMBER) ;
|
||||
}
|
||||
/*-- <revision date='2009-08-15'> Fix ampersand : P&R --> P & R </remarks> --*/
|
||||
if (a == '&')
|
||||
{
|
||||
COLLECT_WHILE(a == '&') ;
|
||||
TERM_AND_LENGTH ;
|
||||
RETURN_NEW_MORPH(DSINGLE) ;
|
||||
}
|
||||
/*-- <remarks> Alphabetic sequence </remarks> --*/
|
||||
if ((isalpha(a)) || (a == '\'') || (a == '#'))
|
||||
{
|
||||
COLLECT_WHILE((isalpha(a)) || (a == '\'')) ;
|
||||
TERM_AND_LENGTH ;
|
||||
/*-- <remarks> Retain position </remarks> --*/
|
||||
switch (n)
|
||||
{
|
||||
case 1 :
|
||||
RETURN_NEW_MORPH(DSINGLE) ;
|
||||
case 2 :
|
||||
RETURN_NEW_MORPH(DDOUBLE) ;
|
||||
default :
|
||||
RETURN_NEW_MORPH( DWORDT ) ;
|
||||
}
|
||||
/*-- <remarks> Retain position </remarks> --*/
|
||||
return __src__ ;
|
||||
}
|
||||
/*-- <remarks> Type 2 terminators ( spacing ) </remarks> --*/
|
||||
if (strchr(__spacer__,a) != NULL)
|
||||
{
|
||||
NO_COLLECT_WHILE(strchr(__spacer__,a) != NULL) ;
|
||||
set_term(__stand_param__,2,__scan_buf__) ;
|
||||
/*-- <remarks> Retain position </remarks> --*/
|
||||
return (__src__) ;
|
||||
}
|
||||
/*-- <remarks> Ignore everything not specified. Point to next input char. </remarks> --*/
|
||||
return (__src__ + 1) ;
|
||||
}
|
||||
|
||||
#ifdef BUILD_API
|
||||
|
||||
/*
|
||||
typedef struct STANDARDIZER_s {
|
||||
int data;
|
||||
char *err_msg;
|
||||
} STANDARDIZER;
|
||||
|
||||
typedef struct STDADDR_s { // define as required
|
||||
char *house_num;
|
||||
char *prequal;
|
||||
char *pretype;
|
||||
char *predir;
|
||||
char *name;
|
||||
char *suftype;
|
||||
char *sufdir;
|
||||
char *sufqual;
|
||||
char *extra;
|
||||
char *city;
|
||||
char *state;
|
||||
char *postcode;
|
||||
char *country;
|
||||
} STDADDR;
|
||||
|
||||
*/
|
||||
|
||||
STANDARDIZER *std_init()
|
||||
{
|
||||
STANDARDIZER *std;
|
||||
|
||||
std = (STANDARDIZER *) calloc(1,sizeof(STANDARDIZER)) ;
|
||||
if ( std == NULL ) return NULL ;
|
||||
|
||||
std -> pagc_p = (PAGC_GLOBAL *) calloc(1,sizeof(PAGC_GLOBAL)) ;
|
||||
if ( std -> pagc_p == NULL ) {
|
||||
free( std ) ;
|
||||
return NULL ;
|
||||
}
|
||||
|
||||
std -> pagc_p -> process_errors = init_errors(std -> pagc_p, NULL) ;
|
||||
std -> err_p = std -> pagc_p -> process_errors ;
|
||||
|
||||
return std;
|
||||
}
|
||||
|
||||
|
||||
int std_use_lex(STANDARDIZER *std, LEXICON *lex)
|
||||
{
|
||||
std -> pagc_p -> addr_lexicon = lex -> hash_table ;
|
||||
lex -> hash_table = NULL;
|
||||
lex_free(lex);
|
||||
if (!setup_default_defs(std -> pagc_p)) return FALSE ;
|
||||
return (install_def_block_table(std -> pagc_p -> addr_lexicon, std -> pagc_p -> process_errors)) ;
|
||||
}
|
||||
|
||||
|
||||
int std_use_gaz(STANDARDIZER *std, LEXICON *gaz)
|
||||
{
|
||||
std -> pagc_p -> gaz_lexicon = gaz -> hash_table ;
|
||||
gaz -> hash_table = NULL;
|
||||
lex_free(gaz);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int std_use_rules(STANDARDIZER *std, RULES *rules)
|
||||
{
|
||||
if ( ! rules -> ready ) {
|
||||
RET_ERR("std_use_rules: Rules have not been readied!", std -> err_p, 1);
|
||||
}
|
||||
std -> pagc_p -> rules = rules -> r_p ;
|
||||
rules -> r_p = NULL;
|
||||
rules_free(rules);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int std_ready_standardizer(STANDARDIZER *std)
|
||||
{
|
||||
std -> misc_stand =
|
||||
init_stand_context(std -> pagc_p, std -> err_p, 1);
|
||||
|
||||
if (std -> misc_stand == NULL)
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
void std_free(STANDARDIZER *std)
|
||||
{
|
||||
if ( std == NULL ) return;
|
||||
DBG("Calling close_stand_process");
|
||||
if ( std -> pagc_p != NULL ) close_stand_process( std -> pagc_p ) ;
|
||||
if ( std -> pagc_p -> process_errors != NULL ) {
|
||||
DBG("Calling close_errors");
|
||||
close_errors( std -> pagc_p -> process_errors );
|
||||
DBG("Calling FREE_AND_NULL");
|
||||
FREE_AND_NULL( std -> pagc_p ) ;
|
||||
}
|
||||
DBG("Calling close_stand_context");
|
||||
close_stand_context( std -> misc_stand );
|
||||
DBG("Calling free");
|
||||
free( std );
|
||||
}
|
||||
|
||||
|
||||
void stdaddr_free(STDADDR *stdaddr)
|
||||
{
|
||||
if (!stdaddr) return;
|
||||
if (stdaddr->building) free(stdaddr->building);
|
||||
if (stdaddr->house_num) free(stdaddr->house_num);
|
||||
if (stdaddr->predir) free(stdaddr->predir);
|
||||
if (stdaddr->qual) free(stdaddr->qual);
|
||||
if (stdaddr->pretype) free(stdaddr->pretype);
|
||||
if (stdaddr->name) free(stdaddr->name);
|
||||
if (stdaddr->suftype) free(stdaddr->suftype);
|
||||
if (stdaddr->sufdir) free(stdaddr->sufdir);
|
||||
if (stdaddr->ruralroute) free(stdaddr->ruralroute);
|
||||
if (stdaddr->extra) free(stdaddr->extra);
|
||||
if (stdaddr->city) free(stdaddr->city);
|
||||
if (stdaddr->state) free(stdaddr->state);
|
||||
if (stdaddr->country) free(stdaddr->country);
|
||||
if (stdaddr->postcode) free(stdaddr->postcode);
|
||||
if (stdaddr->box) free(stdaddr->box);
|
||||
if (stdaddr->unit) free(stdaddr->unit);
|
||||
free(stdaddr);
|
||||
stdaddr = NULL;
|
||||
}
|
||||
|
||||
static char *coalesce( char *a, char *b )
|
||||
{
|
||||
return a?a:b;
|
||||
}
|
||||
|
||||
void print_stdaddr( STDADDR *result )
|
||||
{
|
||||
if (result) {
|
||||
printf(" building: %s\n", coalesce(result -> building, ""));
|
||||
printf(" house_num: %s\n", coalesce(result -> house_num, ""));
|
||||
printf(" predir: %s\n", coalesce(result -> predir, ""));
|
||||
printf(" qual: %s\n", coalesce(result -> qual, ""));
|
||||
printf(" pretype: %s\n", coalesce(result -> pretype, ""));
|
||||
printf(" name: %s\n", coalesce(result -> name, ""));
|
||||
printf(" suftype: %s\n", coalesce(result -> suftype, ""));
|
||||
printf(" sufdir: %s\n", coalesce(result -> sufdir, ""));
|
||||
printf("ruralroute: %s\n", coalesce(result -> ruralroute, ""));
|
||||
printf(" extra: %s\n", coalesce(result -> extra, ""));
|
||||
printf(" city: %s\n", coalesce(result -> city, ""));
|
||||
printf(" state: %s\n", coalesce(result -> state, ""));
|
||||
printf(" country: %s\n", coalesce(result -> country, ""));
|
||||
printf(" postcode: %s\n", coalesce(result -> postcode, ""));
|
||||
printf(" box: %s\n", coalesce(result -> box, ""));
|
||||
printf(" unit: %s\n", coalesce(result -> unit, ""));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
STDADDR *std_standardize_one(STANDARDIZER *std, char *address_one_line, int options)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
*/
|
||||
|
||||
STDADDR *std_standardize_mm(STANDARDIZER *std, char *micro, char *macro, int options)
|
||||
{
|
||||
STAND_PARAM *stand_address;
|
||||
STDADDR *stdaddr;
|
||||
int err;
|
||||
|
||||
stand_address = std -> misc_stand ;
|
||||
if (stand_address == NULL)
|
||||
return NULL;
|
||||
|
||||
if (!micro || ( IS_BLANK( micro ))) {
|
||||
RET_ERR("std_standardize_mm: micro attribute to standardize!", std -> err_p, NULL);
|
||||
}
|
||||
|
||||
init_output_fields( stand_address, BOTH );
|
||||
if (macro && macro[0] != '\0') {
|
||||
err = standardize_field( stand_address, macro, MACRO );
|
||||
if (!err) {
|
||||
RET_ERR1("std_standardize_mm: No standardization of %s!",
|
||||
macro, std -> err_p, NULL);
|
||||
}
|
||||
|
||||
if (options & 1) {
|
||||
printf("After standardize_field for macro:\n");
|
||||
output_raw_elements( stand_address , NULL ) ;
|
||||
send_fields_to_stream(stand_address->standard_fields , NULL, 0, 0);
|
||||
}
|
||||
}
|
||||
|
||||
err = standardize_field( stand_address, micro, MICRO_M );
|
||||
if (!err) {
|
||||
RET_ERR1("std_standardize_mm: No standardization of %s!",
|
||||
micro, std -> err_p, NULL);
|
||||
}
|
||||
|
||||
if (options & 1) {
|
||||
printf("After standardize_field for micro:\n");
|
||||
send_fields_to_stream(stand_address->standard_fields , NULL, 0, 0);
|
||||
}
|
||||
|
||||
PAGC_CALLOC_STRUC(stdaddr,STDADDR,1,std -> err_p,NULL);
|
||||
|
||||
if (strlen(stand_address -> standard_fields[0]))
|
||||
stdaddr->building = strdup(stand_address -> standard_fields[0]);
|
||||
if (strlen(stand_address -> standard_fields[1]))
|
||||
stdaddr->house_num = strdup(stand_address -> standard_fields[1]);
|
||||
if (strlen(stand_address -> standard_fields[2]))
|
||||
stdaddr->predir = strdup(stand_address -> standard_fields[2]);
|
||||
if (strlen(stand_address -> standard_fields[3]))
|
||||
stdaddr->qual = strdup(stand_address -> standard_fields[3]);
|
||||
if (strlen(stand_address -> standard_fields[4]))
|
||||
stdaddr->pretype = strdup(stand_address -> standard_fields[4]);
|
||||
if (strlen(stand_address -> standard_fields[5]))
|
||||
stdaddr->name = strdup(stand_address -> standard_fields[5]);
|
||||
if (strlen(stand_address -> standard_fields[6]))
|
||||
stdaddr->suftype = strdup(stand_address -> standard_fields[6]);
|
||||
if (strlen(stand_address -> standard_fields[7]))
|
||||
stdaddr->sufdir = strdup(stand_address -> standard_fields[7]);
|
||||
if (strlen(stand_address -> standard_fields[8]))
|
||||
stdaddr->ruralroute = strdup(stand_address -> standard_fields[8]);
|
||||
if (strlen(stand_address -> standard_fields[9]))
|
||||
stdaddr->extra = strdup(stand_address -> standard_fields[9]);
|
||||
if (strlen(stand_address -> standard_fields[10]))
|
||||
stdaddr->city = strdup(stand_address -> standard_fields[10]);
|
||||
if (strlen(stand_address -> standard_fields[11]))
|
||||
stdaddr->state = strdup(stand_address -> standard_fields[11]);
|
||||
if (strlen(stand_address -> standard_fields[12]))
|
||||
stdaddr->country = strdup(stand_address -> standard_fields[12]);
|
||||
if (strlen(stand_address -> standard_fields[13]))
|
||||
stdaddr->postcode = strdup(stand_address -> standard_fields[13]);
|
||||
if (strlen(stand_address -> standard_fields[14]))
|
||||
stdaddr->box = strdup(stand_address -> standard_fields[14]);
|
||||
if (strlen(stand_address -> standard_fields[15]))
|
||||
stdaddr->unit = strdup(stand_address -> standard_fields[15]);
|
||||
|
||||
return stdaddr;
|
||||
}
|
||||
|
||||
|
||||
STDADDR *std_standardize(STANDARDIZER *std, char *address, char *city, char *state, char *postcode, char *country, int options)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/*========================================================================
|
||||
<summary>
|
||||
<function name='standard.c (init_stand_process)'/>
|
||||
<remarks>set up process level, opens the lexicons and rules
|
||||
and default definitions for the tokenizer</remarks>
|
||||
<calls><functionref='(gamma.c) create_rules'/>, <functionref='(lexicon.c) create_lexicon'/>,
|
||||
<functionref='(tokenize.c) setup_default_defs'/> and
|
||||
<functionref='(analyze.c) install_def_block_table'/></calls>
|
||||
</summary>
|
||||
=========================================================================*/
|
||||
int init_stand_process(PAGC_GLOBAL *__pagc_global__ ,const char *__rule_name__, const char *__lexicon_name__ , const char *__gazetteer_name__ , const char *__featword_name__)
|
||||
{
|
||||
if ((__pagc_global__->rules = create_rules(__rule_name__,__pagc_global__)) == NULL)
|
||||
{
|
||||
return FALSE ;
|
||||
}
|
||||
/*-- <revision date='2009-08-13'> Support multiple lexicons </revision> --*/
|
||||
if ((__pagc_global__->addr_lexicon = create_lexicon(__pagc_global__ ,__lexicon_name__ , __gazetteer_name__)) == NULL)
|
||||
{
|
||||
return FALSE ;
|
||||
}
|
||||
if ((__pagc_global__->poi_lexicon = create_lexicon(__pagc_global__ ,__featword_name__ ,NULL)) == NULL)
|
||||
{
|
||||
return FALSE ;
|
||||
}
|
||||
#ifdef GAZ_LEXICON
|
||||
/*-- <revision date='2012-06-01'> Add gaz_lexicon to be triggered on _start_state_ = MACRO </revision> --*/
|
||||
if ((__pagc_global__->gaz_lexicon = create_lexicon(__pagc_global__,__gazetteer_name__,NULL)) == NULL)
|
||||
{
|
||||
return FALSE ;
|
||||
}
|
||||
#endif
|
||||
if (!setup_default_defs(__pagc_global__))
|
||||
{
|
||||
return FALSE ;
|
||||
}
|
||||
return (install_def_block_table(__pagc_global__->addr_lexicon ,__pagc_global__->process_errors)) ;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*========================================================================
|
||||
<summary>
|
||||
<function name='standard.c (close_stand_process)'/>
|
||||
<remarks> Called on exit to close down standardizer </remarks>
|
||||
<calls> <functionref='(tokenize.c) remove_default_defs'/>,
|
||||
<functionref='(gamma.c) destroy_rules'/> and
|
||||
<functionref='lexicon.c (destroy_lexicon)'/></calls>
|
||||
</summary>
|
||||
=========================================================================*/
|
||||
void close_stand_process(PAGC_GLOBAL * __pagc_global__)
|
||||
{
|
||||
if (__pagc_global__ == NULL)
|
||||
{
|
||||
return ;
|
||||
}
|
||||
DBG("remove_default_defs(__pagc_global__)");
|
||||
remove_default_defs(__pagc_global__) ;
|
||||
DBG("destroy_rules(__pagc_global__->rules) ;");
|
||||
destroy_rules(__pagc_global__->rules) ;
|
||||
/*-- <revision date='2009-08-13'> Support multiple lexicons </revision> --*/
|
||||
DBG("destroy_lexicon(__pagc_global__->addr_lexicon)");
|
||||
destroy_lexicon(__pagc_global__->addr_lexicon) ;
|
||||
DBG("destroy_lexicon(__pagc_global__->poi_lexicon)");
|
||||
destroy_lexicon(__pagc_global__->poi_lexicon) ;
|
||||
/*-- <revision date='2012-06-01'> Add gaz_lexicon to be triggered on _start_state_ = MACRO </revision> --*/
|
||||
#ifdef GAZ_LEXICON
|
||||
DBG("destroy_lexicon(__pagc_global__->gaz_lexicon)");
|
||||
destroy_lexicon(__pagc_global__->gaz_lexicon) ;
|
||||
#endif
|
||||
}
|
||||
|
||||
/*========================================================================
|
||||
<summary>
|
||||
<function name='standard.c (init_stand_context)'/>
|
||||
<param name='__err_param__'>belongs to the dataset context.</param>
|
||||
<calls><functionref='analyze.c (create_segments)'/>
|
||||
<returns>NULL returned on error - if so, call <functionref='close_stand_context'/></returns>
|
||||
</summary>
|
||||
=========================================================================*/
|
||||
STAND_PARAM *init_stand_context(PAGC_GLOBAL *__pagc_global__,ERR_PARAM *__err_param__,int exhaustive_flag)
|
||||
{
|
||||
STAND_PARAM *__stand_param__ ;
|
||||
/*-- <remarks> Initialization-time allocation </remarks> --*/
|
||||
PAGC_CALLOC_STRUC(__stand_param__,STAND_PARAM,1,__err_param__,NULL) ;
|
||||
if ((__stand_param__->stz_info = create_segments(__err_param__)) == NULL)
|
||||
{
|
||||
return NULL ;
|
||||
}
|
||||
PAGC_CALLOC_2D_ARRAY(__stand_param__->standard_fields, char, MAXOUTSYM, MAXFLDLEN, __err_param__, NULL) ;
|
||||
__stand_param__->analyze_complete = exhaustive_flag ;
|
||||
__stand_param__->errors = __err_param__ ;
|
||||
__stand_param__->have_ref_att = NULL ;
|
||||
/*-- <remarks> Transfer from global </remarks> --*/
|
||||
__stand_param__->rules = __pagc_global__->rules ;
|
||||
/*-- <revision date='2009-08-13'> Support multiple lexicons </revision> --*/
|
||||
/*-- <remarks> Transfer from global </remarks> --*/
|
||||
__stand_param__->address_lexicon = __pagc_global__->addr_lexicon ;
|
||||
/*-- <remarks> Transfer from global </remarks> --*/
|
||||
__stand_param__->poi_lexicon = __pagc_global__->poi_lexicon ;
|
||||
/*-- <revision date='2012-06-01'> Add gaz_lexicon to be triggered on _start_state_ = MACRO </revision> --*/
|
||||
#ifdef GAZ_LEXICON
|
||||
__stand_param__->gaz_lexicon = __pagc_global__->gaz_lexicon ;
|
||||
#endif
|
||||
__stand_param__->default_def = __pagc_global__->default_def ;
|
||||
return __stand_param__ ;
|
||||
}
|
||||
|
||||
|
||||
/*========================================================================
|
||||
<summary>
|
||||
<function name='standard.c (close_stand_context)'/>
|
||||
<remarks> Closes the <code>STAND_PARAM</code> record </remarks>
|
||||
<calls> <functionref='analyze.c (destroy_segments)'/>,
|
||||
<macroref='FREE_AND_NULL'/></calls>
|
||||
<summary>
|
||||
=========================================================================*/
|
||||
void close_stand_context( STAND_PARAM *__stand_param__ )
|
||||
{
|
||||
if (__stand_param__ == NULL)
|
||||
{
|
||||
return ;
|
||||
}
|
||||
destroy_segments(__stand_param__->stz_info) ;
|
||||
if (__stand_param__->standard_fields != NULL)
|
||||
{
|
||||
PAGC_DESTROY_2D_ARRAY(__stand_param__->standard_fields,char,MAXOUTSYM) ;
|
||||
}
|
||||
/*-- <remarks> Cleanup time memory release </remarks> --*/
|
||||
FREE_AND_NULL(__stand_param__) ;
|
||||
}
|
||||
|
||||
/*========================================================================
|
||||
<summary>
|
||||
<function name='standard.c (_Close_Stand_Field_)'/>
|
||||
<remarks> Sends the scanned and processed input to the evaluator </remarks>
|
||||
<called-by> <functionref='standard.c (standardize_field)'/></called-by>
|
||||
<calls> <functionref='analyze.c (evaluator)'/> , <functionref='export.c (stuff_fields)'/></calls>
|
||||
<returns>FALSE on error</returns>
|
||||
<revision date='2012-07-22'> Keep track of start_state </revision>
|
||||
</summary>
|
||||
=========================================================================*/
|
||||
static int _Close_Stand_Field_(STAND_PARAM *__stand_param__)
|
||||
{
|
||||
/*-- <revision date='2012-07-22'> Keep track of start_state </revision> --*/
|
||||
if (evaluator(__stand_param__))
|
||||
{
|
||||
/*-- <remarks> Write the output into the fields. </remarks> --*/
|
||||
stuff_fields(__stand_param__) ;
|
||||
return TRUE ;
|
||||
}
|
||||
RET_ERR("_Close_Stand_Field_: Address failed to standardize",__stand_param__->errors,FALSE) ;
|
||||
}
|
||||
|
895
extras/address_standardizer/std_pg_hash.c
Normal file
895
extras/address_standardizer/std_pg_hash.c
Normal file
|
@ -0,0 +1,895 @@
|
|||
|
||||
|
||||
/* PostgreSQL headers */
|
||||
#include "postgres.h"
|
||||
#include "fmgr.h"
|
||||
#include "miscadmin.h"
|
||||
#include "utils/memutils.h"
|
||||
#include "executor/spi.h"
|
||||
#include "access/hash.h"
|
||||
#include "utils/hsearch.h"
|
||||
#include "funcapi.h"
|
||||
#include "catalog/pg_type.h"
|
||||
|
||||
/* standardizer headers */
|
||||
#undef DEBUG
|
||||
//#define DEBUG 1
|
||||
|
||||
#include "pagc_api.h"
|
||||
#include "pagc_std_api.h"
|
||||
#include "std_pg_hash.h"
|
||||
|
||||
/* C headers */
|
||||
#include <sys/time.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
#include <errno.h>
|
||||
|
||||
#ifdef DEBUG
|
||||
#define SET_TIME(a) gettimeofday(&(a), NULL)
|
||||
#define ELAPSED_T(a,b) \
|
||||
elapsed = (b.tv_sec - a.tv_sec)*1000.0; \
|
||||
elapsed += (b.tv_usec - a.tv_usec)/1000.0;
|
||||
#else
|
||||
#define SET_TIME(a) do { ; } while (0)
|
||||
#define ELAPSED_T(a,b) do { ; } while (0)
|
||||
#endif
|
||||
|
||||
#define MAX_RULE_LENGTH 128
|
||||
#define TUPLIMIT 1000
|
||||
|
||||
#define STD_CACHE_ITEMS 4
|
||||
#define STD_BACKEND_HASH_SIZE 16
|
||||
|
||||
static HTAB* StdHash = NULL;
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
char *lextab;
|
||||
char *gaztab;
|
||||
char *rultab;
|
||||
STANDARDIZER *std;
|
||||
MemoryContext std_mcxt;
|
||||
}
|
||||
StdCacheItem;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
StdCacheItem StdCache[STD_CACHE_ITEMS];
|
||||
int NextSlot;
|
||||
MemoryContext StdCacheContext;
|
||||
}
|
||||
StdPortalCache;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
MemoryContext context;
|
||||
STANDARDIZER *std;
|
||||
}
|
||||
StdHashEntry;
|
||||
|
||||
typedef struct lex_columns
|
||||
{
|
||||
int seq;
|
||||
int word;
|
||||
int stdword;
|
||||
int token;
|
||||
} lex_columns_t;
|
||||
|
||||
typedef struct rules_columns
|
||||
{
|
||||
int rule;
|
||||
} rules_columns_t;
|
||||
|
||||
|
||||
|
||||
/* Memory context hash table function prototypes */
|
||||
uint32 mcxt_ptr_hash_std(const void *key, Size keysize);
|
||||
static void CreateStdHash(void);
|
||||
static void AddStdHashEntry(MemoryContext mcxt, STANDARDIZER *std);
|
||||
static StdHashEntry *GetStdHashEntry(MemoryContext mcxt);
|
||||
static void DeleteStdHashEntry(MemoryContext mcxt);
|
||||
|
||||
/* Memory context cache function prototypes */
|
||||
static void StdCacheInit(MemoryContext context);
|
||||
static void StdCacheReset(MemoryContext context);
|
||||
static void StdCacheDelete(MemoryContext context);
|
||||
static bool StdCacheIsEmpty(MemoryContext context);
|
||||
static void StdCacheStats(MemoryContext context, int level);
|
||||
#ifdef MEMORY_CONTEXT_CHECKING
|
||||
static void StdCacheCheck(MemoryContext context);
|
||||
#endif
|
||||
|
||||
static bool IsInStdPortalCache(StdPortalCache *STDCache, char *lextab, char *gaztab, char *rultab);
|
||||
static STANDARDIZER *GetStdFromPortalCache(StdPortalCache *STDCache, char *lextab, char *gaztab, char *rultab);
|
||||
static void AddToStdPortalCache(StdPortalCache *STDCache, char *lextab, char *gaztab, char *rultab);
|
||||
static StdPortalCache *GetStdPortalCache(FunctionCallInfo fcinfo);
|
||||
|
||||
|
||||
/* standardizer api functions */
|
||||
|
||||
static STANDARDIZER *CreateStd(char *lextab, char *gaztab, char *rultab);
|
||||
static int parse_rule(char *buf, int *rule);
|
||||
static int fetch_lex_columns(SPITupleTable *tuptable, lex_columns_t *lex_cols);
|
||||
static int tableNameOk(char *t);
|
||||
static int load_lex(LEXICON *lex, char *tabname);
|
||||
static int fetch_rules_columns(SPITupleTable *tuptable, rules_columns_t *rules_cols);
|
||||
static int load_rules(RULES *rules, char *tabname);
|
||||
|
||||
|
||||
/* Memory context definition must match the current version of PostgreSQL */
|
||||
static MemoryContextMethods StdCacheContextMethods =
|
||||
{
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
StdCacheInit,
|
||||
StdCacheReset,
|
||||
StdCacheDelete,
|
||||
NULL,
|
||||
StdCacheIsEmpty,
|
||||
StdCacheStats
|
||||
#ifdef MEMORY_CONTEXT_CHECKING
|
||||
, StdCacheCheck
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
static void
|
||||
StdCacheInit(MemoryContext context)
|
||||
{
|
||||
/* NOP - initialized when first used. */
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
StdCacheReset(MemoryContext context)
|
||||
{
|
||||
// NOP - Seems to be a required function
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
StdCacheDelete(MemoryContext context)
|
||||
{
|
||||
StdHashEntry *she;
|
||||
|
||||
DBG("Enter: StdCacheDelete");
|
||||
/* lookup the hash entry in the global hash table
|
||||
so we can free it */
|
||||
she = GetStdHashEntry(context);
|
||||
|
||||
if (!she)
|
||||
elog(ERROR, "StdCacheDelete: Trying to delete non-existant hash entry object with MemoryContext key (%p)", (void *)context);
|
||||
|
||||
DBG("deleting std object (%p) with MemoryContext key (%p)", she->std, context);
|
||||
|
||||
if (she->std)
|
||||
std_free(she->std);
|
||||
|
||||
DeleteStdHashEntry(context);
|
||||
}
|
||||
|
||||
|
||||
static bool
|
||||
StdCacheIsEmpty(MemoryContext context)
|
||||
{
|
||||
// always return false - another required function
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
StdCacheStats(MemoryContext context, int level)
|
||||
{
|
||||
// another required function
|
||||
fprintf(stderr, "%s: STANDARDIZER context\n", context->name);
|
||||
}
|
||||
|
||||
|
||||
#ifdef MEMORY_CONTEXT_CHECKING
|
||||
static void
|
||||
StdCacheCheck(MemoryContext context)
|
||||
{
|
||||
// NOP - another reuired function
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
uint32
|
||||
mcxt_ptr_hash_std(const void *key, Size keysize)
|
||||
{
|
||||
uint32 hashval;
|
||||
hashval = DatumGetUInt32(hash_any(key, keysize));
|
||||
return hashval;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
CreateStdHash(void)
|
||||
{
|
||||
HASHCTL ctl;
|
||||
|
||||
ctl.keysize = sizeof(MemoryContext);
|
||||
ctl.entrysize = sizeof(StdHashEntry);
|
||||
ctl.hash = mcxt_ptr_hash_std;
|
||||
|
||||
StdHash = hash_create("PAGC Address Standardizer Backend MemoryContext Hash", STD_BACKEND_HASH_SIZE, &ctl, (HASH_ELEM | HASH_FUNCTION));
|
||||
DBG("CreateStdHash: created StdHash (%p)", StdHash);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
AddStdHashEntry(MemoryContext mcxt, STANDARDIZER *std)
|
||||
{
|
||||
bool found;
|
||||
void **key;
|
||||
StdHashEntry *he;
|
||||
|
||||
DBG("Enter: AddStdHashEntry(mcxt=%p, std=%p)", mcxt, std);
|
||||
/* The hash key is the MemoryContext pointer */
|
||||
key = (void *)&mcxt;
|
||||
|
||||
he = (StdHashEntry *) hash_search(StdHash, key, HASH_ENTER, &found);
|
||||
DBG("AddStdHashEntry: he=%p, found=%d", he, found);
|
||||
if (!found) {
|
||||
DBG("&he->context=%p", &he->context);
|
||||
he->context = mcxt;
|
||||
DBG("&he->std=%p", &he->std);
|
||||
he->std = std;
|
||||
DBG("Leaving AddStdHashEntry");
|
||||
}
|
||||
else {
|
||||
elog(ERROR, "AddStdHashEntry: This memory context is already in use! (%p)", (void *)mcxt);
|
||||
}
|
||||
}
|
||||
|
||||
static StdHashEntry *
|
||||
GetStdHashEntry(MemoryContext mcxt)
|
||||
{
|
||||
void **key;
|
||||
StdHashEntry *he;
|
||||
|
||||
DBG("Enter: GetStdHashEntry");
|
||||
key = (void *)&mcxt;
|
||||
he = (StdHashEntry *) hash_search(StdHash, key, HASH_FIND, NULL);
|
||||
return he;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
DeleteStdHashEntry(MemoryContext mcxt)
|
||||
{
|
||||
void **key;
|
||||
StdHashEntry *he;
|
||||
|
||||
DBG("Enter: DeleteStdHashEntry");
|
||||
key = (void *)&mcxt;
|
||||
he = (StdHashEntry *) hash_search(StdHash, key, HASH_REMOVE, NULL);
|
||||
if (!he)
|
||||
elog(ERROR, "DeleteStdHashEntry: There was an error removing the STD object from this MemoryContext (%p)", (void *)mcxt);
|
||||
|
||||
he->std = NULL;
|
||||
}
|
||||
|
||||
|
||||
/* public api */
|
||||
bool
|
||||
IsInStdCache(StdCache STDCache, char *lextab, char *gaztab, char *rultab) {
|
||||
return IsInStdPortalCache((StdPortalCache *) STDCache, lextab, gaztab, rultab);
|
||||
}
|
||||
|
||||
|
||||
static bool
|
||||
IsInStdPortalCache(StdPortalCache *STDCache, char *lextab, char *gaztab, char *rultab)
|
||||
{
|
||||
int i;
|
||||
|
||||
DBG("Enter: IsInStdPortalCache");
|
||||
for (i=0; i<STD_CACHE_ITEMS; i++) {
|
||||
StdCacheItem *ci = &STDCache->StdCache[i];
|
||||
if (ci->lextab && !strcmp(ci->lextab, lextab) &&
|
||||
ci->lextab && !strcmp(ci->gaztab, gaztab) &&
|
||||
ci->lextab && !strcmp(ci->rultab, rultab))
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
|
||||
/* public api */
|
||||
STANDARDIZER *
|
||||
GetStdFromStdCache(StdCache STDCache, char *lextab, char *gaztab, char *rultab) {
|
||||
return GetStdFromPortalCache((StdPortalCache *) STDCache, lextab, gaztab, rultab);
|
||||
}
|
||||
|
||||
|
||||
static STANDARDIZER *
|
||||
GetStdFromPortalCache(StdPortalCache *STDCache, char *lextab, char *gaztab, char *rultab)
|
||||
{
|
||||
int i;
|
||||
|
||||
DBG("Enter: GetStdFromPortalCache");
|
||||
for (i=0; i<STD_CACHE_ITEMS; i++) {
|
||||
StdCacheItem *ci = &STDCache->StdCache[i];
|
||||
if (ci->lextab && !strcmp(ci->lextab, lextab) &&
|
||||
ci->lextab && !strcmp(ci->gaztab, gaztab) &&
|
||||
ci->lextab && !strcmp(ci->rultab, rultab))
|
||||
return STDCache->StdCache[i].std;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
DeleteNextSlotFromStdCache(StdPortalCache *STDCache)
|
||||
{
|
||||
MemoryContext old_context;
|
||||
|
||||
DBG("Enter: DeleteNextSlotFromStdCache");
|
||||
if (STDCache->StdCache[STDCache->NextSlot].std != NULL) {
|
||||
StdCacheItem *ce = &STDCache->StdCache[STDCache->NextSlot];
|
||||
DBG("Removing STD cache entry ('%s', '%s', '%s') index %d", ce->lextab, ce->gaztab, ce->rultab, STDCache->NextSlot);
|
||||
|
||||
/* zero out the entries and free the memory context
|
||||
We will get a callback to free the std object.
|
||||
*/
|
||||
old_context = MemoryContextSwitchTo(STDCache->StdCacheContext);
|
||||
MemoryContextDelete(ce->std_mcxt);
|
||||
pfree(ce->lextab);
|
||||
ce->lextab = NULL;
|
||||
pfree(ce->gaztab);
|
||||
ce->gaztab = NULL;
|
||||
pfree(ce->rultab);
|
||||
ce->rultab = NULL;
|
||||
ce->std = NULL;
|
||||
MemoryContextSwitchTo(old_context);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* public api */
|
||||
void
|
||||
AddToStdCache(StdCache cache, char *lextab, char *gaztab, char *rultab) {
|
||||
AddToStdPortalCache((StdPortalCache *) cache, lextab, gaztab, rultab);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
AddToStdPortalCache(StdPortalCache *STDCache, char *lextab, char *gaztab, char *rultab)
|
||||
{
|
||||
MemoryContext STDMemoryContext;
|
||||
MemoryContext old_context;
|
||||
STANDARDIZER *std = NULL;
|
||||
|
||||
DBG("Enter: AddToStdPortalCache");
|
||||
std = CreateStd(lextab, gaztab, rultab);
|
||||
if (!std)
|
||||
elog(ERROR,
|
||||
"AddToStdPortalCache: could not create address standardizer for '%s', '%s', '%s'", lextab, gaztab, rultab);
|
||||
|
||||
/* if the NextSlot in the cache is used, then delete it */
|
||||
if (STDCache->StdCache[STDCache->NextSlot].std != NULL) {
|
||||
#ifdef DEBUG
|
||||
StdCacheItem *ce = &STDCache->StdCache[STDCache->NextSlot];
|
||||
DBG("Removing item from STD cache ('%s', '%s', '%s') index %d", ce->lextab, ce->gaztab, ce->rultab, STDCache->NextSlot);
|
||||
#endif
|
||||
DeleteNextSlotFromStdCache(STDCache);
|
||||
}
|
||||
|
||||
DBG("Adding item to STD cache ('%s', '%s', '%s') index %d", lextab, gaztab, rultab, STDCache->NextSlot);
|
||||
|
||||
STDMemoryContext = MemoryContextCreate(T_AllocSetContext, 8192,
|
||||
&StdCacheContextMethods,
|
||||
STDCache->StdCacheContext,
|
||||
"PAGC STD Memory Context");
|
||||
|
||||
/* Create the backend hash if it doesn't already exist */
|
||||
DBG("Check if StdHash exists (%p)", StdHash);
|
||||
if (!StdHash)
|
||||
CreateStdHash();
|
||||
|
||||
/*
|
||||
* Add the MemoryContext to the backend hash so we can
|
||||
* clean up upon portal shutdown
|
||||
*/
|
||||
DBG("Adding standardizer obj (%p) to hash table with MemoryContext key (%p)", std, STDMemoryContext);
|
||||
|
||||
AddStdHashEntry(STDMemoryContext, std);
|
||||
|
||||
/* change memory contexts so the pstrdup are allocated in the
|
||||
* context of this cache item. They will be freed when the
|
||||
* cache item is deleted.
|
||||
*/
|
||||
DBG("AddToStdPortalCache: changing memory context to %p", STDCache->StdCacheContext);
|
||||
old_context = MemoryContextSwitchTo(STDCache->StdCacheContext);
|
||||
DBG(" old_context= %p", old_context);
|
||||
STDCache->StdCache[STDCache->NextSlot].lextab = pstrdup(lextab);
|
||||
DBG(" pstrdup(lextab) completed");
|
||||
STDCache->StdCache[STDCache->NextSlot].gaztab = pstrdup(gaztab);
|
||||
DBG(" pstrdup(gaztab) completed");
|
||||
STDCache->StdCache[STDCache->NextSlot].rultab = pstrdup(rultab);
|
||||
DBG(" pstrdup(rultab) completed");
|
||||
MemoryContextSwitchTo(old_context);
|
||||
DBG(" changed memory context to %p", old_context);
|
||||
|
||||
STDCache->StdCache[STDCache->NextSlot].std = std;
|
||||
STDCache->StdCache[STDCache->NextSlot].std_mcxt = STDMemoryContext;
|
||||
STDCache->NextSlot = (STDCache->NextSlot + 1) % STD_CACHE_ITEMS;
|
||||
DBG("STDCache->NextSlot=%d", STDCache->NextSlot);
|
||||
}
|
||||
|
||||
|
||||
/* pubilc api */
|
||||
StdCache
|
||||
GetStdCache(FunctionCallInfo fcinfo) {
|
||||
return (StdCache) GetStdPortalCache(fcinfo);
|
||||
}
|
||||
|
||||
|
||||
static StdPortalCache *
|
||||
GetStdPortalCache(FunctionCallInfo fcinfo)
|
||||
{
|
||||
StdPortalCache *STDCache;
|
||||
|
||||
DBG("Enter: GetStdPortalCache");
|
||||
/* create it if we don't already have one for this portal */
|
||||
if (fcinfo->flinfo->fn_extra == NULL) {
|
||||
MemoryContext old_context;
|
||||
|
||||
old_context = MemoryContextSwitchTo(fcinfo->flinfo->fn_mcxt);
|
||||
STDCache = palloc(sizeof(StdPortalCache));
|
||||
MemoryContextSwitchTo(old_context);
|
||||
|
||||
if (STDCache) {
|
||||
int i;
|
||||
|
||||
DBG("Allocating STDCache for portal with STD MemoryContext (%p)", fcinfo->flinfo->fn_mcxt);
|
||||
/* initial the cache items */
|
||||
for (i=0; i<STD_CACHE_ITEMS; i++) {
|
||||
STDCache->StdCache[i].lextab = NULL;
|
||||
STDCache->StdCache[i].gaztab = NULL;
|
||||
STDCache->StdCache[i].rultab = NULL;
|
||||
STDCache->StdCache[i].std = NULL;
|
||||
STDCache->StdCache[i].std_mcxt = NULL;
|
||||
}
|
||||
STDCache->NextSlot = 0;
|
||||
STDCache->StdCacheContext = fcinfo->flinfo->fn_mcxt;
|
||||
|
||||
/* Store the pointer in fcinfo->flinfo->fn_extra */
|
||||
fcinfo->flinfo->fn_extra = STDCache;
|
||||
}
|
||||
}
|
||||
else {
|
||||
/* Use the existing cache */
|
||||
STDCache = fcinfo->flinfo->fn_extra;
|
||||
}
|
||||
|
||||
return STDCache;
|
||||
}
|
||||
|
||||
/* public api */
|
||||
STANDARDIZER *
|
||||
GetStdUsingFCInfo(FunctionCallInfo fcinfo, char *lextab, char *gaztab, char *rultab)
|
||||
{
|
||||
STANDARDIZER *std;
|
||||
StdCache *std_cache = NULL;
|
||||
|
||||
DBG("GetStdUsingFCInfo: calling GetStdCache(fcinfo)");
|
||||
std_cache = GetStdCache(fcinfo);
|
||||
if (!std_cache)
|
||||
return NULL;
|
||||
|
||||
DBG("GetStdUsingFCInfo: calling IsInStdCache(std_cache, lextab, gaztab, rultab)");
|
||||
if (!IsInStdCache(std_cache, lextab, gaztab, rultab)) {
|
||||
DBG("GetStdUsingFCInfo: calling AddToStdCache(std_cache, lextab, gaztab, rultab)");
|
||||
AddToStdCache(std_cache, lextab, gaztab, rultab);
|
||||
}
|
||||
|
||||
DBG("GetStdUsingFCInfo: GetStdFromStdCache(std_cache, lextab, gaztab, rultab)");
|
||||
std = GetStdFromStdCache(std_cache, lextab, gaztab, rultab);
|
||||
|
||||
return std;
|
||||
}
|
||||
|
||||
|
||||
static STANDARDIZER *
|
||||
CreateStd(char *lextab, char *gaztab, char *rultab)
|
||||
{
|
||||
STANDARDIZER *std;
|
||||
LEXICON *lex;
|
||||
LEXICON *gaz;
|
||||
RULES *rules;
|
||||
int err;
|
||||
int SPIcode;
|
||||
|
||||
DBG("Enter: CreateStd");
|
||||
SPIcode = SPI_connect();
|
||||
if (SPIcode != SPI_OK_CONNECT) {
|
||||
elog(ERROR, "CreateStd: couldn't open a connection to SPI");
|
||||
}
|
||||
|
||||
std = std_init();
|
||||
if (!std)
|
||||
elog(ERROR, "CreateStd: could not allocate memory (std)");
|
||||
|
||||
lex = lex_init(std->err_p);
|
||||
if (!lex) {
|
||||
std_free(std);
|
||||
SPI_finish();
|
||||
elog(ERROR, "CreateStd: could not allocate memory (lex)");
|
||||
}
|
||||
|
||||
err = load_lex(lex, lextab);
|
||||
if (err == -1) {
|
||||
lex_free(lex);
|
||||
std_free(std);
|
||||
SPI_finish();
|
||||
elog(ERROR, "CreateStd: failed to load '%s' for lexicon", lextab);
|
||||
}
|
||||
|
||||
gaz = lex_init(std->err_p);
|
||||
if (!gaz) {
|
||||
lex_free(lex);
|
||||
std_free(std);
|
||||
SPI_finish();
|
||||
elog(ERROR, "CreateStd: could not allocate memory (gaz)");
|
||||
}
|
||||
|
||||
err = load_lex(gaz, gaztab);
|
||||
if (err == -1) {
|
||||
lex_free(gaz);
|
||||
lex_free(lex);
|
||||
std_free(std);
|
||||
SPI_finish();
|
||||
elog(ERROR, "CreateStd: failed to load '%s' for gazeteer", gaztab);
|
||||
}
|
||||
|
||||
rules = rules_init(std->err_p);
|
||||
if (!rules) {
|
||||
lex_free(gaz);
|
||||
lex_free(lex);
|
||||
std_free(std);
|
||||
SPI_finish();
|
||||
elog(ERROR, "CreateStd: could not allocate memory (rules)");
|
||||
}
|
||||
|
||||
err = load_rules(rules, rultab);
|
||||
if (err == -1) {
|
||||
rules_free(rules);
|
||||
lex_free(gaz);
|
||||
lex_free(lex);
|
||||
std_free(std);
|
||||
SPI_finish();
|
||||
elog(ERROR, "CreateStd: failed to load '%s' for rules", rultab);
|
||||
}
|
||||
|
||||
std_use_lex(std, lex);
|
||||
std_use_gaz(std, gaz);
|
||||
std_use_rules(std, rules);
|
||||
std_ready_standardizer(std);
|
||||
|
||||
SPI_finish();
|
||||
|
||||
return std;
|
||||
}
|
||||
|
||||
|
||||
static int parse_rule(char *buf, int *rule)
|
||||
{
|
||||
int nr = 0;
|
||||
int *r = rule;
|
||||
char *p = buf;
|
||||
char *q;
|
||||
|
||||
|
||||
while (1) {
|
||||
*r = strtol( p, &q, 10 );
|
||||
if (p == q) break;
|
||||
p = q;
|
||||
nr++;
|
||||
r++;
|
||||
if (nr > MAX_RULE_LENGTH) return -1;
|
||||
}
|
||||
|
||||
return nr;
|
||||
}
|
||||
|
||||
|
||||
#define FETCH_COL(TRGT,NAME,NAME2) \
|
||||
TRGT->NAME = SPI_fnumber(SPI_tuptable->tupdesc,NAME2);\
|
||||
if (TRGT->NAME == SPI_ERROR_NOATTRIBUTE) err++;
|
||||
|
||||
#define CHECK_TYP(TRGT,NAME,TYPE) \
|
||||
if (SPI_gettypeid(SPI_tuptable->tupdesc, TRGT->NAME) != TYPE) {\
|
||||
DBG("CHECK_TYP: expecting %d, got: %d", TYPE, SPI_gettypeid(SPI_tuptable->tupdesc, TRGT->NAME));\
|
||||
err++;\
|
||||
}
|
||||
|
||||
#define GET_INT_FROM_TUPLE(TRGT,WHICH,NULLMSG) \
|
||||
binval = SPI_getbinval(tuple, tupdesc, WHICH, &isnull);\
|
||||
if (isnull) { \
|
||||
elog(NOTICE, NULLMSG); \
|
||||
return -1; \
|
||||
} \
|
||||
TRGT = DatumGetInt32(binval);
|
||||
|
||||
#define GET_TEXT_FROM_TUPLE(TRGT,WHICH) \
|
||||
TRGT = DatumGetCString(SPI_getvalue(tuple, tupdesc, WHICH));
|
||||
|
||||
|
||||
static int fetch_lex_columns(SPITupleTable *tuptable, lex_columns_t *lex_cols)
|
||||
{
|
||||
int err = 0;
|
||||
FETCH_COL(lex_cols,seq,"seq");
|
||||
FETCH_COL(lex_cols,word,"word");
|
||||
FETCH_COL(lex_cols,stdword,"stdword");
|
||||
FETCH_COL(lex_cols,token,"token");
|
||||
if (err) {
|
||||
elog(NOTICE, "lexicon queries must return columns 'seq', 'word', 'stdword' and 'token'");
|
||||
return -1;
|
||||
}
|
||||
CHECK_TYP(lex_cols,seq,INT4OID);
|
||||
CHECK_TYP(lex_cols,word,TEXTOID);
|
||||
CHECK_TYP(lex_cols,stdword,TEXTOID);
|
||||
CHECK_TYP(lex_cols,token,INT4OID);
|
||||
if (err) {
|
||||
elog(NOTICE, "lexicon column types must be: 'seq' int4, 'word' text, 'stdword' text, and 'token' int4");
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* snitize table names, leave '.' for schema */
|
||||
|
||||
static int tableNameOk(char *t)
|
||||
{
|
||||
while (*t != '\0') {
|
||||
if (!(isalnum(*t) || *t == '_' || *t == '.' || *t == '"'))
|
||||
return 0;
|
||||
t++;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int load_lex(LEXICON *lex, char *tab)
|
||||
{
|
||||
int ret;
|
||||
SPIPlanPtr SPIplan;
|
||||
Portal SPIportal;
|
||||
bool moredata = TRUE;
|
||||
#ifdef DEBUG
|
||||
struct timeval t1, t2;
|
||||
double elapsed;
|
||||
#endif
|
||||
char *sql;
|
||||
|
||||
int ntuples;
|
||||
int total_tuples = 0;
|
||||
|
||||
lex_columns_t lex_columns = {seq: -1, word: -1, stdword: -1, token: -1};
|
||||
|
||||
int seq;
|
||||
char *word;
|
||||
char *stdword;
|
||||
int token;
|
||||
|
||||
DBG("start load_lex\n");
|
||||
SET_TIME(t1);
|
||||
|
||||
if (!tab || !strlen(tab)) {
|
||||
elog(NOTICE, "load_lex: rules table is not usable");
|
||||
return -1;
|
||||
}
|
||||
if (!tableNameOk(tab)) {
|
||||
elog(NOTICE, "load_lex: lex and gaz table names may only be alphanum and '.\"_' characters (%s)", tab);
|
||||
return -1;
|
||||
}
|
||||
sql = SPI_palloc(strlen(tab)+65);
|
||||
strcpy(sql, "select seq, word, stdword, token from ");
|
||||
strcat(sql, tab);
|
||||
strcat(sql, " order by id ");
|
||||
|
||||
/* get the sql for the lexicon records and prepare the query */
|
||||
SPIplan = SPI_prepare(sql, 0, NULL);
|
||||
if (SPIplan == NULL) {
|
||||
elog(NOTICE, "load_lex: couldn't create query plan for the lex data via SPI (%s)", sql);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* get the sql for the lexicon records and prepare the query */
|
||||
SPIplan = SPI_prepare(sql, 0, NULL);
|
||||
if (SPIplan == NULL) {
|
||||
elog(NOTICE, "load_lex: couldn't create query plan for the lexicon data via SPI");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if ((SPIportal = SPI_cursor_open(NULL, SPIplan, NULL, NULL, true)) == NULL) {
|
||||
elog(NOTICE, "load_lex: SPI_cursor_open('%s') returns NULL", sql);
|
||||
return -1;
|
||||
}
|
||||
|
||||
while (moredata == TRUE) {
|
||||
//DBG("calling SPI_cursor_fetch");
|
||||
SPI_cursor_fetch(SPIportal, TRUE, TUPLIMIT);
|
||||
|
||||
if (SPI_tuptable == NULL) {
|
||||
elog(NOTICE, "load_lex: SPI_tuptable is NULL");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (lex_columns.seq == -1) {
|
||||
ret = fetch_lex_columns(SPI_tuptable, &lex_columns);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
ntuples = SPI_processed;
|
||||
//DBG("Reading edges: %i - %i", total_tuples, total_tuples+ntuples);
|
||||
total_tuples += ntuples;
|
||||
|
||||
if (ntuples > 0) {
|
||||
int t;
|
||||
Datum binval;
|
||||
bool isnull;
|
||||
SPITupleTable *tuptable = SPI_tuptable;
|
||||
TupleDesc tupdesc = SPI_tuptable->tupdesc;
|
||||
|
||||
for (t = 0; t < ntuples; t++) {
|
||||
//if (t%100 == 0) { DBG(" t: %i", t); }
|
||||
HeapTuple tuple = tuptable->vals[t];
|
||||
GET_INT_FROM_TUPLE(seq,lex_columns.seq,"load_lex: seq contains a null value");
|
||||
GET_TEXT_FROM_TUPLE(word,lex_columns.word);
|
||||
GET_TEXT_FROM_TUPLE(stdword,lex_columns.stdword);
|
||||
GET_INT_FROM_TUPLE(token,lex_columns.token,"load_lex: token contains a null value");
|
||||
lex_add_entry(lex, seq, word, stdword, token);
|
||||
}
|
||||
//DBG("calling SPI_freetuptable");
|
||||
SPI_freetuptable(tuptable);
|
||||
//DBG("back from SPI_freetuptable");
|
||||
}
|
||||
else
|
||||
moredata = FALSE;
|
||||
|
||||
}
|
||||
|
||||
SET_TIME(t2);
|
||||
ELAPSED_T(t1, t2);
|
||||
DBG("Time to read %i lexicon records: %.1f ms.", total_tuples, elapsed);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int fetch_rules_columns(SPITupleTable *tuptable, rules_columns_t *rules_cols)
|
||||
{
|
||||
int err = 0;
|
||||
FETCH_COL(rules_cols,rule,"rule");
|
||||
if (err) {
|
||||
elog(NOTICE, "rules queries must return column 'rule'");
|
||||
return -1;
|
||||
}
|
||||
CHECK_TYP(rules_cols,rule,TEXTOID);
|
||||
if (err) {
|
||||
elog(NOTICE, "rules column type must be: 'rule' text");
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int load_rules(RULES *rules, char *tab)
|
||||
{
|
||||
int ret;
|
||||
SPIPlanPtr SPIplan;
|
||||
Portal SPIportal;
|
||||
bool moredata = TRUE;
|
||||
#ifdef DEBUG
|
||||
struct timeval t1, t2;
|
||||
double elapsed;
|
||||
#endif
|
||||
char *sql;
|
||||
|
||||
int rule_arr[MAX_RULE_LENGTH];
|
||||
|
||||
int ntuples;
|
||||
int total_tuples = 0;
|
||||
|
||||
rules_columns_t rules_columns = {rule: -1};
|
||||
|
||||
char *rule;
|
||||
|
||||
DBG("start load_rules\n");
|
||||
SET_TIME(t1);
|
||||
|
||||
if (!tab || !strlen(tab)) {
|
||||
elog(NOTICE, "load_rules: rules table is not usable");
|
||||
return -1;
|
||||
}
|
||||
if (!tableNameOk(tab)) {
|
||||
elog(NOTICE, "load_rules: rules table name may only be alphanum and '.\"_' characters (%s)", tab);
|
||||
return -1;
|
||||
}
|
||||
sql = SPI_palloc(strlen(tab)+35);
|
||||
strcpy(sql, "select rule from ");
|
||||
strcat(sql, tab);
|
||||
strcat(sql, " order by id ");
|
||||
|
||||
/* get the sql for the lexicon records and prepare the query */
|
||||
SPIplan = SPI_prepare(sql, 0, NULL);
|
||||
if (SPIplan == NULL) {
|
||||
elog(NOTICE, "load_rules: couldn't create query plan for the rule data via SPI (%s)", sql);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if ((SPIportal = SPI_cursor_open(NULL, SPIplan, NULL, NULL, true)) == NULL) {
|
||||
elog(NOTICE, "load_rules: SPI_cursor_open('%s') returns NULL", sql);
|
||||
return -1;
|
||||
}
|
||||
|
||||
while (moredata == TRUE) {
|
||||
//DBG("calling SPI_cursor_fetch");
|
||||
SPI_cursor_fetch(SPIportal, TRUE, TUPLIMIT);
|
||||
|
||||
if (SPI_tuptable == NULL) {
|
||||
elog(NOTICE, "load_rules: SPI_tuptable is NULL");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (rules_columns.rule == -1) {
|
||||
ret = fetch_rules_columns(SPI_tuptable, &rules_columns);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
ntuples = SPI_processed;
|
||||
//DBG("Reading edges: %i - %i", total_tuples, total_tuples+ntuples);
|
||||
|
||||
if (ntuples > 0) {
|
||||
int t;
|
||||
SPITupleTable *tuptable = SPI_tuptable;
|
||||
TupleDesc tupdesc = SPI_tuptable->tupdesc;
|
||||
|
||||
for (t = 0; t < ntuples; t++) {
|
||||
int nr;
|
||||
//if (t%100 == 0) { DBG(" t: %i", t); }
|
||||
HeapTuple tuple = tuptable->vals[t];
|
||||
GET_TEXT_FROM_TUPLE(rule,rules_columns.rule);
|
||||
nr = parse_rule(rule, rule_arr);
|
||||
if (nr == -1) {
|
||||
elog(NOTICE, "load_roles: rule exceeds 128 terms");
|
||||
return -1;
|
||||
}
|
||||
ret = rules_add_rule(rules, nr, rule_arr);
|
||||
if (ret != 0) {
|
||||
elog(NOTICE,"load_roles: failed to add rule %d (%d): %s",
|
||||
total_tuples+t+1, ret, rule);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
//DBG("calling SPI_freetuptable");
|
||||
SPI_freetuptable(tuptable);
|
||||
//DBG("back from SPI_freetuptable");
|
||||
}
|
||||
else
|
||||
moredata = FALSE;
|
||||
|
||||
total_tuples += ntuples;
|
||||
}
|
||||
|
||||
ret = rules_ready(rules);
|
||||
if (ret != 0) {
|
||||
elog(NOTICE, "load_roles: failed to ready the rules: err: %d", ret);
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
SET_TIME(t2);
|
||||
ELAPSED_T(t1, t2);
|
||||
DBG("Time to read %i rule records: %.1f ms.", total_tuples, elapsed);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
16
extras/address_standardizer/std_pg_hash.h
Normal file
16
extras/address_standardizer/std_pg_hash.h
Normal file
|
@ -0,0 +1,16 @@
|
|||
|
||||
/* Opaque type to use in standardizer cache API */
|
||||
typedef void *StdCache;
|
||||
|
||||
StdCache GetStdCache(FunctionCallInfo fcinfo);
|
||||
bool IsInStdCache(StdCache STDCache, char *lextab, char *gaztab, char *rultab);
|
||||
void AddToStdCache(StdCache cache, char *lextab, char *gaztab, char *rultab);
|
||||
STANDARDIZER *GetStdFromStdCache(StdCache STDCache, char *lextab, char *gaztab, char *rultab);
|
||||
|
||||
/*
|
||||
* This is the only interface external code should be calling
|
||||
* it will get the standardizer out of the cache, or
|
||||
* it will create a new one and save it in the cache
|
||||
*/
|
||||
STANDARDIZER *GetStdUsingFCInfo(FunctionCallInfo fcinfo, char *lextab, char *gaztab, char *rultab);
|
||||
|
76
extras/address_standardizer/test-parseaddress.sql
Normal file
76
extras/address_standardizer/test-parseaddress.sql
Normal file
|
@ -0,0 +1,76 @@
|
|||
\pset pager off
|
||||
|
||||
drop table if exists test_parse_address cascade;
|
||||
create table test_parse_address (
|
||||
id serial not null primary key,
|
||||
instring text not null,
|
||||
outstring text
|
||||
);
|
||||
copy test_parse_address (instring, outstring) from stdin;
|
||||
@@ sttype dirs dirs words$
|
||||
123 oak ln e n mycity ny (123,"oak ln e",,"123 oak ln e","n mycity",NY,,,US)
|
||||
123 oak lane east n mycity ny (123,"oak lane east",,"123 oak lane east","n mycity",NY,,,US)
|
||||
123 oak ln e north mycity ny (123,"oak ln e",,"123 oak ln e","north mycity",NY,,,US)
|
||||
@@ sttype dirs dirs saint words$
|
||||
123 oak ln e n st marie ny (123,"oak ln e",,"123 oak ln e","n st marie",NY,,,US)
|
||||
123 oak lane east n st marie ny (123,"oak lane east",,"123 oak lane east","n st marie",NY,,,US)
|
||||
123 oak ln e north st marie ny (123,"oak ln e",,"123 oak ln e","north st marie",NY,,,US)
|
||||
123 oak ln e n saint marie ny (123,"oak ln e",,"123 oak ln e","n saint marie",NY,,,US)
|
||||
123 oak lane east n saint marie ny (123,"oak lane east",,"123 oak lane east","n saint marie",NY,,,US)
|
||||
123 oak ln e north saint marie ny (123,"oak ln e",,"123 oak ln e","north saint marie",NY,,,US)
|
||||
@@ sttype dirs saint words$
|
||||
123 oak ln e st marie ny (123,"oak ln",,"123 oak ln","e st marie",NY,,,US)
|
||||
123 oak lane east st marie ny (123,"oak lane",,"123 oak lane","east st marie",NY,,,US)
|
||||
123 oak ln e st marie ny (123,"oak ln",,"123 oak ln","e st marie",NY,,,US)
|
||||
123 oak ln e saint marie ny (123,"oak ln",,"123 oak ln","e saint marie",NY,,,US)
|
||||
123 oak lane east saint marie ny (123,"oak lane",,"123 oak lane","east saint marie",NY,,,US)
|
||||
123 oak ln e saint marie ny (123,"oak ln",,"123 oak ln","e saint marie",NY,,,US)
|
||||
@@ sttype saint words$
|
||||
123 oak ln st marie ny (123,"oak ln",,"123 oak ln","st marie",NY,,,US)
|
||||
123 oak lane st marie ny (123,"oak lane",,"123 oak lane","st marie",NY,,,US)
|
||||
123 oak ln st marie ny (123,"oak ln",,"123 oak ln","st marie",NY,,,US)
|
||||
123 oak ln saint marie ny (123,"oak ln",,"123 oak ln","saint marie",NY,,,US)
|
||||
123 oak lane saint marie ny (123,"oak lane",,"123 oak lane","saint marie",NY,,,US)
|
||||
123 oak ln saint marie ny (123,"oak ln",,"123 oak ln","saint marie",NY,,,US)
|
||||
@@ sttype words$
|
||||
123 oak ln marie ny (123,"oak ln",,"123 oak ln",marie,NY,,,US)
|
||||
123 oak ln new marie ny (123,"oak ln",,"123 oak ln","new marie",NY,,,US)
|
||||
@@ === same as above but with commas ===
|
||||
@@ sttype dirs dirs words$
|
||||
123 oak ln e, n mycity ny (123,"oak ln e",,"123 oak ln e","n mycity",NY,,,US)
|
||||
123 oak lane east, n mycity ny (123,"oak lane east",,"123 oak lane east","n mycity",NY,,,US)
|
||||
123 oak ln e, north mycity ny (123,"oak ln e",,"123 oak ln e","north mycity",NY,,,US)
|
||||
123 oak ln e n, mycity ny (123,"oak ln e n",,"123 oak ln e n",mycity,NY,,,US)
|
||||
123 oak lane east n, mycity ny (123,"oak lane east n",,"123 oak lane east n",mycity,NY,,,US)
|
||||
123 oak ln e north, mycity ny (123,"oak ln e north",,"123 oak ln e north",mycity,NY,,,US)
|
||||
@@ sttype dirs dirs saint words$
|
||||
123 oak ln e, n st marie ny (123,"oak ln e",,"123 oak ln e","n st marie",NY,,,US)
|
||||
123 oak lane east, n st marie ny (123,"oak lane east",,"123 oak lane east","n st marie",NY,,,US)
|
||||
123 oak ln e, north st marie ny (123,"oak ln e",,"123 oak ln e","north st marie",NY,,,US)
|
||||
123 oak ln e, n saint marie ny (123,"oak ln e",,"123 oak ln e","n saint marie",NY,,,US)
|
||||
123 oak lane east, n saint marie ny (123,"oak lane east",,"123 oak lane east","n saint marie",NY,,,US)
|
||||
123 oak ln e, north saint marie ny (123,"oak ln e",,"123 oak ln e","north saint marie",NY,,,US)
|
||||
@@ sttype dirs saint words$
|
||||
123 oak ln e, st marie ny (123,"oak ln e",,"123 oak ln e","st marie",NY,,,US)
|
||||
123 oak lane east, st marie ny (123,"oak lane east",,"123 oak lane east","st marie",NY,,,US)
|
||||
123 oak ln e, st marie ny (123,"oak ln e",,"123 oak ln e","st marie",NY,,,US)
|
||||
123 oak ln e, saint marie ny (123,"oak ln e",,"123 oak ln e","saint marie",NY,,,US)
|
||||
123 oak lane east, saint marie ny (123,"oak lane east",,"123 oak lane east","saint marie",NY,,,US)
|
||||
123 oak ln e, saint marie ny (123,"oak ln e",,"123 oak ln e","saint marie",NY,,,US)
|
||||
@@ sttype saint words$
|
||||
123 oak ln, st marie ny (123,"oak ln",,"123 oak ln","st marie",NY,,,US)
|
||||
123 oak lane, st marie ny (123,"oak lane",,"123 oak lane","st marie",NY,,,US)
|
||||
123 oak ln, st marie ny (123,"oak ln",,"123 oak ln","st marie",NY,,,US)
|
||||
123 oak ln, saint marie ny (123,"oak ln",,"123 oak ln","saint marie",NY,,,US)
|
||||
123 oak lane, saint marie ny (123,"oak lane",,"123 oak lane","saint marie",NY,,,US)
|
||||
123 oak ln, saint marie ny (123,"oak ln",,"123 oak ln","saint marie",NY,,,US)
|
||||
@@ sttype words$
|
||||
123 oak ln, marie ny (123,"oak ln",,"123 oak ln",marie,NY,,,US)
|
||||
123 oak ln, new marie ny (123,"oak ln",,"123 oak ln","new marie",NY,,,US)
|
||||
\.
|
||||
|
||||
select id, instring, outstring as expected, parse_address(instring) as got_result
|
||||
from test_parse_address
|
||||
where instring not like '@@%' and parse_address(instring)::text != outstring;
|
||||
|
||||
\q
|
12
extras/address_standardizer/test.sql
Normal file
12
extras/address_standardizer/test.sql
Normal file
|
@ -0,0 +1,12 @@
|
|||
\set ECHO queries
|
||||
\pset pager off
|
||||
|
||||
select * from parse_address('123 Main Street, Kansas City, MO 45678');
|
||||
|
||||
\i /usr/share/postgresql/9.2/extension/us-lex.sql
|
||||
\i /usr/share/postgresql/9.2/extension/us-gaz.sql
|
||||
\i /usr/share/postgresql/9.2/extension/us-rules.sql
|
||||
|
||||
select * from standardize_address('lex'::text, 'gaz'::text, 'rules'::text, '123 Main Street'::text, 'Kansas City, MO 45678'::text);
|
||||
|
||||
\q
|
8
extras/address_standardizer/test2.sql
Normal file
8
extras/address_standardizer/test2.sql
Normal file
|
@ -0,0 +1,8 @@
|
|||
\set ECHO queries
|
||||
\pset pager off
|
||||
|
||||
\i micro-macro.sql
|
||||
|
||||
select (std).* from (
|
||||
select standardize_address('lex', 'gaz', 'rules', micro, macro) as std
|
||||
from addresses) as foo;
|
335
extras/address_standardizer/test_main.c
Normal file
335
extras/address_standardizer/test_main.c
Normal file
|
@ -0,0 +1,335 @@
|
|||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "pagc_api.h"
|
||||
#include "pagc_std_api.h"
|
||||
|
||||
#define RULESIZE 40
|
||||
|
||||
#define LEXIN "lexicon.csv"
|
||||
#define GAZIN "gazeteer.csv"
|
||||
#define RULESIN "rules.txt"
|
||||
|
||||
static int standardize_command_line( STANDARDIZER *std ,
|
||||
char *input_str ,
|
||||
int option ) ;
|
||||
|
||||
void print_lexicon( ENTRY ** hash_table ) ;
|
||||
|
||||
/*
|
||||
parse_csv() parses the following file format into fields
|
||||
|
||||
"1","#",16,"#"
|
||||
"2","#",7,"#"
|
||||
"1","&",13,"AND"
|
||||
"2","&",1,"AND"
|
||||
"3","&",7,"AND"
|
||||
"1","-","9","-"
|
||||
|
||||
*/
|
||||
|
||||
/* ----------------------------------------------------
|
||||
lexicon.c (convert_field)
|
||||
called by lexicon.c (read_lexicon)
|
||||
ctype.h (isspace)
|
||||
uses macro BLANK_STRING
|
||||
-------------------------------------------------------*/
|
||||
static char *convert_field( char *buf ,
|
||||
char *inp ) {
|
||||
char c ;
|
||||
char *d = buf;
|
||||
char *s = inp ;
|
||||
|
||||
*d = '\0' ;
|
||||
/* -- space at the beginning of a line will stop the read -- */
|
||||
if ( isspace( *s ) )
|
||||
return NULL ;
|
||||
while ( ( c = *s++ ) != '\0' ) {
|
||||
if ( c == '\"' ||
|
||||
c == '\r' )
|
||||
continue ; /* -- ignore quotes and carriage returns -- */
|
||||
/* -- zero terminate field and record delimiters -- */
|
||||
if ( c == '\n' ||
|
||||
c == ',' ) {
|
||||
*d = '\0' ;
|
||||
return s ;
|
||||
}
|
||||
*d++ = c ; /* -- copy it -- */
|
||||
}
|
||||
return NULL ;
|
||||
}
|
||||
|
||||
static int parse_csv(char *buf, int *seq, char *word, char *stdword, int *token)
|
||||
{
|
||||
char *next_str ;
|
||||
char num_str[512];
|
||||
|
||||
if ( ( next_str = convert_field( num_str , buf) ) == NULL ) return 0;
|
||||
sscanf( num_str, "%d", seq );
|
||||
next_str = convert_field( word, next_str);
|
||||
next_str = convert_field( num_str, next_str);
|
||||
sscanf( num_str, "%d", token );
|
||||
next_str = convert_field( stdword, next_str);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
parse_rule() reads lines the following and loads them into int[] and
|
||||
returns the number of items read.
|
||||
|
||||
1 2 11 28 -1 10 10 11 13 -1 0 16
|
||||
1 2 11 28 12 -1 10 10 11 13 12 -1 0 17
|
||||
1 2 11 28 29 -1 10 10 11 13 13 -1 0 16
|
||||
1 2 11 28 29 12 -1 10 10 11 13 13 12 -1 0 17
|
||||
-1
|
||||
*/
|
||||
|
||||
int parse_rule(char *buf, int *rule)
|
||||
{
|
||||
int nr = 0;
|
||||
int *r = rule;
|
||||
char *p = buf;
|
||||
char *q;
|
||||
|
||||
|
||||
while (1) {
|
||||
*r = strtol( p, &q, 10 );
|
||||
if (p == q) break;
|
||||
p = q;
|
||||
nr++;
|
||||
r++;
|
||||
}
|
||||
|
||||
return nr;
|
||||
}
|
||||
|
||||
void Usage()
|
||||
{
|
||||
printf("Usage: test_main [-o n] \n");
|
||||
printf(" -o n = options bit flag\n");
|
||||
printf(" 1 = print lexicon\n");
|
||||
printf(" 2 = print gazeteer\n");
|
||||
printf(" 4 = print standardized fields\n");
|
||||
printf(" 8 = print rule statistics\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
STANDARDIZER *std;
|
||||
LEXICON *lex;
|
||||
LEXICON *gaz;
|
||||
RULES *rules;
|
||||
|
||||
char buf[1024];
|
||||
|
||||
int seq;
|
||||
char input_str[ 4096 ] ;
|
||||
char word[512];
|
||||
char stdword[512];
|
||||
int token;
|
||||
int nr;
|
||||
int rule[RULESIZE];
|
||||
int err;
|
||||
int cnt;
|
||||
int option = 0;
|
||||
|
||||
FILE *in;
|
||||
|
||||
if (argc == 3 && !strcmp(argv[1], "-o")) {
|
||||
option = strtol(argv[2], NULL, 10);
|
||||
argc -= 2;
|
||||
argv += 2;
|
||||
}
|
||||
else if (argc != 1)
|
||||
Usage();
|
||||
|
||||
std = std_init();
|
||||
assert(std);
|
||||
|
||||
lex = lex_init(std->err_p);
|
||||
assert(lex);
|
||||
|
||||
in = fopen(LEXIN, "rb");
|
||||
assert(in);
|
||||
|
||||
cnt = 0;
|
||||
while (!feof(in) && fgets(buf, 1024, in)) {
|
||||
cnt++;
|
||||
/* parse into fields */
|
||||
if (parse_csv(buf, &seq, word, stdword, &token)) {
|
||||
/* add the record to the lexicon */
|
||||
err = lex_add_entry(lex, seq, word, stdword, token);
|
||||
if (err != 1)
|
||||
printf("lex: Failed: %d: %s", cnt, buf);
|
||||
}
|
||||
else {
|
||||
printf("lex: Skipping: %d: %s", cnt, buf);
|
||||
}
|
||||
}
|
||||
fclose(in);
|
||||
|
||||
if (option & 1) {
|
||||
printf("------------ address lexicon --------------\n");
|
||||
print_lexicon(lex->hash_table);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
gaz = lex_init(std->err_p);
|
||||
assert(gaz);
|
||||
|
||||
in = fopen(GAZIN, "rb");
|
||||
assert(in);
|
||||
|
||||
cnt = 0;
|
||||
while (!feof(in) && fgets(buf, 1024, in)) {
|
||||
cnt++;
|
||||
/* parse into fields */
|
||||
if (parse_csv(buf, &seq, word, stdword, &token)) {
|
||||
/* add the record to the lexicon */
|
||||
err = lex_add_entry(gaz, seq, word, stdword, token);
|
||||
if (err != 1)
|
||||
printf("gaz: Failed: %d: %s", cnt, buf);
|
||||
}
|
||||
else {
|
||||
printf("gaz: Skipping: %d: %s", cnt, buf);
|
||||
}
|
||||
}
|
||||
fclose(in);
|
||||
|
||||
if (option & 2) {
|
||||
printf("------------ gazeteer lexicon --------------\n");
|
||||
print_lexicon(gaz->hash_table);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
rules = rules_init(std->err_p);
|
||||
assert(rules);
|
||||
rules -> r_p -> collect_statistics = TRUE ;
|
||||
|
||||
/* ************ RULES **************** */
|
||||
|
||||
in = fopen(RULESIN, "rb");
|
||||
assert(in);
|
||||
|
||||
cnt = 0;
|
||||
while (!feof(in) && fgets(buf, 1024, in)) {
|
||||
cnt++;
|
||||
/* parse into fields */
|
||||
nr = parse_rule(buf, rule);
|
||||
|
||||
/* add the record to the rules */
|
||||
err = rules_add_rule(rules, nr, rule);
|
||||
if (err != 0)
|
||||
printf("rules: Failed: %d (%d): %s", cnt, err, buf);
|
||||
}
|
||||
err = rules_ready(rules);
|
||||
if (err != 0)
|
||||
printf("rules: Failed: err=%d\n", err);
|
||||
fclose(in);
|
||||
|
||||
std_use_lex(std, lex);
|
||||
std_use_gaz(std, gaz);
|
||||
std_use_rules(std, rules);
|
||||
std_ready_standardizer(std);
|
||||
|
||||
printf( "Standardization test. Type \"exit\" to quit:\n" ) ;
|
||||
fflush( stdout ) ;
|
||||
while ( TRUE ) {
|
||||
err = standardize_command_line( std, input_str, option ) ;
|
||||
if ( err == FAIL ) {
|
||||
break ;
|
||||
}
|
||||
}
|
||||
printf( "OK\n" ) ;
|
||||
fflush( stdout ) ;
|
||||
|
||||
std_free(std);
|
||||
/* these were freed when we bound them with std_use_*()
|
||||
rules_free(rules);
|
||||
lex_free(gaz);
|
||||
lex_free(lex);
|
||||
*/
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
static int standardize_command_line( STANDARDIZER *std ,
|
||||
char *input_str ,
|
||||
int option ) {
|
||||
STDADDR *result;
|
||||
int fld_num ,
|
||||
have_user_macros ,
|
||||
num_prompts ;
|
||||
char unstandard_mic[ MAXSTRLEN ] ;
|
||||
char unstandard_mac_left[ MAXSTRLEN ] ;
|
||||
|
||||
num_prompts = 3 ;
|
||||
|
||||
unstandard_mic[ 0 ] = SENTINEL ;
|
||||
unstandard_mac_left[ 0 ] = SENTINEL ; ;
|
||||
have_user_macros = FALSE ;
|
||||
for ( fld_num = 1 ;
|
||||
fld_num < num_prompts ;
|
||||
fld_num++ ) {
|
||||
/* -- print prompt -- */
|
||||
if ( fld_num == 1 )
|
||||
printf( "MICRO:" ) ;
|
||||
else
|
||||
printf( "MACRO:" ) ;
|
||||
fflush( stdout ) ; /* -- to ensure prompt goes out --*/
|
||||
memset( input_str ,
|
||||
0 ,
|
||||
MAXSTRLEN ) ;
|
||||
input_str[ 0 ] = SENTINEL ;
|
||||
/* -- get user's input -- */
|
||||
if ( ( !get_input_line( input_str , stdin ) ) ||
|
||||
( strncmp( input_str , "exit" , 4 ) == 0 ) ||
|
||||
( strncmp( input_str , "quit" , 4 ) == 0 ) ||
|
||||
( strncmp( input_str , "done" , 4 ) == 0 )
|
||||
) {
|
||||
return FAIL ; /* -- indicate exit -- */
|
||||
}
|
||||
/* -- get input first, then standardize -- */
|
||||
if ( fld_num == 1 ) {
|
||||
strcpy( unstandard_mic ,
|
||||
input_str ) ;
|
||||
if ( *unstandard_mic == SENTINEL ) {
|
||||
printf( "No MICRO input\n" ) ;
|
||||
return FALSE ; /* -- indicate no standardization -- */
|
||||
}
|
||||
convert_latin_one ( unstandard_mic ) ;
|
||||
} else {
|
||||
strcpy( unstandard_mac_left ,
|
||||
input_str ) ;
|
||||
if ( *unstandard_mac_left != SENTINEL ) {
|
||||
have_user_macros = TRUE ;
|
||||
convert_latin_one ( unstandard_mac_left ) ;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result = std_standardize_mm( std,
|
||||
unstandard_mic,
|
||||
unstandard_mac_left,
|
||||
(option & 4)?1:0 ) ;
|
||||
|
||||
print_stdaddr( result );
|
||||
|
||||
if (option & 8)
|
||||
output_rule_statistics( std->pagc_p->rules, std->err_p ) ;
|
||||
|
||||
stdaddr_free(result);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
1005
extras/address_standardizer/tokenize.c
Normal file
1005
extras/address_standardizer/tokenize.c
Normal file
File diff suppressed because it is too large
Load diff
11
extras/address_standardizer/usps-st-city-adds.txt
Normal file
11
extras/address_standardizer/usps-st-city-adds.txt
Normal file
|
@ -0,0 +1,11 @@
|
|||
NY NY
|
||||
NY N Y
|
||||
NY NYC
|
||||
NY N Y C
|
||||
NY NEW YORK
|
||||
NY NY
|
||||
NY NY
|
||||
NY NY
|
||||
NY MANHATTAN
|
||||
NY QUEENS
|
||||
NY THE BRONX
|
38469
extras/address_standardizer/usps-st-city-orig.txt
Normal file
38469
extras/address_standardizer/usps-st-city-orig.txt
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue