cpython/Modules/unicodedata.c

623 lines
16 KiB
C
Raw Normal View History

/* ------------------------------------------------------------------------
unicodedata -- Provides access to the Unicode 3.2 data base.
Data was extracted from the Unicode 3.2 UnicodeData.txt file.
Written by Marc-Andre Lemburg (mal@lemburg.com).
Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Modified by Martin v. L<EFBFBD>wis (martin@v.loewis.de)
Copyright (c) Corporation for National Research Initiatives.
------------------------------------------------------------------------ */
#include "Python.h"
#include "ucnhash.h"
/* character properties */
typedef struct {
const unsigned char category; /* index into
_PyUnicode_CategoryNames */
const unsigned char combining; /* combining class value 0 - 255 */
const unsigned char bidirectional; /* index into
_PyUnicode_BidirectionalNames */
const unsigned char mirrored; /* true if mirrored in bidir mode */
} _PyUnicode_DatabaseRecord;
/* data file generated by Tools/unicode/makeunicodedata.py */
#include "unicodedata_db.h"
static const _PyUnicode_DatabaseRecord*
_getrecord(PyUnicodeObject* v)
{
int code;
int index;
code = (int) *PyUnicode_AS_UNICODE(v);
2002-10-18 16:11:54 +00:00
if (code < 0 || code >= 0x110000)
index = 0;
else {
index = index1[(code>>SHIFT)];
index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
}
return &_PyUnicode_Database_Records[index];
}
/* --- Module API --------------------------------------------------------- */
static PyObject *
unicodedata_decimal(PyObject *self, PyObject *args)
{
PyUnicodeObject *v;
PyObject *defobj = NULL;
long rc;
if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
return NULL;
if (PyUnicode_GET_SIZE(v) != 1) {
PyErr_SetString(PyExc_TypeError,
"need a single Unicode character as parameter");
return NULL;
}
rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
if (rc < 0) {
if (defobj == NULL) {
PyErr_SetString(PyExc_ValueError,
"not a decimal");
return NULL;
}
else {
Py_INCREF(defobj);
return defobj;
}
}
return PyInt_FromLong(rc);
}
static PyObject *
unicodedata_digit(PyObject *self, PyObject *args)
{
PyUnicodeObject *v;
PyObject *defobj = NULL;
long rc;
if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
return NULL;
if (PyUnicode_GET_SIZE(v) != 1) {
PyErr_SetString(PyExc_TypeError,
"need a single Unicode character as parameter");
return NULL;
}
rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
if (rc < 0) {
if (defobj == NULL) {
PyErr_SetString(PyExc_ValueError, "not a digit");
return NULL;
}
else {
Py_INCREF(defobj);
return defobj;
}
}
return PyInt_FromLong(rc);
}
static PyObject *
unicodedata_numeric(PyObject *self, PyObject *args)
{
PyUnicodeObject *v;
PyObject *defobj = NULL;
double rc;
if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
return NULL;
if (PyUnicode_GET_SIZE(v) != 1) {
PyErr_SetString(PyExc_TypeError,
"need a single Unicode character as parameter");
return NULL;
}
rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
if (rc < 0) {
if (defobj == NULL) {
PyErr_SetString(PyExc_ValueError, "not a numeric character");
return NULL;
}
else {
Py_INCREF(defobj);
return defobj;
}
}
return PyFloat_FromDouble(rc);
}
static PyObject *
unicodedata_category(PyObject *self, PyObject *args)
{
PyUnicodeObject *v;
int index;
if (!PyArg_ParseTuple(args, "O!:category",
&PyUnicode_Type, &v))
return NULL;
if (PyUnicode_GET_SIZE(v) != 1) {
PyErr_SetString(PyExc_TypeError,
"need a single Unicode character as parameter");
return NULL;
}
index = (int) _getrecord(v)->category;
return PyString_FromString(_PyUnicode_CategoryNames[index]);
}
static PyObject *
unicodedata_bidirectional(PyObject *self, PyObject *args)
{
PyUnicodeObject *v;
int index;
if (!PyArg_ParseTuple(args, "O!:bidirectional",
&PyUnicode_Type, &v))
return NULL;
if (PyUnicode_GET_SIZE(v) != 1) {
PyErr_SetString(PyExc_TypeError,
"need a single Unicode character as parameter");
return NULL;
}
index = (int) _getrecord(v)->bidirectional;
return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
}
static PyObject *
unicodedata_combining(PyObject *self, PyObject *args)
{
PyUnicodeObject *v;
if (!PyArg_ParseTuple(args, "O!:combining",
&PyUnicode_Type, &v))
return NULL;
if (PyUnicode_GET_SIZE(v) != 1) {
PyErr_SetString(PyExc_TypeError,
"need a single Unicode character as parameter");
return NULL;
}
return PyInt_FromLong((int) _getrecord(v)->combining);
}
static PyObject *
unicodedata_mirrored(PyObject *self, PyObject *args)
{
PyUnicodeObject *v;
if (!PyArg_ParseTuple(args, "O!:mirrored",
&PyUnicode_Type, &v))
return NULL;
if (PyUnicode_GET_SIZE(v) != 1) {
PyErr_SetString(PyExc_TypeError,
"need a single Unicode character as parameter");
return NULL;
}
return PyInt_FromLong((int) _getrecord(v)->mirrored);
}
static PyObject *
unicodedata_decomposition(PyObject *self, PyObject *args)
{
PyUnicodeObject *v;
char decomp[256];
int code, index, count, i;
if (!PyArg_ParseTuple(args, "O!:decomposition",
&PyUnicode_Type, &v))
return NULL;
if (PyUnicode_GET_SIZE(v) != 1) {
PyErr_SetString(PyExc_TypeError,
"need a single Unicode character as parameter");
return NULL;
}
code = (int) *PyUnicode_AS_UNICODE(v);
2002-10-18 16:11:54 +00:00
if (code < 0 || code >= 0x110000)
index = 0;
else {
index = decomp_index1[(code>>DECOMP_SHIFT)];
index = decomp_index2[(index<<DECOMP_SHIFT)+
(code&((1<<DECOMP_SHIFT)-1))];
}
/* high byte is number of hex bytes (usually one or two), low byte
is prefix code (from*/
count = decomp_data[index] >> 8;
/* XXX: could allocate the PyString up front instead
(strlen(prefix) + 5 * count + 1 bytes) */
/* copy prefix */
i = strlen(decomp_prefix[decomp_data[index] & 255]);
memcpy(decomp, decomp_prefix[decomp_data[index] & 255], i);
while (count-- > 0) {
if (i)
decomp[i++] = ' ';
assert((size_t)i < sizeof(decomp));
PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
decomp_data[++index]);
i += strlen(decomp + i);
}
decomp[i] = '\0';
return PyString_FromString(decomp);
}
/* -------------------------------------------------------------------- */
/* unicode character name tables */
/* data file generated by Tools/unicode/makeunicodedata.py */
#include "unicodename_db.h"
/* -------------------------------------------------------------------- */
/* database code (cut and pasted from the unidb package) */
static unsigned long
_gethash(const char *s, int len, int scale)
{
int i;
unsigned long h = 0;
unsigned long ix;
for (i = 0; i < len; i++) {
h = (h * scale) + (unsigned char) toupper(s[i]);
ix = h & 0xff000000;
if (ix)
h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
}
return h;
}
#define SBase 0xAC00
#define LBase 0x1100
#define VBase 0x1161
#define TBase 0x11A7
#define LCount 19
#define VCount 21
#define TCount 28
#define NCount (VCount*TCount)
#define SCount (LCount*NCount)
static char *hangul_syllables[][3] = {
{ "G", "A", "" },
{ "GG", "AE", "G" },
{ "N", "YA", "GG" },
{ "D", "YAE", "GS" },
{ "DD", "EO", "N", },
{ "R", "E", "NJ" },
{ "M", "YEO", "NH" },
{ "B", "YE", "D" },
{ "BB", "O", "L" },
{ "S", "WA", "LG" },
{ "SS", "WAE", "LM" },
{ "", "OE", "LB" },
{ "J", "YO", "LS" },
{ "JJ", "U", "LT" },
{ "C", "WEO", "LP" },
{ "K", "WE", "LH" },
{ "T", "WI", "M" },
{ "P", "YU", "B" },
{ "H", "EU", "BS" },
{ 0, "YI", "S" },
{ 0, "I", "SS" },
{ 0, 0, "NG" },
{ 0, 0, "J" },
{ 0, 0, "C" },
{ 0, 0, "K" },
{ 0, 0, "T" },
{ 0, 0, "P" },
{ 0, 0, "H" }
};
static int
_getucname(Py_UCS4 code, char* buffer, int buflen)
{
int offset;
int i;
int word;
unsigned char* w;
2002-11-23 17:11:06 +00:00
if (SBase <= code && code < SBase+SCount) {
/* Hangul syllable. */
int SIndex = code - SBase;
int L = SIndex / NCount;
int V = (SIndex % NCount) / TCount;
int T = SIndex % TCount;
if (buflen < 27)
/* Worst case: HANGUL SYLLABLE <10chars>. */
return 0;
strcpy(buffer, "HANGUL SYLLABLE ");
buffer += 16;
strcpy(buffer, hangul_syllables[L][0]);
buffer += strlen(hangul_syllables[L][0]);
strcpy(buffer, hangul_syllables[V][1]);
buffer += strlen(hangul_syllables[V][1]);
strcpy(buffer, hangul_syllables[T][2]);
buffer += strlen(hangul_syllables[T][2]);
*buffer = '\0';
return 1;
}
if ((0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
(0x4E00 <= code && code <= 0x9FA5) || /* CJK Ideograph */
(0x20000 <= code && code <= 0x2A6D6)) {/* CJK Ideograph Extension B */
if (buflen < 28)
/* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
return 0;
sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
return 1;
}
2002-10-18 16:11:54 +00:00
if (code >= 0x110000)
return 0;
/* get offset into phrasebook */
offset = phrasebook_offset1[(code>>phrasebook_shift)];
offset = phrasebook_offset2[(offset<<phrasebook_shift) +
(code&((1<<phrasebook_shift)-1))];
if (!offset)
return 0;
i = 0;
for (;;) {
/* get word index */
word = phrasebook[offset] - phrasebook_short;
if (word >= 0) {
word = (word << 8) + phrasebook[offset+1];
offset += 2;
} else
word = phrasebook[offset++];
if (i) {
if (i > buflen)
return 0; /* buffer overflow */
buffer[i++] = ' ';
}
/* copy word string from lexicon. the last character in the
word has bit 7 set. the last word in a string ends with
0x80 */
w = lexicon + lexicon_offset[word];
while (*w < 128) {
if (i >= buflen)
return 0; /* buffer overflow */
buffer[i++] = *w++;
}
if (i >= buflen)
return 0; /* buffer overflow */
buffer[i++] = *w & 127;
if (*w == 128)
break; /* end of word */
}
return 1;
}
static int
_cmpname(int code, const char* name, int namelen)
{
/* check if code corresponds to the given name */
int i;
char buffer[NAME_MAXLEN];
if (!_getucname(code, buffer, sizeof(buffer)))
return 0;
for (i = 0; i < namelen; i++) {
if (toupper(name[i]) != buffer[i])
return 0;
}
return buffer[namelen] == '\0';
}
static void
find_syllable(const char *str, int *len, int *pos, int count, int column)
{
int i, len1;
*len = -1;
for (i = 0; i < count; i++) {
char *s = hangul_syllables[i][column];
len1 = strlen(s);
if (len1 <= *len)
continue;
if (strncmp(str, s, len1) == 0) {
*len = len1;
*pos = i;
}
}
if (*len == -1) {
*len = 0;
*pos = -1;
}
}
static int
_getcode(const char* name, int namelen, Py_UCS4* code)
{
unsigned int h, v;
unsigned int mask = code_size-1;
unsigned int i, incr;
/* Check for hangul syllables. */
if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
int L, V, T, len;
const char *pos = name + 16;
find_syllable(pos, &len, &L, LCount, 0);
pos += len;
find_syllable(pos, &len, &V, VCount, 1);
pos += len;
find_syllable(pos, &len, &T, TCount, 2);
pos += len;
if (V != -1 && V != -1 && T != -1 && pos-name == namelen) {
*code = SBase + (L*VCount+V)*TCount + T;
return 1;
}
/* Otherwise, it's an illegal syllable name. */
return 0;
}
/* Check for unified ideographs. */
if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
/* Four or five hexdigits must follow. */
v = 0;
name += 22;
namelen -= 22;
if (namelen != 4 && namelen != 5)
return 0;
while (namelen--) {
v *= 16;
if (*name >= '0' && *name <= '9')
v += *name - '0';
else if (*name >= 'A' && *name <= 'F')
v += *name - 'A' + 10;
else
return 0;
name++;
}
*code = v;
return 1;
}
/* the following is the same as python's dictionary lookup, with
only minor changes. see the makeunicodedata script for more
details */
h = (unsigned int) _gethash(name, namelen, code_magic);
i = (~h) & mask;
v = code_hash[i];
if (!v)
return 0;
if (_cmpname(v, name, namelen)) {
*code = v;
return 1;
}
incr = (h ^ (h >> 3)) & mask;
if (!incr)
incr = mask;
for (;;) {
i = (i + incr) & mask;
v = code_hash[i];
if (!v)
return 0;
if (_cmpname(v, name, namelen)) {
*code = v;
return 1;
}
incr = incr << 1;
if (incr > mask)
incr = incr ^ code_poly;
}
}
static const _PyUnicode_Name_CAPI hashAPI =
{
sizeof(_PyUnicode_Name_CAPI),
_getucname,
_getcode
};
/* -------------------------------------------------------------------- */
/* Python bindings */
static PyObject *
unicodedata_name(PyObject* self, PyObject* args)
{
char name[NAME_MAXLEN];
PyUnicodeObject* v;
PyObject* defobj = NULL;
if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
return NULL;
if (PyUnicode_GET_SIZE(v) != 1) {
PyErr_SetString(PyExc_TypeError,
"need a single Unicode character as parameter");
return NULL;
}
if (!_getucname((Py_UCS4) *PyUnicode_AS_UNICODE(v),
name, sizeof(name))) {
if (defobj == NULL) {
PyErr_SetString(PyExc_ValueError, "no such name");
return NULL;
}
else {
Py_INCREF(defobj);
return defobj;
}
}
return Py_BuildValue("s", name);
}
static PyObject *
unicodedata_lookup(PyObject* self, PyObject* args)
{
Py_UCS4 code;
Py_UNICODE str[1];
char* name;
int namelen;
if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
return NULL;
if (!_getcode(name, namelen, &code)) {
char fmt[] = "undefined character name '%s'";
char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
sprintf(buf, fmt, name);
PyErr_SetString(PyExc_KeyError, buf);
PyMem_FREE(buf);
return NULL;
}
str[0] = (Py_UNICODE) code;
return PyUnicode_FromUnicode(str, 1);
}
/* XXX Add doc strings. */
static PyMethodDef unicodedata_functions[] = {
{"decimal", unicodedata_decimal, METH_VARARGS},
{"digit", unicodedata_digit, METH_VARARGS},
{"numeric", unicodedata_numeric, METH_VARARGS},
{"category", unicodedata_category, METH_VARARGS},
{"bidirectional", unicodedata_bidirectional, METH_VARARGS},
{"combining", unicodedata_combining, METH_VARARGS},
{"mirrored", unicodedata_mirrored, METH_VARARGS},
{"decomposition",unicodedata_decomposition, METH_VARARGS},
{"name", unicodedata_name, METH_VARARGS},
{"lookup", unicodedata_lookup, METH_VARARGS},
{NULL, NULL} /* sentinel */
};
2002-06-13 20:33:02 +00:00
PyDoc_STRVAR(unicodedata_docstring, "unicode character database");
PyMODINIT_FUNC
initunicodedata(void)
{
PyObject *m, *v;
m = Py_InitModule3(
"unicodedata", unicodedata_functions, unicodedata_docstring);
if (!m)
return;
/* Export C API */
v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
if (v != NULL)
PyModule_AddObject(m, "ucnhash_CAPI", v);
}
/*
Local variables:
c-basic-offset: 4
End:
*/