PageRenderTime 167ms CodeModel.GetById 24ms app.highlight 116ms RepoModel.GetById 1ms app.codeStats 1ms

/Modules/unicodedata.c

http://unladen-swallow.googlecode.com/
C | 1221 lines | 992 code | 128 blank | 101 comment | 249 complexity | 1da9125ed0b8ab0b333aa9f934dcac96 MD5 | raw file
   1/* ------------------------------------------------------------------------
   2
   3   unicodedata -- Provides access to the Unicode 5.1 data base.
   4
   5   Data was extracted from the Unicode 5.1 UnicodeData.txt file.
   6
   7   Written by Marc-Andre Lemburg (mal@lemburg.com).
   8   Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
   9   Modified by Martin v. Löwis (martin@v.loewis.de)
  10
  11   Copyright (c) Corporation for National Research Initiatives.
  12
  13   ------------------------------------------------------------------------ */
  14
  15#include "Python.h"
  16#include "ucnhash.h"
  17#include "structmember.h"
  18
  19/* character properties */
  20
  21typedef struct {
  22    const unsigned char category;	/* index into
  23					   _PyUnicode_CategoryNames */
  24    const unsigned char	combining; 	/* combining class value 0 - 255 */
  25    const unsigned char	bidirectional; 	/* index into
  26					   _PyUnicode_BidirectionalNames */
  27    const unsigned char mirrored;	/* true if mirrored in bidir mode */
  28    const unsigned char east_asian_width;	/* index into
  29						   _PyUnicode_EastAsianWidth */
  30} _PyUnicode_DatabaseRecord;
  31
  32typedef struct change_record {
  33    /* sequence of fields should be the same as in merge_old_version */
  34    const unsigned char bidir_changed;
  35    const unsigned char category_changed;
  36    const unsigned char decimal_changed;
  37    const unsigned char mirrored_changed;
  38    const int numeric_changed;
  39} change_record;
  40
  41/* data file generated by Tools/unicode/makeunicodedata.py */
  42#include "unicodedata_db.h"
  43
  44static const _PyUnicode_DatabaseRecord*
  45_getrecord_ex(Py_UCS4 code)
  46{
  47    int index;
  48    if (code >= 0x110000)
  49        index = 0;
  50    else {
  51        index = index1[(code>>SHIFT)];
  52        index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
  53    }
  54
  55    return &_PyUnicode_Database_Records[index];
  56}
  57
  58/* ------------- Previous-version API ------------------------------------- */
  59typedef struct previous_version {
  60    PyObject_HEAD
  61    const char *name;
  62    const change_record* (*getrecord)(Py_UCS4);
  63    Py_UCS4 (*normalization)(Py_UCS4);
  64} PreviousDBVersion;
  65
  66#define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
  67
  68static PyMemberDef DB_members[] = {
  69	{"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
  70        {NULL}
  71};
  72
  73/* forward declaration */
  74static PyTypeObject UCD_Type;
  75
  76static PyObject*
  77new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
  78                     Py_UCS4 (*normalization)(Py_UCS4))
  79{
  80	PreviousDBVersion *self;
  81	self = PyObject_New(PreviousDBVersion, &UCD_Type);
  82	if (self == NULL)
  83		return NULL;
  84	self->name = name;
  85	self->getrecord = getrecord;
  86        self->normalization = normalization;
  87	return (PyObject*)self;
  88}
  89
  90
  91static Py_UCS4 getuchar(PyUnicodeObject *obj)
  92{
  93    Py_UNICODE *v = PyUnicode_AS_UNICODE(obj);
  94
  95    if (PyUnicode_GET_SIZE(obj) == 1)
  96	return *v;
  97#ifndef Py_UNICODE_WIDE
  98    else if ((PyUnicode_GET_SIZE(obj) == 2) &&
  99             (0xD800 <= v[0] && v[0] <= 0xDBFF) &&
 100             (0xDC00 <= v[1] && v[1] <= 0xDFFF))
 101	return (((v[0] & 0x3FF)<<10) | (v[1] & 0x3FF)) + 0x10000;
 102#endif
 103    PyErr_SetString(PyExc_TypeError,
 104                    "need a single Unicode character as parameter");
 105    return (Py_UCS4)-1;
 106}
 107
 108/* --- Module API --------------------------------------------------------- */
 109
 110PyDoc_STRVAR(unicodedata_decimal__doc__,
 111"decimal(unichr[, default])\n\
 112\n\
 113Returns the decimal value assigned to the Unicode character unichr\n\
 114as integer. If no such value is defined, default is returned, or, if\n\
 115not given, ValueError is raised.");
 116
 117static PyObject *
 118unicodedata_decimal(PyObject *self, PyObject *args)
 119{
 120    PyUnicodeObject *v;
 121    PyObject *defobj = NULL;
 122    int have_old = 0;
 123    long rc;
 124    Py_UCS4 c;
 125
 126    if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
 127        return NULL;
 128    c = getuchar(v);
 129    if (c == (Py_UCS4)-1)
 130        return NULL;
 131
 132    if (self) {
 133        const change_record *old = get_old_record(self, c);
 134        if (old->category_changed == 0) {
 135            /* unassigned */
 136            have_old = 1;
 137            rc = -1;
 138        } 
 139        else if (old->decimal_changed != 0xFF) {
 140            have_old = 1;
 141            rc = old->decimal_changed;
 142        }
 143    }
 144
 145    if (!have_old)
 146        rc = Py_UNICODE_TODECIMAL(c);
 147    if (rc < 0) {
 148	if (defobj == NULL) {
 149	    PyErr_SetString(PyExc_ValueError,
 150			    "not a decimal");
 151            return NULL;
 152	}
 153	else {
 154	    Py_INCREF(defobj);
 155	    return defobj;
 156	}
 157    }
 158    return PyInt_FromLong(rc);
 159}
 160
 161PyDoc_STRVAR(unicodedata_digit__doc__,
 162"digit(unichr[, default])\n\
 163\n\
 164Returns the digit value assigned to the Unicode character unichr as\n\
 165integer. If no such value is defined, default is returned, or, if\n\
 166not given, ValueError is raised.");
 167
 168static PyObject *
 169unicodedata_digit(PyObject *self, PyObject *args)
 170{
 171    PyUnicodeObject *v;
 172    PyObject *defobj = NULL;
 173    long rc;
 174    Py_UCS4 c;
 175
 176    if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
 177        return NULL;
 178    c = getuchar(v);
 179    if (c == (Py_UCS4)-1)
 180        return NULL;
 181    rc = Py_UNICODE_TODIGIT(c);
 182    if (rc < 0) {
 183	if (defobj == NULL) {
 184	    PyErr_SetString(PyExc_ValueError, "not a digit");
 185            return NULL;
 186	}
 187	else {
 188	    Py_INCREF(defobj);
 189	    return defobj;
 190	}
 191    }
 192    return PyInt_FromLong(rc);
 193}
 194
 195PyDoc_STRVAR(unicodedata_numeric__doc__,
 196"numeric(unichr[, default])\n\
 197\n\
 198Returns the numeric value assigned to the Unicode character unichr\n\
 199as float. If no such value is defined, default is returned, or, if\n\
 200not given, ValueError is raised.");
 201
 202static PyObject *
 203unicodedata_numeric(PyObject *self, PyObject *args)
 204{
 205    PyUnicodeObject *v;
 206    PyObject *defobj = NULL;
 207    int have_old = 0;
 208    double rc;
 209    Py_UCS4 c;
 210
 211    if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
 212        return NULL;
 213    c = getuchar(v);
 214    if (c == (Py_UCS4)-1)
 215        return NULL;
 216
 217    if (self) {
 218        const change_record *old = get_old_record(self, c);
 219        if (old->category_changed == 0) {
 220            /* unassigned */
 221            have_old = 1;
 222            rc = -1.0;
 223        } 
 224        else if (old->decimal_changed != 0xFF) {
 225            have_old = 1;
 226            rc = old->decimal_changed;
 227        }
 228    }
 229
 230    if (!have_old)
 231        rc = Py_UNICODE_TONUMERIC(c);
 232    if (rc == -1.0) {
 233	if (defobj == NULL) {
 234	    PyErr_SetString(PyExc_ValueError, "not a numeric character");
 235	    return NULL;
 236	}
 237	else {
 238	    Py_INCREF(defobj);
 239	    return defobj;
 240	}
 241    }
 242    return PyFloat_FromDouble(rc);
 243}
 244
 245PyDoc_STRVAR(unicodedata_category__doc__,
 246"category(unichr)\n\
 247\n\
 248Returns the general category assigned to the Unicode character\n\
 249unichr as string.");
 250
 251static PyObject *
 252unicodedata_category(PyObject *self, PyObject *args)
 253{
 254    PyUnicodeObject *v;
 255    int index;
 256    Py_UCS4 c;
 257
 258    if (!PyArg_ParseTuple(args, "O!:category",
 259			  &PyUnicode_Type, &v))
 260	return NULL;
 261    c = getuchar(v);
 262    if (c == (Py_UCS4)-1)
 263        return NULL;
 264    index = (int) _getrecord_ex(c)->category;
 265    if (self) {
 266        const change_record *old = get_old_record(self, c);
 267        if (old->category_changed != 0xFF)
 268            index = old->category_changed;
 269    }
 270    return PyString_FromString(_PyUnicode_CategoryNames[index]);
 271}
 272
 273PyDoc_STRVAR(unicodedata_bidirectional__doc__,
 274"bidirectional(unichr)\n\
 275\n\
 276Returns the bidirectional category assigned to the Unicode character\n\
 277unichr as string. If no such value is defined, an empty string is\n\
 278returned.");
 279
 280static PyObject *
 281unicodedata_bidirectional(PyObject *self, PyObject *args)
 282{
 283    PyUnicodeObject *v;
 284    int index;
 285    Py_UCS4 c;
 286
 287    if (!PyArg_ParseTuple(args, "O!:bidirectional",
 288			  &PyUnicode_Type, &v))
 289	return NULL;
 290    c = getuchar(v);
 291    if (c == (Py_UCS4)-1)
 292        return NULL;
 293    index = (int) _getrecord_ex(c)->bidirectional;
 294    if (self) {
 295        const change_record *old = get_old_record(self, c);
 296        if (old->category_changed == 0)
 297            index = 0; /* unassigned */
 298        else if (old->bidir_changed != 0xFF)
 299            index = old->bidir_changed;
 300    }
 301    return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
 302}
 303
 304PyDoc_STRVAR(unicodedata_combining__doc__,
 305"combining(unichr)\n\
 306\n\
 307Returns the canonical combining class assigned to the Unicode\n\
 308character unichr as integer. Returns 0 if no combining class is\n\
 309defined.");
 310
 311static PyObject *
 312unicodedata_combining(PyObject *self, PyObject *args)
 313{
 314    PyUnicodeObject *v;
 315    int index;
 316    Py_UCS4 c;
 317
 318    if (!PyArg_ParseTuple(args, "O!:combining",
 319			  &PyUnicode_Type, &v))
 320	return NULL;
 321    c = getuchar(v);
 322    if (c == (Py_UCS4)-1)
 323        return NULL;
 324    index = (int) _getrecord_ex(c)->combining;
 325    if (self) {
 326        const change_record *old = get_old_record(self, c);
 327        if (old->category_changed == 0)
 328            index = 0; /* unassigned */
 329    }
 330    return PyInt_FromLong(index);
 331}
 332
 333PyDoc_STRVAR(unicodedata_mirrored__doc__,
 334"mirrored(unichr)\n\
 335\n\
 336Returns the mirrored property assigned to the Unicode character\n\
 337unichr as integer. Returns 1 if the character has been identified as\n\
 338a \"mirrored\" character in bidirectional text, 0 otherwise.");
 339
 340static PyObject *
 341unicodedata_mirrored(PyObject *self, PyObject *args)
 342{
 343    PyUnicodeObject *v;
 344    int index;
 345    Py_UCS4 c;
 346
 347    if (!PyArg_ParseTuple(args, "O!:mirrored",
 348			  &PyUnicode_Type, &v))
 349	return NULL;
 350    c = getuchar(v);
 351    if (c == (Py_UCS4)-1)
 352        return NULL;
 353    index = (int) _getrecord_ex(c)->mirrored;
 354    if (self) {
 355        const change_record *old = get_old_record(self, c);
 356        if (old->category_changed == 0)
 357            index = 0; /* unassigned */
 358        else if (old->mirrored_changed != 0xFF)
 359            index = old->mirrored_changed;
 360    }
 361    return PyInt_FromLong(index);
 362}
 363
 364PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
 365"east_asian_width(unichr)\n\
 366\n\
 367Returns the east asian width assigned to the Unicode character\n\
 368unichr as string.");
 369
 370static PyObject *
 371unicodedata_east_asian_width(PyObject *self, PyObject *args)
 372{
 373    PyUnicodeObject *v;
 374    int index;
 375    Py_UCS4 c;
 376
 377    if (!PyArg_ParseTuple(args, "O!:east_asian_width",
 378			  &PyUnicode_Type, &v))
 379	return NULL;
 380    c = getuchar(v);
 381    if (c == (Py_UCS4)-1)
 382        return NULL;
 383    index = (int) _getrecord_ex(c)->east_asian_width;
 384    if (self) {
 385        const change_record *old = get_old_record(self, c);
 386        if (old->category_changed == 0)
 387            index = 0; /* unassigned */
 388    }
 389    return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
 390}
 391
 392PyDoc_STRVAR(unicodedata_decomposition__doc__,
 393"decomposition(unichr)\n\
 394\n\
 395Returns the character decomposition mapping assigned to the Unicode\n\
 396character unichr as string. An empty string is returned in case no\n\
 397such mapping is defined.");
 398
 399static PyObject *
 400unicodedata_decomposition(PyObject *self, PyObject *args)
 401{
 402    PyUnicodeObject *v;
 403    char decomp[256];
 404    int code, index, count, i;
 405    unsigned int prefix_index;
 406    Py_UCS4 c;
 407
 408    if (!PyArg_ParseTuple(args, "O!:decomposition",
 409			  &PyUnicode_Type, &v))
 410	return NULL;
 411    c = getuchar(v);
 412    if (c == (Py_UCS4)-1)
 413        return NULL;
 414
 415    code = (int)c;
 416
 417    if (self) {
 418        const change_record *old = get_old_record(self, c);
 419        if (old->category_changed == 0)
 420            return PyString_FromString(""); /* unassigned */
 421    }
 422
 423    if (code < 0 || code >= 0x110000)
 424        index = 0;
 425    else {
 426        index = decomp_index1[(code>>DECOMP_SHIFT)];
 427        index = decomp_index2[(index<<DECOMP_SHIFT)+
 428                             (code&((1<<DECOMP_SHIFT)-1))];
 429    }
 430
 431    /* high byte is number of hex bytes (usually one or two), low byte
 432       is prefix code (from*/
 433    count = decomp_data[index] >> 8;
 434
 435    /* XXX: could allocate the PyString up front instead
 436       (strlen(prefix) + 5 * count + 1 bytes) */
 437
 438    /* Based on how index is calculated above and decomp_data is generated
 439       from Tools/unicode/makeunicodedata.py, it should not be possible
 440       to overflow decomp_prefix. */
 441    prefix_index = decomp_data[index] & 255;
 442    assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
 443
 444    /* copy prefix */
 445    i = strlen(decomp_prefix[prefix_index]);
 446    memcpy(decomp, decomp_prefix[prefix_index], i);
 447
 448    while (count-- > 0) {
 449        if (i)
 450            decomp[i++] = ' ';
 451        assert((size_t)i < sizeof(decomp));
 452        PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
 453                      decomp_data[++index]);
 454        i += strlen(decomp + i);
 455    }
 456    
 457    decomp[i] = '\0';
 458
 459    return PyString_FromString(decomp);
 460}
 461
 462static void
 463get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
 464{
 465    if (code >= 0x110000) {
 466        *index = 0;
 467    } else if (self && get_old_record(self, code)->category_changed==0) {
 468        /* unassigned in old version */
 469        *index = 0;
 470    }
 471    else {
 472        *index = decomp_index1[(code>>DECOMP_SHIFT)];
 473        *index = decomp_index2[(*index<<DECOMP_SHIFT)+
 474                               (code&((1<<DECOMP_SHIFT)-1))];
 475    }
 476	
 477    /* high byte is number of hex bytes (usually one or two), low byte
 478       is prefix code (from*/
 479    *count = decomp_data[*index] >> 8;
 480    *prefix = decomp_data[*index] & 255;
 481
 482    (*index)++;
 483}
 484
 485#define SBase   0xAC00
 486#define LBase   0x1100
 487#define VBase   0x1161
 488#define TBase   0x11A7
 489#define LCount  19
 490#define VCount  21
 491#define TCount  28
 492#define NCount  (VCount*TCount)
 493#define SCount  (LCount*NCount)
 494
 495static PyObject*
 496nfd_nfkd(PyObject *self, PyObject *input, int k)
 497{
 498    PyObject *result;
 499    Py_UNICODE *i, *end, *o;
 500    /* Longest decomposition in Unicode 3.2: U+FDFA */
 501    Py_UNICODE stack[20]; 
 502    Py_ssize_t space, isize;
 503    int index, prefix, count, stackptr;
 504    unsigned char prev, cur;
 505	
 506    stackptr = 0;
 507    isize = PyUnicode_GET_SIZE(input);
 508    /* Overallocate atmost 10 characters. */
 509    space = (isize > 10 ? 10 : isize) + isize;
 510    result = PyUnicode_FromUnicode(NULL, space);
 511    if (!result)
 512        return NULL;
 513    i = PyUnicode_AS_UNICODE(input);
 514    end = i + isize;
 515    o = PyUnicode_AS_UNICODE(result);
 516
 517    while (i < end) {
 518        stack[stackptr++] = *i++;
 519        while(stackptr) {
 520            Py_UNICODE code = stack[--stackptr];
 521            /* Hangul Decomposition adds three characters in
 522               a single step, so we need atleast that much room. */
 523            if (space < 3) {
 524                Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;
 525                space += 10;
 526                if (PyUnicode_Resize(&result, newsize) == -1)
 527                    return NULL;
 528                o = PyUnicode_AS_UNICODE(result) + newsize - space;
 529            }
 530            /* Hangul Decomposition. */
 531            if (SBase <= code && code < (SBase+SCount)) {
 532                int SIndex = code - SBase;
 533                int L = LBase + SIndex / NCount;
 534                int V = VBase + (SIndex % NCount) / TCount;
 535                int T = TBase + SIndex % TCount;
 536                *o++ = L;
 537                *o++ = V;
 538                space -= 2;
 539                if (T != TBase) {
 540                    *o++ = T;
 541                    space --;
 542                }
 543                continue;
 544            }
 545            /* normalization changes */
 546            if (self) {
 547                Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
 548                if (value != 0) {
 549                    stack[stackptr++] = value;
 550                    continue;
 551                }
 552            }
 553
 554            /* Other decompositions. */
 555            get_decomp_record(self, code, &index, &prefix, &count);
 556
 557            /* Copy character if it is not decomposable, or has a
 558               compatibility decomposition, but we do NFD. */
 559            if (!count || (prefix && !k)) {
 560                *o++ = code;
 561                space--;
 562                continue;
 563            }
 564            /* Copy decomposition onto the stack, in reverse
 565               order.  */
 566            while(count) {
 567                code = decomp_data[index + (--count)];
 568                stack[stackptr++] = code;
 569            }
 570        }
 571    }
 572
 573    /* Drop overallocation. Cannot fail. */
 574    PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
 575
 576    /* Sort canonically. */
 577    i = PyUnicode_AS_UNICODE(result);
 578    prev = _getrecord_ex(*i)->combining;
 579    end = i + PyUnicode_GET_SIZE(result);
 580    for (i++; i < end; i++) {
 581        cur = _getrecord_ex(*i)->combining;
 582        if (prev == 0 || cur == 0 || prev <= cur) {
 583            prev = cur;
 584            continue;
 585        }
 586        /* Non-canonical order. Need to switch *i with previous. */
 587        o = i - 1;
 588        while (1) {
 589            Py_UNICODE tmp = o[1];
 590            o[1] = o[0];
 591            o[0] = tmp;
 592            o--;
 593            if (o < PyUnicode_AS_UNICODE(result))
 594                break;
 595            prev = _getrecord_ex(*o)->combining;
 596            if (prev == 0 || prev <= cur)
 597                break;
 598        }
 599        prev = _getrecord_ex(*i)->combining;
 600    }
 601    return result;
 602}
 603
 604static int
 605find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
 606{
 607    int index;
 608    for (index = 0; nfc[index].start; index++) {
 609        int start = nfc[index].start;
 610        if (code < start)
 611            return -1;
 612        if (code <= start + nfc[index].count) {
 613            int delta = code - start;
 614            return nfc[index].index + delta;
 615        }
 616    }
 617    return -1;
 618}
 619
 620static PyObject*
 621nfc_nfkc(PyObject *self, PyObject *input, int k)
 622{
 623    PyObject *result;
 624    Py_UNICODE *i, *i1, *o, *end;
 625    int f,l,index,index1,comb;
 626    Py_UNICODE code;
 627    Py_UNICODE *skipped[20];
 628    int cskipped = 0;
 629
 630    result = nfd_nfkd(self, input, k);
 631    if (!result)
 632        return NULL;
 633
 634    /* We are going to modify result in-place.
 635       If nfd_nfkd is changed to sometimes return the input,
 636       this code needs to be reviewed. */
 637    assert(result != input);
 638
 639    i = PyUnicode_AS_UNICODE(result);
 640    end = i + PyUnicode_GET_SIZE(result);
 641    o = PyUnicode_AS_UNICODE(result);
 642	
 643  again:
 644    while (i < end) {
 645      for (index = 0; index < cskipped; index++) {
 646          if (skipped[index] == i) {
 647              /* *i character is skipped. 
 648                 Remove from list. */
 649              skipped[index] = skipped[cskipped-1];
 650              cskipped--;
 651              i++;
 652              goto again; /* continue while */
 653          }
 654      }
 655      /* Hangul Composition. We don't need to check for <LV,T>
 656         pairs, since we always have decomposed data. */
 657      if (LBase <= *i && *i < (LBase+LCount) &&
 658          i + 1 < end && 
 659          VBase <= i[1] && i[1] <= (VBase+VCount)) {
 660          int LIndex, VIndex;
 661          LIndex = i[0] - LBase;
 662          VIndex = i[1] - VBase;
 663          code = SBase + (LIndex*VCount+VIndex)*TCount;
 664          i+=2;
 665          if (i < end &&
 666              TBase <= *i && *i <= (TBase+TCount)) {
 667              code += *i-TBase;
 668              i++;
 669          }
 670          *o++ = code;
 671          continue;
 672      }
 673
 674      f = find_nfc_index(self, nfc_first, *i);
 675      if (f == -1) {
 676          *o++ = *i++;
 677          continue;
 678      }
 679      /* Find next unblocked character. */
 680      i1 = i+1;
 681      comb = 0;
 682      while (i1 < end) {
 683          int comb1 = _getrecord_ex(*i1)->combining;
 684          if (comb1 && comb == comb1) {
 685              /* Character is blocked. */
 686              i1++;
 687              continue;
 688          }
 689          l = find_nfc_index(self, nfc_last, *i1);
 690          /* *i1 cannot be combined with *i. If *i1
 691             is a starter, we don't need to look further.
 692             Otherwise, record the combining class. */
 693          if (l == -1) {
 694            not_combinable:
 695              if (comb1 == 0)
 696                  break;
 697              comb = comb1;
 698              i1++;
 699              continue;
 700          }
 701          index = f*TOTAL_LAST + l;
 702          index1 = comp_index[index >> COMP_SHIFT];
 703          code = comp_data[(index1<<COMP_SHIFT)+
 704                           (index&((1<<COMP_SHIFT)-1))];
 705          if (code == 0)
 706              goto not_combinable;
 707			
 708          /* Replace the original character. */
 709          *i = code;
 710          /* Mark the second character unused. */
 711          skipped[cskipped++] = i1;
 712          i1++;
 713          f = find_nfc_index(self, nfc_first, *i);
 714          if (f == -1)
 715              break;
 716      }
 717      *o++ = *i++;
 718    }
 719    if (o != end)
 720        PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
 721    return result;
 722}
 723		
 724PyDoc_STRVAR(unicodedata_normalize__doc__,
 725"normalize(form, unistr)\n\
 726\n\
 727Return the normal form 'form' for the Unicode string unistr.  Valid\n\
 728values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
 729
 730static PyObject*
 731unicodedata_normalize(PyObject *self, PyObject *args)
 732{
 733    char *form;
 734    PyObject *input;
 735
 736    if(!PyArg_ParseTuple(args, "sO!:normalize",
 737                         &form, &PyUnicode_Type, &input))
 738        return NULL;
 739
 740    if (PyUnicode_GetSize(input) == 0) {
 741        /* Special case empty input strings, since resizing
 742           them  later would cause internal errors. */
 743        Py_INCREF(input);
 744        return input;
 745    }
 746
 747    if (strcmp(form, "NFC") == 0)
 748        return nfc_nfkc(self, input, 0);
 749    if (strcmp(form, "NFKC") == 0)
 750        return nfc_nfkc(self, input, 1);
 751    if (strcmp(form, "NFD") == 0)
 752        return nfd_nfkd(self, input, 0);
 753    if (strcmp(form, "NFKD") == 0)
 754        return nfd_nfkd(self, input, 1);
 755    PyErr_SetString(PyExc_ValueError, "invalid normalization form");
 756    return NULL;
 757}
 758
 759/* -------------------------------------------------------------------- */
 760/* unicode character name tables */
 761
 762/* data file generated by Tools/unicode/makeunicodedata.py */
 763#include "unicodename_db.h"
 764
 765/* -------------------------------------------------------------------- */
 766/* database code (cut and pasted from the unidb package) */
 767
 768static unsigned long
 769_gethash(const char *s, int len, int scale)
 770{
 771    int i;
 772    unsigned long h = 0;
 773    unsigned long ix;
 774    for (i = 0; i < len; i++) {
 775        h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));
 776        ix = h & 0xff000000;
 777        if (ix)
 778            h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
 779    }
 780    return h;
 781}
 782
 783static char *hangul_syllables[][3] = {
 784    { "G",  "A",   ""   },
 785    { "GG", "AE",  "G"  },
 786    { "N",  "YA",  "GG" },
 787    { "D",  "YAE", "GS" },
 788    { "DD", "EO",  "N", },
 789    { "R",  "E",   "NJ" },
 790    { "M",  "YEO", "NH" },
 791    { "B",  "YE",  "D"  },
 792    { "BB", "O",   "L"  },
 793    { "S",  "WA",  "LG" },
 794    { "SS", "WAE", "LM" },
 795    { "",   "OE",  "LB" },
 796    { "J",  "YO",  "LS" },
 797    { "JJ", "U",   "LT" },
 798    { "C",  "WEO", "LP" },
 799    { "K",  "WE",  "LH" },
 800    { "T",  "WI",  "M"  },
 801    { "P",  "YU",  "B"  },
 802    { "H",  "EU",  "BS" },
 803    { 0,    "YI",  "S"  },
 804    { 0,    "I",   "SS" },
 805    { 0,    0,     "NG" },
 806    { 0,    0,     "J"  },
 807    { 0,    0,     "C"  },
 808    { 0,    0,     "K"  },
 809    { 0,    0,     "T"  },
 810    { 0,    0,     "P"  },
 811    { 0,    0,     "H"  }
 812};
 813
 814static int
 815is_unified_ideograph(Py_UCS4 code)
 816{
 817    return (
 818        (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
 819        (0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */
 820        (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
 821}
 822
 823static int
 824_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
 825{
 826    int offset;
 827    int i;
 828    int word;
 829    unsigned char* w;
 830
 831    if (code >= 0x110000)
 832        return 0;
 833
 834    if (self) {
 835        const change_record *old = get_old_record(self, code);
 836        if (old->category_changed == 0) {
 837            /* unassigned */
 838            return 0;
 839        } 
 840    }
 841
 842    if (SBase <= code && code < SBase+SCount) {
 843	/* Hangul syllable. */
 844	int SIndex = code - SBase;
 845	int L = SIndex / NCount;
 846	int V = (SIndex % NCount) / TCount;
 847	int T = SIndex % TCount;
 848
 849	if (buflen < 27)
 850	    /* Worst case: HANGUL SYLLABLE <10chars>. */
 851	    return 0;
 852	strcpy(buffer, "HANGUL SYLLABLE ");
 853	buffer += 16;
 854	strcpy(buffer, hangul_syllables[L][0]);
 855	buffer += strlen(hangul_syllables[L][0]);
 856	strcpy(buffer, hangul_syllables[V][1]);
 857	buffer += strlen(hangul_syllables[V][1]);
 858	strcpy(buffer, hangul_syllables[T][2]);
 859	buffer += strlen(hangul_syllables[T][2]);
 860	*buffer = '\0';
 861	return 1;
 862    }
 863
 864    if (is_unified_ideograph(code)) {
 865        if (buflen < 28)
 866            /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
 867            return 0;
 868        sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
 869        return 1;
 870    }
 871
 872    /* get offset into phrasebook */
 873    offset = phrasebook_offset1[(code>>phrasebook_shift)];
 874    offset = phrasebook_offset2[(offset<<phrasebook_shift) +
 875                               (code&((1<<phrasebook_shift)-1))];
 876    if (!offset)
 877        return 0;
 878
 879    i = 0;
 880
 881    for (;;) {
 882        /* get word index */
 883        word = phrasebook[offset] - phrasebook_short;
 884        if (word >= 0) {
 885            word = (word << 8) + phrasebook[offset+1];
 886            offset += 2;
 887        } else
 888            word = phrasebook[offset++];
 889        if (i) {
 890            if (i > buflen)
 891                return 0; /* buffer overflow */
 892            buffer[i++] = ' ';
 893        }
 894        /* copy word string from lexicon.  the last character in the
 895           word has bit 7 set.  the last word in a string ends with
 896           0x80 */
 897        w = lexicon + lexicon_offset[word];
 898        while (*w < 128) {
 899            if (i >= buflen)
 900                return 0; /* buffer overflow */
 901            buffer[i++] = *w++;
 902        }
 903        if (i >= buflen)
 904            return 0; /* buffer overflow */
 905        buffer[i++] = *w & 127;
 906        if (*w == 128)
 907            break; /* end of word */
 908    }
 909
 910    return 1;
 911}
 912
 913static int
 914_cmpname(PyObject *self, int code, const char* name, int namelen)
 915{
 916    /* check if code corresponds to the given name */
 917    int i;
 918    char buffer[NAME_MAXLEN];
 919    if (!_getucname(self, code, buffer, sizeof(buffer)))
 920        return 0;
 921    for (i = 0; i < namelen; i++) {
 922        if (toupper(Py_CHARMASK(name[i])) != buffer[i])
 923            return 0;
 924    }
 925    return buffer[namelen] == '\0';
 926}
 927
 928static void 
 929find_syllable(const char *str, int *len, int *pos, int count, int column)
 930{
 931    int i, len1;
 932    *len = -1;
 933    for (i = 0; i < count; i++) {
 934	char *s = hangul_syllables[i][column];
 935	len1 = strlen(s);
 936	if (len1 <= *len)
 937	    continue;
 938	if (strncmp(str, s, len1) == 0) {
 939	    *len = len1;
 940	    *pos = i;
 941	}
 942    }
 943    if (*len == -1) {
 944	*len = 0;
 945    }
 946}
 947
 948static int
 949_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
 950{
 951    unsigned int h, v;
 952    unsigned int mask = code_size-1;
 953    unsigned int i, incr;
 954
 955    /* Check for hangul syllables. */
 956    if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
 957	int len, L = -1, V = -1, T = -1;
 958	const char *pos = name + 16;
 959	find_syllable(pos, &len, &L, LCount, 0);
 960	pos += len;
 961	find_syllable(pos, &len, &V, VCount, 1);
 962	pos += len;
 963	find_syllable(pos, &len, &T, TCount, 2);
 964	pos += len;
 965	if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
 966	    *code = SBase + (L*VCount+V)*TCount + T;
 967	    return 1;
 968	}
 969        /* Otherwise, it's an illegal syllable name. */
 970        return 0;
 971    }
 972
 973    /* Check for unified ideographs. */
 974    if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
 975        /* Four or five hexdigits must follow. */
 976        v = 0;
 977        name += 22;
 978        namelen -= 22;
 979        if (namelen != 4 && namelen != 5)
 980            return 0;
 981        while (namelen--) {
 982            v *= 16;
 983            if (*name >= '0' && *name <= '9')
 984                v += *name - '0';
 985            else if (*name >= 'A' && *name <= 'F')
 986                v += *name - 'A' + 10;
 987            else
 988                return 0;
 989            name++;
 990        }
 991        if (!is_unified_ideograph(v))
 992            return 0;
 993        *code = v;
 994        return 1;
 995    }
 996
 997    /* the following is the same as python's dictionary lookup, with
 998       only minor changes.  see the makeunicodedata script for more
 999       details */
1000
1001    h = (unsigned int) _gethash(name, namelen, code_magic);
1002    i = (~h) & mask;
1003    v = code_hash[i];
1004    if (!v)
1005        return 0;
1006    if (_cmpname(self, v, name, namelen)) {
1007        *code = v;
1008        return 1;
1009    }
1010    incr = (h ^ (h >> 3)) & mask;
1011    if (!incr)
1012        incr = mask;
1013    for (;;) {
1014        i = (i + incr) & mask;
1015        v = code_hash[i];
1016        if (!v)
1017            return 0;
1018        if (_cmpname(self, v, name, namelen)) {
1019            *code = v;
1020            return 1;
1021        }
1022        incr = incr << 1;
1023        if (incr > mask)
1024            incr = incr ^ code_poly;
1025    }
1026}
1027
1028static const _PyUnicode_Name_CAPI hashAPI = 
1029{
1030    sizeof(_PyUnicode_Name_CAPI),
1031    _getucname,
1032    _getcode
1033};
1034
1035/* -------------------------------------------------------------------- */
1036/* Python bindings */
1037
1038PyDoc_STRVAR(unicodedata_name__doc__,
1039"name(unichr[, default])\n\
1040Returns the name assigned to the Unicode character unichr as a\n\
1041string. If no name is defined, default is returned, or, if not\n\
1042given, ValueError is raised.");
1043
1044static PyObject *
1045unicodedata_name(PyObject* self, PyObject* args)
1046{
1047    char name[NAME_MAXLEN];
1048    Py_UCS4 c;
1049
1050    PyUnicodeObject* v;
1051    PyObject* defobj = NULL;
1052    if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
1053        return NULL;
1054
1055    c = getuchar(v);
1056    if (c == (Py_UCS4)-1)
1057        return NULL;
1058
1059    if (!_getucname(self, c, name, sizeof(name))) {
1060	if (defobj == NULL) {
1061	    PyErr_SetString(PyExc_ValueError, "no such name");
1062            return NULL;
1063	}
1064	else {
1065	    Py_INCREF(defobj);
1066	    return defobj;
1067	}
1068    }
1069
1070    return Py_BuildValue("s", name);
1071}
1072
1073PyDoc_STRVAR(unicodedata_lookup__doc__,
1074"lookup(name)\n\
1075\n\
1076Look up character by name.  If a character with the\n\
1077given name is found, return the corresponding Unicode\n\
1078character.  If not found, KeyError is raised.");
1079
1080static PyObject *
1081unicodedata_lookup(PyObject* self, PyObject* args)
1082{
1083    Py_UCS4 code;
1084    Py_UNICODE str[2];
1085
1086    char* name;
1087    int namelen;
1088    if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
1089        return NULL;
1090
1091    if (!_getcode(self, name, namelen, &code)) {
1092        PyErr_Format(PyExc_KeyError, "undefined character name '%s'",
1093                     name);
1094        return NULL;
1095    }
1096
1097#ifndef Py_UNICODE_WIDE
1098    if (code >= 0x10000) {
1099        str[0] = 0xd800 + ((code - 0x10000) >> 10);
1100        str[1] = 0xdc00 + ((code - 0x10000) & 0x3ff);
1101        return PyUnicode_FromUnicode(str, 2);
1102    }
1103#endif
1104    str[0] = (Py_UNICODE) code;
1105    return PyUnicode_FromUnicode(str, 1);    
1106}
1107
1108/* XXX Add doc strings. */
1109
1110static PyMethodDef unicodedata_functions[] = {
1111    {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
1112    {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
1113    {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
1114    {"category", unicodedata_category, METH_VARARGS,
1115                 unicodedata_category__doc__},
1116    {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
1117                      unicodedata_bidirectional__doc__},
1118    {"combining", unicodedata_combining, METH_VARARGS,
1119                  unicodedata_combining__doc__},
1120    {"mirrored", unicodedata_mirrored, METH_VARARGS,
1121                 unicodedata_mirrored__doc__},
1122    {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
1123                         unicodedata_east_asian_width__doc__},
1124    {"decomposition", unicodedata_decomposition, METH_VARARGS,
1125                      unicodedata_decomposition__doc__},
1126    {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
1127    {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
1128    {"normalize", unicodedata_normalize, METH_VARARGS,
1129                  unicodedata_normalize__doc__},
1130    {NULL, NULL}		/* sentinel */
1131};
1132
1133static PyTypeObject UCD_Type = {
1134	/* The ob_type field must be initialized in the module init function
1135	 * to be portable to Windows without using C++. */
1136	PyVarObject_HEAD_INIT(NULL, 0)
1137	"unicodedata.UCD",		/*tp_name*/
1138	sizeof(PreviousDBVersion),	/*tp_basicsize*/
1139	0,			/*tp_itemsize*/
1140	/* methods */
1141	(destructor)PyObject_Del, /*tp_dealloc*/
1142	0,			/*tp_print*/
1143	0,                      /*tp_getattr*/
1144	0,			/*tp_setattr*/
1145	0,			/*tp_compare*/
1146	0,			/*tp_repr*/
1147	0,			/*tp_as_number*/
1148	0,			/*tp_as_sequence*/
1149	0,			/*tp_as_mapping*/
1150	0,			/*tp_hash*/
1151        0,                      /*tp_call*/
1152        0,                      /*tp_str*/
1153        PyObject_GenericGetAttr,/*tp_getattro*/
1154        0,                      /*tp_setattro*/
1155        0,                      /*tp_as_buffer*/
1156        Py_TPFLAGS_DEFAULT,     /*tp_flags*/
1157        0,                      /*tp_doc*/
1158        0,                      /*tp_traverse*/
1159        0,                      /*tp_clear*/
1160        0,                      /*tp_richcompare*/
1161        0,                      /*tp_weaklistoffset*/
1162        0,                      /*tp_iter*/
1163        0,                      /*tp_iternext*/
1164        unicodedata_functions,  /*tp_methods*/
1165        DB_members,             /*tp_members*/
1166        0,                      /*tp_getset*/
1167        0,                      /*tp_base*/
1168        0,                      /*tp_dict*/
1169        0,                      /*tp_descr_get*/
1170        0,                      /*tp_descr_set*/
1171        0,                      /*tp_dictoffset*/
1172        0,                      /*tp_init*/
1173        0,                      /*tp_alloc*/
1174        0,                      /*tp_new*/
1175        0,                      /*tp_free*/
1176        0,                      /*tp_is_gc*/
1177};
1178
1179PyDoc_STRVAR(unicodedata_docstring,
1180"This module provides access to the Unicode Character Database which\n\
1181defines character properties for all Unicode characters. The data in\n\
1182this database is based on the UnicodeData.txt file version\n\
11835.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
1184\n\
1185The module uses the same names and symbols as defined by the\n\
1186UnicodeData File Format 5.1.0 (see\n\
1187http://www.unicode.org/Public/5.1.0/ucd/UCD.html).");
1188
1189PyMODINIT_FUNC
1190initunicodedata(void)
1191{
1192    PyObject *m, *v;
1193
1194    Py_TYPE(&UCD_Type) = &PyType_Type;
1195
1196    m = Py_InitModule3(
1197        "unicodedata", unicodedata_functions, unicodedata_docstring);
1198    if (!m)
1199        return;
1200
1201    PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1202    Py_INCREF(&UCD_Type);
1203    PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1204
1205    /* Previous versions */
1206    v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1207    if (v != NULL)
1208        PyModule_AddObject(m, "ucd_3_2_0", v);
1209
1210    /* Export C API */
1211    v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
1212    if (v != NULL)
1213        PyModule_AddObject(m, "ucnhash_CAPI", v);
1214}
1215
1216/* 
1217Local variables:
1218c-basic-offset: 4
1219indent-tabs-mode: nil
1220End:
1221*/