python, hg: tow outside the environment.

they've served us well, and can ride off into the sunset.
author: Ori Bernstein <ori@eigenstate.org> 2021-06-14 00:00:37 +0000
committer: Ori Bernstein <ori@eigenstate.org> 2021-06-14 00:00:37 +0000
commit: a73a964e51247ed169d322c725a3a18859f109a3 (patch)
tree: 3f752d117274d444bda44e85609aeac1acf313f3 /sys/src/cmd/python/Modules/unicodedata.c
parent: e64efe273fcb921a61bf27d33b230c4e64fcd425 (diff)
1 files changed, 0 insertions, 1223 deletions
diff --git a/sys/src/cmd/python/Modules/unicodedata.c b/sys/src/cmd/python/Modules/unicodedata.c
deleted file mode 100644
index a30d30c8e..000000000
--- a/sys/src/cmd/python/Modules/unicodedata.c
+++ /dev/null
@@ -1,1223 +0,0 @@
-/* ------------------------------------------------------------------------
-
-   unicodedata -- Provides access to the Unicode 4.1 data base.
-
-   Data was extracted from the Unicode 4.1 UnicodeData.txt file.
-
-   Written by Marc-Andre Lemburg (mal@lemburg.com).
-   Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
-   Modified by Martin v. Löwis (martin@v.loewis.de)
-
-   Copyright (c) Corporation for National Research Initiatives.
-
-   ------------------------------------------------------------------------ */
-
-#include "Python.h"
-#include "ucnhash.h"
-#include "structmember.h"
-
-/* character properties */
-
-typedef struct {
-    const unsigned char category;	/* index into
-					   _PyUnicode_CategoryNames */
-    const unsigned char	combining; 	/* combining class value 0 - 255 */
-    const unsigned char	bidirectional; 	/* index into
-					   _PyUnicode_BidirectionalNames */
-    const unsigned char mirrored;	/* true if mirrored in bidir mode */
-    const unsigned char east_asian_width;	/* index into
-						   _PyUnicode_EastAsianWidth */
-} _PyUnicode_DatabaseRecord;
-
-typedef struct change_record {
-    /* sequence of fields should be the same as in merge_old_version */
-    const unsigned char bidir_changed;
-    const unsigned char category_changed;
-    const unsigned char decimal_changed;
-    const int numeric_changed;
-} change_record;
-
-/* data file generated by Tools/unicode/makeunicodedata.py */
-#include "unicodedata_db.h"
-
-static const _PyUnicode_DatabaseRecord*
-_getrecord_ex(Py_UCS4 code)
-{
-    int index;
-    if (code >= 0x110000)
-        index = 0;
-    else {
-        index = index1[(code>>SHIFT)];
-        index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
-    }
-
-    return &_PyUnicode_Database_Records[index];
-}
-
-static const _PyUnicode_DatabaseRecord*
-_getrecord(PyUnicodeObject* v)
-{
-    return _getrecord_ex(*PyUnicode_AS_UNICODE(v));
-}
-
-/* ------------- Previous-version API ------------------------------------- */
-typedef struct previous_version {
-    PyObject_HEAD
-    const char *name;
-    const change_record* (*getrecord)(Py_UCS4);
-    Py_UCS4 (*normalization)(Py_UCS4);
-} PreviousDBVersion;
-
-#define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
-
-static PyMemberDef DB_members[] = {
-	{"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
-        {NULL}
-};
-
-/* forward declaration */
-static PyTypeObject UCD_Type;
-
-static PyObject*
-new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
-                     Py_UCS4 (*normalization)(Py_UCS4))
-{
-	PreviousDBVersion *self;
-	self = PyObject_New(PreviousDBVersion, &UCD_Type);
-	if (self == NULL)
-		return NULL;
-	self->name = name;
-	self->getrecord = getrecord;
-        self->normalization = normalization;
-	return (PyObject*)self;
-}
-
-/* --- Module API --------------------------------------------------------- */
-
-PyDoc_STRVAR(unicodedata_decimal__doc__,
-"decimal(unichr[, default])\n\
-\n\
-Returns the decimal value assigned to the Unicode character unichr\n\
-as integer. If no such value is defined, default is returned, or, if\n\
-not given, ValueError is raised.");
-
-static PyObject *
-unicodedata_decimal(PyObject *self, PyObject *args)
-{
-    PyUnicodeObject *v;
-    PyObject *defobj = NULL;
-    int have_old = 0;
-    long rc;
-
-    if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
-        return NULL;
-    if (PyUnicode_GET_SIZE(v) != 1) {
-	PyErr_SetString(PyExc_TypeError,
-			"need a single Unicode character as parameter");
-        return NULL;
-    }
-
-    if (self) {
-        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
-        if (old->category_changed == 0) {
-            /* unassigned */
-            have_old = 1;
-            rc = -1;
-        } 
-        else if (old->decimal_changed != 0xFF) {
-            have_old = 1;
-            rc = old->decimal_changed;
-        }
-    }
-
-    if (!have_old)
-        rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
-    if (rc < 0) {
-	if (defobj == NULL) {
-	    PyErr_SetString(PyExc_ValueError,
-			    "not a decimal");
-            return NULL;
-	}
-	else {
-	    Py_INCREF(defobj);
-	    return defobj;
-	}
-    }
-    return PyInt_FromLong(rc);
-}
-
-PyDoc_STRVAR(unicodedata_digit__doc__,
-"digit(unichr[, default])\n\
-\n\
-Returns the digit value assigned to the Unicode character unichr as\n\
-integer. If no such value is defined, default is returned, or, if\n\
-not given, ValueError is raised.");
-
-static PyObject *
-unicodedata_digit(PyObject *self, PyObject *args)
-{
-    PyUnicodeObject *v;
-    PyObject *defobj = NULL;
-    long rc;
-
-    if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
-        return NULL;
-    if (PyUnicode_GET_SIZE(v) != 1) {
-	PyErr_SetString(PyExc_TypeError,
-			"need a single Unicode character as parameter");
-        return NULL;
-    }
-    rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
-    if (rc < 0) {
-	if (defobj == NULL) {
-	    PyErr_SetString(PyExc_ValueError, "not a digit");
-            return NULL;
-	}
-	else {
-	    Py_INCREF(defobj);
-	    return defobj;
-	}
-    }
-    return PyInt_FromLong(rc);
-}
-
-PyDoc_STRVAR(unicodedata_numeric__doc__,
-"numeric(unichr[, default])\n\
-\n\
-Returns the numeric value assigned to the Unicode character unichr\n\
-as float. If no such value is defined, default is returned, or, if\n\
-not given, ValueError is raised.");
-
-static PyObject *
-unicodedata_numeric(PyObject *self, PyObject *args)
-{
-    PyUnicodeObject *v;
-    PyObject *defobj = NULL;
-    int have_old = 0;
-    double rc;
-
-    if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
-        return NULL;
-    if (PyUnicode_GET_SIZE(v) != 1) {
-	PyErr_SetString(PyExc_TypeError,
-			"need a single Unicode character as parameter");
-	return NULL;
-    }
-
-    if (self) {
-        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
-        if (old->category_changed == 0) {
-            /* unassigned */
-            have_old = 1;
-            rc = -1.0;
-        } 
-        else if (old->decimal_changed != 0xFF) {
-            have_old = 1;
-            rc = old->decimal_changed;
-        }
-    }
-
-    if (!have_old)
-        rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
-    if (rc == -1.0) {
-	if (defobj == NULL) {
-	    PyErr_SetString(PyExc_ValueError, "not a numeric character");
-	    return NULL;
-	}
-	else {
-	    Py_INCREF(defobj);
-	    return defobj;
-	}
-    }
-    return PyFloat_FromDouble(rc);
-}
-
-PyDoc_STRVAR(unicodedata_category__doc__,
-"category(unichr)\n\
-\n\
-Returns the general category assigned to the Unicode character\n\
-unichr as string.");
-
-static PyObject *
-unicodedata_category(PyObject *self, PyObject *args)
-{
-    PyUnicodeObject *v;
-    int index;
-
-    if (!PyArg_ParseTuple(args, "O!:category",
-			  &PyUnicode_Type, &v))
-	return NULL;
-    if (PyUnicode_GET_SIZE(v) != 1) {
-	PyErr_SetString(PyExc_TypeError,
-			"need a single Unicode character as parameter");
-	return NULL;
-    }
-    index = (int) _getrecord(v)->category;
-    if (self) {
-        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
-        if (old->category_changed != 0xFF)
-            index = old->category_changed;
-    }
-    return PyString_FromString(_PyUnicode_CategoryNames[index]);
-}
-
-PyDoc_STRVAR(unicodedata_bidirectional__doc__,
-"bidirectional(unichr)\n\
-\n\
-Returns the bidirectional category assigned to the Unicode character\n\
-unichr as string. If no such value is defined, an empty string is\n\
-returned.");
-
-static PyObject *
-unicodedata_bidirectional(PyObject *self, PyObject *args)
-{
-    PyUnicodeObject *v;
-    int index;
-
-    if (!PyArg_ParseTuple(args, "O!:bidirectional",
-			  &PyUnicode_Type, &v))
-	return NULL;
-    if (PyUnicode_GET_SIZE(v) != 1) {
-	PyErr_SetString(PyExc_TypeError,
-			"need a single Unicode character as parameter");
-	return NULL;
-    }
-    index = (int) _getrecord(v)->bidirectional;
-    if (self) {
-        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
-        if (old->category_changed == 0)
-            index = 0; /* unassigned */
-        else if (old->bidir_changed != 0xFF)
-            index = old->bidir_changed;
-    }
-    return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
-}
-
-PyDoc_STRVAR(unicodedata_combining__doc__,
-"combining(unichr)\n\
-\n\
-Returns the canonical combining class assigned to the Unicode\n\
-character unichr as integer. Returns 0 if no combining class is\n\
-defined.");
-
-static PyObject *
-unicodedata_combining(PyObject *self, PyObject *args)
-{
-    PyUnicodeObject *v;
-    int index;
-
-    if (!PyArg_ParseTuple(args, "O!:combining",
-			  &PyUnicode_Type, &v))
-	return NULL;
-    if (PyUnicode_GET_SIZE(v) != 1) {
-	PyErr_SetString(PyExc_TypeError,
-			"need a single Unicode character as parameter");
-	return NULL;
-    }
-    index = (int) _getrecord(v)->combining;
-    if (self) {
-        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
-        if (old->category_changed == 0)
-            index = 0; /* unassigned */
-    }
-    return PyInt_FromLong(index);
-}
-
-PyDoc_STRVAR(unicodedata_mirrored__doc__,
-"mirrored(unichr)\n\
-\n\
-Returns the mirrored property assigned to the Unicode character\n\
-unichr as integer. Returns 1 if the character has been identified as\n\
-a \"mirrored\" character in bidirectional text, 0 otherwise.");
-
-static PyObject *
-unicodedata_mirrored(PyObject *self, PyObject *args)
-{
-    PyUnicodeObject *v;
-    int index;
-
-    if (!PyArg_ParseTuple(args, "O!:mirrored",
-			  &PyUnicode_Type, &v))
-	return NULL;
-    if (PyUnicode_GET_SIZE(v) != 1) {
-	PyErr_SetString(PyExc_TypeError,
-			"need a single Unicode character as parameter");
-	return NULL;
-    }
-    index = (int) _getrecord(v)->mirrored;
-    if (self) {
-        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
-        if (old->category_changed == 0)
-            index = 0; /* unassigned */
-    }
-    return PyInt_FromLong(index);
-}
-
-PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
-"east_asian_width(unichr)\n\
-\n\
-Returns the east asian width assigned to the Unicode character\n\
-unichr as string.");
-
-static PyObject *
-unicodedata_east_asian_width(PyObject *self, PyObject *args)
-{
-    PyUnicodeObject *v;
-    int index;
-
-    if (!PyArg_ParseTuple(args, "O!:east_asian_width",
-			  &PyUnicode_Type, &v))
-	return NULL;
-    if (PyUnicode_GET_SIZE(v) != 1) {
-	PyErr_SetString(PyExc_TypeError,
-			"need a single Unicode character as parameter");
-	return NULL;
-    }
-    index = (int) _getrecord(v)->east_asian_width;
-    if (self) {
-        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
-        if (old->category_changed == 0)
-            index = 0; /* unassigned */
-    }
-    return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
-}
-
-PyDoc_STRVAR(unicodedata_decomposition__doc__,
-"decomposition(unichr)\n\
-\n\
-Returns the character decomposition mapping assigned to the Unicode\n\
-character unichr as string. An empty string is returned in case no\n\
-such mapping is defined.");
-
-static PyObject *
-unicodedata_decomposition(PyObject *self, PyObject *args)
-{
-    PyUnicodeObject *v;
-    char decomp[256];
-    int code, index, count, i;
-    unsigned int prefix_index;
-
-    if (!PyArg_ParseTuple(args, "O!:decomposition",
-			  &PyUnicode_Type, &v))
-	return NULL;
-    if (PyUnicode_GET_SIZE(v) != 1) {
-	PyErr_SetString(PyExc_TypeError,
-			"need a single Unicode character as parameter");
-	return NULL;
-    }
-
-    code = (int) *PyUnicode_AS_UNICODE(v);
-
-    if (self) {
-        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
-        if (old->category_changed == 0)
-            return PyString_FromString(""); /* unassigned */
-    }
-
-    if (code < 0 || code >= 0x110000)
-        index = 0;
-    else {
-        index = decomp_index1[(code>>DECOMP_SHIFT)];
-        index = decomp_index2[(index<<DECOMP_SHIFT)+
-                             (code&((1<<DECOMP_SHIFT)-1))];
-    }
-
-    /* high byte is number of hex bytes (usually one or two), low byte
-       is prefix code (from*/
-    count = decomp_data[index] >> 8;
-
-    /* XXX: could allocate the PyString up front instead
-       (strlen(prefix) + 5 * count + 1 bytes) */
-
-    /* Based on how index is calculated above and decomp_data is generated
-       from Tools/unicode/makeunicodedata.py, it should not be possible
-       to overflow decomp_prefix. */
-    prefix_index = decomp_data[index] & 255;
-    assert(prefix_index < (sizeof(decomp_prefix)/sizeof(*decomp_prefix)));
-
-    /* copy prefix */
-    i = strlen(decomp_prefix[prefix_index]);
-    memcpy(decomp, decomp_prefix[prefix_index], i);
-
-    while (count-- > 0) {
-        if (i)
-            decomp[i++] = ' ';
-        assert((size_t)i < sizeof(decomp));
-        PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
-                      decomp_data[++index]);
-        i += strlen(decomp + i);
-    }
-    
-    decomp[i] = '\0';
-
-    return PyString_FromString(decomp);
-}
-
-static void
-get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
-{
-    if (code >= 0x110000) {
-        *index = 0;
-    } else if (self && get_old_record(self, code)->category_changed==0) {
-        /* unassigned in old version */
-        *index = 0;
-    }
-    else {
-        *index = decomp_index1[(code>>DECOMP_SHIFT)];
-        *index = decomp_index2[(*index<<DECOMP_SHIFT)+
-                               (code&((1<<DECOMP_SHIFT)-1))];
-    }
-	
-    /* high byte is number of hex bytes (usually one or two), low byte
-       is prefix code (from*/
-    *count = decomp_data[*index] >> 8;
-    *prefix = decomp_data[*index] & 255;
-
-    (*index)++;
-}
-
-#define SBase   0xAC00
-#define LBase   0x1100
-#define VBase   0x1161
-#define TBase   0x11A7
-#define LCount  19
-#define VCount  21
-#define TCount  28
-#define NCount  (VCount*TCount)
-#define SCount  (LCount*NCount)
-
-static PyObject*
-nfd_nfkd(PyObject *self, PyObject *input, int k)
-{
-    PyObject *result;
-    Py_UNICODE *i, *end, *o;
-    /* Longest decomposition in Unicode 3.2: U+FDFA */
-    Py_UNICODE stack[20]; 
-    Py_ssize_t space, isize;
-    int index, prefix, count, stackptr;
-    unsigned char prev, cur;
-	
-    stackptr = 0;
-    isize = PyUnicode_GET_SIZE(input);
-    /* Overallocate atmost 10 characters. */
-    space = (isize > 10 ? 10 : isize) + isize;
-    result = PyUnicode_FromUnicode(NULL, space);
-    if (!result)
-        return NULL;
-    i = PyUnicode_AS_UNICODE(input);
-    end = i + isize;
-    o = PyUnicode_AS_UNICODE(result);
-
-    while (i < end) {
-        stack[stackptr++] = *i++;
-        while(stackptr) {
-            Py_UNICODE code = stack[--stackptr];
-            /* Hangul Decomposition adds three characters in
-               a single step, so we need atleast that much room. */
-            if (space < 3) {
-                Py_ssize_t newsize = PyString_GET_SIZE(result) + 10;
-                space += 10;
-                if (PyUnicode_Resize(&result, newsize) == -1)
-                    return NULL;
-                o = PyUnicode_AS_UNICODE(result) + newsize - space;
-            }
-            /* Hangul Decomposition. */
-            if (SBase <= code && code < (SBase+SCount)) {
-                int SIndex = code - SBase;
-                int L = LBase + SIndex / NCount;
-                int V = VBase + (SIndex % NCount) / TCount;
-                int T = TBase + SIndex % TCount;
-                *o++ = L;
-                *o++ = V;
-                space -= 2;
-                if (T != TBase) {
-                    *o++ = T;
-                    space --;
-                }
-                continue;
-            }
-            /* normalization changes */
-            if (self) {
-                Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
-                if (value != 0) {
-                    stack[stackptr++] = value;
-                    continue;
-                }
-            }
-
-            /* Other decompositions. */
-            get_decomp_record(self, code, &index, &prefix, &count);
-
-            /* Copy character if it is not decomposable, or has a
-               compatibility decomposition, but we do NFD. */
-            if (!count || (prefix && !k)) {
-                *o++ = code;
-                space--;
-                continue;
-            }
-            /* Copy decomposition onto the stack, in reverse
-               order.  */
-            while(count) {
-                code = decomp_data[index + (--count)];
-                stack[stackptr++] = code;
-            }
-        }
-    }
-
-    /* Drop overallocation. Cannot fail. */
-    PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
-
-    /* Sort canonically. */
-    i = PyUnicode_AS_UNICODE(result);
-    prev = _getrecord_ex(*i)->combining;
-    end = i + PyUnicode_GET_SIZE(result);
-    for (i++; i < end; i++) {
-        cur = _getrecord_ex(*i)->combining;
-        if (prev == 0 || cur == 0 || prev <= cur) {
-            prev = cur;
-            continue;
-        }
-        /* Non-canonical order. Need to switch *i with previous. */
-        o = i - 1;
-        while (1) {
-            Py_UNICODE tmp = o[1];
-            o[1] = o[0];
-            o[0] = tmp;
-            o--;
-            if (o < PyUnicode_AS_UNICODE(result))
-                break;
-            prev = _getrecord_ex(*o)->combining;
-            if (prev == 0 || prev <= cur)
-                break;
-        }
-        prev = _getrecord_ex(*i)->combining;
-    }
-    return result;
-}
-
-static int
-find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
-{
-    int index;
-    for (index = 0; nfc[index].start; index++) {
-        int start = nfc[index].start;
-        if (code < start)
-            return -1;
-        if (code <= start + nfc[index].count) {
-            int delta = code - start;
-            return nfc[index].index + delta;
-        }
-    }
-    return -1;
-}
-
-static PyObject*
-nfc_nfkc(PyObject *self, PyObject *input, int k)
-{
-    PyObject *result;
-    Py_UNICODE *i, *i1, *o, *end;
-    int f,l,index,index1,comb;
-    Py_UNICODE code;
-    Py_UNICODE *skipped[20];
-    int cskipped = 0;
-
-    result = nfd_nfkd(self, input, k);
-    if (!result)
-        return NULL;
-
-    /* We are going to modify result in-place.
-       If nfd_nfkd is changed to sometimes return the input,
-       this code needs to be reviewed. */
-    assert(result != input);
-
-    i = PyUnicode_AS_UNICODE(result);
-    end = i + PyUnicode_GET_SIZE(result);
-    o = PyUnicode_AS_UNICODE(result);
-	
-  again:
-    while (i < end) {
-      for (index = 0; index < cskipped; index++) {
-          if (skipped[index] == i) {
-              /* *i character is skipped. 
-                 Remove from list. */
-              skipped[index] = skipped[cskipped-1];
-              cskipped--;
-              i++;
-              goto again; /* continue while */
-          }
-      }
-      /* Hangul Composition. We don't need to check for <LV,T>
-         pairs, since we always have decomposed data. */
-      if (LBase <= *i && *i < (LBase+LCount) &&
-          i + 1 < end && 
-          VBase <= i[1] && i[1] <= (VBase+VCount)) {
-          int LIndex, VIndex;
-          LIndex = i[0] - LBase;
-          VIndex = i[1] - VBase;
-          code = SBase + (LIndex*VCount+VIndex)*TCount;
-          i+=2;
-          if (i < end &&
-              TBase <= *i && *i <= (TBase+TCount)) {
-              code += *i-TBase;
-              i++;
-          }
-          *o++ = code;
-          continue;
-      }
-
-      f = find_nfc_index(self, nfc_first, *i);
-      if (f == -1) {
-          *o++ = *i++;
-          continue;
-      }
-      /* Find next unblocked character. */
-      i1 = i+1;
-      comb = 0;
-      while (i1 < end) {
-          int comb1 = _getrecord_ex(*i1)->combining;
-          if (comb1 && comb == comb1) {
-              /* Character is blocked. */
-              i1++;
-              continue;
-          }
-          l = find_nfc_index(self, nfc_last, *i1);
-          /* *i1 cannot be combined with *i. If *i1
-             is a starter, we don't need to look further.
-             Otherwise, record the combining class. */
-          if (l == -1) {
-            not_combinable:
-              if (comb1 == 0)
-                  break;
-              comb = comb1;
-              i1++;
-              continue;
-          }
-          index = f*TOTAL_LAST + l;
-          index1 = comp_index[index >> COMP_SHIFT];
-          code = comp_data[(index1<<COMP_SHIFT)+
-                           (index&((1<<COMP_SHIFT)-1))];
-          if (code == 0)
-              goto not_combinable;
-			
-          /* Replace the original character. */
-          *i = code;
-          /* Mark the second character unused. */
-          skipped[cskipped++] = i1;
-          i1++;
-          f = find_nfc_index(self, nfc_first, *i);
-          if (f == -1)
-              break;
-      }
-      *o++ = *i++;
-    }
-    if (o != end)
-        PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
-    return result;
-}
-		
-PyDoc_STRVAR(unicodedata_normalize__doc__,
-"normalize(form, unistr)\n\
-\n\
-Return the normal form 'form' for the Unicode string unistr.  Valid\n\
-values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
-
-static PyObject*
-unicodedata_normalize(PyObject *self, PyObject *args)
-{
-    char *form;
-    PyObject *input;
-
-    if(!PyArg_ParseTuple(args, "sO!:normalize",
-                         &form, &PyUnicode_Type, &input))
-        return NULL;
-
-    if (PyUnicode_GetSize(input) == 0) {
-        /* Special case empty input strings, since resizing
-           them  later would cause internal errors. */
-        Py_INCREF(input);
-        return input;
-    }
-
-    if (strcmp(form, "NFC") == 0)
-        return nfc_nfkc(self, input, 0);
-    if (strcmp(form, "NFKC") == 0)
-        return nfc_nfkc(self, input, 1);
-    if (strcmp(form, "NFD") == 0)
-        return nfd_nfkd(self, input, 0);
-    if (strcmp(form, "NFKD") == 0)
-        return nfd_nfkd(self, input, 1);
-    PyErr_SetString(PyExc_ValueError, "invalid normalization form");
-    return NULL;
-}
-
-/* -------------------------------------------------------------------- */
-/* unicode character name tables */
-
-/* data file generated by Tools/unicode/makeunicodedata.py */
-#include "unicodename_db.h"
-
-/* -------------------------------------------------------------------- */
-/* database code (cut and pasted from the unidb package) */
-
-static unsigned long
-_gethash(const char *s, int len, int scale)
-{
-    int i;
-    unsigned long h = 0;
-    unsigned long ix;
-    for (i = 0; i < len; i++) {
-        h = (h * scale) + (unsigned char) toupper(Py_CHARMASK(s[i]));
-        ix = h & 0xff000000;
-        if (ix)
-            h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
-    }
-    return h;
-}
-
-static char *hangul_syllables[][3] = {
-    { "G",  "A",   ""   },
-    { "GG", "AE",  "G"  },
-    { "N",  "YA",  "GG" },
-    { "D",  "YAE", "GS" },
-    { "DD", "EO",  "N", },
-    { "R",  "E",   "NJ" },
-    { "M",  "YEO", "NH" },
-    { "B",  "YE",  "D"  },
-    { "BB", "O",   "L"  },
-    { "S",  "WA",  "LG" },
-    { "SS", "WAE", "LM" },
-    { "",   "OE",  "LB" },
-    { "J",  "YO",  "LS" },
-    { "JJ", "U",   "LT" },
-    { "C",  "WEO", "LP" },
-    { "K",  "WE",  "LH" },
-    { "T",  "WI",  "M"  },
-    { "P",  "YU",  "B"  },
-    { "H",  "EU",  "BS" },
-    { 0,    "YI",  "S"  },
-    { 0,    "I",   "SS" },
-    { 0,    0,     "NG" },
-    { 0,    0,     "J"  },
-    { 0,    0,     "C"  },
-    { 0,    0,     "K"  },
-    { 0,    0,     "T"  },
-    { 0,    0,     "P"  },
-    { 0,    0,     "H"  }
-};
-
-static int
-is_unified_ideograph(Py_UCS4 code)
-{
-    return (
-        (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
-        (0x4E00 <= code && code <= 0x9FBB) || /* CJK Ideograph */
-        (0x20000 <= code && code <= 0x2A6D6));/* CJK Ideograph Extension B */
-}
-
-static int
-_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
-{
-    int offset;
-    int i;
-    int word;
-    unsigned char* w;
-
-    if (code >= 0x110000)
-        return 0;
-
-    if (self) {
-        const change_record *old = get_old_record(self, code);
-        if (old->category_changed == 0) {
-            /* unassigned */
-            return 0;
-        } 
-    }
-
-    if (SBase <= code && code < SBase+SCount) {
-	/* Hangul syllable. */
-	int SIndex = code - SBase;
-	int L = SIndex / NCount;
-	int V = (SIndex % NCount) / TCount;
-	int T = SIndex % TCount;
-
-	if (buflen < 27)
-	    /* Worst case: HANGUL SYLLABLE <10chars>. */
-	    return 0;
-	strcpy(buffer, "HANGUL SYLLABLE ");
-	buffer += 16;
-	strcpy(buffer, hangul_syllables[L][0]);
-	buffer += strlen(hangul_syllables[L][0]);
-	strcpy(buffer, hangul_syllables[V][1]);
-	buffer += strlen(hangul_syllables[V][1]);
-	strcpy(buffer, hangul_syllables[T][2]);
-	buffer += strlen(hangul_syllables[T][2]);
-	*buffer = '\0';
-	return 1;
-    }
-
-    if (is_unified_ideograph(code)) {
-        if (buflen < 28)
-            /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
-            return 0;
-        sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
-        return 1;
-    }
-
-    /* get offset into phrasebook */
-    offset = phrasebook_offset1[(code>>phrasebook_shift)];
-    offset = phrasebook_offset2[(offset<<phrasebook_shift) +
-                               (code&((1<<phrasebook_shift)-1))];
-    if (!offset)
-        return 0;
-
-    i = 0;
-
-    for (;;) {
-        /* get word index */
-        word = phrasebook[offset] - phrasebook_short;
-        if (word >= 0) {
-            word = (word << 8) + phrasebook[offset+1];
-            offset += 2;
-        } else
-            word = phrasebook[offset++];
-        if (i) {
-            if (i > buflen)
-                return 0; /* buffer overflow */
-            buffer[i++] = ' ';
-        }
-        /* copy word string from lexicon.  the last character in the
-           word has bit 7 set.  the last word in a string ends with
-           0x80 */
-        w = lexicon + lexicon_offset[word];
-        while (*w < 128) {
-            if (i >= buflen)
-                return 0; /* buffer overflow */
-            buffer[i++] = *w++;
-        }
-        if (i >= buflen)
-            return 0; /* buffer overflow */
-        buffer[i++] = *w & 127;
-        if (*w == 128)
-            break; /* end of word */
-    }
-
-    return 1;
-}
-
-static int
-_cmpname(PyObject *self, int code, const char* name, int namelen)
-{
-    /* check if code corresponds to the given name */
-    int i;
-    char buffer[NAME_MAXLEN];
-    if (!_getucname(self, code, buffer, sizeof(buffer)))
-        return 0;
-    for (i = 0; i < namelen; i++) {
-        if (toupper(Py_CHARMASK(name[i])) != buffer[i])
-            return 0;
-    }
-    return buffer[namelen] == '\0';
-}
-
-static void 
-find_syllable(const char *str, int *len, int *pos, int count, int column)
-{
-    int i, len1;
-    *len = -1;
-    for (i = 0; i < count; i++) {
-	char *s = hangul_syllables[i][column];
-	len1 = strlen(s);
-	if (len1 <= *len)
-	    continue;
-	if (strncmp(str, s, len1) == 0) {
-	    *len = len1;
-	    *pos = i;
-	}
-    }
-    if (*len == -1) {
-	*len = 0;
-    }
-}
-
-static int
-_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
-{
-    unsigned int h, v;
-    unsigned int mask = code_size-1;
-    unsigned int i, incr;
-
-    /* Check for hangul syllables. */
-    if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
-	int len, L = -1, V = -1, T = -1;
-	const char *pos = name + 16;
-	find_syllable(pos, &len, &L, LCount, 0);
-	pos += len;
-	find_syllable(pos, &len, &V, VCount, 1);
-	pos += len;
-	find_syllable(pos, &len, &T, TCount, 2);
-	pos += len;
-	if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
-	    *code = SBase + (L*VCount+V)*TCount + T;
-	    return 1;
-	}
-        /* Otherwise, it's an illegal syllable name. */
-        return 0;
-    }
-
-    /* Check for unified ideographs. */
-    if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
-        /* Four or five hexdigits must follow. */
-        v = 0;
-        name += 22;
-        namelen -= 22;
-        if (namelen != 4 && namelen != 5)
-            return 0;
-        while (namelen--) {
-            v *= 16;
-            if (*name >= '0' && *name <= '9')
-                v += *name - '0';
-            else if (*name >= 'A' && *name <= 'F')
-                v += *name - 'A' + 10;
-            else
-                return 0;
-            name++;
-        }
-        if (!is_unified_ideograph(v))
-            return 0;
-        *code = v;
-        return 1;
-    }
-
-    /* the following is the same as python's dictionary lookup, with
-       only minor changes.  see the makeunicodedata script for more
-       details */
-
-    h = (unsigned int) _gethash(name, namelen, code_magic);
-    i = (~h) & mask;
-    v = code_hash[i];
-    if (!v)
-        return 0;
-    if (_cmpname(self, v, name, namelen)) {
-        *code = v;
-        return 1;
-    }
-    incr = (h ^ (h >> 3)) & mask;
-    if (!incr)
-        incr = mask;
-    for (;;) {
-        i = (i + incr) & mask;
-        v = code_hash[i];
-        if (!v)
-            return 0;
-        if (_cmpname(self, v, name, namelen)) {
-            *code = v;
-            return 1;
-        }
-        incr = incr << 1;
-        if (incr > mask)
-            incr = incr ^ code_poly;
-    }
-}
-
-static const _PyUnicode_Name_CAPI hashAPI = 
-{
-    sizeof(_PyUnicode_Name_CAPI),
-    _getucname,
-    _getcode
-};
-
-/* -------------------------------------------------------------------- */
-/* Python bindings */
-
-PyDoc_STRVAR(unicodedata_name__doc__,
-"name(unichr[, default])\n\
-Returns the name assigned to the Unicode character unichr as a\n\
-string. If no name is defined, default is returned, or, if not\n\
-given, ValueError is raised.");
-
-static PyObject *
-unicodedata_name(PyObject* self, PyObject* args)
-{
-    char name[NAME_MAXLEN];
-
-    PyUnicodeObject* v;
-    PyObject* defobj = NULL;
-    if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
-        return NULL;
-
-    if (PyUnicode_GET_SIZE(v) != 1) {
-	PyErr_SetString(PyExc_TypeError,
-			"need a single Unicode character as parameter");
-	return NULL;
-    }
-
-    if (!_getucname(self, (Py_UCS4) *PyUnicode_AS_UNICODE(v),
-                    name, sizeof(name))) {
-	if (defobj == NULL) {
-	    PyErr_SetString(PyExc_ValueError, "no such name");
-            return NULL;
-	}
-	else {
-	    Py_INCREF(defobj);
-	    return defobj;
-	}
-    }
-
-    return Py_BuildValue("s", name);
-}
-
-PyDoc_STRVAR(unicodedata_lookup__doc__,
-"lookup(name)\n\
-\n\
-Look up character by name.  If a character with the\n\
-given name is found, return the corresponding Unicode\n\
-character.  If not found, KeyError is raised.");
-
-static PyObject *
-unicodedata_lookup(PyObject* self, PyObject* args)
-{
-    Py_UCS4 code;
-    Py_UNICODE str[1];
-    char errbuf[256];
-
-    char* name;
-    int namelen;
-    if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
-        return NULL;
-
-    if (!_getcode(self, name, namelen, &code)) {
-	/* XXX(nnorwitz): why are we allocating for the error msg?
-		Why not always use snprintf? */
-        char fmt[] = "undefined character name '%s'";
-        char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
-        if (buf)
-            sprintf(buf, fmt, name);
-        else {
-            buf = errbuf;
-            PyOS_snprintf(buf, sizeof(errbuf), fmt, name);
-        }
-        PyErr_SetString(PyExc_KeyError, buf);
-        if (buf != errbuf)
-        	PyMem_FREE(buf);
-        return NULL;
-    }
-
-    str[0] = (Py_UNICODE) code;
-    return PyUnicode_FromUnicode(str, 1);
-}
-
-/* XXX Add doc strings. */
-
-static PyMethodDef unicodedata_functions[] = {
-    {"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
-    {"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
-    {"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
-    {"category", unicodedata_category, METH_VARARGS,
-                 unicodedata_category__doc__},
-    {"bidirectional", unicodedata_bidirectional, METH_VARARGS,
-                      unicodedata_bidirectional__doc__},
-    {"combining", unicodedata_combining, METH_VARARGS,
-                  unicodedata_combining__doc__},
-    {"mirrored", unicodedata_mirrored, METH_VARARGS,
-                 unicodedata_mirrored__doc__},
-    {"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
-                         unicodedata_east_asian_width__doc__},
-    {"decomposition", unicodedata_decomposition, METH_VARARGS,
-                      unicodedata_decomposition__doc__},
-    {"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
-    {"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
-    {"normalize", unicodedata_normalize, METH_VARARGS,
-                  unicodedata_normalize__doc__},
-    {NULL, NULL}		/* sentinel */
-};
-
-static PyTypeObject UCD_Type = {
-	/* The ob_type field must be initialized in the module init function
-	 * to be portable to Windows without using C++. */
-	PyObject_HEAD_INIT(NULL)
-	0,			/*ob_size*/
-	"unicodedata.UCD",		/*tp_name*/
-	sizeof(PreviousDBVersion),	/*tp_basicsize*/
-	0,			/*tp_itemsize*/
-	/* methods */
-	(destructor)PyObject_Del, /*tp_dealloc*/
-	0,			/*tp_print*/
-	0,                      /*tp_getattr*/
-	0,			/*tp_setattr*/
-	0,			/*tp_compare*/
-	0,			/*tp_repr*/
-	0,			/*tp_as_number*/
-	0,			/*tp_as_sequence*/
-	0,			/*tp_as_mapping*/
-	0,			/*tp_hash*/
-        0,                      /*tp_call*/
-        0,                      /*tp_str*/
-        PyObject_GenericGetAttr,/*tp_getattro*/
-        0,                      /*tp_setattro*/
-        0,                      /*tp_as_buffer*/
-        Py_TPFLAGS_DEFAULT,     /*tp_flags*/
-        0,                      /*tp_doc*/
-        0,                      /*tp_traverse*/
-        0,                      /*tp_clear*/
-        0,                      /*tp_richcompare*/
-        0,                      /*tp_weaklistoffset*/
-        0,                      /*tp_iter*/
-        0,                      /*tp_iternext*/
-        unicodedata_functions,  /*tp_methods*/
-        DB_members,             /*tp_members*/
-        0,                      /*tp_getset*/
-        0,                      /*tp_base*/
-        0,                      /*tp_dict*/
-        0,                      /*tp_descr_get*/
-        0,                      /*tp_descr_set*/
-        0,                      /*tp_dictoffset*/
-        0,                      /*tp_init*/
-        0,                      /*tp_alloc*/
-        0,                      /*tp_new*/
-        0,                      /*tp_free*/
-        0,                      /*tp_is_gc*/
-};
-
-PyDoc_STRVAR(unicodedata_docstring,
-"This module provides access to the Unicode Character Database which\n\
-defines character properties for all Unicode characters. The data in\n\
-this database is based on the UnicodeData.txt file version\n\
-4.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
-\n\
-The module uses the same names and symbols as defined by the\n\
-UnicodeData File Format 4.1.0 (see\n\
-http://www.unicode.org/Public/4.1.0/ucd/UCD.html).");
-
-PyMODINIT_FUNC
-initunicodedata(void)
-{
-    PyObject *m, *v;
-
-    UCD_Type.ob_type = &PyType_Type;
-
-    m = Py_InitModule3(
-        "unicodedata", unicodedata_functions, unicodedata_docstring);
-    if (!m)
-        return;
-
-    PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
-    Py_INCREF(&UCD_Type);
-    PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
-
-    /* Previous versions */
-    v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
-    if (v != NULL)
-        PyModule_AddObject(m, "ucd_3_2_0", v);
-
-    /* Export C API */
-    v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
-    if (v != NULL)
-        PyModule_AddObject(m, "ucnhash_CAPI", v);
-}
-
-/* 
-Local variables:
-c-basic-offset: 4
-indent-tabs-mode: nil
-End:
-*/
author	Ori Bernstein <ori@eigenstate.org>	2021-06-14 00:00:37 +0000
committer	Ori Bernstein <ori@eigenstate.org>	2021-06-14 00:00:37 +0000
commit	a73a964e51247ed169d322c725a3a18859f109a3 (patch)
tree	3f752d117274d444bda44e85609aeac1acf313f3 /sys/src/cmd/python/Modules/unicodedata.c
parent	e64efe273fcb921a61bf27d33b230c4e64fcd425 (diff)