summaryrefslogtreecommitdiff
path: root/sys/src/cmd/python/Tools/unicode/makeunicodedata.py
diff options
context:
space:
mode:
authorOri Bernstein <ori@eigenstate.org>2021-06-14 00:00:37 +0000
committerOri Bernstein <ori@eigenstate.org>2021-06-14 00:00:37 +0000
commita73a964e51247ed169d322c725a3a18859f109a3 (patch)
tree3f752d117274d444bda44e85609aeac1acf313f3 /sys/src/cmd/python/Tools/unicode/makeunicodedata.py
parente64efe273fcb921a61bf27d33b230c4e64fcd425 (diff)
python, hg: tow outside the environment.
they've served us well, and can ride off into the sunset.
Diffstat (limited to 'sys/src/cmd/python/Tools/unicode/makeunicodedata.py')
-rw-r--r--sys/src/cmd/python/Tools/unicode/makeunicodedata.py943
1 files changed, 0 insertions, 943 deletions
diff --git a/sys/src/cmd/python/Tools/unicode/makeunicodedata.py b/sys/src/cmd/python/Tools/unicode/makeunicodedata.py
deleted file mode 100644
index 9de69048e..000000000
--- a/sys/src/cmd/python/Tools/unicode/makeunicodedata.py
+++ /dev/null
@@ -1,943 +0,0 @@
-#
-# (re)generate unicode property and type databases
-#
-# this script converts a unicode 3.2 database file to
-# Modules/unicodedata_db.h, Modules/unicodename_db.h,
-# and Objects/unicodetype_db.h
-#
-# history:
-# 2000-09-24 fl created (based on bits and pieces from unidb)
-# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
-# 2000-09-25 fl added character type table
-# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
-# 2000-11-03 fl expand first/last ranges
-# 2001-01-19 fl added character name tables (2.1)
-# 2001-01-21 fl added decomp compression; dynamic phrasebook threshold
-# 2002-09-11 wd use string methods
-# 2002-10-18 mvl update to Unicode 3.2
-# 2002-10-22 mvl generate NFC tables
-# 2002-11-24 mvl expand all ranges, sort names version-independently
-# 2002-11-25 mvl add UNIDATA_VERSION
-# 2004-05-29 perky add east asian width information
-# 2006-03-10 mvl update to Unicode 4.1; add UCD 3.2 delta
-#
-# written by Fredrik Lundh (fredrik@pythonware.com)
-#
-
-import sys
-
-SCRIPT = sys.argv[0]
-VERSION = "2.5"
-
-# The Unicode Database
-UNIDATA_VERSION = "4.1.0"
-UNICODE_DATA = "UnicodeData%s.txt"
-COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
-EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
-
-old_versions = ["3.2.0"]
-
-CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
- "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
- "Lo", "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", "Sm", "Sc", "Sk",
- "So" ]
-
-BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
- "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
- "ON" ]
-
-EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
-
-# note: should match definitions in Objects/unicodectype.c
-ALPHA_MASK = 0x01
-DECIMAL_MASK = 0x02
-DIGIT_MASK = 0x04
-LOWER_MASK = 0x08
-LINEBREAK_MASK = 0x10
-SPACE_MASK = 0x20
-TITLE_MASK = 0x40
-UPPER_MASK = 0x80
-
-def maketables(trace=0):
-
- print "--- Reading", UNICODE_DATA % "", "..."
-
- version = ""
- unicode = UnicodeData(UNICODE_DATA % version,
- COMPOSITION_EXCLUSIONS % version,
- EASTASIAN_WIDTH % version)
-
- print len(filter(None, unicode.table)), "characters"
-
- for version in old_versions:
- print "--- Reading", UNICODE_DATA % ("-"+version), "..."
- old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
- COMPOSITION_EXCLUSIONS % ("-"+version),
- EASTASIAN_WIDTH % ("-"+version))
- print len(filter(None, old_unicode.table)), "characters"
- merge_old_version(version, unicode, old_unicode)
-
- makeunicodename(unicode, trace)
- makeunicodedata(unicode, trace)
- makeunicodetype(unicode, trace)
-
-# --------------------------------------------------------------------
-# unicode character properties
-
-def makeunicodedata(unicode, trace):
-
- dummy = (0, 0, 0, 0, 0)
- table = [dummy]
- cache = {0: dummy}
- index = [0] * len(unicode.chars)
-
- FILE = "Modules/unicodedata_db.h"
-
- print "--- Preparing", FILE, "..."
-
- # 1) database properties
-
- for char in unicode.chars:
- record = unicode.table[char]
- if record:
- # extract database properties
- category = CATEGORY_NAMES.index(record[2])
- combining = int(record[3])
- bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
- mirrored = record[9] == "Y"
- eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
- item = (
- category, combining, bidirectional, mirrored, eastasianwidth
- )
- # add entry to index and item tables
- i = cache.get(item)
- if i is None:
- cache[item] = i = len(table)
- table.append(item)
- index[char] = i
-
- # 2) decomposition data
-
- decomp_data = [0]
- decomp_prefix = [""]
- decomp_index = [0] * len(unicode.chars)
- decomp_size = 0
-
- comp_pairs = []
- comp_first = [None] * len(unicode.chars)
- comp_last = [None] * len(unicode.chars)
-
- for char in unicode.chars:
- record = unicode.table[char]
- if record:
- if record[5]:
- decomp = record[5].split()
- if len(decomp) > 19:
- raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
- # prefix
- if decomp[0][0] == "<":
- prefix = decomp.pop(0)
- else:
- prefix = ""
- try:
- i = decomp_prefix.index(prefix)
- except ValueError:
- i = len(decomp_prefix)
- decomp_prefix.append(prefix)
- prefix = i
- assert prefix < 256
- # content
- decomp = [prefix + (len(decomp)<<8)] +\
- map(lambda s: int(s, 16), decomp)
- # Collect NFC pairs
- if not prefix and len(decomp) == 3 and \
- char not in unicode.exclusions and \
- unicode.table[decomp[1]][3] == "0":
- p, l, r = decomp
- comp_first[l] = 1
- comp_last[r] = 1
- comp_pairs.append((l,r,char))
- try:
- i = decomp_data.index(decomp)
- except ValueError:
- i = len(decomp_data)
- decomp_data.extend(decomp)
- decomp_size = decomp_size + len(decomp) * 2
- else:
- i = 0
- decomp_index[char] = i
-
- f = l = 0
- comp_first_ranges = []
- comp_last_ranges = []
- prev_f = prev_l = None
- for i in unicode.chars:
- if comp_first[i] is not None:
- comp_first[i] = f
- f += 1
- if prev_f is None:
- prev_f = (i,i)
- elif prev_f[1]+1 == i:
- prev_f = prev_f[0],i
- else:
- comp_first_ranges.append(prev_f)
- prev_f = (i,i)
- if comp_last[i] is not None:
- comp_last[i] = l
- l += 1
- if prev_l is None:
- prev_l = (i,i)
- elif prev_l[1]+1 == i:
- prev_l = prev_l[0],i
- else:
- comp_last_ranges.append(prev_l)
- prev_l = (i,i)
- comp_first_ranges.append(prev_f)
- comp_last_ranges.append(prev_l)
- total_first = f
- total_last = l
-
- comp_data = [0]*(total_first*total_last)
- for f,l,char in comp_pairs:
- f = comp_first[f]
- l = comp_last[l]
- comp_data[f*total_last+l] = char
-
- print len(table), "unique properties"
- print len(decomp_prefix), "unique decomposition prefixes"
- print len(decomp_data), "unique decomposition entries:",
- print decomp_size, "bytes"
- print total_first, "first characters in NFC"
- print total_last, "last characters in NFC"
- print len(comp_pairs), "NFC pairs"
-
- print "--- Writing", FILE, "..."
-
- fp = open(FILE, "w")
- print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
- print >>fp
- print >>fp, '#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION
- print >>fp, "/* a list of unique database records */"
- print >>fp, \
- "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
- for item in table:
- print >>fp, " {%d, %d, %d, %d, %d}," % item
- print >>fp, "};"
- print >>fp
-
- print >>fp, "/* Reindexing of NFC first characters. */"
- print >>fp, "#define TOTAL_FIRST",total_first
- print >>fp, "#define TOTAL_LAST",total_last
- print >>fp, "struct reindex{int start;short count,index;};"
- print >>fp, "struct reindex nfc_first[] = {"
- for start,end in comp_first_ranges:
- print >>fp," { %d, %d, %d}," % (start,end-start,comp_first[start])
- print >>fp," {0,0,0}"
- print >>fp,"};\n"
- print >>fp, "struct reindex nfc_last[] = {"
- for start,end in comp_last_ranges:
- print >>fp," { %d, %d, %d}," % (start,end-start,comp_last[start])
- print >>fp," {0,0,0}"
- print >>fp,"};\n"
-
- # FIXME: <fl> the following tables could be made static, and
- # the support code moved into unicodedatabase.c
-
- print >>fp, "/* string literals */"
- print >>fp, "const char *_PyUnicode_CategoryNames[] = {"
- for name in CATEGORY_NAMES:
- print >>fp, " \"%s\"," % name
- print >>fp, " NULL"
- print >>fp, "};"
-
- print >>fp, "const char *_PyUnicode_BidirectionalNames[] = {"
- for name in BIDIRECTIONAL_NAMES:
- print >>fp, " \"%s\"," % name
- print >>fp, " NULL"
- print >>fp, "};"
-
- print >>fp, "const char *_PyUnicode_EastAsianWidthNames[] = {"
- for name in EASTASIANWIDTH_NAMES:
- print >>fp, " \"%s\"," % name
- print >>fp, " NULL"
- print >>fp, "};"
-
- print >>fp, "static const char *decomp_prefix[] = {"
- for name in decomp_prefix:
- print >>fp, " \"%s\"," % name
- print >>fp, " NULL"
- print >>fp, "};"
-
- # split record index table
- index1, index2, shift = splitbins(index, trace)
-
- print >>fp, "/* index tables for the database records */"
- print >>fp, "#define SHIFT", shift
- Array("index1", index1).dump(fp, trace)
- Array("index2", index2).dump(fp, trace)
-
- # split decomposition index table
- index1, index2, shift = splitbins(decomp_index, trace)
-
- print >>fp, "/* decomposition data */"
- Array("decomp_data", decomp_data).dump(fp, trace)
-
- print >>fp, "/* index tables for the decomposition data */"
- print >>fp, "#define DECOMP_SHIFT", shift
- Array("decomp_index1", index1).dump(fp, trace)
- Array("decomp_index2", index2).dump(fp, trace)
-
- index, index2, shift = splitbins(comp_data, trace)
- print >>fp, "/* NFC pairs */"
- print >>fp, "#define COMP_SHIFT", shift
- Array("comp_index", index).dump(fp, trace)
- Array("comp_data", index2).dump(fp, trace)
-
- # Generate delta tables for old versions
- for version, table, normalization in unicode.changed:
- cversion = version.replace(".","_")
- records = [table[0]]
- cache = {table[0]:0}
- index = [0] * len(table)
- for i, record in enumerate(table):
- try:
- index[i] = cache[record]
- except KeyError:
- index[i] = cache[record] = len(records)
- records.append(record)
- index1, index2, shift = splitbins(index, trace)
- print >>fp, "static const change_record change_records_%s[] = {" % cversion
- for record in records:
- print >>fp, "\t{ %s }," % ", ".join(map(str,record))
- print >>fp, "};"
- Array("changes_%s_index" % cversion, index1).dump(fp, trace)
- Array("changes_%s_data" % cversion, index2).dump(fp, trace)
- print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
- print >>fp, "{"
- print >>fp, "\tint index;"
- print >>fp, "\tif (n >= 0x110000) index = 0;"
- print >>fp, "\telse {"
- print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)
- print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
- (cversion, shift, ((1<<shift)-1))
- print >>fp, "\t}"
- print >>fp, "\treturn change_records_%s+index;" % cversion
- print >>fp, "}\n"
- print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
- print >>fp, "{"
- print >>fp, "\tswitch(n) {"
- for k, v in normalization:
- print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)
- print >>fp, "\tdefault: return 0;"
- print >>fp, "\t}\n}\n"
-
- fp.close()
-
-# --------------------------------------------------------------------
-# unicode character type tables
-
-def makeunicodetype(unicode, trace):
-
- FILE = "Objects/unicodetype_db.h"
-
- print "--- Preparing", FILE, "..."
-
- # extract unicode types
- dummy = (0, 0, 0, 0, 0, 0)
- table = [dummy]
- cache = {0: dummy}
- index = [0] * len(unicode.chars)
-
- for char in unicode.chars:
- record = unicode.table[char]
- if record:
- # extract database properties
- category = record[2]
- bidirectional = record[4]
- flags = 0
- if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
- flags |= ALPHA_MASK
- if category == "Ll":
- flags |= LOWER_MASK
- if category == "Zl" or bidirectional == "B":
- flags |= LINEBREAK_MASK
- if category == "Zs" or bidirectional in ("WS", "B", "S"):
- flags |= SPACE_MASK
- if category == "Lt":
- flags |= TITLE_MASK
- if category == "Lu":
- flags |= UPPER_MASK
- # use delta predictor for upper/lower/title
- if record[12]:
- upper = int(record[12], 16) - char
- assert -32768 <= upper <= 32767
- upper = upper & 0xffff
- else:
- upper = 0
- if record[13]:
- lower = int(record[13], 16) - char
- assert -32768 <= lower <= 32767
- lower = lower & 0xffff
- else:
- lower = 0
- if record[14]:
- title = int(record[14], 16) - char
- assert -32768 <= lower <= 32767
- title = title & 0xffff
- else:
- title = 0
- # decimal digit, integer digit
- decimal = 0
- if record[6]:
- flags |= DECIMAL_MASK
- decimal = int(record[6])
- digit = 0
- if record[7]:
- flags |= DIGIT_MASK
- digit = int(record[7])
- item = (
- upper, lower, title, decimal, digit, flags
- )
- # add entry to index and item tables
- i = cache.get(item)
- if i is None:
- cache[item] = i = len(table)
- table.append(item)
- index[char] = i
-
- print len(table), "unique character type entries"
-
- print "--- Writing", FILE, "..."
-
- fp = open(FILE, "w")
- print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
- print >>fp
- print >>fp, "/* a list of unique character type descriptors */"
- print >>fp, "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
- for item in table:
- print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
- print >>fp, "};"
- print >>fp
-
- # split decomposition index table
- index1, index2, shift = splitbins(index, trace)
-
- print >>fp, "/* type indexes */"
- print >>fp, "#define SHIFT", shift
- Array("index1", index1).dump(fp, trace)
- Array("index2", index2).dump(fp, trace)
-
- fp.close()
-
-# --------------------------------------------------------------------
-# unicode name database
-
-def makeunicodename(unicode, trace):
-
- FILE = "Modules/unicodename_db.h"
-
- print "--- Preparing", FILE, "..."
-
- # collect names
- names = [None] * len(unicode.chars)
-
- for char in unicode.chars:
- record = unicode.table[char]
- if record:
- name = record[1].strip()
- if name and name[0] != "<":
- names[char] = name + chr(0)
-
- print len(filter(lambda n: n is not None, names)), "distinct names"
-
- # collect unique words from names (note that we differ between
- # words inside a sentence, and words ending a sentence. the
- # latter includes the trailing null byte.
-
- words = {}
- n = b = 0
- for char in unicode.chars:
- name = names[char]
- if name:
- w = name.split()
- b = b + len(name)
- n = n + len(w)
- for w in w:
- l = words.get(w)
- if l:
- l.append(None)
- else:
- words[w] = [len(words)]
-
- print n, "words in text;", b, "bytes"
-
- wordlist = words.items()
-
- # sort on falling frequency, then by name
- def cmpwords((aword, alist),(bword, blist)):
- r = -cmp(len(alist),len(blist))
- if r:
- return r
- return cmp(aword, bword)
- wordlist.sort(cmpwords)
-
- # figure out how many phrasebook escapes we need
- escapes = 0
- while escapes * 256 < len(wordlist):
- escapes = escapes + 1
- print escapes, "escapes"
-
- short = 256 - escapes
-
- assert short > 0
-
- print short, "short indexes in lexicon"
-
- # statistics
- n = 0
- for i in range(short):
- n = n + len(wordlist[i][1])
- print n, "short indexes in phrasebook"
-
- # pick the most commonly used words, and sort the rest on falling
- # length (to maximize overlap)
-
- wordlist, wordtail = wordlist[:short], wordlist[short:]
- wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
- wordlist.extend(wordtail)
-
- # generate lexicon from words
-
- lexicon_offset = [0]
- lexicon = ""
- words = {}
-
- # build a lexicon string
- offset = 0
- for w, x in wordlist:
- # encoding: bit 7 indicates last character in word (chr(128)
- # indicates the last character in an entire string)
- ww = w[:-1] + chr(ord(w[-1])+128)
- # reuse string tails, when possible
- o = lexicon.find(ww)
- if o < 0:
- o = offset
- lexicon = lexicon + ww
- offset = offset + len(w)
- words[w] = len(lexicon_offset)
- lexicon_offset.append(o)
-
- lexicon = map(ord, lexicon)
-
- # generate phrasebook from names and lexicon
- phrasebook = [0]
- phrasebook_offset = [0] * len(unicode.chars)
- for char in unicode.chars:
- name = names[char]
- if name:
- w = name.split()
- phrasebook_offset[char] = len(phrasebook)
- for w in w:
- i = words[w]
- if i < short:
- phrasebook.append(i)
- else:
- # store as two bytes
- phrasebook.append((i>>8) + short)
- phrasebook.append(i&255)
-
- assert getsize(phrasebook) == 1
-
- #
- # unicode name hash table
-
- # extract names
- data = []
- for char in unicode.chars:
- record = unicode.table[char]
- if record:
- name = record[1].strip()
- if name and name[0] != "<":
- data.append((name, char))
-
- # the magic number 47 was chosen to minimize the number of
- # collisions on the current data set. if you like, change it
- # and see what happens...
-
- codehash = Hash("code", data, 47)
-
- print "--- Writing", FILE, "..."
-
- fp = open(FILE, "w")
- print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
- print >>fp
- print >>fp, "#define NAME_MAXLEN", 256
- print >>fp
- print >>fp, "/* lexicon */"
- Array("lexicon", lexicon).dump(fp, trace)
- Array("lexicon_offset", lexicon_offset).dump(fp, trace)
-
- # split decomposition index table
- offset1, offset2, shift = splitbins(phrasebook_offset, trace)
-
- print >>fp, "/* code->name phrasebook */"
- print >>fp, "#define phrasebook_shift", shift
- print >>fp, "#define phrasebook_short", short
-
- Array("phrasebook", phrasebook).dump(fp, trace)
- Array("phrasebook_offset1", offset1).dump(fp, trace)
- Array("phrasebook_offset2", offset2).dump(fp, trace)
-
- print >>fp, "/* name->code dictionary */"
- codehash.dump(fp, trace)
-
- fp.close()
-
-
-def merge_old_version(version, new, old):
- # Changes to exclusion file not implemented yet
- if old.exclusions != new.exclusions:
- raise NotImplementedError, "exclusions differ"
-
- # In these change records, 0xFF means "no change"
- bidir_changes = [0xFF]*0x110000
- category_changes = [0xFF]*0x110000
- decimal_changes = [0xFF]*0x110000
- # In numeric data, 0 means "no change",
- # -1 means "did not have a numeric value
- numeric_changes = [0] * 0x110000
- # normalization_changes is a list of key-value pairs
- normalization_changes = []
- for i in range(0x110000):
- if new.table[i] is None:
- # Characters unassigned in the new version ought to
- # be unassigned in the old one
- assert old.table[i] is None
- continue
- # check characters unassigned in the old version
- if old.table[i] is None:
- # category 0 is "unassigned"
- category_changes[i] = 0
- continue
- # check characters that differ
- if old.table[i] != new.table[i]:
- for k in range(len(old.table[i])):
- if old.table[i][k] != new.table[i][k]:
- value = old.table[i][k]
- if k == 2:
- #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
- category_changes[i] = CATEGORY_NAMES.index(value)
- elif k == 4:
- #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
- bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
- elif k == 5:
- #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
- # We assume that all normalization changes are in 1:1 mappings
- assert " " not in value
- normalization_changes.append((i, value))
- elif k == 6:
- #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
- # we only support changes where the old value is a single digit
- assert value in "0123456789"
- decimal_changes[i] = int(value)
- elif k == 8:
- # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
- # Since 0 encodes "no change", the old value is better not 0
- assert value != "0" and value != "-1"
- if not value:
- numeric_changes[i] = -1
- else:
- assert re.match("^[0-9]+$", value)
- numeric_changes[i] = int(value)
- elif k == 11:
- # change to ISO comment, ignore
- pass
- elif k == 12:
- # change to simple uppercase mapping; ignore
- pass
- elif k == 13:
- # change to simple lowercase mapping; ignore
- pass
- elif k == 14:
- # change to simple titlecase mapping; ignore
- pass
- else:
- class Difference(Exception):pass
- raise Difference, (hex(i), k, old.table[i], new.table[i])
- new.changed.append((version, zip(bidir_changes, category_changes,
- decimal_changes, numeric_changes),
- normalization_changes))
-
-
-# --------------------------------------------------------------------
-# the following support code is taken from the unidb utilities
-# Copyright (c) 1999-2000 by Secret Labs AB
-
-# load a unicode-data file from disk
-
-import sys
-
-class UnicodeData:
-
- def __init__(self, filename, exclusions, eastasianwidth, expand=1):
- self.changed = []
- file = open(filename)
- table = [None] * 0x110000
- while 1:
- s = file.readline()
- if not s:
- break
- s = s.strip().split(";")
- char = int(s[0], 16)
- table[char] = s
-
- # expand first-last ranges
- if expand:
- field = None
- for i in range(0, 0x110000):
- s = table[i]
- if s:
- if s[1][-6:] == "First>":
- s[1] = ""
- field = s
- elif s[1][-5:] == "Last>":
- s[1] = ""
- field = None
- elif field:
- f2 = field[:]
- f2[0] = "%X" % i
- table[i] = f2
-
- # public attributes
- self.filename = filename
- self.table = table
- self.chars = range(0x110000) # unicode 3.2
-
- file = open(exclusions)
- self.exclusions = {}
- for s in file:
- s = s.strip()
- if not s:
- continue
- if s[0] == '#':
- continue
- char = int(s.split()[0],16)
- self.exclusions[char] = 1
-
- widths = [None] * 0x110000
- for s in open(eastasianwidth):
- s = s.strip()
- if not s:
- continue
- if s[0] == '#':
- continue
- s = s.split()[0].split(';')
- if '..' in s[0]:
- first, last = [int(c, 16) for c in s[0].split('..')]
- chars = range(first, last+1)
- else:
- chars = [int(s[0], 16)]
- for char in chars:
- widths[char] = s[1]
- for i in range(0, 0x110000):
- if table[i] is not None:
- table[i].append(widths[i])
-
- def uselatin1(self):
- # restrict character range to ISO Latin 1
- self.chars = range(256)
-
-# hash table tools
-
-# this is a straight-forward reimplementation of Python's built-in
-# dictionary type, using a static data structure, and a custom string
-# hash algorithm.
-
-def myhash(s, magic):
- h = 0
- for c in map(ord, s.upper()):
- h = (h * magic) + c
- ix = h & 0xff000000L
- if ix:
- h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
- return h
-
-SIZES = [
- (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
- (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
- (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
- (2097152,5), (4194304,3), (8388608,33), (16777216,27)
-]
-
-class Hash:
- def __init__(self, name, data, magic):
- # turn a (key, value) list into a static hash table structure
-
- # determine table size
- for size, poly in SIZES:
- if size > len(data):
- poly = size + poly
- break
- else:
- raise AssertionError, "ran out of polynominals"
-
- print size, "slots in hash table"
-
- table = [None] * size
-
- mask = size-1
-
- n = 0
-
- hash = myhash
-
- # initialize hash table
- for key, value in data:
- h = hash(key, magic)
- i = (~h) & mask
- v = table[i]
- if v is None:
- table[i] = value
- continue
- incr = (h ^ (h >> 3)) & mask;
- if not incr:
- incr = mask
- while 1:
- n = n + 1
- i = (i + incr) & mask
- v = table[i]
- if v is None:
- table[i] = value
- break
- incr = incr << 1
- if incr > mask:
- incr = incr ^ poly
-
- print n, "collisions"
- self.collisions = n
-
- for i in range(len(table)):
- if table[i] is None:
- table[i] = 0
-
- self.data = Array(name + "_hash", table)
- self.magic = magic
- self.name = name
- self.size = size
- self.poly = poly
-
- def dump(self, file, trace):
- # write data to file, as a C array
- self.data.dump(file, trace)
- file.write("#define %s_magic %d\n" % (self.name, self.magic))
- file.write("#define %s_size %d\n" % (self.name, self.size))
- file.write("#define %s_poly %d\n" % (self.name, self.poly))
-
-# stuff to deal with arrays of unsigned integers
-
-class Array:
-
- def __init__(self, name, data):
- self.name = name
- self.data = data
-
- def dump(self, file, trace=0):
- # write data to file, as a C array
- size = getsize(self.data)
- if trace:
- print >>sys.stderr, self.name+":", size*len(self.data), "bytes"
- file.write("static ")
- if size == 1:
- file.write("unsigned char")
- elif size == 2:
- file.write("unsigned short")
- else:
- file.write("unsigned int")
- file.write(" " + self.name + "[] = {\n")
- if self.data:
- s = " "
- for item in self.data:
- i = str(item) + ", "
- if len(s) + len(i) > 78:
- file.write(s + "\n")
- s = " " + i
- else:
- s = s + i
- if s.strip():
- file.write(s + "\n")
- file.write("};\n\n")
-
-def getsize(data):
- # return smallest possible integer size for the given array
- maxdata = max(data)
- if maxdata < 256:
- return 1
- elif maxdata < 65536:
- return 2
- else:
- return 4
-
-def splitbins(t, trace=0):
- """t, trace=0 -> (t1, t2, shift). Split a table to save space.
-
- t is a sequence of ints. This function can be useful to save space if
- many of the ints are the same. t1 and t2 are lists of ints, and shift
- is an int, chosen to minimize the combined size of t1 and t2 (in C
- code), and where for each i in range(len(t)),
- t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
- where mask is a bitmask isolating the last "shift" bits.
-
- If optional arg trace is non-zero (default zero), progress info
- is printed to sys.stderr. The higher the value, the more info
- you'll get.
- """
-
- import sys
- if trace:
- def dump(t1, t2, shift, bytes):
- print >>sys.stderr, "%d+%d bins at shift %d; %d bytes" % (
- len(t1), len(t2), shift, bytes)
- print >>sys.stderr, "Size of original table:", len(t)*getsize(t), \
- "bytes"
- n = len(t)-1 # last valid index
- maxshift = 0 # the most we can shift n and still have something left
- if n > 0:
- while n >> 1:
- n >>= 1
- maxshift += 1
- del n
- bytes = sys.maxint # smallest total size so far
- t = tuple(t) # so slices can be dict keys
- for shift in range(maxshift + 1):
- t1 = []
- t2 = []
- size = 2**shift
- bincache = {}
- for i in range(0, len(t), size):
- bin = t[i:i+size]
- index = bincache.get(bin)
- if index is None:
- index = len(t2)
- bincache[bin] = index
- t2.extend(bin)
- t1.append(index >> shift)
- # determine memory size
- b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
- if trace > 1:
- dump(t1, t2, shift, b)
- if b < bytes:
- best = t1, t2, shift
- bytes = b
- t1, t2, shift = best
- if trace:
- print >>sys.stderr, "Best:",
- dump(t1, t2, shift, bytes)
- if __debug__:
- # exhaustively verify that the decomposition is correct
- mask = ~((~0) << shift) # i.e., low-bit mask of shift bits
- for i in xrange(len(t)):
- assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
- return best
-
-if __name__ == "__main__":
- maketables(1)