python, hg: tow outside the environment.

they've served us well, and can ride off into the sunset.
author: Ori Bernstein <ori@eigenstate.org> 2021-06-14 00:00:37 +0000
committer: Ori Bernstein <ori@eigenstate.org> 2021-06-14 00:00:37 +0000
commit: a73a964e51247ed169d322c725a3a18859f109a3 (patch)
tree: 3f752d117274d444bda44e85609aeac1acf313f3 /sys/lib/python/csv.py
parent: e64efe273fcb921a61bf27d33b230c4e64fcd425 (diff)
1 files changed, 0 insertions, 415 deletions
diff --git a/sys/lib/python/csv.py b/sys/lib/python/csv.py
deleted file mode 100644
index f21385478..000000000
--- a/sys/lib/python/csv.py
+++ /dev/null
@@ -1,415 +0,0 @@
-
-"""
-csv.py - read/write/investigate CSV files
-"""
-
-import re
-from _csv import Error, __version__, writer, reader, register_dialect, \
-                 unregister_dialect, get_dialect, list_dialects, \
-                 field_size_limit, \
-                 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
-                 __doc__
-from _csv import Dialect as _Dialect
-
-try:
-    from cStringIO import StringIO
-except ImportError:
-    from StringIO import StringIO
-
-__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
-            "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
-            "register_dialect", "get_dialect", "list_dialects", "Sniffer",
-            "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
-
-class Dialect:
-    """Describe an Excel dialect.
-
-    This must be subclassed (see csv.excel).  Valid attributes are:
-    delimiter, quotechar, escapechar, doublequote, skipinitialspace,
-    lineterminator, quoting.
-
-    """
-    _name = ""
-    _valid = False
-    # placeholders
-    delimiter = None
-    quotechar = None
-    escapechar = None
-    doublequote = None
-    skipinitialspace = None
-    lineterminator = None
-    quoting = None
-
-    def __init__(self):
-        if self.__class__ != Dialect:
-            self._valid = True
-        self._validate()
-
-    def _validate(self):
-        try:
-            _Dialect(self)
-        except TypeError, e:
-            # We do this for compatibility with py2.3
-            raise Error(str(e))
-
-class excel(Dialect):
-    """Describe the usual properties of Excel-generated CSV files."""
-    delimiter = ','
-    quotechar = '"'
-    doublequote = True
-    skipinitialspace = False
-    lineterminator = '\r\n'
-    quoting = QUOTE_MINIMAL
-register_dialect("excel", excel)
-
-class excel_tab(excel):
-    """Describe the usual properties of Excel-generated TAB-delimited files."""
-    delimiter = '\t'
-register_dialect("excel-tab", excel_tab)
-
-
-class DictReader:
-    def __init__(self, f, fieldnames=None, restkey=None, restval=None,
-                 dialect="excel", *args, **kwds):
-        self.fieldnames = fieldnames    # list of keys for the dict
-        self.restkey = restkey          # key to catch long rows
-        self.restval = restval          # default value for short rows
-        self.reader = reader(f, dialect, *args, **kwds)
-
-    def __iter__(self):
-        return self
-
-    def next(self):
-        row = self.reader.next()
-        if self.fieldnames is None:
-            self.fieldnames = row
-            row = self.reader.next()
-
-        # unlike the basic reader, we prefer not to return blanks,
-        # because we will typically wind up with a dict full of None
-        # values
-        while row == []:
-            row = self.reader.next()
-        d = dict(zip(self.fieldnames, row))
-        lf = len(self.fieldnames)
-        lr = len(row)
-        if lf < lr:
-            d[self.restkey] = row[lf:]
-        elif lf > lr:
-            for key in self.fieldnames[lr:]:
-                d[key] = self.restval
-        return d
-
-
-class DictWriter:
-    def __init__(self, f, fieldnames, restval="", extrasaction="raise",
-                 dialect="excel", *args, **kwds):
-        self.fieldnames = fieldnames    # list of keys for the dict
-        self.restval = restval          # for writing short dicts
-        if extrasaction.lower() not in ("raise", "ignore"):
-            raise ValueError, \
-                  ("extrasaction (%s) must be 'raise' or 'ignore'" %
-                   extrasaction)
-        self.extrasaction = extrasaction
-        self.writer = writer(f, dialect, *args, **kwds)
-
-    def _dict_to_list(self, rowdict):
-        if self.extrasaction == "raise":
-            for k in rowdict.keys():
-                if k not in self.fieldnames:
-                    raise ValueError, "dict contains fields not in fieldnames"
-        return [rowdict.get(key, self.restval) for key in self.fieldnames]
-
-    def writerow(self, rowdict):
-        return self.writer.writerow(self._dict_to_list(rowdict))
-
-    def writerows(self, rowdicts):
-        rows = []
-        for rowdict in rowdicts:
-            rows.append(self._dict_to_list(rowdict))
-        return self.writer.writerows(rows)
-
-# Guard Sniffer's type checking against builds that exclude complex()
-try:
-    complex
-except NameError:
-    complex = float
-
-class Sniffer:
-    '''
-    "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
-    Returns a Dialect object.
-    '''
-    def __init__(self):
-        # in case there is more than one possible delimiter
-        self.preferred = [',', '\t', ';', ' ', ':']
-
-
-    def sniff(self, sample, delimiters=None):
-        """
-        Returns a dialect (or None) corresponding to the sample
-        """
-
-        quotechar, delimiter, skipinitialspace = \
-                   self._guess_quote_and_delimiter(sample, delimiters)
-        if not delimiter:
-            delimiter, skipinitialspace = self._guess_delimiter(sample,
-                                                                delimiters)
-
-        if not delimiter:
-            raise Error, "Could not determine delimiter"
-
-        class dialect(Dialect):
-            _name = "sniffed"
-            lineterminator = '\r\n'
-            quoting = QUOTE_MINIMAL
-            # escapechar = ''
-            doublequote = False
-
-        dialect.delimiter = delimiter
-        # _csv.reader won't accept a quotechar of ''
-        dialect.quotechar = quotechar or '"'
-        dialect.skipinitialspace = skipinitialspace
-
-        return dialect
-
-
-    def _guess_quote_and_delimiter(self, data, delimiters):
-        """
-        Looks for text enclosed between two identical quotes
-        (the probable quotechar) which are preceded and followed
-        by the same character (the probable delimiter).
-        For example:
-                         ,'some text',
-        The quote with the most wins, same with the delimiter.
-        If there is no quotechar the delimiter can't be determined
-        this way.
-        """
-
-        matches = []
-        for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
-                      '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
-                      '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',  # ,".*?"
-                      '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
-            regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
-            matches = regexp.findall(data)
-            if matches:
-                break
-
-        if not matches:
-            return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
-
-        quotes = {}
-        delims = {}
-        spaces = 0
-        for m in matches:
-            n = regexp.groupindex['quote'] - 1
-            key = m[n]
-            if key:
-                quotes[key] = quotes.get(key, 0) + 1
-            try:
-                n = regexp.groupindex['delim'] - 1
-                key = m[n]
-            except KeyError:
-                continue
-            if key and (delimiters is None or key in delimiters):
-                delims[key] = delims.get(key, 0) + 1
-            try:
-                n = regexp.groupindex['space'] - 1
-            except KeyError:
-                continue
-            if m[n]:
-                spaces += 1
-
-        quotechar = reduce(lambda a, b, quotes = quotes:
-                           (quotes[a] > quotes[b]) and a or b, quotes.keys())
-
-        if delims:
-            delim = reduce(lambda a, b, delims = delims:
-                           (delims[a] > delims[b]) and a or b, delims.keys())
-            skipinitialspace = delims[delim] == spaces
-            if delim == '\n': # most likely a file with a single column
-                delim = ''
-        else:
-            # there is *no* delimiter, it's a single column of quoted data
-            delim = ''
-            skipinitialspace = 0
-
-        return (quotechar, delim, skipinitialspace)
-
-
-    def _guess_delimiter(self, data, delimiters):
-        """
-        The delimiter /should/ occur the same number of times on
-        each row. However, due to malformed data, it may not. We don't want
-        an all or nothing approach, so we allow for small variations in this
-        number.
-          1) build a table of the frequency of each character on every line.
-          2) build a table of freqencies of this frequency (meta-frequency?),
-             e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
-             7 times in 2 rows'
-          3) use the mode of the meta-frequency to determine the /expected/
-             frequency for that character
-          4) find out how often the character actually meets that goal
-          5) the character that best meets its goal is the delimiter
-        For performance reasons, the data is evaluated in chunks, so it can
-        try and evaluate the smallest portion of the data possible, evaluating
-        additional chunks as necessary.
-        """
-
-        data = filter(None, data.split('\n'))
-
-        ascii = [chr(c) for c in range(127)] # 7-bit ASCII
-
-        # build frequency tables
-        chunkLength = min(10, len(data))
-        iteration = 0
-        charFrequency = {}
-        modes = {}
-        delims = {}
-        start, end = 0, min(chunkLength, len(data))
-        while start < len(data):
-            iteration += 1
-            for line in data[start:end]:
-                for char in ascii:
-                    metaFrequency = charFrequency.get(char, {})
-                    # must count even if frequency is 0
-                    freq = line.count(char)
-                    # value is the mode
-                    metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
-                    charFrequency[char] = metaFrequency
-
-            for char in charFrequency.keys():
-                items = charFrequency[char].items()
-                if len(items) == 1 and items[0][0] == 0:
-                    continue
-                # get the mode of the frequencies
-                if len(items) > 1:
-                    modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
-                                         items)
-                    # adjust the mode - subtract the sum of all
-                    # other frequencies
-                    items.remove(modes[char])
-                    modes[char] = (modes[char][0], modes[char][1]
-                                   - reduce(lambda a, b: (0, a[1] + b[1]),
-                                            items)[1])
-                else:
-                    modes[char] = items[0]
-
-            # build a list of possible delimiters
-            modeList = modes.items()
-            total = float(chunkLength * iteration)
-            # (rows of consistent data) / (number of rows) = 100%
-            consistency = 1.0
-            # minimum consistency threshold
-            threshold = 0.9
-            while len(delims) == 0 and consistency >= threshold:
-                for k, v in modeList:
-                    if v[0] > 0 and v[1] > 0:
-                        if ((v[1]/total) >= consistency and
-                            (delimiters is None or k in delimiters)):
-                            delims[k] = v
-                consistency -= 0.01
-
-            if len(delims) == 1:
-                delim = delims.keys()[0]
-                skipinitialspace = (data[0].count(delim) ==
-                                    data[0].count("%c " % delim))
-                return (delim, skipinitialspace)
-
-            # analyze another chunkLength lines
-            start = end
-            end += chunkLength
-
-        if not delims:
-            return ('', 0)
-
-        # if there's more than one, fall back to a 'preferred' list
-        if len(delims) > 1:
-            for d in self.preferred:
-                if d in delims.keys():
-                    skipinitialspace = (data[0].count(d) ==
-                                        data[0].count("%c " % d))
-                    return (d, skipinitialspace)
-
-        # nothing else indicates a preference, pick the character that
-        # dominates(?)
-        items = [(v,k) for (k,v) in delims.items()]
-        items.sort()
-        delim = items[-1][1]
-
-        skipinitialspace = (data[0].count(delim) ==
-                            data[0].count("%c " % delim))
-        return (delim, skipinitialspace)
-
-
-    def has_header(self, sample):
-        # Creates a dictionary of types of data in each column. If any
-        # column is of a single type (say, integers), *except* for the first
-        # row, then the first row is presumed to be labels. If the type
-        # can't be determined, it is assumed to be a string in which case
-        # the length of the string is the determining factor: if all of the
-        # rows except for the first are the same length, it's a header.
-        # Finally, a 'vote' is taken at the end for each column, adding or
-        # subtracting from the likelihood of the first row being a header.
-
-        rdr = reader(StringIO(sample), self.sniff(sample))
-
-        header = rdr.next() # assume first row is header
-
-        columns = len(header)
-        columnTypes = {}
-        for i in range(columns): columnTypes[i] = None
-
-        checked = 0
-        for row in rdr:
-            # arbitrary number of rows to check, to keep it sane
-            if checked > 20:
-                break
-            checked += 1
-
-            if len(row) != columns:
-                continue # skip rows that have irregular number of columns
-
-            for col in columnTypes.keys():
-
-                for thisType in [int, long, float, complex]:
-                    try:
-                        thisType(row[col])
-                        break
-                    except (ValueError, OverflowError):
-                        pass
-                else:
-                    # fallback to length of string
-                    thisType = len(row[col])
-
-                # treat longs as ints
-                if thisType == long:
-                    thisType = int
-
-                if thisType != columnTypes[col]:
-                    if columnTypes[col] is None: # add new column type
-                        columnTypes[col] = thisType
-                    else:
-                        # type is inconsistent, remove column from
-                        # consideration
-                        del columnTypes[col]
-
-        # finally, compare results against first row and "vote"
-        # on whether it's a header
-        hasHeader = 0
-        for col, colType in columnTypes.items():
-            if type(colType) == type(0): # it's a length
-                if len(header[col]) != colType:
-                    hasHeader += 1
-                else:
-                    hasHeader -= 1
-            else: # attempt typecast
-                try:
-                    colType(header[col])
-                except (ValueError, TypeError):
-                    hasHeader += 1
-                else:
-                    hasHeader -= 1
-
-        return hasHeader > 0
author	Ori Bernstein <ori@eigenstate.org>	2021-06-14 00:00:37 +0000
committer	Ori Bernstein <ori@eigenstate.org>	2021-06-14 00:00:37 +0000
commit	a73a964e51247ed169d322c725a3a18859f109a3 (patch)
tree	3f752d117274d444bda44e85609aeac1acf313f3 /sys/lib/python/csv.py
parent	e64efe273fcb921a61bf27d33b230c4e64fcd425 (diff)