diff options
author | cinap_lenrek <cinap_lenrek@localhost> | 2011-05-03 11:25:13 +0000 |
---|---|---|
committer | cinap_lenrek <cinap_lenrek@localhost> | 2011-05-03 11:25:13 +0000 |
commit | 458120dd40db6b4df55a4e96b650e16798ef06a0 (patch) | |
tree | 8f82685be24fef97e715c6f5ca4c68d34d5074ee /sys/lib/python/encodings/utf_8_sig.py | |
parent | 3a742c699f6806c1145aea5149bf15de15a0afd7 (diff) |
add hg and python
Diffstat (limited to 'sys/lib/python/encodings/utf_8_sig.py')
-rw-r--r-- | sys/lib/python/encodings/utf_8_sig.py | 100 |
1 files changed, 100 insertions, 0 deletions
diff --git a/sys/lib/python/encodings/utf_8_sig.py b/sys/lib/python/encodings/utf_8_sig.py new file mode 100644 index 000000000..d751da69c --- /dev/null +++ b/sys/lib/python/encodings/utf_8_sig.py @@ -0,0 +1,100 @@ +""" Python 'utf-8-sig' Codec +This work similar to UTF-8 with the following changes: + +* On encoding/writing a UTF-8 encoded BOM will be prepended/written as the + first three bytes. + +* On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these + bytes will be skipped. +""" +import codecs + +### Codec APIs + +def encode(input, errors='strict'): + return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input)) + +def decode(input, errors='strict'): + prefix = 0 + if input[:3] == codecs.BOM_UTF8: + input = input[3:] + prefix = 3 + (output, consumed) = codecs.utf_8_decode(input, errors, True) + return (output, consumed+prefix) + +class IncrementalEncoder(codecs.IncrementalEncoder): + def __init__(self, errors='strict'): + codecs.IncrementalEncoder.__init__(self, errors) + self.first = True + + def encode(self, input, final=False): + if self.first: + self.first = False + return codecs.BOM_UTF8 + codecs.utf_8_encode(input, self.errors)[0] + else: + return codecs.utf_8_encode(input, self.errors)[0] + + def reset(self): + codecs.IncrementalEncoder.reset(self) + self.first = True + +class IncrementalDecoder(codecs.BufferedIncrementalDecoder): + def __init__(self, errors='strict'): + codecs.BufferedIncrementalDecoder.__init__(self, errors) + self.first = True + + def _buffer_decode(self, input, errors, final): + if self.first and codecs.BOM_UTF8.startswith(input): # might be a BOM + if len(input) < 3: + # not enough data to decide if this really is a BOM + # => try again on the next call + return (u"", 0) + (output, consumed) = codecs.utf_8_decode(input[3:], errors, final) + self.first = False + return (output, consumed+3) + return codecs.utf_8_decode(input, errors, final) + + def reset(self): + codecs.BufferedIncrementalDecoder.reset(self) + self.first = True + +class StreamWriter(codecs.StreamWriter): + def reset(self): + codecs.StreamWriter.reset(self) + try: + del self.encode + except AttributeError: + pass + + def encode(self, input, errors='strict'): + self.encode = codecs.utf_8_encode + return encode(input, errors) + +class StreamReader(codecs.StreamReader): + def reset(self): + codecs.StreamReader.reset(self) + try: + del self.decode + except AttributeError: + pass + + def decode(self, input, errors='strict'): + if len(input) < 3 and codecs.BOM_UTF8.startswith(input): + # not enough data to decide if this is a BOM + # => try again on the next call + return (u"", 0) + self.decode = codecs.utf_8_decode + return decode(input, errors) + +### encodings module API + +def getregentry(): + return codecs.CodecInfo( + name='utf-8-sig', + encode=encode, + decode=decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamreader=StreamReader, + streamwriter=StreamWriter, + ) |