mirror of
https://github.com/python/cpython
synced 2024-07-21 02:05:18 +00:00
SF patch #998993: The UTF-8 and the UTF-16 stateful decoders now support
decoding incomplete input (when the input stream is temporarily exhausted). codecs.StreamReader now implements buffering, which enables proper readline support for the UTF-16 decoders. codecs.StreamReader.read() has a new argument chars which specifies the number of characters to return. codecs.StreamReader.readline() and codecs.StreamReader.readlines() have a new argument keepends. Trailing "\n"s will be stripped from the lines if keepends is false. Added C APIs PyUnicode_DecodeUTF8Stateful and PyUnicode_DecodeUTF16Stateful.
This commit is contained in:
parent
a708d6e3b0
commit
69652035bc
|
@ -1076,6 +1076,17 @@ These are the UTF-8 codec APIs:
|
|||
by the codec.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeUTF8Stateful}{const char *s,
|
||||
int size,
|
||||
const char *errors,
|
||||
int *consumed}
|
||||
If \var{consumed} is \NULL{}, behaves like \cfunction{PyUnicode_DecodeUTF8()}.
|
||||
If \var{consumed} is not \NULL{}, trailing incomplete UTF-8 byte sequences
|
||||
will not be treated as an error. Those bytes will not be decoded and the
|
||||
number of bytes that have been decoded will be stored in \var{consumed}.
|
||||
\versionadded{2.4}
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeUTF8}{const Py_UNICODE *s,
|
||||
int size,
|
||||
const char *errors}
|
||||
|
@ -1121,6 +1132,20 @@ These are the UTF-16 codec APIs:
|
|||
Returns \NULL{} if an exception was raised by the codec.
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeUTF16Stateful}{const char *s,
|
||||
int size,
|
||||
const char *errors,
|
||||
int *byteorder,
|
||||
int *consumed}
|
||||
If \var{consumed} is \NULL{}, behaves like
|
||||
\cfunction{PyUnicode_DecodeUTF16()}. If \var{consumed} is not \NULL{},
|
||||
\cfunction{PyUnicode_DecodeUTF16Stateful()} will not treat trailing incomplete
|
||||
UTF-16 byte sequences (i.e. an odd number of bytes or a split surrogate pair)
|
||||
as an error. Those bytes will not be decoded and the number of bytes that
|
||||
have been decoded will be stored in \var{consumed}.
|
||||
\versionadded{2.4}
|
||||
\end{cfuncdesc}
|
||||
|
||||
\begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeUTF16}{const Py_UNICODE *s,
|
||||
int size,
|
||||
const char *errors,
|
||||
|
|
|
@ -394,9 +394,14 @@ order to be compatible to the Python codec registry.
|
|||
be extended with \function{register_error()}.
|
||||
\end{classdesc}
|
||||
|
||||
\begin{methoddesc}{read}{\optional{size}}
|
||||
\begin{methoddesc}{read}{\optional{size\optional{, chars}}}
|
||||
Decodes data from the stream and returns the resulting object.
|
||||
|
||||
\var{chars} indicates the number of characters to read from the
|
||||
stream. \function{read()} will never return more than \vars{chars}
|
||||
characters, but it might return less, if there are not enough
|
||||
characters available.
|
||||
|
||||
\var{size} indicates the approximate maximum number of bytes to read
|
||||
from the stream for decoding purposes. The decoder can modify this
|
||||
setting as appropriate. The default value -1 indicates to read and
|
||||
|
@ -407,29 +412,29 @@ order to be compatible to the Python codec registry.
|
|||
read as much data as is allowed within the definition of the encoding
|
||||
and the given size, e.g. if optional encoding endings or state
|
||||
markers are available on the stream, these should be read too.
|
||||
|
||||
\versionchanged[\var{chars} argument added]{2.4}
|
||||
\end{methoddesc}
|
||||
|
||||
\begin{methoddesc}{readline}{[size]}
|
||||
\begin{methoddesc}{readline}{\optional{size\optional{, keepends}}}
|
||||
Read one line from the input stream and return the
|
||||
decoded data.
|
||||
|
||||
Unlike the \method{readlines()} method, this method inherits
|
||||
the line breaking knowledge from the underlying stream's
|
||||
\method{readline()} method -- there is currently no support for line
|
||||
breaking using the codec decoder due to lack of line buffering.
|
||||
Sublcasses should however, if possible, try to implement this method
|
||||
using their own knowledge of line breaking.
|
||||
|
||||
\var{size}, if given, is passed as size argument to the stream's
|
||||
\method{readline()} method.
|
||||
|
||||
If \var{keepends} is false lineends will be stripped from the
|
||||
lines returned.
|
||||
|
||||
\versionchanged[\var{keepends} argument added]{2.4}
|
||||
\end{methoddesc}
|
||||
|
||||
\begin{methoddesc}{readlines}{[sizehint]}
|
||||
\begin{methoddesc}{readlines}{\optional{sizehint\optional{, keepends}}}
|
||||
Read all lines available on the input stream and return them as list
|
||||
of lines.
|
||||
|
||||
Line breaks are implemented using the codec's decoder method and are
|
||||
included in the list entries.
|
||||
included in the list entries if \var{keepends} is true.
|
||||
|
||||
\var{sizehint}, if given, is passed as \var{size} argument to the
|
||||
stream's \method{read()} method.
|
||||
|
|
|
@ -160,7 +160,9 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
|||
# define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1
|
||||
# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape
|
||||
# define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16
|
||||
# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS2_DecodeUTF16Stateful
|
||||
# define PyUnicode_DecodeUTF8 PyUnicodeUCS2_DecodeUTF8
|
||||
# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS2_DecodeUTF8Stateful
|
||||
# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS2_DecodeUnicodeEscape
|
||||
# define PyUnicode_Encode PyUnicodeUCS2_Encode
|
||||
# define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII
|
||||
|
@ -233,7 +235,9 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
|||
# define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1
|
||||
# define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape
|
||||
# define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16
|
||||
# define PyUnicode_DecodeUTF16Stateful PyUnicodeUCS4_DecodeUTF16Stateful
|
||||
# define PyUnicode_DecodeUTF8 PyUnicodeUCS4_DecodeUTF8
|
||||
# define PyUnicode_DecodeUTF8Stateful PyUnicodeUCS4_DecodeUTF8Stateful
|
||||
# define PyUnicode_DecodeUnicodeEscape PyUnicodeUCS4_DecodeUnicodeEscape
|
||||
# define PyUnicode_Encode PyUnicodeUCS4_Encode
|
||||
# define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII
|
||||
|
@ -658,6 +662,13 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
|
|||
const char *errors /* error handling */
|
||||
);
|
||||
|
||||
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
|
||||
const char *string, /* UTF-8 encoded string */
|
||||
int length, /* size of string */
|
||||
const char *errors, /* error handling */
|
||||
int *consumed /* bytes consumed */
|
||||
);
|
||||
|
||||
PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
|
||||
PyObject *unicode /* Unicode object */
|
||||
);
|
||||
|
@ -702,6 +713,16 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
|
|||
exit */
|
||||
);
|
||||
|
||||
PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
|
||||
const char *string, /* UTF-16 encoded string */
|
||||
int length, /* size of string */
|
||||
const char *errors, /* error handling */
|
||||
int *byteorder, /* pointer to byteorder to use
|
||||
0=native;-1=LE,1=BE; updated on
|
||||
exit */
|
||||
int *consumed /* bytes consumed */
|
||||
);
|
||||
|
||||
/* Returns a Python string using the UTF-16 encoding in native byte
|
||||
order. The string always starts with a BOM mark. */
|
||||
|
||||
|
|
108
Lib/codecs.py
108
Lib/codecs.py
|
@ -228,12 +228,22 @@ def __init__(self, stream, errors='strict'):
|
|||
"""
|
||||
self.stream = stream
|
||||
self.errors = errors
|
||||
self.bytebuffer = ""
|
||||
self.charbuffer = u""
|
||||
|
||||
def read(self, size=-1):
|
||||
def decode(self, input, errors='strict'):
|
||||
raise NotImplementedError
|
||||
|
||||
def read(self, size=-1, chars=-1):
|
||||
|
||||
""" Decodes data from the stream self.stream and returns the
|
||||
resulting object.
|
||||
|
||||
chars indicates the number of characters to read from the
|
||||
stream. read() will never return more than chars
|
||||
characters, but it might return less, if there are not enough
|
||||
characters available.
|
||||
|
||||
size indicates the approximate maximum number of bytes to
|
||||
read from the stream for decoding purposes. The decoder
|
||||
can modify this setting as appropriate. The default value
|
||||
|
@ -248,54 +258,70 @@ def read(self, size=-1):
|
|||
on the stream, these should be read too.
|
||||
|
||||
"""
|
||||
# Unsliced reading:
|
||||
if size < 0:
|
||||
return self.decode(self.stream.read(), self.errors)[0]
|
||||
|
||||
# Sliced reading:
|
||||
read = self.stream.read
|
||||
decode = self.decode
|
||||
data = read(size)
|
||||
i = 0
|
||||
while 1:
|
||||
try:
|
||||
object, decodedbytes = decode(data, self.errors)
|
||||
except ValueError, why:
|
||||
# This method is slow but should work under pretty much
|
||||
# all conditions; at most 10 tries are made
|
||||
i = i + 1
|
||||
newdata = read(1)
|
||||
if not newdata or i > 10:
|
||||
raise
|
||||
data = data + newdata
|
||||
# read until we get the required number of characters (if available)
|
||||
done = False
|
||||
while True:
|
||||
# can the request can be satisfied from the character buffer?
|
||||
if chars < 0:
|
||||
if self.charbuffer:
|
||||
done = True
|
||||
else:
|
||||
return object
|
||||
if len(self.charbuffer) >= chars:
|
||||
done = True
|
||||
if done:
|
||||
if chars < 0:
|
||||
result = self.charbuffer
|
||||
self.charbuffer = u""
|
||||
break
|
||||
else:
|
||||
result = self.charbuffer[:chars]
|
||||
self.charbuffer = self.charbuffer[chars:]
|
||||
break
|
||||
# we need more data
|
||||
if size < 0:
|
||||
newdata = self.stream.read()
|
||||
else:
|
||||
newdata = self.stream.read(size)
|
||||
data = self.bytebuffer + newdata
|
||||
object, decodedbytes = self.decode(data, self.errors)
|
||||
# keep undecoded bytes until the next call
|
||||
self.bytebuffer = data[decodedbytes:]
|
||||
# put new characters in the character buffer
|
||||
self.charbuffer += object
|
||||
# there was no data available
|
||||
if not newdata:
|
||||
done = True
|
||||
return result
|
||||
|
||||
def readline(self, size=None):
|
||||
def readline(self, size=None, keepends=True):
|
||||
|
||||
""" Read one line from the input stream and return the
|
||||
decoded data.
|
||||
|
||||
Note: Unlike the .readlines() method, this method inherits
|
||||
the line breaking knowledge from the underlying stream's
|
||||
.readline() method -- there is currently no support for
|
||||
line breaking using the codec decoder due to lack of line
|
||||
buffering. Subclasses should however, if possible, try to
|
||||
implement this method using their own knowledge of line
|
||||
breaking.
|
||||
|
||||
size, if given, is passed as size argument to the stream's
|
||||
.readline() method.
|
||||
size, if given, is passed as size argument to the
|
||||
read() method.
|
||||
|
||||
"""
|
||||
if size is None:
|
||||
line = self.stream.readline()
|
||||
else:
|
||||
line = self.stream.readline(size)
|
||||
return self.decode(line, self.errors)[0]
|
||||
size = 10
|
||||
line = u""
|
||||
while True:
|
||||
data = self.read(size)
|
||||
line += data
|
||||
pos = line.find("\n")
|
||||
if pos>=0:
|
||||
self.charbuffer = line[pos+1:] + self.charbuffer
|
||||
if keepends:
|
||||
line = line[:pos+1]
|
||||
else:
|
||||
line = line[:pos]
|
||||
return line
|
||||
elif not data:
|
||||
return line
|
||||
if size<8000:
|
||||
size *= 2
|
||||
|
||||
|
||||
def readlines(self, sizehint=None):
|
||||
def readlines(self, sizehint=None, keepends=True):
|
||||
|
||||
""" Read all lines available on the input stream
|
||||
and return them as list of lines.
|
||||
|
@ -307,8 +333,8 @@ def readlines(self, sizehint=None):
|
|||
way to finding the true end-of-line.
|
||||
|
||||
"""
|
||||
data = self.stream.read()
|
||||
return self.decode(data, self.errors)[0].splitlines(1)
|
||||
data = self.read()
|
||||
return self.splitlines(keepends)
|
||||
|
||||
def reset(self):
|
||||
|
||||
|
|
|
@ -10,54 +10,40 @@
|
|||
|
||||
### Codec APIs
|
||||
|
||||
class Codec(codecs.Codec):
|
||||
encode = codecs.utf_16_encode
|
||||
|
||||
# Note: Binding these as C functions will result in the class not
|
||||
# converting them to methods. This is intended.
|
||||
encode = codecs.utf_16_encode
|
||||
decode = codecs.utf_16_decode
|
||||
def decode(input, errors='strict'):
|
||||
return codecs.utf_16_decode(input, errors, True)
|
||||
|
||||
class StreamWriter(Codec,codecs.StreamWriter):
|
||||
class StreamWriter(codecs.StreamWriter):
|
||||
def __init__(self, stream, errors='strict'):
|
||||
self.bom_written = 0
|
||||
self.bom_written = False
|
||||
codecs.StreamWriter.__init__(self, stream, errors)
|
||||
|
||||
def write(self, data):
|
||||
result = codecs.StreamWriter.write(self, data)
|
||||
if not self.bom_written:
|
||||
self.bom_written = 1
|
||||
if sys.byteorder == 'little':
|
||||
self.encode = codecs.utf_16_le_encode
|
||||
else:
|
||||
self.encode = codecs.utf_16_be_encode
|
||||
def encode(self, input, errors='strict'):
|
||||
self.bom_written = True
|
||||
result = codecs.utf_16_encode(input, errors)
|
||||
if sys.byteorder == 'little':
|
||||
self.encode = codecs.utf_16_le_encode
|
||||
else:
|
||||
self.encode = codecs.utf_16_be_encode
|
||||
return result
|
||||
|
||||
class StreamReader(Codec,codecs.StreamReader):
|
||||
def __init__(self, stream, errors='strict'):
|
||||
self.bom_read = 0
|
||||
codecs.StreamReader.__init__(self, stream, errors)
|
||||
class StreamReader(codecs.StreamReader):
|
||||
|
||||
def read(self, size=-1):
|
||||
if not self.bom_read:
|
||||
signature = self.stream.read(2)
|
||||
if signature == codecs.BOM_BE:
|
||||
self.decode = codecs.utf_16_be_decode
|
||||
elif signature == codecs.BOM_LE:
|
||||
self.decode = codecs.utf_16_le_decode
|
||||
else:
|
||||
raise UnicodeError,"UTF-16 stream does not start with BOM"
|
||||
if size > 2:
|
||||
size -= 2
|
||||
elif size >= 0:
|
||||
size = 0
|
||||
self.bom_read = 1
|
||||
return codecs.StreamReader.read(self, size)
|
||||
|
||||
def readline(self, size=None):
|
||||
raise NotImplementedError, '.readline() is not implemented for UTF-16'
|
||||
def decode(self, input, errors='strict'):
|
||||
(object, consumed, byteorder) = \
|
||||
codecs.utf_16_ex_decode(input, errors, 0, False)
|
||||
if byteorder == -1:
|
||||
self.decode = codecs.utf_16_le_decode
|
||||
elif byteorder == 1:
|
||||
self.decode = codecs.utf_16_be_decode
|
||||
elif consumed>=2:
|
||||
raise UnicodeError,"UTF-16 stream does not start with BOM"
|
||||
return (object, consumed)
|
||||
|
||||
### encodings module API
|
||||
|
||||
def getregentry():
|
||||
|
||||
return (Codec.encode,Codec.decode,StreamReader,StreamWriter)
|
||||
return (encode,decode,StreamReader,StreamWriter)
|
||||
|
|
|
@ -10,23 +10,19 @@
|
|||
|
||||
### Codec APIs
|
||||
|
||||
class Codec(codecs.Codec):
|
||||
encode = codecs.utf_16_be_encode
|
||||
|
||||
# Note: Binding these as C functions will result in the class not
|
||||
# converting them to methods. This is intended.
|
||||
def decode(input, errors='strict'):
|
||||
return codecs.utf_16_be_decode(input, errors, True)
|
||||
|
||||
class StreamWriter(codecs.StreamWriter):
|
||||
encode = codecs.utf_16_be_encode
|
||||
|
||||
class StreamReader(codecs.StreamReader):
|
||||
decode = codecs.utf_16_be_decode
|
||||
|
||||
class StreamWriter(Codec,codecs.StreamWriter):
|
||||
pass
|
||||
|
||||
class StreamReader(Codec,codecs.StreamReader):
|
||||
|
||||
def readline(self, size=None):
|
||||
raise NotImplementedError, '.readline() is not implemented for UTF-16-BE'
|
||||
|
||||
### encodings module API
|
||||
|
||||
def getregentry():
|
||||
|
||||
return (Codec.encode,Codec.decode,StreamReader,StreamWriter)
|
||||
return (encode,decode,StreamReader,StreamWriter)
|
||||
|
|
|
@ -10,23 +10,20 @@
|
|||
|
||||
### Codec APIs
|
||||
|
||||
class Codec(codecs.Codec):
|
||||
encode = codecs.utf_16_le_encode
|
||||
|
||||
# Note: Binding these as C functions will result in the class not
|
||||
# converting them to methods. This is intended.
|
||||
def decode(input, errors='strict'):
|
||||
return codecs.utf_16_le_decode(input, errors, True)
|
||||
|
||||
class StreamWriter(codecs.StreamWriter):
|
||||
encode = codecs.utf_16_le_encode
|
||||
|
||||
class StreamReader(codecs.StreamReader):
|
||||
decode = codecs.utf_16_le_decode
|
||||
|
||||
class StreamWriter(Codec,codecs.StreamWriter):
|
||||
pass
|
||||
|
||||
class StreamReader(Codec,codecs.StreamReader):
|
||||
|
||||
def readline(self, size=None):
|
||||
raise NotImplementedError, '.readline() is not implemented for UTF-16-LE'
|
||||
|
||||
### encodings module API
|
||||
|
||||
def getregentry():
|
||||
|
||||
return (Codec.encode,Codec.decode,StreamReader,StreamWriter)
|
||||
return (encode,decode,StreamReader,StreamWriter)
|
||||
|
|
|
@ -10,21 +10,19 @@
|
|||
|
||||
### Codec APIs
|
||||
|
||||
class Codec(codecs.Codec):
|
||||
encode = codecs.utf_8_encode
|
||||
|
||||
# Note: Binding these as C functions will result in the class not
|
||||
# converting them to methods. This is intended.
|
||||
def decode(input, errors='strict'):
|
||||
return codecs.utf_8_decode(input, errors, True)
|
||||
|
||||
class StreamWriter(codecs.StreamWriter):
|
||||
encode = codecs.utf_8_encode
|
||||
|
||||
class StreamReader(codecs.StreamReader):
|
||||
decode = codecs.utf_8_decode
|
||||
|
||||
class StreamWriter(Codec,codecs.StreamWriter):
|
||||
pass
|
||||
|
||||
class StreamReader(Codec,codecs.StreamReader):
|
||||
pass
|
||||
|
||||
### encodings module API
|
||||
|
||||
def getregentry():
|
||||
|
||||
return (Codec.encode,Codec.decode,StreamReader,StreamWriter)
|
||||
return (encode,decode,StreamReader,StreamWriter)
|
||||
|
|
|
@ -3,7 +3,45 @@
|
|||
import codecs
|
||||
import StringIO
|
||||
|
||||
class UTF16Test(unittest.TestCase):
|
||||
class Queue(object):
|
||||
"""
|
||||
queue: write bytes at one end, read bytes from the other end
|
||||
"""
|
||||
def __init__(self):
|
||||
self._buffer = ""
|
||||
|
||||
def write(self, chars):
|
||||
self._buffer += chars
|
||||
|
||||
def read(self, size=-1):
|
||||
if size<0:
|
||||
s = self._buffer
|
||||
self._buffer = ""
|
||||
return s
|
||||
else:
|
||||
s = self._buffer[:size]
|
||||
self._buffer = self._buffer[size:]
|
||||
return s
|
||||
|
||||
class PartialReadTest(unittest.TestCase):
|
||||
def check_partial(self, encoding, input, partialresults):
|
||||
# get a StreamReader for the encoding and feed the bytestring version
|
||||
# of input to the reader byte by byte. Read every available from
|
||||
# the StreamReader and check that the results equal the appropriate
|
||||
# entries from partialresults.
|
||||
q = Queue()
|
||||
r = codecs.getreader(encoding)(q)
|
||||
result = u""
|
||||
for (c, partialresult) in zip(input.encode(encoding), partialresults):
|
||||
q.write(c)
|
||||
result += r.read()
|
||||
self.assertEqual(result, partialresult)
|
||||
# check that there's nothing left in the buffers
|
||||
self.assertEqual(r.read(), u"")
|
||||
self.assertEqual(r.bytebuffer, "")
|
||||
self.assertEqual(r.charbuffer, u"")
|
||||
|
||||
class UTF16Test(PartialReadTest):
|
||||
|
||||
spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
|
||||
spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
|
||||
|
@ -23,6 +61,81 @@ def test_only_one_bom(self):
|
|||
f = reader(s)
|
||||
self.assertEquals(f.read(), u"spamspam")
|
||||
|
||||
def test_partial(self):
|
||||
self.check_partial(
|
||||
"utf-16",
|
||||
u"\x00\xff\u0100\uffff",
|
||||
[
|
||||
u"", # first byte of BOM read
|
||||
u"", # second byte of BOM read => byteorder known
|
||||
u"",
|
||||
u"\x00",
|
||||
u"\x00",
|
||||
u"\x00\xff",
|
||||
u"\x00\xff",
|
||||
u"\x00\xff\u0100",
|
||||
u"\x00\xff\u0100",
|
||||
u"\x00\xff\u0100\uffff",
|
||||
]
|
||||
)
|
||||
|
||||
class UTF16LETest(PartialReadTest):
|
||||
|
||||
def test_partial(self):
|
||||
self.check_partial(
|
||||
"utf-16-le",
|
||||
u"\x00\xff\u0100\uffff",
|
||||
[
|
||||
u"",
|
||||
u"\x00",
|
||||
u"\x00",
|
||||
u"\x00\xff",
|
||||
u"\x00\xff",
|
||||
u"\x00\xff\u0100",
|
||||
u"\x00\xff\u0100",
|
||||
u"\x00\xff\u0100\uffff",
|
||||
]
|
||||
)
|
||||
|
||||
class UTF16BETest(PartialReadTest):
|
||||
|
||||
def test_partial(self):
|
||||
self.check_partial(
|
||||
"utf-16-be",
|
||||
u"\x00\xff\u0100\uffff",
|
||||
[
|
||||
u"",
|
||||
u"\x00",
|
||||
u"\x00",
|
||||
u"\x00\xff",
|
||||
u"\x00\xff",
|
||||
u"\x00\xff\u0100",
|
||||
u"\x00\xff\u0100",
|
||||
u"\x00\xff\u0100\uffff",
|
||||
]
|
||||
)
|
||||
|
||||
class UTF8Test(PartialReadTest):
|
||||
|
||||
def test_partial(self):
|
||||
self.check_partial(
|
||||
"utf-8",
|
||||
u"\x00\xff\u07ff\u0800\uffff",
|
||||
[
|
||||
u"\x00",
|
||||
u"\x00",
|
||||
u"\x00\xff",
|
||||
u"\x00\xff",
|
||||
u"\x00\xff\u07ff",
|
||||
u"\x00\xff\u07ff",
|
||||
u"\x00\xff\u07ff",
|
||||
u"\x00\xff\u07ff\u0800",
|
||||
u"\x00\xff\u07ff\u0800",
|
||||
u"\x00\xff\u07ff\u0800",
|
||||
u"\x00\xff\u07ff\u0800\uffff",
|
||||
]
|
||||
)
|
||||
|
||||
class EscapeDecodeTest(unittest.TestCase):
|
||||
def test_empty_escape_decode(self):
|
||||
self.assertEquals(codecs.escape_decode(""), ("", 0))
|
||||
|
@ -348,6 +461,9 @@ def test_encode(self):
|
|||
def test_main():
|
||||
test_support.run_unittest(
|
||||
UTF16Test,
|
||||
UTF16LETest,
|
||||
UTF16BETest,
|
||||
UTF8Test,
|
||||
EscapeDecodeTest,
|
||||
RecodingTest,
|
||||
PunycodeTest,
|
||||
|
|
13
Misc/NEWS
13
Misc/NEWS
|
@ -22,7 +22,14 @@ Extension modules
|
|||
Library
|
||||
-------
|
||||
|
||||
...
|
||||
- SF patch #998993: The UTF-8 and the UTF-16 stateful decoders now support
|
||||
decoding incomplete input (when the input stream is temporarily exhausted).
|
||||
``codecs.StreamReader`` now implements buffering, which enables proper
|
||||
readline support for the UTF-16 decoders. ``codecs.StreamReader.read()``
|
||||
has a new argument ``chars`` which specifies the number of characters to
|
||||
return. ``codecs.StreamReader.readline()`` and
|
||||
``codecs.StreamReader.readlines()`` have a new argument ``keepends``.
|
||||
Trailing "\n"s will be stripped from the lines if ``keepends`` is false.
|
||||
|
||||
Build
|
||||
-----
|
||||
|
@ -32,7 +39,9 @@ Build
|
|||
C API
|
||||
-----
|
||||
|
||||
...
|
||||
- SF patch #998993: ``PyUnicode_DecodeUTF8Stateful`` and
|
||||
``PyUnicode_DecodeUTF16Stateful`` have been added, which implement stateful
|
||||
decoding.
|
||||
|
||||
Documentation
|
||||
-------------
|
||||
|
|
|
@ -269,13 +269,20 @@ utf_8_decode(PyObject *self,
|
|||
const char *data;
|
||||
int size;
|
||||
const char *errors = NULL;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "t#|z:utf_8_decode",
|
||||
&data, &size, &errors))
|
||||
return NULL;
|
||||
int final = 0;
|
||||
int consumed;
|
||||
PyObject *decoded = NULL;
|
||||
|
||||
return codec_tuple(PyUnicode_DecodeUTF8(data, size, errors),
|
||||
size);
|
||||
if (!PyArg_ParseTuple(args, "t#|zi:utf_8_decode",
|
||||
&data, &size, &errors, &final))
|
||||
return NULL;
|
||||
consumed = size;
|
||||
|
||||
decoded = PyUnicode_DecodeUTF8Stateful(data, size, errors,
|
||||
final ? NULL : &consumed);
|
||||
if (decoded == NULL)
|
||||
return NULL;
|
||||
return codec_tuple(decoded, consumed);
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
|
@ -286,12 +293,19 @@ utf_16_decode(PyObject *self,
|
|||
int size;
|
||||
const char *errors = NULL;
|
||||
int byteorder = 0;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "t#|z:utf_16_decode",
|
||||
&data, &size, &errors))
|
||||
int final = 0;
|
||||
int consumed;
|
||||
PyObject *decoded;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "t#|zi:utf_16_decode",
|
||||
&data, &size, &errors, &final))
|
||||
return NULL;
|
||||
return codec_tuple(PyUnicode_DecodeUTF16(data, size, errors, &byteorder),
|
||||
size);
|
||||
consumed = size;
|
||||
decoded = PyUnicode_DecodeUTF16Stateful(data, size, errors, &byteorder,
|
||||
final ? NULL : &consumed);
|
||||
if (decoded == NULL)
|
||||
return NULL;
|
||||
return codec_tuple(decoded, consumed);
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
|
@ -302,12 +316,20 @@ utf_16_le_decode(PyObject *self,
|
|||
int size;
|
||||
const char *errors = NULL;
|
||||
int byteorder = -1;
|
||||
int final = 0;
|
||||
int consumed;
|
||||
PyObject *decoded = NULL;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "t#|z:utf_16_le_decode",
|
||||
&data, &size, &errors))
|
||||
if (!PyArg_ParseTuple(args, "t#|zi:utf_16_le_decode",
|
||||
&data, &size, &errors, &final))
|
||||
return NULL;
|
||||
return codec_tuple(PyUnicode_DecodeUTF16(data, size, errors, &byteorder),
|
||||
size);
|
||||
consumed = size;
|
||||
decoded = PyUnicode_DecodeUTF16Stateful(data, size, errors,
|
||||
&byteorder, final ? NULL : &consumed);
|
||||
if (decoded == NULL)
|
||||
return NULL;
|
||||
return codec_tuple(decoded, consumed);
|
||||
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
|
@ -318,12 +340,19 @@ utf_16_be_decode(PyObject *self,
|
|||
int size;
|
||||
const char *errors = NULL;
|
||||
int byteorder = 1;
|
||||
int final = 0;
|
||||
int consumed;
|
||||
PyObject *decoded = NULL;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "t#|z:utf_16_be_decode",
|
||||
&data, &size, &errors))
|
||||
if (!PyArg_ParseTuple(args, "t#|zi:utf_16_be_decode",
|
||||
&data, &size, &errors, &final))
|
||||
return NULL;
|
||||
return codec_tuple(PyUnicode_DecodeUTF16(data, size, errors, &byteorder),
|
||||
size);
|
||||
consumed = size;
|
||||
decoded = PyUnicode_DecodeUTF16Stateful(data, size, errors,
|
||||
&byteorder, final ? NULL : &consumed);
|
||||
if (decoded == NULL)
|
||||
return NULL;
|
||||
return codec_tuple(decoded, consumed);
|
||||
}
|
||||
|
||||
/* This non-standard version also provides access to the byteorder
|
||||
|
@ -343,15 +372,19 @@ utf_16_ex_decode(PyObject *self,
|
|||
const char *errors = NULL;
|
||||
int byteorder = 0;
|
||||
PyObject *unicode, *tuple;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "t#|zi:utf_16_ex_decode",
|
||||
&data, &size, &errors, &byteorder))
|
||||
int final = 0;
|
||||
int consumed;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "t#|zii:utf_16_ex_decode",
|
||||
&data, &size, &errors, &byteorder, &final))
|
||||
return NULL;
|
||||
|
||||
unicode = PyUnicode_DecodeUTF16(data, size, errors, &byteorder);
|
||||
consumed = size;
|
||||
unicode = PyUnicode_DecodeUTF16Stateful(data, size, errors, &byteorder,
|
||||
final ? NULL : &consumed);
|
||||
if (unicode == NULL)
|
||||
return NULL;
|
||||
tuple = Py_BuildValue("Oii", unicode, size, byteorder);
|
||||
tuple = Py_BuildValue("Oii", unicode, consumed, byteorder);
|
||||
Py_DECREF(unicode);
|
||||
return tuple;
|
||||
}
|
||||
|
|
|
@ -1135,6 +1135,14 @@ char utf8_code_length[256] = {
|
|||
PyObject *PyUnicode_DecodeUTF8(const char *s,
|
||||
int size,
|
||||
const char *errors)
|
||||
{
|
||||
return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
|
||||
}
|
||||
|
||||
PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
|
||||
int size,
|
||||
const char *errors,
|
||||
int *consumed)
|
||||
{
|
||||
const char *starts = s;
|
||||
int n;
|
||||
|
@ -1153,8 +1161,11 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
|
|||
unicode = _PyUnicode_New(size);
|
||||
if (!unicode)
|
||||
return NULL;
|
||||
if (size == 0)
|
||||
if (size == 0) {
|
||||
if (consumed)
|
||||
*consumed = 0;
|
||||
return (PyObject *)unicode;
|
||||
}
|
||||
|
||||
/* Unpack UTF-8 encoded data */
|
||||
p = unicode->str;
|
||||
|
@ -1172,10 +1183,14 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
|
|||
n = utf8_code_length[ch];
|
||||
|
||||
if (s + n > e) {
|
||||
errmsg = "unexpected end of data";
|
||||
startinpos = s-starts;
|
||||
endinpos = size;
|
||||
goto utf8Error;
|
||||
if (consumed)
|
||||
break;
|
||||
else {
|
||||
errmsg = "unexpected end of data";
|
||||
startinpos = s-starts;
|
||||
endinpos = size;
|
||||
goto utf8Error;
|
||||
}
|
||||
}
|
||||
|
||||
switch (n) {
|
||||
|
@ -1293,6 +1308,8 @@ PyObject *PyUnicode_DecodeUTF8(const char *s,
|
|||
(PyObject **)&unicode, &outpos, &p))
|
||||
goto onError;
|
||||
}
|
||||
if (consumed)
|
||||
*consumed = s-starts;
|
||||
|
||||
/* Adjust length */
|
||||
if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
|
||||
|
@ -1427,6 +1444,16 @@ PyUnicode_DecodeUTF16(const char *s,
|
|||
int size,
|
||||
const char *errors,
|
||||
int *byteorder)
|
||||
{
|
||||
return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
|
||||
}
|
||||
|
||||
PyObject *
|
||||
PyUnicode_DecodeUTF16Stateful(const char *s,
|
||||
int size,
|
||||
const char *errors,
|
||||
int *byteorder,
|
||||
int *consumed)
|
||||
{
|
||||
const char *starts = s;
|
||||
int startinpos;
|
||||
|
@ -1467,26 +1494,28 @@ PyUnicode_DecodeUTF16(const char *s,
|
|||
mark is skipped, in all other modes, it is copied to the output
|
||||
stream as-is (giving a ZWNBSP character). */
|
||||
if (bo == 0) {
|
||||
const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
|
||||
if (size >= 2) {
|
||||
const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
|
||||
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
|
||||
if (bom == 0xFEFF) {
|
||||
q += 2;
|
||||
bo = -1;
|
||||
}
|
||||
else if (bom == 0xFFFE) {
|
||||
q += 2;
|
||||
bo = 1;
|
||||
}
|
||||
if (bom == 0xFEFF) {
|
||||
q += 2;
|
||||
bo = -1;
|
||||
}
|
||||
else if (bom == 0xFFFE) {
|
||||
q += 2;
|
||||
bo = 1;
|
||||
}
|
||||
#else
|
||||
if (bom == 0xFEFF) {
|
||||
q += 2;
|
||||
bo = 1;
|
||||
}
|
||||
else if (bom == 0xFFFE) {
|
||||
q += 2;
|
||||
bo = -1;
|
||||
}
|
||||
if (bom == 0xFEFF) {
|
||||
q += 2;
|
||||
bo = 1;
|
||||
}
|
||||
else if (bom == 0xFFFE) {
|
||||
q += 2;
|
||||
bo = -1;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
if (bo == -1) {
|
||||
|
@ -1502,8 +1531,10 @@ PyUnicode_DecodeUTF16(const char *s,
|
|||
|
||||
while (q < e) {
|
||||
Py_UNICODE ch;
|
||||
/* remaing bytes at the end? (size should be even) */
|
||||
/* remaining bytes at the end? (size should be even) */
|
||||
if (e-q<2) {
|
||||
if (consumed)
|
||||
break;
|
||||
errmsg = "truncated data";
|
||||
startinpos = ((const char *)q)-starts;
|
||||
endinpos = ((const char *)e)-starts;
|
||||
|
@ -1565,6 +1596,9 @@ PyUnicode_DecodeUTF16(const char *s,
|
|||
if (byteorder)
|
||||
*byteorder = bo;
|
||||
|
||||
if (consumed)
|
||||
*consumed = (const char *)q-starts;
|
||||
|
||||
/* Adjust length */
|
||||
if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
|
||||
goto onError;
|
||||
|
|
Loading…
Reference in a new issue