Skip to content

Commit 8ab7b43

Browse files
gh-62259: Add support of multi-byte encodings in the XML parser (GH-149860)
Supported encodings: "cp932", "cp949", "cp950", "Big5","EUC-JP", "GB2312", "GBK", "johab", and "Shift_JIS". Partially supported encodings (only BMP characters): "Big5-HKSCS", "EUC_JIS-2004", "EUC_JISX0213", "Shift_JIS-2004", "Shift_JISX0213", "utf-8-sig" and non-standard aliases like "UTF8" (without hyphen). The parser now raises ValueError for known unsupported multi-byte encodings such us "ISO-2022-JP" or "raw-unicode-escape" instead of failing later, when encounter non-ASCII data.
1 parent a34edf7 commit 8ab7b43

47 files changed

Lines changed: 401 additions & 29 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

Doc/library/pyexpat.rst

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -63,12 +63,26 @@ The :mod:`!xml.parsers.expat` module contains two functions:
6363

6464
.. function:: ParserCreate(encoding=None, namespace_separator=None)
6565

66-
Creates and returns a new :class:`xmlparser` object. *encoding*, if specified,
67-
must be a string naming the encoding used by the XML data. Expat doesn't
68-
support as many encodings as Python does, and its repertoire of encodings can't
69-
be extended; it supports UTF-8, UTF-16, ISO-8859-1 (Latin1), and ASCII. If
70-
*encoding* [1]_ is given it will override the implicit or explicit encoding of the
71-
document.
66+
Creates and returns a new :class:`xmlparser` object.
67+
*encoding* [1]_, if specified, must be a string naming the encoding
68+
used by the XML data.
69+
If it is given it will override the implicit or explicit encoding
70+
of the document.
71+
72+
.. impl-detail::
73+
74+
Expat natively understands and processes UTF-8, UTF-16, UTF-16BE,
75+
UTF-16LE, ISO-8859-1, and US-ASCII.
76+
For other encodings (including aliases like Latin1 and ASCII) it
77+
falls back to Python.
78+
It supports most of 8-bit encodings and many multi-byte encodings
79+
like Shift_JIS, although only BMP characters (``U+0000-U+FFFF``)
80+
are supported with non-native encodings (this restriction is also
81+
applied to aliases like UTF8).
82+
These restrictions only apply if *encoding* is not given.
83+
84+
.. versionchanged:: next
85+
Added support for multi-byte encodings.
7286

7387
.. _xmlparser-non-root:
7488

@@ -113,7 +127,6 @@ The :mod:`!xml.parsers.expat` module contains two functions:
113127
XML document. Call ``ParserCreate`` for each document to provide unique
114128
parser instances.
115129

116-
117130
.. seealso::
118131

119132
`The Expat XML Parser <http://www.libexpat.org/>`_
@@ -1083,9 +1096,11 @@ The ``errors`` module has the following attributes:
10831096

10841097
.. rubric:: Footnotes
10851098

1086-
.. [1] The encoding string included in XML output should conform to the
1087-
appropriate standards. For example, "UTF-8" is valid, but "UTF8" is
1088-
not. See https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EncodingDecl
1099+
.. [1] The encoding string included in XML output should conform to
1100+
the appropriate standards. For example, "UTF-8" is valid, but
1101+
"UTF8" is not valid in an XML document's declaration, even though
1102+
Python accepts it as an encoding name.
1103+
See https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EncodingDecl
10891104
and https://www.iana.org/assignments/character-sets/character-sets.xhtml.
10901105
10911106

Doc/whatsnew/3.16.rst

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,6 @@ New modules
8686
Improved modules
8787
================
8888

89-
9089
gzip
9190
----
9291

@@ -101,6 +100,21 @@ os
101100
process via a pidfd. Available on Linux 5.6+.
102101
(Contributed by Maurycy Pawłowski-Wieroński in :gh:`149464`.)
103102

103+
xml
104+
---
105+
106+
* Add support for multiple multi-byte encodings in the :mod:`XML parser
107+
<xml.parsers.expat>`: "cp932", "cp949", "cp950", "Big5","EUC-JP",
108+
"GB2312", "GBK", "johab", and "Shift_JIS".
109+
Add partial support (only BMP characters) for multi-byte encodings
110+
"Big5-HKSCS", "EUC_JIS-2004", "EUC_JISX0213", "Shift_JIS-2004",
111+
"Shift_JISX0213", "utf-8-sig" and non-standard aliases like "UTF8"
112+
(without hyphen).
113+
The parser now raises :exc:`ValueError` for known unsupported
114+
multi-byte encodings such us "ISO-2022-JP" or "raw-unicode-escape"
115+
instead of failing later, when encounter non-ASCII data.
116+
(Contributed by Serhiy Storchaka in :gh:`62259`.)
117+
104118
.. Add improved modules above alphabetically, not here at the end.
105119
106120
Optimizations

Include/internal/pycore_codecs.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ extern int _PyCodec_UnregisterError(const char *name);
4545
in Python 3.5+?
4646
4747
*/
48-
extern PyObject* _PyCodec_LookupTextEncoding(
48+
PyAPI_FUNC(PyObject*) _PyCodec_LookupTextEncoding(
4949
const char *encoding,
5050
const char *alternate_command);
5151

Lib/codecs.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ class CodecInfo(tuple):
9393

9494
def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
9595
incrementalencoder=None, incrementaldecoder=None, name=None,
96-
*, _is_text_encoding=None):
96+
*, _is_text_encoding=None, _expat_decoding_table=None):
9797
self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
9898
self.name = name
9999
self.encode = encode
@@ -104,6 +104,8 @@ def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
104104
self.streamreader = streamreader
105105
if _is_text_encoding is not None:
106106
self._is_text_encoding = _is_text_encoding
107+
if _expat_decoding_table is not None:
108+
self._expat_decoding_table = _expat_decoding_table
107109
return self
108110

109111
def __repr__(self):

Lib/encodings/big5.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,13 @@ def getregentry():
3636
incrementaldecoder=IncrementalDecoder,
3737
streamreader=StreamReader,
3838
streamwriter=StreamWriter,
39+
_expat_decoding_table=(*range(128),
40+
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
41+
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
42+
-1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
43+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
44+
-2, -2, -2, -2, -2, -2, -2, -2, -1, -2, -2, -2, -2, -2, -2, -2,
45+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
46+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
47+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1),
3948
)

Lib/encodings/big5hkscs.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,13 @@ def getregentry():
3636
incrementaldecoder=IncrementalDecoder,
3737
streamreader=StreamReader,
3838
streamwriter=StreamWriter,
39+
_expat_decoding_table=(*range(128),
40+
-1, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2,
41+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
42+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
43+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
44+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
45+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
46+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
47+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1),
3948
)

Lib/encodings/cp932.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,18 @@ def getregentry():
3636
incrementaldecoder=IncrementalDecoder,
3737
streamreader=StreamReader,
3838
streamwriter=StreamWriter,
39+
_expat_decoding_table=(*range(128),
40+
0x80, -2, -2, -2, -2, -1, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2,
41+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
42+
0xf8f0, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65, 0xff66, 0xff67,
43+
0xff68, 0xff69, 0xff6a, 0xff6b, 0xff6c, 0xff6d, 0xff6e, 0xff6f,
44+
0xff70, 0xff71, 0xff72, 0xff73, 0xff74, 0xff75, 0xff76, 0xff77,
45+
0xff78, 0xff79, 0xff7a, 0xff7b, 0xff7c, 0xff7d, 0xff7e, 0xff7f,
46+
0xff80, 0xff81, 0xff82, 0xff83, 0xff84, 0xff85, 0xff86, 0xff87,
47+
0xff88, 0xff89, 0xff8a, 0xff8b, 0xff8c, 0xff8d, 0xff8e, 0xff8f,
48+
0xff90, 0xff91, 0xff92, 0xff93, 0xff94, 0xff95, 0xff96, 0xff97,
49+
0xff98, 0xff99, 0xff9a, 0xff9b, 0xff9c, 0xff9d, 0xff9e, 0xff9f,
50+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -2, -2, -1,
51+
-2, -2, -2, -2, -2, -2, -2, -2,
52+
-2, -2, -1, -1, -1, 0xf8f1, 0xf8f2, 0xf8f3),
3953
)

Lib/encodings/cp949.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,13 @@ def getregentry():
3636
incrementaldecoder=IncrementalDecoder,
3737
streamreader=StreamReader,
3838
streamwriter=StreamWriter,
39+
_expat_decoding_table=(*range(128),
40+
-1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
41+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
42+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
43+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
44+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -2, -2, -2, -2, -2, -2,
45+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
46+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
47+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1),
3948
)

Lib/encodings/cp950.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,13 @@ def getregentry():
3636
incrementaldecoder=IncrementalDecoder,
3737
streamreader=StreamReader,
3838
streamwriter=StreamWriter,
39+
_expat_decoding_table=(*range(128),
40+
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
41+
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
42+
-1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
43+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
44+
-2, -2, -2, -2, -2, -2, -2, -2, -1, -2, -2, -2, -2, -2, -2, -2,
45+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
46+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
47+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1),
3948
)

Lib/encodings/euc_jis_2004.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,13 @@ def getregentry():
3636
incrementaldecoder=IncrementalDecoder,
3737
streamreader=StreamReader,
3838
streamwriter=StreamWriter,
39+
_expat_decoding_table=(*range(128),
40+
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -3,
41+
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
42+
-1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
43+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
44+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
45+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
46+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
47+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1),
3948
)

0 commit comments

Comments
 (0)