Skip to content

Commit c88a021

Browse files
Merge branch 'main' into patch-13
2 parents a09ca13 + 776573c commit c88a021

62 files changed

Lines changed: 562 additions & 88 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

Doc/library/pyexpat.rst

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -63,12 +63,26 @@ The :mod:`!xml.parsers.expat` module contains two functions:
6363

6464
.. function:: ParserCreate(encoding=None, namespace_separator=None)
6565

66-
Creates and returns a new :class:`xmlparser` object. *encoding*, if specified,
67-
must be a string naming the encoding used by the XML data. Expat doesn't
68-
support as many encodings as Python does, and its repertoire of encodings can't
69-
be extended; it supports UTF-8, UTF-16, ISO-8859-1 (Latin1), and ASCII. If
70-
*encoding* [1]_ is given it will override the implicit or explicit encoding of the
71-
document.
66+
Creates and returns a new :class:`xmlparser` object.
67+
*encoding* [1]_, if specified, must be a string naming the encoding
68+
used by the XML data.
69+
If it is given it will override the implicit or explicit encoding
70+
of the document.
71+
72+
.. impl-detail::
73+
74+
Expat natively understands and processes UTF-8, UTF-16, UTF-16BE,
75+
UTF-16LE, ISO-8859-1, and US-ASCII.
76+
For other encodings (including aliases like Latin1 and ASCII) it
77+
falls back to Python.
78+
It supports most of 8-bit encodings and many multi-byte encodings
79+
like Shift_JIS, although only BMP characters (``U+0000-U+FFFF``)
80+
are supported with non-native encodings (this restriction is also
81+
applied to aliases like UTF8).
82+
These restrictions only apply if *encoding* is not given.
83+
84+
.. versionchanged:: next
85+
Added support for multi-byte encodings.
7286

7387
.. _xmlparser-non-root:
7488

@@ -113,7 +127,6 @@ The :mod:`!xml.parsers.expat` module contains two functions:
113127
XML document. Call ``ParserCreate`` for each document to provide unique
114128
parser instances.
115129

116-
117130
.. seealso::
118131

119132
`The Expat XML Parser <http://www.libexpat.org/>`_
@@ -1083,9 +1096,11 @@ The ``errors`` module has the following attributes:
10831096

10841097
.. rubric:: Footnotes
10851098

1086-
.. [1] The encoding string included in XML output should conform to the
1087-
appropriate standards. For example, "UTF-8" is valid, but "UTF8" is
1088-
not. See https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EncodingDecl
1099+
.. [1] The encoding string included in XML output should conform to
1100+
the appropriate standards. For example, "UTF-8" is valid, but
1101+
"UTF8" is not valid in an XML document's declaration, even though
1102+
Python accepts it as an encoding name.
1103+
See https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EncodingDecl
10891104
and https://www.iana.org/assignments/character-sets/character-sets.xhtml.
10901105
10911106

Doc/whatsnew/3.16.rst

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,6 @@ New modules
8686
Improved modules
8787
================
8888

89-
9089
gzip
9190
----
9291

@@ -101,6 +100,21 @@ os
101100
process via a pidfd. Available on Linux 5.6+.
102101
(Contributed by Maurycy Pawłowski-Wieroński in :gh:`149464`.)
103102

103+
xml
104+
---
105+
106+
* Add support for multiple multi-byte encodings in the :mod:`XML parser
107+
<xml.parsers.expat>`: "cp932", "cp949", "cp950", "Big5","EUC-JP",
108+
"GB2312", "GBK", "johab", and "Shift_JIS".
109+
Add partial support (only BMP characters) for multi-byte encodings
110+
"Big5-HKSCS", "EUC_JIS-2004", "EUC_JISX0213", "Shift_JIS-2004",
111+
"Shift_JISX0213", "utf-8-sig" and non-standard aliases like "UTF8"
112+
(without hyphen).
113+
The parser now raises :exc:`ValueError` for known unsupported
114+
multi-byte encodings such us "ISO-2022-JP" or "raw-unicode-escape"
115+
instead of failing later, when encounter non-ASCII data.
116+
(Contributed by Serhiy Storchaka in :gh:`62259`.)
117+
104118
.. Add improved modules above alphabetically, not here at the end.
105119
106120
Optimizations

Include/internal/pycore_ceval.h

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -208,16 +208,16 @@ extern void _PyEval_DeactivateOpCache(void);
208208

209209
/* --- _Py_EnterRecursiveCall() ----------------------------------------- */
210210

211-
static inline int _Py_MakeRecCheck(PyThreadState *tstate) {
211+
static inline int _Py_ReachedRecursionLimit(PyThreadState *tstate) {
212212
uintptr_t here_addr = _Py_get_machine_stack_pointer();
213213
_PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate;
214-
// Overflow if stack pointer is between soft limit and the base of the hardware stack.
215-
// If it is below the hardware stack base, assume that we have the wrong stack limits, and do nothing.
216-
// We could have the wrong stack limits because of limited platform support, or user-space threads.
214+
// Possible overflow if stack pointer is beyond the soft limit.
215+
// _Py_CheckRecursiveCall will check for corner cases and
216+
// report an error if there is an overflow.
217217
#if _Py_STACK_GROWS_DOWN
218-
return here_addr < _tstate->c_stack_soft_limit && here_addr >= _tstate->c_stack_soft_limit - 2 * _PyOS_STACK_MARGIN_BYTES;
218+
return here_addr < _tstate->c_stack_soft_limit;
219219
#else
220-
return here_addr > _tstate->c_stack_soft_limit && here_addr <= _tstate->c_stack_soft_limit + 2 * _PyOS_STACK_MARGIN_BYTES;
220+
return here_addr > _tstate->c_stack_soft_limit;
221221
#endif
222222
}
223223

@@ -232,7 +232,7 @@ PyAPI_FUNC(int) _Py_CheckRecursiveCallPy(
232232

233233
static inline int _Py_EnterRecursiveCallTstate(PyThreadState *tstate,
234234
const char *where) {
235-
return (_Py_MakeRecCheck(tstate) && _Py_CheckRecursiveCall(tstate, where));
235+
return (_Py_ReachedRecursionLimit(tstate) && _Py_CheckRecursiveCall(tstate, where));
236236
}
237237

238238
static inline int _Py_EnterRecursiveCall(const char *where) {
@@ -246,8 +246,6 @@ static inline void _Py_LeaveRecursiveCallTstate(PyThreadState *tstate) {
246246

247247
PyAPI_FUNC(void) _Py_InitializeRecursionLimits(PyThreadState *tstate);
248248

249-
PyAPI_FUNC(int) _Py_ReachedRecursionLimit(PyThreadState *tstate);
250-
251249
// Export for test_peg_generator
252250
PyAPI_FUNC(int) _Py_ReachedRecursionLimitWithMargin(
253251
PyThreadState *tstate,

Include/internal/pycore_codecs.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ extern int _PyCodec_UnregisterError(const char *name);
4545
in Python 3.5+?
4646
4747
*/
48-
extern PyObject* _PyCodec_LookupTextEncoding(
48+
PyAPI_FUNC(PyObject*) _PyCodec_LookupTextEncoding(
4949
const char *encoding,
5050
const char *alternate_command);
5151

Include/internal/pycore_pystate.h

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -316,15 +316,20 @@ static uintptr_t return_pointer_as_int(char* p) {
316316

317317
static inline uintptr_t
318318
_Py_get_machine_stack_pointer(void) {
319-
#if _Py__has_builtin(__builtin_frame_address) || defined(__GNUC__)
320-
return (uintptr_t)__builtin_frame_address(0);
321-
#elif defined(_MSC_VER)
322-
return (uintptr_t)_AddressOfReturnAddress();
319+
uintptr_t result;
320+
#if defined(_M_ARM64)
321+
result = __getReg(31);
322+
#elif defined(_M_X64) || defined(_M_IX86)
323+
result = (uintptr_t)_AddressOfReturnAddress();
324+
#elif defined(__aarch64__)
325+
__asm__ ("mov %0, sp" : "=r" (result));
326+
#elif defined(__x86_64__)
327+
__asm__("{movq %%rsp, %0" : "=r" (result));
323328
#else
324329
char here;
325-
/* Avoid compiler warning about returning stack address */
326-
return return_pointer_as_int(&here);
330+
result = (uintptr_t)&here;
327331
#endif
332+
return result;
328333
}
329334

330335
static inline intptr_t

Include/internal/pycore_pythonrun.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@ extern PyObject * _Py_CompileStringObjectWithModule(
4646
* stack consumption of PyEval_EvalDefault */
4747
#if (defined(Py_DEBUG) \
4848
|| defined(_Py_ADDRESS_SANITIZER) \
49-
|| defined(_Py_THREAD_SANITIZER))
49+
|| defined(_Py_THREAD_SANITIZER)) \
50+
|| defined(_Py_UNDEFINED_BEHAVIOR_SANITIZER)
5051
# define _PyOS_LOG2_STACK_MARGIN 12
5152
#else
5253
# define _PyOS_LOG2_STACK_MARGIN 11

Lib/asyncio/selector_events.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1233,7 +1233,6 @@ def close(self):
12331233

12341234
class _SelectorDatagramTransport(_SelectorTransport, transports.DatagramTransport):
12351235

1236-
_buffer_factory = collections.deque
12371236
_header_size = 8
12381237

12391238
def __init__(self, loop, sock, protocol, address=None,

Lib/codecs.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ class CodecInfo(tuple):
9393

9494
def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
9595
incrementalencoder=None, incrementaldecoder=None, name=None,
96-
*, _is_text_encoding=None):
96+
*, _is_text_encoding=None, _expat_decoding_table=None):
9797
self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
9898
self.name = name
9999
self.encode = encode
@@ -104,6 +104,8 @@ def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
104104
self.streamreader = streamreader
105105
if _is_text_encoding is not None:
106106
self._is_text_encoding = _is_text_encoding
107+
if _expat_decoding_table is not None:
108+
self._expat_decoding_table = _expat_decoding_table
107109
return self
108110

109111
def __repr__(self):

Lib/email/charset.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,6 @@
9393

9494
# Map charsets to their Unicode codec strings.
9595
CODEC_MAP = {
96-
'gb2312': 'eucgb2312_cn',
97-
'big5': 'big5_tw',
9896
# Hack: We don't want *any* conversion for stuff marked us-ascii, as all
9997
# sorts of garbage might be sent to us in the guise of 7-bit us-ascii.
10098
# Let that stuff pass through without conversion to/from Unicode.

Lib/encodings/big5.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,13 @@ def getregentry():
3636
incrementaldecoder=IncrementalDecoder,
3737
streamreader=StreamReader,
3838
streamwriter=StreamWriter,
39+
_expat_decoding_table=(*range(128),
40+
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
41+
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
42+
-1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
43+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
44+
-2, -2, -2, -2, -2, -2, -2, -2, -1, -2, -2, -2, -2, -2, -2, -2,
45+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
46+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2,
47+
-2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1),
3948
)

0 commit comments

Comments
 (0)