#2927: Added the unescape() function to the html module.

This commit is contained in:
Ezio Melotti 2013-11-19 20:28:45 +02:00
parent 5160da1afc
commit 4a9ee26750
7 changed files with 215 additions and 49 deletions

View file

@ -20,6 +20,7 @@ This module defines four dictionaries, :data:`html5`,
Note that the trailing semicolon is included in the name (e.g. ``'gt;'``),
however some of the names are accepted by the standard even without the
semicolon: in this case the name is present with and without the ``';'``.
See also :func:`html.unescape`.
.. versionadded:: 3.3

View file

@ -20,6 +20,17 @@ This module defines utilities to manipulate HTML.
.. versionadded:: 3.2
.. function:: unescape(s)
Convert all named and numeric character references (e.g. ``>``,
``>``, ``&x3e;``) in the string *s* to the corresponding unicode
characters. This function uses the rules defined by the HTML 5 standard
for both valid and invalid character references, and the :data:`list of
HTML 5 named character references <html.entities.html5>`.
.. versionadded:: 3.4
--------------
Submodules in the ``html`` package are:

View file

@ -2,7 +2,12 @@
General functions for HTML manipulation.
"""
# NB: this is a candidate for a bytes/string polymorphic interface
import re as _re
from html.entities import html5 as _html5
__all__ = ['escape', 'unescape']
def escape(s, quote=True):
"""
@ -18,3 +23,110 @@ def escape(s, quote=True):
s = s.replace('"', "&quot;")
s = s.replace('\'', "&#x27;")
return s
# see http://www.w3.org/TR/html5/syntax.html#tokenizing-character-references
_invalid_charrefs = {
0x00: '\ufffd', # REPLACEMENT CHARACTER
0x0d: '\r', # CARRIAGE RETURN
0x80: '\u20ac', # EURO SIGN
0x81: '\x81', # <control>
0x82: '\u201a', # SINGLE LOW-9 QUOTATION MARK
0x83: '\u0192', # LATIN SMALL LETTER F WITH HOOK
0x84: '\u201e', # DOUBLE LOW-9 QUOTATION MARK
0x85: '\u2026', # HORIZONTAL ELLIPSIS
0x86: '\u2020', # DAGGER
0x87: '\u2021', # DOUBLE DAGGER
0x88: '\u02c6', # MODIFIER LETTER CIRCUMFLEX ACCENT
0x89: '\u2030', # PER MILLE SIGN
0x8a: '\u0160', # LATIN CAPITAL LETTER S WITH CARON
0x8b: '\u2039', # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
0x8c: '\u0152', # LATIN CAPITAL LIGATURE OE
0x8d: '\x8d', # <control>
0x8e: '\u017d', # LATIN CAPITAL LETTER Z WITH CARON
0x8f: '\x8f', # <control>
0x90: '\x90', # <control>
0x91: '\u2018', # LEFT SINGLE QUOTATION MARK
0x92: '\u2019', # RIGHT SINGLE QUOTATION MARK
0x93: '\u201c', # LEFT DOUBLE QUOTATION MARK
0x94: '\u201d', # RIGHT DOUBLE QUOTATION MARK
0x95: '\u2022', # BULLET
0x96: '\u2013', # EN DASH
0x97: '\u2014', # EM DASH
0x98: '\u02dc', # SMALL TILDE
0x99: '\u2122', # TRADE MARK SIGN
0x9a: '\u0161', # LATIN SMALL LETTER S WITH CARON
0x9b: '\u203a', # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
0x9c: '\u0153', # LATIN SMALL LIGATURE OE
0x9d: '\x9d', # <control>
0x9e: '\u017e', # LATIN SMALL LETTER Z WITH CARON
0x9f: '\u0178', # LATIN CAPITAL LETTER Y WITH DIAERESIS
}
_invalid_codepoints = {
# 0x0001 to 0x0008
0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
# 0x000E to 0x001F
0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
# 0x007F to 0x009F
0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a,
0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
# 0xFDD0 to 0xFDEF
0xfdd0, 0xfdd1, 0xfdd2, 0xfdd3, 0xfdd4, 0xfdd5, 0xfdd6, 0xfdd7, 0xfdd8,
0xfdd9, 0xfdda, 0xfddb, 0xfddc, 0xfddd, 0xfdde, 0xfddf, 0xfde0, 0xfde1,
0xfde2, 0xfde3, 0xfde4, 0xfde5, 0xfde6, 0xfde7, 0xfde8, 0xfde9, 0xfdea,
0xfdeb, 0xfdec, 0xfded, 0xfdee, 0xfdef,
# others
0xb, 0xfffe, 0xffff, 0x1fffe, 0x1ffff, 0x2fffe, 0x2ffff, 0x3fffe, 0x3ffff,
0x4fffe, 0x4ffff, 0x5fffe, 0x5ffff, 0x6fffe, 0x6ffff, 0x7fffe, 0x7ffff,
0x8fffe, 0x8ffff, 0x9fffe, 0x9ffff, 0xafffe, 0xaffff, 0xbfffe, 0xbffff,
0xcfffe, 0xcffff, 0xdfffe, 0xdffff, 0xefffe, 0xeffff, 0xffffe, 0xfffff,
0x10fffe, 0x10ffff
}
def _replace_charref(s):
s = s.group(1)
if s[0] == '#':
# numeric charref
if s[1] in 'xX':
num = int(s[2:].rstrip(';'), 16)
else:
num = int(s[1:].rstrip(';'))
if num in _invalid_charrefs:
return _invalid_charrefs[num]
if 0xD800 <= num <= 0xDFFF or num > 0x10FFFF:
return '\uFFFD'
if num in _invalid_codepoints:
return ''
return chr(num)
else:
# named charref
if s in _html5:
return _html5[s]
# find the longest matching name (as defined by the standard)
for x in range(len(s)-1, 1, -1):
if s[:x] in _html5:
return _html5[s[:x]] + s[x:]
else:
return '&' + s
_charref = _re.compile(r'&(#[0-9]+;?'
r'|#[xX][0-9a-fA-F]+;?'
r'|[^\t\n\f <&#;]{1,32};?)')
def unescape(s):
"""
Convert all named and numeric character references (e.g. &gt;, &#62;,
&x3e;) in the string s to the corresponding unicode characters.
This function uses the rules defined by the HTML 5 standard
for both valid and invalid character references, and the list of
HTML 5 named character references defined in html.entities.html5.
"""
if '&' not in s:
return s
return _charref.sub(_replace_charref, s)

View file

@ -8,9 +8,12 @@
# and CDATA (character data -- only end tags are special).
import _markupbase
import re
import warnings
import _markupbase
from html import unescape
__all__ = ['HTMLParser']
@ -357,7 +360,7 @@ def parse_starttag(self, i):
attrvalue[:1] == '"' == attrvalue[-1:]:
attrvalue = attrvalue[1:-1]
if attrvalue:
attrvalue = self.unescape(attrvalue)
attrvalue = unescape(attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = m.end()
@ -510,34 +513,3 @@ def handle_pi(self, data):
def unknown_decl(self, data):
if self.strict:
self.error("unknown declaration: %r" % (data,))
# Internal -- helper to remove special character quoting
def unescape(self, s):
if '&' not in s:
return s
def replaceEntities(s):
s = s.groups()[0]
try:
if s[0] == "#":
s = s[1:]
if s[0] in ['x','X']:
c = int(s[1:].rstrip(';'), 16)
else:
c = int(s.rstrip(';'))
return chr(c)
except ValueError:
return '&#' + s
else:
from html.entities import html5
if s in html5:
return html5[s]
elif s.endswith(';'):
return '&' + s
for x in range(2, len(s)):
if s[:x] in html5:
return html5[s[:x]] + s[x:]
else:
return '&' + s
return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))",
replaceEntities, s, flags=re.ASCII)

View file

@ -16,9 +16,89 @@ def test_escape(self):
html.escape('\'<script>"&foo;"</script>\'', False),
'\'&lt;script&gt;"&amp;foo;"&lt;/script&gt;\'')
def test_unescape(self):
numeric_formats = ['&#%d', '&#%d;', '&#x%x', '&#x%x;']
errmsg = 'unescape(%r) should have returned %r'
def check(text, expected):
self.assertEqual(html.unescape(text), expected,
msg=errmsg % (text, expected))
def check_num(num, expected):
for format in numeric_formats:
text = format % num
self.assertEqual(html.unescape(text), expected,
msg=errmsg % (text, expected))
# check text with no character references
check('no character references', 'no character references')
# check & followed by invalid chars
check('&\n&\t& &&', '&\n&\t& &&')
# check & followed by numbers and letters
check('&0 &9 &a &0; &9; &a;', '&0 &9 &a &0; &9; &a;')
# check incomplete entities at the end of the string
for x in ['&', '&#', '&#x', '&#X', '&#y', '&#xy', '&#Xy']:
check(x, x)
check(x+';', x+';')
# check several combinations of numeric character references,
# possibly followed by different characters
formats = ['&#%d', '&#%07d', '&#%d;', '&#%07d;',
'&#x%x', '&#x%06x', '&#x%x;', '&#x%06x;',
'&#x%X', '&#x%06X', '&#X%x;', '&#X%06x;']
for num, char in zip([65, 97, 34, 38, 0x2603, 0x101234],
['A', 'a', '"', '&', '\u2603', '\U00101234']):
for s in formats:
check(s % num, char)
for end in [' ', 'X']:
check((s+end) % num, char+end)
# check invalid codepoints
for cp in [0xD800, 0xDB00, 0xDC00, 0xDFFF, 0x110000]:
check_num(cp, '\uFFFD')
# check more invalid codepoints
for cp in [0x1, 0xb, 0xe, 0x7f, 0xfffe, 0xffff, 0x10fffe, 0x10ffff]:
check_num(cp, '')
# check invalid numbers
for num, ch in zip([0x0d, 0x80, 0x95, 0x9d], '\r\u20ac\u2022\x9d'):
check_num(num, ch)
# check small numbers
check_num(0, '\uFFFD')
check_num(9, '\t')
# check a big number
check_num(1000000000000000000, '\uFFFD')
# check that multiple trailing semicolons are handled correctly
for e in ['&quot;;', '&#34;;', '&#x22;;', '&#X22;;']:
check(e, '";')
# check that semicolons in the middle don't create problems
for e in ['&quot;quot;', '&#34;quot;', '&#x22;quot;', '&#X22;quot;']:
check(e, '"quot;')
# check triple adjacent charrefs
for e in ['&quot', '&#34', '&#x22', '&#X22']:
check(e*3, '"""')
check((e+';')*3, '"""')
# check that the case is respected
for e in ['&amp', '&amp;', '&AMP', '&AMP;']:
check(e, '&')
for e in ['&Amp', '&Amp;']:
check(e, e)
# check that non-existent named entities are returned unchanged
check('&svadilfari;', '&svadilfari;')
# the following examples are in the html5 specs
check('&notit', '¬it')
check('&notit;', '¬it;')
check('&notin', '¬in')
check('&notin;', '')
# a similar example with a long name
check('&notReallyAnExistingNamedCharacterReference;',
'¬ReallyAnExistingNamedCharacterReference;')
# longest valid name
check('&CounterClockwiseContourIntegral;', '')
# check a charref that maps to two unicode chars
check('&acE;', '\u223E\u0333')
check('&acE', '&acE')
# see #12888
check('&#123; ' * 1050, '{ ' * 1050)
# see #15156
check('&Eacuteric&Eacute;ric&alphacentauri&alpha;centauri',
'ÉricÉric&alphacentauriαcentauri')
check('&co;', '&co;')
def test_main():
run_unittest(HtmlTests)
if __name__ == '__main__':
test_main()
unittest.main()

View file

@ -569,18 +569,6 @@ def test_EOF_in_charref(self):
for html, expected in data:
self._run_check(html, expected)
def test_unescape_function(self):
p = self.get_collector()
self.assertEqual(p.unescape('&#bad;'),'&#bad;')
self.assertEqual(p.unescape('&#0038;'),'&')
# see #12888
self.assertEqual(p.unescape('&#123; ' * 1050), '{ ' * 1050)
# see #15156
self.assertEqual(p.unescape('&Eacuteric&Eacute;ric'
'&alphacentauri&alpha;centauri'),
'ÉricÉric&alphacentauriαcentauri')
self.assertEqual(p.unescape('&co;'), '&co;')
def test_broken_comments(self):
html = ('<! not really a comment >'
'<! not a comment either -->'

View file

@ -59,6 +59,8 @@ Library
- Issue #19449: in csv's writerow, handle non-string keys when generating the
error message that certain keys are not in the 'fieldnames' list.
- Issue #2927: Added the unescape() function to the html module.
- Issue #8402: Added the escape() function to the glob module.
- Issue #17618: Add Base85 and Ascii85 encoding/decoding to the base64 module.