#2830: add html.escape() helper and move cgi.escape() uses in the standard library to it. It defaults to quote=True and also escapes single quotes, which makes casual use safer. The cgi.escape() interface is not touched, but emits a (silent) PendingDeprecationWarning.

2024-10-14 18:49:33 +00:00 · 2010-10-15 15:57:45 +00:00 · 2010-10-15 15:57:45 +00:00 · 1f7fffb308
parent 70543acfa1
commit 1f7fffb308
11 changed files with 94 additions and 28 deletions
--- a/Doc/howto/webservers.rst
+++ b/Doc/howto/webservers.rst
@ -293,7 +293,7 @@ following WSGI-application::
    # -*- coding: UTF-8 -*-

    import sys, os
-    from cgi import escape
+    from html import escape
    from flup.server.fcgi import WSGIServer

    def app(environ, start_response):
--- a/Doc/library/cgi.rst
+++ b/Doc/library/cgi.rst
@ -328,9 +328,9 @@ algorithms implemented in this module in other circumstances.
   attribute value delimited by double quotes, as in ``<a href="...">``.  Note
   that single quotes are never translated.

-   If the value to be quoted might include single- or double-quote characters,
-   or both, consider using the :func:`~xml.sax.saxutils.quoteattr` function in the
-   :mod:`xml.sax.saxutils` module instead.
+   .. deprecated:: 3.2
+      This function is unsafe because *quote* is false by default, and therefore
+      deprecated.  Use :func:`html.escape` instead.


 .. _cgi-security:
@ -508,8 +508,8 @@ Common problems and solutions

 .. rubric:: Footnotes

-.. [#] Note that some recent versions of the HTML specification do state what order the
-   field values should be supplied in, but knowing whether a request was
-   received from a conforming browser, or even from a browser at all, is tedious
-   and error-prone.
+.. [#] Note that some recent versions of the HTML specification do state what
+   order the field values should be supplied in, but knowing whether a request
+   was received from a conforming browser, or even from a browser at all, is
+   tedious and error-prone.

--- a/Doc/library/html.rst
+++ b/Doc/library/html.rst
@ -0,0 +1,18 @@
+:mod:`html` --- HyperText Markup Language support
+=================================================
+
+.. module:: html
+   :synopsis: Helpers for manipulating HTML.
+
+.. versionadded:: 3.2
+
+
+This module defines utilities to manipulate HTML.
+
+.. function:: escape(s, quote=True)
+
+   Convert the characters ``&``, ``<`` and ``>`` in string *s* to HTML-safe
+   sequences.  Use this if you need to display text that might contain such
+   characters in HTML.  If the optional flag *quote* is true, the characters
+   (``"``) and (``'``) are also translated; this helps for inclusion in an HTML
+   attribute value delimited by quotes, as in ``<a href="...">``.
--- a/Doc/library/markup.rst
+++ b/Doc/library/markup.rst
@ -20,6 +20,7 @@ definition of the Python bindings for the DOM and SAX interfaces.

 .. toctree::

+   html.rst
   html.parser.rst
   html.entities.rst
   pyexpat.rst
--- a/Lib/cgi.py
+++ b/Lib/cgi.py
@ -31,13 +31,13 @@
 # Imports
 # =======

-from operator import attrgetter
 from io import StringIO
 import sys
 import os
 import urllib.parse
 import email.parser
 from warnings import warn
+import html

 __all__ = ["MiniFieldStorage", "FieldStorage",
           "parse", "parse_qs", "parse_qsl", "parse_multipart",
@ -800,8 +800,8 @@ def print_exception(type=None, value=None, tb=None, limit=None):
    list = traceback.format_tb(tb, limit) + \
           traceback.format_exception_only(type, value)
    print("<PRE>%s<B>%s</B></PRE>" % (
-        escape("".join(list[:-1])),
-        escape(list[-1]),
+        html.escape("".join(list[:-1])),
+        html.escape(list[-1]),
        ))
    del tb

@ -812,7 +812,7 @@ def print_environ(environ=os.environ):
    print("<H3>Shell Environment:</H3>")
    print("<DL>")
    for key in keys:
-        print("<DT>", escape(key), "<DD>", escape(environ[key]))
+        print("<DT>", html.escape(key), "<DD>", html.escape(environ[key]))
    print("</DL>")
    print()

@ -825,10 +825,10 @@ def print_form(form):
        print("<P>No form fields.")
    print("<DL>")
    for key in keys:
-        print("<DT>" + escape(key) + ":", end=' ')
+        print("<DT>" + html.escape(key) + ":", end=' ')
        value = form[key]
-        print("<i>" + escape(repr(type(value))) + "</i>")
-        print("<DD>" + escape(repr(value)))
+        print("<i>" + html.escape(repr(type(value))) + "</i>")
+        print("<DD>" + html.escape(repr(value)))
    print("</DL>")
    print()

@ -839,9 +839,9 @@ def print_directory():
    try:
        pwd = os.getcwd()
    except os.error as msg:
-        print("os.error:", escape(str(msg)))
+        print("os.error:", html.escape(str(msg)))
    else:
-        print(escape(pwd))
+        print(html.escape(pwd))
    print()

 def print_arguments():
@ -899,9 +899,9 @@ def print_environ_usage():
 # =========

 def escape(s, quote=None):
-    '''Replace special characters "&", "<" and ">" to HTML-safe sequences.
-    If the optional flag quote is true, the quotation mark character (")
-    is also translated.'''
+    """Deprecated API."""
+    warn("cgi.escape is deprecated, use html.escape instead",
+         PendingDeprecationWarning, stacklevel=2)
    s = s.replace("&", "&amp;") # Must be done first!
    s = s.replace("<", "&lt;")
    s = s.replace(">", "&gt;")
@ -909,6 +909,7 @@ def escape(s, quote=None):
        s = s.replace('"', "&quot;")
    return s

+
 def valid_boundary(s, _vb_pattern="^[ -~]{0,200}[!-~]$"):
    import re
    return re.match(_vb_pattern, s)
--- a/Lib/html/init.py
+++ b/Lib/html/init.py
@ -1 +1,20 @@
-# This directory is a Python package.
+"""
+General functions for HTML manipulation.
+"""
+
+
+_escape_map = {ord('&'): '&amp;', ord('<'): '&lt;', ord('>'): '&gt;'}
+_escape_map_full = {ord('&'): '&amp;', ord('<'): '&lt;', ord('>'): '&gt;',
+                    ord('"'): '&quot;', ord('\''): '&#x27;'}
+
+# NB: this is a candidate for a bytes/string polymorphic interface
+
+def escape(s, quote=True):
+    """
+    Replace special characters "&", "<" and ">" to HTML-safe sequences.
+    If the optional flag quote is true (the default), the quotation mark
+    character (") is also translated.
+    """
+    if quote:
+        return s.translate(_escape_map_full)
+    return s.translate(_escape_map)
--- a/Lib/http/server.py
+++ b/Lib/http/server.py
@ -84,7 +84,7 @@

 __all__ = ["HTTPServer", "BaseHTTPRequestHandler"]

-import cgi
+import html
 import email.message
 import email.parser
 import http.client
@ -705,7 +705,7 @@ def list_directory(self, path):
            return None
        list.sort(key=lambda a: a.lower())
        r = []
-        displaypath = cgi.escape(urllib.parse.unquote(self.path))
+        displaypath = html.escape(urllib.parse.unquote(self.path))
        r.append('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">')
        r.append("<html>\n<title>Directory listing for %s</title>\n" % displaypath)
        r.append("<body>\n<h2>Directory listing for %s</h2>\n" % displaypath)
@ -721,7 +721,7 @@ def list_directory(self, path):
                displayname = name + "@"
                # Note: a link to a directory displays with @ and links with /
            r.append('<li><a href="%s">%s</a>\n'
-                    % (urllib.parse.quote(linkname), cgi.escape(displayname)))
+                    % (urllib.parse.quote(linkname), html.escape(displayname)))
        r.append("</ul>\n<hr>\n</body>\n</html>\n")
        enc = sys.getfilesystemencoding()
        encoded = ''.join(r).encode(enc)
--- a/Lib/lib2to3/tests/test_util.py
+++ b/Lib/lib2to3/tests/test_util.py
@ -568,8 +568,8 @@ def test_beginning(self):

    def test_from_import(self):
        node = parse('bar()')
-        fixer_util.touch_import("cgi", "escape", node)
-        self.assertEqual(str(node), 'from cgi import escape\nbar()\n\n')
+        fixer_util.touch_import("html", "escape", node)
+        self.assertEqual(str(node), 'from html import escape\nbar()\n\n')

    def test_name_import(self):
        node = parse('bar()')
--- a/Lib/test/test_html.py
+++ b/Lib/test/test_html.py
@ -0,0 +1,24 @@
+"""
+Tests for the html module functions.
+"""
+
+import html
+import unittest
+from test.support import run_unittest
+
+
+class HtmlTests(unittest.TestCase):
+    def test_escape(self):
+        self.assertEqual(
+            html.escape('\'<script>"&foo;"</script>\''),
+            '&#x27;&lt;script&gt;&quot;&amp;foo;&quot;&lt;/script&gt;&#x27;')
+        self.assertEqual(
+            html.escape('\'<script>"&foo;"</script>\'', False),
+            '\'&lt;script&gt;"&amp;foo;"&lt;/script&gt;\'')
+
+
+def test_main():
+    run_unittest(HtmlTests)
+
+if __name__ == '__main__':
+    test_main()
--- a/Lib/test/test_xml_etree.py
+++ b/Lib/test/test_xml_etree.py
@ -12,7 +12,7 @@
 # except if the test is specific to the Python implementation.

 import sys
-import cgi
+import html
 import unittest

 from test import support
@ -1328,7 +1328,7 @@ def processinginstruction():
  <p>Example.</p>
  <xi:include href="{}"/>
 </document>
-""".format(cgi.escape(SIMPLE_XMLFILE, True))
+""".format(html.escape(SIMPLE_XMLFILE, True))

 def xinclude_loader(href, parse="xml", encoding=None):
    try:
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -24,6 +24,9 @@ Core and Builtins
 Library
 -------

+- Issue #2830: Add the ``html.escape()`` function, which quotes all problematic
+  characters by default.  Deprecate ``cgi.escape()``.  
+
 - Issue 9409: Fix the regex to match all kind of filenames, for interactive
  debugging in doctests.