[cookies] Parse cookies leniently (#4780)

Closes #4776, #3778
Authored by: Grub4K
This commit is contained in:
Simon Sawicki 2022-09-16 19:02:00 +02:00 committed by GitHub
parent 5736d79172
commit 8817a80d3a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 244 additions and 1 deletions

View file

@ -3,6 +3,7 @@
from yt_dlp import cookies
from yt_dlp.cookies import (
LenientSimpleCookie,
LinuxChromeCookieDecryptor,
MacChromeCookieDecryptor,
WindowsChromeCookieDecryptor,
@ -137,3 +138,148 @@ def test_safari_cookie_parsing(self):
def test_pbkdf2_sha1(self):
key = pbkdf2_sha1(b'peanuts', b' ' * 16, 1, 16)
self.assertEqual(key, b'g\xe1\x8e\x0fQ\x1c\x9b\xf3\xc9`!\xaa\x90\xd9\xd34')
class TestLenientSimpleCookie(unittest.TestCase):
def _run_tests(self, *cases):
for message, raw_cookie, expected in cases:
cookie = LenientSimpleCookie(raw_cookie)
with self.subTest(message, expected=expected):
self.assertEqual(cookie.keys(), expected.keys(), message)
for key, expected_value in expected.items():
morsel = cookie[key]
if isinstance(expected_value, tuple):
expected_value, expected_attributes = expected_value
else:
expected_attributes = {}
attributes = {
key: value
for key, value in dict(morsel).items()
if value != ""
}
self.assertEqual(attributes, expected_attributes, message)
self.assertEqual(morsel.value, expected_value, message)
def test_parsing(self):
self._run_tests(
# Copied from https://github.com/python/cpython/blob/v3.10.7/Lib/test/test_http_cookies.py
(
"Test basic cookie",
"chips=ahoy; vienna=finger",
{"chips": "ahoy", "vienna": "finger"},
),
(
"Test quoted cookie",
'keebler="E=mc2; L=\\"Loves\\"; fudge=\\012;"',
{"keebler": 'E=mc2; L="Loves"; fudge=\012;'},
),
(
"Allow '=' in an unquoted value",
"keebler=E=mc2",
{"keebler": "E=mc2"},
),
(
"Allow cookies with ':' in their name",
"key:term=value:term",
{"key:term": "value:term"},
),
(
"Allow '[' and ']' in cookie values",
"a=b; c=[; d=r; f=h",
{"a": "b", "c": "[", "d": "r", "f": "h"},
),
(
"Test basic cookie attributes",
'Customer="WILE_E_COYOTE"; Version=1; Path=/acme',
{"Customer": ("WILE_E_COYOTE", {"version": "1", "path": "/acme"})},
),
(
"Test flag only cookie attributes",
'Customer="WILE_E_COYOTE"; HttpOnly; Secure',
{"Customer": ("WILE_E_COYOTE", {"httponly": True, "secure": True})},
),
(
"Test flag only attribute with values",
"eggs=scrambled; httponly=foo; secure=bar; Path=/bacon",
{"eggs": ("scrambled", {"httponly": "foo", "secure": "bar", "path": "/bacon"})},
),
(
"Test special case for 'expires' attribute, 4 digit year",
'Customer="W"; expires=Wed, 01 Jan 2010 00:00:00 GMT',
{"Customer": ("W", {"expires": "Wed, 01 Jan 2010 00:00:00 GMT"})},
),
(
"Test special case for 'expires' attribute, 2 digit year",
'Customer="W"; expires=Wed, 01 Jan 98 00:00:00 GMT',
{"Customer": ("W", {"expires": "Wed, 01 Jan 98 00:00:00 GMT"})},
),
(
"Test extra spaces in keys and values",
"eggs = scrambled ; secure ; path = bar ; foo=foo ",
{"eggs": ("scrambled", {"secure": True, "path": "bar"}), "foo": "foo"},
),
(
"Test quoted attributes",
'Customer="WILE_E_COYOTE"; Version="1"; Path="/acme"',
{"Customer": ("WILE_E_COYOTE", {"version": "1", "path": "/acme"})}
),
# Our own tests that CPython passes
(
"Allow ';' in quoted value",
'chips="a;hoy"; vienna=finger',
{"chips": "a;hoy", "vienna": "finger"},
),
(
"Keep only the last set value",
"a=c; a=b",
{"a": "b"},
),
)
def test_lenient_parsing(self):
self._run_tests(
(
"Ignore and try to skip invalid cookies",
'chips={"ahoy;": 1}; vienna="finger;"',
{"vienna": "finger;"},
),
(
"Ignore cookies without a name",
"a=b; unnamed; c=d",
{"a": "b", "c": "d"},
),
(
"Ignore '\"' cookie without name",
'a=b; "; c=d',
{"a": "b", "c": "d"},
),
(
"Skip all space separated values",
"x a=b c=d x; e=f",
{"a": "b", "c": "d", "e": "f"},
),
(
"Skip all space separated values",
'x a=b; data={"complex": "json", "with": "key=value"}; x c=d x',
{"a": "b", "c": "d"},
),
(
"Expect quote mending",
'a=b; invalid="; c=d',
{"a": "b", "c": "d"},
),
(
"Reset morsel after invalid to not capture attributes",
"a=b; invalid; Version=1; c=d",
{"a": "b", "c": "d"},
),
(
"Continue after non-flag attribute without value",
"a=b; path; Version=1; c=d",
{"a": "b", "c": "d"},
),
)

View file

@ -1,6 +1,7 @@
import base64
import contextlib
import http.cookiejar
import http.cookies
import json
import os
import re
@ -990,3 +991,98 @@ def _parse_browser_specification(browser_name, profile=None, keyring=None, conta
if profile is not None and _is_path(profile):
profile = os.path.expanduser(profile)
return browser_name, profile, keyring, container
class LenientSimpleCookie(http.cookies.SimpleCookie):
"""More lenient version of http.cookies.SimpleCookie"""
# From https://github.com/python/cpython/blob/v3.10.7/Lib/http/cookies.py
_LEGAL_KEY_CHARS = r"\w\d!#%&'~_`><@,:/\$\*\+\-\.\^\|\)\(\?\}\{\="
_LEGAL_VALUE_CHARS = _LEGAL_KEY_CHARS + r"\[\]"
_RESERVED = {
"expires",
"path",
"comment",
"domain",
"max-age",
"secure",
"httponly",
"version",
"samesite",
}
_FLAGS = {"secure", "httponly"}
# Added 'bad' group to catch the remaining value
_COOKIE_PATTERN = re.compile(r"""
\s* # Optional whitespace at start of cookie
(?P<key> # Start of group 'key'
[""" + _LEGAL_KEY_CHARS + r"""]+?# Any word of at least one letter
) # End of group 'key'
( # Optional group: there may not be a value.
\s*=\s* # Equal Sign
( # Start of potential value
(?P<val> # Start of group 'val'
"(?:[^\\"]|\\.)*" # Any doublequoted string
| # or
\w{3},\s[\w\d\s-]{9,11}\s[\d:]{8}\sGMT # Special case for "expires" attr
| # or
[""" + _LEGAL_VALUE_CHARS + r"""]* # Any word or empty string
) # End of group 'val'
| # or
(?P<bad>(?:\\;|[^;])*?) # 'bad' group fallback for invalid values
) # End of potential value
)? # End of optional value group
\s* # Any number of spaces.
(\s+|;|$) # Ending either at space, semicolon, or EOS.
""", re.ASCII | re.VERBOSE)
def load(self, data):
# Workaround for https://github.com/yt-dlp/yt-dlp/issues/4776
if not isinstance(data, str):
return super().load(data)
morsel = None
index = 0
length = len(data)
while 0 <= index < length:
match = self._COOKIE_PATTERN.search(data, index)
if not match:
break
index = match.end(0)
if match.group("bad"):
morsel = None
continue
key, value = match.group("key", "val")
if key[0] == "$":
if morsel is not None:
morsel[key[1:]] = True
continue
lower_key = key.lower()
if lower_key in self._RESERVED:
if morsel is None:
continue
if value is None:
if lower_key not in self._FLAGS:
morsel = None
continue
value = True
else:
value, _ = self.value_decode(value)
morsel[key] = value
elif value is not None:
morsel = self.get(key, http.cookies.Morsel())
real_value, coded_value = self.value_decode(value)
morsel.set(key, real_value, coded_value)
self[key] = morsel
else:
morsel = None

View file

@ -22,6 +22,7 @@
from ..compat import functools # isort: split
from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
from ..cookies import LenientSimpleCookie
from ..downloader import FileDownloader
from ..downloader.f4m import get_base_url, remove_encrypted_media
from ..utils import (
@ -3632,7 +3633,7 @@ def _set_cookie(self, domain, name, value, expire_time=None, port=None,
def _get_cookies(self, url):
""" Return a http.cookies.SimpleCookie with the cookies for the url """
return http.cookies.SimpleCookie(self._downloader._calc_cookies(url))
return LenientSimpleCookie(self._downloader._calc_cookies(url))
def _apply_first_set_cookie_header(self, url_handle, cookie):
"""