merged unescapeHTML branch; removed lxml dependency

This commit is contained in:
Filippo Valsorda 2012-04-11 00:22:51 +02:00
commit 9e6dd23876
5 changed files with 92 additions and 57 deletions

View file

@ -18,7 +18,6 @@ if [ ! -d wine-py2exe ]; then
axel -a "http://www.python.org/ftp/python/2.7/python-2.7.msi"
axel -a "http://downloads.sourceforge.net/project/py2exe/py2exe/0.6.9/py2exe-0.6.9.win32-py2.7.exe"
axel -a "http://pypi.python.org/packages/2.7/l/lxml/lxml-2.3.win32-py2.7.exe"
#axel -a "http://winetricks.org/winetricks"
# http://appdb.winehq.org/objectManager.php?sClass=version&iId=21957
@ -28,13 +27,9 @@ if [ ! -d wine-py2exe ]; then
echo "Follow py2exe setup on screen"
wine py2exe-0.6.9.win32-py2.7.exe
echo "Follow lxml setup on screen"
wine lxml-2.3.win32-py2.7.exe
#echo "Follow Microsoft Visual C++ 2008 Redistributable Package setup on screen"
#bash winetricks vcrun2008
rm lxml-2.3.win32-py2.7.exe
rm py2exe-0.6.9.win32-py2.7.exe
rm python-2.7.msi
#rm winetricks

Binary file not shown.

Binary file not shown.

View file

@ -24,11 +24,6 @@
except ImportError:
from cgi import parse_qs
try:
import lxml.etree
except ImportError:
pass # Handled below
try:
import xml.etree.ElementTree
except ImportError: # Python<2.5: Not officially supported, but let it slip
@ -193,8 +188,8 @@ def _closed_captions_xml_to_srt(self, xml_string):
end = start + float(dur)
start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
caption = unescapeHTML(caption)
caption = unescapeHTML(caption) # double cycle, inentional
srt += str(n) + '\n'
srt += start + ' --> ' + end + '\n'
srt += caption + '\n\n'
@ -364,18 +359,9 @@ def _real_extract(self, url):
pass
# description
try:
lxml.etree
except NameError:
video_description = u'No description available.'
mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
if mobj is not None:
video_description = mobj.group(1).decode('utf-8')
else:
html_parser = lxml.etree.HTMLParser(encoding='utf-8')
vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
# TODO use another parser
video_description = get_element_by_id("eow-description", video_webpage)
if video_description: video_description = clean_html(video_description.decode('utf8'))
else: video_description = ''
# closed captions
video_subtitles = None
@ -992,7 +978,7 @@ def _real_extract(self, url, new_video=True):
self._downloader.trouble(u'ERROR: Unable to extract media URL')
return
video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
video_url = unescapeHTML(video_url)
return [{
'id': video_id.decode('utf-8'),
@ -1069,18 +1055,9 @@ def _real_extract(self, url, new_video=True):
video_thumbnail = config["video"]["thumbnail"]
# Extract video description
try:
lxml.etree
except NameError:
video_description = u'No description available.'
mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
if mobj is not None:
video_description = mobj.group(1)
else:
html_parser = lxml.etree.HTMLParser()
vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
# TODO use another parser
video_description = get_element_by_id("description", webpage)
if video_description: video_description = clean_html(video_description.decode('utf8'))
else: video_description = ''
# Extract upload date
video_upload_date = u'NA'
@ -2248,8 +2225,6 @@ def report_config_download(self, showName):
self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
def _real_extract(self, url):
htmlParser = HTMLParser.HTMLParser()
mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@ -2265,11 +2240,11 @@ def _real_extract(self, url):
return
descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
description = htmlParser.unescape(descMatch.group(1))
description = unescapeHTML(descMatch.group(1))
imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
imgUrl = htmlParser.unescape(imgMatch.group(1))
imgUrl = unescapeHTML(imgMatch.group(1))
playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
playerUrl = unescapeHTML(playerUrlMatch.group(1))
configUrlMatch = re.search('config=(.*)$', playerUrl)
configUrl = urllib2.unquote(configUrlMatch.group(1))
@ -2324,8 +2299,6 @@ def report_extraction(self, video_id):
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
def _real_extract(self, url):
htmlParser = HTMLParser.HTMLParser()
mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@ -2391,8 +2364,6 @@ def report_extraction(self, video_id):
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
def _real_extract(self, url):
htmlParser = HTMLParser.HTMLParser()
mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@ -2475,8 +2446,6 @@ def report_extraction(self, video_id):
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
def _real_extract(self, url):
htmlParser = HTMLParser.HTMLParser()
mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@ -2561,8 +2530,6 @@ def report_extraction(self, video_id):
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
def _real_extract(self, url):
htmlParser = HTMLParser.HTMLParser()
mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@ -2782,8 +2749,6 @@ def _real_extract(self, url):
info['format'] = info['ext']
return [info]
elif mobj.group('course'): # A course page
unescapeHTML = HTMLParser.HTMLParser().unescape
course = mobj.group('course')
info = {
'id': simplify_title(course),
@ -2822,8 +2787,6 @@ def _real_extract(self, url):
return results
else: # Root page
unescapeHTML = HTMLParser.HTMLParser().unescape
info = {
'id': 'Stanford OpenClassroom',
'type': 'playlist',

View file

@ -74,9 +74,86 @@ def htmlentity_transform(matchobj):
return (u'&%s;' % entity)
class IDParser(HTMLParser.HTMLParser):
"""Modified HTMLParser that isolates a tag with the specified id"""
def __init__(self, id):
self.id = id
self.result = None
self.started = False
self.depth = {}
self.html = None
self.watch_startpos = False
HTMLParser.HTMLParser.__init__(self)
def loads(self, html):
self.html = html
self.feed(html)
self.close()
def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
if self.started:
self.find_startpos(None)
if 'id' in attrs and attrs['id'] == self.id:
self.result = [tag]
self.started = True
self.watch_startpos = True
if self.started:
if not tag in self.depth: self.depth[tag] = 0
self.depth[tag] += 1
def handle_endtag(self, tag):
if self.started:
if tag in self.depth: self.depth[tag] -= 1
if self.depth[self.result[0]] == 0:
self.started = False
self.result.append(self.getpos())
def find_startpos(self, x):
"""Needed to put the start position of the result (self.result[1])
after the opening tag with the requested id"""
if self.watch_startpos:
self.watch_startpos = False
self.result.append(self.getpos())
handle_entityref = handle_charref = handle_data = handle_comment = \
handle_decl = handle_pi = unknown_decl = find_startpos
def get_result(self):
if self.result == None: return None
if len(self.result) != 3: return None
lines = self.html.split('\n')
lines = lines[self.result[1][0]-1:self.result[2][0]]
lines[0] = lines[0][self.result[1][1]:]
if len(lines) == 1:
lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
lines[-1] = lines[-1][:self.result[2][1]]
return '\n'.join(lines).strip()
def get_element_by_id(id, html):
"""Return the content of the tag with the specified id in the passed HTML document"""
parser = IDParser(id)
try:
parser.loads(html)
except HTMLParser.HTMLParseError:
pass
return parser.get_result()
def clean_html(html):
"""Clean an HTML snippet into a readable string"""
# Newline vs <br />
html = html.replace('\n', ' ')
html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
# Strip html tags
html = re.sub('<.*?>', '', html)
# Replace html entities
html = unescapeHTML(html)
return html
def sanitize_title(utitle):
"""Sanitizes a video title so it could be used as part of a filename."""
utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
utitle = unescapeHTML(utitle)
return utitle.replace(unicode(os.sep), u'%')
@ -133,8 +210,8 @@ def unescapeHTML(s):
"""
assert type(s) == type(u'')
htmlParser = HTMLParser.HTMLParser()
return htmlParser.unescape(s)
result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
return result
def encodeFilename(s):
"""