Merge 'ytdl-org/youtube-dl/master' release 2020.11.19

Old Extractors left behind:
	VLivePlaylistIE
	YoutubeSearchURLIE
	YoutubeShowIE
	YoutubeFavouritesIE

If removing old extractors, make corresponding changes in
	docs/supportedsites.md
	youtube_dlc/extractor/extractors.py

Not merged:
	.github/ISSUE_TEMPLATE/1_broken_site.md
	.github/ISSUE_TEMPLATE/2_site_support_request.md
	.github/ISSUE_TEMPLATE/3_site_feature_request.md
	.github/ISSUE_TEMPLATE/4_bug_report.md
	.github/ISSUE_TEMPLATE/5_feature_request.md
	test/test_all_urls.py
	youtube_dlc/version.py
	Changelog
This commit is contained in:
pukkandan 2020-11-20 00:52:59 +05:30
parent 228385340e
commit 8bdd16b499
34 changed files with 1828 additions and 1695 deletions

View file

@ -61,7 +61,7 @@ def build_lazy_ie(ie, name):
return s
# find the correct sorting and add the required base classes so that sublcasses
# find the correct sorting and add the required base classes so that subclasses
# can be correctly created
classes = _ALL_CLASSES[:-1]
ordered_cls = []

View file

@ -59,9 +59,9 @@ # Supported sites
- **ARD:mediathek**
- **ARDBetaMediathek**
- **Arkena**
- **arte.tv:+7**
- **arte.tv:embed**
- **arte.tv:playlist**
- **ArteTV**
- **ArteTVEmbed**
- **ArteTVPlaylist**
- **AsianCrush**
- **AsianCrushPlaylist**
- **AtresPlayer**
@ -424,6 +424,7 @@ # Supported sites
- **la7.it**
- **laola1tv**
- **laola1tv:embed**
- **lbry.tv**
- **LCI**
- **Lcp**
- **LcpPlay**
@ -835,8 +836,6 @@ # Supported sites
- **SpankBangPlaylist**
- **Spankwire**
- **Spiegel**
- **Spiegel:Article**: Articles on spiegel.de
- **Spiegeltv**
- **sport.francetvinfo.fr**
- **Sport5**
- **SportBox**
@ -1147,19 +1146,18 @@ # Supported sites
- **YourPorn**
- **YourUpload**
- **youtube**: YouTube.com
- **youtube:channel**: YouTube.com channels
- **youtube:favorites**: YouTube.com favourite videos, ":ytfav" for short (requires authentication)
- **youtube:history**: Youtube watch history, ":ythistory" for short (requires authentication)
- **youtube:live**: YouTube.com live streams
- **youtube:playlist**: YouTube.com playlists
- **youtube:playlists**: YouTube.com user/channel playlists
- **youtube:recommended**: YouTube.com recommended videos, ":ytrec" for short (requires authentication)
- **youtube:search**: YouTube.com searches
- **youtube:search:date**: YouTube.com searches, newest videos first
- **youtube:search_url**: YouTube.com search URLs
- **youtube:show**: YouTube.com (multi-season) shows
- **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)
- **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword)
- **YoutubeYtUser**: YouTube.com user videos (URL or "ytuser" keyword)
- **youtube:tab**: YouTube.com tab
- **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication)
- **Zapiks**
- **Zaq1**

View file

@ -31,15 +31,17 @@ def assertMatch(self, url, ie_list):
def test_youtube_playlist_matching(self):
assertPlaylist = lambda url: self.assertMatch(url, ['youtube:playlist'])
assertTab = lambda url: self.assertMatch(url, ['youtube:tab'])
assertPlaylist('ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
assertPlaylist('UUBABnxM4Ar9ten8Mdjj1j0Q') # 585
assertPlaylist('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')
assertPlaylist('PL63F0C78739B09958')
assertTab('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')
assertPlaylist('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
assertPlaylist('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
assertPlaylist('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668
assertTab('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
assertTab('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668
self.assertFalse('youtube:playlist' in self.matching_ies('PLtS2H6bU1M'))
# Top tracks
assertPlaylist('https://www.youtube.com/playlist?list=MCUS.20142101')
assertTab('https://www.youtube.com/playlist?list=MCUS.20142101')
def test_youtube_matching(self):
self.assertTrue(YoutubeIE.suitable('PLtS2H6bU1M'))
@ -50,26 +52,22 @@ def test_youtube_matching(self):
self.assertMatch('http://www.cleanvideosearch.com/media/action/yt/watch?videoId=8v_4O44sfjM', ['youtube'])
def test_youtube_channel_matching(self):
assertChannel = lambda url: self.assertMatch(url, ['youtube:channel'])
assertChannel = lambda url: self.assertMatch(url, ['youtube:tab'])
assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM')
assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM?feature=gb_ch_rec')
assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM/videos')
def test_youtube_user_matching(self):
self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:user'])
# def test_youtube_user_matching(self):
# self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:tab'])
def test_youtube_feeds(self):
self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watchlater'])
self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions'])
self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended'])
self.assertMatch('https://www.youtube.com/my_favorites', ['youtube:favorites'])
def test_youtube_show_matching(self):
self.assertMatch('http://www.youtube.com/show/airdisasters', ['youtube:show'])
def test_youtube_search_matching(self):
self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url'])
self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url'])
# def test_youtube_search_matching(self):
# self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url'])
# self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url'])
def test_youtube_extract(self):
assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id)

View file

@ -937,6 +937,28 @@ def test_js_to_json_edgecases(self):
self.assertEqual(d['x'], 1)
self.assertEqual(d['y'], 'a')
# Just drop ! prefix for now though this results in a wrong value
on = js_to_json('''{
a: !0,
b: !1,
c: !!0,
d: !!42.42,
e: !!![],
f: !"abc",
g: !"",
!42: 42
}''')
self.assertEqual(json.loads(on), {
'a': 0,
'b': 1,
'c': 0,
'd': 42.42,
'e': [],
'f': "abc",
'g': "",
'42': 42
})
on = js_to_json('["abc", "def",]')
self.assertEqual(json.loads(on), ['abc', 'def'])
@ -994,6 +1016,12 @@ def test_js_to_json_edgecases(self):
on = js_to_json('{42:4.2e1}')
self.assertEqual(json.loads(on), {'42': 42.0})
on = js_to_json('{ "0x40": "0x40" }')
self.assertEqual(json.loads(on), {'0x40': '0x40'})
on = js_to_json('{ "040": "040" }')
self.assertEqual(json.loads(on), {'040': '040'})
def test_js_to_json_malformed(self):
self.assertEqual(js_to_json('42a1'), '42"a1"')
self.assertEqual(js_to_json('42a-1'), '42"a"-1')

View file

@ -275,7 +275,7 @@ def _real_extract(self, url):
video_element = video_xml.findall(compat_xpath('./track/video'))[-1]
if video_element is None or video_element.text is None:
raise ExtractorError(
'Video %s video does not exist' % video_id, expected=True)
'Video %s does not exist' % video_id, expected=True)
video_url = video_element.text.strip()

View file

@ -4,23 +4,57 @@
import re
from .common import InfoExtractor
from ..compat import compat_str
from ..compat import (
compat_str,
compat_urlparse,
)
from ..utils import (
ExtractorError,
int_or_none,
qualities,
try_get,
unified_strdate,
url_or_none,
)
# There are different sources of video in arte.tv, the extraction process
# is different for each one. The videos usually expire in 7 days, so we can't
# add tests.
class ArteTVBaseIE(InfoExtractor):
def _extract_from_json_url(self, json_url, video_id, lang, title=None):
info = self._download_json(json_url, video_id)
_ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
_API_BASE = 'https://api.arte.tv/api/player/v1'
class ArteTVIE(ArteTVBaseIE):
_VALID_URL = r'''(?x)
https?://
(?:
(?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
)
/(?P<id>\d{6}-\d{3}-[AF])
''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
_TESTS = [{
'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
'info_dict': {
'id': '088501-000-A',
'ext': 'mp4',
'title': 'Mexico: Stealing Petrol to Survive',
'upload_date': '20190628',
},
}, {
'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
'only_matching': True,
}, {
'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
'only_matching': True,
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
lang = mobj.group('lang') or mobj.group('lang_2')
info = self._download_json(
'%s/config/%s/%s' % (self._API_BASE, lang, video_id), video_id)
player_info = info['videoJsonPlayer']
vsr = try_get(player_info, lambda x: x['VSR'], dict)
@ -37,18 +71,11 @@ def _extract_from_json_url(self, json_url, video_id, lang, title=None):
if not upload_date_str:
upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0]
title = (player_info.get('VTI') or title or player_info['VID']).strip()
title = (player_info.get('VTI') or player_info['VID']).strip()
subtitle = player_info.get('VSU', '').strip()
if subtitle:
title += ' - %s' % subtitle
info_dict = {
'id': player_info['VID'],
'title': title,
'description': player_info.get('VDE'),
'upload_date': unified_strdate(upload_date_str),
'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
}
qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ'])
LANGS = {
@ -65,6 +92,10 @@ def _extract_from_json_url(self, json_url, video_id, lang, title=None):
formats = []
for format_id, format_dict in vsr.items():
f = dict(format_dict)
format_url = url_or_none(f.get('url'))
streamer = f.get('streamer')
if not format_url and not streamer:
continue
versionCode = f.get('versionCode')
l = re.escape(langcode)
@ -107,6 +138,16 @@ def _extract_from_json_url(self, json_url, video_id, lang, title=None):
else:
lang_pref = -1
media_type = f.get('mediaType')
if media_type == 'hls':
m3u8_formats = self._extract_m3u8_formats(
format_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id=format_id, fatal=False)
for m3u8_format in m3u8_formats:
m3u8_format['language_preference'] = lang_pref
formats.extend(m3u8_formats)
continue
format = {
'format_id': format_id,
'preference': -10 if f.get('videoFormat') == 'M3U8' else None,
@ -118,7 +159,7 @@ def _extract_from_json_url(self, json_url, video_id, lang, title=None):
'quality': qfunc(f.get('quality')),
}
if f.get('mediaType') == 'rtmp':
if media_type == 'rtmp':
format['url'] = f['streamer']
format['play_path'] = 'mp4:' + f['url']
format['ext'] = 'flv'
@ -127,56 +168,50 @@ def _extract_from_json_url(self, json_url, video_id, lang, title=None):
formats.append(format)
self._check_formats(formats, video_id)
self._sort_formats(formats)
info_dict['formats'] = formats
return info_dict
return {
'id': player_info.get('VID') or video_id,
'title': title,
'description': player_info.get('VDE'),
'upload_date': unified_strdate(upload_date_str),
'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
'formats': formats,
}
class ArteTVPlus7IE(ArteTVBaseIE):
IE_NAME = 'arte.tv:+7'
_VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>\d{6}-\d{3}-[AF])'
class ArteTVEmbedIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
_TESTS = [{
'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
'info_dict': {
'id': '088501-000-A',
'id': '100605-013-A',
'ext': 'mp4',
'title': 'Mexico: Stealing Petrol to Survive',
'upload_date': '20190628',
'title': 'United we Stream November Lockdown Edition #13',
'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
'upload_date': '20201116',
},
}, {
'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
'only_matching': True,
}]
def _real_extract(self, url):
lang, video_id = re.match(self._VALID_URL, url).groups()
return self._extract_from_json_url(
'https://api.arte.tv/api/player/v1/config/%s/%s' % (lang, video_id),
video_id, lang)
class ArteTVEmbedIE(ArteTVPlus7IE):
IE_NAME = 'arte.tv:embed'
_VALID_URL = r'''(?x)
https://www\.arte\.tv
/player/v3/index\.php\?json_url=
(?P<json_url>
https?://api\.arte\.tv/api/player/v1/config/
(?P<lang>[^/]+)/(?P<id>\d{6}-\d{3}-[AF])
)
'''
_TESTS = []
@staticmethod
def _extract_urls(webpage):
return [url for _, url in re.findall(
r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1',
webpage)]
def _real_extract(self, url):
json_url, lang, video_id = re.match(self._VALID_URL, url).groups()
return self._extract_from_json_url(json_url, video_id, lang)
qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
json_url = qs['json_url'][0]
video_id = ArteTVIE._match_id(json_url)
return self.url_result(
json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
class ArteTVPlaylistIE(ArteTVBaseIE):
IE_NAME = 'arte.tv:playlist'
_VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>RC-\d{6})'
_VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
_TESTS = [{
'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
'info_dict': {
@ -185,17 +220,35 @@ class ArteTVPlaylistIE(ArteTVBaseIE):
'description': 'md5:d322c55011514b3a7241f7fb80d494c2',
},
'playlist_mincount': 6,
}, {
'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
'only_matching': True,
}]
def _real_extract(self, url):
lang, playlist_id = re.match(self._VALID_URL, url).groups()
collection = self._download_json(
'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos'
% (lang, playlist_id), playlist_id)
'%s/collectionData/%s/%s?source=videos'
% (self._API_BASE, lang, playlist_id), playlist_id)
entries = []
for video in collection['videos']:
if not isinstance(video, dict):
continue
video_url = url_or_none(video.get('url')) or url_or_none(video.get('jsonUrl'))
if not video_url:
continue
video_id = video.get('programId')
entries.append({
'_type': 'url_transparent',
'url': video_url,
'id': video_id,
'title': video.get('title'),
'alt_title': video.get('subtitle'),
'thumbnail': url_or_none(try_get(video, lambda x: x['mainImage']['url'], compat_str)),
'duration': int_or_none(video.get('durationSeconds')),
'view_count': int_or_none(video.get('views')),
'ie_key': ArteTVIE.ie_key(),
})
title = collection.get('title')
description = collection.get('shortDescription') or collection.get('teaserText')
entries = [
self._extract_from_json_url(
video['jsonUrl'], video.get('programId') or playlist_id, lang)
for video in collection['videos'] if video.get('jsonUrl')]
return self.playlist_result(entries, playlist_id, title, description)

View file

@ -1,3 +1,4 @@
# coding: utf-8
from __future__ import unicode_literals
import random
@ -5,10 +6,7 @@
import time
from .common import InfoExtractor
from ..compat import (
compat_str,
compat_urlparse,
)
from ..compat import compat_str
from ..utils import (
ExtractorError,
float_or_none,
@ -17,71 +15,32 @@
parse_filesize,
str_or_none,
try_get,
unescapeHTML,
update_url_query,
unified_strdate,
unified_timestamp,
url_or_none,
urljoin,
)
class BandcampBaseIE(InfoExtractor):
"""Provide base functions for Bandcamp extractors"""
def _extract_json_from_html_data_attribute(self, webpage, suffix, video_id):
json_string = self._html_search_regex(
r' data-%s="([^"]*)' % suffix,
webpage, '%s json' % suffix, default='{}')
return self._parse_json(json_string, video_id)
def _parse_json_track(self, json):
formats = []
file_ = json.get('file')
if isinstance(file_, dict):
for format_id, format_url in file_.items():
if not url_or_none(format_url):
continue
ext, abr_str = format_id.split('-', 1)
formats.append({
'format_id': format_id,
'url': self._proto_relative_url(format_url, 'http:'),
'ext': ext,
'vcodec': 'none',
'acodec': ext,
'abr': int_or_none(abr_str),
})
return {
'duration': float_or_none(json.get('duration')),
'id': str_or_none(json.get('track_id') or json.get('id')),
'title': json.get('title'),
'title_link': json.get('title_link'),
'number': int_or_none(json.get('track_num')),
'formats': formats
}
class BandcampIE(BandcampBaseIE):
IE_NAME = "Bandcamp:track"
_VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<title>[^/?#&]+)'
class BandcampIE(InfoExtractor):
_VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
'md5': 'c557841d5e50261777a6585648adf439',
'info_dict': {
'id': '1812978515',
'ext': 'mp3',
'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad",
'title': "youtube-dl \"'/\\ä↭ - youtube-dl \"'/\\ä↭ - youtube-dl test song \"'/\\ä↭",
'duration': 9.8485,
'uploader': "youtube-dl \"'/\\\u00e4\u21ad",
'timestamp': 1354224127,
'uploader': 'youtube-dl "\'/\\ä↭',
'upload_date': '20121129',
'timestamp': 1354224127,
},
'_skip': 'There is a limit of 200 free downloads / month for the test song'
}, {
# free download
'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
'md5': '5d92af55811e47f38962a54c30b07ef0',
'info_dict': {
'id': '2650410135',
'ext': 'aiff',
@ -120,52 +79,59 @@ class BandcampIE(BandcampBaseIE):
},
}]
def _extract_data_attr(self, webpage, video_id, attr='tralbum', fatal=True):
return self._parse_json(self._html_search_regex(
r'data-%s=(["\'])({.+?})\1' % attr, webpage,
attr + ' data', group=2), video_id, fatal=fatal)
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
title = mobj.group('title')
url_track_title = title
title = self._match_id(url)
webpage = self._download_webpage(url, title)
thumbnail = self._html_search_meta('og:image', webpage, default=None)
tralbum = self._extract_data_attr(webpage, title)
thumbnail = self._og_search_thumbnail(webpage)
json_tralbum = self._extract_json_from_html_data_attribute(webpage, "tralbum", url_track_title)
json_embed = self._extract_json_from_html_data_attribute(webpage, "embed", url_track_title)
track_id = None
track = None
track_number = None
duration = None
json_tracks = json_tralbum.get('trackinfo')
if not json_tracks:
raise ExtractorError('Could not extract track')
formats = []
track_info = try_get(tralbum, lambda x: x['trackinfo'][0], dict)
if track_info:
file_ = track_info.get('file')
if isinstance(file_, dict):
for format_id, format_url in file_.items():
if not url_or_none(format_url):
continue
ext, abr_str = format_id.split('-', 1)
formats.append({
'format_id': format_id,
'url': self._proto_relative_url(format_url, 'http:'),
'ext': ext,
'vcodec': 'none',
'acodec': ext,
'abr': int_or_none(abr_str),
})
track = track_info.get('title')
track_id = str_or_none(
track_info.get('track_id') or track_info.get('id'))
track_number = int_or_none(track_info.get('track_num'))
duration = float_or_none(track_info.get('duration'))
track = self._parse_json_track(json_tracks[0])
artist = json_tralbum.get('artist')
album_title = json_embed.get('album_title')
embed = self._extract_data_attr(webpage, title, 'embed', False)
current = tralbum.get('current') or {}
artist = embed.get('artist') or current.get('artist') or tralbum.get('artist')
timestamp = unified_timestamp(
current.get('publish_date') or tralbum.get('album_publish_date'))
json_album = json_tralbum.get('packages')
if json_album:
json_album = json_album[0]
album_publish_date = json_album.get('album_publish_date')
album_release_date = json_album.get('album_release_date')
else:
album_publish_date = None
album_release_date = json_tralbum.get('album_release_date')
timestamp = unified_timestamp(json_tralbum.get('current', {}).get('publish_date') or album_publish_date)
release_date = unified_strdate(album_release_date)
download_link = self._search_regex(
r'freeDownloadPage(?:["\']|&quot;):\s*(["\']|&quot;)(?P<url>(?:(?!\1).)+)\1', webpage,
'download link', default=None, group='url')
download_link = tralbum.get('freeDownloadPage')
if download_link:
track_id = self._search_regex(
r'\?id=(?P<id>\d+)&',
download_link, 'track id')
track_id = compat_str(tralbum['id'])
download_webpage = self._download_webpage(
download_link, track_id, 'Downloading free downloads page')
blob = self._parse_json(
self._search_regex(
r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage,
'blob', group='blob'),
track_id, transform_source=unescapeHTML)
blob = self._extract_data_attr(download_webpage, track_id, 'blob')
info = try_get(
blob, (lambda x: x['digital_items'][0],
@ -173,6 +139,8 @@ def _real_extract(self, url):
if info:
downloads = info.get('downloads')
if isinstance(downloads, dict):
if not track:
track = info.get('title')
if not artist:
artist = info.get('artist')
if not thumbnail:
@ -206,7 +174,7 @@ def _real_extract(self, url):
retry_url = url_or_none(stat.get('retry_url'))
if not retry_url:
continue
track['formats'].append({
formats.append({
'url': self._proto_relative_url(retry_url, 'http:'),
'ext': download_formats.get(format_id),
'format_id': format_id,
@ -215,30 +183,34 @@ def _real_extract(self, url):
'vcodec': 'none',
})
self._sort_formats(track['formats'])
self._sort_formats(formats)
title = '%s - %s' % (artist, track.get('title')) if artist else track.get('title')
title = '%s - %s' % (artist, track) if artist else track
if not duration:
duration = float_or_none(self._html_search_meta(
'duration', webpage, default=None))
return {
'album': album_title,
'artist': artist,
'duration': track['duration'],
'formats': track['formats'],
'id': track['id'],
'release_date': release_date,
'thumbnail': thumbnail,
'timestamp': timestamp,
'id': track_id,
'title': title,
'track': track['title'],
'track_id': track['id'],
'track_number': track['number'],
'uploader': artist
'thumbnail': thumbnail,
'uploader': artist,
'timestamp': timestamp,
'release_date': unified_strdate(tralbum.get('album_release_date')),
'duration': duration,
'track': track,
'track_number': track_number,
'track_id': track_id,
'artist': artist,
'album': embed.get('album_title'),
'formats': formats,
}
class BandcampAlbumIE(BandcampBaseIE):
class BandcampAlbumIE(BandcampIE):
IE_NAME = 'Bandcamp:album'
_VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?'
_VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<id>[^/?#&]+))?'
_TESTS = [{
'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
@ -248,7 +220,10 @@ class BandcampAlbumIE(BandcampBaseIE):
'info_dict': {
'id': '1353101989',
'ext': 'mp3',
'title': 'Intro',
'title': 'Blazo - Intro',
'timestamp': 1311756226,
'upload_date': '20110727',
'uploader': 'Blazo',
}
},
{
@ -256,7 +231,10 @@ class BandcampAlbumIE(BandcampBaseIE):
'info_dict': {
'id': '38097443',
'ext': 'mp3',
'title': 'Kero One - Keep It Alive (Blazo remix)',
'title': 'Blazo - Kero One - Keep It Alive (Blazo remix)',
'timestamp': 1311757238,
'upload_date': '20110727',
'uploader': 'Blazo',
}
},
],
@ -292,6 +270,7 @@ class BandcampAlbumIE(BandcampBaseIE):
'title': '"Entropy" EP',
'uploader_id': 'jstrecords',
'id': 'entropy-ep',
'description': 'md5:0ff22959c943622972596062f2f366a5',
},
'playlist_mincount': 3,
}, {
@ -301,6 +280,7 @@ class BandcampAlbumIE(BandcampBaseIE):
'id': 'we-are-the-plague',
'title': 'WE ARE THE PLAGUE',
'uploader_id': 'insulters',
'description': 'md5:b3cf845ee41b2b1141dc7bde9237255f',
},
'playlist_count': 2,
}]
@ -312,41 +292,34 @@ def suitable(cls, url):
else super(BandcampAlbumIE, cls).suitable(url))
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
uploader_id = mobj.group('subdomain')
album_id = mobj.group('album_id')
uploader_id, album_id = re.match(self._VALID_URL, url).groups()
playlist_id = album_id or uploader_id
webpage = self._download_webpage(url, playlist_id)
json_tralbum = self._extract_json_from_html_data_attribute(webpage, "tralbum", playlist_id)
json_embed = self._extract_json_from_html_data_attribute(webpage, "embed", playlist_id)
json_tracks = json_tralbum.get('trackinfo')
if not json_tracks:
raise ExtractorError('Could not extract album tracks')
album_title = json_embed.get('album_title')
tralbum = self._extract_data_attr(webpage, playlist_id)
track_info = tralbum.get('trackinfo')
if not track_info:
raise ExtractorError('The page doesn\'t contain any tracks')
# Only tracks with duration info have songs
tracks = [self._parse_json_track(track) for track in json_tracks]
entries = [
self.url_result(
compat_urlparse.urljoin(url, track['title_link']),
ie=BandcampIE.ie_key(), video_id=track['id'],
video_title=track['title'])
for track in tracks
if track.get('duration')]
urljoin(url, t['title_link']), BandcampIE.ie_key(),
str_or_none(t.get('track_id') or t.get('id')), t.get('title'))
for t in track_info
if t.get('duration')]
current = tralbum.get('current') or {}
return {
'_type': 'playlist',
'uploader_id': uploader_id,
'id': playlist_id,
'title': album_title,
'entries': entries
'title': current.get('title'),
'description': current.get('about'),
'entries': entries,
}
class BandcampWeeklyIE(InfoExtractor):
class BandcampWeeklyIE(BandcampIE):
IE_NAME = 'Bandcamp:weekly'
_VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)'
_TESTS = [{
@ -361,29 +334,23 @@ class BandcampWeeklyIE(InfoExtractor):
'release_date': '20170404',
'series': 'Bandcamp Weekly',
'episode': 'Magic Moments',
'episode_number': 208,
'episode_id': '224',
}
},
'params': {
'format': 'opus-lo',
},
}, {
'url': 'https://bandcamp.com/?blah/blah@&show=228',
'only_matching': True
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
show_id = self._match_id(url)
webpage = self._download_webpage(url, show_id)
blob = self._parse_json(
self._search_regex(
r'data-blob=(["\'])(?P<blob>{.+?})\1', webpage,
'blob', group='blob'),
video_id, transform_source=unescapeHTML)
blob = self._extract_data_attr(webpage, show_id, 'blob')
show = blob['bcw_show']
# This is desired because any invalid show id redirects to `bandcamp.com`
# which happens to expose the latest Bandcamp Weekly episode.
show_id = int_or_none(show.get('show_id')) or int_or_none(video_id)
show = blob['bcw_data'][show_id]
formats = []
for format_id, format_url in show['audio_stream'].items():
@ -408,20 +375,8 @@ def _real_extract(self, url):
if subtitle:
title += ' - %s' % subtitle
episode_number = None
seq = blob.get('bcw_seq')
if seq and isinstance(seq, list):
try:
episode_number = next(
int_or_none(e.get('episode_number'))
for e in seq
if isinstance(e, dict) and int_or_none(e.get('id')) == show_id)
except StopIteration:
pass
return {
'id': video_id,
'id': show_id,
'title': title,
'description': show.get('desc') or show.get('short_desc'),
'duration': float_or_none(show.get('audio_duration')),
@ -429,7 +384,6 @@ def _real_extract(self, url):
'release_date': unified_strdate(show.get('published_date')),
'series': 'Bandcamp Weekly',
'episode': show.get('subtitle'),
'episode_number': episode_number,
'episode_id': compat_str(video_id),
'episode_id': show_id,
'formats': formats
}

View file

@ -1,6 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import smuggle_url
@ -38,7 +39,7 @@ def _real_extract(self, url):
class CNBCVideoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?cnbc\.com/video/(?:[^/]+/)+(?P<id>[^./?#&]+)'
_VALID_URL = r'https?://(?:www\.)?cnbc\.com(?P<path>/video/(?:[^/]+/)+(?P<id>[^./?#&]+)\.html)'
_TEST = {
'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html',
'info_dict': {
@ -56,11 +57,15 @@ class CNBCVideoIE(InfoExtractor):
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
video_id = self._search_regex(
r'content_id["\']\s*:\s*["\'](\d+)', webpage, display_id,
'video id')
path, display_id = re.match(self._VALID_URL, url).groups()
video_id = self._download_json(
'https://webql-redesign.cnbcfm.com/graphql', display_id, query={
'query': '''{
page(path: "%s") {
vcpsId
}
}''' % path,
})['data']['page']['vcpsId']
return self.url_result(
'http://video.cnbc.com/gallery/?video=%s' % video_id,
'http://video.cnbc.com/gallery/?video=%d' % video_id,
CNBCIE.ie_key())

View file

@ -1456,9 +1456,10 @@ def _is_valid_url(self, url, video_id, item='video', headers={}):
try:
self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
return True
except ExtractorError:
except ExtractorError as e:
self.to_screen(
'%s: %s URL is invalid, skipping' % (video_id, item))
'%s: %s URL is invalid, skipping: %s'
% (video_id, item, error_to_compat_str(e.cause)))
return False
def http_scheme(self):

View file

@ -16,6 +16,8 @@
mimetype2ext,
orderedSet,
parse_iso8601,
strip_or_none,
try_get,
)
@ -82,6 +84,7 @@ class CondeNastIE(InfoExtractor):
'uploader': 'gq',
'upload_date': '20170321',
'timestamp': 1490126427,
'description': 'How much grimmer would things be if these people were competent?',
},
}, {
# JS embed
@ -93,7 +96,7 @@ class CondeNastIE(InfoExtractor):
'title': '3D printed TSA Travel Sentry keys really do open TSA locks',
'uploader': 'arstechnica',
'upload_date': '20150916',
'timestamp': 1442434955,
'timestamp': 1442434920,
}
}, {
'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player',
@ -196,6 +199,13 @@ def _extract_video(self, params):
})
self._sort_formats(formats)
subtitles = {}
for t, caption in video_info.get('captions', {}).items():
caption_url = caption.get('src')
if not (t in ('vtt', 'srt', 'tml') and caption_url):
continue
subtitles.setdefault('en', []).append({'url': caption_url})
return {
'id': video_id,
'formats': formats,
@ -208,6 +218,7 @@ def _extract_video(self, params):
'season': video_info.get('season_title'),
'timestamp': parse_iso8601(video_info.get('premiere_date')),
'categories': video_info.get('categories'),
'subtitles': subtitles,
}
def _real_extract(self, url):
@ -225,8 +236,16 @@ def _real_extract(self, url):
if url_type == 'series':
return self._extract_series(url, webpage)
else:
params = self._extract_video_params(webpage, display_id)
info = self._search_json_ld(
webpage, display_id, fatal=False)
video = try_get(self._parse_json(self._search_regex(
r'__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
'preload state', '{}'), display_id),
lambda x: x['transformed']['video'])
if video:
params = {'videoId': video['id']}
info = {'description': strip_or_none(video.get('description'))}
else:
params = self._extract_video_params(webpage, display_id)
info = self._search_json_ld(
webpage, display_id, fatal=False)
info.update(self._extract_video(params))
return info

View file

@ -62,7 +62,7 @@
ARDMediathekIE,
)
from .arte import (
ArteTVPlus7IE,
ArteTVIE,
ArteTVEmbedIE,
ArteTVPlaylistIE,
)
@ -542,6 +542,7 @@
EHFTVIE,
ITTFIE,
)
from .lbry import LBRYIE
from .lci import LCIIE
from .lcp import (
LcpPlayIE,
@ -1079,8 +1080,7 @@
SpankBangPlaylistIE,
)
from .spankwire import SpankwireIE
from .spiegel import SpiegelIE, SpiegelArticleIE
from .spiegeltv import SpiegeltvIE
from .spiegel import SpiegelIE
from .spike import (
BellatorIE,
ParamountNetworkIE,
@ -1505,12 +1505,11 @@
from .yourupload import YourUploadIE
from .youtube import (
YoutubeIE,
YoutubeChannelIE,
YoutubeFavouritesIE,
YoutubeHistoryIE,
YoutubeLiveIE,
YoutubeTabIE,
YoutubePlaylistIE,
YoutubePlaylistsIE,
YoutubeRecommendedIE,
YoutubeSearchDateIE,
YoutubeSearchIE,
@ -1519,7 +1518,7 @@
YoutubeSubscriptionsIE,
YoutubeTruncatedIDIE,
YoutubeTruncatedURLIE,
YoutubeUserIE,
YoutubeYtUserIE,
YoutubeWatchLaterIE,
)
from .zapiks import ZapiksIE

View file

@ -17,6 +17,7 @@
parse_duration,
try_get,
url_or_none,
urljoin,
)
from .dailymotion import DailymotionIE
@ -128,18 +129,38 @@ def sign(manifest_url, manifest_id):
is_live = None
formats = []
for video in info['videos']:
if video['statut'] != 'ONLINE':
videos = []
for video in (info.get('videos') or []):
if video.get('statut') != 'ONLINE':
continue
video_url = video['url']
if not video.get('url'):
continue
videos.append(video)
if not videos:
for device_type in ['desktop', 'mobile']:
fallback_info = self._download_json(
'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id,
video_id, 'Downloading fallback %s video JSON' % device_type, query={
'device_type': device_type,
'browser': 'chrome',
}, fatal=False)
if fallback_info and fallback_info.get('video'):
videos.append(fallback_info['video'])
formats = []
for video in videos:
video_url = video.get('url')
if not video_url:
continue
if is_live is None:
is_live = (try_get(
video, lambda x: x['plages_ouverture'][0]['direct'],
bool) is True) or '/live.francetv.fr/' in video_url
format_id = video['format']
video, lambda x: x['plages_ouverture'][0]['direct'], bool) is True
or video.get('is_live') is True
or '/live.francetv.fr/' in video_url)
format_id = video.get('format')
ext = determine_ext(video_url)
if ext == 'f4m':
if georestricted:
@ -154,6 +175,9 @@ def sign(manifest_url, manifest_id):
sign(video_url, format_id), video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id=format_id,
fatal=False))
elif ext == 'mpd':
formats.extend(self._extract_mpd_formats(
sign(video_url, format_id), video_id, mpd_id=format_id, fatal=False))
elif video_url.startswith('rtmp'):
formats.append({
'url': video_url,
@ -166,6 +190,7 @@ def sign(manifest_url, manifest_id):
'url': video_url,
'format_id': format_id,
})
self._sort_formats(formats)
title = info['titre']
@ -185,10 +210,10 @@ def sign(manifest_url, manifest_id):
return {
'id': video_id,
'title': self._live_title(title) if is_live else title,
'description': clean_html(info['synopsis']),
'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', info['image']),
'duration': int_or_none(info.get('real_duration')) or parse_duration(info['duree']),
'timestamp': int_or_none(info['diffusion']['timestamp']),
'description': clean_html(info.get('synopsis')),
'thumbnail': urljoin('http://pluzz.francetv.fr', info.get('image')),
'duration': int_or_none(info.get('real_duration')) or parse_duration(info.get('duree')),
'timestamp': int_or_none(try_get(info, lambda x: x['diffusion']['timestamp'])),
'is_live': is_live,
'formats': formats,
'subtitles': subtitles,

View file

@ -91,6 +91,7 @@
from .videa import VideaIE
from .twentymin import TwentyMinutenIE
from .ustream import UstreamIE
from .arte import ArteTVEmbedIE
from .videopress import VideoPressIE
from .rutube import RutubeIE
from .limelight import LimelightBaseIE
@ -2760,11 +2761,9 @@ def _real_extract(self, url):
return self.url_result(ustream_url, UstreamIE.ie_key())
# Look for embedded arte.tv player
mobj = re.search(
r'<(?:script|iframe) [^>]*?src="(?P<url>http://www\.arte\.tv/(?:playerv2/embed|arte_vp/index)[^"]+)"',
webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'ArteTVEmbed')
arte_urls = ArteTVEmbedIE._extract_urls(webpage)
if arte_urls:
return self.playlist_from_matches(arte_urls, video_id, video_title)
# Look for embedded francetv player
mobj = re.search(

View file

@ -150,7 +150,7 @@ def run(self, target, ip, timestamp):
elif function in other_functions:
other_functions[function]()
else:
raise ExtractorError('Unknown funcion %s' % function)
raise ExtractorError('Unknown function %s' % function)
return sdk.target

View file

@ -0,0 +1,88 @@
# coding: utf-8
from __future__ import unicode_literals
import json
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
determine_ext,
ExtractorError,
int_or_none,
mimetype2ext,
try_get,
)
class LBRYIE(InfoExtractor):
IE_NAME = 'lbry.tv'
_VALID_URL = r'https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/(?P<id>@[0-9a-zA-Z-]+:[0-9a-z]+/[0-9a-zA-Z().-]+:[0-9a-z])'
_TESTS = [{
# Video
'url': 'https://lbry.tv/@Mantega:1/First-day-LBRY:1',
'md5': '65bd7ec1f6744ada55da8e4c48a2edf9',
'info_dict': {
'id': '17f983b61f53091fb8ea58a9c56804e4ff8cff4d',
'ext': 'mp4',
'title': 'First day in LBRY? Start HERE!',
'description': 'md5:f6cb5c704b332d37f5119313c2c98f51',
'timestamp': 1595694354,
'upload_date': '20200725',
}
}, {
# Audio
'url': 'https://lbry.tv/@LBRYFoundation:0/Episode-1:e',
'md5': 'c94017d3eba9b49ce085a8fad6b98d00',
'info_dict': {
'id': 'e7d93d772bd87e2b62d5ab993c1c3ced86ebb396',
'ext': 'mp3',
'title': 'The LBRY Foundation Community Podcast Episode 1 - Introduction, Streaming on LBRY, Transcoding',
'description': 'md5:661ac4f1db09f31728931d7b88807a61',
'timestamp': 1591312601,
'upload_date': '20200604',
}
}, {
'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e',
'only_matching': True,
}]
def _call_api_proxy(self, method, display_id, params):
return self._download_json(
'https://api.lbry.tv/api/v1/proxy', display_id,
headers={'Content-Type': 'application/json-rpc'},
data=json.dumps({
'method': method,
'params': params,
}).encode())['result']
def _real_extract(self, url):
display_id = self._match_id(url).replace(':', '#')
uri = 'lbry://' + display_id
result = self._call_api_proxy(
'resolve', display_id, {'urls': [uri]})[uri]
result_value = result['value']
if result_value.get('stream_type') not in ('video', 'audio'):
raise ExtractorError('Unsupported URL', expected=True)
streaming_url = self._call_api_proxy(
'get', display_id, {'uri': uri})['streaming_url']
source = result_value.get('source') or {}
media = result_value.get('video') or result_value.get('audio') or {}
signing_channel = result_value.get('signing_channel') or {}
return {
'id': result['claim_id'],
'title': result_value['title'],
'thumbnail': try_get(result_value, lambda x: x['thumbnail']['url'], compat_str),
'description': result_value.get('description'),
'license': result_value.get('license'),
'timestamp': int_or_none(result.get('timestamp')),
'tags': result_value.get('tags'),
'width': int_or_none(media.get('width')),
'height': int_or_none(media.get('height')),
'duration': int_or_none(media.get('duration')),
'channel': signing_channel.get('name'),
'channel_id': signing_channel.get('claim_id'),
'ext': determine_ext(source.get('name')) or mimetype2ext(source.get('media_type')),
'filesize': int_or_none(source.get('size')),
'url': streaming_url,
}

View file

@ -5,28 +5,26 @@
from .common import InfoExtractor
from ..utils import (
determine_ext,
int_or_none,
parse_duration,
remove_end,
clean_html,
merge_dicts,
)
class LRTIE(InfoExtractor):
IE_NAME = 'lrt.lt'
_VALID_URL = r'https?://(?:www\.)?lrt\.lt/mediateka/irasas/(?P<id>[0-9]+)'
_VALID_URL = r'https?://(?:www\.)?lrt\.lt(?P<path>/mediateka/irasas/(?P<id>[0-9]+))'
_TESTS = [{
# m3u8 download
'url': 'http://www.lrt.lt/mediateka/irasas/54391/',
'md5': 'fe44cf7e4ab3198055f2c598fc175cb0',
'url': 'https://www.lrt.lt/mediateka/irasas/2000127261/greita-ir-gardu-sicilijos-ikvepta-klasikiniu-makaronu-su-baklazanais-vakariene',
'md5': '85cb2bb530f31d91a9c65b479516ade4',
'info_dict': {
'id': '54391',
'id': '2000127261',
'ext': 'mp4',
'title': 'Septynios Kauno dienos',
'description': 'md5:24d84534c7dc76581e59f5689462411a',
'duration': 1783,
'view_count': int,
'like_count': int,
'title': 'Greita ir gardu: Sicilijos įkvėpta klasikinių makaronų su baklažanais vakarienė',
'description': 'md5:ad7d985f51b0dc1489ba2d76d7ed47fa',
'duration': 3035,
'timestamp': 1604079000,
'upload_date': '20201030',
},
}, {
# direct mp3 download
@ -43,52 +41,35 @@ class LRTIE(InfoExtractor):
},
}]
def _extract_js_var(self, webpage, var_name, default):
return self._search_regex(
r'%s\s*=\s*(["\'])((?:(?!\1).)+)\1' % var_name,
webpage, var_name.replace('_', ' '), default, group=2)
def _real_extract(self, url):
video_id = self._match_id(url)
path, video_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage(url, video_id)
title = remove_end(self._og_search_title(webpage), ' - LRT')
media_url = self._extract_js_var(webpage, 'main_url', path)
media = self._download_json(self._extract_js_var(
webpage, 'media_info_url',
'https://www.lrt.lt/servisai/stream_url/vod/media_info/'),
video_id, query={'url': media_url})
jw_data = self._parse_jwplayer_data(
media['playlist_item'], video_id, base_url=url)
formats = []
for _, file_url in re.findall(
r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage):
ext = determine_ext(file_url)
if ext not in ('m3u8', 'mp3'):
json_ld_data = self._search_json_ld(webpage, video_id)
tags = []
for tag in (media.get('tags') or []):
tag_name = tag.get('name')
if not tag_name:
continue
# mp3 served as m3u8 produces stuttered media file
if ext == 'm3u8' and '.mp3' in file_url:
continue
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
file_url, video_id, 'mp4', entry_protocol='m3u8_native',
fatal=False))
elif ext == 'mp3':
formats.append({
'url': file_url,
'vcodec': 'none',
})
self._sort_formats(formats)
tags.append(tag_name)
thumbnail = self._og_search_thumbnail(webpage)
description = self._og_search_description(webpage)
duration = parse_duration(self._search_regex(
r'var\s+record_len\s*=\s*(["\'])(?P<duration>[0-9]+:[0-9]+:[0-9]+)\1',
webpage, 'duration', default=None, group='duration'))
view_count = int_or_none(self._html_search_regex(
r'<div[^>]+class=(["\']).*?record-desc-seen.*?\1[^>]*>(?P<count>.+?)</div>',
webpage, 'view count', fatal=False, group='count'))
like_count = int_or_none(self._search_regex(
r'<span[^>]+id=(["\'])flikesCount.*?\1>(?P<count>\d+)<',
webpage, 'like count', fatal=False, group='count'))
return {
'id': video_id,
'title': title,
'formats': formats,
'thumbnail': thumbnail,
'description': description,
'duration': duration,
'view_count': view_count,
'like_count': like_count,
clean_info = {
'description': clean_html(media.get('content')),
'tags': tags,
}
return merge_dicts(clean_info, jw_data, json_ld_data)

View file

@ -1,10 +1,16 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import merge_dicts
from ..utils import (
clean_html,
dict_get,
float_or_none,
int_or_none,
merge_dicts,
parse_duration,
try_get,
)
class MallTVIE(InfoExtractor):
@ -17,7 +23,7 @@ class MallTVIE(InfoExtractor):
'display_id': '18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice',
'ext': 'mp4',
'title': '18 miliard pro neziskovky. Opravdu jsou sportovci nebo Člověk v tísni pijavice?',
'description': 'md5:25fc0ec42a72ba602b602c683fa29deb',
'description': 'md5:db7d5744a4bd4043d9d98324aa72ab35',
'duration': 216,
'timestamp': 1538870400,
'upload_date': '20181007',
@ -37,20 +43,46 @@ def _real_extract(self, url):
webpage = self._download_webpage(
url, display_id, headers=self.geo_verification_headers())
SOURCE_RE = r'(<source[^>]+\bsrc=(?:(["\'])(?:(?!\2).)+|[^\s]+)/(?P<id>[\da-z]+)/index)\b'
video = self._parse_json(self._search_regex(
r'videoObject\s*=\s*JSON\.parse\(JSON\.stringify\(({.+?})\)\);',
webpage, 'video object'), display_id)
video_source = video['VideoSource']
video_id = self._search_regex(
SOURCE_RE, webpage, 'video id', group='id')
r'/([\da-z]+)/index\b', video_source, 'video id')
media = self._parse_html5_media_entries(
url, re.sub(SOURCE_RE, r'\1.m3u8', webpage), video_id,
m3u8_id='hls', m3u8_entry_protocol='m3u8_native')[0]
formats = self._extract_m3u8_formats(
video_source + '.m3u8', video_id, 'mp4', 'm3u8_native')
self._sort_formats(formats)
subtitles = {}
for s in (video.get('Subtitles') or {}):
s_url = s.get('Url')
if not s_url:
continue
subtitles.setdefault(s.get('Language') or 'cz', []).append({
'url': s_url,
})
entity_counts = video.get('EntityCounts') or {}
def get_count(k):
v = entity_counts.get(k + 's') or {}
return int_or_none(dict_get(v, ('Count', 'StrCount')))
info = self._search_json_ld(webpage, video_id, default={})
return merge_dicts(media, info, {
return merge_dicts({
'id': video_id,
'display_id': display_id,
'title': self._og_search_title(webpage, default=None) or display_id,
'description': self._og_search_description(webpage, default=None),
'thumbnail': self._og_search_thumbnail(webpage, default=None),
})
'title': video.get('Title'),
'description': clean_html(video.get('Description')),
'thumbnail': video.get('ThumbnailUrl'),
'formats': formats,
'subtitles': subtitles,
'duration': int_or_none(video.get('DurationSeconds')) or parse_duration(video.get('Duration')),
'view_count': get_count('View'),
'like_count': get_count('Like'),
'dislike_count': get_count('Dislike'),
'average_rating': float_or_none(try_get(video, lambda x: x['EntityRating']['AvarageRate'])),
'comment_count': get_count('Comment'),
}, info)

View file

@ -17,9 +17,8 @@
class MGTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P<id>\d+)\.html'
_VALID_URL = r'https?://(?:w(?:ww)?\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P<id>\d+)\.html'
IE_DESC = '芒果TV'
_GEO_COUNTRIES = ['CN']
_TESTS = [{
'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html',
@ -34,14 +33,18 @@ class MGTVIE(InfoExtractor):
}, {
'url': 'http://www.mgtv.com/b/301817/3826653.html',
'only_matching': True,
}, {
'url': 'https://w.mgtv.com/b/301817/3826653.html',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
tk2 = base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1]
try:
api_data = self._download_json(
'https://pcweb.api.mgtv.com/player/video', video_id, query={
'tk2': base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1],
'tk2': tk2,
'video_id': video_id,
}, headers=self.geo_verification_headers())['data']
except ExtractorError as e:
@ -56,6 +59,7 @@ def _real_extract(self, url):
stream_data = self._download_json(
'https://pcweb.api.mgtv.com/player/getSource', video_id, query={
'pm2': api_data['atc']['pm2'],
'tk2': tk2,
'video_id': video_id,
}, headers=self.geo_verification_headers())['data']
stream_domain = stream_data['stream_domain'][0]

View file

@ -403,6 +403,18 @@ class MTVIE(MTVServicesInfoExtractor):
'only_matching': True,
}]
@staticmethod
def extract_child_with_type(parent, t):
children = parent['children']
return next(c for c in children if c.get('type') == t)
def _extract_mgid(self, webpage):
data = self._parse_json(self._search_regex(
r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None)
main_container = self.extract_child_with_type(data, 'MainContainer')
video_player = self.extract_child_with_type(main_container, 'VideoPlayer')
return video_player['props']['media']['video']['config']['uri']
class MTVJapanIE(MTVServicesInfoExtractor):
IE_NAME = 'mtvjapan'

View file

@ -10,7 +10,6 @@
from ..compat import compat_urllib_parse_unquote
from ..utils import (
int_or_none,
js_to_json,
parse_duration,
smuggle_url,
try_get,
@ -394,8 +393,8 @@ def _real_extract(self, url):
webpage = self._download_webpage(url, video_id)
data = self._parse_json(self._search_regex(
r'window\.__data\s*=\s*({.+});', webpage,
'bootstrap json'), video_id, js_to_json)
r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>',
webpage, 'bootstrap json'), video_id)['props']['initialState']
video_data = try_get(data, lambda x: x['video']['current'], dict)
if not video_data:
video_data = data['article']['content'][0]['primaryMedia']['video']

View file

@ -82,6 +82,29 @@ class NDRIE(NDRBaseIE):
'params': {
'skip_download': True,
},
}, {
# with subtitles
'url': 'https://www.ndr.de/fernsehen/sendungen/extra_3/extra-3-Satiremagazin-mit-Christian-Ehring,sendung1091858.html',
'info_dict': {
'id': 'extra18674',
'display_id': 'extra-3-Satiremagazin-mit-Christian-Ehring',
'ext': 'mp4',
'title': 'Extra 3 vom 11.11.2020 mit Christian Ehring',
'description': 'md5:42ee53990a715eaaf4dc7f13a3bd56c6',
'uploader': 'ndrtv',
'upload_date': '20201113',
'duration': 1749,
'subtitles': {
'de': [{
'ext': 'ttml',
'url': r're:^https://www\.ndr\.de.+',
}],
},
},
'params': {
'skip_download': True,
},
'expected_warnings': ['Unable to download f4m manifest'],
}, {
'url': 'https://www.ndr.de/Fettes-Brot-Ferris-MC-und-Thees-Uhlmann-live-on-stage,festivalsommer116.html',
'only_matching': True,
@ -242,6 +265,20 @@ def _real_extract(self, url):
'preference': quality_key(thumbnail.get('quality')),
})
subtitles = {}
tracks = config.get('tracks')
if tracks and isinstance(tracks, list):
for track in tracks:
if not isinstance(track, dict):
continue
track_url = urljoin(url, track.get('src'))
if not track_url:
continue
subtitles.setdefault(track.get('srclang') or 'de', []).append({
'url': track_url,
'ext': 'ttml',
})
return {
'id': video_id,
'title': title,
@ -251,6 +288,7 @@ def _real_extract(self, url):
'duration': duration,
'thumbnails': thumbnails,
'formats': formats,
'subtitles': subtitles,
}

View file

@ -17,7 +17,7 @@
int_or_none,
parse_duration,
strip_or_none,
try_get,
unescapeHTML,
unified_strdate,
unified_timestamp,
update_url_query,
@ -30,7 +30,6 @@ class RaiBaseIE(InfoExtractor):
_UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}'
_GEO_COUNTRIES = ['IT']
_GEO_BYPASS = False
_BASE_URL = 'https://www.raiplay.it'
def _extract_relinker_info(self, relinker_url, video_id):
if not re.match(r'https?://', relinker_url):
@ -123,19 +122,40 @@ def _extract_subtitles(url, subtitle_url):
class RaiPlayIE(RaiBaseIE):
_VALID_URL = r'(?P<url>(?P<base>https?://(?:www\.)?raiplay\.it/.+?-)(?P<id>%s)(?P<ext>\.(?:html|json)))' % RaiBaseIE._UUID_RE
_VALID_URL = r'(?P<url>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s)\.(?:html|json))' % RaiBaseIE._UUID_RE
_TESTS = [{
'url': 'http://www.raiplay.it/video/2016/10/La-Casa-Bianca-e06118bb-59a9-4636-b914-498e4cfd2c66.html?source=twitter',
'md5': '340aa3b7afb54bfd14a8c11786450d76',
'info_dict': {
'id': 'e06118bb-59a9-4636-b914-498e4cfd2c66',
'ext': 'mp4',
'title': 'La Casa Bianca',
'alt_title': 'S2016 - Puntata del 23/10/2016',
'description': 'md5:a09d45890850458077d1f68bb036e0a5',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Rai 3',
'creator': 'Rai 3',
'duration': 3278,
'timestamp': 1477764300,
'upload_date': '20161029',
'series': 'La Casa Bianca',
'season': '2016',
},
'skip': 'This content is not available',
}, {
'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html',
'md5': '8970abf8caf8aef4696e7b1f2adfc696',
'info_dict': {
'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391',
'ext': 'mp4',
'title': 'Report del 07/04/2014',
'alt_title': 'St 2013/14 - Espresso nel caffè - 07/04/2014 ',
'alt_title': 'St 2013/14 - Espresso nel caffè - 07/04/2014',
'description': 'md5:d730c168a58f4bb35600fc2f881ec04e',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Rai Gulp',
'duration': 6160,
'series': 'Report',
'season': '2013/14',
},
'params': {
'skip_download': True,
@ -146,11 +166,10 @@ class RaiPlayIE(RaiBaseIE):
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
url, base, video_id, ext = mobj.group('url', 'base', 'id', 'ext')
url, video_id = re.match(self._VALID_URL, url).groups()
media = self._download_json(
'%s%s.json' % (base, video_id), video_id, 'Downloading video JSON')
url.replace('.html', '.json'), video_id, 'Downloading video JSON')
title = media['name']
video = media['video']
@ -159,34 +178,38 @@ def _real_extract(self, url):
self._sort_formats(relinker_info['formats'])
thumbnails = []
if 'images' in media:
for _, value in media.get('images').items():
if value:
thumbnails.append({
'url': urljoin(RaiBaseIE._BASE_URL, value.replace('[RESOLUTION]', '600x400'))
})
for _, value in media.get('images', {}).items():
if value:
thumbnails.append({
'url': urljoin(url, value),
})
timestamp = unified_timestamp(try_get(
media, lambda x: x['availabilities'][0]['start'], compat_str))
date_published = media.get('date_published')
time_published = media.get('time_published')
if date_published and time_published:
date_published += ' ' + time_published
subtitles = self._extract_subtitles(url, video.get('subtitles'))
program_info = media.get('program_info') or {}
season = media.get('season')
info = {
'id': video_id,
'title': self._live_title(title) if relinker_info.get(
'is_live') else title,
'alt_title': media.get('subtitle'),
'alt_title': strip_or_none(media.get('subtitle')),
'description': media.get('description'),
'uploader': strip_or_none(media.get('channel')),
'creator': strip_or_none(media.get('editor')),
'creator': strip_or_none(media.get('editor') or None),
'duration': parse_duration(video.get('duration')),
'timestamp': timestamp,
'timestamp': unified_timestamp(date_published),
'thumbnails': thumbnails,
'series': try_get(
media, lambda x: x['isPartOf']['name'], compat_str),
'season_number': int_or_none(try_get(
media, lambda x: x['isPartOf']['numeroStagioni'])),
'season': media.get('stagione') or None,
'series': program_info.get('name'),
'season_number': int_or_none(season),
'season': season if (season and not season.isdigit()) else None,
'episode': media.get('episode_title'),
'episode_number': int_or_none(media.get('episode')),
'subtitles': subtitles,
}
@ -203,7 +226,7 @@ class RaiPlayLiveIE(RaiBaseIE):
'display_id': 'rainews24',
'ext': 'mp4',
'title': 're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': 'md5:4d00bcf6dc98b27c6ec480de329d1497',
'description': 'md5:6eca31500550f9376819f174e5644754',
'uploader': 'Rai News 24',
'creator': 'Rai News 24',
'is_live': True,
@ -216,32 +239,20 @@ class RaiPlayLiveIE(RaiBaseIE):
def _real_extract(self, url):
display_id = self._match_id(url)
media = self._download_json(
'%s.json' % urljoin(RaiBaseIE._BASE_URL, 'dirette/' + display_id),
display_id, 'Downloading channel JSON')
webpage = self._download_webpage(url, display_id)
title = media['name']
video = media['video']
video_id = media['id'].replace('ContentItem-', '')
video_id = self._search_regex(
r'data-uniquename=["\']ContentItem-(%s)' % RaiBaseIE._UUID_RE,
webpage, 'content id')
relinker_info = self._extract_relinker_info(video['content_url'], video_id)
self._sort_formats(relinker_info['formats'])
info = {
return {
'_type': 'url_transparent',
'ie_key': RaiPlayIE.ie_key(),
'url': 'http://www.raiplay.it/dirette/ContentItem-%s.html' % video_id,
'id': video_id,
'display_id': display_id,
'title': self._live_title(title) if relinker_info.get(
'is_live') else title,
'alt_title': media.get('subtitle'),
'description': media.get('description'),
'uploader': strip_or_none(media.get('channel')),
'creator': strip_or_none(media.get('editor')),
'duration': parse_duration(video.get('duration')),
}
info.update(relinker_info)
return info
class RaiPlayPlaylistIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+)'
@ -250,7 +261,7 @@ class RaiPlayPlaylistIE(InfoExtractor):
'info_dict': {
'id': 'nondirloalmiocapo',
'title': 'Non dirlo al mio capo',
'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b',
'description': 'md5:9f3d603b2947c1c7abb098f3b14fac86',
},
'playlist_mincount': 12,
}]
@ -258,25 +269,21 @@ class RaiPlayPlaylistIE(InfoExtractor):
def _real_extract(self, url):
playlist_id = self._match_id(url)
media = self._download_json(
'%s.json' % urljoin(RaiBaseIE._BASE_URL, 'programmi/' + playlist_id),
playlist_id, 'Downloading program JSON')
webpage = self._download_webpage(url, playlist_id)
title = media['name']
description = media['program_info']['description']
content_sets = [s['id'] for b in media['blocks'] for s in b['sets']]
title = self._html_search_meta(
('programma', 'nomeProgramma'), webpage, 'title')
description = unescapeHTML(self._html_search_meta(
('description', 'og:description'), webpage, 'description'))
entries = []
for cs in content_sets:
medias = self._download_json(
'%s/%s.json' % (urljoin(RaiBaseIE._BASE_URL, 'programmi/' + playlist_id), cs),
cs, 'Downloading content set JSON')
for m in medias['items']:
video_url = urljoin(url, m['path_id'])
entries.append(self.url_result(
video_url, ie=RaiPlayIE.ie_key(),
video_id=RaiPlayIE._match_id(video_url)))
for mobj in re.finditer(
r'<a\b[^>]+\bhref=(["\'])(?P<path>/raiplay/video/.+?)\1',
webpage):
video_url = urljoin(url, mobj.group('path'))
entries.append(self.url_result(
video_url, ie=RaiPlayIE.ie_key(),
video_id=RaiPlayIE._match_id(video_url)))
return self.playlist_result(entries, playlist_id, title, description)
@ -294,7 +301,8 @@ class RaiIE(RaiBaseIE):
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 1758,
'upload_date': '20140612',
}
},
'skip': 'This content is available only in Italy',
}, {
# with ContentItem in many metas
'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html',
@ -320,6 +328,19 @@ class RaiIE(RaiBaseIE):
'duration': 2214,
'upload_date': '20161103',
}
}, {
# drawMediaRaiTV(...)
'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html',
'md5': '2dd727e61114e1ee9c47f0da6914e178',
'info_dict': {
'id': '59d69d28-6bb6-409d-a4b5-ed44096560af',
'ext': 'mp4',
'title': 'Il pacco',
'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a',
'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20141221',
},
'skip': 'This content is not available',
}, {
# initEdizione('ContentItem-...'
'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined',
@ -331,6 +352,18 @@ class RaiIE(RaiBaseIE):
'upload_date': '20170401',
},
'skip': 'Changes daily',
}, {
# HDS live stream with only relinker URL
'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews',
'info_dict': {
'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc',
'ext': 'flv',
'title': 'EuroNews',
},
'params': {
'skip_download': True,
},
'skip': 'This content is available only in Italy',
}, {
# HLS live stream with ContentItem in og:url
'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html',

View file

@ -1,9 +1,15 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
determine_ext,
float_or_none,
int_or_none,
unified_timestamp,
urlencode_postdata,
url_or_none,
)
class ServusIE(InfoExtractor):
@ -12,20 +18,29 @@ class ServusIE(InfoExtractor):
(?:www\.)?
(?:
servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)|
servustv\.com/videos
(?:servustv|pm-wissen)\.com/videos
)
/(?P<id>[aA]{2}-\w+|\d+-\d+)
'''
_TESTS = [{
# new URL schema
'url': 'https://www.servustv.com/videos/aa-1t6vbu5pw1w12/',
'md5': '3e1dd16775aa8d5cbef23628cfffc1f4',
'md5': '60474d4c21f3eb148838f215c37f02b9',
'info_dict': {
'id': 'AA-1T6VBU5PW1W12',
'ext': 'mp4',
'title': 'Die Grünen aus Sicht des Volkes',
'alt_title': 'Talk im Hangar-7 Voxpops Gruene',
'description': 'md5:1247204d85783afe3682644398ff2ec4',
'thumbnail': r're:^https?://.*\.jpg',
'duration': 62.442,
'timestamp': 1605193976,
'upload_date': '20201112',
'series': 'Talk im Hangar-7',
'season': 'Season 9',
'season_number': 9,
'episode': 'Episode 31 - September 14',
'episode_number': 31,
}
}, {
# old URL schema
@ -40,30 +55,94 @@ class ServusIE(InfoExtractor):
}, {
'url': 'https://www.servus.com/tv/videos/1380889096408-1235196658/',
'only_matching': True,
}, {
'url': 'https://www.pm-wissen.com/videos/aa-24mus4g2w2112/',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url).upper()
webpage = self._download_webpage(url, video_id)
title = self._search_regex(
(r'videoLabel\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1',
r'<h\d+[^>]+\bclass=["\']heading--(?:one|two)["\'][^>]*>(?P<title>[^<]+)'),
webpage, 'title', default=None,
group='title') or self._og_search_title(webpage)
title = re.sub(r'\s*-\s*Servus TV\s*$', '', title)
description = self._og_search_description(webpage)
thumbnail = self._og_search_thumbnail(webpage)
token = self._download_json(
'https://auth.redbullmediahouse.com/token', video_id,
'Downloading token', data=urlencode_postdata({
'grant_type': 'client_credentials',
}), headers={
'Authorization': 'Basic SVgtMjJYNEhBNFdEM1cxMTpEdDRVSkFLd2ZOMG5IMjB1NGFBWTBmUFpDNlpoQ1EzNA==',
})
access_token = token['access_token']
token_type = token.get('token_type', 'Bearer')
formats = self._extract_m3u8_formats(
'https://stv.rbmbtnx.net/api/v1/manifests/%s.m3u8' % video_id,
video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls')
video = self._download_json(
'https://sparkle-api.liiift.io/api/v1/stv/channels/international/assets/%s' % video_id,
video_id, 'Downloading video JSON', headers={
'Authorization': '%s %s' % (token_type, access_token),
})
formats = []
thumbnail = None
for resource in video['resources']:
if not isinstance(resource, dict):
continue
format_url = url_or_none(resource.get('url'))
if not format_url:
continue
extension = resource.get('extension')
type_ = resource.get('type')
if extension == 'jpg' or type_ == 'reference_keyframe':
thumbnail = format_url
continue
ext = determine_ext(format_url)
if type_ == 'dash' or ext == 'mpd':
formats.extend(self._extract_mpd_formats(
format_url, video_id, mpd_id='dash', fatal=False))
elif type_ == 'hls' or ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
format_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False))
elif extension == 'mp4' or ext == 'mp4':
formats.append({
'url': format_url,
'format_id': type_,
'width': int_or_none(resource.get('width')),
'height': int_or_none(resource.get('height')),
})
self._sort_formats(formats)
attrs = {}
for attribute in video['attributes']:
if not isinstance(attribute, dict):
continue
key = attribute.get('fieldKey')
value = attribute.get('fieldValue')
if not key or not value:
continue
attrs[key] = value
title = attrs.get('title_stv') or video_id
alt_title = attrs.get('title')
description = attrs.get('long_description') or attrs.get('short_description')
series = attrs.get('label')
season = attrs.get('season')
episode = attrs.get('chapter')
duration = float_or_none(attrs.get('duration'), scale=1000)
season_number = int_or_none(self._search_regex(
r'Season (\d+)', season or '', 'season number', default=None))
episode_number = int_or_none(self._search_regex(
r'Episode (\d+)', episode or '', 'episode number', default=None))
return {
'id': video_id,
'title': title,
'alt_title': alt_title,
'description': description,
'thumbnail': thumbnail,
'duration': duration,
'timestamp': unified_timestamp(video.get('lastPublished')),
'series': series,
'season': season,
'season_number': season_number,
'episode': episode,
'episode_number': episode_number,
'formats': formats,
}

View file

@ -1,159 +1,54 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from .nexx import (
NexxIE,
NexxEmbedIE,
)
from .spiegeltv import SpiegeltvIE
from ..compat import compat_urlparse
from ..utils import (
parse_duration,
strip_or_none,
unified_timestamp,
)
from .jwplatform import JWPlatformIE
class SpiegelIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<id>[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$'
_UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}'
_VALID_URL = r'https?://(?:www\.)?(?:spiegel|manager-magazin)\.de(?:/[^/]+)+/[^/]*-(?P<id>[0-9]+|%s)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' % _UUID_RE
_TESTS = [{
'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html',
'md5': 'b57399839d055fccfeb9a0455c439868',
'md5': '50c7948883ec85a3e431a0a44b7ad1d6',
'info_dict': {
'id': '563747',
'id': 'II0BUyxY',
'display_id': '1259285',
'ext': 'mp4',
'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv',
'title': 'Vulkan Tungurahua in Ecuador ist wieder aktiv - DER SPIEGEL - Wissenschaft',
'description': 'md5:8029d8310232196eb235d27575a8b9f4',
'duration': 49,
'duration': 48.0,
'upload_date': '20130311',
'timestamp': 1362994320,
'timestamp': 1362997920,
},
}, {
'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html',
'md5': '5b6c2f4add9d62912ed5fc78a1faed80',
'info_dict': {
'id': '580988',
'ext': 'mp4',
'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers',
'description': 'md5:c2322b65e58f385a820c10fa03b2d088',
'duration': 983,
'upload_date': '20131115',
'timestamp': 1384546642,
},
}, {
'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html',
'md5': '97b91083a672d72976faa8433430afb9',
'info_dict': {
'id': '601883',
'ext': 'mp4',
'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.',
'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"',
'upload_date': '20140904',
'timestamp': 1409834160,
}
}, {
'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html',
'only_matching': True,
}, {
# nexx video
'url': 'https://www.spiegel.de/video/eifel-zoo-aufregung-um-ausgebrochene-raubtiere-video-99018031.html',
'only_matching': True,
}, {
'url': 'https://www.spiegel.de/panorama/urteile-im-goldmuenzenprozess-haftstrafen-fuer-clanmitglieder-a-aae8df48-43c1-4c61-867d-23f0a2d254b7',
'only_matching': True,
}, {
'url': 'http://www.spiegel.de/video/spiegel-tv-magazin-ueber-guellekrise-in-schleswig-holstein-video-99012776.html',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
metadata_url = 'http://www.spiegel.de/video/metadata/video-%s.json' % video_id
handle = self._request_webpage(metadata_url, video_id)
# 302 to spiegel.tv, like http://www.spiegel.de/video/der-film-zum-wochenende-die-wahrheit-ueber-maenner-video-99003272.html
if SpiegeltvIE.suitable(handle.geturl()):
return self.url_result(handle.geturl(), 'Spiegeltv')
video_data = self._parse_json(self._webpage_read_content(
handle, metadata_url, video_id), video_id)
title = video_data['title']
nexx_id = video_data['nexxOmniaId']
domain_id = video_data.get('nexxOmniaDomain') or '748'
return {
'_type': 'url_transparent',
'id': video_id,
'url': 'nexx:%s:%s' % (domain_id, nexx_id),
'title': title,
'description': strip_or_none(video_data.get('teaser')),
'duration': parse_duration(video_data.get('duration')),
'timestamp': unified_timestamp(video_data.get('datum')),
'ie_key': NexxIE.ie_key(),
}
class SpiegelArticleIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?spiegel\.de/(?!video/)[^?#]*?-(?P<id>[0-9]+)\.html'
IE_NAME = 'Spiegel:Article'
IE_DESC = 'Articles on spiegel.de'
_TESTS = [{
}, {
'url': 'http://www.spiegel.de/sport/sonst/badminton-wm-die-randsportart-soll-populaerer-werden-a-987092.html',
'info_dict': {
'id': '1516455',
'ext': 'mp4',
'title': 'Faszination Badminton: Nennt es bloß nicht Federball',
'description': 're:^Patrick Kämnitz gehört.{100,}',
'upload_date': '20140825',
},
}, {
'url': 'http://www.spiegel.de/wissenschaft/weltall/astronaut-alexander-gerst-antwortet-spiegel-online-lesern-a-989876.html',
'info_dict': {
},
'playlist_count': 6,
}, {
# Nexx iFrame embed
'url': 'http://www.spiegel.de/sptv/spiegeltv/spiegel-tv-ueber-schnellste-katapult-achterbahn-der-welt-taron-a-1137884.html',
'info_dict': {
'id': '161464',
'ext': 'mp4',
'title': 'Nervenkitzel Achterbahn',
'alt_title': 'Karussellbauer in Deutschland',
'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc',
'release_year': 2005,
'creator': 'SPIEGEL TV',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 2761,
'timestamp': 1394021479,
'upload_date': '20140305',
},
'params': {
'format': 'bestvideo',
'skip_download': True,
},
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
# Single video on top of the page
video_link = self._search_regex(
r'<a href="([^"]+)" onclick="return spOpenVideo\(this,', webpage,
'video page URL', default=None)
if video_link:
video_url = compat_urlparse.urljoin(
self.http_scheme() + '//spiegel.de/', video_link)
return self.url_result(video_url)
# Multiple embedded videos
embeds = re.findall(
r'<div class="vid_holder[0-9]+.*?</div>\s*.*?url\s*=\s*"([^"]+)"',
webpage)
entries = [
self.url_result(compat_urlparse.urljoin(
self.http_scheme() + '//spiegel.de/', embed_path))
for embed_path in embeds]
if embeds:
return self.playlist_result(entries)
return self.playlist_from_matches(
NexxEmbedIE._extract_urls(webpage), ie=NexxEmbedIE.ie_key())
media_id = self._html_search_regex(
r'(&#34;|["\'])mediaId\1\s*:\s*(&#34;|["\'])(?P<id>(?:(?!\2).)+)\2',
webpage, 'media id', group='id')
return {
'_type': 'url_transparent',
'id': video_id,
'display_id': video_id,
'url': 'jwplatform:%s' % media_id,
'title': self._og_search_title(webpage, default=None),
'ie_key': JWPlatformIE.ie_key(),
}

View file

@ -8,8 +8,8 @@
class TwentyThreeVideoIE(InfoExtractor):
IE_NAME = '23video'
_VALID_URL = r'https?://video\.(?P<domain>twentythree\.net|23video\.com|filmweb\.no)/v\.ihtml/player\.html\?(?P<query>.*?\bphoto(?:_|%5f)id=(?P<id>\d+).*)'
_TEST = {
_VALID_URL = r'https?://(?P<domain>[^.]+\.(?:twentythree\.net|23video\.com|filmweb\.no))/v\.ihtml/player\.html\?(?P<query>.*?\bphoto(?:_|%5f)id=(?P<id>\d+).*)'
_TESTS = [{
'url': 'https://video.twentythree.net/v.ihtml/player.html?showDescriptions=0&source=site&photo%5fid=20448876&autoPlay=1',
'md5': '75fcf216303eb1dae9920d651f85ced4',
'info_dict': {
@ -21,11 +21,14 @@ class TwentyThreeVideoIE(InfoExtractor):
'uploader_id': '12258964',
'uploader': 'Rasmus Bysted',
}
}
}, {
'url': 'https://bonnier-publications-danmark.23video.com/v.ihtml/player.html?token=f0dc46476e06e13afd5a1f84a29e31e8&source=embed&photo%5fid=36137620',
'only_matching': True,
}]
def _real_extract(self, url):
domain, query, photo_id = re.match(self._VALID_URL, url).groups()
base_url = 'https://video.%s' % domain
base_url = 'https://%s' % domain
photo_data = self._download_json(
base_url + '/api/photo/list?' + query, photo_id, query={
'format': 'json',

View file

@ -2,8 +2,11 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import unified_timestamp
import re
from ..utils import (
dict_get,
int_or_none,
unified_timestamp,
)
class URPlayIE(InfoExtractor):
@ -14,7 +17,7 @@ class URPlayIE(InfoExtractor):
'info_dict': {
'id': '203704',
'ext': 'mp4',
'title': 'Om vetenskap, kritiskt tänkande och motstånd',
'title': 'UR Samtiden - Livet, universum och rymdens märkliga musik : Om vetenskap, kritiskt tänkande och motstånd',
'description': 'md5:5344508a52aa78c1ced6c1b8b9e44e9a',
'timestamp': 1513292400,
'upload_date': '20171214',
@ -26,7 +29,7 @@ class URPlayIE(InfoExtractor):
'ext': 'mp4',
'title': 'Tripp, Trapp, Träd : Sovkudde',
'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1',
'timestamp': 1440093600,
'timestamp': 1440086400,
'upload_date': '20150820',
},
}, {
@ -36,28 +39,27 @@ class URPlayIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
url = url.replace('skola.se/Produkter', 'play.se/program')
webpage = self._download_webpage(url, video_id)
urplayer_data = re.sub("&quot;", "\"", self._search_regex(
r'components\/Player\/Player\" data-react-props=\"({.+?})\"',
webpage, 'urplayer data'))
urplayer_data = self._parse_json(urplayer_data, video_id)
for i in range(len(urplayer_data['accessibleEpisodes'])):
if urplayer_data.get('accessibleEpisodes', {})[i].get('id') == int(video_id):
urplayer_data = urplayer_data['accessibleEpisodes'][i]
break
urplayer_data = self._parse_json(self._html_search_regex(
r'data-react-class="components/Player/Player"[^>]+data-react-props="({.+?})"',
webpage, 'urplayer data'), video_id)['currentProduct']
episode = urplayer_data['title']
host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect']
formats = []
urplayer_streams = urplayer_data.get("streamingInfo")
for quality in ('sd'), ('hd'):
location = (urplayer_streams.get("raw", {}).get(quality, {}).get("location")
or urplayer_streams.get("sweComplete", {}).get(quality, {}).get("location"))
if location:
urplayer_streams = urplayer_data.get('streamingInfo', {})
for k, v in urplayer_streams.get('raw', {}).items():
if not (k in ('sd', 'hd') and isinstance(v, dict)):
continue
file_http = v.get('location')
if file_http:
formats.extend(self._extract_wowza_formats(
'http://%s/%s/playlist.m3u8' % (host, location), video_id,
skip_protocols=['f4m', 'rtmp', 'rtsp']))
'http://%s/%splaylist.m3u8' % (host, file_http),
video_id, skip_protocols=['f4m', 'rtmp', 'rtsp']))
self._sort_formats(formats)
subtitles = {}
subs = urplayer_streams.get("sweComplete", {}).get("tt", {}).get("location")
if subs:
@ -65,14 +67,37 @@ def _real_extract(self, url):
'url': subs,
})
image = urplayer_data.get('image') or {}
thumbnails = []
for k, v in image.items():
t = {
'id': k,
'url': v,
}
wh = k.split('x')
if len(wh) == 2:
t.update({
'width': int_or_none(wh[0]),
'height': int_or_none(wh[1]),
})
thumbnails.append(t)
series = urplayer_data.get('series') or {}
series_title = dict_get(series, ('seriesTitle', 'title')) or dict_get(urplayer_data, ('seriesTitle', 'mainTitle'))
return {
'id': video_id,
'title': urplayer_data['title'],
'description': self._og_search_description(webpage),
'thumbnail': urplayer_data.get('image', {}).get('1280x720'),
'timestamp': unified_timestamp(self._html_search_meta(('uploadDate', 'schema:uploadDate'),
webpage, 'timestamp')),
'series': urplayer_data.get('seriesTitle'),
'subtitles': subtitles,
'title': '%s : %s' % (series_title, episode) if series_title else episode,
'description': urplayer_data.get('description'),
'thumbnails': thumbnails,
'timestamp': unified_timestamp(urplayer_data.get('publishedAt')),
'series': series_title,
'formats': formats,
'duration': int_or_none(urplayer_data.get('duration')),
'categories': urplayer_data.get('categories'),
'tags': urplayer_data.get('keywords'),
'season': series.get('label'),
'episode': episode,
'episode_number': int_or_none(urplayer_data.get('episodeNumber')),
}

View file

@ -1,74 +1,24 @@
# coding: utf-8
from __future__ import unicode_literals
from .adobepass import AdobePassIE
from ..utils import (
NO_DEFAULT,
smuggle_url,
update_url_query,
)
from .nbc import NBCIE
class USANetworkIE(AdobePassIE):
_VALID_URL = r'https?://(?:www\.)?usanetwork\.com/(?:[^/]+/videos|movies)/(?P<id>[^/?#]+)'
_TEST = {
'url': 'http://www.usanetwork.com/mrrobot/videos/hpe-cybersecurity',
'md5': '33c0d2ba381571b414024440d08d57fd',
class USANetworkIE(NBCIE):
_VALID_URL = r'https?(?P<permalink>://(?:www\.)?usanetwork\.com/(?:[^/]+/videos?|movies?)/(?:[^/]+/)?(?P<id>\d+))'
_TESTS = [{
'url': 'https://www.usanetwork.com/peacock-trailers/video/intelligence-trailer/4185302',
'info_dict': {
'id': '3086229',
'id': '4185302',
'ext': 'mp4',
'title': 'HPE Cybersecurity',
'description': 'The more we digitize our world, the more vulnerable we are.',
'upload_date': '20160818',
'timestamp': 1471535460,
'uploader': 'NBCU-USA',
'title': 'Intelligence (Trailer)',
'description': 'A maverick NSA agent enlists the help of a junior systems analyst in a workplace power grab.',
'upload_date': '20200715',
'timestamp': 1594785600,
'uploader': 'NBCU-MPAT',
},
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
def _x(name, default=NO_DEFAULT):
return self._search_regex(
r'data-%s\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % name,
webpage, name, default=default, group='value')
video_id = _x('mpx-guid')
title = _x('episode-title')
mpx_account_id = _x('mpx-account-id', '2304992029')
query = {
'mbr': 'true',
}
if _x('is-full-episode', None) == '1':
query['manifest'] = 'm3u'
if _x('is-entitlement', None) == '1':
adobe_pass = {}
drupal_settings = self._search_regex(
r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);',
webpage, 'drupal settings', fatal=False)
if drupal_settings:
drupal_settings = self._parse_json(drupal_settings, video_id, fatal=False)
if drupal_settings:
adobe_pass = drupal_settings.get('adobePass', {})
resource = self._get_mvpd_resource(
adobe_pass.get('adobePassResourceId', 'usa'),
title, video_id, _x('episode-rating', 'TV-14'))
query['auth'] = self._extract_mvpd_auth(
url, video_id, adobe_pass.get('adobePassRequestorId', 'usa'), resource)
info = self._search_json_ld(webpage, video_id, default={})
info.update({
'_type': 'url_transparent',
'url': smuggle_url(update_url_query(
'http://link.theplatform.com/s/HNK2IC/media/guid/%s/%s' % (mpx_account_id, video_id),
query), {'force_smil_url': True}),
'id': video_id,
'title': title,
'series': _x('show-title', None),
'episode': title,
'ie_key': 'ThePlatform',
})
return info
'params': {
# m3u8 download
'skip_download': True,
},
}]

View file

@ -19,7 +19,7 @@
class UstreamIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?ustream\.tv/(?P<type>recorded|embed|embed/recorded)/(?P<id>\d+)'
_VALID_URL = r'https?://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/(?P<type>recorded|embed|embed/recorded)/(?P<id>\d+)'
IE_NAME = 'ustream'
_TESTS = [{
'url': 'http://www.ustream.tv/recorded/20274954',
@ -67,12 +67,15 @@ class UstreamIE(InfoExtractor):
'params': {
'skip_download': True, # m3u8 download
},
}, {
'url': 'https://video.ibm.com/embed/recorded/128240221?&autoplay=true&controls=true&volume=100',
'only_matching': True,
}]
@staticmethod
def _extract_url(webpage):
mobj = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
r'<iframe[^>]+?src=(["\'])(?P<url>http://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/embed/.+?)\1', webpage)
if mobj is not None:
return mobj.group('url')

View file

@ -946,10 +946,13 @@ def _fetch_page(self, album_id, authorizaion, hashed_pass, page):
def _real_extract(self, url):
album_id = self._match_id(url)
webpage = self._download_webpage(url, album_id)
viewer = self._parse_json(self._search_regex(
r'bootstrap_data\s*=\s*({.+?})</script>',
webpage, 'bootstrap data'), album_id)['viewer']
viewer = self._download_json(
'https://vimeo.com/_rv/viewer', album_id, fatal=False)
if not viewer:
webpage = self._download_webpage(url, album_id)
viewer = self._parse_json(self._search_regex(
r'bootstrap_data\s*=\s*({.+?})</script>',
webpage, 'bootstrap data'), album_id)['viewer']
jwt = viewer['jwt']
album = self._download_json(
'https://api.vimeo.com/albums/' + album_id,

View file

@ -4,52 +4,48 @@
import re
import time
import itertools
import json
from .common import InfoExtractor
from .naver import NaverBaseIE
from ..compat import compat_str
from ..compat import (
compat_HTTPError,
compat_str,
)
from ..utils import (
ExtractorError,
int_or_none,
merge_dicts,
try_get,
urlencode_postdata,
)
class VLiveIE(NaverBaseIE):
class VLiveBaseIE(NaverBaseIE):
_APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b'
class VLiveIE(VLiveBaseIE):
IE_NAME = 'vlive'
_VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/(?:video|post)/(?P<id>(?:\d-)?[0-9]+)'
_VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/(?:video|embed)/(?P<id>[0-9]+)'
_NETRC_MACHINE = 'vlive'
_TESTS = [{
'url': 'https://www.vlive.tv/video/1326',
'url': 'http://www.vlive.tv/video/1326',
'md5': 'cc7314812855ce56de70a06a27314983',
'info_dict': {
'id': '1326',
'ext': 'mp4',
'title': "[V LIVE] Girl's Day's Broadcast",
'title': "Girl's Day's Broadcast",
'creator': "Girl's Day",
'view_count': int,
'uploader_id': 'muploader_a',
},
},
{
'url': 'https://vlive.tv/post/1-18244258',
'md5': 'cc7314812855ce56de70a06a27314983',
'info_dict': {
'id': '1326',
'ext': 'mp4',
'title': "[V LIVE] Girl's Day's Broadcast",
'creator': "Girl's Day",
'view_count': int,
'uploader_id': 'muploader_a',
},
},
{
'url': 'https://www.vlive.tv/video/16937',
}, {
'url': 'http://www.vlive.tv/video/16937',
'info_dict': {
'id': '16937',
'ext': 'mp4',
'title': '[V LIVE] 첸백시 걍방',
'title': '첸백시 걍방',
'creator': 'EXO',
'view_count': int,
'subtitles': 'mincount:12',
@ -70,12 +66,11 @@ class VLiveIE(NaverBaseIE):
'subtitles': 'mincount:10',
},
'skip': 'This video is only available for CH+ subscribers',
}, {
'url': 'https://www.vlive.tv/embed/1326',
'only_matching': True,
}]
@classmethod
def suitable(cls, url):
return False if VLivePlaylistIE.suitable(url) else super(VLiveIE, cls).suitable(url)
def _real_initialize(self):
self._login()
@ -107,118 +102,82 @@ def is_logged_in():
if not is_logged_in():
raise ExtractorError('Unable to log in', expected=True)
def _call_api(self, path_template, video_id, fields=None):
query = {'appId': self._APP_ID}
if fields:
query['fields'] = fields
return self._download_json(
'https://www.vlive.tv/globalv-web/vam-web/' + path_template % video_id, video_id,
'Downloading %s JSON metadata' % path_template.split('/')[-1].split('-')[0],
headers={'Referer': 'https://www.vlive.tv/'}, query=query)
def _real_extract(self, url):
# url may match on a post or a video url with a post_id potentially matching a video_id
working_id = self._match_id(url)
webpage = self._download_webpage(url, working_id)
video_id = self._match_id(url)
PARAMS_RE = r'window\.__PRELOADED_STATE__\s*=\s*({.*});?\s*</script>'
PARAMS_FIELD = 'params'
try:
post = self._call_api(
'post/v1.0/officialVideoPost-%s', video_id,
'author{nickname},channel{channelCode,channelName},officialVideo{commentCount,exposeStatus,likeCount,playCount,playTime,status,title,type,vodId}')
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
self.raise_login_required(json.loads(e.cause.read().decode())['message'])
raise
params = self._search_regex(
PARAMS_RE, webpage, PARAMS_FIELD, default='', flags=re.DOTALL)
params = self._parse_json(params, working_id, fatal=False)
video = post['officialVideo']
video_params = try_get(params, lambda x: x["postDetail"]["post"]["officialVideo"], dict)
def get_common_fields():
channel = post.get('channel') or {}
return {
'title': video.get('title'),
'creator': post.get('author', {}).get('nickname'),
'channel': channel.get('channelName'),
'channel_id': channel.get('channelCode'),
'duration': int_or_none(video.get('playTime')),
'view_count': int_or_none(video.get('playCount')),
'like_count': int_or_none(video.get('likeCount')),
'comment_count': int_or_none(video.get('commentCount')),
}
if video_params is None:
error = try_get(params, lambda x: x["postDetail"]["error"], dict)
error_data = try_get(error, lambda x: x["data"], dict)
error_video = try_get(error_data, lambda x: x["officialVideo"], dict)
error_msg = try_get(error, lambda x: x["message"], compat_str)
product_type = try_get(error_data,
[lambda x: x["officialVideo"]["productType"],
lambda x: x["board"]["boardType"]],
compat_str)
if error_video is not None:
if product_type in ('VLIVE_PLUS', 'VLIVE+'):
self.raise_login_required('This video is only available with V LIVE+.')
elif error_msg is not None:
raise ExtractorError('V LIVE reported the following error: %s' % error_msg)
else:
raise ExtractorError('Failed to extract video parameters.')
elif 'post' in url:
raise ExtractorError('Url does not appear to be a video post.', expected=True)
else:
raise ExtractorError('Failed to extract video parameters.')
video_id = working_id if 'video' in url else str(video_params["videoSeq"])
video_type = video_params["type"]
if video_type in ('VOD'):
encoding_status = video_params["encodingStatus"]
if encoding_status == 'COMPLETE':
return self._replay(video_id, webpage, params, video_params)
else:
raise ExtractorError('VOD encoding not yet complete. Please try again later.',
expected=True)
elif video_type in ('LIVE'):
video_status = video_params["status"]
if video_status in ('RESERVED'):
video_type = video.get('type')
if video_type == 'VOD':
inkey = self._call_api('video/v1.0/vod/%s/inkey', video_id)['inkey']
vod_id = video['vodId']
return merge_dicts(
get_common_fields(),
self._extract_video_info(video_id, vod_id, inkey))
elif video_type == 'LIVE':
status = video.get('status')
if status == 'ON_AIR':
stream_url = self._call_api(
'old/v3/live/%s/playInfo',
video_id)['result']['adaptiveStreamUrl']
formats = self._extract_m3u8_formats(stream_url, video_id, 'mp4')
info = get_common_fields()
info.update({
'title': self._live_title(video['title']),
'id': video_id,
'formats': formats,
'is_live': True,
})
return info
elif status == 'ENDED':
raise ExtractorError(
'Uploading for replay. Please wait...', expected=True)
elif status == 'RESERVED':
raise ExtractorError('Coming soon!', expected=True)
elif video_status in ('ENDED', 'END'):
raise ExtractorError('Uploading for replay. Please wait...', expected=True)
elif video.get('exposeStatus') == 'CANCEL':
raise ExtractorError(
'We are sorry, but the live broadcast has been canceled.',
expected=True)
else:
return self._live(video_id, webpage, params)
else:
raise ExtractorError('Unknown video type %s' % video_type)
def _get_common_fields(self, webpage, params):
title = self._og_search_title(webpage)
description = self._html_search_meta(
['og:description', 'description', 'twitter:description'],
webpage, 'description', default=None)
creator = (try_get(params, lambda x: x["channel"]["channel"]["channelName"], compat_str)
or self._search_regex(r'on (.*) channel', description or '', 'creator', fatal=False))
thumbnail = self._og_search_thumbnail(webpage)
return {
'title': title,
'creator': creator,
'thumbnail': thumbnail,
}
def _live(self, video_id, webpage, params):
LIVE_INFO_ENDPOINT = 'https://www.vlive.tv/globalv-web/vam-web/old/v3/live/%s/playInfo' % video_id
play_info = self._download_json(LIVE_INFO_ENDPOINT, video_id,
headers={"referer": "https://www.vlive.tv"})
streams = try_get(play_info, lambda x: x["result"]["streamList"], list) or []
formats = []
for stream in streams:
formats.extend(self._extract_m3u8_formats(
stream['serviceUrl'], video_id, 'mp4',
fatal=False, live=True))
self._sort_formats(formats)
info = self._get_common_fields(webpage, params)
info.update({
'title': self._live_title(info['title']),
'id': video_id,
'formats': formats,
'is_live': True,
})
return info
def _replay(self, video_id, webpage, params, video_params):
long_video_id = video_params["vodId"]
VOD_KEY_ENDPOINT = 'https://www.vlive.tv/globalv-web/vam-web/video/v1.0/vod/%s/inkey' % video_id
key_json = self._download_json(VOD_KEY_ENDPOINT, video_id,
headers={"referer": "https://www.vlive.tv"})
key = key_json["inkey"]
return merge_dicts(
self._get_common_fields(webpage, params),
self._extract_video_info(video_id, long_video_id, key))
raise ExtractorError('Unknown status ' + status)
class VLiveChannelIE(InfoExtractor):
class VLiveChannelIE(VLiveBaseIE):
IE_NAME = 'vlive:channel'
_VALID_URL = r'https?://(?:(?:www|m)\.)?(?:channels\.vlive\.tv/|vlive\.tv/channels?/)(?P<id>[0-9A-Z]+)'
_VALID_URL = r'https?://(?:channels\.vlive\.tv|(?:(?:www|m)\.)?vlive\.tv/channel)/(?P<id>[0-9A-Z]+)'
_TESTS = [{
'url': 'https://channels.vlive.tv/FCD4B',
'url': 'http://channels.vlive.tv/FCD4B',
'info_dict': {
'id': 'FCD4B',
'title': 'MAMAMOO',
@ -226,63 +185,39 @@ class VLiveChannelIE(InfoExtractor):
'playlist_mincount': 110
}, {
'url': 'https://www.vlive.tv/channel/FCD4B',
'info_dict': {
'id': 'FCD4B',
'title': 'MAMAMOO',
},
'playlist_mincount': 110
'only_matching': True,
}]
_APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b'
def _call_api(self, path, channel_key_suffix, channel_value, note, query):
q = {
'app_id': self._APP_ID,
'channel' + channel_key_suffix: channel_value,
}
q.update(query)
return self._download_json(
'http://api.vfan.vlive.tv/vproxy/channelplus/' + path,
channel_value, note='Downloading ' + note, query=q)['result']
def _real_extract(self, url):
channel_code = self._match_id(url)
webpage = self._download_webpage(
'http://channels.vlive.tv/%s/video' % channel_code, channel_code)
channel_seq = self._call_api(
'decodeChannelCode', 'Code', channel_code,
'decode channel code', {})['channelSeq']
app_id = None
app_js_url = self._search_regex(
r'<script[^>]+src=(["\'])(?P<url>http.+?/app\.js.*?)\1',
webpage, 'app js', default=None, group='url')
if app_js_url:
app_js = self._download_webpage(
app_js_url, channel_code, 'Downloading app JS', fatal=False)
if app_js:
app_id = self._search_regex(
r'Global\.VFAN_APP_ID\s*=\s*[\'"]([^\'"]+)[\'"]',
app_js, 'app id', default=None)
app_id = app_id or self._APP_ID
channel_info = self._download_json(
'http://api.vfan.vlive.tv/vproxy/channelplus/decodeChannelCode',
channel_code, note='Downloading decode channel code',
query={
'app_id': app_id,
'channelCode': channel_code,
'_': int(time.time())
})
channel_seq = channel_info['result']['channelSeq']
channel_name = None
entries = []
for page_num in itertools.count(1):
video_list = self._download_json(
'http://api.vfan.vlive.tv/vproxy/channelplus/getChannelVideoList',
channel_code, note='Downloading channel list page #%d' % page_num,
query={
'app_id': app_id,
'channelSeq': channel_seq,
video_list = self._call_api(
'getChannelVideoList', 'Seq', channel_seq,
'channel list page #%d' % page_num, {
# Large values of maxNumOfRows (~300 or above) may cause
# empty responses (see [1]), e.g. this happens for [2] that
# has more than 300 videos.
# 1. https://github.com/ytdl-org/youtube-dl/issues/13830
# 2. http://channels.vlive.tv/EDBF.
'maxNumOfRows': 100,
'_': int(time.time()),
'pageNo': page_num
}
)
@ -290,11 +225,11 @@ def _real_extract(self, url):
if not channel_name:
channel_name = try_get(
video_list,
lambda x: x['result']['channelInfo']['channelName'],
lambda x: x['channelInfo']['channelName'],
compat_str)
videos = try_get(
video_list, lambda x: x['result']['videoList'], list)
video_list, lambda x: x['videoList'], list)
if not videos:
break
@ -312,7 +247,9 @@ def _real_extract(self, url):
entries, channel_code, channel_name)
class VLivePlaylistIE(InfoExtractor):
# old extractor. Rewrite?
class VLivePlaylistIE(VLiveBaseIE):
IE_NAME = 'vlive:playlist'
_VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P<video_id>[0-9]+)/playlist/(?P<id>[0-9]+)'
_VIDEO_URL_TEMPLATE = 'http://www.vlive.tv/video/%s'

View file

@ -5,7 +5,6 @@
from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
js_to_json,
orderedSet,
@ -34,7 +33,7 @@ class XTubeIE(InfoExtractor):
'title': 'strange erotica',
'description': 'contains:an ET kind of thing',
'uploader': 'greenshowers',
'duration': 449,
'duration': 450,
'view_count': int,
'comment_count': int,
'age_limit': 18,
@ -74,24 +73,16 @@ def _real_extract(self, url):
title, thumbnail, duration = [None] * 3
json_config_string = self._search_regex(
r'playerConf=({.+?}),loaderConf',
webpage, 'config', default=None)
if not json_config_string:
raise ExtractorError("Could not extract video player data")
json_config_string = json_config_string.replace("!0", "true").replace("!1", "false")
config = self._parse_json(json_config_string, video_id, transform_source=js_to_json, fatal=False)
if not config:
raise ExtractorError("Could not extract video player data")
config = config.get('mainRoll')
if isinstance(config, dict):
title = config.get('title')
thumbnail = config.get('poster')
duration = int_or_none(config.get('duration'))
sources = config.get('sources') or config.get('format')
config = self._parse_json(self._search_regex(
r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf)', webpage, 'config',
default='{}'), video_id, transform_source=js_to_json, fatal=False)
if config:
config = config.get('mainRoll')
if isinstance(config, dict):
title = config.get('title')
thumbnail = config.get('poster')
duration = int_or_none(config.get('duration'))
sources = config.get('sources') or config.get('format')
if not isinstance(sources, dict):
sources = self._parse_json(self._search_regex(

View file

@ -29,7 +29,6 @@ class YouPornIE(InfoExtractor):
'upload_date': '20101217',
'average_rating': int,
'view_count': int,
'comment_count': int,
'categories': list,
'tags': list,
'age_limit': 18,
@ -48,7 +47,6 @@ class YouPornIE(InfoExtractor):
'upload_date': '20110418',
'average_rating': int,
'view_count': int,
'comment_count': int,
'categories': list,
'tags': list,
'age_limit': 18,
@ -156,7 +154,8 @@ def _real_extract(self, url):
r'(?s)<div[^>]+class=["\']submitByLink["\'][^>]*>(.+?)</div>',
webpage, 'uploader', fatal=False)
upload_date = unified_strdate(self._html_search_regex(
[r'Date\s+[Aa]dded:\s*<span>([^<]+)',
[r'UPLOADED:\s*<span>([^<]+)',
r'Date\s+[Aa]dded:\s*<span>([^<]+)',
r'(?s)<div[^>]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)</div>'],
webpage, 'upload date', fatal=False))
@ -171,7 +170,7 @@ def _real_extract(self, url):
webpage, 'view count', fatal=False, group='count'))
comment_count = str_to_int(self._search_regex(
r'>All [Cc]omments? \(([\d,.]+)\)',
webpage, 'comment count', fatal=False))
webpage, 'comment count', default=None))
def extract_tag_box(regex, title):
tag_box = self._search_regex(regex, webpage, title, default=None)

File diff suppressed because it is too large Load diff

View file

@ -4085,7 +4085,7 @@ def fix_kv(m):
v = m.group(0)
if v in ('true', 'false', 'null'):
return v
elif v.startswith('/*') or v.startswith('//') or v == ',':
elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
return ""
if v[0] in ("'", '"'):
@ -4095,12 +4095,12 @@ def fix_kv(m):
'\\\n': '',
'\\x': '\\u00',
}.get(m.group(0), m.group(0)), v[1:-1])
for regex, base in INTEGER_TABLE:
im = re.match(regex, v)
if im:
i = int(im.group(1), base)
return '"%d":' % i if v.endswith(':') else '%d' % i
else:
for regex, base in INTEGER_TABLE:
im = re.match(regex, v)
if im:
i = int(im.group(1), base)
return '"%d":' % i if v.endswith(':') else '%d' % i
return '"%s"' % v
@ -4110,7 +4110,8 @@ def fix_kv(m):
{comment}|,(?={skip}[\]}}])|
(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
\b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
[0-9]+(?={skip}:)
[0-9]+(?={skip}:)|
!+
'''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)