Merge e7079c3a5a into f3411af12e

[ie/matchtv] Fix extractor (#10190 )
Authored by: megumintyan
2024-06-29 05:54:33 +00:00 · 2024-06-26 14:01:56 +03:00 · 2024-06-25 00:49:09 +02:00 · 2024-05-15 22:08:43 +02:00 · 2024-05-15 22:08:43 +02:00 · 2024-05-15 22:08:43 +02:00
4 changed files with 107 additions and 76 deletions
--- a/yt_dlp/extractor/_extractors.py
+++ b/yt_dlp/extractor/_extractors.py
@ -947,6 +947,7 @@
 )
 from .kicker import KickerIE
 from .kickstarter import KickStarterIE
+from .kika import KikaIE
 from .kinja import KinjaEmbedIE
 from .kinopoisk import KinoPoiskIE
 from .kommunetv import KommunetvIE
--- a/yt_dlp/extractor/kika.py
+++ b/yt_dlp/extractor/kika.py
@ -0,0 +1,91 @@
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    parse_duration,
+    parse_iso8601,
+    traverse_obj
+)
+
+
+class KikaIE(InfoExtractor):
+    IE_DESC = 'KiKA.de'
+    _VALID_URL = r'https?://(?:www\.)?kika\.de/(?:.*)/(?P<id>[a-z-]+-?\d+)'
+    _GEO_COUNTRIES = ['DE']
+
+    _TESTS = [{
+        'url': 'https://www.kika.de/beutolomaeus-und-der-wahre-weihnachtsmann/videos/eins-der-neue-weihnachtsmann-102',
+        'md5': '25ceea8790417f3c6dcf1d4342f8a97a',
+        'info_dict': {
+            'id': 'eins-der-neue-weihnachtsmann-102',
+            'ext': 'mp4',
+            'title': '1. Der neue Weihnachtsmann',
+            'description': 'md5:61b1e6f32882e8ca2a0ddfd135d03c6b',
+            'duration': 787,
+            'timestamp': 1700584500,
+            'upload_date': '20231121'
+        }
+    }, {
+        'url': 'https://www.kika.de/kaltstart/videos/video92498',
+        'md5': '710ece827e5055094afeb474beacb7aa',
+        'info_dict': {
+            'id': 'video92498',
+            'ext': 'mp4',
+            'title': '7. Wo ist Leo?',
+            'description': 'md5:fb48396a5b75068bcac1df74f1524920',
+            'duration': 436,
+            'timestamp': 1702926876,
+            'upload_date': '20231218'
+        }
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        doc = self._download_json(f'https://www.kika.de/_next-api/proxy/v1/videos/{video_id}', video_id)
+        video_assets = self._download_json(doc['assets']['url'], video_id)
+
+        subtitles = {}
+        ttml_resource = video_assets.get('videoSubtitle')
+        if ttml_resource:
+            subtitles['de'] = [{
+                'url': ttml_resource,
+                'ext': 'ttml',
+            }]
+        webvtt_resource = video_assets.get('webvttUrl')
+        if webvtt_resource:
+            subtitles.setdefault('de', []).append({
+                'url': webvtt_resource,
+                'ext': 'vtt'
+            })
+
+        return {
+            'id': video_id,
+            'title': doc.get('title'),
+            'description': doc.get('description'),
+            'timestamp': parse_iso8601(doc.get('date')),
+            'duration': parse_duration(doc.get('duration')),
+            'formats': list(self._extract_formats(video_assets, video_id)),
+            'subtitles': subtitles
+        }
+
+    def _extract_formats(self, media_info, video_id):
+        for media in media_info['assets']:
+            stream_url = media.get('url')
+            if not stream_url:
+                continue
+            ext = determine_ext(stream_url)
+            if ext == 'm3u8':
+                yield from self._extract_m3u8_formats(
+                    stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False)
+            else:
+                yield {
+                    'url': stream_url,
+                    'format_id': ext,
+                    **traverse_obj(media, {
+                        'width': 'frameWidth',
+                        'height': 'frameHeight',
+                        'filesize': 'fileSize',
+                        'abr': 'bitrateAudio',
+                        'vbr': 'bitrateVideo'
+                    })
+                }
--- a/yt_dlp/extractor/matchtv.py
+++ b/yt_dlp/extractor/matchtv.py
@ -1,51 +1,35 @@
-import random
-
 from .common import InfoExtractor
-from ..utils import xpath_text


 class MatchTVIE(InfoExtractor):
-    _VALID_URL = r'https?://matchtv\.ru(?:/on-air|/?#live-player)'
+    _VALID_URL = [
+        r'https?://matchtv\.ru/on-air/?(?:$|[?#])',
+        r'https?://video\.matchtv\.ru/iframe/channel/106/?(?:$|[?#])',
+    ]
    _TESTS = [{
-        'url': 'http://matchtv.ru/#live-player',
+        'url': 'http://matchtv.ru/on-air/',
        'info_dict': {
            'id': 'matchtv-live',
-            'ext': 'flv',
+            'ext': 'mp4',
            'title': r're:^Матч ТВ - Прямой эфир \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
-            'is_live': True,
+            'live_status': 'is_live',
        },
        'params': {
            'skip_download': True,
        },
    }, {
-        'url': 'http://matchtv.ru/on-air/',
+        'url': 'https://video.matchtv.ru/iframe/channel/106',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        video_id = 'matchtv-live'
-        video_url = self._download_json(
-            'http://player.matchtv.ntvplus.tv/player/smil', video_id,
-            query={
-                'ts': '',
-                'quality': 'SD',
-                'contentId': '561d2c0df7159b37178b4567',
-                'sign': '',
-                'includeHighlights': '0',
-                'userId': '',
-                'sessionId': random.randint(1, 1000000000),
-                'contentType': 'channel',
-                'timeShift': '0',
-                'platform': 'portal',
-            },
-            headers={
-                'Referer': 'http://player.matchtv.ntvplus.tv/embed-player/NTVEmbedPlayer.swf',
-            })['data']['videoUrl']
-        f4m_url = xpath_text(self._download_xml(video_url, video_id), './to')
-        formats = self._extract_f4m_formats(f4m_url, video_id)
+        webpage = self._download_webpage('https://video.matchtv.ru/iframe/channel/106', video_id)
+        video_url = self._html_search_regex(
+            r'data-config="config=(https?://[^?"]+)[?"]', webpage, 'video URL').replace('/feed/', '/media/') + '.m3u8'
        return {
            'id': video_id,
            'title': 'Матч ТВ - Прямой эфир',
            'is_live': True,
-            'formats': formats,
+            'formats': self._extract_m3u8_formats(video_url, video_id, 'mp4', live=True),
        }
--- a/yt_dlp/extractor/mdr.py
+++ b/yt_dlp/extractor/mdr.py
@ -13,8 +13,8 @@


 class MDRIE(InfoExtractor):
-    IE_DESC = 'MDR.DE and KiKA'
-    _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z-]+-?(?P<id>\d+)(?:_.+?)?\.html'
+    IE_DESC = 'MDR.DE'
+    _VALID_URL = r'https?://(?:www\.)?mdr\.de/(?:.*)/[a-z-]+-?(?P<id>\d+)(?:_.+?)?\.html'

    _GEO_COUNTRIES = ['DE']

@ -34,30 +34,6 @@ class MDRIE(InfoExtractor):
            'uploader': 'MITTELDEUTSCHER RUNDFUNK',
        },
        'skip': '404 not found',
-    }, {
-        'url': 'http://www.kika.de/baumhaus/videos/video19636.html',
-        'md5': '4930515e36b06c111213e80d1e4aad0e',
-        'info_dict': {
-            'id': '19636',
-            'ext': 'mp4',
-            'title': 'Baumhaus vom 30. Oktober 2015',
-            'duration': 134,
-            'uploader': 'KIKA',
-        },
-        'skip': '404 not found',
-    }, {
-        'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html',
-        'md5': '5fe9c4dd7d71e3b238f04b8fdd588357',
-        'info_dict': {
-            'id': '8182',
-            'ext': 'mp4',
-            'title': 'Beutolomäus und der geheime Weihnachtswunsch',
-            'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd',
-            'timestamp': 1482541200,
-            'upload_date': '20161224',
-            'duration': 4628,
-            'uploader': 'KIKA',
-        },
    }, {
        # audio with alternative playerURL pattern
        'url': 'http://www.mdr.de/kultur/videos-und-audios/audio-radio/operation-mindfuck-robert-wilson100.html',
@ -68,28 +44,7 @@ class MDRIE(InfoExtractor):
            'duration': 3239,
            'uploader': 'MITTELDEUTSCHER RUNDFUNK',
        },
-    }, {
-        # empty bitrateVideo and bitrateAudio
-        'url': 'https://www.kika.de/filme/sendung128372_zc-572e3f45_zs-1d9fb70e.html',
-        'info_dict': {
-            'id': '128372',
-            'ext': 'mp4',
-            'title': 'Der kleine Wichtel kehrt zurück',
-            'description': 'md5:f77fafdff90f7aa1e9dca14f662c052a',
-            'duration': 4876,
-            'timestamp': 1607823300,
-            'upload_date': '20201213',
-            'uploader': 'ZDF',
-        },
-        'params': {
-            'skip_download': True,
-        },
-    }, {
-        'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html',
-        'only_matching': True,
-    }, {
-        'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html',
-        'only_matching': True,
+        'skip': '404 not found',
    }, {
        'url': 'http://www.mdr.de/mediathek/mdr-videos/a/video-1334.html',
        'only_matching': True,
Author	SHA1	Message	Date
Frank Aurich	d875a0dab0	Merge `e7079c3a5a` into `f3411af12e`	2024-06-26 14:01:56 +03:00
megumin	f3411af12e	[ie/matchtv] Fix extractor (#10190 ) Authored by: megumintyan	2024-06-25 00:49:09 +02:00
Frank Aurich	e7079c3a5a	Update KiKA extractor: - Use regular expression suggested by dirkf - Update existing test case, and add a second one with different URL schema - Remove static 'uploader' element	2024-05-15 22:08:43 +02:00
pukkandan	b96401f7c4	Update yt_dlp/extractor/kika.py	2024-05-15 22:08:43 +02:00
Frank Aurich	5d9a4a223c	Incorporate changes suggested during code review	2024-05-15 22:08:43 +02:00
Frank Aurich	0408452f4b	Remove Kika support from MDR extractor	2024-05-15 22:08:42 +02:00
Frank Aurich	164fbc7fc3	[KiKa] Add extractor	2024-05-15 22:08:42 +02:00