From 833fe8c9afe565332f1c8388ec95705ac3d3ce3b Mon Sep 17 00:00:00 2001 From: deepspy Date: Mon, 24 Jun 2024 00:10:22 +0300 Subject: [PATCH 1/8] Added a basic shell for a kankids extractor. --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/kankids.py | 44 ++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) create mode 100644 youtube_dl/extractor/kankids.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 3da5f802093..c9def8240b4 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -560,6 +560,7 @@ from .kakao import KakaoIE from .kaltura import KalturaIE from .kankan import KankanIE +from .kankids import KanKidsIE from .karaoketv import KaraoketvIE from .karrierevideos import KarriereVideosIE from .keezmovies import KeezMoviesIE diff --git a/youtube_dl/extractor/kankids.py b/youtube_dl/extractor/kankids.py new file mode 100644 index 00000000000..f6f5ae38cfd --- /dev/null +++ b/youtube_dl/extractor/kankids.py @@ -0,0 +1,44 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + +class KanKidsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?kankids\.org\.il/content/kids/(?P[a-z]+)-main/p-(?P[0-9]+)/(?P\w+/)?$' + _TEST = { + 'url': 'https://www.kankids.org.il/content/kids/hinuchit-main/p-12050/', + 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', + 'info_dict': { + 'id': '42', + 'ext': 'mp4', + 'title': 'Video title goes here', + 'thumbnail': r're:^https?://.*\.jpg$', + # TODO more properties, either as: + # * A value + # * MD5 checksum; start the string with md5: + # * A regular expression; start the string with re: + # * Any Python type (for example int or float) + } + } + + def _real_extract(self, url): + a = super()._match_valid_url(url) + print(a.groupdict()) + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + # TODO more code goes here, for example ... + # title = self._html_search_regex(r'

(.+?)

', webpage, 'title') + title = 'hi' + + return { + 'id': video_id, + 'title': title, + 'description': self._og_search_description(webpage), + 'url': 'https://www.kankids.org.il/content/kids/hinuchit-main/p-12050/s1/89707/', + 'ie_key': 'Generic', + '_type': 'url', + # 'uploader': self._search_regex(r']+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False), + # TODO more properties (see youtube_dl/extractor/common.py) + } + From d335e0beecfbff7a55ba3a35ef3688104821525a Mon Sep 17 00:00:00 2001 From: deepspy Date: Mon, 24 Jun 2024 14:22:56 +0300 Subject: [PATCH 2/8] Extracted episode list. --- youtube_dl/extractor/kankids.py | 34 ++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/youtube_dl/extractor/kankids.py b/youtube_dl/extractor/kankids.py index f6f5ae38cfd..38866dc6f8a 100644 --- a/youtube_dl/extractor/kankids.py +++ b/youtube_dl/extractor/kankids.py @@ -2,9 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor +import re class KanKidsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?kankids\.org\.il/content/kids/(?P[a-z]+)-main/p-(?P[0-9]+)/(?P\w+/)?$' + _VALID_URL = r'https?://(?:www\.)?kankids\.org\.il/content/kids/(?P[a-z]+)-main/p-(?P[0-9]+)/(?P\w+)?/?$' _TEST = { 'url': 'https://www.kankids.org.il/content/kids/hinuchit-main/p-12050/', 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', @@ -22,17 +23,32 @@ class KanKidsIE(InfoExtractor): } def _real_extract(self, url): - a = super()._match_valid_url(url) - print(a.groupdict()) - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + m = super()._match_valid_url(url) + series_id = m.group('id') + category = m.group('category') + playlist_season = m.group('season') + + webpage = self._download_webpage(url, series_id) - # TODO more code goes here, for example ... - # title = self._html_search_regex(r'

(.+?)

', webpage, 'title') - title = 'hi' + series_title = self._html_search_regex(r'(?P<title>.+) \|', webpage, 'title') + + season = playlist_season if playlist_season else '(?P<season>\w+)' + playlist = set(re.findall( + r'href="/content/kids/' + # Content dir + category + r'-main/' + # Category + 'p-' + series_id + '/' + # Series + season + '/' + # Season + '(?P<id>[0-9]+)/"' + # Episode + '.+title="(?P<title>.+)"' # Title + , webpage)) + # , 'Episode list') + print('playlist:', playlist) + + for season, video_id, title in playlist if not playlist_season else map(lambda episode: (playlist_season,) + episode, playlist): + pass return { - 'id': video_id, + 'id': series_id, 'title': title, 'description': self._og_search_description(webpage), 'url': 'https://www.kankids.org.il/content/kids/hinuchit-main/p-12050/s1/89707/', From 3fb423c0bfd87ea8ce7cabecf8bc99e180eef437 Mon Sep 17 00:00:00 2001 From: deepspy <pavel.deepspy@gmail.com> Date: Mon, 24 Jun 2024 16:25:34 +0300 Subject: [PATCH 3/8] First working version of the kan kids extractor. --- youtube_dl/extractor/kankids.py | 40 ++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/youtube_dl/extractor/kankids.py b/youtube_dl/extractor/kankids.py index 38866dc6f8a..19c9baf378f 100644 --- a/youtube_dl/extractor/kankids.py +++ b/youtube_dl/extractor/kankids.py @@ -4,8 +4,13 @@ from .common import InfoExtractor import re +CONTENT_DIR = r'/content/kids/' +DOMAIN = r'kankids.org.il' + class KanKidsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?kankids\.org\.il/content/kids/(?P<category>[a-z]+)-main/p-(?P<id>[0-9]+)/(?P<season>\w+)?/?$' + _VALID_URL = r'https?://(?:www\.)?' +\ + DOMAIN.replace('.', '\.') + CONTENT_DIR +\ + r'(?P<category>[a-z]+)-main/p-(?P<id>[0-9]+)/(?P<season>\w+)?/?$' _TEST = { 'url': 'https://www.kankids.org.il/content/kids/hinuchit-main/p-12050/', 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', @@ -32,29 +37,32 @@ def _real_extract(self, url): series_title = self._html_search_regex(r'<title>(?P<title>.+) \|', webpage, 'title') - season = playlist_season if playlist_season else '(?P<season>\w+)' + season = playlist_season if playlist_season else r'(?P<season>\w+)' + content_dir = CONTENT_DIR + category + r'-main/' playlist = set(re.findall( - r'href="/content/kids/' + # Content dir - category + r'-main/' + # Category - 'p-' + series_id + '/' + # Series - season + '/' + # Season - '(?P<id>[0-9]+)/"' + # Episode - '.+title="(?P<title>.+)"' # Title + r'href="' + content_dir + # Content dir + r'p-' + series_id + r'/' + # Series + season + r'/' + # Season + r'(?P<id>[0-9]+)/"' + # Episode + r'.+title="(?P<title>.+)"' # Title , webpage)) # , 'Episode list') print('playlist:', playlist) + entries = [] + content_dir = r'https://www.' + DOMAIN + content_dir for season, video_id, title in playlist if not playlist_season else map(lambda episode: (playlist_season,) + episode, playlist): - pass + entries.append(self.url_result( + content_dir + season + r'/' + video_id + r'/', + ie='Generic', + video_id=video_id, + video_title=title, + )) return { + '_type': 'playlist', 'id': series_id, - 'title': title, - 'description': self._og_search_description(webpage), - 'url': 'https://www.kankids.org.il/content/kids/hinuchit-main/p-12050/s1/89707/', - 'ie_key': 'Generic', - '_type': 'url', - # 'uploader': self._search_regex(r'<div[^>]+id="uploader"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False), - # TODO more properties (see youtube_dl/extractor/common.py) + 'title': series_title, + 'entries': entries, } From c9265f6d609d3805e5c67ac615637a203ece80fc Mon Sep 17 00:00:00 2001 From: deepspy <pavel.deepspy@gmail.com> Date: Mon, 24 Jun 2024 16:46:29 +0300 Subject: [PATCH 4/8] Fixed a series title location bug. --- youtube_dl/extractor/kankids.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/kankids.py b/youtube_dl/extractor/kankids.py index 19c9baf378f..03051817c08 100644 --- a/youtube_dl/extractor/kankids.py +++ b/youtube_dl/extractor/kankids.py @@ -35,7 +35,12 @@ def _real_extract(self, url): webpage = self._download_webpage(url, series_id) - series_title = self._html_search_regex(r'<title>(?P<title>.+) \|', webpage, 'title') + title_pattern = r'<title>(?P<title>.+) \|' + series_title = re.search(title_pattern, webpage) + if not series_title: + series_title = re.search(title_pattern[:-1] + r'-', webpage) + if series_title: + series_title = series_title.group('title') season = playlist_season if playlist_season else r'(?P<season>\w+)' content_dir = CONTENT_DIR + category + r'-main/' @@ -47,7 +52,6 @@ def _real_extract(self, url): r'.+title="(?P<title>.+)"' # Title , webpage)) # , 'Episode list') - print('playlist:', playlist) entries = [] content_dir = r'https://www.' + DOMAIN + content_dir From 162eb5632b70a2b84a07517bea7ae50c2b21b63b Mon Sep 17 00:00:00 2001 From: deepspy <pavel.deepspy@gmail.com> Date: Mon, 24 Jun 2024 17:40:43 +0300 Subject: [PATCH 5/8] Added unittests for kankids extractor. Fixed a non p- playlist id matching bug. --- youtube_dl/extractor/kankids.py | 40 +++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/youtube_dl/extractor/kankids.py b/youtube_dl/extractor/kankids.py index 03051817c08..906cf2a2d22 100644 --- a/youtube_dl/extractor/kankids.py +++ b/youtube_dl/extractor/kankids.py @@ -10,28 +10,34 @@ class KanKidsIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?' +\ DOMAIN.replace('.', '\.') + CONTENT_DIR +\ - r'(?P<category>[a-z]+)-main/p-(?P<id>[0-9]+)/(?P<season>\w+)?/?$' - _TEST = { - 'url': 'https://www.kankids.org.il/content/kids/hinuchit-main/p-12050/', - 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', - 'info_dict': { - 'id': '42', - 'ext': 'mp4', - 'title': 'Video title goes here', - 'thumbnail': r're:^https?://.*\.jpg$', - # TODO more properties, either as: - # * A value - # * MD5 checksum; start the string with md5: - # * A regular expression; start the string with re: - # * Any Python type (for example int or float) - } - } + r'(?P<category>[a-z]+)-main/(?P<id>[\w\-0-9]+)/(?P<season>\w+)?/?$' + _TESTS = [ + { + 'url': 'https://www.kankids.org.il/content/kids/ktantanim-main/p-11732/', + 'info_dict': { + '_type': 'playlist', + 'id': 'p-11732', + 'title': 'בית ספר לקוסמים', + }, + 'playlist_count': 60, + }, + { + 'url': 'https://www.kankids.org.il/content/kids/hinuchit-main/cramel_main/s1/', + 'info_dict': { + '_type': 'playlist', + 'id': 'cramel_main', + 'title': 'כראמל - עונה 1', + }, + 'playlist_count': 21, + }, + ] def _real_extract(self, url): m = super()._match_valid_url(url) series_id = m.group('id') category = m.group('category') playlist_season = m.group('season') + print(m.groupdict()) webpage = self._download_webpage(url, series_id) @@ -46,7 +52,7 @@ def _real_extract(self, url): content_dir = CONTENT_DIR + category + r'-main/' playlist = set(re.findall( r'href="' + content_dir + # Content dir - r'p-' + series_id + r'/' + # Series + series_id + r'/' + # Series season + r'/' + # Season r'(?P<id>[0-9]+)/"' + # Episode r'.+title="(?P<title>.+)"' # Title From d3e980eaa526f2e7524c344cdef90abf85d1d55a Mon Sep 17 00:00:00 2001 From: deepspy <pavel.deepspy@gmail.com> Date: Mon, 24 Jun 2024 22:11:06 +0300 Subject: [PATCH 6/8] Some cleanup of the kan kids extractor. --- youtube_dl/extractor/kankids.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/youtube_dl/extractor/kankids.py b/youtube_dl/extractor/kankids.py index 906cf2a2d22..ca6d36fa895 100644 --- a/youtube_dl/extractor/kankids.py +++ b/youtube_dl/extractor/kankids.py @@ -37,7 +37,6 @@ def _real_extract(self, url): series_id = m.group('id') category = m.group('category') playlist_season = m.group('season') - print(m.groupdict()) webpage = self._download_webpage(url, series_id) @@ -57,7 +56,6 @@ def _real_extract(self, url): r'(?P<id>[0-9]+)/"' + # Episode r'.+title="(?P<title>.+)"' # Title , webpage)) - # , 'Episode list') entries = [] content_dir = r'https://www.' + DOMAIN + content_dir From a4737bb7555d8f34a20168ab0a99e0a860727cca Mon Sep 17 00:00:00 2001 From: deepspy <pavel.deepspy@gmail.com> Date: Mon, 24 Jun 2024 23:12:03 +0300 Subject: [PATCH 7/8] Fixed flake8 errors. --- youtube_dl/extractor/kankids.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/kankids.py b/youtube_dl/extractor/kankids.py index ca6d36fa895..1bcc1d025c6 100644 --- a/youtube_dl/extractor/kankids.py +++ b/youtube_dl/extractor/kankids.py @@ -7,6 +7,7 @@ CONTENT_DIR = r'/content/kids/' DOMAIN = r'kankids.org.il' + class KanKidsIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?' +\ DOMAIN.replace('.', '\.') + CONTENT_DIR +\ @@ -18,7 +19,7 @@ class KanKidsIE(InfoExtractor): '_type': 'playlist', 'id': 'p-11732', 'title': 'בית ספר לקוסמים', - }, + }, 'playlist_count': 60, }, { @@ -27,17 +28,17 @@ class KanKidsIE(InfoExtractor): '_type': 'playlist', 'id': 'cramel_main', 'title': 'כראמל - עונה 1', - }, + }, 'playlist_count': 21, }, - ] + ] def _real_extract(self, url): m = super()._match_valid_url(url) series_id = m.group('id') category = m.group('category') playlist_season = m.group('season') - + webpage = self._download_webpage(url, series_id) title_pattern = r'<title>(?P<title>.+) \|' @@ -54,8 +55,8 @@ def _real_extract(self, url): series_id + r'/' + # Series season + r'/' + # Season r'(?P<id>[0-9]+)/"' + # Episode - r'.+title="(?P<title>.+)"' # Title - , webpage)) + r'.+title="(?P<title>.+)"', # Title + webpage)) entries = [] content_dir = r'https://www.' + DOMAIN + content_dir @@ -65,12 +66,12 @@ def _real_extract(self, url): ie='Generic', video_id=video_id, video_title=title, - )) + )) return { '_type': 'playlist', 'id': series_id, 'title': series_title, 'entries': entries, - } + } From 5b088ccdd9ebb83ccd32a0a0a2cacd537fd2e03e Mon Sep 17 00:00:00 2001 From: deepspy <pavel.deepspy@gmail.com> Date: Wed, 3 Jul 2024 14:31:36 +0300 Subject: [PATCH 8/8] Fixed linter warnings. --- youtube_dl/extractor/kankids.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/youtube_dl/extractor/kankids.py b/youtube_dl/extractor/kankids.py index 1bcc1d025c6..397da5517b3 100644 --- a/youtube_dl/extractor/kankids.py +++ b/youtube_dl/extractor/kankids.py @@ -10,7 +10,7 @@ class KanKidsIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?' +\ - DOMAIN.replace('.', '\.') + CONTENT_DIR +\ + DOMAIN.replace('.', '\\.') + CONTENT_DIR +\ r'(?P<category>[a-z]+)-main/(?P<id>[\w\-0-9]+)/(?P<season>\w+)?/?$' _TESTS = [ { @@ -51,11 +51,11 @@ def _real_extract(self, url): season = playlist_season if playlist_season else r'(?P<season>\w+)' content_dir = CONTENT_DIR + category + r'-main/' playlist = set(re.findall( - r'href="' + content_dir + # Content dir - series_id + r'/' + # Series - season + r'/' + # Season - r'(?P<id>[0-9]+)/"' + # Episode - r'.+title="(?P<title>.+)"', # Title + r'href="' + content_dir # Content dir + + series_id + r'/' # Series + + season + r'/' # Season + + r'(?P<id>[0-9]+)/"' # Episode + + r'.+title="(?P<title>.+)"', # Title webpage)) entries = [] @@ -74,4 +74,3 @@ def _real_extract(self, url): 'title': series_title, 'entries': entries, } -