From 6aadce819b42e2c0daabc7625464994010e40e51 Mon Sep 17 00:00:00 2001 From: Alyssa Ross Date: Fri, 9 Oct 2020 14:41:45 +0000 Subject: patches/youtube-dl: fix bandcamp --- overlays/patches/default.nix | 2 + ...-fix-regexp-for-JSON-matching-on-bandcamp.patch | 42 +++++++++++ ...e-unescapeHTML-instead-of-a-simple-replac.patch | 41 +++++++++++ ...tch-album-titles-inside-the-new-JSON-data.patch | 31 ++++++++ ...x-the-freeDownloadPage-JSON-lookup-and-us.patch | 34 +++++++++ ...date-youtuble-dl-test-song-information-to.patch | 31 ++++++++ ...x-test-song-uploader-name-cleanup-remanin.patch | 84 ++++++++++++++++++++++ ...vert-test-song-title-and-extract-title-ge.patch | 53 ++++++++++++++ .../0008-bandcamp-fix-album-downloading.patch | 62 ++++++++++++++++ overlays/patches/youtube-dl/default.nix | 21 ++++++ 10 files changed, 401 insertions(+) create mode 100644 overlays/patches/youtube-dl/0001-bandcamp-fix-regexp-for-JSON-matching-on-bandcamp.patch create mode 100644 overlays/patches/youtube-dl/0002-bandcamp-use-unescapeHTML-instead-of-a-simple-replac.patch create mode 100644 overlays/patches/youtube-dl/0003-bandcamp-match-album-titles-inside-the-new-JSON-data.patch create mode 100644 overlays/patches/youtube-dl/0004-bandcamp-fix-the-freeDownloadPage-JSON-lookup-and-us.patch create mode 100644 overlays/patches/youtube-dl/0005-bandcamp-update-youtuble-dl-test-song-information-to.patch create mode 100644 overlays/patches/youtube-dl/0006-bandcamp-fix-test-song-uploader-name-cleanup-remanin.patch create mode 100644 overlays/patches/youtube-dl/0007-bandcamp-Revert-test-song-title-and-extract-title-ge.patch create mode 100644 overlays/patches/youtube-dl/0008-bandcamp-fix-album-downloading.patch create mode 100644 overlays/patches/youtube-dl/default.nix diff --git a/overlays/patches/default.nix b/overlays/patches/default.nix index 88aa2ccf251c..40bd5b39f179 100644 --- a/overlays/patches/default.nix +++ b/overlays/patches/default.nix @@ -22,4 +22,6 @@ self: super: { tmux = self.callPackage ./tmux { inherit (super) tmux; }; waylandPkgs = self.extend (import ./nixpkgs-wayland); + + youtube-dl = self.callPackage ./youtube-dl { inherit (super) youtube-dl; }; } diff --git a/overlays/patches/youtube-dl/0001-bandcamp-fix-regexp-for-JSON-matching-on-bandcamp.patch b/overlays/patches/youtube-dl/0001-bandcamp-fix-regexp-for-JSON-matching-on-bandcamp.patch new file mode 100644 index 000000000000..f442d5bb03b3 --- /dev/null +++ b/overlays/patches/youtube-dl/0001-bandcamp-fix-regexp-for-JSON-matching-on-bandcamp.patch @@ -0,0 +1,42 @@ +From 435d62abe9c4b4de7634d6028812ebd28b31c882 Mon Sep 17 00:00:00 2001 +From: Gilles Pietri +Date: Wed, 23 Sep 2020 23:09:00 +0200 +Subject: [PATCH 1/8] [bandcamp] fix regexp for JSON matching on bandcamp + +From https://github.com/ytdl-org/youtube-dl/pull/26684. +--- + youtube_dl/extractor/bandcamp.py | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py +index f14b407dc..ad1812320 100644 +--- a/youtube_dl/extractor/bandcamp.py ++++ b/youtube_dl/extractor/bandcamp.py +@@ -91,10 +91,11 @@ class BandcampIE(InfoExtractor): + duration = None + + formats = [] +- track_info = self._parse_json( +- self._search_regex( +- r'trackinfo\s*:\s*\[\s*({.+?})\s*\]\s*,\s*?\n', +- webpage, 'track info', default='{}'), title) ++ trackinfo_block = self._search_regex( ++ r'trackinfo":\[\s*({.+?})\s*\],"', ++ webpage, 'track info', default='{}') ++ quoted_json = trackinfo_block.replace('"', '"') ++ track_info = self._parse_json(quoted_json, title) + if track_info: + file_ = track_info.get('file') + if isinstance(file_, dict): +@@ -117,7 +118,7 @@ class BandcampIE(InfoExtractor): + + def extract(key): + return self._search_regex( +- r'\b%s\s*["\']?\s*:\s*(["\'])(?P(?:(?!\1).)+)\1' % key, ++ r',"%s":(")(?P(?:(?!").)+)"' % key, + webpage, key, default=None, group='value') + + artist = extract('artist') +-- +2.27.0 + diff --git a/overlays/patches/youtube-dl/0002-bandcamp-use-unescapeHTML-instead-of-a-simple-replac.patch b/overlays/patches/youtube-dl/0002-bandcamp-use-unescapeHTML-instead-of-a-simple-replac.patch new file mode 100644 index 000000000000..649ca6477951 --- /dev/null +++ b/overlays/patches/youtube-dl/0002-bandcamp-use-unescapeHTML-instead-of-a-simple-replac.patch @@ -0,0 +1,41 @@ +From 61b0809be0f6acb7b88ebb0146cb2a85bc41b805 Mon Sep 17 00:00:00 2001 +From: Gilles Pietri +Date: Sat, 26 Sep 2020 17:34:35 +0200 +Subject: [PATCH 2/8] [bandcamp] use unescapeHTML instead of a simple replace + of quotes + +From https://github.com/ytdl-org/youtube-dl/pull/26684. +--- + youtube_dl/extractor/bandcamp.py | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py +index ad1812320..55d110e28 100644 +--- a/youtube_dl/extractor/bandcamp.py ++++ b/youtube_dl/extractor/bandcamp.py +@@ -92,10 +92,10 @@ class BandcampIE(InfoExtractor): + + formats = [] + trackinfo_block = self._search_regex( +- r'trackinfo":\[\s*({.+?})\s*\],"', ++ r'trackinfo(?:["\']|"):\[\s*({.+?})\s*\],(?:["\']|")', + webpage, 'track info', default='{}') +- quoted_json = trackinfo_block.replace('"', '"') +- track_info = self._parse_json(quoted_json, title) ++ unescaped_json = unescapeHTML(trackinfo_block) ++ track_info = self._parse_json(unescaped_json, title) + if track_info: + file_ = track_info.get('file') + if isinstance(file_, dict): +@@ -118,7 +118,7 @@ class BandcampIE(InfoExtractor): + + def extract(key): + return self._search_regex( +- r',"%s":(")(?P(?:(?!").)+)"' % key, ++ r',(["\']|")%s\1:\1(?P(?:(?!\1).)+)\1' % key, + webpage, key, default=None, group='value') + + artist = extract('artist') +-- +2.27.0 + diff --git a/overlays/patches/youtube-dl/0003-bandcamp-match-album-titles-inside-the-new-JSON-data.patch b/overlays/patches/youtube-dl/0003-bandcamp-match-album-titles-inside-the-new-JSON-data.patch new file mode 100644 index 000000000000..3c88c19954fb --- /dev/null +++ b/overlays/patches/youtube-dl/0003-bandcamp-match-album-titles-inside-the-new-JSON-data.patch @@ -0,0 +1,31 @@ +From c8f146c5f3cd6600cab8f570b29510ee0d092523 Mon Sep 17 00:00:00 2001 +From: Gilles Pietri +Date: Sun, 27 Sep 2020 14:51:42 +0200 +Subject: [PATCH 3/8] [bandcamp] match album titles inside the new JSON data + block, and unescape the title properly + +From https://github.com/ytdl-org/youtube-dl/pull/26684. +--- + youtube_dl/extractor/bandcamp.py | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py +index 55d110e28..f036a89eb 100644 +--- a/youtube_dl/extractor/bandcamp.py ++++ b/youtube_dl/extractor/bandcamp.py +@@ -316,10 +316,10 @@ class BandcampAlbumIE(InfoExtractor): + if self._html_search_meta('duration', elem_content, default=None)] + + title = self._html_search_regex( +- r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"', ++ r'album_title\s*(?:"|["\']):\s*(?:"|["\'])((?:\\.|[^"\\])+?)(?:"|["\'])', + webpage, 'title', fatal=False) + if title: +- title = title.replace(r'\"', '"') ++ title = unescapeHTML(title) + return { + '_type': 'playlist', + 'uploader_id': uploader_id, +-- +2.27.0 + diff --git a/overlays/patches/youtube-dl/0004-bandcamp-fix-the-freeDownloadPage-JSON-lookup-and-us.patch b/overlays/patches/youtube-dl/0004-bandcamp-fix-the-freeDownloadPage-JSON-lookup-and-us.patch new file mode 100644 index 000000000000..b5876a477009 --- /dev/null +++ b/overlays/patches/youtube-dl/0004-bandcamp-fix-the-freeDownloadPage-JSON-lookup-and-us.patch @@ -0,0 +1,34 @@ +From 848e97dc19218ed59e401fb34e7e2f1d2e40f9da Mon Sep 17 00:00:00 2001 +From: Gilles Pietri +Date: Sun, 27 Sep 2020 15:11:08 +0200 +Subject: [PATCH 4/8] [bandcamp] fix the freeDownloadPage JSON lookup, and use + the id from the URL to match the tracks + +From https://github.com/ytdl-org/youtube-dl/pull/26684. +--- + youtube_dl/extractor/bandcamp.py | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py +index f036a89eb..eccb867a0 100644 +--- a/youtube_dl/extractor/bandcamp.py ++++ b/youtube_dl/extractor/bandcamp.py +@@ -128,12 +128,12 @@ class BandcampIE(InfoExtractor): + release_date = unified_strdate(extract('album_release_date')) + + download_link = self._search_regex( +- r'freeDownloadPage\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, ++ r'freeDownloadPage(?:["\']|"):\s*(["\']|")(?P(?:(?!\1).)+)\1', webpage, + 'download link', default=None, group='url') + if download_link: + track_id = self._search_regex( +- r'(?ms)var TralbumData = .*?[{,]\s*id: (?P\d+),?$', +- webpage, 'track id') ++ r'\?id=(?P\d+)&', ++ download_link, 'track id') + + download_webpage = self._download_webpage( + download_link, track_id, 'Downloading free downloads page') +-- +2.27.0 + diff --git a/overlays/patches/youtube-dl/0005-bandcamp-update-youtuble-dl-test-song-information-to.patch b/overlays/patches/youtube-dl/0005-bandcamp-update-youtuble-dl-test-song-information-to.patch new file mode 100644 index 000000000000..fd391f119dd2 --- /dev/null +++ b/overlays/patches/youtube-dl/0005-bandcamp-update-youtuble-dl-test-song-information-to.patch @@ -0,0 +1,31 @@ +From 3dd523b0cadf149b3a21b36f6ffbe770e302921a Mon Sep 17 00:00:00 2001 +From: Gilles Pietri +Date: Sun, 27 Sep 2020 15:52:55 +0200 +Subject: [PATCH 5/8] [bandcamp] update youtuble dl test song information to + match title as artist - track, and add missing keys from info_dict + +From https://github.com/ytdl-org/youtube-dl/pull/26684. +--- + youtube_dl/extractor/bandcamp.py | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py +index eccb867a0..3d32b1e0f 100644 +--- a/youtube_dl/extractor/bandcamp.py ++++ b/youtube_dl/extractor/bandcamp.py +@@ -33,8 +33,11 @@ class BandcampIE(InfoExtractor): + 'info_dict': { + 'id': '1812978515', + 'ext': 'mp3', +- 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", ++ 'title': "youtube-dl \\ - youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", + 'duration': 9.8485, ++ 'uploader': 'youtube-dl \\', ++ 'timestamp': 1354224127, ++ 'upload_date': '20121129', + }, + '_skip': 'There is a limit of 200 free downloads / month for the test song' + }, { +-- +2.27.0 + diff --git a/overlays/patches/youtube-dl/0006-bandcamp-fix-test-song-uploader-name-cleanup-remanin.patch b/overlays/patches/youtube-dl/0006-bandcamp-fix-test-song-uploader-name-cleanup-remanin.patch new file mode 100644 index 000000000000..7f27661b065d --- /dev/null +++ b/overlays/patches/youtube-dl/0006-bandcamp-fix-test-song-uploader-name-cleanup-remanin.patch @@ -0,0 +1,84 @@ +From d6b4c19c0557c0486fd1d34ee2891c5fd3c583ee Mon Sep 17 00:00:00 2001 +From: Gilles Pietri +Date: Mon, 28 Sep 2020 19:42:56 +0200 +Subject: [PATCH 6/8] [bandcamp] fix test song uploader name, cleanup remanings + " and \ in data, including album titles + +From https://github.com/ytdl-org/youtube-dl/pull/26684. +--- + youtube_dl/extractor/bandcamp.py | 26 +++++++++++++++----------- + 1 file changed, 15 insertions(+), 11 deletions(-) + +diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py +index 3d32b1e0f..3405b570a 100644 +--- a/youtube_dl/extractor/bandcamp.py ++++ b/youtube_dl/extractor/bandcamp.py +@@ -33,9 +33,9 @@ class BandcampIE(InfoExtractor): + 'info_dict': { + 'id': '1812978515', + 'ext': 'mp3', +- 'title': "youtube-dl \\ - youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", ++ 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", + 'duration': 9.8485, +- 'uploader': 'youtube-dl \\', ++ 'uploader': "youtube-dl \"'/\\\u00e4\u21ad", + 'timestamp': 1354224127, + 'upload_date': '20121129', + }, +@@ -43,7 +43,7 @@ class BandcampIE(InfoExtractor): + }, { + # free download + 'url': 'http://benprunty.bandcamp.com/track/lanius-battle', +- 'md5': '853e35bf34aa1d6fe2615ae612564b36', ++ 'md5': '5d92af55811e47f38962a54c30b07ef0', + 'info_dict': { + 'id': '2650410135', + 'ext': 'aiff', +@@ -94,11 +94,12 @@ class BandcampIE(InfoExtractor): + duration = None + + formats = [] +- trackinfo_block = self._search_regex( ++ trackinfo_block = self._html_search_regex( + r'trackinfo(?:["\']|"):\[\s*({.+?})\s*\],(?:["\']|")', + webpage, 'track info', default='{}') +- unescaped_json = unescapeHTML(trackinfo_block) +- track_info = self._parse_json(unescaped_json, title) ++ ++ track_info = self._parse_json(trackinfo_block, title) ++ + if track_info: + file_ = track_info.get('file') + if isinstance(file_, dict): +@@ -120,9 +121,10 @@ class BandcampIE(InfoExtractor): + duration = float_or_none(track_info.get('duration')) + + def extract(key): +- return self._search_regex( +- r',(["\']|")%s\1:\1(?P(?:(?!\1).)+)\1' % key, ++ data = self._html_search_regex( ++ r',(["\']|")%s\1:\1(?P(?:\\\1|((?!\1).))+)\1' % key, + webpage, key, default=None, group='value') ++ return data.replace(r'\"', '"').replace('\\\\', '\\') if data else data + + artist = extract('artist') + album = extract('album_title') +@@ -319,10 +321,12 @@ class BandcampAlbumIE(InfoExtractor): + if self._html_search_meta('duration', elem_content, default=None)] + + title = self._html_search_regex( +- r'album_title\s*(?:"|["\']):\s*(?:"|["\'])((?:\\.|[^"\\])+?)(?:"|["\'])', +- webpage, 'title', fatal=False) ++ r'album_title\s*(?:"|["\']):\s*("|["\'])(?P(?:\\\1|((?!\1).))+)\1', ++ webpage, 'title', fatal=False, group='album') ++ + if title: +- title = unescapeHTML(title) ++ title = title.replace(r'\"', '"') ++ + return { + '_type': 'playlist', + 'uploader_id': uploader_id, +-- +2.27.0 + diff --git a/overlays/patches/youtube-dl/0007-bandcamp-Revert-test-song-title-and-extract-title-ge.patch b/overlays/patches/youtube-dl/0007-bandcamp-Revert-test-song-title-and-extract-title-ge.patch new file mode 100644 index 000000000000..b30a7c0d24d5 --- /dev/null +++ b/overlays/patches/youtube-dl/0007-bandcamp-Revert-test-song-title-and-extract-title-ge.patch @@ -0,0 +1,53 @@ +From ab8820258115866f1c5f8955131a7e88ee178662 Mon Sep 17 00:00:00 2001 +From: Gilles Pietri +Date: Tue, 29 Sep 2020 12:09:55 +0200 +Subject: [PATCH 7/8] [bandcamp] Revert test song title, and extract title + generally (which may fail, as the other title json values might come up), + instead of out of trackinfo, as bandcamp prefixes it with artist - + +From https://github.com/ytdl-org/youtube-dl/pull/26684. +--- + youtube_dl/extractor/bandcamp.py | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py +index 3405b570a..04b8aa80f 100644 +--- a/youtube_dl/extractor/bandcamp.py ++++ b/youtube_dl/extractor/bandcamp.py +@@ -33,7 +33,7 @@ class BandcampIE(InfoExtractor): + 'info_dict': { + 'id': '1812978515', + 'ext': 'mp3', +- 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", ++ 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", + 'duration': 9.8485, + 'uploader': "youtube-dl \"'/\\\u00e4\u21ad", + 'timestamp': 1354224127, +@@ -99,7 +99,6 @@ class BandcampIE(InfoExtractor): + webpage, 'track info', default='{}') + + track_info = self._parse_json(trackinfo_block, title) +- + if track_info: + file_ = track_info.get('file') + if isinstance(file_, dict): +@@ -115,7 +114,7 @@ class BandcampIE(InfoExtractor): + 'acodec': ext, + 'abr': int_or_none(abr_str), + }) +- track = track_info.get('title') ++ + track_id = str_or_none(track_info.get('track_id') or track_info.get('id')) + track_number = int_or_none(track_info.get('track_num')) + duration = float_or_none(track_info.get('duration')) +@@ -126,6 +125,7 @@ class BandcampIE(InfoExtractor): + webpage, key, default=None, group='value') + return data.replace(r'\"', '"').replace('\\\\', '\\') if data else data + ++ track = extract('title') + artist = extract('artist') + album = extract('album_title') + timestamp = unified_timestamp( +-- +2.27.0 + diff --git a/overlays/patches/youtube-dl/0008-bandcamp-fix-album-downloading.patch b/overlays/patches/youtube-dl/0008-bandcamp-fix-album-downloading.patch new file mode 100644 index 000000000000..750e4b1ae0a3 --- /dev/null +++ b/overlays/patches/youtube-dl/0008-bandcamp-fix-album-downloading.patch @@ -0,0 +1,62 @@ +From 4b2caf8924f404d15d8cdc475912eac3a6e24339 Mon Sep 17 00:00:00 2001 +From: Valentin Berlier +Date: Fri, 9 Oct 2020 14:23:21 +0000 +Subject: [PATCH 8/8] [bandcamp] fix album downloading + +From +, +with missing json import added. +--- + youtube_dl/extractor/bandcamp.py | 27 ++++++++------------------- + 1 file changed, 8 insertions(+), 19 deletions(-) + +diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py +index 04b8aa80f..3fba35b83 100644 +--- a/youtube_dl/extractor/bandcamp.py ++++ b/youtube_dl/extractor/bandcamp.py +@@ -1,5 +1,6 @@ + from __future__ import unicode_literals + ++import json + import random + import re + import time +@@ -305,28 +306,16 @@ class BandcampAlbumIE(InfoExtractor): + album_id = mobj.group('album_id') + playlist_id = album_id or uploader_id + webpage = self._download_webpage(url, playlist_id) +- track_elements = re.findall( +- r'(?s)]*>(.*?]+href="([^"]+?)"[^>]+itemprop="url"[^>]*>.*?)', webpage) +- if not track_elements: +- raise ExtractorError('The page doesn\'t contain any tracks') +- # Only tracks with duration info have songs ++ ld = re.findall(r']+type="application/ld\+json"[^>]*>(.*?)', webpage, re.DOTALL) ++ album = json.loads(ld[0]) + entries = [ + self.url_result( +- compat_urlparse.urljoin(url, t_path), ++ record['item']['@id'], + ie=BandcampIE.ie_key(), +- video_title=self._search_regex( +- r']+\bitemprop=["\']name["\'][^>]*>([^<]+)', +- elem_content, 'track title', fatal=False)) +- for elem_content, t_path in track_elements +- if self._html_search_meta('duration', elem_content, default=None)] +- +- title = self._html_search_regex( +- r'album_title\s*(?:"|["\']):\s*("|["\'])(?P(?:\\\1|((?!\1).))+)\1', +- webpage, 'title', fatal=False, group='album') +- +- if title: +- title = title.replace(r'\"', '"') +- ++ video_title=record['item']['name']) ++ for record in album['track']['itemListElement'] ++ ] ++ title = album['name'] + return { + '_type': 'playlist', + 'uploader_id': uploader_id, +-- +2.27.0 + diff --git a/overlays/patches/youtube-dl/default.nix b/overlays/patches/youtube-dl/default.nix new file mode 100644 index 000000000000..b9f20e7592c8 --- /dev/null +++ b/overlays/patches/youtube-dl/default.nix @@ -0,0 +1,21 @@ +{ youtube-dl, ... } @ args: + +let + inherit (builtins) removeAttrs; +in + +(youtube-dl.override (removeAttrs args [ "youtube-dl" ])).overrideAttrs ( + { patches ? [], ... }: + { + patches = patches ++ [ + ./0001-bandcamp-fix-regexp-for-JSON-matching-on-bandcamp.patch + ./0002-bandcamp-use-unescapeHTML-instead-of-a-simple-replac.patch + ./0003-bandcamp-match-album-titles-inside-the-new-JSON-data.patch + ./0004-bandcamp-fix-the-freeDownloadPage-JSON-lookup-and-us.patch + ./0005-bandcamp-update-youtuble-dl-test-song-information-to.patch + ./0006-bandcamp-fix-test-song-uploader-name-cleanup-remanin.patch + ./0007-bandcamp-Revert-test-song-title-and-extract-title-ge.patch + ./0008-bandcamp-fix-album-downloading.patch + ]; + } +) -- cgit 1.4.1