overlays/patches/youtube-dl/0008-bandcamp-fix-album-downloading.patch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62

From 4b2caf8924f404d15d8cdc475912eac3a6e24339 Mon Sep 17 00:00:00 2001
From: Valentin Berlier <berlier.v@gmail.com>
Date: Fri, 9 Oct 2020 14:23:21 +0000
Subject: [PATCH 8/8] [bandcamp] fix album downloading

From
<https://github.com/ytdl-org/youtube-dl/pull/26684#issuecomment-705253522>,
with missing json import added.
---
 youtube_dl/extractor/bandcamp.py | 27 ++++++++-------------------
 1 file changed, 8 insertions(+), 19 deletions(-)

diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py
index 04b8aa80f..3fba35b83 100644
--- a/youtube_dl/extractor/bandcamp.py
+++ b/youtube_dl/extractor/bandcamp.py
@@ -1,5 +1,6 @@
 from __future__ import unicode_literals
 
+import json
 import random
 import re
 import time
@@ -305,28 +306,16 @@ class BandcampAlbumIE(InfoExtractor):
         album_id = mobj.group('album_id')
         playlist_id = album_id or uploader_id
         webpage = self._download_webpage(url, playlist_id)
-        track_elements = re.findall(
-            r'(?s)<div[^>]*>(.*?<a[^>]+href="([^"]+?)"[^>]+itemprop="url"[^>]*>.*?)</div>', webpage)
-        if not track_elements:
-            raise ExtractorError('The page doesn\'t contain any tracks')
-        # Only tracks with duration info have songs
+        ld = re.findall(r'<script[^>]+type="application/ld\+json"[^>]*>(.*?)</script>', webpage, re.DOTALL)
+        album = json.loads(ld[0])
         entries = [
             self.url_result(
-                compat_urlparse.urljoin(url, t_path),
+                record['item']['@id'],
                 ie=BandcampIE.ie_key(),
-                video_title=self._search_regex(
-                    r'<span\b[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)',
-                    elem_content, 'track title', fatal=False))
-            for elem_content, t_path in track_elements
-            if self._html_search_meta('duration', elem_content, default=None)]
-
-        title = self._html_search_regex(
-            r'album_title\s*(?:&quot;|["\']):\s*(&quot;|["\'])(?P<album>(?:\\\1|((?!\1).))+)\1',
-            webpage, 'title', fatal=False, group='album')
-
-        if title:
-            title = title.replace(r'\"', '"')
-
+                video_title=record['item']['name'])
+            for record in album['track']['itemListElement']
+        ]
+        title = album['name']
         return {
             '_type': 'playlist',
             'uploader_id': uploader_id,
-- 
2.27.0