[soundgasm] Improve extraction (closes #14588)
[ytdl] / youtube_dl / extractor / soundgasm.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7
8
9 class SoundgasmIE(InfoExtractor):
10     IE_NAME = 'soundgasm'
11     _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<user>[0-9a-zA-Z_-]+)/(?P<display_id>[0-9a-zA-Z_-]+)'
12     _TEST = {
13         'url': 'http://soundgasm.net/u/ytdl/Piano-sample',
14         'md5': '010082a2c802c5275bb00030743e75ad',
15         'info_dict': {
16             'id': '88abd86ea000cafe98f96321b23cc1206cbcbcc9',
17             'ext': 'm4a',
18             'title': 'Piano sample',
19             'description': 'Royalty Free Sample Music',
20             'uploader': 'ytdl',
21         }
22     }
23
24     def _real_extract(self, url):
25         mobj = re.match(self._VALID_URL, url)
26         display_id = mobj.group('display_id')
27
28         webpage = self._download_webpage(url, display_id)
29
30         audio_url = self._html_search_regex(
31             r'(?s)m4a\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
32             'audio URL', group='url')
33
34         title = self._search_regex(
35             r'<div[^>]+\bclass=["\']jp-title[^>]+>([^<]+)',
36             webpage, 'title', default=display_id)
37
38         description = self._html_search_regex(
39             (r'(?s)<div[^>]+\bclass=["\']jp-description[^>]+>(.+?)</div>',
40              r'(?s)<li>Description:\s(.*?)<\/li>'),
41             webpage, 'description', fatal=False)
42
43         audio_id = self._search_regex(
44             r'/([^/]+)\.m4a', audio_url, 'audio id', default=display_id)
45
46         return {
47             'id': audio_id,
48             'display_id': display_id,
49             'url': audio_url,
50             'vcodec': 'none',
51             'title': title,
52             'description': description,
53             'uploader': mobj.group('user'),
54         }
55
56
57 class SoundgasmProfileIE(InfoExtractor):
58     IE_NAME = 'soundgasm:profile'
59     _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<id>[^/]+)/?(?:\#.*)?$'
60     _TEST = {
61         'url': 'http://soundgasm.net/u/ytdl',
62         'info_dict': {
63             'id': 'ytdl',
64         },
65         'playlist_count': 1,
66     }
67
68     def _real_extract(self, url):
69         profile_id = self._match_id(url)
70
71         webpage = self._download_webpage(url, profile_id)
72
73         entries = [
74             self.url_result(audio_url, 'Soundgasm')
75             for audio_url in re.findall(r'href="([^"]+/u/%s/[^"]+)' % profile_id, webpage)]
76
77         return self.playlist_result(entries, profile_id)