Merge branch 'master' of github.com:rg3/youtube-dl into weibo
[ytdl] / youtube_dl / extractor / common.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import base64
5 import datetime
6 import hashlib
7 import json
8 import netrc
9 import os
10 import random
11 import re
12 import socket
13 import sys
14 import time
15 import math
16
17 from ..compat import (
18     compat_cookiejar,
19     compat_cookies,
20     compat_etree_fromstring,
21     compat_getpass,
22     compat_http_client,
23     compat_os_name,
24     compat_str,
25     compat_urllib_error,
26     compat_urllib_parse_unquote,
27     compat_urllib_parse_urlencode,
28     compat_urllib_request,
29     compat_urlparse,
30     compat_xml_parse_error,
31 )
32 from ..downloader.f4m import (
33     get_base_url,
34     remove_encrypted_media,
35 )
36 from ..utils import (
37     NO_DEFAULT,
38     age_restricted,
39     base_url,
40     bug_reports_message,
41     clean_html,
42     compiled_regex_type,
43     determine_ext,
44     determine_protocol,
45     error_to_compat_str,
46     ExtractorError,
47     extract_attributes,
48     fix_xml_ampersands,
49     float_or_none,
50     GeoRestrictedError,
51     GeoUtils,
52     int_or_none,
53     js_to_json,
54     mimetype2ext,
55     orderedSet,
56     parse_codecs,
57     parse_duration,
58     parse_iso8601,
59     parse_m3u8_attributes,
60     RegexNotFoundError,
61     sanitized_Request,
62     sanitize_filename,
63     unescapeHTML,
64     unified_strdate,
65     unified_timestamp,
66     update_Request,
67     update_url_query,
68     urljoin,
69     url_basename,
70     xpath_element,
71     xpath_text,
72     xpath_with_ns,
73 )
74
75
76 class InfoExtractor(object):
77     """Information Extractor class.
78
79     Information extractors are the classes that, given a URL, extract
80     information about the video (or videos) the URL refers to. This
81     information includes the real video URL, the video title, author and
82     others. The information is stored in a dictionary which is then
83     passed to the YoutubeDL. The YoutubeDL processes this
84     information possibly downloading the video to the file system, among
85     other possible outcomes.
86
87     The type field determines the type of the result.
88     By far the most common value (and the default if _type is missing) is
89     "video", which indicates a single video.
90
91     For a video, the dictionaries must include the following fields:
92
93     id:             Video identifier.
94     title:          Video title, unescaped.
95
96     Additionally, it must contain either a formats entry or a url one:
97
98     formats:        A list of dictionaries for each format available, ordered
99                     from worst to best quality.
100
101                     Potential fields:
102                     * url        Mandatory. The URL of the video file
103                     * manifest_url
104                                  The URL of the manifest file in case of
105                                  fragmented media (DASH, hls, hds)
106                     * ext        Will be calculated from URL if missing
107                     * format     A human-readable description of the format
108                                  ("mp4 container with h264/opus").
109                                  Calculated from the format_id, width, height.
110                                  and format_note fields if missing.
111                     * format_id  A short description of the format
112                                  ("mp4_h264_opus" or "19").
113                                 Technically optional, but strongly recommended.
114                     * format_note Additional info about the format
115                                  ("3D" or "DASH video")
116                     * width      Width of the video, if known
117                     * height     Height of the video, if known
118                     * resolution Textual description of width and height
119                     * tbr        Average bitrate of audio and video in KBit/s
120                     * abr        Average audio bitrate in KBit/s
121                     * acodec     Name of the audio codec in use
122                     * asr        Audio sampling rate in Hertz
123                     * vbr        Average video bitrate in KBit/s
124                     * fps        Frame rate
125                     * vcodec     Name of the video codec in use
126                     * container  Name of the container format
127                     * filesize   The number of bytes, if known in advance
128                     * filesize_approx  An estimate for the number of bytes
129                     * player_url SWF Player URL (used for rtmpdump).
130                     * protocol   The protocol that will be used for the actual
131                                  download, lower-case.
132                                  "http", "https", "rtsp", "rtmp", "rtmpe",
133                                  "m3u8", "m3u8_native" or "http_dash_segments".
134                     * fragment_base_url
135                                  Base URL for fragments. Each fragment's path
136                                  value (if present) will be relative to
137                                  this URL.
138                     * fragments  A list of fragments of a fragmented media.
139                                  Each fragment entry must contain either an url
140                                  or a path. If an url is present it should be
141                                  considered by a client. Otherwise both path and
142                                  fragment_base_url must be present. Here is
143                                  the list of all potential fields:
144                                  * "url" - fragment's URL
145                                  * "path" - fragment's path relative to
146                                             fragment_base_url
147                                  * "duration" (optional, int or float)
148                                  * "filesize" (optional, int)
149                     * preference Order number of this format. If this field is
150                                  present and not None, the formats get sorted
151                                  by this field, regardless of all other values.
152                                  -1 for default (order by other properties),
153                                  -2 or smaller for less than default.
154                                  < -1000 to hide the format (if there is
155                                     another one which is strictly better)
156                     * language   Language code, e.g. "de" or "en-US".
157                     * language_preference  Is this in the language mentioned in
158                                  the URL?
159                                  10 if it's what the URL is about,
160                                  -1 for default (don't know),
161                                  -10 otherwise, other values reserved for now.
162                     * quality    Order number of the video quality of this
163                                  format, irrespective of the file format.
164                                  -1 for default (order by other properties),
165                                  -2 or smaller for less than default.
166                     * source_preference  Order number for this video source
167                                   (quality takes higher priority)
168                                  -1 for default (order by other properties),
169                                  -2 or smaller for less than default.
170                     * http_headers  A dictionary of additional HTTP headers
171                                  to add to the request.
172                     * stretched_ratio  If given and not 1, indicates that the
173                                  video's pixels are not square.
174                                  width : height ratio as float.
175                     * no_resume  The server does not support resuming the
176                                  (HTTP or RTMP) download. Boolean.
177
178     url:            Final video URL.
179     ext:            Video filename extension.
180     format:         The video format, defaults to ext (used for --get-format)
181     player_url:     SWF Player URL (used for rtmpdump).
182
183     The following fields are optional:
184
185     alt_title:      A secondary title of the video.
186     display_id      An alternative identifier for the video, not necessarily
187                     unique, but available before title. Typically, id is
188                     something like "4234987", title "Dancing naked mole rats",
189                     and display_id "dancing-naked-mole-rats"
190     thumbnails:     A list of dictionaries, with the following entries:
191                         * "id" (optional, string) - Thumbnail format ID
192                         * "url"
193                         * "preference" (optional, int) - quality of the image
194                         * "width" (optional, int)
195                         * "height" (optional, int)
196                         * "resolution" (optional, string "{width}x{height"},
197                                         deprecated)
198                         * "filesize" (optional, int)
199     thumbnail:      Full URL to a video thumbnail image.
200     description:    Full video description.
201     uploader:       Full name of the video uploader.
202     license:        License name the video is licensed under.
203     creator:        The creator of the video.
204     release_date:   The date (YYYYMMDD) when the video was released.
205     timestamp:      UNIX timestamp of the moment the video became available.
206     upload_date:    Video upload date (YYYYMMDD).
207                     If not explicitly set, calculated from timestamp.
208     uploader_id:    Nickname or id of the video uploader.
209     uploader_url:   Full URL to a personal webpage of the video uploader.
210     location:       Physical location where the video was filmed.
211     subtitles:      The available subtitles as a dictionary in the format
212                     {tag: subformats}. "tag" is usually a language code, and
213                     "subformats" is a list sorted from lower to higher
214                     preference, each element is a dictionary with the "ext"
215                     entry and one of:
216                         * "data": The subtitles file contents
217                         * "url": A URL pointing to the subtitles file
218                     "ext" will be calculated from URL if missing
219     automatic_captions: Like 'subtitles', used by the YoutubeIE for
220                     automatically generated captions
221     duration:       Length of the video in seconds, as an integer or float.
222     view_count:     How many users have watched the video on the platform.
223     like_count:     Number of positive ratings of the video
224     dislike_count:  Number of negative ratings of the video
225     repost_count:   Number of reposts of the video
226     average_rating: Average rating give by users, the scale used depends on the webpage
227     comment_count:  Number of comments on the video
228     comments:       A list of comments, each with one or more of the following
229                     properties (all but one of text or html optional):
230                         * "author" - human-readable name of the comment author
231                         * "author_id" - user ID of the comment author
232                         * "id" - Comment ID
233                         * "html" - Comment as HTML
234                         * "text" - Plain text of the comment
235                         * "timestamp" - UNIX timestamp of comment
236                         * "parent" - ID of the comment this one is replying to.
237                                      Set to "root" to indicate that this is a
238                                      comment to the original video.
239     age_limit:      Age restriction for the video, as an integer (years)
240     webpage_url:    The URL to the video webpage, if given to youtube-dl it
241                     should allow to get the same result again. (It will be set
242                     by YoutubeDL if it's missing)
243     categories:     A list of categories that the video falls in, for example
244                     ["Sports", "Berlin"]
245     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
246     is_live:        True, False, or None (=unknown). Whether this video is a
247                     live stream that goes on instead of a fixed-length video.
248     start_time:     Time in seconds where the reproduction should start, as
249                     specified in the URL.
250     end_time:       Time in seconds where the reproduction should end, as
251                     specified in the URL.
252     chapters:       A list of dictionaries, with the following entries:
253                         * "start_time" - The start time of the chapter in seconds
254                         * "end_time" - The end time of the chapter in seconds
255                         * "title" (optional, string)
256
257     The following fields should only be used when the video belongs to some logical
258     chapter or section:
259
260     chapter:        Name or title of the chapter the video belongs to.
261     chapter_number: Number of the chapter the video belongs to, as an integer.
262     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
263
264     The following fields should only be used when the video is an episode of some
265     series, programme or podcast:
266
267     series:         Title of the series or programme the video episode belongs to.
268     season:         Title of the season the video episode belongs to.
269     season_number:  Number of the season the video episode belongs to, as an integer.
270     season_id:      Id of the season the video episode belongs to, as a unicode string.
271     episode:        Title of the video episode. Unlike mandatory video title field,
272                     this field should denote the exact title of the video episode
273                     without any kind of decoration.
274     episode_number: Number of the video episode within a season, as an integer.
275     episode_id:     Id of the video episode, as a unicode string.
276
277     The following fields should only be used when the media is a track or a part of
278     a music album:
279
280     track:          Title of the track.
281     track_number:   Number of the track within an album or a disc, as an integer.
282     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
283                     as a unicode string.
284     artist:         Artist(s) of the track.
285     genre:          Genre(s) of the track.
286     album:          Title of the album the track belongs to.
287     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
288     album_artist:   List of all artists appeared on the album (e.g.
289                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
290                     and compilations).
291     disc_number:    Number of the disc or other physical medium the track belongs to,
292                     as an integer.
293     release_year:   Year (YYYY) when the album was released.
294
295     Unless mentioned otherwise, the fields should be Unicode strings.
296
297     Unless mentioned otherwise, None is equivalent to absence of information.
298
299
300     _type "playlist" indicates multiple videos.
301     There must be a key "entries", which is a list, an iterable, or a PagedList
302     object, each element of which is a valid dictionary by this specification.
303
304     Additionally, playlists can have "id", "title", "description", "uploader",
305     "uploader_id", "uploader_url" attributes with the same semantics as videos
306     (see above).
307
308
309     _type "multi_video" indicates that there are multiple videos that
310     form a single show, for examples multiple acts of an opera or TV episode.
311     It must have an entries key like a playlist and contain all the keys
312     required for a video at the same time.
313
314
315     _type "url" indicates that the video must be extracted from another
316     location, possibly by a different extractor. Its only required key is:
317     "url" - the next URL to extract.
318     The key "ie_key" can be set to the class name (minus the trailing "IE",
319     e.g. "Youtube") if the extractor class is known in advance.
320     Additionally, the dictionary may have any properties of the resolved entity
321     known in advance, for example "title" if the title of the referred video is
322     known ahead of time.
323
324
325     _type "url_transparent" entities have the same specification as "url", but
326     indicate that the given additional information is more precise than the one
327     associated with the resolved URL.
328     This is useful when a site employs a video service that hosts the video and
329     its technical metadata, but that video service does not embed a useful
330     title, description etc.
331
332
333     Subclasses of this one should re-define the _real_initialize() and
334     _real_extract() methods and define a _VALID_URL regexp.
335     Probably, they should also be added to the list of extractors.
336
337     _GEO_BYPASS attribute may be set to False in order to disable
338     geo restriction bypass mechanisms for a particular extractor.
339     Though it won't disable explicit geo restriction bypass based on
340     country code provided with geo_bypass_country. (experimental)
341
342     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
343     countries for this extractor. One of these countries will be used by
344     geo restriction bypass mechanism right away in order to bypass
345     geo restriction, of course, if the mechanism is not disabled. (experimental)
346
347     NB: both these geo attributes are experimental and may change in future
348     or be completely removed.
349
350     Finally, the _WORKING attribute should be set to False for broken IEs
351     in order to warn the users and skip the tests.
352     """
353
354     _ready = False
355     _downloader = None
356     _x_forwarded_for_ip = None
357     _GEO_BYPASS = True
358     _GEO_COUNTRIES = None
359     _WORKING = True
360
361     def __init__(self, downloader=None):
362         """Constructor. Receives an optional downloader."""
363         self._ready = False
364         self._x_forwarded_for_ip = None
365         self.set_downloader(downloader)
366
367     @classmethod
368     def suitable(cls, url):
369         """Receives a URL and returns True if suitable for this IE."""
370
371         # This does not use has/getattr intentionally - we want to know whether
372         # we have cached the regexp for *this* class, whereas getattr would also
373         # match the superclass
374         if '_VALID_URL_RE' not in cls.__dict__:
375             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
376         return cls._VALID_URL_RE.match(url) is not None
377
378     @classmethod
379     def _match_id(cls, url):
380         if '_VALID_URL_RE' not in cls.__dict__:
381             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
382         m = cls._VALID_URL_RE.match(url)
383         assert m
384         return compat_str(m.group('id'))
385
386     @classmethod
387     def working(cls):
388         """Getter method for _WORKING."""
389         return cls._WORKING
390
391     def initialize(self):
392         """Initializes an instance (authentication, etc)."""
393         self._initialize_geo_bypass(self._GEO_COUNTRIES)
394         if not self._ready:
395             self._real_initialize()
396             self._ready = True
397
398     def _initialize_geo_bypass(self, countries):
399         """
400         Initialize geo restriction bypass mechanism.
401
402         This method is used to initialize geo bypass mechanism based on faking
403         X-Forwarded-For HTTP header. A random country from provided country list
404         is selected and a random IP belonging to this country is generated. This
405         IP will be passed as X-Forwarded-For HTTP header in all subsequent
406         HTTP requests.
407
408         This method will be used for initial geo bypass mechanism initialization
409         during the instance initialization with _GEO_COUNTRIES.
410
411         You may also manually call it from extractor's code if geo countries
412         information is not available beforehand (e.g. obtained during
413         extraction) or due to some another reason.
414         """
415         if not self._x_forwarded_for_ip:
416             country_code = self._downloader.params.get('geo_bypass_country', None)
417             # If there is no explicit country for geo bypass specified and
418             # the extractor is known to be geo restricted let's fake IP
419             # as X-Forwarded-For right away.
420             if (not country_code and
421                     self._GEO_BYPASS and
422                     self._downloader.params.get('geo_bypass', True) and
423                     countries):
424                 country_code = random.choice(countries)
425             if country_code:
426                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
427                 if self._downloader.params.get('verbose', False):
428                     self._downloader.to_screen(
429                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
430                         % (self._x_forwarded_for_ip, country_code.upper()))
431
432     def extract(self, url):
433         """Extracts URL information and returns it in list of dicts."""
434         try:
435             for _ in range(2):
436                 try:
437                     self.initialize()
438                     ie_result = self._real_extract(url)
439                     if self._x_forwarded_for_ip:
440                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
441                     return ie_result
442                 except GeoRestrictedError as e:
443                     if self.__maybe_fake_ip_and_retry(e.countries):
444                         continue
445                     raise
446         except ExtractorError:
447             raise
448         except compat_http_client.IncompleteRead as e:
449             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
450         except (KeyError, StopIteration) as e:
451             raise ExtractorError('An extractor error has occurred.', cause=e)
452
453     def __maybe_fake_ip_and_retry(self, countries):
454         if (not self._downloader.params.get('geo_bypass_country', None) and
455                 self._GEO_BYPASS and
456                 self._downloader.params.get('geo_bypass', True) and
457                 not self._x_forwarded_for_ip and
458                 countries):
459             country_code = random.choice(countries)
460             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
461             if self._x_forwarded_for_ip:
462                 self.report_warning(
463                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
464                     % (self._x_forwarded_for_ip, country_code.upper()))
465                 return True
466         return False
467
468     def set_downloader(self, downloader):
469         """Sets the downloader for this IE."""
470         self._downloader = downloader
471
472     def _real_initialize(self):
473         """Real initialization process. Redefine in subclasses."""
474         pass
475
476     def _real_extract(self, url):
477         """Real extraction process. Redefine in subclasses."""
478         pass
479
480     @classmethod
481     def ie_key(cls):
482         """A string for getting the InfoExtractor with get_info_extractor"""
483         return compat_str(cls.__name__[:-2])
484
485     @property
486     def IE_NAME(self):
487         return compat_str(type(self).__name__[:-2])
488
489     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
490         """ Returns the response handle """
491         if note is None:
492             self.report_download_webpage(video_id)
493         elif note is not False:
494             if video_id is None:
495                 self.to_screen('%s' % (note,))
496             else:
497                 self.to_screen('%s: %s' % (video_id, note))
498
499         # Some sites check X-Forwarded-For HTTP header in order to figure out
500         # the origin of the client behind proxy. This allows bypassing geo
501         # restriction by faking this header's value to IP that belongs to some
502         # geo unrestricted country. We will do so once we encounter any
503         # geo restriction error.
504         if self._x_forwarded_for_ip:
505             if 'X-Forwarded-For' not in headers:
506                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
507
508         if isinstance(url_or_request, compat_urllib_request.Request):
509             url_or_request = update_Request(
510                 url_or_request, data=data, headers=headers, query=query)
511         else:
512             if query:
513                 url_or_request = update_url_query(url_or_request, query)
514             if data is not None or headers:
515                 url_or_request = sanitized_Request(url_or_request, data, headers)
516         try:
517             return self._downloader.urlopen(url_or_request)
518         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
519             if errnote is False:
520                 return False
521             if errnote is None:
522                 errnote = 'Unable to download webpage'
523
524             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
525             if fatal:
526                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
527             else:
528                 self._downloader.report_warning(errmsg)
529                 return False
530
531     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
532         """ Returns a tuple (page content as string, URL handle) """
533         # Strip hashes from the URL (#1038)
534         if isinstance(url_or_request, (compat_str, str)):
535             url_or_request = url_or_request.partition('#')[0]
536
537         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
538         if urlh is False:
539             assert not fatal
540             return False
541         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
542         return (content, urlh)
543
544     @staticmethod
545     def _guess_encoding_from_content(content_type, webpage_bytes):
546         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
547         if m:
548             encoding = m.group(1)
549         else:
550             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
551                           webpage_bytes[:1024])
552             if m:
553                 encoding = m.group(1).decode('ascii')
554             elif webpage_bytes.startswith(b'\xff\xfe'):
555                 encoding = 'utf-16'
556             else:
557                 encoding = 'utf-8'
558
559         return encoding
560
561     def __check_blocked(self, content):
562         first_block = content[:512]
563         if ('<title>Access to this site is blocked</title>' in content and
564                 'Websense' in first_block):
565             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
566             blocked_iframe = self._html_search_regex(
567                 r'<iframe src="([^"]+)"', content,
568                 'Websense information URL', default=None)
569             if blocked_iframe:
570                 msg += ' Visit %s for more details' % blocked_iframe
571             raise ExtractorError(msg, expected=True)
572         if '<title>The URL you requested has been blocked</title>' in first_block:
573             msg = (
574                 'Access to this webpage has been blocked by Indian censorship. '
575                 'Use a VPN or proxy server (with --proxy) to route around it.')
576             block_msg = self._html_search_regex(
577                 r'</h1><p>(.*?)</p>',
578                 content, 'block message', default=None)
579             if block_msg:
580                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
581             raise ExtractorError(msg, expected=True)
582         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
583                 'blocklist.rkn.gov.ru' in content):
584             raise ExtractorError(
585                 'Access to this webpage has been blocked by decision of the Russian government. '
586                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
587                 expected=True)
588
589     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
590         content_type = urlh.headers.get('Content-Type', '')
591         webpage_bytes = urlh.read()
592         if prefix is not None:
593             webpage_bytes = prefix + webpage_bytes
594         if not encoding:
595             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
596         if self._downloader.params.get('dump_intermediate_pages', False):
597             self.to_screen('Dumping request to ' + urlh.geturl())
598             dump = base64.b64encode(webpage_bytes).decode('ascii')
599             self._downloader.to_screen(dump)
600         if self._downloader.params.get('write_pages', False):
601             basen = '%s_%s' % (video_id, urlh.geturl())
602             if len(basen) > 240:
603                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
604                 basen = basen[:240 - len(h)] + h
605             raw_filename = basen + '.dump'
606             filename = sanitize_filename(raw_filename, restricted=True)
607             self.to_screen('Saving request to ' + filename)
608             # Working around MAX_PATH limitation on Windows (see
609             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
610             if compat_os_name == 'nt':
611                 absfilepath = os.path.abspath(filename)
612                 if len(absfilepath) > 259:
613                     filename = '\\\\?\\' + absfilepath
614             with open(filename, 'wb') as outf:
615                 outf.write(webpage_bytes)
616
617         try:
618             content = webpage_bytes.decode(encoding, 'replace')
619         except LookupError:
620             content = webpage_bytes.decode('utf-8', 'replace')
621
622         self.__check_blocked(content)
623
624         return content
625
626     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
627         """ Returns the data of the page as a string """
628         success = False
629         try_count = 0
630         while success is False:
631             try:
632                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
633                 success = True
634             except compat_http_client.IncompleteRead as e:
635                 try_count += 1
636                 if try_count >= tries:
637                     raise e
638                 self._sleep(timeout, video_id)
639         if res is False:
640             return res
641         else:
642             content, _ = res
643             return content
644
645     def _download_xml(self, url_or_request, video_id,
646                       note='Downloading XML', errnote='Unable to download XML',
647                       transform_source=None, fatal=True, encoding=None,
648                       data=None, headers={}, query={}):
649         """Return the xml as an xml.etree.ElementTree.Element"""
650         xml_string = self._download_webpage(
651             url_or_request, video_id, note, errnote, fatal=fatal,
652             encoding=encoding, data=data, headers=headers, query=query)
653         if xml_string is False:
654             return xml_string
655         return self._parse_xml(
656             xml_string, video_id, transform_source=transform_source,
657             fatal=fatal)
658
659     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
660         if transform_source:
661             xml_string = transform_source(xml_string)
662         try:
663             return compat_etree_fromstring(xml_string.encode('utf-8'))
664         except compat_xml_parse_error as ve:
665             errmsg = '%s: Failed to parse XML ' % video_id
666             if fatal:
667                 raise ExtractorError(errmsg, cause=ve)
668             else:
669                 self.report_warning(errmsg + str(ve))
670
671     def _download_json(self, url_or_request, video_id,
672                        note='Downloading JSON metadata',
673                        errnote='Unable to download JSON metadata',
674                        transform_source=None,
675                        fatal=True, encoding=None, data=None, headers={}, query={}):
676         json_string = self._download_webpage(
677             url_or_request, video_id, note, errnote, fatal=fatal,
678             encoding=encoding, data=data, headers=headers, query=query)
679         if (not fatal) and json_string is False:
680             return None
681         return self._parse_json(
682             json_string, video_id, transform_source=transform_source, fatal=fatal)
683
684     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
685         if transform_source:
686             json_string = transform_source(json_string)
687         try:
688             return json.loads(json_string)
689         except ValueError as ve:
690             errmsg = '%s: Failed to parse JSON ' % video_id
691             if fatal:
692                 raise ExtractorError(errmsg, cause=ve)
693             else:
694                 self.report_warning(errmsg + str(ve))
695
696     def report_warning(self, msg, video_id=None):
697         idstr = '' if video_id is None else '%s: ' % video_id
698         self._downloader.report_warning(
699             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
700
701     def to_screen(self, msg):
702         """Print msg to screen, prefixing it with '[ie_name]'"""
703         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
704
705     def report_extraction(self, id_or_name):
706         """Report information extraction."""
707         self.to_screen('%s: Extracting information' % id_or_name)
708
709     def report_download_webpage(self, video_id):
710         """Report webpage download."""
711         self.to_screen('%s: Downloading webpage' % video_id)
712
713     def report_age_confirmation(self):
714         """Report attempt to confirm age."""
715         self.to_screen('Confirming age')
716
717     def report_login(self):
718         """Report attempt to log in."""
719         self.to_screen('Logging in')
720
721     @staticmethod
722     def raise_login_required(msg='This video is only available for registered users'):
723         raise ExtractorError(
724             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
725             expected=True)
726
727     @staticmethod
728     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
729         raise GeoRestrictedError(msg, countries=countries)
730
731     # Methods for following #608
732     @staticmethod
733     def url_result(url, ie=None, video_id=None, video_title=None):
734         """Returns a URL that points to a page that should be processed"""
735         # TODO: ie should be the class used for getting the info
736         video_info = {'_type': 'url',
737                       'url': url,
738                       'ie_key': ie}
739         if video_id is not None:
740             video_info['id'] = video_id
741         if video_title is not None:
742             video_info['title'] = video_title
743         return video_info
744
745     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
746         urls = orderedSet(
747             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
748             for m in matches)
749         return self.playlist_result(
750             urls, playlist_id=playlist_id, playlist_title=playlist_title)
751
752     @staticmethod
753     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
754         """Returns a playlist"""
755         video_info = {'_type': 'playlist',
756                       'entries': entries}
757         if playlist_id:
758             video_info['id'] = playlist_id
759         if playlist_title:
760             video_info['title'] = playlist_title
761         if playlist_description:
762             video_info['description'] = playlist_description
763         return video_info
764
765     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
766         """
767         Perform a regex search on the given string, using a single or a list of
768         patterns returning the first matching group.
769         In case of failure return a default value or raise a WARNING or a
770         RegexNotFoundError, depending on fatal, specifying the field name.
771         """
772         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
773             mobj = re.search(pattern, string, flags)
774         else:
775             for p in pattern:
776                 mobj = re.search(p, string, flags)
777                 if mobj:
778                     break
779
780         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
781             _name = '\033[0;34m%s\033[0m' % name
782         else:
783             _name = name
784
785         if mobj:
786             if group is None:
787                 # return the first matching group
788                 return next(g for g in mobj.groups() if g is not None)
789             else:
790                 return mobj.group(group)
791         elif default is not NO_DEFAULT:
792             return default
793         elif fatal:
794             raise RegexNotFoundError('Unable to extract %s' % _name)
795         else:
796             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
797             return None
798
799     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
800         """
801         Like _search_regex, but strips HTML tags and unescapes entities.
802         """
803         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
804         if res:
805             return clean_html(res).strip()
806         else:
807             return res
808
809     def _get_netrc_login_info(self, netrc_machine=None):
810         username = None
811         password = None
812         netrc_machine = netrc_machine or self._NETRC_MACHINE
813
814         if self._downloader.params.get('usenetrc', False):
815             try:
816                 info = netrc.netrc().authenticators(netrc_machine)
817                 if info is not None:
818                     username = info[0]
819                     password = info[2]
820                 else:
821                     raise netrc.NetrcParseError(
822                         'No authenticators for %s' % netrc_machine)
823             except (IOError, netrc.NetrcParseError) as err:
824                 self._downloader.report_warning(
825                     'parsing .netrc: %s' % error_to_compat_str(err))
826
827         return username, password
828
829     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
830         """
831         Get the login info as (username, password)
832         First look for the manually specified credentials using username_option
833         and password_option as keys in params dictionary. If no such credentials
834         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
835         value.
836         If there's no info available, return (None, None)
837         """
838         if self._downloader is None:
839             return (None, None)
840
841         downloader_params = self._downloader.params
842
843         # Attempt to use provided username and password or .netrc data
844         if downloader_params.get(username_option) is not None:
845             username = downloader_params[username_option]
846             password = downloader_params[password_option]
847         else:
848             username, password = self._get_netrc_login_info(netrc_machine)
849
850         return username, password
851
852     def _get_tfa_info(self, note='two-factor verification code'):
853         """
854         Get the two-factor authentication info
855         TODO - asking the user will be required for sms/phone verify
856         currently just uses the command line option
857         If there's no info available, return None
858         """
859         if self._downloader is None:
860             return None
861         downloader_params = self._downloader.params
862
863         if downloader_params.get('twofactor') is not None:
864             return downloader_params['twofactor']
865
866         return compat_getpass('Type %s and press [Return]: ' % note)
867
868     # Helper functions for extracting OpenGraph info
869     @staticmethod
870     def _og_regexes(prop):
871         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
872         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
873                        % {'prop': re.escape(prop)})
874         template = r'<meta[^>]+?%s[^>]+?%s'
875         return [
876             template % (property_re, content_re),
877             template % (content_re, property_re),
878         ]
879
880     @staticmethod
881     def _meta_regex(prop):
882         return r'''(?isx)<meta
883                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
884                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
885
886     def _og_search_property(self, prop, html, name=None, **kargs):
887         if not isinstance(prop, (list, tuple)):
888             prop = [prop]
889         if name is None:
890             name = 'OpenGraph %s' % prop[0]
891         og_regexes = []
892         for p in prop:
893             og_regexes.extend(self._og_regexes(p))
894         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
895         if escaped is None:
896             return None
897         return unescapeHTML(escaped)
898
899     def _og_search_thumbnail(self, html, **kargs):
900         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
901
902     def _og_search_description(self, html, **kargs):
903         return self._og_search_property('description', html, fatal=False, **kargs)
904
905     def _og_search_title(self, html, **kargs):
906         return self._og_search_property('title', html, **kargs)
907
908     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
909         regexes = self._og_regexes('video') + self._og_regexes('video:url')
910         if secure:
911             regexes = self._og_regexes('video:secure_url') + regexes
912         return self._html_search_regex(regexes, html, name, **kargs)
913
914     def _og_search_url(self, html, **kargs):
915         return self._og_search_property('url', html, **kargs)
916
917     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
918         if not isinstance(name, (list, tuple)):
919             name = [name]
920         if display_name is None:
921             display_name = name[0]
922         return self._html_search_regex(
923             [self._meta_regex(n) for n in name],
924             html, display_name, fatal=fatal, group='content', **kwargs)
925
926     def _dc_search_uploader(self, html):
927         return self._html_search_meta('dc.creator', html, 'uploader')
928
929     def _rta_search(self, html):
930         # See http://www.rtalabel.org/index.php?content=howtofaq#single
931         if re.search(r'(?ix)<meta\s+name="rating"\s+'
932                      r'     content="RTA-5042-1996-1400-1577-RTA"',
933                      html):
934             return 18
935         return 0
936
937     def _media_rating_search(self, html):
938         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
939         rating = self._html_search_meta('rating', html)
940
941         if not rating:
942             return None
943
944         RATING_TABLE = {
945             'safe for kids': 0,
946             'general': 8,
947             '14 years': 14,
948             'mature': 17,
949             'restricted': 19,
950         }
951         return RATING_TABLE.get(rating.lower())
952
953     def _family_friendly_search(self, html):
954         # See http://schema.org/VideoObject
955         family_friendly = self._html_search_meta(
956             'isFamilyFriendly', html, default=None)
957
958         if not family_friendly:
959             return None
960
961         RATING_TABLE = {
962             '1': 0,
963             'true': 0,
964             '0': 18,
965             'false': 18,
966         }
967         return RATING_TABLE.get(family_friendly.lower())
968
969     def _twitter_search_player(self, html):
970         return self._html_search_meta('twitter:player', html,
971                                       'twitter card player')
972
973     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
974         json_ld = self._search_regex(
975             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
976             html, 'JSON-LD', group='json_ld', **kwargs)
977         default = kwargs.get('default', NO_DEFAULT)
978         if not json_ld:
979             return default if default is not NO_DEFAULT else {}
980         # JSON-LD may be malformed and thus `fatal` should be respected.
981         # At the same time `default` may be passed that assumes `fatal=False`
982         # for _search_regex. Let's simulate the same behavior here as well.
983         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
984         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
985
986     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
987         if isinstance(json_ld, compat_str):
988             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
989         if not json_ld:
990             return {}
991         info = {}
992         if not isinstance(json_ld, (list, tuple, dict)):
993             return info
994         if isinstance(json_ld, dict):
995             json_ld = [json_ld]
996
997         def extract_video_object(e):
998             assert e['@type'] == 'VideoObject'
999             info.update({
1000                 'url': e.get('contentUrl'),
1001                 'title': unescapeHTML(e.get('name')),
1002                 'description': unescapeHTML(e.get('description')),
1003                 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
1004                 'duration': parse_duration(e.get('duration')),
1005                 'timestamp': unified_timestamp(e.get('uploadDate')),
1006                 'filesize': float_or_none(e.get('contentSize')),
1007                 'tbr': int_or_none(e.get('bitrate')),
1008                 'width': int_or_none(e.get('width')),
1009                 'height': int_or_none(e.get('height')),
1010                 'view_count': int_or_none(e.get('interactionCount')),
1011             })
1012
1013         for e in json_ld:
1014             if e.get('@context') == 'http://schema.org':
1015                 item_type = e.get('@type')
1016                 if expected_type is not None and expected_type != item_type:
1017                     return info
1018                 if item_type in ('TVEpisode', 'Episode'):
1019                     info.update({
1020                         'episode': unescapeHTML(e.get('name')),
1021                         'episode_number': int_or_none(e.get('episodeNumber')),
1022                         'description': unescapeHTML(e.get('description')),
1023                     })
1024                     part_of_season = e.get('partOfSeason')
1025                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1026                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
1027                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1028                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1029                         info['series'] = unescapeHTML(part_of_series.get('name'))
1030                 elif item_type == 'Article':
1031                     info.update({
1032                         'timestamp': parse_iso8601(e.get('datePublished')),
1033                         'title': unescapeHTML(e.get('headline')),
1034                         'description': unescapeHTML(e.get('articleBody')),
1035                     })
1036                 elif item_type == 'VideoObject':
1037                     extract_video_object(e)
1038                     continue
1039                 video = e.get('video')
1040                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1041                     extract_video_object(video)
1042                 break
1043         return dict((k, v) for k, v in info.items() if v is not None)
1044
1045     @staticmethod
1046     def _hidden_inputs(html):
1047         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1048         hidden_inputs = {}
1049         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1050             attrs = extract_attributes(input)
1051             if not input:
1052                 continue
1053             if attrs.get('type') not in ('hidden', 'submit'):
1054                 continue
1055             name = attrs.get('name') or attrs.get('id')
1056             value = attrs.get('value')
1057             if name and value is not None:
1058                 hidden_inputs[name] = value
1059         return hidden_inputs
1060
1061     def _form_hidden_inputs(self, form_id, html):
1062         form = self._search_regex(
1063             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1064             html, '%s form' % form_id, group='form')
1065         return self._hidden_inputs(form)
1066
1067     def _sort_formats(self, formats, field_preference=None):
1068         if not formats:
1069             raise ExtractorError('No video formats found')
1070
1071         for f in formats:
1072             # Automatically determine tbr when missing based on abr and vbr (improves
1073             # formats sorting in some cases)
1074             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1075                 f['tbr'] = f['abr'] + f['vbr']
1076
1077         def _formats_key(f):
1078             # TODO remove the following workaround
1079             from ..utils import determine_ext
1080             if not f.get('ext') and 'url' in f:
1081                 f['ext'] = determine_ext(f['url'])
1082
1083             if isinstance(field_preference, (list, tuple)):
1084                 return tuple(
1085                     f.get(field)
1086                     if f.get(field) is not None
1087                     else ('' if field == 'format_id' else -1)
1088                     for field in field_preference)
1089
1090             preference = f.get('preference')
1091             if preference is None:
1092                 preference = 0
1093                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1094                     preference -= 0.5
1095
1096             protocol = f.get('protocol') or determine_protocol(f)
1097             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1098
1099             if f.get('vcodec') == 'none':  # audio only
1100                 preference -= 50
1101                 if self._downloader.params.get('prefer_free_formats'):
1102                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1103                 else:
1104                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1105                 ext_preference = 0
1106                 try:
1107                     audio_ext_preference = ORDER.index(f['ext'])
1108                 except ValueError:
1109                     audio_ext_preference = -1
1110             else:
1111                 if f.get('acodec') == 'none':  # video only
1112                     preference -= 40
1113                 if self._downloader.params.get('prefer_free_formats'):
1114                     ORDER = ['flv', 'mp4', 'webm']
1115                 else:
1116                     ORDER = ['webm', 'flv', 'mp4']
1117                 try:
1118                     ext_preference = ORDER.index(f['ext'])
1119                 except ValueError:
1120                     ext_preference = -1
1121                 audio_ext_preference = 0
1122
1123             return (
1124                 preference,
1125                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1126                 f.get('quality') if f.get('quality') is not None else -1,
1127                 f.get('tbr') if f.get('tbr') is not None else -1,
1128                 f.get('filesize') if f.get('filesize') is not None else -1,
1129                 f.get('vbr') if f.get('vbr') is not None else -1,
1130                 f.get('height') if f.get('height') is not None else -1,
1131                 f.get('width') if f.get('width') is not None else -1,
1132                 proto_preference,
1133                 ext_preference,
1134                 f.get('abr') if f.get('abr') is not None else -1,
1135                 audio_ext_preference,
1136                 f.get('fps') if f.get('fps') is not None else -1,
1137                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1138                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1139                 f.get('format_id') if f.get('format_id') is not None else '',
1140             )
1141         formats.sort(key=_formats_key)
1142
1143     def _check_formats(self, formats, video_id):
1144         if formats:
1145             formats[:] = filter(
1146                 lambda f: self._is_valid_url(
1147                     f['url'], video_id,
1148                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1149                 formats)
1150
1151     @staticmethod
1152     def _remove_duplicate_formats(formats):
1153         format_urls = set()
1154         unique_formats = []
1155         for f in formats:
1156             if f['url'] not in format_urls:
1157                 format_urls.add(f['url'])
1158                 unique_formats.append(f)
1159         formats[:] = unique_formats
1160
1161     def _is_valid_url(self, url, video_id, item='video', headers={}):
1162         url = self._proto_relative_url(url, scheme='http:')
1163         # For now assume non HTTP(S) URLs always valid
1164         if not (url.startswith('http://') or url.startswith('https://')):
1165             return True
1166         try:
1167             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1168             return True
1169         except ExtractorError as e:
1170             if isinstance(e.cause, compat_urllib_error.URLError):
1171                 self.to_screen(
1172                     '%s: %s URL is invalid, skipping' % (video_id, item))
1173                 return False
1174             raise
1175
1176     def http_scheme(self):
1177         """ Either "http:" or "https:", depending on the user's preferences """
1178         return (
1179             'http:'
1180             if self._downloader.params.get('prefer_insecure', False)
1181             else 'https:')
1182
1183     def _proto_relative_url(self, url, scheme=None):
1184         if url is None:
1185             return url
1186         if url.startswith('//'):
1187             if scheme is None:
1188                 scheme = self.http_scheme()
1189             return scheme + url
1190         else:
1191             return url
1192
1193     def _sleep(self, timeout, video_id, msg_template=None):
1194         if msg_template is None:
1195             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1196         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1197         self.to_screen(msg)
1198         time.sleep(timeout)
1199
1200     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1201                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1202                              fatal=True, m3u8_id=None):
1203         manifest = self._download_xml(
1204             manifest_url, video_id, 'Downloading f4m manifest',
1205             'Unable to download f4m manifest',
1206             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1207             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1208             transform_source=transform_source,
1209             fatal=fatal)
1210
1211         if manifest is False:
1212             return []
1213
1214         return self._parse_f4m_formats(
1215             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1216             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1217
1218     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1219                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1220                            fatal=True, m3u8_id=None):
1221         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1222         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1223         if akamai_pv is not None and ';' in akamai_pv.text:
1224             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1225             if playerVerificationChallenge.strip() != '':
1226                 return []
1227
1228         formats = []
1229         manifest_version = '1.0'
1230         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1231         if not media_nodes:
1232             manifest_version = '2.0'
1233             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1234         # Remove unsupported DRM protected media from final formats
1235         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1236         media_nodes = remove_encrypted_media(media_nodes)
1237         if not media_nodes:
1238             return formats
1239
1240         manifest_base_url = get_base_url(manifest)
1241
1242         bootstrap_info = xpath_element(
1243             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1244             'bootstrap info', default=None)
1245
1246         vcodec = None
1247         mime_type = xpath_text(
1248             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1249             'base URL', default=None)
1250         if mime_type and mime_type.startswith('audio/'):
1251             vcodec = 'none'
1252
1253         for i, media_el in enumerate(media_nodes):
1254             tbr = int_or_none(media_el.attrib.get('bitrate'))
1255             width = int_or_none(media_el.attrib.get('width'))
1256             height = int_or_none(media_el.attrib.get('height'))
1257             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1258             # If <bootstrapInfo> is present, the specified f4m is a
1259             # stream-level manifest, and only set-level manifests may refer to
1260             # external resources.  See section 11.4 and section 4 of F4M spec
1261             if bootstrap_info is None:
1262                 media_url = None
1263                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1264                 if manifest_version == '2.0':
1265                     media_url = media_el.attrib.get('href')
1266                 if media_url is None:
1267                     media_url = media_el.attrib.get('url')
1268                 if not media_url:
1269                     continue
1270                 manifest_url = (
1271                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1272                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1273                 # If media_url is itself a f4m manifest do the recursive extraction
1274                 # since bitrates in parent manifest (this one) and media_url manifest
1275                 # may differ leading to inability to resolve the format by requested
1276                 # bitrate in f4m downloader
1277                 ext = determine_ext(manifest_url)
1278                 if ext == 'f4m':
1279                     f4m_formats = self._extract_f4m_formats(
1280                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1281                         transform_source=transform_source, fatal=fatal)
1282                     # Sometimes stream-level manifest contains single media entry that
1283                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1284                     # At the same time parent's media entry in set-level manifest may
1285                     # contain it. We will copy it from parent in such cases.
1286                     if len(f4m_formats) == 1:
1287                         f = f4m_formats[0]
1288                         f.update({
1289                             'tbr': f.get('tbr') or tbr,
1290                             'width': f.get('width') or width,
1291                             'height': f.get('height') or height,
1292                             'format_id': f.get('format_id') if not tbr else format_id,
1293                             'vcodec': vcodec,
1294                         })
1295                     formats.extend(f4m_formats)
1296                     continue
1297                 elif ext == 'm3u8':
1298                     formats.extend(self._extract_m3u8_formats(
1299                         manifest_url, video_id, 'mp4', preference=preference,
1300                         m3u8_id=m3u8_id, fatal=fatal))
1301                     continue
1302             formats.append({
1303                 'format_id': format_id,
1304                 'url': manifest_url,
1305                 'manifest_url': manifest_url,
1306                 'ext': 'flv' if bootstrap_info is not None else None,
1307                 'protocol': 'f4m',
1308                 'tbr': tbr,
1309                 'width': width,
1310                 'height': height,
1311                 'vcodec': vcodec,
1312                 'preference': preference,
1313             })
1314         return formats
1315
1316     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1317         return {
1318             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1319             'url': m3u8_url,
1320             'ext': ext,
1321             'protocol': 'm3u8',
1322             'preference': preference - 100 if preference else -100,
1323             'resolution': 'multiple',
1324             'format_note': 'Quality selection URL',
1325         }
1326
1327     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1328                               entry_protocol='m3u8', preference=None,
1329                               m3u8_id=None, note=None, errnote=None,
1330                               fatal=True, live=False):
1331         res = self._download_webpage_handle(
1332             m3u8_url, video_id,
1333             note=note or 'Downloading m3u8 information',
1334             errnote=errnote or 'Failed to download m3u8 information',
1335             fatal=fatal)
1336
1337         if res is False:
1338             return []
1339
1340         m3u8_doc, urlh = res
1341         m3u8_url = urlh.geturl()
1342
1343         return self._parse_m3u8_formats(
1344             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1345             preference=preference, m3u8_id=m3u8_id, live=live)
1346
1347     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1348                             entry_protocol='m3u8', preference=None,
1349                             m3u8_id=None, live=False):
1350         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1351             return []
1352
1353         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
1354             return []
1355
1356         formats = []
1357
1358         format_url = lambda u: (
1359             u
1360             if re.match(r'^https?://', u)
1361             else compat_urlparse.urljoin(m3u8_url, u))
1362
1363         # References:
1364         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1365         # 2. https://github.com/rg3/youtube-dl/issues/12211
1366
1367         # We should try extracting formats only from master playlists [1, 4.3.4],
1368         # i.e. playlists that describe available qualities. On the other hand
1369         # media playlists [1, 4.3.3] should be returned as is since they contain
1370         # just the media without qualities renditions.
1371         # Fortunately, master playlist can be easily distinguished from media
1372         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1373         # master playlist tags MUST NOT appear in a media playist and vice versa.
1374         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1375         # media playlist and MUST NOT appear in master playlist thus we can
1376         # clearly detect media playlist with this criterion.
1377
1378         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1379             return [{
1380                 'url': m3u8_url,
1381                 'format_id': m3u8_id,
1382                 'ext': ext,
1383                 'protocol': entry_protocol,
1384                 'preference': preference,
1385             }]
1386
1387         groups = {}
1388         last_stream_inf = {}
1389
1390         def extract_media(x_media_line):
1391             media = parse_m3u8_attributes(x_media_line)
1392             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1393             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1394             if not (media_type and group_id and name):
1395                 return
1396             groups.setdefault(group_id, []).append(media)
1397             if media_type not in ('VIDEO', 'AUDIO'):
1398                 return
1399             media_url = media.get('URI')
1400             if media_url:
1401                 format_id = []
1402                 for v in (m3u8_id, group_id, name):
1403                     if v:
1404                         format_id.append(v)
1405                 f = {
1406                     'format_id': '-'.join(format_id),
1407                     'url': format_url(media_url),
1408                     'manifest_url': m3u8_url,
1409                     'language': media.get('LANGUAGE'),
1410                     'ext': ext,
1411                     'protocol': entry_protocol,
1412                     'preference': preference,
1413                 }
1414                 if media_type == 'AUDIO':
1415                     f['vcodec'] = 'none'
1416                 formats.append(f)
1417
1418         def build_stream_name():
1419             # Despite specification does not mention NAME attribute for
1420             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1421             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1422             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1423             stream_name = last_stream_inf.get('NAME')
1424             if stream_name:
1425                 return stream_name
1426             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1427             # from corresponding rendition group
1428             stream_group_id = last_stream_inf.get('VIDEO')
1429             if not stream_group_id:
1430                 return
1431             stream_group = groups.get(stream_group_id)
1432             if not stream_group:
1433                 return stream_group_id
1434             rendition = stream_group[0]
1435             return rendition.get('NAME') or stream_group_id
1436
1437         for line in m3u8_doc.splitlines():
1438             if line.startswith('#EXT-X-STREAM-INF:'):
1439                 last_stream_inf = parse_m3u8_attributes(line)
1440             elif line.startswith('#EXT-X-MEDIA:'):
1441                 extract_media(line)
1442             elif line.startswith('#') or not line.strip():
1443                 continue
1444             else:
1445                 tbr = float_or_none(
1446                     last_stream_inf.get('AVERAGE-BANDWIDTH') or
1447                     last_stream_inf.get('BANDWIDTH'), scale=1000)
1448                 format_id = []
1449                 if m3u8_id:
1450                     format_id.append(m3u8_id)
1451                 stream_name = build_stream_name()
1452                 # Bandwidth of live streams may differ over time thus making
1453                 # format_id unpredictable. So it's better to keep provided
1454                 # format_id intact.
1455                 if not live:
1456                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1457                 manifest_url = format_url(line.strip())
1458                 f = {
1459                     'format_id': '-'.join(format_id),
1460                     'url': manifest_url,
1461                     'manifest_url': m3u8_url,
1462                     'tbr': tbr,
1463                     'ext': ext,
1464                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1465                     'protocol': entry_protocol,
1466                     'preference': preference,
1467                 }
1468                 resolution = last_stream_inf.get('RESOLUTION')
1469                 if resolution:
1470                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1471                     if mobj:
1472                         f['width'] = int(mobj.group('width'))
1473                         f['height'] = int(mobj.group('height'))
1474                 # Unified Streaming Platform
1475                 mobj = re.search(
1476                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1477                 if mobj:
1478                     abr, vbr = mobj.groups()
1479                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1480                     f.update({
1481                         'vbr': vbr,
1482                         'abr': abr,
1483                     })
1484                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1485                 f.update(codecs)
1486                 audio_group_id = last_stream_inf.get('AUDIO')
1487                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1488                 # references a rendition group MUST have a CODECS attribute.
1489                 # However, this is not always respected, for example, [2]
1490                 # contains EXT-X-STREAM-INF tag which references AUDIO
1491                 # rendition group but does not have CODECS and despite
1492                 # referencing audio group an audio group, it represents
1493                 # a complete (with audio and video) format. So, for such cases
1494                 # we will ignore references to rendition groups and treat them
1495                 # as complete formats.
1496                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1497                     audio_group = groups.get(audio_group_id)
1498                     if audio_group and audio_group[0].get('URI'):
1499                         # TODO: update acodec for audio only formats with
1500                         # the same GROUP-ID
1501                         f['acodec'] = 'none'
1502                 formats.append(f)
1503                 last_stream_inf = {}
1504         return formats
1505
1506     @staticmethod
1507     def _xpath_ns(path, namespace=None):
1508         if not namespace:
1509             return path
1510         out = []
1511         for c in path.split('/'):
1512             if not c or c == '.':
1513                 out.append(c)
1514             else:
1515                 out.append('{%s}%s' % (namespace, c))
1516         return '/'.join(out)
1517
1518     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1519         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1520
1521         if smil is False:
1522             assert not fatal
1523             return []
1524
1525         namespace = self._parse_smil_namespace(smil)
1526
1527         return self._parse_smil_formats(
1528             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1529
1530     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1531         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1532         if smil is False:
1533             return {}
1534         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1535
1536     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1537         return self._download_xml(
1538             smil_url, video_id, 'Downloading SMIL file',
1539             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1540
1541     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1542         namespace = self._parse_smil_namespace(smil)
1543
1544         formats = self._parse_smil_formats(
1545             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1546         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1547
1548         video_id = os.path.splitext(url_basename(smil_url))[0]
1549         title = None
1550         description = None
1551         upload_date = None
1552         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1553             name = meta.attrib.get('name')
1554             content = meta.attrib.get('content')
1555             if not name or not content:
1556                 continue
1557             if not title and name == 'title':
1558                 title = content
1559             elif not description and name in ('description', 'abstract'):
1560                 description = content
1561             elif not upload_date and name == 'date':
1562                 upload_date = unified_strdate(content)
1563
1564         thumbnails = [{
1565             'id': image.get('type'),
1566             'url': image.get('src'),
1567             'width': int_or_none(image.get('width')),
1568             'height': int_or_none(image.get('height')),
1569         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1570
1571         return {
1572             'id': video_id,
1573             'title': title or video_id,
1574             'description': description,
1575             'upload_date': upload_date,
1576             'thumbnails': thumbnails,
1577             'formats': formats,
1578             'subtitles': subtitles,
1579         }
1580
1581     def _parse_smil_namespace(self, smil):
1582         return self._search_regex(
1583             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1584
1585     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1586         base = smil_url
1587         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1588             b = meta.get('base') or meta.get('httpBase')
1589             if b:
1590                 base = b
1591                 break
1592
1593         formats = []
1594         rtmp_count = 0
1595         http_count = 0
1596         m3u8_count = 0
1597
1598         srcs = []
1599         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1600         for medium in media:
1601             src = medium.get('src')
1602             if not src or src in srcs:
1603                 continue
1604             srcs.append(src)
1605
1606             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1607             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1608             width = int_or_none(medium.get('width'))
1609             height = int_or_none(medium.get('height'))
1610             proto = medium.get('proto')
1611             ext = medium.get('ext')
1612             src_ext = determine_ext(src)
1613             streamer = medium.get('streamer') or base
1614
1615             if proto == 'rtmp' or streamer.startswith('rtmp'):
1616                 rtmp_count += 1
1617                 formats.append({
1618                     'url': streamer,
1619                     'play_path': src,
1620                     'ext': 'flv',
1621                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1622                     'tbr': bitrate,
1623                     'filesize': filesize,
1624                     'width': width,
1625                     'height': height,
1626                 })
1627                 if transform_rtmp_url:
1628                     streamer, src = transform_rtmp_url(streamer, src)
1629                     formats[-1].update({
1630                         'url': streamer,
1631                         'play_path': src,
1632                     })
1633                 continue
1634
1635             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1636             src_url = src_url.strip()
1637
1638             if proto == 'm3u8' or src_ext == 'm3u8':
1639                 m3u8_formats = self._extract_m3u8_formats(
1640                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1641                 if len(m3u8_formats) == 1:
1642                     m3u8_count += 1
1643                     m3u8_formats[0].update({
1644                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1645                         'tbr': bitrate,
1646                         'width': width,
1647                         'height': height,
1648                     })
1649                 formats.extend(m3u8_formats)
1650                 continue
1651
1652             if src_ext == 'f4m':
1653                 f4m_url = src_url
1654                 if not f4m_params:
1655                     f4m_params = {
1656                         'hdcore': '3.2.0',
1657                         'plugin': 'flowplayer-3.2.0.1',
1658                     }
1659                 f4m_url += '&' if '?' in f4m_url else '?'
1660                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1661                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1662                 continue
1663
1664             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1665                 http_count += 1
1666                 formats.append({
1667                     'url': src_url,
1668                     'ext': ext or src_ext or 'flv',
1669                     'format_id': 'http-%d' % (bitrate or http_count),
1670                     'tbr': bitrate,
1671                     'filesize': filesize,
1672                     'width': width,
1673                     'height': height,
1674                 })
1675                 continue
1676
1677         return formats
1678
1679     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1680         urls = []
1681         subtitles = {}
1682         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1683             src = textstream.get('src')
1684             if not src or src in urls:
1685                 continue
1686             urls.append(src)
1687             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1688             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1689             subtitles.setdefault(lang, []).append({
1690                 'url': src,
1691                 'ext': ext,
1692             })
1693         return subtitles
1694
1695     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1696         xspf = self._download_xml(
1697             playlist_url, playlist_id, 'Downloading xpsf playlist',
1698             'Unable to download xspf manifest', fatal=fatal)
1699         if xspf is False:
1700             return []
1701         return self._parse_xspf(xspf, playlist_id)
1702
1703     def _parse_xspf(self, playlist, playlist_id):
1704         NS_MAP = {
1705             'xspf': 'http://xspf.org/ns/0/',
1706             's1': 'http://static.streamone.nl/player/ns/0',
1707         }
1708
1709         entries = []
1710         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1711             title = xpath_text(
1712                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1713             description = xpath_text(
1714                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1715             thumbnail = xpath_text(
1716                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1717             duration = float_or_none(
1718                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1719
1720             formats = [{
1721                 'url': location.text,
1722                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1723                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1724                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1725             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1726             self._sort_formats(formats)
1727
1728             entries.append({
1729                 'id': playlist_id,
1730                 'title': title,
1731                 'description': description,
1732                 'thumbnail': thumbnail,
1733                 'duration': duration,
1734                 'formats': formats,
1735             })
1736         return entries
1737
1738     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1739         res = self._download_webpage_handle(
1740             mpd_url, video_id,
1741             note=note or 'Downloading MPD manifest',
1742             errnote=errnote or 'Failed to download MPD manifest',
1743             fatal=fatal)
1744         if res is False:
1745             return []
1746         mpd, urlh = res
1747         mpd_base_url = base_url(urlh.geturl())
1748
1749         return self._parse_mpd_formats(
1750             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
1751             formats_dict=formats_dict, mpd_url=mpd_url)
1752
1753     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1754         """
1755         Parse formats from MPD manifest.
1756         References:
1757          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1758             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1759          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1760         """
1761         if mpd_doc.get('type') == 'dynamic':
1762             return []
1763
1764         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1765
1766         def _add_ns(path):
1767             return self._xpath_ns(path, namespace)
1768
1769         def is_drm_protected(element):
1770             return element.find(_add_ns('ContentProtection')) is not None
1771
1772         def extract_multisegment_info(element, ms_parent_info):
1773             ms_info = ms_parent_info.copy()
1774
1775             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1776             # common attributes and elements.  We will only extract relevant
1777             # for us.
1778             def extract_common(source):
1779                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1780                 if segment_timeline is not None:
1781                     s_e = segment_timeline.findall(_add_ns('S'))
1782                     if s_e:
1783                         ms_info['total_number'] = 0
1784                         ms_info['s'] = []
1785                         for s in s_e:
1786                             r = int(s.get('r', 0))
1787                             ms_info['total_number'] += 1 + r
1788                             ms_info['s'].append({
1789                                 't': int(s.get('t', 0)),
1790                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1791                                 'd': int(s.attrib['d']),
1792                                 'r': r,
1793                             })
1794                 start_number = source.get('startNumber')
1795                 if start_number:
1796                     ms_info['start_number'] = int(start_number)
1797                 timescale = source.get('timescale')
1798                 if timescale:
1799                     ms_info['timescale'] = int(timescale)
1800                 segment_duration = source.get('duration')
1801                 if segment_duration:
1802                     ms_info['segment_duration'] = float(segment_duration)
1803
1804             def extract_Initialization(source):
1805                 initialization = source.find(_add_ns('Initialization'))
1806                 if initialization is not None:
1807                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1808
1809             segment_list = element.find(_add_ns('SegmentList'))
1810             if segment_list is not None:
1811                 extract_common(segment_list)
1812                 extract_Initialization(segment_list)
1813                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1814                 if segment_urls_e:
1815                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1816             else:
1817                 segment_template = element.find(_add_ns('SegmentTemplate'))
1818                 if segment_template is not None:
1819                     extract_common(segment_template)
1820                     media = segment_template.get('media')
1821                     if media:
1822                         ms_info['media'] = media
1823                     initialization = segment_template.get('initialization')
1824                     if initialization:
1825                         ms_info['initialization'] = initialization
1826                     else:
1827                         extract_Initialization(segment_template)
1828             return ms_info
1829
1830         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1831         formats = []
1832         for period in mpd_doc.findall(_add_ns('Period')):
1833             period_duration = parse_duration(period.get('duration')) or mpd_duration
1834             period_ms_info = extract_multisegment_info(period, {
1835                 'start_number': 1,
1836                 'timescale': 1,
1837             })
1838             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1839                 if is_drm_protected(adaptation_set):
1840                     continue
1841                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1842                 for representation in adaptation_set.findall(_add_ns('Representation')):
1843                     if is_drm_protected(representation):
1844                         continue
1845                     representation_attrib = adaptation_set.attrib.copy()
1846                     representation_attrib.update(representation.attrib)
1847                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1848                     mime_type = representation_attrib['mimeType']
1849                     content_type = mime_type.split('/')[0]
1850                     if content_type == 'text':
1851                         # TODO implement WebVTT downloading
1852                         pass
1853                     elif content_type in ('video', 'audio'):
1854                         base_url = ''
1855                         for element in (representation, adaptation_set, period, mpd_doc):
1856                             base_url_e = element.find(_add_ns('BaseURL'))
1857                             if base_url_e is not None:
1858                                 base_url = base_url_e.text + base_url
1859                                 if re.match(r'^https?://', base_url):
1860                                     break
1861                         if mpd_base_url and not re.match(r'^https?://', base_url):
1862                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1863                                 mpd_base_url += '/'
1864                             base_url = mpd_base_url + base_url
1865                         representation_id = representation_attrib.get('id')
1866                         lang = representation_attrib.get('lang')
1867                         url_el = representation.find(_add_ns('BaseURL'))
1868                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1869                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1870                         f = {
1871                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1872                             'url': base_url,
1873                             'manifest_url': mpd_url,
1874                             'ext': mimetype2ext(mime_type),
1875                             'width': int_or_none(representation_attrib.get('width')),
1876                             'height': int_or_none(representation_attrib.get('height')),
1877                             'tbr': float_or_none(bandwidth, 1000),
1878                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1879                             'fps': int_or_none(representation_attrib.get('frameRate')),
1880                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1881                             'format_note': 'DASH %s' % content_type,
1882                             'filesize': filesize,
1883                         }
1884                         f.update(parse_codecs(representation_attrib.get('codecs')))
1885                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1886
1887                         def prepare_template(template_name, identifiers):
1888                             t = representation_ms_info[template_name]
1889                             t = t.replace('$RepresentationID$', representation_id)
1890                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
1891                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
1892                             t.replace('$$', '$')
1893                             return t
1894
1895                         # @initialization is a regular template like @media one
1896                         # so it should be handled just the same way (see
1897                         # https://github.com/rg3/youtube-dl/issues/11605)
1898                         if 'initialization' in representation_ms_info:
1899                             initialization_template = prepare_template(
1900                                 'initialization',
1901                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
1902                                 # $Time$ shall not be included for @initialization thus
1903                                 # only $Bandwidth$ remains
1904                                 ('Bandwidth', ))
1905                             representation_ms_info['initialization_url'] = initialization_template % {
1906                                 'Bandwidth': bandwidth,
1907                             }
1908
1909                         def location_key(location):
1910                             return 'url' if re.match(r'^https?://', location) else 'path'
1911
1912                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
1913
1914                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1915                             media_location_key = location_key(media_template)
1916
1917                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1918                             # can't be used at the same time
1919                             if '%(Number' in media_template and 's' not in representation_ms_info:
1920                                 segment_duration = None
1921                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
1922                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1923                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1924                                 representation_ms_info['fragments'] = [{
1925                                     media_location_key: media_template % {
1926                                         'Number': segment_number,
1927                                         'Bandwidth': bandwidth,
1928                                     },
1929                                     'duration': segment_duration,
1930                                 } for segment_number in range(
1931                                     representation_ms_info['start_number'],
1932                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1933                             else:
1934                                 # $Number*$ or $Time$ in media template with S list available
1935                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1936                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
1937                                 representation_ms_info['fragments'] = []
1938                                 segment_time = 0
1939                                 segment_d = None
1940                                 segment_number = representation_ms_info['start_number']
1941
1942                                 def add_segment_url():
1943                                     segment_url = media_template % {
1944                                         'Time': segment_time,
1945                                         'Bandwidth': bandwidth,
1946                                         'Number': segment_number,
1947                                     }
1948                                     representation_ms_info['fragments'].append({
1949                                         media_location_key: segment_url,
1950                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1951                                     })
1952
1953                                 for num, s in enumerate(representation_ms_info['s']):
1954                                     segment_time = s.get('t') or segment_time
1955                                     segment_d = s['d']
1956                                     add_segment_url()
1957                                     segment_number += 1
1958                                     for r in range(s.get('r', 0)):
1959                                         segment_time += segment_d
1960                                         add_segment_url()
1961                                         segment_number += 1
1962                                     segment_time += segment_d
1963                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1964                             # No media template
1965                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
1966                             # or any YouTube dashsegments video
1967                             fragments = []
1968                             segment_index = 0
1969                             timescale = representation_ms_info['timescale']
1970                             for s in representation_ms_info['s']:
1971                                 duration = float_or_none(s['d'], timescale)
1972                                 for r in range(s.get('r', 0) + 1):
1973                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
1974                                     fragments.append({
1975                                         location_key(segment_uri): segment_uri,
1976                                         'duration': duration,
1977                                     })
1978                                     segment_index += 1
1979                             representation_ms_info['fragments'] = fragments
1980                         elif 'segment_urls' in representation_ms_info:
1981                             # Segment URLs with no SegmentTimeline
1982                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
1983                             # https://github.com/rg3/youtube-dl/pull/14844
1984                             fragments = []
1985                             segment_duration = float_or_none(
1986                                 representation_ms_info['segment_duration'],
1987                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
1988                             for segment_url in representation_ms_info['segment_urls']:
1989                                 fragment = {
1990                                     location_key(segment_url): segment_url,
1991                                 }
1992                                 if segment_duration:
1993                                     fragment['duration'] = segment_duration
1994                                 fragments.append(fragment)
1995                             representation_ms_info['fragments'] = fragments
1996                         # NB: MPD manifest may contain direct URLs to unfragmented media.
1997                         # No fragments key is present in this case.
1998                         if 'fragments' in representation_ms_info:
1999                             f.update({
2000                                 'fragment_base_url': base_url,
2001                                 'fragments': [],
2002                                 'protocol': 'http_dash_segments',
2003                             })
2004                             if 'initialization_url' in representation_ms_info:
2005                                 initialization_url = representation_ms_info['initialization_url']
2006                                 if not f.get('url'):
2007                                     f['url'] = initialization_url
2008                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2009                             f['fragments'].extend(representation_ms_info['fragments'])
2010                         try:
2011                             existing_format = next(
2012                                 fo for fo in formats
2013                                 if fo['format_id'] == representation_id)
2014                         except StopIteration:
2015                             full_info = formats_dict.get(representation_id, {}).copy()
2016                             full_info.update(f)
2017                             formats.append(full_info)
2018                         else:
2019                             existing_format.update(f)
2020                     else:
2021                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2022         return formats
2023
2024     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
2025         res = self._download_webpage_handle(
2026             ism_url, video_id,
2027             note=note or 'Downloading ISM manifest',
2028             errnote=errnote or 'Failed to download ISM manifest',
2029             fatal=fatal)
2030         if res is False:
2031             return []
2032         ism, urlh = res
2033
2034         return self._parse_ism_formats(
2035             compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
2036
2037     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2038         """
2039         Parse formats from ISM manifest.
2040         References:
2041          1. [MS-SSTR]: Smooth Streaming Protocol,
2042             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2043         """
2044         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2045             return []
2046
2047         duration = int(ism_doc.attrib['Duration'])
2048         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2049
2050         formats = []
2051         for stream in ism_doc.findall('StreamIndex'):
2052             stream_type = stream.get('Type')
2053             if stream_type not in ('video', 'audio'):
2054                 continue
2055             url_pattern = stream.attrib['Url']
2056             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2057             stream_name = stream.get('Name')
2058             for track in stream.findall('QualityLevel'):
2059                 fourcc = track.get('FourCC')
2060                 # TODO: add support for WVC1 and WMAP
2061                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2062                     self.report_warning('%s is not a supported codec' % fourcc)
2063                     continue
2064                 tbr = int(track.attrib['Bitrate']) // 1000
2065                 # [1] does not mention Width and Height attributes. However,
2066                 # they're often present while MaxWidth and MaxHeight are
2067                 # missing, so should be used as fallbacks
2068                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2069                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2070                 sampling_rate = int_or_none(track.get('SamplingRate'))
2071
2072                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2073                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2074
2075                 fragments = []
2076                 fragment_ctx = {
2077                     'time': 0,
2078                 }
2079                 stream_fragments = stream.findall('c')
2080                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2081                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2082                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2083                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2084                     if not fragment_ctx['duration']:
2085                         try:
2086                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2087                         except IndexError:
2088                             next_fragment_time = duration
2089                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2090                     for _ in range(fragment_repeat):
2091                         fragments.append({
2092                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2093                             'duration': fragment_ctx['duration'] / stream_timescale,
2094                         })
2095                         fragment_ctx['time'] += fragment_ctx['duration']
2096
2097                 format_id = []
2098                 if ism_id:
2099                     format_id.append(ism_id)
2100                 if stream_name:
2101                     format_id.append(stream_name)
2102                 format_id.append(compat_str(tbr))
2103
2104                 formats.append({
2105                     'format_id': '-'.join(format_id),
2106                     'url': ism_url,
2107                     'manifest_url': ism_url,
2108                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2109                     'width': width,
2110                     'height': height,
2111                     'tbr': tbr,
2112                     'asr': sampling_rate,
2113                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2114                     'acodec': 'none' if stream_type == 'video' else fourcc,
2115                     'protocol': 'ism',
2116                     'fragments': fragments,
2117                     '_download_params': {
2118                         'duration': duration,
2119                         'timescale': stream_timescale,
2120                         'width': width or 0,
2121                         'height': height or 0,
2122                         'fourcc': fourcc,
2123                         'codec_private_data': track.get('CodecPrivateData'),
2124                         'sampling_rate': sampling_rate,
2125                         'channels': int_or_none(track.get('Channels', 2)),
2126                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2127                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2128                     },
2129                 })
2130         return formats
2131
2132     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2133         def absolute_url(video_url):
2134             return compat_urlparse.urljoin(base_url, video_url)
2135
2136         def parse_content_type(content_type):
2137             if not content_type:
2138                 return {}
2139             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2140             if ctr:
2141                 mimetype, codecs = ctr.groups()
2142                 f = parse_codecs(codecs)
2143                 f['ext'] = mimetype2ext(mimetype)
2144                 return f
2145             return {}
2146
2147         def _media_formats(src, cur_media_type, type_info={}):
2148             full_url = absolute_url(src)
2149             ext = type_info.get('ext') or determine_ext(full_url)
2150             if ext == 'm3u8':
2151                 is_plain_url = False
2152                 formats = self._extract_m3u8_formats(
2153                     full_url, video_id, ext='mp4',
2154                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2155                     preference=preference, fatal=False)
2156             elif ext == 'mpd':
2157                 is_plain_url = False
2158                 formats = self._extract_mpd_formats(
2159                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2160             else:
2161                 is_plain_url = True
2162                 formats = [{
2163                     'url': full_url,
2164                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2165                 }]
2166             return is_plain_url, formats
2167
2168         entries = []
2169         # amp-video and amp-audio are very similar to their HTML5 counterparts
2170         # so we wll include them right here (see
2171         # https://www.ampproject.org/docs/reference/components/amp-video)
2172         media_tags = [(media_tag, media_type, '')
2173                       for media_tag, media_type
2174                       in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2175         media_tags.extend(re.findall(
2176             # We only allow video|audio followed by a whitespace or '>'.
2177             # Allowing more characters may end up in significant slow down (see
2178             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2179             # http://www.porntrex.com/maps/videositemap.xml).
2180             r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2181         for media_tag, media_type, media_content in media_tags:
2182             media_info = {
2183                 'formats': [],
2184                 'subtitles': {},
2185             }
2186             media_attributes = extract_attributes(media_tag)
2187             src = media_attributes.get('src')
2188             if src:
2189                 _, formats = _media_formats(src, media_type)
2190                 media_info['formats'].extend(formats)
2191             media_info['thumbnail'] = media_attributes.get('poster')
2192             if media_content:
2193                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2194                     source_attributes = extract_attributes(source_tag)
2195                     src = source_attributes.get('src')
2196                     if not src:
2197                         continue
2198                     f = parse_content_type(source_attributes.get('type'))
2199                     is_plain_url, formats = _media_formats(src, media_type, f)
2200                     if is_plain_url:
2201                         # res attribute is not standard but seen several times
2202                         # in the wild
2203                         f.update({
2204                             'height': int_or_none(source_attributes.get('res')),
2205                             'format_id': source_attributes.get('label'),
2206                         })
2207                         f.update(formats[0])
2208                         media_info['formats'].append(f)
2209                     else:
2210                         media_info['formats'].extend(formats)
2211                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2212                     track_attributes = extract_attributes(track_tag)
2213                     kind = track_attributes.get('kind')
2214                     if not kind or kind in ('subtitles', 'captions'):
2215                         src = track_attributes.get('src')
2216                         if not src:
2217                             continue
2218                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2219                         media_info['subtitles'].setdefault(lang, []).append({
2220                             'url': absolute_url(src),
2221                         })
2222             if media_info['formats'] or media_info['subtitles']:
2223                 entries.append(media_info)
2224         return entries
2225
2226     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2227         formats = []
2228         hdcore_sign = 'hdcore=3.7.0'
2229         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2230         hds_host = hosts.get('hds')
2231         if hds_host:
2232             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2233         if 'hdcore=' not in f4m_url:
2234             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2235         f4m_formats = self._extract_f4m_formats(
2236             f4m_url, video_id, f4m_id='hds', fatal=False)
2237         for entry in f4m_formats:
2238             entry.update({'extra_param_to_segment_url': hdcore_sign})
2239         formats.extend(f4m_formats)
2240         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2241         hls_host = hosts.get('hls')
2242         if hls_host:
2243             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2244         formats.extend(self._extract_m3u8_formats(
2245             m3u8_url, video_id, 'mp4', 'm3u8_native',
2246             m3u8_id='hls', fatal=False))
2247         return formats
2248
2249     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2250         query = compat_urlparse.urlparse(url).query
2251         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2252         url_base = self._search_regex(
2253             r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url')
2254         http_base_url = '%s:%s' % ('http', url_base)
2255         formats = []
2256
2257         def manifest_url(manifest):
2258             m_url = '%s/%s' % (http_base_url, manifest)
2259             if query:
2260                 m_url += '?%s' % query
2261             return m_url
2262
2263         if 'm3u8' not in skip_protocols:
2264             formats.extend(self._extract_m3u8_formats(
2265                 manifest_url('playlist.m3u8'), video_id, 'mp4',
2266                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2267         if 'f4m' not in skip_protocols:
2268             formats.extend(self._extract_f4m_formats(
2269                 manifest_url('manifest.f4m'),
2270                 video_id, f4m_id='hds', fatal=False))
2271         if 'dash' not in skip_protocols:
2272             formats.extend(self._extract_mpd_formats(
2273                 manifest_url('manifest.mpd'),
2274                 video_id, mpd_id='dash', fatal=False))
2275         if re.search(r'(?:/smil:|\.smil)', url_base):
2276             if 'smil' not in skip_protocols:
2277                 rtmp_formats = self._extract_smil_formats(
2278                     manifest_url('jwplayer.smil'),
2279                     video_id, fatal=False)
2280                 for rtmp_format in rtmp_formats:
2281                     rtsp_format = rtmp_format.copy()
2282                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2283                     del rtsp_format['play_path']
2284                     del rtsp_format['ext']
2285                     rtsp_format.update({
2286                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2287                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2288                         'protocol': 'rtsp',
2289                     })
2290                     formats.extend([rtmp_format, rtsp_format])
2291         else:
2292             for protocol in ('rtmp', 'rtsp'):
2293                 if protocol not in skip_protocols:
2294                     formats.append({
2295                         'url': '%s:%s' % (protocol, url_base),
2296                         'format_id': protocol,
2297                         'protocol': protocol,
2298                     })
2299         return formats
2300
2301     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2302         mobj = re.search(
2303             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2304             webpage)
2305         if mobj:
2306             try:
2307                 jwplayer_data = self._parse_json(mobj.group('options'),
2308                                                  video_id=video_id,
2309                                                  transform_source=transform_source)
2310             except ExtractorError:
2311                 pass
2312             else:
2313                 if isinstance(jwplayer_data, dict):
2314                     return jwplayer_data
2315
2316     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2317         jwplayer_data = self._find_jwplayer_data(
2318             webpage, video_id, transform_source=js_to_json)
2319         return self._parse_jwplayer_data(
2320             jwplayer_data, video_id, *args, **kwargs)
2321
2322     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2323                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2324         # JWPlayer backward compatibility: flattened playlists
2325         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2326         if 'playlist' not in jwplayer_data:
2327             jwplayer_data = {'playlist': [jwplayer_data]}
2328
2329         entries = []
2330
2331         # JWPlayer backward compatibility: single playlist item
2332         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2333         if not isinstance(jwplayer_data['playlist'], list):
2334             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2335
2336         for video_data in jwplayer_data['playlist']:
2337             # JWPlayer backward compatibility: flattened sources
2338             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2339             if 'sources' not in video_data:
2340                 video_data['sources'] = [video_data]
2341
2342             this_video_id = video_id or video_data['mediaid']
2343
2344             formats = self._parse_jwplayer_formats(
2345                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2346                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2347
2348             subtitles = {}
2349             tracks = video_data.get('tracks')
2350             if tracks and isinstance(tracks, list):
2351                 for track in tracks:
2352                     if not isinstance(track, dict):
2353                         continue
2354                     if track.get('kind') != 'captions':
2355                         continue
2356                     track_url = urljoin(base_url, track.get('file'))
2357                     if not track_url:
2358                         continue
2359                     subtitles.setdefault(track.get('label') or 'en', []).append({
2360                         'url': self._proto_relative_url(track_url)
2361                     })
2362
2363             entry = {
2364                 'id': this_video_id,
2365                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2366                 'description': video_data.get('description'),
2367                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2368                 'timestamp': int_or_none(video_data.get('pubdate')),
2369                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2370                 'subtitles': subtitles,
2371             }
2372             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2373             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2374                 entry.update({
2375                     '_type': 'url_transparent',
2376                     'url': formats[0]['url'],
2377                 })
2378             else:
2379                 self._sort_formats(formats)
2380                 entry['formats'] = formats
2381             entries.append(entry)
2382         if len(entries) == 1:
2383             return entries[0]
2384         else:
2385             return self.playlist_result(entries)
2386
2387     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2388                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2389         urls = []
2390         formats = []
2391         for source in jwplayer_sources_data:
2392             if not isinstance(source, dict):
2393                 continue
2394             source_url = self._proto_relative_url(source.get('file'))
2395             if not source_url:
2396                 continue
2397             if base_url:
2398                 source_url = compat_urlparse.urljoin(base_url, source_url)
2399             if source_url in urls:
2400                 continue
2401             urls.append(source_url)
2402             source_type = source.get('type') or ''
2403             ext = mimetype2ext(source_type) or determine_ext(source_url)
2404             if source_type == 'hls' or ext == 'm3u8':
2405                 formats.extend(self._extract_m3u8_formats(
2406                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2407                     m3u8_id=m3u8_id, fatal=False))
2408             elif ext == 'mpd':
2409                 formats.extend(self._extract_mpd_formats(
2410                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2411             elif ext == 'smil':
2412                 formats.extend(self._extract_smil_formats(
2413                     source_url, video_id, fatal=False))
2414             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2415             elif source_type.startswith('audio') or ext in (
2416                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2417                 formats.append({
2418                     'url': source_url,
2419                     'vcodec': 'none',
2420                     'ext': ext,
2421                 })
2422             else:
2423                 height = int_or_none(source.get('height'))
2424                 if height is None:
2425                     # Often no height is provided but there is a label in
2426                     # format like "1080p", "720p SD", or 1080.
2427                     height = int_or_none(self._search_regex(
2428                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2429                         'height', default=None))
2430                 a_format = {
2431                     'url': source_url,
2432                     'width': int_or_none(source.get('width')),
2433                     'height': height,
2434                     'tbr': int_or_none(source.get('bitrate')),
2435                     'ext': ext,
2436                 }
2437                 if source_url.startswith('rtmp'):
2438                     a_format['ext'] = 'flv'
2439                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2440                     # of jwplayer.flash.swf
2441                     rtmp_url_parts = re.split(
2442                         r'((?:mp4|mp3|flv):)', source_url, 1)
2443                     if len(rtmp_url_parts) == 3:
2444                         rtmp_url, prefix, play_path = rtmp_url_parts
2445                         a_format.update({
2446                             'url': rtmp_url,
2447                             'play_path': prefix + play_path,
2448                         })
2449                     if rtmp_params:
2450                         a_format.update(rtmp_params)
2451                 formats.append(a_format)
2452         return formats
2453
2454     def _live_title(self, name):
2455         """ Generate the title for a live video """
2456         now = datetime.datetime.now()
2457         now_str = now.strftime('%Y-%m-%d %H:%M')
2458         return name + ' ' + now_str
2459
2460     def _int(self, v, name, fatal=False, **kwargs):
2461         res = int_or_none(v, **kwargs)
2462         if 'get_attr' in kwargs:
2463             print(getattr(v, kwargs['get_attr']))
2464         if res is None:
2465             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2466             if fatal:
2467                 raise ExtractorError(msg)
2468             else:
2469                 self._downloader.report_warning(msg)
2470         return res
2471
2472     def _float(self, v, name, fatal=False, **kwargs):
2473         res = float_or_none(v, **kwargs)
2474         if res is None:
2475             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2476             if fatal:
2477                 raise ExtractorError(msg)
2478             else:
2479                 self._downloader.report_warning(msg)
2480         return res
2481
2482     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2483                     path='/', secure=False, discard=False, rest={}, **kwargs):
2484         cookie = compat_cookiejar.Cookie(
2485             0, name, value, port, port is not None, domain, True,
2486             domain.startswith('.'), path, True, secure, expire_time,
2487             discard, None, None, rest)
2488         self._downloader.cookiejar.set_cookie(cookie)
2489
2490     def _get_cookies(self, url):
2491         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2492         req = sanitized_Request(url)
2493         self._downloader.cookiejar.add_cookie_header(req)
2494         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2495
2496     def get_testcases(self, include_onlymatching=False):
2497         t = getattr(self, '_TEST', None)
2498         if t:
2499             assert not hasattr(self, '_TESTS'), \
2500                 '%s has _TEST and _TESTS' % type(self).__name__
2501             tests = [t]
2502         else:
2503             tests = getattr(self, '_TESTS', [])
2504         for t in tests:
2505             if not include_onlymatching and t.get('only_matching', False):
2506                 continue
2507             t['name'] = type(self).__name__[:-len('IE')]
2508             yield t
2509
2510     def is_suitable(self, age_limit):
2511         """ Test whether the extractor is generally suitable for the given
2512         age limit (i.e. pornographic sites are not, all others usually are) """
2513
2514         any_restricted = False
2515         for tc in self.get_testcases(include_onlymatching=False):
2516             if tc.get('playlist', []):
2517                 tc = tc['playlist'][0]
2518             is_restricted = age_restricted(
2519                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2520             if not is_restricted:
2521                 return True
2522             any_restricted = any_restricted or is_restricted
2523         return not any_restricted
2524
2525     def extract_subtitles(self, *args, **kwargs):
2526         if (self._downloader.params.get('writesubtitles', False) or
2527                 self._downloader.params.get('listsubtitles')):
2528             return self._get_subtitles(*args, **kwargs)
2529         return {}
2530
2531     def _get_subtitles(self, *args, **kwargs):
2532         raise NotImplementedError('This method must be implemented by subclasses')
2533
2534     @staticmethod
2535     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2536         """ Merge subtitle items for one language. Items with duplicated URLs
2537         will be dropped. """
2538         list1_urls = set([item['url'] for item in subtitle_list1])
2539         ret = list(subtitle_list1)
2540         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2541         return ret
2542
2543     @classmethod
2544     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2545         """ Merge two subtitle dictionaries, language by language. """
2546         ret = dict(subtitle_dict1)
2547         for lang in subtitle_dict2:
2548             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2549         return ret
2550
2551     def extract_automatic_captions(self, *args, **kwargs):
2552         if (self._downloader.params.get('writeautomaticsub', False) or
2553                 self._downloader.params.get('listsubtitles')):
2554             return self._get_automatic_captions(*args, **kwargs)
2555         return {}
2556
2557     def _get_automatic_captions(self, *args, **kwargs):
2558         raise NotImplementedError('This method must be implemented by subclasses')
2559
2560     def mark_watched(self, *args, **kwargs):
2561         if (self._downloader.params.get('mark_watched', False) and
2562                 (self._get_login_info()[0] is not None or
2563                     self._downloader.params.get('cookiefile') is not None)):
2564             self._mark_watched(*args, **kwargs)
2565
2566     def _mark_watched(self, *args, **kwargs):
2567         raise NotImplementedError('This method must be implemented by subclasses')
2568
2569     def geo_verification_headers(self):
2570         headers = {}
2571         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2572         if geo_verification_proxy:
2573             headers['Ytdl-request-proxy'] = geo_verification_proxy
2574         return headers
2575
2576     def _generic_id(self, url):
2577         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2578
2579     def _generic_title(self, url):
2580         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2581
2582
2583 class SearchInfoExtractor(InfoExtractor):
2584     """
2585     Base class for paged search queries extractors.
2586     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2587     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2588     """
2589
2590     @classmethod
2591     def _make_valid_url(cls):
2592         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2593
2594     @classmethod
2595     def suitable(cls, url):
2596         return re.match(cls._make_valid_url(), url) is not None
2597
2598     def _real_extract(self, query):
2599         mobj = re.match(self._make_valid_url(), query)
2600         if mobj is None:
2601             raise ExtractorError('Invalid search query "%s"' % query)
2602
2603         prefix = mobj.group('prefix')
2604         query = mobj.group('query')
2605         if prefix == '':
2606             return self._get_n_results(query, 1)
2607         elif prefix == 'all':
2608             return self._get_n_results(query, self._MAX_RESULTS)
2609         else:
2610             n = int(prefix)
2611             if n <= 0:
2612                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2613             elif n > self._MAX_RESULTS:
2614                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2615                 n = self._MAX_RESULTS
2616             return self._get_n_results(query, n)
2617
2618     def _get_n_results(self, query, n):
2619         """Get a specified number of results for a query"""
2620         raise NotImplementedError('This method must be implemented by subclasses')
2621
2622     @property
2623     def SEARCH_KEY(self):
2624         return self._SEARCH_KEY