[YoutubeDL] Improve _default_format_spec (closes #14461)
[ytdl] / youtube_dl / YoutubeDL.py
1 #!/usr/bin/env python
2 # coding: utf-8
3
4 from __future__ import absolute_import, unicode_literals
5
6 import collections
7 import contextlib
8 import copy
9 import datetime
10 import errno
11 import fileinput
12 import io
13 import itertools
14 import json
15 import locale
16 import operator
17 import os
18 import platform
19 import re
20 import shutil
21 import subprocess
22 import socket
23 import sys
24 import time
25 import tokenize
26 import traceback
27 import random
28
29 from string import ascii_letters
30
31 from .compat import (
32     compat_basestring,
33     compat_cookiejar,
34     compat_get_terminal_size,
35     compat_http_client,
36     compat_kwargs,
37     compat_numeric_types,
38     compat_os_name,
39     compat_str,
40     compat_tokenize_tokenize,
41     compat_urllib_error,
42     compat_urllib_request,
43     compat_urllib_request_DataHandler,
44 )
45 from .utils import (
46     age_restricted,
47     args_to_str,
48     ContentTooShortError,
49     date_from_str,
50     DateRange,
51     DEFAULT_OUTTMPL,
52     determine_ext,
53     determine_protocol,
54     DownloadError,
55     encode_compat_str,
56     encodeFilename,
57     error_to_compat_str,
58     expand_path,
59     ExtractorError,
60     format_bytes,
61     formatSeconds,
62     GeoRestrictedError,
63     int_or_none,
64     ISO3166Utils,
65     locked_file,
66     make_HTTPS_handler,
67     MaxDownloadsReached,
68     orderedSet,
69     PagedList,
70     parse_filesize,
71     PerRequestProxyHandler,
72     platform_name,
73     PostProcessingError,
74     preferredencoding,
75     prepend_extension,
76     register_socks_protocols,
77     render_table,
78     replace_extension,
79     SameFileError,
80     sanitize_filename,
81     sanitize_path,
82     sanitize_url,
83     sanitized_Request,
84     std_headers,
85     subtitles_filename,
86     UnavailableVideoError,
87     url_basename,
88     version_tuple,
89     write_json_file,
90     write_string,
91     YoutubeDLCookieProcessor,
92     YoutubeDLHandler,
93 )
94 from .cache import Cache
95 from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER
96 from .extractor.openload import PhantomJSwrapper
97 from .downloader import get_suitable_downloader
98 from .downloader.rtmp import rtmpdump_version
99 from .postprocessor import (
100     FFmpegFixupM3u8PP,
101     FFmpegFixupM4aPP,
102     FFmpegFixupStretchedPP,
103     FFmpegMergerPP,
104     FFmpegPostProcessor,
105     get_postprocessor,
106 )
107 from .version import __version__
108
109 if compat_os_name == 'nt':
110     import ctypes
111
112
113 class YoutubeDL(object):
114     """YoutubeDL class.
115
116     YoutubeDL objects are the ones responsible of downloading the
117     actual video file and writing it to disk if the user has requested
118     it, among some other tasks. In most cases there should be one per
119     program. As, given a video URL, the downloader doesn't know how to
120     extract all the needed information, task that InfoExtractors do, it
121     has to pass the URL to one of them.
122
123     For this, YoutubeDL objects have a method that allows
124     InfoExtractors to be registered in a given order. When it is passed
125     a URL, the YoutubeDL object handles it to the first InfoExtractor it
126     finds that reports being able to handle it. The InfoExtractor extracts
127     all the information about the video or videos the URL refers to, and
128     YoutubeDL process the extracted information, possibly using a File
129     Downloader to download the video.
130
131     YoutubeDL objects accept a lot of parameters. In order not to saturate
132     the object constructor with arguments, it receives a dictionary of
133     options instead. These options are available through the params
134     attribute for the InfoExtractors to use. The YoutubeDL also
135     registers itself as the downloader in charge for the InfoExtractors
136     that are added to it, so this is a "mutual registration".
137
138     Available options:
139
140     username:          Username for authentication purposes.
141     password:          Password for authentication purposes.
142     videopassword:     Password for accessing a video.
143     ap_mso:            Adobe Pass multiple-system operator identifier.
144     ap_username:       Multiple-system operator account username.
145     ap_password:       Multiple-system operator account password.
146     usenetrc:          Use netrc for authentication instead.
147     verbose:           Print additional info to stdout.
148     quiet:             Do not print messages to stdout.
149     no_warnings:       Do not print out anything for warnings.
150     forceurl:          Force printing final URL.
151     forcetitle:        Force printing title.
152     forceid:           Force printing ID.
153     forcethumbnail:    Force printing thumbnail URL.
154     forcedescription:  Force printing description.
155     forcefilename:     Force printing final filename.
156     forceduration:     Force printing duration.
157     forcejson:         Force printing info_dict as JSON.
158     dump_single_json:  Force printing the info_dict of the whole playlist
159                        (or video) as a single JSON line.
160     simulate:          Do not download the video files.
161     format:            Video format code. See options.py for more information.
162     outtmpl:           Template for output names.
163     restrictfilenames: Do not allow "&" and spaces in file names
164     ignoreerrors:      Do not stop on download errors.
165     force_generic_extractor: Force downloader to use the generic extractor
166     nooverwrites:      Prevent overwriting files.
167     playliststart:     Playlist item to start at.
168     playlistend:       Playlist item to end at.
169     playlist_items:    Specific indices of playlist to download.
170     playlistreverse:   Download playlist items in reverse order.
171     playlistrandom:    Download playlist items in random order.
172     matchtitle:        Download only matching titles.
173     rejecttitle:       Reject downloads for matching titles.
174     logger:            Log messages to a logging.Logger instance.
175     logtostderr:       Log messages to stderr instead of stdout.
176     writedescription:  Write the video description to a .description file
177     writeinfojson:     Write the video description to a .info.json file
178     writeannotations:  Write the video annotations to a .annotations.xml file
179     writethumbnail:    Write the thumbnail image to a file
180     write_all_thumbnails:  Write all thumbnail formats to files
181     writesubtitles:    Write the video subtitles to a file
182     writeautomaticsub: Write the automatically generated subtitles to a file
183     allsubtitles:      Downloads all the subtitles of the video
184                        (requires writesubtitles or writeautomaticsub)
185     listsubtitles:     Lists all available subtitles for the video
186     subtitlesformat:   The format code for subtitles
187     subtitleslangs:    List of languages of the subtitles to download
188     keepvideo:         Keep the video file after post-processing
189     daterange:         A DateRange object, download only if the upload_date is in the range.
190     skip_download:     Skip the actual download of the video file
191     cachedir:          Location of the cache files in the filesystem.
192                        False to disable filesystem cache.
193     noplaylist:        Download single video instead of a playlist if in doubt.
194     age_limit:         An integer representing the user's age in years.
195                        Unsuitable videos for the given age are skipped.
196     min_views:         An integer representing the minimum view count the video
197                        must have in order to not be skipped.
198                        Videos without view count information are always
199                        downloaded. None for no limit.
200     max_views:         An integer representing the maximum view count.
201                        Videos that are more popular than that are not
202                        downloaded.
203                        Videos without view count information are always
204                        downloaded. None for no limit.
205     download_archive:  File name of a file where all downloads are recorded.
206                        Videos already present in the file are not downloaded
207                        again.
208     cookiefile:        File name where cookies should be read from and dumped to.
209     nocheckcertificate:Do not verify SSL certificates
210     prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
211                        At the moment, this is only supported by YouTube.
212     proxy:             URL of the proxy server to use
213     geo_verification_proxy:  URL of the proxy to use for IP address verification
214                        on geo-restricted sites. (Experimental)
215     socket_timeout:    Time to wait for unresponsive hosts, in seconds
216     bidi_workaround:   Work around buggy terminals without bidirectional text
217                        support, using fridibi
218     debug_printtraffic:Print out sent and received HTTP traffic
219     include_ads:       Download ads as well
220     default_search:    Prepend this string if an input url is not valid.
221                        'auto' for elaborate guessing
222     encoding:          Use this encoding instead of the system-specified.
223     extract_flat:      Do not resolve URLs, return the immediate result.
224                        Pass in 'in_playlist' to only show this behavior for
225                        playlist items.
226     postprocessors:    A list of dictionaries, each with an entry
227                        * key:  The name of the postprocessor. See
228                                youtube_dl/postprocessor/__init__.py for a list.
229                        as well as any further keyword arguments for the
230                        postprocessor.
231     progress_hooks:    A list of functions that get called on download
232                        progress, with a dictionary with the entries
233                        * status: One of "downloading", "error", or "finished".
234                                  Check this first and ignore unknown values.
235
236                        If status is one of "downloading", or "finished", the
237                        following properties may also be present:
238                        * filename: The final filename (always present)
239                        * tmpfilename: The filename we're currently writing to
240                        * downloaded_bytes: Bytes on disk
241                        * total_bytes: Size of the whole file, None if unknown
242                        * total_bytes_estimate: Guess of the eventual file size,
243                                                None if unavailable.
244                        * elapsed: The number of seconds since download started.
245                        * eta: The estimated time in seconds, None if unknown
246                        * speed: The download speed in bytes/second, None if
247                                 unknown
248                        * fragment_index: The counter of the currently
249                                          downloaded video fragment.
250                        * fragment_count: The number of fragments (= individual
251                                          files that will be merged)
252
253                        Progress hooks are guaranteed to be called at least once
254                        (with status "finished") if the download is successful.
255     merge_output_format: Extension to use when merging formats.
256     fixup:             Automatically correct known faults of the file.
257                        One of:
258                        - "never": do nothing
259                        - "warn": only emit a warning
260                        - "detect_or_warn": check whether we can do anything
261                                            about it, warn otherwise (default)
262     source_address:    (Experimental) Client-side IP address to bind to.
263     call_home:         Boolean, true iff we are allowed to contact the
264                        youtube-dl servers for debugging.
265     sleep_interval:    Number of seconds to sleep before each download when
266                        used alone or a lower bound of a range for randomized
267                        sleep before each download (minimum possible number
268                        of seconds to sleep) when used along with
269                        max_sleep_interval.
270     max_sleep_interval:Upper bound of a range for randomized sleep before each
271                        download (maximum possible number of seconds to sleep).
272                        Must only be used along with sleep_interval.
273                        Actual sleep time will be a random float from range
274                        [sleep_interval; max_sleep_interval].
275     listformats:       Print an overview of available video formats and exit.
276     list_thumbnails:   Print a table of all thumbnails and exit.
277     match_filter:      A function that gets called with the info_dict of
278                        every video.
279                        If it returns a message, the video is ignored.
280                        If it returns None, the video is downloaded.
281                        match_filter_func in utils.py is one example for this.
282     no_color:          Do not emit color codes in output.
283     geo_bypass:        Bypass geographic restriction via faking X-Forwarded-For
284                        HTTP header (experimental)
285     geo_bypass_country:
286                        Two-letter ISO 3166-2 country code that will be used for
287                        explicit geographic restriction bypassing via faking
288                        X-Forwarded-For HTTP header (experimental)
289
290     The following options determine which downloader is picked:
291     external_downloader: Executable of the external downloader to call.
292                        None or unset for standard (built-in) downloader.
293     hls_prefer_native: Use the native HLS downloader instead of ffmpeg/avconv
294                        if True, otherwise use ffmpeg/avconv if False, otherwise
295                        use downloader suggested by extractor if None.
296
297     The following parameters are not used by YoutubeDL itself, they are used by
298     the downloader (see youtube_dl/downloader/common.py):
299     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
300     noresizebuffer, retries, continuedl, noprogress, consoletitle,
301     xattr_set_filesize, external_downloader_args, hls_use_mpegts.
302
303     The following options are used by the post processors:
304     prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
305                        otherwise prefer avconv.
306     postprocessor_args: A list of additional command-line arguments for the
307                         postprocessor.
308
309     The following options are used by the Youtube extractor:
310     youtube_include_dash_manifest: If True (default), DASH manifests and related
311                         data will be downloaded and processed by extractor.
312                         You can reduce network I/O by disabling it if you don't
313                         care about DASH.
314     """
315
316     _NUMERIC_FIELDS = set((
317         'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
318         'timestamp', 'upload_year', 'upload_month', 'upload_day',
319         'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
320         'average_rating', 'comment_count', 'age_limit',
321         'start_time', 'end_time',
322         'chapter_number', 'season_number', 'episode_number',
323         'track_number', 'disc_number', 'release_year',
324         'playlist_index',
325     ))
326
327     params = None
328     _ies = []
329     _pps = []
330     _download_retcode = None
331     _num_downloads = None
332     _screen_file = None
333
334     def __init__(self, params=None, auto_init=True):
335         """Create a FileDownloader object with the given options."""
336         if params is None:
337             params = {}
338         self._ies = []
339         self._ies_instances = {}
340         self._pps = []
341         self._progress_hooks = []
342         self._download_retcode = 0
343         self._num_downloads = 0
344         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
345         self._err_file = sys.stderr
346         self.params = {
347             # Default parameters
348             'nocheckcertificate': False,
349         }
350         self.params.update(params)
351         self.cache = Cache(self)
352
353         def check_deprecated(param, option, suggestion):
354             if self.params.get(param) is not None:
355                 self.report_warning(
356                     '%s is deprecated. Use %s instead.' % (option, suggestion))
357                 return True
358             return False
359
360         if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
361             if self.params.get('geo_verification_proxy') is None:
362                 self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
363
364         check_deprecated('autonumber_size', '--autonumber-size', 'output template with %(autonumber)0Nd, where N in the number of digits')
365         check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
366         check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
367
368         if params.get('bidi_workaround', False):
369             try:
370                 import pty
371                 master, slave = pty.openpty()
372                 width = compat_get_terminal_size().columns
373                 if width is None:
374                     width_args = []
375                 else:
376                     width_args = ['-w', str(width)]
377                 sp_kwargs = dict(
378                     stdin=subprocess.PIPE,
379                     stdout=slave,
380                     stderr=self._err_file)
381                 try:
382                     self._output_process = subprocess.Popen(
383                         ['bidiv'] + width_args, **sp_kwargs
384                     )
385                 except OSError:
386                     self._output_process = subprocess.Popen(
387                         ['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs)
388                 self._output_channel = os.fdopen(master, 'rb')
389             except OSError as ose:
390                 if ose.errno == errno.ENOENT:
391                     self.report_warning('Could not find fribidi executable, ignoring --bidi-workaround . Make sure that  fribidi  is an executable file in one of the directories in your $PATH.')
392                 else:
393                     raise
394
395         if (sys.platform != 'win32' and
396                 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
397                 not params.get('restrictfilenames', False)):
398             # Unicode filesystem API will throw errors (#1474, #13027)
399             self.report_warning(
400                 'Assuming --restrict-filenames since file system encoding '
401                 'cannot encode all characters. '
402                 'Set the LC_ALL environment variable to fix this.')
403             self.params['restrictfilenames'] = True
404
405         if isinstance(params.get('outtmpl'), bytes):
406             self.report_warning(
407                 'Parameter outtmpl is bytes, but should be a unicode string. '
408                 'Put  from __future__ import unicode_literals  at the top of your code file or consider switching to Python 3.x.')
409
410         self._setup_opener()
411
412         if auto_init:
413             self.print_debug_header()
414             self.add_default_info_extractors()
415
416         for pp_def_raw in self.params.get('postprocessors', []):
417             pp_class = get_postprocessor(pp_def_raw['key'])
418             pp_def = dict(pp_def_raw)
419             del pp_def['key']
420             pp = pp_class(self, **compat_kwargs(pp_def))
421             self.add_post_processor(pp)
422
423         for ph in self.params.get('progress_hooks', []):
424             self.add_progress_hook(ph)
425
426         register_socks_protocols()
427
428     def warn_if_short_id(self, argv):
429         # short YouTube ID starting with dash?
430         idxs = [
431             i for i, a in enumerate(argv)
432             if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
433         if idxs:
434             correct_argv = (
435                 ['youtube-dl'] +
436                 [a for i, a in enumerate(argv) if i not in idxs] +
437                 ['--'] + [argv[i] for i in idxs]
438             )
439             self.report_warning(
440                 'Long argument string detected. '
441                 'Use -- to separate parameters and URLs, like this:\n%s\n' %
442                 args_to_str(correct_argv))
443
444     def add_info_extractor(self, ie):
445         """Add an InfoExtractor object to the end of the list."""
446         self._ies.append(ie)
447         if not isinstance(ie, type):
448             self._ies_instances[ie.ie_key()] = ie
449             ie.set_downloader(self)
450
451     def get_info_extractor(self, ie_key):
452         """
453         Get an instance of an IE with name ie_key, it will try to get one from
454         the _ies list, if there's no instance it will create a new one and add
455         it to the extractor list.
456         """
457         ie = self._ies_instances.get(ie_key)
458         if ie is None:
459             ie = get_info_extractor(ie_key)()
460             self.add_info_extractor(ie)
461         return ie
462
463     def add_default_info_extractors(self):
464         """
465         Add the InfoExtractors returned by gen_extractors to the end of the list
466         """
467         for ie in gen_extractor_classes():
468             self.add_info_extractor(ie)
469
470     def add_post_processor(self, pp):
471         """Add a PostProcessor object to the end of the chain."""
472         self._pps.append(pp)
473         pp.set_downloader(self)
474
475     def add_progress_hook(self, ph):
476         """Add the progress hook (currently only for the file downloader)"""
477         self._progress_hooks.append(ph)
478
479     def _bidi_workaround(self, message):
480         if not hasattr(self, '_output_channel'):
481             return message
482
483         assert hasattr(self, '_output_process')
484         assert isinstance(message, compat_str)
485         line_count = message.count('\n') + 1
486         self._output_process.stdin.write((message + '\n').encode('utf-8'))
487         self._output_process.stdin.flush()
488         res = ''.join(self._output_channel.readline().decode('utf-8')
489                       for _ in range(line_count))
490         return res[:-len('\n')]
491
492     def to_screen(self, message, skip_eol=False):
493         """Print message to stdout if not in quiet mode."""
494         return self.to_stdout(message, skip_eol, check_quiet=True)
495
496     def _write_string(self, s, out=None):
497         write_string(s, out=out, encoding=self.params.get('encoding'))
498
499     def to_stdout(self, message, skip_eol=False, check_quiet=False):
500         """Print message to stdout if not in quiet mode."""
501         if self.params.get('logger'):
502             self.params['logger'].debug(message)
503         elif not check_quiet or not self.params.get('quiet', False):
504             message = self._bidi_workaround(message)
505             terminator = ['\n', ''][skip_eol]
506             output = message + terminator
507
508             self._write_string(output, self._screen_file)
509
510     def to_stderr(self, message):
511         """Print message to stderr."""
512         assert isinstance(message, compat_str)
513         if self.params.get('logger'):
514             self.params['logger'].error(message)
515         else:
516             message = self._bidi_workaround(message)
517             output = message + '\n'
518             self._write_string(output, self._err_file)
519
520     def to_console_title(self, message):
521         if not self.params.get('consoletitle', False):
522             return
523         if compat_os_name == 'nt':
524             if ctypes.windll.kernel32.GetConsoleWindow():
525                 # c_wchar_p() might not be necessary if `message` is
526                 # already of type unicode()
527                 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
528         elif 'TERM' in os.environ:
529             self._write_string('\033]0;%s\007' % message, self._screen_file)
530
531     def save_console_title(self):
532         if not self.params.get('consoletitle', False):
533             return
534         if compat_os_name != 'nt' and 'TERM' in os.environ:
535             # Save the title on stack
536             self._write_string('\033[22;0t', self._screen_file)
537
538     def restore_console_title(self):
539         if not self.params.get('consoletitle', False):
540             return
541         if compat_os_name != 'nt' and 'TERM' in os.environ:
542             # Restore the title from stack
543             self._write_string('\033[23;0t', self._screen_file)
544
545     def __enter__(self):
546         self.save_console_title()
547         return self
548
549     def __exit__(self, *args):
550         self.restore_console_title()
551
552         if self.params.get('cookiefile') is not None:
553             self.cookiejar.save()
554
555     def trouble(self, message=None, tb=None):
556         """Determine action to take when a download problem appears.
557
558         Depending on if the downloader has been configured to ignore
559         download errors or not, this method may throw an exception or
560         not when errors are found, after printing the message.
561
562         tb, if given, is additional traceback information.
563         """
564         if message is not None:
565             self.to_stderr(message)
566         if self.params.get('verbose'):
567             if tb is None:
568                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
569                     tb = ''
570                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
571                         tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
572                     tb += encode_compat_str(traceback.format_exc())
573                 else:
574                     tb_data = traceback.format_list(traceback.extract_stack())
575                     tb = ''.join(tb_data)
576             self.to_stderr(tb)
577         if not self.params.get('ignoreerrors', False):
578             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
579                 exc_info = sys.exc_info()[1].exc_info
580             else:
581                 exc_info = sys.exc_info()
582             raise DownloadError(message, exc_info)
583         self._download_retcode = 1
584
585     def report_warning(self, message):
586         '''
587         Print the message to stderr, it will be prefixed with 'WARNING:'
588         If stderr is a tty file the 'WARNING:' will be colored
589         '''
590         if self.params.get('logger') is not None:
591             self.params['logger'].warning(message)
592         else:
593             if self.params.get('no_warnings'):
594                 return
595             if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
596                 _msg_header = '\033[0;33mWARNING:\033[0m'
597             else:
598                 _msg_header = 'WARNING:'
599             warning_message = '%s %s' % (_msg_header, message)
600             self.to_stderr(warning_message)
601
602     def report_error(self, message, tb=None):
603         '''
604         Do the same as trouble, but prefixes the message with 'ERROR:', colored
605         in red if stderr is a tty file.
606         '''
607         if not self.params.get('no_color') and self._err_file.isatty() and compat_os_name != 'nt':
608             _msg_header = '\033[0;31mERROR:\033[0m'
609         else:
610             _msg_header = 'ERROR:'
611         error_message = '%s %s' % (_msg_header, message)
612         self.trouble(error_message, tb)
613
614     def report_file_already_downloaded(self, file_name):
615         """Report file has already been fully downloaded."""
616         try:
617             self.to_screen('[download] %s has already been downloaded' % file_name)
618         except UnicodeEncodeError:
619             self.to_screen('[download] The file has already been downloaded')
620
621     def prepare_filename(self, info_dict):
622         """Generate the output filename."""
623         try:
624             template_dict = dict(info_dict)
625
626             template_dict['epoch'] = int(time.time())
627             autonumber_size = self.params.get('autonumber_size')
628             if autonumber_size is None:
629                 autonumber_size = 5
630             template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
631             if template_dict.get('resolution') is None:
632                 if template_dict.get('width') and template_dict.get('height'):
633                     template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
634                 elif template_dict.get('height'):
635                     template_dict['resolution'] = '%sp' % template_dict['height']
636                 elif template_dict.get('width'):
637                     template_dict['resolution'] = '%dx?' % template_dict['width']
638
639             sanitize = lambda k, v: sanitize_filename(
640                 compat_str(v),
641                 restricted=self.params.get('restrictfilenames'),
642                 is_id=(k == 'id' or k.endswith('_id')))
643             template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v))
644                                  for k, v in template_dict.items()
645                                  if v is not None and not isinstance(v, (list, tuple, dict)))
646             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
647
648             outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
649
650             # For fields playlist_index and autonumber convert all occurrences
651             # of %(field)s to %(field)0Nd for backward compatibility
652             field_size_compat_map = {
653                 'playlist_index': len(str(template_dict['n_entries'])),
654                 'autonumber': autonumber_size,
655             }
656             FIELD_SIZE_COMPAT_RE = r'(?<!%)%\((?P<field>autonumber|playlist_index)\)s'
657             mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl)
658             if mobj:
659                 outtmpl = re.sub(
660                     FIELD_SIZE_COMPAT_RE,
661                     r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')],
662                     outtmpl)
663
664             # Missing numeric fields used together with integer presentation types
665             # in format specification will break the argument substitution since
666             # string 'NA' is returned for missing fields. We will patch output
667             # template for missing fields to meet string presentation type.
668             for numeric_field in self._NUMERIC_FIELDS:
669                 if numeric_field not in template_dict:
670                     # As of [1] format syntax is:
671                     #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
672                     # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
673                     FORMAT_RE = r'''(?x)
674                         (?<!%)
675                         %
676                         \({0}\)  # mapping key
677                         (?:[#0\-+ ]+)?  # conversion flags (optional)
678                         (?:\d+)?  # minimum field width (optional)
679                         (?:\.\d+)?  # precision (optional)
680                         [hlL]?  # length modifier (optional)
681                         [diouxXeEfFgGcrs%]  # conversion type
682                     '''
683                     outtmpl = re.sub(
684                         FORMAT_RE.format(numeric_field),
685                         r'%({0})s'.format(numeric_field), outtmpl)
686
687             # expand_path translates '%%' into '%' and '$$' into '$'
688             # correspondingly that is not what we want since we need to keep
689             # '%%' intact for template dict substitution step. Working around
690             # with boundary-alike separator hack.
691             sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
692             outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep))
693
694             # outtmpl should be expand_path'ed before template dict substitution
695             # because meta fields may contain env variables we don't want to
696             # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
697             # title "Hello $PATH", we don't want `$PATH` to be expanded.
698             filename = expand_path(outtmpl).replace(sep, '') % template_dict
699
700             # Temporary fix for #4787
701             # 'Treat' all problem characters by passing filename through preferredencoding
702             # to workaround encoding issues with subprocess on python2 @ Windows
703             if sys.version_info < (3, 0) and sys.platform == 'win32':
704                 filename = encodeFilename(filename, True).decode(preferredencoding())
705             return sanitize_path(filename)
706         except ValueError as err:
707             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
708             return None
709
710     def _match_entry(self, info_dict, incomplete):
711         """ Returns None iff the file should be downloaded """
712
713         video_title = info_dict.get('title', info_dict.get('id', 'video'))
714         if 'title' in info_dict:
715             # This can happen when we're just evaluating the playlist
716             title = info_dict['title']
717             matchtitle = self.params.get('matchtitle', False)
718             if matchtitle:
719                 if not re.search(matchtitle, title, re.IGNORECASE):
720                     return '"' + title + '" title did not match pattern "' + matchtitle + '"'
721             rejecttitle = self.params.get('rejecttitle', False)
722             if rejecttitle:
723                 if re.search(rejecttitle, title, re.IGNORECASE):
724                     return '"' + title + '" title matched reject pattern "' + rejecttitle + '"'
725         date = info_dict.get('upload_date')
726         if date is not None:
727             dateRange = self.params.get('daterange', DateRange())
728             if date not in dateRange:
729                 return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
730         view_count = info_dict.get('view_count')
731         if view_count is not None:
732             min_views = self.params.get('min_views')
733             if min_views is not None and view_count < min_views:
734                 return 'Skipping %s, because it has not reached minimum view count (%d/%d)' % (video_title, view_count, min_views)
735             max_views = self.params.get('max_views')
736             if max_views is not None and view_count > max_views:
737                 return 'Skipping %s, because it has exceeded the maximum view count (%d/%d)' % (video_title, view_count, max_views)
738         if age_restricted(info_dict.get('age_limit'), self.params.get('age_limit')):
739             return 'Skipping "%s" because it is age restricted' % video_title
740         if self.in_download_archive(info_dict):
741             return '%s has already been recorded in archive' % video_title
742
743         if not incomplete:
744             match_filter = self.params.get('match_filter')
745             if match_filter is not None:
746                 ret = match_filter(info_dict)
747                 if ret is not None:
748                     return ret
749
750         return None
751
752     @staticmethod
753     def add_extra_info(info_dict, extra_info):
754         '''Set the keys from extra_info in info dict if they are missing'''
755         for key, value in extra_info.items():
756             info_dict.setdefault(key, value)
757
758     def extract_info(self, url, download=True, ie_key=None, extra_info={},
759                      process=True, force_generic_extractor=False):
760         '''
761         Returns a list with a dictionary for each video we find.
762         If 'download', also downloads the videos.
763         extra_info is a dict containing the extra values to add to each result
764         '''
765
766         if not ie_key and force_generic_extractor:
767             ie_key = 'Generic'
768
769         if ie_key:
770             ies = [self.get_info_extractor(ie_key)]
771         else:
772             ies = self._ies
773
774         for ie in ies:
775             if not ie.suitable(url):
776                 continue
777
778             ie = self.get_info_extractor(ie.ie_key())
779             if not ie.working():
780                 self.report_warning('The program functionality for this site has been marked as broken, '
781                                     'and will probably not work.')
782
783             try:
784                 ie_result = ie.extract(url)
785                 if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here)
786                     break
787                 if isinstance(ie_result, list):
788                     # Backwards compatibility: old IE result format
789                     ie_result = {
790                         '_type': 'compat_list',
791                         'entries': ie_result,
792                     }
793                 self.add_default_extra_info(ie_result, ie, url)
794                 if process:
795                     return self.process_ie_result(ie_result, download, extra_info)
796                 else:
797                     return ie_result
798             except GeoRestrictedError as e:
799                 msg = e.msg
800                 if e.countries:
801                     msg += '\nThis video is available in %s.' % ', '.join(
802                         map(ISO3166Utils.short2full, e.countries))
803                 msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
804                 self.report_error(msg)
805                 break
806             except ExtractorError as e:  # An error we somewhat expected
807                 self.report_error(compat_str(e), e.format_traceback())
808                 break
809             except MaxDownloadsReached:
810                 raise
811             except Exception as e:
812                 if self.params.get('ignoreerrors', False):
813                     self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
814                     break
815                 else:
816                     raise
817         else:
818             self.report_error('no suitable InfoExtractor for URL %s' % url)
819
820     def add_default_extra_info(self, ie_result, ie, url):
821         self.add_extra_info(ie_result, {
822             'extractor': ie.IE_NAME,
823             'webpage_url': url,
824             'webpage_url_basename': url_basename(url),
825             'extractor_key': ie.ie_key(),
826         })
827
828     def process_ie_result(self, ie_result, download=True, extra_info={}):
829         """
830         Take the result of the ie(may be modified) and resolve all unresolved
831         references (URLs, playlist items).
832
833         It will also download the videos if 'download'.
834         Returns the resolved ie_result.
835         """
836         result_type = ie_result.get('_type', 'video')
837
838         if result_type in ('url', 'url_transparent'):
839             ie_result['url'] = sanitize_url(ie_result['url'])
840             extract_flat = self.params.get('extract_flat', False)
841             if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
842                     extract_flat is True):
843                 if self.params.get('forcejson', False):
844                     self.to_stdout(json.dumps(ie_result))
845                 return ie_result
846
847         if result_type == 'video':
848             self.add_extra_info(ie_result, extra_info)
849             return self.process_video_result(ie_result, download=download)
850         elif result_type == 'url':
851             # We have to add extra_info to the results because it may be
852             # contained in a playlist
853             return self.extract_info(ie_result['url'],
854                                      download,
855                                      ie_key=ie_result.get('ie_key'),
856                                      extra_info=extra_info)
857         elif result_type == 'url_transparent':
858             # Use the information from the embedding page
859             info = self.extract_info(
860                 ie_result['url'], ie_key=ie_result.get('ie_key'),
861                 extra_info=extra_info, download=False, process=False)
862
863             # extract_info may return None when ignoreerrors is enabled and
864             # extraction failed with an error, don't crash and return early
865             # in this case
866             if not info:
867                 return info
868
869             force_properties = dict(
870                 (k, v) for k, v in ie_result.items() if v is not None)
871             for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'):
872                 if f in force_properties:
873                     del force_properties[f]
874             new_result = info.copy()
875             new_result.update(force_properties)
876
877             # Extracted info may not be a video result (i.e.
878             # info.get('_type', 'video') != video) but rather an url or
879             # url_transparent. In such cases outer metadata (from ie_result)
880             # should be propagated to inner one (info). For this to happen
881             # _type of info should be overridden with url_transparent. This
882             # fixes issue from https://github.com/rg3/youtube-dl/pull/11163.
883             if new_result.get('_type') == 'url':
884                 new_result['_type'] = 'url_transparent'
885
886             return self.process_ie_result(
887                 new_result, download=download, extra_info=extra_info)
888         elif result_type in ('playlist', 'multi_video'):
889             # We process each entry in the playlist
890             playlist = ie_result.get('title') or ie_result.get('id')
891             self.to_screen('[download] Downloading playlist: %s' % playlist)
892
893             playlist_results = []
894
895             playliststart = self.params.get('playliststart', 1) - 1
896             playlistend = self.params.get('playlistend')
897             # For backwards compatibility, interpret -1 as whole list
898             if playlistend == -1:
899                 playlistend = None
900
901             playlistitems_str = self.params.get('playlist_items')
902             playlistitems = None
903             if playlistitems_str is not None:
904                 def iter_playlistitems(format):
905                     for string_segment in format.split(','):
906                         if '-' in string_segment:
907                             start, end = string_segment.split('-')
908                             for item in range(int(start), int(end) + 1):
909                                 yield int(item)
910                         else:
911                             yield int(string_segment)
912                 playlistitems = orderedSet(iter_playlistitems(playlistitems_str))
913
914             ie_entries = ie_result['entries']
915
916             def make_playlistitems_entries(list_ie_entries):
917                 num_entries = len(list_ie_entries)
918                 return [
919                     list_ie_entries[i - 1] for i in playlistitems
920                     if -num_entries <= i - 1 < num_entries]
921
922             def report_download(num_entries):
923                 self.to_screen(
924                     '[%s] playlist %s: Downloading %d videos' %
925                     (ie_result['extractor'], playlist, num_entries))
926
927             if isinstance(ie_entries, list):
928                 n_all_entries = len(ie_entries)
929                 if playlistitems:
930                     entries = make_playlistitems_entries(ie_entries)
931                 else:
932                     entries = ie_entries[playliststart:playlistend]
933                 n_entries = len(entries)
934                 self.to_screen(
935                     '[%s] playlist %s: Collected %d video ids (downloading %d of them)' %
936                     (ie_result['extractor'], playlist, n_all_entries, n_entries))
937             elif isinstance(ie_entries, PagedList):
938                 if playlistitems:
939                     entries = []
940                     for item in playlistitems:
941                         entries.extend(ie_entries.getslice(
942                             item - 1, item
943                         ))
944                 else:
945                     entries = ie_entries.getslice(
946                         playliststart, playlistend)
947                 n_entries = len(entries)
948                 report_download(n_entries)
949             else:  # iterable
950                 if playlistitems:
951                     entries = make_playlistitems_entries(list(ie_entries))
952                 else:
953                     entries = list(itertools.islice(
954                         ie_entries, playliststart, playlistend))
955                 n_entries = len(entries)
956                 report_download(n_entries)
957
958             if self.params.get('playlistreverse', False):
959                 entries = entries[::-1]
960
961             if self.params.get('playlistrandom', False):
962                 random.shuffle(entries)
963
964             x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
965
966             for i, entry in enumerate(entries, 1):
967                 self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
968                 # This __x_forwarded_for_ip thing is a bit ugly but requires
969                 # minimal changes
970                 if x_forwarded_for:
971                     entry['__x_forwarded_for_ip'] = x_forwarded_for
972                 extra = {
973                     'n_entries': n_entries,
974                     'playlist': playlist,
975                     'playlist_id': ie_result.get('id'),
976                     'playlist_title': ie_result.get('title'),
977                     'playlist_index': i + playliststart,
978                     'extractor': ie_result['extractor'],
979                     'webpage_url': ie_result['webpage_url'],
980                     'webpage_url_basename': url_basename(ie_result['webpage_url']),
981                     'extractor_key': ie_result['extractor_key'],
982                 }
983
984                 reason = self._match_entry(entry, incomplete=True)
985                 if reason is not None:
986                     self.to_screen('[download] ' + reason)
987                     continue
988
989                 entry_result = self.process_ie_result(entry,
990                                                       download=download,
991                                                       extra_info=extra)
992                 playlist_results.append(entry_result)
993             ie_result['entries'] = playlist_results
994             self.to_screen('[download] Finished downloading playlist: %s' % playlist)
995             return ie_result
996         elif result_type == 'compat_list':
997             self.report_warning(
998                 'Extractor %s returned a compat_list result. '
999                 'It needs to be updated.' % ie_result.get('extractor'))
1000
1001             def _fixup(r):
1002                 self.add_extra_info(
1003                     r,
1004                     {
1005                         'extractor': ie_result['extractor'],
1006                         'webpage_url': ie_result['webpage_url'],
1007                         'webpage_url_basename': url_basename(ie_result['webpage_url']),
1008                         'extractor_key': ie_result['extractor_key'],
1009                     }
1010                 )
1011                 return r
1012             ie_result['entries'] = [
1013                 self.process_ie_result(_fixup(r), download, extra_info)
1014                 for r in ie_result['entries']
1015             ]
1016             return ie_result
1017         else:
1018             raise Exception('Invalid result type: %s' % result_type)
1019
1020     def _build_format_filter(self, filter_spec):
1021         " Returns a function to filter the formats according to the filter_spec "
1022
1023         OPERATORS = {
1024             '<': operator.lt,
1025             '<=': operator.le,
1026             '>': operator.gt,
1027             '>=': operator.ge,
1028             '=': operator.eq,
1029             '!=': operator.ne,
1030         }
1031         operator_rex = re.compile(r'''(?x)\s*
1032             (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)
1033             \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1034             (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
1035             $
1036             ''' % '|'.join(map(re.escape, OPERATORS.keys())))
1037         m = operator_rex.search(filter_spec)
1038         if m:
1039             try:
1040                 comparison_value = int(m.group('value'))
1041             except ValueError:
1042                 comparison_value = parse_filesize(m.group('value'))
1043                 if comparison_value is None:
1044                     comparison_value = parse_filesize(m.group('value') + 'B')
1045                 if comparison_value is None:
1046                     raise ValueError(
1047                         'Invalid value %r in format specification %r' % (
1048                             m.group('value'), filter_spec))
1049             op = OPERATORS[m.group('op')]
1050
1051         if not m:
1052             STR_OPERATORS = {
1053                 '=': operator.eq,
1054                 '!=': operator.ne,
1055                 '^=': lambda attr, value: attr.startswith(value),
1056                 '$=': lambda attr, value: attr.endswith(value),
1057                 '*=': lambda attr, value: value in attr,
1058             }
1059             str_operator_rex = re.compile(r'''(?x)
1060                 \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id)
1061                 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
1062                 \s*(?P<value>[a-zA-Z0-9._-]+)
1063                 \s*$
1064                 ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
1065             m = str_operator_rex.search(filter_spec)
1066             if m:
1067                 comparison_value = m.group('value')
1068                 op = STR_OPERATORS[m.group('op')]
1069
1070         if not m:
1071             raise ValueError('Invalid filter specification %r' % filter_spec)
1072
1073         def _filter(f):
1074             actual_value = f.get(m.group('key'))
1075             if actual_value is None:
1076                 return m.group('none_inclusive')
1077             return op(actual_value, comparison_value)
1078         return _filter
1079
1080     def _default_format_spec(self, info_dict, download=True):
1081
1082         def can_merge():
1083             merger = FFmpegMergerPP(self)
1084             return merger.available and merger.can_merge()
1085
1086         def prefer_best():
1087             if self.params.get('simulate', False):
1088                 return False
1089             if not download:
1090                 return False
1091             if self.params.get('outtmpl', DEFAULT_OUTTMPL) == '-':
1092                 return True
1093             if info_dict.get('is_live'):
1094                 return True
1095             if not can_merge():
1096                 return True
1097             return False
1098
1099         req_format_list = ['bestvideo+bestaudio', 'best']
1100         if prefer_best():
1101             req_format_list.reverse()
1102         return '/'.join(req_format_list)
1103
1104     def build_format_selector(self, format_spec):
1105         def syntax_error(note, start):
1106             message = (
1107                 'Invalid format specification: '
1108                 '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
1109             return SyntaxError(message)
1110
1111         PICKFIRST = 'PICKFIRST'
1112         MERGE = 'MERGE'
1113         SINGLE = 'SINGLE'
1114         GROUP = 'GROUP'
1115         FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
1116
1117         def _parse_filter(tokens):
1118             filter_parts = []
1119             for type, string, start, _, _ in tokens:
1120                 if type == tokenize.OP and string == ']':
1121                     return ''.join(filter_parts)
1122                 else:
1123                     filter_parts.append(string)
1124
1125         def _remove_unused_ops(tokens):
1126             # Remove operators that we don't use and join them with the surrounding strings
1127             # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
1128             ALLOWED_OPS = ('/', '+', ',', '(', ')')
1129             last_string, last_start, last_end, last_line = None, None, None, None
1130             for type, string, start, end, line in tokens:
1131                 if type == tokenize.OP and string == '[':
1132                     if last_string:
1133                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1134                         last_string = None
1135                     yield type, string, start, end, line
1136                     # everything inside brackets will be handled by _parse_filter
1137                     for type, string, start, end, line in tokens:
1138                         yield type, string, start, end, line
1139                         if type == tokenize.OP and string == ']':
1140                             break
1141                 elif type == tokenize.OP and string in ALLOWED_OPS:
1142                     if last_string:
1143                         yield tokenize.NAME, last_string, last_start, last_end, last_line
1144                         last_string = None
1145                     yield type, string, start, end, line
1146                 elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
1147                     if not last_string:
1148                         last_string = string
1149                         last_start = start
1150                         last_end = end
1151                     else:
1152                         last_string += string
1153             if last_string:
1154                 yield tokenize.NAME, last_string, last_start, last_end, last_line
1155
1156         def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
1157             selectors = []
1158             current_selector = None
1159             for type, string, start, _, _ in tokens:
1160                 # ENCODING is only defined in python 3.x
1161                 if type == getattr(tokenize, 'ENCODING', None):
1162                     continue
1163                 elif type in [tokenize.NAME, tokenize.NUMBER]:
1164                     current_selector = FormatSelector(SINGLE, string, [])
1165                 elif type == tokenize.OP:
1166                     if string == ')':
1167                         if not inside_group:
1168                             # ')' will be handled by the parentheses group
1169                             tokens.restore_last_token()
1170                         break
1171                     elif inside_merge and string in ['/', ',']:
1172                         tokens.restore_last_token()
1173                         break
1174                     elif inside_choice and string == ',':
1175                         tokens.restore_last_token()
1176                         break
1177                     elif string == ',':
1178                         if not current_selector:
1179                             raise syntax_error('"," must follow a format selector', start)
1180                         selectors.append(current_selector)
1181                         current_selector = None
1182                     elif string == '/':
1183                         if not current_selector:
1184                             raise syntax_error('"/" must follow a format selector', start)
1185                         first_choice = current_selector
1186                         second_choice = _parse_format_selection(tokens, inside_choice=True)
1187                         current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
1188                     elif string == '[':
1189                         if not current_selector:
1190                             current_selector = FormatSelector(SINGLE, 'best', [])
1191                         format_filter = _parse_filter(tokens)
1192                         current_selector.filters.append(format_filter)
1193                     elif string == '(':
1194                         if current_selector:
1195                             raise syntax_error('Unexpected "("', start)
1196                         group = _parse_format_selection(tokens, inside_group=True)
1197                         current_selector = FormatSelector(GROUP, group, [])
1198                     elif string == '+':
1199                         video_selector = current_selector
1200                         audio_selector = _parse_format_selection(tokens, inside_merge=True)
1201                         if not video_selector or not audio_selector:
1202                             raise syntax_error('"+" must be between two format selectors', start)
1203                         current_selector = FormatSelector(MERGE, (video_selector, audio_selector), [])
1204                     else:
1205                         raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
1206                 elif type == tokenize.ENDMARKER:
1207                     break
1208             if current_selector:
1209                 selectors.append(current_selector)
1210             return selectors
1211
1212         def _build_selector_function(selector):
1213             if isinstance(selector, list):
1214                 fs = [_build_selector_function(s) for s in selector]
1215
1216                 def selector_function(ctx):
1217                     for f in fs:
1218                         for format in f(ctx):
1219                             yield format
1220                 return selector_function
1221             elif selector.type == GROUP:
1222                 selector_function = _build_selector_function(selector.selector)
1223             elif selector.type == PICKFIRST:
1224                 fs = [_build_selector_function(s) for s in selector.selector]
1225
1226                 def selector_function(ctx):
1227                     for f in fs:
1228                         picked_formats = list(f(ctx))
1229                         if picked_formats:
1230                             return picked_formats
1231                     return []
1232             elif selector.type == SINGLE:
1233                 format_spec = selector.selector
1234
1235                 def selector_function(ctx):
1236                     formats = list(ctx['formats'])
1237                     if not formats:
1238                         return
1239                     if format_spec == 'all':
1240                         for f in formats:
1241                             yield f
1242                     elif format_spec in ['best', 'worst', None]:
1243                         format_idx = 0 if format_spec == 'worst' else -1
1244                         audiovideo_formats = [
1245                             f for f in formats
1246                             if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
1247                         if audiovideo_formats:
1248                             yield audiovideo_formats[format_idx]
1249                         # for extractors with incomplete formats (audio only (soundcloud)
1250                         # or video only (imgur)) we will fallback to best/worst
1251                         # {video,audio}-only format
1252                         elif ctx['incomplete_formats']:
1253                             yield formats[format_idx]
1254                     elif format_spec == 'bestaudio':
1255                         audio_formats = [
1256                             f for f in formats
1257                             if f.get('vcodec') == 'none']
1258                         if audio_formats:
1259                             yield audio_formats[-1]
1260                     elif format_spec == 'worstaudio':
1261                         audio_formats = [
1262                             f for f in formats
1263                             if f.get('vcodec') == 'none']
1264                         if audio_formats:
1265                             yield audio_formats[0]
1266                     elif format_spec == 'bestvideo':
1267                         video_formats = [
1268                             f for f in formats
1269                             if f.get('acodec') == 'none']
1270                         if video_formats:
1271                             yield video_formats[-1]
1272                     elif format_spec == 'worstvideo':
1273                         video_formats = [
1274                             f for f in formats
1275                             if f.get('acodec') == 'none']
1276                         if video_formats:
1277                             yield video_formats[0]
1278                     else:
1279                         extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
1280                         if format_spec in extensions:
1281                             filter_f = lambda f: f['ext'] == format_spec
1282                         else:
1283                             filter_f = lambda f: f['format_id'] == format_spec
1284                         matches = list(filter(filter_f, formats))
1285                         if matches:
1286                             yield matches[-1]
1287             elif selector.type == MERGE:
1288                 def _merge(formats_info):
1289                     format_1, format_2 = [f['format_id'] for f in formats_info]
1290                     # The first format must contain the video and the
1291                     # second the audio
1292                     if formats_info[0].get('vcodec') == 'none':
1293                         self.report_error('The first format must '
1294                                           'contain the video, try using '
1295                                           '"-f %s+%s"' % (format_2, format_1))
1296                         return
1297                     # Formats must be opposite (video+audio)
1298                     if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none':
1299                         self.report_error(
1300                             'Both formats %s and %s are video-only, you must specify "-f video+audio"'
1301                             % (format_1, format_2))
1302                         return
1303                     output_ext = (
1304                         formats_info[0]['ext']
1305                         if self.params.get('merge_output_format') is None
1306                         else self.params['merge_output_format'])
1307                     return {
1308                         'requested_formats': formats_info,
1309                         'format': '%s+%s' % (formats_info[0].get('format'),
1310                                              formats_info[1].get('format')),
1311                         'format_id': '%s+%s' % (formats_info[0].get('format_id'),
1312                                                 formats_info[1].get('format_id')),
1313                         'width': formats_info[0].get('width'),
1314                         'height': formats_info[0].get('height'),
1315                         'resolution': formats_info[0].get('resolution'),
1316                         'fps': formats_info[0].get('fps'),
1317                         'vcodec': formats_info[0].get('vcodec'),
1318                         'vbr': formats_info[0].get('vbr'),
1319                         'stretched_ratio': formats_info[0].get('stretched_ratio'),
1320                         'acodec': formats_info[1].get('acodec'),
1321                         'abr': formats_info[1].get('abr'),
1322                         'ext': output_ext,
1323                     }
1324                 video_selector, audio_selector = map(_build_selector_function, selector.selector)
1325
1326                 def selector_function(ctx):
1327                     for pair in itertools.product(
1328                             video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))):
1329                         yield _merge(pair)
1330
1331             filters = [self._build_format_filter(f) for f in selector.filters]
1332
1333             def final_selector(ctx):
1334                 ctx_copy = copy.deepcopy(ctx)
1335                 for _filter in filters:
1336                     ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
1337                 return selector_function(ctx_copy)
1338             return final_selector
1339
1340         stream = io.BytesIO(format_spec.encode('utf-8'))
1341         try:
1342             tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
1343         except tokenize.TokenError:
1344             raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
1345
1346         class TokenIterator(object):
1347             def __init__(self, tokens):
1348                 self.tokens = tokens
1349                 self.counter = 0
1350
1351             def __iter__(self):
1352                 return self
1353
1354             def __next__(self):
1355                 if self.counter >= len(self.tokens):
1356                     raise StopIteration()
1357                 value = self.tokens[self.counter]
1358                 self.counter += 1
1359                 return value
1360
1361             next = __next__
1362
1363             def restore_last_token(self):
1364                 self.counter -= 1
1365
1366         parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
1367         return _build_selector_function(parsed_selector)
1368
1369     def _calc_headers(self, info_dict):
1370         res = std_headers.copy()
1371
1372         add_headers = info_dict.get('http_headers')
1373         if add_headers:
1374             res.update(add_headers)
1375
1376         cookies = self._calc_cookies(info_dict)
1377         if cookies:
1378             res['Cookie'] = cookies
1379
1380         if 'X-Forwarded-For' not in res:
1381             x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
1382             if x_forwarded_for_ip:
1383                 res['X-Forwarded-For'] = x_forwarded_for_ip
1384
1385         return res
1386
1387     def _calc_cookies(self, info_dict):
1388         pr = sanitized_Request(info_dict['url'])
1389         self.cookiejar.add_cookie_header(pr)
1390         return pr.get_header('Cookie')
1391
1392     def process_video_result(self, info_dict, download=True):
1393         assert info_dict.get('_type', 'video') == 'video'
1394
1395         if 'id' not in info_dict:
1396             raise ExtractorError('Missing "id" field in extractor result')
1397         if 'title' not in info_dict:
1398             raise ExtractorError('Missing "title" field in extractor result')
1399
1400         def report_force_conversion(field, field_not, conversion):
1401             self.report_warning(
1402                 '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
1403                 % (field, field_not, conversion))
1404
1405         def sanitize_string_field(info, string_field):
1406             field = info.get(string_field)
1407             if field is None or isinstance(field, compat_str):
1408                 return
1409             report_force_conversion(string_field, 'a string', 'string')
1410             info[string_field] = compat_str(field)
1411
1412         def sanitize_numeric_fields(info):
1413             for numeric_field in self._NUMERIC_FIELDS:
1414                 field = info.get(numeric_field)
1415                 if field is None or isinstance(field, compat_numeric_types):
1416                     continue
1417                 report_force_conversion(numeric_field, 'numeric', 'int')
1418                 info[numeric_field] = int_or_none(field)
1419
1420         sanitize_string_field(info_dict, 'id')
1421         sanitize_numeric_fields(info_dict)
1422
1423         if 'playlist' not in info_dict:
1424             # It isn't part of a playlist
1425             info_dict['playlist'] = None
1426             info_dict['playlist_index'] = None
1427
1428         thumbnails = info_dict.get('thumbnails')
1429         if thumbnails is None:
1430             thumbnail = info_dict.get('thumbnail')
1431             if thumbnail:
1432                 info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
1433         if thumbnails:
1434             thumbnails.sort(key=lambda t: (
1435                 t.get('preference') if t.get('preference') is not None else -1,
1436                 t.get('width') if t.get('width') is not None else -1,
1437                 t.get('height') if t.get('height') is not None else -1,
1438                 t.get('id') if t.get('id') is not None else '', t.get('url')))
1439             for i, t in enumerate(thumbnails):
1440                 t['url'] = sanitize_url(t['url'])
1441                 if t.get('width') and t.get('height'):
1442                     t['resolution'] = '%dx%d' % (t['width'], t['height'])
1443                 if t.get('id') is None:
1444                     t['id'] = '%d' % i
1445
1446         if self.params.get('list_thumbnails'):
1447             self.list_thumbnails(info_dict)
1448             return
1449
1450         thumbnail = info_dict.get('thumbnail')
1451         if thumbnail:
1452             info_dict['thumbnail'] = sanitize_url(thumbnail)
1453         elif thumbnails:
1454             info_dict['thumbnail'] = thumbnails[-1]['url']
1455
1456         if 'display_id' not in info_dict and 'id' in info_dict:
1457             info_dict['display_id'] = info_dict['id']
1458
1459         if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
1460             # Working around out-of-range timestamp values (e.g. negative ones on Windows,
1461             # see http://bugs.python.org/issue1646728)
1462             try:
1463                 upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
1464                 info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
1465             except (ValueError, OverflowError, OSError):
1466                 pass
1467
1468         # Auto generate title fields corresponding to the *_number fields when missing
1469         # in order to always have clean titles. This is very common for TV series.
1470         for field in ('chapter', 'season', 'episode'):
1471             if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
1472                 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
1473
1474         subtitles = info_dict.get('subtitles')
1475         if subtitles:
1476             for _, subtitle in subtitles.items():
1477                 for subtitle_format in subtitle:
1478                     if subtitle_format.get('url'):
1479                         subtitle_format['url'] = sanitize_url(subtitle_format['url'])
1480                     if subtitle_format.get('ext') is None:
1481                         subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
1482
1483         if self.params.get('listsubtitles', False):
1484             if 'automatic_captions' in info_dict:
1485                 self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
1486             self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
1487             return
1488         info_dict['requested_subtitles'] = self.process_subtitles(
1489             info_dict['id'], subtitles,
1490             info_dict.get('automatic_captions'))
1491
1492         # We now pick which formats have to be downloaded
1493         if info_dict.get('formats') is None:
1494             # There's only one format available
1495             formats = [info_dict]
1496         else:
1497             formats = info_dict['formats']
1498
1499         if not formats:
1500             raise ExtractorError('No video formats found!')
1501
1502         def is_wellformed(f):
1503             url = f.get('url')
1504             if not url:
1505                 self.report_warning(
1506                     '"url" field is missing or empty - skipping format, '
1507                     'there is an error in extractor')
1508                 return False
1509             if isinstance(url, bytes):
1510                 sanitize_string_field(f, 'url')
1511             return True
1512
1513         # Filter out malformed formats for better extraction robustness
1514         formats = list(filter(is_wellformed, formats))
1515
1516         formats_dict = {}
1517
1518         # We check that all the formats have the format and format_id fields
1519         for i, format in enumerate(formats):
1520             sanitize_string_field(format, 'format_id')
1521             sanitize_numeric_fields(format)
1522             format['url'] = sanitize_url(format['url'])
1523             if not format.get('format_id'):
1524                 format['format_id'] = compat_str(i)
1525             else:
1526                 # Sanitize format_id from characters used in format selector expression
1527                 format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
1528             format_id = format['format_id']
1529             if format_id not in formats_dict:
1530                 formats_dict[format_id] = []
1531             formats_dict[format_id].append(format)
1532
1533         # Make sure all formats have unique format_id
1534         for format_id, ambiguous_formats in formats_dict.items():
1535             if len(ambiguous_formats) > 1:
1536                 for i, format in enumerate(ambiguous_formats):
1537                     format['format_id'] = '%s-%d' % (format_id, i)
1538
1539         for i, format in enumerate(formats):
1540             if format.get('format') is None:
1541                 format['format'] = '{id} - {res}{note}'.format(
1542                     id=format['format_id'],
1543                     res=self.format_resolution(format),
1544                     note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
1545                 )
1546             # Automatically determine file extension if missing
1547             if format.get('ext') is None:
1548                 format['ext'] = determine_ext(format['url']).lower()
1549             # Automatically determine protocol if missing (useful for format
1550             # selection purposes)
1551             if format.get('protocol') is None:
1552                 format['protocol'] = determine_protocol(format)
1553             # Add HTTP headers, so that external programs can use them from the
1554             # json output
1555             full_format_info = info_dict.copy()
1556             full_format_info.update(format)
1557             format['http_headers'] = self._calc_headers(full_format_info)
1558         # Remove private housekeeping stuff
1559         if '__x_forwarded_for_ip' in info_dict:
1560             del info_dict['__x_forwarded_for_ip']
1561
1562         # TODO Central sorting goes here
1563
1564         if formats[0] is not info_dict:
1565             # only set the 'formats' fields if the original info_dict list them
1566             # otherwise we end up with a circular reference, the first (and unique)
1567             # element in the 'formats' field in info_dict is info_dict itself,
1568             # which can't be exported to json
1569             info_dict['formats'] = formats
1570         if self.params.get('listformats'):
1571             self.list_formats(info_dict)
1572             return
1573
1574         req_format = self.params.get('format')
1575         if req_format is None:
1576             req_format = self._default_format_spec(info_dict, download=download)
1577             if self.params.get('verbose'):
1578                 self.to_stdout('[debug] Default format spec: %s' % req_format)
1579
1580         format_selector = self.build_format_selector(req_format)
1581
1582         # While in format selection we may need to have an access to the original
1583         # format set in order to calculate some metrics or do some processing.
1584         # For now we need to be able to guess whether original formats provided
1585         # by extractor are incomplete or not (i.e. whether extractor provides only
1586         # video-only or audio-only formats) for proper formats selection for
1587         # extractors with such incomplete formats (see
1588         # https://github.com/rg3/youtube-dl/pull/5556).
1589         # Since formats may be filtered during format selection and may not match
1590         # the original formats the results may be incorrect. Thus original formats
1591         # or pre-calculated metrics should be passed to format selection routines
1592         # as well.
1593         # We will pass a context object containing all necessary additional data
1594         # instead of just formats.
1595         # This fixes incorrect format selection issue (see
1596         # https://github.com/rg3/youtube-dl/issues/10083).
1597         incomplete_formats = (
1598             # All formats are video-only or
1599             all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) or
1600             # all formats are audio-only
1601             all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
1602
1603         ctx = {
1604             'formats': formats,
1605             'incomplete_formats': incomplete_formats,
1606         }
1607
1608         formats_to_download = list(format_selector(ctx))
1609         if not formats_to_download:
1610             raise ExtractorError('requested format not available',
1611                                  expected=True)
1612
1613         if download:
1614             if len(formats_to_download) > 1:
1615                 self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
1616             for format in formats_to_download:
1617                 new_info = dict(info_dict)
1618                 new_info.update(format)
1619                 self.process_info(new_info)
1620         # We update the info dict with the best quality format (backwards compatibility)
1621         info_dict.update(formats_to_download[-1])
1622         return info_dict
1623
1624     def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
1625         """Select the requested subtitles and their format"""
1626         available_subs = {}
1627         if normal_subtitles and self.params.get('writesubtitles'):
1628             available_subs.update(normal_subtitles)
1629         if automatic_captions and self.params.get('writeautomaticsub'):
1630             for lang, cap_info in automatic_captions.items():
1631                 if lang not in available_subs:
1632                     available_subs[lang] = cap_info
1633
1634         if (not self.params.get('writesubtitles') and not
1635                 self.params.get('writeautomaticsub') or not
1636                 available_subs):
1637             return None
1638
1639         if self.params.get('allsubtitles', False):
1640             requested_langs = available_subs.keys()
1641         else:
1642             if self.params.get('subtitleslangs', False):
1643                 requested_langs = self.params.get('subtitleslangs')
1644             elif 'en' in available_subs:
1645                 requested_langs = ['en']
1646             else:
1647                 requested_langs = [list(available_subs.keys())[0]]
1648
1649         formats_query = self.params.get('subtitlesformat', 'best')
1650         formats_preference = formats_query.split('/') if formats_query else []
1651         subs = {}
1652         for lang in requested_langs:
1653             formats = available_subs.get(lang)
1654             if formats is None:
1655                 self.report_warning('%s subtitles not available for %s' % (lang, video_id))
1656                 continue
1657             for ext in formats_preference:
1658                 if ext == 'best':
1659                     f = formats[-1]
1660                     break
1661                 matches = list(filter(lambda f: f['ext'] == ext, formats))
1662                 if matches:
1663                     f = matches[-1]
1664                     break
1665             else:
1666                 f = formats[-1]
1667                 self.report_warning(
1668                     'No subtitle format found matching "%s" for language %s, '
1669                     'using %s' % (formats_query, lang, f['ext']))
1670             subs[lang] = f
1671         return subs
1672
1673     def process_info(self, info_dict):
1674         """Process a single resolved IE result."""
1675
1676         assert info_dict.get('_type', 'video') == 'video'
1677
1678         max_downloads = self.params.get('max_downloads')
1679         if max_downloads is not None:
1680             if self._num_downloads >= int(max_downloads):
1681                 raise MaxDownloadsReached()
1682
1683         info_dict['fulltitle'] = info_dict['title']
1684         if len(info_dict['title']) > 200:
1685             info_dict['title'] = info_dict['title'][:197] + '...'
1686
1687         if 'format' not in info_dict:
1688             info_dict['format'] = info_dict['ext']
1689
1690         reason = self._match_entry(info_dict, incomplete=False)
1691         if reason is not None:
1692             self.to_screen('[download] ' + reason)
1693             return
1694
1695         self._num_downloads += 1
1696
1697         info_dict['_filename'] = filename = self.prepare_filename(info_dict)
1698
1699         # Forced printings
1700         if self.params.get('forcetitle', False):
1701             self.to_stdout(info_dict['fulltitle'])
1702         if self.params.get('forceid', False):
1703             self.to_stdout(info_dict['id'])
1704         if self.params.get('forceurl', False):
1705             if info_dict.get('requested_formats') is not None:
1706                 for f in info_dict['requested_formats']:
1707                     self.to_stdout(f['url'] + f.get('play_path', ''))
1708             else:
1709                 # For RTMP URLs, also include the playpath
1710                 self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
1711         if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
1712             self.to_stdout(info_dict['thumbnail'])
1713         if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
1714             self.to_stdout(info_dict['description'])
1715         if self.params.get('forcefilename', False) and filename is not None:
1716             self.to_stdout(filename)
1717         if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
1718             self.to_stdout(formatSeconds(info_dict['duration']))
1719         if self.params.get('forceformat', False):
1720             self.to_stdout(info_dict['format'])
1721         if self.params.get('forcejson', False):
1722             self.to_stdout(json.dumps(info_dict))
1723
1724         # Do nothing else if in simulate mode
1725         if self.params.get('simulate', False):
1726             return
1727
1728         if filename is None:
1729             return
1730
1731         def ensure_dir_exists(path):
1732             try:
1733                 dn = os.path.dirname(path)
1734                 if dn and not os.path.exists(dn):
1735                     os.makedirs(dn)
1736                 return True
1737             except (OSError, IOError) as err:
1738                 self.report_error('unable to create directory ' + error_to_compat_str(err))
1739                 return False
1740
1741         if not ensure_dir_exists(sanitize_path(encodeFilename(filename))):
1742             return
1743
1744         if self.params.get('writedescription', False):
1745             descfn = replace_extension(filename, 'description', info_dict.get('ext'))
1746             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(descfn)):
1747                 self.to_screen('[info] Video description is already present')
1748             elif info_dict.get('description') is None:
1749                 self.report_warning('There\'s no description to write.')
1750             else:
1751                 try:
1752                     self.to_screen('[info] Writing video description to: ' + descfn)
1753                     with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
1754                         descfile.write(info_dict['description'])
1755                 except (OSError, IOError):
1756                     self.report_error('Cannot write description file ' + descfn)
1757                     return
1758
1759         if self.params.get('writeannotations', False):
1760             annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext'))
1761             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
1762                 self.to_screen('[info] Video annotations are already present')
1763             else:
1764                 try:
1765                     self.to_screen('[info] Writing video annotations to: ' + annofn)
1766                     with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
1767                         annofile.write(info_dict['annotations'])
1768                 except (KeyError, TypeError):
1769                     self.report_warning('There are no annotations to write.')
1770                 except (OSError, IOError):
1771                     self.report_error('Cannot write annotations file: ' + annofn)
1772                     return
1773
1774         subtitles_are_requested = any([self.params.get('writesubtitles', False),
1775                                        self.params.get('writeautomaticsub')])
1776
1777         if subtitles_are_requested and info_dict.get('requested_subtitles'):
1778             # subtitles download errors are already managed as troubles in relevant IE
1779             # that way it will silently go on when used with unsupporting IE
1780             subtitles = info_dict['requested_subtitles']
1781             ie = self.get_info_extractor(info_dict['extractor_key'])
1782             for sub_lang, sub_info in subtitles.items():
1783                 sub_format = sub_info['ext']
1784                 sub_filename = subtitles_filename(filename, sub_lang, sub_format)
1785                 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
1786                     self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format))
1787                 else:
1788                     self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
1789                     if sub_info.get('data') is not None:
1790                         try:
1791                             # Use newline='' to prevent conversion of newline characters
1792                             # See https://github.com/rg3/youtube-dl/issues/10268
1793                             with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
1794                                 subfile.write(sub_info['data'])
1795                         except (OSError, IOError):
1796                             self.report_error('Cannot write subtitles file ' + sub_filename)
1797                             return
1798                     else:
1799                         try:
1800                             sub_data = ie._request_webpage(
1801                                 sub_info['url'], info_dict['id'], note=False).read()
1802                             with io.open(encodeFilename(sub_filename), 'wb') as subfile:
1803                                 subfile.write(sub_data)
1804                         except (ExtractorError, IOError, OSError, ValueError) as err:
1805                             self.report_warning('Unable to download subtitle for "%s": %s' %
1806                                                 (sub_lang, error_to_compat_str(err)))
1807                             continue
1808
1809         if self.params.get('writeinfojson', False):
1810             infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
1811             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)):
1812                 self.to_screen('[info] Video description metadata is already present')
1813             else:
1814                 self.to_screen('[info] Writing video description metadata as JSON to: ' + infofn)
1815                 try:
1816                     write_json_file(self.filter_requested_info(info_dict), infofn)
1817                 except (OSError, IOError):
1818                     self.report_error('Cannot write metadata to JSON file ' + infofn)
1819                     return
1820
1821         self._write_thumbnails(info_dict, filename)
1822
1823         if not self.params.get('skip_download', False):
1824             try:
1825                 def dl(name, info):
1826                     fd = get_suitable_downloader(info, self.params)(self, self.params)
1827                     for ph in self._progress_hooks:
1828                         fd.add_progress_hook(ph)
1829                     if self.params.get('verbose'):
1830                         self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
1831                     return fd.download(name, info)
1832
1833                 if info_dict.get('requested_formats') is not None:
1834                     downloaded = []
1835                     success = True
1836                     merger = FFmpegMergerPP(self)
1837                     if not merger.available:
1838                         postprocessors = []
1839                         self.report_warning('You have requested multiple '
1840                                             'formats but ffmpeg or avconv are not installed.'
1841                                             ' The formats won\'t be merged.')
1842                     else:
1843                         postprocessors = [merger]
1844
1845                     def compatible_formats(formats):
1846                         video, audio = formats
1847                         # Check extension
1848                         video_ext, audio_ext = audio.get('ext'), video.get('ext')
1849                         if video_ext and audio_ext:
1850                             COMPATIBLE_EXTS = (
1851                                 ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'),
1852                                 ('webm')
1853                             )
1854                             for exts in COMPATIBLE_EXTS:
1855                                 if video_ext in exts and audio_ext in exts:
1856                                     return True
1857                         # TODO: Check acodec/vcodec
1858                         return False
1859
1860                     filename_real_ext = os.path.splitext(filename)[1][1:]
1861                     filename_wo_ext = (
1862                         os.path.splitext(filename)[0]
1863                         if filename_real_ext == info_dict['ext']
1864                         else filename)
1865                     requested_formats = info_dict['requested_formats']
1866                     if self.params.get('merge_output_format') is None and not compatible_formats(requested_formats):
1867                         info_dict['ext'] = 'mkv'
1868                         self.report_warning(
1869                             'Requested formats are incompatible for merge and will be merged into mkv.')
1870                     # Ensure filename always has a correct extension for successful merge
1871                     filename = '%s.%s' % (filename_wo_ext, info_dict['ext'])
1872                     if os.path.exists(encodeFilename(filename)):
1873                         self.to_screen(
1874                             '[download] %s has already been downloaded and '
1875                             'merged' % filename)
1876                     else:
1877                         for f in requested_formats:
1878                             new_info = dict(info_dict)
1879                             new_info.update(f)
1880                             fname = prepend_extension(
1881                                 self.prepare_filename(new_info),
1882                                 'f%s' % f['format_id'], new_info['ext'])
1883                             if not ensure_dir_exists(fname):
1884                                 return
1885                             downloaded.append(fname)
1886                             partial_success = dl(fname, new_info)
1887                             success = success and partial_success
1888                         info_dict['__postprocessors'] = postprocessors
1889                         info_dict['__files_to_merge'] = downloaded
1890                 else:
1891                     # Just a single file
1892                     success = dl(filename, info_dict)
1893             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1894                 self.report_error('unable to download video data: %s' % error_to_compat_str(err))
1895                 return
1896             except (OSError, IOError) as err:
1897                 raise UnavailableVideoError(err)
1898             except (ContentTooShortError, ) as err:
1899                 self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
1900                 return
1901
1902             if success and filename != '-':
1903                 # Fixup content
1904                 fixup_policy = self.params.get('fixup')
1905                 if fixup_policy is None:
1906                     fixup_policy = 'detect_or_warn'
1907
1908                 INSTALL_FFMPEG_MESSAGE = 'Install ffmpeg or avconv to fix this automatically.'
1909
1910                 stretched_ratio = info_dict.get('stretched_ratio')
1911                 if stretched_ratio is not None and stretched_ratio != 1:
1912                     if fixup_policy == 'warn':
1913                         self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
1914                             info_dict['id'], stretched_ratio))
1915                     elif fixup_policy == 'detect_or_warn':
1916                         stretched_pp = FFmpegFixupStretchedPP(self)
1917                         if stretched_pp.available:
1918                             info_dict.setdefault('__postprocessors', [])
1919                             info_dict['__postprocessors'].append(stretched_pp)
1920                         else:
1921                             self.report_warning(
1922                                 '%s: Non-uniform pixel ratio (%s). %s'
1923                                 % (info_dict['id'], stretched_ratio, INSTALL_FFMPEG_MESSAGE))
1924                     else:
1925                         assert fixup_policy in ('ignore', 'never')
1926
1927                 if (info_dict.get('requested_formats') is None and
1928                         info_dict.get('container') == 'm4a_dash'):
1929                     if fixup_policy == 'warn':
1930                         self.report_warning(
1931                             '%s: writing DASH m4a. '
1932                             'Only some players support this container.'
1933                             % info_dict['id'])
1934                     elif fixup_policy == 'detect_or_warn':
1935                         fixup_pp = FFmpegFixupM4aPP(self)
1936                         if fixup_pp.available:
1937                             info_dict.setdefault('__postprocessors', [])
1938                             info_dict['__postprocessors'].append(fixup_pp)
1939                         else:
1940                             self.report_warning(
1941                                 '%s: writing DASH m4a. '
1942                                 'Only some players support this container. %s'
1943                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1944                     else:
1945                         assert fixup_policy in ('ignore', 'never')
1946
1947                 if (info_dict.get('protocol') == 'm3u8_native' or
1948                         info_dict.get('protocol') == 'm3u8' and
1949                         self.params.get('hls_prefer_native')):
1950                     if fixup_policy == 'warn':
1951                         self.report_warning('%s: malformed AAC bitstream detected.' % (
1952                             info_dict['id']))
1953                     elif fixup_policy == 'detect_or_warn':
1954                         fixup_pp = FFmpegFixupM3u8PP(self)
1955                         if fixup_pp.available:
1956                             info_dict.setdefault('__postprocessors', [])
1957                             info_dict['__postprocessors'].append(fixup_pp)
1958                         else:
1959                             self.report_warning(
1960                                 '%s: malformed AAC bitstream detected. %s'
1961                                 % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
1962                     else:
1963                         assert fixup_policy in ('ignore', 'never')
1964
1965                 try:
1966                     self.post_process(filename, info_dict)
1967                 except (PostProcessingError) as err:
1968                     self.report_error('postprocessing: %s' % str(err))
1969                     return
1970                 self.record_download_archive(info_dict)
1971
1972     def download(self, url_list):
1973         """Download a given list of URLs."""
1974         outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
1975         if (len(url_list) > 1 and
1976                 outtmpl != '-' and
1977                 '%' not in outtmpl and
1978                 self.params.get('max_downloads') != 1):
1979             raise SameFileError(outtmpl)
1980
1981         for url in url_list:
1982             try:
1983                 # It also downloads the videos
1984                 res = self.extract_info(
1985                     url, force_generic_extractor=self.params.get('force_generic_extractor', False))
1986             except UnavailableVideoError:
1987                 self.report_error('unable to download video')
1988             except MaxDownloadsReached:
1989                 self.to_screen('[info] Maximum number of downloaded files reached.')
1990                 raise
1991             else:
1992                 if self.params.get('dump_single_json', False):
1993                     self.to_stdout(json.dumps(res))
1994
1995         return self._download_retcode
1996
1997     def download_with_info_file(self, info_filename):
1998         with contextlib.closing(fileinput.FileInput(
1999                 [info_filename], mode='r',
2000                 openhook=fileinput.hook_encoded('utf-8'))) as f:
2001             # FileInput doesn't have a read method, we can't call json.load
2002             info = self.filter_requested_info(json.loads('\n'.join(f)))
2003         try:
2004             self.process_ie_result(info, download=True)
2005         except DownloadError:
2006             webpage_url = info.get('webpage_url')
2007             if webpage_url is not None:
2008                 self.report_warning('The info failed to download, trying with "%s"' % webpage_url)
2009                 return self.download([webpage_url])
2010             else:
2011                 raise
2012         return self._download_retcode
2013
2014     @staticmethod
2015     def filter_requested_info(info_dict):
2016         return dict(
2017             (k, v) for k, v in info_dict.items()
2018             if k not in ['requested_formats', 'requested_subtitles'])
2019
2020     def post_process(self, filename, ie_info):
2021         """Run all the postprocessors on the given file."""
2022         info = dict(ie_info)
2023         info['filepath'] = filename
2024         pps_chain = []
2025         if ie_info.get('__postprocessors') is not None:
2026             pps_chain.extend(ie_info['__postprocessors'])
2027         pps_chain.extend(self._pps)
2028         for pp in pps_chain:
2029             files_to_delete = []
2030             try:
2031                 files_to_delete, info = pp.run(info)
2032             except PostProcessingError as e:
2033                 self.report_error(e.msg)
2034             if files_to_delete and not self.params.get('keepvideo', False):
2035                 for old_filename in files_to_delete:
2036                     self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename)
2037                     try:
2038                         os.remove(encodeFilename(old_filename))
2039                     except (IOError, OSError):
2040                         self.report_warning('Unable to remove downloaded original file')
2041
2042     def _make_archive_id(self, info_dict):
2043         # Future-proof against any change in case
2044         # and backwards compatibility with prior versions
2045         extractor = info_dict.get('extractor_key')
2046         if extractor is None:
2047             if 'id' in info_dict:
2048                 extractor = info_dict.get('ie_key')  # key in a playlist
2049         if extractor is None:
2050             return None  # Incomplete video information
2051         return extractor.lower() + ' ' + info_dict['id']
2052
2053     def in_download_archive(self, info_dict):
2054         fn = self.params.get('download_archive')
2055         if fn is None:
2056             return False
2057
2058         vid_id = self._make_archive_id(info_dict)
2059         if vid_id is None:
2060             return False  # Incomplete video information
2061
2062         try:
2063             with locked_file(fn, 'r', encoding='utf-8') as archive_file:
2064                 for line in archive_file:
2065                     if line.strip() == vid_id:
2066                         return True
2067         except IOError as ioe:
2068             if ioe.errno != errno.ENOENT:
2069                 raise
2070         return False
2071
2072     def record_download_archive(self, info_dict):
2073         fn = self.params.get('download_archive')
2074         if fn is None:
2075             return
2076         vid_id = self._make_archive_id(info_dict)
2077         assert vid_id
2078         with locked_file(fn, 'a', encoding='utf-8') as archive_file:
2079             archive_file.write(vid_id + '\n')
2080
2081     @staticmethod
2082     def format_resolution(format, default='unknown'):
2083         if format.get('vcodec') == 'none':
2084             return 'audio only'
2085         if format.get('resolution') is not None:
2086             return format['resolution']
2087         if format.get('height') is not None:
2088             if format.get('width') is not None:
2089                 res = '%sx%s' % (format['width'], format['height'])
2090             else:
2091                 res = '%sp' % format['height']
2092         elif format.get('width') is not None:
2093             res = '%dx?' % format['width']
2094         else:
2095             res = default
2096         return res
2097
2098     def _format_note(self, fdict):
2099         res = ''
2100         if fdict.get('ext') in ['f4f', 'f4m']:
2101             res += '(unsupported) '
2102         if fdict.get('language'):
2103             if res:
2104                 res += ' '
2105             res += '[%s] ' % fdict['language']
2106         if fdict.get('format_note') is not None:
2107             res += fdict['format_note'] + ' '
2108         if fdict.get('tbr') is not None:
2109             res += '%4dk ' % fdict['tbr']
2110         if fdict.get('container') is not None:
2111             if res:
2112                 res += ', '
2113             res += '%s container' % fdict['container']
2114         if (fdict.get('vcodec') is not None and
2115                 fdict.get('vcodec') != 'none'):
2116             if res:
2117                 res += ', '
2118             res += fdict['vcodec']
2119             if fdict.get('vbr') is not None:
2120                 res += '@'
2121         elif fdict.get('vbr') is not None and fdict.get('abr') is not None:
2122             res += 'video@'
2123         if fdict.get('vbr') is not None:
2124             res += '%4dk' % fdict['vbr']
2125         if fdict.get('fps') is not None:
2126             if res:
2127                 res += ', '
2128             res += '%sfps' % fdict['fps']
2129         if fdict.get('acodec') is not None:
2130             if res:
2131                 res += ', '
2132             if fdict['acodec'] == 'none':
2133                 res += 'video only'
2134             else:
2135                 res += '%-5s' % fdict['acodec']
2136         elif fdict.get('abr') is not None:
2137             if res:
2138                 res += ', '
2139             res += 'audio'
2140         if fdict.get('abr') is not None:
2141             res += '@%3dk' % fdict['abr']
2142         if fdict.get('asr') is not None:
2143             res += ' (%5dHz)' % fdict['asr']
2144         if fdict.get('filesize') is not None:
2145             if res:
2146                 res += ', '
2147             res += format_bytes(fdict['filesize'])
2148         elif fdict.get('filesize_approx') is not None:
2149             if res:
2150                 res += ', '
2151             res += '~' + format_bytes(fdict['filesize_approx'])
2152         return res
2153
2154     def list_formats(self, info_dict):
2155         formats = info_dict.get('formats', [info_dict])
2156         table = [
2157             [f['format_id'], f['ext'], self.format_resolution(f), self._format_note(f)]
2158             for f in formats
2159             if f.get('preference') is None or f['preference'] >= -1000]
2160         if len(formats) > 1:
2161             table[-1][-1] += (' ' if table[-1][-1] else '') + '(best)'
2162
2163         header_line = ['format code', 'extension', 'resolution', 'note']
2164         self.to_screen(
2165             '[info] Available formats for %s:\n%s' %
2166             (info_dict['id'], render_table(header_line, table)))
2167
2168     def list_thumbnails(self, info_dict):
2169         thumbnails = info_dict.get('thumbnails')
2170         if not thumbnails:
2171             self.to_screen('[info] No thumbnails present for %s' % info_dict['id'])
2172             return
2173
2174         self.to_screen(
2175             '[info] Thumbnails for %s:' % info_dict['id'])
2176         self.to_screen(render_table(
2177             ['ID', 'width', 'height', 'URL'],
2178             [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
2179
2180     def list_subtitles(self, video_id, subtitles, name='subtitles'):
2181         if not subtitles:
2182             self.to_screen('%s has no %s' % (video_id, name))
2183             return
2184         self.to_screen(
2185             'Available %s for %s:' % (name, video_id))
2186         self.to_screen(render_table(
2187             ['Language', 'formats'],
2188             [[lang, ', '.join(f['ext'] for f in reversed(formats))]
2189                 for lang, formats in subtitles.items()]))
2190
2191     def urlopen(self, req):
2192         """ Start an HTTP download """
2193         if isinstance(req, compat_basestring):
2194             req = sanitized_Request(req)
2195         return self._opener.open(req, timeout=self._socket_timeout)
2196
2197     def print_debug_header(self):
2198         if not self.params.get('verbose'):
2199             return
2200
2201         if type('') is not compat_str:
2202             # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326)
2203             self.report_warning(
2204                 'Your Python is broken! Update to a newer and supported version')
2205
2206         stdout_encoding = getattr(
2207             sys.stdout, 'encoding', 'missing (%s)' % type(sys.stdout).__name__)
2208         encoding_str = (
2209             '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % (
2210                 locale.getpreferredencoding(),
2211                 sys.getfilesystemencoding(),
2212                 stdout_encoding,
2213                 self.get_encoding()))
2214         write_string(encoding_str, encoding=None)
2215
2216         self._write_string('[debug] youtube-dl version ' + __version__ + '\n')
2217         if _LAZY_LOADER:
2218             self._write_string('[debug] Lazy loading extractors enabled' + '\n')
2219         try:
2220             sp = subprocess.Popen(
2221                 ['git', 'rev-parse', '--short', 'HEAD'],
2222                 stdout=subprocess.PIPE, stderr=subprocess.PIPE,
2223                 cwd=os.path.dirname(os.path.abspath(__file__)))
2224             out, err = sp.communicate()
2225             out = out.decode().strip()
2226             if re.match('[0-9a-f]+', out):
2227                 self._write_string('[debug] Git HEAD: ' + out + '\n')
2228         except Exception:
2229             try:
2230                 sys.exc_clear()
2231             except Exception:
2232                 pass
2233         self._write_string('[debug] Python version %s - %s\n' % (
2234             platform.python_version(), platform_name()))
2235
2236         exe_versions = FFmpegPostProcessor.get_versions(self)
2237         exe_versions['rtmpdump'] = rtmpdump_version()
2238         exe_versions['phantomjs'] = PhantomJSwrapper._version()
2239         exe_str = ', '.join(
2240             '%s %s' % (exe, v)
2241             for exe, v in sorted(exe_versions.items())
2242             if v
2243         )
2244         if not exe_str:
2245             exe_str = 'none'
2246         self._write_string('[debug] exe versions: %s\n' % exe_str)
2247
2248         proxy_map = {}
2249         for handler in self._opener.handlers:
2250             if hasattr(handler, 'proxies'):
2251                 proxy_map.update(handler.proxies)
2252         self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n')
2253
2254         if self.params.get('call_home', False):
2255             ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8')
2256             self._write_string('[debug] Public IP address: %s\n' % ipaddr)
2257             latest_version = self.urlopen(
2258                 'https://yt-dl.org/latest/version').read().decode('utf-8')
2259             if version_tuple(latest_version) > version_tuple(__version__):
2260                 self.report_warning(
2261                     'You are using an outdated version (newest version: %s)! '
2262                     'See https://yt-dl.org/update if you need help updating.' %
2263                     latest_version)
2264
2265     def _setup_opener(self):
2266         timeout_val = self.params.get('socket_timeout')
2267         self._socket_timeout = 600 if timeout_val is None else float(timeout_val)
2268
2269         opts_cookiefile = self.params.get('cookiefile')
2270         opts_proxy = self.params.get('proxy')
2271
2272         if opts_cookiefile is None:
2273             self.cookiejar = compat_cookiejar.CookieJar()
2274         else:
2275             opts_cookiefile = expand_path(opts_cookiefile)
2276             self.cookiejar = compat_cookiejar.MozillaCookieJar(
2277                 opts_cookiefile)
2278             if os.access(opts_cookiefile, os.R_OK):
2279                 self.cookiejar.load()
2280
2281         cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
2282         if opts_proxy is not None:
2283             if opts_proxy == '':
2284                 proxies = {}
2285             else:
2286                 proxies = {'http': opts_proxy, 'https': opts_proxy}
2287         else:
2288             proxies = compat_urllib_request.getproxies()
2289             # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
2290             if 'http' in proxies and 'https' not in proxies:
2291                 proxies['https'] = proxies['http']
2292         proxy_handler = PerRequestProxyHandler(proxies)
2293
2294         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
2295         https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
2296         ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
2297         data_handler = compat_urllib_request_DataHandler()
2298
2299         # When passing our own FileHandler instance, build_opener won't add the
2300         # default FileHandler and allows us to disable the file protocol, which
2301         # can be used for malicious purposes (see
2302         # https://github.com/rg3/youtube-dl/issues/8227)
2303         file_handler = compat_urllib_request.FileHandler()
2304
2305         def file_open(*args, **kwargs):
2306             raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in youtube-dl for security reasons')
2307         file_handler.file_open = file_open
2308
2309         opener = compat_urllib_request.build_opener(
2310             proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler)
2311
2312         # Delete the default user-agent header, which would otherwise apply in
2313         # cases where our custom HTTP handler doesn't come into play
2314         # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
2315         opener.addheaders = []
2316         self._opener = opener
2317
2318     def encode(self, s):
2319         if isinstance(s, bytes):
2320             return s  # Already encoded
2321
2322         try:
2323             return s.encode(self.get_encoding())
2324         except UnicodeEncodeError as err:
2325             err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.'
2326             raise
2327
2328     def get_encoding(self):
2329         encoding = self.params.get('encoding')
2330         if encoding is None:
2331             encoding = preferredencoding()
2332         return encoding
2333
2334     def _write_thumbnails(self, info_dict, filename):
2335         if self.params.get('writethumbnail', False):
2336             thumbnails = info_dict.get('thumbnails')
2337             if thumbnails:
2338                 thumbnails = [thumbnails[-1]]
2339         elif self.params.get('write_all_thumbnails', False):
2340             thumbnails = info_dict.get('thumbnails')
2341         else:
2342             return
2343
2344         if not thumbnails:
2345             # No thumbnails present, so return immediately
2346             return
2347
2348         for t in thumbnails:
2349             thumb_ext = determine_ext(t['url'], 'jpg')
2350             suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
2351             thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
2352             t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
2353
2354             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
2355                 self.to_screen('[%s] %s: Thumbnail %sis already present' %
2356                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2357             else:
2358                 self.to_screen('[%s] %s: Downloading thumbnail %s...' %
2359                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
2360                 try:
2361                     uf = self.urlopen(t['url'])
2362                     with open(encodeFilename(thumb_filename), 'wb') as thumbf:
2363                         shutil.copyfileobj(uf, thumbf)
2364                     self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
2365                                    (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
2366                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2367                     self.report_warning('Unable to download thumbnail "%s": %s' %
2368                                         (t['url'], error_to_compat_str(err)))