Ferdowsi
/

pytube

Model card Files Files and versions Community

Taylor Fox Dahlin commited on Nov 8, 2020

Commit

39acdce

unverified ·

1 Parent(s): 1776b2f

Fix/404 error on adaptive (#799)

* Fixes slight mistake in implementation of splice, and added unit test.

* Added sequential filestream request support for adaptive downloads, with unit testing.

* Removed some type hints

* Extracted default values to variables

* Made lru_caches limited to default (128)

Files changed (5) hide show

pytube/cipher.py +1 -1
pytube/request.py +109 -16
pytube/streams.py +22 -6
tests/test_cipher.py +5 -0
tests/test_streams.py +83 -0

pytube/cipher.py CHANGED Viewed

@@ -247,7 +247,7 @@ def splice(arr: List, b: int):
     >>> splice([1, 2, 3, 4], 2)
     [1, 2]
     """
-    return arr[:b] + arr[b * 2 :]
 def swap(arr: List, b: int):

     >>> splice([1, 2, 3, 4], 2)
     [1, 2]
     """
+    return arr[:b]
 def swap(arr: List, b: int):

pytube/request.py CHANGED Viewed

@@ -2,21 +2,20 @@
 """Implements a simple wrapper around urlopen."""
 import logging
 from functools import lru_cache
-from http.client import HTTPResponse
-from typing import Dict
-from typing import Iterable
-from typing import Optional
 from urllib.request import Request
 from urllib.request import urlopen
 logger = logging.getLogger(__name__)
-def _execute_request(
-        url: str,
-        method: Optional[str] = None,
-        headers: Optional[Dict[str, str]] = None,
-) -> HTTPResponse:
     base_headers = {"User-Agent": "Mozilla/5.0"}
     if headers:
         base_headers.update(headers)
@@ -27,7 +26,7 @@ def _execute_request(
     return urlopen(request)  # nosec
-def get(url, extra_headers=None) -> str:
     """Send an http GET request.
     :param str url:
@@ -43,9 +42,51 @@ def get(url, extra_headers=None) -> str:
     return _execute_request(url, headers=extra_headers).read().decode("utf-8")
-def stream(
-        url: str, chunk_size: int = 4096, range_size: int = 9437184
-) -> Iterable[bytes]:
     """Read the response in chunks.
     :param str url: The URL to perform the GET request for.
     :param int chunk_size: The size in bytes of each chunk. Defaults to 4KB
@@ -76,8 +117,8 @@ def stream(
     return  # pylint: disable=R1711
-@lru_cache(maxsize=None)
-def filesize(url: str) -> int:
     """Fetch size in bytes of file at given URL
     :param str url: The URL to get the size of
@@ -86,7 +127,59 @@ def filesize(url: str) -> int:
     return int(head(url)["content-length"])
-def head(url: str) -> Dict:
     """Fetch headers returned http GET request.
     :param str url:

 """Implements a simple wrapper around urlopen."""
 import logging
 from functools import lru_cache
+import re
+from urllib import parse
 from urllib.request import Request
 from urllib.request import urlopen
+from pytube.exceptions import RegexMatchError
+from pytube.helpers import regex_search
 logger = logging.getLogger(__name__)
+default_chunk_size = 4096  # 4kb
+default_range_size = 9437184  # 9MB
+def _execute_request(url, method=None, headers=None):
     base_headers = {"User-Agent": "Mozilla/5.0"}
     if headers:
         base_headers.update(headers)
     return urlopen(request)  # nosec
+def get(url, extra_headers=None):
     """Send an http GET request.
     :param str url:
     return _execute_request(url, headers=extra_headers).read().decode("utf-8")
+def seq_stream(url, chunk_size=default_chunk_size, range_size=default_range_size):
+    """Read the response in sequence.
+    :param str url: The URL to perform the GET request for.
+    :param int chunk_size: The size in bytes of each chunk. Defaults to 4KB
+    :param int range_size: The size in bytes of each range request. Defaults
+    to 9MB
+    :rtype: Iterable[bytes]
+    """
+    # YouTube expects a request sequence number as part of the parameters.
+    split_url = parse.urlsplit(url)
+    base_url = '%s://%s/%s?' % (split_url.scheme, split_url.netloc, split_url.path)
+    querys = dict(parse.parse_qsl(split_url.query))
+    # The 0th sequential request provides the file headers, which tell us
+    #  information about how the file is segmented.
+    querys['sq'] = 0
+    url = base_url + parse.urlencode(querys)
+    segment_data = b''
+    for chunk in stream(url):
+        yield chunk
+        segment_data += chunk
+    # We can then parse the header to find the number of segments
+    stream_info = segment_data.split(b'\r\n')
+    segment_count_pattern = re.compile(b'Segment-Count: (\\d+)')
+    for line in stream_info:
+        match = segment_count_pattern.search(line)
+        if match:
+            segment_count = int(match.group(1).decode('utf-8'))
+    # We request these segments sequentially to build the file.
+    seq_num = 1
+    while seq_num <= segment_count:
+        # Create sequential request URL
+        querys['sq'] = seq_num
+        url = base_url + parse.urlencode(querys)
+        yield from stream(url)
+        seq_num += 1
+    return  # pylint: disable=R1711
+def stream(url, chunk_size=default_chunk_size, range_size=default_range_size):
     """Read the response in chunks.
     :param str url: The URL to perform the GET request for.
     :param int chunk_size: The size in bytes of each chunk. Defaults to 4KB
     return  # pylint: disable=R1711
+@lru_cache()
+def filesize(url):
     """Fetch size in bytes of file at given URL
     :param str url: The URL to get the size of
     return int(head(url)["content-length"])
+@lru_cache()
+def seq_filesize(url):
+    """Fetch size in bytes of file at given URL from sequential requests
+    :param str url: The URL to get the size of
+    :returns: int: size in bytes of remote file
+    """
+    total_filesize = 0
+    # YouTube expects a request sequence number as part of the parameters.
+    split_url = parse.urlsplit(url)
+    base_url = '%s://%s/%s?' % (split_url.scheme, split_url.netloc, split_url.path)
+    querys = dict(parse.parse_qsl(split_url.query))
+    # The 0th sequential request provides the file headers, which tell us
+    #  information about how the file is segmented.
+    querys['sq'] = 0
+    url = base_url + parse.urlencode(querys)
+    response = _execute_request(
+        url, method="GET"
+    )
+    response_value = response.read()
+    # The file header must be added to the total filesize
+    total_filesize += len(response_value)
+    # We can then parse the header to find the number of segments
+    segment_count = 0
+    stream_info = response_value.split(b'\r\n')
+    segment_regex = b'Segment-Count: (\\d+)'
+    for line in stream_info:
+        # One of the lines should contain the segment count, but we don't know
+        #  which, so we need to iterate through the lines to find it
+        try:
+            segment_count = int(regex_search(segment_regex, line, 1))
+        except RegexMatchError:
+            pass
+    if segment_count == 0:
+        raise RegexMatchError('seq_filesize', segment_regex)
+    # We make HEAD requests to the segments sequentially to find the total filesize.
+    seq_num = 1
+    while seq_num <= segment_count:
+        # Create sequential request URL
+        querys['sq'] = seq_num
+        url = base_url + parse.urlencode(querys)
+        total_filesize += int(head(url)['content-length'])
+        seq_num += 1
+    return total_filesize
+def head(url):
     """Fetch headers returned http GET request.
     :param str url:

pytube/streams.py CHANGED Viewed

@@ -14,6 +14,7 @@ from typing import BinaryIO
 from typing import Dict
 from typing import Optional
 from typing import Tuple
 from urllib.parse import parse_qs
 from pytube import extract
@@ -153,7 +154,12 @@ class Stream:
             Filesize (in bytes) of the stream.
         """
         if self._filesize is None:
-            self._filesize = request.filesize(self.url)
         return self._filesize
     @property
@@ -250,11 +256,21 @@ class Stream:
         )
         with open(file_path, "wb") as fh:
-            for chunk in request.stream(self.url):
-                # reduce the (bytes) remainder by the length of the chunk.
-                bytes_remaining -= len(chunk)
-                # send to the on_progress callback.
-                self.on_progress(chunk, fh, bytes_remaining)
         self.on_complete(file_path)
         return file_path

 from typing import Dict
 from typing import Optional
 from typing import Tuple
+from urllib.error import HTTPError
 from urllib.parse import parse_qs
 from pytube import extract
             Filesize (in bytes) of the stream.
         """
         if self._filesize is None:
+            try:
+                self._filesize = request.filesize(self.url)
+            except HTTPError as e:
+                if e.code != 404:
+                    raise
+                self._filesize = request.seq_filesize(self.url)
         return self._filesize
     @property
         )
         with open(file_path, "wb") as fh:
+            try:
+                for chunk in request.stream(self.url):
+                    # reduce the (bytes) remainder by the length of the chunk.
+                    bytes_remaining -= len(chunk)
+                    # send to the on_progress callback.
+                    self.on_progress(chunk, fh, bytes_remaining)
+            except HTTPError as e:
+                if e.code != 404:
+                    raise
+                # Some adaptive streams need to be requested with sequence numbers
+                for chunk in request.seq_stream(self.url):
+                    # reduce the (bytes) remainder by the length of the chunk.
+                    bytes_remaining -= len(chunk)
+                    # send to the on_progress callback.
+                    self.on_progress(chunk, fh, bytes_remaining)
         self.on_complete(file_path)
         return file_path

tests/test_cipher.py CHANGED Viewed

@@ -23,3 +23,8 @@ def test_get_transform_object_with_no_match_should_error():
 def test_reverse():
     reversed_array = cipher.reverse([1, 2, 3, 4], None)
     assert reversed_array == [4, 3, 2, 1]

 def test_reverse():
     reversed_array = cipher.reverse([1, 2, 3, 4], None)
     assert reversed_array == [4, 3, 2, 1]
+def test_splice():
+    assert cipher.splice([1, 2, 3, 4], 2) == [1, 2]
+    assert cipher.splice([1, 2, 3, 4], 1) == [1]

tests/test_streams.py CHANGED Viewed

@@ -2,8 +2,10 @@
 import os
 import random
 from datetime import datetime
 from unittest import mock
 from unittest.mock import MagicMock, Mock
 from pytube import request
 from pytube import Stream
@@ -306,3 +308,84 @@ def test_repr_for_adaptive_streams(cipher_signature):
         'vcodec="avc1.640028" progressive="False" type="video">'
     )
     assert stream == expected

 import os
 import random
 from datetime import datetime
+import pytest
 from unittest import mock
 from unittest.mock import MagicMock, Mock
+from urllib.error import HTTPError
 from pytube import request
 from pytube import Stream
         'vcodec="avc1.640028" progressive="False" type="video">'
     )
     assert stream == expected
+def test_segmented_stream_on_404(cipher_signature):
+    stream = cipher_signature.streams.filter(adaptive=True)[0]
+    with mock.patch('pytube.request.head') as mock_head:
+        with mock.patch('pytube.request.urlopen') as mock_url_open:
+            # Mock the responses to YouTube
+            mock_url_open_object = mock.Mock()
+            # These are our 4 "segments" of a dash stream
+            # The first explains how many pieces there are, and
+            # the rest are those pieces
+            responses = [
+                b'Raw_data\r\nSegment-Count: 3',
+                b'a',
+                b'b',
+                b'c',
+            ]
+            joined_responses = b''.join(responses)
+            # We create response headers to match the segments
+            response_headers = [
+                {
+                    'content-length': len(r),
+                    'Content-Range': '0-%s/%s' % (str(len(r)), str(len(r)))
+                }
+                for r in responses
+            ]
+            # Request order for stream:
+            # Filesize:
+            #   1. head(url) -> 404
+            #   2. get(url&sn=0)
+            #   3. head(url&sn=[1,2,3])
+            # Download:
+            #   4. info(url) -> 404
+            #   5. get(url&sn=0)
+            #   6. get(url&sn=[1,2,3])
+            # Handle filesize requests
+            mock_head.side_effect = [
+                HTTPError('', 404, 'Not Found', '', ''),
+                *response_headers[1:],
+            ]
+            # Each response must be followed by None, to break iteration
+            #  in the stream() function
+            mock_url_open_object.read.side_effect = [
+                responses[0], None,
+                responses[0], None,
+                responses[1], None,
+                responses[2], None,
+                responses[3], None,
+            ]
+            # This handles the HEAD requests to get content-length
+            mock_url_open_object.info.side_effect = [
+                HTTPError('', 404, 'Not Found', '', ''),
+                *response_headers
+            ]
+            mock_url_open.return_value = mock_url_open_object
+            with mock.patch('builtins.open', new_callable=mock.mock_open) as mock_open:
+                file_handle = mock_open.return_value.__enter__.return_value
+                fp = stream.download()
+                full_content = b''
+                for call in file_handle.write.call_args_list:
+                    args, kwargs = call
+                    full_content += b''.join(args)
+                assert full_content == joined_responses
+                mock_open.assert_called_once_with(fp, 'wb')
+def test_segmented_only_catches_404(cipher_signature):
+    stream = cipher_signature.streams.filter(adaptive=True)[0]
+    with mock.patch('pytube.request.head') as mock_head:
+        mock_head.side_effect = HTTPError('', 403, 'Forbidden', '', '')
+        with pytest.raises(HTTPError):
+            stream.download()