Taylor Fox Dahlin
commited on
Fix/404 error on adaptive (#799)
Browse files* Fixes slight mistake in implementation of splice, and added unit test.
* Added sequential filestream request support for adaptive downloads, with unit testing.
* Removed some type hints
* Extracted default values to variables
* Made lru_caches limited to default (128)
- pytube/cipher.py +1 -1
- pytube/request.py +109 -16
- pytube/streams.py +22 -6
- tests/test_cipher.py +5 -0
- tests/test_streams.py +83 -0
pytube/cipher.py
CHANGED
@@ -247,7 +247,7 @@ def splice(arr: List, b: int):
|
|
247 |
>>> splice([1, 2, 3, 4], 2)
|
248 |
[1, 2]
|
249 |
"""
|
250 |
-
return arr[:b]
|
251 |
|
252 |
|
253 |
def swap(arr: List, b: int):
|
|
|
247 |
>>> splice([1, 2, 3, 4], 2)
|
248 |
[1, 2]
|
249 |
"""
|
250 |
+
return arr[:b]
|
251 |
|
252 |
|
253 |
def swap(arr: List, b: int):
|
pytube/request.py
CHANGED
@@ -2,21 +2,20 @@
|
|
2 |
"""Implements a simple wrapper around urlopen."""
|
3 |
import logging
|
4 |
from functools import lru_cache
|
5 |
-
|
6 |
-
from
|
7 |
-
from typing import Iterable
|
8 |
-
from typing import Optional
|
9 |
from urllib.request import Request
|
10 |
from urllib.request import urlopen
|
11 |
|
|
|
|
|
|
|
12 |
logger = logging.getLogger(__name__)
|
|
|
|
|
13 |
|
14 |
|
15 |
-
def _execute_request(
|
16 |
-
url: str,
|
17 |
-
method: Optional[str] = None,
|
18 |
-
headers: Optional[Dict[str, str]] = None,
|
19 |
-
) -> HTTPResponse:
|
20 |
base_headers = {"User-Agent": "Mozilla/5.0"}
|
21 |
if headers:
|
22 |
base_headers.update(headers)
|
@@ -27,7 +26,7 @@ def _execute_request(
|
|
27 |
return urlopen(request) # nosec
|
28 |
|
29 |
|
30 |
-
def get(url, extra_headers=None)
|
31 |
"""Send an http GET request.
|
32 |
|
33 |
:param str url:
|
@@ -43,9 +42,51 @@ def get(url, extra_headers=None) -> str:
|
|
43 |
return _execute_request(url, headers=extra_headers).read().decode("utf-8")
|
44 |
|
45 |
|
46 |
-
def
|
47 |
-
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
"""Read the response in chunks.
|
50 |
:param str url: The URL to perform the GET request for.
|
51 |
:param int chunk_size: The size in bytes of each chunk. Defaults to 4KB
|
@@ -76,8 +117,8 @@ def stream(
|
|
76 |
return # pylint: disable=R1711
|
77 |
|
78 |
|
79 |
-
@lru_cache(
|
80 |
-
def filesize(url
|
81 |
"""Fetch size in bytes of file at given URL
|
82 |
|
83 |
:param str url: The URL to get the size of
|
@@ -86,7 +127,59 @@ def filesize(url: str) -> int:
|
|
86 |
return int(head(url)["content-length"])
|
87 |
|
88 |
|
89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
"""Fetch headers returned http GET request.
|
91 |
|
92 |
:param str url:
|
|
|
2 |
"""Implements a simple wrapper around urlopen."""
|
3 |
import logging
|
4 |
from functools import lru_cache
|
5 |
+
import re
|
6 |
+
from urllib import parse
|
|
|
|
|
7 |
from urllib.request import Request
|
8 |
from urllib.request import urlopen
|
9 |
|
10 |
+
from pytube.exceptions import RegexMatchError
|
11 |
+
from pytube.helpers import regex_search
|
12 |
+
|
13 |
logger = logging.getLogger(__name__)
|
14 |
+
default_chunk_size = 4096 # 4kb
|
15 |
+
default_range_size = 9437184 # 9MB
|
16 |
|
17 |
|
18 |
+
def _execute_request(url, method=None, headers=None):
|
|
|
|
|
|
|
|
|
19 |
base_headers = {"User-Agent": "Mozilla/5.0"}
|
20 |
if headers:
|
21 |
base_headers.update(headers)
|
|
|
26 |
return urlopen(request) # nosec
|
27 |
|
28 |
|
29 |
+
def get(url, extra_headers=None):
|
30 |
"""Send an http GET request.
|
31 |
|
32 |
:param str url:
|
|
|
42 |
return _execute_request(url, headers=extra_headers).read().decode("utf-8")
|
43 |
|
44 |
|
45 |
+
def seq_stream(url, chunk_size=default_chunk_size, range_size=default_range_size):
|
46 |
+
"""Read the response in sequence.
|
47 |
+
:param str url: The URL to perform the GET request for.
|
48 |
+
:param int chunk_size: The size in bytes of each chunk. Defaults to 4KB
|
49 |
+
:param int range_size: The size in bytes of each range request. Defaults
|
50 |
+
to 9MB
|
51 |
+
:rtype: Iterable[bytes]
|
52 |
+
"""
|
53 |
+
# YouTube expects a request sequence number as part of the parameters.
|
54 |
+
split_url = parse.urlsplit(url)
|
55 |
+
base_url = '%s://%s/%s?' % (split_url.scheme, split_url.netloc, split_url.path)
|
56 |
+
|
57 |
+
querys = dict(parse.parse_qsl(split_url.query))
|
58 |
+
|
59 |
+
# The 0th sequential request provides the file headers, which tell us
|
60 |
+
# information about how the file is segmented.
|
61 |
+
querys['sq'] = 0
|
62 |
+
url = base_url + parse.urlencode(querys)
|
63 |
+
|
64 |
+
segment_data = b''
|
65 |
+
for chunk in stream(url):
|
66 |
+
yield chunk
|
67 |
+
segment_data += chunk
|
68 |
+
|
69 |
+
# We can then parse the header to find the number of segments
|
70 |
+
stream_info = segment_data.split(b'\r\n')
|
71 |
+
segment_count_pattern = re.compile(b'Segment-Count: (\\d+)')
|
72 |
+
for line in stream_info:
|
73 |
+
match = segment_count_pattern.search(line)
|
74 |
+
if match:
|
75 |
+
segment_count = int(match.group(1).decode('utf-8'))
|
76 |
+
|
77 |
+
# We request these segments sequentially to build the file.
|
78 |
+
seq_num = 1
|
79 |
+
while seq_num <= segment_count:
|
80 |
+
# Create sequential request URL
|
81 |
+
querys['sq'] = seq_num
|
82 |
+
url = base_url + parse.urlencode(querys)
|
83 |
+
|
84 |
+
yield from stream(url)
|
85 |
+
seq_num += 1
|
86 |
+
return # pylint: disable=R1711
|
87 |
+
|
88 |
+
|
89 |
+
def stream(url, chunk_size=default_chunk_size, range_size=default_range_size):
|
90 |
"""Read the response in chunks.
|
91 |
:param str url: The URL to perform the GET request for.
|
92 |
:param int chunk_size: The size in bytes of each chunk. Defaults to 4KB
|
|
|
117 |
return # pylint: disable=R1711
|
118 |
|
119 |
|
120 |
+
@lru_cache()
|
121 |
+
def filesize(url):
|
122 |
"""Fetch size in bytes of file at given URL
|
123 |
|
124 |
:param str url: The URL to get the size of
|
|
|
127 |
return int(head(url)["content-length"])
|
128 |
|
129 |
|
130 |
+
@lru_cache()
|
131 |
+
def seq_filesize(url):
|
132 |
+
"""Fetch size in bytes of file at given URL from sequential requests
|
133 |
+
|
134 |
+
:param str url: The URL to get the size of
|
135 |
+
:returns: int: size in bytes of remote file
|
136 |
+
"""
|
137 |
+
total_filesize = 0
|
138 |
+
# YouTube expects a request sequence number as part of the parameters.
|
139 |
+
split_url = parse.urlsplit(url)
|
140 |
+
base_url = '%s://%s/%s?' % (split_url.scheme, split_url.netloc, split_url.path)
|
141 |
+
querys = dict(parse.parse_qsl(split_url.query))
|
142 |
+
|
143 |
+
# The 0th sequential request provides the file headers, which tell us
|
144 |
+
# information about how the file is segmented.
|
145 |
+
querys['sq'] = 0
|
146 |
+
url = base_url + parse.urlencode(querys)
|
147 |
+
response = _execute_request(
|
148 |
+
url, method="GET"
|
149 |
+
)
|
150 |
+
|
151 |
+
response_value = response.read()
|
152 |
+
# The file header must be added to the total filesize
|
153 |
+
total_filesize += len(response_value)
|
154 |
+
|
155 |
+
# We can then parse the header to find the number of segments
|
156 |
+
segment_count = 0
|
157 |
+
stream_info = response_value.split(b'\r\n')
|
158 |
+
segment_regex = b'Segment-Count: (\\d+)'
|
159 |
+
for line in stream_info:
|
160 |
+
# One of the lines should contain the segment count, but we don't know
|
161 |
+
# which, so we need to iterate through the lines to find it
|
162 |
+
try:
|
163 |
+
segment_count = int(regex_search(segment_regex, line, 1))
|
164 |
+
except RegexMatchError:
|
165 |
+
pass
|
166 |
+
|
167 |
+
if segment_count == 0:
|
168 |
+
raise RegexMatchError('seq_filesize', segment_regex)
|
169 |
+
|
170 |
+
# We make HEAD requests to the segments sequentially to find the total filesize.
|
171 |
+
seq_num = 1
|
172 |
+
while seq_num <= segment_count:
|
173 |
+
# Create sequential request URL
|
174 |
+
querys['sq'] = seq_num
|
175 |
+
url = base_url + parse.urlencode(querys)
|
176 |
+
|
177 |
+
total_filesize += int(head(url)['content-length'])
|
178 |
+
seq_num += 1
|
179 |
+
return total_filesize
|
180 |
+
|
181 |
+
|
182 |
+
def head(url):
|
183 |
"""Fetch headers returned http GET request.
|
184 |
|
185 |
:param str url:
|
pytube/streams.py
CHANGED
@@ -14,6 +14,7 @@ from typing import BinaryIO
|
|
14 |
from typing import Dict
|
15 |
from typing import Optional
|
16 |
from typing import Tuple
|
|
|
17 |
from urllib.parse import parse_qs
|
18 |
|
19 |
from pytube import extract
|
@@ -153,7 +154,12 @@ class Stream:
|
|
153 |
Filesize (in bytes) of the stream.
|
154 |
"""
|
155 |
if self._filesize is None:
|
156 |
-
|
|
|
|
|
|
|
|
|
|
|
157 |
return self._filesize
|
158 |
|
159 |
@property
|
@@ -250,11 +256,21 @@ class Stream:
|
|
250 |
)
|
251 |
|
252 |
with open(file_path, "wb") as fh:
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
258 |
self.on_complete(file_path)
|
259 |
return file_path
|
260 |
|
|
|
14 |
from typing import Dict
|
15 |
from typing import Optional
|
16 |
from typing import Tuple
|
17 |
+
from urllib.error import HTTPError
|
18 |
from urllib.parse import parse_qs
|
19 |
|
20 |
from pytube import extract
|
|
|
154 |
Filesize (in bytes) of the stream.
|
155 |
"""
|
156 |
if self._filesize is None:
|
157 |
+
try:
|
158 |
+
self._filesize = request.filesize(self.url)
|
159 |
+
except HTTPError as e:
|
160 |
+
if e.code != 404:
|
161 |
+
raise
|
162 |
+
self._filesize = request.seq_filesize(self.url)
|
163 |
return self._filesize
|
164 |
|
165 |
@property
|
|
|
256 |
)
|
257 |
|
258 |
with open(file_path, "wb") as fh:
|
259 |
+
try:
|
260 |
+
for chunk in request.stream(self.url):
|
261 |
+
# reduce the (bytes) remainder by the length of the chunk.
|
262 |
+
bytes_remaining -= len(chunk)
|
263 |
+
# send to the on_progress callback.
|
264 |
+
self.on_progress(chunk, fh, bytes_remaining)
|
265 |
+
except HTTPError as e:
|
266 |
+
if e.code != 404:
|
267 |
+
raise
|
268 |
+
# Some adaptive streams need to be requested with sequence numbers
|
269 |
+
for chunk in request.seq_stream(self.url):
|
270 |
+
# reduce the (bytes) remainder by the length of the chunk.
|
271 |
+
bytes_remaining -= len(chunk)
|
272 |
+
# send to the on_progress callback.
|
273 |
+
self.on_progress(chunk, fh, bytes_remaining)
|
274 |
self.on_complete(file_path)
|
275 |
return file_path
|
276 |
|
tests/test_cipher.py
CHANGED
@@ -23,3 +23,8 @@ def test_get_transform_object_with_no_match_should_error():
|
|
23 |
def test_reverse():
|
24 |
reversed_array = cipher.reverse([1, 2, 3, 4], None)
|
25 |
assert reversed_array == [4, 3, 2, 1]
|
|
|
|
|
|
|
|
|
|
|
|
23 |
def test_reverse():
|
24 |
reversed_array = cipher.reverse([1, 2, 3, 4], None)
|
25 |
assert reversed_array == [4, 3, 2, 1]
|
26 |
+
|
27 |
+
|
28 |
+
def test_splice():
|
29 |
+
assert cipher.splice([1, 2, 3, 4], 2) == [1, 2]
|
30 |
+
assert cipher.splice([1, 2, 3, 4], 1) == [1]
|
tests/test_streams.py
CHANGED
@@ -2,8 +2,10 @@
|
|
2 |
import os
|
3 |
import random
|
4 |
from datetime import datetime
|
|
|
5 |
from unittest import mock
|
6 |
from unittest.mock import MagicMock, Mock
|
|
|
7 |
|
8 |
from pytube import request
|
9 |
from pytube import Stream
|
@@ -306,3 +308,84 @@ def test_repr_for_adaptive_streams(cipher_signature):
|
|
306 |
'vcodec="avc1.640028" progressive="False" type="video">'
|
307 |
)
|
308 |
assert stream == expected
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import os
|
3 |
import random
|
4 |
from datetime import datetime
|
5 |
+
import pytest
|
6 |
from unittest import mock
|
7 |
from unittest.mock import MagicMock, Mock
|
8 |
+
from urllib.error import HTTPError
|
9 |
|
10 |
from pytube import request
|
11 |
from pytube import Stream
|
|
|
308 |
'vcodec="avc1.640028" progressive="False" type="video">'
|
309 |
)
|
310 |
assert stream == expected
|
311 |
+
|
312 |
+
|
313 |
+
def test_segmented_stream_on_404(cipher_signature):
|
314 |
+
stream = cipher_signature.streams.filter(adaptive=True)[0]
|
315 |
+
with mock.patch('pytube.request.head') as mock_head:
|
316 |
+
with mock.patch('pytube.request.urlopen') as mock_url_open:
|
317 |
+
# Mock the responses to YouTube
|
318 |
+
mock_url_open_object = mock.Mock()
|
319 |
+
|
320 |
+
# These are our 4 "segments" of a dash stream
|
321 |
+
# The first explains how many pieces there are, and
|
322 |
+
# the rest are those pieces
|
323 |
+
responses = [
|
324 |
+
b'Raw_data\r\nSegment-Count: 3',
|
325 |
+
b'a',
|
326 |
+
b'b',
|
327 |
+
b'c',
|
328 |
+
]
|
329 |
+
joined_responses = b''.join(responses)
|
330 |
+
|
331 |
+
# We create response headers to match the segments
|
332 |
+
response_headers = [
|
333 |
+
{
|
334 |
+
'content-length': len(r),
|
335 |
+
'Content-Range': '0-%s/%s' % (str(len(r)), str(len(r)))
|
336 |
+
}
|
337 |
+
for r in responses
|
338 |
+
]
|
339 |
+
|
340 |
+
# Request order for stream:
|
341 |
+
# Filesize:
|
342 |
+
# 1. head(url) -> 404
|
343 |
+
# 2. get(url&sn=0)
|
344 |
+
# 3. head(url&sn=[1,2,3])
|
345 |
+
# Download:
|
346 |
+
# 4. info(url) -> 404
|
347 |
+
# 5. get(url&sn=0)
|
348 |
+
# 6. get(url&sn=[1,2,3])
|
349 |
+
|
350 |
+
# Handle filesize requests
|
351 |
+
mock_head.side_effect = [
|
352 |
+
HTTPError('', 404, 'Not Found', '', ''),
|
353 |
+
*response_headers[1:],
|
354 |
+
]
|
355 |
+
|
356 |
+
# Each response must be followed by None, to break iteration
|
357 |
+
# in the stream() function
|
358 |
+
mock_url_open_object.read.side_effect = [
|
359 |
+
responses[0], None,
|
360 |
+
responses[0], None,
|
361 |
+
responses[1], None,
|
362 |
+
responses[2], None,
|
363 |
+
responses[3], None,
|
364 |
+
]
|
365 |
+
|
366 |
+
# This handles the HEAD requests to get content-length
|
367 |
+
mock_url_open_object.info.side_effect = [
|
368 |
+
HTTPError('', 404, 'Not Found', '', ''),
|
369 |
+
*response_headers
|
370 |
+
]
|
371 |
+
|
372 |
+
mock_url_open.return_value = mock_url_open_object
|
373 |
+
|
374 |
+
with mock.patch('builtins.open', new_callable=mock.mock_open) as mock_open:
|
375 |
+
file_handle = mock_open.return_value.__enter__.return_value
|
376 |
+
fp = stream.download()
|
377 |
+
full_content = b''
|
378 |
+
for call in file_handle.write.call_args_list:
|
379 |
+
args, kwargs = call
|
380 |
+
full_content += b''.join(args)
|
381 |
+
|
382 |
+
assert full_content == joined_responses
|
383 |
+
mock_open.assert_called_once_with(fp, 'wb')
|
384 |
+
|
385 |
+
|
386 |
+
def test_segmented_only_catches_404(cipher_signature):
|
387 |
+
stream = cipher_signature.streams.filter(adaptive=True)[0]
|
388 |
+
with mock.patch('pytube.request.head') as mock_head:
|
389 |
+
mock_head.side_effect = HTTPError('', 403, 'Forbidden', '', '')
|
390 |
+
with pytest.raises(HTTPError):
|
391 |
+
stream.download()
|