Taylor Fox Dahlin commited on
Commit
39acdce
·
unverified ·
1 Parent(s): 1776b2f

Fix/404 error on adaptive (#799)

Browse files

* Fixes slight mistake in implementation of splice, and added unit test.

* Added sequential filestream request support for adaptive downloads, with unit testing.

* Removed some type hints

* Extracted default values to variables

* Made lru_caches limited to default (128)

pytube/cipher.py CHANGED
@@ -247,7 +247,7 @@ def splice(arr: List, b: int):
247
  >>> splice([1, 2, 3, 4], 2)
248
  [1, 2]
249
  """
250
- return arr[:b] + arr[b * 2 :]
251
 
252
 
253
  def swap(arr: List, b: int):
 
247
  >>> splice([1, 2, 3, 4], 2)
248
  [1, 2]
249
  """
250
+ return arr[:b]
251
 
252
 
253
  def swap(arr: List, b: int):
pytube/request.py CHANGED
@@ -2,21 +2,20 @@
2
  """Implements a simple wrapper around urlopen."""
3
  import logging
4
  from functools import lru_cache
5
- from http.client import HTTPResponse
6
- from typing import Dict
7
- from typing import Iterable
8
- from typing import Optional
9
  from urllib.request import Request
10
  from urllib.request import urlopen
11
 
 
 
 
12
  logger = logging.getLogger(__name__)
 
 
13
 
14
 
15
- def _execute_request(
16
- url: str,
17
- method: Optional[str] = None,
18
- headers: Optional[Dict[str, str]] = None,
19
- ) -> HTTPResponse:
20
  base_headers = {"User-Agent": "Mozilla/5.0"}
21
  if headers:
22
  base_headers.update(headers)
@@ -27,7 +26,7 @@ def _execute_request(
27
  return urlopen(request) # nosec
28
 
29
 
30
- def get(url, extra_headers=None) -> str:
31
  """Send an http GET request.
32
 
33
  :param str url:
@@ -43,9 +42,51 @@ def get(url, extra_headers=None) -> str:
43
  return _execute_request(url, headers=extra_headers).read().decode("utf-8")
44
 
45
 
46
- def stream(
47
- url: str, chunk_size: int = 4096, range_size: int = 9437184
48
- ) -> Iterable[bytes]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  """Read the response in chunks.
50
  :param str url: The URL to perform the GET request for.
51
  :param int chunk_size: The size in bytes of each chunk. Defaults to 4KB
@@ -76,8 +117,8 @@ def stream(
76
  return # pylint: disable=R1711
77
 
78
 
79
- @lru_cache(maxsize=None)
80
- def filesize(url: str) -> int:
81
  """Fetch size in bytes of file at given URL
82
 
83
  :param str url: The URL to get the size of
@@ -86,7 +127,59 @@ def filesize(url: str) -> int:
86
  return int(head(url)["content-length"])
87
 
88
 
89
- def head(url: str) -> Dict:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  """Fetch headers returned http GET request.
91
 
92
  :param str url:
 
2
  """Implements a simple wrapper around urlopen."""
3
  import logging
4
  from functools import lru_cache
5
+ import re
6
+ from urllib import parse
 
 
7
  from urllib.request import Request
8
  from urllib.request import urlopen
9
 
10
+ from pytube.exceptions import RegexMatchError
11
+ from pytube.helpers import regex_search
12
+
13
  logger = logging.getLogger(__name__)
14
+ default_chunk_size = 4096 # 4kb
15
+ default_range_size = 9437184 # 9MB
16
 
17
 
18
+ def _execute_request(url, method=None, headers=None):
 
 
 
 
19
  base_headers = {"User-Agent": "Mozilla/5.0"}
20
  if headers:
21
  base_headers.update(headers)
 
26
  return urlopen(request) # nosec
27
 
28
 
29
+ def get(url, extra_headers=None):
30
  """Send an http GET request.
31
 
32
  :param str url:
 
42
  return _execute_request(url, headers=extra_headers).read().decode("utf-8")
43
 
44
 
45
+ def seq_stream(url, chunk_size=default_chunk_size, range_size=default_range_size):
46
+ """Read the response in sequence.
47
+ :param str url: The URL to perform the GET request for.
48
+ :param int chunk_size: The size in bytes of each chunk. Defaults to 4KB
49
+ :param int range_size: The size in bytes of each range request. Defaults
50
+ to 9MB
51
+ :rtype: Iterable[bytes]
52
+ """
53
+ # YouTube expects a request sequence number as part of the parameters.
54
+ split_url = parse.urlsplit(url)
55
+ base_url = '%s://%s/%s?' % (split_url.scheme, split_url.netloc, split_url.path)
56
+
57
+ querys = dict(parse.parse_qsl(split_url.query))
58
+
59
+ # The 0th sequential request provides the file headers, which tell us
60
+ # information about how the file is segmented.
61
+ querys['sq'] = 0
62
+ url = base_url + parse.urlencode(querys)
63
+
64
+ segment_data = b''
65
+ for chunk in stream(url):
66
+ yield chunk
67
+ segment_data += chunk
68
+
69
+ # We can then parse the header to find the number of segments
70
+ stream_info = segment_data.split(b'\r\n')
71
+ segment_count_pattern = re.compile(b'Segment-Count: (\\d+)')
72
+ for line in stream_info:
73
+ match = segment_count_pattern.search(line)
74
+ if match:
75
+ segment_count = int(match.group(1).decode('utf-8'))
76
+
77
+ # We request these segments sequentially to build the file.
78
+ seq_num = 1
79
+ while seq_num <= segment_count:
80
+ # Create sequential request URL
81
+ querys['sq'] = seq_num
82
+ url = base_url + parse.urlencode(querys)
83
+
84
+ yield from stream(url)
85
+ seq_num += 1
86
+ return # pylint: disable=R1711
87
+
88
+
89
+ def stream(url, chunk_size=default_chunk_size, range_size=default_range_size):
90
  """Read the response in chunks.
91
  :param str url: The URL to perform the GET request for.
92
  :param int chunk_size: The size in bytes of each chunk. Defaults to 4KB
 
117
  return # pylint: disable=R1711
118
 
119
 
120
+ @lru_cache()
121
+ def filesize(url):
122
  """Fetch size in bytes of file at given URL
123
 
124
  :param str url: The URL to get the size of
 
127
  return int(head(url)["content-length"])
128
 
129
 
130
+ @lru_cache()
131
+ def seq_filesize(url):
132
+ """Fetch size in bytes of file at given URL from sequential requests
133
+
134
+ :param str url: The URL to get the size of
135
+ :returns: int: size in bytes of remote file
136
+ """
137
+ total_filesize = 0
138
+ # YouTube expects a request sequence number as part of the parameters.
139
+ split_url = parse.urlsplit(url)
140
+ base_url = '%s://%s/%s?' % (split_url.scheme, split_url.netloc, split_url.path)
141
+ querys = dict(parse.parse_qsl(split_url.query))
142
+
143
+ # The 0th sequential request provides the file headers, which tell us
144
+ # information about how the file is segmented.
145
+ querys['sq'] = 0
146
+ url = base_url + parse.urlencode(querys)
147
+ response = _execute_request(
148
+ url, method="GET"
149
+ )
150
+
151
+ response_value = response.read()
152
+ # The file header must be added to the total filesize
153
+ total_filesize += len(response_value)
154
+
155
+ # We can then parse the header to find the number of segments
156
+ segment_count = 0
157
+ stream_info = response_value.split(b'\r\n')
158
+ segment_regex = b'Segment-Count: (\\d+)'
159
+ for line in stream_info:
160
+ # One of the lines should contain the segment count, but we don't know
161
+ # which, so we need to iterate through the lines to find it
162
+ try:
163
+ segment_count = int(regex_search(segment_regex, line, 1))
164
+ except RegexMatchError:
165
+ pass
166
+
167
+ if segment_count == 0:
168
+ raise RegexMatchError('seq_filesize', segment_regex)
169
+
170
+ # We make HEAD requests to the segments sequentially to find the total filesize.
171
+ seq_num = 1
172
+ while seq_num <= segment_count:
173
+ # Create sequential request URL
174
+ querys['sq'] = seq_num
175
+ url = base_url + parse.urlencode(querys)
176
+
177
+ total_filesize += int(head(url)['content-length'])
178
+ seq_num += 1
179
+ return total_filesize
180
+
181
+
182
+ def head(url):
183
  """Fetch headers returned http GET request.
184
 
185
  :param str url:
pytube/streams.py CHANGED
@@ -14,6 +14,7 @@ from typing import BinaryIO
14
  from typing import Dict
15
  from typing import Optional
16
  from typing import Tuple
 
17
  from urllib.parse import parse_qs
18
 
19
  from pytube import extract
@@ -153,7 +154,12 @@ class Stream:
153
  Filesize (in bytes) of the stream.
154
  """
155
  if self._filesize is None:
156
- self._filesize = request.filesize(self.url)
 
 
 
 
 
157
  return self._filesize
158
 
159
  @property
@@ -250,11 +256,21 @@ class Stream:
250
  )
251
 
252
  with open(file_path, "wb") as fh:
253
- for chunk in request.stream(self.url):
254
- # reduce the (bytes) remainder by the length of the chunk.
255
- bytes_remaining -= len(chunk)
256
- # send to the on_progress callback.
257
- self.on_progress(chunk, fh, bytes_remaining)
 
 
 
 
 
 
 
 
 
 
258
  self.on_complete(file_path)
259
  return file_path
260
 
 
14
  from typing import Dict
15
  from typing import Optional
16
  from typing import Tuple
17
+ from urllib.error import HTTPError
18
  from urllib.parse import parse_qs
19
 
20
  from pytube import extract
 
154
  Filesize (in bytes) of the stream.
155
  """
156
  if self._filesize is None:
157
+ try:
158
+ self._filesize = request.filesize(self.url)
159
+ except HTTPError as e:
160
+ if e.code != 404:
161
+ raise
162
+ self._filesize = request.seq_filesize(self.url)
163
  return self._filesize
164
 
165
  @property
 
256
  )
257
 
258
  with open(file_path, "wb") as fh:
259
+ try:
260
+ for chunk in request.stream(self.url):
261
+ # reduce the (bytes) remainder by the length of the chunk.
262
+ bytes_remaining -= len(chunk)
263
+ # send to the on_progress callback.
264
+ self.on_progress(chunk, fh, bytes_remaining)
265
+ except HTTPError as e:
266
+ if e.code != 404:
267
+ raise
268
+ # Some adaptive streams need to be requested with sequence numbers
269
+ for chunk in request.seq_stream(self.url):
270
+ # reduce the (bytes) remainder by the length of the chunk.
271
+ bytes_remaining -= len(chunk)
272
+ # send to the on_progress callback.
273
+ self.on_progress(chunk, fh, bytes_remaining)
274
  self.on_complete(file_path)
275
  return file_path
276
 
tests/test_cipher.py CHANGED
@@ -23,3 +23,8 @@ def test_get_transform_object_with_no_match_should_error():
23
  def test_reverse():
24
  reversed_array = cipher.reverse([1, 2, 3, 4], None)
25
  assert reversed_array == [4, 3, 2, 1]
 
 
 
 
 
 
23
  def test_reverse():
24
  reversed_array = cipher.reverse([1, 2, 3, 4], None)
25
  assert reversed_array == [4, 3, 2, 1]
26
+
27
+
28
+ def test_splice():
29
+ assert cipher.splice([1, 2, 3, 4], 2) == [1, 2]
30
+ assert cipher.splice([1, 2, 3, 4], 1) == [1]
tests/test_streams.py CHANGED
@@ -2,8 +2,10 @@
2
  import os
3
  import random
4
  from datetime import datetime
 
5
  from unittest import mock
6
  from unittest.mock import MagicMock, Mock
 
7
 
8
  from pytube import request
9
  from pytube import Stream
@@ -306,3 +308,84 @@ def test_repr_for_adaptive_streams(cipher_signature):
306
  'vcodec="avc1.640028" progressive="False" type="video">'
307
  )
308
  assert stream == expected
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import os
3
  import random
4
  from datetime import datetime
5
+ import pytest
6
  from unittest import mock
7
  from unittest.mock import MagicMock, Mock
8
+ from urllib.error import HTTPError
9
 
10
  from pytube import request
11
  from pytube import Stream
 
308
  'vcodec="avc1.640028" progressive="False" type="video">'
309
  )
310
  assert stream == expected
311
+
312
+
313
+ def test_segmented_stream_on_404(cipher_signature):
314
+ stream = cipher_signature.streams.filter(adaptive=True)[0]
315
+ with mock.patch('pytube.request.head') as mock_head:
316
+ with mock.patch('pytube.request.urlopen') as mock_url_open:
317
+ # Mock the responses to YouTube
318
+ mock_url_open_object = mock.Mock()
319
+
320
+ # These are our 4 "segments" of a dash stream
321
+ # The first explains how many pieces there are, and
322
+ # the rest are those pieces
323
+ responses = [
324
+ b'Raw_data\r\nSegment-Count: 3',
325
+ b'a',
326
+ b'b',
327
+ b'c',
328
+ ]
329
+ joined_responses = b''.join(responses)
330
+
331
+ # We create response headers to match the segments
332
+ response_headers = [
333
+ {
334
+ 'content-length': len(r),
335
+ 'Content-Range': '0-%s/%s' % (str(len(r)), str(len(r)))
336
+ }
337
+ for r in responses
338
+ ]
339
+
340
+ # Request order for stream:
341
+ # Filesize:
342
+ # 1. head(url) -> 404
343
+ # 2. get(url&sn=0)
344
+ # 3. head(url&sn=[1,2,3])
345
+ # Download:
346
+ # 4. info(url) -> 404
347
+ # 5. get(url&sn=0)
348
+ # 6. get(url&sn=[1,2,3])
349
+
350
+ # Handle filesize requests
351
+ mock_head.side_effect = [
352
+ HTTPError('', 404, 'Not Found', '', ''),
353
+ *response_headers[1:],
354
+ ]
355
+
356
+ # Each response must be followed by None, to break iteration
357
+ # in the stream() function
358
+ mock_url_open_object.read.side_effect = [
359
+ responses[0], None,
360
+ responses[0], None,
361
+ responses[1], None,
362
+ responses[2], None,
363
+ responses[3], None,
364
+ ]
365
+
366
+ # This handles the HEAD requests to get content-length
367
+ mock_url_open_object.info.side_effect = [
368
+ HTTPError('', 404, 'Not Found', '', ''),
369
+ *response_headers
370
+ ]
371
+
372
+ mock_url_open.return_value = mock_url_open_object
373
+
374
+ with mock.patch('builtins.open', new_callable=mock.mock_open) as mock_open:
375
+ file_handle = mock_open.return_value.__enter__.return_value
376
+ fp = stream.download()
377
+ full_content = b''
378
+ for call in file_handle.write.call_args_list:
379
+ args, kwargs = call
380
+ full_content += b''.join(args)
381
+
382
+ assert full_content == joined_responses
383
+ mock_open.assert_called_once_with(fp, 'wb')
384
+
385
+
386
+ def test_segmented_only_catches_404(cipher_signature):
387
+ stream = cipher_signature.streams.filter(adaptive=True)[0]
388
+ with mock.patch('pytube.request.head') as mock_head:
389
+ mock_head.side_effect = HTTPError('', 403, 'Forbidden', '', '')
390
+ with pytest.raises(HTTPError):
391
+ stream.download()