Taylor Fox Dahlin commited on
Commit
46ba6e2
·
unverified ·
1 Parent(s): a238c48

Enhancement/timeout (#966)

Browse files

* Adds optional timeout and max_retries arguments to stream.download()

pytube/exceptions.py CHANGED
@@ -13,6 +13,10 @@ class PytubeError(Exception):
13
  """
14
 
15
 
 
 
 
 
16
  class HTMLParseError(PytubeError):
17
  """HTML could not be parsed"""
18
 
 
13
  """
14
 
15
 
16
+ class MaxRetriesExceeded(PytubeError):
17
+ """Maximum number of retries exceeded."""
18
+
19
+
20
  class HTMLParseError(PytubeError):
21
  """HTML could not be parsed"""
22
 
pytube/request.py CHANGED
@@ -1,14 +1,16 @@
1
  # -*- coding: utf-8 -*-
2
  """Implements a simple wrapper around urlopen."""
 
3
  import logging
4
  from functools import lru_cache
5
  import re
6
- import json
7
  from urllib import parse
 
8
  from urllib.request import Request
9
  from urllib.request import urlopen
10
 
11
- from pytube.exceptions import RegexMatchError
12
  from pytube.helpers import regex_search
13
 
14
  logger = logging.getLogger(__name__)
@@ -16,7 +18,13 @@ default_chunk_size = 4096 # 4kb
16
  default_range_size = 9437184 # 9MB
17
 
18
 
19
- def _execute_request(url, method=None, headers=None, data=None):
 
 
 
 
 
 
20
  base_headers = {"User-Agent": "Mozilla/5.0", "accept-language": "en-US,en"}
21
  if headers:
22
  base_headers.update(headers)
@@ -28,10 +36,10 @@ def _execute_request(url, method=None, headers=None, data=None):
28
  request = Request(url, headers=base_headers, method=method, data=data)
29
  else:
30
  raise ValueError("Invalid URL")
31
- return urlopen(request) # nosec
32
 
33
 
34
- def get(url, extra_headers=None):
35
  """Send an http GET request.
36
 
37
  :param str url:
@@ -44,10 +52,11 @@ def get(url, extra_headers=None):
44
  """
45
  if extra_headers is None:
46
  extra_headers = {}
47
- return _execute_request(url, headers=extra_headers).read().decode("utf-8")
 
48
 
49
 
50
- def post(url, extra_headers=None, data=None):
51
  """Send an http POST request.
52
 
53
  :param str url:
@@ -69,15 +78,22 @@ def post(url, extra_headers=None, data=None):
69
  # required because the youtube servers are strict on content type
70
  # raises HTTPError [400]: Bad Request otherwise
71
  extra_headers.update({"Content-Type": "application/json"})
72
- return _execute_request(url, headers=extra_headers, data=data).read().decode("utf-8")
 
 
 
 
 
 
73
 
74
 
75
- def seq_stream(url, chunk_size=default_chunk_size, range_size=default_range_size):
 
 
 
 
76
  """Read the response in sequence.
77
  :param str url: The URL to perform the GET request for.
78
- :param int chunk_size: The size in bytes of each chunk. Defaults to 4KB
79
- :param int range_size: The size in bytes of each range request. Defaults
80
- to 9MB
81
  :rtype: Iterable[bytes]
82
  """
83
  # YouTube expects a request sequence number as part of the parameters.
@@ -92,7 +108,7 @@ def seq_stream(url, chunk_size=default_chunk_size, range_size=default_range_size
92
  url = base_url + parse.urlencode(querys)
93
 
94
  segment_data = b''
95
- for chunk in stream(url):
96
  yield chunk
97
  segment_data += chunk
98
 
@@ -111,35 +127,57 @@ def seq_stream(url, chunk_size=default_chunk_size, range_size=default_range_size
111
  querys['sq'] = seq_num
112
  url = base_url + parse.urlencode(querys)
113
 
114
- yield from stream(url)
115
  seq_num += 1
116
  return # pylint: disable=R1711
117
 
118
 
119
- def stream(url, chunk_size=default_chunk_size, range_size=default_range_size):
 
 
 
 
120
  """Read the response in chunks.
121
  :param str url: The URL to perform the GET request for.
122
- :param int chunk_size: The size in bytes of each chunk. Defaults to 4KB
123
- :param int range_size: The size in bytes of each range request. Defaults
124
- to 9MB
125
  :rtype: Iterable[bytes]
126
  """
127
- file_size: int = range_size # fake filesize to start
128
  downloaded = 0
129
  while downloaded < file_size:
130
- stop_pos = min(downloaded + range_size, file_size) - 1
131
  range_header = f"bytes={downloaded}-{stop_pos}"
132
- response = _execute_request(
133
- url, method="GET", headers={"Range": range_header}
134
- )
135
- if file_size == range_size:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  try:
137
  content_range = response.info()["Content-Range"]
138
  file_size = int(content_range.split("/")[1])
139
  except (KeyError, IndexError, ValueError) as e:
140
  logger.error(e)
141
  while True:
142
- chunk = response.read(chunk_size)
143
  if not chunk:
144
  break
145
  downloaded += len(chunk)
 
1
  # -*- coding: utf-8 -*-
2
  """Implements a simple wrapper around urlopen."""
3
+ import json
4
  import logging
5
  from functools import lru_cache
6
  import re
7
+ import socket
8
  from urllib import parse
9
+ from urllib.error import URLError
10
  from urllib.request import Request
11
  from urllib.request import urlopen
12
 
13
+ from pytube.exceptions import RegexMatchError, MaxRetriesExceeded
14
  from pytube.helpers import regex_search
15
 
16
  logger = logging.getLogger(__name__)
 
18
  default_range_size = 9437184 # 9MB
19
 
20
 
21
+ def _execute_request(
22
+ url,
23
+ method=None,
24
+ headers=None,
25
+ data=None,
26
+ timeout=socket._GLOBAL_DEFAULT_TIMEOUT
27
+ ):
28
  base_headers = {"User-Agent": "Mozilla/5.0", "accept-language": "en-US,en"}
29
  if headers:
30
  base_headers.update(headers)
 
36
  request = Request(url, headers=base_headers, method=method, data=data)
37
  else:
38
  raise ValueError("Invalid URL")
39
+ return urlopen(request, timeout=timeout) # nosec
40
 
41
 
42
+ def get(url, extra_headers=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
43
  """Send an http GET request.
44
 
45
  :param str url:
 
52
  """
53
  if extra_headers is None:
54
  extra_headers = {}
55
+ response = _execute_request(url, headers=extra_headers, timeout=timeout)
56
+ return response.read().decode("utf-8")
57
 
58
 
59
+ def post(url, extra_headers=None, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT):
60
  """Send an http POST request.
61
 
62
  :param str url:
 
78
  # required because the youtube servers are strict on content type
79
  # raises HTTPError [400]: Bad Request otherwise
80
  extra_headers.update({"Content-Type": "application/json"})
81
+ response = _execute_request(
82
+ url,
83
+ headers=extra_headers,
84
+ data=data,
85
+ timeout=timeout
86
+ )
87
+ return response.read().decode("utf-8")
88
 
89
 
90
+ def seq_stream(
91
+ url,
92
+ timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
93
+ max_retries=0
94
+ ):
95
  """Read the response in sequence.
96
  :param str url: The URL to perform the GET request for.
 
 
 
97
  :rtype: Iterable[bytes]
98
  """
99
  # YouTube expects a request sequence number as part of the parameters.
 
108
  url = base_url + parse.urlencode(querys)
109
 
110
  segment_data = b''
111
+ for chunk in stream(url, timeout=timeout, max_retries=max_retries):
112
  yield chunk
113
  segment_data += chunk
114
 
 
127
  querys['sq'] = seq_num
128
  url = base_url + parse.urlencode(querys)
129
 
130
+ yield from stream(url, timeout=timeout, max_retries=max_retries)
131
  seq_num += 1
132
  return # pylint: disable=R1711
133
 
134
 
135
+ def stream(
136
+ url,
137
+ timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
138
+ max_retries=0
139
+ ):
140
  """Read the response in chunks.
141
  :param str url: The URL to perform the GET request for.
 
 
 
142
  :rtype: Iterable[bytes]
143
  """
144
+ file_size: int = default_range_size # fake filesize to start
145
  downloaded = 0
146
  while downloaded < file_size:
147
+ stop_pos = min(downloaded + default_range_size, file_size) - 1
148
  range_header = f"bytes={downloaded}-{stop_pos}"
149
+ tries = 0
150
+
151
+ # Attempt to make the request multiple times as necessary.
152
+ while True:
153
+ # If the max retries is exceeded, raise an exception
154
+ if tries >= 1 + max_retries:
155
+ raise MaxRetriesExceeded()
156
+
157
+ # Try to execute the request, ignoring socket timeouts
158
+ try:
159
+ response = _execute_request(
160
+ url,
161
+ method="GET",
162
+ headers={"Range": range_header},
163
+ timeout=timeout
164
+ )
165
+ except URLError as e:
166
+ if isinstance(e.reason, socket.timeout):
167
+ pass
168
+ else:
169
+ # On a successful request, break from loop
170
+ break
171
+ tries += 1
172
+
173
+ if file_size == default_range_size:
174
  try:
175
  content_range = response.info()["Content-Range"]
176
  file_size = int(content_range.split("/")[1])
177
  except (KeyError, IndexError, ValueError) as e:
178
  logger.error(e)
179
  while True:
180
+ chunk = response.read(default_chunk_size)
181
  if not chunk:
182
  break
183
  downloaded += len(chunk)
pytube/streams.py CHANGED
@@ -211,6 +211,8 @@ class Stream:
211
  filename: Optional[str] = None,
212
  filename_prefix: Optional[str] = None,
213
  skip_existing: bool = True,
 
 
214
  ) -> str:
215
  """Write the media stream to disk.
216
 
@@ -230,8 +232,11 @@ class Stream:
230
  filename but still add a prefix.
231
  :type filename_prefix: str or None
232
  :param skip_existing:
233
- (optional) skip existing files, defaults to True
234
  :type skip_existing: bool
 
 
 
235
  :returns:
236
  Path to the saved video
237
  :rtype: str
@@ -244,20 +249,20 @@ class Stream:
244
  )
245
 
246
  if skip_existing and self.exists_at_path(file_path):
247
- logger.debug("file %s already exists, skipping", file_path)
248
  self.on_complete(file_path)
249
  return file_path
250
 
251
  bytes_remaining = self.filesize
252
- logger.debug(
253
- "downloading (%s total bytes) file to %s",
254
- self.filesize,
255
- file_path,
256
- )
257
 
258
  with open(file_path, "wb") as fh:
259
  try:
260
- for chunk in request.stream(self.url):
 
 
 
 
261
  # reduce the (bytes) remainder by the length of the chunk.
262
  bytes_remaining -= len(chunk)
263
  # send to the on_progress callback.
@@ -266,7 +271,11 @@ class Stream:
266
  if e.code != 404:
267
  raise
268
  # Some adaptive streams need to be requested with sequence numbers
269
- for chunk in request.seq_stream(self.url):
 
 
 
 
270
  # reduce the (bytes) remainder by the length of the chunk.
271
  bytes_remaining -= len(chunk)
272
  # send to the on_progress callback.
 
211
  filename: Optional[str] = None,
212
  filename_prefix: Optional[str] = None,
213
  skip_existing: bool = True,
214
+ timeout: Optional[int] = None,
215
+ max_retries: Optional[int] = 0
216
  ) -> str:
217
  """Write the media stream to disk.
218
 
 
232
  filename but still add a prefix.
233
  :type filename_prefix: str or None
234
  :param skip_existing:
235
+ (optional) Skip existing files, defaults to True
236
  :type skip_existing: bool
237
+ :param timeout:
238
+ (optional) Request timeout length in seconds
239
+ :type timeout: int
240
  :returns:
241
  Path to the saved video
242
  :rtype: str
 
249
  )
250
 
251
  if skip_existing and self.exists_at_path(file_path):
252
+ logger.debug(f'file {file_path} already exists, skipping')
253
  self.on_complete(file_path)
254
  return file_path
255
 
256
  bytes_remaining = self.filesize
257
+ logger.debug(f'downloading ({self.filesize} total bytes) file to {file_path}')
 
 
 
 
258
 
259
  with open(file_path, "wb") as fh:
260
  try:
261
+ for chunk in request.stream(
262
+ self.url,
263
+ timeout=timeout,
264
+ max_retries=max_retries
265
+ ):
266
  # reduce the (bytes) remainder by the length of the chunk.
267
  bytes_remaining -= len(chunk)
268
  # send to the on_progress callback.
 
271
  if e.code != 404:
272
  raise
273
  # Some adaptive streams need to be requested with sequence numbers
274
+ for chunk in request.seq_stream(
275
+ self.url,
276
+ timeout=timeout,
277
+ max_retries=max_retries
278
+ ):
279
  # reduce the (bytes) remainder by the length of the chunk.
280
  bytes_remaining -= len(chunk)
281
  # send to the on_progress callback.
tests/test_request.py CHANGED
@@ -1,10 +1,13 @@
1
  # -*- coding: utf-8 -*-
 
2
  import os
3
  from unittest import mock
 
4
 
5
  import pytest
6
 
7
  from pytube import request
 
8
 
9
 
10
  @mock.patch("pytube.request.urlopen")
@@ -16,15 +19,24 @@ def test_streaming(mock_urlopen):
16
  os.urandom(8 * 1024),
17
  None,
18
  ]
19
- response = mock.Mock()
20
- response.read.side_effect = fake_stream_binary
21
- response.info.return_value = {"Content-Range": "bytes 200-1000/24576"}
22
- mock_urlopen.return_value = response
23
  # When
24
- response = request.stream("http://fakeassurl.gov")
25
  # Then
26
- call_count = len(list(response))
27
- assert call_count == 3
 
 
 
 
 
 
 
 
 
28
 
29
 
30
  @mock.patch("pytube.request.urlopen")
 
1
  # -*- coding: utf-8 -*-
2
+ import socket
3
  import os
4
  from unittest import mock
5
+ from urllib.error import URLError
6
 
7
  import pytest
8
 
9
  from pytube import request
10
+ from pytube.exceptions import MaxRetriesExceeded
11
 
12
 
13
  @mock.patch("pytube.request.urlopen")
 
19
  os.urandom(8 * 1024),
20
  None,
21
  ]
22
+ mock_response = mock.Mock()
23
+ mock_response.read.side_effect = fake_stream_binary
24
+ mock_response.info.return_value = {"Content-Range": "bytes 200-1000/24576"}
25
+ mock_urlopen.return_value = mock_response
26
  # When
27
+ response = request.stream("http://fakeassurl.gov/streaming_test")
28
  # Then
29
+ assert len(b''.join(response)) == 3 * 8 * 1024
30
+ assert mock_response.read.call_count == 4
31
+
32
+
33
+ @mock.patch('pytube.request.urlopen')
34
+ def test_timeout(mock_urlopen):
35
+ exc = URLError(reason=socket.timeout('timed_out'))
36
+ mock_urlopen.side_effect = exc
37
+ generator = request.stream('http://fakeassurl.gov/timeout_test', timeout=1)
38
+ with pytest.raises(MaxRetriesExceeded):
39
+ next(generator)
40
 
41
 
42
  @mock.patch("pytube.request.urlopen")