hbmartin commited on
Commit
f3408d8
·
1 Parent(s): f521184

multi range file streaming

Browse files
README.md CHANGED
@@ -36,7 +36,7 @@ $ pip install pytube3 --upgrade
36
  ## Quick start
37
  ```python
38
  >>> from pytube import YouTube
39
- >>> YouTube('https://youtu.be/9bZkp7q19f0').streams[0].download()
40
  >>>
41
  >>> yt = YouTube('http://youtube.com/watch?v=9bZkp7q19f0')
42
  >>> yt.streams
 
36
  ## Quick start
37
  ```python
38
  >>> from pytube import YouTube
39
+ >>> YouTube('https://youtu.be/9bZkp7q19f0').streams.get_highest_resolution().download()
40
  >>>
41
  >>> yt = YouTube('http://youtube.com/watch?v=9bZkp7q19f0')
42
  >>> yt.streams
pytube/cli.py CHANGED
@@ -11,7 +11,6 @@ import os
11
  import shutil
12
  import sys
13
  import subprocess # nosec
14
- from io import BufferedWriter
15
  from typing import Any, Optional, List
16
 
17
  from pytube import __version__, CaptionQuery, Stream, Playlist
@@ -210,9 +209,7 @@ def display_progress_bar(
210
 
211
 
212
  # noinspection PyUnusedLocal
213
- def on_progress(
214
- stream: Any, chunk: Any, file_handler: BufferedWriter, bytes_remaining: int
215
- ) -> None:
216
  filesize = stream.filesize
217
  bytes_received = filesize - bytes_remaining
218
  display_progress_bar(bytes_received, filesize)
 
11
  import shutil
12
  import sys
13
  import subprocess # nosec
 
14
  from typing import Any, Optional, List
15
 
16
  from pytube import __version__, CaptionQuery, Stream, Playlist
 
209
 
210
 
211
  # noinspection PyUnusedLocal
212
+ def on_progress(stream: Any, chunk: bytes, bytes_remaining: int) -> None:
 
 
213
  filesize = stream.filesize
214
  bytes_received = filesize - bytes_remaining
215
  display_progress_bar(bytes_received, filesize)
pytube/helpers.py CHANGED
@@ -4,7 +4,6 @@
4
  import functools
5
  import logging
6
  import os
7
- import pprint
8
  import re
9
  import warnings
10
  from typing import TypeVar, Callable, Optional, Dict, List, Any
@@ -34,10 +33,7 @@ def regex_search(pattern: str, string: str, group: int) -> str:
34
  if not results:
35
  raise RegexMatchError(caller="regex_search", pattern=pattern)
36
 
37
- logger.debug(
38
- "finished regex search: %s",
39
- pprint.pformat({"pattern": pattern, "results": results.group(0),}, indent=2,),
40
- )
41
 
42
  return results.group(group)
43
 
 
4
  import functools
5
  import logging
6
  import os
 
7
  import re
8
  import warnings
9
  from typing import TypeVar, Callable, Optional, Dict, List, Any
 
33
  if not results:
34
  raise RegexMatchError(caller="regex_search", pattern=pattern)
35
 
36
+ logger.debug("matched regex search: %s", pattern)
 
 
 
37
 
38
  return results.group(group)
39
 
pytube/monostate.py CHANGED
@@ -1,18 +1,11 @@
1
  # -*- coding: utf-8 -*-
2
 
3
- import io
4
  from typing import Any, Optional
5
  from typing_extensions import Protocol
6
 
7
 
8
  class OnProgress(Protocol):
9
- def __call__(
10
- self,
11
- stream: bytes,
12
- chunk: Any,
13
- file_handler: io.BufferedWriter,
14
- bytes_remaining: int,
15
- ) -> None:
16
  """On download progress callback function.
17
 
18
  :param stream:
@@ -21,10 +14,6 @@ class OnProgress(Protocol):
21
  :py:class:`pytube.Stream`
22
  :param str chunk:
23
  Segment of media file binary data, not yet written to disk.
24
- :param file_handler:
25
- The file handle where the media is being written to.
26
- :type file_handler:
27
- :py:class:`io.BufferedWriter`
28
  :param int bytes_remaining:
29
  How many bytes have been downloaded.
30
 
 
1
  # -*- coding: utf-8 -*-
2
 
 
3
  from typing import Any, Optional
4
  from typing_extensions import Protocol
5
 
6
 
7
  class OnProgress(Protocol):
8
+ def __call__(self, stream: Any, chunk: bytes, bytes_remaining: int) -> None:
 
 
 
 
 
 
9
  """On download progress callback function.
10
 
11
  :param stream:
 
14
  :py:class:`pytube.Stream`
15
  :param str chunk:
16
  Segment of media file binary data, not yet written to disk.
 
 
 
 
17
  :param int bytes_remaining:
18
  How many bytes have been downloaded.
19
 
pytube/request.py CHANGED
@@ -1,6 +1,7 @@
1
  # -*- coding: utf-8 -*-
2
 
3
  """Implements a simple wrapper around urlopen."""
 
4
  from typing import Any, Iterable, Dict, Optional
5
  from urllib.request import Request
6
  from urllib.request import urlopen
@@ -31,18 +32,33 @@ def get(url) -> str:
31
  return _execute_request(url).read().decode("utf-8")
32
 
33
 
34
- def stream(url: str, chunk_size: int = 8192) -> Iterable[bytes]:
 
 
35
  """Read the response in chunks.
36
  :param str url: The URL to perform the GET request for.
37
- :param int chunk_size: The size in bytes of each chunk. Defaults to 8*1024
 
38
  :rtype: Iterable[bytes]
39
  """
40
- response = _execute_request(url, headers={"Range": "bytes=0-"})
41
- while True:
42
- buf = response.read(chunk_size)
43
- if not buf:
44
- break
45
- yield buf
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
 
48
  def head(url: str) -> Dict:
 
1
  # -*- coding: utf-8 -*-
2
 
3
  """Implements a simple wrapper around urlopen."""
4
+ from functools import lru_cache
5
  from typing import Any, Iterable, Dict, Optional
6
  from urllib.request import Request
7
  from urllib.request import urlopen
 
32
  return _execute_request(url).read().decode("utf-8")
33
 
34
 
35
+ def stream(
36
+ url: str, chunk_size: int = 1024, range_size: int = 10485760
37
+ ) -> Iterable[bytes]:
38
  """Read the response in chunks.
39
  :param str url: The URL to perform the GET request for.
40
+ :param int chunk_size: The size in bytes of each chunk. Defaults to 1KB
41
+ :param int range_size: The size in bytes of each range request. Defaults to 10MB
42
  :rtype: Iterable[bytes]
43
  """
44
+ file_size: int = filesize(url)
45
+ downloaded = 0
46
+ while downloaded < file_size:
47
+ stop_pos = min(downloaded + range_size, file_size) - 1
48
+ range_header = f"bytes={downloaded}-{stop_pos}"
49
+ r = _execute_request(url, method="GET", headers={"Range": range_header})
50
+ while True:
51
+ chunk = r.read(chunk_size)
52
+ if not chunk:
53
+ break
54
+ downloaded += len(chunk)
55
+ yield chunk
56
+ return
57
+
58
+
59
+ @lru_cache(maxsize=None)
60
+ def filesize(url: str) -> int:
61
+ return int(head(url)["content-length"])
62
 
63
 
64
  def head(url: str) -> Dict:
pytube/streams.py CHANGED
@@ -9,11 +9,9 @@ has been renamed to accommodate DASH (which serves the audio and video
9
  separately).
10
  """
11
 
12
- import io
13
  import logging
14
  import os
15
- import pprint
16
- from typing import Dict, Tuple, Optional
17
 
18
  from pytube import extract
19
  from pytube import request
@@ -143,8 +141,7 @@ class Stream:
143
  Filesize (in bytes) of the stream.
144
  """
145
  if self._filesize is None:
146
- headers = request.head(self.url)
147
- self._filesize = int(headers["content-length"])
148
  return self._filesize
149
 
150
  @property
@@ -255,15 +252,14 @@ class Stream:
255
  def exists_at_path(self, file_path: str) -> bool:
256
  return os.path.isfile(file_path) and os.path.getsize(file_path) == self.filesize
257
 
258
- def stream_to_buffer(self) -> io.BytesIO:
259
  """Write the media stream to buffer
260
 
261
  :rtype: io.BytesIO buffer
262
  """
263
- buffer = io.BytesIO()
264
  bytes_remaining = self.filesize
265
- logger.debug(
266
- "downloading (%s total bytes) file to BytesIO buffer", self.filesize,
267
  )
268
 
269
  for chunk in request.stream(self.url):
@@ -272,9 +268,8 @@ class Stream:
272
  # send to the on_progress callback.
273
  self.on_progress(chunk, buffer, bytes_remaining)
274
  self.on_complete(None)
275
- return buffer
276
 
277
- def on_progress(self, chunk, file_handler, bytes_remaining):
278
  """On progress callback function.
279
 
280
  This function writes the binary data to the file, then checks if an
@@ -295,24 +290,16 @@ class Stream:
295
 
296
  """
297
  file_handler.write(chunk)
298
- logger.debug(
299
- "download progress\n%s",
300
- pprint.pformat(
301
- {"chunk_size": len(chunk), "bytes_remaining": bytes_remaining,},
302
- indent=2,
303
- ),
304
- )
305
- on_progress = self._monostate.on_progress
306
- if on_progress:
307
- logger.debug("calling on_progress callback %s", on_progress)
308
- on_progress(self, chunk, file_handler, bytes_remaining)
309
 
310
  def on_complete(self, file_path: Optional[str]):
311
  """On download complete handler function.
312
 
313
  :param file_path:
314
  The file handle where the media is being written to.
315
- :type file_handle: str
316
 
317
  :rtype: None
318
 
 
9
  separately).
10
  """
11
 
 
12
  import logging
13
  import os
14
+ from typing import Dict, Tuple, Optional, BinaryIO
 
15
 
16
  from pytube import extract
17
  from pytube import request
 
141
  Filesize (in bytes) of the stream.
142
  """
143
  if self._filesize is None:
144
+ self._filesize = request.filesize(self.url)
 
145
  return self._filesize
146
 
147
  @property
 
252
  def exists_at_path(self, file_path: str) -> bool:
253
  return os.path.isfile(file_path) and os.path.getsize(file_path) == self.filesize
254
 
255
+ def stream_to_buffer(self, buffer: BinaryIO) -> None:
256
  """Write the media stream to buffer
257
 
258
  :rtype: io.BytesIO buffer
259
  """
 
260
  bytes_remaining = self.filesize
261
+ logger.info(
262
+ "downloading (%s total bytes) file to buffer", self.filesize,
263
  )
264
 
265
  for chunk in request.stream(self.url):
 
268
  # send to the on_progress callback.
269
  self.on_progress(chunk, buffer, bytes_remaining)
270
  self.on_complete(None)
 
271
 
272
+ def on_progress(self, chunk: bytes, file_handler: BinaryIO, bytes_remaining: int):
273
  """On progress callback function.
274
 
275
  This function writes the binary data to the file, then checks if an
 
290
 
291
  """
292
  file_handler.write(chunk)
293
+ logger.debug("download remaining: %s", bytes_remaining)
294
+ if self._monostate.on_progress:
295
+ self._monostate.on_progress(self, chunk, bytes_remaining)
 
 
 
 
 
 
 
 
296
 
297
  def on_complete(self, file_path: Optional[str]):
298
  """On download complete handler function.
299
 
300
  :param file_path:
301
  The file handle where the media is being written to.
302
+ :type file_path: str
303
 
304
  :rtype: None
305
 
tests/test_cli.py CHANGED
@@ -125,11 +125,10 @@ def test_display_progress_bar(capsys):
125
 
126
 
127
  @mock.patch("pytube.Stream")
128
- @mock.patch("io.BufferedWriter")
129
- def test_on_progress(stream, writer):
130
  stream.filesize = 10
131
  cli.display_progress_bar = MagicMock()
132
- cli.on_progress(stream, "", writer, 7)
133
  cli.display_progress_bar.assert_called_once_with(3, 10)
134
 
135
 
 
125
 
126
 
127
  @mock.patch("pytube.Stream")
128
+ def test_on_progress(stream):
 
129
  stream.filesize = 10
130
  cli.display_progress_bar = MagicMock()
131
+ cli.on_progress(stream, "", 7)
132
  cli.display_progress_bar.assert_called_once_with(3, 10)
133
 
134
 
tests/test_request.py CHANGED
@@ -7,12 +7,13 @@ import pytest
7
  from pytube import request
8
 
9
 
 
10
  @mock.patch("pytube.request.urlopen")
11
- def test_streaming(mock_urlopen):
12
  fake_stream_binary = [
13
- iter(os.urandom(8 * 1024)),
14
- iter(os.urandom(8 * 1024)),
15
- iter(os.urandom(8 * 1024)),
16
  None,
17
  ]
18
  response = mock.Mock()
 
7
  from pytube import request
8
 
9
 
10
+ @mock.patch("pytube.request.filesize", return_value=3 * 8 * 1024)
11
  @mock.patch("pytube.request.urlopen")
12
+ def test_streaming(mock_urlopen, filesize):
13
  fake_stream_binary = [
14
+ os.urandom(8 * 1024),
15
+ os.urandom(8 * 1024),
16
+ os.urandom(8 * 1024),
17
  None,
18
  ]
19
  response = mock.Mock()
tests/test_streams.py CHANGED
@@ -15,12 +15,11 @@ def test_filesize(cipher_signature, mocker):
15
 
16
 
17
  def test_filesize_approx(cipher_signature, mocker):
18
- mocker.patch.object(request, "head")
19
- request.head.return_value = {"content-length": "123"}
20
  stream = cipher_signature.streams[0]
 
21
  assert stream.filesize_approx == 22350604
22
  stream.bitrate = None
23
- assert stream.filesize_approx == 123
24
 
25
 
26
  def test_default_filename(cipher_signature):
@@ -188,8 +187,8 @@ def test_on_progress_hook(cipher_signature, mocker):
188
  stream.download()
189
  assert callback_fn.called
190
  args, _ = callback_fn.call_args
191
- assert len(args) == 4
192
- stream, _, _, _ = args
193
  assert isinstance(stream, Stream)
194
 
195
 
 
15
 
16
 
17
  def test_filesize_approx(cipher_signature, mocker):
 
 
18
  stream = cipher_signature.streams[0]
19
+
20
  assert stream.filesize_approx == 22350604
21
  stream.bitrate = None
22
+ assert stream.filesize_approx == 6796391
23
 
24
 
25
  def test_default_filename(cipher_signature):
 
187
  stream.download()
188
  assert callback_fn.called
189
  args, _ = callback_fn.call_args
190
+ assert len(args) == 3
191
+ stream, _, _ = args
192
  assert isinstance(stream, Stream)
193
 
194