hbmartin commited on
Commit
30b06f6
·
1 Parent(s): 5a6acf1

prevent unnecessary load more calls when trimming

Browse files
pytube/contrib/playlist.py CHANGED
@@ -9,7 +9,7 @@ from datetime import date, datetime
9
  from typing import List, Optional, Iterable, Dict
10
  from urllib.parse import parse_qs
11
 
12
- from pytube import request, YouTube, extract
13
  from pytube.helpers import cache, deprecated
14
  from pytube.mixins import install_proxy
15
 
@@ -37,10 +37,14 @@ class Playlist:
37
 
38
  # Needs testing with non-English
39
  self.last_update: Optional[date] = None
40
- results = re.search(r"<li>Last updated on (\w{3}) (\d{1,2}), (\d{4})<\/li>", self.html)
 
 
41
  if results:
42
  month, day, year = results.groups()
43
- self.last_update = datetime.strptime(f"{month} {day:0>2} {year}", "%b %d %Y").date()
 
 
44
 
45
  @staticmethod
46
  def _find_load_more_url(req: str) -> Optional[str]:
@@ -56,11 +60,10 @@ class Playlist:
56
 
57
  return None
58
 
59
- def parse_links(self) -> List[str]:
60
  """Parse the video links from the page source, extracts and
61
  returns the /watch?v= part from video link href
62
  """
63
-
64
  req = self.html
65
 
66
  # split the page source by line and process each line
@@ -71,6 +74,12 @@ class Playlist:
71
  # Simulating a browser request for the load more link
72
  load_more_url = self._find_load_more_url(req)
73
  while load_more_url: # there is an url found
 
 
 
 
 
 
74
  logger.debug("load more url: %s", load_more_url)
75
  req = request.get(load_more_url)
76
  load_more = json.loads(req)
@@ -94,12 +103,8 @@ class Playlist:
94
  :returns:
95
  List of video URLs from the playlist trimmed at the given ID
96
  """
97
- trimmed_urls = []
98
- for url in self.video_urls:
99
- if extract.video_id(url) == video_id:
100
- break
101
- trimmed_urls.append(url)
102
- return trimmed_urls
103
 
104
  @property # type: ignore
105
  @cache
@@ -109,9 +114,7 @@ class Playlist:
109
  :returns:
110
  List of video URLs
111
  """
112
- return [
113
- "https://www.youtube.com" + watch_path for watch_path in self.parse_links()
114
- ]
115
 
116
  @property
117
  def videos(self) -> Iterable[YouTube]:
@@ -221,3 +224,7 @@ class Playlist:
221
  .replace("- YouTube", "")
222
  .strip()
223
  )
 
 
 
 
 
9
  from typing import List, Optional, Iterable, Dict
10
  from urllib.parse import parse_qs
11
 
12
+ from pytube import request, YouTube
13
  from pytube.helpers import cache, deprecated
14
  from pytube.mixins import install_proxy
15
 
 
37
 
38
  # Needs testing with non-English
39
  self.last_update: Optional[date] = None
40
+ results = re.search(
41
+ r"<li>Last updated on (\w{3}) (\d{1,2}), (\d{4})<\/li>", self.html
42
+ )
43
  if results:
44
  month, day, year = results.groups()
45
+ self.last_update = datetime.strptime(
46
+ f"{month} {day:0>2} {year}", "%b %d %Y"
47
+ ).date()
48
 
49
  @staticmethod
50
  def _find_load_more_url(req: str) -> Optional[str]:
 
60
 
61
  return None
62
 
63
+ def parse_links(self, until_watch_id: Optional[str] = None) -> List[str]:
64
  """Parse the video links from the page source, extracts and
65
  returns the /watch?v= part from video link href
66
  """
 
67
  req = self.html
68
 
69
  # split the page source by line and process each line
 
74
  # Simulating a browser request for the load more link
75
  load_more_url = self._find_load_more_url(req)
76
  while load_more_url: # there is an url found
77
+ if until_watch_id:
78
+ try:
79
+ trim_index = link_list.index(f"/watch?v={until_watch_id}")
80
+ return link_list[:trim_index]
81
+ except ValueError:
82
+ pass
83
  logger.debug("load more url: %s", load_more_url)
84
  req = request.get(load_more_url)
85
  load_more = json.loads(req)
 
103
  :returns:
104
  List of video URLs from the playlist trimmed at the given ID
105
  """
106
+ trimmed_watch = self.parse_links(until_watch_id=video_id)
107
+ return [self._video_url(watch_path) for watch_path in trimmed_watch]
 
 
 
 
108
 
109
  @property # type: ignore
110
  @cache
 
114
  :returns:
115
  List of video URLs
116
  """
117
+ return [self._video_url(watch_path) for watch_path in self.parse_links()]
 
 
118
 
119
  @property
120
  def videos(self) -> Iterable[YouTube]:
 
224
  .replace("- YouTube", "")
225
  .strip()
226
  )
227
+
228
+ @staticmethod
229
+ def _video_url(watch_path: str):
230
+ return f"https://www.youtube.com{watch_path}"
tests/contrib/test_playlist.py CHANGED
@@ -130,7 +130,8 @@ def test_trimmed(request_get, playlist_html):
130
  url = "https://www.fakeurl.com/playlist?list=whatever"
131
  request_get.return_value = playlist_html
132
  playlist = Playlist(url)
133
- playlist._find_load_more_url = MagicMock(return_value=None)
 
134
  assert playlist.trimmed("1BYu65vLKdA") == [
135
  "https://www.youtube.com/watch?v=ujTCoH21GlA",
136
  "https://www.youtube.com/watch?v=45ryDIPHdGg",
 
130
  url = "https://www.fakeurl.com/playlist?list=whatever"
131
  request_get.return_value = playlist_html
132
  playlist = Playlist(url)
133
+ playlist._find_load_more_url = MagicMock(return_value="dummy")
134
+ assert request_get.call_count == 1
135
  assert playlist.trimmed("1BYu65vLKdA") == [
136
  "https://www.youtube.com/watch?v=ujTCoH21GlA",
137
  "https://www.youtube.com/watch?v=45ryDIPHdGg",