prevent unnecessary load more calls when trimming
Browse files- pytube/contrib/playlist.py +21 -14
- tests/contrib/test_playlist.py +2 -1
pytube/contrib/playlist.py
CHANGED
@@ -9,7 +9,7 @@ from datetime import date, datetime
|
|
9 |
from typing import List, Optional, Iterable, Dict
|
10 |
from urllib.parse import parse_qs
|
11 |
|
12 |
-
from pytube import request, YouTube
|
13 |
from pytube.helpers import cache, deprecated
|
14 |
from pytube.mixins import install_proxy
|
15 |
|
@@ -37,10 +37,14 @@ class Playlist:
|
|
37 |
|
38 |
# Needs testing with non-English
|
39 |
self.last_update: Optional[date] = None
|
40 |
-
results = re.search(
|
|
|
|
|
41 |
if results:
|
42 |
month, day, year = results.groups()
|
43 |
-
self.last_update = datetime.strptime(
|
|
|
|
|
44 |
|
45 |
@staticmethod
|
46 |
def _find_load_more_url(req: str) -> Optional[str]:
|
@@ -56,11 +60,10 @@ class Playlist:
|
|
56 |
|
57 |
return None
|
58 |
|
59 |
-
def parse_links(self) -> List[str]:
|
60 |
"""Parse the video links from the page source, extracts and
|
61 |
returns the /watch?v= part from video link href
|
62 |
"""
|
63 |
-
|
64 |
req = self.html
|
65 |
|
66 |
# split the page source by line and process each line
|
@@ -71,6 +74,12 @@ class Playlist:
|
|
71 |
# Simulating a browser request for the load more link
|
72 |
load_more_url = self._find_load_more_url(req)
|
73 |
while load_more_url: # there is an url found
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
logger.debug("load more url: %s", load_more_url)
|
75 |
req = request.get(load_more_url)
|
76 |
load_more = json.loads(req)
|
@@ -94,12 +103,8 @@ class Playlist:
|
|
94 |
:returns:
|
95 |
List of video URLs from the playlist trimmed at the given ID
|
96 |
"""
|
97 |
-
|
98 |
-
for
|
99 |
-
if extract.video_id(url) == video_id:
|
100 |
-
break
|
101 |
-
trimmed_urls.append(url)
|
102 |
-
return trimmed_urls
|
103 |
|
104 |
@property # type: ignore
|
105 |
@cache
|
@@ -109,9 +114,7 @@ class Playlist:
|
|
109 |
:returns:
|
110 |
List of video URLs
|
111 |
"""
|
112 |
-
return [
|
113 |
-
"https://www.youtube.com" + watch_path for watch_path in self.parse_links()
|
114 |
-
]
|
115 |
|
116 |
@property
|
117 |
def videos(self) -> Iterable[YouTube]:
|
@@ -221,3 +224,7 @@ class Playlist:
|
|
221 |
.replace("- YouTube", "")
|
222 |
.strip()
|
223 |
)
|
|
|
|
|
|
|
|
|
|
9 |
from typing import List, Optional, Iterable, Dict
|
10 |
from urllib.parse import parse_qs
|
11 |
|
12 |
+
from pytube import request, YouTube
|
13 |
from pytube.helpers import cache, deprecated
|
14 |
from pytube.mixins import install_proxy
|
15 |
|
|
|
37 |
|
38 |
# Needs testing with non-English
|
39 |
self.last_update: Optional[date] = None
|
40 |
+
results = re.search(
|
41 |
+
r"<li>Last updated on (\w{3}) (\d{1,2}), (\d{4})<\/li>", self.html
|
42 |
+
)
|
43 |
if results:
|
44 |
month, day, year = results.groups()
|
45 |
+
self.last_update = datetime.strptime(
|
46 |
+
f"{month} {day:0>2} {year}", "%b %d %Y"
|
47 |
+
).date()
|
48 |
|
49 |
@staticmethod
|
50 |
def _find_load_more_url(req: str) -> Optional[str]:
|
|
|
60 |
|
61 |
return None
|
62 |
|
63 |
+
def parse_links(self, until_watch_id: Optional[str] = None) -> List[str]:
|
64 |
"""Parse the video links from the page source, extracts and
|
65 |
returns the /watch?v= part from video link href
|
66 |
"""
|
|
|
67 |
req = self.html
|
68 |
|
69 |
# split the page source by line and process each line
|
|
|
74 |
# Simulating a browser request for the load more link
|
75 |
load_more_url = self._find_load_more_url(req)
|
76 |
while load_more_url: # there is an url found
|
77 |
+
if until_watch_id:
|
78 |
+
try:
|
79 |
+
trim_index = link_list.index(f"/watch?v={until_watch_id}")
|
80 |
+
return link_list[:trim_index]
|
81 |
+
except ValueError:
|
82 |
+
pass
|
83 |
logger.debug("load more url: %s", load_more_url)
|
84 |
req = request.get(load_more_url)
|
85 |
load_more = json.loads(req)
|
|
|
103 |
:returns:
|
104 |
List of video URLs from the playlist trimmed at the given ID
|
105 |
"""
|
106 |
+
trimmed_watch = self.parse_links(until_watch_id=video_id)
|
107 |
+
return [self._video_url(watch_path) for watch_path in trimmed_watch]
|
|
|
|
|
|
|
|
|
108 |
|
109 |
@property # type: ignore
|
110 |
@cache
|
|
|
114 |
:returns:
|
115 |
List of video URLs
|
116 |
"""
|
117 |
+
return [self._video_url(watch_path) for watch_path in self.parse_links()]
|
|
|
|
|
118 |
|
119 |
@property
|
120 |
def videos(self) -> Iterable[YouTube]:
|
|
|
224 |
.replace("- YouTube", "")
|
225 |
.strip()
|
226 |
)
|
227 |
+
|
228 |
+
@staticmethod
|
229 |
+
def _video_url(watch_path: str):
|
230 |
+
return f"https://www.youtube.com{watch_path}"
|
tests/contrib/test_playlist.py
CHANGED
@@ -130,7 +130,8 @@ def test_trimmed(request_get, playlist_html):
|
|
130 |
url = "https://www.fakeurl.com/playlist?list=whatever"
|
131 |
request_get.return_value = playlist_html
|
132 |
playlist = Playlist(url)
|
133 |
-
playlist._find_load_more_url = MagicMock(return_value=
|
|
|
134 |
assert playlist.trimmed("1BYu65vLKdA") == [
|
135 |
"https://www.youtube.com/watch?v=ujTCoH21GlA",
|
136 |
"https://www.youtube.com/watch?v=45ryDIPHdGg",
|
|
|
130 |
url = "https://www.fakeurl.com/playlist?list=whatever"
|
131 |
request_get.return_value = playlist_html
|
132 |
playlist = Playlist(url)
|
133 |
+
playlist._find_load_more_url = MagicMock(return_value="dummy")
|
134 |
+
assert request_get.call_count == 1
|
135 |
assert playlist.trimmed("1BYu65vLKdA") == [
|
136 |
"https://www.youtube.com/watch?v=ujTCoH21GlA",
|
137 |
"https://www.youtube.com/watch?v=45ryDIPHdGg",
|