Merge pull request #45 from hbmartin/decrypt-performance
Browse files- .gitignore +4 -0
- pytube/__main__.py +3 -6
- pytube/cipher.py +65 -68
- pytube/extract.py +9 -13
- tests/test_cipher.py +0 -11
- tests/test_streams.py +9 -9
.gitignore
CHANGED
@@ -54,7 +54,11 @@ coverage.xml
|
|
54 |
*.cover
|
55 |
.hypothesis/
|
56 |
.pytest_cache/
|
|
|
|
|
|
|
57 |
prof/
|
|
|
58 |
|
59 |
# Debian Files
|
60 |
debian/files
|
|
|
54 |
*.cover
|
55 |
.hypothesis/
|
56 |
.pytest_cache/
|
57 |
+
*.mp4
|
58 |
+
|
59 |
+
# Performance profiling
|
60 |
prof/
|
61 |
+
*.cprof
|
62 |
|
63 |
# Debian Files
|
64 |
debian/files
|
pytube/__main__.py
CHANGED
@@ -70,7 +70,6 @@ class YouTube:
|
|
70 |
self.player_response: Dict = {}
|
71 |
# streams
|
72 |
self.age_restricted: Optional[bool] = None
|
73 |
-
self.vid_descr: Optional[str] = None
|
74 |
|
75 |
self.fmt_streams: List[Stream] = []
|
76 |
|
@@ -125,8 +124,6 @@ class YouTube:
|
|
125 |
title = title[:index] if index > 0 else title
|
126 |
self.player_config_args["title"] = unescape(title)
|
127 |
|
128 |
-
if self.watch_html:
|
129 |
-
self.vid_descr = extract.get_vid_descr(self.watch_html)
|
130 |
# https://github.com/nficano/pytube/issues/165
|
131 |
stream_maps = ["url_encoded_fmt_stream_map"]
|
132 |
if "adaptive_fmts" in self.player_config_args:
|
@@ -276,9 +273,9 @@ class YouTube:
|
|
276 |
:rtype: str
|
277 |
|
278 |
"""
|
279 |
-
return self.
|
280 |
-
|
281 |
-
)
|
282 |
|
283 |
@property
|
284 |
def rating(self) -> float:
|
|
|
70 |
self.player_response: Dict = {}
|
71 |
# streams
|
72 |
self.age_restricted: Optional[bool] = None
|
|
|
73 |
|
74 |
self.fmt_streams: List[Stream] = []
|
75 |
|
|
|
124 |
title = title[:index] if index > 0 else title
|
125 |
self.player_config_args["title"] = unescape(title)
|
126 |
|
|
|
|
|
127 |
# https://github.com/nficano/pytube/issues/165
|
128 |
stream_maps = ["url_encoded_fmt_stream_map"]
|
129 |
if "adaptive_fmts" in self.player_config_args:
|
|
|
273 |
:rtype: str
|
274 |
|
275 |
"""
|
276 |
+
return self.player_response.get("videoDetails", {}).get(
|
277 |
+
"shortDescription"
|
278 |
+
) or extract.get_vid_descr(self.watch_html)
|
279 |
|
280 |
@property
|
281 |
def rating(self) -> float:
|
pytube/cipher.py
CHANGED
@@ -20,11 +20,75 @@ from itertools import chain
|
|
20 |
from typing import List, Tuple, Dict, Callable, Any, Optional
|
21 |
|
22 |
from pytube.exceptions import RegexMatchError
|
23 |
-
from pytube.helpers import regex_search, create_logger
|
24 |
|
25 |
logger = create_logger()
|
26 |
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
def get_initial_function_name(js: str) -> str:
|
29 |
"""Extract the name of the function responsible for computing the signature.
|
30 |
:param str js:
|
@@ -48,7 +112,6 @@ def get_initial_function_name(js: str) -> str:
|
|
48 |
r"\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501
|
49 |
r"\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501
|
50 |
]
|
51 |
-
|
52 |
logger.debug("finding initial function name")
|
53 |
for pattern in function_patterns:
|
54 |
regex = re.compile(pattern)
|
@@ -71,7 +134,6 @@ def get_transform_plan(js: str) -> List[str]:
|
|
71 |
|
72 |
**Example**:
|
73 |
|
74 |
-
>>> get_transform_plan(js)
|
75 |
['DE.AJ(a,15)',
|
76 |
'DE.VR(a,3)',
|
77 |
'DE.AJ(a,51)',
|
@@ -222,68 +284,3 @@ def map_functions(js_func: str) -> Callable:
|
|
222 |
if re.search(pattern, js_func):
|
223 |
return fn
|
224 |
raise RegexMatchError(caller="map_functions", pattern="multiple")
|
225 |
-
|
226 |
-
|
227 |
-
def parse_function(js_func: str) -> Tuple[str, int]:
|
228 |
-
"""Parse the Javascript transform function.
|
229 |
-
|
230 |
-
Break a JavaScript transform function down into a two element ``tuple``
|
231 |
-
containing the function name and some integer-based argument.
|
232 |
-
|
233 |
-
:param str js_func:
|
234 |
-
The JavaScript version of the transform function.
|
235 |
-
:rtype: tuple
|
236 |
-
:returns:
|
237 |
-
two element tuple containing the function name and an argument.
|
238 |
-
|
239 |
-
**Example**:
|
240 |
-
|
241 |
-
>>> parse_function('DE.AJ(a,15)')
|
242 |
-
('AJ', 15)
|
243 |
-
|
244 |
-
"""
|
245 |
-
logger.debug("parsing transform function")
|
246 |
-
pattern = r"\w+\.(\w+)\(\w,(\d+)\)"
|
247 |
-
regex = re.compile(pattern)
|
248 |
-
parse_match = regex.search(js_func)
|
249 |
-
if not parse_match:
|
250 |
-
raise RegexMatchError(caller="parse_function", pattern=pattern)
|
251 |
-
fn_name, fn_arg = parse_match.groups()
|
252 |
-
return fn_name, int(fn_arg)
|
253 |
-
|
254 |
-
|
255 |
-
def get_signature(js: str, ciphered_signature: str) -> str:
|
256 |
-
"""Decipher the signature.
|
257 |
-
|
258 |
-
Taking the ciphered signature, applies the transform functions.
|
259 |
-
|
260 |
-
:param str js:
|
261 |
-
The contents of the base.js asset file.
|
262 |
-
:param str ciphered_signature:
|
263 |
-
The ciphered signature sent in the ``player_config``.
|
264 |
-
:rtype: str
|
265 |
-
:returns:
|
266 |
-
Decrypted signature required to download the media content.
|
267 |
-
|
268 |
-
"""
|
269 |
-
transform_plan = get_transform_plan(js)
|
270 |
-
var, _ = transform_plan[0].split(".")
|
271 |
-
transform_map = get_transform_map(js, var)
|
272 |
-
signature = list(ciphered_signature)
|
273 |
-
|
274 |
-
for js_func in transform_plan:
|
275 |
-
name, argument = parse_function(js_func)
|
276 |
-
signature = transform_map[name](signature, argument)
|
277 |
-
logger.debug(
|
278 |
-
"applied transform function\n"
|
279 |
-
"output: %s\n"
|
280 |
-
"js_function: %s\n"
|
281 |
-
"argument: %d\n"
|
282 |
-
"function: %s",
|
283 |
-
"".join(signature),
|
284 |
-
name,
|
285 |
-
argument,
|
286 |
-
transform_map[name],
|
287 |
-
)
|
288 |
-
|
289 |
-
return "".join(signature)
|
|
|
20 |
from typing import List, Tuple, Dict, Callable, Any, Optional
|
21 |
|
22 |
from pytube.exceptions import RegexMatchError
|
23 |
+
from pytube.helpers import regex_search, create_logger, cache
|
24 |
|
25 |
logger = create_logger()
|
26 |
|
27 |
|
28 |
+
class Cipher:
|
29 |
+
def __init__(self, js: str):
|
30 |
+
self.transform_plan: List[str] = get_transform_plan(js)
|
31 |
+
var, _ = self.transform_plan[0].split(".")
|
32 |
+
self.transform_map = get_transform_map(js, var)
|
33 |
+
self.js_func_regex = re.compile(r"\w+\.(\w+)\(\w,(\d+)\)")
|
34 |
+
|
35 |
+
def get_signature(self, ciphered_signature: str) -> str:
|
36 |
+
"""Decipher the signature.
|
37 |
+
|
38 |
+
Taking the ciphered signature, applies the transform functions.
|
39 |
+
|
40 |
+
:param str ciphered_signature:
|
41 |
+
The ciphered signature sent in the ``player_config``.
|
42 |
+
:rtype: str
|
43 |
+
:returns:
|
44 |
+
Decrypted signature required to download the media content.
|
45 |
+
"""
|
46 |
+
signature = list(ciphered_signature)
|
47 |
+
|
48 |
+
for js_func in self.transform_plan:
|
49 |
+
name, argument = self.parse_function(js_func) # type: ignore
|
50 |
+
signature = self.transform_map[name](signature, argument)
|
51 |
+
logger.debug(
|
52 |
+
"applied transform function\n"
|
53 |
+
"output: %s\n"
|
54 |
+
"js_function: %s\n"
|
55 |
+
"argument: %d\n"
|
56 |
+
"function: %s",
|
57 |
+
"".join(signature),
|
58 |
+
name,
|
59 |
+
argument,
|
60 |
+
self.transform_map[name],
|
61 |
+
)
|
62 |
+
|
63 |
+
return "".join(signature)
|
64 |
+
|
65 |
+
@cache
|
66 |
+
def parse_function(self, js_func: str) -> Tuple[str, int]:
|
67 |
+
"""Parse the Javascript transform function.
|
68 |
+
|
69 |
+
Break a JavaScript transform function down into a two element ``tuple``
|
70 |
+
containing the function name and some integer-based argument.
|
71 |
+
|
72 |
+
:param str js_func:
|
73 |
+
The JavaScript version of the transform function.
|
74 |
+
:rtype: tuple
|
75 |
+
:returns:
|
76 |
+
two element tuple containing the function name and an argument.
|
77 |
+
|
78 |
+
**Example**:
|
79 |
+
|
80 |
+
>>> parse_function('DE.AJ(a,15)')
|
81 |
+
('AJ', 15)
|
82 |
+
|
83 |
+
"""
|
84 |
+
logger.debug("parsing transform function")
|
85 |
+
parse_match = self.js_func_regex.search(js_func)
|
86 |
+
if not parse_match:
|
87 |
+
raise RegexMatchError(caller="parse_function", pattern="js_func_regex")
|
88 |
+
fn_name, fn_arg = parse_match.groups()
|
89 |
+
return fn_name, int(fn_arg)
|
90 |
+
|
91 |
+
|
92 |
def get_initial_function_name(js: str) -> str:
|
93 |
"""Extract the name of the function responsible for computing the signature.
|
94 |
:param str js:
|
|
|
112 |
r"\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501
|
113 |
r"\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501
|
114 |
]
|
|
|
115 |
logger.debug("finding initial function name")
|
116 |
for pattern in function_patterns:
|
117 |
regex = re.compile(pattern)
|
|
|
134 |
|
135 |
**Example**:
|
136 |
|
|
|
137 |
['DE.AJ(a,15)',
|
138 |
'DE.VR(a,3)',
|
139 |
'DE.AJ(a,51)',
|
|
|
284 |
if re.search(pattern, js_func):
|
285 |
return fn
|
286 |
raise RegexMatchError(caller="map_functions", pattern="multiple")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pytube/extract.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
"""This module contains all non-cipher related data extraction logic."""
|
3 |
import json
|
4 |
-
import pprint
|
5 |
import re
|
6 |
from collections import OrderedDict
|
7 |
|
@@ -10,7 +9,7 @@ from typing import Any, Optional, Tuple, List, Dict
|
|
10 |
from urllib.parse import quote, parse_qs, unquote, parse_qsl
|
11 |
from urllib.parse import urlencode
|
12 |
|
13 |
-
from pytube import
|
14 |
from pytube.exceptions import RegexMatchError, HTMLParseError, LiveStreamError
|
15 |
from pytube.helpers import regex_search, logger
|
16 |
|
@@ -205,9 +204,10 @@ def get_ytplayer_config(html: str, age_restricted: bool = False) -> Any:
|
|
205 |
return json.loads(yt_player_config)
|
206 |
|
207 |
|
208 |
-
def get_vid_descr(html: str) -> str:
|
209 |
html_parser = PytubeHTMLParser()
|
210 |
-
|
|
|
211 |
return html_parser.vid_descr
|
212 |
|
213 |
|
@@ -224,6 +224,7 @@ def apply_signature(config_args: Dict, fmt: str, js: str) -> None:
|
|
224 |
The contents of the base.js asset file.
|
225 |
|
226 |
"""
|
|
|
227 |
stream_manifest = config_args[fmt]
|
228 |
live_stream = (
|
229 |
json.loads(config_args["player_response"])
|
@@ -247,17 +248,13 @@ def apply_signature(config_args: Dict, fmt: str, js: str) -> None:
|
|
247 |
continue
|
248 |
|
249 |
if js is not None:
|
250 |
-
signature = cipher.get_signature(
|
251 |
else:
|
252 |
# signature not present in url (line 33), need js to descramble
|
253 |
# TypeError caught in __main__
|
254 |
raise TypeError("JS is None")
|
255 |
|
256 |
-
logger.debug(
|
257 |
-
"finished descrambling signature for itag=%s\n%s",
|
258 |
-
stream["itag"],
|
259 |
-
pprint.pformat({"s": stream["s"], "signature": signature,}, indent=2,),
|
260 |
-
)
|
261 |
# 403 forbidden fix
|
262 |
stream_manifest[i]["url"] = url + "&sig=" + signature
|
263 |
|
@@ -320,6 +317,5 @@ def apply_descrambler(stream_data: Dict, key: str) -> None:
|
|
320 |
{k: unquote(v) for k, v in parse_qsl(i)}
|
321 |
for i in stream_data[key].split(",")
|
322 |
]
|
323 |
-
|
324 |
-
|
325 |
-
)
|
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
"""This module contains all non-cipher related data extraction logic."""
|
3 |
import json
|
|
|
4 |
import re
|
5 |
from collections import OrderedDict
|
6 |
|
|
|
9 |
from urllib.parse import quote, parse_qs, unquote, parse_qsl
|
10 |
from urllib.parse import urlencode
|
11 |
|
12 |
+
from pytube.cipher import Cipher
|
13 |
from pytube.exceptions import RegexMatchError, HTMLParseError, LiveStreamError
|
14 |
from pytube.helpers import regex_search, logger
|
15 |
|
|
|
204 |
return json.loads(yt_player_config)
|
205 |
|
206 |
|
207 |
+
def get_vid_descr(html: Optional[str]) -> str:
|
208 |
html_parser = PytubeHTMLParser()
|
209 |
+
if html:
|
210 |
+
html_parser.feed(html)
|
211 |
return html_parser.vid_descr
|
212 |
|
213 |
|
|
|
224 |
The contents of the base.js asset file.
|
225 |
|
226 |
"""
|
227 |
+
cipher = Cipher(js=js)
|
228 |
stream_manifest = config_args[fmt]
|
229 |
live_stream = (
|
230 |
json.loads(config_args["player_response"])
|
|
|
248 |
continue
|
249 |
|
250 |
if js is not None:
|
251 |
+
signature = cipher.get_signature(ciphered_signature=stream["s"])
|
252 |
else:
|
253 |
# signature not present in url (line 33), need js to descramble
|
254 |
# TypeError caught in __main__
|
255 |
raise TypeError("JS is None")
|
256 |
|
257 |
+
logger.debug("finished descrambling signature for itag=%s", stream["itag"])
|
|
|
|
|
|
|
|
|
258 |
# 403 forbidden fix
|
259 |
stream_manifest[i]["url"] = url + "&sig=" + signature
|
260 |
|
|
|
317 |
{k: unquote(v) for k, v in parse_qsl(i)}
|
318 |
for i in stream_data[key].split(",")
|
319 |
]
|
320 |
+
|
321 |
+
logger.debug("applying descrambler")
|
|
tests/test_cipher.py
CHANGED
@@ -20,17 +20,6 @@ def test_get_transform_object_with_no_match_should_error():
|
|
20 |
cipher.get_transform_object("asdf", var="lt")
|
21 |
|
22 |
|
23 |
-
def test_parse_function_with_match():
|
24 |
-
fn_name, fn_arg = cipher.parse_function("DE.AJ(a,15)")
|
25 |
-
assert fn_name == "AJ"
|
26 |
-
assert fn_arg == 15
|
27 |
-
|
28 |
-
|
29 |
-
def test_parse_function_with_no_match_should_error():
|
30 |
-
with pytest.raises(RegexMatchError):
|
31 |
-
cipher.parse_function("asdf")
|
32 |
-
|
33 |
-
|
34 |
def test_reverse():
|
35 |
reversed_array = cipher.reverse([1, 2, 3, 4], None)
|
36 |
assert reversed_array == [4, 3, 2, 1]
|
|
|
20 |
cipher.get_transform_object("asdf", var="lt")
|
21 |
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
def test_reverse():
|
24 |
reversed_array = cipher.reverse([1, 2, 3, 4], None)
|
25 |
assert reversed_array == [4, 3, 2, 1]
|
tests/test_streams.py
CHANGED
@@ -40,11 +40,11 @@ def test_title(cipher_signature):
|
|
40 |
def test_description(cipher_signature):
|
41 |
expected = (
|
42 |
"PSY - ‘I LUV IT’ M/V @ https://youtu.be/Xvjnoagk6GU\n"
|
43 |
-
"PSY - ‘New Face’ M/V @https://youtu.be/OwJPPaEyqhI\n"
|
44 |
"PSY - 8TH ALBUM '4X2=8' on iTunes @\n"
|
45 |
-
"https://smarturl.it/PSY_8thAlbum\n"
|
46 |
-
"PSY - GANGNAM STYLE(강남스타일) on iTunes @ http://smarturl.it/PsyGangnam\n"
|
47 |
-
"#PSY #싸이 #GANGNAMSTYLE #강남스타일\n"
|
48 |
"More about PSY@\nhttp://www.youtube.com/officialpsy\n"
|
49 |
"http://www.facebook.com/officialpsy\n"
|
50 |
"http://twitter.com/psy_oppa\n"
|
@@ -55,14 +55,14 @@ def test_description(cipher_signature):
|
|
55 |
)
|
56 |
assert cipher_signature.description == expected
|
57 |
|
58 |
-
cipher_signature.
|
59 |
expected = (
|
60 |
"PSY - ‘I LUV IT’ M/V @ https://youtu.be/Xvjnoagk6GU\n"
|
61 |
-
"PSY - ‘New Face’ M/V @https://youtu.be/OwJPPaEyqhI\n
|
62 |
"PSY - 8TH ALBUM '4X2=8' on iTunes @\n"
|
63 |
-
"https://smarturl.it/PSY_8thAlbum\n
|
64 |
-
"PSY - GANGNAM STYLE(강남스타일) on iTunes @ http://smarturl.it/PsyGangnam\n
|
65 |
-
"#PSY #싸이 #GANGNAMSTYLE #강남스타일\n
|
66 |
"More about PSY@\nhttp://www.youtube.com/officialpsy\n"
|
67 |
"http://www.facebook.com/officialpsy\n"
|
68 |
"http://twitter.com/psy_oppa\n"
|
|
|
40 |
def test_description(cipher_signature):
|
41 |
expected = (
|
42 |
"PSY - ‘I LUV IT’ M/V @ https://youtu.be/Xvjnoagk6GU\n"
|
43 |
+
"PSY - ‘New Face’ M/V @https://youtu.be/OwJPPaEyqhI\n\n"
|
44 |
"PSY - 8TH ALBUM '4X2=8' on iTunes @\n"
|
45 |
+
"https://smarturl.it/PSY_8thAlbum\n\n"
|
46 |
+
"PSY - GANGNAM STYLE(강남스타일) on iTunes @ http://smarturl.it/PsyGangnam\n\n"
|
47 |
+
"#PSY #싸이 #GANGNAMSTYLE #강남스타일\n\n"
|
48 |
"More about PSY@\nhttp://www.youtube.com/officialpsy\n"
|
49 |
"http://www.facebook.com/officialpsy\n"
|
50 |
"http://twitter.com/psy_oppa\n"
|
|
|
55 |
)
|
56 |
assert cipher_signature.description == expected
|
57 |
|
58 |
+
cipher_signature.player_response = {}
|
59 |
expected = (
|
60 |
"PSY - ‘I LUV IT’ M/V @ https://youtu.be/Xvjnoagk6GU\n"
|
61 |
+
"PSY - ‘New Face’ M/V @https://youtu.be/OwJPPaEyqhI\n"
|
62 |
"PSY - 8TH ALBUM '4X2=8' on iTunes @\n"
|
63 |
+
"https://smarturl.it/PSY_8thAlbum\n"
|
64 |
+
"PSY - GANGNAM STYLE(강남스타일) on iTunes @ http://smarturl.it/PsyGangnam\n"
|
65 |
+
"#PSY #싸이 #GANGNAMSTYLE #강남스타일\n"
|
66 |
"More about PSY@\nhttp://www.youtube.com/officialpsy\n"
|
67 |
"http://www.facebook.com/officialpsy\n"
|
68 |
"http://twitter.com/psy_oppa\n"
|