hbmartin commited on
Commit
b16e7b4
·
unverified ·
2 Parent(s): 8a5273e 01450bd

Merge pull request #45 from hbmartin/decrypt-performance

Browse files
.gitignore CHANGED
@@ -54,7 +54,11 @@ coverage.xml
54
  *.cover
55
  .hypothesis/
56
  .pytest_cache/
 
 
 
57
  prof/
 
58
 
59
  # Debian Files
60
  debian/files
 
54
  *.cover
55
  .hypothesis/
56
  .pytest_cache/
57
+ *.mp4
58
+
59
+ # Performance profiling
60
  prof/
61
+ *.cprof
62
 
63
  # Debian Files
64
  debian/files
pytube/__main__.py CHANGED
@@ -70,7 +70,6 @@ class YouTube:
70
  self.player_response: Dict = {}
71
  # streams
72
  self.age_restricted: Optional[bool] = None
73
- self.vid_descr: Optional[str] = None
74
 
75
  self.fmt_streams: List[Stream] = []
76
 
@@ -125,8 +124,6 @@ class YouTube:
125
  title = title[:index] if index > 0 else title
126
  self.player_config_args["title"] = unescape(title)
127
 
128
- if self.watch_html:
129
- self.vid_descr = extract.get_vid_descr(self.watch_html)
130
  # https://github.com/nficano/pytube/issues/165
131
  stream_maps = ["url_encoded_fmt_stream_map"]
132
  if "adaptive_fmts" in self.player_config_args:
@@ -276,9 +273,9 @@ class YouTube:
276
  :rtype: str
277
 
278
  """
279
- return self.vid_descr or (
280
- self.player_response.get("videoDetails", {}).get("shortDescription")
281
- )
282
 
283
  @property
284
  def rating(self) -> float:
 
70
  self.player_response: Dict = {}
71
  # streams
72
  self.age_restricted: Optional[bool] = None
 
73
 
74
  self.fmt_streams: List[Stream] = []
75
 
 
124
  title = title[:index] if index > 0 else title
125
  self.player_config_args["title"] = unescape(title)
126
 
 
 
127
  # https://github.com/nficano/pytube/issues/165
128
  stream_maps = ["url_encoded_fmt_stream_map"]
129
  if "adaptive_fmts" in self.player_config_args:
 
273
  :rtype: str
274
 
275
  """
276
+ return self.player_response.get("videoDetails", {}).get(
277
+ "shortDescription"
278
+ ) or extract.get_vid_descr(self.watch_html)
279
 
280
  @property
281
  def rating(self) -> float:
pytube/cipher.py CHANGED
@@ -20,11 +20,75 @@ from itertools import chain
20
  from typing import List, Tuple, Dict, Callable, Any, Optional
21
 
22
  from pytube.exceptions import RegexMatchError
23
- from pytube.helpers import regex_search, create_logger
24
 
25
  logger = create_logger()
26
 
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  def get_initial_function_name(js: str) -> str:
29
  """Extract the name of the function responsible for computing the signature.
30
  :param str js:
@@ -48,7 +112,6 @@ def get_initial_function_name(js: str) -> str:
48
  r"\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501
49
  r"\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501
50
  ]
51
-
52
  logger.debug("finding initial function name")
53
  for pattern in function_patterns:
54
  regex = re.compile(pattern)
@@ -71,7 +134,6 @@ def get_transform_plan(js: str) -> List[str]:
71
 
72
  **Example**:
73
 
74
- >>> get_transform_plan(js)
75
  ['DE.AJ(a,15)',
76
  'DE.VR(a,3)',
77
  'DE.AJ(a,51)',
@@ -222,68 +284,3 @@ def map_functions(js_func: str) -> Callable:
222
  if re.search(pattern, js_func):
223
  return fn
224
  raise RegexMatchError(caller="map_functions", pattern="multiple")
225
-
226
-
227
- def parse_function(js_func: str) -> Tuple[str, int]:
228
- """Parse the Javascript transform function.
229
-
230
- Break a JavaScript transform function down into a two element ``tuple``
231
- containing the function name and some integer-based argument.
232
-
233
- :param str js_func:
234
- The JavaScript version of the transform function.
235
- :rtype: tuple
236
- :returns:
237
- two element tuple containing the function name and an argument.
238
-
239
- **Example**:
240
-
241
- >>> parse_function('DE.AJ(a,15)')
242
- ('AJ', 15)
243
-
244
- """
245
- logger.debug("parsing transform function")
246
- pattern = r"\w+\.(\w+)\(\w,(\d+)\)"
247
- regex = re.compile(pattern)
248
- parse_match = regex.search(js_func)
249
- if not parse_match:
250
- raise RegexMatchError(caller="parse_function", pattern=pattern)
251
- fn_name, fn_arg = parse_match.groups()
252
- return fn_name, int(fn_arg)
253
-
254
-
255
- def get_signature(js: str, ciphered_signature: str) -> str:
256
- """Decipher the signature.
257
-
258
- Taking the ciphered signature, applies the transform functions.
259
-
260
- :param str js:
261
- The contents of the base.js asset file.
262
- :param str ciphered_signature:
263
- The ciphered signature sent in the ``player_config``.
264
- :rtype: str
265
- :returns:
266
- Decrypted signature required to download the media content.
267
-
268
- """
269
- transform_plan = get_transform_plan(js)
270
- var, _ = transform_plan[0].split(".")
271
- transform_map = get_transform_map(js, var)
272
- signature = list(ciphered_signature)
273
-
274
- for js_func in transform_plan:
275
- name, argument = parse_function(js_func)
276
- signature = transform_map[name](signature, argument)
277
- logger.debug(
278
- "applied transform function\n"
279
- "output: %s\n"
280
- "js_function: %s\n"
281
- "argument: %d\n"
282
- "function: %s",
283
- "".join(signature),
284
- name,
285
- argument,
286
- transform_map[name],
287
- )
288
-
289
- return "".join(signature)
 
20
  from typing import List, Tuple, Dict, Callable, Any, Optional
21
 
22
  from pytube.exceptions import RegexMatchError
23
+ from pytube.helpers import regex_search, create_logger, cache
24
 
25
  logger = create_logger()
26
 
27
 
28
+ class Cipher:
29
+ def __init__(self, js: str):
30
+ self.transform_plan: List[str] = get_transform_plan(js)
31
+ var, _ = self.transform_plan[0].split(".")
32
+ self.transform_map = get_transform_map(js, var)
33
+ self.js_func_regex = re.compile(r"\w+\.(\w+)\(\w,(\d+)\)")
34
+
35
+ def get_signature(self, ciphered_signature: str) -> str:
36
+ """Decipher the signature.
37
+
38
+ Taking the ciphered signature, applies the transform functions.
39
+
40
+ :param str ciphered_signature:
41
+ The ciphered signature sent in the ``player_config``.
42
+ :rtype: str
43
+ :returns:
44
+ Decrypted signature required to download the media content.
45
+ """
46
+ signature = list(ciphered_signature)
47
+
48
+ for js_func in self.transform_plan:
49
+ name, argument = self.parse_function(js_func) # type: ignore
50
+ signature = self.transform_map[name](signature, argument)
51
+ logger.debug(
52
+ "applied transform function\n"
53
+ "output: %s\n"
54
+ "js_function: %s\n"
55
+ "argument: %d\n"
56
+ "function: %s",
57
+ "".join(signature),
58
+ name,
59
+ argument,
60
+ self.transform_map[name],
61
+ )
62
+
63
+ return "".join(signature)
64
+
65
+ @cache
66
+ def parse_function(self, js_func: str) -> Tuple[str, int]:
67
+ """Parse the Javascript transform function.
68
+
69
+ Break a JavaScript transform function down into a two element ``tuple``
70
+ containing the function name and some integer-based argument.
71
+
72
+ :param str js_func:
73
+ The JavaScript version of the transform function.
74
+ :rtype: tuple
75
+ :returns:
76
+ two element tuple containing the function name and an argument.
77
+
78
+ **Example**:
79
+
80
+ >>> parse_function('DE.AJ(a,15)')
81
+ ('AJ', 15)
82
+
83
+ """
84
+ logger.debug("parsing transform function")
85
+ parse_match = self.js_func_regex.search(js_func)
86
+ if not parse_match:
87
+ raise RegexMatchError(caller="parse_function", pattern="js_func_regex")
88
+ fn_name, fn_arg = parse_match.groups()
89
+ return fn_name, int(fn_arg)
90
+
91
+
92
  def get_initial_function_name(js: str) -> str:
93
  """Extract the name of the function responsible for computing the signature.
94
  :param str js:
 
112
  r"\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501
113
  r"\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(", # noqa: E501
114
  ]
 
115
  logger.debug("finding initial function name")
116
  for pattern in function_patterns:
117
  regex = re.compile(pattern)
 
134
 
135
  **Example**:
136
 
 
137
  ['DE.AJ(a,15)',
138
  'DE.VR(a,3)',
139
  'DE.AJ(a,51)',
 
284
  if re.search(pattern, js_func):
285
  return fn
286
  raise RegexMatchError(caller="map_functions", pattern="multiple")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pytube/extract.py CHANGED
@@ -1,7 +1,6 @@
1
  # -*- coding: utf-8 -*-
2
  """This module contains all non-cipher related data extraction logic."""
3
  import json
4
- import pprint
5
  import re
6
  from collections import OrderedDict
7
 
@@ -10,7 +9,7 @@ from typing import Any, Optional, Tuple, List, Dict
10
  from urllib.parse import quote, parse_qs, unquote, parse_qsl
11
  from urllib.parse import urlencode
12
 
13
- from pytube import cipher
14
  from pytube.exceptions import RegexMatchError, HTMLParseError, LiveStreamError
15
  from pytube.helpers import regex_search, logger
16
 
@@ -205,9 +204,10 @@ def get_ytplayer_config(html: str, age_restricted: bool = False) -> Any:
205
  return json.loads(yt_player_config)
206
 
207
 
208
- def get_vid_descr(html: str) -> str:
209
  html_parser = PytubeHTMLParser()
210
- html_parser.feed(html)
 
211
  return html_parser.vid_descr
212
 
213
 
@@ -224,6 +224,7 @@ def apply_signature(config_args: Dict, fmt: str, js: str) -> None:
224
  The contents of the base.js asset file.
225
 
226
  """
 
227
  stream_manifest = config_args[fmt]
228
  live_stream = (
229
  json.loads(config_args["player_response"])
@@ -247,17 +248,13 @@ def apply_signature(config_args: Dict, fmt: str, js: str) -> None:
247
  continue
248
 
249
  if js is not None:
250
- signature = cipher.get_signature(js, stream["s"])
251
  else:
252
  # signature not present in url (line 33), need js to descramble
253
  # TypeError caught in __main__
254
  raise TypeError("JS is None")
255
 
256
- logger.debug(
257
- "finished descrambling signature for itag=%s\n%s",
258
- stream["itag"],
259
- pprint.pformat({"s": stream["s"], "signature": signature,}, indent=2,),
260
- )
261
  # 403 forbidden fix
262
  stream_manifest[i]["url"] = url + "&sig=" + signature
263
 
@@ -320,6 +317,5 @@ def apply_descrambler(stream_data: Dict, key: str) -> None:
320
  {k: unquote(v) for k, v in parse_qsl(i)}
321
  for i in stream_data[key].split(",")
322
  ]
323
- logger.debug(
324
- "applying descrambler\n%s", pprint.pformat(stream_data[key], indent=2),
325
- )
 
1
  # -*- coding: utf-8 -*-
2
  """This module contains all non-cipher related data extraction logic."""
3
  import json
 
4
  import re
5
  from collections import OrderedDict
6
 
 
9
  from urllib.parse import quote, parse_qs, unquote, parse_qsl
10
  from urllib.parse import urlencode
11
 
12
+ from pytube.cipher import Cipher
13
  from pytube.exceptions import RegexMatchError, HTMLParseError, LiveStreamError
14
  from pytube.helpers import regex_search, logger
15
 
 
204
  return json.loads(yt_player_config)
205
 
206
 
207
+ def get_vid_descr(html: Optional[str]) -> str:
208
  html_parser = PytubeHTMLParser()
209
+ if html:
210
+ html_parser.feed(html)
211
  return html_parser.vid_descr
212
 
213
 
 
224
  The contents of the base.js asset file.
225
 
226
  """
227
+ cipher = Cipher(js=js)
228
  stream_manifest = config_args[fmt]
229
  live_stream = (
230
  json.loads(config_args["player_response"])
 
248
  continue
249
 
250
  if js is not None:
251
+ signature = cipher.get_signature(ciphered_signature=stream["s"])
252
  else:
253
  # signature not present in url (line 33), need js to descramble
254
  # TypeError caught in __main__
255
  raise TypeError("JS is None")
256
 
257
+ logger.debug("finished descrambling signature for itag=%s", stream["itag"])
 
 
 
 
258
  # 403 forbidden fix
259
  stream_manifest[i]["url"] = url + "&sig=" + signature
260
 
 
317
  {k: unquote(v) for k, v in parse_qsl(i)}
318
  for i in stream_data[key].split(",")
319
  ]
320
+
321
+ logger.debug("applying descrambler")
 
tests/test_cipher.py CHANGED
@@ -20,17 +20,6 @@ def test_get_transform_object_with_no_match_should_error():
20
  cipher.get_transform_object("asdf", var="lt")
21
 
22
 
23
- def test_parse_function_with_match():
24
- fn_name, fn_arg = cipher.parse_function("DE.AJ(a,15)")
25
- assert fn_name == "AJ"
26
- assert fn_arg == 15
27
-
28
-
29
- def test_parse_function_with_no_match_should_error():
30
- with pytest.raises(RegexMatchError):
31
- cipher.parse_function("asdf")
32
-
33
-
34
  def test_reverse():
35
  reversed_array = cipher.reverse([1, 2, 3, 4], None)
36
  assert reversed_array == [4, 3, 2, 1]
 
20
  cipher.get_transform_object("asdf", var="lt")
21
 
22
 
 
 
 
 
 
 
 
 
 
 
 
23
  def test_reverse():
24
  reversed_array = cipher.reverse([1, 2, 3, 4], None)
25
  assert reversed_array == [4, 3, 2, 1]
tests/test_streams.py CHANGED
@@ -40,11 +40,11 @@ def test_title(cipher_signature):
40
  def test_description(cipher_signature):
41
  expected = (
42
  "PSY - ‘I LUV IT’ M/V @ https://youtu.be/Xvjnoagk6GU\n"
43
- "PSY - ‘New Face’ M/V @https://youtu.be/OwJPPaEyqhI\n"
44
  "PSY - 8TH ALBUM '4X2=8' on iTunes @\n"
45
- "https://smarturl.it/PSY_8thAlbum\n"
46
- "PSY - GANGNAM STYLE(강남스타일) on iTunes @ http://smarturl.it/PsyGangnam\n"
47
- "#PSY #싸이 #GANGNAMSTYLE #강남스타일\n"
48
  "More about PSY@\nhttp://www.youtube.com/officialpsy\n"
49
  "http://www.facebook.com/officialpsy\n"
50
  "http://twitter.com/psy_oppa\n"
@@ -55,14 +55,14 @@ def test_description(cipher_signature):
55
  )
56
  assert cipher_signature.description == expected
57
 
58
- cipher_signature.vid_descr = None
59
  expected = (
60
  "PSY - ‘I LUV IT’ M/V @ https://youtu.be/Xvjnoagk6GU\n"
61
- "PSY - ‘New Face’ M/V @https://youtu.be/OwJPPaEyqhI\n\n"
62
  "PSY - 8TH ALBUM '4X2=8' on iTunes @\n"
63
- "https://smarturl.it/PSY_8thAlbum\n\n"
64
- "PSY - GANGNAM STYLE(강남스타일) on iTunes @ http://smarturl.it/PsyGangnam\n\n"
65
- "#PSY #싸이 #GANGNAMSTYLE #강남스타일\n\n"
66
  "More about PSY@\nhttp://www.youtube.com/officialpsy\n"
67
  "http://www.facebook.com/officialpsy\n"
68
  "http://twitter.com/psy_oppa\n"
 
40
  def test_description(cipher_signature):
41
  expected = (
42
  "PSY - ‘I LUV IT’ M/V @ https://youtu.be/Xvjnoagk6GU\n"
43
+ "PSY - ‘New Face’ M/V @https://youtu.be/OwJPPaEyqhI\n\n"
44
  "PSY - 8TH ALBUM '4X2=8' on iTunes @\n"
45
+ "https://smarturl.it/PSY_8thAlbum\n\n"
46
+ "PSY - GANGNAM STYLE(강남스타일) on iTunes @ http://smarturl.it/PsyGangnam\n\n"
47
+ "#PSY #싸이 #GANGNAMSTYLE #강남스타일\n\n"
48
  "More about PSY@\nhttp://www.youtube.com/officialpsy\n"
49
  "http://www.facebook.com/officialpsy\n"
50
  "http://twitter.com/psy_oppa\n"
 
55
  )
56
  assert cipher_signature.description == expected
57
 
58
+ cipher_signature.player_response = {}
59
  expected = (
60
  "PSY - ‘I LUV IT’ M/V @ https://youtu.be/Xvjnoagk6GU\n"
61
+ "PSY - ‘New Face’ M/V @https://youtu.be/OwJPPaEyqhI\n"
62
  "PSY - 8TH ALBUM '4X2=8' on iTunes @\n"
63
+ "https://smarturl.it/PSY_8thAlbum\n"
64
+ "PSY - GANGNAM STYLE(강남스타일) on iTunes @ http://smarturl.it/PsyGangnam\n"
65
+ "#PSY #싸이 #GANGNAMSTYLE #강남스타일\n"
66
  "More about PSY@\nhttp://www.youtube.com/officialpsy\n"
67
  "http://www.facebook.com/officialpsy\n"
68
  "http://twitter.com/psy_oppa\n"