Fuan commited on
Commit
642bb0a
·
unverified ·
1 Parent(s): e8f4391

Fix regex to find throttle function name (#1222)

Browse files

* Fix regex to find throttle function name

The javascript now stores the throttling function name in an array.
Fix https://github.com/pytube/pytube/issues/1218

* Fix array parsing

Strip whitespaces around symbol names for future-proofing.

The variable name might be "b" right now, but it could change in the
future.

pytube/cipher.py CHANGED
@@ -263,9 +263,14 @@ def get_throttling_function_name(js: str) -> str:
263
  """
264
  function_patterns = [
265
  # https://github.com/ytdl-org/youtube-dl/issues/29326#issuecomment-865985377
266
- # a.C&&(b=a.get("n"))&&(b=Dea(b),a.set("n",b))}};
267
- # In above case, `Dea` is the relevant function name
268
- r'a\.[A-Z]&&\(b=a\.get\("n"\)\)&&\(b=([^(]+)\(b\)',
 
 
 
 
 
269
  ]
270
  logger.debug('Finding throttling function name')
271
  for pattern in function_patterns:
@@ -273,7 +278,20 @@ def get_throttling_function_name(js: str) -> str:
273
  function_match = regex.search(js)
274
  if function_match:
275
  logger.debug("finished regex search, matched: %s", pattern)
276
- return function_match.group(1)
 
 
 
 
 
 
 
 
 
 
 
 
 
277
 
278
  raise RegexMatchError(
279
  caller="get_throttling_function_name", pattern="multiple"
 
263
  """
264
  function_patterns = [
265
  # https://github.com/ytdl-org/youtube-dl/issues/29326#issuecomment-865985377
266
+ # https://github.com/yt-dlp/yt-dlp/commit/48416bc4a8f1d5ff07d5977659cb8ece7640dcd8
267
+ # var Bpa = [iha];
268
+ # ...
269
+ # a.C && (b = a.get("n")) && (b = Bpa[0](b), a.set("n", b),
270
+ # Bpa.length || iha("")) }};
271
+ # In the above case, `iha` is the relevant function name
272
+ r'a\.[a-zA-Z]\s*&&\s*\([a-z]\s*=\s*a\.get\("n"\)\)\s*&&\s*'
273
+ r'\([a-z]\s*=\s*([a-zA-Z0-9$]{3})(\[\d+\])?\([a-z]\)',
274
  ]
275
  logger.debug('Finding throttling function name')
276
  for pattern in function_patterns:
 
278
  function_match = regex.search(js)
279
  if function_match:
280
  logger.debug("finished regex search, matched: %s", pattern)
281
+ if len(function_match.groups()) == 1:
282
+ return function_match.group(1)
283
+ idx = function_match.group(2)
284
+ if idx:
285
+ idx = idx.strip("[]")
286
+ array = re.search(
287
+ r'var {nfunc}\s*=\s*(\[.+?\]);'.format(
288
+ nfunc=function_match.group(1)),
289
+ js
290
+ )
291
+ if array:
292
+ array = array.group(1).strip("[]").split(",")
293
+ array = [x.strip() for x in array]
294
+ return array[int(idx)]
295
 
296
  raise RegexMatchError(
297
  caller="get_throttling_function_name", pattern="multiple"
tests/conftest.py CHANGED
@@ -146,3 +146,17 @@ def channel_videos_html():
146
  )
147
  with gzip.open(file_path, 'rb') as f:
148
  return f.read().decode('utf-8')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  )
147
  with gzip.open(file_path, 'rb') as f:
148
  return f.read().decode('utf-8')
149
+
150
+
151
+ @pytest.fixture
152
+ def base_js():
153
+ """Youtube base.js retrieved on 2022-02-04 from
154
+ https://www.youtube.com/watch?v=vmzxpUsN0uA
155
+ """
156
+ file_path = os.path.join(
157
+ os.path.dirname(os.path.realpath(__file__)),
158
+ "mocks",
159
+ "base.js.gz",
160
+ )
161
+ with gzip.open(file_path, 'rb') as f:
162
+ return f.read().decode('utf-8')
tests/mocks/base.js.gz ADDED
Binary file (611 kB). View file
 
tests/test_cipher.py CHANGED
@@ -77,3 +77,14 @@ def test_js_splice():
77
  for args, result in mapping.items():
78
  a = [1, 2, 3, 4]
79
  assert cipher.js_splice(a, *args) == result
 
 
 
 
 
 
 
 
 
 
 
 
77
  for args, result in mapping.items():
78
  a = [1, 2, 3, 4]
79
  assert cipher.js_splice(a, *args) == result
80
+
81
+
82
+ def test_get_throttling_function_name(base_js):
83
+ # Values expected as of 2022/02/04:
84
+ raw_var = r'var Apa=[hha]'
85
+ assert raw_var in base_js
86
+ raw_code = r'a.url="";a.C&&(b=a.get("n"))&&(b=Apa[0](b),a.set("n",b),'\
87
+ r'Apa.length||hha(""))}};'
88
+ assert raw_code in base_js
89
+ func_name = cipher.get_throttling_function_name(base_js)
90
+ assert func_name == "hha"