Fuan
commited on
Fix regex to find throttle function name (#1222)
Browse files* Fix regex to find throttle function name
The javascript now stores the throttling function name in an array.
Fix https://github.com/pytube/pytube/issues/1218
* Fix array parsing
Strip whitespaces around symbol names for future-proofing.
The variable name might be "b" right now, but it could change in the
future.
- pytube/cipher.py +22 -4
- tests/conftest.py +14 -0
- tests/mocks/base.js.gz +0 -0
- tests/test_cipher.py +11 -0
pytube/cipher.py
CHANGED
@@ -263,9 +263,14 @@ def get_throttling_function_name(js: str) -> str:
|
|
263 |
"""
|
264 |
function_patterns = [
|
265 |
# https://github.com/ytdl-org/youtube-dl/issues/29326#issuecomment-865985377
|
266 |
-
#
|
267 |
-
#
|
268 |
-
|
|
|
|
|
|
|
|
|
|
|
269 |
]
|
270 |
logger.debug('Finding throttling function name')
|
271 |
for pattern in function_patterns:
|
@@ -273,7 +278,20 @@ def get_throttling_function_name(js: str) -> str:
|
|
273 |
function_match = regex.search(js)
|
274 |
if function_match:
|
275 |
logger.debug("finished regex search, matched: %s", pattern)
|
276 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
277 |
|
278 |
raise RegexMatchError(
|
279 |
caller="get_throttling_function_name", pattern="multiple"
|
|
|
263 |
"""
|
264 |
function_patterns = [
|
265 |
# https://github.com/ytdl-org/youtube-dl/issues/29326#issuecomment-865985377
|
266 |
+
# https://github.com/yt-dlp/yt-dlp/commit/48416bc4a8f1d5ff07d5977659cb8ece7640dcd8
|
267 |
+
# var Bpa = [iha];
|
268 |
+
# ...
|
269 |
+
# a.C && (b = a.get("n")) && (b = Bpa[0](b), a.set("n", b),
|
270 |
+
# Bpa.length || iha("")) }};
|
271 |
+
# In the above case, `iha` is the relevant function name
|
272 |
+
r'a\.[a-zA-Z]\s*&&\s*\([a-z]\s*=\s*a\.get\("n"\)\)\s*&&\s*'
|
273 |
+
r'\([a-z]\s*=\s*([a-zA-Z0-9$]{3})(\[\d+\])?\([a-z]\)',
|
274 |
]
|
275 |
logger.debug('Finding throttling function name')
|
276 |
for pattern in function_patterns:
|
|
|
278 |
function_match = regex.search(js)
|
279 |
if function_match:
|
280 |
logger.debug("finished regex search, matched: %s", pattern)
|
281 |
+
if len(function_match.groups()) == 1:
|
282 |
+
return function_match.group(1)
|
283 |
+
idx = function_match.group(2)
|
284 |
+
if idx:
|
285 |
+
idx = idx.strip("[]")
|
286 |
+
array = re.search(
|
287 |
+
r'var {nfunc}\s*=\s*(\[.+?\]);'.format(
|
288 |
+
nfunc=function_match.group(1)),
|
289 |
+
js
|
290 |
+
)
|
291 |
+
if array:
|
292 |
+
array = array.group(1).strip("[]").split(",")
|
293 |
+
array = [x.strip() for x in array]
|
294 |
+
return array[int(idx)]
|
295 |
|
296 |
raise RegexMatchError(
|
297 |
caller="get_throttling_function_name", pattern="multiple"
|
tests/conftest.py
CHANGED
@@ -146,3 +146,17 @@ def channel_videos_html():
|
|
146 |
)
|
147 |
with gzip.open(file_path, 'rb') as f:
|
148 |
return f.read().decode('utf-8')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
)
|
147 |
with gzip.open(file_path, 'rb') as f:
|
148 |
return f.read().decode('utf-8')
|
149 |
+
|
150 |
+
|
151 |
+
@pytest.fixture
|
152 |
+
def base_js():
|
153 |
+
"""Youtube base.js retrieved on 2022-02-04 from
|
154 |
+
https://www.youtube.com/watch?v=vmzxpUsN0uA
|
155 |
+
"""
|
156 |
+
file_path = os.path.join(
|
157 |
+
os.path.dirname(os.path.realpath(__file__)),
|
158 |
+
"mocks",
|
159 |
+
"base.js.gz",
|
160 |
+
)
|
161 |
+
with gzip.open(file_path, 'rb') as f:
|
162 |
+
return f.read().decode('utf-8')
|
tests/mocks/base.js.gz
ADDED
Binary file (611 kB). View file
|
|
tests/test_cipher.py
CHANGED
@@ -77,3 +77,14 @@ def test_js_splice():
|
|
77 |
for args, result in mapping.items():
|
78 |
a = [1, 2, 3, 4]
|
79 |
assert cipher.js_splice(a, *args) == result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
for args, result in mapping.items():
|
78 |
a = [1, 2, 3, 4]
|
79 |
assert cipher.js_splice(a, *args) == result
|
80 |
+
|
81 |
+
|
82 |
+
def test_get_throttling_function_name(base_js):
|
83 |
+
# Values expected as of 2022/02/04:
|
84 |
+
raw_var = r'var Apa=[hha]'
|
85 |
+
assert raw_var in base_js
|
86 |
+
raw_code = r'a.url="";a.C&&(b=a.get("n"))&&(b=Apa[0](b),a.set("n",b),'\
|
87 |
+
r'Apa.length||hha(""))}};'
|
88 |
+
assert raw_code in base_js
|
89 |
+
func_name = cipher.get_throttling_function_name(base_js)
|
90 |
+
assert func_name == "hha"
|