Taylor Fox Dahlin commited on
Commit
e0d67e2
·
unverified ·
1 Parent(s): b10f64e

[Feature] Video metadata (#809)

Browse files

* Added accept-language to request headers to coerce certain strings sent by YouTube into english (e.g. 'This video is private.'
* Implemented metadata class.

.github/workflows/ci.yml CHANGED
@@ -13,7 +13,7 @@ jobs:
13
 
14
  strategy:
15
  matrix:
16
- python: [3.5, 3.6, 3.7, 3.8]
17
 
18
  steps:
19
  - name: Checkout repo
 
13
 
14
  strategy:
15
  matrix:
16
+ python: [3.6, 3.7, 3.8, 3.9]
17
 
18
  steps:
19
  - name: Checkout repo
pytube/__main__.py CHANGED
@@ -27,6 +27,7 @@ from pytube.extract import apply_descrambler
27
  from pytube.extract import apply_signature
28
  from pytube.extract import get_ytplayer_config
29
  from pytube.helpers import install_proxy
 
30
  from pytube.monostate import Monostate
31
  from pytube.monostate import OnComplete
32
  from pytube.monostate import OnProgress
@@ -60,23 +61,17 @@ class YouTube:
60
 
61
  """
62
  self.js: Optional[str] = None # js fetched by js_url
63
- self.js_url: Optional[
64
- str
65
- ] = None # the url to the js, parsed from watch html
66
 
67
  # note: vid_info may eventually be removed. It sounds like it once had
68
  # additional formats, but that doesn't appear to still be the case.
69
 
70
  # the url to vid info, parsed from watch html
71
  self.vid_info_url: Optional[str] = None
72
- self.vid_info_raw: Optional[
73
- str
74
- ] = None # content fetched by vid_info_url
75
  self.vid_info: Optional[Dict] = None # parsed content of vid_info_raw
76
 
77
- self.watch_html: Optional[
78
- str
79
- ] = None # the html of /watch?v=<video_id>
80
  self.embed_html: Optional[str] = None
81
  self.player_config_args: Dict = {} # inline js in the html containing
82
  self.player_response: Dict = {}
@@ -85,6 +80,10 @@ class YouTube:
85
 
86
  self.fmt_streams: List[Stream] = []
87
 
 
 
 
 
88
  # video_id part of /watch?v=<video_id>
89
  self.video_id = extract.video_id(url)
90
 
@@ -187,6 +186,9 @@ class YouTube:
187
  video_id=self.video_id, watch_url=self.watch_url
188
  )
189
 
 
 
 
190
  self.vid_info_raw = request.get(self.vid_info_url)
191
  if not self.age_restricted:
192
  self.js_url = extract.js_url(self.watch_html)
@@ -287,9 +289,7 @@ class YouTube:
287
  :rtype: str
288
 
289
  """
290
- return self.player_response.get("videoDetails", {}).get(
291
- "shortDescription"
292
- )
293
 
294
  @property
295
  def rating(self) -> float:
@@ -298,9 +298,7 @@ class YouTube:
298
  :rtype: float
299
 
300
  """
301
- return self.player_response.get("videoDetails", {}).get(
302
- "averageRating"
303
- )
304
 
305
  @property
306
  def length(self) -> int:
@@ -338,6 +336,18 @@ class YouTube:
338
  "author", "unknown"
339
  )
340
 
 
 
 
 
 
 
 
 
 
 
 
 
341
  def register_on_progress_callback(self, func: OnProgress):
342
  """Register a download progress callback function post initialization.
343
 
 
27
  from pytube.extract import apply_signature
28
  from pytube.extract import get_ytplayer_config
29
  from pytube.helpers import install_proxy
30
+ from pytube.metadata import YouTubeMetadata
31
  from pytube.monostate import Monostate
32
  from pytube.monostate import OnComplete
33
  from pytube.monostate import OnProgress
 
61
 
62
  """
63
  self.js: Optional[str] = None # js fetched by js_url
64
+ self.js_url: Optional[str] = None # the url to the js, parsed from watch html
 
 
65
 
66
  # note: vid_info may eventually be removed. It sounds like it once had
67
  # additional formats, but that doesn't appear to still be the case.
68
 
69
  # the url to vid info, parsed from watch html
70
  self.vid_info_url: Optional[str] = None
71
+ self.vid_info_raw: Optional[str] = None # content fetched by vid_info_url
 
 
72
  self.vid_info: Optional[Dict] = None # parsed content of vid_info_raw
73
 
74
+ self.watch_html: Optional[str] = None # the html of /watch?v=<video_id>
 
 
75
  self.embed_html: Optional[str] = None
76
  self.player_config_args: Dict = {} # inline js in the html containing
77
  self.player_response: Dict = {}
 
80
 
81
  self.fmt_streams: List[Stream] = []
82
 
83
+ self.initial_data_raw = None
84
+ self.initial_data = {}
85
+ self._metadata: Optional[YouTubeMetadata] = None
86
+
87
  # video_id part of /watch?v=<video_id>
88
  self.video_id = extract.video_id(url)
89
 
 
186
  video_id=self.video_id, watch_url=self.watch_url
187
  )
188
 
189
+ self.initial_data_raw = extract.initial_data(self.watch_html)
190
+ self.initial_data = json.loads(self.initial_data_raw)
191
+
192
  self.vid_info_raw = request.get(self.vid_info_url)
193
  if not self.age_restricted:
194
  self.js_url = extract.js_url(self.watch_html)
 
289
  :rtype: str
290
 
291
  """
292
+ return self.player_response.get("videoDetails", {}).get("shortDescription")
 
 
293
 
294
  @property
295
  def rating(self) -> float:
 
298
  :rtype: float
299
 
300
  """
301
+ return self.player_response.get("videoDetails", {}).get("averageRating")
 
 
302
 
303
  @property
304
  def length(self) -> int:
 
336
  "author", "unknown"
337
  )
338
 
339
+ @property
340
+ def metadata(self) -> Optional[YouTubeMetadata]:
341
+ """Get the metadata for the video.
342
+
343
+ :rtype: YouTubeMetadata
344
+ """
345
+ if self._metadata:
346
+ return self._metadata
347
+ else:
348
+ self._metadata = extract.metadata(self.initial_data)
349
+ return self._metadata
350
+
351
  def register_on_progress_callback(self, func: OnProgress):
352
  """Register a download progress callback function post initialization.
353
 
pytube/extract.py CHANGED
@@ -8,6 +8,7 @@ from datetime import datetime
8
  from typing import Any
9
  from typing import Dict
10
  from typing import List
 
11
  from typing import Tuple
12
  from urllib.parse import parse_qs
13
  from urllib.parse import parse_qsl
@@ -19,6 +20,7 @@ from pytube.cipher import Cipher
19
  from pytube.exceptions import LiveStreamError
20
  from pytube.exceptions import RegexMatchError
21
  from pytube.helpers import regex_search
 
22
 
23
  logger = logging.getLogger(__name__)
24
 
@@ -396,3 +398,58 @@ def apply_descrambler(stream_data: Dict, key: str) -> None:
396
  ]
397
 
398
  logger.debug("applying descrambler")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  from typing import Any
9
  from typing import Dict
10
  from typing import List
11
+ from typing import Optional
12
  from typing import Tuple
13
  from urllib.parse import parse_qs
14
  from urllib.parse import parse_qsl
 
20
  from pytube.exceptions import LiveStreamError
21
  from pytube.exceptions import RegexMatchError
22
  from pytube.helpers import regex_search
23
+ from pytube.metadata import YouTubeMetadata
24
 
25
  logger = logging.getLogger(__name__)
26
 
 
398
  ]
399
 
400
  logger.debug("applying descrambler")
401
+
402
+
403
+ def initial_data(watch_html: str) -> str:
404
+ """Extract the ytInitialData json from the watch_html page.
405
+
406
+ This mostly contains metadata necessary for rendering the page on-load,
407
+ such as video information, copyright notices, etc.
408
+
409
+ @param watch_html: Html of the watch page
410
+ @return:
411
+ """
412
+ initial_data_pattern = r"window\[['\"]ytInitialData['\"]]\s*=\s*([^\n]+)"
413
+ try:
414
+ match = regex_search(initial_data_pattern, watch_html, 1)
415
+ except RegexMatchError:
416
+ return "{}"
417
+ else:
418
+ return match[:-1]
419
+
420
+
421
+ def metadata(initial_data) -> Optional[YouTubeMetadata]:
422
+ """Get the informational metadata for the video.
423
+
424
+ e.g.:
425
+ [
426
+ {
427
+ 'Song': '강남스타일(Gangnam Style)',
428
+ 'Artist': 'PSY',
429
+ 'Album': 'PSY SIX RULES Pt.1',
430
+ 'Licensed to YouTube by': 'YG Entertainment Inc. [...]'
431
+ }
432
+ ]
433
+
434
+ :rtype: YouTubeMetadata
435
+ """
436
+ try:
437
+ metadata_rows: List = initial_data["contents"]["twoColumnWatchNextResults"][
438
+ "results"]["results"]["contents"][1]["videoSecondaryInfoRenderer"][
439
+ "metadataRowContainer"]["metadataRowContainerRenderer"]["rows"]
440
+ except (KeyError, IndexError):
441
+ # If there's an exception accessing this data, it probably doesn't exist.
442
+ return YouTubeMetadata([])
443
+
444
+ # Rows appear to only have "metadataRowRenderer" or "metadataRowHeaderRenderer"
445
+ # and we only care about the former, so we filter the others
446
+ metadata_rows = filter(
447
+ lambda x: "metadataRowRenderer" in x.keys(),
448
+ metadata_rows
449
+ )
450
+
451
+ # We then access the metadataRowRenderer key in each element
452
+ # and build a metadata object from this new list
453
+ metadata_rows = [x["metadataRowRenderer"] for x in metadata_rows]
454
+
455
+ return YouTubeMetadata(metadata_rows)
pytube/metadata.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """This module contains the YouTubeMetadata class."""
3
+ import json
4
+ from typing import Dict
5
+ from typing import List
6
+ from typing import Optional
7
+
8
+
9
+ class YouTubeMetadata:
10
+ def __init__(self, metadata: List):
11
+ self._raw_metadata: List = metadata
12
+ self._metadata = [{}]
13
+
14
+ for el in metadata:
15
+ # We only add metadata to the dict if it has a simpleText title.
16
+ if 'title' in el and 'simpleText' in el['title']:
17
+ metadata_title = el['title']['simpleText']
18
+ else:
19
+ continue
20
+
21
+ contents = el['contents'][0]
22
+ if 'simpleText' in contents:
23
+ self._metadata[-1][metadata_title] = contents['simpleText']
24
+ elif 'runs' in contents:
25
+ self._metadata[-1][metadata_title] = contents['runs'][0]['text']
26
+
27
+ # Upon reaching a dividing line, create a new grouping
28
+ if el.get('hasDividerLine', False):
29
+ self._metadata.append({})
30
+
31
+ # If we happen to create an empty dict at the end, drop it
32
+ if self._metadata[-1] == {}:
33
+ self._metadata = self._metadata[:-1]
34
+
35
+ def __iter__(self):
36
+ for el in self._metadata:
37
+ yield el
38
+
39
+ def __str__(self):
40
+ return json.dumps(self._metadata)
41
+
42
+ @property
43
+ def raw_metadata(self) -> Optional[Dict]:
44
+ return self._raw_metadata
45
+
46
+ @property
47
+ def metadata(self):
48
+ return self._metadata
pytube/request.py CHANGED
@@ -16,7 +16,7 @@ default_range_size = 9437184 # 9MB
16
 
17
 
18
  def _execute_request(url, method=None, headers=None):
19
- base_headers = {"User-Agent": "Mozilla/5.0"}
20
  if headers:
21
  base_headers.update(headers)
22
  if url.lower().startswith("http"):
 
16
 
17
 
18
  def _execute_request(url, method=None, headers=None):
19
+ base_headers = {"User-Agent": "Mozilla/5.0", "accept-language": "en-US,en"}
20
  if headers:
21
  base_headers.update(headers)
22
  if url.lower().startswith("http"):
tests/test_extract.py CHANGED
@@ -102,3 +102,13 @@ def test_signature_cipher_does_not_error(stream_dict):
102
  config_args = extract.get_ytplayer_config(stream_dict)['args']
103
  extract.apply_descrambler(config_args, "url_encoded_fmt_stream_map")
104
  assert "s" in config_args["url_encoded_fmt_stream_map"][0].keys()
 
 
 
 
 
 
 
 
 
 
 
102
  config_args = extract.get_ytplayer_config(stream_dict)['args']
103
  extract.apply_descrambler(config_args, "url_encoded_fmt_stream_map")
104
  assert "s" in config_args["url_encoded_fmt_stream_map"][0].keys()
105
+
106
+
107
+ def test_initial_data_missing():
108
+ initial_data = extract.initial_data('')
109
+ assert initial_data == "{}"
110
+
111
+
112
+ def test_initial_data(stream_dict):
113
+ initial_data = extract.initial_data(stream_dict)
114
+ assert 'contents' in initial_data
tests/test_metadata.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Unit tests for the :module:`metadata <metadata>` module."""
3
+ import json
4
+ from pytube import extract
5
+
6
+
7
+ def test_extract_metadata_empty():
8
+ ytmd = extract.metadata({})
9
+ assert ytmd._raw_metadata == []
10
+
11
+
12
+ def test_metadata_from_initial_data(stream_dict):
13
+ initial_data = extract.initial_data(stream_dict)
14
+ ytmd = extract.metadata(json.loads(initial_data))
15
+ assert len(ytmd.raw_metadata) > 0
16
+ assert 'contents' in ytmd.raw_metadata[0]
17
+ assert len(ytmd.metadata) > 0
18
+ assert 'Song' in ytmd.metadata[0]