nficano commited on
Commit
3a0daa7
·
2 Parent(s): 64a4e3a 114d355

Merge pull request #161 from nficano/feature-caption-support

Browse files
docs/api.rst CHANGED
@@ -27,6 +27,20 @@ StreamQuery Object
27
  :members:
28
  :inherited-members:
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  Extract
31
  -------
32
 
 
27
  :members:
28
  :inherited-members:
29
 
30
+ Caption Object
31
+ -------------
32
+
33
+ .. autoclass:: pytube.Caption
34
+ :members:
35
+ :inherited-members:
36
+
37
+ CaptionQuery Object
38
+ ------------------
39
+
40
+ .. autoclass:: pytube.query.CaptionQuery
41
+ :members:
42
+ :inherited-members:
43
+
44
  Extract
45
  -------
46
 
pytube/__init__.py CHANGED
@@ -15,8 +15,10 @@ __license__ = 'MIT License'
15
  __copyright__ = 'Copyright 2017 Nick Ficano'
16
 
17
  from pytube.logging import create_logger
 
18
  from pytube.query import StreamQuery
19
  from pytube.streams import Stream
 
20
  from pytube.__main__ import YouTube
21
 
22
  logger = create_logger()
 
15
  __copyright__ = 'Copyright 2017 Nick Ficano'
16
 
17
  from pytube.logging import create_logger
18
+ from pytube.query import CaptionQuery
19
  from pytube.query import StreamQuery
20
  from pytube.streams import Stream
21
+ from pytube.captions import Caption
22
  from pytube.__main__ import YouTube
23
 
24
  logger = create_logger()
pytube/__main__.py CHANGED
@@ -12,6 +12,8 @@ from __future__ import absolute_import
12
  import json
13
  import logging
14
 
 
 
15
  from pytube import extract
16
  from pytube import mixins
17
  from pytube import request
@@ -59,6 +61,7 @@ class YouTube(object):
59
  self.player_config = None # inline js in the html containing streams
60
 
61
  self.fmt_streams = [] # list of :class:`Stream <Stream>` instances
 
62
 
63
  # video_id part of /watch?v=<video_id>
64
  self.video_id = extract.video_id(url)
@@ -115,6 +118,8 @@ class YouTube(object):
115
  # build instances of :class:`Stream <Stream>`
116
  self.initialize_stream_objects(progressive_fmts)
117
  self.initialize_stream_objects(adaptive_fmts)
 
 
118
  logger.info('init finished successfully')
119
 
120
  def prefetch(self):
@@ -158,6 +163,29 @@ class YouTube(object):
158
  )
159
  self.fmt_streams.append(video)
160
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  @property
162
  def streams(self):
163
  """Interface to query both adaptive (DASH) and progressive streams."""
 
12
  import json
13
  import logging
14
 
15
+ from pytube import Caption
16
+ from pytube import CaptionQuery
17
  from pytube import extract
18
  from pytube import mixins
19
  from pytube import request
 
61
  self.player_config = None # inline js in the html containing streams
62
 
63
  self.fmt_streams = [] # list of :class:`Stream <Stream>` instances
64
+ self.caption_tracks = []
65
 
66
  # video_id part of /watch?v=<video_id>
67
  self.video_id = extract.video_id(url)
 
118
  # build instances of :class:`Stream <Stream>`
119
  self.initialize_stream_objects(progressive_fmts)
120
  self.initialize_stream_objects(adaptive_fmts)
121
+
122
+ self.initialize_caption_objects()
123
  logger.info('init finished successfully')
124
 
125
  def prefetch(self):
 
163
  )
164
  self.fmt_streams.append(video)
165
 
166
+ def initialize_caption_objects(self):
167
+ """Populate instances of :class:`Caption <Caption>`.
168
+
169
+ Take the unscrambled player response data, and use it to initialize
170
+ instances of :class:`Caption <Caption>`.
171
+ """
172
+ if 'captions' not in self.player_config['args']['player_response']:
173
+ return
174
+ caption_tracks = (
175
+ self.player_config['args']
176
+ ['player_response']
177
+ ['captions']
178
+ ['playerCaptionsTracklistRenderer']
179
+ ['captionTracks']
180
+ )
181
+ for caption_track in caption_tracks:
182
+ self.caption_tracks.append(Caption(caption_track))
183
+
184
+ @property
185
+ def captions(self):
186
+ """Interface to query caption tracks."""
187
+ return CaptionQuery([c for c in self.caption_tracks])
188
+
189
  @property
190
  def streams(self):
191
  """Interface to query both adaptive (DASH) and progressive streams."""
pytube/captions.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """This module contrains a container for caption tracks."""
3
+ from pytube import request
4
+ from pytube.helpers import xml_caption_to_srt
5
+
6
+
7
+ class Caption:
8
+ """Container for caption tracks."""
9
+
10
+ def __init__(self, caption_track):
11
+ """Construct a :class:`Caption <Caption>`.
12
+
13
+ :param dict caption_track:
14
+ Caption track data extracted from ``watch_html``.
15
+ """
16
+ self.url = caption_track.get('baseUrl')
17
+ self.name = caption_track['name']['simpleText']
18
+ self.code = caption_track['languageCode']
19
+
20
+ @property
21
+ def xml_captions(self):
22
+ """Download the xml caption tracks."""
23
+ return request.get(self.url)
24
+
25
+ def generate_srt_captions(self):
26
+ """Generate "SubRip Subtitle" captions.
27
+
28
+ Takes the xml captions from :meth:`~pytube.Caption.xml_captions` and
29
+ recompiles them into the "SubRip Subtitle" format.
30
+ """
31
+ return xml_caption_to_srt(self.xml_captions)
32
+
33
+ def __repr__(self):
34
+ """Printable object representation."""
35
+ return'<Caption lang="{s.name}" code="{s.code}">'.format(s=self)
pytube/compat.py CHANGED
@@ -13,6 +13,12 @@ if python_version == 2:
13
  from urllib2 import unquote
14
  from urllib2 import urlopen
15
  from urlparse import parse_qsl
 
 
 
 
 
 
16
 
17
  def unicode(s):
18
  """Encode a string to utf-8."""
@@ -25,6 +31,7 @@ elif python_version == 3:
25
  from urllib.parse import unquote
26
  from urllib.parse import urlencode
27
  from urllib.request import urlopen
 
28
 
29
  def unicode(s):
30
  """No-op."""
 
13
  from urllib2 import unquote
14
  from urllib2 import urlopen
15
  from urlparse import parse_qsl
16
+ from HTMLParser import HTMLParser
17
+
18
+ def unescape(s):
19
+ """Strip HTML entries from a string."""
20
+ html_parser = HTMLParser()
21
+ return html_parser.unescape(s)
22
 
23
  def unicode(s):
24
  """Encode a string to utf-8."""
 
31
  from urllib.parse import unquote
32
  from urllib.parse import urlencode
33
  from urllib.request import urlopen
34
+ from html import unescape
35
 
36
  def unicode(s):
37
  """No-op."""
pytube/helpers.py CHANGED
@@ -3,9 +3,13 @@
3
  from __future__ import absolute_import
4
 
5
  import logging
 
6
  import pprint
7
  import re
 
 
8
 
 
9
  from pytube.compat import unicode
10
  from pytube.exceptions import RegexMatchError
11
 
@@ -88,3 +92,50 @@ def safe_filename(s, max_length=255):
88
  regex = re.compile(pattern, re.UNICODE)
89
  filename = regex.sub('', s)
90
  return unicode(filename[:max_length].rsplit(' ', 0)[0])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  from __future__ import absolute_import
4
 
5
  import logging
6
+ import math
7
  import pprint
8
  import re
9
+ import time
10
+ import xml.etree.ElementTree as ElementTree
11
 
12
+ from pytube.compat import unescape
13
  from pytube.compat import unicode
14
  from pytube.exceptions import RegexMatchError
15
 
 
92
  regex = re.compile(pattern, re.UNICODE)
93
  filename = regex.sub('', s)
94
  return unicode(filename[:max_length].rsplit(' ', 0)[0])
95
+
96
+
97
+ def float_to_srt_time_format(d):
98
+ """Convert decimal durations into proper srt format.
99
+
100
+ :rtype: str
101
+ :returns:
102
+ SubRip Subtitle (str) formatted time duration.
103
+
104
+ >>> float_to_srt_time_format(3.89)
105
+ '00:00:03,890'
106
+ """
107
+ frac, whole = math.modf(d)
108
+ time_fmt = time.strftime('0%H:0%M:%S,', time.gmtime(whole))
109
+ ms = '{:.3f}'.format(frac).replace('0.', '')
110
+ return time_fmt + ms
111
+
112
+
113
+ def xml_caption_to_srt(xml_captions):
114
+ """Convert xml caption tracks to "SubRip Subtitle (srt)".
115
+
116
+ :param str xml_captions:
117
+ XML formatted caption tracks.
118
+ """
119
+ segments = []
120
+ root = ElementTree.fromstring(xml_captions)
121
+ for i, child in enumerate(root.getchildren()):
122
+ text = child.text or ''
123
+ caption = unescape(
124
+ text
125
+ .replace('\n', ' ')
126
+ .replace(' ', ' '),
127
+ )
128
+ duration = float(child.attrib['dur'])
129
+ start = float(child.attrib['start'])
130
+ end = start + duration
131
+ sequence_number = i + 1 # convert from 0-indexed to 1.
132
+ line = (
133
+ '{seq}\n{start} --> {end}\n{text}\n'.format(
134
+ seq=sequence_number,
135
+ start=float_to_srt_time_format(start),
136
+ end=float_to_srt_time_format(end),
137
+ text=caption,
138
+ )
139
+ )
140
+ segments.append(line)
141
+ return '\n'.join(segments).strip()
pytube/query.py CHANGED
@@ -1,13 +1,16 @@
1
  # -*- coding: utf-8 -*-
2
- """This module provides a query interface for media streams."""
3
 
4
 
5
  class StreamQuery:
6
  """Interface for querying the available media streams."""
7
 
8
  def __init__(self, fmt_streams):
9
- """Construct a :class:`StreamQuery <StreamQuery>`."""
10
- # list of :class:`Stream <Stream>` instances.
 
 
 
11
  self.fmt_streams = fmt_streams
12
  self.itag_index = {int(s.itag): s for s in fmt_streams}
13
 
@@ -224,3 +227,33 @@ class StreamQuery:
224
  def all(self):
225
  """Get all the results represented by this query as a list."""
226
  return self.fmt_streams
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # -*- coding: utf-8 -*-
2
+ """This module provides a query interface for media streams and captions."""
3
 
4
 
5
  class StreamQuery:
6
  """Interface for querying the available media streams."""
7
 
8
  def __init__(self, fmt_streams):
9
+ """Construct a :class:`StreamQuery <StreamQuery>`.
10
+
11
+ param list fmt_streams:
12
+ list of :class:`Stream <Stream>` instances.
13
+ """
14
  self.fmt_streams = fmt_streams
15
  self.itag_index = {int(s.itag): s for s in fmt_streams}
16
 
 
227
  def all(self):
228
  """Get all the results represented by this query as a list."""
229
  return self.fmt_streams
230
+
231
+
232
+ class CaptionQuery:
233
+ """Interface for querying the available captions."""
234
+
235
+ def __init__(self, captions):
236
+ """Construct a :class:`Caption <Caption>`.
237
+
238
+ param list captions:
239
+ list of :class:`Caption <Caption>` instances.
240
+
241
+ """
242
+ self.captions = captions
243
+ self.lang_code_index = {c.code: c for c in captions}
244
+
245
+ def get_by_language_code(self, lang_code):
246
+ """Get the :class:`Caption <Caption>` for a given ``lang_code``.
247
+
248
+ :param str lang_code:
249
+ The code that identifies the caption language.
250
+ :rtype: :class:`Caption <Caption>` or ``None``
251
+ :returns:
252
+ The :class:`Caption <Caption>` matching the given ``lang_code`` or
253
+ ``None`` if it does not exist.
254
+ """
255
+ return self.lang_code_index.get(lang_code)
256
+
257
+ def all(self):
258
+ """Get all the results represented by this query as a list."""
259
+ return self.captions
setup.cfg CHANGED
@@ -3,7 +3,7 @@ commit = True
3
  tag = True
4
  current_version = 7.0.9
5
  parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+))?
6
- serialize =
7
  {major}.{minor}.{patch}
8
 
9
  [metadata]
@@ -15,6 +15,5 @@ description-file = README.md
15
 
16
  [coverage:run]
17
  source = pytube
18
- omit =
19
  pytube/compat.py
20
-
 
3
  tag = True
4
  current_version = 7.0.9
5
  parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+))?
6
+ serialize =
7
  {major}.{minor}.{patch}
8
 
9
  [metadata]
 
15
 
16
  [coverage:run]
17
  source = pytube
18
+ omit =
19
  pytube/compat.py