Merge pull request #161 from nficano/feature-caption-support
Browse files- docs/api.rst +14 -0
- pytube/__init__.py +2 -0
- pytube/__main__.py +28 -0
- pytube/captions.py +35 -0
- pytube/compat.py +7 -0
- pytube/helpers.py +51 -0
- pytube/query.py +36 -3
- setup.cfg +2 -3
docs/api.rst
CHANGED
@@ -27,6 +27,20 @@ StreamQuery Object
|
|
27 |
:members:
|
28 |
:inherited-members:
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
Extract
|
31 |
-------
|
32 |
|
|
|
27 |
:members:
|
28 |
:inherited-members:
|
29 |
|
30 |
+
Caption Object
|
31 |
+
-------------
|
32 |
+
|
33 |
+
.. autoclass:: pytube.Caption
|
34 |
+
:members:
|
35 |
+
:inherited-members:
|
36 |
+
|
37 |
+
CaptionQuery Object
|
38 |
+
------------------
|
39 |
+
|
40 |
+
.. autoclass:: pytube.query.CaptionQuery
|
41 |
+
:members:
|
42 |
+
:inherited-members:
|
43 |
+
|
44 |
Extract
|
45 |
-------
|
46 |
|
pytube/__init__.py
CHANGED
@@ -15,8 +15,10 @@ __license__ = 'MIT License'
|
|
15 |
__copyright__ = 'Copyright 2017 Nick Ficano'
|
16 |
|
17 |
from pytube.logging import create_logger
|
|
|
18 |
from pytube.query import StreamQuery
|
19 |
from pytube.streams import Stream
|
|
|
20 |
from pytube.__main__ import YouTube
|
21 |
|
22 |
logger = create_logger()
|
|
|
15 |
__copyright__ = 'Copyright 2017 Nick Ficano'
|
16 |
|
17 |
from pytube.logging import create_logger
|
18 |
+
from pytube.query import CaptionQuery
|
19 |
from pytube.query import StreamQuery
|
20 |
from pytube.streams import Stream
|
21 |
+
from pytube.captions import Caption
|
22 |
from pytube.__main__ import YouTube
|
23 |
|
24 |
logger = create_logger()
|
pytube/__main__.py
CHANGED
@@ -12,6 +12,8 @@ from __future__ import absolute_import
|
|
12 |
import json
|
13 |
import logging
|
14 |
|
|
|
|
|
15 |
from pytube import extract
|
16 |
from pytube import mixins
|
17 |
from pytube import request
|
@@ -59,6 +61,7 @@ class YouTube(object):
|
|
59 |
self.player_config = None # inline js in the html containing streams
|
60 |
|
61 |
self.fmt_streams = [] # list of :class:`Stream <Stream>` instances
|
|
|
62 |
|
63 |
# video_id part of /watch?v=<video_id>
|
64 |
self.video_id = extract.video_id(url)
|
@@ -115,6 +118,8 @@ class YouTube(object):
|
|
115 |
# build instances of :class:`Stream <Stream>`
|
116 |
self.initialize_stream_objects(progressive_fmts)
|
117 |
self.initialize_stream_objects(adaptive_fmts)
|
|
|
|
|
118 |
logger.info('init finished successfully')
|
119 |
|
120 |
def prefetch(self):
|
@@ -158,6 +163,29 @@ class YouTube(object):
|
|
158 |
)
|
159 |
self.fmt_streams.append(video)
|
160 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
@property
|
162 |
def streams(self):
|
163 |
"""Interface to query both adaptive (DASH) and progressive streams."""
|
|
|
12 |
import json
|
13 |
import logging
|
14 |
|
15 |
+
from pytube import Caption
|
16 |
+
from pytube import CaptionQuery
|
17 |
from pytube import extract
|
18 |
from pytube import mixins
|
19 |
from pytube import request
|
|
|
61 |
self.player_config = None # inline js in the html containing streams
|
62 |
|
63 |
self.fmt_streams = [] # list of :class:`Stream <Stream>` instances
|
64 |
+
self.caption_tracks = []
|
65 |
|
66 |
# video_id part of /watch?v=<video_id>
|
67 |
self.video_id = extract.video_id(url)
|
|
|
118 |
# build instances of :class:`Stream <Stream>`
|
119 |
self.initialize_stream_objects(progressive_fmts)
|
120 |
self.initialize_stream_objects(adaptive_fmts)
|
121 |
+
|
122 |
+
self.initialize_caption_objects()
|
123 |
logger.info('init finished successfully')
|
124 |
|
125 |
def prefetch(self):
|
|
|
163 |
)
|
164 |
self.fmt_streams.append(video)
|
165 |
|
166 |
+
def initialize_caption_objects(self):
|
167 |
+
"""Populate instances of :class:`Caption <Caption>`.
|
168 |
+
|
169 |
+
Take the unscrambled player response data, and use it to initialize
|
170 |
+
instances of :class:`Caption <Caption>`.
|
171 |
+
"""
|
172 |
+
if 'captions' not in self.player_config['args']['player_response']:
|
173 |
+
return
|
174 |
+
caption_tracks = (
|
175 |
+
self.player_config['args']
|
176 |
+
['player_response']
|
177 |
+
['captions']
|
178 |
+
['playerCaptionsTracklistRenderer']
|
179 |
+
['captionTracks']
|
180 |
+
)
|
181 |
+
for caption_track in caption_tracks:
|
182 |
+
self.caption_tracks.append(Caption(caption_track))
|
183 |
+
|
184 |
+
@property
|
185 |
+
def captions(self):
|
186 |
+
"""Interface to query caption tracks."""
|
187 |
+
return CaptionQuery([c for c in self.caption_tracks])
|
188 |
+
|
189 |
@property
|
190 |
def streams(self):
|
191 |
"""Interface to query both adaptive (DASH) and progressive streams."""
|
pytube/captions.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""This module contrains a container for caption tracks."""
|
3 |
+
from pytube import request
|
4 |
+
from pytube.helpers import xml_caption_to_srt
|
5 |
+
|
6 |
+
|
7 |
+
class Caption:
|
8 |
+
"""Container for caption tracks."""
|
9 |
+
|
10 |
+
def __init__(self, caption_track):
|
11 |
+
"""Construct a :class:`Caption <Caption>`.
|
12 |
+
|
13 |
+
:param dict caption_track:
|
14 |
+
Caption track data extracted from ``watch_html``.
|
15 |
+
"""
|
16 |
+
self.url = caption_track.get('baseUrl')
|
17 |
+
self.name = caption_track['name']['simpleText']
|
18 |
+
self.code = caption_track['languageCode']
|
19 |
+
|
20 |
+
@property
|
21 |
+
def xml_captions(self):
|
22 |
+
"""Download the xml caption tracks."""
|
23 |
+
return request.get(self.url)
|
24 |
+
|
25 |
+
def generate_srt_captions(self):
|
26 |
+
"""Generate "SubRip Subtitle" captions.
|
27 |
+
|
28 |
+
Takes the xml captions from :meth:`~pytube.Caption.xml_captions` and
|
29 |
+
recompiles them into the "SubRip Subtitle" format.
|
30 |
+
"""
|
31 |
+
return xml_caption_to_srt(self.xml_captions)
|
32 |
+
|
33 |
+
def __repr__(self):
|
34 |
+
"""Printable object representation."""
|
35 |
+
return'<Caption lang="{s.name}" code="{s.code}">'.format(s=self)
|
pytube/compat.py
CHANGED
@@ -13,6 +13,12 @@ if python_version == 2:
|
|
13 |
from urllib2 import unquote
|
14 |
from urllib2 import urlopen
|
15 |
from urlparse import parse_qsl
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
def unicode(s):
|
18 |
"""Encode a string to utf-8."""
|
@@ -25,6 +31,7 @@ elif python_version == 3:
|
|
25 |
from urllib.parse import unquote
|
26 |
from urllib.parse import urlencode
|
27 |
from urllib.request import urlopen
|
|
|
28 |
|
29 |
def unicode(s):
|
30 |
"""No-op."""
|
|
|
13 |
from urllib2 import unquote
|
14 |
from urllib2 import urlopen
|
15 |
from urlparse import parse_qsl
|
16 |
+
from HTMLParser import HTMLParser
|
17 |
+
|
18 |
+
def unescape(s):
|
19 |
+
"""Strip HTML entries from a string."""
|
20 |
+
html_parser = HTMLParser()
|
21 |
+
return html_parser.unescape(s)
|
22 |
|
23 |
def unicode(s):
|
24 |
"""Encode a string to utf-8."""
|
|
|
31 |
from urllib.parse import unquote
|
32 |
from urllib.parse import urlencode
|
33 |
from urllib.request import urlopen
|
34 |
+
from html import unescape
|
35 |
|
36 |
def unicode(s):
|
37 |
"""No-op."""
|
pytube/helpers.py
CHANGED
@@ -3,9 +3,13 @@
|
|
3 |
from __future__ import absolute_import
|
4 |
|
5 |
import logging
|
|
|
6 |
import pprint
|
7 |
import re
|
|
|
|
|
8 |
|
|
|
9 |
from pytube.compat import unicode
|
10 |
from pytube.exceptions import RegexMatchError
|
11 |
|
@@ -88,3 +92,50 @@ def safe_filename(s, max_length=255):
|
|
88 |
regex = re.compile(pattern, re.UNICODE)
|
89 |
filename = regex.sub('', s)
|
90 |
return unicode(filename[:max_length].rsplit(' ', 0)[0])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
from __future__ import absolute_import
|
4 |
|
5 |
import logging
|
6 |
+
import math
|
7 |
import pprint
|
8 |
import re
|
9 |
+
import time
|
10 |
+
import xml.etree.ElementTree as ElementTree
|
11 |
|
12 |
+
from pytube.compat import unescape
|
13 |
from pytube.compat import unicode
|
14 |
from pytube.exceptions import RegexMatchError
|
15 |
|
|
|
92 |
regex = re.compile(pattern, re.UNICODE)
|
93 |
filename = regex.sub('', s)
|
94 |
return unicode(filename[:max_length].rsplit(' ', 0)[0])
|
95 |
+
|
96 |
+
|
97 |
+
def float_to_srt_time_format(d):
|
98 |
+
"""Convert decimal durations into proper srt format.
|
99 |
+
|
100 |
+
:rtype: str
|
101 |
+
:returns:
|
102 |
+
SubRip Subtitle (str) formatted time duration.
|
103 |
+
|
104 |
+
>>> float_to_srt_time_format(3.89)
|
105 |
+
'00:00:03,890'
|
106 |
+
"""
|
107 |
+
frac, whole = math.modf(d)
|
108 |
+
time_fmt = time.strftime('0%H:0%M:%S,', time.gmtime(whole))
|
109 |
+
ms = '{:.3f}'.format(frac).replace('0.', '')
|
110 |
+
return time_fmt + ms
|
111 |
+
|
112 |
+
|
113 |
+
def xml_caption_to_srt(xml_captions):
|
114 |
+
"""Convert xml caption tracks to "SubRip Subtitle (srt)".
|
115 |
+
|
116 |
+
:param str xml_captions:
|
117 |
+
XML formatted caption tracks.
|
118 |
+
"""
|
119 |
+
segments = []
|
120 |
+
root = ElementTree.fromstring(xml_captions)
|
121 |
+
for i, child in enumerate(root.getchildren()):
|
122 |
+
text = child.text or ''
|
123 |
+
caption = unescape(
|
124 |
+
text
|
125 |
+
.replace('\n', ' ')
|
126 |
+
.replace(' ', ' '),
|
127 |
+
)
|
128 |
+
duration = float(child.attrib['dur'])
|
129 |
+
start = float(child.attrib['start'])
|
130 |
+
end = start + duration
|
131 |
+
sequence_number = i + 1 # convert from 0-indexed to 1.
|
132 |
+
line = (
|
133 |
+
'{seq}\n{start} --> {end}\n{text}\n'.format(
|
134 |
+
seq=sequence_number,
|
135 |
+
start=float_to_srt_time_format(start),
|
136 |
+
end=float_to_srt_time_format(end),
|
137 |
+
text=caption,
|
138 |
+
)
|
139 |
+
)
|
140 |
+
segments.append(line)
|
141 |
+
return '\n'.join(segments).strip()
|
pytube/query.py
CHANGED
@@ -1,13 +1,16 @@
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
-
"""This module provides a query interface for media streams."""
|
3 |
|
4 |
|
5 |
class StreamQuery:
|
6 |
"""Interface for querying the available media streams."""
|
7 |
|
8 |
def __init__(self, fmt_streams):
|
9 |
-
"""Construct a :class:`StreamQuery <StreamQuery>`.
|
10 |
-
|
|
|
|
|
|
|
11 |
self.fmt_streams = fmt_streams
|
12 |
self.itag_index = {int(s.itag): s for s in fmt_streams}
|
13 |
|
@@ -224,3 +227,33 @@ class StreamQuery:
|
|
224 |
def all(self):
|
225 |
"""Get all the results represented by this query as a list."""
|
226 |
return self.fmt_streams
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
+
"""This module provides a query interface for media streams and captions."""
|
3 |
|
4 |
|
5 |
class StreamQuery:
|
6 |
"""Interface for querying the available media streams."""
|
7 |
|
8 |
def __init__(self, fmt_streams):
|
9 |
+
"""Construct a :class:`StreamQuery <StreamQuery>`.
|
10 |
+
|
11 |
+
param list fmt_streams:
|
12 |
+
list of :class:`Stream <Stream>` instances.
|
13 |
+
"""
|
14 |
self.fmt_streams = fmt_streams
|
15 |
self.itag_index = {int(s.itag): s for s in fmt_streams}
|
16 |
|
|
|
227 |
def all(self):
|
228 |
"""Get all the results represented by this query as a list."""
|
229 |
return self.fmt_streams
|
230 |
+
|
231 |
+
|
232 |
+
class CaptionQuery:
|
233 |
+
"""Interface for querying the available captions."""
|
234 |
+
|
235 |
+
def __init__(self, captions):
|
236 |
+
"""Construct a :class:`Caption <Caption>`.
|
237 |
+
|
238 |
+
param list captions:
|
239 |
+
list of :class:`Caption <Caption>` instances.
|
240 |
+
|
241 |
+
"""
|
242 |
+
self.captions = captions
|
243 |
+
self.lang_code_index = {c.code: c for c in captions}
|
244 |
+
|
245 |
+
def get_by_language_code(self, lang_code):
|
246 |
+
"""Get the :class:`Caption <Caption>` for a given ``lang_code``.
|
247 |
+
|
248 |
+
:param str lang_code:
|
249 |
+
The code that identifies the caption language.
|
250 |
+
:rtype: :class:`Caption <Caption>` or ``None``
|
251 |
+
:returns:
|
252 |
+
The :class:`Caption <Caption>` matching the given ``lang_code`` or
|
253 |
+
``None`` if it does not exist.
|
254 |
+
"""
|
255 |
+
return self.lang_code_index.get(lang_code)
|
256 |
+
|
257 |
+
def all(self):
|
258 |
+
"""Get all the results represented by this query as a list."""
|
259 |
+
return self.captions
|
setup.cfg
CHANGED
@@ -3,7 +3,7 @@ commit = True
|
|
3 |
tag = True
|
4 |
current_version = 7.0.9
|
5 |
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+))?
|
6 |
-
serialize =
|
7 |
{major}.{minor}.{patch}
|
8 |
|
9 |
[metadata]
|
@@ -15,6 +15,5 @@ description-file = README.md
|
|
15 |
|
16 |
[coverage:run]
|
17 |
source = pytube
|
18 |
-
omit =
|
19 |
pytube/compat.py
|
20 |
-
|
|
|
3 |
tag = True
|
4 |
current_version = 7.0.9
|
5 |
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+))?
|
6 |
+
serialize =
|
7 |
{major}.{minor}.{patch}
|
8 |
|
9 |
[metadata]
|
|
|
15 |
|
16 |
[coverage:run]
|
17 |
source = pytube
|
18 |
+
omit =
|
19 |
pytube/compat.py
|
|