Cleanup, Pep8, finished docstring, 100% std lib.
Browse files* A bit of tidying some odds and ends.
* Pep8
* Finished docstrings
* Removed ``requests`` dependency making it compatible out of the box
* Rewrote filename sanitization method, also fixing unicode error.
- youtube.py +107 -42
youtube.py
CHANGED
@@ -21,15 +21,15 @@
|
|
21 |
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
22 |
# OTHER DEALINGS IN THE SOFTWARE.
|
23 |
|
|
|
|
|
24 |
from urlparse import urlparse, parse_qs
|
25 |
-
from unicodedata import normalize
|
26 |
|
27 |
import re
|
28 |
-
import requests
|
29 |
-
import urllib2
|
30 |
|
31 |
YT_BASE_URL = 'http://www.youtube.com/get_video_info'
|
32 |
-
|
|
|
33 |
YT_ENCODING = {
|
34 |
5: (5, "flv", "224p"),
|
35 |
6: (6, "flv", "270p"),
|
@@ -56,7 +56,7 @@ class Video(object):
|
|
56 |
Keyword arguments:
|
57 |
extention -- The file extention the video should be saved as.
|
58 |
resolution -- The broadcasting standard of the video.
|
59 |
-
url -- The url of the video. (e.g.:
|
60 |
filename -- The filename (minus the extention) to save the video.
|
61 |
"""
|
62 |
self.extension = extension
|
@@ -65,8 +65,11 @@ class Video(object):
|
|
65 |
self.filename = filename
|
66 |
|
67 |
def download(self):
|
68 |
-
"""
|
69 |
-
|
|
|
|
|
|
|
70 |
#TODO: Allow a destination path to be specified.
|
71 |
dst_file = open(self.filename, 'wb')
|
72 |
meta_data = response.info()
|
@@ -99,8 +102,8 @@ class YouTube(object):
|
|
99 |
_video_url = None
|
100 |
title = None
|
101 |
videos = []
|
102 |
-
# fmt was an undocumented URL parameter that allowed selecting
|
103 |
-
# quality mode without using player user interface.
|
104 |
|
105 |
@property
|
106 |
def url(self):
|
@@ -109,22 +112,31 @@ class YouTube(object):
|
|
109 |
|
110 |
@url.setter
|
111 |
def url(self, url):
|
|
|
112 |
self._video_url = url
|
|
|
|
|
|
|
113 |
self._get_video_info()
|
114 |
|
115 |
@property
|
116 |
def filename(self):
|
117 |
-
"""
|
|
|
|
|
|
|
118 |
if not self._filename:
|
119 |
self._filename = slugify(self.title)
|
120 |
return self._filename
|
121 |
|
122 |
@filename.setter
|
123 |
def filename(self, filename):
|
|
|
124 |
self._filename = filename
|
125 |
|
126 |
@property
|
127 |
def video_id(self):
|
|
|
128 |
parts = urlparse(self._video_url)
|
129 |
qs = getattr(parts, 'query', None)
|
130 |
if qs:
|
@@ -133,6 +145,13 @@ class YouTube(object):
|
|
133 |
return video_id.pop()
|
134 |
|
135 |
def get(self, extension=None, res=None):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
136 |
result = []
|
137 |
for v in self.videos:
|
138 |
if extension and v.extension != extension:
|
@@ -142,12 +161,19 @@ class YouTube(object):
|
|
142 |
else:
|
143 |
result.append(v)
|
144 |
if len(result) is 1:
|
145 |
-
|
146 |
-
return result.pop()
|
147 |
else:
|
148 |
-
raise
|
149 |
|
150 |
def filter(self, extension=None, res=None):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
results = []
|
152 |
for v in self.videos:
|
153 |
if extension and v.extension != extension:
|
@@ -159,57 +185,79 @@ class YouTube(object):
|
|
159 |
return results
|
160 |
|
161 |
def _fetch(self, path, data):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
elem = path[0]
|
|
|
163 |
if type(data) is list:
|
|
|
164 |
return self._fetch(path, data.pop())
|
|
|
165 |
data = parse_qs(data)
|
|
|
166 |
data = data.get(elem, None)
|
|
|
167 |
path = path[1::1]
|
|
|
|
|
168 |
if len(path) is 0 or data is None:
|
169 |
if type(data) is list and len(data) is 1:
|
170 |
data = data.pop()
|
171 |
return data
|
172 |
else:
|
|
|
173 |
return self._fetch(path, data)
|
174 |
|
175 |
def _get_video_info(self):
|
176 |
-
|
|
|
|
|
|
|
|
|
|
|
177 |
'asv': 3,
|
178 |
'el': 'detailpage',
|
179 |
'hl': 'en_US',
|
180 |
'video_id': self.video_id
|
181 |
-
}
|
182 |
-
|
183 |
-
response =
|
184 |
-
|
185 |
-
|
|
|
|
|
|
|
186 |
path = ('url_encoded_fmt_stream_map', 'itag')
|
|
|
|
|
187 |
encoding_options = self._fetch(path, content)
|
188 |
self.title = self._fetch(('title',), content)
|
189 |
|
190 |
for video in encoding_options:
|
191 |
url = self._extract_url(video)
|
192 |
if not url:
|
|
|
|
|
193 |
continue
|
194 |
|
195 |
fmt, extension, resolution = self._extract_fmt(video)
|
196 |
-
|
197 |
-
if fmt in self._fmt_values:
|
198 |
-
continue
|
199 |
-
|
200 |
filename = "%s.%s" % (self.filename, extension)
|
201 |
|
202 |
-
self.videos.append(
|
203 |
-
Video(extension, resolution, url, filename)
|
204 |
-
)
|
205 |
-
|
206 |
self._fmt_values.append(fmt)
|
207 |
|
208 |
def _extract_fmt(self, text):
|
209 |
"""
|
210 |
-
YouTube does not pass you a completely valid URLencoded form,
|
211 |
-
this is suppose to act as a deterrent.. Nothing some
|
212 |
-
expressions couldn't handle.
|
213 |
|
214 |
Keyword arguments:
|
215 |
text -- The malformed data contained within each url node.
|
@@ -221,9 +269,10 @@ class YouTube(object):
|
|
221 |
|
222 |
def _extract_url(self, text):
|
223 |
"""
|
224 |
-
(I hate to be redundant here, but whatever) YouTube does not
|
225 |
-
completely valid URLencoded form, I suspect this is
|
226 |
-
deterrent.. Nothing some regulular
|
|
|
227 |
|
228 |
Keyword arguments:
|
229 |
text -- The malformed data contained in the itag node.
|
@@ -233,6 +282,28 @@ class YouTube(object):
|
|
233 |
return url[0]
|
234 |
|
235 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
def slugify(text):
|
237 |
"""
|
238 |
Santizes the video text, generating a valid filename.
|
@@ -240,12 +311,6 @@ def slugify(text):
|
|
240 |
Keyword arguments:
|
241 |
text -- The text corpus to make file name save.
|
242 |
"""
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
text = unicode(text)
|
247 |
-
#BUG: Fails trying to interpet non-ascii characters.
|
248 |
-
#UTF-8.. we get it.
|
249 |
-
text = normalize('NFKD', text).encode('ascii', 'ignore')
|
250 |
-
text = unicode(strip.sub('', text).strip())
|
251 |
-
return unicode(text)
|
|
|
21 |
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
22 |
# OTHER DEALINGS IN THE SOFTWARE.
|
23 |
|
24 |
+
from urllib import urlencode
|
25 |
+
from urllib2 import urlopen
|
26 |
from urlparse import urlparse, parse_qs
|
|
|
27 |
|
28 |
import re
|
|
|
|
|
29 |
|
30 |
YT_BASE_URL = 'http://www.youtube.com/get_video_info'
|
31 |
+
|
32 |
+
# YouTube media encoding options.
|
33 |
YT_ENCODING = {
|
34 |
5: (5, "flv", "224p"),
|
35 |
6: (6, "flv", "270p"),
|
|
|
56 |
Keyword arguments:
|
57 |
extention -- The file extention the video should be saved as.
|
58 |
resolution -- The broadcasting standard of the video.
|
59 |
+
url -- The url of the video. (e.g.: youtube.com/watch?v=..)
|
60 |
filename -- The filename (minus the extention) to save the video.
|
61 |
"""
|
62 |
self.extension = extension
|
|
|
65 |
self.filename = filename
|
66 |
|
67 |
def download(self):
|
68 |
+
"""
|
69 |
+
Downloads the file of the URL defined within the class
|
70 |
+
instance.
|
71 |
+
"""
|
72 |
+
response = urlopen(self.url)
|
73 |
#TODO: Allow a destination path to be specified.
|
74 |
dst_file = open(self.filename, 'wb')
|
75 |
meta_data = response.info()
|
|
|
102 |
_video_url = None
|
103 |
title = None
|
104 |
videos = []
|
105 |
+
# fmt was an undocumented URL parameter that allowed selecting
|
106 |
+
# YouTube quality mode without using player user interface.
|
107 |
|
108 |
@property
|
109 |
def url(self):
|
|
|
112 |
|
113 |
@url.setter
|
114 |
def url(self, url):
|
115 |
+
""" Defines the URL of the YouTube video."""
|
116 |
self._video_url = url
|
117 |
+
#Reset the filename.
|
118 |
+
self._filename = None
|
119 |
+
#Get the video details.
|
120 |
self._get_video_info()
|
121 |
|
122 |
@property
|
123 |
def filename(self):
|
124 |
+
"""
|
125 |
+
Exposes the title of the video. If this is not set, one is
|
126 |
+
generated based on the name of the video.
|
127 |
+
"""
|
128 |
if not self._filename:
|
129 |
self._filename = slugify(self.title)
|
130 |
return self._filename
|
131 |
|
132 |
@filename.setter
|
133 |
def filename(self, filename):
|
134 |
+
""" Defines the filename."""
|
135 |
self._filename = filename
|
136 |
|
137 |
@property
|
138 |
def video_id(self):
|
139 |
+
"""Gets the video ID extracted from the URL."""
|
140 |
parts = urlparse(self._video_url)
|
141 |
qs = getattr(parts, 'query', None)
|
142 |
if qs:
|
|
|
145 |
return video_id.pop()
|
146 |
|
147 |
def get(self, extension=None, res=None):
|
148 |
+
"""
|
149 |
+
Return a single video given an extention and resolution.
|
150 |
+
|
151 |
+
Keyword arguments:
|
152 |
+
extention -- The desired file extention (e.g.: mp4).
|
153 |
+
res -- The desired broadcasting standard of the video (e.g.: 1080p).
|
154 |
+
"""
|
155 |
result = []
|
156 |
for v in self.videos:
|
157 |
if extension and v.extension != extension:
|
|
|
161 |
else:
|
162 |
result.append(v)
|
163 |
if len(result) is 1:
|
164 |
+
return result[0]
|
|
|
165 |
else:
|
166 |
+
raise Exception("Multiple videos returned")
|
167 |
|
168 |
def filter(self, extension=None, res=None):
|
169 |
+
"""
|
170 |
+
Return a filtered list of videos given an extention and
|
171 |
+
resolution criteria.
|
172 |
+
|
173 |
+
Keyword arguments:
|
174 |
+
extention -- The desired file extention (e.g.: mp4).
|
175 |
+
res -- The desired broadcasting standard of the video (e.g.: 1080p).
|
176 |
+
"""
|
177 |
results = []
|
178 |
for v in self.videos:
|
179 |
if extension and v.extension != extension:
|
|
|
185 |
return results
|
186 |
|
187 |
def _fetch(self, path, data):
|
188 |
+
"""
|
189 |
+
Given a path, traverse the response for the desired data. (A
|
190 |
+
modified ver. of my dictionary traverse method:
|
191 |
+
https://gist.github.com/2009119)
|
192 |
+
|
193 |
+
Keyword arguments:
|
194 |
+
path -- A tulip representing a path to a node within a tree.
|
195 |
+
data -- The data containing the tree.
|
196 |
+
"""
|
197 |
elem = path[0]
|
198 |
+
#Get first element in tulip, and check if it contains a list.
|
199 |
if type(data) is list:
|
200 |
+
# Pop it, and let's continue..
|
201 |
return self._fetch(path, data.pop())
|
202 |
+
#Parse the url encoded data
|
203 |
data = parse_qs(data)
|
204 |
+
#Get the element in our path
|
205 |
data = data.get(elem, None)
|
206 |
+
#Offset the tulip by 1.
|
207 |
path = path[1::1]
|
208 |
+
#Check if the path has reached the end OR the element return
|
209 |
+
#nothing.
|
210 |
if len(path) is 0 or data is None:
|
211 |
if type(data) is list and len(data) is 1:
|
212 |
data = data.pop()
|
213 |
return data
|
214 |
else:
|
215 |
+
# Nope, let's keep diggin'
|
216 |
return self._fetch(path, data)
|
217 |
|
218 |
def _get_video_info(self):
|
219 |
+
"""
|
220 |
+
This is responsable for executing the request, extracting the
|
221 |
+
necessary details, and populating the different video
|
222 |
+
resolutions and formats into a list.
|
223 |
+
"""
|
224 |
+
querystring = urlencode({
|
225 |
'asv': 3,
|
226 |
'el': 'detailpage',
|
227 |
'hl': 'en_US',
|
228 |
'video_id': self.video_id
|
229 |
+
})
|
230 |
+
|
231 |
+
response = urlopen(YT_BASE_URL + '?' + querystring)
|
232 |
+
#TODO: evaulate the status code.
|
233 |
+
if response:
|
234 |
+
content = response.read()
|
235 |
+
#Use my cool traversing method to extract the specific
|
236 |
+
#attribute from the response body.
|
237 |
path = ('url_encoded_fmt_stream_map', 'itag')
|
238 |
+
#Using the ``itag`` (otherwised referred to as ``fmf``, set the
|
239 |
+
#available encoding options.
|
240 |
encoding_options = self._fetch(path, content)
|
241 |
self.title = self._fetch(('title',), content)
|
242 |
|
243 |
for video in encoding_options:
|
244 |
url = self._extract_url(video)
|
245 |
if not url:
|
246 |
+
#Sometimes the regex for matching the video returns
|
247 |
+
#a single empty element, so we'll skip those here.
|
248 |
continue
|
249 |
|
250 |
fmt, extension, resolution = self._extract_fmt(video)
|
|
|
|
|
|
|
|
|
251 |
filename = "%s.%s" % (self.filename, extension)
|
252 |
|
253 |
+
self.videos.append(Video(extension, resolution, url, filename))
|
|
|
|
|
|
|
254 |
self._fmt_values.append(fmt)
|
255 |
|
256 |
def _extract_fmt(self, text):
|
257 |
"""
|
258 |
+
YouTube does not pass you a completely valid URLencoded form,
|
259 |
+
I suspect this is suppose to act as a deterrent.. Nothing some
|
260 |
+
regulular expressions couldn't handle.
|
261 |
|
262 |
Keyword arguments:
|
263 |
text -- The malformed data contained within each url node.
|
|
|
269 |
|
270 |
def _extract_url(self, text):
|
271 |
"""
|
272 |
+
(I hate to be redundant here, but whatever) YouTube does not
|
273 |
+
pass you a completely valid URLencoded form, I suspect this is
|
274 |
+
suppose to act as a deterrent.. Nothing some regulular
|
275 |
+
expressions couldn't handle.
|
276 |
|
277 |
Keyword arguments:
|
278 |
text -- The malformed data contained in the itag node.
|
|
|
282 |
return url[0]
|
283 |
|
284 |
|
285 |
+
def sanitize_filename(text):
|
286 |
+
"""
|
287 |
+
Sanitizes filenames for many operating systems.
|
288 |
+
|
289 |
+
Keyword arguments:
|
290 |
+
text -- The unsanitized pending filename.
|
291 |
+
"""
|
292 |
+
# quick truncate for handling long filenames.
|
293 |
+
truncate = lambda text: text[:155].rsplit(' ', 1)[0]
|
294 |
+
|
295 |
+
#NTFS forbids characters in range 0-31 (0x00-0x1F)
|
296 |
+
ntfs = [chr(i) for i in range(0, 31)]
|
297 |
+
|
298 |
+
# This *should* cover a wide range of legacy operating systems.
|
299 |
+
paranoid = ['\"', '\#', '\$', '\%', '\'', '\*', '\,', '\.', '\/', '\:',
|
300 |
+
'\;', '\<', '\>', '\?', '\\', '\^', '\|', '\~', '\\\\']
|
301 |
+
|
302 |
+
blacklist = re.compile('|'.join(ntfs + paranoid), re.UNICODE)
|
303 |
+
filename = blacklist.sub('', text)
|
304 |
+
return truncate(filename)
|
305 |
+
|
306 |
+
|
307 |
def slugify(text):
|
308 |
"""
|
309 |
Santizes the video text, generating a valid filename.
|
|
|
311 |
Keyword arguments:
|
312 |
text -- The text corpus to make file name save.
|
313 |
"""
|
314 |
+
text = sanitize_filename(text)
|
315 |
+
text = text.replace('_', ' ')
|
316 |
+
return text
|
|
|
|
|
|
|
|
|
|
|
|