nficano commited on
Commit
76e5d62
·
1 Parent(s): 570af13

Cleanup, Pep8, finished docstring, 100% std lib.

Browse files

* A bit of tidying some odds and ends.
* Pep8
* Finished docstrings
* Removed ``requests`` dependency making it compatible out of the box
* Rewrote filename sanitization method, also fixing unicode error.

Files changed (1) hide show
  1. youtube.py +107 -42
youtube.py CHANGED
@@ -21,15 +21,15 @@
21
  # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22
  # OTHER DEALINGS IN THE SOFTWARE.
23
 
 
 
24
  from urlparse import urlparse, parse_qs
25
- from unicodedata import normalize
26
 
27
  import re
28
- import requests
29
- import urllib2
30
 
31
  YT_BASE_URL = 'http://www.youtube.com/get_video_info'
32
- # YouTube media encoding options
 
33
  YT_ENCODING = {
34
  5: (5, "flv", "224p"),
35
  6: (6, "flv", "270p"),
@@ -56,7 +56,7 @@ class Video(object):
56
  Keyword arguments:
57
  extention -- The file extention the video should be saved as.
58
  resolution -- The broadcasting standard of the video.
59
- url -- The url of the video. (e.g.: http://www.youtube.com/watch?v=..)
60
  filename -- The filename (minus the extention) to save the video.
61
  """
62
  self.extension = extension
@@ -65,8 +65,11 @@ class Video(object):
65
  self.filename = filename
66
 
67
  def download(self):
68
- """Downloads the file of the URL defined within the class instance."""
69
- response = urllib2.urlopen(self.url)
 
 
 
70
  #TODO: Allow a destination path to be specified.
71
  dst_file = open(self.filename, 'wb')
72
  meta_data = response.info()
@@ -99,8 +102,8 @@ class YouTube(object):
99
  _video_url = None
100
  title = None
101
  videos = []
102
- # fmt was an undocumented URL parameter that allowed selecting YouTube
103
- # quality mode without using player user interface.
104
 
105
  @property
106
  def url(self):
@@ -109,22 +112,31 @@ class YouTube(object):
109
 
110
  @url.setter
111
  def url(self, url):
 
112
  self._video_url = url
 
 
 
113
  self._get_video_info()
114
 
115
  @property
116
  def filename(self):
117
- """Exposes the title of the video."""
 
 
 
118
  if not self._filename:
119
  self._filename = slugify(self.title)
120
  return self._filename
121
 
122
  @filename.setter
123
  def filename(self, filename):
 
124
  self._filename = filename
125
 
126
  @property
127
  def video_id(self):
 
128
  parts = urlparse(self._video_url)
129
  qs = getattr(parts, 'query', None)
130
  if qs:
@@ -133,6 +145,13 @@ class YouTube(object):
133
  return video_id.pop()
134
 
135
  def get(self, extension=None, res=None):
 
 
 
 
 
 
 
136
  result = []
137
  for v in self.videos:
138
  if extension and v.extension != extension:
@@ -142,12 +161,19 @@ class YouTube(object):
142
  else:
143
  result.append(v)
144
  if len(result) is 1:
145
- #BUG: This fucks up occasionally..
146
- return result.pop()
147
  else:
148
- raise
149
 
150
  def filter(self, extension=None, res=None):
 
 
 
 
 
 
 
 
151
  results = []
152
  for v in self.videos:
153
  if extension and v.extension != extension:
@@ -159,57 +185,79 @@ class YouTube(object):
159
  return results
160
 
161
  def _fetch(self, path, data):
 
 
 
 
 
 
 
 
 
162
  elem = path[0]
 
163
  if type(data) is list:
 
164
  return self._fetch(path, data.pop())
 
165
  data = parse_qs(data)
 
166
  data = data.get(elem, None)
 
167
  path = path[1::1]
 
 
168
  if len(path) is 0 or data is None:
169
  if type(data) is list and len(data) is 1:
170
  data = data.pop()
171
  return data
172
  else:
 
173
  return self._fetch(path, data)
174
 
175
  def _get_video_info(self):
176
- querystring = {
 
 
 
 
 
177
  'asv': 3,
178
  'el': 'detailpage',
179
  'hl': 'en_US',
180
  'video_id': self.video_id
181
- }
182
-
183
- response = requests.get(YT_BASE_URL, params=querystring)
184
- if response.ok:
185
- content = response.content
 
 
 
186
  path = ('url_encoded_fmt_stream_map', 'itag')
 
 
187
  encoding_options = self._fetch(path, content)
188
  self.title = self._fetch(('title',), content)
189
 
190
  for video in encoding_options:
191
  url = self._extract_url(video)
192
  if not url:
 
 
193
  continue
194
 
195
  fmt, extension, resolution = self._extract_fmt(video)
196
-
197
- if fmt in self._fmt_values:
198
- continue
199
-
200
  filename = "%s.%s" % (self.filename, extension)
201
 
202
- self.videos.append(
203
- Video(extension, resolution, url, filename)
204
- )
205
-
206
  self._fmt_values.append(fmt)
207
 
208
  def _extract_fmt(self, text):
209
  """
210
- YouTube does not pass you a completely valid URLencoded form, I suspect
211
- this is suppose to act as a deterrent.. Nothing some regulular
212
- expressions couldn't handle.
213
 
214
  Keyword arguments:
215
  text -- The malformed data contained within each url node.
@@ -221,9 +269,10 @@ class YouTube(object):
221
 
222
  def _extract_url(self, text):
223
  """
224
- (I hate to be redundant here, but whatever) YouTube does not pass you a
225
- completely valid URLencoded form, I suspect this is suppose to act as a
226
- deterrent.. Nothing some regulular expressions couldn't handle.
 
227
 
228
  Keyword arguments:
229
  text -- The malformed data contained in the itag node.
@@ -233,6 +282,28 @@ class YouTube(object):
233
  return url[0]
234
 
235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  def slugify(text):
237
  """
238
  Santizes the video text, generating a valid filename.
@@ -240,12 +311,6 @@ def slugify(text):
240
  Keyword arguments:
241
  text -- The text corpus to make file name save.
242
  """
243
- strip = re.compile(r'[^\w\s-]')
244
-
245
- if not isinstance(text, unicode):
246
- text = unicode(text)
247
- #BUG: Fails trying to interpet non-ascii characters.
248
- #UTF-8.. we get it.
249
- text = normalize('NFKD', text).encode('ascii', 'ignore')
250
- text = unicode(strip.sub('', text).strip())
251
- return unicode(text)
 
21
  # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22
  # OTHER DEALINGS IN THE SOFTWARE.
23
 
24
+ from urllib import urlencode
25
+ from urllib2 import urlopen
26
  from urlparse import urlparse, parse_qs
 
27
 
28
  import re
 
 
29
 
30
  YT_BASE_URL = 'http://www.youtube.com/get_video_info'
31
+
32
+ # YouTube media encoding options.
33
  YT_ENCODING = {
34
  5: (5, "flv", "224p"),
35
  6: (6, "flv", "270p"),
 
56
  Keyword arguments:
57
  extention -- The file extention the video should be saved as.
58
  resolution -- The broadcasting standard of the video.
59
+ url -- The url of the video. (e.g.: youtube.com/watch?v=..)
60
  filename -- The filename (minus the extention) to save the video.
61
  """
62
  self.extension = extension
 
65
  self.filename = filename
66
 
67
  def download(self):
68
+ """
69
+ Downloads the file of the URL defined within the class
70
+ instance.
71
+ """
72
+ response = urlopen(self.url)
73
  #TODO: Allow a destination path to be specified.
74
  dst_file = open(self.filename, 'wb')
75
  meta_data = response.info()
 
102
  _video_url = None
103
  title = None
104
  videos = []
105
+ # fmt was an undocumented URL parameter that allowed selecting
106
+ # YouTube quality mode without using player user interface.
107
 
108
  @property
109
  def url(self):
 
112
 
113
  @url.setter
114
  def url(self, url):
115
+ """ Defines the URL of the YouTube video."""
116
  self._video_url = url
117
+ #Reset the filename.
118
+ self._filename = None
119
+ #Get the video details.
120
  self._get_video_info()
121
 
122
  @property
123
  def filename(self):
124
+ """
125
+ Exposes the title of the video. If this is not set, one is
126
+ generated based on the name of the video.
127
+ """
128
  if not self._filename:
129
  self._filename = slugify(self.title)
130
  return self._filename
131
 
132
  @filename.setter
133
  def filename(self, filename):
134
+ """ Defines the filename."""
135
  self._filename = filename
136
 
137
  @property
138
  def video_id(self):
139
+ """Gets the video ID extracted from the URL."""
140
  parts = urlparse(self._video_url)
141
  qs = getattr(parts, 'query', None)
142
  if qs:
 
145
  return video_id.pop()
146
 
147
  def get(self, extension=None, res=None):
148
+ """
149
+ Return a single video given an extention and resolution.
150
+
151
+ Keyword arguments:
152
+ extention -- The desired file extention (e.g.: mp4).
153
+ res -- The desired broadcasting standard of the video (e.g.: 1080p).
154
+ """
155
  result = []
156
  for v in self.videos:
157
  if extension and v.extension != extension:
 
161
  else:
162
  result.append(v)
163
  if len(result) is 1:
164
+ return result[0]
 
165
  else:
166
+ raise Exception("Multiple videos returned")
167
 
168
  def filter(self, extension=None, res=None):
169
+ """
170
+ Return a filtered list of videos given an extention and
171
+ resolution criteria.
172
+
173
+ Keyword arguments:
174
+ extention -- The desired file extention (e.g.: mp4).
175
+ res -- The desired broadcasting standard of the video (e.g.: 1080p).
176
+ """
177
  results = []
178
  for v in self.videos:
179
  if extension and v.extension != extension:
 
185
  return results
186
 
187
  def _fetch(self, path, data):
188
+ """
189
+ Given a path, traverse the response for the desired data. (A
190
+ modified ver. of my dictionary traverse method:
191
+ https://gist.github.com/2009119)
192
+
193
+ Keyword arguments:
194
+ path -- A tulip representing a path to a node within a tree.
195
+ data -- The data containing the tree.
196
+ """
197
  elem = path[0]
198
+ #Get first element in tulip, and check if it contains a list.
199
  if type(data) is list:
200
+ # Pop it, and let's continue..
201
  return self._fetch(path, data.pop())
202
+ #Parse the url encoded data
203
  data = parse_qs(data)
204
+ #Get the element in our path
205
  data = data.get(elem, None)
206
+ #Offset the tulip by 1.
207
  path = path[1::1]
208
+ #Check if the path has reached the end OR the element return
209
+ #nothing.
210
  if len(path) is 0 or data is None:
211
  if type(data) is list and len(data) is 1:
212
  data = data.pop()
213
  return data
214
  else:
215
+ # Nope, let's keep diggin'
216
  return self._fetch(path, data)
217
 
218
  def _get_video_info(self):
219
+ """
220
+ This is responsable for executing the request, extracting the
221
+ necessary details, and populating the different video
222
+ resolutions and formats into a list.
223
+ """
224
+ querystring = urlencode({
225
  'asv': 3,
226
  'el': 'detailpage',
227
  'hl': 'en_US',
228
  'video_id': self.video_id
229
+ })
230
+
231
+ response = urlopen(YT_BASE_URL + '?' + querystring)
232
+ #TODO: evaulate the status code.
233
+ if response:
234
+ content = response.read()
235
+ #Use my cool traversing method to extract the specific
236
+ #attribute from the response body.
237
  path = ('url_encoded_fmt_stream_map', 'itag')
238
+ #Using the ``itag`` (otherwised referred to as ``fmf``, set the
239
+ #available encoding options.
240
  encoding_options = self._fetch(path, content)
241
  self.title = self._fetch(('title',), content)
242
 
243
  for video in encoding_options:
244
  url = self._extract_url(video)
245
  if not url:
246
+ #Sometimes the regex for matching the video returns
247
+ #a single empty element, so we'll skip those here.
248
  continue
249
 
250
  fmt, extension, resolution = self._extract_fmt(video)
 
 
 
 
251
  filename = "%s.%s" % (self.filename, extension)
252
 
253
+ self.videos.append(Video(extension, resolution, url, filename))
 
 
 
254
  self._fmt_values.append(fmt)
255
 
256
  def _extract_fmt(self, text):
257
  """
258
+ YouTube does not pass you a completely valid URLencoded form,
259
+ I suspect this is suppose to act as a deterrent.. Nothing some
260
+ regulular expressions couldn't handle.
261
 
262
  Keyword arguments:
263
  text -- The malformed data contained within each url node.
 
269
 
270
  def _extract_url(self, text):
271
  """
272
+ (I hate to be redundant here, but whatever) YouTube does not
273
+ pass you a completely valid URLencoded form, I suspect this is
274
+ suppose to act as a deterrent.. Nothing some regulular
275
+ expressions couldn't handle.
276
 
277
  Keyword arguments:
278
  text -- The malformed data contained in the itag node.
 
282
  return url[0]
283
 
284
 
285
+ def sanitize_filename(text):
286
+ """
287
+ Sanitizes filenames for many operating systems.
288
+
289
+ Keyword arguments:
290
+ text -- The unsanitized pending filename.
291
+ """
292
+ # quick truncate for handling long filenames.
293
+ truncate = lambda text: text[:155].rsplit(' ', 1)[0]
294
+
295
+ #NTFS forbids characters in range 0-31 (0x00-0x1F)
296
+ ntfs = [chr(i) for i in range(0, 31)]
297
+
298
+ # This *should* cover a wide range of legacy operating systems.
299
+ paranoid = ['\"', '\#', '\$', '\%', '\'', '\*', '\,', '\.', '\/', '\:',
300
+ '\;', '\<', '\>', '\?', '\\', '\^', '\|', '\~', '\\\\']
301
+
302
+ blacklist = re.compile('|'.join(ntfs + paranoid), re.UNICODE)
303
+ filename = blacklist.sub('', text)
304
+ return truncate(filename)
305
+
306
+
307
  def slugify(text):
308
  """
309
  Santizes the video text, generating a valid filename.
 
311
  Keyword arguments:
312
  text -- The text corpus to make file name save.
313
  """
314
+ text = sanitize_filename(text)
315
+ text = text.replace('_', ' ')
316
+ return text