Ferdowsi
/

pytube

Model card Files Files and versions Community

nficano commited on Mar 20, 2012

Commit

76e5d62

1 Parent(s): 570af13

Cleanup, Pep8, finished docstring, 100% std lib.

* A bit of tidying some odds and ends.
* Pep8
* Finished docstrings
* Removed ``requests`` dependency making it compatible out of the box
* Rewrote filename sanitization method, also fixing unicode error.

Files changed (1) hide show

youtube.py +107 -42

youtube.py CHANGED Viewed

@@ -21,15 +21,15 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 # OTHER DEALINGS IN THE SOFTWARE.
 from urlparse import urlparse, parse_qs
-from unicodedata import normalize
 import re
-import requests
-import urllib2
 YT_BASE_URL = 'http://www.youtube.com/get_video_info'
-# YouTube media encoding options
 YT_ENCODING = {
     5: (5, "flv", "224p"),
     6: (6, "flv", "270p"),
@@ -56,7 +56,7 @@ class Video(object):
         Keyword arguments:
         extention -- The file extention the video should be saved as.
         resolution -- The broadcasting standard of the video.
-        url -- The url of the video. (e.g.: http://www.youtube.com/watch?v=..)
         filename -- The filename (minus the extention) to save the video.
         """
         self.extension = extension
@@ -65,8 +65,11 @@ class Video(object):
         self.filename = filename
     def download(self):
-        """Downloads the file of the URL defined within the class instance."""
-        response = urllib2.urlopen(self.url)
         #TODO: Allow a destination path to be specified.
         dst_file = open(self.filename, 'wb')
         meta_data = response.info()
@@ -99,8 +102,8 @@ class YouTube(object):
     _video_url = None
     title = None
     videos = []
-    # fmt was an undocumented URL parameter that allowed selecting YouTube
-    # quality mode without using player user interface.
     @property
     def url(self):
@@ -109,22 +112,31 @@ class YouTube(object):
     @url.setter
     def url(self, url):
         self._video_url = url
         self._get_video_info()
     @property
     def filename(self):
-        """Exposes the title of the video."""
         if not self._filename:
             self._filename = slugify(self.title)
         return self._filename
     @filename.setter
     def filename(self, filename):
         self._filename = filename
     @property
     def video_id(self):
         parts = urlparse(self._video_url)
         qs = getattr(parts, 'query', None)
         if qs:
@@ -133,6 +145,13 @@ class YouTube(object):
                 return video_id.pop()
     def get(self, extension=None, res=None):
         result = []
         for v in self.videos:
             if extension and v.extension != extension:
@@ -142,12 +161,19 @@ class YouTube(object):
             else:
                 result.append(v)
         if len(result) is 1:
-            #BUG: This fucks up occasionally..
-            return result.pop()
         else:
-            raise
     def filter(self, extension=None, res=None):
         results = []
         for v in self.videos:
             if extension and v.extension != extension:
@@ -159,57 +185,79 @@ class YouTube(object):
         return results
     def _fetch(self, path, data):
         elem = path[0]
         if type(data) is list:
             return self._fetch(path, data.pop())
         data = parse_qs(data)
         data = data.get(elem, None)
         path = path[1::1]
         if len(path) is 0 or data is None:
             if type(data) is list and len(data) is 1:
                 data = data.pop()
             return data
         else:
             return self._fetch(path, data)
     def _get_video_info(self):
-        querystring = {
             'asv': 3,
             'el': 'detailpage',
             'hl': 'en_US',
             'video_id': self.video_id
-        }
-        response = requests.get(YT_BASE_URL, params=querystring)
-        if response.ok:
-            content = response.content
             path = ('url_encoded_fmt_stream_map', 'itag')
             encoding_options = self._fetch(path, content)
             self.title = self._fetch(('title',), content)
             for video in encoding_options:
                 url = self._extract_url(video)
                 if not url:
                     continue
                 fmt, extension, resolution = self._extract_fmt(video)
-                if fmt in self._fmt_values:
-                    continue
                 filename = "%s.%s" % (self.filename, extension)
-                self.videos.append(
-                    Video(extension, resolution, url, filename)
-                )
                 self._fmt_values.append(fmt)
     def _extract_fmt(self, text):
         """
-        YouTube does not pass you a completely valid URLencoded form, I suspect
-        this is suppose to act as a deterrent.. Nothing some regulular
-        expressions couldn't handle.
         Keyword arguments:
         text -- The malformed data contained within each url node.
@@ -221,9 +269,10 @@ class YouTube(object):
     def _extract_url(self, text):
         """
-        (I hate to be redundant here, but whatever) YouTube does not pass you a
-        completely valid URLencoded form, I suspect this is suppose to act as a
-        deterrent.. Nothing some regulular expressions couldn't handle.
         Keyword arguments:
         text -- The malformed data contained in the itag node.
@@ -233,6 +282,28 @@ class YouTube(object):
             return url[0]
 def slugify(text):
     """
     Santizes the video text, generating a valid filename.
@@ -240,12 +311,6 @@ def slugify(text):
     Keyword arguments:
     text -- The text corpus to make file name save.
     """
-    strip = re.compile(r'[^\w\s-]')
-    if not isinstance(text, unicode):
-        text = unicode(text)
-        #BUG: Fails trying to interpet non-ascii characters.
-        #UTF-8.. we get it.
-        text = normalize('NFKD', text).encode('ascii', 'ignore')
-        text = unicode(strip.sub('', text).strip())
-        return unicode(text)

 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 # OTHER DEALINGS IN THE SOFTWARE.
+from urllib import urlencode
+from urllib2 import urlopen
 from urlparse import urlparse, parse_qs
 import re
 YT_BASE_URL = 'http://www.youtube.com/get_video_info'
+# YouTube media encoding options.
 YT_ENCODING = {
     5: (5, "flv", "224p"),
     6: (6, "flv", "270p"),
         Keyword arguments:
         extention -- The file extention the video should be saved as.
         resolution -- The broadcasting standard of the video.
+        url -- The url of the video. (e.g.: youtube.com/watch?v=..)
         filename -- The filename (minus the extention) to save the video.
         """
         self.extension = extension
         self.filename = filename
     def download(self):
+        """
+        Downloads the file of the URL defined within the class
+        instance.
+        """
+        response = urlopen(self.url)
         #TODO: Allow a destination path to be specified.
         dst_file = open(self.filename, 'wb')
         meta_data = response.info()
     _video_url = None
     title = None
     videos = []
+    # fmt was an undocumented URL parameter that allowed selecting
+    # YouTube quality mode without using player user interface.
     @property
     def url(self):
     @url.setter
     def url(self, url):
+        """ Defines the URL of the YouTube video."""
         self._video_url = url
+        #Reset the filename.
+        self._filename = None
+        #Get the video details.
         self._get_video_info()
     @property
     def filename(self):
+        """
+        Exposes the title of the video. If this is not set, one is
+        generated based on the name of the video.
+        """
         if not self._filename:
             self._filename = slugify(self.title)
         return self._filename
     @filename.setter
     def filename(self, filename):
+        """ Defines the filename."""
         self._filename = filename
     @property
     def video_id(self):
+        """Gets the video ID extracted from the URL."""
         parts = urlparse(self._video_url)
         qs = getattr(parts, 'query', None)
         if qs:
                 return video_id.pop()
     def get(self, extension=None, res=None):
+        """
+        Return a single video given an extention and resolution.
+        Keyword arguments:
+        extention -- The desired file extention (e.g.: mp4).
+        res -- The desired broadcasting standard of the video (e.g.: 1080p).
+        """
         result = []
         for v in self.videos:
             if extension and v.extension != extension:
             else:
                 result.append(v)
         if len(result) is 1:
+            return result[0]
         else:
+            raise Exception("Multiple videos returned")
     def filter(self, extension=None, res=None):
+        """
+        Return a filtered list of videos given an extention and
+        resolution criteria.
+        Keyword arguments:
+        extention -- The desired file extention (e.g.: mp4).
+        res -- The desired broadcasting standard of the video (e.g.: 1080p).
+        """
         results = []
         for v in self.videos:
             if extension and v.extension != extension:
         return results
     def _fetch(self, path, data):
+        """
+        Given a path, traverse the response for the desired data. (A
+        modified ver. of my dictionary traverse method:
+        https://gist.github.com/2009119)
+        Keyword arguments:
+        path -- A tulip representing a path to a node within a tree.
+        data -- The data containing the tree.
+        """
         elem = path[0]
+        #Get first element in tulip, and check if it contains a list.
         if type(data) is list:
+            # Pop it, and let's continue..
             return self._fetch(path, data.pop())
+        #Parse the url encoded data
         data = parse_qs(data)
+        #Get the element in our path
         data = data.get(elem, None)
+        #Offset the tulip by 1.
         path = path[1::1]
+        #Check if the path has reached the end OR the element return
+        #nothing.
         if len(path) is 0 or data is None:
             if type(data) is list and len(data) is 1:
                 data = data.pop()
             return data
         else:
+            # Nope, let's keep diggin'
             return self._fetch(path, data)
     def _get_video_info(self):
+        """
+        This is responsable for executing the request, extracting the
+        necessary details, and populating the different video
+        resolutions and formats into a list.
+        """
+        querystring = urlencode({
             'asv': 3,
             'el': 'detailpage',
             'hl': 'en_US',
             'video_id': self.video_id
+        })
+        response = urlopen(YT_BASE_URL + '?' + querystring)
+        #TODO: evaulate the status code.
+        if response:
+            content = response.read()
+            #Use my cool traversing method to extract the specific
+            #attribute from the response body.
             path = ('url_encoded_fmt_stream_map', 'itag')
+            #Using the ``itag`` (otherwised referred to as ``fmf``, set the
+            #available encoding options.
             encoding_options = self._fetch(path, content)
             self.title = self._fetch(('title',), content)
             for video in encoding_options:
                 url = self._extract_url(video)
                 if not url:
+                    #Sometimes the regex for matching the video returns
+                    #a single empty element, so we'll skip those here.
                     continue
                 fmt, extension, resolution = self._extract_fmt(video)
                 filename = "%s.%s" % (self.filename, extension)
+                self.videos.append(Video(extension, resolution, url, filename))
                 self._fmt_values.append(fmt)
     def _extract_fmt(self, text):
         """
+        YouTube does not pass you a completely valid URLencoded form,
+        I suspect this is suppose to act as a deterrent.. Nothing some
+        regulular expressions couldn't handle.
         Keyword arguments:
         text -- The malformed data contained within each url node.
     def _extract_url(self, text):
         """
+        (I hate to be redundant here, but whatever) YouTube does not
+        pass you a completely valid URLencoded form, I suspect this is
+        suppose to act as a deterrent.. Nothing some regulular
+        expressions couldn't handle.
         Keyword arguments:
         text -- The malformed data contained in the itag node.
             return url[0]
+def sanitize_filename(text):
+    """
+    Sanitizes filenames for many operating systems.
+    Keyword arguments:
+    text -- The unsanitized pending filename.
+    """
+    # quick truncate for handling long filenames.
+    truncate = lambda text: text[:155].rsplit(' ', 1)[0]
+    #NTFS forbids characters in range 0-31 (0x00-0x1F)
+    ntfs = [chr(i) for i in range(0, 31)]
+    # This *should* cover a wide range of legacy operating systems.
+    paranoid = ['\"', '\#', '\$', '\%', '\'', '\*', '\,', '\.', '\/', '\:',
+        '\;', '\<', '\>', '\?', '\\', '\^', '\|', '\~', '\\\\']
+    blacklist = re.compile('|'.join(ntfs + paranoid), re.UNICODE)
+    filename = blacklist.sub('', text)
+    return truncate(filename)
 def slugify(text):
     """
     Santizes the video text, generating a valid filename.
     Keyword arguments:
     text -- The text corpus to make file name save.
     """
+    text = sanitize_filename(text)
+    text = text.replace('_', ' ')
+    return text