pytube / youtube /
nficano's picture
General housekeeping, no code modified.
history blame
9.45 kB
from urllib import urlencode
from urllib2 import urlopen
from urlparse import urlparse, parse_qs
import re
# YouTube media encoding options.
5: (5, "flv", "224p"),
6: (6, "flv", "270p"),
34: (34, "flv", "360p"),
35: (35, "flv", "480p"),
18: (18, "mp4", "360p"),
22: (22, "mp4", "720p"),
37: (37, "mp4", "1080p"),
43: (43, "webm", "360p"),
44: (44, "webm", "480p"),
45: (45, "webm", "720p"),
46: (46, "webm", "1080p"),
class Video(object):
Class representation of a single instance of a YouTube video.
def __init__(self, extension, resolution, url, filename):
Define the variables required to declare a new video.
Keyword arguments:
extention -- The file extention the video should be saved as.
resolution -- The broadcasting standard of the video.
url -- The url of the video. (e.g.:
filename -- The filename (minus the extention) to save the video.
self.extension = extension
self.resolution = resolution
self.url = url
self.filename = filename
def download(self):
Downloads the file of the URL defined within the class
response = urlopen(self.url)
#TODO: Allow a destination path to be specified.
dst_file = open(self.filename, 'wb')
meta_data =
file_size = int(meta_data.getheaders("Content-Length")[0])
print "Downloading: %s Bytes: %s" % (self.filename, file_size)
bytes_received = 0
chunk_size = 8192
while True:
buffer =
if not buffer:
bytes_received += len(buffer)
percent = bytes_received * 100. / file_size
status = r"%10d [%3.2f%%]" % (bytes_received, percent)
status = status + chr(8) * (len(status) + 1)
print status,
def __repr__(self):
"""A cleaner representation of the class instance."""
return "<Video: %s - %s>" % (self.extension, self.resolution)
class YouTube(object):
_filename = None
_fmt_values = []
_video_url = None
title = None
videos = []
# fmt was an undocumented URL parameter that allowed selecting
# YouTube quality mode without using player user interface.
def url(self):
"""Exposes the video url."""
return self._video_url
def url(self, url):
""" Defines the URL of the YouTube video."""
self._video_url = url
#Reset the filename.
self._filename = None
#Get the video details.
def filename(self):
Exposes the title of the video. If this is not set, one is
generated based on the name of the video.
if not self._filename:
self._filename = safe_filename(self.title)
return self._filename
def filename(self, filename):
""" Defines the filename."""
self._filename = filename
def video_id(self):
"""Gets the video ID extracted from the URL."""
parts = urlparse(self._video_url)
qs = getattr(parts, 'query', None)
if qs:
video_id = parse_qs(qs).get('v', None)
if video_id:
return video_id.pop()
def get(self, extension=None, res=None):
Return a single video given an extention and resolution.
Keyword arguments:
extention -- The desired file extention (e.g.: mp4).
res -- The desired broadcasting standard of the video (e.g.: 1080p).
result = []
for v in self.videos:
if extension and v.extension != extension:
elif res and v.resolution != res:
if len(result) is 1:
return result[0]
raise Exception("Multiple videos returned")
def filter(self, extension=None, res=None):
Return a filtered list of videos given an extention and
resolution criteria.
Keyword arguments:
extention -- The desired file extention (e.g.: mp4).
res -- The desired broadcasting standard of the video (e.g.: 1080p).
results = []
for v in self.videos:
if extension and v.extension != extension:
elif res and v.resolution != res:
return results
def _fetch(self, path, data):
Given a path, traverse the response for the desired data. (A
modified ver. of my dictionary traverse method:
Keyword arguments:
path -- A tulip representing a path to a node within a tree.
data -- The data containing the tree.
elem = path[0]
#Get first element in tulip, and check if it contains a list.
if type(data) is list:
# Pop it, and let's continue..
return self._fetch(path, data.pop())
#Parse the url encoded data
data = parse_qs(data)
#Get the element in our path
data = data.get(elem, None)
#Offset the tulip by 1.
path = path[1::1]
#Check if the path has reached the end OR the element return
if len(path) is 0 or data is None:
if type(data) is list and len(data) is 1:
data = data.pop()
return data
# Nope, let's keep diggin'
return self._fetch(path, data)
def _get_video_info(self):
This is responsable for executing the request, extracting the
necessary details, and populating the different video
resolutions and formats into a list.
querystring = urlencode({
'asv': 3,
'el': 'detailpage',
'hl': 'en_US',
'video_id': self.video_id
response = urlopen(YT_BASE_URL + '?' + querystring)
#TODO: evaulate the status code.
if response:
content =
#Use my cool traversing method to extract the specific
#attribute from the response body.
path = ('url_encoded_fmt_stream_map', 'itag')
#Using the ``itag`` (otherwised referred to as ``fmf``, set the
#available encoding options.
encoding_options = self._fetch(path, content)
self.title = self._fetch(('title',), content)
for video in encoding_options:
url = self._extract_url(video)
if not url:
#Sometimes the regex for matching the video returns
#a single empty element, so we'll skip those here.
fmt, extension, resolution = self._extract_fmt(video)
filename = "%s.%s" % (self.filename, extension)
self.videos.append(Video(extension, resolution, url, filename))
def _extract_fmt(self, text):
YouTube does not pass you a completely valid URLencoded form,
I suspect this is suppose to act as a deterrent.. Nothing some
regulular expressions couldn't handle.
Keyword arguments:
text -- The malformed data contained within each url node.
itag = re.findall('itag=(\d+)', text)
if itag and len(itag) is 1:
itag = int(itag[0])
return YT_ENCODING.get(itag, None)
def _extract_url(self, text):
(I hate to be redundant here, but whatever) YouTube does not
pass you a completely valid URLencoded form, I suspect this is
suppose to act as a deterrent.. Nothing some regulular
expressions couldn't handle.
Keyword arguments:
text -- The malformed data contained in the itag node.
url = re.findall('url=(.*)', text)
if url and len(url) is 1:
return url[0]
def safe_filename(text, max_length=200):
Sanitizes filenames for many operating systems.
Keyword arguments:
text -- The unsanitized pending filename.
#Quickly truncates long filenames.
truncate = lambda text: text[:max_length].rsplit(' ', 0)[0]
#Tidy up ugly formatted filenames.
text = text.replace('_', ' ')
text = text.replace(':', ' -')
#NTFS forbids filenames containing characters in range 0-31 (0x00-0x1F)
ntfs = [chr(i) for i in range(0, 31)]
#Removing these SHOULD make most filename safe for a wide range
#of operating systems.
paranoid = ['\"', '\#', '\$', '\%', '\'', '\*', '\,', '\.', '\/', '\:',
'\;', '\<', '\>', '\?', '\\', '\^', '\|', '\~', '\\\\']
blacklist = re.compile('|'.join(ntfs + paranoid), re.UNICODE)
filename = blacklist.sub('', text)
return truncate(filename)