adding subQuery.py methods to query html pages and retrieve specific urls.

This commit is contained in:
Fred Pauchet 2011-11-21 21:21:31 +01:00
parent 638654066f
commit 26cd5e67c3
1 changed files with 109 additions and 0 deletions

109
pigeonhole/subQuery.py Normal file
View File

@ -0,0 +1,109 @@
import urllib2
import re
import os
from BeautifulSoup import BeautifulSoup
"""
Querying non web services interfaces
through http interrogation and regex results retrieval.
"""
languages = ('en', 'es', 'fr', 'de')
def queryUrl(baseurl, baseregex):
print 'Querying %s w/ %s' % (baseurl, baseregex)
def query(showname):
print "Trying " + showname
socket = urllib2.urlopen('http://www.tvsubtitles.net/search.php?q=' + showname.replace(' ', '%20'))
soup = BeautifulSoup(socket.read())
socket.close()
results = soup.findAll(href=re.compile("/tvshow-([A-Za-z0-9]*)\.html$"))
# a yield here would be cool ! :)
if len(results) == 1:
print str(results[0])
return results[0]
elif len(results) == 0:
print "No results found for " + showname
return None
else:
print "Here are the possible results for " + showname
for res in results:
print "\t" + str(res)
return None
""" Get a specific season, based on the show name and the season number
eg. getSeason('suits', 1)
getSeason('dexter', 3)
"""
def getSeason(showname, seasonNumber):
season = query(showname)
#idem
if season is not None:
print str(season).replace('.html', '-' + str(seasonNumber) + '.html')
else:
print "no season found"
""" Get a specific episode, based on the show name, the season number and the episode number
eg. getEpisode('being erica', 2, 12)
getEpisode('the big bang theory', 3, 15)
"""
def getEpisode(showname, seasonNumber, episodeNumber):
raise Exception('not implemented yet')
season = query(showname, seasonNumber)
urllib2.urlopen(season)
#idem
if episode is not None:
print str(episode).replace('.html', '-')
else:
print "no episode found"
def getUrl(showname, seasonNumber, episodeNumber, language):
"""Supposed to send to the right page, according to the right episode number"""
pass
""" Write a shortcut to a specific web page and fix the shortcutname within the writtent file.
eg. writeUrlShortcut('/opt/tmp', 'google.url', 'http://www.google.com', 'Google')
>>> [Google]
>>> URL=http://www.google.com
>>> inside a file named /opt/tmp/google.url
"""
def writeUrlShortcut(folderpath, filename, url, shortcutname):
if not os.path.exists(folderpath):
raise Exception('Writing Url : Path does not exists')
filecontent = """[%s]\nURL=%s""" % (shortcutname, url)
with open(os.path.join(folderpath, filename), 'w+') as f:
f.write(filecontent)
if __name__ == "__main__":
#queryUrl('http://www.tvsubtitles.net/search?q=', 'tvshow')
# query('the big bang theory')
# query('being erica')
# query('white collar')
# query('scrubs')
# query('castle')
getSeason('the big bang theory', 2)
getSeason('white collar', 1)
getSeason('suits', 1)
getSeason('being erica', 2)