-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmklist-openflix
executable file
·87 lines (80 loc) · 2.88 KB
/
mklist-openflix
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env python
"""
Fetch complete list of movies listed on openflix.com.
Mapping to IMDB by searching for the title and year and
assuming it is the right one if IMDB only find one matching entry.
"""
import argparse
import lxml.html
import movielib
import urllib2
import urlparse
def extract_movies(args, l, genreurl):
try:
root = lxml.html.fromstring(movielib.http_get_read(genreurl))
except urllib2.HTTPError as e:
return None
# FIXME this end up ignoring entries without iages
for img in root.cssselect("td a[href] img"):
a = img.getparent()
if -1 != a.attrib['href'].find('/movie/'):
movieurl = urlparse.urljoin(genreurl, a.attrib['href'])
title = img.attrib['alt']
for tr in img.iterancestors('tr'):
entry = tr
break
#print " ", movieurl, title
info = {
'status' : 'free',
'title' : title,
'freenessurl' : genreurl,
}
d1 = entry.cssselect("table tr td.lab")[0]
d2 = d1.getnext()
s = (d1.text_content() + d2.text_content())
#print s
s = s.replace('Language: ', 'Language:n/a ')
s = s.replace(': ',':')
s.replace(':  C', ':   C')
s = s.split("   ")
#print s
for v in s:
key, value = v.split(':')
info[key.lower()] = value
print info
ref = movieurl
if args.imdblookup:
try:
imdb = movielib.imdb_find_one(info['title'], int(info['year']))
if imdb:
ref = imdb
info['imdblookup'] = '%s %d' % (info['title'], int(info['year']))
except KeyError: # hit this with mojobake and UTF-8 in 'Haxan')
pass
l[ref] = info
return l
def fetch_genre_list(genrelisturl):
try:
root = lxml.html.fromstring(movielib.http_get_read(genrelisturl))
except urllib2.HTTPError as e:
return None
g = []
for a in root.cssselect("li a[href]"):
if -1 != a.attrib['href'].find('/genre/'):
genreurl = urlparse.urljoin(genrelisturl, a.attrib['href'])
g.append(genreurl)
return g
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--imdblookup', action='store_true', default=False,
help='also find title IDs by searching for title/year in IMDB')
args = parser.parse_args()
genrelisturl = "http://www.openflix.com/genre/"
g = fetch_genre_list(genrelisturl)
l = {}
for genreurl in g:
#print genreurl
l = extract_movies(args, l, genreurl)
movielib.savelist(l, name='free-movies-openflix.json')
if __name__ == '__main__':
main()