-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmklist-imdb-c-expired-year
executable file
·92 lines (82 loc) · 3.13 KB
/
mklist-imdb-c-expired-year
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
According to
https://en.wikipedia.org/wiki/List_of_films_in_the_public_domain_in_the_United_States,
every movie published in the USA before 1924 are now public domain.
All works published before 1912-07-01 in the United Kingdom is no longer
protected, according to
https://en.wikipedia.org/wiki/Copyright_law_of_the_United_Kingdom#Extension_of_copyright_term
Extract such lists of movies from imdb.com.
"""
import argparse
import json
import lxml.html
import movielib
import re
import time
import urlparse
def fetch_search_result(entries, urlbase, start):
count = 0
url = urlbase % start
print url
html = movielib.http_get_read(url)
root = lxml.html.fromstring(html)
for h in root.cssselect("div.lister-item-content h3.lister-item-header "):
t = h.cssselect("a[href]")
if t:
ta = t[0].cssselect("a")[0]
imdburl = urlparse.urljoin(url, ta.attrib['href'].split('?')[0])
title = t[0].text_content().strip()
y = h.cssselect("span.lister-item-year")[0].text_content()
m = re.search("\((\d+)\)$", y)
if m:
year = m.group(1)
else:
year = None
print imdburl, year, title
entries[imdburl] = {
'status' : 'free',
'freenessurl' : url,
'title' : title,
}
if year:
entries[imdburl]['year'] = int(year)
count = count + 1
next = root.cssselect("a.lister-page-next")
if next:
return True
return False
def imdb_c_expired(entries, country='us', start=1874, end=1924,
all=False, path=None):
pagesize = 50
# The oldest known movie was published in 1874
for y in xrange(start, end):
urlbase = "http://www.imdb.com/search/title?sort=release_date&country_of_origin=%s&year=%d&start=%%s" % (country, y)
if not all:
urlbase = urlbase + '&title_type=feature'
start = 1
while fetch_search_result(entries, urlbase, start):
start = start + pagesize
if path:
movielib.savelist(entries, name=path)
time.sleep(10) # avoid overloading IMDB
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--output', default='free-movies-imdb-c-expired.json')
parser.add_argument('--all', action='store_true', default=False,
help='list all movies, not only feature movies')
parser.add_argument('--country', default='us',
help='the code for country of origin')
parser.add_argument('--start', type=int, default=1874,
help='the first year to search for in IMDB')
parser.add_argument('--end', type=int, default=1924,
help='stop searching before this year')
args = parser.parse_args()
entries = {}
imdb_c_expired(entries, country=args.country,
start=args.start, end=args.end, all=args.all,
path=args.output)
movielib.savelist(entries, name=args.output)
if __name__ == '__main__':
main()