mklist-imdb-c-expired-year

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
According to
https://en.wikipedia.org/wiki/List_of_films_in_the_public_domain_in_the_United_States,
every movie published in the USA before 1924 are now public domain.

All works published before 1912-07-01 in the United Kingdom is no longer
protected, according to
https://en.wikipedia.org/wiki/Copyright_law_of_the_United_Kingdom#Extension_of_copyright_term

Extract such lists of movies from imdb.com.
"""

import argparse
import json
import lxml.html
import movielib
import re
import time
import urlparse

def fetch_search_result(entries, urlbase, start):
    count = 0
    url = urlbase % start
    print url
    html = movielib.http_get_read(url)
    root = lxml.html.fromstring(html)
    for h in root.cssselect("div.lister-item-content h3.lister-item-header "):
        t = h.cssselect("a[href]")
        if t:
            ta = t[0].cssselect("a")[0]
            imdburl = urlparse.urljoin(url, ta.attrib['href'].split('?')[0])
            title = t[0].text_content().strip()
            y = h.cssselect("span.lister-item-year")[0].text_content()
            m = re.search("\((\d+)\)$", y)
            if m:
                year = m.group(1)
            else:
                year = None
            print imdburl, year, title
            entries[imdburl] = {
                'status' : 'free',
                'freenessurl' : url,
                'title' : title,
            }
            if year:
                entries[imdburl]['year'] = int(year)
            count = count + 1
    next = root.cssselect("a.lister-page-next")
    if next:
        return True
    return False

def imdb_c_expired(entries, country='us', start=1874, end=1924,
                   all=False, path=None):

    pagesize = 50
    
    # The oldest known movie was published in 1874
    for y in xrange(start, end):

        urlbase = "http://www.imdb.com/search/title?sort=release_date&country_of_origin=%s&year=%d&start=%%s" % (country, y)
        if not all:
            urlbase = urlbase + '&title_type=feature'
        start = 1
        while fetch_search_result(entries, urlbase, start):
            start = start + pagesize
        if path:
            movielib.savelist(entries, name=path)
        time.sleep(10) # avoid overloading IMDB

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--output', default='free-movies-imdb-c-expired.json')
    parser.add_argument('--all', action='store_true', default=False,
                        help='list all movies, not only feature movies')
    parser.add_argument('--country', default='us',
                        help='the code for country of origin')
    parser.add_argument('--start', type=int, default=1874,
                        help='the first year to search for in IMDB')
    parser.add_argument('--end', type=int, default=1924,
                        help='stop searching before this year')
    args = parser.parse_args()

    entries = {}
    imdb_c_expired(entries, country=args.country,
                   start=args.start, end=args.end, all=args.all,
                   path=args.output)
    movielib.savelist(entries, name=args.output)
if __name__ == '__main__':
    main()