-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmklist-imdb-pd
executable file
·83 lines (74 loc) · 2.79 KB
/
mklist-imdb-pd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Extract list of claimed public domain movies listed on imdb.com.
"""
import argparse
import json
import lxml.html
import re
import time
import urllib2
import urlparse
import movielib
def fetch(entries, urlbase, page = 1):
count = 0
url = urlbase % start
print url
html = movielib.http_get_read(url)
root = lxml.html.fromstring(html)
for entry in root.cssselect("h3.lister-item-header"):
t = entry.cssselect("a[href]")
if t:
ta = t[0].cssselect("a")[0]
imdburl = urlparse.urljoin(url, ta.attrib['href'].split('?')[0])
title = t[0].text_content().strip()
y = entry.cssselect("span.lister-item-year")
# Picking the last year seen to pick episode info for TV
# series to avoid non-ASCII character in TV series year
m = re.search("\((\d+).*\)", y[-1].text_content())
if m:
year = m.group(1)
print imdburl, title
entries[imdburl] = {
'status' : 'free',
'freenessurl' : url,
'title' : title,
}
if '' != year:
entries[imdburl]['year'] = int(year)
count = count + 1
return count
urlbases = [
# 250 Public Domain Movies
#"http://www.imdb.com/list/title/ls066150214/?page=%s&view=compact&sort=listorian:asc",
# 1,089 Titles on Public Domain:
"http://www.imdb.com/list/ls060673364/?page=%s&view=compact&sort=listorian:asc",
# Public domain science fiction films in the Internet Archive
"http://www.imdb.com/list/ls002037861/?page=%s&view=compact&sort=listorian:asc",
# Public domain scifi / horror films (available in the Internet Archive)
"http://www.imdb.com/list/ls068570432/?page=%s&view=compact&sort=listorian:asc",
# Public domain martial arts films in the Internet Archive
"http://www.imdb.com/list/ls002024467/?page=%s&view=compact&sort=listorian:asc",
# Public domain horror films in the Internet Archive
"http://www.imdb.com/list/ls002037865/?page=%s&view=compact&sort=listorian:asc",
]
urlbases_dubious = [
# 2088 titles on Public domain
# This list contain Psycho II from 1983. Is its content really in
# the public domain?
"http://www.imdb.com/list/ls063670826/?page=%s&view=compact&sort=listorian:asc",
]
parser = argparse.ArgumentParser()
parser.add_argument('--dubious-list', action='store_true', default=False)
parser.add_argument('--output', default='free-movies-imdb-pd.json')
args = parser.parse_args()
print args
if args.dubious_list:
urlbases = urlbases_dubious
entries = {}
for urlbase in urlbases:
start = 1
while 0 < fetch(entries, urlbase, start):
start = start + 1
movielib.savelist(entries, name=args.output)