-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtumblrscraper.py
executable file
·66 lines (61 loc) · 2.43 KB
/
tumblrscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/env python
import tumblpy
import sqlite3
import ts_model
import sys
def tumblr_scraper(base_url,db_name,num_images,start_offset=0,limit=20,url_type='blog'):
#init with some key
t = tumblpy.Tumblpy(app_key = 'V55FKUe1lMSdx0UyGSFknmO8DoSaeNzT9oByUwOE1Hvp7diQJ7',
app_secret = 'TD9eTgRhoo8ceu0cjcF0nROWAAMkst1uAkSx5XuSOjnYxrGq50',
callback_url = 'whatever.com/notimportant_now')
#we don't need this code
auth_props = t.get_authentication_tokens()
auth_url = auth_props['auth_url']
oauth_token = auth_props['oauth_token']
oauth_token_secret = auth_props['oauth_token_secret']
#running
#get db connection
print "Connecting to %s" % db_name
conn = ts_model.touch_db(db_name)
c = conn.cursor()
#scraping...
print "Scraping %s" % base_url
n = 0
i = 0
while n < num_images :
#get the posts
print "Get posts %i to %i" % (i*limit+start_offset,(1+i)*limit+start_offset)
if url_type == 'blog':
posts = t.get('posts',blog_url=base_url,params={'limit':limit, 'offset':i*limit+start_offset})
i += 1
for p in posts['posts']:
#some posts don't have photo
if(not('photos' in p)): continue
#some posts have more than one image, we will ignore that for now
if(len(p['photos']) != 1): continue
#some posts don't have tag
if(len(p['tags']) == 0): continue
# If we made it through that, we have a new photo
n += 1
#print out the info, move to DB later
note_count = p['note_count']
tags = [ y.strip().lower() for x in p['tags']
for y in x.split('\n') ]
url = p['photos'][0]['original_size']['url']
#if this is slow, switch to batch execute instead
print "Found %s %i: %s %s" % (sys.argv[1],n,url,"#" + " #".join(tags))
ts_model.add_tags(c, tags)
ts_model.add_photo(c, url, note_count)
ts_model.link_tags_photo(c, tags, url)
conn.commit()
conn.close()
if __name__ == "__main__" :
#subject to cmd input
base_url='http://%s.tumblr.com' % sys.argv[1]
db_name = '%s.db' % sys.argv[1]
num_images = int(sys.argv[2])
if len(sys.argv) >= 4:
start_offset = int(sys.argv[3])
else :
start_offset = 0
tumblr_scraper(base_url,db_name,num_images,start_offset=start_offset,limit=20)