Skip to content

Commit

Permalink
# This is a combination of 2 commits.
Browse files Browse the repository at this point in the history
# The first commit's message is:

Add pyvideo spider

# This is the 2nd commit message:

add conference to spiders
  • Loading branch information
shaneaevans committed Apr 14, 2014
1 parent a0818b5 commit 840b73c
Show file tree
Hide file tree
Showing 6 changed files with 41 additions and 0 deletions.
2 changes: 2 additions & 0 deletions pycon_speakers/spiders/europython_eu.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def parse(self, response):
il.add_css('name', ".name > a::text")
il.add_css('image_urls', "img::attr(src)")
il.add_value('year', str(response.meta['cookiejar']))
il.add_value('conference', 'EuroPython')
yield il.load_item()
# pagination
pages = sel.css('.pagination a::attr(href)').extract()
Expand All @@ -40,6 +41,7 @@ def parse_new(self, response):
speakers = sel.css('.archive .talk .speakers > .speaker')
for speaker in speakers:
il = SpeakerLoader(selector=speaker)
il.add_value('conference', 'EuroPython')
il.add_css('name', "span::text")
il.add_css('image_urls', "a > img::attr(src)", lambda x:
[urljoin(response.url, y) for y in x])
Expand Down
1 change: 1 addition & 0 deletions pycon_speakers/spiders/euroscipy.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def parse(self, response):
sl = SpeakerLoader(selector=sel, response=response)
# TODO: handle/remove affiliation value and possibly multiple
# authors.
sl.add_value('conference', 'EuroSciPy')
sl.add_value('name', author)
sl.add_value('year', response.meta['year'])
yield sl.load_item()
1 change: 1 addition & 0 deletions pycon_speakers/spiders/oscon_com.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,5 @@ def parse(self, response):
il = SpeakerLoader(response=response)
il.add_value('name', speaker)
il.add_value('year', str(response.meta['year']))
il.add_value('conference', 'OSCON')
yield il.load_item()
3 changes: 3 additions & 0 deletions pycon_speakers/spiders/pycon_org.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,15 @@ def _follow_speakers(self, response):
il = SpeakerLoader(response=response)
il.add_xpath('name', "//a[contains(@href, '/speaker/profile/')]")
il.add_value('year', str(response.meta['year']))
il.add_value('conference', 'PyCon US')
yield il.load_item()

def _parse_2010(self, response):
for section in Selector(response).xpath('//div[@class="proposal_list_summary"]'):
il = SpeakerLoader(selector=section)
il.add_xpath('name', './span[1]')
il.add_value('year', str(response.meta['year']))
il.add_value('conference', 'PyCon US')
yield il.load_item()

def _parse_2006(self, response):
Expand All @@ -65,4 +67,5 @@ def _parse_2006(self, response):
il = SpeakerLoader(selector=name)
il.add_xpath('name', '.')
il.add_value('year', str(response.meta['year']))
il.add_value('conference', 'PyCon US')
yield il.load_item()
28 changes: 28 additions & 0 deletions pycon_speakers/spiders/pyvideo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import re

from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
from pycon_speakers.items import Speaker


class PyVideoSpider(CrawlSpider):
name = 'pyvideo.org'
allowed_domains = ['pyvideo.org']
start_urls = ['http://www.pyvideo.org/speaker/']

rules = (
# Extract links matching speakers
Rule(SgmlLinkExtractor(allow=('/speaker/\d+/', )), callback='parse_speaker'),
)

def parse_speaker(self, response):
sel = Selector(response)
name = sel.xpath('//h1/text()').extract()[0].strip()
for conf in sel.xpath('//div[@class="video-summary-data"]'):
speaker = Speaker()
speaker['name'] = name
conf_text = conf.select('.//a/text()')[1].extract()
speaker['conference'] = re.sub('\s20\d\d$','', conf_text)
speaker['year'] = conf.re('20\d\d')[0]
yield speaker
6 changes: 6 additions & 0 deletions pycon_speakers/spiders/scipy_org.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ def parse_2008(self, response):
sl = SpeakerLoader(selector=sel, response=response)
sl.add_value('name', author)
sl.add_value('year', response.meta['year'])
sl.add_value('conference', 'SciPy')
yield sl.load_item()

def parse_2009(self, response):
Expand All @@ -69,6 +70,7 @@ def parse_2009(self, response):
sl = SpeakerLoader(selector=sel, response=response)
sl.add_value('name', author)
sl.add_value('year', response.meta['year'])
sl.add_value('conference', 'SciPy')
yield sl.load_item()

def parse_2010(self, response):
Expand All @@ -78,6 +80,7 @@ def parse_2010(self, response):
sl = SpeakerLoader(selector=sel, response=response)
sl.add_value('name', author)
sl.add_value('year', response.meta['year'])
sl.add_value('conference', 'SciPy')
yield sl.load_item()

def parse_2011(self, response):
Expand All @@ -94,6 +97,7 @@ def parse_2012(self, response):
sl = SpeakerLoader(selector=sel, response=response)
sl.add_value('name', author)
sl.add_value('year', response.meta['year'])
sl.add_value('conference', 'SciPy')
yield sl.load_item()

def parse_2013(self, response):
Expand All @@ -106,4 +110,6 @@ def parse_2013(self, response):
# FIXME: most author entry have the institution at the end.
sl.add_value('name', author)
sl.add_value('year', response.meta['year'])
sl.add_value('conference', 'SciPy')

yield sl.load_item()

0 comments on commit 840b73c

Please sign in to comment.