Skip to content

Commit

Permalink
Merge pull request #21 from rocioar/master
Browse files Browse the repository at this point in the history
Add more years to oscon spider
  • Loading branch information
shaneaevans committed Apr 15, 2014
2 parents ce5a865 + 33427ab commit 46eea45
Showing 1 changed file with 43 additions and 3 deletions.
46 changes: 43 additions & 3 deletions pycon_speakers/spiders/oscon_com.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,39 @@

from pycon_speakers.loaders import SpeakerLoader

ARCHIVE = {
2001: 'http://conferences.oreillynet.com/cs/os2001/pub/w/os2001/speakers.html',
2002: '15',
2003: '23',
2004: '29',
2005: '38',
2006: '46',
2007: '58',
}


class OsConSpider(Spider):
name = 'oscon.com'
years = '2013,2012,2011,2010,2009,2008,2007'
years = (
'2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001'
)
base_url = 'http://www.oscon.com/oscon{year}/public/schedule/speakers'
old_base_url = (
'http://conferences.oreillynet.com/pub/w/{code}/speakers.html')

def start_requests(self):
years = [int(x) for x in self.years.split(',')]
for year in years:
meta = {'year': year}
url = self.base_url.format(year=year)
yield Request(url, meta=meta)
if int(year) < 2008 and int(year) > 2001:
url = self.old_base_url.format(code=ARCHIVE[year])
yield Request(url, callback=self.parse_old_format, meta=meta)
elif int(year) < 2002:
url = ARCHIVE[year]
yield Request(url, callback=self.parse_old_format, meta=meta)
else:
url = self.base_url.format(year=year)
yield Request(url, meta=meta)

def parse(self, response):
sel = Selector(response)
Expand All @@ -24,3 +46,21 @@ def parse(self, response):
il.add_value('year', str(response.meta['year']))
il.add_value('conference', 'OSCON')
yield il.load_item()

def parse_old_format(self, response):
sel = Selector(response)
speakers = sel.xpath('//div[@class="speaker-blurb"]//h3').extract()
for speaker in speakers:
il = SpeakerLoader(response=response)
il.add_value('name', speaker)
il.add_value('year', str(response.meta['year']))
il.add_value('conference', 'OSCON')
yield il.load_item()
more_speakers = sel.xpath(
'//span/a[contains(@href, "e_spkr")]//text()').extract()
for speaker in more_speakers:
il = SpeakerLoader(response=response)
il.add_value('name', speaker.replace('N/A', ''))
il.add_value('year', str(response.meta['year']))
il.add_value('conference', 'OSCON')
yield il.load_item()

0 comments on commit 46eea45

Please sign in to comment.