From 33427abd9a2bf7153ce8e2d18fe2e6857db87212 Mon Sep 17 00:00:00 2001 From: Rocio Aramberri Date: Tue, 15 Apr 2014 13:51:02 -0400 Subject: [PATCH] Add more years to oscon spider --- pycon_speakers/spiders/oscon_com.py | 46 +++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/pycon_speakers/spiders/oscon_com.py b/pycon_speakers/spiders/oscon_com.py index 3a5c059..b107da0 100644 --- a/pycon_speakers/spiders/oscon_com.py +++ b/pycon_speakers/spiders/oscon_com.py @@ -4,17 +4,39 @@ from pycon_speakers.loaders import SpeakerLoader +ARCHIVE = { + 2001: 'http://conferences.oreillynet.com/cs/os2001/pub/w/os2001/speakers.html', + 2002: '15', + 2003: '23', + 2004: '29', + 2005: '38', + 2006: '46', + 2007: '58', +} + + class OsConSpider(Spider): name = 'oscon.com' - years = '2013,2012,2011,2010,2009,2008,2007' + years = ( + '2014,2013,2012,2011,2010,2009,2008,2007,2006,2005,2004,2003,2002,2001' + ) base_url = 'http://www.oscon.com/oscon{year}/public/schedule/speakers' + old_base_url = ( + 'http://conferences.oreillynet.com/pub/w/{code}/speakers.html') def start_requests(self): years = [int(x) for x in self.years.split(',')] for year in years: meta = {'year': year} - url = self.base_url.format(year=year) - yield Request(url, meta=meta) + if int(year) < 2008 and int(year) > 2001: + url = self.old_base_url.format(code=ARCHIVE[year]) + yield Request(url, callback=self.parse_old_format, meta=meta) + elif int(year) < 2002: + url = ARCHIVE[year] + yield Request(url, callback=self.parse_old_format, meta=meta) + else: + url = self.base_url.format(year=year) + yield Request(url, meta=meta) def parse(self, response): sel = Selector(response) @@ -24,3 +46,21 @@ def parse(self, response): il.add_value('year', str(response.meta['year'])) il.add_value('conference', 'OSCON') yield il.load_item() + + def parse_old_format(self, response): + sel = Selector(response) + speakers = sel.xpath('//div[@class="speaker-blurb"]//h3').extract() + for speaker in speakers: + il = SpeakerLoader(response=response) + il.add_value('name', speaker) + il.add_value('year', str(response.meta['year'])) + il.add_value('conference', 'OSCON') + yield il.load_item() + more_speakers = sel.xpath( + '//span/a[contains(@href, "e_spkr")]//text()').extract() + for speaker in more_speakers: + il = SpeakerLoader(response=response) + il.add_value('name', speaker.replace('N/A', '')) + il.add_value('year', str(response.meta['year'])) + il.add_value('conference', 'OSCON') + yield il.load_item()