diff --git a/pycon_speakers/loaders.py b/pycon_speakers/loaders.py index 1833618..bdb2085 100644 --- a/pycon_speakers/loaders.py +++ b/pycon_speakers/loaders.py @@ -19,6 +19,7 @@ def _cleanup_name(name): >>> _cleanup_name(u'Ivan Krstic / Harvard University (presently..)') u'Ivan Krstic' """ + name = name.replace('\t', ' ') name = _STRIPRE1.sub(u'', name, re.I) return _STRIPRE2.sub(u'', name, re.I) diff --git a/pycon_speakers/spiders/developerweek_com.py b/pycon_speakers/spiders/developerweek_com.py new file mode 100644 index 0000000..bdb1bea --- /dev/null +++ b/pycon_speakers/spiders/developerweek_com.py @@ -0,0 +1,41 @@ + + +from urlparse import urljoin + +from scrapy.spider import Spider +from scrapy.selector import Selector +from scrapy.http import Request + +from pycon_speakers.loaders import SpeakerLoader + + +class PyConSpider(Spider): + name = 'developerweek.com' + base_url = "http://confreaks.com/" + + def start_requests(self): + url = "http://developerweek2014conferenceexpo.sched.org/directory/speakers" + meta = {'year': '2014', 'conference': self.name} + yield Request(url, meta=meta, + callback=self._parse_2014) + + url = "http://www.developerweek.com/2013-sf/index/allspeakers" + meta = {'year': '2013', 'conference': self.name} + yield Request(url, meta=meta, + callback=self._parse_2013) + + def _parse_2013(self, response): + for section in Selector(response).xpath("//div[@class='data-mid2']/h2[1]/a[1]"): + il = SpeakerLoader(selector=section) + il.add_xpath('name', ".") + il.add_value('conference', str(response.meta['conference'])) + il.add_value('year', str(response.meta['year'])) + yield il.load_item() + + def _parse_2014(self, response): + for section in Selector(response).xpath("//div[@class='sched-person']"): + il = SpeakerLoader(selector=section) + il.add_xpath('name', "./h2/a") + il.add_value('conference', str(response.meta['conference'])) + il.add_value('year', str(response.meta['year'])) + yield il.load_item() \ No newline at end of file