From 837e3c0f9a660fad49181c77ef8898a2f30b414c Mon Sep 17 00:00:00 2001 From: elrull Date: Sun, 20 Apr 2014 19:29:09 -0500 Subject: [PATCH] only speakers: added 2011 arg pycon --- pycon_speakers/spiders/ar_pycon.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/pycon_speakers/spiders/ar_pycon.py b/pycon_speakers/spiders/ar_pycon.py index 3f9a1e9..0237b17 100644 --- a/pycon_speakers/spiders/ar_pycon.py +++ b/pycon_speakers/spiders/ar_pycon.py @@ -13,10 +13,10 @@ class ArPyconSpider(Spider): """ name = 'ar.pycon.org' from_year = 2011 - base_url = 'http://ar.pycon.org/{year}/stats/attendees' + base_url = 'http://ar.pycon.org/{year}/schedule/index' def start_requests(self): - current_year = date.today().year + current_year = date.today().year - 2 for year in range(self.from_year, current_year): url = self.base_url.format(year=year) yield Request(url) @@ -24,8 +24,15 @@ def start_requests(self): def parse(self, response): selector = Selector(response) year = re.search(r'/(\d+)/', response.url).group(1) + + speakers = [] + for i, bad_name in enumerate(selector.xpath('//div[@style]//span[position()=1]/text()').extract()): + if i % 2 != 0: + name = reversed([a.strip() for a in bad_name.split(",")]) + speaker = " ".join(name) + speakers.append(speaker) + return [Speaker(name=speaker, conference=self.name, year=year) - for speaker in selector.xpath('//table[position()>1]' - '//tr[position()>1]//td[position()=1]//text()').extract()] + for speaker in speakers]