From 964aad889db2c1c04294d2ee2e7afd9a876de312 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20Moreira?= Date: Mon, 14 Apr 2014 17:29:31 -0300 Subject: [PATCH] add spider for StrangeLoop conference --- pycon_speakers/spiders/strangeloop.py | 29 +++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 pycon_speakers/spiders/strangeloop.py diff --git a/pycon_speakers/spiders/strangeloop.py b/pycon_speakers/spiders/strangeloop.py new file mode 100644 index 0000000..2b68085 --- /dev/null +++ b/pycon_speakers/spiders/strangeloop.py @@ -0,0 +1,29 @@ +from urlparse import urljoin + +from scrapy.spider import Spider +from scrapy.selector import Selector +from scrapy.http import Request + +from pycon_speakers.loaders import SpeakerLoader + +class StrangeLoopSpider(Spider): + name = 'strangeloop.com' + start_urls = ['https://thestrangeloop.com/'] + + def parse(self, response): + sel = Selector(response) + xp_links = sel.xpath("//ul/li/a[contains(@href, '/archive/')]/@href") + for link in set(xp_links.extract()): + year = link.rpartition('/')[2] + yield Request(urljoin(response.url, link), + callback=self.parse_speakers, meta={'year': year}) + + def parse_speakers(self, response): + sel = Selector(response) + for speaker_div in sel.xpath("//div[contains(@class, 'speaker')]"): + loader = SpeakerLoader(selector=speaker_div) + loader.add_xpath('name', ".//h5/a[@target='_blank']/text()") + loader.add_value('year', str(response.meta['year'])) + yield loader.load_item() + +