From 06cf42442be13ff6cb12a9bfbfb1eb92cc6d2750 Mon Sep 17 00:00:00 2001 From: Shane Evans Date: Mon, 14 Apr 2014 18:34:41 -0400 Subject: [PATCH] Add spider for next day video --- pycon_speakers/spiders/nextdayvideo.py | 30 ++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 pycon_speakers/spiders/nextdayvideo.py diff --git a/pycon_speakers/spiders/nextdayvideo.py b/pycon_speakers/spiders/nextdayvideo.py new file mode 100644 index 0000000..11ce07e --- /dev/null +++ b/pycon_speakers/spiders/nextdayvideo.py @@ -0,0 +1,30 @@ +import json, re +from scrapy.spider import Spider +from pycon_speakers.items import Speaker + + +class NextDayVideoSpider(Spider): + name = 'nextdayvideo.com' + start_urls = ['http://veyepar.nextdayvideo.com/api/csp/?format=json'] + + def parse(self, response): + for conference in json.loads(response.body_as_unicode()): + conference_name = conference['name'] + for show_set in conference['show_set']: + set_name = show_set['name'] + year_match = re.search('20\d\d', set_name) + if not year_match: + self.log("skipping %s, set %s: missing year" % + (conference_name, set_name)) + continue + year = year_match.group() + for episode in show_set['episode_set']: + authors = episode.get('authors') + if not authors: + # lightning talks, panels, etc. + continue + yield Speaker( + name=authors, + conference=conference_name, + year=year + )