Skip to content

Commit

Permalink
speakers: added conferences from 2009 to 2013, 2013 can be improved
Browse files Browse the repository at this point in the history
  • Loading branch information
eLRuLL committed Apr 21, 2014
1 parent 837e3c0 commit f4e8670
Showing 1 changed file with 8 additions and 12 deletions.
20 changes: 8 additions & 12 deletions pycon_speakers/spiders/ar_pycon.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,27 +12,23 @@ class ArPyconSpider(Spider):
"""A spider to crawl Argentinian Pycon conference speakers.
"""
name = 'ar.pycon.org'
from_year = 2011
base_url = 'http://ar.pycon.org/{year}/schedule/index'
from_year = 2009
base_url = 'http://ar.pycon.org/{year}/activity/speakers'

def start_requests(self):
current_year = date.today().year - 2
current_year = date.today().year
for year in range(self.from_year, current_year):
url = self.base_url.format(year=year)
yield Request(url)

def parse(self, response):
selector = Selector(response)
year = re.search(r'/(\d+)/', response.url).group(1)

speakers = []
for i, bad_name in enumerate(selector.xpath('//div[@style]//span[position()=1]/text()').extract()):
if i % 2 != 0:
name = reversed([a.strip() for a in bad_name.split(",")])
speaker = " ".join(name)
speakers.append(speaker)
year = re.search('/(\d+)/', response.url).group(1)

return [Speaker(name=speaker,
conference=self.name,
year=year)
for speaker in speakers]
for speaker in selector.xpath(
'//div[@id="content"]/h2/text() | '
'(//div[@id="content"]/div[@class="frame"] | '
'//div[@class="frame"])/h3/text()').extract()]

0 comments on commit f4e8670

Please sign in to comment.