diff --git a/README.md b/README.md index 0dd8a59..fd78709 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,8 @@ To get started with the sprint: Other tasks: -1. Gender identification - Improve the gender identification in pycon_speakers/pipelines.py +1. Improve the gender identification in pycon_speakers/pipelines.py +2. Chart results Running the Scrapy Code diff --git a/pycon_speakers/pipelines.py b/pycon_speakers/pipelines.py index 8528636..969108c 100644 --- a/pycon_speakers/pipelines.py +++ b/pycon_speakers/pipelines.py @@ -1,19 +1,16 @@ -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html +import sexmachine.detector as gender class GenderPipeline(object): + def __init__(self): + self.detector = gender.Detector() + def process_item(self, item, spider): - item['gender'] = self._infer_gender(item) + firstname = item['name'].split()[0] + item['gender'] = self.detector.get_gender(firstname) return item - def _infer_gender(self, item): - return 'unknown' - - class DefaultsPipeline(object): """ Set default values. diff --git a/requirements.txt b/requirements.txt index df898ce..d59056f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ Scrapy +SexMachine