scrapy爬虫

HungYann · Feb 22, 2020 · be9c61e · be9c61e
1 parent 9011f9c
commit be9c61e
Show file tree

Hide file tree

Showing 21 changed files with 679 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -3,6 +3,10 @@ I will upload my python project into the repository.Hopefully, this way will hel
 
 
 
+tutorial:
+
+scrapy爬虫技术[网站](https://www.bilibili.com/video/av57909837?p=5)
+
 
 
 ## Website:

diff --git a/tutorial/.idea/misc.xml b/tutorial/.idea/misc.xml
diff --git a/tutorial/.idea/modules.xml b/tutorial/.idea/modules.xml
diff --git a/tutorial/.idea/tutorial.iml b/tutorial/.idea/tutorial.iml
diff --git a/tutorial/.idea/workspace.xml b/tutorial/.idea/workspace.xml
diff --git a/tutorial/scrapy.cfg b/tutorial/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = tutorial.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = tutorial
diff --git a/tutorial/start.py b/tutorial/start.py
@@ -0,0 +1,3 @@
+from scrapy import cmdline
+
+cmdline.execute("scrapy crawl tutorial_spider".split())
diff --git a/tutorial/tutorial/__init__.py b/tutorial/tutorial/__init__.py
diff --git a/tutorial/tutorial/__pycache__/__init__.cpython-37.pyc b/tutorial/tutorial/__pycache__/__init__.cpython-37.pyc
diff --git a/tutorial/tutorial/__pycache__/items.cpython-37.pyc b/tutorial/tutorial/__pycache__/items.cpython-37.pyc
diff --git a/tutorial/tutorial/__pycache__/pipelines.cpython-37.pyc b/tutorial/tutorial/__pycache__/pipelines.cpython-37.pyc
diff --git a/tutorial/tutorial/__pycache__/settings.cpython-37.pyc b/tutorial/tutorial/__pycache__/settings.cpython-37.pyc
diff --git a/tutorial/tutorial/items.py b/tutorial/tutorial/items.py
@@ -0,0 +1,15 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class TutorialItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    bookname = scrapy.Field()
+    content = scrapy.Field()
diff --git a/tutorial/tutorial/middlewares.py b/tutorial/tutorial/middlewares.py
@@ -0,0 +1,103 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class TutorialSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Request, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class TutorialDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
diff --git a/tutorial/tutorial/pipelines.py b/tutorial/tutorial/pipelines.py
@@ -0,0 +1,26 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+
+class TutorialPipeline(object):
+
+    def __init__(self):
+        self.fp = open("xiaoshuo.json",'w',encoding='UTF-8');
+
+    def open_spider(self,spider):
+
+        print('爬虫开始！')
+
+    def process_item(self, item, spider):
+        item_json = json.dumps(dict(item),ensure_ascii=False)
+
+        self.fp.write(item_json+'\n')
+
+        return item
+
+    def close_spider(self):
+        print('爬虫结束')
diff --git a/tutorial/tutorial/settings.py b/tutorial/tutorial/settings.py
@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for tutorial project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://docs.scrapy.org/en/latest/topics/settings.html
+#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'tutorial'
+
+SPIDER_MODULES = ['tutorial.spiders']
+NEWSPIDER_MODULE = 'tutorial.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'tutorial (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = False
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+DEFAULT_REQUEST_HEADERS = {
+  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+  'Accept-Language': 'en',
+  'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36'
+
+}
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'tutorial.middlewares.TutorialSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'tutorial.middlewares.TutorialDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+   'tutorial.pipelines.TutorialPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
diff --git a/tutorial/tutorial/spiders/__init__.py b/tutorial/tutorial/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/tutorial/tutorial/spiders/__pycache__/__init__.cpython-37.pyc b/tutorial/tutorial/spiders/__pycache__/__init__.cpython-37.pyc
diff --git a/tutorial/tutorial/spiders/__pycache__/tutorial_spider.cpython-37.pyc b/tutorial/tutorial/spiders/__pycache__/tutorial_spider.cpython-37.pyc
diff --git a/tutorial/tutorial/spiders/tutorial_spider.py b/tutorial/tutorial/spiders/tutorial_spider.py
@@ -0,0 +1,26 @@
+# -*- coding: utf-8 -*-
+import scrapy
+# https://www.zwdu.com/book/25435/7774888.html
+from scrapy.http.response.html import HtmlResponse
+from tutorial.items import TutorialItem
+
+class TutorialSpiderSpider(scrapy.Spider):
+    name = 'tutorial_spider'
+    allowed_domains = ['https://www.zwdu.com/book/25435/7774879.html']
+    start_urls = ['https://www.zwdu.com/book/25435/7774879.html']
+
+    def parse(self, response):
+        wrappers = response.xpath("//div[@id='wrapper']/div");
+        for wrapper in wrappers:
+
+            bookname = wrapper.xpath(".//div[@class='bookname']//text()").getall();
+            bookname = "".join(bookname).strip();
+
+
+            content = wrapper.xpath(".//div[@id='content']//text()").getall();
+            content = "".join(content).strip();
+
+            item = TutorialItem(bookname=bookname,content=content)
+
+            # xiaoshuo = {"bookname":bookname,"content":content}
+            yield item
diff --git a/tutorial/xiaoshuo.json b/tutorial/xiaoshuo.json
@@ -0,0 +1,4 @@
+{"bookname": "", "content": ""}
+{"bookname": "", "content": ""}
+{"bookname": "第一章 战神重生\r\n\t\t\t\t\r\n\t\t\t\t\t上一章 ← 章节列表 → 下一章 加入书签\r\n\t\t\t\t\r\n\t\t\t\t推荐阅读：铁血女兵气动乾坤至尊兵王魔君蛇妻：爱妃，别闹至尊少年王无敌剑域极品古医传人超级医生贫道有礼仙家有田", "content": "“没想到，我聂天居然重生了！”房间之中，聂天身躯剧烈颤抖，眼神之中充斥着压抑的愤怒。八一中文 Ｗ≤Ｗ≈Ｗ＝．≈８≠１≥Ｚ≥Ｗ≈．≤Ｃ≥ＯＭ    他的心中，惊涛骇浪，过往种种在脑海之中飞驰而过。    聂天本是天界第一战神，晨昏神域大半疆域都是他一手打下。    赫赫威名，震慑神域！    为了封赏他的绝世战功，晨昏大帝将掌上明珠紫烟公主许配于他。    洛紫烟，晨昏神域第一美女，风采绝世，倾国倾城。    配上聂天这天界第一战神，堪称天造地设。    但聂天怎么也想不到，洛紫烟竟会在洞房之夜对他出手。    堂堂天界第一战神，竟死在未婚妻的手上，还死在了洞房之夜，真是天大的笑话！    “她为何杀我？难道传言是真的？晨昏大帝将洛紫烟许配于我，本来就是一个阴谋，就是为了要杀我。”聂天眼神凌冽，心中惊涛骇浪。    功高震主，历来都是臣子大忌。    聂天声望在晨昏神域，远胜晨昏大帝，后者想杀他，亦在情理之中。    “好一个晨昏大帝，好一个洛紫烟，你们父女好狠的心！我聂天为晨昏神域打下大片疆土，更视洛紫烟为毕生挚爱，没想到最后竟死在你们父女手上。”聂天双目赤红，全身颤抖。    良久，聂天稍稍镇定，眼中闪现一抹精芒，突然狂笑一声：“也罢！既然上苍让我聂天重生一回，我聂天再不做别人的殿下之臣。”    “这一世，我要创造我的世界！”    “这一世，我要成为万古天帝！”    “这一世，我要主宰天界神域！”    豪言壮语，振聋聩，聂天整个人锋芒毕露，好似一把出鞘利剑！    重生一次，聂天信心满满，但当他看到自己的这副身躯，却是苦笑一声，自嘲道：“现在的这副身体，实在弱了一些。”    死在洛紫烟手中，聂天再次醒来，已是百年之后。    他的灵魂重生在已经病死的少年身上。巧合的是，这个少年也叫聂天。    此时的聂天，乃是墨阳城三大家族之一聂家的家主。    但是他这个家主，在家族之中却连一个体面的下人都不如。    就连他死在房间，都没人知道。    究其原因，就是因为他是一个元脉尽毁的废人。    三年前，聂天还是墨阳城第一天才，年仅十三岁，实力达到元脉九重，堪称妖孽。    但是三年前的一天，聂天和父亲及多位族人进入裂云山脉，进行历练，却遭遇一群黑衣人的伏击，结果父亲和族人全部被杀，只有聂天一人拼死逃出，但却元脉尽毁，成了废人。    父亲死后，他继任家主。但是在所有人眼中，他这个家主，屁都不是。    元脉尽毁，聂天开始自暴自弃，自甘堕落，每天借酒消愁，流连风月之地。    就在昨天，他被墨阳城三大家族之一巴家的大少爷巴子阳，打得重伤昏死。    抬回聂府之后，今天早上就咽气了。这也就给了战神聂天附身的机会。    “元脉尽毁吗？”聂天稍稍镇定，开始检查自己的新身体。    “毒！”聂天内视元脉，惊愕现，他的元脉除了损伤严重之外，竟然还呈现污黑之色。    “我是被毒死的！”聂天脑海之中出现一张面孔，聂家大执事，聂三通。    在聂天受伤期间，只有聂三通看过他，给他服下了一枚“恢复伤势”的固元丹。    “好一个聂三通，定是觊觎家主之位，谋害于我。”聂天马上明白了，双瞳之中浮现一抹森然寒光。    “嗯？”聂天继续内视身体，脸色唰地一变，惊骇道：“星辰原石！居然跟着我一起重生了！”    “家主，大事不好了！”就在这个时候，一道身影夺门而入，惊慌大叫。    “阿牛，生什么事了？”聂天看着来人，淡淡问道。    阿牛，聂天的仆从，也是整个聂家唯一一个把他当家主的人。    “家主，巴，巴家的人来逼婚了！”喘着粗气，阿牛着急说道。    “巴家！”聂天微微皱眉，想起自己就是被巴家大少爷巴子阳打伤，脸色顿时变得阴沉起来。    巴家，和聂家一样，墨阳城三大家族之一。    不过自从三年前聂天父亲死后，聂家的声望一天不如一天，到了今日，已经是大厦将倾。    正因为这样，巴家大少爷巴子阳才敢把聂天这个巴家家主打得重伤昏死。    “阿牛，你不要着急，逼婚到底是怎么回事？”聂天并不慌张，反倒玩味一笑。    阿牛愣了一下，一脸古怪地看着聂天。    这还是家主吗？怎么这么镇定？    阿牛隐隐感觉聂天变了，和以前不一样了，却又说不出哪里不一样。    “快说啊。”聂天见阿牛愣，催促一声。    “哎！是！”阿牛反应过来，赶紧说道：“巴家的管家带着巴家大少年和三少爷来我们府上提亲了，而且是向最有天赋的九小姐提亲。”    “九妹！”聂天脑海中浮现一张粉雕玉琢，乖巧可爱的脸蛋。    聂家是大家族，人口多，同辈之间，直接按年龄排序。    九妹，就是聂家年轻一代年龄第九的女孩。    “九妹好像叫聂雨柔吧。”聂天记得，上次见九妹，还是在三年之前，那时的聂雨柔还是一个六岁的小姑娘。    现在想来，也该有九岁了。    “九岁？！”聂天惊叫一声。    谁会向一个九岁的小女孩提亲？    “巴家给谁提亲？”聂天脸色一沉，眼神闪过一抹狠辣。    向一个九岁的小孩提亲，巴家的人简直丧心病狂。    先是打伤聂天，然后又上门逼婚，巴家的人真是嚣张到姥姥家了。    “巴家三少爷巴子星。”阿牛回答。    “巴子星！”聂天脸色更加阴沉，沉声道：“如果我没记错，巴子星是个傻子吧。”    “嗯。”阿牛看着聂天，咽了一下口水，重重点头。    聂天确实没有记错，巴子星的确是一个傻子，而且还是他亲手打傻的。    三年前的聂天，风头正劲，墨阳城武会之上，巴子星不服气，向他挑战，结果被打成了傻子。    为此事，聂家和巴家差一点血拼。    现在，巴家居然替巴子星向聂雨柔提亲，明显是欺负聂家势弱，想要报以前的耻辱。    聂雨柔是聂家新一代天才，刚刚九岁，已经是元脉四重，天赋直追当年的聂天。    若是聂雨柔嫁给了巴子星，聂家绝对会沦为墨阳城的笑柄，而且还将失去一位少年天才。    “不行！绝对不能让这种事情生！”聂天一脸肃杀，低吼道：“带路，我要去议事大堂！”    “在我的头上拉屎，还管我要纸。巴家，今天我要让你们把自己拉的屎，吃回去！”聂天心中，霸道怒吼。"}
+{"bookname": "", "content": ""}
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,6 +3,10 @@ I will upload my python project into the repository.Hopefully, this way will hel



		tutorial:

		scrapy爬虫技术[网站](https://www.bilibili.com/video/av57909837?p=5)



		## Website:
Expand Down
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from scrapy import cmdline

		cmdline.execute("scrapy crawl tutorial_spider".split())