+
+
+
+
+
+
\ No newline at end of file
diff --git a/tutorial/scrapy.cfg b/tutorial/scrapy.cfg
new file mode 100644
index 0000000..d79f6f1
--- /dev/null
+++ b/tutorial/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = tutorial.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = tutorial
diff --git a/tutorial/start.py b/tutorial/start.py
new file mode 100644
index 0000000..9ff995f
--- /dev/null
+++ b/tutorial/start.py
@@ -0,0 +1,3 @@
+from scrapy import cmdline
+
+cmdline.execute("scrapy crawl tutorial_spider".split())
\ No newline at end of file
diff --git a/tutorial/tutorial/__init__.py b/tutorial/tutorial/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tutorial/tutorial/__pycache__/__init__.cpython-37.pyc b/tutorial/tutorial/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..74a4f23
Binary files /dev/null and b/tutorial/tutorial/__pycache__/__init__.cpython-37.pyc differ
diff --git a/tutorial/tutorial/__pycache__/items.cpython-37.pyc b/tutorial/tutorial/__pycache__/items.cpython-37.pyc
new file mode 100644
index 0000000..2f020db
Binary files /dev/null and b/tutorial/tutorial/__pycache__/items.cpython-37.pyc differ
diff --git a/tutorial/tutorial/__pycache__/pipelines.cpython-37.pyc b/tutorial/tutorial/__pycache__/pipelines.cpython-37.pyc
new file mode 100644
index 0000000..32c76bb
Binary files /dev/null and b/tutorial/tutorial/__pycache__/pipelines.cpython-37.pyc differ
diff --git a/tutorial/tutorial/__pycache__/settings.cpython-37.pyc b/tutorial/tutorial/__pycache__/settings.cpython-37.pyc
new file mode 100644
index 0000000..11d55a8
Binary files /dev/null and b/tutorial/tutorial/__pycache__/settings.cpython-37.pyc differ
diff --git a/tutorial/tutorial/items.py b/tutorial/tutorial/items.py
new file mode 100644
index 0000000..641b595
--- /dev/null
+++ b/tutorial/tutorial/items.py
@@ -0,0 +1,15 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class TutorialItem(scrapy.Item):
+ # define the fields for your item here like:
+ # name = scrapy.Field()
+ bookname = scrapy.Field()
+ content = scrapy.Field()
diff --git a/tutorial/tutorial/middlewares.py b/tutorial/tutorial/middlewares.py
new file mode 100644
index 0000000..3366f46
--- /dev/null
+++ b/tutorial/tutorial/middlewares.py
@@ -0,0 +1,103 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class TutorialSpiderMiddleware(object):
+ # Not all methods need to be defined. If a method is not defined,
+ # scrapy acts as if the spider middleware does not modify the
+ # passed objects.
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ # This method is used by Scrapy to create your spiders.
+ s = cls()
+ crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+ return s
+
+ def process_spider_input(self, response, spider):
+ # Called for each response that goes through the spider
+ # middleware and into the spider.
+
+ # Should return None or raise an exception.
+ return None
+
+ def process_spider_output(self, response, result, spider):
+ # Called with the results returned from the Spider, after
+ # it has processed the response.
+
+ # Must return an iterable of Request, dict or Item objects.
+ for i in result:
+ yield i
+
+ def process_spider_exception(self, response, exception, spider):
+ # Called when a spider or process_spider_input() method
+ # (from other spider middleware) raises an exception.
+
+ # Should return either None or an iterable of Request, dict
+ # or Item objects.
+ pass
+
+ def process_start_requests(self, start_requests, spider):
+ # Called with the start requests of the spider, and works
+ # similarly to the process_spider_output() method, except
+ # that it doesn’t have a response associated.
+
+ # Must return only requests (not items).
+ for r in start_requests:
+ yield r
+
+ def spider_opened(self, spider):
+ spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class TutorialDownloaderMiddleware(object):
+ # Not all methods need to be defined. If a method is not defined,
+ # scrapy acts as if the downloader middleware does not modify the
+ # passed objects.
+
+ @classmethod
+ def from_crawler(cls, crawler):
+ # This method is used by Scrapy to create your spiders.
+ s = cls()
+ crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+ return s
+
+ def process_request(self, request, spider):
+ # Called for each request that goes through the downloader
+ # middleware.
+
+ # Must either:
+ # - return None: continue processing this request
+ # - or return a Response object
+ # - or return a Request object
+ # - or raise IgnoreRequest: process_exception() methods of
+ # installed downloader middleware will be called
+ return None
+
+ def process_response(self, request, response, spider):
+ # Called with the response returned from the downloader.
+
+ # Must either;
+ # - return a Response object
+ # - return a Request object
+ # - or raise IgnoreRequest
+ return response
+
+ def process_exception(self, request, exception, spider):
+ # Called when a download handler or a process_request()
+ # (from other downloader middleware) raises an exception.
+
+ # Must either:
+ # - return None: continue processing this exception
+ # - return a Response object: stops process_exception() chain
+ # - return a Request object: stops process_exception() chain
+ pass
+
+ def spider_opened(self, spider):
+ spider.logger.info('Spider opened: %s' % spider.name)
diff --git a/tutorial/tutorial/pipelines.py b/tutorial/tutorial/pipelines.py
new file mode 100644
index 0000000..c1ad279
--- /dev/null
+++ b/tutorial/tutorial/pipelines.py
@@ -0,0 +1,26 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+import json
+
+class TutorialPipeline(object):
+
+ def __init__(self):
+ self.fp = open("xiaoshuo.json",'w',encoding='UTF-8');
+
+ def open_spider(self,spider):
+
+ print('爬虫开始!')
+
+ def process_item(self, item, spider):
+ item_json = json.dumps(dict(item),ensure_ascii=False)
+
+ self.fp.write(item_json+'\n')
+
+ return item
+
+ def close_spider(self):
+ print('爬虫结束')
\ No newline at end of file
diff --git a/tutorial/tutorial/settings.py b/tutorial/tutorial/settings.py
new file mode 100644
index 0000000..74ab99b
--- /dev/null
+++ b/tutorial/tutorial/settings.py
@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for tutorial project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+# https://docs.scrapy.org/en/latest/topics/settings.html
+# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'tutorial'
+
+SPIDER_MODULES = ['tutorial.spiders']
+NEWSPIDER_MODULE = 'tutorial.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'tutorial (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = False
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+DEFAULT_REQUEST_HEADERS = {
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+ 'Accept-Language': 'en',
+ 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36'
+
+}
+
+# Enable or disable spider middlewares
+# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+# 'tutorial.middlewares.TutorialSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+# 'tutorial.middlewares.TutorialDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See https://docs.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+# 'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+ITEM_PIPELINES = {
+ 'tutorial.pipelines.TutorialPipeline': 300,
+}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
diff --git a/tutorial/tutorial/spiders/__init__.py b/tutorial/tutorial/spiders/__init__.py
new file mode 100644
index 0000000..ebd689a
--- /dev/null
+++ b/tutorial/tutorial/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/tutorial/tutorial/spiders/__pycache__/__init__.cpython-37.pyc b/tutorial/tutorial/spiders/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000..79354ca
Binary files /dev/null and b/tutorial/tutorial/spiders/__pycache__/__init__.cpython-37.pyc differ
diff --git a/tutorial/tutorial/spiders/__pycache__/tutorial_spider.cpython-37.pyc b/tutorial/tutorial/spiders/__pycache__/tutorial_spider.cpython-37.pyc
new file mode 100644
index 0000000..0e2456f
Binary files /dev/null and b/tutorial/tutorial/spiders/__pycache__/tutorial_spider.cpython-37.pyc differ
diff --git a/tutorial/tutorial/spiders/tutorial_spider.py b/tutorial/tutorial/spiders/tutorial_spider.py
new file mode 100644
index 0000000..0910f07
--- /dev/null
+++ b/tutorial/tutorial/spiders/tutorial_spider.py
@@ -0,0 +1,26 @@
+# -*- coding: utf-8 -*-
+import scrapy
+# https://www.zwdu.com/book/25435/7774888.html
+from scrapy.http.response.html import HtmlResponse
+from tutorial.items import TutorialItem
+
+class TutorialSpiderSpider(scrapy.Spider):
+ name = 'tutorial_spider'
+ allowed_domains = ['https://www.zwdu.com/book/25435/7774879.html']
+ start_urls = ['https://www.zwdu.com/book/25435/7774879.html']
+
+ def parse(self, response):
+ wrappers = response.xpath("//div[@id='wrapper']/div");
+ for wrapper in wrappers:
+
+ bookname = wrapper.xpath(".//div[@class='bookname']//text()").getall();
+ bookname = "".join(bookname).strip();
+
+
+ content = wrapper.xpath(".//div[@id='content']//text()").getall();
+ content = "".join(content).strip();
+
+ item = TutorialItem(bookname=bookname,content=content)
+
+ # xiaoshuo = {"bookname":bookname,"content":content}
+ yield item
diff --git a/tutorial/xiaoshuo.json b/tutorial/xiaoshuo.json
new file mode 100644
index 0000000..70cb46d
--- /dev/null
+++ b/tutorial/xiaoshuo.json
@@ -0,0 +1,4 @@
+{"bookname": "", "content": ""}
+{"bookname": "", "content": ""}
+{"bookname": "第一章 战神重生\r\n\t\t\t\t\r\n\t\t\t\t\t上一章 ← 章节列表 → 下一章 加入书签\r\n\t\t\t\t\r\n\t\t\t\t推荐阅读:铁血女兵气动乾坤至尊兵王魔君蛇妻:爱妃,别闹至尊少年王无敌剑域极品古医传人超级医生贫道有礼仙家有田", "content": "“没想到,我聂天居然重生了!”房间之中,聂天身躯剧烈颤抖,眼神之中充斥着压抑的愤怒。八一中文 W≤W≈W=.≈8≠1≥Z≥W≈.≤C≥OM 他的心中,惊涛骇浪,过往种种在脑海之中飞驰而过。 聂天本是天界第一战神,晨昏神域大半疆域都是他一手打下。 赫赫威名,震慑神域! 为了封赏他的绝世战功,晨昏大帝将掌上明珠紫烟公主许配于他。 洛紫烟,晨昏神域第一美女,风采绝世,倾国倾城。 配上聂天这天界第一战神,堪称天造地设。 但聂天怎么也想不到,洛紫烟竟会在洞房之夜对他出手。 堂堂天界第一战神,竟死在未婚妻的手上,还死在了洞房之夜,真是天大的笑话! “她为何杀我?难道传言是真的?晨昏大帝将洛紫烟许配于我,本来就是一个阴谋,就是为了要杀我。”聂天眼神凌冽,心中惊涛骇浪。 功高震主,历来都是臣子大忌。 聂天声望在晨昏神域,远胜晨昏大帝,后者想杀他,亦在情理之中。 “好一个晨昏大帝,好一个洛紫烟,你们父女好狠的心!我聂天为晨昏神域打下大片疆土,更视洛紫烟为毕生挚爱,没想到最后竟死在你们父女手上。”聂天双目赤红,全身颤抖。 良久,聂天稍稍镇定,眼中闪现一抹精芒,突然狂笑一声:“也罢!既然上苍让我聂天重生一回,我聂天再不做别人的殿下之臣。” “这一世,我要创造我的世界!” “这一世,我要成为万古天帝!” “这一世,我要主宰天界神域!” 豪言壮语,振聋聩,聂天整个人锋芒毕露,好似一把出鞘利剑! 重生一次,聂天信心满满,但当他看到自己的这副身躯,却是苦笑一声,自嘲道:“现在的这副身体,实在弱了一些。” 死在洛紫烟手中,聂天再次醒来,已是百年之后。 他的灵魂重生在已经病死的少年身上。巧合的是,这个少年也叫聂天。 此时的聂天,乃是墨阳城三大家族之一聂家的家主。 但是他这个家主,在家族之中却连一个体面的下人都不如。 就连他死在房间,都没人知道。 究其原因,就是因为他是一个元脉尽毁的废人。 三年前,聂天还是墨阳城第一天才,年仅十三岁,实力达到元脉九重,堪称妖孽。 但是三年前的一天,聂天和父亲及多位族人进入裂云山脉,进行历练,却遭遇一群黑衣人的伏击,结果父亲和族人全部被杀,只有聂天一人拼死逃出,但却元脉尽毁,成了废人。 父亲死后,他继任家主。但是在所有人眼中,他这个家主,屁都不是。 元脉尽毁,聂天开始自暴自弃,自甘堕落,每天借酒消愁,流连风月之地。 就在昨天,他被墨阳城三大家族之一巴家的大少爷巴子阳,打得重伤昏死。 抬回聂府之后,今天早上就咽气了。这也就给了战神聂天附身的机会。 “元脉尽毁吗?”聂天稍稍镇定,开始检查自己的新身体。 “毒!”聂天内视元脉,惊愕现,他的元脉除了损伤严重之外,竟然还呈现污黑之色。 “我是被毒死的!”聂天脑海之中出现一张面孔,聂家大执事,聂三通。 在聂天受伤期间,只有聂三通看过他,给他服下了一枚“恢复伤势”的固元丹。 “好一个聂三通,定是觊觎家主之位,谋害于我。”聂天马上明白了,双瞳之中浮现一抹森然寒光。 “嗯?”聂天继续内视身体,脸色唰地一变,惊骇道:“星辰原石!居然跟着我一起重生了!” “家主,大事不好了!”就在这个时候,一道身影夺门而入,惊慌大叫。 “阿牛,生什么事了?”聂天看着来人,淡淡问道。 阿牛,聂天的仆从,也是整个聂家唯一一个把他当家主的人。 “家主,巴,巴家的人来逼婚了!”喘着粗气,阿牛着急说道。 “巴家!”聂天微微皱眉,想起自己就是被巴家大少爷巴子阳打伤,脸色顿时变得阴沉起来。 巴家,和聂家一样,墨阳城三大家族之一。 不过自从三年前聂天父亲死后,聂家的声望一天不如一天,到了今日,已经是大厦将倾。 正因为这样,巴家大少爷巴子阳才敢把聂天这个巴家家主打得重伤昏死。 “阿牛,你不要着急,逼婚到底是怎么回事?”聂天并不慌张,反倒玩味一笑。 阿牛愣了一下,一脸古怪地看着聂天。 这还是家主吗?怎么这么镇定? 阿牛隐隐感觉聂天变了,和以前不一样了,却又说不出哪里不一样。 “快说啊。”聂天见阿牛愣,催促一声。 “哎!是!”阿牛反应过来,赶紧说道:“巴家的管家带着巴家大少年和三少爷来我们府上提亲了,而且是向最有天赋的九小姐提亲。” “九妹!”聂天脑海中浮现一张粉雕玉琢,乖巧可爱的脸蛋。 聂家是大家族,人口多,同辈之间,直接按年龄排序。 九妹,就是聂家年轻一代年龄第九的女孩。 “九妹好像叫聂雨柔吧。”聂天记得,上次见九妹,还是在三年之前,那时的聂雨柔还是一个六岁的小姑娘。 现在想来,也该有九岁了。 “九岁?!”聂天惊叫一声。 谁会向一个九岁的小女孩提亲? “巴家给谁提亲?”聂天脸色一沉,眼神闪过一抹狠辣。 向一个九岁的小孩提亲,巴家的人简直丧心病狂。 先是打伤聂天,然后又上门逼婚,巴家的人真是嚣张到姥姥家了。 “巴家三少爷巴子星。”阿牛回答。 “巴子星!”聂天脸色更加阴沉,沉声道:“如果我没记错,巴子星是个傻子吧。” “嗯。”阿牛看着聂天,咽了一下口水,重重点头。 聂天确实没有记错,巴子星的确是一个傻子,而且还是他亲手打傻的。 三年前的聂天,风头正劲,墨阳城武会之上,巴子星不服气,向他挑战,结果被打成了傻子。 为此事,聂家和巴家差一点血拼。 现在,巴家居然替巴子星向聂雨柔提亲,明显是欺负聂家势弱,想要报以前的耻辱。 聂雨柔是聂家新一代天才,刚刚九岁,已经是元脉四重,天赋直追当年的聂天。 若是聂雨柔嫁给了巴子星,聂家绝对会沦为墨阳城的笑柄,而且还将失去一位少年天才。 “不行!绝对不能让这种事情生!”聂天一脸肃杀,低吼道:“带路,我要去议事大堂!” “在我的头上拉屎,还管我要纸。巴家,今天我要让你们把自己拉的屎,吃回去!”聂天心中,霸道怒吼。"}
+{"bookname": "", "content": ""}