更新项目代码

JoeanAmier · Dec 6, 2023 · 413168f · 413168f
1 parent 395d81c
commit 413168f
Show file tree

Hide file tree

Showing 7 changed files with 92 additions and 84 deletions.
diff --git a/README.md b/README.md
@@ -11,10 +11,11 @@
 <h1>📑 功能清单</h1>
 <ul>
 <li>✅ 采集小红书图文/视频作品信息</li>
-<li>✅ 获取小红书图文/视频作品文件下载地址</li>
+<li>✅ 提取小红书图文/视频作品文件下载地址</li>
 <li>✅ 下载小红书无水印图文/视频作品文件</li>
 <li>✅ 自动跳过已下载的作品文件</li>
 <li>✅ 作品文件完整性处理机制</li>
+<li>☑️ 采集作品信息储存至文件</li>
 </ul>
 <h1>📸 程序截图</h1>
 <br>
@@ -25,7 +26,7 @@
 <li><code>https://www.xiaohongshu.com/discovery/item/作品ID</code></li>
 <li><code>https://xhslink.com/分享码</code></li>
 <br/>
-<p><b>可以单次输入多个作品链接，链接之间使用空格分隔。</b></p>
+<p><b>支持单次输入多个作品链接，链接之间使用空格分隔。</b></p>
 </ul>
 <h1>🪟 关于终端</h1>
 <p>⭐ 推荐使用 <a href="https://learn.microsoft.com/zh-cn/windows/terminal/install">Windows 终端</a> （Windows 11 自带默认终端）运行程序以便获得最佳显示效果！</p>
@@ -41,32 +42,34 @@
 <li>运行 <code>main.py</code> 即可使用</li>
 </ol>
 <h2>💻 二次开发</h2>
-<p>如果想要获取小红书图文/视频作品信息，可以根据 <code>main.py</code> 的注释提示进行代码调用。</p>
+<p>如果需要获取小红书图文/视频作品信息，可以根据 <code>main.py</code> 的注释提示进行代码调用。</p>
 <pre>
 # 测试链接
 error_demo = "https://github.com/JoeanAmier/XHS_Downloader"
 image_demo = "https://www.xiaohongshu.com/explore/63b275a30000000019020185"
 video_demo = "https://www.xiaohongshu.com/explore/64edb460000000001f03cadc"
 multiple_demo = f"{image_demo} {video_demo}"
 # 实例对象
-path = "D:\\"  # 作品下载储存根路径，默认值：当前路径
+path = ""  # 作品下载储存根路径，默认值：当前路径
 folder = "Download"  # 作品下载文件夹名称（自动创建），默认值：Download
-proxies = None  # 网络代理
+user_agent = ""  # 请求头 User-Agent
+proxy = None  # 网络代理
 timeout = 5  # 网络请求超时限制，默认值：10
 chunk = 1024 * 1024  # 下载文件时，每次从服务器获取的数据块大小，单位字节
-# with XHS() as xhs:
+# async with XHS() as xhs:
 #     pass  # 使用默认参数
-with XHS(path=path,
-         folder=folder,
-         proxies=proxies,
-         timeout=timeout,
-         chunk=chunk) as xhs:  # 使用自定义参数
+async with XHS(path=path,
+               folder=folder,
+               user_agent=user_agent,
+               proxy=proxy,
+               timeout=timeout,
+               chunk=chunk) as xhs:  # 使用自定义参数
     download = True  # 是否下载作品文件，默认值：False
     # 返回作品详细信息，包括下载地址
-    print(xhs.extract(error_demo))  # 获取数据失败时返回空字典
-    print(xhs.extract(image_demo, download=download))
-    print(xhs.extract(video_demo, download=download))
-    print(xhs.extract(multiple_demo, download=download))
+    print(await xhs.extract(error_demo, download=download))  # 获取数据失败时返回空字典
+    print(await xhs.extract(image_demo, download=download))
+    print(await xhs.extract(video_demo, download=download))
+    print(await xhs.extract(multiple_demo, download=download))  # 支持传入多个作品链接
 </pre>
 <h1>⚙️ 配置文件</h1>
 <p>项目根目录下的 <code>settings.json</code> 文件，首次运行自动生成，可以自定义部分运行参数。</p>
@@ -83,7 +86,7 @@ with XHS(path=path,
 <tr>
 <td align="center">path</td>
 <td align="center">str</td>
-<td align="center">作品文件储存根路径</td>
+<td align="center">作品数据 / 文件保存根路径</td>
 <td align="center">项目根路径</td>
 </tr>
 <tr>
@@ -93,6 +96,12 @@ with XHS(path=path,
 <td align="center">Download</td>
 </tr>
 <tr>
+<td align="center">user_agent</td>
+<td align="center">str</td>
+<td align="center">请求头 User-Agent</td>
+<td align="center">内置 UA</td>
+</tr>
+<tr>
 <td align="center">proxy</td>
 <td align="center">str</td>
 <td align="center">设置代理</td>

diff --git a/main.py b/main.py
@@ -11,24 +11,26 @@ async def example():
     video_demo = "https://www.xiaohongshu.com/explore/64edb460000000001f03cadc"
     multiple_demo = f"{image_demo} {video_demo}"
     # 实例对象
-    path = "D:\\"  # 作品下载储存根路径，默认值：当前路径
+    path = ""  # 作品下载储存根路径，默认值：当前路径
     folder = "Download"  # 作品下载文件夹名称（自动创建），默认值：Download
-    proxies = None  # 网络代理
+    user_agent = ""  # 请求头 User-Agent
+    proxy = None  # 网络代理
     timeout = 5  # 网络请求超时限制，默认值：10
     chunk = 1024 * 1024  # 下载文件时，每次从服务器获取的数据块大小，单位字节
-    async with XHS() as xhs:
-        pass  # 使用默认参数
+    # async with XHS() as xhs:
+    #     pass  # 使用默认参数
     async with XHS(path=path,
                    folder=folder,
-                   proxy=proxies,
+                   user_agent=user_agent,
+                   proxy=proxy,
                    timeout=timeout,
                    chunk=chunk) as xhs:  # 使用自定义参数
-        download = False  # 是否下载作品文件，默认值：False
+        download = True  # 是否下载作品文件，默认值：False
         # 返回作品详细信息，包括下载地址
-        print(await xhs.extract(error_demo))  # 获取数据失败时返回空字典
+        print(await xhs.extract(error_demo, download=download))  # 获取数据失败时返回空字典
         print(await xhs.extract(image_demo, download=download))
         print(await xhs.extract(video_demo, download=download))
-        print(await xhs.extract(multiple_demo, download=download))
+        print(await xhs.extract(multiple_demo, download=download))  # 支持传入多个作品链接
 
 
 if __name__ == '__main__':

diff --git a/source/Downloader.py b/source/Downloader.py
@@ -1,11 +1,8 @@
 from pathlib import Path
 
-from aiohttp import ClientConnectionError
-from aiohttp import ClientProxyConnectionError
-from aiohttp import ClientSSLError
 from aiohttp import ClientSession
-
-# from aiohttp import ClientTimeout
+from aiohttp import ClientTimeout
+from aiohttp import ServerTimeoutError
 
 __all__ = ['Download']
 
@@ -26,8 +23,9 @@ def __init__(
         self.root = self.__init_root(root, path, folder)
         self.proxy = proxy
         self.chunk = chunk
-        # self.timeout = ClientTimeout(total=timeout)
-        self.session = ClientSession(headers=manager.headers)
+        self.session = ClientSession(
+            headers=manager.headers,
+            timeout=ClientTimeout(connect=timeout))
 
     def __init_root(self, root: Path, path: str, folder: str) -> Path:
         if path and (r := Path(path)).is_dir():
@@ -42,8 +40,10 @@ async def run(self, urls: list, name: str, type_: int, log, bar):
         if type_ == 0:
             await self.__download(urls[0], f"{name}.mp4", log, bar)
         elif type_ == 1:
-            for index, url in enumerate(urls):
-                await self.__download(url, f"{name}_{index + 1}.png", log, bar)
+            for index, url in enumerate(urls, start=1):
+                await self.__download(url, f"{name}_{index}.png", log, bar)
+        else:
+            raise ValueError
 
     async def __download(self, url: str, name: str, log, bar):
         temp = self.temp.joinpath(name)
@@ -52,32 +52,26 @@ async def __download(self, url: str, name: str, log, bar):
             return
         try:
             async with self.session.get(url, proxy=self.proxy) as response:
-                # self.__create_progress(bar, int(response.headers.get('content-length', 0)))
+                self.__create_progress(
+                    bar, int(
+                        response.headers.get(
+                            'content-length', 0)) or None)
                 with temp.open("wb") as f:
                     async for chunk in response.content.iter_chunked(self.chunk):
                         f.write(chunk)
-                        # self.__update_progress(bar, len(chunk))
-                # self.__remove_progress(bar)
+                        self.__update_progress(bar, len(chunk))
             self.manager.move(temp, file)
-        except (
-                ClientProxyConnectionError,
-                ClientSSLError,
-                ClientConnectionError,
-                TimeoutError,
-        ):
+            self.__create_progress(bar, None)
+        except ServerTimeoutError:
             self.manager.delete(temp)
-            # self.__remove_progress(bar)
+            self.__create_progress(bar, None)
+
+    @staticmethod
+    def __create_progress(bar, total: int | None):
+        if bar:
+            bar.update(total=total)
 
-    # @staticmethod
-    # def __create_progress(bar, total: int | None):
-    #     if bar:
-    #         bar.update(total=total)
-    #
-    # @staticmethod
-    # def __update_progress(bar, advance: int):
-    #     if bar:
-    #         bar.advance(advance)
-    #
-    # @staticmethod
-    # def __remove_progress(bar):
-    #     pass
+    @staticmethod
+    def __update_progress(bar, advance: int):
+        if bar:
+            bar.advance(advance)
diff --git a/source/Html.py b/source/Html.py
@@ -1,9 +1,6 @@
-from aiohttp import ClientConnectionError
-from aiohttp import ClientProxyConnectionError
-from aiohttp import ClientSSLError
 from aiohttp import ClientSession
-
-# from aiohttp import ClientTimeout
+from aiohttp import ClientTimeout
+from aiohttp import ServerTimeoutError
 
 __all__ = ['Html']
 
@@ -18,7 +15,9 @@ def __init__(
         self.proxy = proxy
         self.session = ClientSession(
             headers=headers | {
-                "Referer": "https://www.xiaohongshu.com/", })
+                "Referer": "https://www.xiaohongshu.com/", },
+            timeout=ClientTimeout(connect=timeout),
+        )
 
     async def request_url(
             self,
@@ -30,11 +29,7 @@ async def request_url(
                     proxy=self.proxy,
             ) as response:
                 return await response.text() if text else response.url
-        except (
-                ClientProxyConnectionError,
-                ClientSSLError,
-                ClientConnectionError,
-        ):
+        except ServerTimeoutError:
             return ""
 
     @staticmethod

diff --git a/source/Manager.py b/source/Manager.py
@@ -6,12 +6,11 @@
 
 
 class Manager:
-    headers = {
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
-                      "Chrome/119.0.0.0 Safari/537.36", }
-
-    def __init__(self, root: Path):
+    def __init__(self, root: Path, ua: str):
         self.temp = root.joinpath("./temp")
+        self.headers = {
+            "User-Agent": ua or "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
+                                "Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0", }
 
     @staticmethod
     def is_exists(path: Path) -> bool:

diff --git a/source/Settings.py b/source/Settings.py
@@ -9,7 +9,8 @@ class Settings:
     default = {
         "path": "",
         "folder": "Download",
-        "proxies": None,
+        "user_agent": "",
+        "proxy": "",
         "timeout": 10,
         "chunk": 1024 * 1024,
     }

diff --git a/source/__init__.py b/source/__init__.py
@@ -30,20 +30,27 @@
 
 class XHS:
     ROOT = Path(__file__).resolve().parent.parent
-    link = compile(r"https://www\.xiaohongshu\.com/explore/[a-z0-9]+")
-    share = compile(r"https://www\.xiaohongshu\.com/discovery/item/[a-z0-9]+")
-    short = compile(r"https://xhslink\.com/[A-Za-z0-9]+")
+    LINK = compile(r"https://www\.xiaohongshu\.com/explore/[a-z0-9]+")
+    SHARE = compile(r"https://www\.xiaohongshu\.com/discovery/item/[a-z0-9]+")
+    SHORT = compile(r"https://xhslink\.com/[A-Za-z0-9]+")
+    __INSTANCE = None
+
+    def __new__(cls, *args, **kwargs):
+        if not cls.__INSTANCE:
+            cls.__INSTANCE = super().__new__(cls)
+        return cls.__INSTANCE
 
     def __init__(
             self,
             path="",
             folder="Download",
-            proxy=None,
+            user_agent: str = None,
+            proxy: str = None,
             timeout=10,
             chunk=1024 * 1024,
             **kwargs,
     ):
-        self.manager = Manager(self.ROOT)
+        self.manager = Manager(self.ROOT, user_agent)
         self.html = Html(self.manager.headers, proxy, timeout)
         self.image = Image()
         self.video = Video()
@@ -81,12 +88,12 @@ async def extract(self, url: str, download=False, log=None, bar=None) -> list[di
     async def __deal_links(self, url: str) -> list:
         urls = []
         for i in url.split():
-            if u := self.short.search(i):
+            if u := self.SHORT.search(i):
                 i = await self.html.request_url(
                     u.group(), False)
-            if u := self.share.search(i):
+            if u := self.SHARE.search(i):
                 urls.append(u.group())
-            elif u := self.link.search(i):
+            elif u := self.LINK.search(i):
                 urls.append(u.group())
         return urls
 
@@ -118,18 +125,19 @@ async def __aexit__(self, exc_type, exc_value, traceback):
         await self.html.session.close()
         await self.download.session.close()
 
-    def rich_log(self, log, text, style="b bright_green"):
+    @staticmethod
+    def rich_log(log, text, style="b bright_green"):
         if log:
             log.write(Text(text, style=style))
         else:
-            self.console.print(text, style=style)
+            print(text)
 
 
 class XHSDownloader(App):
     VERSION = 1.6
     BETA = True
     ROOT = Path(__file__).resolve().parent.parent
-    APP = XHS(**Settings(ROOT).run())
+    # APP = XHS(**Settings(ROOT).run())
     CSS_PATH = ROOT.joinpath(
         "static/XHS-Downloader.tcss")
     BINDINGS = [