From 233fbd30dfe79a4416669b13f68a93716381c533 Mon Sep 17 00:00:00 2001
From: tmr <32825326+ttttmr@users.noreply.github.com>
Date: Fri, 4 Aug 2023 00:43:39 +0800
Subject: [PATCH 1/2] =?UTF-8?q?feat:=20=E4=B8=BAhtml/json=E8=BD=AC?=
=?UTF-8?q?=E5=8C=96=E4=B8=BArss=E6=8F=90=E4=BE=9B=E9=80=9A=E7=94=A8?=
=?UTF-8?q?=E6=94=AF=E6=8C=81=20(#12882)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* feat: add rss proxy
* docs: 增加文档
* feat: 增加description和自动链接提取
* feat: 增加一些有用的radar
* fix: 链接补全
* fix: lint
* fix: request config
* docs: example encode
* fix: rename proxy to transform
* refactor: move it under rsshub
fix: split radar rules
* fix: maintainer build
* style: camelCase
* docs: fix example
---------
---
docs/en/other.md | 74 ++++++++++++++++++++++++++++++++
docs/other.md | 74 ++++++++++++++++++++++++++++++++
lib/maintainer.js | 2 +-
lib/v2/altervista/radar.js | 13 ++++++
lib/v2/rsshub/maintainer.js | 2 +
lib/v2/rsshub/router.js | 4 +-
lib/v2/rsshub/transform/html.js | 75 +++++++++++++++++++++++++++++++++
lib/v2/rsshub/transform/json.js | 57 +++++++++++++++++++++++++
lib/v2/sec/radar.js | 13 ++++++
9 files changed, 311 insertions(+), 3 deletions(-)
create mode 100644 lib/v2/altervista/radar.js
create mode 100644 lib/v2/rsshub/transform/html.js
create mode 100644 lib/v2/rsshub/transform/json.js
create mode 100644 lib/v2/sec/radar.js
diff --git a/docs/en/other.md b/docs/en/other.md
index cdc21f2b6285f4..9da71e5f1caf5e 100644
--- a/docs/en/other.md
+++ b/docs/en/other.md
@@ -316,6 +316,80 @@ please refer to the [Notion API documentation](https://developers.notion.com/ref
+## Transformation
+
+Pass URL and transformation rules to convert HTML/JSON into RSS.
+
+### HTML
+
+Specify options (in the format of query string) in parameter `routeParams` parameter to extract data from HTML.
+
+| Key | Meaning | Accepted Values | Default |
+| -------------- | -------------------------------------------------- | --------------- | ----------------------- |
+| `title` | The title of the RSS | `string` | Extract from `
` |
+| `item` | The HTML elements as `item` using CSS selector | `string` | html |
+| `itemTitle` | The HTML elements as `title` in `item` using CSS selector | `string` | `item` element |
+| `itemTitleAttr` | The attributes of `title` element as title | `string` | Element text |
+| `itemLink` | The HTML elements as `link` in `item` using CSS selector | `string` | `item` element |
+| `itemLinkAttr` | The attributes of `link` element as link | `string` | `href` |
+| `itemDesc` | The HTML elements as `descrption` in `item` using CSS selector | `string` | `item` element |
+| `itemDescAttr` | The attributes of `descrption` element as description | `string` | Element html |
+
+
+
+Parameters parsing in the above example:
+
+| Parameter | Value |
+| ------------ | ----------------------------------------- |
+| `url` | `https://wechat2rss.xlab.app/posts/list/` |
+| `routeParams`| `item=div[class='post-content'] p a` |
+
+Parsing of `routeParams` parameter:
+
+| Parameter | Value |
+| --------- | ------------------------------- |
+| `item` | `div[class='post-content'] p a` |
+
+
+
+### JSON
+
+Specify options (in the format of query string) in parameter `routeParams` parameter to extract data from JSON.
+
+| Key | Meaning | Accepted Values | Default |
+| ---------- | ----------------------------- ---------- | --------------- | ---------- ------------------------------ |
+| `title` | The title of the RSS | `string` | Extracted from home page of current domain |
+| `item` | The JSON Path as `item` element | `string` | Entire JSON response |
+| `itemTitle` | The JSON Path as `title` in `item` | `string` | None |
+| `itemLink` | The JSON Path as `link` in `item` | `string` | None |
+| `itemDesc` | The JSON Path as `description` in `item` | `string` | None |
+
+::: tip Note
+
+JSON Path only supports format like `a.b.c`. if you need to access arrays, like `a[0].b`, you can write it as `a.0.b`.
+
+:::
+
+
+
+Parameters parsing in the above example:
+
+| Parameter | Value |
+| ------------- | ----------------------------------------------- |
+| `url` | `https://api.github.com/repos/ginuerzh/gost/releases` |
+| `routeParams` | `title=Gost releases&itemTitle=tag_name&itemLink=html_url&itemDesc=body` |
+
+Parsing of `routeParams` parameter:
+
+| Parameter | Value |
+| ------------ | ---------------- |
+| `title` | `Gost releases` |
+| `itemTitle` | `tag_name` |
+| `itemLink` | `html_url` |
+| `itemDesc` | `body` |
+
+
+
## Trending Search Keyword Aggregator
### Aggregated Keyword Tracker
diff --git a/docs/other.md b/docs/other.md
index 46b89daf4dea45..b009aad06a521d 100644
--- a/docs/other.md
+++ b/docs/other.md
@@ -1121,6 +1121,80 @@ type 为 all 时,category 参数不支持 cost 和 free
+## 转换
+
+传递 URL 和转化规则,将 HTML/JSON 转换为 RSS
+
+### HTML
+
+在 `routeParams` 参数中以 query string 格式指定选项,可以控制提取数据
+
+| 键 | 含义 | 接受的值 | 默认值 |
+| --------------- | --------------------------------------------------------------- | -------- | ------------------------ |
+| `title` | 指定 RSS 的标题 | `string` | 从当前网页中取 `` |
+| `item` | 通过 CSS 选择器查找 HTML 元素作为 `item` 元素 | `string` | html |
+| `itemTitle` | 在 `item` 中通过 CSS 选择器查找 HTML 元素作为 `title` 元素 | `string` | `item` 元素 |
+| `itemTitleAttr` | 获取 `title` 元素属性作为标题 | `string` | 元素 text |
+| `itemLink` | 在 `item` 中通过 CSS 选择器查找 HTML 元素作为 `link` 元素 | `string` | `item` 元素 |
+| `itemLinkAttr` | 获取 `link` 元素属性作为链接 | `string` | `href` |
+| `itemDesc` | 在 `item` 中通过 CSS 选择器查找 HTML 元素作为 `descrption` 元素 | `string` | `item` 元素 |
+| `itemDescAttr` | 获取 `descrption` 元素属性作为描述 | `string` | 元素 html |
+
+
+
+上述例子中参数解析如下
+
+| 参数 | 值 |
+| -------------- | ----------------------------------------- |
+| `:url` | `https://wechat2rss.xlab.app/posts/list/` |
+| `:routeParams` | `item=div[class='post-content'] p a` |
+
+`routeParams`参数解析如下
+
+| 参数 | 值 |
+| ------ | ------------------------------- |
+| `item` | `div[class='post-content'] p a` |
+
+
+
+### JSON
+
+在 `routeParams` 参数中以 query string 格式指定选项,可以控制提取数据
+
+| 键 | 含义 | 接受的值 | 默认值 |
+| ----------- | --------------------------------------- | -------- | ------------------------------------ |
+| `title` | 指定 RSS 的标题 | `string` | 从当前域名的根路径网页中取 `` |
+| `item` | 通过 JSON Path 查找作为 `item` 元素 | `string` | 整个响应 JSON |
+| `itemTitle` | 在 `item` 中通过 JSON Path 查找作为标题 | `string` | 无 |
+| `itemLink` | 在 `item` 中通过 JSON Path 查找作为链接 | `string` | 无 |
+| `itemDesc` | 在 `item` 中通过 JSON Path 查找作为描述 | `string` | 无 |
+
+::: tip 注意
+
+JSON Path 目前只支持例如 `a.b.c` 的形式,如果需要从数组中读取,例如 `a[0].b`,可以写成 `a.0.b`
+
+:::
+
+
+
+上述例子中参数解析如下
+
+| 参数 | 值 |
+| -------------- | ------------------------------------------------------------------------ |
+| `:url` | `https://api.github.com/repos/ginuerzh/gost/releases` |
+| `:routeParams` | `title=Gost releases&itemTitle=tag_name&itemLink=html_url&itemDesc=body` |
+
+`routeParams` 参数解析如下
+
+| 参数 | 值 |
+| ----------- | --------------- |
+| `title` | `Gost releases` |
+| `itemTitle` | `tag_name` |
+| `itemLink` | `html_url` |
+| `itemDesc` | `body` |
+
+
+
## 自如
### 房源
diff --git a/lib/maintainer.js b/lib/maintainer.js
index 610215d5316828..9efb9f448b62d8 100644
--- a/lib/maintainer.js
+++ b/lib/maintainer.js
@@ -5,7 +5,7 @@ const { join } = require('path');
// Presence Check
for (const dir of fs.readdirSync(dirname)) {
const dirPath = join(dirname, dir);
- if (!fs.existsSync(join(dirPath, 'maintainer.js'))) {
+ if (fs.existsSync(join(dirPath, 'router.js')) && !fs.existsSync(join(dirPath, 'maintainer.js'))) {
throw Error(`No maintainer.js in "${dirPath}".`);
}
}
diff --git a/lib/v2/altervista/radar.js b/lib/v2/altervista/radar.js
new file mode 100644
index 00000000000000..756cc0c7dfa9e9
--- /dev/null
+++ b/lib/v2/altervista/radar.js
@@ -0,0 +1,13 @@
+module.exports = {
+ 'altervista.org': {
+ _name: 'Altervista',
+ hyp3rlinx: [
+ {
+ title: 'hyp3rlinx blog',
+ docs: 'https://docs.rsshub.app/',
+ source: ['/'],
+ target: '/rsshub/transform/html/http%3A%2F%2Fhyp3rlinx.altervista.org%2F/item=table[border=%221%22]%20tr%20td%20a',
+ },
+ ],
+ },
+};
diff --git a/lib/v2/rsshub/maintainer.js b/lib/v2/rsshub/maintainer.js
index b3bc264d4f755a..b0ffddb838814b 100644
--- a/lib/v2/rsshub/maintainer.js
+++ b/lib/v2/rsshub/maintainer.js
@@ -1,4 +1,6 @@
module.exports = {
'/routes/:lang?': ['DIYgod'],
'/rsshub/sponsors': ['DIYgod'],
+ '/transform/html/:url/:routeParams': ['ttttmr'],
+ '/transform/json/:url/:routeParams': ['ttttmr'],
};
diff --git a/lib/v2/rsshub/router.js b/lib/v2/rsshub/router.js
index daddbdf8fe8361..960796dbf4660e 100644
--- a/lib/v2/rsshub/router.js
+++ b/lib/v2/rsshub/router.js
@@ -1,6 +1,6 @@
module.exports = (router) => {
- router.get('/rss', require('./routes')); // 弃用
-
router.get('/routes/:lang?', require('./routes'));
router.get('/sponsors', require('./sponsors'));
+ router.get('/transform/html/:url/:routeParams', require('./transform/html'));
+ router.get('/transform/json/:url/:routeParams', require('./transform/json'));
};
diff --git a/lib/v2/rsshub/transform/html.js b/lib/v2/rsshub/transform/html.js
new file mode 100644
index 00000000000000..c97e62d4a87faf
--- /dev/null
+++ b/lib/v2/rsshub/transform/html.js
@@ -0,0 +1,75 @@
+const got = require('@/utils/got');
+const cheerio = require('cheerio');
+const config = require('@/config').value;
+
+module.exports = async (ctx) => {
+ if (!config.feature.allow_user_supply_unsafe_domain) {
+ ctx.throw(403, `This RSS is disabled unless 'ALLOW_USER_SUPPLY_UNSAFE_DOMAIN' is set to 'true'.`);
+ }
+ const { url } = ctx.params;
+ const response = await got({
+ method: 'get',
+ url,
+ });
+
+ const routeParams = new URLSearchParams(ctx.params.routeParams);
+ const $ = cheerio.load(response.data);
+ const rssTitle = routeParams.get('title') ? routeParams.get('title') : $('title').text();
+ const item = routeParams.get('item') ? routeParams.get('item') : 'html';
+ const items = $(item)
+ .toArray()
+ .map((item) => {
+ try {
+ item = $(item);
+
+ let title;
+ const titleEle = routeParams.get('itemTitle') ? item.find(routeParams.get('itemTitle')) : item;
+ if (routeParams.get('itemTitleAttr')) {
+ title = titleEle.attr(routeParams.get('itemTitleAttr'));
+ } else {
+ title = titleEle.text();
+ }
+
+ let link;
+ const linkEle = routeParams.get('itemLink') ? item.find(routeParams.get('itemLink')) : item;
+ if (routeParams.get('itemLinkAttr')) {
+ link = linkEle.attr(routeParams.get('itemLinkAttr'));
+ } else {
+ if (linkEle.is('a')) {
+ link = linkEle.attr('href');
+ } else {
+ link = linkEle.find('a').attr('href');
+ }
+ }
+ // 补全绝对链接
+ link = link.trim();
+ if (link && !link.startsWith('http')) {
+ link = `${new URL(url).origin}${link}`;
+ }
+
+ let desc;
+ const descEle = routeParams.get('itemDesc') ? item.find(routeParams.get('itemDesc')) : item;
+ if (routeParams.get('itemDescAttr')) {
+ desc = descEle.attr(routeParams.get('itemDescAttr'));
+ } else {
+ desc = descEle.html();
+ }
+
+ return {
+ title,
+ link,
+ description: desc,
+ };
+ } catch (e) {
+ return null;
+ }
+ })
+ .filter(Boolean);
+
+ ctx.state.data = {
+ title: rssTitle,
+ link: url,
+ description: `Proxy ${url}`,
+ item: items,
+ };
+};
diff --git a/lib/v2/rsshub/transform/json.js b/lib/v2/rsshub/transform/json.js
new file mode 100644
index 00000000000000..9ca8d409505898
--- /dev/null
+++ b/lib/v2/rsshub/transform/json.js
@@ -0,0 +1,57 @@
+const got = require('@/utils/got');
+const cheerio = require('cheerio');
+const config = require('@/config').value;
+
+function jsonGet(obj, attr) {
+ if (typeof attr !== 'string') {
+ return obj;
+ }
+ // a.b.c
+ // a.b[0].c => a.b.0.c
+ attr.split('.').forEach((key) => {
+ obj = obj[key];
+ });
+ return obj;
+}
+
+module.exports = async (ctx) => {
+ if (!config.feature.allow_user_supply_unsafe_domain) {
+ ctx.throw(403, `This RSS is disabled unless 'ALLOW_USER_SUPPLY_UNSAFE_DOMAIN' is set to 'true'.`);
+ }
+ const { url } = ctx.params;
+ const response = await got({
+ method: 'get',
+ url,
+ });
+
+ const routeParams = new URLSearchParams(ctx.params.routeParams);
+ let rssTitle = routeParams.get('title');
+ if (!rssTitle) {
+ const resp = await got({
+ method: 'get',
+ url: new URL(url).origin,
+ });
+ const $ = cheerio.load(resp.data);
+ rssTitle = $('title').text();
+ }
+
+ const items = jsonGet(response.data, routeParams.get('item')).map((item) => {
+ let link = jsonGet(item, routeParams.get('itemLink')).trim();
+ // 补全绝对链接
+ if (link && !link.startsWith('http')) {
+ link = `${new URL(url).origin}${link}`;
+ }
+ return {
+ title: jsonGet(item, routeParams.get('itemTitle')),
+ link,
+ description: routeParams.get('itemDesc') ? jsonGet(item, routeParams.get('itemDesc')) : '',
+ };
+ });
+
+ ctx.state.data = {
+ title: rssTitle,
+ link: url,
+ description: `Proxy ${url}`,
+ item: items,
+ };
+};
diff --git a/lib/v2/sec/radar.js b/lib/v2/sec/radar.js
new file mode 100644
index 00000000000000..c187630e0efcb2
--- /dev/null
+++ b/lib/v2/sec/radar.js
@@ -0,0 +1,13 @@
+module.exports = {
+ 'sec.today': {
+ _name: '每日安全',
+ '.': [
+ {
+ title: '动态',
+ docs: 'https://docs.rsshub.app/',
+ source: ['/pulses', '/'],
+ target: '/rsshub/transform/html/https%3A%2F%2Fsec.today%2Fpulses%2F/item=div[class="card-body"]',
+ },
+ ],
+ },
+};
From a198d61c8f284cbc5db0b803f9ffa2b01c5643db Mon Sep 17 00:00:00 2001
From: Tony
Date: Fri, 4 Aug 2023 06:46:44 +0800
Subject: [PATCH 2/2] docs: fix table in other.md
---
docs/en/other.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/en/other.md b/docs/en/other.md
index 9da71e5f1caf5e..f0fcc4a8b76b19 100644
--- a/docs/en/other.md
+++ b/docs/en/other.md
@@ -357,7 +357,7 @@ Parsing of `routeParams` parameter:
Specify options (in the format of query string) in parameter `routeParams` parameter to extract data from JSON.
| Key | Meaning | Accepted Values | Default |
-| ---------- | ----------------------------- ---------- | --------------- | ---------- ------------------------------ |
+| ---------- | ---------------------------------------- | --------------- | ------------------------------------------ |
| `title` | The title of the RSS | `string` | Extracted from home page of current domain |
| `item` | The JSON Path as `item` element | `string` | Entire JSON response |
| `itemTitle` | The JSON Path as `title` in `item` | `string` | None |