From 233fbd30dfe79a4416669b13f68a93716381c533 Mon Sep 17 00:00:00 2001 From: tmr <32825326+ttttmr@users.noreply.github.com> Date: Fri, 4 Aug 2023 00:43:39 +0800 Subject: [PATCH 1/2] =?UTF-8?q?feat:=20=E4=B8=BAhtml/json=E8=BD=AC?= =?UTF-8?q?=E5=8C=96=E4=B8=BArss=E6=8F=90=E4=BE=9B=E9=80=9A=E7=94=A8?= =?UTF-8?q?=E6=94=AF=E6=8C=81=20(#12882)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: add rss proxy * docs: 增加文档 * feat: 增加description和自动链接提取 * feat: 增加一些有用的radar * fix: 链接补全 * fix: lint * fix: request config * docs: example encode * fix: rename proxy to transform * refactor: move it under rsshub fix: split radar rules * fix: maintainer build * style: camelCase * docs: fix example --------- --- docs/en/other.md | 74 ++++++++++++++++++++++++++++++++ docs/other.md | 74 ++++++++++++++++++++++++++++++++ lib/maintainer.js | 2 +- lib/v2/altervista/radar.js | 13 ++++++ lib/v2/rsshub/maintainer.js | 2 + lib/v2/rsshub/router.js | 4 +- lib/v2/rsshub/transform/html.js | 75 +++++++++++++++++++++++++++++++++ lib/v2/rsshub/transform/json.js | 57 +++++++++++++++++++++++++ lib/v2/sec/radar.js | 13 ++++++ 9 files changed, 311 insertions(+), 3 deletions(-) create mode 100644 lib/v2/altervista/radar.js create mode 100644 lib/v2/rsshub/transform/html.js create mode 100644 lib/v2/rsshub/transform/json.js create mode 100644 lib/v2/sec/radar.js diff --git a/docs/en/other.md b/docs/en/other.md index cdc21f2b6285f4..9da71e5f1caf5e 100644 --- a/docs/en/other.md +++ b/docs/en/other.md @@ -316,6 +316,80 @@ please refer to the [Notion API documentation](https://developers.notion.com/ref +## Transformation + +Pass URL and transformation rules to convert HTML/JSON into RSS. + +### HTML + +Specify options (in the format of query string) in parameter `routeParams` parameter to extract data from HTML. + +| Key | Meaning | Accepted Values | Default | +| -------------- | -------------------------------------------------- | --------------- | ----------------------- | +| `title` | The title of the RSS | `string` | Extract from `` | +| `item` | The HTML elements as `item` using CSS selector | `string` | html | +| `itemTitle` | The HTML elements as `title` in `item` using CSS selector | `string` | `item` element | +| `itemTitleAttr` | The attributes of `title` element as title | `string` | Element text | +| `itemLink` | The HTML elements as `link` in `item` using CSS selector | `string` | `item` element | +| `itemLinkAttr` | The attributes of `link` element as link | `string` | `href` | +| `itemDesc` | The HTML elements as `descrption` in `item` using CSS selector | `string` | `item` element | +| `itemDescAttr` | The attributes of `descrption` element as description | `string` | Element html | + +<RouteEn author="ttttmr" example="/rsshub/transform/html/https%3A%2F%2Fwechat2rss.xlab.app%2Fposts%2Flist%2F/item=div%5Bclass%3D%27post%2Dcontent%27%5D%20p%20a" path="/rsshub/transform/html/:url/:routeParams" :paramsDesc="['`encodeURIComponent`ed URL address', 'Transformation rules, requires URL encode']" selfhost="1"> + +Parameters parsing in the above example: + +| Parameter | Value | +| ------------ | ----------------------------------------- | +| `url` | `https://wechat2rss.xlab.app/posts/list/` | +| `routeParams`| `item=div[class='post-content'] p a` | + +Parsing of `routeParams` parameter: + +| Parameter | Value | +| --------- | ------------------------------- | +| `item` | `div[class='post-content'] p a` | + +</RouteEn> + +### JSON + +Specify options (in the format of query string) in parameter `routeParams` parameter to extract data from JSON. + +| Key | Meaning | Accepted Values | Default | +| ---------- | ----------------------------- ---------- | --------------- | ---------- ------------------------------ | +| `title` | The title of the RSS | `string` | Extracted from home page of current domain | +| `item` | The JSON Path as `item` element | `string` | Entire JSON response | +| `itemTitle` | The JSON Path as `title` in `item` | `string` | None | +| `itemLink` | The JSON Path as `link` in `item` | `string` | None | +| `itemDesc` | The JSON Path as `description` in `item` | `string` | None | + +::: tip Note + +JSON Path only supports format like `a.b.c`. if you need to access arrays, like `a[0].b`, you can write it as `a.0.b`. + +::: + +<RouteEn author="ttttmr" example="/rsshub/transform/json/https%3A%2F%2Fapi.github.com%2Frepos%2Fginuerzh%2Fgost%2Freleases/title=Gost%20releases&itemTitle=tag_name&itemLink=html_url&itemDesc=body" path="/rsshub/transform/json/:url/:routeParams" :paramsDesc="['`encodeURIComponent`ed URL address', 'Transformation rules, requires URL encode']" selfhost="1"> + +Parameters parsing in the above example: + +| Parameter | Value | +| ------------- | ----------------------------------------------- | +| `url` | `https://api.github.com/repos/ginuerzh/gost/releases` | +| `routeParams` | `title=Gost releases&itemTitle=tag_name&itemLink=html_url&itemDesc=body` | + +Parsing of `routeParams` parameter: + +| Parameter | Value | +| ------------ | ---------------- | +| `title` | `Gost releases` | +| `itemTitle` | `tag_name` | +| `itemLink` | `html_url` | +| `itemDesc` | `body` | + +</RouteEn> + ## Trending Search Keyword Aggregator ### Aggregated Keyword Tracker diff --git a/docs/other.md b/docs/other.md index 46b89daf4dea45..b009aad06a521d 100644 --- a/docs/other.md +++ b/docs/other.md @@ -1121,6 +1121,80 @@ type 为 all 时,category 参数不支持 cost 和 free <Route author="Fatpandac" example="/ems/apple/EZ319397281CN" path="/ems/apple/:id" :paramsDesc="['苹果邮件编号']"/> +## 转换 + +传递 URL 和转化规则,将 HTML/JSON 转换为 RSS + +### HTML + +在 `routeParams` 参数中以 query string 格式指定选项,可以控制提取数据 + +| 键 | 含义 | 接受的值 | 默认值 | +| --------------- | --------------------------------------------------------------- | -------- | ------------------------ | +| `title` | 指定 RSS 的标题 | `string` | 从当前网页中取 `<title>` | +| `item` | 通过 CSS 选择器查找 HTML 元素作为 `item` 元素 | `string` | html | +| `itemTitle` | 在 `item` 中通过 CSS 选择器查找 HTML 元素作为 `title` 元素 | `string` | `item` 元素 | +| `itemTitleAttr` | 获取 `title` 元素属性作为标题 | `string` | 元素 text | +| `itemLink` | 在 `item` 中通过 CSS 选择器查找 HTML 元素作为 `link` 元素 | `string` | `item` 元素 | +| `itemLinkAttr` | 获取 `link` 元素属性作为链接 | `string` | `href` | +| `itemDesc` | 在 `item` 中通过 CSS 选择器查找 HTML 元素作为 `descrption` 元素 | `string` | `item` 元素 | +| `itemDescAttr` | 获取 `descrption` 元素属性作为描述 | `string` | 元素 html | + +<Route author="ttttmr" example="/rsshub/transform/html/https%3A%2F%2Fwechat2rss.xlab.app%2Fposts%2Flist%2F/item=div%5Bclass%3D%27post%2Dcontent%27%5D%20p%20a" path="/rsshub/transform/html/:url/:routeParams" :paramsDesc="['URL地址,需经 URL 编码', '转换规则,需经 URL 编码']" selfhost="1"> + +上述例子中参数解析如下 + +| 参数 | 值 | +| -------------- | ----------------------------------------- | +| `:url` | `https://wechat2rss.xlab.app/posts/list/` | +| `:routeParams` | `item=div[class='post-content'] p a` | + +`routeParams`参数解析如下 + +| 参数 | 值 | +| ------ | ------------------------------- | +| `item` | `div[class='post-content'] p a` | + +</Route> + +### JSON + +在 `routeParams` 参数中以 query string 格式指定选项,可以控制提取数据 + +| 键 | 含义 | 接受的值 | 默认值 | +| ----------- | --------------------------------------- | -------- | ------------------------------------ | +| `title` | 指定 RSS 的标题 | `string` | 从当前域名的根路径网页中取 `<title>` | +| `item` | 通过 JSON Path 查找作为 `item` 元素 | `string` | 整个响应 JSON | +| `itemTitle` | 在 `item` 中通过 JSON Path 查找作为标题 | `string` | 无 | +| `itemLink` | 在 `item` 中通过 JSON Path 查找作为链接 | `string` | 无 | +| `itemDesc` | 在 `item` 中通过 JSON Path 查找作为描述 | `string` | 无 | + +::: tip 注意 + +JSON Path 目前只支持例如 `a.b.c` 的形式,如果需要从数组中读取,例如 `a[0].b`,可以写成 `a.0.b` + +::: + +<Route author="ttttmr" example="/rsshub/transform/json/https%3A%2F%2Fapi.github.com%2Frepos%2Fginuerzh%2Fgost%2Freleases/title=Gost%20releases&itemTitle=tag_name&itemLink=html_url&itemDesc=body" path="/rsshub/transform/json/:url/:routeParams" :paramsDesc="['URL地址,需经 URL 编码', '转换规则,需经 URL 编码']" selfhost="1"> + +上述例子中参数解析如下 + +| 参数 | 值 | +| -------------- | ------------------------------------------------------------------------ | +| `:url` | `https://api.github.com/repos/ginuerzh/gost/releases` | +| `:routeParams` | `title=Gost releases&itemTitle=tag_name&itemLink=html_url&itemDesc=body` | + +`routeParams` 参数解析如下 + +| 参数 | 值 | +| ----------- | --------------- | +| `title` | `Gost releases` | +| `itemTitle` | `tag_name` | +| `itemLink` | `html_url` | +| `itemDesc` | `body` | + +</Route> + ## 自如 ### 房源 diff --git a/lib/maintainer.js b/lib/maintainer.js index 610215d5316828..9efb9f448b62d8 100644 --- a/lib/maintainer.js +++ b/lib/maintainer.js @@ -5,7 +5,7 @@ const { join } = require('path'); // Presence Check for (const dir of fs.readdirSync(dirname)) { const dirPath = join(dirname, dir); - if (!fs.existsSync(join(dirPath, 'maintainer.js'))) { + if (fs.existsSync(join(dirPath, 'router.js')) && !fs.existsSync(join(dirPath, 'maintainer.js'))) { throw Error(`No maintainer.js in "${dirPath}".`); } } diff --git a/lib/v2/altervista/radar.js b/lib/v2/altervista/radar.js new file mode 100644 index 00000000000000..756cc0c7dfa9e9 --- /dev/null +++ b/lib/v2/altervista/radar.js @@ -0,0 +1,13 @@ +module.exports = { + 'altervista.org': { + _name: 'Altervista', + hyp3rlinx: [ + { + title: 'hyp3rlinx blog', + docs: 'https://docs.rsshub.app/', + source: ['/'], + target: '/rsshub/transform/html/http%3A%2F%2Fhyp3rlinx.altervista.org%2F/item=table[border=%221%22]%20tr%20td%20a', + }, + ], + }, +}; diff --git a/lib/v2/rsshub/maintainer.js b/lib/v2/rsshub/maintainer.js index b3bc264d4f755a..b0ffddb838814b 100644 --- a/lib/v2/rsshub/maintainer.js +++ b/lib/v2/rsshub/maintainer.js @@ -1,4 +1,6 @@ module.exports = { '/routes/:lang?': ['DIYgod'], '/rsshub/sponsors': ['DIYgod'], + '/transform/html/:url/:routeParams': ['ttttmr'], + '/transform/json/:url/:routeParams': ['ttttmr'], }; diff --git a/lib/v2/rsshub/router.js b/lib/v2/rsshub/router.js index daddbdf8fe8361..960796dbf4660e 100644 --- a/lib/v2/rsshub/router.js +++ b/lib/v2/rsshub/router.js @@ -1,6 +1,6 @@ module.exports = (router) => { - router.get('/rss', require('./routes')); // 弃用 - router.get('/routes/:lang?', require('./routes')); router.get('/sponsors', require('./sponsors')); + router.get('/transform/html/:url/:routeParams', require('./transform/html')); + router.get('/transform/json/:url/:routeParams', require('./transform/json')); }; diff --git a/lib/v2/rsshub/transform/html.js b/lib/v2/rsshub/transform/html.js new file mode 100644 index 00000000000000..c97e62d4a87faf --- /dev/null +++ b/lib/v2/rsshub/transform/html.js @@ -0,0 +1,75 @@ +const got = require('@/utils/got'); +const cheerio = require('cheerio'); +const config = require('@/config').value; + +module.exports = async (ctx) => { + if (!config.feature.allow_user_supply_unsafe_domain) { + ctx.throw(403, `This RSS is disabled unless 'ALLOW_USER_SUPPLY_UNSAFE_DOMAIN' is set to 'true'.`); + } + const { url } = ctx.params; + const response = await got({ + method: 'get', + url, + }); + + const routeParams = new URLSearchParams(ctx.params.routeParams); + const $ = cheerio.load(response.data); + const rssTitle = routeParams.get('title') ? routeParams.get('title') : $('title').text(); + const item = routeParams.get('item') ? routeParams.get('item') : 'html'; + const items = $(item) + .toArray() + .map((item) => { + try { + item = $(item); + + let title; + const titleEle = routeParams.get('itemTitle') ? item.find(routeParams.get('itemTitle')) : item; + if (routeParams.get('itemTitleAttr')) { + title = titleEle.attr(routeParams.get('itemTitleAttr')); + } else { + title = titleEle.text(); + } + + let link; + const linkEle = routeParams.get('itemLink') ? item.find(routeParams.get('itemLink')) : item; + if (routeParams.get('itemLinkAttr')) { + link = linkEle.attr(routeParams.get('itemLinkAttr')); + } else { + if (linkEle.is('a')) { + link = linkEle.attr('href'); + } else { + link = linkEle.find('a').attr('href'); + } + } + // 补全绝对链接 + link = link.trim(); + if (link && !link.startsWith('http')) { + link = `${new URL(url).origin}${link}`; + } + + let desc; + const descEle = routeParams.get('itemDesc') ? item.find(routeParams.get('itemDesc')) : item; + if (routeParams.get('itemDescAttr')) { + desc = descEle.attr(routeParams.get('itemDescAttr')); + } else { + desc = descEle.html(); + } + + return { + title, + link, + description: desc, + }; + } catch (e) { + return null; + } + }) + .filter(Boolean); + + ctx.state.data = { + title: rssTitle, + link: url, + description: `Proxy ${url}`, + item: items, + }; +}; diff --git a/lib/v2/rsshub/transform/json.js b/lib/v2/rsshub/transform/json.js new file mode 100644 index 00000000000000..9ca8d409505898 --- /dev/null +++ b/lib/v2/rsshub/transform/json.js @@ -0,0 +1,57 @@ +const got = require('@/utils/got'); +const cheerio = require('cheerio'); +const config = require('@/config').value; + +function jsonGet(obj, attr) { + if (typeof attr !== 'string') { + return obj; + } + // a.b.c + // a.b[0].c => a.b.0.c + attr.split('.').forEach((key) => { + obj = obj[key]; + }); + return obj; +} + +module.exports = async (ctx) => { + if (!config.feature.allow_user_supply_unsafe_domain) { + ctx.throw(403, `This RSS is disabled unless 'ALLOW_USER_SUPPLY_UNSAFE_DOMAIN' is set to 'true'.`); + } + const { url } = ctx.params; + const response = await got({ + method: 'get', + url, + }); + + const routeParams = new URLSearchParams(ctx.params.routeParams); + let rssTitle = routeParams.get('title'); + if (!rssTitle) { + const resp = await got({ + method: 'get', + url: new URL(url).origin, + }); + const $ = cheerio.load(resp.data); + rssTitle = $('title').text(); + } + + const items = jsonGet(response.data, routeParams.get('item')).map((item) => { + let link = jsonGet(item, routeParams.get('itemLink')).trim(); + // 补全绝对链接 + if (link && !link.startsWith('http')) { + link = `${new URL(url).origin}${link}`; + } + return { + title: jsonGet(item, routeParams.get('itemTitle')), + link, + description: routeParams.get('itemDesc') ? jsonGet(item, routeParams.get('itemDesc')) : '', + }; + }); + + ctx.state.data = { + title: rssTitle, + link: url, + description: `Proxy ${url}`, + item: items, + }; +}; diff --git a/lib/v2/sec/radar.js b/lib/v2/sec/radar.js new file mode 100644 index 00000000000000..c187630e0efcb2 --- /dev/null +++ b/lib/v2/sec/radar.js @@ -0,0 +1,13 @@ +module.exports = { + 'sec.today': { + _name: '每日安全', + '.': [ + { + title: '动态', + docs: 'https://docs.rsshub.app/', + source: ['/pulses', '/'], + target: '/rsshub/transform/html/https%3A%2F%2Fsec.today%2Fpulses%2F/item=div[class="card-body"]', + }, + ], + }, +}; From a198d61c8f284cbc5db0b803f9ffa2b01c5643db Mon Sep 17 00:00:00 2001 From: Tony <TonyRL@users.noreply.github.com> Date: Fri, 4 Aug 2023 06:46:44 +0800 Subject: [PATCH 2/2] docs: fix table in other.md --- docs/en/other.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/other.md b/docs/en/other.md index 9da71e5f1caf5e..f0fcc4a8b76b19 100644 --- a/docs/en/other.md +++ b/docs/en/other.md @@ -357,7 +357,7 @@ Parsing of `routeParams` parameter: Specify options (in the format of query string) in parameter `routeParams` parameter to extract data from JSON. | Key | Meaning | Accepted Values | Default | -| ---------- | ----------------------------- ---------- | --------------- | ---------- ------------------------------ | +| ---------- | ---------------------------------------- | --------------- | ------------------------------------------ | | `title` | The title of the RSS | `string` | Extracted from home page of current domain | | `item` | The JSON Path as `item` element | `string` | Entire JSON response | | `itemTitle` | The JSON Path as `title` in `item` | `string` | None |