From 560e6d24ba4de19d81cee2660c0d4cfb724c2308 Mon Sep 17 00:00:00 2001 From: Markus Neteler Date: Wed, 5 Feb 2025 18:23:27 +0100 Subject: [PATCH] docs: script to convert HTML manual pages to markdown (#4620) --- utils/grass_html2md.sh | 69 ++++++++++++++++++++++++++++++++++++++ utils/pandoc_codeblock.lua | 8 +++++ 2 files changed, 77 insertions(+) create mode 100755 utils/grass_html2md.sh create mode 100644 utils/pandoc_codeblock.lua diff --git a/utils/grass_html2md.sh b/utils/grass_html2md.sh new file mode 100755 index 00000000000..9ac2e548ae1 --- /dev/null +++ b/utils/grass_html2md.sh @@ -0,0 +1,69 @@ +#!/bin/bash +set -eu + +############################################################################### +# Convert recursively all .html files to .md (GitHub flavoured Markdown) +# +# Dependencies: +# pandoc +# wget +# +# Author(s): +# Martin Landa, Markus Neteler +# +# Usage: +# If you have "pandoc" in PATH, execute for HTML file conversion in +# current directory and subdirectories: +# ./utils/grass_html2md.sh +# +# COPYRIGHT: (C) 2024 by the GRASS Development Team +# +# This program is free software under the GNU General Public +# License (>=v2). Read the file COPYING that comes with GRASS +# for details. +# +############################################################################### + +# cleanup at user break +cleanup() +{ + rm -f "${f%%.html}_tmp.html" +} + +# what to do in case of user break: +exitprocedure() +{ + echo "User break!" + cleanup + exit 1 +} +# shell check for user break (signal list: trap -l) +trap "exitprocedure" 2 3 15 + +# path to LUA file (./utils/pandoc_codeblock.lua) +UTILSPATH="utils" + +# run recursively: HTML to MD +for f in $(find . -name *.html); do + echo "${f}" + + # HTML: Process the tmp file to selectively replace .html with .md only in relative URLs + sed -E ' + # Step 1: Preserve http/https links with .html (and optional anchors) + s|(|\1_KEEPHTML\2">|g; + # Step 2: Replace .html with .md for local links (with or without anchors) + s|(|\1\2.md\3">|g; + # Step 3: Restore preserved http/https links with .html + s|_KEEPHTML||g; +' "${f%%.html}.html" > "${f%%.html}_tmp.html" + + cat "${f%%.html}_tmp.html" | \ + sed 's#
#
#g' | \
+        sed 's#
##g' | \ + pandoc --from=html --to=markdown -t gfm \ + --lua-filter "${UTILSPATH}/pandoc_codeblock.lua" | \ + sed 's+ \\\$+ \$+g' | sed 's+%20+-+g' > "${f%%.html}.md" + + rm -f "${f%%.html}_tmp.html" + +done diff --git a/utils/pandoc_codeblock.lua b/utils/pandoc_codeblock.lua new file mode 100644 index 00000000000..e2a0a54910f --- /dev/null +++ b/utils/pandoc_codeblock.lua @@ -0,0 +1,8 @@ +-- Pandoc Lua filter to handle code blocks +-- Test cases +-- raster/r.sun/r.sun.html + +-- Function to convert code blocks to markdown +function CodeBlock (cb) + return pandoc.RawBlock('markdown', '```shell\n' .. cb.text .. '\n```\n') +end