-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathsitemap-urls.sh
executable file
·87 lines (72 loc) · 1.9 KB
/
sitemap-urls.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/bin/bash
if ! command -v xpath &> /dev/null; then
echo "Error: xpath command not found. Please install it and try again."
exit 1
fi
if ! command -v curl &> /dev/null; then
echo "Error: curl command not found. Please install it and try again."
exit 1
fi
if [[ $# -eq 0 ]]; then
echo "Error: no argument provided."
echo "Usage: parse_sitemap_xml <sitemap_url>"
exit 1
fi
# fetch_xml - Fetches an XML file from a URL, optionally decompressing it if it is gzipped
#
# Usage:
# fetch_xml "url"
#
# Arguments:
# $1 - The URL of the XML file to fetch
fetch_xml() {
local url=$1
local filename=$(basename $url)
if [[ $filename == *.gz ]]
then
curl -sL $url | gunzip -c
else
curl -sL $url
fi
}
# decode_xml_entities - Decodes HTML entities in an XML or HTML string
#
# Usage:
# decode_xml_entities "input_string"
# cat file.xml | decode_xml_entities
#
# Arguments:
# $1 - The input XML or HTML string to decode
#
# Output:
# The decoded XML or HTML string, printed to standard output
decode_xml_entities() {
sed 's/</</g; s/>/>/g; s/"/"/g; s/'/\x27/g; s/&/\&/g;'
}
# parse_sitemap_xml - Recursively parses a sitemap XML file and outputs the URLs
#
# Usage:
# parse_sitemap_xml "sitemap_url"
#
# Arguments:
# $1 - The URL of the sitemap XML file to parse
#
# Output:
# The URLs of all pages contained within the sitemap, printed to standard output
parse_sitemap() {
local xml=$(fetch_xml $1)
# Skip if fetching the XML file failed
if [ $? -ne 0 ]; then
return 0
fi
local sub_sitemaps=($(echo "$xml" | xpath -q -e "//sitemap/loc/text()" 2>/dev/null | decode_xml_entities))
local pages=($(echo "$xml" | xpath -q -e "//urlset/url/loc/text()" 2>/dev/null | decode_xml_entities))
if [ ${#pages[@]} -gt 0 ]; then
printf '%s\n' "${pages[@]}" >&1
fi
for xml_url in "${sub_sitemaps[@]}"
do
parse_sitemap $xml_url
done
}
parse_sitemap $1