-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathplot.sh
executable file
·68 lines (51 loc) · 2.34 KB
/
plot.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/bin/bash
set -exo pipefail
LATEST_CRAWL=$(basename $(ls stats/CC-MAIN-20[12]*.gz | tail -n 1) .gz)
sed -i 's@^latest_crawl:.*@latest_crawl: '$LATEST_CRAWL'@' _config.yml
function update_excerpt() {
regex="$1"
excerpt="$2"
if [ -e "$excerpt" ]; then
# short-cut for monthy update plots: only add data from latest crawl
if ! zgrep -qF "$LATEST_CRAWL" $excerpt; then
zgrep -Eh "$regex" stats/$LATEST_CRAWL.gz | gzip >>$excerpt
fi
else
zcat stats/CC-MAIN-*.gz | grep -Eh "$regex" | gzip >$excerpt
fi
}
# filter data to speed-up reading while plotting
mkdir -p stats/excerpt
update_excerpt '^\["size' stats/excerpt/size.json.gz
update_excerpt '^\["histogram"' stats/excerpt/histogram.json.gz
update_excerpt '^\["tld"' stats/excerpt/tld.json.gz
update_excerpt '^\["(size|domain)"' stats/excerpt/domain.json.gz
update_excerpt '^\["(size", *"page|mimetype)"' stats/excerpt/mimetype.json.gz
update_excerpt '^\["(size", *"page|mimetype_detected)"' stats/excerpt/mimetype_detected.json.gz
update_excerpt '^\["(size", *"page|charset)"' stats/excerpt/charset.json.gz
update_excerpt '^\["(size", *"page|primary_language|languages)"' stats/excerpt/language.json.gz
update_excerpt '^\["scheme"' stats/excerpt/url_protocol.json.gz
mkdir -p data
zcat stats/excerpt/size.json.gz \
| python3 plot/crawl_size.py
zcat stats/excerpt/size.json.gz \
| python3 plot/overlap.py
# zcat stats/excerpt/histogram.json.gz \
# | python3 plot/histogram.py "$LATEST_CRAWL"
(cat stats/crawler/CC-MAIN-*.json;
zcat stats/excerpt/size.json.gz | grep '^\["size"';
zcat stats/excerpt/url_protocol.json.gz) \
| python3 plot/crawler_metrics.py
zcat stats/excerpt/tld.json.gz \
| python3 plot/tld.py CC-MAIN-2008-2009 CC-MAIN-2012 CC-MAIN-2014-10 \
CC-MAIN-2016-30 CC-MAIN-2019-09 CC-MAIN-2022-49 $LATEST_CRAWL
zcat stats/excerpt/mimetype.json.gz \
| python3 plot/mimetype.py
zcat stats/excerpt/mimetype_detected.json.gz \
| python3 plot/mimetype_detected.py
zcat stats/excerpt/charset.json.gz \
| python3 plot/charset.py
zcat stats/excerpt/language.json.gz \
| python3 plot/language.py
zcat stats/excerpt/domain.json.gz \
| python3 plot/domain.py