-
Notifications
You must be signed in to change notification settings - Fork 4
/
multi_site_crawler.sh
88 lines (77 loc) · 3.43 KB
/
multi_site_crawler.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/bin/bash
# Set the start and end date for the data collection
# Recommend set a 2 weeks range
START_DATE="2024-05-01"
END_DATE="2024-05-14"
# Set your GitHub access token
GITHUB_ACCESS_TOKEN=""
echo "Starting data collection from $START_DATE to $END_DATE"
DATE_RANGE="${START_DATE//-/}to${END_DATE//-/}"
# GitHub
GITHUB_FILE_NAME="github_cpp_${DATE_RANGE}.json"
GITHUB_LANGUAGE="cpp"
echo "Collecting GitHub data for language $GITHUB_LANGUAGE from $START_DATE to $END_DATE"
python3 github_crawler.py --start_date $START_DATE --end_date $END_DATE --file_name $GITHUB_FILE_NAME --language $GITHUB_LANGUAGE --access_token $GITHUB_ACCESS_TOKEN
if [ $? -eq 0 ]; then
echo "Data saved to $GITHUB_FILE_NAME"
else
echo "Failed to collect GitHub data for language $GITHUB_LANGUAGE"
fi
# ArXiv
CLASSIFICATION1="computer_science"
FILE_NAME1="arxiv_${CLASSIFICATION1}_${DATE_RANGE}.json"
echo "Collecting ArXiv data for classification $CLASSIFICATION1 from $START_DATE to $END_DATE"
python3 arxiv_crawler.py --start_date $START_DATE --end_date $END_DATE --classification $CLASSIFICATION1 --file_name $FILE_NAME1
if [ $? -eq 0 ]; then
echo "Data saved to $FILE_NAME1"
else
echo "Failed to collect ArXiv data for classification $CLASSIFICATION1"
fi
CLASSIFICATION2="physics"
FILE_NAME2="arxiv_${CLASSIFICATION2}_${DATE_RANGE}.json"
echo "Collecting ArXiv data for classification $CLASSIFICATION2 from $START_DATE to $END_DATE"
python3 arxiv_crawler.py --start_date $START_DATE --end_date $END_DATE --classification $CLASSIFICATION2 --file_name $FILE_NAME2
if [ $? -eq 0 ]; then
echo "Data saved to $FILE_NAME2"
else
echo "Failed to collect ArXiv data for classification $CLASSIFICATION2"
fi
# Wikipedia
WIKIPEDIA_FILE_NAME="wikipedia_english_${DATE_RANGE}.json"
echo "Collecting Wikipedia data from $START_DATE to $END_DATE"
python3 wikipedia_crawler.py --start_date $START_DATE --end_date $END_DATE --file_name $WIKIPEDIA_FILE_NAME
if [ $? -eq 0 ]; then
echo "Data saved to $WIKIPEDIA_FILE_NAME"
else
echo "Failed to collect Wikipedia data"
fi
# BBC News
BBC_FILE_NAME="bbc_news_${DATE_RANGE}.json"
echo "Collecting BBC News data from $START_DATE to $END_DATE"
python3 bbc_crawler.py --start_date $START_DATE --end_date $END_DATE --file_name $BBC_FILE_NAME
if [ $? -eq 0 ]; then
echo "Data saved to $BBC_FILE_NAME"
else
echo "Failed to collect BBC News data"
fi
# AO3
#LANGUAGE="english"
#AO3_FILE_NAME="ao3_${LANGUAGE}_${DATE_RANGE}.json"
#echo "Collecting AO3 data from $START_DATE to $END_DATE in $LANGUAGE"
#echo "AO3 has a strict rate limit (20 requests per minute), please implement your own proxy strategy in proxy.py, then set max_workers to larger value, or wait for a longer period."
#python3 ao3_crawler.py --start_date $START_DATE --end_date $END_DATE --file_name $AO3_FILE_NAME --language $LANGUAGE --max_workers 1
#if [ $? -eq 0 ]; then
# echo "Data saved to $AO3_FILE_NAME"
#else
# echo "Failed to collect AO3 data"
#fi
GITHUB_FILE_NAME="github_python_${DATE_RANGE}.json"
GITHUB_LANGUAGE="python"
echo "Collecting GitHub data for language $GITHUB_LANGUAGE from $START_DATE to $END_DATE"
python3 github_crawler.py --start_date $START_DATE --end_date $END_DATE --file_name $GITHUB_FILE_NAME --language $GITHUB_LANGUAGE --access_token $GITHUB_ACCESS_TOKEN
if [ $? -eq 0 ]; then
echo "Data saved to $GITHUB_FILE_NAME"
else
echo "Failed to collect GitHub data for language $GITHUB_LANGUAGE"
fi
echo "Data collection completed for all sources."