-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path.env-example
69 lines (51 loc) · 2.64 KB
/
.env-example
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# ------------
# ANNOTATIONS
# ------------
## Some ID for the annotator of the dataset. This ID will be used as a prefix for the comment and
## labels fields of the output.csv
ANNOTATOR = 'annotator_name'
## The randomization seed to shuffle the tasks for the annotator ('None' means not randomized)
RANDOM_SEED = 'None'
## A comma-seperated list of labels that the ANNOTATOR may use for the tasks of the TASKS_FILE
LABELS = 'Children,Energy,Cannabis'
# -----------
# TASKS FILE
# -----------
## The column of the TASKS_FILE that uniquely identifies each annotation task.
TASKS_ID_COLUMN = '_id'
## The column of the TASKS_FILE that holds the url to be annotated.
TASKS_URL_COLUMN = 'url'
# ----------
# STRUCTURE
# ----------
## The directory that will hold the TASKS_FILE, the annotations directory, the raw text directory,
## the cleaned text directory and the html directory
WORKING_DIR = 'example_workdir'
## The CSV file that holds the annotation tasks (including the TASKS_ID_COLUMN and TASKS_URL_COLUMN)
TASKS_FILE = 'tasks.csv'
## The sqlite file in the WORKING_DIR that will hold all annotations.
ANNOTATIONS_DB = 'annotations.sqlite'
## The directory in the WORKING_DIR that will hold the extracted (raw) text of the html content of
## each task.
RAW_TEXT_DIR = 'raw_text'
## The directory in the WORKING_DIR that will hold the extracted and cleaned text of the html content
## of each task.
CLEANED_TEXT_DIR = 'cleaned_text'
## The directory in the WORKING_DIR that holds the html content of each task as a result of scraping
## done before-hand. Each html file is expected to follow the naming scheme: TASK_ID.html, where
## TASK_ID is the value in the TASKS_ID_COLUMN of the corresponding row in the TASKS_FILE
HTML_DIR = 'html'
####################
# URL PARSING AIDS #
####################
# parameters used to find search terms in URLs
URL_QUERY_PARAMS = 'q,p,query,text,search_query,search,psg'
# exclude commonly found steps in the paths that might be confused with SEO titles (aka. slugs)
NOT_SEO_TITLES = 'cgi-bin,fast-cgi,auth-ui,consent-management,openid-connect,de-de,en-us,wba-ui,websso-prod,
login-ui,login-callback,login-actions,sign-up,sign-in,sign-out,search,watch,results,web,result,scholar,url,
dsearch,zustimmung,index,live,suche,suchergebnisse,home,top,default,welcome,homepage,main,start,surveys,
viwweb,login,tv-sender,live-tv,tv-programm,streamen-tv,tv-programm-live-stream,alle-kategorien'
# mapping dictionary for non special characters, e.g., below is the case for German umlauts
SPECIAL_CHARACTER_MAP = '{"ß": "ss", "ä": "ae", "ö": "oe", "ü": "ue"}'
# common extension endings in the slug
COMMON_EXTENSIONS = 'html,htm,pdf,php,aspx,asp,php'