.env-example

# ------------
# ANNOTATIONS
# ------------

## Some ID for the annotator of the dataset. This ID will be used as a prefix for the comment and
## labels fields of the output.csv
ANNOTATOR = 'annotator_name'

## The randomization seed to shuffle the tasks for the annotator ('None' means not randomized)
RANDOM_SEED = 'None'

## A comma-seperated list of labels that the ANNOTATOR may use for the tasks of the TASKS_FILE
LABELS = 'Children,Energy,Cannabis'

# -----------
# TASKS FILE
# -----------

## The column of the TASKS_FILE that uniquely identifies each annotation task.
TASKS_ID_COLUMN = '_id'

##  The column of the TASKS_FILE that holds the url to be annotated.
TASKS_URL_COLUMN = 'url'

# ----------
# STRUCTURE
# ----------

## The directory that will hold the TASKS_FILE, the annotations directory, the raw text directory,
## the cleaned text directory and the html directory
WORKING_DIR = 'example_workdir'

## The CSV file that holds the annotation tasks (including the TASKS_ID_COLUMN and TASKS_URL_COLUMN)
TASKS_FILE = 'tasks.csv'

## The sqlite file in the WORKING_DIR that will hold all annotations.
ANNOTATIONS_DB = 'annotations.sqlite'

## The directory in the WORKING_DIR that will hold the extracted (raw) text of the html content of
## each task.
RAW_TEXT_DIR = 'raw_text'

## The directory in the WORKING_DIR that will hold the extracted and cleaned text of the html content
## of each task.
CLEANED_TEXT_DIR = 'cleaned_text'

## The directory in the WORKING_DIR that holds the html content of each task as a result of scraping
## done before-hand. Each html file is expected to follow the naming scheme: TASK_ID.html, where
## TASK_ID is the value in the TASKS_ID_COLUMN of the corresponding row in the TASKS_FILE
HTML_DIR = 'html'

####################
# URL PARSING AIDS #
####################

# parameters used to find search terms in URLs
URL_QUERY_PARAMS = 'q,p,query,text,search_query,search,psg'

# exclude commonly found steps in the paths that might be confused with SEO titles (aka. slugs)
NOT_SEO_TITLES = 'cgi-bin,fast-cgi,auth-ui,consent-management,openid-connect,de-de,en-us,wba-ui,websso-prod,
login-ui,login-callback,login-actions,sign-up,sign-in,sign-out,search,watch,results,web,result,scholar,url,
dsearch,zustimmung,index,live,suche,suchergebnisse,home,top,default,welcome,homepage,main,start,surveys,
viwweb,login,tv-sender,live-tv,tv-programm,streamen-tv,tv-programm-live-stream,alle-kategorien'

# mapping dictionary for non special characters, e.g., below is the case for German umlauts
SPECIAL_CHARACTER_MAP = '{"ß": "ss", "ä": "ae", "ö": "oe", "ü": "ue"}'

# common extension endings in the slug
COMMON_EXTENSIONS = 'html,htm,pdf,php,aspx,asp,php'