From b86bece102928fdf3a0be71364059cff200d70ba Mon Sep 17 00:00:00 2001 From: Jiahang Li Date: Sun, 16 Apr 2023 20:26:53 -0700 Subject: [PATCH 1/4] add custom dataset dwmw17 --- datasets/download_text_classification.sh | 77 ++++++++++--------- .../data_utils/text_classification_dataset.py | 33 ++++++++ 2 files changed, 74 insertions(+), 36 deletions(-) diff --git a/datasets/download_text_classification.sh b/datasets/download_text_classification.sh index 3f654d1..1d3e6f3 100755 --- a/datasets/download_text_classification.sh +++ b/datasets/download_text_classification.sh @@ -3,41 +3,46 @@ DIR="./TextClassification" mkdir $DIR cd $DIR -rm -rf mnli -wget --content-disposition https://cloud.tsinghua.edu.cn/f/33182c22cb594e88b49b/?dl=1 -tar -zxvf mnli.tar.gz -rm -rf mnli.tar.gz - -rm -rf agnews -wget --content-disposition https://cloud.tsinghua.edu.cn/f/0fb6af2a1e6647b79098/?dl=1 -tar -zxvf agnews.tar.gz -rm -rf agnews.tar.gz - -rm -rf dbpedia -wget --content-disposition https://cloud.tsinghua.edu.cn/f/362d3cdaa63b4692bafb/?dl=1 -tar -zxvf dbpedia.tar.gz -rm -rf dbpedia.tar.gz - -rm -rf imdb -wget --content-disposition https://cloud.tsinghua.edu.cn/f/37bd6cb978d342db87ed/?dl=1 -tar -zxvf imdb.tar.gz -rm -rf imdb.tar.gz - -rm -rf SST-2 -wget --content-disposition https://cloud.tsinghua.edu.cn/f/bccfdb243eca404f8bf3/?dl=1 -tar -zxvf SST-2.tar.gz -rm -rf SST-2.tar.gz - -rm -rf amazon -wget --content-disposition https://cloud.tsinghua.edu.cn/f/e00a4c44aaf844cdb6c9/?dl=1 -tar -zxvf amazon.tar.gz -mv datasets/amazon/ amazon -rm -rf ./datasets -rm -rf amazon.tar.gz - -rm -rf yahoo_answers_topics -wget --content-disposition https://cloud.tsinghua.edu.cn/f/79257038afaa4730a03f/?dl=1 -tar -zxvf yahoo_answers_topics.tar.gz -rm -rf yahoo_answers_topics.tar.gz +# rm -rf mnli +# wget --content-disposition https://cloud.tsinghua.edu.cn/f/33182c22cb594e88b49b/?dl=1 +# tar -zxvf mnli.tar.gz +# rm -rf mnli.tar.gz + +# rm -rf agnews +# wget --content-disposition https://cloud.tsinghua.edu.cn/f/0fb6af2a1e6647b79098/?dl=1 +# tar -zxvf agnews.tar.gz +# rm -rf agnews.tar.gz + +# rm -rf dbpedia +# wget --content-disposition https://cloud.tsinghua.edu.cn/f/362d3cdaa63b4692bafb/?dl=1 +# tar -zxvf dbpedia.tar.gz +# rm -rf dbpedia.tar.gz + +# rm -rf imdb +# wget --content-disposition https://cloud.tsinghua.edu.cn/f/37bd6cb978d342db87ed/?dl=1 +# tar -zxvf imdb.tar.gz +# rm -rf imdb.tar.gz + +# rm -rf SST-2 +# wget --content-disposition https://cloud.tsinghua.edu.cn/f/bccfdb243eca404f8bf3/?dl=1 +# tar -zxvf SST-2.tar.gz +# rm -rf SST-2.tar.gz + +# rm -rf amazon +# wget --content-disposition https://cloud.tsinghua.edu.cn/f/e00a4c44aaf844cdb6c9/?dl=1 +# tar -zxvf amazon.tar.gz +# mv datasets/amazon/ amazon +# rm -rf ./datasets +# rm -rf amazon.tar.gz + +# rm -rf yahoo_answers_topics +# wget --content-disposition https://cloud.tsinghua.edu.cn/f/79257038afaa4730a03f/?dl=1 +# tar -zxvf yahoo_answers_topics.tar.gz +# rm -rf yahoo_answers_topics.tar.gz + +rm -rf dwmw17 +wget --content-disposition https://raw.githubusercontent.com/t-davidson/hate-speech-and-offensive-language/master/data/labeled_data.csv +mkdir -p dwmw17 +mv labeled_data.csv dwmw17 cd .. diff --git a/openprompt/data_utils/text_classification_dataset.py b/openprompt/data_utils/text_classification_dataset.py index 1dde85c..f9ca7ce 100644 --- a/openprompt/data_utils/text_classification_dataset.py +++ b/openprompt/data_utils/text_classification_dataset.py @@ -17,6 +17,7 @@ import os import json, csv +import pandas as pd from abc import ABC, abstractmethod from collections import defaultdict, Counter from typing import List, Dict, Callable @@ -27,6 +28,38 @@ from openprompt.data_utils.data_processor import DataProcessor +class Dwmw17Processor(DataProcessor): + def __init__(self): + super().__init__() + self.labels = [ "hate speech", "offensive language", "neither" ] + + def get_examples(self, data_dir, split): + df = pd.read_csv(os.path.join(data_dir, 'labeled_data.csv')) + """ + 24783 rows in total, 0: 1430, 1: 19190, 2: 4163 + I will take 50% as training and 50% as testing + """ + train_splits = [ 715, 9595, 2081 ] + examples = [] + for label_idx in range(len(self.labels)): + df_label = df[df['class'] == label_idx] + train_split = train_splits[label_idx] + + tweets = df_label['tweet'].tolist() + indexs = df_label.iloc[:, 0].tolist() + if split == 'train': + tweets_split = tweets[:train_split] + indexs_split = indexs[:train_split] + else: + tweets_split = tweets[train_split:] + indexs_split = indexs[train_split:] + for tweet, index in zip(tweets_split, indexs_split): + examples.append(InputExample( + guid=str(index), text_a=tweet, text_b="", label=label_idx + )) + return examples + + class MnliProcessor(DataProcessor): # TODO Test needed def __init__(self): From f1085d1429eaae9f74c0a28260d071c02c701a88 Mon Sep 17 00:00:00 2001 From: Harsh Karia Date: Wed, 19 Apr 2023 14:42:26 -0700 Subject: [PATCH 2/4] Adding split dataset + standard dataloader --- datasets/download_text_classification.sh | 10 ++-- .../data_utils/text_classification_dataset.py | 51 +++++++++++-------- 2 files changed, 35 insertions(+), 26 deletions(-) diff --git a/datasets/download_text_classification.sh b/datasets/download_text_classification.sh index 1d3e6f3..6b3672a 100755 --- a/datasets/download_text_classification.sh +++ b/datasets/download_text_classification.sh @@ -1,5 +1,5 @@ #!/bin/sh -DIR="./TextClassification" +DIR="./datasets/TextClassification" mkdir $DIR cd $DIR @@ -41,8 +41,10 @@ cd $DIR # rm -rf yahoo_answers_topics.tar.gz rm -rf dwmw17 -wget --content-disposition https://raw.githubusercontent.com/t-davidson/hate-speech-and-offensive-language/master/data/labeled_data.csv -mkdir -p dwmw17 -mv labeled_data.csv dwmw17 +FILEID="1FW_qQX8aubnuFy--y8cY8HW26CFixFei" +FILENAME="dwmw17.zip" +wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=${FILEID}' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=${FILEID}" -O ${FILENAME} && rm -rf /tmp/cookies.txt +unzip dwmw17.zip +rm dwmw17.zip cd .. diff --git a/openprompt/data_utils/text_classification_dataset.py b/openprompt/data_utils/text_classification_dataset.py index f9ca7ce..4cfe0fa 100644 --- a/openprompt/data_utils/text_classification_dataset.py +++ b/openprompt/data_utils/text_classification_dataset.py @@ -29,34 +29,40 @@ class Dwmw17Processor(DataProcessor): + """ + from openprompt.data_utils.text_classification_dataset import PROCESSORS + import os + # Get the absolute path of the parent directory of the current file + root_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) + + # Set the base path to the 'datasets' directory located in the parent directory + base_path = os.path.join(root_dir, 'datasets/TextClassification') + + + dataset_name = "dwmw17" + dataset_path = os.path.join(base_path, dataset_name) + processor = PROCESSORS[dataset_name.lower()]() + trainvalid_dataset = processor.get_train_examples(dataset_path) + print(trainvalid_dataset) + """ def __init__(self): super().__init__() self.labels = [ "hate speech", "offensive language", "neither" ] def get_examples(self, data_dir, split): - df = pd.read_csv(os.path.join(data_dir, 'labeled_data.csv')) - """ - 24783 rows in total, 0: 1430, 1: 19190, 2: 4163 - I will take 50% as training and 50% as testing - """ - train_splits = [ 715, 9595, 2081 ] + path = os.path.join(data_dir, "{}.csv".format(split)) examples = [] - for label_idx in range(len(self.labels)): - df_label = df[df['class'] == label_idx] - train_split = train_splits[label_idx] - - tweets = df_label['tweet'].tolist() - indexs = df_label.iloc[:, 0].tolist() - if split == 'train': - tweets_split = tweets[:train_split] - indexs_split = indexs[:train_split] - else: - tweets_split = tweets[train_split:] - indexs_split = indexs[train_split:] - for tweet, index in zip(tweets_split, indexs_split): - examples.append(InputExample( - guid=str(index), text_a=tweet, text_b="", label=label_idx - )) + with open(path, encoding='utf8') as f: + reader = csv.reader(f, delimiter=',') + # Skip first row + next(reader) + for idx, row in enumerate(reader): + idx, _, _, _, _, label, tweet = row + text_a = tweet + example = InputExample( + guid=str(idx), text_a=text_a, label=int(label)) + examples.append(example) + return examples @@ -391,4 +397,5 @@ def get_examples(self, data_dir, split): "sst-2": SST2Processor, "mnli": MnliProcessor, "yahoo": YahooProcessor, + "dwmw17": Dwmw17Processor } From e01dc0fc9a9ff12838c196b978b7c2bf014420a1 Mon Sep 17 00:00:00 2001 From: Harsh Karia Date: Wed, 19 Apr 2023 15:18:34 -0700 Subject: [PATCH 3/4] Adding ICL learned verbalizer and template --- scripts/TextClassification/dwmw17/icl_verbalizer.json | 5 +++++ scripts/TextClassification/dwmw17/manual_template.txt | 3 +++ 2 files changed, 8 insertions(+) create mode 100644 scripts/TextClassification/dwmw17/icl_verbalizer.json create mode 100644 scripts/TextClassification/dwmw17/manual_template.txt diff --git a/scripts/TextClassification/dwmw17/icl_verbalizer.json b/scripts/TextClassification/dwmw17/icl_verbalizer.json new file mode 100644 index 0000000..c4631d0 --- /dev/null +++ b/scripts/TextClassification/dwmw17/icl_verbalizer.json @@ -0,0 +1,5 @@ +{ + "hate speech": ["Hateful", "Malicious", "Malevolent", "Vicious", "Nefarious", "Sinister", "Discriminatory", "Harmful", "Abusive", "Prejudice"], + "offensive language": ["Offensive", "Insulting", "Rude", "Inappropriate", "Insensitive", "Controversial", "Obscenity", "Profanity"], + "neither": ["Harmless", "Innocent", "Benign", "Nonthreatening", "Inoffensive", "Amicable", "Acceptable", "Respectful", "Neutral"] +} \ No newline at end of file diff --git a/scripts/TextClassification/dwmw17/manual_template.txt b/scripts/TextClassification/dwmw17/manual_template.txt new file mode 100644 index 0000000..4934197 --- /dev/null +++ b/scripts/TextClassification/dwmw17/manual_template.txt @@ -0,0 +1,3 @@ +This tweet contains {"mask"} . {"placeholder": "text_a"} +This tweet is {"mask"} . {"placeholder": "text_a"} +A {"mask"} tweet : {"placeholder": "text_a"} \ No newline at end of file From ceca3d0032bf3e95c95bcfe872abd3d1ddc886be Mon Sep 17 00:00:00 2001 From: Harsh Karia Date: Wed, 19 Apr 2023 15:28:30 -0700 Subject: [PATCH 4/4] Adding experiment for dwmw17 --- .../classification_protoverb_dwmw17.yaml | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 experiments/classification_protoverb_dwmw17.yaml diff --git a/experiments/classification_protoverb_dwmw17.yaml b/experiments/classification_protoverb_dwmw17.yaml new file mode 100644 index 0000000..2b0deb3 --- /dev/null +++ b/experiments/classification_protoverb_dwmw17.yaml @@ -0,0 +1,68 @@ +dataset: + name: dwmw17 + path: datasets/TextClassification/dwmw17 + +plm: + model_name: roberta + model_path: roberta-large + optimize: + freeze_para: False + lr: 0.00003 + weight_decay: 0.01 + scheduler: + type: + num_warmup_steps: 500 + +checkpoint: + save_latest: False + save_best: False + +train: + batch_size: 2 + num_epochs: 5 + train_verblizer: post + clean: True + +test: + batch_size: 2 + +template: manual_template +verbalizer: proto_verbalizer + +manual_template: + choice: 0 + file_path: scripts/TextClassification/dwmw17/manual_template.txt + +proto_verbalizer: + parent_config: dwmw17 + choice: 0 + file_path: scripts/TextClassification/dwmw17/icl_verbalizer.json + lr: 0.01 + mid_dim: 128 + epochs: 30 + multi_verb: multi + + + +environment: + num_gpus: 1 + cuda_visible_devices: + - 0 + local_rank: 0 + +learning_setting: few_shot + +few_shot: + parent_config: learning_setting + few_shot_sampling: sampling_from_train + +sampling_from_train: + parent_config: few_shot_sampling + num_examples_per_label: 1 + also_sample_dev: True + num_examples_per_label_dev: 1 + seed: + - 123 + +reproduce: # seed for reproduction + seed: 123 # a seed for all random part \ No newline at end of file