From b86bece102928fdf3a0be71364059cff200d70ba Mon Sep 17 00:00:00 2001
From: Jiahang Li <jiahang.li@itadmins-MacBook-Pro-7.local>
Date: Sun, 16 Apr 2023 20:26:53 -0700
Subject: [PATCH 1/4] add custom dataset dwmw17

---
 datasets/download_text_classification.sh      | 77 ++++++++++---------
 .../data_utils/text_classification_dataset.py | 33 ++++++++
 2 files changed, 74 insertions(+), 36 deletions(-)

diff --git a/datasets/download_text_classification.sh b/datasets/download_text_classification.sh
index 3f654d1..1d3e6f3 100755
--- a/datasets/download_text_classification.sh
+++ b/datasets/download_text_classification.sh
@@ -3,41 +3,46 @@ DIR="./TextClassification"
 mkdir $DIR
 cd $DIR
 
-rm -rf mnli
-wget --content-disposition https://cloud.tsinghua.edu.cn/f/33182c22cb594e88b49b/?dl=1
-tar -zxvf mnli.tar.gz
-rm -rf mnli.tar.gz
-
-rm -rf agnews
-wget --content-disposition https://cloud.tsinghua.edu.cn/f/0fb6af2a1e6647b79098/?dl=1
-tar -zxvf agnews.tar.gz
-rm -rf agnews.tar.gz
-
-rm -rf dbpedia
-wget --content-disposition https://cloud.tsinghua.edu.cn/f/362d3cdaa63b4692bafb/?dl=1
-tar -zxvf dbpedia.tar.gz
-rm -rf dbpedia.tar.gz
-
-rm -rf imdb
-wget --content-disposition https://cloud.tsinghua.edu.cn/f/37bd6cb978d342db87ed/?dl=1
-tar -zxvf imdb.tar.gz
-rm -rf imdb.tar.gz
-
-rm -rf SST-2
-wget --content-disposition https://cloud.tsinghua.edu.cn/f/bccfdb243eca404f8bf3/?dl=1
-tar -zxvf SST-2.tar.gz
-rm -rf SST-2.tar.gz
-
-rm -rf amazon
-wget --content-disposition https://cloud.tsinghua.edu.cn/f/e00a4c44aaf844cdb6c9/?dl=1
-tar -zxvf amazon.tar.gz
-mv datasets/amazon/ amazon
-rm  -rf ./datasets
-rm -rf amazon.tar.gz
-
-rm -rf yahoo_answers_topics
-wget --content-disposition https://cloud.tsinghua.edu.cn/f/79257038afaa4730a03f/?dl=1
-tar -zxvf yahoo_answers_topics.tar.gz
-rm -rf yahoo_answers_topics.tar.gz
+# rm -rf mnli
+# wget --content-disposition https://cloud.tsinghua.edu.cn/f/33182c22cb594e88b49b/?dl=1
+# tar -zxvf mnli.tar.gz
+# rm -rf mnli.tar.gz
+
+# rm -rf agnews
+# wget --content-disposition https://cloud.tsinghua.edu.cn/f/0fb6af2a1e6647b79098/?dl=1
+# tar -zxvf agnews.tar.gz
+# rm -rf agnews.tar.gz
+
+# rm -rf dbpedia
+# wget --content-disposition https://cloud.tsinghua.edu.cn/f/362d3cdaa63b4692bafb/?dl=1
+# tar -zxvf dbpedia.tar.gz
+# rm -rf dbpedia.tar.gz
+
+# rm -rf imdb
+# wget --content-disposition https://cloud.tsinghua.edu.cn/f/37bd6cb978d342db87ed/?dl=1
+# tar -zxvf imdb.tar.gz
+# rm -rf imdb.tar.gz
+
+# rm -rf SST-2
+# wget --content-disposition https://cloud.tsinghua.edu.cn/f/bccfdb243eca404f8bf3/?dl=1
+# tar -zxvf SST-2.tar.gz
+# rm -rf SST-2.tar.gz
+
+# rm -rf amazon
+# wget --content-disposition https://cloud.tsinghua.edu.cn/f/e00a4c44aaf844cdb6c9/?dl=1
+# tar -zxvf amazon.tar.gz
+# mv datasets/amazon/ amazon
+# rm  -rf ./datasets
+# rm -rf amazon.tar.gz
+
+# rm -rf yahoo_answers_topics
+# wget --content-disposition https://cloud.tsinghua.edu.cn/f/79257038afaa4730a03f/?dl=1
+# tar -zxvf yahoo_answers_topics.tar.gz
+# rm -rf yahoo_answers_topics.tar.gz
+
+rm -rf dwmw17
+wget --content-disposition https://raw.githubusercontent.com/t-davidson/hate-speech-and-offensive-language/master/data/labeled_data.csv
+mkdir -p dwmw17
+mv labeled_data.csv dwmw17
 
 cd ..
diff --git a/openprompt/data_utils/text_classification_dataset.py b/openprompt/data_utils/text_classification_dataset.py
index 1dde85c..f9ca7ce 100644
--- a/openprompt/data_utils/text_classification_dataset.py
+++ b/openprompt/data_utils/text_classification_dataset.py
@@ -17,6 +17,7 @@
 
 import os
 import json, csv
+import pandas as pd
 from abc import ABC, abstractmethod
 from collections import defaultdict, Counter
 from typing import List, Dict, Callable
@@ -27,6 +28,38 @@
 from openprompt.data_utils.data_processor import DataProcessor
 
 
+class Dwmw17Processor(DataProcessor):
+    def __init__(self):
+        super().__init__()
+        self.labels = [ "hate speech", "offensive language", "neither" ]
+    
+    def get_examples(self, data_dir, split):
+        df = pd.read_csv(os.path.join(data_dir, 'labeled_data.csv'))
+        """
+        24783 rows in total, 0: 1430, 1: 19190, 2: 4163
+        I will take 50% as training and 50% as testing
+        """
+        train_splits = [ 715, 9595, 2081 ]
+        examples = []
+        for label_idx in range(len(self.labels)):
+            df_label = df[df['class'] == label_idx]
+            train_split = train_splits[label_idx]
+            
+            tweets = df_label['tweet'].tolist()
+            indexs = df_label.iloc[:, 0].tolist()
+            if split == 'train':
+                tweets_split = tweets[:train_split]
+                indexs_split = indexs[:train_split]
+            else:
+                tweets_split = tweets[train_split:]
+                indexs_split = indexs[train_split:]
+            for tweet, index in zip(tweets_split, indexs_split):
+                examples.append(InputExample(
+                    guid=str(index), text_a=tweet, text_b="", label=label_idx
+                ))
+        return examples
+
+
 class MnliProcessor(DataProcessor):
     # TODO Test needed
     def __init__(self):

From f1085d1429eaae9f74c0a28260d071c02c701a88 Mon Sep 17 00:00:00 2001
From: Harsh Karia <harshkaria@Harshs-MacBook-Pro.local>
Date: Wed, 19 Apr 2023 14:42:26 -0700
Subject: [PATCH 2/4] Adding split dataset + standard dataloader

---
 datasets/download_text_classification.sh      | 10 ++--
 .../data_utils/text_classification_dataset.py | 51 +++++++++++--------
 2 files changed, 35 insertions(+), 26 deletions(-)

diff --git a/datasets/download_text_classification.sh b/datasets/download_text_classification.sh
index 1d3e6f3..6b3672a 100755
--- a/datasets/download_text_classification.sh
+++ b/datasets/download_text_classification.sh
@@ -1,5 +1,5 @@
 #!/bin/sh
-DIR="./TextClassification"
+DIR="./datasets/TextClassification"
 mkdir $DIR
 cd $DIR
 
@@ -41,8 +41,10 @@ cd $DIR
 # rm -rf yahoo_answers_topics.tar.gz
 
 rm -rf dwmw17
-wget --content-disposition https://raw.githubusercontent.com/t-davidson/hate-speech-and-offensive-language/master/data/labeled_data.csv
-mkdir -p dwmw17
-mv labeled_data.csv dwmw17
+FILEID="1FW_qQX8aubnuFy--y8cY8HW26CFixFei"
+FILENAME="dwmw17.zip"
+wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=${FILEID}' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=${FILEID}" -O ${FILENAME} && rm -rf /tmp/cookies.txt
+unzip dwmw17.zip
+rm dwmw17.zip
 
 cd ..
diff --git a/openprompt/data_utils/text_classification_dataset.py b/openprompt/data_utils/text_classification_dataset.py
index f9ca7ce..4cfe0fa 100644
--- a/openprompt/data_utils/text_classification_dataset.py
+++ b/openprompt/data_utils/text_classification_dataset.py
@@ -29,34 +29,40 @@
 
 
 class Dwmw17Processor(DataProcessor):
+    """
+    from openprompt.data_utils.text_classification_dataset import PROCESSORS
+    import os
+    # Get the absolute path of the parent directory of the current file
+    root_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
+
+    # Set the base path to the 'datasets' directory located in the parent directory
+    base_path = os.path.join(root_dir, 'datasets/TextClassification')
+
+
+    dataset_name = "dwmw17"
+    dataset_path = os.path.join(base_path, dataset_name)
+    processor = PROCESSORS[dataset_name.lower()]()
+    trainvalid_dataset = processor.get_train_examples(dataset_path)
+    print(trainvalid_dataset)
+    """
     def __init__(self):
         super().__init__()
         self.labels = [ "hate speech", "offensive language", "neither" ]
     
     def get_examples(self, data_dir, split):
-        df = pd.read_csv(os.path.join(data_dir, 'labeled_data.csv'))
-        """
-        24783 rows in total, 0: 1430, 1: 19190, 2: 4163
-        I will take 50% as training and 50% as testing
-        """
-        train_splits = [ 715, 9595, 2081 ]
+        path = os.path.join(data_dir, "{}.csv".format(split))
         examples = []
-        for label_idx in range(len(self.labels)):
-            df_label = df[df['class'] == label_idx]
-            train_split = train_splits[label_idx]
-            
-            tweets = df_label['tweet'].tolist()
-            indexs = df_label.iloc[:, 0].tolist()
-            if split == 'train':
-                tweets_split = tweets[:train_split]
-                indexs_split = indexs[:train_split]
-            else:
-                tweets_split = tweets[train_split:]
-                indexs_split = indexs[train_split:]
-            for tweet, index in zip(tweets_split, indexs_split):
-                examples.append(InputExample(
-                    guid=str(index), text_a=tweet, text_b="", label=label_idx
-                ))
+        with open(path, encoding='utf8') as f:
+            reader = csv.reader(f, delimiter=',')
+            # Skip first row
+            next(reader)
+            for idx, row in enumerate(reader):
+                idx, _, _, _, _, label, tweet = row
+                text_a = tweet
+                example = InputExample(
+                    guid=str(idx), text_a=text_a, label=int(label))
+                examples.append(example)
+
         return examples
 
 
@@ -391,4 +397,5 @@ def get_examples(self, data_dir, split):
     "sst-2": SST2Processor,
     "mnli": MnliProcessor,
     "yahoo": YahooProcessor,
+    "dwmw17": Dwmw17Processor
 }

From e01dc0fc9a9ff12838c196b978b7c2bf014420a1 Mon Sep 17 00:00:00 2001
From: Harsh Karia <harshkaria@Harshs-MacBook-Pro.local>
Date: Wed, 19 Apr 2023 15:18:34 -0700
Subject: [PATCH 3/4] Adding ICL learned verbalizer and template

---
 scripts/TextClassification/dwmw17/icl_verbalizer.json | 5 +++++
 scripts/TextClassification/dwmw17/manual_template.txt | 3 +++
 2 files changed, 8 insertions(+)
 create mode 100644 scripts/TextClassification/dwmw17/icl_verbalizer.json
 create mode 100644 scripts/TextClassification/dwmw17/manual_template.txt

diff --git a/scripts/TextClassification/dwmw17/icl_verbalizer.json b/scripts/TextClassification/dwmw17/icl_verbalizer.json
new file mode 100644
index 0000000..c4631d0
--- /dev/null
+++ b/scripts/TextClassification/dwmw17/icl_verbalizer.json
@@ -0,0 +1,5 @@
+{
+    "hate speech": ["Hateful", "Malicious", "Malevolent", "Vicious", "Nefarious", "Sinister", "Discriminatory", "Harmful", "Abusive", "Prejudice"],
+    "offensive language": ["Offensive", "Insulting", "Rude", "Inappropriate", "Insensitive", "Controversial", "Obscenity", "Profanity"],
+    "neither": ["Harmless", "Innocent", "Benign", "Nonthreatening", "Inoffensive", "Amicable", "Acceptable", "Respectful", "Neutral"]
+}
\ No newline at end of file
diff --git a/scripts/TextClassification/dwmw17/manual_template.txt b/scripts/TextClassification/dwmw17/manual_template.txt
new file mode 100644
index 0000000..4934197
--- /dev/null
+++ b/scripts/TextClassification/dwmw17/manual_template.txt
@@ -0,0 +1,3 @@
+This tweet contains {"mask"} . {"placeholder": "text_a"}
+This tweet is {"mask"} . {"placeholder": "text_a"}
+A {"mask"} tweet : {"placeholder": "text_a"}
\ No newline at end of file

From ceca3d0032bf3e95c95bcfe872abd3d1ddc886be Mon Sep 17 00:00:00 2001
From: Harsh Karia <harshkaria@Harshs-MacBook-Pro.local>
Date: Wed, 19 Apr 2023 15:28:30 -0700
Subject: [PATCH 4/4] Adding experiment for dwmw17

---
 .../classification_protoverb_dwmw17.yaml      | 68 +++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 experiments/classification_protoverb_dwmw17.yaml

diff --git a/experiments/classification_protoverb_dwmw17.yaml b/experiments/classification_protoverb_dwmw17.yaml
new file mode 100644
index 0000000..2b0deb3
--- /dev/null
+++ b/experiments/classification_protoverb_dwmw17.yaml
@@ -0,0 +1,68 @@
+dataset:
+  name: dwmw17
+  path: datasets/TextClassification/dwmw17
+
+plm:
+  model_name: roberta
+  model_path: roberta-large
+  optimize:
+    freeze_para: False
+    lr: 0.00003
+    weight_decay: 0.01
+    scheduler:
+      type: 
+      num_warmup_steps: 500
+
+checkpoint:
+  save_latest: False
+  save_best: False
+
+train:
+  batch_size: 2
+  num_epochs: 5
+  train_verblizer: post
+  clean: True
+
+test:
+  batch_size: 2
+
+template: manual_template
+verbalizer: proto_verbalizer
+
+manual_template:
+  choice: 0
+  file_path: scripts/TextClassification/dwmw17/manual_template.txt
+
+proto_verbalizer:
+  parent_config: dwmw17
+  choice: 0
+  file_path: scripts/TextClassification/dwmw17/icl_verbalizer.json
+  lr: 0.01
+  mid_dim: 128
+  epochs: 30
+  multi_verb: multi
+
+  
+
+environment:
+  num_gpus: 1
+  cuda_visible_devices:
+    - 0
+  local_rank: 0
+  
+learning_setting: few_shot
+
+few_shot:
+  parent_config: learning_setting
+  few_shot_sampling: sampling_from_train
+  
+sampling_from_train:
+  parent_config: few_shot_sampling
+  num_examples_per_label: 1
+  also_sample_dev: True
+  num_examples_per_label_dev: 1
+  seed:
+    - 123
+
+reproduce:  # seed for reproduction 
+  seed: 123  # a seed for all random part
\ No newline at end of file