code and conf

rbiswasfc · Oct 25, 2023 · 9417775 · 9417775
1 parent 04641d6
commit 9417775
Show file tree

Hide file tree

Showing 32 changed files with 5,105 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -1,2 +1,62 @@
-# llm-science-exam
-6th Position Solution Code for Kaggle - LLM Science Exam Competition
+This repo contains our code and configurations for the Kaggle - LLM Science Exam competition. A detailed summary of the solution is posted [here](https://www.kaggle.com/competitions/kaggle-llm-science-exam/discussion/447647). Please refer to the following sections for details on training and dependencies. 
+
+## Section 1: Setup
+### 1.1 Hardware
+Computing resources from Jarvislabs.ai were used. Specifically, models were trained on the following instance:
+
+Ubuntu 20.04.5 LTS (128 GB boot disk)
+Intel(R) Xeon(R) Silver 4216 CPU @ 2.10GHz (7 vCPUs)
+1 x NVIDIA A100 40GB GPU OR 1 x NVIDIA A6000 48GB GPU
+
+### 1.2 Software
+I used PyTorch-2.0 image from Jarvislabs.ai, which comes with:
+
+* Python 3.10.11
+* CUDA 11.8
+* Python packages installation: pip install -r requirements.txt
+
+### 1.3 Datasets
+Please make sure Kaggle API is installed. Then run the following script to download the required datasets:
+
+```
+chmod +x ./setup.sh
+./setup.sh
+```
+
+Please note that the above script will create a datasets folder in the directory located one level above the current directory. The external datasets will be downloaded in the datasets folder.
+
+## Section 2: Training
+
+### 2.1 Retriever Training
+```
+python ./code/train_e_topic.py \
+--config-name conf_e_topic_bge \
+use_wandb=false \
+all_data=false
+```
+
+### 2.2 Ranker Training
+```
+python ./code/train_e_ranker.py \
+--config-name conf_e_ranker \
+use_wandb=false \
+all_data=false
+```
+
+### 2.3 Reader: Spanwise model
+
+#### Step 1: training with large number of MCQs
+```
+python ./code/train_r_delta.py \
+--config-name conf_r_delta_k1 \
+use_wandb=false \
+all_data=false
+```
+
+#### Step 2: specialization with difficult MCQs
+```
+python ./code/train_r_delta.py \
+--config-name conf_r_delta_k2_resumed \
+use_wandb=false \
+all_data=false
+```
diff --git a/code/.DS_Store b/code/.DS_Store
diff --git a/code/e_ranker/rank_dataset.py b/code/e_ranker/rank_dataset.py
@@ -0,0 +1,59 @@
+from copy import deepcopy
+
+from datasets import Dataset
+from transformers import AutoTokenizer
+
+# --------------- Dataset ----------------------------------------------#
+
+
+class RankerDataset:
+    """
+    Dataset class for llm-science-exam task for re-ranking
+    """
+
+    def __init__(self, cfg):
+        self.cfg = cfg
+        self.tokenizer = AutoTokenizer.from_pretrained(cfg.model.backbone_path)
+
+    def pre_process(self, df):
+        columns = ["prompt", "A", "B", "C", "D", "E"]
+        df["query"] = df[columns].apply(lambda x: " | ".join(x), axis=1)
+        return df
+
+    def tokenize_function(self, examples):
+        tz = self.tokenizer(
+            examples["query"],
+            examples["context"],
+            padding=False,
+            truncation="longest_first",
+            max_length=self.cfg.model.max_length,
+            add_special_tokens=True,
+            return_token_type_ids=False,
+        )
+
+        return tz
+
+    def compute_input_length(self, examples):
+        return {"input_length": [len(x) for x in examples["input_ids"]]}
+
+    def get_dataset(self, df):
+        """
+        Main api for creating the Science Exam dataset
+        :param df: input dataframe
+        :type df: pd.DataFrame
+        :return: the created dataset
+        :rtype: Dataset
+        """
+        df = deepcopy(df)
+        df = self.pre_process(df)
+        task_dataset = Dataset.from_pandas(df)
+
+        task_dataset = task_dataset.map(self.tokenize_function, batched=True)
+        task_dataset = task_dataset.map(self.compute_input_length, batched=True)
+
+        try:
+            task_dataset = task_dataset.remove_columns(column_names=["__index_level_0__"])
+        except Exception as e:
+            print(e)
+
+        return task_dataset
diff --git a/code/e_ranker/rank_loader.py b/code/e_ranker/rank_loader.py
@@ -0,0 +1,92 @@
+
+import pdb
+import random
+from copy import deepcopy
+from dataclasses import dataclass, field
+
+import torch
+from transformers import DataCollatorWithPadding
+
+
+@dataclass
+class RankerCollator(DataCollatorWithPadding):
+    """
+    data collector for re-ranker task
+    """
+
+    tokenizer = None
+    padding = True
+    max_length = None
+    pad_to_multiple_of = None
+    return_tensors = "pt"
+
+    def __call__(self, features):
+
+        buffer_dict = dict()
+        buffer_keys = ["query_id", "content_id"]
+
+        for key in buffer_keys:
+            if key in features[0].keys():
+                value = [feature[key] for feature in features]
+                buffer_dict[key] = value
+
+        labels = None
+        if "label" in features[0].keys():
+            labels = [feature["label"] for feature in features]
+
+        features = [
+            {
+                "input_ids": feature["input_ids"],
+                "attention_mask": feature["attention_mask"],
+            } for feature in features
+        ]
+
+        batch = self.tokenizer.pad(
+            features,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors=None,
+        )
+
+        for key, value in buffer_dict.items():
+            batch[key] = value
+
+        if labels is not None:
+            batch["labels"] = labels
+
+        tensor_keys = [
+            "input_ids",
+            "attention_mask",
+        ]
+
+        for key in tensor_keys:
+            batch[key] = torch.tensor(batch[key], dtype=torch.int64)
+
+        if labels is not None:
+            batch["labels"] = torch.tensor(batch["labels"], dtype=torch.float32)
+
+        return batch
+
+
+# ---
+def show_batch(batch, tokenizer, n_examples=16, task='training'):
+
+    bs = batch['input_ids'].size(0)
+    print(f"batch size: {bs}")
+
+    print(f"shape of input_ids: {batch['input_ids'].shape}")
+
+    n_examples = min(n_examples, bs)
+    print(f"Showing {n_examples} from a {task} batch...")
+
+    print("\n\n")
+    for idx in range(n_examples):
+        print(f"Input:\n\n{tokenizer.decode(batch['input_ids'][idx], skip_special_tokens=False)}")
+        # print("\n\n")
+
+        if "infer" not in task.lower():
+            print("--"*20)
+            labels = batch['labels'][idx]
+            print(f"Label: {labels}")
+        print('=='*40)
diff --git a/code/e_ranker/rank_model.py b/code/e_ranker/rank_model.py
@@ -0,0 +1,90 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from transformers import AutoConfig, AutoModel
+
+
+class MeanPooling(nn.Module):
+    def __init__(self):
+        super(MeanPooling, self).__init__()
+
+    def forward(self, last_hidden_state, attention_mask):
+        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
+        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
+        sum_mask = input_mask_expanded.sum(1)
+        sum_mask = torch.clamp(sum_mask, min=1e-9)
+        mean_embeddings = sum_embeddings / sum_mask
+        return mean_embeddings
+
+
+# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+# Rank Model
+# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+class RankerModel(nn.Module):
+    """
+    The Sci-LLM re-ranker
+    """
+
+    def __init__(self, cfg):
+        print("initializing the Rank Model...")
+
+        super(RankerModel, self).__init__()
+        self.cfg = cfg
+
+        # ----------------------------- Backbone -----------------------------------------#
+        backbone_config = AutoConfig.from_pretrained(self.cfg.model.backbone_path)
+        backbone_config.update(
+            {
+                "use_cache": False,
+            }
+        )
+
+        self.backbone = AutoModel.from_pretrained(self.cfg.model.backbone_path, config=backbone_config)
+        self.backbone.gradient_checkpointing_enable()
+
+        # Mean pooling
+        self.pool = MeanPooling()
+
+        # classifier
+        num_features = self.backbone.config.hidden_size
+        self.classifier = nn.Linear(num_features, 1)
+
+        # loss function
+        self.loss_fn = nn.BCEWithLogitsLoss(reduction="mean")
+
+    def encode(
+        self,
+        input_ids,
+        attention_mask,
+    ):
+        outputs = self.backbone(
+            input_ids,
+            attention_mask=attention_mask,
+            output_hidden_states=False,
+        )
+
+        encoder_layer = outputs.last_hidden_state
+        embeddings = self.pool(encoder_layer, attention_mask)  # mean pooling
+
+        return embeddings
+
+    def forward(self, input_ids, attention_mask, labels=None, **kwargs):
+        # features
+        features = self.encode(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+        )  # (bs, num_features)
+
+        # logits
+        logits = self.classifier(features).reshape(-1)
+        loss_dict = dict()
+
+        # loss
+        loss = None
+        if labels is not None:
+            loss = self.loss_fn(logits, labels)
+            loss_dict = {"loss": loss}
+
+        return logits, loss, loss_dict