-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
32 changed files
with
5,105 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,62 @@ | ||
# llm-science-exam | ||
6th Position Solution Code for Kaggle - LLM Science Exam Competition | ||
This repo contains our code and configurations for the Kaggle - LLM Science Exam competition. A detailed summary of the solution is posted [here](https://www.kaggle.com/competitions/kaggle-llm-science-exam/discussion/447647). Please refer to the following sections for details on training and dependencies. | ||
|
||
## Section 1: Setup | ||
### 1.1 Hardware | ||
Computing resources from Jarvislabs.ai were used. Specifically, models were trained on the following instance: | ||
|
||
Ubuntu 20.04.5 LTS (128 GB boot disk) | ||
Intel(R) Xeon(R) Silver 4216 CPU @ 2.10GHz (7 vCPUs) | ||
1 x NVIDIA A100 40GB GPU OR 1 x NVIDIA A6000 48GB GPU | ||
|
||
### 1.2 Software | ||
I used PyTorch-2.0 image from Jarvislabs.ai, which comes with: | ||
|
||
* Python 3.10.11 | ||
* CUDA 11.8 | ||
* Python packages installation: pip install -r requirements.txt | ||
|
||
### 1.3 Datasets | ||
Please make sure Kaggle API is installed. Then run the following script to download the required datasets: | ||
|
||
``` | ||
chmod +x ./setup.sh | ||
./setup.sh | ||
``` | ||
|
||
Please note that the above script will create a datasets folder in the directory located one level above the current directory. The external datasets will be downloaded in the datasets folder. | ||
|
||
## Section 2: Training | ||
|
||
### 2.1 Retriever Training | ||
``` | ||
python ./code/train_e_topic.py \ | ||
--config-name conf_e_topic_bge \ | ||
use_wandb=false \ | ||
all_data=false | ||
``` | ||
|
||
### 2.2 Ranker Training | ||
``` | ||
python ./code/train_e_ranker.py \ | ||
--config-name conf_e_ranker \ | ||
use_wandb=false \ | ||
all_data=false | ||
``` | ||
|
||
### 2.3 Reader: Spanwise model | ||
|
||
#### Step 1: training with large number of MCQs | ||
``` | ||
python ./code/train_r_delta.py \ | ||
--config-name conf_r_delta_k1 \ | ||
use_wandb=false \ | ||
all_data=false | ||
``` | ||
|
||
#### Step 2: specialization with difficult MCQs | ||
``` | ||
python ./code/train_r_delta.py \ | ||
--config-name conf_r_delta_k2_resumed \ | ||
use_wandb=false \ | ||
all_data=false | ||
``` |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
from copy import deepcopy | ||
|
||
from datasets import Dataset | ||
from transformers import AutoTokenizer | ||
|
||
# --------------- Dataset ----------------------------------------------# | ||
|
||
|
||
class RankerDataset: | ||
""" | ||
Dataset class for llm-science-exam task for re-ranking | ||
""" | ||
|
||
def __init__(self, cfg): | ||
self.cfg = cfg | ||
self.tokenizer = AutoTokenizer.from_pretrained(cfg.model.backbone_path) | ||
|
||
def pre_process(self, df): | ||
columns = ["prompt", "A", "B", "C", "D", "E"] | ||
df["query"] = df[columns].apply(lambda x: " | ".join(x), axis=1) | ||
return df | ||
|
||
def tokenize_function(self, examples): | ||
tz = self.tokenizer( | ||
examples["query"], | ||
examples["context"], | ||
padding=False, | ||
truncation="longest_first", | ||
max_length=self.cfg.model.max_length, | ||
add_special_tokens=True, | ||
return_token_type_ids=False, | ||
) | ||
|
||
return tz | ||
|
||
def compute_input_length(self, examples): | ||
return {"input_length": [len(x) for x in examples["input_ids"]]} | ||
|
||
def get_dataset(self, df): | ||
""" | ||
Main api for creating the Science Exam dataset | ||
:param df: input dataframe | ||
:type df: pd.DataFrame | ||
:return: the created dataset | ||
:rtype: Dataset | ||
""" | ||
df = deepcopy(df) | ||
df = self.pre_process(df) | ||
task_dataset = Dataset.from_pandas(df) | ||
|
||
task_dataset = task_dataset.map(self.tokenize_function, batched=True) | ||
task_dataset = task_dataset.map(self.compute_input_length, batched=True) | ||
|
||
try: | ||
task_dataset = task_dataset.remove_columns(column_names=["__index_level_0__"]) | ||
except Exception as e: | ||
print(e) | ||
|
||
return task_dataset |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
|
||
import pdb | ||
import random | ||
from copy import deepcopy | ||
from dataclasses import dataclass, field | ||
|
||
import torch | ||
from transformers import DataCollatorWithPadding | ||
|
||
|
||
@dataclass | ||
class RankerCollator(DataCollatorWithPadding): | ||
""" | ||
data collector for re-ranker task | ||
""" | ||
|
||
tokenizer = None | ||
padding = True | ||
max_length = None | ||
pad_to_multiple_of = None | ||
return_tensors = "pt" | ||
|
||
def __call__(self, features): | ||
|
||
buffer_dict = dict() | ||
buffer_keys = ["query_id", "content_id"] | ||
|
||
for key in buffer_keys: | ||
if key in features[0].keys(): | ||
value = [feature[key] for feature in features] | ||
buffer_dict[key] = value | ||
|
||
labels = None | ||
if "label" in features[0].keys(): | ||
labels = [feature["label"] for feature in features] | ||
|
||
features = [ | ||
{ | ||
"input_ids": feature["input_ids"], | ||
"attention_mask": feature["attention_mask"], | ||
} for feature in features | ||
] | ||
|
||
batch = self.tokenizer.pad( | ||
features, | ||
padding=self.padding, | ||
max_length=self.max_length, | ||
pad_to_multiple_of=self.pad_to_multiple_of, | ||
return_tensors=None, | ||
) | ||
|
||
for key, value in buffer_dict.items(): | ||
batch[key] = value | ||
|
||
if labels is not None: | ||
batch["labels"] = labels | ||
|
||
tensor_keys = [ | ||
"input_ids", | ||
"attention_mask", | ||
] | ||
|
||
for key in tensor_keys: | ||
batch[key] = torch.tensor(batch[key], dtype=torch.int64) | ||
|
||
if labels is not None: | ||
batch["labels"] = torch.tensor(batch["labels"], dtype=torch.float32) | ||
|
||
return batch | ||
|
||
|
||
# --- | ||
def show_batch(batch, tokenizer, n_examples=16, task='training'): | ||
|
||
bs = batch['input_ids'].size(0) | ||
print(f"batch size: {bs}") | ||
|
||
print(f"shape of input_ids: {batch['input_ids'].shape}") | ||
|
||
n_examples = min(n_examples, bs) | ||
print(f"Showing {n_examples} from a {task} batch...") | ||
|
||
print("\n\n") | ||
for idx in range(n_examples): | ||
print(f"Input:\n\n{tokenizer.decode(batch['input_ids'][idx], skip_special_tokens=False)}") | ||
# print("\n\n") | ||
|
||
if "infer" not in task.lower(): | ||
print("--"*20) | ||
labels = batch['labels'][idx] | ||
print(f"Label: {labels}") | ||
print('=='*40) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
import torch | ||
import torch.nn as nn | ||
import torch.nn.functional as F | ||
import torch.utils.checkpoint | ||
from transformers import AutoConfig, AutoModel | ||
|
||
|
||
class MeanPooling(nn.Module): | ||
def __init__(self): | ||
super(MeanPooling, self).__init__() | ||
|
||
def forward(self, last_hidden_state, attention_mask): | ||
input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float() | ||
sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1) | ||
sum_mask = input_mask_expanded.sum(1) | ||
sum_mask = torch.clamp(sum_mask, min=1e-9) | ||
mean_embeddings = sum_embeddings / sum_mask | ||
return mean_embeddings | ||
|
||
|
||
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | ||
# Rank Model | ||
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | ||
|
||
class RankerModel(nn.Module): | ||
""" | ||
The Sci-LLM re-ranker | ||
""" | ||
|
||
def __init__(self, cfg): | ||
print("initializing the Rank Model...") | ||
|
||
super(RankerModel, self).__init__() | ||
self.cfg = cfg | ||
|
||
# ----------------------------- Backbone -----------------------------------------# | ||
backbone_config = AutoConfig.from_pretrained(self.cfg.model.backbone_path) | ||
backbone_config.update( | ||
{ | ||
"use_cache": False, | ||
} | ||
) | ||
|
||
self.backbone = AutoModel.from_pretrained(self.cfg.model.backbone_path, config=backbone_config) | ||
self.backbone.gradient_checkpointing_enable() | ||
|
||
# Mean pooling | ||
self.pool = MeanPooling() | ||
|
||
# classifier | ||
num_features = self.backbone.config.hidden_size | ||
self.classifier = nn.Linear(num_features, 1) | ||
|
||
# loss function | ||
self.loss_fn = nn.BCEWithLogitsLoss(reduction="mean") | ||
|
||
def encode( | ||
self, | ||
input_ids, | ||
attention_mask, | ||
): | ||
outputs = self.backbone( | ||
input_ids, | ||
attention_mask=attention_mask, | ||
output_hidden_states=False, | ||
) | ||
|
||
encoder_layer = outputs.last_hidden_state | ||
embeddings = self.pool(encoder_layer, attention_mask) # mean pooling | ||
|
||
return embeddings | ||
|
||
def forward(self, input_ids, attention_mask, labels=None, **kwargs): | ||
# features | ||
features = self.encode( | ||
input_ids=input_ids, | ||
attention_mask=attention_mask, | ||
) # (bs, num_features) | ||
|
||
# logits | ||
logits = self.classifier(features).reshape(-1) | ||
loss_dict = dict() | ||
|
||
# loss | ||
loss = None | ||
if labels is not None: | ||
loss = self.loss_fn(logits, labels) | ||
loss_dict = {"loss": loss} | ||
|
||
return logits, loss, loss_dict |
Oops, something went wrong.