Skip to content

Commit

Permalink
code and conf
Browse files Browse the repository at this point in the history
  • Loading branch information
rbiswasfc committed Oct 25, 2023
1 parent 04641d6 commit 9417775
Show file tree
Hide file tree
Showing 32 changed files with 5,105 additions and 2 deletions.
64 changes: 62 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,62 @@
# llm-science-exam
6th Position Solution Code for Kaggle - LLM Science Exam Competition
This repo contains our code and configurations for the Kaggle - LLM Science Exam competition. A detailed summary of the solution is posted [here](https://www.kaggle.com/competitions/kaggle-llm-science-exam/discussion/447647). Please refer to the following sections for details on training and dependencies.

## Section 1: Setup
### 1.1 Hardware
Computing resources from Jarvislabs.ai were used. Specifically, models were trained on the following instance:

Ubuntu 20.04.5 LTS (128 GB boot disk)
Intel(R) Xeon(R) Silver 4216 CPU @ 2.10GHz (7 vCPUs)
1 x NVIDIA A100 40GB GPU OR 1 x NVIDIA A6000 48GB GPU

### 1.2 Software
I used PyTorch-2.0 image from Jarvislabs.ai, which comes with:

* Python 3.10.11
* CUDA 11.8
* Python packages installation: pip install -r requirements.txt

### 1.3 Datasets
Please make sure Kaggle API is installed. Then run the following script to download the required datasets:

```
chmod +x ./setup.sh
./setup.sh
```

Please note that the above script will create a datasets folder in the directory located one level above the current directory. The external datasets will be downloaded in the datasets folder.

## Section 2: Training

### 2.1 Retriever Training
```
python ./code/train_e_topic.py \
--config-name conf_e_topic_bge \
use_wandb=false \
all_data=false
```

### 2.2 Ranker Training
```
python ./code/train_e_ranker.py \
--config-name conf_e_ranker \
use_wandb=false \
all_data=false
```

### 2.3 Reader: Spanwise model

#### Step 1: training with large number of MCQs
```
python ./code/train_r_delta.py \
--config-name conf_r_delta_k1 \
use_wandb=false \
all_data=false
```

#### Step 2: specialization with difficult MCQs
```
python ./code/train_r_delta.py \
--config-name conf_r_delta_k2_resumed \
use_wandb=false \
all_data=false
```
Binary file added code/.DS_Store
Binary file not shown.
59 changes: 59 additions & 0 deletions code/e_ranker/rank_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from copy import deepcopy

from datasets import Dataset
from transformers import AutoTokenizer

# --------------- Dataset ----------------------------------------------#


class RankerDataset:
"""
Dataset class for llm-science-exam task for re-ranking
"""

def __init__(self, cfg):
self.cfg = cfg
self.tokenizer = AutoTokenizer.from_pretrained(cfg.model.backbone_path)

def pre_process(self, df):
columns = ["prompt", "A", "B", "C", "D", "E"]
df["query"] = df[columns].apply(lambda x: " | ".join(x), axis=1)
return df

def tokenize_function(self, examples):
tz = self.tokenizer(
examples["query"],
examples["context"],
padding=False,
truncation="longest_first",
max_length=self.cfg.model.max_length,
add_special_tokens=True,
return_token_type_ids=False,
)

return tz

def compute_input_length(self, examples):
return {"input_length": [len(x) for x in examples["input_ids"]]}

def get_dataset(self, df):
"""
Main api for creating the Science Exam dataset
:param df: input dataframe
:type df: pd.DataFrame
:return: the created dataset
:rtype: Dataset
"""
df = deepcopy(df)
df = self.pre_process(df)
task_dataset = Dataset.from_pandas(df)

task_dataset = task_dataset.map(self.tokenize_function, batched=True)
task_dataset = task_dataset.map(self.compute_input_length, batched=True)

try:
task_dataset = task_dataset.remove_columns(column_names=["__index_level_0__"])
except Exception as e:
print(e)

return task_dataset
92 changes: 92 additions & 0 deletions code/e_ranker/rank_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@

import pdb
import random
from copy import deepcopy
from dataclasses import dataclass, field

import torch
from transformers import DataCollatorWithPadding


@dataclass
class RankerCollator(DataCollatorWithPadding):
"""
data collector for re-ranker task
"""

tokenizer = None
padding = True
max_length = None
pad_to_multiple_of = None
return_tensors = "pt"

def __call__(self, features):

buffer_dict = dict()
buffer_keys = ["query_id", "content_id"]

for key in buffer_keys:
if key in features[0].keys():
value = [feature[key] for feature in features]
buffer_dict[key] = value

labels = None
if "label" in features[0].keys():
labels = [feature["label"] for feature in features]

features = [
{
"input_ids": feature["input_ids"],
"attention_mask": feature["attention_mask"],
} for feature in features
]

batch = self.tokenizer.pad(
features,
padding=self.padding,
max_length=self.max_length,
pad_to_multiple_of=self.pad_to_multiple_of,
return_tensors=None,
)

for key, value in buffer_dict.items():
batch[key] = value

if labels is not None:
batch["labels"] = labels

tensor_keys = [
"input_ids",
"attention_mask",
]

for key in tensor_keys:
batch[key] = torch.tensor(batch[key], dtype=torch.int64)

if labels is not None:
batch["labels"] = torch.tensor(batch["labels"], dtype=torch.float32)

return batch


# ---
def show_batch(batch, tokenizer, n_examples=16, task='training'):

bs = batch['input_ids'].size(0)
print(f"batch size: {bs}")

print(f"shape of input_ids: {batch['input_ids'].shape}")

n_examples = min(n_examples, bs)
print(f"Showing {n_examples} from a {task} batch...")

print("\n\n")
for idx in range(n_examples):
print(f"Input:\n\n{tokenizer.decode(batch['input_ids'][idx], skip_special_tokens=False)}")
# print("\n\n")

if "infer" not in task.lower():
print("--"*20)
labels = batch['labels'][idx]
print(f"Label: {labels}")
print('=='*40)
90 changes: 90 additions & 0 deletions code/e_ranker/rank_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint
from transformers import AutoConfig, AutoModel


class MeanPooling(nn.Module):
def __init__(self):
super(MeanPooling, self).__init__()

def forward(self, last_hidden_state, attention_mask):
input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
sum_mask = input_mask_expanded.sum(1)
sum_mask = torch.clamp(sum_mask, min=1e-9)
mean_embeddings = sum_embeddings / sum_mask
return mean_embeddings


# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Rank Model
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

class RankerModel(nn.Module):
"""
The Sci-LLM re-ranker
"""

def __init__(self, cfg):
print("initializing the Rank Model...")

super(RankerModel, self).__init__()
self.cfg = cfg

# ----------------------------- Backbone -----------------------------------------#
backbone_config = AutoConfig.from_pretrained(self.cfg.model.backbone_path)
backbone_config.update(
{
"use_cache": False,
}
)

self.backbone = AutoModel.from_pretrained(self.cfg.model.backbone_path, config=backbone_config)
self.backbone.gradient_checkpointing_enable()

# Mean pooling
self.pool = MeanPooling()

# classifier
num_features = self.backbone.config.hidden_size
self.classifier = nn.Linear(num_features, 1)

# loss function
self.loss_fn = nn.BCEWithLogitsLoss(reduction="mean")

def encode(
self,
input_ids,
attention_mask,
):
outputs = self.backbone(
input_ids,
attention_mask=attention_mask,
output_hidden_states=False,
)

encoder_layer = outputs.last_hidden_state
embeddings = self.pool(encoder_layer, attention_mask) # mean pooling

return embeddings

def forward(self, input_ids, attention_mask, labels=None, **kwargs):
# features
features = self.encode(
input_ids=input_ids,
attention_mask=attention_mask,
) # (bs, num_features)

# logits
logits = self.classifier(features).reshape(-1)
loss_dict = dict()

# loss
loss = None
if labels is not None:
loss = self.loss_fn(logits, labels)
loss_dict = {"loss": loss}

return logits, loss, loss_dict
Loading

0 comments on commit 9417775

Please sign in to comment.