Skip to content

Commit

Permalink
update code
Browse files Browse the repository at this point in the history
  • Loading branch information
rosen1998 committed Apr 21, 2023
0 parents commit 7add144
Show file tree
Hide file tree
Showing 17 changed files with 739 additions and 0 deletions.
158 changes: 158 additions & 0 deletions PSTNet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
import torch.nn as nn
import torch
import sys
from torch.autograd import Variable
import warnings

sys.path.append('../')
warnings.filterwarnings("ignore")

class LearningProcessModule(nn.Module):
def __init__(self, seq_len, embedding_dim, device, max_position, dropout=0.2):
super().__init__()
self.seq_len = seq_len
self.embedding_dim = embedding_dim
self.device = device
self.max_position = max_position
self.W_1 = nn.Linear(3*embedding_dim, embedding_dim)
self.W_2 = nn.Linear(4*embedding_dim, embedding_dim)
self.W_3 = nn.Linear(3*embedding_dim, embedding_dim)
self.W_4 = nn.Linear(3*embedding_dim, embedding_dim)
self.W_5 = nn.Linear(3*embedding_dim, embedding_dim)
self.W_6 = nn.Linear(3*embedding_dim, embedding_dim)
self.W_7 = nn.Linear(2*embedding_dim, embedding_dim)
self.W_8 = nn.Linear(3*embedding_dim, embedding_dim)
self.W_9 = nn.Linear(2*embedding_dim, embedding_dim)
self.W_10 = nn.Linear(2*embedding_dim, embedding_dim)
self.W_11 = nn.Linear(2*embedding_dim, embedding_dim)
self.W_12 = nn.Linear(embedding_dim, 1)
self.W_13 = nn.Linear(embedding_dim, 1)
self.position_encoder = nn.Embedding(self.max_position, self.embedding_dim)
self.dropout = nn.Dropout(dropout)

torch.nn.init.xavier_uniform_(self.W_1.weight)
torch.nn.init.xavier_uniform_(self.W_2.weight)
torch.nn.init.xavier_uniform_(self.W_3.weight)
torch.nn.init.xavier_uniform_(self.W_4.weight)
torch.nn.init.xavier_uniform_(self.W_5.weight)
torch.nn.init.xavier_uniform_(self.W_6.weight)
torch.nn.init.xavier_uniform_(self.W_7.weight)
torch.nn.init.xavier_uniform_(self.W_8.weight)
torch.nn.init.xavier_uniform_(self.W_9.weight)
torch.nn.init.xavier_uniform_(self.W_10.weight)
torch.nn.init.xavier_uniform_(self.W_11.weight)
torch.nn.init.xavier_uniform_(self.W_12.weight)
torch.nn.init.xavier_uniform_(self.W_13.weight)
torch.nn.init.xavier_uniform_(self.position_encoder.weight)

self.sig = nn.Sigmoid()
self.relu = nn.ReLU()
self.tanh = nn.Tanh()

def forward(self, coding_ability, programming_knowledge, exercises, feedbacks, detail_is_ac, CIGs, CTGs, exercise_id):
e_cig_last = torch.zeros((exercises.shape[0], self.embedding_dim)).to(self.device)
e_f_last = torch.zeros((exercises.shape[0], self.embedding_dim)).to(self.device)
e_id_last = torch.zeros((exercises.shape[0])).to(self.device)
e_id_last -= 1

pred = torch.zeros((exercises.shape[0], self.seq_len))
pred_r = torch.zeros((exercises.shape[0], self.seq_len))

self.coding_ability = coding_ability
self.programming_knowledge = programming_knowledge

position = torch.zeros((exercises.shape[0], 1)).to(self.device)
for i in range(0, self.seq_len-1):
position += 1
zero_p = torch.zeros(exercises.shape[0], 1).to(self.device)
max_p = torch.zeros(exercises.shape[0], 1).to(self.device) + self.max_position
max_gate = (position>=self.max_position).float().view(-1, 1)
position = max_gate*max_p + (1-max_gate)*position
e_p = self.position_encoder(position.long()-1).view(-1, self.embedding_dim)

e_e = exercises[:, i]
e_f = feedbacks[:, i]
e_cig = CIGs[:, i]
e_ctg = CTGs[:, i]

e_id = exercise_id[:, i]
e_similarity = e_id_last == e_id
h = (e_similarity==1).float().view(-1, 1)

e_s = torch.cat((e_cig, e_e, e_f), 1)
e_c = torch.cat((e_cig_last, e_ctg, e_e, e_f_last), 1)

submission = self.dropout(self.tanh(self.W_1(e_s)))
change = self.dropout(self.tanh(self.W_2(e_c)))
zero_embedding = torch.zeros((exercises.shape[0], self.embedding_dim)).to(self.device)
coding_info = torch.cat((h*change+(1-h)*zero_embedding, (h*zero_embedding+(1-h)*submission)), 1)
coding_ability_hat = self.tanh(self.W_3(torch.cat((self.coding_ability, coding_info), 1)))
forget_gate_ca = self.sig(self.W_4(torch.cat((self.coding_ability, coding_info), dim=1)))
input_gate_ca = self.sig(self.W_5(torch.cat((self.coding_ability, coding_info), dim=1)))
self.coding_ability = forget_gate_ca*self.coding_ability+input_gate_ca*coding_ability_hat

e_id_next = exercise_id[:, i+1]
e_similarity_next = e_id_next == e_id
h_next = (e_similarity_next==1).float().view(-1, 1)
final_solution = self.tanh(self.W_6(e_s))
forget_gate_pk = self.sig(self.W_7(torch.cat((self.programming_knowledge, final_solution), dim=1)))
input_gate_pk = self.sig(self.W_8(torch.cat((self.programming_knowledge, final_solution, e_p), dim=1)))
LG = self.tanh(self.W_9(torch.cat((self.programming_knowledge, final_solution), 1)))
self.programming_knowledge = h_next*self.programming_knowledge + (1-h_next)*(forget_gate_pk*self.programming_knowledge+input_gate_pk*LG)

e_e_next = exercises[:, i+1]
solution = self.relu(self.W_10(torch.cat((self.programming_knowledge, e_e_next), dim=1)))
y = self.relu(self.W_11(torch.cat((self.coding_ability, solution), dim=1)))
next_pred = self.sig(self.W_12(y))
r = self.sig(self.W_13(y))
pred[:, i+1] = torch.squeeze(next_pred)
pred_r[:, i+1] = torch.squeeze(r)

e_id_last = e_id
e_f_last = e_f
e_cig_last = h_next*e_cig + (1-h_next)*zero_embedding
position = h_next*position + (1-h_next)*zero_p

return pred, pred_r

class PST(nn.Module):
def __init__(self, seq_len, num_exercises, embedding_dim, device, max_position, dropout=0.2):
super().__init__()
self.seq_len = seq_len
self.num_exercises = num_exercises
self.embedding_dim = embedding_dim
self.device = device
self.max_position = max_position
self.dropout = dropout
self.W_1 = nn.Linear(2*self.embedding_dim, self.embedding_dim)
self.W_2 = nn.Linear(2*self.embedding_dim, self.embedding_dim)

torch.nn.init.xavier_uniform_(self.W_1.weight)
torch.nn.init.xavier_uniform_(self.W_2.weight)

self.learning_fitting_encoder = LearningProcessModule(self.seq_len, self.embedding_dim, self.device, self.max_position, self.dropout)
self.exercise_encoder = nn.Embedding(self.num_exercises, self.embedding_dim)
self.tanh = nn.Tanh()

torch.nn.init.xavier_uniform_(self.exercise_encoder.weight)

def encode_feedback(self, answer):
one = torch.ones((answer.shape[0], self.embedding_dim)).to(self.device)
zero = torch.zeros((answer.shape[0], self.embedding_dim)).to(self.device)
e_a = answer.long()*one + (1-answer.long())*zero
return e_a

def forward(self, detail_is_ac, exercises, e_cig, e_ctg):
self.programming_knowledge = Variable(torch.zeros(detail_is_ac.shape[0], self.embedding_dim).to(self.device))
self.coding_ability = Variable(torch.zeros(detail_is_ac.shape[0], self.embedding_dim).to(self.device))

e_cig = self.W_1(e_cig)
e_ctg = self.W_2(e_ctg)
detail_is_ac = detail_is_ac.view(-1, 1)
e_f = self.encode_feedback(detail_is_ac.long())
e_f = e_f.view(-1, self.seq_len, self.embedding_dim)
e_e = self.exercise_encoder(exercises)

pred, pred_r = self.learning_fitting_encoder(self.coding_ability, self.programming_knowledge, e_e, e_f, detail_is_ac, e_cig, e_ctg, exercises)

return pred, pred_r
48 changes: 48 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# PST
Source code for the paper **PST: Measuring Skill Proficiency in Programming Exercise Process via Programming Skill Tracing**.

## Usage
### Download pre-trained CIG, CTG embedding
- [Baidu Netdisk](https://pan.baidu.com/s/16JV_8elbMLQ2_OurZk1PTA?pwd=78n3)
- Put the downloaded embedding folder into the root directory of the PST project.

### Train and test
```python
python train.py --dataset_name atcoder_c --num_exercises 1671 --do_test True
python train.py --dataset_name aizu_cpp --num_exercises 2207 --do_test True
```
### Train only
```python
python train.py --dataset_name atcoder_c --num_exercises 1671
python train.py --dataset_name aizu_cpp --num_exercises 2207
```
### Test only
```python
python test.py --dataset_name atcoder_c
python test.py --dataset_name aizu_cpp
```
- We removed some data that could not be processed by some baselines, such as submissions without corresponding exercise text. So the actual number of submissions of Atcoder_C for the experiment was 423841, and the actual number of submissions of AIZU_Cpp for the experiment was 264839.
## Corrections
1. In our paper, the equation for the cross-entropy loss function was written incorrectly, so the correct loss function for the PST model is as follows:
![PST_equation](/equation/PST.png)
2. We made a mistake in calculating the AUC of task 1 for all the baselines and PST. The good thing is that this mistake occurred in the final testing phase and did not affect model training, model selection, and the calculation of other metrics for task 1 and all metrics for other tasks. The correct experimental results of the PST model are as follows:
- Atcoder_C
- Task1
- AUC 0.8383
- ACC 0.8107
- Task2
- RMSE 0.2875
- Task3
- RMSE 0.3453
- Task4
- RMSE 0.2862
- AIZU_Cpp
- Task1
- AUC 0.8849
- ACC 0.9596
- Task2
- RMSE 0.2239
- Task3
- RMSE 0.3073
- Task4
- RMSE 0.1731
62 changes: 62 additions & 0 deletions data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import numpy as np
from tqdm import tqdm
import math

class PST_DATA():
def __init__(self, seq_len):
self.seq_len = seq_len

def load_data(self, data, target):
pred_target = []
pro_ids = []
detail_is_ac = []
rates = []
for learning_seq in tqdm(data, desc='loading data...'):
pred_target_arr = []
pro_ids_arr = []
detail_is_ac_arr = []
rates_arr = []
for learning_item in learning_seq:
item_len = len(learning_item['is_ac_arr'])
target_item = learning_item[target]
now_target = [target_item] + [-1]*(item_len-1)
pred_target_arr += now_target
pro_ids_arr += [learning_item['pro_id']]*item_len
detail_is_ac_arr += learning_item['is_ac_arr']
rates_arr += learning_item['rate']
n_split = 1
if len(pred_target_arr)>self.seq_len:
n_split = math.floor(len(pred_target_arr)/self.seq_len)
if len(pred_target_arr)/self.seq_len:
n_split += 1
for k in range(n_split):
pred_target_seq = []
pro_ids_seq = []
detail_is_ac_seq = []
rates_seq = []
if k == n_split-1:
end_index = len(pred_target_arr)
else:
end_index = (k+1)*self.seq_len
for idx in range(k*self.seq_len, end_index):
pred_target_seq.append(pred_target_arr[idx])
pro_ids_seq.append(pro_ids_arr[idx])
detail_is_ac_seq.append(detail_is_ac_arr[idx])
rates_seq.append(rates_arr[idx])
pred_target.append(pred_target_seq)
pro_ids.append(pro_ids_seq)
detail_is_ac.append(detail_is_ac_seq)
rates.append(rates_seq)

pred_target_np = np.zeros((len(pred_target), self.seq_len))
pro_ids_np = np.zeros((len(pred_target), self.seq_len))
detail_is_ac_np = np.zeros((len(pred_target), self.seq_len))
rates_np = np.zeros((len(pred_target), self.seq_len))
rates_np -= 1
pred_target_np -= 1
for i in tqdm(range(len(pred_target)), desc='get numpy...'):
pred_target_np[i, :len(pred_target[i])] = pred_target[i]
pro_ids_np[i, :len(pred_target[i])] = pro_ids[i]
detail_is_ac_np[i, :len(pred_target[i])] = detail_is_ac[i]
rates_np[i, :len(pred_target[i])] = rates[i]
return pred_target_np, pro_ids_np, detail_is_ac_np, rates_np
Binary file added data/aizu_cpp/test.pkl
Binary file not shown.
Binary file added data/aizu_cpp/train.pkl
Binary file not shown.
Binary file added data/aizu_cpp/val.pkl
Binary file not shown.
Binary file added data/atcoder_c/test.pkl
Binary file not shown.
Binary file added data/atcoder_c/train.pkl
Binary file not shown.
Binary file added data/atcoder_c/val.pkl
Binary file not shown.
Binary file added equation/PST.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file added log/aizu_cpp/.gitignore
Empty file.
Empty file added log/atcoder_c/.gitignore
Empty file.
Empty file added model/aizu_cpp/.gitignore
Empty file.
Empty file added model/atcoder_c/.gitignore
Empty file.
Loading

0 comments on commit 7add144

Please sign in to comment.