Author: Anil Gurbuz
Last Updated: 20 Oct 2020
This architecture is designed by tito here. I only re-implemented it in Pytorch also utilising some different Pytorch implementation approaches.
I have been tracking the experiments using Weights&Biases. It is easy to keep records of training statistics for different runs. Most of the times I only train a couple of epochs to see the initial behavior of training and compare it with previous runs. Therefore, some of the runs in the weights&biases projects will only have data for small number of iterations.
Here is public project to check out the convergence of different training runs.
Also records of some benchmark runs are in the table below;
Version | Model | Validation MCRMSE | Epoch | LR | Batch Size | Folds | Date | Notes | LB |
---|---|---|---|---|---|---|---|---|---|
7 | GRU | 0.319 | 100 | 0.01 | 64 | 4 | @Sep 23, 2020 | May try longer training | |
7 | LSTM | 0.249 | 100 | 0.01 | 64 | 4 | @Sep 23, 2020 | ||
7 | Ensemble | 0.262 | 100 | 0.01 | 64 | 4 | @Sep 23, 2020 | ||
10 | GRU | 0.305 | 100 | 0.01 | 64 | 4 | @Sep 24, 2020 | BBP Features added and Weighted loss by ln(signal to Noise ratio) Still Longer training time may help | |
10 | LSTM | 0.243 | 100 | 0.01 | 64 | 4 | @Sep 24, 2020 | ||
10 | Ensemble | 0.255 | 100 | 0.01 | 64 | 4 | @Sep 24, 2020 | ||
12 | GRU | 0.285 | 170 | 0.01 | 64 | 4 | @Sep 24, 2020 | BBP Features added and Weighted loss by ln(signal to Noise ratio) | 0.26493 |
12 | LSTM | 0.236 | 170 | 0.01 | 64 | 4 | @Sep 24, 2020 | ||
12 | Ensemble | 0.241 | 170 | 0.01 | 64 | 4 | @Sep 24, 2020 | ||
13 | GRU | 0.312 | 300 | 0.01 | 64 | 4 | @Sep 24, 2020 | 0.27225 | |
13 | LSTM | 0.238 | 300 | 0.01 | 64 | 4 | @Sep 24, 2020 | ||
13 | Ensemble | 0.247 | 300 | 0.01 | 64 | 4 | @Sep 24, 2020 | ||
14 | GRU | 0.335 | 120 | 0.01 | 64 | 4 | @Sep 24, 2020 | # of Hidden layers increased to 5 from 3 | |
14 | LSTM | 0.245 | 120 | 0.01 | 64 | 4 | @Sep 24, 2020 | ||
14 | Ensemble | 0.261 | 120 | 0.01 | 64 | 4 | @Sep 24, 2020 | ||
17 | GRU | 0.377 | 200 | 0.01 | 64 | 4 | @Sep 25, 2020 | Still 5 layers, SNL>1 | 0.28038 |
17 | LSTM | 0.243 | 150 | 0.01 | 64 | 4 | @Sep 25, 2020 | ||
17 | Ensemble | 0.274 | 0 | 0.01 | 64 | 4 | @Sep 25, 2020 | ||
18 | GRU | 0.341 | 200 | 0.01 | 64 | 4 | @Sep 26, 2020 | Still 5 layers, SN==1 | 0.27665 |
18 | LSTM | 0.238 | 150 | 0.01 | 64 | 4 | @Sep 26, 2020 | ||
18 | Ensemble | 0.261 | 0 | 0.01 | 64 | 4 | @Sep 26, 2020 | ||
19 | GRU | 0.323 | 200 | 0.01 | 64 | 4 | @Sep 26, 2020 | Still 5 layers, SN==1, Augmentation without validation adjustment | 0.27994 |
19 | LSTM | 0.245 | 150 | 0.01 | 64 | 4 | @Sep 26, 2020 | ||
19 | Ensemble | 0.259 | 0 | 0.01 | 64 | 4 | @Sep 26, 2020 | ||
21 | GRU | 0.32 | 200 | 0.01 | 64 | 4 | @Sep 26, 2020 | 3 layers, SN==1, Augmentation done with adjustment for validation | |
21 | LSTM | 0.231 | 150 | 0.01 | 64 | 4 | @Sep 26, 2020 | ||
21 | Ensemble | 0.246 | 0 | 0.01 | 64 | 4 | @Sep 26, 2020 | ||
23 | LSTM | 0.234 | 150 | 0.01 | 64 | 4 | @Sep 28, 2020 | Augmentation data included in validation as well. | |
26 | LSTM | 0.234 | 65 | 0.01 | 64 | 4 | @Sep 28, 2020 | Dropout=0.4 | |
27 | LSTM | 0.242 | 65 | 0.01 | 64 | 4 | @Sep 28, 2020 | Dropout=0.5 , hidden_layers=2 | |
29 | LSTM | 0.236 | 65 | 0.01 | 64 | 4 | @Sep 28, 2020 | Hidden layers=4, fixed MCRMSE |
import warnings
warnings.filterwarnings('ignore')
import os
import sys
import pickle
import pandas as pd, numpy as np
from matplotlib import pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split, KFold
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Dataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#!pip install wandb -q
import logging
logging.propagate = False
logging.getLogger().setLevel(logging.ERROR)
import wandb
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb_key")
wandb.login(key=secret_value_0)
wandb.init(project="my-project",)
# Define global variables and load the datasets
# Mapper for base molecules, bond structure and loop type
TOKEN_TO_INT = {x:i for i, x in enumerate('().ACGUBEHIMSX')}
SEQ = ["sequence","structure","predicted_loop_type"]
TARGET = ['reactivity', 'deg_Mg_pH10','deg_pH10', 'deg_Mg_50C', 'deg_50C']
SCORED = ["reactivity", "deg_Mg_pH10","deg_Mg_50C"]
ERROR = ['reactivity_error', 'deg_error_Mg_pH10', 'deg_error_pH10','deg_error_Mg_50C', 'deg_error_50C']
# Some of training parameters
FOLDS = 4
EPOCHS = 65
BATCH_SIZE = 64
VERBOSE = 2
LR = 0.01
train = pd.read_json('/kaggle/input/stanford-covid-vaccine/train.json', lines=True)
test = pd.read_json('/kaggle/input/stanford-covid-vaccine/test.json', lines=True)
sample_sub = pd.read_csv('/kaggle/input/stanford-covid-vaccine/sample_submission.csv')
aug_df = pd.read_csv("../input/how-to-generate-augmentation-data/aug_data.csv")
# Load ordered ids to generate submission format fast at the end of the NB
with open('../input/tmp-ids-data/public_ids.pkl', 'rb') as f:
public_id_index = pickle.load(f)
with open('../input/tmp-ids-data/private_ids.pkl', 'rb') as f:
private_id_index = pickle.load(f)
# Generate basic statistical features from base pair probabilities.
def read_bpps_sum(df):
bpps_arr = []
for mol_id in df.id.to_list():
bpps_arr.append(np.load(f"../input/stanford-covid-vaccine/bpps/{mol_id}.npy").max(axis=1))
return bpps_arr
def read_bpps_max(df):
bpps_arr = []
for mol_id in df.id.to_list():
bpps_arr.append(np.load(f"../input/stanford-covid-vaccine/bpps/{mol_id}.npy").sum(axis=1))
return bpps_arr
def read_bpps_nb(df):
# normalized non-zero number
# from https://www.kaggle.com/symyksr/openvaccine-deepergcn
bpps_nb_mean = 0.077522 # mean of bpps_nb across all training data
bpps_nb_std = 0.08914 # std of bpps_nb across all training data
bpps_arr = []
for mol_id in df.id.to_list():
bpps = np.load(f"../input/stanford-covid-vaccine/bpps/{mol_id}.npy")
bpps_nb = (bpps > 0).sum(axis=0) / bpps.shape[0]
bpps_nb = (bpps_nb - bpps_nb_mean) / bpps_nb_std
bpps_arr.append(bpps_nb)
return bpps_arr
train['bpps_sum'] = read_bpps_sum(train)
test['bpps_sum'] = read_bpps_sum(test)
train['bpps_max'] = read_bpps_max(train)
test['bpps_max'] = read_bpps_max(test)
train['bpps_nb'] = read_bpps_nb(train)
test['bpps_nb'] = read_bpps_nb(test)
# Concatanate 3 sequence vectors together to feed the NN
def preprocess_inputs(df, cols=['sequence', 'structure', 'predicted_loop_type']):
base_seqs=np.transpose(
np.array(
df[cols]
.applymap(lambda seq: [TOKEN_TO_INT[x] for x in seq])
.values
.tolist()
),
(0, 2, 1)
)
bpps_sum_fea = np.array(df['bpps_sum'].to_list())[:,:,np.newaxis]
bpps_max_fea = np.array(df['bpps_max'].to_list())[:,:,np.newaxis]
bpps_nb_fea = np.array(df['bpps_nb'].to_list())[:,:,np.newaxis]
return np.concatenate([base_seqs,bpps_sum_fea,bpps_max_fea,bpps_nb_fea], 2)
# Concatanate Measurement errors to use as weights in learning step
def preprocess_errors(df, cols=['reactivity_error', 'deg_error_Mg_pH10', 'deg_error_pH10','deg_error_Mg_50C', 'deg_error_50C']):
error1 = np.array(df['reactivity_error'].to_list())[:,:,np.newaxis]
error2 = np.array(df['deg_error_Mg_pH10'].to_list())[:,:,np.newaxis]
error3 = np.array(df['deg_error_pH10'].to_list())[:,:,np.newaxis]
error4 = np.array(df['deg_error_Mg_50C'].to_list())[:,:,np.newaxis]
error5 = np.array(df['deg_error_50C'].to_list())[:,:,np.newaxis]
return np.concatenate([error1,error2,error3,error4,error5], 2)
# Augment training data by using the generated molecules with Arnie package
# Thanks for tito again for publishing this.
def aug_data(df):
target_df = df.copy()
new_df = aug_df[aug_df['id'].isin(target_df['id'])]
del target_df['structure']
del target_df['predicted_loop_type']
new_df = new_df.merge(target_df, on=['id','sequence'], how='left')
df['cnt'] = df['id'].map(new_df[['id','cnt']].set_index('id').to_dict()['cnt'])
df['log_gamma'] = 100
df['score'] = 1.0
df = df.append(new_df[df.columns])
return df
# Put 2 dataframes into same order
def align(df1, df2, col_to_align):
assert df1.shape[0] == df2.shape[0], "Not same number of Rows"
df2 = df2.set_index(col_to_align)
df2 = df2.reindex(df1[col_to_align].values.tolist()).reset_index()
df1 = df1.reset_index(drop=True)
assert sum(df1[col_to_align] != df2[col_to_align])==0, "Not all aligned"
return df1, df2
train = aug_data(train)
#test = aug_data(test)
train.loc[train.log_gamma < 100, ERROR] = train.loc[train.log_gamma < 100,ERROR].applymap(lambda seq: [3*x for x in seq] )
# extract augmentad data from original train
train_augment = train.loc[train.log_gamma < 100,]
train = train.loc[train.log_gamma == 100,]
# Select training and augmentation examples with signalto noise ratio>1
TRAIN_FILTERED = train.loc[train.signal_to_noise>1]
TRAIN_AUGMENT = train_augment.loc[train_augment.signal_to_noise>1]
TRAIN_FILTERED, TRAIN_AUGMENT = align(TRAIN_FILTERED,TRAIN_AUGMENT, "id")
# Create tensors and store in "device"
TRAIN_INPUTS = torch.tensor(preprocess_inputs(TRAIN_FILTERED)).to(device)
print("input shape: ", TRAIN_INPUTS.shape)
AUGMENT_INPUTS = torch.tensor(preprocess_inputs(TRAIN_AUGMENT)).to(device)
print("Augment shape: ", AUGMENT_INPUTS.shape)
TRAIN_ERRORS = torch.tensor(preprocess_errors(TRAIN_FILTERED)).to(device)
print("Errors shape: ", TRAIN_INPUTS.shape)
TRAIN_LABELS = torch.tensor(
np.array(TRAIN_FILTERED[TARGET].values.tolist()).transpose(0, 2, 1)
).float().to(device)
print("output shape: ", TRAIN_LABELS.shape)
AUGMENT_LABELS = torch.tensor(
np.array(TRAIN_AUGMENT[TARGET].values.tolist()).transpose(0, 2, 1)
).float().to(device)
print("Augment labels shape: ", AUGMENT_LABELS.shape)
#Get different test sets and process each
public_df = test.query("seq_length == 107").copy()
private_df = test.query("seq_length == 130").copy()
PUBLIC_INPUTS = torch.tensor(preprocess_inputs(public_df)).to(device)
PRIVATE_INPUTS = torch.tensor(preprocess_inputs(private_df)).to(device)
# Create Data Loaders for test data only. Dataset class will be modified for training examlples to return index when called the get method.
PUBLIC_LOADER = DataLoader(TensorDataset(PUBLIC_INPUTS), shuffle=False, batch_size=BATCH_SIZE)
PRIVATE_LOADER = DataLoader(TensorDataset(PRIVATE_INPUTS), shuffle=False, batch_size=BATCH_SIZE)
FOLD_ITERATOR = KFold(FOLDS, shuffle=True, random_state=2020)
## Validation only consider examples that can pass signal to noise filter
new_filtered = TRAIN_FILTERED.reset_index(drop=True)
idx_filter = new_filtered.loc[new_filtered.SN_filter==0].index.tolist()
# Modify the get method of Pytorch Dataset class to retun index as well.
class new_dataset(Dataset):
def __init__(self, train, test):
super(new_dataset, self).__init__()
self.dataset = TensorDataset(train, test)
def __getitem__(self, index):
data, target = self.dataset[index]
return data, target, index
def __len__(self):
return len(self.dataset)
# Create LSTM architecture
class LSTM_model(nn.Module):
def __init__(
self, seq_len=107, pred_len=68, dropout=0.5, embed_dim=100, hidden_dim=128, hidden_layers=3, num_target_class=5
):
"""
seq_len: Length of input m-RNA
pred_len: Number of bases will be scored
dropout: Dropout ratio in LSTM
embed_dim: Number of dimensions to map each element of base seq., loop type and base structure sequences.
if 100, 100x3 total i.e for each of sequence elements.
hidden_dim: Number of hidden layer nodes in LSTM.
hidden_layers: Number of hidden layers in LSTM.
num_target_class: Number of target class elements
"""
super(LSTM_model, self).__init__()
self.pred_len = pred_len
self.embeding = nn.Embedding(num_embeddings=len(TOKEN_TO_INT), embedding_dim=embed_dim)
self.lstm = nn.LSTM(
input_size=embed_dim * 3 +3, # +3 for 3 bpps features
hidden_size=hidden_dim,
num_layers=hidden_layers,
dropout=dropout,
bidirectional=True,
batch_first=True,
)
self.linear = nn.Linear(hidden_dim * 2, num_target_class)
def forward(self, seqs ):
# Seperate categorical and numerical features to feed embedding with categoricals
cat_seqs = seqs[:,:,:3].type(torch.LongTensor).to(device)
num_seqs = seqs[:,:,3:].type(torch.float)
embed = self.embeding(cat_seqs)
# Reduce one dimension of embedding output and concat with numerical features i.e bpps features
reshaped = torch.reshape(embed, (-1, embed.shape[1], embed.shape[2] * embed.shape[3]))
reshaped = torch.cat((reshaped, num_seqs), axis=2)
output, hidden = self.lstm(reshaped)
# Ignore bases that will not scored
truncated = output[:, : self.pred_len, :]
# Feed into lienar layer
out = self.linear(truncated)
return out
# Use Mean Squared Error as loss function. reduction="none" to allow weighting loss for individual examples.
mse_loss = nn.MSELoss(reduction="none")
def compute_loss(batch_X, batch_Y, batch_weights, model, optimizer=None, is_train=True):
# Set train/inference mode.
model.train(is_train)
# Make sure model is in same device with data
model.to(device)
# Make predictions
pred_Y = model(batch_X)
# Calculate MSE for current batch -- Doesn't reduce to single MSE value. Keeps squared errors for each example in batch seperately.
loss = mse_loss(pred_Y, batch_Y)
#### Weight each squared error using measurement error ####
loss =(loss * batch_weights / batch_weights.sum()).sum()
# Update model if training
if is_train:
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Return loss value -- not a tensor
return loss.item()
# Take column-wise mean of RMSE
def MCRMSE(pred, label, seq_scored=68, index_to_keep=[0,1,3]):
diff = torch.pow((pred - label),2)
diff = diff[:,:seq_scored,index_to_keep]
diff = torch.sum(diff, (0,1))/(diff.shape[0]*diff.shape[1])
diff = torch.sqrt(diff)
diff = torch.sum(diff) / 3
return diff
# Cross-Validation Loop for LSTM
def LSTM_CV():
# Dataframe to track CV results
store = pd.DataFrame(columns=["Model","K", "Val_Index","Val_MCRMSE", "Train_MCRMSE", "Val_Preds", "Public_Preds", "Private_Preds"])
model_name = "LSTM"
# For each fold...
for k, (train_index, val_index) in enumerate(FOLD_ITERATOR.split(TRAIN_INPUTS)):
## Weights are based on 1/log() transformation of measurement errors
weights = 1/(torch.log(TRAIN_ERRORS[train_index] +1.1))
## Assign half importance to augmented examples. 1/2 is selected by intiution.
## Optimisation would potentially lead to better performance.
weights = torch.cat((weights,weights/2),0)
## Concat predictors and labels of augmented and original training set.
fold_inputs = torch.cat((TRAIN_INPUTS[train_index],AUGMENT_INPUTS[train_index]),0)
fold_labels = torch.cat((TRAIN_LABELS[train_index],TRAIN_LABELS[train_index]),0)
train_loader = DataLoader(new_dataset(fold_inputs,fold_labels),shuffle=True, batch_size=BATCH_SIZE)
## Make sure each validation set records pass SN_Filter
val_index = np.array([i for i in val_index if i not in idx_filter])
## Turn array into Torch TensorDataset
val_set = TensorDataset(TRAIN_INPUTS[val_index], TRAIN_LABELS[val_index])
val_loader = DataLoader(val_set,shuffle=False, batch_size=BATCH_SIZE)
## Create model class and optimiser
model = LSTM_model(seq_len=107, pred_len=68).to(device)
optimizer = optim.Adam(model.parameters(), lr=LR)
## Train the model and return weights
model_state = training(model, device, train_loader, val_set, weights, optimizer, one_epoch=False,EPOCHS=EPOCHS)
## Make predictions on train and validation set
train_preds = pred(model, train_loader)
val_preds = pred(model, val_loader)
## Calculate validation and train MCRMSE
val_MCRMSE = MCRMSE(val_preds, TRAIN_LABELS[val_index])
train_MCRMSE = MCRMSE(train_preds[:len(train_index),:,:], TRAIN_LABELS[train_index])
## Private part of test data requires different NN architecture for inference as m-RNA length is different.
model = LSTM_model(seq_len=107, pred_len=68)
model.load_state_dict(model_state)
model.to(device)
public_preds = pred(model, PUBLIC_LOADER)
model = LSTM_model(seq_len=130, pred_len=91)
model.load_state_dict(model_state)
model.to(device)
private_preds = pred(model, PRIVATE_LOADER)
# Store results of the fold to store dataframe
store = store.append(dict(zip(store.columns.tolist(), (model_name, k, val_index , val_MCRMSE, train_MCRMSE, val_preds, public_preds, private_preds))), ignore_index=True)
return store
# Define training Loop steps to use in cross-validation
def training(model, device, train_loader, val_dataset, weights, optimizer, one_epoch=False, EPOCHS=90):
# Track model convergence using weights&biases
wandb.watch(model, log="all")
# Run one_epoch if running for error checking
if one_epoch:
EPOCHS=1
# For each epoch...
for epoch in tqdm(range(EPOCHS)):
## For each batch...
for data, label, index in train_loader: # train_loader gives index as well due to modification in Dataset class
### Get weights of the batch
batch_weights = weights[index]
### Update model and return training loss
train_batch_loss = compute_loss(data, label, batch_weights, model, optimizer=optimizer, is_train=True)
### Every validation examples weighted by 1 as their affect on RMSE are equal and no learning going in here.
batch_weights = torch.ones_like(val_dataset[:][1]).to(device)
### Don't update model and return validation loss
val_loss = compute_loss(val_dataset[:][0], val_dataset[:][1], batch_weights, model, optimizer=optimizer, is_train=False)
### Log loss values and epoch to weights&biases
wandb.log({"train_batch_loss":train_batch_loss,"val_loss":val_loss,"Epoch":epoch})
# Return model weights
return model.state_dict()
def pred(model, test_loader):
# Tell Torch not to track gradients as doing inference here.
# If not, memory can overflow as Torch goes on storing gradients without zero_grad() call.
with torch.no_grad():
# Set inference mode
model.eval()
# Empty tensor to store predictions in "device"
preds = torch.empty((0, model.pred_len, 5), device=device)
# concat predictions for entire test data
for data in test_loader:
data = data[0].to(device)
pred = model(data).to(device)
preds = torch.cat([preds, pred], axis=0)
return preds
# Turn tensor outputs into single MCRMSE value
def tensor_to_item(df):
df[["Val_MCRMSE","Train_MCRMSE"]] = df[["Val_MCRMSE","Train_MCRMSE"]].applymap(lambda x: x.item())
return df
# Average predictions of each fold on test data
def create_submission(df,FOLDS):
df_preds=df[["Public_Preds", "Private_Preds"]]
publics = torch.zeros_like(df["Public_Preds"][0])
privates = torch.zeros_like(df["Private_Preds"][0])
for fold in range(FOLDS):
publics += df["Public_Preds"][fold]/FOLDS
privates += df["Private_Preds"][fold]/FOLDS
return publics, privates
# Replicate sample_submission format
def format_submission(public_preds, private_preds, public_id_index, private_id_index):
submission_file = pd.read_csv("../input/stanford-covid-vaccine/sample_submission.csv")
for i, (start, end) in enumerate(public_id_index):
submission_file.iloc[start:start+68,1:] = np.array(public_preds[i,])
for i, (start, end) in enumerate(private_id_index):
submission_file.iloc[start:start+91,1:] = np.array(private_preds[i,])
return submission_file
# Train LSTM
track_CV = LSTM_CV()
track_CV = tensor_to_item(track_CV)
print(f"Model: {track_CV.Model[0]:.3s}, Validation MCRMSE: {track_CV.Val_MCRMSE.mean():.3f}")
# Average predictions of CV folds on public and private test
pred_publics, pred_privates =create_submission(track_CV, FOLDS)
# Take predictions to CPU device and resturucture
final_submission = format_submission(pred_publics.cpu(), pred_privates.cpu(), public_id_index, private_id_index)
# Generate submission file
final_submission.to_csv("final_submission.csv", index=False)