【Jane Street Market Prediction】CNNモデルを作ってみた

前回の記事では、主成分分析とクラスタリングから次元削減を行いLightGBMによりモデルを作成しました。

今回は、PytorchでCNN(1次元畳み込み)を用いたモデルを構築してみたので記事にまとめます。
今回構築したモデルは、単純なCNNとTCN(Temporal Convolutional Network)です。
結果はいまいちだったのでご参考程度にしていただけると幸いです。

※本記事は、公開しているNotebookをまとめたものとなっております。
・CNNの学習に関するNotebook ⇒ こちら,　推論に関するNotebook ⇒ こちら
・TCNの学習に関するNotebook ⇒ こちら,　推論に関するNotebook ⇒ こちら

Dataset Class
DataLoader Class
モデル構築
1. CNN
2. TCN
損失関数と最適化
1. モデルの学習
推論
結果
まとめ

Dataset Class

Pytorchでモデルを構築する為にデータセットクラスを作成します。
CNNでは時系列データをひとまとまりとして取り扱うため、ある期間幅(window_size)を指定して、行列(window_size×説明変数の数)をインプットデータとして返すデータセットクラスを作成しました。
インプットデータのリクエスト時には整形処理をするように工夫しました。
こうすることで、メモリに乗らないような大規模なデータセットでも対応できるようになります。
コードは、以下の通り。

from torch.utils.data import Dataset
from torch import nn
 
class JSMP_Dataset(Dataset):
     
    def __init__(self, file_path, window_size):
        # valiables
        self.file_path = file_path
        self.window_size = window_size
        
        # read csv
        train = pd.read_csv(file_path)
        
        # pre processing
        train = train.query('date > 85').reset_index(drop = True) 
        #train = train[train['weight'] != 0]
        train.fillna(train.mean(),inplace=True)
        train['action'] = ((train['resp'].values) > 0).astype(int)
        
        resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4']
        self.features = [c for c in train.columns if "feature" in c]
        self.f_mean = np.mean(train[self.features[1:]].values,axis=0)
        
        self.X_train = train.loc[:, train.columns.str.contains('feature')].values
        self.y_train = np.stack([(train[c] > 0).astype('int') for c in resp_cols]).T
        
        self.X_train = torch.from_numpy(self.X_train).float()
        self.y_train = torch.from_numpy(self.y_train).float()
        
        # reduce memory
        del train
        gc.collect()
 
    def __len__(self):
        return len(self.X_train) - self.window_size
     
    def __getitem__(self, i):
        data = self.X_train[i:(i+ self.window_size), :] 
        label = self.y_train[i + self.window_size - 1]
 
        return data, label
window_size = 20
file_path = '/kaggle/input/jane-street-market-prediction/train.csv'
ds = JSMP_Dataset(file_path, window_size)

DataLoader Class

モデルの学習・検証時にデータセットからバッチサイズ分のデータを取り出すためのクラスを作成します。

今回は、バッチサイズを4096、学習データを全体の8割・残りを検証データとしております。
コードは以下の通り。

from torch.utils.data.dataset import Subset
n_samples = len(ds)
train_size = int(n_samples * 0.8)
train_ds = Subset(ds, list(range(0, train_size)))
valid_ds = Subset(ds, list(range(train_size, n_samples)))
print('train size:',len(train_ds))
print('valid size:',len(valid_ds))
# make DataLoder
train_dataloader = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dataloader = torch.utils.data.DataLoader(valid_ds, batch_size=batch_size, shuffle=True)
# dict
dataloaders_dict = {'train': train_dataloader,
                    'val'  : valid_dataloader}

モデル構築

入力データは、(直近から指定した時点)×(説明変数の数）となっております。
今回は、このデータを畳み込み層を用いて行数を圧縮させ、全結合層に通すようなネットワークを構築してみました。

モデルは、簡単なCNNと時系列データで効果があるといわれているTCNを用いてみました。
以下に、それぞれのネットワークを載せておきます。

CNN

class Net(nn.Module):
    def __init__(self):
        super(Net,self).__init__()
                
        self.bn0 = nn.BatchNorm1d(20)
        self.dropout = nn.Dropout(p=0.2)
        
        self.conv1 = nn.Conv1d(in_channels=20, out_channels=16, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        
        self.conv2 = nn.Conv1d(in_channels=16, out_channels=8, kernel_size=3, padding=1)
        self.conv3 = nn.Conv1d(in_channels=8, out_channels=4, kernel_size=3, padding=1)
        self.conv4 = nn.Conv1d(in_channels=4, out_channels=2, kernel_size=3, padding=1)
        
        
        self.fc1 = nn.Linear(260, 260)
        self.fc2 = nn.Linear(260, 5)
    def forward(self,x):
        
        x = self.bn0(x)
        x = self.dropout(x)
        
        x = self.conv1(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.conv2(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.conv3(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.conv4(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = torch.flatten(x, start_dim=1)
        
        x = self.fc1(x)
        x = self.dropout(x)
        
        x = self.fc1(x)
        x = self.dropout(x)
        
        x = self.fc2(x)
        return torch.sigmoid(x)
net = Net()
print(net)

TCN

import torch
import torch.nn as nn
from torch.nn.utils import weight_norm
class Chomp1d(nn.Module):
    def __init__(self, chomp_size):
        super(Chomp1d, self).__init__()
        self.chomp_size = chomp_size
    def forward(self, x):
        return x[:, :, :-self.chomp_size].contiguous()
class TemporalBlock(nn.Module):
    def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, padding, dropout=0.2):
        super(TemporalBlock, self).__init__()
        self.conv1 = weight_norm(nn.Conv1d(n_inputs, n_outputs, kernel_size,
                                           stride=stride, padding=padding, dilation=dilation))
        self.chomp1 = Chomp1d(padding)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout)
        self.conv2 = weight_norm(nn.Conv1d(n_outputs, n_outputs, kernel_size,
                                           stride=stride, padding=padding, dilation=dilation))
        self.chomp2 = Chomp1d(padding)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout)
        self.net = nn.Sequential(self.conv1, self.chomp1, self.relu1, self.dropout1,
                                 self.conv2, self.chomp2, self.relu2, self.dropout2)
        self.downsample = nn.Conv1d(n_inputs, n_outputs, 1) if n_inputs != n_outputs else None
        self.relu = nn.ReLU()
        self.init_weights()
    def init_weights(self):
        self.conv1.weight.data.normal_(0, 0.01)
        self.conv2.weight.data.normal_(0, 0.01)
        if self.downsample is not None:
            self.downsample.weight.data.normal_(0, 0.01)
    def forward(self, x):
        out = self.net(x)
        res = x if self.downsample is None else self.downsample(x)
        return self.relu(out + res)
class TemporalConvNet(nn.Module):
    def __init__(self, num_inputs, num_channels, kernel_size=2, dropout=0.2):
        super(TemporalConvNet, self).__init__()
        layers = []
        num_levels = len(num_channels)
        for i in range(num_levels):
            dilation_size = 2 ** i
            in_channels = num_inputs if i == 0 else num_channels[i-1]
            out_channels = num_channels[i]
            layers += [TemporalBlock(in_channels, out_channels, kernel_size, stride=1, dilation=dilation_size,
                                     padding=(kernel_size-1) * dilation_size, dropout=dropout)]
        self.network = nn.Sequential(*layers)
    def forward(self, x):
        return self.network(x)
class TCN(nn.Module):
    def __init__(self, input_size, output_size, num_channels, kernel_size, dropout):
        super(TCN, self).__init__()
        self.tcn = TemporalConvNet(input_size, num_channels, kernel_size=kernel_size, dropout=dropout)
        self.fc = nn.Linear(130 * num_channels[-1], output_size)
    def forward(self, inputs):
        """Inputs have to have dimension (N, C_in, L_in)"""
        y1 = self.tcn(inputs)  # input should have dimension (N, C, L)
        y1 = torch.flatten(y1, start_dim=1)
        o = self.fc(y1)
        return torch.sigmoid(o)
net = TCN(input_size=20, output_size=5, num_channels=[16, 8, 4, 2], kernel_size=2, dropout=0.5)
print(net)

損失関数と最適化

今回は、2値分類なのでBinary Cross Entorpyを選択しました。
最適化にはよく使われるAdamを利用しました。

import torch.optim as optim
criterion = nn.BCELoss() # Binary Cross Entropy
optimizer = optim.Adam(net.parameters(), lr=0.001)

モデルの学習

from tqdm import tqdm
from pytorch_lightning.metrics import Accuracy
def train_model(net, dataloader_dict, criterion, optimizer, num_epochs):
    
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print('use devise:', device)
    
    net.to(device)
    accuracy = Accuracy(compute_on_step=False).to(device)
    #torch.backends.cudnn.deterministic = True
    
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('--------------------------')
        
        for phase in ['train', 'val']:
            epoch_loss = 0.0
        
            for inputs, labels in tqdm(dataloader_dict[phase]):
                
                inputs = inputs.to(device)
                labels = labels.to(device)
                # init optimizer:勾配パラメータを0にする
                optimizer.zero_grad()
                
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = net(inputs)
                    loss = criterion(outputs, labels)
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                    
                    # Calculate Score
                    y_hat = (torch.median(outputs, axis=1).values > 0.5).long()
                    y = torch.median(labels, axis=1).values.long()
                    accuracy(y_hat, y)
                    
                    epoch_loss += loss.item() * inputs.size(0)
                
            # print Score
            epoch_accuracy = accuracy.compute()
            print('{} Loss: {:.4f} Acc:{:.4f}'.format(phase, epoch_loss, epoch_accuracy))
            
        # save model
        if phase == 'val':
            if epoch == 0:
                best_val_loss = epoch_loss
                save = True
            elif best_val_loss > epoch_loss:
                best_val_loss = epoch_loss
                save = True
            if save:
                print('Best score updated. New model was saved.')
                torch.save(net.state_dict(), './model.mdl')
                save = False
num_epochs = 100
train_model(net, dataloaders_dict, criterion, optimizer, num_epochs)

エポック数を100としましたが、そこまでやる必要はなさそうでした。

推論

今回は入力データが行列型としているため、推論時は少し工夫が必要となります。
指定したwindow_sizeの時点までは、推論を行わずデータを行列に追加し、window_sizeの時点以降からは、行列の先頭と末尾を更新する処理とする必要があります。
詳細は、以下のコードをご確認ください

# load model
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('use devise:', device)
net.load_state_dict(torch.load('/kaggle/input/jsmp-cnn-pytorch/model.mdl', map_location=torch.device('cpu') ))
net.eval()
th = 0.5
import janestreet
env = janestreet.make_env()
for i, (test_df, pred_df) in enumerate(env.iter_test()):
    x_tt = test_df.loc[:, ds.features].values
    if np.isnan(x_tt[:, 1:].sum()):
        x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * ds.f_mean
    
    # make window data
    if i == 0:
        x_window = x_tt.copy()
    elif i < window_size: 
        x_window = np.concatenate([x_window, x_tt], axis=0)
    else:
        x_window = np.concatenate([x_window[1:, :], x_tt], axis=0)
    
    if i < window_size - 1:
        # pass 
        pred_df.action = 0
    else:
        # prediction
        if test_df['weight'].item() > 0:
            inputs = torch.Tensor(x_window).unsqueeze(0).to(device)
            outputs = net(inputs)
            pred = (torch.median(outputs, axis=1).values > th).long()
            pred_df.action = pred.item()
            #print(pred.item())
        else:
            pred_df.action = 0
        
    env.predict(pred_df)