[논문구현]DeepFM: A Factorization-Machine based Neural Networks for CTR Prediction(2017)

DeepFM

torchfm

  • pip install torchfm으로 바로 설치가능
!pip install torchfm
Collecting torchfm
  Downloading torchfm-0.7.0.tar.gz (9.6 kB)
Building wheels for collected packages: torchfm
  Building wheel for torchfm (setup.py) ... [?25l[?25hdone
  Created wheel for torchfm: filename=torchfm-0.7.0-py3-none-any.whl size=18357 sha256=e1cb287c46b8bbfa3ba8df350ededd5ba5b3dab456075104627d764945ff4307
  Stored in directory: /root/.cache/pip/wheels/5e/e9/25/1ae407681ff67b4334513b7793ced4c66a4bf37ee99ed31ffe
Successfully built torchfm
Installing collected packages: torchfm
Successfully installed torchfm-0.7.0
import torchfm

torchfm 라이브러리 뜯어보기

import numpy as np
import torch
import torch.nn.functional as F
# torchfm의 deepfm 부분 구현 


class FeaturesLinear(torch.nn.Module):

    # model initialize 
    def __init__(self, field_dims, output_dim=1):
        super().__init__()

        # 2개의 숫자만큼의 임베딩 공간 fc(fully connected) 만듬 
        self.fc = torch.nn.Embedding(sum(field_dims), output_dim) # sum(field_dim) : user100 item 10000이면 100+10000

        self.bias = torch.nn.Parameter(torch.zeros((output_dim,)))
        self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.long) # 어디까지가 user인지 마크 용도. 

    def forward(self, x):
        """
        :param x: Long tensor of size ``(batch_size, num_fields)``
        """
        x = x + x.new_tensor(self.offsets).unsqueeze(0)
        return torch.sum(self.fc(x), dim=1) + self.bias

# hidden layer 
class FeaturesEmbedding(torch.nn.Module):

    def __init__(self, field_dims, embed_dim):
        super().__init__()
        self.embedding = torch.nn.Embedding(sum(field_dims), embed_dim) # output형태가 다름. embed_dim으로 줄여짐 hidden layer가 됨 ## embedding init 수정 시 포인트!
        self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.long)
        torch.nn.init.xavier_uniform_(self.embedding.weight.data) ## emdedding weight 업데이트 부분 수정시 포인트 ! 

    def forward(self, x):
        """
        :param x: Long tensor of size ``(batch_size, num_fields)``
        """
        x = x + x.new_tensor(self.offsets).unsqueeze(0)
        return self.embedding(x)


class FactorizationMachine(torch.nn.Module):

    def __init__(self, reduce_sum=True):
        super().__init__()
        self.reduce_sum = reduce_sum

    def forward(self, x):
        """
        :param x: Float tensor of size ``(batch_size, num_fields, embed_dim)``
        """
        square_of_sum = torch.sum(x, dim=1) ** 2 # fm의 linear complexity 
        sum_of_square = torch.sum(x ** 2, dim=1) # fm의 linear complexity 
        ix = square_of_sum - sum_of_square
        if self.reduce_sum:
            ix = torch.sum(ix, dim=1, keepdim=True)
        return 0.5 * ix


class MultiLayerPerceptron(torch.nn.Module):

    def __init__(self, input_dim, embed_dims, dropout, output_layer=True):
        super().__init__()
        layers = list()
        for embed_dim in embed_dims:
            layers.append(torch.nn.Linear(input_dim, embed_dim))
            layers.append(torch.nn.BatchNorm1d(embed_dim))
            layers.append(torch.nn.ReLU())
            layers.append(torch.nn.Dropout(p=dropout))
            input_dim = embed_dim # check 
        if output_layer:
            layers.append(torch.nn.Linear(input_dim, 1))
        self.mlp = torch.nn.Sequential(*layers)

    def forward(self, x):
        """
        :param x: Float tensor of size ``(batch_size, embed_dim)``
        """
        return self.mlp(x)

class DeepFactorizationMachineModel(torch.nn.Module):
    """
    A pytorch implementation of DeepFM.

    Reference:
        H Guo, et al. DeepFM: A Factorization-Machine based Neural Network for CTR Prediction, 2017.
    """

    def __init__(self, field_dims, embed_dim, mlp_dims, dropout):
        super().__init__()
        self.linear = FeaturesLinear(field_dims)
        self.fm = FactorizationMachine(reduce_sum=True)
        self.embedding = FeaturesEmbedding(field_dims, embed_dim)
        self.embed_output_dim = len(field_dims) * embed_dim
        self.mlp = MultiLayerPerceptron(self.embed_output_dim, mlp_dims, dropout)

    def forward(self, x):
        """
        :param x: Long tensor of size ``(batch_size, num_fields)``
        """
        embed_x = self.embedding(x) # 임베딩한 input x를 share 
        x = self.linear(x) + self.fm(embed_x) + self.mlp(embed_x.view(-1, self.embed_output_dim))
        return torch.sigmoid(x.squeeze(1))

Load dataset and Train model

from google.colab import drive
drive.mount('/content/drive')
data_path = '/content/drive/MyDrive/capstone/data/kmrd/kmr_dataset/datafile/kmrd-small'
Mounted at /content/drive
import torch.utils.data

class KMRDDataset(torch.utils.data.Dataset):
    def __init__(self, data_path):
        data = pd.read_csv(os.path.join(data_path,'rates.csv'))[:10000] # 코랩 리소스상 데이터 쪼갤게요 
        
        user_to_index = {original: idx for idx, original in enumerate(data.user.unique())}
        movie_to_index = {original: idx for idx, original in enumerate(data.movie.unique())}
        data['user'] = data['user'].apply(lambda x: user_to_index[x])
        data['movie'] = data['movie'].apply(lambda x: movie_to_index[x])
        # df [user, movie, rate] -> tuple list (user, movie, rate)
        data = data.to_numpy()[:, :3]

        self.items = data[:, :2].astype(np.int)  # -1 because ID begins from 1
        self.targets = self.__preprocess_target(data[:, 2]).astype(np.float32)
        self.field_dims = np.max(self.items, axis=0) + 1 # user, movie에서 가장 큰 값 뽑아내서 field 차원 찾기 
        self.user_field_idx = np.array((0, ), dtype=np.long)
        self.item_field_idx = np.array((1,), dtype=np.long)

    def __len__(self):
        return self.targets.shape[0]

    def __getitem__(self, index):
        return self.items[index], self.targets[index]

    def __preprocess_target(self, target):
        target[target <= 9] = 0 # KMRD 데이터 특성상 (9,10 평점에 몰려있음) 10점 아니면 다 0 
        target[target > 9] = 1
        return target
import pandas as pd
import os
dataset = KMRDDataset(data_path=data_path)
print(dataset.item_field_idx)
print(dataset.field_dims) # user 466 movie 532
print(sum(dataset.field_dims))
print(torch.nn.Embedding(sum(dataset.field_dims), 16)) # output dimension으로 16 잡아줌 (k)
print(torch.nn.Parameter(torch.zeros((1,))))
print(np.array((0, *np.cumsum(dataset.field_dims)[:-1]), dtype=np.long)) # offset 시의 결과 확인 : 0~466까지는 user, 뒷부분은 movie 
[1]
[466 532]
998
Embedding(998, 16)
Parameter containing:
tensor([0.], requires_grad=True)
[  0 466]
train_length = int(len(dataset) * 0.8)
valid_length = int(len(dataset) * 0.1)
test_length = len(dataset) - train_length - valid_length
train_dataset, valid_dataset, test_dataset = torch.utils.data.random_split(
    dataset, (train_length, valid_length, test_length))
from torch.utils.data import DataLoader

train_data_loader = DataLoader(train_dataset, batch_size=16)
valid_data_loader = DataLoader(valid_dataset, batch_size=16)
test_data_loader = DataLoader(test_dataset, batch_size=1)
print(dataset.items) # [user movie]
print(dataset.targets) # [rating]
[[  0   0]
 [  0   1]
 [  0   2]
 ...
 [465  15]
 [465  15]
 [465 338]]
[0. 0. 0. ... 0. 0. 0.]
model = DeepFactorizationMachineModel(dataset.field_dims, embed_dim=16, mlp_dims=(16, 16), dropout=0.2)
model
DeepFactorizationMachineModel(
  (linear): FeaturesLinear(
    (fc): Embedding(998, 1)
  )
  (fm): FactorizationMachine()
  (embedding): FeaturesEmbedding(
    (embedding): Embedding(998, 16)
  )
  (mlp): MultiLayerPerceptron(
    (mlp): Sequential(
      (0): Linear(in_features=32, out_features=16, bias=True)
      (1): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): Dropout(p=0.2, inplace=False)
      (4): Linear(in_features=16, out_features=16, bias=True)
      (5): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (6): ReLU()
      (7): Dropout(p=0.2, inplace=False)
      (8): Linear(in_features=16, out_features=1, bias=True)
    )
  )
)
criterion = torch.nn.BCELoss() # 0,1이니까 binary cross entropy~
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001, weight_decay=1e-6)
import tqdm
log_interval = 100

model.train()
total_loss = 0
tk0 = tqdm.tqdm(train_data_loader, smoothing=0, mininterval=1.0)
for i, (fields, target) in enumerate(tk0):
    # fields, target = fields.to(device), target.to(device)
    y = model(fields)
    loss = criterion(y, target.float())
    model.zero_grad()
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
    if (i + 1) % log_interval == 0:
        tk0.set_postfix(loss=total_loss / log_interval)
        total_loss = 0
100%|██████████| 500/500 [00:01<00:00, 342.01it/s, loss=0.606]