diff --git a/.gitignore b/.gitignore index 7bbc71c..038a6cc 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,9 @@ +# Data files and folders +download/ +training_set/ +nf_prize_dataset.tar.gz +Netflix/ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/README.md b/README.md index ab7d7d2..a1b893e 100644 --- a/README.md +++ b/README.md @@ -22,29 +22,16 @@ $ python -m unittest test/test_model.py ### Get the data +**Note: Run all these commands within your `DeepRecommender` folder** + [Netflix prize](http://netflixprize.com/) -* ```$ mkdir -p ~/Recommendations``` you can use any other folder name -* Download from [here](http://academictorrents.com/details/9b13183dc4d60676b773c9e2cd6de5e5542cee9a) to ```~/Recommendations``` -* ```$ cd ~/Recommendations``` -* ```$ tar -xvf nf_prize_dataset.tar.gz``` -* ```$ tar -xf download/training_set.tar ``` -* Create necessary folders +* Download from [here](http://academictorrents.com/details/9b13183dc4d60676b773c9e2cd6de5e5542cee9a) into your ```DeepRecommender``` folder ``` -mkdir -p Netflix/N3M_TRAIN -mkdir -p Netflix/N3M_VALID -mkdir -p Netflix/N3M_TEST -mkdir -p Netflix/N6M_TRAIN -mkdir -p Netflix/N6M_VALID -mkdir -p Netflix/N6M_TEST -mkdir -p Netflix/N1Y_TRAIN -mkdir -p Netflix/N1Y_VALID -mkdir -p Netflix/N1Y_TEST -mkdir -p Netflix/NF_TRAIN -mkdir -p Netflix/NF_VALID -mkdir -p Netflix/NF_TEST +$ tar -xvf nf_prize_dataset.tar.gz +$ tar -xf download/training_set.tar +$ python ./data_utils/netflix_data_convert.py training_set Netflix ``` -* ```$ python ~/repos/DeepRecoEncoders/data_utils/netflix_data_convert.py training_set Netflix```. Here ```~/repos/DeepRecoEncoders''' is a path to this repo. #### Data stats | Dataset | Netflix 3 months | Netflix 6 months | Netflix 1 year | Netflix full | @@ -62,7 +49,7 @@ mkdir -p Netflix/NF_TEST ### Train the model In this example, the model will be trained for 12 epochs. In paper we train for 102. ``` -python ~/repos/DeepRecoEncoders/run.py --gpu_ids 0 \ +python run.py --gpu_ids 0 \ --path_to_train_data Netflix/NF_TRAIN \ --path_to_eval_data Netflix/NF_VALID \ --hidden_layers 512,512,1024 \ @@ -86,7 +73,7 @@ $ tensorboard --logdir=model_save ### Run inference on the Test set ``` -python ~/repos/DeepRecoEncoders/infer.py \ +python infer.py \ --path_to_train_data Netflix/NF_TRAIN \ --path_to_eval_data Netflix/NF_TEST \ --hidden_layers 512,512,1024 \ @@ -98,7 +85,7 @@ python ~/repos/DeepRecoEncoders/infer.py \ ### Compute Test RMSE ``` -python ~/repos/DeepRecoEncoders/compute_RMSE.py --path_to_predictions=preds.txt +python compute_RMSE.py --path_to_predictions=preds.txt ``` After 12 epochs you should get RMSE around 0.927. Train longer to get below 0.92 diff --git a/data_utils/netflix_data_convert.py b/data_utils/netflix_data_convert.py index 5d82523..2ca46f6 100644 --- a/data_utils/netflix_data_convert.py +++ b/data_utils/netflix_data_convert.py @@ -1,5 +1,5 @@ # Copyright (c) 2017 NVIDIA Corporation -from os import listdir, path +from os import listdir, path, makedirs import random import sys import time @@ -75,6 +75,13 @@ def create_NETFLIX_data_timesplit(all_data, def main(args): + # create necessary folders: + for output_dir in [ + "Netflix/N3M_TRAIN", "Netflix/N3M_VALID", "Netflix/N3M_TEST", "Netflix/N6M_TRAIN", + "Netflix/N6M_VALID", "Netflix/N6M_TEST", "Netflix/N1Y_TRAIN", "Netflix/N1Y_VALID", + "Netflix/N1Y_TEST", "Netflix/NF_TRAIN", "Netflix/NF_VALID", "Netflix/NF_TEST"]: + makedirs(output_dir, exist_ok=True) + user2id_map = dict() item2id_map = dict() userId = 0 diff --git a/infer.py b/infer.py index 53caede..4c344d7 100644 --- a/infer.py +++ b/infer.py @@ -3,13 +3,9 @@ import argparse from reco_encoder.data import input_layer from reco_encoder.model import model -import torch.optim as optim -import torch.nn as nn from torch.autograd import Variable import copy -import time from pathlib import Path -import numpy as np parser = argparse.ArgumentParser(description='RecoEncoder') @@ -92,4 +88,3 @@ def main(): if __name__ == '__main__': main() - diff --git a/reco_encoder/__init__.py b/reco_encoder/__init__.py index 68d1463..bad4325 100644 --- a/reco_encoder/__init__.py +++ b/reco_encoder/__init__.py @@ -1,3 +1 @@ # Copyright (c) 2017 NVIDIA Corporation -from . import data -from . import model \ No newline at end of file diff --git a/reco_encoder/data/__init__.py b/reco_encoder/data/__init__.py index c61fef8..bad4325 100644 --- a/reco_encoder/data/__init__.py +++ b/reco_encoder/data/__init__.py @@ -1,2 +1 @@ # Copyright (c) 2017 NVIDIA Corporation -from . import input_layer \ No newline at end of file diff --git a/reco_encoder/model/__init__.py b/reco_encoder/model/__init__.py index 2932d87..bad4325 100644 --- a/reco_encoder/model/__init__.py +++ b/reco_encoder/model/__init__.py @@ -1,3 +1 @@ # Copyright (c) 2017 NVIDIA Corporation -from .model import AutoEncoder -from .model import MSEloss \ No newline at end of file diff --git a/test/context.py b/test/context.py index 856808b..712b336 100644 --- a/test/context.py +++ b/test/context.py @@ -2,5 +2,3 @@ import os import sys sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) - -import reco_encoder \ No newline at end of file diff --git a/test/data_layer_tests.py b/test/data_layer_tests.py index 1421ed3..b198a47 100644 --- a/test/data_layer_tests.py +++ b/test/data_layer_tests.py @@ -1,6 +1,5 @@ # Copyright (c) 2017 NVIDIA Corporation import unittest -import sys from .context import reco_encoder class UserItemRecDataProviderTest(unittest.TestCase): diff --git a/test/test_model.py b/test/test_model.py index 60168d9..2403cff 100644 --- a/test/test_model.py +++ b/test/test_model.py @@ -3,10 +3,8 @@ import sys sys.path.append('data') sys.path.append('model') -import torch from .context import reco_encoder import torch.optim as optim -import torch.nn as nn from torch.autograd import Variable class iRecAutoEncoderTest(unittest.TestCase):