From 0da0b606ba67be84c262370e00b822cc81eaa828 Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Wed, 25 Apr 2018 10:49:08 -0700 Subject: [PATCH 1/9] not yet running mp recipe --- reco_encoder/model/model.py | 2 +- run.py | 75 ++++++++++++++++++++++++++++--------- 2 files changed, 59 insertions(+), 18 deletions(-) diff --git a/reco_encoder/model/model.py b/reco_encoder/model/model.py index 339a32b..cd83d9c 100644 --- a/reco_encoder/model/model.py +++ b/reco_encoder/model/model.py @@ -32,7 +32,7 @@ def MSEloss(inputs, targets, size_avarage=False): mask = targets != 0 num_ratings = torch.sum(mask.float()) criterion = nn.MSELoss(size_average=size_avarage) - return criterion(inputs * mask.float(), targets), Variable(torch.Tensor([1.0])) if size_avarage else num_ratings + return criterion(inputs * mask.float().half(), targets), Variable(torch.Tensor([1.0]).half()) if size_avarage else num_ratings class AutoEncoder(nn.Module): def __init__(self, layer_sizes, nl_type='selu', is_constrained=True, dp_drop_prob=0.0, last_layer_activations=True): diff --git a/run.py b/run.py index 0de4e04..3ca8d84 100644 --- a/run.py +++ b/run.py @@ -65,12 +65,12 @@ def do_eval(encoder, evaluation_data_layer): denom = 0.0 total_epoch_loss = 0.0 for i, (eval, src) in enumerate(evaluation_data_layer.iterate_one_epoch_eval()): - inputs = Variable(src.cuda().to_dense() if use_gpu else src.to_dense()) - targets = Variable(eval.cuda().to_dense() if use_gpu else eval.to_dense()) + inputs = Variable(src.cuda().to_dense().half() if use_gpu else src.to_dense().half()) + targets = Variable(eval.cuda().to_dense().half() if use_gpu else eval.to_dense().half()) outputs = encoder(inputs) loss, num_ratings = model.MSEloss(outputs, targets) total_epoch_loss += loss.data[0] - denom += num_ratings.data[0] + denom += num_ratings.data.half()[0] return sqrt(total_epoch_loss / denom) def log_var_and_grad_summaries(logger, layers, global_step, prefix, log_histograms=False): @@ -85,20 +85,48 @@ def log_var_and_grad_summaries(logger, layers, global_step, prefix, log_histogra """ for ind, w in enumerate(layers): # Variables - w_var = w.data.cpu().numpy() + w_var = w.data.float().cpu().numpy() logger.scalar_summary("Variables/FrobNorm/{}_{}".format(prefix, ind), np.linalg.norm(w_var), global_step) if log_histograms: - logger.histo_summary(tag="Variables/{}_{}".format(prefix, ind), values=w.data.cpu().numpy(), + logger.histo_summary(tag="Variablmodeles/{}_{}".format(prefix, ind), values=w.data.cpu().numpy(), step=global_step) # Gradients - w_grad = w.grad.data.cpu().numpy() + w_grad = w.grad.float().data.cpu().numpy() logger.scalar_summary("Gradients/FrobNorm/{}_{}".format(prefix, ind), np.linalg.norm(w_grad), global_step) if log_histograms: - logger.histo_summary(tag="Gradients/{}_{}".format(prefix, ind), values=w.grad.data.cpu().numpy(), - step=global_step) + logger.histo_summary(tag="Gradients/{}_{}".format(prefix, ind), values=w.grad.float().data.cpu().numpy(), + step=global_step) + + +###### +def prep_param_lists(model): + model_params = [p for p in model.parameters() if p.requires_grad] + master_params = [p.detach().clone().float() for p in model_params] + for p in master_params: + print("AAAAAAAAA") + print(p) + print("AAAAAAAAA") + p.requires_grad = True + return model_params, master_params + +def master_params_to_model_params(model_params, master_params): + for model, master in zip(model_params, master_params): + model.data.copy_(master.data) + +def model_grads_to_master_grads(model_params, master_params): + for model, master in zip(model_params, master_params): + if master.grad is None: + master.grad = Variable( + master.data.new(*master.data.size()) + ) + master.grad.data.copy_(model.grad.data) + +###### + + def main(): logger = Logger(args.logdir) @@ -127,6 +155,8 @@ def main(): is_constrained=args.constrained, dp_drop_prob=args.drop_prob, last_layer_activations=not args.skip_last_layer_nl) + + os.makedirs(args.logdir, exist_ok=True) model_checkpoint = args.logdir + "/model" path_to_model = Path(model_checkpoint) @@ -147,27 +177,31 @@ def main(): rencoder = nn.DataParallel(rencoder, device_ids=gpu_ids) - if use_gpu: rencoder = rencoder.cuda() + if use_gpu: rencoder = rencoder.cuda().half() + + ########## + model_params, master_params = prep_param_lists(rencoder) + ########## if args.optimizer == "adam": - optimizer = optim.Adam(rencoder.parameters(), + optimizer = optim.Adam(master_params,#rencoder.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer == "adagrad": - optimizer = optim.Adagrad(rencoder.parameters(), + optimizer = optim.Adagrad(master_params, #rencoder.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer == "momentum": - optimizer = optim.SGD(rencoder.parameters(), + optimizer = optim.SGD(master_params,#rencoder.parameters(), lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) scheduler = MultiStepLR(optimizer, milestones=[24, 36, 48, 66, 72], gamma=0.5) elif args.optimizer == "rmsprop": - optimizer = optim.RMSprop(rencoder.parameters(), + optimizer = optim.RMSprop(master_params,#rencoder.parameters(), lr=args.lr, momentum=0.9, weight_decay=args.weight_decay) else: - raise ValueError('Unknown optimizer kind') + raise ValueError('Unknown optimizer kind') t_loss = 0.0 t_loss_denom = 0.0 @@ -185,13 +219,20 @@ def main(): if args.optimizer == "momentum": scheduler.step() for i, mb in enumerate(data_layer.iterate_one_epoch()): - inputs = Variable(mb.cuda().to_dense() if use_gpu else mb.to_dense()) + inputs = Variable(mb.cuda().to_dense().half() if use_gpu else mb.to_dense()) optimizer.zero_grad() outputs = rencoder(inputs) loss, num_ratings = model.MSEloss(outputs, inputs) - loss = loss / num_ratings + loss = loss / num_ratings.half() loss.backward() + ## + model_grads_to_master_grads(model_params, master_params) + ## optimizer.step() + ## + master_params_to_model_params(model_params, master_params) + ## + global_step += 1 t_loss += loss.data[0] t_loss_denom += 1 @@ -220,7 +261,7 @@ def main(): optimizer.zero_grad() outputs = rencoder(inputs) loss, num_ratings = model.MSEloss(outputs, inputs) - loss = loss / num_ratings + loss = loss / num_ratings.half() loss.backward() optimizer.step() From 739483d6d0f02b816dac178de3b6aa1e2d8787e8 Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Wed, 25 Apr 2018 12:46:28 -0700 Subject: [PATCH 2/9] add loss_scale --- run.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/run.py b/run.py index 3ca8d84..4cac02d 100644 --- a/run.py +++ b/run.py @@ -15,6 +15,8 @@ import numpy as np import os +scale_factor = 0.01 + parser = argparse.ArgumentParser(description='RecoEncoder') parser.add_argument('--lr', type=float, default=0.00001, metavar='N', help='learning rate') @@ -104,11 +106,8 @@ def log_var_and_grad_summaries(logger, layers, global_step, prefix, log_histogra ###### def prep_param_lists(model): model_params = [p for p in model.parameters() if p.requires_grad] - master_params = [p.detach().clone().float() for p in model_params] + master_params = [p.clone().float().detach() for p in model_params] for p in master_params: - print("AAAAAAAAA") - print(p) - print("AAAAAAAAA") p.requires_grad = True return model_params, master_params @@ -224,9 +223,14 @@ def main(): outputs = rencoder(inputs) loss, num_ratings = model.MSEloss(outputs, inputs) loss = loss / num_ratings.half() - loss.backward() + scaled_loss = scale_factor * loss.float() + scaled_loss.backward() + #loss.backward() + ## model_grads_to_master_grads(model_params, master_params) + for param in master_params: + param.grad.data.mul_(1./scale_factor) ## optimizer.step() ## From 619334aefbfe5af12375bdcb89c8a899203e1f53 Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Wed, 25 Apr 2018 12:59:23 -0700 Subject: [PATCH 3/9] update readme --- README.md | 4 +++- run.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index fc141d8..4993bbb 100644 --- a/README.md +++ b/README.md @@ -63,12 +63,14 @@ python run.py --gpu_ids 0 \ --optimizer momentum \ --lr 0.005 \ --weight_decay 0 \ ---aug_step 1 \ +--aug_step 0 \ --noise_prob 0 \ --num_epochs 12 \ --summary_frequency 1000 ``` +WARNING: aug_step is not working in mixed_precision mode yet, so keep it to 0. + Note that you can run Tensorboard in parallel ``` $ tensorboard --logdir=model_save diff --git a/run.py b/run.py index 4cac02d..b757aba 100644 --- a/run.py +++ b/run.py @@ -15,7 +15,7 @@ import numpy as np import os -scale_factor = 0.01 +scale_factor = 128.0 parser = argparse.ArgumentParser(description='RecoEncoder') parser.add_argument('--lr', type=float, default=0.00001, metavar='N', From 3ee7f7e0bf967469eb730b6b966b305d42260a2e Mon Sep 17 00:00:00 2001 From: Oleksii Kuchaiev Date: Fri, 27 Apr 2018 15:42:20 -0700 Subject: [PATCH 4/9] mp work --- README.md | 4 +--- reco_encoder/model/model.py | 20 +++++++++++--------- run.py | 12 +++++++++++- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 4993bbb..fc141d8 100644 --- a/README.md +++ b/README.md @@ -63,14 +63,12 @@ python run.py --gpu_ids 0 \ --optimizer momentum \ --lr 0.005 \ --weight_decay 0 \ ---aug_step 0 \ +--aug_step 1 \ --noise_prob 0 \ --num_epochs 12 \ --summary_frequency 1000 ``` -WARNING: aug_step is not working in mixed_precision mode yet, so keep it to 0. - Note that you can run Tensorboard in parallel ``` $ tensorboard --logdir=model_save diff --git a/reco_encoder/model/model.py b/reco_encoder/model/model.py index cd83d9c..23c8baa 100644 --- a/reco_encoder/model/model.py +++ b/reco_encoder/model/model.py @@ -8,25 +8,27 @@ def activation(input, kind): #print("Activation: {}".format(kind)) if kind == 'selu': - return F.selu(input) + res = F.selu(input) elif kind == 'relu': - return F.relu(input) + res = F.relu(input) elif kind == 'relu6': - return F.relu6(input) + res = F.relu6(input) elif kind == 'sigmoid': - return F.sigmoid(input) + res = F.sigmoid(input) elif kind == 'tanh': - return F.tanh(input) + res = F.tanh(input) elif kind == 'elu': - return F.elu(input) + res = F.elu(input) elif kind == 'lrelu': - return F.leaky_relu(input) + res = F.leaky_relu(input) elif kind == 'swish': - return input*F.sigmoid(input) + res = input*F.sigmoid(input) elif kind == 'none': - return input + res = input else: raise ValueError('Unknown non-linearity type') + return torch.max(res, torch.ones_like(res)*6.0) + #return torch.max(res, torch.HalfTensor(1).fill_(6.0)) def MSEloss(inputs, targets, size_avarage=False): mask = targets != 0 diff --git a/run.py b/run.py index b757aba..c8ce713 100644 --- a/run.py +++ b/run.py @@ -266,8 +266,18 @@ def main(): outputs = rencoder(inputs) loss, num_ratings = model.MSEloss(outputs, inputs) loss = loss / num_ratings.half() - loss.backward() + scaled_loss = scale_factor * loss.float() + scaled_loss.backward() + #loss.backward() + ## + model_grads_to_master_grads(model_params, master_params) + for param in master_params: + param.grad.data.mul_(1. / scale_factor) + ## optimizer.step() + ## + master_params_to_model_params(model_params, master_params) + ## e_end_time = time.time() print('Total epoch {} finished in {} seconds with TRAINING RMSE loss: {}' From b6455c94bc3c925172f1fe974628d668b762f3d2 Mon Sep 17 00:00:00 2001 From: David Garcia Date: Mon, 30 Apr 2018 09:27:14 -0700 Subject: [PATCH 5/9] Removed dependency with tensorflow --- logger.py | 53 ++++------------------------------------------------- 1 file changed, 4 insertions(+), 49 deletions(-) mode change 100644 => 100755 logger.py diff --git a/logger.py b/logger.py old mode 100644 new mode 100755 index 4827ccc..1fb7151 --- a/logger.py +++ b/logger.py @@ -1,7 +1,6 @@ # THIS FILE IS COPY-PASTED FROM HERE: https://github.com/yunjey/pytorch-tutorial/tree/master/tutorials/04-utils/tensorboard # Code referenced from https://gist.github.com/gyglim/1f8dfb1b5c82627ae3efcfbbadb9f514 -import tensorflow as tf import numpy as np import scipy.misc @@ -14,60 +13,16 @@ class Logger(object): def __init__(self, log_dir): """Create a summary writer logging to log_dir.""" - self.writer = tf.summary.FileWriter(log_dir) + pass def scalar_summary(self, tag, value, step): """Log a scalar variable.""" - summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)]) - self.writer.add_summary(summary, step) + pass def image_summary(self, tag, images, step): """Log a list of images.""" - - img_summaries = [] - for i, img in enumerate(images): - # Write the image to a string - try: - s = StringIO() - except: - s = BytesIO() - scipy.misc.toimage(img).save(s, format="png") - - # Create an Image object - img_sum = tf.Summary.Image(encoded_image_string=s.getvalue(), - height=img.shape[0], - width=img.shape[1]) - # Create a Summary value - img_summaries.append(tf.Summary.Value(tag='%s/%d' % (tag, i), image=img_sum)) - - # Create and write Summary - summary = tf.Summary(value=img_summaries) - self.writer.add_summary(summary, step) + pass def histo_summary(self, tag, values, step, bins=1000): """Log a histogram of the tensor of values.""" - - # Create a histogram using numpy - counts, bin_edges = np.histogram(values, bins=bins) - - # Fill the fields of the histogram proto - hist = tf.HistogramProto() - hist.min = float(np.min(values)) - hist.max = float(np.max(values)) - hist.num = int(np.prod(values.shape)) - hist.sum = float(np.sum(values)) - hist.sum_squares = float(np.sum(values ** 2)) - - # Drop the start of the first bin - bin_edges = bin_edges[1:] - - # Add bin edges and counts - for edge in bin_edges: - hist.bucket_limit.append(edge) - for c in counts: - hist.bucket.append(c) - - # Create and write Summary - summary = tf.Summary(value=[tf.Summary.Value(tag=tag, histo=hist)]) - self.writer.add_summary(summary, step) - self.writer.flush() \ No newline at end of file + pass From b07301b00dffa774a60892f11ba720013f9b5042 Mon Sep 17 00:00:00 2001 From: David Garcia Date: Mon, 30 Apr 2018 10:13:23 -0700 Subject: [PATCH 6/9] Compute MSE loss in fp32. Call model.zero_grad() to clear fp16 gradients instead of master gradients --- .gitignore | 0 AutoEncoder.png | Bin LICENSE | 0 README.md | 0 azkaban/AutoRec/ConstrainedRecoEncoder.job | 0 .../AutoRec/ConstrainedRecoEncoderNoLastLayerNl.job | 0 azkaban/AutoRec/RecoEncoder.job | 0 azkaban/AutoRec/RecoEncoderNoLastLayerNl.job | 0 azkaban/AutoRec/done.job | 0 azkaban/AutoRec/netflix_data_preprocess.job | 0 azkaban/AutoRecAllSplits/RecoEncoder1Y.job | 0 azkaban/AutoRecAllSplits/RecoEncoderN3m.job | 0 azkaban/AutoRecAllSplits/RecoEncoderN6m.job | 0 azkaban/AutoRecAllSplits/RecoEncoderNF.job | 0 azkaban/AutoRecAllSplits/done.job | 0 .../AutoRecAllSplits/netflix_data_preprocess.job | 0 compute_RMSE.py | 0 data_utils/movie_lense_data_converter.py | 0 data_utils/netflix_data_convert.py | 0 infer.py | 0 reco_encoder/__init__.py | 0 reco_encoder/data/__init__.py | 0 reco_encoder/data/input_layer.py | 0 reco_encoder/model/__init__.py | 0 reco_encoder/model/model.py | 2 +- run.py | 10 +++++----- test/__init__.py | 0 test/data_layer_tests.py | 0 ...199-f683aa3b-8840-4835-b8bc-a8d1eaa11c78.txt.crc | Bin test/testData_iRec/_SUCCESS | 0 ...t-00000-f683aa3b-8840-4835-b8bc-a8d1eaa11c78.txt | 0 ...t-00003-f683aa3b-8840-4835-b8bc-a8d1eaa11c78.txt | 0 test/testData_uRec/._SUCCESS.crc | Bin ...000-4a844096-8dd9-425e-9d9d-bd9062cc6940.txt.crc | Bin ...t-00161-4a844096-8dd9-425e-9d9d-bd9062cc6940.txt | 0 ...t-00196-4a844096-8dd9-425e-9d9d-bd9062cc6940.txt | 0 ...t-00199-4a844096-8dd9-425e-9d9d-bd9062cc6940.txt | 0 test/test_model.py | 0 38 files changed, 6 insertions(+), 6 deletions(-) mode change 100644 => 100755 .gitignore mode change 100644 => 100755 AutoEncoder.png mode change 100644 => 100755 LICENSE mode change 100644 => 100755 README.md mode change 100644 => 100755 azkaban/AutoRec/ConstrainedRecoEncoder.job mode change 100644 => 100755 azkaban/AutoRec/ConstrainedRecoEncoderNoLastLayerNl.job mode change 100644 => 100755 azkaban/AutoRec/RecoEncoder.job mode change 100644 => 100755 azkaban/AutoRec/RecoEncoderNoLastLayerNl.job mode change 100644 => 100755 azkaban/AutoRec/done.job mode change 100644 => 100755 azkaban/AutoRec/netflix_data_preprocess.job mode change 100644 => 100755 azkaban/AutoRecAllSplits/RecoEncoder1Y.job mode change 100644 => 100755 azkaban/AutoRecAllSplits/RecoEncoderN3m.job mode change 100644 => 100755 azkaban/AutoRecAllSplits/RecoEncoderN6m.job mode change 100644 => 100755 azkaban/AutoRecAllSplits/RecoEncoderNF.job mode change 100644 => 100755 azkaban/AutoRecAllSplits/done.job mode change 100644 => 100755 azkaban/AutoRecAllSplits/netflix_data_preprocess.job mode change 100644 => 100755 compute_RMSE.py mode change 100644 => 100755 data_utils/movie_lense_data_converter.py mode change 100644 => 100755 data_utils/netflix_data_convert.py mode change 100644 => 100755 infer.py mode change 100644 => 100755 reco_encoder/__init__.py mode change 100644 => 100755 reco_encoder/data/__init__.py mode change 100644 => 100755 reco_encoder/data/input_layer.py mode change 100644 => 100755 reco_encoder/model/__init__.py mode change 100644 => 100755 reco_encoder/model/model.py mode change 100644 => 100755 run.py mode change 100644 => 100755 test/__init__.py mode change 100644 => 100755 test/data_layer_tests.py mode change 100644 => 100755 test/testData_iRec/.part-00199-f683aa3b-8840-4835-b8bc-a8d1eaa11c78.txt.crc mode change 100644 => 100755 test/testData_iRec/_SUCCESS mode change 100644 => 100755 test/testData_iRec/part-00000-f683aa3b-8840-4835-b8bc-a8d1eaa11c78.txt mode change 100644 => 100755 test/testData_iRec/part-00003-f683aa3b-8840-4835-b8bc-a8d1eaa11c78.txt mode change 100644 => 100755 test/testData_uRec/._SUCCESS.crc mode change 100644 => 100755 test/testData_uRec/.part-00000-4a844096-8dd9-425e-9d9d-bd9062cc6940.txt.crc mode change 100644 => 100755 test/testData_uRec/part-00161-4a844096-8dd9-425e-9d9d-bd9062cc6940.txt mode change 100644 => 100755 test/testData_uRec/part-00196-4a844096-8dd9-425e-9d9d-bd9062cc6940.txt mode change 100644 => 100755 test/testData_uRec/part-00199-4a844096-8dd9-425e-9d9d-bd9062cc6940.txt mode change 100644 => 100755 test/test_model.py diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 diff --git a/AutoEncoder.png b/AutoEncoder.png old mode 100644 new mode 100755 diff --git a/LICENSE b/LICENSE old mode 100644 new mode 100755 diff --git a/README.md b/README.md old mode 100644 new mode 100755 diff --git a/azkaban/AutoRec/ConstrainedRecoEncoder.job b/azkaban/AutoRec/ConstrainedRecoEncoder.job old mode 100644 new mode 100755 diff --git a/azkaban/AutoRec/ConstrainedRecoEncoderNoLastLayerNl.job b/azkaban/AutoRec/ConstrainedRecoEncoderNoLastLayerNl.job old mode 100644 new mode 100755 diff --git a/azkaban/AutoRec/RecoEncoder.job b/azkaban/AutoRec/RecoEncoder.job old mode 100644 new mode 100755 diff --git a/azkaban/AutoRec/RecoEncoderNoLastLayerNl.job b/azkaban/AutoRec/RecoEncoderNoLastLayerNl.job old mode 100644 new mode 100755 diff --git a/azkaban/AutoRec/done.job b/azkaban/AutoRec/done.job old mode 100644 new mode 100755 diff --git a/azkaban/AutoRec/netflix_data_preprocess.job b/azkaban/AutoRec/netflix_data_preprocess.job old mode 100644 new mode 100755 diff --git a/azkaban/AutoRecAllSplits/RecoEncoder1Y.job b/azkaban/AutoRecAllSplits/RecoEncoder1Y.job old mode 100644 new mode 100755 diff --git a/azkaban/AutoRecAllSplits/RecoEncoderN3m.job b/azkaban/AutoRecAllSplits/RecoEncoderN3m.job old mode 100644 new mode 100755 diff --git a/azkaban/AutoRecAllSplits/RecoEncoderN6m.job b/azkaban/AutoRecAllSplits/RecoEncoderN6m.job old mode 100644 new mode 100755 diff --git a/azkaban/AutoRecAllSplits/RecoEncoderNF.job b/azkaban/AutoRecAllSplits/RecoEncoderNF.job old mode 100644 new mode 100755 diff --git a/azkaban/AutoRecAllSplits/done.job b/azkaban/AutoRecAllSplits/done.job old mode 100644 new mode 100755 diff --git a/azkaban/AutoRecAllSplits/netflix_data_preprocess.job b/azkaban/AutoRecAllSplits/netflix_data_preprocess.job old mode 100644 new mode 100755 diff --git a/compute_RMSE.py b/compute_RMSE.py old mode 100644 new mode 100755 diff --git a/data_utils/movie_lense_data_converter.py b/data_utils/movie_lense_data_converter.py old mode 100644 new mode 100755 diff --git a/data_utils/netflix_data_convert.py b/data_utils/netflix_data_convert.py old mode 100644 new mode 100755 diff --git a/infer.py b/infer.py old mode 100644 new mode 100755 diff --git a/reco_encoder/__init__.py b/reco_encoder/__init__.py old mode 100644 new mode 100755 diff --git a/reco_encoder/data/__init__.py b/reco_encoder/data/__init__.py old mode 100644 new mode 100755 diff --git a/reco_encoder/data/input_layer.py b/reco_encoder/data/input_layer.py old mode 100644 new mode 100755 diff --git a/reco_encoder/model/__init__.py b/reco_encoder/model/__init__.py old mode 100644 new mode 100755 diff --git a/reco_encoder/model/model.py b/reco_encoder/model/model.py old mode 100644 new mode 100755 index 23c8baa..6200688 --- a/reco_encoder/model/model.py +++ b/reco_encoder/model/model.py @@ -34,7 +34,7 @@ def MSEloss(inputs, targets, size_avarage=False): mask = targets != 0 num_ratings = torch.sum(mask.float()) criterion = nn.MSELoss(size_average=size_avarage) - return criterion(inputs * mask.float().half(), targets), Variable(torch.Tensor([1.0]).half()) if size_avarage else num_ratings + return criterion(inputs.float() * mask.float(), targets.float()), Variable(torch.Tensor([1.0])) if size_avarage else num_ratings class AutoEncoder(nn.Module): def __init__(self, layer_sizes, nl_type='selu', is_constrained=True, dp_drop_prob=0.0, last_layer_activations=True): diff --git a/run.py b/run.py old mode 100644 new mode 100755 index c8ce713..5488caf --- a/run.py +++ b/run.py @@ -72,7 +72,7 @@ def do_eval(encoder, evaluation_data_layer): outputs = encoder(inputs) loss, num_ratings = model.MSEloss(outputs, targets) total_epoch_loss += loss.data[0] - denom += num_ratings.data.half()[0] + denom += num_ratings.data.float()[0] return sqrt(total_epoch_loss / denom) def log_var_and_grad_summaries(logger, layers, global_step, prefix, log_histograms=False): @@ -219,10 +219,10 @@ def main(): scheduler.step() for i, mb in enumerate(data_layer.iterate_one_epoch()): inputs = Variable(mb.cuda().to_dense().half() if use_gpu else mb.to_dense()) - optimizer.zero_grad() + rencoder.zero_grad() outputs = rencoder(inputs) loss, num_ratings = model.MSEloss(outputs, inputs) - loss = loss / num_ratings.half() + loss = loss / num_ratings.float() scaled_loss = scale_factor * loss.float() scaled_loss.backward() #loss.backward() @@ -262,10 +262,10 @@ def main(): inputs = Variable(outputs.data) if args.noise_prob > 0.0: inputs = dp(inputs) - optimizer.zero_grad() + rencoder.zero_grad() outputs = rencoder(inputs) loss, num_ratings = model.MSEloss(outputs, inputs) - loss = loss / num_ratings.half() + loss = loss / num_ratings.float() scaled_loss = scale_factor * loss.float() scaled_loss.backward() #loss.backward() diff --git a/test/__init__.py b/test/__init__.py old mode 100644 new mode 100755 diff --git a/test/data_layer_tests.py b/test/data_layer_tests.py old mode 100644 new mode 100755 diff --git a/test/testData_iRec/.part-00199-f683aa3b-8840-4835-b8bc-a8d1eaa11c78.txt.crc b/test/testData_iRec/.part-00199-f683aa3b-8840-4835-b8bc-a8d1eaa11c78.txt.crc old mode 100644 new mode 100755 diff --git a/test/testData_iRec/_SUCCESS b/test/testData_iRec/_SUCCESS old mode 100644 new mode 100755 diff --git a/test/testData_iRec/part-00000-f683aa3b-8840-4835-b8bc-a8d1eaa11c78.txt b/test/testData_iRec/part-00000-f683aa3b-8840-4835-b8bc-a8d1eaa11c78.txt old mode 100644 new mode 100755 diff --git a/test/testData_iRec/part-00003-f683aa3b-8840-4835-b8bc-a8d1eaa11c78.txt b/test/testData_iRec/part-00003-f683aa3b-8840-4835-b8bc-a8d1eaa11c78.txt old mode 100644 new mode 100755 diff --git a/test/testData_uRec/._SUCCESS.crc b/test/testData_uRec/._SUCCESS.crc old mode 100644 new mode 100755 diff --git a/test/testData_uRec/.part-00000-4a844096-8dd9-425e-9d9d-bd9062cc6940.txt.crc b/test/testData_uRec/.part-00000-4a844096-8dd9-425e-9d9d-bd9062cc6940.txt.crc old mode 100644 new mode 100755 diff --git a/test/testData_uRec/part-00161-4a844096-8dd9-425e-9d9d-bd9062cc6940.txt b/test/testData_uRec/part-00161-4a844096-8dd9-425e-9d9d-bd9062cc6940.txt old mode 100644 new mode 100755 diff --git a/test/testData_uRec/part-00196-4a844096-8dd9-425e-9d9d-bd9062cc6940.txt b/test/testData_uRec/part-00196-4a844096-8dd9-425e-9d9d-bd9062cc6940.txt old mode 100644 new mode 100755 diff --git a/test/testData_uRec/part-00199-4a844096-8dd9-425e-9d9d-bd9062cc6940.txt b/test/testData_uRec/part-00199-4a844096-8dd9-425e-9d9d-bd9062cc6940.txt old mode 100644 new mode 100755 diff --git a/test/test_model.py b/test/test_model.py old mode 100644 new mode 100755 From 8442fddd66090b7d6c628ce2254ab41e9bdb2f6c Mon Sep 17 00:00:00 2001 From: David Garcia Date: Mon, 30 Apr 2018 11:23:19 -0700 Subject: [PATCH 7/9] Disable hard saturation --- reco_encoder/model/model.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/reco_encoder/model/model.py b/reco_encoder/model/model.py index 6200688..3328a0c 100755 --- a/reco_encoder/model/model.py +++ b/reco_encoder/model/model.py @@ -27,8 +27,7 @@ def activation(input, kind): res = input else: raise ValueError('Unknown non-linearity type') - return torch.max(res, torch.ones_like(res)*6.0) - #return torch.max(res, torch.HalfTensor(1).fill_(6.0)) + return res def MSEloss(inputs, targets, size_avarage=False): mask = targets != 0 From 5394c512e008c04c5a43db4f45ac44f88ed8bece Mon Sep 17 00:00:00 2001 From: David Garcia Date: Mon, 30 Apr 2018 12:23:17 -0700 Subject: [PATCH 8/9] Add periodic flushing of stdout --- run.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/run.py b/run.py index 5488caf..3248e9e 100755 --- a/run.py +++ b/run.py @@ -14,6 +14,7 @@ from math import sqrt import numpy as np import os +import sys scale_factor = 128.0 @@ -243,6 +244,7 @@ def main(): if i % args.summary_frequency == 0: print('[%d, %5d] RMSE: %.7f' % (epoch, i, sqrt(t_loss / t_loss_denom))) + sys.stdout.flush() logger.scalar_summary("Training_RMSE", sqrt(t_loss/t_loss_denom), global_step) t_loss = 0 t_loss_denom = 0.0 @@ -282,6 +284,7 @@ def main(): e_end_time = time.time() print('Total epoch {} finished in {} seconds with TRAINING RMSE loss: {}' .format(epoch, e_end_time - e_start_time, sqrt(total_epoch_loss/denom))) + sys.stdout.flush() logger.scalar_summary("Training_RMSE_per_epoch", sqrt(total_epoch_loss/denom), epoch) logger.scalar_summary("Epoch_time", e_end_time - e_start_time, epoch) if epoch % 3 == 0 or epoch == args.num_epochs - 1: From dc5d5e99a17fa6e179b53ab9d50bbc3446e3f773 Mon Sep 17 00:00:00 2001 From: David Garcia Date: Mon, 30 Apr 2018 13:56:18 -0700 Subject: [PATCH 9/9] Revert "Removed dependency with tensorflow" This reverts commit b6455c94bc3c925172f1fe974628d668b762f3d2. --- logger.py | 53 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 4 deletions(-) mode change 100755 => 100644 logger.py diff --git a/logger.py b/logger.py old mode 100755 new mode 100644 index 1fb7151..4827ccc --- a/logger.py +++ b/logger.py @@ -1,6 +1,7 @@ # THIS FILE IS COPY-PASTED FROM HERE: https://github.com/yunjey/pytorch-tutorial/tree/master/tutorials/04-utils/tensorboard # Code referenced from https://gist.github.com/gyglim/1f8dfb1b5c82627ae3efcfbbadb9f514 +import tensorflow as tf import numpy as np import scipy.misc @@ -13,16 +14,60 @@ class Logger(object): def __init__(self, log_dir): """Create a summary writer logging to log_dir.""" - pass + self.writer = tf.summary.FileWriter(log_dir) def scalar_summary(self, tag, value, step): """Log a scalar variable.""" - pass + summary = tf.Summary(value=[tf.Summary.Value(tag=tag, simple_value=value)]) + self.writer.add_summary(summary, step) def image_summary(self, tag, images, step): """Log a list of images.""" - pass + + img_summaries = [] + for i, img in enumerate(images): + # Write the image to a string + try: + s = StringIO() + except: + s = BytesIO() + scipy.misc.toimage(img).save(s, format="png") + + # Create an Image object + img_sum = tf.Summary.Image(encoded_image_string=s.getvalue(), + height=img.shape[0], + width=img.shape[1]) + # Create a Summary value + img_summaries.append(tf.Summary.Value(tag='%s/%d' % (tag, i), image=img_sum)) + + # Create and write Summary + summary = tf.Summary(value=img_summaries) + self.writer.add_summary(summary, step) def histo_summary(self, tag, values, step, bins=1000): """Log a histogram of the tensor of values.""" - pass + + # Create a histogram using numpy + counts, bin_edges = np.histogram(values, bins=bins) + + # Fill the fields of the histogram proto + hist = tf.HistogramProto() + hist.min = float(np.min(values)) + hist.max = float(np.max(values)) + hist.num = int(np.prod(values.shape)) + hist.sum = float(np.sum(values)) + hist.sum_squares = float(np.sum(values ** 2)) + + # Drop the start of the first bin + bin_edges = bin_edges[1:] + + # Add bin edges and counts + for edge in bin_edges: + hist.bucket_limit.append(edge) + for c in counts: + hist.bucket.append(c) + + # Create and write Summary + summary = tf.Summary(value=[tf.Summary.Value(tag=tag, histo=hist)]) + self.writer.add_summary(summary, step) + self.writer.flush() \ No newline at end of file