From 1a988ac6bfc5415559e9895a328626fd561410a2 Mon Sep 17 00:00:00 2001 From: philippe-thomas Date: Fri, 2 Aug 2019 16:31:22 +0200 Subject: [PATCH 1/5] Moved generator stuff to separate folder --- .../generatorCreateTrainTestSplit.py | 0 generatorPreprocess.py => customModels/generatorPreprocess.py | 4 ++-- generatorTrain.py => customModels/generatorTrain.py | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename generatorCreateTrainTestSplit.py => customModels/generatorCreateTrainTestSplit.py (100%) rename generatorPreprocess.py => customModels/generatorPreprocess.py (95%) rename generatorTrain.py => customModels/generatorTrain.py (100%) diff --git a/generatorCreateTrainTestSplit.py b/customModels/generatorCreateTrainTestSplit.py similarity index 100% rename from generatorCreateTrainTestSplit.py rename to customModels/generatorCreateTrainTestSplit.py diff --git a/generatorPreprocess.py b/customModels/generatorPreprocess.py similarity index 95% rename from generatorPreprocess.py rename to customModels/generatorPreprocess.py index d147ca3..6d06987 100644 --- a/generatorPreprocess.py +++ b/customModels/generatorPreprocess.py @@ -16,8 +16,8 @@ binaryPath= 'data/binaries/' #Place where the serialized training data is modelPath= 'data/models/' #Place to store the models unknownClass = "unknownLocation" #place holder for unknown classes -trainFile="/home/philippe/PycharmProjects/geolocation/train.json.gz" -testFile="/home/philippe/PycharmProjects/geolocation/test.json.gz" +trainFile="/home/philippe/Desktop/train.json.gz" +testFile="/home/philippe/Desktop/test.json.gz" diff --git a/generatorTrain.py b/customModels/generatorTrain.py similarity index 100% rename from generatorTrain.py rename to customModels/generatorTrain.py From d3d4bc830bd1688bdd656302bb904599f3a67ea9 Mon Sep 17 00:00:00 2001 From: philippe-thomas Date: Mon, 12 Aug 2019 09:23:30 +0200 Subject: [PATCH 2/5] Prettier printout in webservice JSON --- webservice.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/webservice.py b/webservice.py index 4e80edb..3be6e82 100644 --- a/webservice.py +++ b/webservice.py @@ -62,7 +62,7 @@ def predictText(): predict = textBranch.predict(textSequences) # Print the topN - result = [] + hits = [] for index in reversed(predict.argsort()[0][-maxCities:]): print("%s with score=%.3f" % (colnames[index], float(predict[0][index]))) my_dict = { @@ -71,9 +71,11 @@ def predictText(): 'lat': placeMedian[colnames[index]][0], 'lon': placeMedian[colnames[index]][1] } - result.append(json.dumps(my_dict, indent=4)) - print(result) - return Response(json.dumps(result, indent=4), mimetype='application/json') + hits.append(my_dict) + x= {"query":text, + "results":hits} + print(hits) + return Response(json.dumps(x, indent=4), mimetype='application/json') #Has some issues with json escape character // """ From 94f0837453baaa942f79be5e0ffb32d5fe71b4c7 Mon Sep 17 00:00:00 2001 From: philippe-thomas Date: Mon, 12 Aug 2019 12:25:10 +0200 Subject: [PATCH 3/5] Minor changes to automaticall generate validation-set --- TrainIndividualModels.py | 32 ++++++++++++++++++-------------- TrainMergedModel.py | 10 ++++++++-- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/TrainIndividualModels.py b/TrainIndividualModels.py index 3682b0c..daaa732 100644 --- a/TrainIndividualModels.py +++ b/TrainIndividualModels.py @@ -30,6 +30,9 @@ file = open(binaryPath +"data.obj",'rb') trainDescription, trainLocation, trainDomain, trainTld, trainSource, trainTexts, trainUserName, trainTZ, trainUtc, trainUserLang, trainCreatedAt= pickle.load(file) +#Shuffle train-data +trainDescription, trainLocation, trainDomain, trainTld, trainSource, trainTexts, trainUserName, trainTZ, trainUtc, trainUserLang, trainCreatedAt, classes = shuffle(trainDescription, trainLocation, trainDomain, trainTld, trainSource, trainTexts, trainUserName, trainTZ, trainUtc, trainUserLang, trainCreatedAt, classes, random_state=1202) + ##################Train # create the model batch_size = 256 @@ -41,6 +44,7 @@ textEmbeddings = 100 nameEmbeddings = 100 tzEmbeddings = 50 +validation_split = 0.01 #91279 samples for validation #callbacks = [ # EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=6, verbose=1, restore_best_weights=True), @@ -70,7 +74,7 @@ start = time.time() descriptionHistory = descriptionModel.fit(trainDescription, classes, epochs=nb_epoch, batch_size=batch_size, - verbose=verbosity + verbose=verbosity, validation_split=validation_split ) print("descriptionBranch finished after " +str(datetime.timedelta(seconds=round(time.time() - start)))) descriptionModel.save(modelPath +'descriptionBranchNorm.h5') @@ -96,7 +100,7 @@ start = time.time() sourceHistory = domainModel.fit(trainDomain, classes, epochs=nb_epoch, batch_size=batch_size, - verbose=verbosity + verbose=verbosity, validation_split=validation_split ) print("tldBranch finished after " +str(datetime.timedelta(seconds=round(time.time() - start)))) domainModel.save(modelPath + 'domainBranch.h5') @@ -121,7 +125,7 @@ start = time.time() sourceHistory = tldBranchModel.fit(trainTld, classes, epochs=nb_epoch, batch_size=batch_size, - verbose=verbosity + verbose=verbosity, validation_split=validation_split ) print("tldBranch finished after " +str(datetime.timedelta(seconds=round(time.time() - start)))) tldBranchModel.save(modelPath + 'tldBranch.h5') @@ -139,7 +143,7 @@ start = time.time() sourceHistory = linkModel.fit(np.concatenate((trainDomain, trainTld), axis=1), classes, epochs=nb_epoch, batch_size=batch_size, - verbose=verbosity + verbose=verbosity, validation_split=validation_split ) print("linkModel finished after " +str(datetime.timedelta(seconds=round(time.time() - start)))) linkModel.save(modelPath + 'linkModel.h5') @@ -166,7 +170,7 @@ start = time.time() locationHistory = locationModel.fit(trainLocation, classes, epochs=nb_epoch, batch_size=batch_size, - verbose=verbosity + verbose=verbosity, validation_split=validation_split ) print("locationHistory finished after " +str(datetime.timedelta(seconds=round(time.time() - start)))) locationModel.save(modelPath +'locationBranchNorm.h5') @@ -191,7 +195,7 @@ start = time.time() sourceHistory = sourceModel.fit(trainSource, classes, epochs=nb_epoch, batch_size=batch_size, - verbose=verbosity + verbose=verbosity, validation_split=validation_split ) print("sourceBranch finished after " +str(datetime.timedelta(seconds=round(time.time() - start)))) sourceModel.save(modelPath +'sourceBranch.h5') @@ -219,7 +223,7 @@ start = time.time() textHistory = textModel.fit(trainTexts, classes, epochs=nb_epoch, batch_size=batch_size, - verbose=verbosity + verbose=verbosity, validation_split=validation_split ) print("textBranch finished after " +str(datetime.timedelta(seconds=round(time.time() - start)))) textModel.save(modelPath +'textBranchNorm.h5') @@ -247,7 +251,7 @@ start = time.time() nameHistory = nameModel.fit(trainUserName, classes, epochs=nb_epoch, batch_size=batch_size, - verbose=verbosity + verbose=verbosity, validation_split=validation_split ) print("nameBranch finished after " +str(datetime.timedelta(seconds=round(time.time() - start)))) nameModel.save(modelPath +'nameBranchNorm.h5') @@ -274,7 +278,7 @@ start = time.time() tzHistory = tzBranchModel.fit(trainTZ, classes, epochs=nb_epoch, batch_size=batch_size, - verbose=verbosity + verbose=verbosity, validation_split=validation_split ) print("tzBranch finished after " +str(datetime.timedelta(seconds=round(time.time() - start)))) tzBranchModel.save(modelPath +'tzBranchNorm.h5') @@ -300,7 +304,7 @@ start = time.time() utcHistory = utcBranchModel.fit(trainUtc, classes, epochs=nb_epoch, batch_size=batch_size, - verbose=verbosity + verbose=verbosity, validation_split=validation_split ) print("utcBranch finished after " +str(datetime.timedelta(seconds=round(time.time() - start)))) utcBranchModel.save(modelPath +'utcBranch.h5') @@ -323,7 +327,7 @@ start = time.time() userLangHistory = userLangModel.fit(trainUserLang, classes, epochs=nb_epoch, batch_size=batch_size, - verbose=verbosity + verbose=verbosity, validation_split=validation_split ) print("userLangBranch finished after " +str(datetime.timedelta(seconds=round(time.time() - start)))) userLangModel.save(modelPath +'userLangBranch.h5') @@ -339,7 +343,7 @@ timeHistory = tweetTimeModel.fit(trainCreatedAt, classes, epochs=nb_epoch, batch_size=batch_size, - verbose=verbosity + verbose=verbosity, validation_split=validation_split ) print("tweetTimeModel finished after " +str(datetime.timedelta(seconds=round(time.time() - start)))) tweetTimeModel.save(modelPath + 'tweetTimeBranch.h5') @@ -364,7 +368,7 @@ timeHistory = tweetTimeModel.fit(trainCreatedAt, classes, epochs=nb_epoch, batch_size=batch_size, - verbose=verbosity + verbose=verbosity, validation_split=validation_split ) print("tweetTimeModel finished after " +str(datetime.timedelta(seconds=round(time.time() - start)))) tweetTimeModel.save(modelPath + 'tweetTimeBranch.h5') @@ -387,7 +391,7 @@ categorialModelHistory = categorialModel.fit(trainData, classes, epochs=nb_epoch, batch_size=batch_size, - verbose=verbosity + verbose=verbosity, validation_split=validation_split ) print("categorialModel finished after " +str(datetime.timedelta(time.time() - start))) categorialModel.save(modelPath + 'categorialModel.h5') \ No newline at end of file diff --git a/TrainMergedModel.py b/TrainMergedModel.py index f2332ed..603d88b 100644 --- a/TrainMergedModel.py +++ b/TrainMergedModel.py @@ -8,6 +8,7 @@ import time import os +from sklearn.utils import shuffle os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152 os.environ["CUDA_VISIBLE_DEVICES"] = "" @@ -43,9 +44,14 @@ file = open(binaryPath +"data.obj",'rb') trainDescription, trainLocation, trainDomain, trainTld, trainSource, trainTexts, trainUserName, trainTZ, trainUtc, trainUserLang, trainCreatedAt= pickle.load(file) +#Shuffle train-data +trainDescription, trainLocation, trainDomain, trainTld, trainSource, trainTexts, trainUserName, trainTZ, trainUtc, trainUserLang, trainCreatedAt, classes = shuffle(trainDescription, trainLocation, trainDomain, trainTld, trainSource, trainTexts, trainUserName, trainTZ, trainUtc, trainUserLang, trainCreatedAt, classes, random_state=1202) + + # create the model batch_size = 256 nb_epoch = 3 +validation_split = 0.01 #91279 samples for validation ##Convert data into one hot encodings @@ -148,7 +154,7 @@ finalHistory = final_model.fit([trainDescription, trainDomain, trainTld, trainLocation, trainSource, trainTexts, trainUserName, trainTZ, trainUtc, trainUserLang, trainCreatedAt], classes, epochs=nb_epoch, batch_size=batch_size, - verbose=2 + verbose=2, validation_split=validation_split ) end = time.time() print("final_model finished after " +str(datetime.timedelta(seconds=time.time() - start))) @@ -168,7 +174,7 @@ finalHistory = final_model.fit([trainDescription, trainDomain, trainTld, trainLocation, trainSource, trainTexts, trainUserName, trainTZ, trainUtc, trainUserLang, trainCreatedAt], classes, epochs=nb_epoch, batch_size=batch_size, - verbose=2 + verbose=2, validation_split=validation_split ) end = time.time() print("final_model finished after " +str(datetime.timedelta(seconds=time.time() - start))) From 09912892cbac66f25804c3df29d65d6187a3c340 Mon Sep 17 00:00:00 2001 From: philippe-thomas Date: Mon, 12 Aug 2019 12:25:28 +0200 Subject: [PATCH 4/5] Adding docker --- Dockerfile | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..69d3132 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,41 @@ +FROM python:3.6 + +COPY *.py /app/ +COPY requirements.txt app/ +COPY data/binaries/ /app/data/binaries/ +COPY data/models/textBranchNorm.h5 /app/data/models/ + +WORKDIR /app +RUN pip install -r requirements.txt + + +EXPOSE 5000 +CMD ["python", "./webservice.py"] + + +###Some comands I used for building this docker container + +##Build docker container from 'Dockerfile' +#docker build -t geoloc . + +##Execute docker container +#docker run -d -p 5000:5000 --network host geoloc + +##Contact docker webservice +#http://127.0.0.1:5000/predictText?text=Montmartre%20is%20truly%20beautiful + +########################### Other commands ########################### +##list images +#docker images -a + +##ls for docker +#docker container ls + +#Interactive execution of docker container +#docker exec -i -t 3411bb89b103 /bin/bash + + + + + + From 664332cbabf4ff4d384d852fd137acdaed4332ae Mon Sep 17 00:00:00 2001 From: philippe-thomas Date: Mon, 12 Aug 2019 12:40:35 +0200 Subject: [PATCH 5/5] Modified docker image --- Dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Dockerfile b/Dockerfile index 69d3132..0ca3602 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,6 +24,9 @@ CMD ["python", "./webservice.py"] ##Contact docker webservice #http://127.0.0.1:5000/predictText?text=Montmartre%20is%20truly%20beautiful +##Export docker container +#docker save geoloc > geolocV2.tar + ########################### Other commands ########################### ##list images #docker images -a