9 Copyright (C) e2its - All Rights Reserved 10 * Unauthorized copying of this file, via any medium is strictly prohibited 11 * Proprietary and confidential 13 * This file is part of gDayF project. 15 * Written by Jose L. Sanchez <e2its.es@gmail.com>, 2016-2019 21 from collections
import OrderedDict
as OrderedDict
22 from pandas
import DataFrame
as DataFrame
23 from hashlib
import md5
as md5
24 from py4j.protocol
import Py4JJavaError
29 from pyspark.sql
import SparkSession
30 from pyspark.ml
import Pipeline
31 from pyspark.ml
import PipelineModel
32 from pyspark.ml.feature
import VectorAssembler
33 from pyspark.ml.feature
import VectorIndexer
39 from pyspark.ml.feature
import StringIndexer
40 from pyspark.ml.feature
import IndexToString
41 from pyspark.ml.feature
import OneHotEncoder
42 from pyspark.sql.utils
import IllegalArgumentException
44 print(
"Successfully imported all Spark modules")
45 except ImportError
as e:
46 print(
"Error importing Spark Modules", e)
83 self.
_labels = self.
_ec.labels.get_config()[
'messages'][
'corehandler']
114 except Py4JJavaError:
115 print(
'Apache Spark Cluster not working')
118 def addColumnIndex(dataframe): 119 # Create new column names 120 oldColumns = dataframe.schema.names 121 newColumns = oldColumns + ["columnindex"] 124 df_indexed = df.rdd.zipWithIndex().map(lambda row, columnindex: \ 125 row + (columnindex,)).toDF() 127 # Rename all the columns 128 new_df = reduce(lambda data, idx: data.withColumnRenamed(oldColumns[idx], 129 newColumns[idx]), range(len(oldColumns)), 139 spark = SparkSession.builder.master(self.
url +
'[' + str(self.
nthreads) +
']')\
140 .appName(
'job_gdayf_'+self.
url+
'_' + time.strftime(
"%b-%d-%Y_%H:%M:%S-%z", time.localtime())) \
147 log4j.LogManager.getRootLogger().setLevel(eval(self.
_config[
'frameworks'][self.
_framework][
'conf'][
'log']))
149 except Py4JJavaError
as connection_error:
150 self.
_logging.log_critical(
'gDayF',
"sparkHandler", self.
_labels[
"failed_conn"])
151 raise connection_error
164 def _get_dtype(list_dtypes, column):
165 for element
in list_dtypes:
166 if element[0] == column:
184 def _get_temporal_objects_ids(self, model_id, nfolds):
200 def _get_model_from_load_path(self, ar_metadata):
206 assert isinstance(ar_metadata[
'load_path'], list)
207 except AssertionError:
210 while ar_metadata[
'load_path']
is not None and counter_storage < len(ar_metadata[
'load_path'])
and load_fails:
212 if ar_metadata[
'load_path'][counter_storage][
'hash_value']
is None or \
213 hash_key(ar_metadata[
'load_path'][counter_storage][
'hash_type'],
214 ar_metadata[
'load_path'][counter_storage][
'value']) == \
215 ar_metadata[
'load_path'][counter_storage][
'hash_value']:
217 self.
_model_base = PipelineModel.load(ar_metadata[
'load_path'][counter_storage][
'value'])
221 if ar_metadata[
'load_path'][counter_storage][
'hash_value']
is not None:
224 ar_metadata[
'load_path'][counter_storage][
'hash_value'] +
' - ' +
225 hash_key(ar_metadata[
'load_path'][counter_storage][
'hash_type'],
226 ar_metadata[
'load_path'][counter_storage][
'value'])
228 except Py4JJavaError:
230 self.
_labels[
"abort"], ar_metadata[
'load_path'][counter_storage][
'value'])
240 for _, sparkdataframe
in iterator.items():
241 sparkdataframe.unpersist()
242 del self.
_ec.spark_temporal_data_frames
243 self.
_ec.spark_temporal_data_frames = dict()
251 assert type_
in [
'PoC',
'train',
'predict']
257 load_path.append(self.
hdfs)
258 load_path.append(
'/')
260 load_path.append(
'/')
261 load_path.append(base_ar[
'model_id'])
262 load_path.append(
'/')
263 load_path.append(type_)
264 load_path.append(
'/')
265 load_path.append(str(base_ar[
'timestamp']))
266 load_path.append(
'/')
267 return ''.join(load_path)
272 load_path.append(
'/')
274 load_path.append(
'/')
275 load_path.append(base_ar[
'model_id'])
276 load_path.append(
'/')
277 load_path.append(type_)
278 load_path.append(
'/')
279 load_path.append(str(base_ar[
'timestamp']))
280 load_path.append(
'/')
281 return ''.join(load_path)
294 def _get_evaluator(analysis_type, objective_column=None):
295 if objective_column
is None:
296 if analysis_type ==
'clustering':
299 if analysis_type ==
'binomial':
301 return BinaryClassificationEvaluator(labelCol=objective_column)
302 elif analysis_type ==
'multinomial':
303 return MulticlassClassificationEvaluator(labelCol=objective_column)
304 elif analysis_type ==
'regression':
305 return RegressionEvaluator(labelCol=objective_column)
314 def _generate_execution_metrics(self, dataframe, antype, objective_column):
315 if antype ==
'binomial':
316 model_metrics = BinomialMetricMetadata()
317 elif antype ==
'multinomial':
318 model_metrics = MultinomialMetricMetadata()
319 elif antype ==
'regression':
320 model_metrics = RegressionMetricMetadata()
321 elif antype ==
'anomalies':
322 model_metrics = AnomaliesMetricMetadata()
323 elif antype ==
'clustering':
324 model_metrics = ClusteringMetricMetadata()
328 evaluator = self.
_get_evaluator(analysis_type=antype, objective_column=objective_column)
330 if isinstance(model_metrics, ClusteringMetricMetadata):
331 model_metrics.set_metrics(model=self.
_model_base.stages[-1], data=dataframe)
333 model_metrics.set_metrics(evaluator=evaluator, data=dataframe, objective_column=objective_column)
339 def _generate_scoring_history(self):
340 result_dataframe =
None 341 if isinstance(self.
_model_base.stages[-1], GBTRegressionModel)
or \
342 isinstance(self.
_model_base.stages[-1], RandomForestRegressionModel)
or \
343 isinstance(self.
_model_base.stages[-1], GBTClassificationModel):
345 for itera
in range(0, len(self.
_model_base.stages[-1].trees)):
346 maximo = max(maximo, self.
_model_base.stages[-1].trees[itera].depth)
348 result_dataframe = DataFrame(data={
'trees': self.
_model_base.stages[-1].getNumTrees,
350 'total_nodes': self.
_model_base.stages[-1].totalNumNodes,
351 'numFeatures': self.
_model_base.stages[-1].numFeatures},
352 index=[0]).to_json(orient=
'split')
353 elif isinstance(self.
_model_base.stages[-1], RandomForestClassificationModel):
355 for itera
in range(0, len(self.
_model_base.stages[-1].trees)):
356 maximo = max(maximo, self.
_model_base.stages[-1].trees[itera].depth)
358 result_dataframe = DataFrame(data={
'trees': self.
_model_base.stages[-1].getNumTrees,
360 'total_nodes': self.
_model_base.stages[-1].totalNumNodes,
361 'numFeatures': self.
_model_base.stages[-1].numFeatures,
362 'numClasses': self.
_model_base.stages[-1].numClasses},
363 index=[0]).to_json(orient=
'split')
364 elif isinstance(self.
_model_base.stages[-1], DecisionTreeRegressionModel):
365 result_dataframe = DataFrame(data={
'trees': 1,
367 'total_nodes': self.
_model_base.stages[-1].numNodes,
368 'numFeatures': self.
_model_base.stages[-1].numFeatures},
369 index=[0]).to_json(orient=
'split')
370 elif isinstance(self.
_model_base.stages[-1], NaiveBayesModel):
371 result_dataframe = DataFrame(data={
'numFeatures': self.
_model_base.stages[-1].numFeatures,
372 'numClasses': self.
_model_base.stages[-1].numClasses},
373 index=[0]).to_json(orient=
'split')
374 elif isinstance(self.
_model_base.stages[-1], DecisionTreeClassificationModel):
375 result_dataframe = DataFrame(data={
'trees': 1,
377 'total_nodes': self.
_model_base.stages[-1].numNodes,
378 'numFeatures': self.
_model_base.stages[-1].numFeatures,
379 'numClasses': self.
_model_base.stages[-1].numClasses},
380 index=[0]).to_json(orient=
'split')
381 elif isinstance(self.
_model_base.stages[-1], GeneralizedLinearRegressionModel):
383 result_dataframe = DataFrame(data={
'aic': summary.aic,
384 'intercept': str(self.
_model_base.stages[-1].intercept),
385 'degreesOfFreedom': summary.degreesOfFreedom,
386 'numInstances': summary.numInstances,
387 'rank': summary.rank,
388 'dispersion': summary.dispersion,
389 'nullDeviance': summary.nullDeviance,
390 'residuals': summary.residuals,
391 'numFeatures': self.
_model_base.stages[-1].numFeatures},
392 index=[0]).to_json(orient=
'split')
393 elif isinstance(self.
_model_base.stages[-1], LinearRegressionModel):
395 result_dataframe = DataFrame(data={
'coefifients': str(self.
_model_base.stages[-1].coefficients),
396 'degreesOfFreedom': summary.degreesOfFreedom,
397 'numInstances': summary.numInstances,
398 'totalIterations': summary.totalIterations,
399 'devianceResiduals': str(summary.devianceResiduals),
400 'explainedVariance': summary.explainedVariance,
401 'numFeatures': self.
_model_base.stages[-1].numFeatures},
402 index=[0]).to_json(orient=
'split')
403 elif isinstance(self.
_model_base.stages[-1], LinearSVCModel):
404 result_dataframe = DataFrame(data={
'coefifients': str(self.
_model_base.stages[-1].coefficients),
405 'intercept': self.
_model_base.stages[-1].intercept,
406 'numClasses': self.
_model_base.stages[-1].numClasses,
407 'numFeatures': self.
_model_base.stages[-1].numFeatures},
408 index=[0]).to_json(orient=
'split')
409 elif isinstance(self.
_model_base.stages[-1], LogisticRegressionModel):
412 result_dataframe = DataFrame(data={
'coefifients': str(self.
_model_base.stages[-1].coefficients),
413 'intercept': self.
_model_base.stages[-1].intercept,
414 'totalIterations': summary.totalIterations,
415 'roc': summary.roc.toPandas().to_json(orient=
'split'),
416 'pr': summary.pr.toPandas().to_json(orient=
'split')},
417 index=[0]).to_json(orient=
'split')
419 result_dataframe = DataFrame(data={
'coefifientsMatrix': str(self.
_model_base.stages[-1].coefficientMatrix),
420 'interceptVector': str(self.
_model_base.stages[-1].interceptVector)},
421 index=[0]).to_json(orient=
'split')
422 elif isinstance(self.
_model_base.stages[-1], BisectingKMeansModel)
or \
423 isinstance(self.
_model_base.stages[-1], KMeansModel):
425 result_dataframe = DataFrame(data={
'clusterCenters': str(self.
_model_base.stages[-1].clusterCenters()),
426 'clusterSizes': str(summary.clusterSizes),
428 index=[0]).to_json(orient=
'split')
431 if result_dataframe
is not None:
432 return json.loads(result_dataframe, object_pairs_hook=OrderedDict)
440 def _generate_importance_variables(self, column_chain):
441 var_importance = OrderedDict()
442 for columns
in column_chain:
444 var_importance[columns] = self.
_model_base.stages[-1].featureImportances[column_chain.index(columns)]
445 except AttributeError:
446 var_importance[columns] =
None 447 return var_importance
452 def _generate_model_metrics(self):
453 if isinstance(self.
_model_base.stages[-1], LogisticRegressionModel)
or \
454 isinstance(self.
_model_base.stages[-1], LinearRegressionModel):
457 return json.loads(DataFrame(summary.objectiveHistory, columns=[
'Metrics']).to_json(orient=
'split'),
458 object_pairs_hook=OrderedDict)
461 elif isinstance(self.
_model_base.stages[-1], NaiveBayesModel):
462 metrics = OrderedDict()
463 metrics[
'pi'] = json.loads(DataFrame(self.
_model_base.stages[-1].pi.values).to_json(orient=
'split'),
464 object_pairs_hook=OrderedDict)
465 metrics[
'theta'] = json.loads(DataFrame(self.
_model_base.stages[-1].theta.values).to_json(orient=
'split'),
466 object_pairs_hook=OrderedDict)
478 def _accuracy(self, objective, dataframe, tolerance=0.0):
480 fmin = eval(
"lambda x: x - " + str(tolerance / 2))
481 fmax = eval(
"lambda x: x + " + str(tolerance / 2))
483 resultado_train = dataframe.select(
"prediction", objective)
485 accuracy = resultado_train.filter(resultado_train.prediction >= fmin(resultado_train[objective])) \
486 .filter(resultado_train.prediction <= fmax(resultado_train[objective])).count() \
487 / float(resultado_train.count())
501 def _predict_accuracy(self, objective, dataframe, tolerance=0.0):
506 columns = prediccion.columns
507 if objective
in columns:
508 accuracy = self.
_accuracy(objective=objective, dataframe=prediccion, tolerance=tolerance)
509 return accuracy, prediccion
516 def _predict_clustering(self, dataframe, objective=None):
523 def _generate_params(self, modeldef):
525 Generate model params for this model. 526 :return (status (success 0, error 1) , OrderedDict(full_stack_parameters)) 528 full_stack_params = OrderedDict()
529 for key, item
in modeldef.extractParamMap().items():
530 full_stack_params[str(key)[str(key).find(
'__') + 2:]] = item
531 return 0, full_stack_params
541 struct_ar = OrderedDict(json.load(algorithm_description))
544 return (
'Necesario cargar un modelo valid o ar.json valido')
546 return struct_ar[
'metrics'][source][metric]
560 if base_ns
is not None:
561 data_norm = dataframe.copy(deep=
True)
565 if not exist_objective:
566 base_ns = normalizer.filter_objective_base(normalizemd=base_ns)
567 if filtering ==
'STANDARDIZE':
568 base_ns = normalizer.filter_standardize(normalizemd=base_ns, model_id=model_id)
569 elif filtering ==
'DROP':
570 base_ns = normalizer.filter_drop_missing(normalizemd=base_ns)
571 data_norm = normalizer.normalizeDataFrame(data_norm, base_ns)
574 df_metadata.getDataFrameMetadata(dataframe=data_norm, typedf=
'pandas')
575 df_metadata_hash_value = md5(json.dumps(df_metadata).encode(
'utf-8')).hexdigest()
576 return data_norm, df_metadata, df_metadata_hash_value,
True, base_ns
579 df_metadata.getDataFrameMetadata(dataframe=dataframe, typedf=
'pandas')
580 df_metadata_hash_value = md5(json.dumps(df_metadata).encode(
'utf-8')).hexdigest()
581 return dataframe, df_metadata, df_metadata_hash_value,
False, base_ns
593 aux_ns = normalizer.define_special_spark_naive_norm(dataframe_metadata=df_metadata)
604 assert isinstance(training_pframe, DataFrame)
605 assert isinstance(base_ar, ArMetadata)
609 for pname, pvalue
in kwargs.items():
610 if pname ==
'filtering':
611 assert isinstance(pvalue, str)
615 analysis_id = self.
_ec.get_id_analysis()
618 objective_column = base_ar[
'objective_column']
619 if objective_column
is None:
626 if "test_frame" in kwargs.keys():
627 test_pframe = kwargs[
'test_frame']
631 base_ns = get_model_ns(base_ar)
632 modelid = base_ar[
'model_parameters'][
'spark'][
'model']
633 artype = base_ar[
'model_parameters'][
'spark'][
'types'][0][
"type"]
636 self.
_labels[
"st_analysis"], modelid)
638 assert isinstance(base_ns, list)
or base_ns
is None 641 data_initial.getDataFrameMetadata(dataframe=training_pframe, typedf=
'pandas')
642 training_pframe, data_normalized, train_hash_value, norm_executed, base_ns = \
644 filtering=filtering, exist_objective=
True)
646 if modelid ==
'NaiveBayes' and artype ==
'multinomial':
647 training_pframe, data_normalized, train_hash_value, aux_norm_executed, aux_norm = \
651 if aux_norm
is not None:
652 base_ns.extend(aux_norm)
653 norm_executed = norm_executed | aux_norm_executed
655 if base_ar[
'round'] == 1:
656 aux_ns =
Normalizer(self.
_ec).define_ignored_columns(data_normalized, objective_column)
657 if aux_ns
is not None:
658 base_ns.extend(aux_ns)
660 df_metadata = data_initial
661 if not norm_executed:
662 data_normalized =
None 667 str(data_initial[
'correlation'][objective_column]))
672 str(data_initial[
'correlation']))
674 df_metadata = data_normalized
675 base_ar[
'normalizations_set'] = base_ns
680 str(data_normalized[
'correlation'][objective_column]))
685 str(data_initial[
'correlation']))
686 if test_pframe
is not None:
687 test_pframe, _, test_hash_value, _, _ = self.
execute_normalization(dataframe=test_pframe, base_ns=base_ns,
688 model_id=modelid, filtering=filtering,
689 exist_objective=
True)
691 training_frame = self.
_get_dataframe(pframe=training_pframe, hash_value=train_hash_value, type=
"train")
693 if "test_frame" in kwargs.keys():
694 '''test_frame = self._spark_session.createDataFrame(test_frame).cache() 695 self._logging.log_info(analysis_id, self._spark_session.sparkContext.applicationId, 696 self._labels["parsing_to_spark"], 697 'test_frame (' + str(test_frame.count()) + ')')''' 698 test_frame = self.
_get_dataframe(pframe=test_pframe, hash_value=test_hash_value, type=
"test")
700 if supervised
and artype ==
'regression':
704 objective_column +
' - ' + self.
_get_dtype(training_frame.dtypes, objective_column))
711 self.
_labels[
"action_type"], base_ar[
'type'])
714 final_ar_model = copy.deepcopy(base_ar)
715 final_ar_model[
'status'] = self.
_labels[
'failed_op']
716 final_ar_model[
'model_parameters'][
'spark'][
'id'] = self.
_spark_session.version
717 model_timestamp = str(time.time())
718 final_ar_model[
'data_initial'] = data_initial
719 final_ar_model[
'data_normalized'] = data_normalized
721 model_id = modelid +
'_' + model_timestamp
726 analysis_type = base_ar[
'model_parameters'][
'spark'][
'types'][0][
'type']
728 base_ar[
'model_parameters'][
'spark'][
'types'][0][
'type'])
730 '''Generate commands pipeline : model and model.train()''' 731 invalid_types = [
'string']
732 transformation_chain = list()
733 column_chain = list()
735 ignored_columns = norm.ignored_columns(base_ns)
737 for element
in training_frame.dtypes:
738 if element[0]
not in ignored_columns:
739 if element[1]
in invalid_types
or (modelid ==
'NaiveBayes' and artype ==
'binomial'):
740 transformation_chain.append(StringIndexer() \
741 .setInputCol(element[0]) \
742 .setOutputCol(element[0] +
'_to_index')
743 .setHandleInvalid(
"keep"))
744 column_rename = element[0] +
'_to_index' 745 if element[0] != objective_column:
746 transformation_chain.append(OneHotEncoder() \
747 .setInputCol(element[0] +
'_to_index') \
748 .setOutputCol(element[0] +
'_to_onehot'))
749 column_rename = element[0] +
'_to_onehot' 751 objective_column = column_rename
752 decoder = len(transformation_chain) - 1
753 column_chain.append(column_rename)
755 column_chain.append(element[0])
757 ''' Packaging Features ''' 760 column_chain.remove(objective_column)
762 for column
in ignored_columns:
763 column_chain.remove(column)
767 transformation_chain.append(VectorAssembler().setInputCols(column_chain).setOutputCol(
'features'))
770 trc_pipeline = Pipeline(stages=transformation_chain.copy())
772 model_command = list()
773 model_command.append(modelid)
774 model_command.append(
"(")
775 model_command.append(
"featuresCol=\'features\'")
778 model_command.append(
", labelCol=\'%s\'" % objective_column)
782 model_command.append(
")")
783 model_command =
''.join(model_command)
786 modeldef = eval(model_command)
788 self.
_labels[
"gmodel"], model_command)
790 transformation_chain.append(modeldef)
791 pipeline = Pipeline(stages=transformation_chain)
792 grid = ParamGridBuilder().build()
793 antype = base_ar[
'model_parameters'][
'spark'][
'types'][0][
'type']
797 if training_pframe.count(axis=0).all() <= \
798 self.
_config[
'frameworks'][
'spark'][
'conf'][
'validation_frame_threshold']:
800 model = CrossValidator(estimator=pipeline,
801 estimatorParamMaps=grid,
803 objective_column=objective_column),
804 numFolds=self.
_config[
'frameworks'][
'spark'][
'conf'][
'nfolds'],
805 seed=int(base_ar[
'timestamp']))
807 model = TrainValidationSplit(estimator=pipeline,
808 estimatorParamMaps=grid,
810 objective_column=objective_column),
811 tranRation=self.
_config[
'frameworks'][
'spark'][
'conf'][
'validation_frame_ratio'],
812 seed=int(base_ar[
'timestamp']))
815 trc_dataframe = trc_pipeline.fit(training_frame).transform(training_frame)
818 self.
_labels[
"trc:label_cardinality"],
819 "( " + objective_column +
"," +
820 str(trc_dataframe.select(objective_column).distinct().count()) +
822 self.
_model_base = model.fit(training_frame).bestModel
827 final_ar_model[
'status'] = self.
_labels[
"success_op"]
830 final_ar_model[
'execution_seconds'] = time.time() - start
831 final_ar_model[
'model_parameters'][
'spark'][
'parameters'][
'model_id'] =
ParameterMetadata()
832 final_ar_model[
'model_parameters'][
'spark'][
'parameters'][
'model_id'].set_value(value=model_id,
836 final_ar_model[
'ignored_parameters'], \
837 final_ar_model[
'full_parameters_stack'] = self.
_generate_params(modeldef=modeldef)
845 str(final_ar_model[
'execution_seconds']))
855 prediction_train = self.
_model_base.transform(training_frame)
857 final_ar_model[
'metrics'][
'execution'][
'train'] = \
859 antype=analysis_type,
860 objective_column=objective_column)
861 if test_frame
is not None:
862 prediction_test = self.
_model_base.transform(test_frame)
864 final_ar_model[
'metrics'][
'execution'][
'test'] = \
866 antype=analysis_type,
867 objective_column=objective_column)
869 final_ar_model[
'metrics'][
'execution'][
'predict'] = OrderedDict()
870 final_ar_model[
'metrics'][
'execution'][
'predict'][
'decoder'] = decoder
872 final_ar_model[
'metrics'][
'accuracy'] = OrderedDict()
873 final_ar_model[
'metrics'][
'accuracy'] = OrderedDict()
876 final_ar_model[
'metrics'][
'accuracy'][
'train'] = \
877 self.
_accuracy(objective=objective_column, dataframe=prediction_train, tolerance=tolerance)
880 model_id +
' - ' + str(final_ar_model[
'metrics'][
'accuracy'][
'train']))
881 final_ar_model[
'tolerance'] = tolerance
883 final_ar_model[
'metrics'][
'accuracy'][
'train'] = 0.0
885 if test_frame
is not None:
886 prediction_test = self.
_model_base.transform(test_frame)
888 final_ar_model[
'metrics'][
'accuracy'][
'test'] = \
889 self.
_accuracy(objective=objective_column, dataframe=prediction_test, tolerance=tolerance)
891 train_balance = self.
_config[
'frameworks'][
'spark'][
'conf'][
'train_balance_metric']
892 test_balance = 1 - train_balance
893 final_ar_model[
'metrics'][
'accuracy'][
'combined'] = \
894 (final_ar_model[
'metrics'][
'accuracy'][
'train']*train_balance +
895 final_ar_model[
'metrics'][
'accuracy'][
'test']*test_balance)
899 model_id +
' - ' + str(final_ar_model[
'metrics'][
'accuracy'][
'test']))
903 model_id +
' - ' + str(final_ar_model[
'metrics'][
'accuracy'][
'combined']))
906 final_ar_model[
'metrics'][
'accuracy'][
'test'] = 0.0
907 final_ar_model[
'metrics'][
'accuracy'][
'combined'] = 0.0
913 self.
_labels[
"gmodel_metric"], model_id)
919 self.
_labels[
"gvar_metric"], model_id)
925 self.
_labels[
"gsco_metric"], model_id)
927 final_ar_model[
'status'] = self.
_labels[
'success_op']
929 except Exception
as execution_error:
930 for handler
in self.
_logging.logger.handlers:
933 final_ar_model[
'execution_seconds'] = time.time() - start
935 final_ar_model[
'model_parameters'][
'spark'][
'parameters'][
'model_id'] =
ParameterMetadata()
936 final_ar_model[
'model_parameters'][
'spark'][
'parameters'][
'model_id'].set_value(value=model_id,
943 final_ar_model[
'ignored_parameters'], \
944 final_ar_model[
'full_parameters_stack'] = self.
_generate_params(modeldef=modeldef)
946 final_ar_model[
'ignored_parameters'], \
947 final_ar_model[
'full_parameters_stack'] = self.
_generate_params(modeldef=modeldef)
949 final_ar_model[
'status'] = self.
_labels[
"failed_op"]
950 self.
_logging.log_critical(analysis_id,
953 repr(execution_error))
954 final_ar_model[
'metrics'] = OrderedDict()
955 final_ar_model[
'metrics'][
'accuracy'] = OrderedDict()
956 final_ar_model[
'metrics'][
'accuracy'][
'train'] = 0.0
957 final_ar_model[
'metrics'][
'accuracy'][
'test'] = 0.0
958 final_ar_model[
'metrics'][
'accuracy'][
'combined'] = 0.0
959 final_ar_model[
'metrics'][
'execution'] = OrderedDict()
960 final_ar_model[
'metrics'][
'execution'][
'train'] = OrderedDict()
961 final_ar_model[
'metrics'][
'execution'][
'train'][
'RMSE'] = 1e+16
962 final_ar_model[
'metrics'][
'execution'][
'train'][
'tot_withinss'] = 1e+16
963 final_ar_model[
'metrics'][
'execution'][
'train'][
'betweenss'] = 1e+16
964 final_ar_model[
'metrics'][
'execution'][
'test'] = OrderedDict()
965 final_ar_model[
'metrics'][
'execution'][
'test'][
'RMSE'] = 1e+16
966 final_ar_model[
'metrics'][
'execution'][
'test'][
'tot_withinss'] = 1e+16
967 final_ar_model[
'metrics'][
'execution'][
'test'][
'betweenss'] = 1e+16
970 generate_json_path(self.
_ec, final_ar_model)
971 self.
_persistence.store_json(storage_json=final_ar_model[
'json_path'], ar_json=final_ar_model)
975 self.
_labels[
"model_stored"], model_id)
982 for handler
in self.
_logging.logger.handlers:
984 return analysis_id, final_ar_model
990 def _get_dataframe(self, pframe, hash_value, type):
996 self.
_labels[
"parsing_to_spark"],
997 type +
'_frame(' + str(frame.count()) +
')')
1003 self.
_logging.log_info(self.
_ec.get_id_analysis(),
1005 self.
_labels[
"getting_from_spark"],
1006 type +
'_frame(' + str(frame.count()) +
')')
1015 fw = get_model_fw(armetadata)
1016 model_id = armetadata[
'model_parameters'][fw][
'parameters'][
'model_id'][
'value']
1019 armetadata[
'status'] = self.
_labels[
"success_op"]
1022 for each_storage_type
in load_storage.get_load_path():
1023 source_data = list()
1024 primary_path = self.
_config[
'storage'][each_storage_type[
'type']][
'value']
1025 source_data.append(primary_path)
1026 source_data.append(
'/')
1027 source_data.append(armetadata[
'user_id'])
1028 source_data.append(
'/')
1029 source_data.append(armetadata[
'workflow_id'])
1030 source_data.append(
'/')
1031 source_data.append(armetadata[
'model_id'])
1032 source_data.append(
'/')
1033 source_data.append(fw)
1034 source_data.append(
'/')
1035 source_data.append(armetadata[
'type'])
1036 source_data.append(
'/')
1037 source_data.append(str(armetadata[
'timestamp']))
1038 source_data.append(
'/')
1040 load_path =
''.join(source_data) + each_storage_type[
'value']+
'/' 1041 self.
_persistence.mkdir(type=each_storage_type[
'type'], path=load_path,
1042 grants=self.
_config[
'storage'][
'grants'])
1043 if each_storage_type[
'type'] ==
'hdfs':
1044 load_path = self.
_config[
'storage'][each_storage_type[
'type']][
'uri'] + load_path
1048 load_storage.append(value=load_path + model_id + self.
_get_ext(),
1049 fstype=each_storage_type[
'type'], hash_type=each_storage_type[
'hash_type'])
1052 armetadata[
'load_path'] = load_storage
1054 self.
_logging.log_exec(self.
_ec.get_id_analysis(),
1058 self.
_persistence.store_json(storage_json=armetadata[
'json_path'], ar_json=armetadata)
1059 self.
_logging.log_info(self.
_ec.get_id_analysis(),
1061 self.
_labels[
"model_stored"], model_id)
1071 fw = get_model_fw(armetadata)
1072 model_id = armetadata[
'model_parameters'][fw][
'parameters'][
'model_id'][
'value']
1074 load_fail, from_disk = self.
_get_model(base_ar=armetadata, base_model_id=model_id, remove_model=from_disk)
1088 def predict(self, predict_frame, base_ar, **kwargs):
1090 for pname, pvalue
in kwargs.items():
1093 remove_model =
False 1094 model_timestamp = str(time.time())
1095 self.
_ec.set_id_analysis(base_ar[
'model_id'])
1096 analysis_id = self.
_ec.get_id_analysis()
1097 base_model_id = base_ar[
'model_parameters'][
'spark'][
'parameters'][
'model_id'][
'value'] + self.
_get_ext()
1098 model_id = base_model_id +
'_' + model_timestamp
1099 antype = base_ar[
'model_parameters'][
'spark'][
'types'][0][
'type']
1101 modelid = base_ar[
'model_parameters'][
'spark'][
'model']
1102 base_ns = get_model_ns(base_ar)
1105 load_fails, remove_model = self.
_get_model(base_ar, base_model_id, remove_model)
1109 self.
_labels[
"no_models"], base_model_id)
1110 base_ar[
'status'] = self.
_labels[
'failed_op']
1113 objective_column = base_ar[
'objective_column']
1115 exist_objective =
True 1116 if objective_column
is None:
1117 exist_objective =
False 1122 tolerance = base_ar[
'tolerance']
1125 data_initial.getDataFrameMetadata(dataframe=predict_frame, typedf=
'pandas')
1126 base_ar[
'data_initial'] = data_initial
1128 if objective_column
in list(predict_frame.columns.values):
1131 self.
_labels[
"cor_struct"], str(data_initial[
'correlation'][objective_column]))
1134 self.
_labels[
"cor_struct"], str(data_initial[
'correlation']))
1135 npredict_frame, data_normalized, _, norm_executed, _ = self.
execute_normalization(dataframe=predict_frame,
1139 exist_objective=
True)
1142 npredict_frame, data_normalized, _, norm_executed, _ = self.
execute_normalization(dataframe=predict_frame,
1146 exist_objective=
False)
1148 if not norm_executed:
1150 'No Normalizations Required')
1153 '''predict_frame = self._spark_session.createDataFrame(predict_frame).cache() 1154 self._logging.log_info(analysis_id, self._spark_session.sparkContext.applicationId, self._labels["parsing_to_spark"], 1155 'test_frame (' + str(predict_frame.count()) + ')')''' 1157 base_ar[
'data_normalized'] = data_normalized
1158 if objective_column
in list(npredict_frame.columns.values):
1161 str(data_normalized[
'correlation'][objective_column]))
1164 str(data_normalized[
'correlation']))
1167 npredict_frame = self.
_spark_session.createDataFrame(npredict_frame).cache()
1169 'test_frame (' + str(npredict_frame.count()) +
')')
1171 base_ar[
'type'] =
'predict' 1173 self.
_labels[
"action_type"], base_ar[
'type'])
1175 base_ar[
'timestamp'] = model_timestamp
1178 self.
_labels[
'st_predict_model'],
1180 if base_ar[
'metrics'][
'execution'][
'predict'][
'decoder']
is not None:
1181 decoder = self.
_model_base.stages[base_ar[
'metrics'][
'execution'][
'predict'][
'decoder']]
1185 if objective_column
in npredict_frame.columns:
1186 for element
in npredict_frame.dtypes:
1187 if element[0] == objective_column:
1188 if element[1] ==
'string':
1189 objective_column = objective_column +
'_to_index' 1190 objective_type =
'double' 1192 objective_type = element[1]
1194 objective_type =
None 1199 accuracy, prediction_dataframe = self.
_predict_accuracy(objective_column, npredict_frame,
1200 tolerance=tolerance)
1202 base_ar[
'execution_seconds'] = time.time() - start
1203 base_ar[
'tolerance'] = tolerance
1207 if antype ==
'clustering':
1213 base_ar[
'execution_seconds'] = time.time() - start
1217 if not exist_objective
or objective_type
is not None:
1219 self.
_labels[
"gexec_metric"], model_id)
1222 dataframe=prediction_dataframe,
1223 objective_column=objective_column,
1225 if objective_column
in prediction_dataframe.columns:
1226 base_ar[
'metrics'][
'accuracy'][
'predict'] = accuracy
1227 self.
_logging.log_info(analysis_id,
1229 base_model_id +
' - ' + str(base_ar[
'metrics'][
'accuracy'][
'predict']))
1231 base_ar[
'status'] = self.
_labels[
'success_op']
1233 if decoder
is not None:
1234 labelConverter = IndexToString(inputCol=
"prediction", outputCol=
"predictedLabel",
1235 labels=decoder.labels)
1236 prediction_dataframe = labelConverter.transform(prediction_dataframe).drop(
"prediction")
1237 prediction_dataframe = prediction_dataframe.withColumnRenamed(
"predictedLabel",
"predict")
1239 prediction_dataframe = prediction_dataframe.withColumnRenamed(
"prediction",
"predict")
1242 command.append(
"prediction_dataframe.select(")
1243 command.append(
'\"predict\"')
1244 if antype
in [
'binomial',
'multinomial']:
1245 command.append(
', \"probability\"')
1246 command.append(
").toPandas()")
1248 presults = eval(
"".join(command))
1249 prediction =predict_frame.copy()
1250 prediction[
'predict'] = presults.loc[:,
'predict']
1251 if antype
in [
'binomial',
'multinomial']:
1252 prediction[
'probability'] = presults.loc[:,
'probability']
1255 prediction_json = OrderedDict()
1256 prediction_json[
'metadata'] = OrderedDict()
1257 prediction_json[
'metadata'][
'user_id'] = self.
_ec.get_id_user()
1258 prediction_json[
'metadata'][
'timestamp'] = model_timestamp
1259 prediction_json[
'metadata'][
'workflow_id'] = self.
_ec.get_id_workflow()
1260 prediction_json[
'metadata'][
'analysis_id'] = self.
_ec.get_id_analysis()
1261 prediction_json[
'metadata'][
'model_id'] = base_ar[
'model_parameters'][
'spark'][
'parameters'][
'model_id'][
'value']
1264 prediction_json[
'data'] = OrderedDict()
1265 if isinstance(prediction, DataFrame):
1266 prediction_json[
'data'] = prediction.to_dict(orient=
'records')
1268 prediction_json[
'data'] = OrderedDict(prediction)
1271 generate_json_path(self.
_ec, base_ar,
'prediction')
1272 self.
_persistence.store_json(storage_json=base_ar[
'prediction_path'], ar_json=base_ar, other=prediction_json)
1273 self.
_logging.log_exec(analysis_id,
1275 self.
_labels[
"prediction_stored"], model_id)
1278 generate_json_path(self.
_ec, base_ar)
1279 self.
_persistence.store_json(storage_json=base_ar[
'json_path'], ar_json=base_ar)
1280 self.
_logging.log_exec(analysis_id,
1282 self.
_labels[
"model_stored"], model_id)
1284 self.
_logging.log_info(analysis_id,
1286 self.
_labels[
"end"], model_id)
1287 for handler
in self.
_logging.logger.handlers:
1290 return prediction, base_ar
1298 def _get_model(self, base_ar, base_model_id, remove_model):
1301 return load_fails, remove_model
1308 remove_fails =
False 1310 assert isinstance(arlist, list)
1311 except AssertionError:
1315 for ar_metadata
in arlist:
1317 assert isinstance(ar_metadata[
'load_path'], list)
1318 except AssertionError:
1321 _, ar_metadata[
'load_path'] = persistence.remove_file(load_path=ar_metadata[
'load_path'])
1323 if len(ar_metadata[
'load_path']) == 0:
1324 ar_metadata[
'load_path'] =
None 1328 persistence.store_json(storage_json=ar_metadata[
"json_path"], ar_json=ar_metadata)
1338 for key, value
in each_model[
'parameters'].items():
1339 if value[
'seleccionable']:
1340 if isinstance(value[
'value'], str):
1341 model_command.append(
", %s=\'%s\'" % (key, value[
'value']))
1343 if value
is not None:
1344 model_command.append(
", %s=%s" % (key, value[
'value']))
1352 if isinstance(tolerance, dict):
1353 if tolerance[
'enable_fixed']:
1354 threshold = tolerance[
'fixed']
1358 for each_column
in columns:
1359 if each_column[
"name"] == objective_column
and each_column[
"type"]
in DTYPES:
1360 min_val = float(each_column[
"min"])
1361 max_val = float(each_column[
"max"])
1362 if min_val
is None or max_val
is None:
1365 threshold = (max_val - min_val) * tolerance[
'percentage']
1367 threshold = tolerance
def _generate_execution_metrics(self, dataframe, antype, objective_column)
Generate execution metrics for the correct model.
def _generate_importance_variables(self, column_chain)
Generate variable importance metrics.
def delete_frames(self)
Not Used: Remove used dataframes during analysis execution_.
def define_special_spark_naive_norm(self, df_metadata)
Method to generate special normalizations for Naive non negative work restrictions.
def generate_commands_parameters(each_model, model_command)
auxiliary function (procedure) to generate model and train chain paramters to execute models Modify m...
Define all objects, functions and structs related to common utilities not associated to one concrete ...
def load_model(self, armetadata)
Method to load model from persistence layer by armetadata.
def predict(self, predict_frame, base_ar, kwargs)
Main method to execute predictions over traning models Take the ar.json for and execute predictions i...
Define all objects, functions and structures related to logging event on DayF product logs...
def shutdown_cluster(cls)
Class Method for cluster shutdown.
def order_training(self, training_pframe, base_ar, kwargs)
Main method to execute sets of analysis and normalizations base on params.
Class oriented to manage all messages and interaction with DayF product logs.
def _generate_model_metrics(self)
Generate model summary metrics.
def _predict_clustering(self, dataframe, objective=None)
Generate detected anomalies on dataframe.
def __init__(self, e_c)
Constructor Initialize all framework variables and starts or connect to spark cluster Aditionally sta...
def _generate_scoring_history(self)
Generate model scoring_history metrics.
def _generate_params(self, modeldef)
Generate model full values parameters for execution analysis.
def get_metric(self, algorithm_description, metric, source)
Get one especific metric for execution metrics Not tested yet.
def _get_evaluator(analysis_type, objective_column=None)
Get Evaluator for model.
def _get_model(self, base_ar, base_model_id, remove_model)
Internal method to get an sparkmodel from server or file transparent to user.
def _get_ext(self)
Generate extension for diferente saving modes.
def generate_base_path(self, base_ar, type_)
Generate base path to store all files [models, logs, json] relative to it.
Define all objects, functions and structures related to physically store information on persistence s...
def _get_dataframe(self, pframe, hash_value, type)
Method to parse and reuse Spark Dataframes.
def get_tolerance(columns, objective_column, tolerance=0.0)
Auxiliary function to get the level of tolerance for regression analysis.
def _get_dtype(list_dtypes, column)
Get Spark dtype for column.
Define common execution base structure as OrderedDict() of common datasets on an unified way...
def _accuracy(self, objective, dataframe, tolerance=0.0)
Generate accuracy metrics for model for regression assume tolerance on results equivalent to 2*tolera...
def get_external_model(self, ar_metadata, type)
Generate pdml model class_.
def _predict_accuracy(self, objective, dataframe, tolerance=0.0)
Generate accuracy metrics for model for regression assume tolerance on results equivalent to 2*tolera...
def _get_model_from_load_path(self, ar_metadata)
Not Used: Load a model in sparkCluster from disk.
def __del__(self)
Destructor.
def remove_models(self, arlist)
Method to remove list of model from disk.
def store_model(self, armetadata)
Method to save model to persistence layer from armetadata.
def is_alive(self)
Is alive_connection method.
def connect(self)
Connexion_method to cluster If cluster is up connect to cluster on another case start a cluster...
Class oriented to manage normalizations on dataframes for improvements on accuracy.
Class to manage trasient information between all persistence options and models on an unified way...
def execute_normalization(self, dataframe, base_ns, model_id, filtering='NONE', exist_objective=True)
Method to execute normalizations base on params.