9 Copyright (C) e2its - All Rights Reserved 10 * Unauthorized copying of this file, via any medium is strictly prohibited 11 * Proprietary and confidential 13 * This file is part of gDayF project. 15 * Written by Jose L. Sanchez <e2its.es@gmail.com>, 2016-2019 21 from collections
import OrderedDict
as OrderedDict
23 from pandas
import DataFrame
as DataFrame
24 from hashlib
import md5
as md5
25 from pathlib
import Path
27 from h2o
import H2OFrame
as H2OFrame
28 from h2o
import cluster
as cluster
29 from h2o
import connect
as connect
30 from h2o
import connection
as connection
31 from h2o
import init
as init
32 from h2o
import load_model
as load_model
33 from h2o
import save_model
as save_model
34 from h2o.exceptions
import H2OError, H2OServerError
35 from h2o.exceptions
import H2OConnectionError
36 from h2o
import ls
as H2Olist
37 from h2o
import frames
as H2Oframes
38 from h2o
import get_model
39 from h2o
import remove
as H2Oremove
40 from h2o
import api
as H2Oapi
41 from h2o
import get_frame
42 from h2o
import download_pojo
44 from h2o.estimators.gbm
import H2OGradientBoostingEstimator
45 from h2o.estimators.glm
import H2OGeneralizedLinearEstimator
46 from h2o.estimators.deeplearning
import H2ODeepLearningEstimator
47 from h2o.estimators.random_forest
import H2ORandomForestEstimator
48 from h2o.estimators.naive_bayes
import H2ONaiveBayesEstimator
49 from h2o.estimators.deeplearning
import H2OAutoEncoderEstimator
50 from h2o.estimators.kmeans
import H2OKMeansEstimator
84 self.
_labels = self.
_ec.labels.get_config()[
'messages'][
'corehandler']
109 H2Oapi(
"POST /3/GarbageCollect")
119 print(
'H20-cluster not working')
134 self.
_logging.log_critical(
'gDayF',
"H2OHandler", self.
_labels[
"failed_conn"])
149 except H2OConnectionError:
158 def _get_temporal_objects_ids(self, model_id, nfolds):
160 if nfolds
is not None:
161 for iter
in range(0, nfolds):
162 models_ids.append(model_id +
'_cv_' + str(iter+1))
163 for element
in H2Oframes()[
'frames']:
164 name = element[
'frame_id'][
'name']
165 if name.find(
'reconstruction_error_') != -1
and name.find(model_id) != -1:
166 models_ids.append(name)
167 elif name.find(
'predictions_') != -1
and name.find(model_id) != -1:
168 models_ids.append(name)
178 fw = get_model_fw(ar_metadata)
179 model_id = ar_metadata[
'model_parameters'][fw][
'parameters'][
'model_id'][
'value']
180 analysis_id = ar_metadata[
'model_id']
181 config = self.
_ec.config.get_config()[
'frameworks'][fw][
'conf']
182 base_model_id = model_id +
'.model' 183 load_fails, remove_model = self.
_get_model(ar_metadata, base_model_id, remove_model)
187 self.
_labels[
"no_models"], ar_metadata)
190 for each_storage_type
in load_storage.get_load_path():
191 if each_storage_type[
'type'] ==
'localfs':
193 primary_path = self.
_config[
'storage'][each_storage_type[
'type']][
'value']
194 source_data.append(primary_path)
195 source_data.append(
'/')
196 source_data.append(ar_metadata[
'user_id'])
197 source_data.append(
'/')
198 source_data.append(ar_metadata[
'workflow_id'])
199 source_data.append(
'/')
200 source_data.append(ar_metadata[
'model_id'])
201 source_data.append(
'/')
202 source_data.append(config[
'download_dir'])
203 source_data.append(
'/')
204 source_data.append(model_id)
206 download_path =
''.join(source_data)
209 self.
_labels[
"down_path"], download_path)
211 persistence.mkdir(type=each_storage_type[
'type'], path=str(download_path),
212 grants=self.
_config[
'storage'][
'grants'])
214 if type.upper() ==
'MOJO':
216 file_path = self.
_model_base.download_mojo(path=str(download_path), get_genmodel_jar=
True)
218 self.
_labels[
"success_op"], file_path)
222 self.
_labels[
"failed_op"], download_path)
225 file_path = download_pojo(self.
_model_base, path=str(download_path), get_jar=
True)
227 self.
_labels[
"success_op"], file_path)
231 self.
_labels[
"failed_op"], download_path)
246 def _get_model_from_load_path(self, ar_metadata):
252 assert isinstance(ar_metadata[
'load_path'], list)
253 except AssertionError:
256 while ar_metadata[
'load_path']
is not None and counter_storage < len(ar_metadata[
'load_path'])
and load_fails:
258 if ar_metadata[
'load_path'][counter_storage][
'hash_value']
is None or \
259 hash_key(ar_metadata[
'load_path'][counter_storage][
'hash_type'],
260 ar_metadata[
'load_path'][counter_storage][
'value']) == \
261 ar_metadata[
'load_path'][counter_storage][
'hash_value']:
268 self.
_labels[
"abort"], ar_metadata[
'load_path'][counter_storage][
'value'])
270 if ar_metadata[
'load_path'][counter_storage][
'hash_value']
is not None:
272 ar_metadata[
'load_path'][counter_storage][
'hash_value'] +
' - ' +
273 hash_key(ar_metadata[
'load_path'][counter_storage][
'hash_type'],
274 ar_metadata[
'load_path'][counter_storage][
'value'])
296 assert type_
in [
'PoC',
'train',
'predict']
302 load_path.append(self.
hdfs)
303 load_path.append(
'/')
305 load_path.append(
'/')
306 load_path.append(base_ar[
'model_id'])
307 load_path.append(
'/')
308 load_path.append(type_)
309 load_path.append(
'/')
310 load_path.append(str(base_ar[
'timestamp']))
311 load_path.append(
'/')
312 return ''.join(load_path)
317 load_path.append(
'/')
319 load_path.append(
'/')
320 load_path.append(base_ar[
'model_id'])
321 load_path.append(
'/')
322 load_path.append(type_)
323 load_path.append(
'/')
324 load_path.append(str(base_ar[
'timestamp']))
325 load_path.append(
'/')
326 return ''.join(load_path)
345 def _generate_execution_metrics(self, dataframe, source, antype):
346 if antype ==
'binomial':
347 model_metrics = BinomialMetricMetadata()
348 elif antype ==
'multinomial':
349 model_metrics = MultinomialMetricMetadata()
350 elif antype ==
'regression':
351 model_metrics = RegressionMetricMetadata()
352 elif antype ==
'anomalies':
353 model_metrics = AnomaliesMetricMetadata()
354 elif antype ==
'clustering':
355 model_metrics = ClusteringMetricMetadata()
359 if dataframe
is not None and antype ==
'anomalies':
363 columns.append(item[0])
365 for items
in dataframe.columns:
369 difference = dataframe[x] - result1
370 anomalies_threshold = OrderedDict()
371 anomalies_threshold[
'global_mse'] = OrderedDict()
372 reconstruction = self.
_model_base.anomaly(dataframe)
373 anomalies_threshold[
'global_mse'][
'max'] = reconstruction.max()
374 anomalies_threshold[
'global_mse'][
'min'] = reconstruction.min()
376 anomalies_threshold[
'columns'] = OrderedDict()
377 for col
in difference.columns:
378 anomalies_threshold[
'columns'][col] = OrderedDict()
379 anomalies_threshold[
'columns'][col][
'max'] = difference[col].max()
380 anomalies_threshold[
'columns'][col][
'min'] = difference[col].min()
383 return anomalies_threshold
385 elif dataframe
is not None:
386 perf_metrics = self.
_model_base.model_performance(dataframe)
388 if source ==
'valid':
389 perf_metrics = self.
_model_base.model_performance(valid=
True)
390 elif source ==
'xval':
391 perf_metrics = self.
_model_base.model_performance(xval=
True)
393 perf_metrics = self.
_model_base.model_performance(train=
True)
394 model_metrics.set_metrics(perf_metrics)
395 except H2OServerError:
407 def _generate_model_metrics(self):
409 return json.loads(self.
_model_base.summary().as_data_frame().drop(
"", axis=1).to_json(orient=
'split'),
410 object_pairs_hook=OrderedDict)
415 def _generate_importance_variables(self):
419 aux[each_value[0]] = each_value[2]
427 def _generate_scoring_history(self):
429 if model_scoring
is None:
433 return json.loads(model_scoring.drop(
"", axis=1).to_json(orient=
'split'),
434 object_pairs_hook=OrderedDict)
445 def _accuracy(self, objective, dataframe, antype, base_type, tolerance=0.0):
450 if antype ==
'regression':
452 fmin = eval(
"lambda x: x - " + str(tolerance/2))
453 fmax = eval(
"lambda x: x + " + str(tolerance/2))
454 success = (dataframe[objective].apply(fmin).asnumeric() <= \
455 prediccion[
'predict'].asnumeric())
and \
456 (prediccion[
'predict'].asnumeric() <= \
457 dataframe[objective].apply(fmax).asnumeric())
463 success = prediccion[0] == dataframe[objective]
466 if accuracy
not in [0.0, -1.0]:
467 accuracy = success.sum() / dataframe.nrows
489 def _predict_accuracy(self, objective, odataframe, dataframe, antype, tolerance=0.0):
491 dataframe_cols = odataframe.columns
494 self.
_frame_list.append(prediction_dataframe.frame_id)
495 prediccion = odataframe.cbind(prediction_dataframe)
497 prediction_columns = prediccion.columns
498 if dataframe.columns
is not None:
499 dataframe_cols.extend(dataframe.columns)
500 for element
in dataframe_cols:
502 prediction_columns.remove(element)
505 predictor_col = prediction_columns[0]
507 if objective
in dataframe.columns:
508 if antype ==
'regression':
510 fmin = eval(
"lambda x: x - " + str(tolerance/2))
511 fmax = eval(
"lambda x: x + " + str(tolerance/2))
512 success = (prediccion[objective].apply(fmin).asnumeric() <= \
513 prediccion[predictor_col].asnumeric())
and \
514 (prediccion[predictor_col].asnumeric() <=
515 prediccion[objective].apply(fmax).asnumeric())
521 success = prediccion[predictor_col] == prediccion[objective]
523 if accuracy
not in [0.0, -1.0]:
524 accuracy = success.sum() / dataframe.nrows
530 return accuracy, prediccion
538 def _predict_anomalies(self, odataframe, dataframe, anomalies_thresholds):
545 columns.append(item[0])
547 for items
in dataframe.columns:
551 difference = dataframe[x] - result1
554 anomalies = OrderedDict()
555 anomalies[
'columns'] = OrderedDict()
557 if anomalies_thresholds[
'columns'][col][
'max'] < 0:
561 if anomalies_thresholds[
'columns'][col][
'min'] < 0:
566 temp_anomalies = odataframe[difference[col] > (anomalies_thresholds[
'columns'][col][
'max'] * max)
568 difference[col] < (anomalies_thresholds[
'columns'][col][
'min'] * min)]
570 if temp_anomalies.nrows > 0:
571 anomalies[
'columns'][col] = json.loads(temp_anomalies.as_data_frame(use_pandas=
True)
572 .to_json(orient=
'records'), object_pairs_hook=OrderedDict)
574 anomalies[
'global_mse'] = OrderedDict()
575 if anomalies_thresholds[
'global_mse'][
'max'] < 0:
579 if anomalies_thresholds[
'global_mse'][
'min'] < 0:
587 anomalyframe = dataframe.cbind(anomalyframe)
591 temp_anomalies = odataframe[anomalyframe[
'Reconstruction.MSE'] > (anomalies_thresholds[
'global_mse'][
'max'] * max)
593 anomalyframe[
'Reconstruction.MSE'] < (anomalies_thresholds[
'global_mse'][
'min'] *min)]
595 if temp_anomalies.nrows > 0:
596 anomalies[
'global_mse'] = json.loads(temp_anomalies.as_data_frame(use_pandas=
True).
597 to_json(orient=
'records'), object_pairs_hook=OrderedDict)
609 def _predict_clustering(self, odataframe, dataframe, objective=None):
612 dataframe_cols = dataframe.columns
614 self.
_frame_list.append(prediction_dataframe.frame_id)
615 prediccion = odataframe.cbind(prediction_dataframe)
616 self.
_frame_list.append(prediction_dataframe.frame_id)
618 return accuracy, prediccion
623 def _generate_params(self):
625 Generate model params for this model. 626 :return (status (success 0, error 1) , OrderedDict(full_stack_parameters)) 629 full_stack_params = OrderedDict()
630 for key, values
in params.items():
631 if key
not in [
'model_id',
'training_frame',
'validation_frame',
'response_column']:
632 full_stack_params[key] = values[
'actual_value']
633 return (0, full_stack_params)
643 struct_ar = OrderedDict(json.load(algorithm_description))
646 return (
'Necesario cargar un modelo valid o ar.json valido')
648 return struct_ar[
'metrics'][source][metric]
660 def need_factor(self, atype, objective_column, training_frame=None, valid_frame=None, predict_frame=None):
661 analysis_id = self.
_ec.get_id_analysis()
662 if atype
in [
'binomial',
'multinomial']:
663 if training_frame
is not None:
664 if training_frame[objective_column].types[objective_column]
in DTYPES:
665 training_frame[objective_column] = training_frame[objective_column].asfactor()
668 ' train : ' + objective_column)
670 training_frame[objective_column] = training_frame[objective_column].ascharacter().asfactor()
672 ' train : ' + objective_column)
673 if valid_frame
is not None:
674 if valid_frame[objective_column].types[objective_column]
in DTYPES:
675 valid_frame[objective_column] = valid_frame[objective_column].asfactor()
677 ' validation : ' + objective_column)
679 valid_frame[objective_column] = valid_frame[objective_column].ascharacter().asfactor()
681 ' validation : ' + objective_column)
682 if predict_frame
is not None and objective_column
in predict_frame.columns:
683 if predict_frame[objective_column].types[objective_column]
in DTYPES:
684 predict_frame[objective_column] = predict_frame[objective_column].asfactor()
686 ' predict : ' + objective_column)
688 predict_frame[objective_column] = predict_frame[objective_column].ascharacter().asfactor()
690 ' predict : ' + objective_column)
702 if base_ns
is not None:
703 data_norm = dataframe.copy(deep=
True)
707 if not exist_objective:
708 base_ns = normalizer.filter_objective_base(normalizemd=base_ns)
709 if filtering ==
'STANDARDIZE':
710 base_ns = normalizer.filter_standardize(normalizemd=base_ns, model_id=model_id)
711 elif filtering ==
'DROP':
712 base_ns = normalizer.filter_drop_missing(normalizemd=base_ns)
713 data_norm = normalizer.normalizeDataFrame(data_norm, base_ns)
716 df_metadata.getDataFrameMetadata(dataframe=data_norm, typedf=
'pandas')
717 df_metadata_hash_value = md5(json.dumps(df_metadata).encode(
'utf-8')).hexdigest()
718 return data_norm, df_metadata, df_metadata_hash_value,
True, base_ns
721 df_metadata.getDataFrameMetadata(dataframe=dataframe, typedf=
'pandas')
722 df_metadata_hash_value = md5(json.dumps(df_metadata).encode(
'utf-8')).hexdigest()
723 return dataframe, df_metadata, df_metadata_hash_value,
False, base_ns
734 assert isinstance(training_pframe, DataFrame)
735 assert isinstance(base_ar, ArMetadata)
737 analysis_id = self.
_ec.get_id_analysis()
739 for pname, pvalue
in kwargs.items():
740 if pname ==
'filtering':
741 assert isinstance(pvalue, str)
747 objective_column = base_ar[
'objective_column']
748 if objective_column
is None:
751 train_parameters_list = [
'max_runtime_secs',
'fold_column',
752 'weights_column',
'offset_column']
756 if "test_frame" in kwargs.keys():
757 test_frame = kwargs[
'test_frame']
761 base_ns = get_model_ns(base_ar)
762 modelid = base_ar[
'model_parameters'][
'h2o'][
'model']
766 assert isinstance(base_ns, list)
or base_ns
is None 769 data_initial.getDataFrameMetadata(dataframe=training_pframe, typedf=
'pandas')
770 training_pframe, data_normalized, train_hash_value, norm_executed, base_ns = \
772 filtering=filtering, exist_objective=
True)
774 if base_ar[
'round'] == 1:
775 aux_ns =
Normalizer(self.
_ec).define_ignored_columns(data_normalized, objective_column)
776 if aux_ns
is not None:
777 base_ns.extend(aux_ns)
779 df_metadata = data_initial
780 if not norm_executed:
781 data_normalized =
None 784 str(data_initial[
'correlation'][objective_column]))
787 str(data_initial[
'correlation']))
790 df_metadata = data_normalized
791 base_ar[
'normalizations_set'] = base_ns
794 str(data_normalized[
'correlation'][objective_column]))
797 str(data_initial[
'correlation']))
798 if test_frame
is not None:
800 model_id=modelid, filtering=filtering,
801 exist_objective=
True)
803 h2o_elements = H2Olist()
804 if len(h2o_elements[h2o_elements[
'key'] ==
'train_' + analysis_id +
'_' + str(train_hash_value)]):
805 if training_pframe.count(axis=0).all() > \
806 self.
_config[
'frameworks'][
'h2o'][
'conf'][
'validation_frame_threshold']:
807 training_frame = get_frame(
'train_' + analysis_id +
'_' + str(train_hash_value))
809 valid_frame = get_frame(
'valid_' + analysis_id +
'_' + str(train_hash_value))
811 'training_frame(' + str(training_frame.nrows) +
812 ') validating_frame(' + str(valid_frame.nrows) +
')')
814 training_frame = get_frame(
'train_' + analysis_id +
'_' + str(train_hash_value))
816 'training_frame(' + str(training_frame.nrows) +
')')
817 if "test_frame" in kwargs.keys():
818 test_frame = get_frame(
'test_' + analysis_id +
'_' + str(train_hash_value))
820 'test_frame (' + str(test_frame.nrows) +
')')
822 if training_pframe.count(axis=0).all() > \
823 self.
_config[
'frameworks'][
'h2o'][
'conf'][
'validation_frame_threshold']:
824 training_frame, valid_frame = \
825 H2OFrame(python_obj=training_pframe).\
826 split_frame(ratios=[self.
_config[
'frameworks'][
'h2o'][
'conf'][
'validation_frame_ratio']],
827 destination_frames=[
'train_' + analysis_id +
'_' + str(train_hash_value),
828 'valid_' + analysis_id +
'_' + str(train_hash_value)])
830 'training_frame(' + str(training_frame.nrows) +
831 ') validating_frame(' + str(valid_frame.nrows) +
')')
836 H2OFrame(python_obj=training_pframe,
837 destination_frame=
'train_' + analysis_id +
'_' + str(train_hash_value))
839 'training_frame(' + str(training_frame.nrows) +
')')
842 if "test_frame" in kwargs.keys():
843 test_frame = H2OFrame(python_obj=test_frame,
844 destination_frame=
'test_' + analysis_id +
'_' + str(train_hash_value))
846 'test_frame (' + str(test_frame.nrows) +
')')
850 self.
need_factor(atype=base_ar[
'model_parameters'][
'h2o'][
'types'][0][
'type'],
851 training_frame=training_frame,
852 valid_frame=valid_frame,
853 predict_frame=test_frame,
854 objective_column=objective_column)
858 objective_column +
' - ' + training_frame.type(objective_column))
866 final_ar_model = copy.deepcopy(base_ar)
867 final_ar_model[
'status'] = self.
_labels[
'failed_op']
868 final_ar_model[
'model_parameters'][
'h2o'][
'id'] = cluster().version
869 model_timestamp = str(time.time())
870 final_ar_model[
'data_initial'] = data_initial
871 final_ar_model[
'data_normalized'] = data_normalized
874 model_id = modelid +
'_' + model_timestamp
877 analysis_type = base_ar[
'model_parameters'][
'h2o'][
'types'][0][
'type']
879 base_ar[
'model_parameters'][
'h2o'][
'types'][0][
'type'])
881 ''' Generating and executing Models ''' 883 x = training_frame.col_names
885 x.remove(objective_column)
887 '''Generate commands: model and model.train()''' 888 model_command = list()
889 model_command.append(modelid)
890 model_command.append(
"(")
891 model_command.append(
"training_frame=training_frame")
892 train_command = list()
895 train_command.append(
"self._model_base.train(y=\'%s\', " % objective_column)
897 train_command.append(
"self._model_base.train(")
899 train_command.append(
"training_frame=training_frame")
900 if valid_frame
is not None:
901 model_command.append(
", validation_frame=valid_frame")
902 train_command.append(
", validation_frame=valid_frame")
903 model_command.append(
", model_id=\'%s%s\'" % (model_id, self.
_get_ext()))
906 train_command.append(
", ignored_columns=%s" % str(norm.ignored_columns(base_ns)))
910 train_parameters_list)
911 model_command.append(
")")
912 model_command =
''.join(model_command)
913 train_command.append(
")")
914 train_command =
''.join(train_command)
921 for each_storage_type
in final_ar_model[
'log_path'].get_log_path():
922 log_path = base_path + each_storage_type[
'value'] +
'/' + model_id +
'.log' 923 final_ar_model[
'log_path'].append(value=log_path, fstype=each_storage_type[
'type'],
924 hash_type=each_storage_type[
'hash_type'])
925 self.
_persistence.mkdir(type=final_ar_model[
'log_path'][0][
'type'],
926 grants=self.
_config[
'storage'][
'grants'],
927 path=path.dirname(final_ar_model[
'log_path'][0][
'value']))
928 connection().start_logging(final_ar_model[
'log_path'][0][
'value'])
934 final_ar_model[
'status'] =
'Executed' 936 except OSError
as execution_error:
939 repr(execution_error))
942 final_ar_model[
'model_parameters'][
'h2o'][
'parameters'][
'nfolds']
948 final_ar_model[
'model_parameters'][
'h2o'][
'parameters'][
'model_id'] =
ParameterMetadata()
949 final_ar_model[
'model_parameters'][
'h2o'][
'parameters'][
'model_id'].set_value(value=model_id,
952 final_ar_model[
'execution_seconds'] = time.time() - start
955 str(final_ar_model[
'execution_seconds']))
958 connection().stop_logging()
959 self.
_persistence.store_file(filename=final_ar_model[
'log_path'][0][
'value'],
960 storage_json=final_ar_model[
'log_path'])
963 final_ar_model[
'ignored_parameters'], \
976 antype=analysis_type)
977 final_ar_model[
'metrics'][
'accuracy'] = OrderedDict()
981 final_ar_model[
'metrics'][
'accuracy'][
'train'] = \
982 self.
_accuracy(objective_column, training_frame, antype=analysis_type, tolerance=tolerance,
983 base_type=training_frame.type(objective_column))
985 model_id +
' - ' + str(final_ar_model[
'metrics'][
'accuracy'][
'train']))
986 final_ar_model[
'tolerance'] = tolerance
989 final_ar_model[
'metrics'][
'accuracy'][
'train'] = 0.0
991 final_ar_model[
'metrics'][
'execution'][
'xval'] = \
994 if valid_frame
is not None:
996 final_ar_model[
'metrics'][
'execution'][
'valid'] = \
1001 final_ar_model[
'metrics'][
'accuracy'][
'valid'] = \
1002 self.
_accuracy(objective_column, valid_frame,
1003 antype=analysis_type, tolerance=tolerance,
1004 base_type=valid_frame.type(objective_column))
1007 final_ar_model[
'metrics'][
'accuracy'][
'valid'] = 0.0
1009 if test_frame
is not None:
1012 antype=analysis_type)
1015 final_ar_model[
'metrics'][
'accuracy'][
'test'] = \
1016 self.
_accuracy(objective_column, test_frame, antype=analysis_type, tolerance=tolerance,
1017 base_type=test_frame.type(objective_column))
1019 train_balance = self.
_config[
'frameworks'][
'h2o'][
'conf'][
'train_balance_metric']
1020 test_balance = 1 - train_balance
1021 final_ar_model[
'metrics'][
'accuracy'][
'combined'] = \
1022 (final_ar_model[
'metrics'][
'accuracy'][
'train']*train_balance +
1023 final_ar_model[
'metrics'][
'accuracy'][
'test']*test_balance)
1025 model_id +
' - ' + str(final_ar_model[
'metrics'][
'accuracy'][
'test']))
1028 model_id +
' - ' + str(final_ar_model[
'metrics'][
'accuracy'][
'combined']))
1031 final_ar_model[
'metrics'][
'accuracy'][
'test'] = 0.0
1032 final_ar_model[
'metrics'][
'accuracy'][
'combined'] = 0.0
1036 except Exception
as execution_error:
1038 repr(execution_error))
1040 final_ar_model[
'metrics'] = OrderedDict()
1041 final_ar_model[
'metrics'][
'accuracy'] = OrderedDict()
1042 final_ar_model[
'metrics'][
'accuracy'][
'train'] = -1.0
1043 final_ar_model[
'metrics'][
'accuracy'][
'test'] = -1.0
1044 final_ar_model[
'metrics'][
'accuracy'][
'combined'] = -1.0
1045 final_ar_model[
'metrics'][
'execution'] = OrderedDict()
1046 final_ar_model[
'metrics'][
'execution'][
'train'] = OrderedDict()
1047 final_ar_model[
'metrics'][
'execution'][
'train'][
'RMSE'] = 1e+16
1048 final_ar_model[
'metrics'][
'execution'][
'train'][
'tot_withinss'] = 1e+16
1049 final_ar_model[
'metrics'][
'execution'][
'train'][
'betweenss'] = 1e+16
1050 final_ar_model[
'metrics'][
'execution'][
'test'] = OrderedDict()
1051 final_ar_model[
'metrics'][
'execution'][
'test'][
'RMSE'] = 1e+16
1052 final_ar_model[
'metrics'][
'execution'][
'test'][
'tot_withinss'] = 1e+16
1053 final_ar_model[
'metrics'][
'execution'][
'test'][
'betweenss'] = 1e+16
1060 if analysis_type ==
'anomalies':
1063 antype=analysis_type)
1074 final_ar_model[
'status'] = self.
_labels[
'success_op']
1077 final_ar_model[
'status'] = self.
_labels[
'failed_op']
1078 final_ar_model[
'metrics'] = OrderedDict()
1079 final_ar_model[
'metrics'][
'accuracy'] = OrderedDict()
1080 final_ar_model[
'metrics'][
'accuracy'][
'train'] = -1.0
1081 final_ar_model[
'metrics'][
'accuracy'][
'test'] = -1.0
1082 final_ar_model[
'metrics'][
'accuracy'][
'combined'] = -1.0
1083 final_ar_model[
'metrics'][
'execution'] = OrderedDict()
1084 final_ar_model[
'metrics'][
'execution'][
'train'] = OrderedDict()
1085 final_ar_model[
'metrics'][
'execution'][
'train'][
'RMSE'] = 1e+16
1086 final_ar_model[
'metrics'][
'execution'][
'train'][
'tot_withinss'] = 1e+16
1087 final_ar_model[
'metrics'][
'execution'][
'train'][
'betweenss'] = 1e+16
1088 final_ar_model[
'metrics'][
'execution'][
'test'] = OrderedDict()
1089 final_ar_model[
'metrics'][
'execution'][
'test'][
'RMSE'] = 1e+16
1090 final_ar_model[
'metrics'][
'execution'][
'test'][
'tot_withinss'] = 1e+16
1091 final_ar_model[
'metrics'][
'execution'][
'test'][
'betweenss'] = 1e+16
1093 generate_json_path(self.
_ec, final_ar_model)
1094 self.
_persistence.store_json(storage_json=final_ar_model[
'json_path'], ar_json=final_ar_model)
1103 for handler
in self.
_logging.logger.handlers:
1110 final_ar_model[
'model_parameters'][
'h2o'][
'parameters'][
'nfolds'][
'value']))
1115 self.
_logging.log_exec(analysis_id,
1119 H2Oapi(
"POST /3/GarbageCollect")
1120 return analysis_id, final_ar_model
1128 fw = get_model_fw(armetadata)
1129 model_id = armetadata[
'model_parameters'][fw][
'parameters'][
'model_id'][
'value']
1131 if model_id + self.
_get_ext()
not in H2Olist()[
'key'].tolist():
1134 model_base = get_model(model_id + self.
_get_ext())
1137 armetadata[
'status'] = self.
_labels[
"success_op"]
1140 for each_storage_type
in load_storage.get_load_path():
1141 source_data = list()
1142 primary_path = self.
_config[
'storage'][each_storage_type[
'type']][
'value']
1143 source_data.append(primary_path)
1144 source_data.append(
'/')
1145 source_data.append(armetadata[
'user_id'])
1146 source_data.append(
'/')
1147 source_data.append(armetadata[
'workflow_id'])
1148 source_data.append(
'/')
1149 source_data.append(armetadata[
'model_id'])
1150 source_data.append(
'/')
1151 source_data.append(fw)
1152 source_data.append(
'/')
1153 source_data.append(armetadata[
'type'])
1154 source_data.append(
'/')
1155 source_data.append(str(armetadata[
'timestamp']))
1156 source_data.append(
'/')
1158 load_path =
''.join(source_data) + each_storage_type[
'value']+
'/' 1159 self.
_persistence.mkdir(type=each_storage_type[
'type'], path=load_path,
1160 grants=self.
_config[
'storage'][
'grants'])
1161 if each_storage_type[
'type'] ==
'hdfs':
1162 load_path = self.
_config[
'storage'][each_storage_type[
'type']][
'uri'] + load_path
1165 download_pojo(model=model_base, path=load_path, get_jar=
True)
1167 self.
_model_base.download_mojo(path=load_path, get_genmodel_jar=
True)
1169 save_model(model=model_base, path=load_path, force=
True)
1170 load_storage.append(value=load_path + model_id + self.
_get_ext(),
1171 fstype=each_storage_type[
'type'], hash_type=each_storage_type[
'hash_type'])
1174 armetadata[
'load_path'] = load_storage
1178 self.
_persistence.store_json(storage_json=armetadata[
'json_path'], ar_json=armetadata)
1189 fw = get_model_fw(armetadata)
1190 model_id = armetadata[
'model_parameters'][fw][
'parameters'][
'model_id'][
'value']
1192 load_fail, from_disk = self.
_get_model(base_ar=armetadata, base_model_id=model_id, remove_model=from_disk)
1206 def predict(self, predict_frame, base_ar, **kwargs):
1208 for pname, pvalue
in kwargs.items():
1211 remove_model =
False 1212 model_timestamp = str(time.time())
1213 self.
_ec.set_id_analysis(base_ar[
'model_id'])
1214 analysis_id = self.
_ec.get_id_analysis()
1215 base_model_id = base_ar[
'model_parameters'][
'h2o'][
'parameters'][
'model_id'][
'value'] +
'.model' 1216 model_id = base_model_id +
'_' + model_timestamp
1218 antype = base_ar[
'model_parameters'][
'h2o'][
'types'][0][
'type']
1220 modelid = base_ar[
'model_parameters'][
'h2o'][
'model']
1221 base_ns = get_model_ns(base_ar)
1224 load_fails, remove_model = self.
_get_model(base_ar, base_model_id, remove_model)
1228 self.
_labels[
"no_models"], base_model_id)
1229 base_ar[
'status'] = self.
_labels[
'failed_op']
1232 objective_column = base_ar[
'objective_column']
1234 exist_objective =
True 1235 if objective_column
is None:
1236 exist_objective =
False 1241 tolerance = base_ar[
'tolerance']
1244 data_initial.getDataFrameMetadata(dataframe=predict_frame, typedf=
'pandas')
1245 base_ar[
'data_initial'] = data_initial
1247 if objective_column
in list(predict_frame.columns.values):
1250 self.
_labels[
"cor_struct"], str(data_initial[
'correlation'][objective_column]))
1253 self.
_labels[
"cor_struct"], str(data_initial[
'correlation']))
1254 npredict_frame, data_normalized, _, norm_executed, _ = self.
execute_normalization(dataframe=predict_frame,
1258 exist_objective=
True)
1261 npredict_frame, data_normalized, _, norm_executed, _ = self.
execute_normalization(dataframe=predict_frame,
1265 exist_objective=
False)
1267 if not norm_executed:
1269 'No Normalizations Required')
1272 predict_frame = H2OFrame(python_obj=predict_frame,
1273 destination_frame=
'predict_frame_' + base_ar[
'model_id'])
1275 'test_frame (' + str(predict_frame.nrows) +
')')
1278 base_ar[
'data_normalized'] = data_normalized
1279 if objective_column
in list(npredict_frame.columns.values):
1282 str(data_normalized[
'correlation'][objective_column]))
1285 str(data_normalized[
'correlation']))
1288 npredict_frame = H2OFrame(python_obj=npredict_frame,
1289 destination_frame=
'npredict_frame_' + base_ar[
'model_id'])
1291 'test_frame (' + str(npredict_frame.nrows) +
')')
1295 self.
need_factor(atype=base_ar[
'model_parameters'][
'h2o'][
'types'][0][
'type'],
1296 objective_column=objective_column, predict_frame=npredict_frame)
1298 base_ar[
'type'] =
'predict' 1300 self.
_labels[
"action_type"], base_ar[
'type'])
1302 base_ar[
'timestamp'] = model_timestamp
1304 for each_storage_type
in base_ar[
'log_path']:
1305 each_storage_type[
'value'] = each_storage_type[
'value'].replace(
'train',
'predict') \
1306 .replace(
'.log',
'_' + model_timestamp +
'.log')
1308 self.
_persistence.mkdir(type=base_ar[
'log_path'][0][
'type'],
1309 grants=self.
_config[
'storage'][
'grants'],
1310 path=path.dirname(base_ar[
'log_path'][0][
'value']))
1311 connection().start_logging(base_ar[
'log_path'][0][
'value'])
1314 self.
_labels[
'st_predict_model'],
1317 if objective_column
in npredict_frame.columns:
1318 objective_type = npredict_frame.type(objective_column)
1320 objective_type =
None 1325 ''' Bug Fixing 02/04/2018 1326 if predict_frame.nrows == npredict_frame.nrows: 1327 accuracy, prediction_dataframe = self._predict_accuracy(objective_column, predict_frame, npredict_frame, 1329 tolerance=tolerance, base_type=objective_type) 1331 accuracy, prediction_dataframe = self._predict_accuracy(objective_column, npredict_frame, npredict_frame, 1333 tolerance=tolerance, base_type=objective_type)''' 1335 accuracy, prediction_dataframe = self.
_predict_accuracy(objective_column, predict_frame, npredict_frame,
1336 antype=antype, tolerance=tolerance)
1337 self.
_frame_list.append(prediction_dataframe.frame_id)
1339 base_ar[
'execution_seconds'] = time.time() - start
1340 base_ar[
'tolerance'] = tolerance
1342 prediction_dataframe = prediction_dataframe.as_data_frame(use_pandas=
True)
1344 if antype ==
'anomalies':
1347 base_ar[
'metrics'][
'anomalies'])
1350 base_ar[
'metrics'][
'anomalies'])
1351 base_ar[
'execution_seconds'] = time.time() - start
1353 if antype ==
'clustering':
1358 self.
_frame_list.append(prediction_dataframe.frame_id)
1360 base_ar[
'execution_seconds'] = time.time() - start
1361 prediction_dataframe = prediction_dataframe.as_data_frame(use_pandas=
True)
1364 connection().stop_logging()
1365 self.
_persistence.store_file(filename=base_ar[
'log_path'][0][
'value'],
1366 storage_json=base_ar[
'log_path'])
1368 if not exist_objective
or objective_type
is not None:
1371 source=
None, antype=antype)
1372 if objective_column
in npredict_frame.columns:
1373 base_ar[
'metrics'][
'accuracy'][
'predict'] = accuracy
1375 base_model_id +
' - ' + str(base_ar[
'metrics'][
'accuracy'][
'predict']))
1377 base_ar[
'status'] = self.
_labels[
'success_op']
1379 if antype ==
'anomalies':
1380 prediction = predict_anomalies
1382 prediction = prediction_dataframe
1385 prediction_json = OrderedDict()
1386 prediction_json[
'metadata'] = OrderedDict()
1387 prediction_json[
'metadata'][
'user_id'] = self.
_ec.get_id_user()
1388 prediction_json[
'metadata'][
'timestamp'] = model_timestamp
1389 prediction_json[
'metadata'][
'workflow_id'] = self.
_ec.get_id_workflow()
1390 prediction_json[
'metadata'][
'analysis_id'] = self.
_ec.get_id_analysis()
1391 prediction_json[
'metadata'][
'model_id'] = base_ar[
'model_parameters'][
'h2o'][
'parameters'][
'model_id'][
1395 prediction_json[
'data'] = OrderedDict()
1396 if isinstance(prediction, DataFrame):
1397 prediction_json[
'data'] = prediction.to_dict(orient=
'records')
1399 prediction_json[
'data'] = OrderedDict(prediction)
1402 generate_json_path(self.
_ec, base_ar,
'prediction')
1403 self.
_persistence.store_json(storage_json=base_ar[
'prediction_path'], ar_json=base_ar, other=prediction_json)
1407 generate_json_path(self.
_ec, base_ar)
1408 self.
_persistence.store_json(storage_json=base_ar[
'json_path'], ar_json=base_ar)
1412 for handler
in self.
_logging.logger.handlers:
1417 H2Oremove(npredict_frame)
1419 H2Oremove(predict_frame)
1421 self.
_logging.log_critical(analysis_id,
1428 self.
_logging.log_critical(analysis_id,
1431 H2Oapi(
"POST /3/GarbageCollect")
1433 return prediction, base_ar
1441 def _get_model(self, base_ar, base_model_id, remove_model):
1442 if base_model_id
in H2Olist()[
'key'].tolist():
1451 return load_fails, remove_model
1459 assert isinstance(arlist, list)
1460 except AssertionError:
1462 for armetadata
in arlist:
1463 fw = get_model_fw(armetadata)
1464 model_id = armetadata[
'model_parameters'][fw][
'parameters'][
'model_id'][
'value']+
'.model' 1465 if model_id
in H2Olist()[
'key'].tolist():
1468 remove_fails =
False 1480 for ar_metadata
in arlist:
1482 assert isinstance(ar_metadata[
'load_path'], list)
1483 except AssertionError:
1486 _, ar_metadata[
'load_path'] = persistence.remove_file(load_path=ar_metadata[
'load_path'])
1488 if len(ar_metadata[
'load_path']) == 0:
1489 ar_metadata[
'load_path'] =
None 1493 persistence.store_json(storage_json=ar_metadata[
"json_path"], ar_json=ar_metadata)
1508 for key, value
in each_model[
'parameters'].items():
1509 if value[
'seleccionable']:
1510 if isinstance(value[
'value'], str):
1511 if key
in train_parameters_list
and value
is not None:
1512 train_command.append(
", %s=\'%s\'" % (key, value[
'value']))
1514 model_command.append(
", %s=\'%s\'" % (key, value[
'value']))
1516 if key
in train_parameters_list
and value
is not None:
1517 train_command.append(
", %s=%s" % (key, value[
'value']))
1519 model_command.append(
", %s=%s" % (key, value[
'value']))
1527 if isinstance(tolerance, dict):
1528 if tolerance[
'enable_fixed']:
1529 threshold = tolerance[
'fixed']
1533 for each_column
in columns:
1534 if each_column[
"name"] == objective_column
and each_column[
"type"]
in DTYPES:
1535 min_val = float(each_column[
"min"])
1536 max_val = float(each_column[
"max"])
1537 if min_val
is None or max_val
is None:
1540 threshold = (max_val - min_val) * tolerance[
'percentage']
1542 threshold = tolerance
def _get_ext(self)
Generate extension for diferente saving modes.
def _get_model_from_load_path(self, ar_metadata)
Load a model in H2OCluster from disk.
def execute_normalization(self, dataframe, base_ns, model_id, filtering='NONE', exist_objective=True)
Method to execute normalizations base on params.
def _get_model(self, base_ar, base_model_id, remove_model)
Internal method to get an H2Omodel from server or file transparent to user.
def _generate_scoring_history(self)
Generate model scoring_history metrics.
def get_metric(self, algorithm_description, metric, source)
Get one especific metric for execution metrics Not tested yet.
def remove_memory_models(self, arlist)
Method to remove list of model from server.
def _predict_accuracy(self, objective, odataframe, dataframe, antype, tolerance=0.0)
Generate accuracy metrics for model for regression assume tolerance on results equivalent to 2*tolera...
def _predict_anomalies(self, odataframe, dataframe, anomalies_thresholds)
Generate detected anomalies on dataframe.
def store_model(self, armetadata)
Method to save model to persistence layer from armetadata.
def _generate_params(self)
Generate model full values parameters for execution analysis.
def generate_commands_parameters(each_model, model_command, train_command, train_parameters_list)
auxiliary function (procedure) to generate model and train chain paramters to execute models Modify m...
Define all objects, functions and structs related to common utilities not associated to one concrete ...
def _predict_clustering(self, odataframe, dataframe, objective=None)
Generate detected clustering on dataframe.
Define all objects, functions and structures related to logging event on DayF product logs...
def get_tolerance(columns, objective_column, tolerance=0.0)
Auxiliary function to get the level of tolerance for regression analysis.
Class oriented to manage all messages and interaction with DayF product logs.
def _accuracy(self, objective, dataframe, antype, base_type, tolerance=0.0)
Generate accuracy metrics for model for regression assume tolerance on results equivalent to 2*tolera...
def _generate_execution_metrics(self, dataframe, source, antype)
Generate execution metrics for the correct model.
def _generate_model_metrics(self)
Generate model summary metrics.
def generate_base_path(self, base_ar, type_)
Generate base path to store all files [models, logs, json] relative to it.
def order_training(self, training_pframe, base_ar, kwargs)
Main method to execute sets of analysis and normalizations base on params.
def shutdown_cluster(cls)
Class Method for cluster shutdown.
def remove_models(self, arlist)
Method to remove list of model from server.
def _generate_importance_variables(self)
Generate variable importance metrics.
Define all objects, functions and structures related to physically store information on persistence s...
def _get_temporal_objects_ids(self, model_id, nfolds)
Generate list of models_id for internal crossvalidation objects_.
Define common execution base structure as OrderedDict() of common datasets on an unified way...
def predict(self, predict_frame, base_ar, kwargs)
Main method to execute predictions over traning models Take the ar.json for and execute predictions i...
def get_external_model(self, ar_metadata, type)
Generate java model class_.
def __init__(self, e_c)
Constructor Initialize all framework variables and starts or connect to h2o cluster Aditionally start...
def need_factor(self, atype, objective_column, training_frame=None, valid_frame=None, predict_frame=None)
Auxiliary Method to convert numerical and string columns on H2OFrame to enum (factor) for classificat...
def __del__(self)
Destructor.
Class oriented to manage normalizations on dataframes for improvements on accuracy.
def delete_frames(self)
Remove used dataframes during analysis execution_.
Class to manage trasient information between all persistence options and models on an unified way...
def load_model(self, armetadata)
Method to load model from persistence layer by armetadata.
def connect(self)
Connexion_method to cluster If cluster is up connect to cluster on another case start a cluster...
def is_alive(self)
Is alive_connection method.