7 Copyright (C) e2its - All Rights Reserved 8 * Unauthorized copying of this file, via any medium is strictly prohibited 9 * Proprietary and confidential 11 * This file is part of gDayF project. 13 * Written by Jose L. Sanchez <e2its.es@gmail.com>, 2016-2019 24 from collections
import OrderedDict
26 from hashlib
import md5
as md5
27 from json
import dumps
28 from copy
import deepcopy
47 def __init__(self, e_c, deep_impact=5, metric='accuracy', dataframe_name='', hash_dataframe=''):
49 self.
_labels = self.
_ec.labels.get_config()[
'messages'][
'adviser']
50 self.
_config = self.
_ec.config.get_config()[
'optimizer']
74 if objective_column
is None:
76 self.
_logging.log_exec(self.
_ec.get_id_analysis(),
'AdviserAStar',
78 str(atype) +
' (' + str(self.
deepness) +
')')
82 objective_column=objective_column,
85 return self.
analysispoc(dataframe_metadata, objective_column, amode=FAST)
86 if amode
in [FAST, NORMAL]:
87 return self.
analysisnormal(dataframe_metadata, objective_column, amode=amode)
88 elif amode
in [FAST_PARANOIAC, PARANOIAC]:
91 if amode
in [ANOMALIES]:
94 elif amode
in [CLUSTERING]:
111 fw_model_list = list()
116 for indexer
in range(0, aux_loop_controller):
119 if model[
'status'] ==
'Executed':
120 model_type = model[
'model_parameters'][get_model_fw(model)][
'model']
121 if model_type
not in best_models
and len(best_models) < self.
_config[
'adviser_L2_wide']:
123 best_models.append(model_type)
125 ''' If all optimize_models doesn't return new models 126 register it as evaluated and seleted''' 127 best_models.append(model_type)
132 fw_model_list = list()
137 for indexer
in range(0, aux_loop_controller):
140 if model[
'status'] ==
'Executed':
141 model_type = model[
'model_parameters'][get_model_fw(model)][
'model']
142 if model_type
not in best_models
and len(best_models) < self.
_config[
'adviser_normal_wide']:
145 best_models.append(model_type)
147 ''' If all optimize_models doesn't return new models 148 register it as evaluated and seleted''' 149 best_models.append(model_type)
151 '''' Modified 20/09/2017 152 # Get two most potential best models 153 fw_model_list = list() 154 for indexer in range(0, 2): 156 fw_model_list.extend(self.optimize_models(self.analysis_recommendation_order[indexer])) 159 #if fw_model_list is not None:''' 172 def analysispoc(self, dataframe_metadata, objective_column, amode):
180 fw_model_list = list()
181 for indexer
in range(0, 1):
208 fw_model_list = list()
213 for indexer
in range(0, aux_loop_controller):
217 if model[
'status'] ==
'Executed':
218 model_type = model[
'model_parameters'][get_model_fw(model)][
'model']
219 if model_type
not in best_models:
222 best_models.append(model_type)
225 ''' If all optimize_models doesn't return new models 226 pass and look for next best model on this type''' 248 fw_model_list = list()
253 for indexer
in range(0, aux_loop_controller):
257 if model[
'status'] ==
'Executed':
258 model_type = model[
'model_parameters'][get_model_fw(model)][
'model']
259 if model_type
not in best_models:
262 best_models.append(model_type)
265 ''' If all optimize_models doesn't return new models 266 pass and look for next best model on this type''' 289 fw_model_list = list()
294 for indexer
in range(0, aux_loop_controller):
298 if model[
'status'] ==
'Executed':
299 model_type = model[
'model_parameters'][get_model_fw(model)][
'model']
300 if model_type
not in best_models:
303 best_models.append(model_type)
306 ''' If all optimize_models doesn't return new models 307 pass and look for next best model on this type''' 330 fw_model_list = list()
335 for indexer
in range(0, aux_loop_controller):
339 if model[
'status'] ==
'Executed':
340 model_type = model[
'model_parameters'][get_model_fw(model)][
'model']
347 ''' If all optimize_models doesn't return new models 348 pass and look for next best model on this type''' 349 best_models.append(model_type)
362 version = self.
_ec.config.get_config()[
'common'][
'version']
363 for ar_metadata
in list_ar_metadata:
367 self.
_ec.set_id_analysis(ar_metadata[
'model_id'])
368 ar_structure[
'predecessor'] = ar_metadata[
'model_parameters'][get_model_fw(ar_metadata)] \
369 [
'parameters'][
'model_id'][
'value']
370 ar_structure[
'round'] = int(ar_metadata[
'round']) + 1
372 ar_structure[
'predecessor'] =
'root' 374 ar_structure[
'model_id'] = self.
_ec.get_id_analysis()
375 ar_structure[
'version'] = version
376 ar_structure[
'user_id'] = self.
_ec.get_id_user()
377 ar_structure[
'workflow_id'] = ar_metadata[
'workflow_id']
378 ar_structure[
'objective_column'] = ar_metadata[
'objective_column']
379 ar_structure[
'timestamp'] = self.
timestamp 380 ar_structure[
'normalizations_set'] = ar_metadata[
'normalizations_set']
383 ar_structure[
'data_initial'] = dataframe_metadata
384 ar_structure[
'data_normalized'] =
None 385 ar_structure[
'model_parameters'] = ar_metadata[
'model_parameters']
386 ar_structure[
'ignored_parameters'] =
None 387 ar_structure[
'full_parameters_stack'] =
None 388 ar_structure[
'status'] = -1
398 version = self.
_ec.config.get_config()[
'common'][
'version']
400 if objective_column
is None:
406 aux_model_list = list()
410 minimal_nmd = norm.define_minimal_norm(dataframe_metadata=dataframe_metadata,
411 objective_column=objective_column,
413 for fw, model, _
in fw_model_list:
414 aux_model_list.append((fw, model, deepcopy(minimal_nmd)))
415 fw_model_list = aux_model_list
417 self.
applicability(fw_model_list, nrows=dataframe_metadata[
'rowcount'], ncols=dataframe_metadata[
'cols'])
419 nmd = norm.define_normalizations(dataframe_metadata=dataframe_metadata,
420 objective_column=objective_column,
425 for fw, model, _
in fw_model_list:
426 if minimal_nmd
is not None and len(minimal_nmd) > 0:
427 whole_nmd = deepcopy(minimal_nmd)
428 whole_nmd.extend(deepcopy(nmd))
429 nmdlist.append((fw, model, whole_nmd))
431 nmdlist.append((fw, model, deepcopy(nmd)))
433 fw_model_list.extend(nmdlist)
435 for fw, model_params, norm_sets
in fw_model_list:
437 if not(norm_sets
is not None and len(norm_sets) > 0
and compare_sorted_list_dict(norm_sets, minimal_nmd) \
438 and model_params[
'only_standardize'])\
439 or ((norm_sets
is None or len(norm_sets) == 0)
and model_params[
'only_standardize']):
441 ar_structure[
'model_id'] = self.
_ec.get_id_analysis()
442 ar_structure[
'version'] = version
443 ar_structure[
'user_id'] = self.
_ec.get_id_user()
444 ar_structure[
'workflow_id'] = self.
_ec.get_id_workflow()
445 ar_structure[
'objective_column'] = objective_column
446 ar_structure[
'timestamp'] = self.
timestamp 447 ar_structure[
'normalizations_set'] = norm_sets
450 ar_structure[
'data_initial'] = dataframe_metadata
451 ar_structure[
'data_normalized'] =
None 452 ar_structure[
'model_parameters'] = OrderedDict()
453 ar_structure[
'model_parameters'][fw] = model_params
454 ar_structure[
'ignored_parameters'] =
None 455 ar_structure[
'full_parameters_stack'] =
None 456 ar_structure[
'predecessor'] =
'root' 457 ar_structure[
'status'] = -1
474 config = self.
_config[
'AdviserStart_rules'][
'common']
475 for each_column
in dataframe_metadata[
'columns']:
476 if each_column[
'name'] == objective_column:
478 if each_column[
'missed'] != 0:
479 cardinality = int(each_column[
'cardinality']) - 1
481 cardinality = int(each_column[
'cardinality'])
483 if cardinality == 2
and (atype ==
'binomial' or atype
is None):
484 if atype
is not None:
485 self.
_logging.log_info(self.
_ec.get_id_analysis(),
'AdviserAStar',
486 self.
_labels[
"sucess_specific"],
'%s-%s' % (cardinality, atype))
488 elif atype
is not None:
489 if atype ==
'regression':
490 self.
_logging.log_info(self.
_ec.get_id_analysis(),
'AdviserAStar',
491 self.
_labels[
"sucess_specific"],
'%s-%s' % (cardinality, atype))
493 if atype ==
'multinomial':
494 self.
_logging.log_info(self.
_ec.get_id_analysis(),
'AdviserAStar',
495 self.
_labels[
"sucess_specific"],
'%s-%s' % (cardinality, atype))
498 self.
_logging.log_info(self.
_ec.get_id_analysis(),
'AdviserAStar',
499 self.
_labels[
"failed_specific"],
'%s-%s' % (cardinality, atype))
501 if each_column[
'type']
not in DTYPES:
504 elif cardinality <= config[
'multi_cardinality_limit'] \
505 and cardinality <= (dataframe_metadata[
'rowcount']*config[
'multi_limit']):
510 self.
_logging.log_critical(self.
_ec.get_id_analysis(),
'AdviserAStar',
511 self.
_labels[
"failed_mselection"],
'%s-%s' % (cardinality, atype))
519 base = self.
_config[
'common'][
'base_increment']
521 variabilizations = df_metadata[
'rowcount'] * df_metadata[
'cols']
522 for _, pvalue
in base.items():
523 if variabilizations > pvalue[
'base']
and increment < pvalue[
'increment']:
524 increment = pvalue[
'increment']
525 self.
_logging.log_info(self.
_ec.get_id_analysis(),
'AdviserAStar', self.
_labels[
"inc_application"],
538 for fw, fw_value
in defaultframeworks.items():
539 if fw_value[
'conf'][
'enabled']:
540 wfw_module = importlib.import_module(self.
_frameworks[fw][
'conf'][
'framework_metadata_module'])
541 wfw = eval(
'wfw_module.' + self.
_frameworks[fw][
'conf'][
'framework_metadata_class']
542 +
'(defaultframeworks)')
543 for each_base_model
in wfw.get_default():
544 if each_base_model[
'enabled']:
545 for each_type
in each_base_model[
'types']:
546 if each_type[
'active']
and each_type[
'type'] == atype[0][
'type']:
547 model_module = importlib.import_module(self.
_frameworks[fw][
'conf'][
'model_metadata_module'])
548 modelbase = eval(
'model_module.' + self.
_frameworks[fw][
'conf'][
'model_metadata_class']
550 model = modelbase.generate_models(each_base_model[
'model'], atype, amode, increment)
551 wfw.models.append(model)
552 model_list.append((fw, model,
None))
562 fw_config = self.
_ec.config.get_config()[
'frameworks']
563 exclude_model = list()
564 for iterator
in range(0, len(model_list)):
565 fw = model_list[iterator][0]
566 model = model_list[iterator][1]
567 if fw_config[fw][
'conf'][
'min_rows_enabled']
and (nrows < model[
'min_rows_applicability']):
568 self.
_logging.log_info(self.
_ec.get_id_analysis(),
'AdviserAStar', self.
_labels[
"exc_applicability"],
569 model[
'model'] +
' - ' +
'rows < ' +
570 str(model[
'min_rows_applicability']))
571 exclude_model.append(model_list[iterator])
572 if fw_config[fw][
'conf'][
'max_cols_enabled']
and model[
'max_cols_applicability']
is not None \
573 and(ncols > model[
'max_cols_applicability']):
574 self.
_logging.log_info(self.
_ec.get_id_analysis(),
'AdviserAStar', self.
_labels[
"exc_applicability"],
575 model[
'model'] +
' - ' +
'cols > ' +
576 str(model[
'max_cols_applicability']))
577 exclude_model.append(model_list[iterator])
578 for model
in exclude_model:
579 model_list.remove(model)
587 return float(model[
'metrics'][
'accuracy'][
'train']),\
588 1/float(model[
'metrics'][
'execution'][
'train'][
'RMSE']),\
590 except ZeroDivisionError:
591 return float(model[
'metrics'][
'accuracy'][
'train']), \
595 return -1.0, -1.0, 1.0
597 return -1.0, -1.0, 1.0
605 return float(model[
'metrics'][
'accuracy'][
'test']),\
606 1/float(model[
'metrics'][
'execution'][
'test'][
'RMSE']),\
608 except ZeroDivisionError:
609 return float(model[
'metrics'][
'accuracy'][
'test']), \
613 return -1.0, -1.0, 1.0
615 return -1.0, -1.0, 1.0
623 return float(model[
'metrics'][
'accuracy'][
'combined']),\
624 1/float(model[
'metrics'][
'execution'][
'train'][
'RMSE']),\
626 except ZeroDivisionError:
627 return float(model[
'metrics'][
'accuracy'][
'combined']), \
631 return -1.0, -1.0, 1.0
633 return -1.0, -1.0, 1.0
640 if str(float(model[
'metrics'][
'execution'][
'train'][
'RMSE'])).lower() ==
'nan':
643 rmse = float(model[
'metrics'][
'execution'][
'train'][
'RMSE'])
646 1/float(model[
'metrics'][
'accuracy'][
'combined']),\
648 except ZeroDivisionError:
653 return 1e+16, 1e+16, 0.0
655 return 1e+16, 1e+16, 0.0
662 if str(float(model[
'metrics'][
'execution'][
'test'][
'RMSE'])).lower() ==
'nan':
665 rmse = float(model[
'metrics'][
'execution'][
'test'][
'RMSE'])
668 1/float(model[
'metrics'][
'accuracy'][
'combined']),\
670 except ZeroDivisionError:
675 return 1e+16, 1e+16, 0.0
677 return 1e+16, 1e+16, 0.0
686 return float(model[
'metrics'][
'execution'][
'train'][
'tot_withinss']), \
687 1/float(model[
'metrics'][
'execution'][
'train'][
'betweenss']), \
689 except ZeroDivisionError:
690 return float(model[
'metrics'][
'execution'][
'train'][
'tot_withinss']), \
694 return float(model[
'metrics'][
'execution'][
'train'][
'tot_withinss']), \
698 return 1e+16, 1e+16, 0.0
706 return float(model[
'metrics'][
'execution'][
'train'][
'r2']),\
707 1/float(model[
'metrics'][
'execution'][
'train'][
'RMSE']),\
709 except ZeroDivisionError:
710 return float(model[
'metrics'][
'execution'][
'train'][
'r2']), \
714 return -1.0, -1.0, 1.0
716 return -1.0, -1.0, 1.0
724 return float(model[
'metrics'][
'execution'][
'test'][
'r2']),\
725 1/float(model[
'metrics'][
'execution'][
'test'][
'RMSE']),\
727 except ZeroDivisionError:
728 return float(model[
'metrics'][
'execution'][
'test'][
'r2']), \
732 return -1.0, -1.0, 1.0
734 return -1.0, -1.0, 1.0
741 if self.
metric ==
'train_accuracy':
743 elif self.
metric ==
'test_accuracy':
745 elif self.
metric ==
'combined_accuracy':
747 elif self.
metric ==
'cdistance':
749 elif self.
metric ==
'train_rmse':
751 elif self.
metric ==
'test_rmse':
753 elif self.
metric ==
'train_r2':
754 return sorted(model_list, key=self.
get_train_r2, reverse=
True)
755 elif self.
metric ==
'test_r2':
756 return sorted(model_list, key=self.
get_test_r2, reverse=
True)
767 fw = get_model_fw(model)
768 for parm, parm_value
in model[
'model_parameters'][fw][
'parameters'].items():
769 if isinstance(parm_value, OrderedDict)
and parm !=
'model_id':
770 vector.append(parm_value[
'value'])
772 if normalization_set == [
None]:
773 norm_vector = normalization_set
775 for normalization
in normalization_set:
776 norm_vector.append(md5(dumps(normalization).encode(
'utf8')).hexdigest())
778 return fw, model[
'model_parameters'][fw][
'model'], vector, norm_vector
786 while not analyzed
and len(aux_analized_models) > 0:
787 analyzed = analyzed
or self.
compare_vectors(vector, aux_analized_models.pop())
796 return vector1[0] == vector2[0]
and vector1[1] == vector2[1] \
797 and vector1[2] == vector2[2]
and vector1[3] == vector2[3]
805 model_list.append(model)
807 self.
_logging.log_info(self.
_ec.get_id_analysis(),
'AdviserAStar', self.
_labels[
"new_vector"], str(vector))
810 self.
_logging.log_info(self.
_ec.get_id_analysis(),
'AdviserAStar', self.
_labels[
"exc_vector"], str(vector))
def analysisnormal(self, dataframe_metadata, objective_column, amode)
Method oriented to execute smart normal and fast analysis.
Class focused on execute A* based analysis on three modalities of working Fast: 1 level analysis over...
def get_train_r2(model)
Method get train accuracy for generic model.
def compare_vectors(vector1, vector2)
Compare to execution vectors.
def applicability(self, model_list, nrows, ncols)
Method oriented to select applicability of models over min_rows_limit.
def generate_vectors(self, model, normalization_set)
Store executed model base parameters to check past executions.
Define all objects, functions and structs related to common utilities not associated to one concrete ...
def get_cdistance(model)
Method get clustering distance for generic model.
def base_iteration(self, amode, dataframe_metadata, objective_column)
Method oriented to select initial candidate models.
Define all objects, functions and structures related to logging event on DayF product logs...
Class oriented to manage all messages and interaction with DayF product logs.
def get_combined_accuracy(model)
Method get averaged train and test accuracy for generic model.
def safe_append(self, model_list, model)
Check if model is previously executed.
def get_train_rmse(model)
Method get rmse for generic model.
def analysis_specific(self, dataframe_metadata, list_ar_metadata)
Method oriented to execute new analysis.
def analysisclustering(self, dataframe_metadata, objective_column, amode)
Method oriented to execute unsupervised clustering models.
def is_executed(self, vector)
Check if model has benn executed or is planned to execute.
def get_test_r2(model)
Method get test accuracy for generic model.
analysis_recommendation_order
def analysisanomalies(self, dataframe_metadata, objective_column, amode)
Method oriented to execute unsupervised anomalies models.
def base_specific(self, dataframe_metadata, list_ar_metadata)
Method oriented to generate specific candidate metadata.
def priorize_models(self, model_list)
Method managing scoring algorithm results params: results for Handlers (gdayf.handlers) ...
def get_size_increment(self, df_metadata)
Method oriented to analyze get increments on effort based on DF_metadata structure.
def get_analysis_objective(self, dataframe_metadata, objective_column, atype=None)
Method oriented to analyze DFmetadata and select analysis objective.
def analysisparanoiac(self, dataframe_metadata, objective_column, amode)
Method oriented to execute smart normal and fast analysis.
def get_test_accuracy(model)
Method get test accuracy for generic model.
def __init__(self, e_c, deep_impact=5, metric='accuracy', dataframe_name='', hash_dataframe='')
Constructor.
Class oriented to manage normalizations on dataframes for improvements on accuracy.
def analysispoc(self, dataframe_metadata, objective_column, amode)
Method oriented to execute poc analysis.
def get_train_accuracy(model)
Method get train accuracy for generic model.
def load_frameworks(self)
Method oriented to get frameworks default values from config.
def get_candidate_models(self, atype, amode, increment=1.0)
Method oriented to analyze choose models candidate and select analysis objective. ...
def get_test_rmse(model)
Method get test rmse for generic model.
def set_recommendations(self, dataframe_metadata, objective_column, amode=POC, atype=None)
Main method oriented to execute smart analysis.