6 Copyright (C) e2its - All Rights Reserved 7 * Unauthorized copying of this file, via any medium is strictly prohibited 8 * Proprietary and confidential 10 * This file is part of gDayF project. 12 * Written by Jose L. Sanchez <e2its.es@gmail.com>, 2016-2019 22 from json.decoder
import JSONDecodeError
23 from json
import load, dumps
24 from collections
import OrderedDict
25 from pandas
import DataFrame, set_option
27 from pathlib
import Path
35 self.
_ec = E_C(user_id=user_id)
37 self.
_labels = self.
_ec.labels.get_config()[
'messages'][
'workflow']
50 def workflow(self, datapath, workflow, prefix=None, remove_models=EACH_BEST):
52 if isinstance(workflow, str):
53 file = open(workflow,
'r') 54 wf = load(file, object_hook=OrderedDict) 55 if self.
_ec.get_id_workflow() ==
'default':
56 self.
_ec.set_id_workflow(Path(workflow).stem +
'_' + self.
timestamp)
59 if self.
_ec.get_id_workflow() ==
'default':
60 self.
_ec.set_id_workflow(self.
_ec.get_id_workflow() +
'_' + self.
timestamp)
62 for wkey, wvalue
in wf.items():
65 _prefix = xstr(wvalue[
'parameters'][
'objective_column']) +
'_' + wkey
68 _prefix = prefix +
'_' + xstr(wvalue[
'parameters'][
'objective_column']) +
'_' + wkey
69 if wvalue[
'parameters'][
'mode'] ==
"train":
71 prefix=_prefix, remove_models=remove_models)
72 elif wvalue[
'parameters'][
'mode'] ==
"predict":
74 prefix=_prefix, remove_models=remove_models)
76 self.
_logging.log_info(
'gDayF',
"Workflow", self.
_labels[
"nothing_to_do"])
86 def train_workflow(self, datapath, wkey, workflow, prefix='main', remove_models=EACH_BEST):
87 set_option(
'display.max_rows', 500)
88 set_option(
'display.max_columns', 500)
89 set_option(
'display.width', 1000)
99 if controller.config_checks():
100 variables = dataset.columns.tolist()
103 if wf[
"data"][
"filtered_columns"]
is not None:
104 for delete
in wf[
"data"][
"filtered_columns"]:
106 variables.remove(delete)
108 self.
_logging.log_info(
'gDayF',
"Workflow", self.
_labels[
"failed_var"], delete)
109 self.
_logging.log_info(
'gDayF',
"Workflow", self.
_labels[
"variables_desc"], variables)
110 if wf[
"data"][
"for_each"]
is not None:
111 fe_column = wf[
"data"][
"for_each"]
112 fe_data_exclusions = wf[
"data"][
"for_each_exclusions"]
113 fe_filtered_data = wf[
"data"][
"filtered_data"]
114 fe_parameters = wf[
"parameters"]
117 for each
in eval(
'dataset.'+fe_column+
'.unique()'):
118 if fe_data_exclusions
is None or each
not in fe_data_exclusions:
119 aux_dataset = eval(
'dataset[dataset.' + fe_column +
'== each]')
120 pfix = xstr(prefix +
'_' + str(each))
123 if fe_filtered_data
is not None:
124 qcolumn = fe_filtered_data[
"column"]
125 quantile = aux_dataset[qcolumn].quantile(q=fe_filtered_data[
"quantile"])
126 aux_dataset = eval(
'aux_dataset.loc[aux_dataset.' + qcolumn +
'<= ' + str(quantile) +
']')
127 pfix = xstr(pfix +
'_' + str(fe_filtered_data[
"quantile"]))
130 if fe_parameters
is not None:
131 source_parameters = list()
132 source_parameters.append(
'controller.exec_analysis(')
133 source_parameters.append(
'datapath=aux_dataset.loc[:, variables]')
134 for ikey, ivalue
in fe_parameters.items():
135 source_parameters.append(
',')
136 source_parameters.append(ikey)
137 source_parameters.append(
'=')
138 if isinstance(ivalue, str)
and ikey !=
"amode":
139 source_parameters.append(
'\'')
140 source_parameters.append(ivalue)
141 source_parameters.append(
'\'')
143 source_parameters.append(str(ivalue))
144 source_parameters.append(
')')
146 self.
_logging.log_info(
'gDayF',
"Workflow", self.
_labels[
"desc_operation"],
147 ''.join(source_parameters))
148 status, recomendations = eval(
''.join(source_parameters))
149 controller.remove_models(recomendations, mode=remove_models)
150 controller.reconstruct_execution_tree(recomendations, metric=fe_parameters[
'metric'],
154 table_model_list = controller.table_model_list(ar_list=recomendations,
155 metric=eval(fe_parameters[
'metric']))
156 self.
_logging.log_info(
'gDayF',
'workflow', self.
_labels[
"results"]+
'\n',
157 table_model_list.to_string(justify=
"left"))
160 if self.
_config[
'common'][
'workflow_summary_enabled']:
161 filename = self.
storage_path(
'train', str(pfix) +
'_' +
'train_performance' 163 table_model_list.to_excel(filename, index=
False, sheet_name=
'performance')
166 prediction_frame = controller.exec_prediction(datapath=aux_dataset,
167 model_file=recomendations[0][
'json_path'][0][
'value'])
169 if 'predict' in prediction_frame.columns.values:
170 prediction_frame.rename(columns={
"predict": wkey}, inplace=
True)
171 elif 'prediction' in prediction_frame.columns.values:
172 prediction_frame.rename(columns={
"prediction": wkey}, inplace=
True)
174 self.
_logging.log_info(
'gDayF',
'workflow', self.
_labels[
"results"]+
'\n',
175 prediction_frame.to_string(index_names=
False, justify=
"left"))
177 '''filename = self.storage_path('train', wkey + '_' 178 + str(pfix) + '_' + 'prediction', 'xls')''' 179 if self.
_config[
'common'][
'workflow_summary_enabled']:
180 filename = self.
storage_path(
'train', str(pfix) +
'_' +
'prediction',
'xls')
181 prediction_frame.to_excel(filename, index=
False, sheet_name=
'train_prediction')
184 except AttributeError
as oexecution_error:
185 self.
_logging.log_info(
'gDayF',
"Workflow", self.
_labels[
"failed_model"],
186 str(repr(oexecution_error)))
189 if fe_next
is not None and prediction_frame
is not None:
190 self.
workflow(prediction_frame, fe_next, pfix, remove_models=remove_models)
191 except Exception
as oexecution_error:
192 self.
_logging.log_critical(
'gDayF',
"Workflow", self.
_labels[
"failed_wf"], str(fe_next))
194 aux_dataset = dataset
196 if wf[
"data"][
"filtered_data"]
is not None:
197 qcolumn = wf[
"data"][
"filtered_data"][
"column"]
198 quantile = aux_dataset[[qcolumn]].quatile([wf[
"data"][
"filtered_data"][
"quantile"]])
199 aux_dataset = aux_dataset.query(
'%s <= %s' % (qcolumn, quantile))
201 if wf[
'parameters']
is not None:
202 source_parameters = list()
203 source_parameters.append(
'controller.exec_analysis(')
204 source_parameters.append(
'datapath=aux_dataset.loc[:, variables]')
205 for ikey, ivalue
in wf[
'parameters'].items():
206 source_parameters.append(
',')
207 source_parameters.append(ikey)
208 source_parameters.append(
'=')
209 if isinstance(ivalue, str)
and ikey !=
"amode":
210 source_parameters.append(
'\'')
211 source_parameters.append(ivalue)
212 source_parameters.append(
'\'')
214 source_parameters.append(str(ivalue))
215 source_parameters.append(
')')
216 self.
_logging.log_info(
'gDayF',
"Workflow", self.
_labels[
"desc_operation"],
217 ''.join(source_parameters))
218 status, recomendations = eval(
''.join(source_parameters))
219 controller.remove_models(recomendations, mode=remove_models)
220 controller.reconstruct_execution_tree(recomendations, metric=wf[
'parameters'][
'metric'], store=
True)
222 model_id = recomendations[0][
'model_id']
223 table_model_list = controller.table_model_list(ar_list=recomendations,
224 metric=eval(wf[
'parameters'][
'metric']))
225 self.
_logging.log_info(
'gDayF',
'workflow', self.
_labels[
"results"]+
'\n',
226 table_model_list.to_string(justify=
"left"))
228 if self.
_config[
'common'][
'workflow_summary_enabled']:
229 '''filename = self.storage_path('train', wkey + '_' + str(pfix) + '_' 230 + 'train_performance', 'xls')''' 231 filename = self.
storage_path(
'train', str(pfix) +
'_' +
'train_performance',
'xls')
232 table_model_list.to_excel(filename, index=
False, sheet_name=
"performace")
235 prediction_frame = controller.exec_prediction(datapath=aux_dataset,
236 model_file=recomendations[0][
'json_path'][0][
'value'])
238 if 'predict' in prediction_frame.columns.values:
239 prediction_frame.rename(columns={
"predict": wkey}, inplace=
True)
240 elif 'prediction' in prediction_frame.columns.values:
241 prediction_frame.rename(columns={
"prediction": wkey}, inplace=
True)
243 self.
_logging.log_info(
'gDayF',
'workflow', self.
_labels[
"results"]+
'\n',
244 prediction_frame.to_string(index_names=
False, justify=
"left"))
246 '''filename = self.storage_path('train', wkey + '_' + str(pfix) + '_' 247 + 'prediction', 'xls')''' 248 if self.
_config[
'common'][
'workflow_summary_enabled']:
249 filename = self.
storage_path(
'train', str(pfix) +
'_' +
'prediction',
'xls')
250 prediction_frame.to_excel(filename, index=
False, sheet_name=
"train_prediction")
253 except AttributeError
as oexecution_error:
254 self.
_logging.log_info(
'gDayF',
"Workflow", self.
_labels[
"failed_model"],
255 str(repr(oexecution_error)))
257 if wf[
'Next']
is not None and prediction_frame
is not None:
259 self.
workflow(datapath=prediction_frame, workflow=wf[
'Next'],
260 prefix=pfix, remove_models=remove_models)
261 except Exception
as oexecution_error:
262 self.
_logging.log_critical(
'gDayF',
"Workflow", self.
_labels[
"failed_wf"], str(wf[
'Next']))
263 self.
_logging.log_critical(
'gDayF',
"Workflow", self.
_labels[
"failed_wf"],
264 repr(oexecution_error))
267 controller.clean_handlers()
279 def predict_workflow(self, datapath, wkey, workflow, prefix='main', workflow_id='default', remove_models=EACH_BEST):
280 set_option(
'display.height', 1000)
281 set_option(
'display.max_rows', 500)
282 set_option(
'display.max_columns', 500)
283 set_option(
'display.width', 1000)
289 if isinstance(workflow, str):
290 file = open(workflow,
'r') 291 wf = load(file, object_hook=OrderedDict) 296 if controller.config_checks():
297 variables = dataset.columns.tolist()
300 if wf[
"model"]
is not None and \
301 (isinstance(wf[
"model"], str)
or isinstance(wf[
"model"], dict)):
303 if wf[
"data"][
"filtered_columns"]
is not None:
304 for delete
in wf[
"data"][
"filtered_columns"]:
306 variables.remove(delete)
308 self.
_logging.log_info(
'gDayF',
"Workflow", self.
_labels[
"failed_var"], delete)
310 self.
_logging.log_info(
'gDayF',
"Workflow", self.
_labels[
"variables_desc"], variables)
312 if wf[
"data"][
"for_each"]
is not None:
313 fe_column = wf[
"data"][
"for_each"]
314 fe_data_exclusions = wf[
"data"][
"for_each_exclusions"]
315 fe_filtered_data = wf[
"data"][
"filtered_data"]
318 for each
in eval(
'dataset.' + fe_column +
'.unique()'):
319 if fe_data_exclusions
is None or each
not in fe_data_exclusions:
320 aux_dataset = eval(
'dataset[dataset.' + fe_column +
'== each]')
321 pfix = xstr(prefix +
'_' + str(each))
323 if fe_filtered_data
is not None:
324 qcolumn = fe_filtered_data[
"column"]
325 quantile = aux_dataset[qcolumn].quantile(q=fe_filtered_data[
"quantile"])
326 aux_dataset = eval(
'aux_dataset.loc[aux_dataset.' + qcolumn +
'<= ' + str(quantile) +
']')
327 pfix = xstr(pfix +
'_' + str(fe_filtered_data[
"quantile"]))
329 prediction_frame = controller.exec_prediction(datapath=aux_dataset,
330 model_file=wf[
"model"][str(each)])
332 if 'predict' in prediction_frame.columns.values:
333 prediction_frame.rename(columns={
"predict": wkey}, inplace=
True)
334 elif 'prediction' in prediction_frame.columns.values:
335 prediction_frame.rename(columns={
"prediction": wkey}, inplace=
True)
336 except AttributeError:
337 self.
_logging.log_info(
'gDayF',
"Workflow", self.
_labels[
"anomalies_operation"])
339 self.
_logging.log_info(
'gDayF',
'workflow', self.
_labels[
"results"]+
'\n',
340 prediction_frame.to_string(index_names=
False, justify=
"left"))
343 if isinstance(prediction_frame, DataFrame) \
344 and self.
_config[
'common'][
'workflow_summary_enabled']:
345 '''filename = self.storage_path('predict', wkey + '_' 346 + str(pfix) + '_' + 'prediction', 'xls')''' 347 filename = self.
storage_path(
'predict', str(pfix) +
'_' +
348 str(self.
timestamp) +
'_' +
'prediction',
'xls')
349 prediction_frame.to_excel(filename, index=
False, sheet_name=
"prediction")
351 elif self.
_config[
'common'][
'workflow_summary_enabled']:
352 for ikey, ivalue
in prediction_frame[
'columns'].items():
353 ppDF = decode_ordered_dict_to_dataframe(ivalue)
354 if isinstance(ppDF, DataFrame):
355 '''filename = self.storage_path('predict', wkey + '_' 356 + str(pfix) + '_' + 'prediction_' + ikey, 'xls')''' 357 filename = self.
storage_path(
'predict', str(pfix) +
'_' +
359 'prediction_' + ikey,
'xls')
360 ppDF.to_excel(filename, index=
False, sheet_name=
"prediction")
363 filename = self.
storage_path(
'predict', str(pfix) +
'_' +
364 str(self.
timestamp) +
'_' +
'_prediction',
'json')
365 with open(filename,
'w')
as f:
366 f.write(dumps(prediction_frame[
'global_mse']))
368 except AttributeError:
369 self.
_logging.log_info(
'gDayF',
"Workflow", self.
_labels[
"anomalies_operation"],
373 if fe_next
is not None and prediction_frame
is not None:
374 self.
workflow(prediction_frame, fe_next, pfix, remove_models=remove_models)
375 except Exception
as oexecution_error:
376 self.
_logging.log_critical(
'gDayF',
"Workflow", self.
_labels[
"failed_wf"], str(fe_next))
377 self.
_logging.log_critical(
'gDayF',
"Workflow", self.
_labels[
"failed_wf"],
378 repr(oexecution_error))
380 aux_dataset = dataset
382 prediction_frame = controller.exec_prediction(datapath=aux_dataset, model_file=wf[
"model"])
383 if 'predict' in prediction_frame.columns.values:
384 prediction_frame.rename(columns={
"predict": wkey}, inplace=
True)
385 elif 'prediction' in prediction_frame.columns.values:
386 prediction_frame.rename(columns={
"prediction": wkey}, inplace=
True)
388 self.
_logging.log_info(
'gDayF',
'workflow', self.
_labels[
"results"]+
'\n',
389 prediction_frame.to_string(index_names=
False, justify=
"left"))
390 if isinstance(prediction_frame, DataFrame)
and self.
_config[
'common'][
'workflow_summary_enabled']:
392 str(self.
timestamp) +
'_' +
'_prediction',
'xls')
393 prediction_frame.to_excel(filename, index=
False, sheet_name=
"prediction")
395 elif self.
_config[
'common'][
'workflow_summary_enabled']:
396 for ikey, ivalue
in prediction_frame[
'columns'].items():
397 ppDF = decode_ordered_dict_to_dataframe(ivalue)
398 if isinstance(ppDF, DataFrame):
399 filename = self.
storage_path(
'predict', str(pfix) +
'_' +
400 str(self.
timestamp) +
'_' +
'prediction_' + ikey,
'xls')
401 ppDF.to_excel(filename, index=
False, sheet_name=
"prediction")
404 filename = self.
storage_path(
'predict', str(pfix) +
'_' +
405 str(self.
timestamp) +
'_' +
'_prediction',
'json')
406 with open(filename,
'w')
as f:
407 f.write(dumps(prediction_frame))
410 if wf[
'Next']
is not None and prediction_frame
is not None:
412 self.
workflow(datapath=prediction_frame, workflow=wf[
'Next'], prefix=pfix,
413 remove_models=remove_models)
414 except Exception
as oexecution_error:
415 self.
_logging.log_critical(
'gDayF',
"Workflow", self.
_labels[
"failed_wf"], str(wf[
'Next']))
416 self.
_logging.log_critical(
'gDayF',
"Workflow", self.
_labels[
"failed_wf"],
417 repr(oexecution_error))
419 controller.clean_handlers()
426 if isinstance(datapath, str):
428 self.
_logging.log_info(
'gDayF',
"Workflow", self.
_labels[
"input_param"], datapath)
430 return None, pd_dataset.copy()
431 except [IOError, OSError, JSONDecodeError]:
432 self.
_logging.log_critical(
'gDayF',
"Workflow", self.
_labels[
"failed_input"], datapath)
433 return self.
_labels[
'failed_input'],
None 434 elif isinstance(datapath, DataFrame):
435 self.
_logging.log_info(
'gDayF',
"Controller", self.
_labels[
"input_param"], str(datapath.shape))
436 return None, datapath
438 self.
_logging.log_critical(
'gDayF',
"Workflow", self.
_labels[
"failed_input"], datapath)
439 return self.
_labels[
'failed_input'],
None 448 if self.
_config[
'common'][
'workflow_summary_enabled']:
452 for each_storage_type
in load_storage.get_load_path(include=include):
453 if each_storage_type[
'type'] ==
'localfs':
455 primary_path = self.
_config[
'storage'][each_storage_type[
'type']][
'value']
456 source_data.append(primary_path)
457 source_data.append(
'/')
458 source_data.append(self.
_ec.get_id_user())
459 source_data.append(
'/')
460 source_data.append(self.
_ec.get_id_workflow())
461 source_data.append(
'/')
462 source_data.append(self.
_config[
'common'][
'workflow_summary_dir'])
463 source_data.append(
'/')
464 source_data.append(mode)
465 source_data.append(
'/')
468 path=
''.join(source_data), grants=self.
_config[
'storage'][
'grants'])
469 source_data.append(filename)
470 source_data.append(
'.' + filetype)
472 return ''.join(source_data)
482 for each_storage_type
in load_storage:
483 if each_storage_type[
'type']
in [
'localfs',
'hdfs']:
485 primary_path = self.
_config[
'storage'][each_storage_type[
'type']][
'value']
486 source_data.append(primary_path)
487 source_data.append(
'/')
488 source_data.append(self.
_ec.get_id_user())
489 source_data.append(
'/')
490 source_data.append(self.
_ec.get_id_workflow())
491 source_data.append(
'/')
492 source_data.append(self.
_config[
'common'][
'workflow_summary_dir'])
493 source_data.append(
'/')
494 source_data.append(mode)
495 source_data.append(
'/')
497 '''if each_storage_type['type'] == 'hdfs': 498 source_data = self._config['storage'][each_storage_type['type']]['uri'] + ''.join(source_data)''' 499 each_storage_type[
'value'] =
''.join(source_data)
501 persistence.mkdir(type=each_storage_type[
'type'], path=each_storage_type[
'value'],
502 grants=self.
_config[
'storage'][
'grants'])
503 each_storage_type[
'value'] = each_storage_type[
'value'] + Path(filename).name
505 persistence.store_file(storage_json=load_storage, filename=filename)
def storage_path(self, mode, filename, filetype)
Method managing storage path.
def replicate_file(self, mode, filename)
Method replicate files from primery to others.
Define all objects, functions and structs related to common utilities not associated to one concrete ...
Define all objects, functions and structures related to logging event on DayF product logs...
Class oriented to manage all messages and interaction with DayF product logs.
def __init__(self, user_id='PoC_gDayF')
Constructor.
Define all objects, functions and structures related to physically store information on persistence s...
def train_workflow(self, datapath, wkey, workflow, prefix='main', remove_models=EACH_BEST)
Method leading train workflow executions.
Core class oriented to manage the comunication and execution messages pass for all components on syst...
def check_path(self, datapath)
Method managing dataset load from datapath:
Core class oriented to manage pipeline of workflows execution orchestrating the execution of actions ...
def predict_workflow(self, datapath, wkey, workflow, prefix='main', workflow_id='default', remove_models=EACH_BEST)
Method leading predict workflow executions.
Define all global objects, functions and structs related with an specific experiment.
def workflow(self, datapath, workflow, prefix=None, remove_models=EACH_BEST)
Method leading workflow executions.
Class to manage trasient information between all persistence options and models on an unified way...
Define all objects, functions and structured related to manage and execute actions over DayF core and...