DayF core  1.2.1.2
DayF (Decision at your Fingertips) is an AutoML freeware development framework that let developers works with Machine Learning models without any idea of AI, simply taking a csv dataset and the objective column
workflow.py
1 
4 
5 '''
6 Copyright (C) e2its - All Rights Reserved
7  * Unauthorized copying of this file, via any medium is strictly prohibited
8  * Proprietary and confidential
9  *
10  * This file is part of gDayF project.
11  *
12  * Written by Jose L. Sanchez <e2its.es@gmail.com>, 2016-2019
13 '''
14 
15 from gdayf.core.controller import Controller
16 from gdayf.logs.logshandler import LogsHandler
17 from gdayf.common.storagemetadata import StorageMetadata
18 from gdayf.persistence.persistencehandler import PersistenceHandler
19 from gdayf.common.utils import decode_ordered_dict_to_dataframe, xstr
20 from gdayf.common.constants import *
21 from gdayf.handlers.inputhandler import inputHandlerCSV
22 from json.decoder import JSONDecodeError
23 from json import load, dumps
24 from collections import OrderedDict
25 from pandas import DataFrame, set_option
26 from time import time
27 from pathlib import Path
28 from gdayf.core.experiment_context import Experiment_Context as E_C
29 
30 
32 class Workflow(object):
33 
34  def __init__(self, user_id='PoC_gDayF'):
35  self._ec = E_C(user_id=user_id)
36  self._config = self._ec.config.get_config()
37  self._labels = self._ec.labels.get_config()['messages']['workflow']
38  self._logging = LogsHandler(self._ec)
39  self.timestamp = str(time())
40 
41 
42 
49 
50  def workflow(self, datapath, workflow, prefix=None, remove_models=EACH_BEST):
51 
52  if isinstance(workflow, str):
53  file = open(workflow, 'r')
54  wf = load(file, object_hook=OrderedDict)
55  if self._ec.get_id_workflow() == 'default':
56  self._ec.set_id_workflow(Path(workflow).stem + '_' + self.timestamp)
57  else:
58  wf = workflow
59  if self._ec.get_id_workflow() == 'default':
60  self._ec.set_id_workflow(self._ec.get_id_workflow() + '_' + self.timestamp)
61 
62  for wkey, wvalue in wf.items():
63  if prefix is None:
64  #_prefix = xstr(wvalue['parameters']['objective_column'])
65  _prefix = xstr(wvalue['parameters']['objective_column']) + '_' + wkey
66  else:
67  #_prefix = prefix + '_' + xstr(wvalue['parameters']['objective_column'])
68  _prefix = prefix + '_' + xstr(wvalue['parameters']['objective_column']) + '_' + wkey
69  if wvalue['parameters']['mode'] == "train":
70  self.train_workflow(datapath=datapath, wkey=wkey, workflow=wvalue,
71  prefix=_prefix, remove_models=remove_models)
72  elif wvalue['parameters']['mode'] == "predict":
73  self.predict_workflow(datapath=datapath, wkey=wkey, workflow=wvalue,
74  prefix=_prefix, remove_models=remove_models)
75  else:
76  self._logging.log_info('gDayF', "Workflow", self._labels["nothing_to_do"])
77 
78 
85 
86  def train_workflow(self, datapath, wkey, workflow, prefix='main', remove_models=EACH_BEST):
87  set_option('display.max_rows', 500)
88  set_option('display.max_columns', 500)
89  set_option('display.width', 1000)
90 
91  wf = workflow
92  pfix = prefix
93 
94  error, dataset = self.check_path(datapath)
95  if dataset is None:
96  return error
97 
98  controller = Controller(e_c=self._ec)
99  if controller.config_checks():
100  variables = dataset.columns.tolist()
101 
102  #for wkey, wvalue in wf.items():
103  if wf["data"]["filtered_columns"] is not None:
104  for delete in wf["data"]["filtered_columns"]:
105  try:
106  variables.remove(delete)
107  except Exception:
108  self._logging.log_info('gDayF', "Workflow", self._labels["failed_var"], delete)
109  self._logging.log_info('gDayF', "Workflow", self._labels["variables_desc"], variables)
110  if wf["data"]["for_each"] is not None:
111  fe_column = wf["data"]["for_each"]
112  fe_data_exclusions = wf["data"]["for_each_exclusions"]
113  fe_filtered_data = wf["data"]["filtered_data"]
114  fe_parameters = wf["parameters"]
115  fe_next = wf["Next"]
116 
117  for each in eval('dataset.'+fe_column+'.unique()'):
118  if fe_data_exclusions is None or each not in fe_data_exclusions:
119  aux_dataset = eval('dataset[dataset.' + fe_column + '== each]')
120  pfix = xstr(prefix + '_' + str(each))
121 
122 
123  if fe_filtered_data is not None:
124  qcolumn = fe_filtered_data["column"]
125  quantile = aux_dataset[qcolumn].quantile(q=fe_filtered_data["quantile"])
126  aux_dataset = eval('aux_dataset.loc[aux_dataset.' + qcolumn + '<= ' + str(quantile) + ']')
127  pfix = xstr(pfix + '_' + str(fe_filtered_data["quantile"]))
128 
129 
130  if fe_parameters is not None:
131  source_parameters = list()
132  source_parameters.append('controller.exec_analysis(')
133  source_parameters.append('datapath=aux_dataset.loc[:, variables]')
134  for ikey, ivalue in fe_parameters.items():
135  source_parameters.append(',')
136  source_parameters.append(ikey)
137  source_parameters.append('=')
138  if isinstance(ivalue, str) and ikey != "amode":
139  source_parameters.append('\'')
140  source_parameters.append(ivalue)
141  source_parameters.append('\'')
142  else:
143  source_parameters.append(str(ivalue))
144  source_parameters.append(')')
145 
146  self._logging.log_info('gDayF', "Workflow", self._labels["desc_operation"],
147  ''.join(source_parameters))
148  status, recomendations = eval(''.join(source_parameters))
149  controller.remove_models(recomendations, mode=remove_models)
150  controller.reconstruct_execution_tree(recomendations, metric=fe_parameters['metric'],
151  store=True)
152 
153  #model_id = recomendations[0]['model_id']
154  table_model_list = controller.table_model_list(ar_list=recomendations,
155  metric=eval(fe_parameters['metric']))
156  self._logging.log_info('gDayF', 'workflow', self._labels["results"]+'\n',
157  table_model_list.to_string(justify="left"))
158 
159  #filename = self.storage_path('train', wkey + '_' + str(pfix) + '_' + 'train_performance'
160  if self._config['common']['workflow_summary_enabled']:
161  filename = self.storage_path('train', str(pfix) + '_' + 'train_performance'
162  , 'xls')
163  table_model_list.to_excel(filename, index=False, sheet_name='performance')
164  self.replicate_file('train', filename=filename)
165 
166  prediction_frame = controller.exec_prediction(datapath=aux_dataset,
167  model_file=recomendations[0]['json_path'][0]['value'])
168  try:
169  if 'predict' in prediction_frame.columns.values:
170  prediction_frame.rename(columns={"predict": wkey}, inplace=True)
171  elif 'prediction' in prediction_frame.columns.values:
172  prediction_frame.rename(columns={"prediction": wkey}, inplace=True)
173 
174  self._logging.log_info('gDayF', 'workflow', self._labels["results"]+'\n',
175  prediction_frame.to_string(index_names=False, justify="left"))
176 
177  '''filename = self.storage_path('train', wkey + '_'
178  + str(pfix) + '_' + 'prediction', 'xls')'''
179  if self._config['common']['workflow_summary_enabled']:
180  filename = self.storage_path('train', str(pfix) + '_' + 'prediction', 'xls')
181  prediction_frame.to_excel(filename, index=False, sheet_name='train_prediction')
182  self.replicate_file('train', filename=filename)
183 
184  except AttributeError as oexecution_error:
185  self._logging.log_info('gDayF', "Workflow", self._labels["failed_model"],
186  str(repr(oexecution_error)))
187 
188  try:
189  if fe_next is not None and prediction_frame is not None:
190  self.workflow(prediction_frame, fe_next, pfix, remove_models=remove_models)
191  except Exception as oexecution_error:
192  self._logging.log_critical('gDayF', "Workflow", self._labels["failed_wf"], str(fe_next))
193  else:
194  aux_dataset = dataset
195 
196  if wf["data"]["filtered_data"] is not None:
197  qcolumn = wf["data"]["filtered_data"]["column"]
198  quantile = aux_dataset[[qcolumn]].quatile([wf["data"]["filtered_data"]["quantile"]])
199  aux_dataset = aux_dataset.query('%s <= %s' % (qcolumn, quantile))
200 
201  if wf['parameters'] is not None:
202  source_parameters = list()
203  source_parameters.append('controller.exec_analysis(')
204  source_parameters.append('datapath=aux_dataset.loc[:, variables]')
205  for ikey, ivalue in wf['parameters'].items():
206  source_parameters.append(',')
207  source_parameters.append(ikey)
208  source_parameters.append('=')
209  if isinstance(ivalue, str) and ikey != "amode":
210  source_parameters.append('\'')
211  source_parameters.append(ivalue)
212  source_parameters.append('\'')
213  else:
214  source_parameters.append(str(ivalue))
215  source_parameters.append(')')
216  self._logging.log_info('gDayF', "Workflow", self._labels["desc_operation"],
217  ''.join(source_parameters))
218  status, recomendations = eval(''.join(source_parameters))
219  controller.remove_models(recomendations, mode=remove_models)
220  controller.reconstruct_execution_tree(recomendations, metric=wf['parameters']['metric'], store=True)
221 
222  model_id = recomendations[0]['model_id']
223  table_model_list = controller.table_model_list(ar_list=recomendations,
224  metric=eval(wf['parameters']['metric']))
225  self._logging.log_info('gDayF', 'workflow', self._labels["results"]+'\n',
226  table_model_list.to_string(justify="left"))
227 
228  if self._config['common']['workflow_summary_enabled']:
229  '''filename = self.storage_path('train', wkey + '_' + str(pfix) + '_'
230  + 'train_performance', 'xls')'''
231  filename = self.storage_path('train', str(pfix) + '_' + 'train_performance', 'xls')
232  table_model_list.to_excel(filename, index=False, sheet_name="performace")
233  self.replicate_file('train', filename=filename)
234 
235  prediction_frame = controller.exec_prediction(datapath=aux_dataset,
236  model_file=recomendations[0]['json_path'][0]['value'])
237  try:
238  if 'predict' in prediction_frame.columns.values:
239  prediction_frame.rename(columns={"predict": wkey}, inplace=True)
240  elif 'prediction' in prediction_frame.columns.values:
241  prediction_frame.rename(columns={"prediction": wkey}, inplace=True)
242 
243  self._logging.log_info('gDayF', 'workflow', self._labels["results"]+'\n',
244  prediction_frame.to_string(index_names=False, justify="left"))
245 
246  '''filename = self.storage_path('train', wkey + '_' + str(pfix) + '_'
247  + 'prediction', 'xls')'''
248  if self._config['common']['workflow_summary_enabled']:
249  filename = self.storage_path('train', str(pfix) + '_' + 'prediction', 'xls')
250  prediction_frame.to_excel(filename, index=False, sheet_name="train_prediction")
251  self.replicate_file('train', filename=filename)
252 
253  except AttributeError as oexecution_error:
254  self._logging.log_info('gDayF', "Workflow", self._labels["failed_model"],
255  str(repr(oexecution_error)))
256 
257  if wf['Next'] is not None and prediction_frame is not None:
258  try:
259  self.workflow(datapath=prediction_frame, workflow=wf['Next'],
260  prefix=pfix, remove_models=remove_models)
261  except Exception as oexecution_error:
262  self._logging.log_critical('gDayF', "Workflow", self._labels["failed_wf"], str(wf['Next']))
263  self._logging.log_critical('gDayF', "Workflow", self._labels["failed_wf"],
264  repr(oexecution_error))
265 
266 
267  controller.clean_handlers()
268  del controller
269 
270 
278 
279  def predict_workflow(self, datapath, wkey, workflow, prefix='main', workflow_id='default', remove_models=EACH_BEST):
280  set_option('display.height', 1000)
281  set_option('display.max_rows', 500)
282  set_option('display.max_columns', 500)
283  set_option('display.width', 1000)
284 
285  error, dataset = self.check_path(datapath)
286  if dataset is None:
287  return error
288 
289  if isinstance(workflow, str):
290  file = open(workflow, 'r')
291  wf = load(file, object_hook=OrderedDict)
292  else:
293  wf = workflow
294  pfix = xstr(prefix)
295  controller = Controller(e_c=self._ec)
296  if controller.config_checks():
297  variables = dataset.columns.tolist()
298 
299  #for wkey, wvalue in wf.items():
300  if wf["model"] is not None and \
301  (isinstance(wf["model"], str) or isinstance(wf["model"], dict)):
302 
303  if wf["data"]["filtered_columns"] is not None:
304  for delete in wf["data"]["filtered_columns"]:
305  try:
306  variables.remove(delete)
307  except Exception:
308  self._logging.log_info('gDayF', "Workflow", self._labels["failed_var"], delete)
309 
310  self._logging.log_info('gDayF', "Workflow", self._labels["variables_desc"], variables)
311 
312  if wf["data"]["for_each"] is not None:
313  fe_column = wf["data"]["for_each"]
314  fe_data_exclusions = wf["data"]["for_each_exclusions"]
315  fe_filtered_data = wf["data"]["filtered_data"]
316  fe_next = wf["Next"]
317 
318  for each in eval('dataset.' + fe_column + '.unique()'):
319  if fe_data_exclusions is None or each not in fe_data_exclusions:
320  aux_dataset = eval('dataset[dataset.' + fe_column + '== each]')
321  pfix = xstr(prefix + '_' + str(each))
322 
323  if fe_filtered_data is not None:
324  qcolumn = fe_filtered_data["column"]
325  quantile = aux_dataset[qcolumn].quantile(q=fe_filtered_data["quantile"])
326  aux_dataset = eval('aux_dataset.loc[aux_dataset.' + qcolumn + '<= ' + str(quantile) + ']')
327  pfix = xstr(pfix + '_' + str(fe_filtered_data["quantile"]))
328 
329  prediction_frame = controller.exec_prediction(datapath=aux_dataset,
330  model_file=wf["model"][str(each)])
331  try:
332  if 'predict' in prediction_frame.columns.values:
333  prediction_frame.rename(columns={"predict": wkey}, inplace=True)
334  elif 'prediction' in prediction_frame.columns.values:
335  prediction_frame.rename(columns={"prediction": wkey}, inplace=True)
336  except AttributeError:
337  self._logging.log_info('gDayF', "Workflow", self._labels["anomalies_operation"])
338 
339  self._logging.log_info('gDayF', 'workflow', self._labels["results"]+'\n',
340  prediction_frame.to_string(index_names=False, justify="left"))
341 
342  try:
343  if isinstance(prediction_frame, DataFrame) \
344  and self._config['common']['workflow_summary_enabled']:
345  '''filename = self.storage_path('predict', wkey + '_'
346  + str(pfix) + '_' + 'prediction', 'xls')'''
347  filename = self.storage_path('predict', str(pfix) + '_' +
348  str(self.timestamp) + '_' + 'prediction', 'xls')
349  prediction_frame.to_excel(filename, index=False, sheet_name="prediction")
350  self.replicate_file('predict', filename=filename)
351  elif self._config['common']['workflow_summary_enabled']:
352  for ikey, ivalue in prediction_frame['columns'].items():
353  ppDF = decode_ordered_dict_to_dataframe(ivalue)
354  if isinstance(ppDF, DataFrame):
355  '''filename = self.storage_path('predict', wkey + '_'
356  + str(pfix) + '_' + 'prediction_' + ikey, 'xls')'''
357  filename = self.storage_path('predict', str(pfix) + '_' +
358  str(self.timestamp) + '_' +
359  'prediction_' + ikey, 'xls')
360  ppDF.to_excel(filename, index=False, sheet_name="prediction")
361  self.replicate_file('predict', filename=filename)
362 
363  filename = self.storage_path('predict', str(pfix) + '_' +
364  str(self.timestamp) + '_' + '_prediction', 'json')
365  with open(filename, 'w') as f:
366  f.write(dumps(prediction_frame['global_mse']))
367  self.replicate_file('predict', filename=filename)
368  except AttributeError:
369  self._logging.log_info('gDayF', "Workflow", self._labels["anomalies_operation"],
370  prediction_frame)
371 
372  try:
373  if fe_next is not None and prediction_frame is not None:
374  self.workflow(prediction_frame, fe_next, pfix, remove_models=remove_models)
375  except Exception as oexecution_error:
376  self._logging.log_critical('gDayF', "Workflow", self._labels["failed_wf"], str(fe_next))
377  self._logging.log_critical('gDayF', "Workflow", self._labels["failed_wf"],
378  repr(oexecution_error))
379  else:
380  aux_dataset = dataset
381 
382  prediction_frame = controller.exec_prediction(datapath=aux_dataset, model_file=wf["model"])
383  if 'predict' in prediction_frame.columns.values:
384  prediction_frame.rename(columns={"predict": wkey}, inplace=True)
385  elif 'prediction' in prediction_frame.columns.values:
386  prediction_frame.rename(columns={"prediction": wkey}, inplace=True)
387 
388  self._logging.log_info('gDayF', 'workflow', self._labels["results"]+'\n',
389  prediction_frame.to_string(index_names=False, justify="left"))
390  if isinstance(prediction_frame, DataFrame) and self._config['common']['workflow_summary_enabled']:
391  filename = self.storage_path('predict', str(pfix) +
392  str(self.timestamp) + '_' + '_prediction', 'xls')
393  prediction_frame.to_excel(filename, index=False, sheet_name="prediction")
394  self.replicate_file('predict', filename=filename)
395  elif self._config['common']['workflow_summary_enabled']:
396  for ikey, ivalue in prediction_frame['columns'].items():
397  ppDF = decode_ordered_dict_to_dataframe(ivalue)
398  if isinstance(ppDF, DataFrame):
399  filename = self.storage_path('predict', str(pfix) + '_' +
400  str(self.timestamp) + '_' + 'prediction_' + ikey, 'xls')
401  ppDF.to_excel(filename, index=False, sheet_name="prediction")
402  self.replicate_file('predict', filename=filename)
403 
404  filename = self.storage_path('predict', str(pfix) + '_' +
405  str(self.timestamp) + '_' + '_prediction', 'json')
406  with open(filename, 'w') as f:
407  f.write(dumps(prediction_frame))
408  self.replicate_file('predict', filename=filename)
409 
410  if wf['Next'] is not None and prediction_frame is not None:
411  try:
412  self.workflow(datapath=prediction_frame, workflow=wf['Next'], prefix=pfix,
413  remove_models=remove_models)
414  except Exception as oexecution_error:
415  self._logging.log_critical('gDayF', "Workflow", self._labels["failed_wf"], str(wf['Next']))
416  self._logging.log_critical('gDayF', "Workflow", self._labels["failed_wf"],
417  repr(oexecution_error))
418 
419  controller.clean_handlers()
420  del controller
421 
422 
425  def check_path(self, datapath):
426  if isinstance(datapath, str):
427  try:
428  self._logging.log_info('gDayF', "Workflow", self._labels["input_param"], datapath)
429  pd_dataset = inputHandlerCSV().inputCSV(filename=datapath)
430  return None, pd_dataset.copy()
431  except [IOError, OSError, JSONDecodeError]:
432  self._logging.log_critical('gDayF', "Workflow", self._labels["failed_input"], datapath)
433  return self._labels['failed_input'], None
434  elif isinstance(datapath, DataFrame):
435  self._logging.log_info('gDayF', "Controller", self._labels["input_param"], str(datapath.shape))
436  return None, datapath
437  else:
438  self._logging.log_critical('gDayF', "Workflow", self._labels["failed_input"], datapath)
439  return self._labels['failed_input'], None
440 
441 
446  def storage_path(self, mode, filename, filetype):
447  load_storage = StorageMetadata(self._ec)
448  if self._config['common']['workflow_summary_enabled']:
449  include = True
450  else:
451  include = False
452  for each_storage_type in load_storage.get_load_path(include=include):
453  if each_storage_type['type'] == 'localfs':
454  source_data = list()
455  primary_path = self._config['storage'][each_storage_type['type']]['value']
456  source_data.append(primary_path)
457  source_data.append('/')
458  source_data.append(self._ec.get_id_user())
459  source_data.append('/')
460  source_data.append(self._ec.get_id_workflow())
461  source_data.append('/')
462  source_data.append(self._config['common']['workflow_summary_dir'])
463  source_data.append('/')
464  source_data.append(mode)
465  source_data.append('/')
466 
467  PersistenceHandler(self._ec).mkdir(type=each_storage_type['type'],
468  path=''.join(source_data), grants=self._config['storage']['grants'])
469  source_data.append(filename)
470  source_data.append('.' + filetype)
471 
472  return ''.join(source_data)
473  return None
474 
475 
479  def replicate_file(self, mode, filename):
480  load_storage = StorageMetadata(self._ec).get_json_path()
481  persistence = PersistenceHandler(self._ec)
482  for each_storage_type in load_storage:
483  if each_storage_type['type'] in ['localfs', 'hdfs']:
484  source_data = list()
485  primary_path = self._config['storage'][each_storage_type['type']]['value']
486  source_data.append(primary_path)
487  source_data.append('/')
488  source_data.append(self._ec.get_id_user())
489  source_data.append('/')
490  source_data.append(self._ec.get_id_workflow())
491  source_data.append('/')
492  source_data.append(self._config['common']['workflow_summary_dir'])
493  source_data.append('/')
494  source_data.append(mode)
495  source_data.append('/')
496 
497  '''if each_storage_type['type'] == 'hdfs':
498  source_data = self._config['storage'][each_storage_type['type']]['uri'] + ''.join(source_data)'''
499  each_storage_type['value'] = ''.join(source_data)
500 
501  persistence.mkdir(type=each_storage_type['type'], path=each_storage_type['value'],
502  grants=self._config['storage']['grants'])
503  each_storage_type['value'] = each_storage_type['value'] + Path(filename).name
504 
505  persistence.store_file(storage_json=load_storage, filename=filename)
506  del persistence
507 
508 
509 
def storage_path(self, mode, filename, filetype)
Method managing storage path.
Definition: workflow.py:446
def replicate_file(self, mode, filename)
Method replicate files from primery to others.
Definition: workflow.py:479
Define all objects, functions and structs related to common utilities not associated to one concrete ...
Definition: utils.py:1
Define all objects, functions and structures related to logging event on DayF product logs...
Definition: logshandler.py:1
Class storage metadata format [{value: , fstype:[&#39;localfs&#39;, &#39;hdfs&#39;, &#39;mongoDB&#39;], hash_value : ""...
Class oriented to manage all messages and interaction with DayF product logs.
Definition: logshandler.py:23
def __init__(self, user_id='PoC_gDayF')
Constructor.
Definition: workflow.py:34
Define all objects, functions and structures related to physically store information on persistence s...
def train_workflow(self, datapath, wkey, workflow, prefix='main', remove_models=EACH_BEST)
Method leading train workflow executions.
Definition: workflow.py:86
Core class oriented to manage the comunication and execution messages pass for all components on syst...
Definition: controller.py:41
def check_path(self, datapath)
Method managing dataset load from datapath:
Definition: workflow.py:425
Core class oriented to manage pipeline of workflows execution orchestrating the execution of actions ...
Definition: workflow.py:32
def predict_workflow(self, datapath, wkey, workflow, prefix='main', workflow_id='default', remove_models=EACH_BEST)
Method leading predict workflow executions.
Definition: workflow.py:279
Define all global objects, functions and structs related with an specific experiment.
Define all objects, functions and structured related to adding storage information metadata (json str...
def workflow(self, datapath, workflow, prefix=None, remove_models=EACH_BEST)
Method leading workflow executions.
Definition: workflow.py:50
Class to manage trasient information between all persistence options and models on an unified way...
Define all objects, functions and structured related to manage and execute actions over DayF core and...
Definition: controller.py:1