DayF core  1.2.1.2
DayF (Decision at your Fingertips) is an AutoML freeware development framework that let developers works with Machine Learning models without any idea of AI, simply taking a csv dataset and the objective column
h2ohandler.py
1 
7 
8 '''
9 Copyright (C) e2its - All Rights Reserved
10  * Unauthorized copying of this file, via any medium is strictly prohibited
11  * Proprietary and confidential
12  *
13  * This file is part of gDayF project.
14  *
15  * Written by Jose L. Sanchez <e2its.es@gmail.com>, 2016-2019
16 '''
17 
18 import copy
19 import json
20 import time
21 from collections import OrderedDict as OrderedDict
22 from os import path
23 from pandas import DataFrame as DataFrame
24 from hashlib import md5 as md5
25 from pathlib import Path
26 
27 from h2o import H2OFrame as H2OFrame
28 from h2o import cluster as cluster
29 from h2o import connect as connect
30 from h2o import connection as connection
31 from h2o import init as init
32 from h2o import load_model as load_model
33 from h2o import save_model as save_model
34 from h2o.exceptions import H2OError, H2OServerError
35 from h2o.exceptions import H2OConnectionError
36 from h2o import ls as H2Olist
37 from h2o import frames as H2Oframes
38 from h2o import get_model
39 from h2o import remove as H2Oremove
40 from h2o import api as H2Oapi
41 from h2o import get_frame
42 from h2o import download_pojo
43 
44 from h2o.estimators.gbm import H2OGradientBoostingEstimator
45 from h2o.estimators.glm import H2OGeneralizedLinearEstimator
46 from h2o.estimators.deeplearning import H2ODeepLearningEstimator
47 from h2o.estimators.random_forest import H2ORandomForestEstimator
48 from h2o.estimators.naive_bayes import H2ONaiveBayesEstimator
49 from h2o.estimators.deeplearning import H2OAutoEncoderEstimator
50 from h2o.estimators.kmeans import H2OKMeansEstimator
51 
52 from gdayf.common.normalizationset import NormalizationSet
53 from gdayf.common.constants import DTYPES
54 from gdayf.common.storagemetadata import StorageMetadata
55 from gdayf.common.utils import hash_key
56 from gdayf.logs.logshandler import LogsHandler
57 from gdayf.handler_metrics.h2obinomialmetricmetadata import H2OBinomialMetricMetadata as BinomialMetricMetadata
58 from gdayf.metrics.metricmetadata import MetricMetadata
59 from gdayf.metrics.executionmetriccollection import ExecutionMetricCollection
60 from gdayf.handler_metrics.h2oregressionmetricmetadata import H2ORegressionMetricMetadata as RegressionMetricMetadata
61 from gdayf.handler_metrics.h2omultinomialmetricmetadata import H2OMultinomialMetricMetadata as MultinomialMetricMetadata
62 from gdayf.handler_metrics.h2oanomaliesmetricmetadata import H2OAnomaliesMetricMetadata as AnomaliesMetricMetadata
63 from gdayf.handler_metrics.h2oclusteringmetricmetadata import H2OClusteringMetricMetadata as ClusteringMetricMetadata
64 from gdayf.persistence.persistencehandler import PersistenceHandler
65 from gdayf.common.dfmetada import DFMetada
66 from gdayf.common.utils import get_model_ns
67 from gdayf.common.armetadata import ArMetadata
68 from gdayf.models.parametersmetadata import ParameterMetadata
69 from gdayf.normalizer.normalizer import Normalizer
70 from gdayf.common.utils import get_model_fw
71 from gdayf.common.storagemetadata import generate_json_path
72 
73 
74 class H2OHandler(object):
75 
76 
80  def __init__(self, e_c):
81  self._ec = e_c
82  self._framework = 'h2o'
83  self._config = self._ec.config.get_config()
84  self._labels = self._ec.labels.get_config()['messages']['corehandler']
85  self.localfs = self._config['storage']['localfs']['value']
86  self.hdfs =self._config['storage']['hdfs']['value']
87  self.mongoDB = self._config['storage']['mongoDB']['value']
88  self.primary_path = self._config['storage'][self._config['storage']['primary_path']]['value']
89  self.url = self._config['frameworks'][self._framework]['conf']['url']
90  self.nthreads = self._config['frameworks'][self._framework]['conf']['nthreads']
91  self.ice_root = self._config['frameworks'][self._framework]['conf']['ice_root']
92  self.max_mem_size = self._config['frameworks'][self._framework]['conf']['max_mem_size']
93  self.start_h2o = self._config['frameworks'][self._framework]['conf']['start_h2o']
94  self._debug = self._config['frameworks'][self._framework]['conf']['debug']
95  self._save_model = self._config['frameworks'][self._framework]['conf']['save_model']
96  self._autosaved = self._config['frameworks'][self._framework]['conf']['autosaved']
97  self._tolerance = self._config['frameworks'][self._framework]['conf']['tolerance']
98  self._anomaly_threshold = self._config['frameworks'][self._framework]['conf']['anomaly_threshold']
99  self._model_base = None
100  self._h2o_session = None
101  self._persistence = PersistenceHandler(e_c=self._ec)
102  self._logging = LogsHandler(e_c=self._ec, module=__name__)
103  self._frame_list = list()
104 
105 
106 
107  def __del__(self):
108  if self._h2o_session is not None and self.is_alive():
109  H2Oapi("POST /3/GarbageCollect")
110  self._h2o_session.close()
111 
112 
114  @classmethod
116  try:
117  cluster().shutdown()
118  except:
119  print('H20-cluster not working')
120 
121 
124  def connect(self):
125  initiated = False
126  try:
127  self._h2o_session = connect(url=self.url)
128  except H2OError:
129  try:
130  init(url=self.url, nthreads=self.nthreads, ice_root=self.ice_root, max_mem_size=self.max_mem_size)
131  self._h2o_session = connection()
132  initiated = True
133  except H2OError:
134  self._logging.log_critical('gDayF', "H2OHandler", self._labels["failed_conn"])
135  raise Exception
136  finally:
137  self._logging.log_info('gDayF', "H2OHandler", self._labels["start"])
138  self._logging.log_info('gDayF', "H2OHandler", self._labels["framework"], self._framework)
139  self._logging.log_info('gDayF', "H2OHandler", self._labels["sess"], self._h2o_session.session_id())
140  return initiated
141 
142 
143  def is_alive(self):
144  if self._h2o_session is None:
145  return False
146  else:
147  try:
148  self._h2o_session.session_id()
149  except H2OConnectionError:
150  return False
151  return self._h2o_session.cluster.is_running()
152 
153 
158  def _get_temporal_objects_ids(self, model_id, nfolds):
159  models_ids = list()
160  if nfolds is not None:
161  for iter in range(0, nfolds):
162  models_ids.append(model_id + '_cv_' + str(iter+1))
163  for element in H2Oframes()['frames']:
164  name = element['frame_id']['name']
165  if name.find('reconstruction_error_') != -1 and name.find(model_id) != -1:
166  models_ids.append(name)
167  elif name.find('predictions_') != -1 and name.find(model_id) != -1:
168  models_ids.append(name)
169  return models_ids
170 
171 
176  def get_external_model(self, ar_metadata, type):
177  remove_model = False
178  fw = get_model_fw(ar_metadata)
179  model_id = ar_metadata['model_parameters'][fw]['parameters']['model_id']['value']
180  analysis_id = ar_metadata['model_id']
181  config = self._ec.config.get_config()['frameworks'][fw]['conf']
182  base_model_id = model_id + '.model'
183  load_fails, remove_model = self._get_model(ar_metadata, base_model_id, remove_model)
184 
185  if load_fails:
186  self._logging.log_critical(self._h2o_session.session_id,
187  self._labels["no_models"], ar_metadata)
188  return None
189  load_storage = StorageMetadata(self._ec)
190  for each_storage_type in load_storage.get_load_path():
191  if each_storage_type['type'] == 'localfs':
192  source_data = list()
193  primary_path = self._config['storage'][each_storage_type['type']]['value']
194  source_data.append(primary_path)
195  source_data.append('/')
196  source_data.append(ar_metadata['user_id'])
197  source_data.append('/')
198  source_data.append(ar_metadata['workflow_id'])
199  source_data.append('/')
200  source_data.append(ar_metadata['model_id'])
201  source_data.append('/')
202  source_data.append(config['download_dir'])
203  source_data.append('/')
204  source_data.append(model_id)
205 
206  download_path = ''.join(source_data)
207 
208  self._logging.log_info(analysis_id, self._h2o_session.session_id,
209  self._labels["down_path"], download_path)
210  persistence = PersistenceHandler(self._ec)
211  persistence.mkdir(type=each_storage_type['type'], path=str(download_path),
212  grants=self._config['storage']['grants'])
213 
214  if type.upper() == 'MOJO':
215  try:
216  file_path = self._model_base.download_mojo(path=str(download_path), get_genmodel_jar=True)
217  self._logging.log_info(analysis_id, self._h2o_session.session_id,
218  self._labels["success_op"], file_path)
219  except H2OError:
220  load_fails = True
221  self._logging.log_critical(analysis_id, self._h2o_session.session_id,
222  self._labels["failed_op"], download_path)
223  else:
224  try:
225  file_path = download_pojo(self._model_base, path=str(download_path), get_jar=True)
226  self._logging.log_info(analysis_id, self._h2o_session.session_id,
227  self._labels["success_op"], file_path)
228  except H2OError:
229  load_fails = True
230  self._logging.log_critical(analysis_id, self._h2o_session.session_id,
231  self._labels["failed_op"], download_path)
232  try:
233  if self._model_base is not None and remove_model:
234  H2Oremove(self._model_base.model_id)
235  except H2OError:
236  self._logging.log_exec(analysis_id,
237  self._h2o_session.session_id, self._labels["delete_objects"],
238  self._model_base.model_id)
239 
240  return load_fails
241 
242 
246  def _get_model_from_load_path(self, ar_metadata):
247  load_fails = True
248  counter_storage = 0
249  # Checking file source versus hash_value
250 
251  try:
252  assert isinstance(ar_metadata['load_path'], list)
253  except AssertionError:
254  return load_fails
255 
256  while ar_metadata['load_path'] is not None and counter_storage < len(ar_metadata['load_path']) and load_fails:
257 
258  if ar_metadata['load_path'][counter_storage]['hash_value'] is None or \
259  hash_key(ar_metadata['load_path'][counter_storage]['hash_type'],
260  ar_metadata['load_path'][counter_storage]['value']) == \
261  ar_metadata['load_path'][counter_storage]['hash_value']:
262  try:
263  self._model_base = load_model(ar_metadata['load_path'][counter_storage]['value'])
264  if self._model_base is not None:
265  load_fails = False
266  except H2OError:
267  self._logging.log_critical(self._ec.get_id_analysis(), self._h2o_session.session_id,
268  self._labels["abort"], ar_metadata['load_path'][counter_storage]['value'])
269 
270  if ar_metadata['load_path'][counter_storage]['hash_value'] is not None:
271  self._logging.log_info(self._ec.get_id_analysis(), self._h2o_session.session_id, self._labels["hk_check"],
272  ar_metadata['load_path'][counter_storage]['hash_value'] + ' - ' +
273  hash_key(ar_metadata['load_path'][counter_storage]['hash_type'],
274  ar_metadata['load_path'][counter_storage]['value'])
275  )
276  counter_storage += 1
277  return load_fails
278 
279 
281  def delete_frames (self):
282  for frame_id in self._frame_list:
283  try:
284  H2Oremove(frame_id)
285  except H2OError:
286  self._logging.log_exec(self._ec.get_id_analysis(),
287  self._h2o_session.session_id, self._labels["delete_frames"],
288  frame_id)
289 
290 
295  def generate_base_path(self, base_ar, type_):
296  assert type_ in ['PoC', 'train', 'predict']
297  if self.primary_path == self.mongoDB:
298  return None
299  elif self.primary_path == self.hdfs:
300  # Generating base_path
301  load_path = list()
302  load_path.append(self.hdfs)
303  load_path.append('/')
304  load_path.append(self._framework)
305  load_path.append('/')
306  load_path.append(base_ar['model_id'])
307  load_path.append('/')
308  load_path.append(type_)
309  load_path.append('/')
310  load_path.append(str(base_ar['timestamp']))
311  load_path.append('/')
312  return ''.join(load_path)
313  else:
314  # Generating base_path
315  load_path = list()
316  load_path.append(self.localfs)
317  load_path.append('/')
318  load_path.append(self._framework)
319  load_path.append('/')
320  load_path.append(base_ar['model_id'])
321  load_path.append('/')
322  load_path.append(type_)
323  load_path.append('/')
324  load_path.append(str(base_ar['timestamp']))
325  load_path.append('/')
326  return ''.join(load_path)
327 
328 
331  def _get_ext(self):
332  if self._save_model == 'POJO':
333  return '.pojo'
334  elif self._save_model == 'MOJO':
335  return '.mojo'
336  else:
337  return '.model'
338 
339 
345  def _generate_execution_metrics(self, dataframe, source, antype):
346  if antype == 'binomial':
347  model_metrics = BinomialMetricMetadata()
348  elif antype == 'multinomial':
349  model_metrics = MultinomialMetricMetadata()
350  elif antype == 'regression':
351  model_metrics = RegressionMetricMetadata()
352  elif antype == 'anomalies':
353  model_metrics = AnomaliesMetricMetadata()
354  elif antype == 'clustering':
355  model_metrics = ClusteringMetricMetadata()
356  else:
357  model_metrics = MetricMetadata()
358  try:
359  if dataframe is not None and antype == 'anomalies':
360  result1 = self._model_base.predict(dataframe)
361  columns = list()
362  for item in self._model_base.varimp():
363  columns.append(item[0])
364  x = list()
365  for items in dataframe.columns:
366  if items in columns:
367  x.append(items)
368 
369  difference = dataframe[x] - result1
370  anomalies_threshold = OrderedDict()
371  anomalies_threshold['global_mse'] = OrderedDict()
372  reconstruction = self._model_base.anomaly(dataframe)
373  anomalies_threshold['global_mse']['max'] = reconstruction.max()
374  anomalies_threshold['global_mse']['min'] = reconstruction.min()
375 
376  anomalies_threshold['columns'] = OrderedDict()
377  for col in difference.columns:
378  anomalies_threshold['columns'][col] = OrderedDict()
379  anomalies_threshold['columns'][col]['max'] = difference[col].max()
380  anomalies_threshold['columns'][col]['min'] = difference[col].min()
381 
382  H2Oremove(self._get_temporal_objects_ids(model_id=self._model_base.model_id, nfolds=None))
383  return anomalies_threshold
384 
385  elif dataframe is not None:
386  perf_metrics = self._model_base.model_performance(dataframe)
387  else:
388  if source == 'valid':
389  perf_metrics = self._model_base.model_performance(valid=True)
390  elif source == 'xval':
391  perf_metrics = self._model_base.model_performance(xval=True)
392  else:
393  perf_metrics = self._model_base.model_performance(train=True)
394  model_metrics.set_metrics(perf_metrics)
395  except H2OServerError:
396  self._logging.log_exec(self._ec.get_id_analysis(), self._h2o_session.session_id, self._labels["gexec_metric"],
397  self._labels["failed_op"])
398  raise
399 
400 
401  H2Oremove(self._get_temporal_objects_ids(model_id=self._model_base.model_id, nfolds=None))
402  return model_metrics
403 
404 
407  def _generate_model_metrics(self):
408  #Change 27/01/2018 sprint 6
409  return json.loads(self._model_base.summary().as_data_frame().drop("", axis=1).to_json(orient='split'),
410  object_pairs_hook=OrderedDict)
411 
412 
415  def _generate_importance_variables(self):
416  aux = OrderedDict()
417  try:
418  for each_value in self._model_base.varimp():
419  aux[each_value[0]] = each_value[2]
420  except TypeError:
421  pass
422  return aux
423 
424 
427  def _generate_scoring_history(self):
428  model_scoring = self._model_base.scoring_history()
429  if model_scoring is None:
430  return None
431  else:
432  #Change 27/01/2018 sprint 6
433  return json.loads(model_scoring.drop("", axis=1).to_json(orient='split'),
434  object_pairs_hook=OrderedDict)
435 
436 
445  def _accuracy(self, objective, dataframe, antype, base_type, tolerance=0.0):
446  accuracy = -1.0
447  try:
448  prediccion = self._model_base.predict(dataframe)
449 
450  if antype == 'regression':
451  try:
452  fmin = eval("lambda x: x - " + str(tolerance/2))
453  fmax = eval("lambda x: x + " + str(tolerance/2))
454  success = (dataframe[objective].apply(fmin).asnumeric() <= \
455  prediccion['predict'].asnumeric()) and \
456  (prediccion['predict'].asnumeric() <= \
457  dataframe[objective].apply(fmax).asnumeric())
458  accuracy = "Valid"
459  except Exception:
460  accuracy = -1.0
461  else:
462  tolerance = 0.0
463  success = prediccion[0] == dataframe[objective]
464  accuracy = "Valid"
465 
466  if accuracy not in [0.0, -1.0]:
467  accuracy = success.sum() / dataframe.nrows
468 
469  self._frame_list.append(prediccion.frame_id)
470 
471  self._logging.log_exec(self._ec.get_id_analysis(), self._h2o_session.session_id, self._labels["tolerance"],
472  str(tolerance))
473  return accuracy
474  except OSError:
475  self._logging.log_exec(self._ec.get_id_analysis(), self._h2o_session.session_id, self._labels["model_pacc"],
476  self._labels["failed_op"])
477  raise
478 
479 
489  def _predict_accuracy(self, objective, odataframe, dataframe, antype, tolerance=0.0):
490  accuracy = -1.0
491  dataframe_cols = odataframe.columns
492 
493  prediction_dataframe = self._model_base.predict(dataframe)
494  self._frame_list.append(prediction_dataframe.frame_id)
495  prediccion = odataframe.cbind(prediction_dataframe)
496  self._frame_list.append(prediccion.frame_id)
497  prediction_columns = prediccion.columns
498  if dataframe.columns is not None:
499  dataframe_cols.extend(dataframe.columns)
500  for element in dataframe_cols:
501  try:
502  prediction_columns.remove(element)
503  except ValueError:
504  pass
505  predictor_col = prediction_columns[0]
506 
507  if objective in dataframe.columns:
508  if antype == 'regression':
509  try:
510  fmin = eval("lambda x: x - " + str(tolerance/2))
511  fmax = eval("lambda x: x + " + str(tolerance/2))
512  success = (prediccion[objective].apply(fmin).asnumeric() <= \
513  prediccion[predictor_col].asnumeric()) and \
514  (prediccion[predictor_col].asnumeric() <=
515  prediccion[objective].apply(fmax).asnumeric())
516  accuracy = "Valid"
517  except Exception:
518  accuracy = 0.0
519  else:
520  tolerance = 0.0
521  success = prediccion[predictor_col] == prediccion[objective]
522  accuracy = "Valid"
523  if accuracy not in [0.0, -1.0]:
524  accuracy = success.sum() / dataframe.nrows
525 
526 
527  self._logging.log_info(self._ec.get_id_analysis(), self._h2o_session.session_id, self._labels["tolerance"],
528  str(tolerance))
529 
530  return accuracy, prediccion
531 
532 
538  def _predict_anomalies(self, odataframe, dataframe, anomalies_thresholds):
539 
540  result1 = self._model_base.predict(dataframe)
541  self._frame_list.append(result1.frame_id)
542 
543  columns = list()
544  for item in self._model_base.varimp():
545  columns.append(item[0])
546  x = list()
547  for items in dataframe.columns:
548  if items in columns:
549  x.append(items)
550 
551  difference = dataframe[x] - result1
552  self._frame_list.append(difference.frame_id)
553 
554  anomalies = OrderedDict()
555  anomalies['columns'] = OrderedDict()
556  for col in x:
557  if anomalies_thresholds['columns'][col]['max'] < 0:
558  max = (1 - self._anomaly_threshold['columns'])
559  else:
560  max = (1 + self._anomaly_threshold['columns'])
561  if anomalies_thresholds['columns'][col]['min'] < 0:
562  min = (1 + self._anomaly_threshold['columns'])
563  else:
564  min = (1 - self._anomaly_threshold['columns'])
565 
566  temp_anomalies = odataframe[difference[col] > (anomalies_thresholds['columns'][col]['max'] * max)
567  or
568  difference[col] < (anomalies_thresholds['columns'][col]['min'] * min)]
569  self._frame_list.append(temp_anomalies.frame_id)
570  if temp_anomalies.nrows > 0:
571  anomalies['columns'][col] = json.loads(temp_anomalies.as_data_frame(use_pandas=True)
572  .to_json(orient='records'), object_pairs_hook=OrderedDict)
573 
574  anomalies['global_mse'] = OrderedDict()
575  if anomalies_thresholds['global_mse']['max'] < 0:
576  max = (1 - self._anomaly_threshold['global_mse'])
577  else:
578  max = (1 + self._anomaly_threshold['global_mse'])
579  if anomalies_thresholds['global_mse']['min'] < 0:
580  min = (1 + self._anomaly_threshold['global_mse'])
581  else:
582  min = (1 - self._anomaly_threshold['global_mse'])
583 
584  anomalyframe = self._model_base.anomaly(dataframe)
585  self._frame_list.append(anomalyframe.frame_id)
586 
587  anomalyframe = dataframe.cbind(anomalyframe)
588  self._frame_list.append(anomalyframe.frame_id)
589 
590 
591  temp_anomalies = odataframe[anomalyframe['Reconstruction.MSE'] > (anomalies_thresholds['global_mse']['max'] * max)
592  or
593  anomalyframe['Reconstruction.MSE'] < (anomalies_thresholds['global_mse']['min'] *min)]
594  self._frame_list.append(temp_anomalies.frame_id)
595  if temp_anomalies.nrows > 0:
596  anomalies['global_mse'] = json.loads(temp_anomalies.as_data_frame(use_pandas=True).
597  to_json(orient='records'), object_pairs_hook=OrderedDict)
598 
599  H2Oremove(self._get_temporal_objects_ids(model_id=self._model_base.model_id, nfolds=None))
600 
601  return anomalies
602 
603 
609  def _predict_clustering(self, odataframe, dataframe, objective=None):
610 
611  accuracy = -1.0
612  dataframe_cols = dataframe.columns
613  prediction_dataframe = self._model_base.predict(dataframe)
614  self._frame_list.append(prediction_dataframe.frame_id)
615  prediccion = odataframe.cbind(prediction_dataframe)
616  self._frame_list.append(prediction_dataframe.frame_id)
617 
618  return accuracy, prediccion
619 
620 
623  def _generate_params(self):
624  """
625  Generate model params for this model.
626  :return (status (success 0, error 1) , OrderedDict(full_stack_parameters))
627  """
628  params = self._model_base.get_params()
629  full_stack_params = OrderedDict()
630  for key, values in params.items():
631  if key not in ['model_id', 'training_frame', 'validation_frame', 'response_column']:
632  full_stack_params[key] = values['actual_value']
633  return (0, full_stack_params)
634 
635 
641  def get_metric(self, algorithm_description, metric, source): # not tested
642  try:
643  struct_ar = OrderedDict(json.load(algorithm_description))
644  except:
645  self._logging.log_critical('gDayF', self._h2o_session.session_id(), self._labels["ar_error"])
646  return ('Necesario cargar un modelo valid o ar.json valido')
647  try:
648  return struct_ar['metrics'][source][metric]
649  except KeyError:
650  return 'Not Found'
651 
652 
660  def need_factor(self, atype, objective_column, training_frame=None, valid_frame=None, predict_frame=None):
661  analysis_id = self._ec.get_id_analysis()
662  if atype in ['binomial', 'multinomial']:
663  if training_frame is not None:
664  if training_frame[objective_column].types[objective_column] in DTYPES:
665  training_frame[objective_column] = training_frame[objective_column].asfactor()
666  self._logging.log_info(analysis_id, self._h2o_session.session_id,
667  self._labels["factoring"],
668  ' train : ' + objective_column)
669  else:
670  training_frame[objective_column] = training_frame[objective_column].ascharacter().asfactor()
671  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["factoring"],
672  ' train : ' + objective_column)
673  if valid_frame is not None:
674  if valid_frame[objective_column].types[objective_column] in DTYPES:
675  valid_frame[objective_column] = valid_frame[objective_column].asfactor()
676  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["factoring"],
677  ' validation : ' + objective_column)
678  else:
679  valid_frame[objective_column] = valid_frame[objective_column].ascharacter().asfactor()
680  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["factoring"],
681  ' validation : ' + objective_column)
682  if predict_frame is not None and objective_column in predict_frame.columns:
683  if predict_frame[objective_column].types[objective_column] in DTYPES:
684  predict_frame[objective_column] = predict_frame[objective_column].asfactor()
685  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["factoring"],
686  ' predict : ' + objective_column)
687  else:
688  predict_frame[objective_column] = predict_frame[objective_column].ascharacter().asfactor()
689  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["factoring"],
690  ' predict : ' + objective_column)
691 
692 
701  def execute_normalization(self, dataframe, base_ns, model_id, filtering='NONE', exist_objective=True):
702  if base_ns is not None:
703  data_norm = dataframe.copy(deep=True)
704  self._logging.log_exec(self._ec.get_id_analysis(),
705  self._h2o_session.session_id, self._labels["exec_norm"], str(base_ns))
706  normalizer = Normalizer(self._ec)
707  if not exist_objective:
708  base_ns = normalizer.filter_objective_base(normalizemd=base_ns)
709  if filtering == 'STANDARDIZE':
710  base_ns = normalizer.filter_standardize(normalizemd=base_ns, model_id=model_id)
711  elif filtering == 'DROP':
712  base_ns = normalizer.filter_drop_missing(normalizemd=base_ns)
713  data_norm = normalizer.normalizeDataFrame(data_norm, base_ns)
714  del normalizer
715  df_metadata = DFMetada()
716  df_metadata.getDataFrameMetadata(dataframe=data_norm, typedf='pandas')
717  df_metadata_hash_value = md5(json.dumps(df_metadata).encode('utf-8')).hexdigest()
718  return data_norm, df_metadata, df_metadata_hash_value, True, base_ns
719  else:
720  df_metadata = DFMetada()
721  df_metadata.getDataFrameMetadata(dataframe=dataframe, typedf='pandas')
722  df_metadata_hash_value = md5(json.dumps(df_metadata).encode('utf-8')).hexdigest()
723  return dataframe, df_metadata, df_metadata_hash_value, False, base_ns
724 
725  #base_ns = json.load(normalization, object_pairs_hook=NormalizationSet)
726 
727 
733  def order_training(self, training_pframe, base_ar, **kwargs):
734  assert isinstance(training_pframe, DataFrame)
735  assert isinstance(base_ar, ArMetadata)
736 
737  analysis_id = self._ec.get_id_analysis()
738  filtering = 'NONE'
739  for pname, pvalue in kwargs.items():
740  if pname == 'filtering':
741  assert isinstance(pvalue, str)
742  filtering = pvalue
743 
744  # python train parameters effective
745  supervised = True
746  aborted = False
747  objective_column = base_ar['objective_column']
748  if objective_column is None:
749  supervised = False
750 
751  train_parameters_list = ['max_runtime_secs', 'fold_column',
752  'weights_column', 'offset_column']
753 
754  valid_frame = None
755 
756  if "test_frame" in kwargs.keys():
757  test_frame = kwargs['test_frame']
758  else:
759  test_frame = None
760 
761  base_ns = get_model_ns(base_ar)
762  modelid = base_ar['model_parameters']['h2o']['model']
763  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["st_analysis"], modelid)
764 
765 
766  assert isinstance(base_ns, list) or base_ns is None
767  # Applying Normalizations
768  data_initial = DFMetada()
769  data_initial.getDataFrameMetadata(dataframe=training_pframe, typedf='pandas')
770  training_pframe, data_normalized, train_hash_value, norm_executed, base_ns = \
771  self.execute_normalization(dataframe=training_pframe, base_ns=base_ns, model_id=modelid,
772  filtering=filtering, exist_objective=True)
773 
774  if base_ar['round'] == 1:
775  aux_ns = Normalizer(self._ec).define_ignored_columns(data_normalized, objective_column)
776  if aux_ns is not None:
777  base_ns.extend(aux_ns)
778 
779  df_metadata = data_initial
780  if not norm_executed:
781  data_normalized = None
782  try:
783  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["cor_struct"],
784  str(data_initial['correlation'][objective_column]))
785  except KeyError:
786  self._logging.log_exec(analysis_id, self._h2o_session.session_id, self._labels["cor_struct"],
787  str(data_initial['correlation']))
788 
789  else:
790  df_metadata = data_normalized
791  base_ar['normalizations_set'] = base_ns
792  try:
793  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["cor_struct"],
794  str(data_normalized['correlation'][objective_column]))
795  except KeyError:
796  self._logging.log_exec(analysis_id, self._h2o_session.session_id, self._labels["cor_struct"],
797  str(data_initial['correlation']))
798  if test_frame is not None:
799  test_frame, _, _, _, _ = self.execute_normalization(dataframe=test_frame, base_ns=base_ns,
800  model_id=modelid, filtering=filtering,
801  exist_objective=True)
802 
803  h2o_elements = H2Olist()
804  if len(h2o_elements[h2o_elements['key'] == 'train_' + analysis_id + '_' + str(train_hash_value)]):
805  if training_pframe.count(axis=0).all() > \
806  self._config['frameworks']['h2o']['conf']['validation_frame_threshold']:
807  training_frame = get_frame('train_' + analysis_id + '_' + str(train_hash_value))
808 
809  valid_frame = get_frame('valid_' + analysis_id + '_' + str(train_hash_value))
810  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["getting_from_h2o"],
811  'training_frame(' + str(training_frame.nrows) +
812  ') validating_frame(' + str(valid_frame.nrows) + ')')
813  else:
814  training_frame = get_frame('train_' + analysis_id + '_' + str(train_hash_value))
815  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["getting_from_h2o"],
816  'training_frame(' + str(training_frame.nrows) + ')')
817  if "test_frame" in kwargs.keys():
818  test_frame = get_frame('test_' + analysis_id + '_' + str(train_hash_value))
819  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["getting_from_h2o"],
820  'test_frame (' + str(test_frame.nrows) + ')')
821  else:
822  if training_pframe.count(axis=0).all() > \
823  self._config['frameworks']['h2o']['conf']['validation_frame_threshold']:
824  training_frame, valid_frame = \
825  H2OFrame(python_obj=training_pframe).\
826  split_frame(ratios=[self._config['frameworks']['h2o']['conf']['validation_frame_ratio']],
827  destination_frames=['train_' + analysis_id + '_' + str(train_hash_value),
828  'valid_' + analysis_id + '_' + str(train_hash_value)])
829  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["parsing_to_h2o"],
830  'training_frame(' + str(training_frame.nrows) +
831  ') validating_frame(' + str(valid_frame.nrows) + ')')
832  self._frame_list.append(training_frame.frame_id)
833  self._frame_list.append(valid_frame.frame_id)
834  else:
835  training_frame = \
836  H2OFrame(python_obj=training_pframe,
837  destination_frame='train_' + analysis_id + '_' + str(train_hash_value))
838  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["parsing_to_h2o"],
839  'training_frame(' + str(training_frame.nrows) + ')')
840  self._frame_list.append(training_frame.frame_id)
841 
842  if "test_frame" in kwargs.keys():
843  test_frame = H2OFrame(python_obj=test_frame,
844  destination_frame='test_' + analysis_id + '_' + str(train_hash_value))
845  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["parsing_to_h2o"],
846  'test_frame (' + str(test_frame.nrows) + ')')
847  self._frame_list.append(test_frame.frame_id)
848 
849  if supervised:
850  self.need_factor(atype=base_ar['model_parameters']['h2o']['types'][0]['type'],
851  training_frame=training_frame,
852  valid_frame=valid_frame,
853  predict_frame=test_frame,
854  objective_column=objective_column)
855 
856  # Initializing base structures
857  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["objective"],
858  objective_column + ' - ' + training_frame.type(objective_column))
859 
860  tolerance = get_tolerance(df_metadata['columns'], objective_column, self._tolerance)
861 
862  # Generating base_path
863  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["action_type"], base_ar['type'])
864  base_path = self.generate_base_path(base_ar, base_ar['type'])
865 
866  final_ar_model = copy.deepcopy(base_ar)
867  final_ar_model['status'] = self._labels['failed_op']
868  final_ar_model['model_parameters']['h2o']['id'] = cluster().version
869  model_timestamp = str(time.time())
870  final_ar_model['data_initial'] = data_initial
871  final_ar_model['data_normalized'] = data_normalized
872 
873 
874  model_id = modelid + '_' + model_timestamp
875  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["model_id"], model_id)
876 
877  analysis_type = base_ar['model_parameters']['h2o']['types'][0]['type']
878  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["amode"],
879  base_ar['model_parameters']['h2o']['types'][0]['type'])
880 
881  ''' Generating and executing Models '''
882  # 06/06/2017: Use X less ignored_columns on train
883  x = training_frame.col_names
884  if supervised:
885  x.remove(objective_column)
886 
887  '''Generate commands: model and model.train()'''
888  model_command = list()
889  model_command.append(modelid)
890  model_command.append("(")
891  model_command.append("training_frame=training_frame")
892  train_command = list()
893  # 06/06/2017: Use ignore_columns instead X on train
894  if supervised:
895  train_command.append("self._model_base.train(y=\'%s\', " % objective_column)
896  else:
897  train_command.append("self._model_base.train(")
898 
899  train_command.append("training_frame=training_frame")
900  if valid_frame is not None:
901  model_command.append(", validation_frame=valid_frame")
902  train_command.append(", validation_frame=valid_frame")
903  model_command.append(", model_id=\'%s%s\'" % (model_id, self._get_ext()))
904 
905  norm = Normalizer(self._ec)
906  train_command.append(", ignored_columns=%s" % str(norm.ignored_columns(base_ns)))
907  del norm
908 
909  generate_commands_parameters(base_ar['model_parameters']['h2o'], model_command, train_command,
910  train_parameters_list)
911  model_command.append(")")
912  model_command = ''.join(model_command)
913  train_command.append(")")
914  train_command = ''.join(train_command)
915 
916  self._logging.log_exec(analysis_id, self._h2o_session.session_id, self._labels["gmodel"], model_command)
917 
918  # Generating model
919  if self._debug:
920  final_ar_model['log_path'] = StorageMetadata(self._ec)
921  for each_storage_type in final_ar_model['log_path'].get_log_path():
922  log_path = base_path + each_storage_type['value'] + '/' + model_id + '.log'
923  final_ar_model['log_path'].append(value=log_path, fstype=each_storage_type['type'],
924  hash_type=each_storage_type['hash_type'])
925  self._persistence.mkdir(type=final_ar_model['log_path'][0]['type'],
926  grants=self._config['storage']['grants'],
927  path=path.dirname(final_ar_model['log_path'][0]['value']))
928  connection().start_logging(final_ar_model['log_path'][0]['value'])
929 
930  self._model_base = eval(model_command)
931  start = time.time()
932  try:
933  eval(train_command)
934  final_ar_model['status'] = 'Executed'
935 
936  except OSError as execution_error:
937  aborted = True
938  self._logging.log_critical(analysis_id, self._h2o_session.session_id, self._labels["abort"],
939  repr(execution_error))
940  try:
941  H2Oremove(self._get_temporal_objects_ids(self._model_base.model_id,
942  final_ar_model['model_parameters']['h2o']['parameters']['nfolds']
943  ['value']))
944  except KeyError:
945  H2Oremove(self._get_temporal_objects_ids(self._model_base.model_id, None))
946 
947  # Generating aditional model parameters Model_ID
948  final_ar_model['model_parameters']['h2o']['parameters']['model_id'] = ParameterMetadata()
949  final_ar_model['model_parameters']['h2o']['parameters']['model_id'].set_value(value=model_id,
950  seleccionable=False,
951  type="String")
952  final_ar_model['execution_seconds'] = time.time() - start
953  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["tmodel"], model_id)
954  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["exec_time"],
955  str(final_ar_model['execution_seconds']))
956 
957  if self._debug:
958  connection().stop_logging()
959  self._persistence.store_file(filename=final_ar_model['log_path'][0]['value'],
960  storage_json=final_ar_model['log_path'])
961 
962  # Filling whole json ar.json
963  final_ar_model['ignored_parameters'], \
964  final_ar_model['full_parameters_stack'] = self._generate_params()
965 
966  if not aborted:
967  try:
968  # Generating execution metrics
969  final_ar_model['metrics']['execution'] = ExecutionMetricCollection()
970 
971  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["gexec_metric"], model_id)
972 
973 
974  final_ar_model['metrics']['execution']['train'] = self._generate_execution_metrics(dataframe=None,
975  source='train',
976  antype=analysis_type)
977  final_ar_model['metrics']['accuracy'] = OrderedDict()
978 
979  if supervised:
980 
981  final_ar_model['metrics']['accuracy']['train'] = \
982  self._accuracy(objective_column, training_frame, antype=analysis_type, tolerance=tolerance,
983  base_type=training_frame.type(objective_column))
984  self._logging.log_exec(analysis_id, self._h2o_session.session_id, self._labels["model_tacc"],
985  model_id + ' - ' + str(final_ar_model['metrics']['accuracy']['train']))
986  final_ar_model['tolerance'] = tolerance
987  else:
988 
989  final_ar_model['metrics']['accuracy']['train'] = 0.0
990 
991  final_ar_model['metrics']['execution']['xval'] = \
992  self._generate_execution_metrics(dataframe=None, source='xval', antype=analysis_type)
993 
994  if valid_frame is not None:
995 
996  final_ar_model['metrics']['execution']['valid'] = \
997  self._generate_execution_metrics(dataframe=None, source='valid', antype=analysis_type)
998 
999  if supervised:
1000 
1001  final_ar_model['metrics']['accuracy']['valid'] = \
1002  self._accuracy(objective_column, valid_frame,
1003  antype=analysis_type, tolerance=tolerance,
1004  base_type=valid_frame.type(objective_column))
1005  else:
1006 
1007  final_ar_model['metrics']['accuracy']['valid'] = 0.0
1008 
1009  if test_frame is not None:
1010  final_ar_model['metrics']['execution']['test'] = self._generate_execution_metrics(dataframe=test_frame,
1011  source=None,
1012  antype=analysis_type)
1013  if supervised:
1014 
1015  final_ar_model['metrics']['accuracy']['test'] = \
1016  self._accuracy(objective_column, test_frame, antype=analysis_type, tolerance=tolerance,
1017  base_type=test_frame.type(objective_column))
1018 
1019  train_balance = self._config['frameworks']['h2o']['conf']['train_balance_metric']
1020  test_balance = 1 - train_balance
1021  final_ar_model['metrics']['accuracy']['combined'] = \
1022  (final_ar_model['metrics']['accuracy']['train']*train_balance +
1023  final_ar_model['metrics']['accuracy']['test']*test_balance)
1024  self._logging.log_exec(analysis_id, self._h2o_session.session_id, self._labels["model_pacc"],
1025  model_id + ' - ' + str(final_ar_model['metrics']['accuracy']['test']))
1026 
1027  self._logging.log_exec(analysis_id, self._h2o_session.session_id, self._labels["model_cacc"],
1028  model_id + ' - ' + str(final_ar_model['metrics']['accuracy']['combined']))
1029  else:
1030 
1031  final_ar_model['metrics']['accuracy']['test'] = 0.0
1032  final_ar_model['metrics']['accuracy']['combined'] = 0.0
1033 
1034  #final_ar_model['tolerance'] = tolerance
1035 
1036  except Exception as execution_error:
1037  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["abort"],
1038  repr(execution_error))
1039 
1040  final_ar_model['metrics'] = OrderedDict()
1041  final_ar_model['metrics']['accuracy'] = OrderedDict()
1042  final_ar_model['metrics']['accuracy']['train'] = -1.0
1043  final_ar_model['metrics']['accuracy']['test'] = -1.0
1044  final_ar_model['metrics']['accuracy']['combined'] = -1.0
1045  final_ar_model['metrics']['execution'] = OrderedDict()
1046  final_ar_model['metrics']['execution']['train'] = OrderedDict()
1047  final_ar_model['metrics']['execution']['train']['RMSE'] = 1e+16
1048  final_ar_model['metrics']['execution']['train']['tot_withinss'] = 1e+16
1049  final_ar_model['metrics']['execution']['train']['betweenss'] = 1e+16
1050  final_ar_model['metrics']['execution']['test'] = OrderedDict()
1051  final_ar_model['metrics']['execution']['test']['RMSE'] = 1e+16
1052  final_ar_model['metrics']['execution']['test']['tot_withinss'] = 1e+16
1053  final_ar_model['metrics']['execution']['test']['betweenss'] = 1e+16
1054 
1055  # Generating model metrics
1056  final_ar_model['metrics']['model'] = self._generate_model_metrics()
1057  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["gmodel_metric"], model_id)
1058 
1059  # Generating anomalies metrics
1060  if analysis_type == 'anomalies':
1061  final_ar_model['metrics']['anomalies'] = self._generate_execution_metrics(dataframe=training_frame[x],
1062  source='train',
1063  antype=analysis_type)
1064  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["ganomaly_metric"], model_id)
1065 
1066  # Generating Variable importance
1067  final_ar_model['metrics']['var_importance'] = self._generate_importance_variables()
1068  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["gvar_metric"], model_id)
1069 
1070  # Generating scoring_history
1071  final_ar_model['metrics']['scoring'] = self._generate_scoring_history()
1072  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["gsco_metric"], model_id)
1073 
1074  final_ar_model['status'] = self._labels['success_op']
1075 
1076  else:
1077  final_ar_model['status'] = self._labels['failed_op']
1078  final_ar_model['metrics'] = OrderedDict()
1079  final_ar_model['metrics']['accuracy'] = OrderedDict()
1080  final_ar_model['metrics']['accuracy']['train'] = -1.0
1081  final_ar_model['metrics']['accuracy']['test'] = -1.0
1082  final_ar_model['metrics']['accuracy']['combined'] = -1.0
1083  final_ar_model['metrics']['execution'] = OrderedDict()
1084  final_ar_model['metrics']['execution']['train'] = OrderedDict()
1085  final_ar_model['metrics']['execution']['train']['RMSE'] = 1e+16
1086  final_ar_model['metrics']['execution']['train']['tot_withinss'] = 1e+16
1087  final_ar_model['metrics']['execution']['train']['betweenss'] = 1e+16
1088  final_ar_model['metrics']['execution']['test'] = OrderedDict()
1089  final_ar_model['metrics']['execution']['test']['RMSE'] = 1e+16
1090  final_ar_model['metrics']['execution']['test']['tot_withinss'] = 1e+16
1091  final_ar_model['metrics']['execution']['test']['betweenss'] = 1e+16
1092 
1093  generate_json_path(self._ec, final_ar_model)
1094  self._persistence.store_json(storage_json=final_ar_model['json_path'], ar_json=final_ar_model)
1095 
1096  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["model_stored"], model_id)
1097  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["end"], model_id)
1098 
1099  if self._autosaved and not aborted:
1100  self.store_model(armetadata=final_ar_model)
1101  self.remove_memory_models([final_ar_model])
1102 
1103  for handler in self._logging.logger.handlers:
1104  handler.flush()
1105  # Cleaning H2OCluster
1106  try:
1107  if self._model_base is not None:
1108  try:
1109  H2Oremove(self._get_temporal_objects_ids(self._model_base.model_id,
1110  final_ar_model['model_parameters']['h2o']['parameters']['nfolds']['value']))
1111  except KeyError:
1112  H2Oremove(self._get_temporal_objects_ids(self._model_base.model_id, None))
1113  #H2Oremove(self._model_base.model_id)
1114  except H2OError:
1115  self._logging.log_exec(analysis_id,
1116  self._h2o_session.session_id, self._labels["delete_objects"],
1117  self._model_base.model_id)
1118 
1119  H2Oapi("POST /3/GarbageCollect")
1120  return analysis_id, final_ar_model
1121 
1122 
1125  def store_model(self, armetadata):
1126  saved_model = False
1127 
1128  fw = get_model_fw(armetadata)
1129  model_id = armetadata['model_parameters'][fw]['parameters']['model_id']['value']
1130 
1131  if model_id + self._get_ext() not in H2Olist()['key'].tolist():
1132  return saved_model
1133  else:
1134  model_base = get_model(model_id + self._get_ext())
1135 
1136  #Updating status
1137  armetadata['status'] = self._labels["success_op"]
1138  # Generating load_path
1139  load_storage = StorageMetadata(self._ec)
1140  for each_storage_type in load_storage.get_load_path():
1141  source_data = list()
1142  primary_path = self._config['storage'][each_storage_type['type']]['value']
1143  source_data.append(primary_path)
1144  source_data.append('/')
1145  source_data.append(armetadata['user_id'])
1146  source_data.append('/')
1147  source_data.append(armetadata['workflow_id'])
1148  source_data.append('/')
1149  source_data.append(armetadata['model_id'])
1150  source_data.append('/')
1151  source_data.append(fw)
1152  source_data.append('/')
1153  source_data.append(armetadata['type'])
1154  source_data.append('/')
1155  source_data.append(str(armetadata['timestamp']))
1156  source_data.append('/')
1157 
1158  load_path = ''.join(source_data) + each_storage_type['value']+'/'
1159  self._persistence.mkdir(type=each_storage_type['type'], path=load_path,
1160  grants=self._config['storage']['grants'])
1161  if each_storage_type['type'] == 'hdfs':
1162  load_path = self._config['storage'][each_storage_type['type']]['uri'] + load_path
1163 
1164  if self._get_ext() == '.pojo':
1165  download_pojo(model=model_base, path=load_path, get_jar=True)
1166  elif self._get_ext() == '.mojo':
1167  self._model_base.download_mojo(path=load_path, get_genmodel_jar=True)
1168  else:
1169  save_model(model=model_base, path=load_path, force=True)
1170  load_storage.append(value=load_path + model_id + self._get_ext(),
1171  fstype=each_storage_type['type'], hash_type=each_storage_type['hash_type'])
1172  saved_model = True
1173 
1174  armetadata['load_path'] = load_storage
1175 
1176  self._logging.log_exec(self._h2o_session.session_id, self._labels["msaved"], model_id)
1177 
1178  self._persistence.store_json(storage_json=armetadata['json_path'], ar_json=armetadata)
1179  self._logging.log_info( self._h2o_session.session_id, self._labels["model_stored"], model_id)
1180 
1181  return saved_model
1182 
1183 
1186  def load_model(self, armetadata):
1187  from_disk = False
1188 
1189  fw = get_model_fw(armetadata)
1190  model_id = armetadata['model_parameters'][fw]['parameters']['model_id']['value']
1191 
1192  load_fail, from_disk = self._get_model(base_ar=armetadata, base_model_id=model_id, remove_model=from_disk)
1193  if load_fail:
1194  return None
1195  else:
1196  return armetadata
1197 
1198 
1206  def predict(self, predict_frame, base_ar, **kwargs):
1207 
1208  for pname, pvalue in kwargs.items():
1209  None
1210 
1211  remove_model = False
1212  model_timestamp = str(time.time())
1213  self._ec.set_id_analysis(base_ar['model_id'])
1214  analysis_id = self._ec.get_id_analysis()
1215  base_model_id = base_ar['model_parameters']['h2o']['parameters']['model_id']['value'] + '.model'
1216  model_id = base_model_id + '_' + model_timestamp
1217 
1218  antype = base_ar['model_parameters']['h2o']['types'][0]['type']
1219 
1220  modelid = base_ar['model_parameters']['h2o']['model']
1221  base_ns = get_model_ns(base_ar)
1222 
1223  #Checking file source versus hash_value
1224  load_fails, remove_model = self._get_model(base_ar, base_model_id, remove_model)
1225 
1226  if load_fails or self._model_base is None:
1227  self._logging.log_critical(analysis_id, self._h2o_session.session_id,
1228  self._labels["no_models"], base_model_id)
1229  base_ar['status'] = self._labels['failed_op'] # Default Failed Operation Code
1230  return None
1231 
1232  objective_column = base_ar['objective_column']
1233 
1234  exist_objective = True
1235  if objective_column is None:
1236  exist_objective = False
1237  if exist_objective:
1238  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["objective"],
1239  objective_column)
1240  # Recovering tolerance
1241  tolerance = base_ar['tolerance']
1242 
1243  data_initial = DFMetada()
1244  data_initial.getDataFrameMetadata(dataframe=predict_frame, typedf='pandas')
1245  base_ar['data_initial'] = data_initial
1246 
1247  if objective_column in list(predict_frame.columns.values):
1248  try:
1249  self._logging.log_info(analysis_id, self._h2o_session.session_id,
1250  self._labels["cor_struct"], str(data_initial['correlation'][objective_column]))
1251  except KeyError:
1252  self._logging.log_exec(analysis_id, self._h2o_session.session_id,
1253  self._labels["cor_struct"], str(data_initial['correlation']))
1254  npredict_frame, data_normalized, _, norm_executed, _ = self.execute_normalization(dataframe=predict_frame,
1255  base_ns=base_ns,
1256  model_id=modelid,
1257  filtering='DROP',
1258  exist_objective=True)
1259 
1260  else:
1261  npredict_frame, data_normalized, _, norm_executed, _ = self.execute_normalization(dataframe=predict_frame,
1262  base_ns=base_ns,
1263  model_id=modelid,
1264  filtering='DROP',
1265  exist_objective=False)
1266 
1267  if not norm_executed:
1268  self._logging.log_exec(analysis_id, self._h2o_session.session_id, self._labels["exec_norm"],
1269  'No Normalizations Required')
1270  else:
1271  # Transforming original dataframe to H2OFrame
1272  predict_frame = H2OFrame(python_obj=predict_frame,
1273  destination_frame='predict_frame_' + base_ar['model_id'])
1274  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["parsing_to_h2o"],
1275  'test_frame (' + str(predict_frame.nrows) + ')')
1276  self._frame_list.append(predict_frame.frame_id)
1277 
1278  base_ar['data_normalized'] = data_normalized
1279  if objective_column in list(npredict_frame.columns.values):
1280  try:
1281  self._logging.log_exec(analysis_id, self._h2o_session.session_id, self._labels["cor_struct"],
1282  str(data_normalized['correlation'][objective_column]))
1283  except KeyError:
1284  self._logging.log_exec(analysis_id, self._h2o_session.session_id, self._labels["no_cor_struct"],
1285  str(data_normalized['correlation']))
1286 
1287  #Transforming to H2OFrame
1288  npredict_frame = H2OFrame(python_obj=npredict_frame,
1289  destination_frame='npredict_frame_' + base_ar['model_id'])
1290  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["parsing_to_h2o"],
1291  'test_frame (' + str(npredict_frame.nrows) + ')')
1292  self._frame_list.append(npredict_frame.frame_id)
1293 
1294  if exist_objective:
1295  self.need_factor(atype=base_ar['model_parameters']['h2o']['types'][0]['type'],
1296  objective_column=objective_column, predict_frame=npredict_frame)
1297 
1298  base_ar['type'] = 'predict'
1299  self._logging.log_info(analysis_id, self._h2o_session.session_id,
1300  self._labels["action_type"], base_ar['type'])
1301 
1302  base_ar['timestamp'] = model_timestamp
1303  if self._debug:
1304  for each_storage_type in base_ar['log_path']:
1305  each_storage_type['value'] = each_storage_type['value'].replace('train', 'predict') \
1306  .replace('.log', '_' + model_timestamp + '.log')
1307 
1308  self._persistence.mkdir(type=base_ar['log_path'][0]['type'],
1309  grants=self._config['storage']['grants'],
1310  path=path.dirname(base_ar['log_path'][0]['value']))
1311  connection().start_logging(base_ar['log_path'][0]['value'])
1312 
1313  self._logging.log_info(analysis_id, self._h2o_session.session_id,
1314  self._labels['st_predict_model'],
1315  base_model_id)
1316 
1317  if objective_column in npredict_frame.columns:
1318  objective_type = npredict_frame.type(objective_column)
1319  else:
1320  objective_type = None
1321 
1322  start = time.time()
1323  if exist_objective:
1324 
1325  ''' Bug Fixing 02/04/2018
1326  if predict_frame.nrows == npredict_frame.nrows:
1327  accuracy, prediction_dataframe = self._predict_accuracy(objective_column, predict_frame, npredict_frame,
1328  antype=antype,
1329  tolerance=tolerance, base_type=objective_type)
1330  else:
1331  accuracy, prediction_dataframe = self._predict_accuracy(objective_column, npredict_frame, npredict_frame,
1332  antype=antype,
1333  tolerance=tolerance, base_type=objective_type)'''
1334 
1335  accuracy, prediction_dataframe = self._predict_accuracy(objective_column, predict_frame, npredict_frame,
1336  antype=antype, tolerance=tolerance)
1337  self._frame_list.append(prediction_dataframe.frame_id)
1338 
1339  base_ar['execution_seconds'] = time.time() - start
1340  base_ar['tolerance'] = tolerance
1341 
1342  prediction_dataframe = prediction_dataframe.as_data_frame(use_pandas=True)
1343  else:
1344  if antype == 'anomalies':
1345  if norm_executed:
1346  predict_anomalies = self._predict_anomalies(predict_frame, npredict_frame,
1347  base_ar['metrics']['anomalies'])
1348  else:
1349  predict_anomalies = self._predict_anomalies(npredict_frame, npredict_frame,
1350  base_ar['metrics']['anomalies'])
1351  base_ar['execution_seconds'] = time.time() - start
1352 
1353  if antype == 'clustering':
1354  if norm_executed:
1355  accuracy, prediction_dataframe = self._predict_clustering(predict_frame, npredict_frame)
1356  else:
1357  accuracy, prediction_dataframe = self._predict_clustering(npredict_frame, npredict_frame)
1358  self._frame_list.append(prediction_dataframe.frame_id)
1359 
1360  base_ar['execution_seconds'] = time.time() - start
1361  prediction_dataframe = prediction_dataframe.as_data_frame(use_pandas=True)
1362 
1363  if self._debug:
1364  connection().stop_logging()
1365  self._persistence.store_file(filename=base_ar['log_path'][0]['value'],
1366  storage_json=base_ar['log_path'])
1367 
1368  if not exist_objective or objective_type is not None:
1369  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["gexec_metric"], model_id)
1370  base_ar['metrics']['execution'][base_ar['type']] = self._generate_execution_metrics(dataframe=npredict_frame,
1371  source=None, antype=antype)
1372  if objective_column in npredict_frame.columns:
1373  base_ar['metrics']['accuracy']['predict'] = accuracy
1374  self._logging.log_info(analysis_id, self._h2o_session.session_id, self._labels["model_pacc"],
1375  base_model_id + ' - ' + str(base_ar['metrics']['accuracy']['predict']))
1376 
1377  base_ar['status'] = self._labels['success_op']
1378 
1379  if antype == 'anomalies':
1380  prediction = predict_anomalies
1381  else:
1382  prediction = prediction_dataframe
1383 
1384  # writing metadata predict.json file
1385  prediction_json = OrderedDict()
1386  prediction_json['metadata'] = OrderedDict()
1387  prediction_json['metadata']['user_id'] = self._ec.get_id_user()
1388  prediction_json['metadata']['timestamp'] = model_timestamp
1389  prediction_json['metadata']['workflow_id'] = self._ec.get_id_workflow()
1390  prediction_json['metadata']['analysis_id'] = self._ec.get_id_analysis()
1391  prediction_json['metadata']['model_id'] = base_ar['model_parameters']['h2o']['parameters']['model_id'][
1392  'value']
1393 
1394  # writing data predict.json file
1395  prediction_json['data'] = OrderedDict()
1396  if isinstance(prediction, DataFrame):
1397  prediction_json['data'] = prediction.to_dict(orient='records')
1398  else:
1399  prediction_json['data'] = OrderedDict(prediction)
1400 
1401  # writing file predict.json file
1402  generate_json_path(self._ec, base_ar, 'prediction')
1403  self._persistence.store_json(storage_json=base_ar['prediction_path'], ar_json=base_ar, other=prediction_json)
1404  self._logging.log_exec(self._ec.get_id_analysis(), self._h2o_session.session_id, self._labels["model_stored"], model_id)
1405 
1406  # writing ar.json file
1407  generate_json_path(self._ec, base_ar)
1408  self._persistence.store_json(storage_json=base_ar['json_path'], ar_json=base_ar)
1409  self._logging.log_exec(self._ec.get_id_analysis(), self._h2o_session.session_id, self._labels["model_stored"], model_id)
1410 
1411  self._logging.log_info(self._ec.get_id_analysis(), self._h2o_session.session_id, self._labels["end"], model_id)
1412  for handler in self._logging.logger.handlers:
1413  handler.flush()
1414 
1415  # Cleaning H2OCluster
1416  try:
1417  H2Oremove(npredict_frame)
1418  if norm_executed:
1419  H2Oremove(predict_frame)
1420  except H2OError:
1421  self._logging.log_critical(analysis_id,
1422  self._h2o_session.session_id, self._labels["delete_frame"],
1423  self._model_base.model_id)
1424  try:
1425  if self._model_base is not None and remove_model:
1426  H2Oremove(self._model_base.model_id)
1427  except H2OError:
1428  self._logging.log_critical(analysis_id,
1429  self._h2o_session.session_id, self._labels["delete_objects"],
1430  self._model_base.model_id)
1431  H2Oapi("POST /3/GarbageCollect")
1432 
1433  return prediction, base_ar
1434 
1435 
1441  def _get_model(self, base_ar, base_model_id, remove_model):
1442  if base_model_id in H2Olist()['key'].tolist():
1443  self._model_base = get_model(base_model_id)
1444  if self._model_base is None:
1445  load_fails = True
1446  else:
1447  load_fails = False
1448  else:
1449  load_fails = self._get_model_from_load_path(base_ar)
1450  remove_model = True
1451  return load_fails, remove_model
1452 
1453 
1456  def remove_memory_models(self, arlist):
1457  remove_fails = True
1458  try:
1459  assert isinstance(arlist, list)
1460  except AssertionError:
1461  return remove_fails
1462  for armetadata in arlist:
1463  fw = get_model_fw(armetadata)
1464  model_id = armetadata['model_parameters'][fw]['parameters']['model_id']['value']+'.model'
1465  if model_id in H2Olist()['key'].tolist():
1466  H2Oremove(model_id)
1467  H2Oremove(self._get_temporal_objects_ids(model_id=model_id, nfolds=None))
1468  remove_fails = False
1469 
1470  return remove_fails
1471 
1472 
1475  def remove_models(self, arlist):
1476  remove_fails = True
1477 
1478  if self._autosaved:
1479  persistence = PersistenceHandler(self._ec)
1480  for ar_metadata in arlist:
1481  try:
1482  assert isinstance(ar_metadata['load_path'], list)
1483  except AssertionError:
1484  return remove_fails
1485 
1486  _, ar_metadata['load_path'] = persistence.remove_file(load_path=ar_metadata['load_path'])
1487 
1488  if len(ar_metadata['load_path']) == 0:
1489  ar_metadata['load_path'] = None
1490 
1491  else:
1492  remove_fails = True
1493  persistence.store_json(storage_json=ar_metadata["json_path"], ar_json=ar_metadata)
1494 
1495  del persistence
1496  else:
1497  remove_fails = self.remove_memory_models(arlist)
1498 
1499  return remove_fails
1500 
1501 
1507 def generate_commands_parameters(each_model, model_command, train_command, train_parameters_list):
1508  for key, value in each_model['parameters'].items():
1509  if value['seleccionable']:
1510  if isinstance(value['value'], str):
1511  if key in train_parameters_list and value is not None:
1512  train_command.append(", %s=\'%s\'" % (key, value['value']))
1513  else:
1514  model_command.append(", %s=\'%s\'" % (key, value['value']))
1515  else:
1516  if key in train_parameters_list and value is not None:
1517  train_command.append(", %s=%s" % (key, value['value']))
1518  else:
1519  model_command.append(", %s=%s" % (key, value['value']))
1520 
1521 
1526 def get_tolerance(columns, objective_column, tolerance=0.0):
1527  if isinstance(tolerance, dict):
1528  if tolerance['enable_fixed']:
1529  threshold = tolerance['fixed']
1530  else:
1531  min_val = None
1532  max_val = None
1533  for each_column in columns:
1534  if each_column["name"] == objective_column and each_column["type"] in DTYPES:
1535  min_val = float(each_column["min"])
1536  max_val = float(each_column["max"])
1537  if min_val is None or max_val is None:
1538  threshold = 0
1539  else:
1540  threshold = (max_val - min_val) * tolerance['percentage']
1541  else:
1542  threshold = tolerance
1543  return threshold
Define all objects, functions and structured related to Analysis_Results for one execution (final jso...
Definition: armetadata.py:1
def _get_ext(self)
Generate extension for diferente saving modes.
Definition: h2ohandler.py:331
def _get_model_from_load_path(self, ar_metadata)
Load a model in H2OCluster from disk.
Definition: h2ohandler.py:246
def execute_normalization(self, dataframe, base_ns, model_id, filtering='NONE', exist_objective=True)
Method to execute normalizations base on params.
Definition: h2ohandler.py:701
def _get_model(self, base_ar, base_model_id, remove_model)
Internal method to get an H2Omodel from server or file transparent to user.
Definition: h2ohandler.py:1441
def _generate_scoring_history(self)
Generate model scoring_history metrics.
Definition: h2ohandler.py:427
Class DFMetadata manage the Data Analysis results structs on OrderedDict format and exportable to jso...
Definition: dfmetada.py:28
def get_metric(self, algorithm_description, metric, source)
Get one especific metric for execution metrics Not tested yet.
Definition: h2ohandler.py:641
def remove_memory_models(self, arlist)
Method to remove list of model from server.
Definition: h2ohandler.py:1456
Define all objects, functions and structured related to Data Analysis of input data on OrderedDict fo...
Definition: dfmetada.py:1
def _predict_accuracy(self, objective, odataframe, dataframe, antype, tolerance=0.0)
Generate accuracy metrics for model for regression assume tolerance on results equivalent to 2*tolera...
Definition: h2ohandler.py:489
def _predict_anomalies(self, odataframe, dataframe, anomalies_thresholds)
Generate detected anomalies on dataframe.
Definition: h2ohandler.py:538
def store_model(self, armetadata)
Method to save model to persistence layer from armetadata.
Definition: h2ohandler.py:1125
def _generate_params(self)
Generate model full values parameters for execution analysis.
Definition: h2ohandler.py:623
def generate_commands_parameters(each_model, model_command, train_command, train_parameters_list)
auxiliary function (procedure) to generate model and train chain paramters to execute models Modify m...
Definition: h2ohandler.py:1507
Define all objects, functions and structs related to common utilities not associated to one concrete ...
Definition: utils.py:1
Define Base Metric object as OrderedDict() of common measures for all metrics types on an unified way...
def _predict_clustering(self, odataframe, dataframe, objective=None)
Generate detected clustering on dataframe.
Definition: h2ohandler.py:609
Define all objects, functions and structures related to logging event on DayF product logs...
Definition: logshandler.py:1
Class Base for metricts as OrderedDict.
Define Binomial Metric object as OrderedDict() of common measures for all frameworks on an unified wa...
def get_tolerance(columns, objective_column, tolerance=0.0)
Auxiliary function to get the level of tolerance for regression analysis.
Definition: h2ohandler.py:1526
Class storage metadata format [{value: , fstype:[&#39;localfs&#39;, &#39;hdfs&#39;, &#39;mongoDB&#39;], hash_value : ""...
Class oriented to manage all messages and interaction with DayF product logs.
Definition: logshandler.py:23
def _accuracy(self, objective, dataframe, antype, base_type, tolerance=0.0)
Generate accuracy metrics for model for regression assume tolerance on results equivalent to 2*tolera...
Definition: h2ohandler.py:445
def _generate_execution_metrics(self, dataframe, source, antype)
Generate execution metrics for the correct model.
Definition: h2ohandler.py:345
def _generate_model_metrics(self)
Generate model summary metrics.
Definition: h2ohandler.py:407
def generate_base_path(self, base_ar, type_)
Generate base path to store all files [models, logs, json] relative to it.
Definition: h2ohandler.py:295
Define Regression Metric object as OrderedDict() of common measures for all frameworks on an unified ...
def order_training(self, training_pframe, base_ar, kwargs)
Main method to execute sets of analysis and normalizations base on params.
Definition: h2ohandler.py:733
def shutdown_cluster(cls)
Class Method for cluster shutdown.
Definition: h2ohandler.py:115
def remove_models(self, arlist)
Method to remove list of model from server.
Definition: h2ohandler.py:1475
def _generate_importance_variables(self)
Generate variable importance metrics.
Definition: h2ohandler.py:415
Define all objects, functions and structures related to physically store information on persistence s...
def _get_temporal_objects_ids(self, model_id, nfolds)
Generate list of models_id for internal crossvalidation objects_.
Definition: h2ohandler.py:158
Define common execution base structure as OrderedDict() of common datasets on an unified way...
def predict(self, predict_frame, base_ar, kwargs)
Main method to execute predictions over traning models Take the ar.json for and execute predictions i...
Definition: h2ohandler.py:1206
def get_external_model(self, ar_metadata, type)
Generate java model class_.
Definition: h2ohandler.py:176
def __init__(self, e_c)
Constructor Initialize all framework variables and starts or connect to h2o cluster Aditionally start...
Definition: h2ohandler.py:80
Define Regression Metric object as OrderedDict() of common measures for all frameworks on an unified ...
Define Clustering Metric object as OrderedDict() of common measures for all frameworks on an unified ...
def need_factor(self, atype, objective_column, training_frame=None, valid_frame=None, predict_frame=None)
Auxiliary Method to convert numerical and string columns on H2OFrame to enum (factor) for classificat...
Definition: h2ohandler.py:660
Define Multinomial Metric object as OrderedDict() of common measures for all frameworks on an unified...
Define all objects, functions and structured related to manage Model Parameters Structure: OrderedDic...
def __del__(self)
Destructor.
Definition: h2ohandler.py:107
Class oriented to manage normalizations on dataframes for improvements on accuracy.
Definition: normalizer.py:26
Define all objects, functions and structured related to adding storage information metadata (json str...
def delete_frames(self)
Remove used dataframes during analysis execution_.
Definition: h2ohandler.py:281
Class to manage trasient information between all persistence options and models on an unified way...
def load_model(self, armetadata)
Method to load model from persistence layer by armetadata.
Definition: h2ohandler.py:1186
def connect(self)
Connexion_method to cluster If cluster is up connect to cluster on another case start a cluster...
Definition: h2ohandler.py:124
def is_alive(self)
Is alive_connection method.
Definition: h2ohandler.py:143