DayF core  1.2.1.2
DayF (Decision at your Fingertips) is an AutoML freeware development framework that let developers works with Machine Learning models without any idea of AI, simply taking a csv dataset and the objective column
sparkhandler.py
1 
7 
8 '''
9 Copyright (C) e2its - All Rights Reserved
10  * Unauthorized copying of this file, via any medium is strictly prohibited
11  * Proprietary and confidential
12  *
13  * This file is part of gDayF project.
14  *
15  * Written by Jose L. Sanchez <e2its.es@gmail.com>, 2016-2019
16 '''
17 
18 import copy
19 import json
20 import time
21 from collections import OrderedDict as OrderedDict
22 from pandas import DataFrame as DataFrame
23 from hashlib import md5 as md5
24 from py4j.protocol import Py4JJavaError
25 
26 
27 try:
28  # Now we are ready to import Spark Modules
29  from pyspark.sql import SparkSession
30  from pyspark.ml import Pipeline
31  from pyspark.ml import PipelineModel
32  from pyspark.ml.feature import VectorAssembler
33  from pyspark.ml.feature import VectorIndexer
34  from pyspark.ml.classification import *
35  from pyspark.ml.regression import *
36  from pyspark.ml.clustering import *
37  from pyspark.ml.evaluation import *
38  from pyspark.ml.tuning import *
39  from pyspark.ml.feature import StringIndexer
40  from pyspark.ml.feature import IndexToString
41  from pyspark.ml.feature import OneHotEncoder
42  from pyspark.sql.utils import IllegalArgumentException
43 
44  print("Successfully imported all Spark modules")
45 except ImportError as e:
46  print("Error importing Spark Modules", e)
47  exit(1)
48 
49 
50 from gdayf.common.normalizationset import NormalizationSet
51 from gdayf.common.constants import DTYPES
52 from gdayf.common.storagemetadata import StorageMetadata
53 from gdayf.common.utils import hash_key
54 from gdayf.logs.logshandler import LogsHandler
55 from gdayf.handler_metrics.sparkbinomialmetricmetadata import SparkBinomialMetricMetadata as BinomialMetricMetadata
56 from gdayf.metrics.metricmetadata import MetricMetadata
57 from gdayf.metrics.executionmetriccollection import ExecutionMetricCollection
58 from gdayf.handler_metrics.sparkregressionmetricmetadata import SparkRegressionMetricMetadata as RegressionMetricMetadata
59 from gdayf.handler_metrics.sparkmultinomialmetricmetadata import SparkMultinomialMetricMetadata as MultinomialMetricMetadata
60 from gdayf.handler_metrics.sparkanomaliesmetricmetadata import SparkAnomaliesMetricMetadata as AnomaliesMetricMetadata
61 from gdayf.handler_metrics.sparkclusteringmetricmetadata import SparkClusteringMetricMetadata as ClusteringMetricMetadata
62 from gdayf.persistence.persistencehandler import PersistenceHandler
63 from gdayf.common.dfmetada import DFMetada
64 from gdayf.common.utils import get_model_ns
65 from gdayf.common.armetadata import ArMetadata
66 from gdayf.models.parametersmetadata import ParameterMetadata
67 from gdayf.normalizer.normalizer import Normalizer
68 from gdayf.common.utils import get_model_fw
69 from gdayf.common.storagemetadata import generate_json_path
70 
71 
72 class sparkHandler(object):
73 
74 
79  def __init__(self, e_c):
80  self._ec = e_c
81  self._framework = 'spark'
82  self._config = self._ec.config.get_config()
83  self._labels = self._ec.labels.get_config()['messages']['corehandler']
84  self.localfs = self._config['storage']['localfs']['value']
85  self.hdfs =self._config['storage']['hdfs']['value']
86  self.mongoDB = self._config['storage']['mongoDB']['value']
87  self.primary_path = self._config['storage'][self._config['storage']['primary_path']]['value']
88  self.url = self._config['frameworks'][self._framework]['conf']['master']
89  self.nthreads = self._config['frameworks'][self._framework]['conf']['nthreads']
90  self.spark_warehouse_dir = self._config['frameworks'][self._framework]['conf']['spark_warehouse_dir']
91  self.spark_executor_mem = self._config['frameworks'][self._framework]['conf']['spark.executor.memory']
92  self.spark_driver_mem = self._config['frameworks'][self._framework]['conf']['spark.driver.memory']
93  self.start_spark = self._config['frameworks'][self._framework]['conf']['start_spark']
94  self._save_model = self._config['frameworks'][self._framework]['conf']['save_model']
95  self._tolerance = self._config['frameworks'][self._framework]['conf']['tolerance']
96  self._model_base = None
97  self._spark_session = None
99  self._logging = LogsHandler(self._ec, __name__)
100  self._frame_list = self._ec.spark_temporal_data_frames
101 
102 
103  def __del__(self):
104  if self._spark_session is not None and self.is_alive():
105  self._spark_session.stop()
106 
107 
110  @classmethod
112  try:
113  pass
114  except Py4JJavaError:
115  print('Apache Spark Cluster not working')
116 
117  ''''@staticmethod
118  def addColumnIndex(dataframe):
119  # Create new column names
120  oldColumns = dataframe.schema.names
121  newColumns = oldColumns + ["columnindex"]
122 
123  # Add Column index
124  df_indexed = df.rdd.zipWithIndex().map(lambda row, columnindex: \
125  row + (columnindex,)).toDF()
126 
127  # Rename all the columns
128  new_df = reduce(lambda data, idx: data.withColumnRenamed(oldColumns[idx],
129  newColumns[idx]), range(len(oldColumns)),
130  df_indexed)
131  return new_df'''
132 
133 
136  def connect(self):
137  initiated = False
138  try:
139  spark = SparkSession.builder.master(self.url + '[' + str(self.nthreads) + ']')\
140  .appName('job_gdayf_'+self.url+'_' + time.strftime("%b-%d-%Y_%H:%M:%S-%z", time.localtime())) \
141  .config("spark.executor.memory", self.spark_executor_mem) \
142  .config("spark.driver.memory", self.spark_driver_mem) \
143  .config("spark.sql.warehouse.dir", self.spark_warehouse_dir)
144 
145  self._spark_session = spark.getOrCreate()
146  log4j = self._spark_session.sparkContext._jvm.org.apache.log4j
147  log4j.LogManager.getRootLogger().setLevel(eval(self._config['frameworks'][self._framework]['conf']['log']))
148 
149  except Py4JJavaError as connection_error:
150  self._logging.log_critical('gDayF', "sparkHandler", self._labels["failed_conn"])
151  raise connection_error
152  finally:
153  self._logging.log_info('gDayF', "sparkHandler", self._labels["start"])
154  self._logging.log_info('gDayF', "sparkHandler", self._labels["framework"], self._framework)
155  self._logging.log_info('gDayF', "sparkHandler", self._labels["sess"],
156  self._spark_session.sparkContext.applicationId)
157  return initiated
158 
159 
163  @ staticmethod
164  def _get_dtype(list_dtypes, column):
165  for element in list_dtypes:
166  if element[0] == column:
167  return element[1]
168  return None
169 
170 
171  def is_alive(self):
172  if self._spark_session is None:
173  return False
174  elif self._spark_session._instantiatedSession is None:
175  return False
176  else:
177  return True
178 
179 
184  def _get_temporal_objects_ids(self, model_id, nfolds):
185  pass
186 
187 
193  def get_external_model(self, ar_metadata, type):
194  return False
195 
196 
200  def _get_model_from_load_path(self, ar_metadata):
201  load_fails = True
202  counter_storage = 0
203  # Checking file source versus hash_value
204 
205  try:
206  assert isinstance(ar_metadata['load_path'], list)
207  except AssertionError:
208  return load_fails
209 
210  while ar_metadata['load_path'] is not None and counter_storage < len(ar_metadata['load_path']) and load_fails:
211 
212  if ar_metadata['load_path'][counter_storage]['hash_value'] is None or \
213  hash_key(ar_metadata['load_path'][counter_storage]['hash_type'],
214  ar_metadata['load_path'][counter_storage]['value']) == \
215  ar_metadata['load_path'][counter_storage]['hash_value']:
216  try:
217  self._model_base = PipelineModel.load(ar_metadata['load_path'][counter_storage]['value'])
218  if self._model_base is not None:
219  load_fails = False
220 
221  if ar_metadata['load_path'][counter_storage]['hash_value'] is not None:
222  self._logging.log_info(self._ec.get_id_analysis(), self._spark_session.sparkContext.applicationId,
223  self._labels["hk_check"],
224  ar_metadata['load_path'][counter_storage]['hash_value'] + ' - ' +
225  hash_key(ar_metadata['load_path'][counter_storage]['hash_type'],
226  ar_metadata['load_path'][counter_storage]['value'])
227  )
228  except Py4JJavaError:
229  self._logging.log_critical(self._ec.get_id_analysis(), self._spark_session.sparkContext.applicationId,
230  self._labels["abort"], ar_metadata['load_path'][counter_storage]['value'])
231 
232  counter_storage += 1
233  return load_fails
234 
235 
238  def delete_frames(self):
239  for _, iterator in self._frame_list.items():
240  for _, sparkdataframe in iterator.items():
241  sparkdataframe.unpersist()
242  del self._ec.spark_temporal_data_frames
243  self._ec.spark_temporal_data_frames = dict()
244 
245 
250  def generate_base_path(self, base_ar, type_):
251  assert type_ in ['PoC', 'train', 'predict']
252  if self.primary_path == self.mongoDB:
253  return None
254  elif self.primary_path == self.hdfs:
255  # Generating base_path
256  load_path = list()
257  load_path.append(self.hdfs)
258  load_path.append('/')
259  load_path.append(self._framework)
260  load_path.append('/')
261  load_path.append(base_ar['model_id'])
262  load_path.append('/')
263  load_path.append(type_)
264  load_path.append('/')
265  load_path.append(str(base_ar['timestamp']))
266  load_path.append('/')
267  return ''.join(load_path)
268  else:
269  # Generating base_path
270  load_path = list()
271  load_path.append(self.localfs)
272  load_path.append('/')
273  load_path.append(self._framework)
274  load_path.append('/')
275  load_path.append(base_ar['model_id'])
276  load_path.append('/')
277  load_path.append(type_)
278  load_path.append('/')
279  load_path.append(str(base_ar['timestamp']))
280  load_path.append('/')
281  return ''.join(load_path)
282 
283 
286  def _get_ext(self):
287  return '.spark'
288 
289 
293  @ staticmethod
294  def _get_evaluator(analysis_type, objective_column=None):
295  if objective_column is None:
296  if analysis_type == 'clustering':
297  return None
298  else:
299  if analysis_type == 'binomial':
300  #print('TRC' + objective_column)
301  return BinaryClassificationEvaluator(labelCol=objective_column)
302  elif analysis_type == 'multinomial':
303  return MulticlassClassificationEvaluator(labelCol=objective_column)
304  elif analysis_type == 'regression':
305  return RegressionEvaluator(labelCol=objective_column)
306  return None
307 
308 
314  def _generate_execution_metrics(self, dataframe, antype, objective_column):
315  if antype == 'binomial':
316  model_metrics = BinomialMetricMetadata()
317  elif antype == 'multinomial':
318  model_metrics = MultinomialMetricMetadata()
319  elif antype == 'regression':
320  model_metrics = RegressionMetricMetadata()
321  elif antype == 'anomalies':
322  model_metrics = AnomaliesMetricMetadata()
323  elif antype == 'clustering':
324  model_metrics = ClusteringMetricMetadata()
325  else:
326  model_metrics = MetricMetadata()
327 
328  evaluator = self._get_evaluator(analysis_type=antype, objective_column=objective_column)
329 
330  if isinstance(model_metrics, ClusteringMetricMetadata):
331  model_metrics.set_metrics(model=self._model_base.stages[-1], data=dataframe)
332  else:
333  model_metrics.set_metrics(evaluator=evaluator, data=dataframe, objective_column=objective_column)
334  return model_metrics
335 
336 
339  def _generate_scoring_history(self):
340  result_dataframe = None
341  if isinstance(self._model_base.stages[-1], GBTRegressionModel) or \
342  isinstance(self._model_base.stages[-1], RandomForestRegressionModel) or \
343  isinstance(self._model_base.stages[-1], GBTClassificationModel):
344  maximo = 0
345  for itera in range(0, len(self._model_base.stages[-1].trees)):
346  maximo = max(maximo, self._model_base.stages[-1].trees[itera].depth)
347 
348  result_dataframe = DataFrame(data={'trees': self._model_base.stages[-1].getNumTrees,
349  'max_depth': maximo,
350  'total_nodes': self._model_base.stages[-1].totalNumNodes,
351  'numFeatures': self._model_base.stages[-1].numFeatures},
352  index=[0]).to_json(orient='split')
353  elif isinstance(self._model_base.stages[-1], RandomForestClassificationModel):
354  maximo = 0
355  for itera in range(0, len(self._model_base.stages[-1].trees)):
356  maximo = max(maximo, self._model_base.stages[-1].trees[itera].depth)
357 
358  result_dataframe = DataFrame(data={'trees': self._model_base.stages[-1].getNumTrees,
359  'max_depth': maximo,
360  'total_nodes': self._model_base.stages[-1].totalNumNodes,
361  'numFeatures': self._model_base.stages[-1].numFeatures,
362  'numClasses': self._model_base.stages[-1].numClasses},
363  index=[0]).to_json(orient='split')
364  elif isinstance(self._model_base.stages[-1], DecisionTreeRegressionModel):
365  result_dataframe = DataFrame(data={'trees': 1,
366  'max_depth': self._model_base.stages[-1].depth,
367  'total_nodes': self._model_base.stages[-1].numNodes,
368  'numFeatures': self._model_base.stages[-1].numFeatures},
369  index=[0]).to_json(orient='split')
370  elif isinstance(self._model_base.stages[-1], NaiveBayesModel):
371  result_dataframe = DataFrame(data={'numFeatures': self._model_base.stages[-1].numFeatures,
372  'numClasses': self._model_base.stages[-1].numClasses},
373  index=[0]).to_json(orient='split')
374  elif isinstance(self._model_base.stages[-1], DecisionTreeClassificationModel):
375  result_dataframe = DataFrame(data={'trees': 1,
376  'max_depth': self._model_base.stages[-1].depth,
377  'total_nodes': self._model_base.stages[-1].numNodes,
378  'numFeatures': self._model_base.stages[-1].numFeatures,
379  'numClasses': self._model_base.stages[-1].numClasses},
380  index=[0]).to_json(orient='split')
381  elif isinstance(self._model_base.stages[-1], GeneralizedLinearRegressionModel):
382  summary = self._model_base.stages[-1].summary
383  result_dataframe = DataFrame(data={'aic': summary.aic,
384  'intercept': str(self._model_base.stages[-1].intercept),
385  'degreesOfFreedom': summary.degreesOfFreedom,
386  'numInstances': summary.numInstances,
387  'rank': summary.rank,
388  'dispersion': summary.dispersion,
389  'nullDeviance': summary.nullDeviance,
390  'residuals': summary.residuals,
391  'numFeatures': self._model_base.stages[-1].numFeatures},
392  index=[0]).to_json(orient='split')
393  elif isinstance(self._model_base.stages[-1], LinearRegressionModel):
394  summary = self._model_base.stages[-1].summary
395  result_dataframe = DataFrame(data={'coefifients': str(self._model_base.stages[-1].coefficients),
396  'degreesOfFreedom': summary.degreesOfFreedom,
397  'numInstances': summary.numInstances,
398  'totalIterations': summary.totalIterations,
399  'devianceResiduals': str(summary.devianceResiduals),
400  'explainedVariance': summary.explainedVariance,
401  'numFeatures': self._model_base.stages[-1].numFeatures},
402  index=[0]).to_json(orient='split')
403  elif isinstance(self._model_base.stages[-1], LinearSVCModel):
404  result_dataframe = DataFrame(data={'coefifients': str(self._model_base.stages[-1].coefficients),
405  'intercept': self._model_base.stages[-1].intercept,
406  'numClasses': self._model_base.stages[-1].numClasses,
407  'numFeatures': self._model_base.stages[-1].numFeatures},
408  index=[0]).to_json(orient='split')
409  elif isinstance(self._model_base.stages[-1], LogisticRegressionModel):
410  try:
411  summary = self._model_base.stages[-1].summary
412  result_dataframe = DataFrame(data={'coefifients': str(self._model_base.stages[-1].coefficients),
413  'intercept': self._model_base.stages[-1].intercept,
414  'totalIterations': summary.totalIterations,
415  'roc': summary.roc.toPandas().to_json(orient='split'),
416  'pr': summary.pr.toPandas().to_json(orient='split')},
417  index=[0]).to_json(orient='split')
418  except RuntimeError:
419  result_dataframe = DataFrame(data={'coefifientsMatrix': str(self._model_base.stages[-1].coefficientMatrix),
420  'interceptVector': str(self._model_base.stages[-1].interceptVector)},
421  index=[0]).to_json(orient='split')
422  elif isinstance(self._model_base.stages[-1], BisectingKMeansModel) or \
423  isinstance(self._model_base.stages[-1], KMeansModel):
424  summary = self._model_base.stages[-1].summary
425  result_dataframe = DataFrame(data={'clusterCenters': str(self._model_base.stages[-1].clusterCenters()),
426  'clusterSizes': str(summary.clusterSizes),
427  'k': summary.k},
428  index=[0]).to_json(orient='split')
429 
430  # Change 27/01/2018 sprint 6
431  if result_dataframe is not None:
432  return json.loads(result_dataframe, object_pairs_hook=OrderedDict)
433  else:
434  return None
435 
436 
440  def _generate_importance_variables(self, column_chain):
441  var_importance = OrderedDict()
442  for columns in column_chain:
443  try:
444  var_importance[columns] = self._model_base.stages[-1].featureImportances[column_chain.index(columns)]
445  except AttributeError:
446  var_importance[columns] = None
447  return var_importance
448 
449 
452  def _generate_model_metrics(self):
453  if isinstance(self._model_base.stages[-1], LogisticRegressionModel) or \
454  isinstance(self._model_base.stages[-1], LinearRegressionModel):
455  try:
456  summary = self._model_base.stages[-1].summary
457  return json.loads(DataFrame(summary.objectiveHistory, columns=['Metrics']).to_json(orient='split'),
458  object_pairs_hook=OrderedDict)
459  except RuntimeError:
460  return None
461  elif isinstance(self._model_base.stages[-1], NaiveBayesModel):
462  metrics = OrderedDict()
463  metrics['pi'] = json.loads(DataFrame(self._model_base.stages[-1].pi.values).to_json(orient='split'),
464  object_pairs_hook=OrderedDict)
465  metrics['theta'] = json.loads(DataFrame(self._model_base.stages[-1].theta.values).to_json(orient='split'),
466  object_pairs_hook=OrderedDict)
467 
468  return metrics
469  return None
470 
471 
478  def _accuracy(self, objective, dataframe, tolerance=0.0):
479 
480  fmin = eval("lambda x: x - " + str(tolerance / 2))
481  fmax = eval("lambda x: x + " + str(tolerance / 2))
482 
483  resultado_train = dataframe.select("prediction", objective)
484 
485  accuracy = resultado_train.filter(resultado_train.prediction >= fmin(resultado_train[objective])) \
486  .filter(resultado_train.prediction <= fmax(resultado_train[objective])).count() \
487  / float(resultado_train.count())
488 
489  self._logging.log_exec(self._ec.get_id_analysis(), self._spark_session.sparkContext.applicationId, self._labels["tolerance"],
490  str(tolerance))
491  return accuracy
492 
493 
501  def _predict_accuracy(self, objective, dataframe, tolerance=0.0):
502  accuracy = -1.0
503  #bug SPARK-14948
504  #prediccion = dataframe.withColumn('prediction', self._model_base.transform(dataframe).prediction)
505  prediccion = self._model_base.transform(dataframe)
506  columns = prediccion.columns
507  if objective in columns:
508  accuracy = self._accuracy(objective=objective, dataframe=prediccion, tolerance=tolerance)
509  return accuracy, prediccion
510 
511 
516  def _predict_clustering(self, dataframe, objective=None):
517  return self._predict_accuracy(objective=objective, dataframe=dataframe)
518 
519 
523  def _generate_params(self, modeldef):
524  """
525  Generate model params for this model.
526  :return (status (success 0, error 1) , OrderedDict(full_stack_parameters))
527  """
528  full_stack_params = OrderedDict()
529  for key, item in modeldef.extractParamMap().items():
530  full_stack_params[str(key)[str(key).find('__') + 2:]] = item
531  return 0, full_stack_params
532 
533 
539  def get_metric(self, algorithm_description, metric, source): # not tested
540  try:
541  struct_ar = OrderedDict(json.load(algorithm_description))
542  except:
543  self._logging.log_critical('gDayF', self._spark_session.sparkContext.applicationId(), self._labels["ar_error"])
544  return ('Necesario cargar un modelo valid o ar.json valido')
545  try:
546  return struct_ar['metrics'][source][metric]
547  except KeyError:
548  return 'Not Found'
549 
550 
559  def execute_normalization(self, dataframe, base_ns, model_id, filtering='NONE', exist_objective=True):
560  if base_ns is not None:
561  data_norm = dataframe.copy(deep=True)
562  self._logging.log_exec(self._ec.get_id_analysis(),
563  self._spark_session.sparkContext.applicationId, self._labels["exec_norm"], str(base_ns))
564  normalizer = Normalizer(self._ec)
565  if not exist_objective:
566  base_ns = normalizer.filter_objective_base(normalizemd=base_ns)
567  if filtering == 'STANDARDIZE':
568  base_ns = normalizer.filter_standardize(normalizemd=base_ns, model_id=model_id)
569  elif filtering == 'DROP':
570  base_ns = normalizer.filter_drop_missing(normalizemd=base_ns)
571  data_norm = normalizer.normalizeDataFrame(data_norm, base_ns)
572  del normalizer
573  df_metadata = DFMetada()
574  df_metadata.getDataFrameMetadata(dataframe=data_norm, typedf='pandas')
575  df_metadata_hash_value = md5(json.dumps(df_metadata).encode('utf-8')).hexdigest()
576  return data_norm, df_metadata, df_metadata_hash_value, True, base_ns
577  else:
578  df_metadata = DFMetada()
579  df_metadata.getDataFrameMetadata(dataframe=dataframe, typedf='pandas')
580  df_metadata_hash_value = md5(json.dumps(df_metadata).encode('utf-8')).hexdigest()
581  return dataframe, df_metadata, df_metadata_hash_value, False, base_ns
582 
583  #base_ns = json.load(normalization, object_pairs_hook=NormalizationSet)
584 
585 
589  def define_special_spark_naive_norm(self, df_metadata):
590  self._logging.log_exec(self._ec.get_id_analysis(),
591  self._spark_session.sparkContext.applicationId, self._labels["def_naive_norm"])
592  normalizer = Normalizer(self._ec)
593  aux_ns = normalizer.define_special_spark_naive_norm(dataframe_metadata=df_metadata)
594  del normalizer
595  return aux_ns
596 
597 
603  def order_training(self, training_pframe, base_ar, **kwargs):
604  assert isinstance(training_pframe, DataFrame)
605  assert isinstance(base_ar, ArMetadata)
606 
607  filtering = 'NONE'
608 
609  for pname, pvalue in kwargs.items():
610  if pname == 'filtering':
611  assert isinstance(pvalue, str)
612  filtering = pvalue
613 
614  # python train parameters effective
615  analysis_id = self._ec.get_id_analysis()
616  supervised = True
617  tolerance = 0.0
618  objective_column = base_ar['objective_column']
619  if objective_column is None:
620  supervised = False
621 
622  #valid_frame = None
623  test_frame = None
624 
625 
626  if "test_frame" in kwargs.keys():
627  test_pframe = kwargs['test_frame']
628  else:
629  test_pframe = None
630 
631  base_ns = get_model_ns(base_ar)
632  modelid = base_ar['model_parameters']['spark']['model']
633  artype = base_ar['model_parameters']['spark']['types'][0]["type"]
634  self._logging.log_info(analysis_id,
635  self._spark_session.sparkContext.applicationId,
636  self._labels["st_analysis"], modelid)
637 
638  assert isinstance(base_ns, list) or base_ns is None
639  # Applying Normalizations
640  data_initial = DFMetada()
641  data_initial.getDataFrameMetadata(dataframe=training_pframe, typedf='pandas')
642  training_pframe, data_normalized, train_hash_value, norm_executed, base_ns = \
643  self.execute_normalization(dataframe=training_pframe, base_ns=base_ns, model_id=modelid,
644  filtering=filtering, exist_objective=True)
645 
646  if modelid == 'NaiveBayes' and artype == 'multinomial':
647  training_pframe, data_normalized, train_hash_value, aux_norm_executed, aux_norm = \
648  self.execute_normalization(dataframe=training_pframe,
649  base_ns=self.define_special_spark_naive_norm(data_normalized),
650  model_id=modelid)
651  if aux_norm is not None:
652  base_ns.extend(aux_norm)
653  norm_executed = norm_executed | aux_norm_executed
654 
655  if base_ar['round'] == 1:
656  aux_ns = Normalizer(self._ec).define_ignored_columns(data_normalized, objective_column)
657  if aux_ns is not None:
658  base_ns.extend(aux_ns)
659 
660  df_metadata = data_initial
661  if not norm_executed:
662  data_normalized = None
663  try:
664  self._logging.log_info(analysis_id,
665  self._spark_session.sparkContext.applicationId,
666  self._labels["cor_struct"],
667  str(data_initial['correlation'][objective_column]))
668  except KeyError:
669  self._logging.log_exec(analysis_id,
670  self._spark_session.sparkContext.applicationId,
671  self._labels["cor_struct"],
672  str(data_initial['correlation']))
673  else:
674  df_metadata = data_normalized
675  base_ar['normalizations_set'] = base_ns
676  try:
677  self._logging.log_info(analysis_id,
678  self._spark_session.sparkContext.applicationId,
679  self._labels["cor_struct"],
680  str(data_normalized['correlation'][objective_column]))
681  except KeyError:
682  self._logging.log_exec(analysis_id,
683  self._spark_session.sparkContext.applicationId,
684  self._labels["cor_struct"],
685  str(data_initial['correlation']))
686  if test_pframe is not None:
687  test_pframe, _, test_hash_value, _, _ = self.execute_normalization(dataframe=test_pframe, base_ns=base_ns,
688  model_id=modelid, filtering=filtering,
689  exist_objective=True)
690 
691  training_frame = self._get_dataframe(pframe=training_pframe, hash_value=train_hash_value, type="train")
692 
693  if "test_frame" in kwargs.keys():
694  '''test_frame = self._spark_session.createDataFrame(test_frame).cache()
695  self._logging.log_info(analysis_id, self._spark_session.sparkContext.applicationId,
696  self._labels["parsing_to_spark"],
697  'test_frame (' + str(test_frame.count()) + ')')'''
698  test_frame = self._get_dataframe(pframe=test_pframe, hash_value=test_hash_value, type="test")
699 
700  if supervised and artype == 'regression':
701  # Initializing base structures
702  self._logging.log_info(analysis_id,
703  self._spark_session.sparkContext.applicationId, self._labels["objective"],
704  objective_column + ' - ' + self._get_dtype(training_frame.dtypes, objective_column))
705 
706  tolerance = get_tolerance(df_metadata['columns'], objective_column, self._tolerance)
707 
708  # Generating base_path
709  self._logging.log_info(analysis_id,
710  self._spark_session.sparkContext.applicationId,
711  self._labels["action_type"], base_ar['type'])
712  base_path = self.generate_base_path(base_ar, base_ar['type'])
713 
714  final_ar_model = copy.deepcopy(base_ar)
715  final_ar_model['status'] = self._labels['failed_op']
716  final_ar_model['model_parameters']['spark']['id'] = self._spark_session.version
717  model_timestamp = str(time.time())
718  final_ar_model['data_initial'] = data_initial
719  final_ar_model['data_normalized'] = data_normalized
720 
721  model_id = modelid + '_' + model_timestamp
722  self._logging.log_info(analysis_id, self._spark_session.sparkContext.applicationId,
723  self._labels["model_id"],
724  model_id)
725 
726  analysis_type = base_ar['model_parameters']['spark']['types'][0]['type']
727  self._logging.log_info(analysis_id, self._spark_session.sparkContext.applicationId, self._labels["amode"],
728  base_ar['model_parameters']['spark']['types'][0]['type'])
729 
730  '''Generate commands pipeline : model and model.train()'''
731  invalid_types = ['string']
732  transformation_chain = list()
733  column_chain = list()
734  norm = Normalizer(self._ec)
735  ignored_columns = norm.ignored_columns(base_ns)
736  decoder = None
737  for element in training_frame.dtypes:
738  if element[0] not in ignored_columns:
739  if element[1] in invalid_types or (modelid == 'NaiveBayes' and artype == 'binomial'):
740  transformation_chain.append(StringIndexer() \
741  .setInputCol(element[0]) \
742  .setOutputCol(element[0] + '_to_index')
743  .setHandleInvalid("keep"))
744  column_rename = element[0] + '_to_index'
745  if element[0] != objective_column:
746  transformation_chain.append(OneHotEncoder() \
747  .setInputCol(element[0] + '_to_index') \
748  .setOutputCol(element[0] + '_to_onehot'))
749  column_rename = element[0] + '_to_onehot'
750  else:
751  objective_column = column_rename
752  decoder = len(transformation_chain) - 1
753  column_chain.append(column_rename)
754  else:
755  column_chain.append(element[0])
756  del norm
757  ''' Packaging Features '''
758 
759  try:
760  column_chain.remove(objective_column)
761  # Remove ignored_columns
762  for column in ignored_columns:
763  column_chain.remove(column)
764 
765  except ValueError:
766  pass
767  transformation_chain.append(VectorAssembler().setInputCols(column_chain).setOutputCol('features'))
768 
769  #Only for trace issues
770  trc_pipeline = Pipeline(stages=transformation_chain.copy())
771  ''' Compose Model'''
772  model_command = list()
773  model_command.append(modelid)
774  model_command.append("(")
775  model_command.append("featuresCol=\'features\'")
776 
777  if supervised:
778  model_command.append(", labelCol=\'%s\'" % objective_column)
779 
780  generate_commands_parameters(base_ar['model_parameters']['spark'], model_command)
781 
782  model_command.append(")")
783  model_command = ''.join(model_command)
784  #print('TRC:' + model_command)
785 
786  modeldef = eval(model_command)
787  self._logging.log_exec(analysis_id, self._spark_session.sparkContext.applicationId,
788  self._labels["gmodel"], model_command)
789 
790  transformation_chain.append(modeldef)
791  pipeline = Pipeline(stages=transformation_chain)
792  grid = ParamGridBuilder().build()
793  antype = base_ar['model_parameters']['spark']['types'][0]['type']
794  aborted = False
795  try:
796  if supervised:
797  if training_pframe.count(axis=0).all() <= \
798  self._config['frameworks']['spark']['conf']['validation_frame_threshold']:
799 
800  model = CrossValidator(estimator=pipeline,
801  estimatorParamMaps=grid,
802  evaluator=self._get_evaluator(analysis_type=antype,
803  objective_column=objective_column),
804  numFolds=self._config['frameworks']['spark']['conf']['nfolds'],
805  seed=int(base_ar['timestamp']))
806  else:
807  model = TrainValidationSplit(estimator=pipeline,
808  estimatorParamMaps=grid,
809  evaluator=self._get_evaluator(analysis_type=antype,
810  objective_column=objective_column),
811  tranRation=self._config['frameworks']['spark']['conf']['validation_frame_ratio'],
812  seed=int(base_ar['timestamp']))
813  start = time.time()
814 
815  trc_dataframe = trc_pipeline.fit(training_frame).transform(training_frame)
816  self._logging.log_info(analysis_id,
817  self._spark_session.sparkContext.applicationId,
818  self._labels["trc:label_cardinality"],
819  "( " + objective_column + "," +
820  str(trc_dataframe.select(objective_column).distinct().count()) +
821  " )")
822  self._model_base = model.fit(training_frame).bestModel
823  else:
824  model = pipeline
825  start = time.time()
826  self._model_base = model.fit(training_frame)
827  final_ar_model['status'] = self._labels["success_op"]
828 
829  # Generating aditional model parameters Model_ID
830  final_ar_model['execution_seconds'] = time.time() - start
831  final_ar_model['model_parameters']['spark']['parameters']['model_id'] = ParameterMetadata()
832  final_ar_model['model_parameters']['spark']['parameters']['model_id'].set_value(value=model_id,
833  seleccionable=False,
834  type="str")
835  # Filling whole json ar.json
836  final_ar_model['ignored_parameters'], \
837  final_ar_model['full_parameters_stack'] = self._generate_params(modeldef=modeldef)
838 
839  self._logging.log_info(analysis_id,
840  self._spark_session.sparkContext.applicationId, self._labels["tmodel"],
841  model_id)
842  self._logging.log_info(analysis_id,
843  self._spark_session.sparkContext.applicationId,
844  self._labels["exec_time"],
845  str(final_ar_model['execution_seconds']))
846 
847 
848  # Generating execution metrics
849  final_ar_model['metrics']['execution'] = ExecutionMetricCollection()
850 
851  self._logging.log_info(analysis_id,
852  self._spark_session.sparkContext.applicationId, self._labels["gexec_metric"],
853  model_id)
854 
855  prediction_train = self._model_base.transform(training_frame)
856 
857  final_ar_model['metrics']['execution']['train'] = \
858  self._generate_execution_metrics(dataframe=prediction_train,
859  antype=analysis_type,
860  objective_column=objective_column)
861  if test_frame is not None:
862  prediction_test = self._model_base.transform(test_frame)
863 
864  final_ar_model['metrics']['execution']['test'] = \
865  self._generate_execution_metrics(dataframe=prediction_test,
866  antype=analysis_type,
867  objective_column=objective_column)
868 
869  final_ar_model['metrics']['execution']['predict'] = OrderedDict()
870  final_ar_model['metrics']['execution']['predict']['decoder'] = decoder
871 
872  final_ar_model['metrics']['accuracy'] = OrderedDict()
873  final_ar_model['metrics']['accuracy'] = OrderedDict()
874 
875  if supervised:
876  final_ar_model['metrics']['accuracy']['train'] = \
877  self._accuracy(objective=objective_column, dataframe=prediction_train, tolerance=tolerance)
878  self._logging.log_exec(analysis_id,
879  self._spark_session.sparkContext.applicationId, self._labels["model_tacc"],
880  model_id + ' - ' + str(final_ar_model['metrics']['accuracy']['train']))
881  final_ar_model['tolerance'] = tolerance
882  else:
883  final_ar_model['metrics']['accuracy']['train'] = 0.0
884 
885  if test_frame is not None:
886  prediction_test = self._model_base.transform(test_frame)
887  if supervised:
888  final_ar_model['metrics']['accuracy']['test'] = \
889  self._accuracy(objective=objective_column, dataframe=prediction_test, tolerance=tolerance)
890 
891  train_balance = self._config['frameworks']['spark']['conf']['train_balance_metric']
892  test_balance = 1 - train_balance
893  final_ar_model['metrics']['accuracy']['combined'] = \
894  (final_ar_model['metrics']['accuracy']['train']*train_balance +
895  final_ar_model['metrics']['accuracy']['test']*test_balance)
896 
897  self._logging.log_exec(analysis_id, self._spark_session.sparkContext.applicationId,
898  self._labels["model_pacc"],
899  model_id + ' - ' + str(final_ar_model['metrics']['accuracy']['test']))
900 
901  self._logging.log_exec(analysis_id, self._spark_session.sparkContext.applicationId,
902  self._labels["model_cacc"],
903  model_id + ' - ' + str(final_ar_model['metrics']['accuracy']['combined']))
904  else:
905 
906  final_ar_model['metrics']['accuracy']['test'] = 0.0
907  final_ar_model['metrics']['accuracy']['combined'] = 0.0
908 
909  # Generating model metrics
910  final_ar_model['metrics']['model'] = self._generate_model_metrics()
911  self._logging.log_info(analysis_id,
912  self._spark_session.sparkContext.applicationId,
913  self._labels["gmodel_metric"], model_id)
914 
915  # Generating Variable importance
916  final_ar_model['metrics']['var_importance'] = self._generate_importance_variables(column_chain=column_chain)
917  self._logging.log_info(analysis_id,
918  self._spark_session.sparkContext.applicationId,
919  self._labels["gvar_metric"], model_id)
920 
921  # Generating scoring_history
922  final_ar_model['metrics']['scoring'] = self._generate_scoring_history()
923  self._logging.log_info(analysis_id,
924  self._spark_session.sparkContext.applicationId,
925  self._labels["gsco_metric"], model_id)
926 
927  final_ar_model['status'] = self._labels['success_op']
928 
929  except Exception as execution_error:
930  for handler in self._logging.logger.handlers:
931  handler.flush()
932  # Generating aditional model parameters Model_ID
933  final_ar_model['execution_seconds'] = time.time() - start
934  aborted = True
935  final_ar_model['model_parameters']['spark']['parameters']['model_id'] = ParameterMetadata()
936  final_ar_model['model_parameters']['spark']['parameters']['model_id'].set_value(value=model_id,
937  seleccionable=False,
938  type="str")
939  self._logging.log_info(analysis_id,
940  self._spark_session.sparkContext.applicationId, self._labels["abort_data_nc"],
941  model_id)
942  # Filling whole json ar.json
943  final_ar_model['ignored_parameters'], \
944  final_ar_model['full_parameters_stack'] = self._generate_params(modeldef=modeldef)
945  # Filling whole json ar.json
946  final_ar_model['ignored_parameters'], \
947  final_ar_model['full_parameters_stack'] = self._generate_params(modeldef=modeldef)
948 
949  final_ar_model['status'] = self._labels["failed_op"]
950  self._logging.log_critical(analysis_id,
951  self._spark_session.sparkContext.applicationId,
952  self._labels["abort"],
953  repr(execution_error))
954  final_ar_model['metrics'] = OrderedDict()
955  final_ar_model['metrics']['accuracy'] = OrderedDict()
956  final_ar_model['metrics']['accuracy']['train'] = 0.0
957  final_ar_model['metrics']['accuracy']['test'] = 0.0
958  final_ar_model['metrics']['accuracy']['combined'] = 0.0
959  final_ar_model['metrics']['execution'] = OrderedDict()
960  final_ar_model['metrics']['execution']['train'] = OrderedDict()
961  final_ar_model['metrics']['execution']['train']['RMSE'] = 1e+16
962  final_ar_model['metrics']['execution']['train']['tot_withinss'] = 1e+16
963  final_ar_model['metrics']['execution']['train']['betweenss'] = 1e+16
964  final_ar_model['metrics']['execution']['test'] = OrderedDict()
965  final_ar_model['metrics']['execution']['test']['RMSE'] = 1e+16
966  final_ar_model['metrics']['execution']['test']['tot_withinss'] = 1e+16
967  final_ar_model['metrics']['execution']['test']['betweenss'] = 1e+16
968 
969  finally:
970  generate_json_path(self._ec, final_ar_model)
971  self._persistence.store_json(storage_json=final_ar_model['json_path'], ar_json=final_ar_model)
972 
973  self._logging.log_info(analysis_id,
974  self._spark_session.sparkContext.applicationId,
975  self._labels["model_stored"], model_id)
976  self._logging.log_info(analysis_id,
977  self._spark_session.sparkContext.applicationId,
978  self._labels["end"], model_id)
979  if not aborted:
980  self.store_model(final_ar_model)
981 
982  for handler in self._logging.logger.handlers:
983  handler.flush()
984  return analysis_id, final_ar_model
985 
986 
990  def _get_dataframe(self, pframe, hash_value, type):
991 
992  if self._frame_list.get(hash_value) is None or self._frame_list[hash_value].get(type) is None:
993  frame = self._spark_session.createDataFrame(pframe).cache()
994  self._logging.log_info(self._ec.get_id_analysis(),
995  self._spark_session.sparkContext.applicationId,
996  self._labels["parsing_to_spark"],
997  type + '_frame(' + str(frame.count()) + ')')
998  if self._frame_list.get(hash_value) is None:
999  self._frame_list[hash_value] = dict()
1000  self._frame_list[hash_value][type] = frame
1001  else:
1002  frame = self._frame_list[hash_value][type]
1003  self._logging.log_info(self._ec.get_id_analysis(),
1004  self._spark_session.sparkContext.applicationId,
1005  self._labels["getting_from_spark"],
1006  type + '_frame(' + str(frame.count()) + ')')
1007  return frame
1008 
1009 
1012  def store_model(self, armetadata):
1013  saved_model = False
1014 
1015  fw = get_model_fw(armetadata)
1016  model_id = armetadata['model_parameters'][fw]['parameters']['model_id']['value']
1017 
1018  #Updating status
1019  armetadata['status'] = self._labels["success_op"]
1020  # Generating load_path
1021  load_storage = StorageMetadata(self._ec)
1022  for each_storage_type in load_storage.get_load_path():
1023  source_data = list()
1024  primary_path = self._config['storage'][each_storage_type['type']]['value']
1025  source_data.append(primary_path)
1026  source_data.append('/')
1027  source_data.append(armetadata['user_id'])
1028  source_data.append('/')
1029  source_data.append(armetadata['workflow_id'])
1030  source_data.append('/')
1031  source_data.append(armetadata['model_id'])
1032  source_data.append('/')
1033  source_data.append(fw)
1034  source_data.append('/')
1035  source_data.append(armetadata['type'])
1036  source_data.append('/')
1037  source_data.append(str(armetadata['timestamp']))
1038  source_data.append('/')
1039 
1040  load_path = ''.join(source_data) + each_storage_type['value']+'/'
1041  self._persistence.mkdir(type=each_storage_type['type'], path=load_path,
1042  grants=self._config['storage']['grants'])
1043  if each_storage_type['type'] == 'hdfs':
1044  load_path = self._config['storage'][each_storage_type['type']]['uri'] + load_path
1045 
1046  self._model_base.write().overwrite().save(path=load_path + model_id + self._get_ext())
1047 
1048  load_storage.append(value=load_path + model_id + self._get_ext(),
1049  fstype=each_storage_type['type'], hash_type=each_storage_type['hash_type'])
1050  saved_model = True
1051 
1052  armetadata['load_path'] = load_storage
1053 
1054  self._logging.log_exec(self._ec.get_id_analysis(),
1055  self._spark_session.sparkContext.applicationId, self._labels["msaved"],
1056  model_id)
1057 
1058  self._persistence.store_json(storage_json=armetadata['json_path'], ar_json=armetadata)
1059  self._logging.log_info(self._ec.get_id_analysis(),
1060  self._spark_session.sparkContext.applicationId,
1061  self._labels["model_stored"], model_id)
1062 
1063  return saved_model
1064 
1065 
1068  def load_model(self, armetadata):
1069  from_disk = False
1070 
1071  fw = get_model_fw(armetadata)
1072  model_id = armetadata['model_parameters'][fw]['parameters']['model_id']['value']
1073 
1074  load_fail, from_disk = self._get_model(base_ar=armetadata, base_model_id=model_id, remove_model=from_disk)
1075  if load_fail:
1076  return None
1077  else:
1078  return armetadata
1079 
1080 
1088  def predict(self, predict_frame, base_ar, **kwargs):
1089 
1090  for pname, pvalue in kwargs.items():
1091  None
1092 
1093  remove_model = False
1094  model_timestamp = str(time.time())
1095  self._ec.set_id_analysis(base_ar['model_id'])
1096  analysis_id = self._ec.get_id_analysis()
1097  base_model_id = base_ar['model_parameters']['spark']['parameters']['model_id']['value'] + self._get_ext()
1098  model_id = base_model_id + '_' + model_timestamp
1099  antype = base_ar['model_parameters']['spark']['types'][0]['type']
1100 
1101  modelid = base_ar['model_parameters']['spark']['model']
1102  base_ns = get_model_ns(base_ar)
1103 
1104  #Checking file source versus hash_value
1105  load_fails, remove_model = self._get_model(base_ar, base_model_id, remove_model)
1106 
1107  if load_fails or self._model_base is None:
1108  self._logging.log_critical(analysis_id, self._spark_session.sparkContext.applicationId,
1109  self._labels["no_models"], base_model_id)
1110  base_ar['status'] = self._labels['failed_op'] # Default Failed Operation Code
1111  return None
1112 
1113  objective_column = base_ar['objective_column']
1114 
1115  exist_objective = True
1116  if objective_column is None:
1117  exist_objective = False
1118  if exist_objective:
1119  self._logging.log_info(analysis_id, self._spark_session.sparkContext.applicationId, self._labels["objective"],
1120  objective_column)
1121  # Recovering tolerance
1122  tolerance = base_ar['tolerance']
1123 
1124  data_initial = DFMetada()
1125  data_initial.getDataFrameMetadata(dataframe=predict_frame, typedf='pandas')
1126  base_ar['data_initial'] = data_initial
1127 
1128  if objective_column in list(predict_frame.columns.values):
1129  try:
1130  self._logging.log_info(analysis_id, self._spark_session.sparkContext.applicationId,
1131  self._labels["cor_struct"], str(data_initial['correlation'][objective_column]))
1132  except KeyError:
1133  self._logging.log_exec(analysis_id, self._spark_session.sparkContext.applicationId,
1134  self._labels["cor_struct"], str(data_initial['correlation']))
1135  npredict_frame, data_normalized, _, norm_executed, _ = self.execute_normalization(dataframe=predict_frame,
1136  base_ns=base_ns,
1137  model_id=modelid,
1138  filtering='DROP',
1139  exist_objective=True)
1140 
1141  else:
1142  npredict_frame, data_normalized, _, norm_executed, _ = self.execute_normalization(dataframe=predict_frame,
1143  base_ns=base_ns,
1144  model_id=modelid,
1145  filtering='DROP',
1146  exist_objective=False)
1147 
1148  if not norm_executed:
1149  self._logging.log_exec(analysis_id, self._spark_session.sparkContext.applicationId, self._labels["exec_norm"],
1150  'No Normalizations Required')
1151  else:
1152  # Transforming original dataframe to sparkFrame
1153  '''predict_frame = self._spark_session.createDataFrame(predict_frame).cache()
1154  self._logging.log_info(analysis_id, self._spark_session.sparkContext.applicationId, self._labels["parsing_to_spark"],
1155  'test_frame (' + str(predict_frame.count()) + ')')'''
1156 
1157  base_ar['data_normalized'] = data_normalized
1158  if objective_column in list(npredict_frame.columns.values):
1159  try:
1160  self._logging.log_exec(analysis_id, self._spark_session.sparkContext.applicationId, self._labels["cor_struct"],
1161  str(data_normalized['correlation'][objective_column]))
1162  except KeyError:
1163  self._logging.log_exec(analysis_id, self._spark_session.sparkContext.applicationId, self._labels["no_cor_struct"],
1164  str(data_normalized['correlation']))
1165 
1166  #Transforming to sparkFrame
1167  npredict_frame = self._spark_session.createDataFrame(npredict_frame).cache()
1168  self._logging.log_info(analysis_id, self._spark_session.sparkContext.applicationId, self._labels["parsing_to_spark"],
1169  'test_frame (' + str(npredict_frame.count()) + ')')
1170 
1171  base_ar['type'] = 'predict'
1172  self._logging.log_info(self._ec.get_id_analysis(), self._spark_session.sparkContext.applicationId,
1173  self._labels["action_type"], base_ar['type'])
1174 
1175  base_ar['timestamp'] = model_timestamp
1176 
1177  self._logging.log_info(analysis_id, self._spark_session.sparkContext.applicationId,
1178  self._labels['st_predict_model'],
1179  base_model_id)
1180  if base_ar['metrics']['execution']['predict']['decoder'] is not None:
1181  decoder = self._model_base.stages[base_ar['metrics']['execution']['predict']['decoder']]
1182  else:
1183  decoder = None
1184 
1185  if objective_column in npredict_frame.columns:
1186  for element in npredict_frame.dtypes:
1187  if element[0] == objective_column:
1188  if element[1] == 'string':
1189  objective_column = objective_column + '_to_index'
1190  objective_type = 'double'
1191  else:
1192  objective_type = element[1]
1193  else:
1194  objective_type = None
1195 
1196  start = time.time()
1197 
1198  if exist_objective:
1199  accuracy, prediction_dataframe = self._predict_accuracy(objective_column, npredict_frame,
1200  tolerance=tolerance)
1201 
1202  base_ar['execution_seconds'] = time.time() - start
1203  base_ar['tolerance'] = tolerance
1204 
1205  #prediction_dataframe = prediction_dataframe.toPandas()
1206  else:
1207  if antype == 'clustering':
1208  if norm_executed:
1209  accuracy, prediction_dataframe = self._predict_clustering(npredict_frame)
1210  else:
1211  accuracy, prediction_dataframe = self._predict_clustering(npredict_frame)
1212 
1213  base_ar['execution_seconds'] = time.time() - start
1214  #prediction_dataframe = prediction_dataframe.toPandas()
1215 
1216 
1217  if not exist_objective or objective_type is not None:
1218  self._logging.log_info(analysis_id, self._spark_session.sparkContext.applicationId,
1219  self._labels["gexec_metric"], model_id)
1220 
1221  base_ar['metrics']['execution'][base_ar['type']] = self._generate_execution_metrics( \
1222  dataframe=prediction_dataframe,
1223  objective_column=objective_column,
1224  antype=antype)
1225  if objective_column in prediction_dataframe.columns:
1226  base_ar['metrics']['accuracy']['predict'] = accuracy
1227  self._logging.log_info(analysis_id,
1228  self._spark_session.sparkContext.applicationId, self._labels["model_pacc"],
1229  base_model_id + ' - ' + str(base_ar['metrics']['accuracy']['predict']))
1230 
1231  base_ar['status'] = self._labels['success_op']
1232 
1233  if decoder is not None:
1234  labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
1235  labels=decoder.labels)
1236  prediction_dataframe = labelConverter.transform(prediction_dataframe).drop("prediction")
1237  prediction_dataframe = prediction_dataframe.withColumnRenamed("predictedLabel", "predict")
1238  else:
1239  prediction_dataframe = prediction_dataframe.withColumnRenamed("prediction", "predict")
1240 
1241  command = list()
1242  command.append("prediction_dataframe.select(")
1243  command.append('\"predict\"')
1244  if antype in ['binomial', 'multinomial']:
1245  command.append(', \"probability\"')
1246  command.append(").toPandas()")
1247 
1248  presults = eval("".join(command))
1249  prediction =predict_frame.copy()
1250  prediction['predict'] = presults.loc[:, 'predict']
1251  if antype in ['binomial', 'multinomial']:
1252  prediction['probability'] = presults.loc[:, 'probability']
1253 
1254  # writing metadata predict.json file
1255  prediction_json = OrderedDict()
1256  prediction_json['metadata'] = OrderedDict()
1257  prediction_json['metadata']['user_id'] = self._ec.get_id_user()
1258  prediction_json['metadata']['timestamp'] = model_timestamp
1259  prediction_json['metadata']['workflow_id'] = self._ec.get_id_workflow()
1260  prediction_json['metadata']['analysis_id'] = self._ec.get_id_analysis()
1261  prediction_json['metadata']['model_id'] = base_ar['model_parameters']['spark']['parameters']['model_id']['value']
1262 
1263  # writing data predict.json file
1264  prediction_json['data'] = OrderedDict()
1265  if isinstance(prediction, DataFrame):
1266  prediction_json['data'] = prediction.to_dict(orient='records')
1267  else:
1268  prediction_json['data'] = OrderedDict(prediction)
1269 
1270  # writing file predict.json file
1271  generate_json_path(self._ec, base_ar, 'prediction')
1272  self._persistence.store_json(storage_json=base_ar['prediction_path'], ar_json=base_ar, other=prediction_json)
1273  self._logging.log_exec(analysis_id,
1274  self._spark_session.sparkContext.applicationId,
1275  self._labels["prediction_stored"], model_id)
1276 
1277  # writing ar.json file
1278  generate_json_path(self._ec, base_ar)
1279  self._persistence.store_json(storage_json=base_ar['json_path'], ar_json=base_ar)
1280  self._logging.log_exec(analysis_id,
1281  self._spark_session.sparkContext.applicationId,
1282  self._labels["model_stored"], model_id)
1283 
1284  self._logging.log_info(analysis_id,
1285  self._spark_session.sparkContext.applicationId,
1286  self._labels["end"], model_id)
1287  for handler in self._logging.logger.handlers:
1288  handler.flush()
1289 
1290  return prediction, base_ar
1291 
1292 
1298  def _get_model(self, base_ar, base_model_id, remove_model):
1299  load_fails = self._get_model_from_load_path(base_ar)
1300  remove_model = True
1301  return load_fails, remove_model
1302 
1303 
1307  def remove_models(self, arlist):
1308  remove_fails = False
1309  try:
1310  assert isinstance(arlist, list)
1311  except AssertionError:
1312  return remove_fails
1313 
1314  persistence = PersistenceHandler(self._ec)
1315  for ar_metadata in arlist:
1316  try:
1317  assert isinstance(ar_metadata['load_path'], list)
1318  except AssertionError:
1319  return True
1320 
1321  _, ar_metadata['load_path'] = persistence.remove_file(load_path=ar_metadata['load_path'])
1322 
1323  if len(ar_metadata['load_path']) == 0:
1324  ar_metadata['load_path'] = None
1325  else:
1326  remove_fails = True
1327 
1328  persistence.store_json(storage_json=ar_metadata["json_path"], ar_json=ar_metadata)
1329 
1330  del persistence
1331  return remove_fails
1332 
1333 
1337 def generate_commands_parameters(each_model, model_command):
1338  for key, value in each_model['parameters'].items():
1339  if value['seleccionable']:
1340  if isinstance(value['value'], str):
1341  model_command.append(", %s=\'%s\'" % (key, value['value']))
1342  else:
1343  if value is not None:
1344  model_command.append(", %s=%s" % (key, value['value']))
1345 
1346 
1351 def get_tolerance(columns, objective_column, tolerance=0.0):
1352  if isinstance(tolerance, dict):
1353  if tolerance['enable_fixed']:
1354  threshold = tolerance['fixed']
1355  else:
1356  min_val = None
1357  max_val = None
1358  for each_column in columns:
1359  if each_column["name"] == objective_column and each_column["type"] in DTYPES:
1360  min_val = float(each_column["min"])
1361  max_val = float(each_column["max"])
1362  if min_val is None or max_val is None:
1363  threshold = 0
1364  else:
1365  threshold = (max_val - min_val) * tolerance['percentage']
1366  else:
1367  threshold = tolerance
1368  return threshold
Define all objects, functions and structured related to Analysis_Results for one execution (final jso...
Definition: armetadata.py:1
def _generate_execution_metrics(self, dataframe, antype, objective_column)
Generate execution metrics for the correct model.
Class DFMetadata manage the Data Analysis results structs on OrderedDict format and exportable to jso...
Definition: dfmetada.py:28
def _generate_importance_variables(self, column_chain)
Generate variable importance metrics.
Define all objects, functions and structured related to Data Analysis of input data on OrderedDict fo...
Definition: dfmetada.py:1
def delete_frames(self)
Not Used: Remove used dataframes during analysis execution_.
def define_special_spark_naive_norm(self, df_metadata)
Method to generate special normalizations for Naive non negative work restrictions.
def generate_commands_parameters(each_model, model_command)
auxiliary function (procedure) to generate model and train chain paramters to execute models Modify m...
Define all objects, functions and structs related to common utilities not associated to one concrete ...
Definition: utils.py:1
Define Base Metric object as OrderedDict() of common measures for all metrics types on an unified way...
def load_model(self, armetadata)
Method to load model from persistence layer by armetadata.
def predict(self, predict_frame, base_ar, kwargs)
Main method to execute predictions over traning models Take the ar.json for and execute predictions i...
Define all objects, functions and structures related to logging event on DayF product logs...
Definition: logshandler.py:1
Class Base for metricts as OrderedDict.
def shutdown_cluster(cls)
Class Method for cluster shutdown.
Class storage metadata format [{value: , fstype:[&#39;localfs&#39;, &#39;hdfs&#39;, &#39;mongoDB&#39;], hash_value : ""...
def order_training(self, training_pframe, base_ar, kwargs)
Main method to execute sets of analysis and normalizations base on params.
Class oriented to manage all messages and interaction with DayF product logs.
Definition: logshandler.py:23
def _generate_model_metrics(self)
Generate model summary metrics.
def _predict_clustering(self, dataframe, objective=None)
Generate detected anomalies on dataframe.
def __init__(self, e_c)
Constructor Initialize all framework variables and starts or connect to spark cluster Aditionally sta...
Definition: sparkhandler.py:79
def _generate_scoring_history(self)
Generate model scoring_history metrics.
def _generate_params(self, modeldef)
Generate model full values parameters for execution analysis.
def get_metric(self, algorithm_description, metric, source)
Get one especific metric for execution metrics Not tested yet.
def _get_evaluator(analysis_type, objective_column=None)
Get Evaluator for model.
Define Regression Metric object as OrderedDict() of common measures for all frameworks on an unified ...
def _get_model(self, base_ar, base_model_id, remove_model)
Internal method to get an sparkmodel from server or file transparent to user.
def _get_ext(self)
Generate extension for diferente saving modes.
def generate_base_path(self, base_ar, type_)
Generate base path to store all files [models, logs, json] relative to it.
Define all objects, functions and structures related to physically store information on persistence s...
Define Multinomial Metric object as OrderedDict() of common measures for all frameworks on an unified...
def _get_dataframe(self, pframe, hash_value, type)
Method to parse and reuse Spark Dataframes.
def get_tolerance(columns, objective_column, tolerance=0.0)
Auxiliary function to get the level of tolerance for regression analysis.
def _get_dtype(list_dtypes, column)
Get Spark dtype for column.
Define common execution base structure as OrderedDict() of common datasets on an unified way...
def _accuracy(self, objective, dataframe, tolerance=0.0)
Generate accuracy metrics for model for regression assume tolerance on results equivalent to 2*tolera...
def get_external_model(self, ar_metadata, type)
Generate pdml model class_.
def _predict_accuracy(self, objective, dataframe, tolerance=0.0)
Generate accuracy metrics for model for regression assume tolerance on results equivalent to 2*tolera...
def _get_model_from_load_path(self, ar_metadata)
Not Used: Load a model in sparkCluster from disk.
def remove_models(self, arlist)
Method to remove list of model from disk.
def store_model(self, armetadata)
Method to save model to persistence layer from armetadata.
Define Regression Metric object as OrderedDict() of common measures for all frameworks on an unified ...
def is_alive(self)
Is alive_connection method.
Define all objects, functions and structured related to manage Model Parameters Structure: OrderedDic...
Define Binomial Metric object as OrderedDict() of common measures for all frameworks on an unified wa...
def connect(self)
Connexion_method to cluster If cluster is up connect to cluster on another case start a cluster...
Class oriented to manage normalizations on dataframes for improvements on accuracy.
Definition: normalizer.py:26
Define all objects, functions and structured related to adding storage information metadata (json str...
Class to manage trasient information between all persistence options and models on an unified way...
Define Regression Metric object as OrderedDict() of common measures for all frameworks on an unified ...
def execute_normalization(self, dataframe, base_ns, model_id, filtering='NONE', exist_objective=True)
Method to execute normalizations base on params.