DayF core  1.2.1.2
DayF (Decision at your Fingertips) is an AutoML freeware development framework that let developers works with Machine Learning models without any idea of AI, simply taking a csv dataset and the objective column
normalizer.py
1 
2 
3 '''
4 Copyright (C) e2its - All Rights Reserved
5  * Unauthorized copying of this file, via any medium is strictly prohibited
6  * Proprietary and confidential
7  *
8  * This file is part of gDayF project.
9  *
10  * Written by Jose L. Sanchez <e2its.es@gmail.com>, 2016-2019
11 '''
12 
13 import numpy as np
14 import pandas as pd
15 from datetime import datetime
16 from collections import OrderedDict
17 from gdayf.conf.loadconfig import LoadConfig
18 from gdayf.conf.loadconfig import LoadLabels
19 from gdayf.logs.logshandler import LogsHandler
20 from gdayf.common.constants import DTYPES, NO_STANDARDIZE
21 from gdayf.common.normalizationset import NormalizationSet
22 from copy import deepcopy
23 
24 
25 
26 class Normalizer (object):
27 
28 
30  def __init__(self, e_c):
31  self._ec = e_c
32  self._config = self._ec.config.get_config()['normalizer']
33  self._labels = self._ec.labels.get_config()['messages']['normalizer']
34  self._logging = LogsHandler(self._ec, __name__)
35 
36 
41  def define_normalizations(self, dataframe_metadata, an_objective, objective_column):
42  if not self._config['non_minimal_normalizations_enabled']:
43  return None
44  else:
45  df_type = dataframe_metadata['type']
46  rowcount = dataframe_metadata['rowcount']
47  #cols = dataframe_metadata['cols']
48  columns = dataframe_metadata['columns']
49  norms = list()
50  normoption = NormalizationSet()
51  if df_type == 'pandas':
52  for description in columns:
53  col = description['name']
54  if col != objective_column:
55  if int(description['missed']) > 0 and \
56  (int(description['missed'])/rowcount >= self._config['exclusion_missing_threshold']):
57  normoption.set_ignore_column()
58  norms.append({col: normoption.copy()})
59  if self._config['clustering_standardize_enabled'] and an_objective[0]['type'] in ['clustering'] \
60  and description['type'] in DTYPES \
61  and int(description['cardinality']) > 1 and description['mean'] != 0.0 and \
62  description['std'] != 1.0 \
63  and (
64  float(description['std']) / (float(description['max']) - float(description['min']))) \
65  > self._config['std_threshold']:
66  normoption.set_stdmean(description['mean'], description['std'])
67  norms.append({col: normoption.copy()})
68  if self._config['standardize_enabled'] and description['type'] in DTYPES \
69  and an_objective[0]['type'] not in ['clustering']\
70  and int(description['cardinality']) > 1 and description['mean'] != 0.0 and \
71  description['std'] != 1.0 \
72  and(float(description['std']) / (float(description['max']) - float(description['min']))) \
73  > self._config['std_threshold']:
74  normoption.set_stdmean(description['mean'], description['std'])
75  norms.append({col: normoption.copy()})
76 
77  self._logging.log_exec('gDayF', "Normalizer", self._labels["norm_set_establish"], norms)
78  if len(norms) != 0:
79  return norms.copy()
80  else:
81  return None
82  else:
83  return None
84 
85 
89  def define_ignored_columns(self, dataframe_metadata, objective_column):
90  if not self._config['non_minimal_normalizations_enabled']:
91  return None
92  else:
93  df_type = dataframe_metadata['type']
94  rowcount = dataframe_metadata['rowcount']
95  # cols = dataframe_metadata['cols']
96  columns = dataframe_metadata['columns']
97  norms = list()
98  normoption = NormalizationSet()
99  if df_type == 'pandas':
100  for description in columns:
101  col = description['name']
102  if col != objective_column:
103  if int(description['cardinality']) == 1:
104  normoption.set_ignore_column()
105  norms.append({col: normoption.copy()})
106  elif self._config['datetime_columns_management'] is not None \
107  and self._config['datetime_columns_management'] \
108  and description['type'] == 'datetime64[ns]':
109  normoption.set_ignore_column()
110  norms.append({col: normoption.copy()})
111  self._logging.log_exec('gDayF', "Normalizer", self._labels["ignored_set_establish"], norms)
112  if len(norms) != 0:
113  return norms.copy()
114  else:
115  return None
116  else:
117  return None
118 
119 
124  def define_special_spark_naive_norm(self, dataframe_metadata):
125  df_type = dataframe_metadata['type']
126  if df_type == 'pandas':
127  norms = list()
128  normoption = NormalizationSet()
129  columns = dataframe_metadata['columns']
130  norms = list()
131  for description in columns:
132  col = description['name']
133  if description['min'] is not None and float(description['min']) < 0.0:
134  normoption.set_offset(offset=abs(float(description['min']))
135  * self._config['special_spark_naive_offset'])
136  norms.append({col: normoption.copy()})
137 
138  return norms.copy()
139  else:
140  return None
141 
142 
147  def define_minimal_norm(self, dataframe_metadata, an_objective, objective_column):
148  df_type = dataframe_metadata['type']
149  if not self._config['minimal_normalizations_enabled']:
150  return [None]
151  elif objective_column is None:
152  norms = list()
153  normoption = NormalizationSet()
154  columns = dataframe_metadata['columns']
155  for description in columns:
156  col = description['name']
157  if description['type'] == "object" and self._config['base_normalization_enabled']:
158  normoption.set_base(datetime=False)
159  norms.append({col: normoption.copy()})
160  return norms.copy()
161  else:
162  if df_type == 'pandas':
163  rowcount = dataframe_metadata['rowcount']
164  norms = list()
165  normoption = NormalizationSet()
166  normoption.set_drop_missing()
167  norms.append({objective_column: normoption.copy()})
168 
169  columns = dataframe_metadata['columns']
170  for description in columns:
171  col = description['name']
172  if col != objective_column:
173  if description['type'] == "object" and self._config['base_normalization_enabled']:
174  normoption.set_base()
175  norms.append({col: normoption.copy()})
176  if int(description['missed']) > 0 and \
177  (int(description['missed'])/rowcount >= self._config['exclusion_missing_threshold']):
178  if an_objective[0]['type'] in ['binomial', 'multinomial'] and self._config['manage_on_train_errors']:
179  normoption.set_mean_missing_values(objective_column, full=False)
180  norms.append({col: normoption.copy()})
181  elif an_objective[0]['type'] in ['regression'] and self._config['manage_on_train_errors']:
182  normoption.set_progressive_missing_values(objective_column)
183  norms.append({col: normoption.copy()})
184  elif an_objective[0]['type'] in ['anomalies']:
185  normoption.set_mean_missing_values(objective_column, full=True)
186  norms.append({col: normoption.copy()})
187  else:
188  normoption.set_mean_missing_values(objective_column, full=True)
189  norms.append({col: normoption.copy()})
190  return norms.copy()
191 
192 
196  def filter_standardize(self, normalizemd, model_id):
197  filter_normalized = list()
198  for norm_set in normalizemd:
199  if norm_set is not None:
200  col = list(norm_set.keys())[0]
201  norms = norm_set.get(col)
202  if norms['class'] == 'stdmean' and model_id in NO_STANDARDIZE:
203  self._logging.log_info('gDayF', "Normalizer", self._labels["excluding"],
204  col + ' - ' + norms['class'])
205  else:
206  filter_normalized.append(norm_set)
207  return filter_normalized
208 
209 
212  def filter_drop_missing(self, normalizemd):
213  filter_normalized = list()
214  for norm_set in normalizemd:
215  if norm_set is not None:
216  col = list(norm_set.keys())[0]
217  norms = norm_set.get(col)
218  if norms['class'] == 'drop_missing':
219  self._logging.log_exec('gDayF', "Normalizer", self._labels["excluding"],
220  col + ' - ' + norms['class'])
221  else:
222  filter_normalized.append(norm_set)
223 
224  return filter_normalized
225 
226 
229  def filter_objective_base(self, normalizemd):
230  filter_normalized = list()
231  for norm_set in normalizemd:
232  if norm_set is not None:
233  col = list(norm_set.keys())[0]
234  norms = norm_set.get(col)
235  if norms['class'] == 'progressive_missing_values' or \
236  (norms['class'] == 'mean_missing_values' and not norms['objective']['full']):
237  self._logging.log_exec('gDayF', "Normalizer", self._labels["excluding"],
238  col + ' - ' + norms['class'])
239  else:
240  filter_normalized.append(norm_set)
241 
242  return filter_normalized
243 
244 
249  def normalizeDataFrame(self, df, normalizemd):
250  self._logging.log_info('gDayF', "Normalizer", self._labels["start_data_norm"], str(df.shape))
251  if isinstance(df, pd.DataFrame):
252  dataframe = df.copy()
253  for norm_set in normalizemd:
254  if norm_set is not None:
255  col = list(norm_set.keys())[0]
256  norms = norm_set.get(col)
257  if norms['class'] == 'base':
258  dataframe.loc[:, col] = self.normalizeBase(dataframe.loc[:, col])
259  self._logging.log_info('gDayF', "Normalizer", self._labels["applying"],
260  col + ' - ' + norms['class'])
261  if dataframe[col].dtype == '<M8[ns]' and norms['datetime']:
262  dataframe = self.normalizeDateTime(dataframe=dataframe, date_column=col)
263  if self._config['datetime_columns_management'] is not None \
264  and self._config['datetime_columns_management']['enable']:
265  self._logging.log_info('gDayF', "Normalizer", self._labels["applying"],
266  col + ' - ' + str(self._config['datetime_columns_management']
267  ['filter']))
268  elif norms['class'] == 'drop_missing':
269  try:
270  dataframe = self.normalizeDropMissing(dataframe, col)
271  self._logging.log_info('gDayF', "Normalizer", self._labels["applying"],
272  col + ' - ' + norms['class'])
273  except KeyError:
274  self._logging.log_info('gDayF', "Normalizer", self._labels["excluding"],
275  col + ' - ' + norms['class'])
276  elif norms['class'] == 'stdmean':
277  dataframe.loc[:, col] = self.normalizeStdMean(dataframe.loc[:, col],
278  norms['objective']['mean'],
279  norms['objective']['std']
280  )
281  self._logging.log_info('gDayF', "Normalizer", self._labels["applying"],
282  col + ' - ' + norms['class'] + ' ( ' +
283  str(norms['objective']['mean']) + ',' +
284  str(norms['objective']['std']) + ' ) ')
285  elif norms['class'] == 'working_range':
286  dataframe.loc[:, col] = self.normalizeWorkingRange(dataframe.loc[:, col],
287  norms['objective']['minval'],
288  norms['objective']['maxval'],
289  norms['objective']['minrange'],
290  norms['objective']['maxrange']
291  )
292  self._logging.log_info('gDayF', "Normalizer", self._labels["applying"],
293  col + ' - ' + norms['class'] + ' ( ' +
294  str(norms['objective']['minval']) + ',' +
295  str(norms['objective']['maxval']) + ' ) ')
296  elif norms['class'] == 'offset':
297  dataframe.loc[:, col] = self.normalizeOffset(dataframe.loc[:, col],
298  norms['objective']['offset'])
299  self._logging.log_info('gDayF', "Normalizer", self._labels["applying"],
300  col + ' - ' + norms['class'] + ' ( ' +
301  str(norms['objective']['offset']) + ' )')
302  elif norms['class'] == 'discretize':
303  dataframe.loc[:, col] = self.normalizeDiscretize(dataframe.loc[:, col],
304  norms['objective']['buckets_number'],
305  norms['objective']['fixed_size'])
306  self._logging.log_info('gDayF', "Normalizer", self._labels["applying"],
307  col + ' - ' + norms['class'] + ' ( ' +
308  str(norms['objective']['buckets_number']) + ',' +
309  str(norms['objective']['fixed_size']) + ' ) ')
310  elif norms['class'] == 'aggregation':
311  dataframe.loc[:, col] = self.normalizeAgregation(dataframe.loc[:, col],
312  norms['objective']['bucket_ratio'])
313  self._logging.log_info('gDayF', "Normalizer", self._labels["applying"],
314  col + ' - ' + norms['class'] + ' ( ' +
315  str(norms['objective']['bucket_ratio']) + ' ) ')
316  elif norms['class'] == 'fixed_missing_values':
317  dataframe.loc[:, col] = self.fixedMissingValues(dataframe.loc[:, col], norms['objective']['value'])
318  self._logging.log_info('gDayF', "Normalizer", self._labels["applying"],
319  col + ' - ' + norms['class'] + ' ( ' +
320  str(norms['objective']['value']) + ' ) ')
321  elif norms['class'] == 'mean_missing_values':
322  dataframe = self.meanMissingValues(dataframe,
323  col,
324  norms['objective']['objective_column'],
325  norms['objective']['full']
326  )
327  if norms['objective']['objective_column'] is None:
328  norms['objective']['objective_column'] = 'None'
329  self._logging.log_info('gDayF', "Normalizer", self._labels["applying"],
330  col + ' - ' + norms['class'] + ' ( ' +
331  norms['objective']['objective_column'] + ',' +
332  str(norms['objective']['full']) + ' ) ')
333  elif norms['class'] == 'progressive_missing_values':
334  dataframe = self.progressiveMissingValues(dataframe,
335  col,
336  norms['objective']['objective_column'])
337  self._logging.log_info('gDayF', "Normalizer", self._labels["applying"],
338  col + ' - ' + norms['class'] + ' ( ' +
339  norms['objective']['objective_column'] + ' ) ')
340  elif norms['class'] == 'ignore_column':
341  pass
342  #elif norms['class'] == 'binary_encoding':
343  #self.normalizeBinaryEncoding(dataframe[col])
344  else:
345  self._logging.log_info('gDayF', "Normalizer", self._labels["nothing_to_do"])
346  return dataframe
347  else:
348  return df
349 
350 
353  def ignored_columns(self, normalizemd):
354  ignored_list = list()
355  if normalizemd is not None:
356  for elements in normalizemd:
357  for col, value in elements.items():
358  if value['class'] == 'ignore_column':
359  ignored_list.append(col)
360  self._logging.log_info('gDayF', "Normalizer", self._labels["ignored_list"], ignored_list)
361  return ignored_list.copy()
362 
363 
367  def normalizeBase(self, dataframe):
368  if dataframe.dtype == np.object:
369  try:
370  return pd.to_numeric(dataframe)
371  except ValueError:
372  try:
373  return pd.to_datetime(dataframe)
374  except ValueError:
375  return pd.Categorical(dataframe)
376 
377 
382  def normalizeDropMissing(self, dataframe, col):
383  return dataframe.dropna(axis=0, subset=[col])
384 
385 
391  def normalizeWorkingRange(self, dataframe, minval=-1.0, maxval=1.0, minrange = -1.0, maxrange = 1.0):
392  assert(maxval > minval)
393  if dataframe.dtype != np.object:
394  if dataframe.dtype != np.object:
395  convert_factor = (maxrange - minrange) / (maxval - minval)
396  dataframe = dataframe.astype(np.float16)
397  dataframe = dataframe.apply(lambda x: (x-minval) * convert_factor + minrange)
398  return dataframe.copy()
399 
400 
406  def normalizeOffset(self, dataframe, offset=0):
407  if dataframe.dtype != np.object:
408  dataframe = offset + dataframe
409  return dataframe.copy()
410 
411 
416  def normalizeAgregation(self, dataframe, br=0.25):
417  if (dataframe.dtype != np.object):
418  buckets = int(1 / (br/2))
419  q, bins = pd.qcut(dataframe.iloc[:], buckets, retbins=True)
420  if dataframe.dtype != np.int:
421  dataframe[dataframe <= bins[1]] = np.int(dataframe[dataframe <= bins[1]].mean().copy())
422  dataframe[dataframe >= bins[-2]] = np.int(dataframe[dataframe >= bins[-2]].mean().copy())
423  else:
424  dataframe[dataframe <= bins[1]] = dataframe[dataframe <= bins[1]].mean().copy()
425  dataframe[dataframe <= bins[-2]] = dataframe[dataframe <= bins[-2]].mean().copy()
426  return dataframe.copy()
427 
428 
432  def normalizeBinaryEncoding(self, dataframe):
433  return dataframe.copy()
434 
435 
441  def normalizeStdMean(self, dataframe, mean, std):
442  if dataframe.dtype != np.object and dataframe.dtype != "datetime64[ns]":
443  try:
444  dataframe = dataframe.astype(np.float64)
445  dataframe = dataframe.apply(lambda x: x - float(mean))
446  dataframe = dataframe.apply(lambda x: x / float(std))
447  except ZeroDivisionError:
448  dataframe = dataframe.apply(lambda x: x + float(mean))
449  #dataframe = preprocessing.scale(dataframe)
450  return dataframe.copy()
451 
452 
458  def normalizeDiscretize(self, dataframe, buckets_number, fixed_size):
459  #Un número de buckets de tamaño fixed_size
460  if fixed_size:
461  return pd.qcut(dataframe, buckets_number)
462  else:
463  return pd.cut(dataframe, buckets_number)
464 
465 
470  def fixedMissingValues(self, dataframe, value=0.0):
471  return dataframe.fillna(value)
472 
473 
480  def meanMissingValues(self, dataframe, col, objective_col, full=False):
481  if full:
482  return dataframe.fillna(dataframe.mean())
483  else:
484  nullfalse = dataframe[dataframe[:][col].notnull()][[col, objective_col]]
485  if objective_col in DTYPES:
486  nullfalse_gb = nullfalse.groupby(objective_col).mean()
487  else:
488  nullfalse_gb = nullfalse.groupby(objective_col).agg(lambda x: x.value_counts().index[0])
489  for index, row in dataframe[dataframe[:][col].isnull()].iterrows():
490  row = row.copy()
491  if nullfalse_gb.index.isin([row[objective_col]]).any():
492  dataframe.loc[index, col] = nullfalse_gb.loc[row[objective_col], col]
493  return dataframe.copy()
494 
495 
502  def progressiveMissingValues(self, dataframe, col, objective_col):
503  nullfalse = dataframe[dataframe[:][col].notnull()].sort_values(objective_col,
504  axis=0,
505  ascending=True)[[col, objective_col]]
506  nullfalse_gb = nullfalse.groupby(objective_col).mean()
507  for index, row in dataframe[dataframe[:][col].isnull()].iterrows():
508  row = row.copy()
509  if nullfalse_gb.index.isin([row[objective_col]]).any():
510  dataframe.loc[index, col] = nullfalse_gb.loc[row[objective_col], col]
511  else:
512  index_max = nullfalse_gb.index.where(nullfalse_gb.index > row[objective_col]).min()
513  index_min = nullfalse_gb.index.where(nullfalse_gb.index < row[objective_col]).max()
514  try:
515  if index_min is np.nan and index_max is np.nan \
516  or index_min is None or index_max is None:
517  pass
518  if index_min is np.nan or index_min is None:
519  dataframe.loc[index, col] = nullfalse_gb.loc[index_max, col]
520  elif index_max is np.nan or index_max is None:
521  dataframe.loc[index, col] = nullfalse_gb.loc[index_min, col]
522  else:
523  minimal = min(nullfalse_gb.loc[index_min, col], nullfalse_gb.loc[index_max, col])
524  maximal = max(nullfalse_gb.loc[index_min, col], nullfalse_gb.loc[index_max, col])
525  b = maximal - minimal
526  a = index_max - index_min
527  x = (row[objective_col] - index_min) / a
528  offset = b * x
529  dataframe.loc[index, col] = minimal + offset
530  except TypeError:
531  pass
532  return dataframe.copy()
533 
534 
539  def normalizeDateTime(self, dataframe, date_column=None):
540  datetime_columns_management = self._config['datetime_columns_management']
541  if date_column is not None:
542  if datetime_columns_management is not None and datetime_columns_management['enable']:
543  for element in datetime_columns_management['filter']:
544  try:
545  if element not in ['weekday', 'weeknumber']:
546  dataframe[date_column + '_' + element] = dataframe.loc[:, date_column]\
547  .transform(lambda x: eval('x.' + element))
548  elif element == 'weekday':
549  dataframe[date_column + '_' + element] = dataframe.loc[:, date_column]\
550  .transform(lambda x: x.isoweekday())
551  elif element == 'weeknumber':
552  dataframe[date_column + '_' + element] = dataframe.loc[:, date_column]\
553  .transform(lambda x: x.isocalendar()[1])
554  except AttributeError:
555  print('TRC: invalid configuration:' + element)
556  pass
557  return dataframe.copy()
558 
559 
560 
561 
def normalizeDateTime(self, dataframe, date_column=None)
Internal method oriented to manage date_time conversions to pattern.
Definition: normalizer.py:539
def normalizeWorkingRange(self, dataframe, minval=-1.0, maxval=1.0, minrange=-1.0, maxrange=1.0)
Internal method oriented to manage Working range normalizations on a [closed, closed] interval...
Definition: normalizer.py:391
def define_special_spark_naive_norm(self, dataframe_metadata)
Method oriented to specificate minimal data_normalizations.
Definition: normalizer.py:124
def define_ignored_columns(self, dataframe_metadata, objective_column)
Method oriented to specificate ignored_columns.
Definition: normalizer.py:89
def filter_drop_missing(self, normalizemd)
Method oriented to filter drop_missing operations on non standardize algorithms.
Definition: normalizer.py:212
Define all objects, functions and structures related to logging event on DayF product logs...
Definition: logshandler.py:1
Class oriented to manage all messages and interaction with DayF product logs.
Definition: logshandler.py:23
def normalizeBase(self, dataframe)
Internal method oriented to manage drop NaN values from dataset.
Definition: normalizer.py:367
def normalizeOffset(self, dataframe, offset=0)
Internal method oriented to manage Working range normalizations on a [closed, closed] interval...
Definition: normalizer.py:406
def define_minimal_norm(self, dataframe_metadata, an_objective, objective_column)
Method oriented to specificate special data_normalizations non negative.
Definition: normalizer.py:147
def fixedMissingValues(self, dataframe, value=0.0)
Internal method oriented to manage imputation for missing values to fixed value.
Definition: normalizer.py:470
def normalizeDropMissing(self, dataframe, col)
Internal method oriented to manage base normalizations.
Definition: normalizer.py:382
def normalizeDiscretize(self, dataframe, buckets_number, fixed_size)
Internal method oriented to manage bucketing for discretize.
Definition: normalizer.py:458
def normalizeDataFrame(self, df, normalizemd)
Main method oriented to define and manage normalizations sets applying normalizations.
Definition: normalizer.py:249
def ignored_columns(self, normalizemd)
Method oriented to generate ignored_column_list on issues where missed > exclusion_missing_threshold...
Definition: normalizer.py:353
Class NormalizationSet manage the Normalizations metadata as OrderedDict supporting Normalizer Class ...
def __init__(self, e_c)
Constructor.
Definition: normalizer.py:30
def normalizeBinaryEncoding(self, dataframe)
Internal method oriented to manage Binary encodings.
Definition: normalizer.py:432
def filter_objective_base(self, normalizemd)
Method oriented to filter filling_missing operations dependent of objective_column.
Definition: normalizer.py:229
def define_normalizations(self, dataframe_metadata, an_objective, objective_column)
Method oriented to specificate data_normalizations.
Definition: normalizer.py:41
def filter_standardize(self, normalizemd, model_id)
Method oriented to filter stdmean operations on non standardize algorithms.
Definition: normalizer.py:196
def normalizeAgregation(self, dataframe, br=0.25)
Internal method oriented to manage bucket ratio normalizations head - tail.
Definition: normalizer.py:416
def progressiveMissingValues(self, dataframe, col, objective_col)
Internal method oriented to manage progressive imputations for missing values.
Definition: normalizer.py:502
def meanMissingValues(self, dataframe, col, objective_col, full=False)
Internal method oriented to manage imputation for missing values to mean value.
Definition: normalizer.py:480
Class oriented to manage normalizations on dataframes for improvements on accuracy.
Definition: normalizer.py:26
Define all objects, functions and structs related to load on system all configuration parameter from ...
Definition: loadconfig.py:1
def normalizeStdMean(self, dataframe, mean, std)
Internal method oriented to manage mean and std normalizations.
Definition: normalizer.py:441