4 Copyright (C) e2its - All Rights Reserved 5 * Unauthorized copying of this file, via any medium is strictly prohibited 6 * Proprietary and confidential 8 * This file is part of gDayF project. 10 * Written by Jose L. Sanchez <e2its.es@gmail.com>, 2016-2019 15 from datetime
import datetime
16 from collections
import OrderedDict
22 from copy
import deepcopy
32 self.
_config = self.
_ec.config.get_config()[
'normalizer']
33 self.
_labels = self.
_ec.labels.get_config()[
'messages'][
'normalizer']
42 if not self.
_config[
'non_minimal_normalizations_enabled']:
45 df_type = dataframe_metadata[
'type']
46 rowcount = dataframe_metadata[
'rowcount']
48 columns = dataframe_metadata[
'columns']
51 if df_type ==
'pandas':
52 for description
in columns:
53 col = description[
'name']
54 if col != objective_column:
55 if int(description[
'missed']) > 0
and \
56 (int(description[
'missed'])/rowcount >= self.
_config[
'exclusion_missing_threshold']):
57 normoption.set_ignore_column()
58 norms.append({col: normoption.copy()})
59 if self.
_config[
'clustering_standardize_enabled']
and an_objective[0][
'type']
in [
'clustering'] \
60 and description[
'type']
in DTYPES \
61 and int(description[
'cardinality']) > 1
and description[
'mean'] != 0.0
and \
62 description[
'std'] != 1.0 \
64 float(description[
'std']) / (float(description[
'max']) - float(description[
'min']))) \
65 > self.
_config[
'std_threshold']:
66 normoption.set_stdmean(description[
'mean'], description[
'std'])
67 norms.append({col: normoption.copy()})
68 if self.
_config[
'standardize_enabled']
and description[
'type']
in DTYPES \
69 and an_objective[0][
'type']
not in [
'clustering']\
70 and int(description[
'cardinality']) > 1
and description[
'mean'] != 0.0
and \
71 description[
'std'] != 1.0 \
72 and(float(description[
'std']) / (float(description[
'max']) - float(description[
'min']))) \
73 > self.
_config[
'std_threshold']:
74 normoption.set_stdmean(description[
'mean'], description[
'std'])
75 norms.append({col: normoption.copy()})
77 self.
_logging.log_exec(
'gDayF',
"Normalizer", self.
_labels[
"norm_set_establish"], norms)
90 if not self.
_config[
'non_minimal_normalizations_enabled']:
93 df_type = dataframe_metadata[
'type']
94 rowcount = dataframe_metadata[
'rowcount']
96 columns = dataframe_metadata[
'columns']
99 if df_type ==
'pandas':
100 for description
in columns:
101 col = description[
'name']
102 if col != objective_column:
103 if int(description[
'cardinality']) == 1:
104 normoption.set_ignore_column()
105 norms.append({col: normoption.copy()})
106 elif self.
_config[
'datetime_columns_management']
is not None \
107 and self.
_config[
'datetime_columns_management'] \
108 and description[
'type'] ==
'datetime64[ns]':
109 normoption.set_ignore_column()
110 norms.append({col: normoption.copy()})
111 self.
_logging.log_exec(
'gDayF',
"Normalizer", self.
_labels[
"ignored_set_establish"], norms)
125 df_type = dataframe_metadata[
'type']
126 if df_type ==
'pandas':
129 columns = dataframe_metadata[
'columns']
131 for description
in columns:
132 col = description[
'name']
133 if description[
'min']
is not None and float(description[
'min']) < 0.0:
134 normoption.set_offset(offset=abs(float(description[
'min']))
135 * self.
_config[
'special_spark_naive_offset'])
136 norms.append({col: normoption.copy()})
148 df_type = dataframe_metadata[
'type']
149 if not self.
_config[
'minimal_normalizations_enabled']:
151 elif objective_column
is None:
154 columns = dataframe_metadata[
'columns']
155 for description
in columns:
156 col = description[
'name']
157 if description[
'type'] ==
"object" and self.
_config[
'base_normalization_enabled']:
158 normoption.set_base(datetime=
False)
159 norms.append({col: normoption.copy()})
162 if df_type ==
'pandas':
163 rowcount = dataframe_metadata[
'rowcount']
166 normoption.set_drop_missing()
167 norms.append({objective_column: normoption.copy()})
169 columns = dataframe_metadata[
'columns']
170 for description
in columns:
171 col = description[
'name']
172 if col != objective_column:
173 if description[
'type'] ==
"object" and self.
_config[
'base_normalization_enabled']:
174 normoption.set_base()
175 norms.append({col: normoption.copy()})
176 if int(description[
'missed']) > 0
and \
177 (int(description[
'missed'])/rowcount >= self.
_config[
'exclusion_missing_threshold']):
178 if an_objective[0][
'type']
in [
'binomial',
'multinomial']
and self.
_config[
'manage_on_train_errors']:
179 normoption.set_mean_missing_values(objective_column, full=
False)
180 norms.append({col: normoption.copy()})
181 elif an_objective[0][
'type']
in [
'regression']
and self.
_config[
'manage_on_train_errors']:
182 normoption.set_progressive_missing_values(objective_column)
183 norms.append({col: normoption.copy()})
184 elif an_objective[0][
'type']
in [
'anomalies']:
185 normoption.set_mean_missing_values(objective_column, full=
True)
186 norms.append({col: normoption.copy()})
188 normoption.set_mean_missing_values(objective_column, full=
True)
189 norms.append({col: normoption.copy()})
197 filter_normalized = list()
198 for norm_set
in normalizemd:
199 if norm_set
is not None:
200 col = list(norm_set.keys())[0]
201 norms = norm_set.get(col)
202 if norms[
'class'] ==
'stdmean' and model_id
in NO_STANDARDIZE:
203 self.
_logging.log_info(
'gDayF',
"Normalizer", self.
_labels[
"excluding"],
204 col +
' - ' + norms[
'class'])
206 filter_normalized.append(norm_set)
207 return filter_normalized
213 filter_normalized = list()
214 for norm_set
in normalizemd:
215 if norm_set
is not None:
216 col = list(norm_set.keys())[0]
217 norms = norm_set.get(col)
218 if norms[
'class'] ==
'drop_missing':
219 self.
_logging.log_exec(
'gDayF',
"Normalizer", self.
_labels[
"excluding"],
220 col +
' - ' + norms[
'class'])
222 filter_normalized.append(norm_set)
224 return filter_normalized
230 filter_normalized = list()
231 for norm_set
in normalizemd:
232 if norm_set
is not None:
233 col = list(norm_set.keys())[0]
234 norms = norm_set.get(col)
235 if norms[
'class'] ==
'progressive_missing_values' or \
236 (norms[
'class'] ==
'mean_missing_values' and not norms[
'objective'][
'full']):
237 self.
_logging.log_exec(
'gDayF',
"Normalizer", self.
_labels[
"excluding"],
238 col +
' - ' + norms[
'class'])
240 filter_normalized.append(norm_set)
242 return filter_normalized
250 self.
_logging.log_info(
'gDayF',
"Normalizer", self.
_labels[
"start_data_norm"], str(df.shape))
251 if isinstance(df, pd.DataFrame):
252 dataframe = df.copy()
253 for norm_set
in normalizemd:
254 if norm_set
is not None:
255 col = list(norm_set.keys())[0]
256 norms = norm_set.get(col)
257 if norms[
'class'] ==
'base':
258 dataframe.loc[:, col] = self.
normalizeBase(dataframe.loc[:, col])
259 self.
_logging.log_info(
'gDayF',
"Normalizer", self.
_labels[
"applying"],
260 col +
' - ' + norms[
'class'])
261 if dataframe[col].dtype ==
'<M8[ns]' and norms[
'datetime']:
263 if self.
_config[
'datetime_columns_management']
is not None \
264 and self.
_config[
'datetime_columns_management'][
'enable']:
265 self.
_logging.log_info(
'gDayF',
"Normalizer", self.
_labels[
"applying"],
266 col +
' - ' + str(self.
_config[
'datetime_columns_management']
268 elif norms[
'class'] ==
'drop_missing':
271 self.
_logging.log_info(
'gDayF',
"Normalizer", self.
_labels[
"applying"],
272 col +
' - ' + norms[
'class'])
274 self.
_logging.log_info(
'gDayF',
"Normalizer", self.
_labels[
"excluding"],
275 col +
' - ' + norms[
'class'])
276 elif norms[
'class'] ==
'stdmean':
278 norms[
'objective'][
'mean'],
279 norms[
'objective'][
'std']
281 self.
_logging.log_info(
'gDayF',
"Normalizer", self.
_labels[
"applying"],
282 col +
' - ' + norms[
'class'] +
' ( ' +
283 str(norms[
'objective'][
'mean']) +
',' +
284 str(norms[
'objective'][
'std']) +
' ) ')
285 elif norms[
'class'] ==
'working_range':
287 norms[
'objective'][
'minval'],
288 norms[
'objective'][
'maxval'],
289 norms[
'objective'][
'minrange'],
290 norms[
'objective'][
'maxrange']
292 self.
_logging.log_info(
'gDayF',
"Normalizer", self.
_labels[
"applying"],
293 col +
' - ' + norms[
'class'] +
' ( ' +
294 str(norms[
'objective'][
'minval']) +
',' +
295 str(norms[
'objective'][
'maxval']) +
' ) ')
296 elif norms[
'class'] ==
'offset':
298 norms[
'objective'][
'offset'])
299 self.
_logging.log_info(
'gDayF',
"Normalizer", self.
_labels[
"applying"],
300 col +
' - ' + norms[
'class'] +
' ( ' +
301 str(norms[
'objective'][
'offset']) +
' )')
302 elif norms[
'class'] ==
'discretize':
304 norms[
'objective'][
'buckets_number'],
305 norms[
'objective'][
'fixed_size'])
306 self.
_logging.log_info(
'gDayF',
"Normalizer", self.
_labels[
"applying"],
307 col +
' - ' + norms[
'class'] +
' ( ' +
308 str(norms[
'objective'][
'buckets_number']) +
',' +
309 str(norms[
'objective'][
'fixed_size']) +
' ) ')
310 elif norms[
'class'] ==
'aggregation':
312 norms[
'objective'][
'bucket_ratio'])
313 self.
_logging.log_info(
'gDayF',
"Normalizer", self.
_labels[
"applying"],
314 col +
' - ' + norms[
'class'] +
' ( ' +
315 str(norms[
'objective'][
'bucket_ratio']) +
' ) ')
316 elif norms[
'class'] ==
'fixed_missing_values':
317 dataframe.loc[:, col] = self.
fixedMissingValues(dataframe.loc[:, col], norms[
'objective'][
'value'])
318 self.
_logging.log_info(
'gDayF',
"Normalizer", self.
_labels[
"applying"],
319 col +
' - ' + norms[
'class'] +
' ( ' +
320 str(norms[
'objective'][
'value']) +
' ) ')
321 elif norms[
'class'] ==
'mean_missing_values':
324 norms[
'objective'][
'objective_column'],
325 norms[
'objective'][
'full']
327 if norms[
'objective'][
'objective_column']
is None:
328 norms[
'objective'][
'objective_column'] =
'None' 329 self.
_logging.log_info(
'gDayF',
"Normalizer", self.
_labels[
"applying"],
330 col +
' - ' + norms[
'class'] +
' ( ' +
331 norms[
'objective'][
'objective_column'] +
',' +
332 str(norms[
'objective'][
'full']) +
' ) ')
333 elif norms[
'class'] ==
'progressive_missing_values':
336 norms[
'objective'][
'objective_column'])
337 self.
_logging.log_info(
'gDayF',
"Normalizer", self.
_labels[
"applying"],
338 col +
' - ' + norms[
'class'] +
' ( ' +
339 norms[
'objective'][
'objective_column'] +
' ) ')
340 elif norms[
'class'] ==
'ignore_column':
345 self.
_logging.log_info(
'gDayF',
"Normalizer", self.
_labels[
"nothing_to_do"])
354 ignored_list = list()
355 if normalizemd
is not None:
356 for elements
in normalizemd:
357 for col, value
in elements.items():
358 if value[
'class'] ==
'ignore_column':
359 ignored_list.append(col)
360 self.
_logging.log_info(
'gDayF',
"Normalizer", self.
_labels[
"ignored_list"], ignored_list)
361 return ignored_list.copy()
368 if dataframe.dtype == np.object:
370 return pd.to_numeric(dataframe)
373 return pd.to_datetime(dataframe)
375 return pd.Categorical(dataframe)
383 return dataframe.dropna(axis=0, subset=[col])
392 assert(maxval > minval)
393 if dataframe.dtype != np.object:
394 if dataframe.dtype != np.object:
395 convert_factor = (maxrange - minrange) / (maxval - minval)
396 dataframe = dataframe.astype(np.float16)
397 dataframe = dataframe.apply(
lambda x: (x-minval) * convert_factor + minrange)
398 return dataframe.copy()
407 if dataframe.dtype != np.object:
408 dataframe = offset + dataframe
409 return dataframe.copy()
417 if (dataframe.dtype != np.object):
418 buckets = int(1 / (br/2))
419 q, bins = pd.qcut(dataframe.iloc[:], buckets, retbins=
True)
420 if dataframe.dtype != np.int:
421 dataframe[dataframe <= bins[1]] = np.int(dataframe[dataframe <= bins[1]].mean().copy())
422 dataframe[dataframe >= bins[-2]] = np.int(dataframe[dataframe >= bins[-2]].mean().copy())
424 dataframe[dataframe <= bins[1]] = dataframe[dataframe <= bins[1]].mean().copy()
425 dataframe[dataframe <= bins[-2]] = dataframe[dataframe <= bins[-2]].mean().copy()
426 return dataframe.copy()
433 return dataframe.copy()
442 if dataframe.dtype != np.object
and dataframe.dtype !=
"datetime64[ns]":
444 dataframe = dataframe.astype(np.float64)
445 dataframe = dataframe.apply(
lambda x: x - float(mean))
446 dataframe = dataframe.apply(
lambda x: x / float(std))
447 except ZeroDivisionError:
448 dataframe = dataframe.apply(
lambda x: x + float(mean))
450 return dataframe.copy()
461 return pd.qcut(dataframe, buckets_number)
463 return pd.cut(dataframe, buckets_number)
471 return dataframe.fillna(value)
482 return dataframe.fillna(dataframe.mean())
484 nullfalse = dataframe[dataframe[:][col].notnull()][[col, objective_col]]
485 if objective_col
in DTYPES:
486 nullfalse_gb = nullfalse.groupby(objective_col).mean()
488 nullfalse_gb = nullfalse.groupby(objective_col).agg(
lambda x: x.value_counts().index[0])
489 for index, row
in dataframe[dataframe[:][col].isnull()].iterrows():
491 if nullfalse_gb.index.isin([row[objective_col]]).any():
492 dataframe.loc[index, col] = nullfalse_gb.loc[row[objective_col], col]
493 return dataframe.copy()
503 nullfalse = dataframe[dataframe[:][col].notnull()].sort_values(objective_col,
505 ascending=
True)[[col, objective_col]]
506 nullfalse_gb = nullfalse.groupby(objective_col).mean()
507 for index, row
in dataframe[dataframe[:][col].isnull()].iterrows():
509 if nullfalse_gb.index.isin([row[objective_col]]).any():
510 dataframe.loc[index, col] = nullfalse_gb.loc[row[objective_col], col]
512 index_max = nullfalse_gb.index.where(nullfalse_gb.index > row[objective_col]).min()
513 index_min = nullfalse_gb.index.where(nullfalse_gb.index < row[objective_col]).max()
515 if index_min
is np.nan
and index_max
is np.nan \
516 or index_min
is None or index_max
is None:
518 if index_min
is np.nan
or index_min
is None:
519 dataframe.loc[index, col] = nullfalse_gb.loc[index_max, col]
520 elif index_max
is np.nan
or index_max
is None:
521 dataframe.loc[index, col] = nullfalse_gb.loc[index_min, col]
523 minimal = min(nullfalse_gb.loc[index_min, col], nullfalse_gb.loc[index_max, col])
524 maximal = max(nullfalse_gb.loc[index_min, col], nullfalse_gb.loc[index_max, col])
525 b = maximal - minimal
526 a = index_max - index_min
527 x = (row[objective_col] - index_min) / a
529 dataframe.loc[index, col] = minimal + offset
532 return dataframe.copy()
540 datetime_columns_management = self.
_config[
'datetime_columns_management']
541 if date_column
is not None:
542 if datetime_columns_management
is not None and datetime_columns_management[
'enable']:
543 for element
in datetime_columns_management[
'filter']:
545 if element
not in [
'weekday',
'weeknumber']:
546 dataframe[date_column +
'_' + element] = dataframe.loc[:, date_column]\
547 .transform(
lambda x: eval(
'x.' + element))
548 elif element ==
'weekday':
549 dataframe[date_column +
'_' + element] = dataframe.loc[:, date_column]\
550 .transform(
lambda x: x.isoweekday())
551 elif element ==
'weeknumber':
552 dataframe[date_column +
'_' + element] = dataframe.loc[:, date_column]\
553 .transform(
lambda x: x.isocalendar()[1])
554 except AttributeError:
555 print(
'TRC: invalid configuration:' + element)
557 return dataframe.copy()
def normalizeDateTime(self, dataframe, date_column=None)
Internal method oriented to manage date_time conversions to pattern.
def normalizeWorkingRange(self, dataframe, minval=-1.0, maxval=1.0, minrange=-1.0, maxrange=1.0)
Internal method oriented to manage Working range normalizations on a [closed, closed] interval...
def define_special_spark_naive_norm(self, dataframe_metadata)
Method oriented to specificate minimal data_normalizations.
def define_ignored_columns(self, dataframe_metadata, objective_column)
Method oriented to specificate ignored_columns.
def filter_drop_missing(self, normalizemd)
Method oriented to filter drop_missing operations on non standardize algorithms.
Define all objects, functions and structures related to logging event on DayF product logs...
Class oriented to manage all messages and interaction with DayF product logs.
def normalizeBase(self, dataframe)
Internal method oriented to manage drop NaN values from dataset.
def normalizeOffset(self, dataframe, offset=0)
Internal method oriented to manage Working range normalizations on a [closed, closed] interval...
def define_minimal_norm(self, dataframe_metadata, an_objective, objective_column)
Method oriented to specificate special data_normalizations non negative.
def fixedMissingValues(self, dataframe, value=0.0)
Internal method oriented to manage imputation for missing values to fixed value.
def normalizeDropMissing(self, dataframe, col)
Internal method oriented to manage base normalizations.
def normalizeDiscretize(self, dataframe, buckets_number, fixed_size)
Internal method oriented to manage bucketing for discretize.
def normalizeDataFrame(self, df, normalizemd)
Main method oriented to define and manage normalizations sets applying normalizations.
def ignored_columns(self, normalizemd)
Method oriented to generate ignored_column_list on issues where missed > exclusion_missing_threshold...
Class NormalizationSet manage the Normalizations metadata as OrderedDict supporting Normalizer Class ...
def __init__(self, e_c)
Constructor.
def normalizeBinaryEncoding(self, dataframe)
Internal method oriented to manage Binary encodings.
def filter_objective_base(self, normalizemd)
Method oriented to filter filling_missing operations dependent of objective_column.
def define_normalizations(self, dataframe_metadata, an_objective, objective_column)
Method oriented to specificate data_normalizations.
def filter_standardize(self, normalizemd, model_id)
Method oriented to filter stdmean operations on non standardize algorithms.
def normalizeAgregation(self, dataframe, br=0.25)
Internal method oriented to manage bucket ratio normalizations head - tail.
def progressiveMissingValues(self, dataframe, col, objective_col)
Internal method oriented to manage progressive imputations for missing values.
def meanMissingValues(self, dataframe, col, objective_col, full=False)
Internal method oriented to manage imputation for missing values to mean value.
Class oriented to manage normalizations on dataframes for improvements on accuracy.
Define all objects, functions and structs related to load on system all configuration parameter from ...
def normalizeStdMean(self, dataframe, mean, std)
Internal method oriented to manage mean and std normalizations.