DayF core  1.2.1.2
DayF (Decision at your Fingertips) is an AutoML freeware development framework that let developers works with Machine Learning models without any idea of AI, simply taking a csv dataset and the objective column
dfmetada.py
1 
4 
5 '''
6 Copyright (C) e2its - All Rights Reserved
7  * Unauthorized copying of this file, via any medium is strictly prohibited
8  * Proprietary and confidential
9  *
10  * This file is part of gDayF project.
11  *
12  * Written by Jose L. Sanchez del Coso <e2its.es@gmail.com>, 2016-2019
13 '''
14 
15 from json import dumps, load
16 from collections import OrderedDict
17 from gdayf.common.constants import DTYPES
18 from gdayf.conf.loadconfig import LoadConfig
19 from copy import deepcopy
20 from pandas import cut
21 from hashlib import md5 as md5
22 from numpy import isnan
23 from os import path
24 import operator
25 
26 
27 
28 class DFMetada(OrderedDict):
29 
30 
32  def __init__(self):
33  OrderedDict.__init__(self)
34  self._config = None
35  configpath = path.join(path.dirname(__file__), '../../../.config')
36  configfile = path.join(configpath, 'config.json')
37 
38  # @var _configfile protected member variable to store configfile path
39  self._configfile = configfile
40  if path.exists(self._configfile):
41  with open(configfile, 'rt') as f:
42  try:
43  self._config = load(f, object_hook=OrderedDict, encoding='utf8')["dfmetadata"]
44  except IOError:
45  raise IOError
46  else:
47  raise IOError
48 
49  self['type'] = None
50  self['rowcount'] = None
51  self['cols'] = None
52  self['timeformat'] = None
53  self['columns'] = list()
54 
55 
60  def getDataFrameMetadata(self, dataframe, typedf):
61  self['type'] = '%s' % typedf
62  self['rowcount'] = dataframe.shape[0] - 1
63  self['cols'] = dataframe.shape[1]
64  self['timeformat'] = None
65  for col in dataframe.columns:
66  summary = dataframe[col].describe()
67  auxdict = OrderedDict()
68  auxdict['name'] = dataframe[col].name
69  auxdict['type'] = str(dataframe[col].dtype)
70  for comp in ['min', 'max', 'mean', 'std', '25%', '50%', '75%']:
71  try:
72  auxdict[comp] = float(summary[comp])
73  except KeyError:
74  auxdict[comp] = None
75  if auxdict['type'] in DTYPES:
76  auxdict['zeros'] = float(dataframe[dataframe.loc[:, col] == 0][col].count())
77  else:
78  auxdict['zeros'] = None
79  auxdict['missed'] = float(dataframe[col].isnull().values.ravel().sum())
80  auxdict['cardinality'] = float(dataframe.loc[:, col].value_counts().describe()['count'])
81  auxdict['histogram'] = OrderedDict()
82  cardinality_limit = self._config["cardinality_limit"]
83  if int(auxdict['cardinality']) <= cardinality_limit:
84  hist = dataframe.loc[:, col].value_counts().to_dict()
85  for tupla in sorted(hist.items(), key=operator.itemgetter(0)):
86  auxdict['histogram'][str(tupla[0])] = float(tupla[1])
87  del hist
88  else:
89  try:
90  hist = cut(dataframe.loc[:, col], cardinality_limit).value_counts().to_dict()
91  for tupla in sorted(hist.items(), key=operator.itemgetter(0)):
92  auxdict['histogram'][str(tupla[0])] = float(tupla[1])
93  del hist
94  except TypeError:
95  auxHist = dataframe[col].value_counts()
96  auxdict['histogram']['max'] = float(auxHist.max())
97  auxdict['histogram']['min'] = float(auxHist.min())
98  auxdict['histogram']['mean'] = float(auxHist.mean())
99  auxdict['histogram']['std'] = float(auxHist.std())
100  del auxHist
101  except ValueError:
102  auxHist = dataframe[col].value_counts()
103  auxdict['histogram']['max'] = float(auxHist.max())
104  auxdict['histogram']['min'] = float(auxHist.min())
105  auxdict['histogram']['mean'] = float(auxHist.mean())
106  auxdict['histogram']['std'] = float(auxHist.std())
107  del auxHist
108 
109  auxdict['distribution'] = 'Not implemented yet'
110  self['columns'].append(auxdict)
111  self['correlation'] = dataframe.corr().to_dict()
112  for key, value in deepcopy(self['correlation']).items():
113  for subkey, subvalue in value.items():
114  if (self._config['correlation_threshold'] > abs(subvalue)) or key == subkey or isnan(subvalue):
115  self['correlation'][key].pop(subkey)
116  self['covariance'] = dataframe.cov().to_dict()
117  return self
118 
119  def pop(self, key, default=None):
120  return 1
121 
122  def popitem(self, last=True):
123  return 1
124 
125 
129 def compare_dict(dict1, dict2):
130  if dict1 is None or dict2 is None:
131  return dict1 is None and dict2 is None
132  else:
133  ddict1 = dumps(OrderedDict(dict1))
134  ddict2 = dumps(OrderedDict(dict2))
135  #print( md5(ddict1.encode('utf-8')))
136  #print( md5(ddict2.encode('utf-8')))
137  return md5(ddict1.encode('utf-8')) == md5(ddict2.encode('utf-8'))
138 
139 if __name__ == "__main__":
140  from gdayf.handlers.inputhandler import inputHandlerCSV
141  from pandas import concat
142  import operator
143  from gdayf.core.experiment_context import Experiment_Context
144  from os import path
145  from gdayf.common.constants import *
146 
147  e_c = Experiment_Context(user_id='Crulogic')
148 
149  source_data = list()
150  source_data.append(path.join(path.dirname(__file__),
151  '../../../../../source data/Transformados-PDI/Crulogic-2017/'))
152  source_data.append("Crulogic-17-18.csv")
153 
154  pd_train_dataset = inputHandlerCSV().inputCSV(''.join(source_data))
155 
156  m = DFMetada()
157  print(OrderedDict(m.getDataFrameMetadata(pd_train_dataset, 'pandas')))
158  print(dumps(m.getDataFrameMetadata(pd_train_dataset, 'pandas'), indent=4))
159 
Class DFMetadata manage the Data Analysis results structs on OrderedDict format and exportable to jso...
Definition: dfmetada.py:28
def getDataFrameMetadata(self, dataframe, typedf)
Get dataframe on pandas format and return and equivalent DFmetadata object.
Definition: dfmetada.py:60
def __init__(self)
The constructor Generate an empty DFMetada class with all elements initialized to correct types...
Definition: dfmetada.py:32
def compare_dict(dict1, dict2)
Function oriented compare two dicts based on hash_key(json transformations)
Definition: dfmetada.py:129
Define all global objects, functions and structs related with an specific experiment.
Define all objects, functions and structs related to load on system all configuration parameter from ...
Definition: loadconfig.py:1