6 Copyright (C) e2its - All Rights Reserved 7 * Unauthorized copying of this file, via any medium is strictly prohibited 8 * Proprietary and confidential 10 * This file is part of gDayF project. 12 * Written by Jose L. Sanchez del Coso <e2its.es@gmail.com>, 2016-2019 15 from json
import dumps, load
16 from collections
import OrderedDict
19 from copy
import deepcopy
20 from pandas
import cut
21 from hashlib
import md5
as md5
22 from numpy
import isnan
33 OrderedDict.__init__(self)
35 configpath = path.join(path.dirname(__file__),
'../../../.config')
36 configfile = path.join(configpath,
'config.json')
41 with open(configfile,
'rt')
as f:
43 self.
_config = load(f, object_hook=OrderedDict, encoding=
'utf8')[
"dfmetadata"]
50 self[
'rowcount'] =
None 52 self[
'timeformat'] =
None 53 self[
'columns'] = list()
61 self[
'type'] =
'%s' % typedf
62 self[
'rowcount'] = dataframe.shape[0] - 1
63 self[
'cols'] = dataframe.shape[1]
64 self[
'timeformat'] =
None 65 for col
in dataframe.columns:
66 summary = dataframe[col].describe()
67 auxdict = OrderedDict()
68 auxdict[
'name'] = dataframe[col].name
69 auxdict[
'type'] = str(dataframe[col].dtype)
70 for comp
in [
'min',
'max',
'mean',
'std',
'25%',
'50%',
'75%']:
72 auxdict[comp] = float(summary[comp])
75 if auxdict[
'type']
in DTYPES:
76 auxdict[
'zeros'] = float(dataframe[dataframe.loc[:, col] == 0][col].count())
78 auxdict[
'zeros'] =
None 79 auxdict[
'missed'] = float(dataframe[col].isnull().values.ravel().sum())
80 auxdict[
'cardinality'] = float(dataframe.loc[:, col].value_counts().describe()[
'count'])
81 auxdict[
'histogram'] = OrderedDict()
82 cardinality_limit = self.
_config[
"cardinality_limit"]
83 if int(auxdict[
'cardinality']) <= cardinality_limit:
84 hist = dataframe.loc[:, col].value_counts().to_dict()
85 for tupla
in sorted(hist.items(), key=operator.itemgetter(0)):
86 auxdict[
'histogram'][str(tupla[0])] = float(tupla[1])
90 hist = cut(dataframe.loc[:, col], cardinality_limit).value_counts().to_dict()
91 for tupla
in sorted(hist.items(), key=operator.itemgetter(0)):
92 auxdict[
'histogram'][str(tupla[0])] = float(tupla[1])
95 auxHist = dataframe[col].value_counts()
96 auxdict[
'histogram'][
'max'] = float(auxHist.max())
97 auxdict[
'histogram'][
'min'] = float(auxHist.min())
98 auxdict[
'histogram'][
'mean'] = float(auxHist.mean())
99 auxdict[
'histogram'][
'std'] = float(auxHist.std())
102 auxHist = dataframe[col].value_counts()
103 auxdict[
'histogram'][
'max'] = float(auxHist.max())
104 auxdict[
'histogram'][
'min'] = float(auxHist.min())
105 auxdict[
'histogram'][
'mean'] = float(auxHist.mean())
106 auxdict[
'histogram'][
'std'] = float(auxHist.std())
109 auxdict[
'distribution'] =
'Not implemented yet' 110 self[
'columns'].append(auxdict)
111 self[
'correlation'] = dataframe.corr().to_dict()
112 for key, value
in deepcopy(self[
'correlation']).items():
113 for subkey, subvalue
in value.items():
114 if (self.
_config[
'correlation_threshold'] > abs(subvalue))
or key == subkey
or isnan(subvalue):
115 self[
'correlation'][key].pop(subkey)
116 self[
'covariance'] = dataframe.cov().to_dict()
119 def pop(self, key, default=None):
122 def popitem(self, last=True):
130 if dict1
is None or dict2
is None:
131 return dict1
is None and dict2
is None 133 ddict1 = dumps(OrderedDict(dict1))
134 ddict2 = dumps(OrderedDict(dict2))
137 return md5(ddict1.encode(
'utf-8')) == md5(ddict2.encode(
'utf-8'))
139 if __name__ ==
"__main__":
141 from pandas
import concat
150 source_data.append(path.join(path.dirname(__file__),
151 '../../../../../source data/Transformados-PDI/Crulogic-2017/'))
152 source_data.append(
"Crulogic-17-18.csv")
157 print(OrderedDict(m.getDataFrameMetadata(pd_train_dataset,
'pandas')))
158 print(dumps(m.getDataFrameMetadata(pd_train_dataset,
'pandas'), indent=4))
Define all global objects, functions and structs related with an specific experiment.
Define all objects, functions and structs related to load on system all configuration parameter from ...