6 Copyright (C) e2its - All Rights Reserved 7 * Unauthorized copying of this file, via any medium is strictly prohibited 8 * Proprietary and confidential 10 * This file is part of gDayF project. 12 * Written by Jose L. Sanchez <e2its.es@gmail.com>, 2016-2019 17 $betweenss: is the between clusters sum of squares. In fact it is the mean of distances between cluster centers. 18 One expects, this ratio, to be as higher as possible, since we would like to have heterogenous clusters. 19 2 · ( ∑m ∑n | CmP - CnP |2 ) / p · p - 1 21 $withinss: is the within cluster sum of squares. So it results in a vector with a number for each cluster. 22 One expects, this ratio, to be as lower as possible for each cluster, 23 since we would like to have homogeneity within the clusters. 24 ( ∑m | Xm - C |2 ) / p 26 Some equalities may help to understand: 27 $tot.withinss = sum ( $withinss ) 28 $totss = $tot.withinss + $betweenss 31 from pandas
import DataFrame
32 from collections
import OrderedDict
46 MetricMetadata.__init__(self)
47 self[
'betweenss'] =
None 48 self[
'tot_withinss'] =
None 50 self[
'centroid_stats'] =
None 61 for parameter, _
in self.items():
63 if perf_metrics
is not None:
64 if parameter ==
'centroid_stats':
65 self[parameter] = perf_metrics._metric_json[parameter].as_data_frame()
66 self[
'k'] = int(self[parameter][
'centroid'].max())
67 self[parameter] = json.loads(self[parameter].to_json(orient=
'split'),
68 object_pairs_hook=OrderedDict)
70 self[parameter] = perf_metrics._metric_json[parameter]
71 except KeyError
as kexecution_error:
74 except AttributeError
as aexecution_error:
75 print(
'Trace: ' + repr(aexecution_error))
76 except TypeError
as texecution_error:
77 print(
'Trace: ' + repr(texecution_error))
87 if model
is not None and data
is not None:
88 self[
'clusterCenters'] = DataFrame(model.clusterCenters())
89 self[
'k'] = self[
'clusterCenters'].shape[0]
90 self[
'clusterCenters'] = json.loads(DataFrame(model.clusterCenters()).to_json(orient=
'split'),
91 object_pairs_hook=OrderedDict)
92 self[
'bettweenss'] = 1e15
94 self[
'tot_withinss'] = model.computeCost(data)
95 self[
'nobs'] = data.count()
96 self[
'model_category'] =
'Clustering' 97 self[
'predictions'] =
None 98 self[
'RMSE'] = 10e+308
99 self[
'scoring_time'] = int(time.time() - start)