-
Notifications
You must be signed in to change notification settings - Fork 114
Open
Description
Still in the process of understanding the core algo but from the current understanding found it weird for the explainer to be returning empty anchors
. Is that by design?
Mentioned below is the sample code used for experimenting,
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA, TruncatedSVD
import time
# Classifier Libraries
from sklearn.ensemble import RandomForestClassifier
import collections
# Other Libraries
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from collections import Counter
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
warnings.filterwarnings("ignore")
# data source: https://www.kaggle.com/mlg-ulb/creditcardfraud
# All continuous features
credit_data = pd.read_csv("../creditcard.csv")
colors = ["#0101DF", "#DF0101"]
sns.countplot('Class', data=df, palette=colors)
plt.title('Class Distributions \n (0: No Fraud || 1: Fraud)', fontsize=14)
print('No Frauds', round(df['Class'].value_counts()[0]/len(df) * 100,2), '% of the dataset')
print('Frauds', round(df['Class'].value_counts()[1]/len(df) * 100,2), '% of the dataset')
from sklearn.preprocessing import StandardScaler, RobustScaler
# RobustScaler is less prone to outliers
rob_scaler = RobustScaler()
df['scaled_amount'] = rob_scaler.fit_transform(df['Amount'].values.reshape(-1,1))
df['scaled_time'] = rob_scaler.fit_transform(df['Time'].values.reshape(-1,1))
df.drop(['Time','Amount'], axis=1, inplace=True)
from sklearn.model_selection import StratifiedShuffleSplit
X = df.drop('Class', axis=1)
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y, random_state=42)
print('No Frauds', round(y_train.value_counts()[0]/len(y_train) * 100,2), '% of the dataset')
print('Frauds', round(y_train.value_counts()[1]/len(y_train) * 100,2), '% of the dataset')
# Oversample under represented class
sm = SMOTE(sampling_strategy='minority', random_state=42)
Xsm_train, ysm_train = sm.fit_resample(X_train, y_train)
print('No Frauds', round(ysm_train.value_counts()[0]/len(ysm_train) * 100,2), '% of the dataset')
print('Frauds', round(ysm_train.value_counts()[1]/len(ysm_train) * 100,2), '% of the dataset')
from xgboost import XGBClassifier
classifiers = {
# "BoostingTrees": XGBClassifier(objective='reg:logistic')
"RF": RandomForestClassifier(n_estimators=50, n_jobs=5)
}
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(random_state=42, shuffle=True)
for key, classifier in classifiers.items():
classifier.fit(Xsm_train.to_numpy(), ysm_train.to_numpy())
training_score = cross_val_score(classifier, Xsm_train, ysm_train, cv=cv)
print("Classifiers: ", classifier.__class__.__name__, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
from sklearn.metrics import classification_report
y_pred = classifiers['RF'].predict(X_test)
target_names = ['No Fraud', 'Fraud']
print(classification_report(y_test, y_pred, target_names=target_names))
# A random data row
idx = 20
print(X_test[idx:idx+1])
print(f"Ground Truth: {y_test[idx]}")
print(f"Predicted Label: {classifiers['RF'].predict(X_test[idx:idx+1])}")
from anchor import utils
from anchor import anchor_tabular
class_names = ['Not Fraud', 'Fraud']
feature_names = list(X_test.columns)
# Also tried with 'quartile` discretizer
anchor = anchor_tabular.AnchorTabularExplainer(class_names,
feature_names, X_test.to_numpy(), {}, discretizer='decile')
np.random.seed(1)
c = classifiers['RF']
print('Prediction: ', anchor.class_names[c.predict(X_test.iloc[idx:idx+1])[0]])
exp = anchor.explain_instance(X_test.loc[idx, feature_names].values, c.predict, threshold=0.67, batch_size=5000)
print('Anchor: %s' % (' AND '.join(exp.names())))
print('Precision: %.2f' % exp.precision())
print('Coverage: %.2f' % exp.coverage())
Output:
Anchor:
Precision: 1.00
Coverage: 1.00
Output of exp_map
in-case helpful
{'feature': [],
'mean': [],
'precision': [],
'coverage': [],
'examples': [],
'all_precision': 0.999000199960008,
'num_preds': 5001,
'names': [],
'instance': array([-0.72322052, -0.29332573, 2.47559774, -2.5954306 , -1.13671485,
-0.53469756, -0.30401351, -0.13267745, -2.23157085, 1.32801337,
1.28661056, -0.4945589 , 0.43808759, -0.74606032, -0.42314735,
0.0285372 , -0.14005478, 0.76874186, -0.74326881, -0.11283696,
-0.03239568, 0.45303638, -0.29160164, 0.51677789, 0.33426919,
-0.23986655, 0.01333135, -0.09690853, -0.12017047, -0.66296597]),
'prediction': 0}
Note: One gets an output if we lower the threshold to 0.2
.
Metadata
Metadata
Assignees
Labels
No labels