+
Skip to content

Is it possible to get empty anchors? #71

@pramitchoudhary

Description

@pramitchoudhary

Still in the process of understanding the core algo but from the current understanding found it weird for the explainer to be returning empty anchors. Is that by design?

Mentioned below is the sample code used for experimenting,

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA, TruncatedSVD
import time

# Classifier Libraries
from sklearn.ensemble import RandomForestClassifier
import collections


# Other Libraries
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from collections import Counter
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
warnings.filterwarnings("ignore")

# data source: https://www.kaggle.com/mlg-ulb/creditcardfraud
# All continuous features
credit_data = pd.read_csv("../creditcard.csv")


colors = ["#0101DF", "#DF0101"]

sns.countplot('Class', data=df, palette=colors)
plt.title('Class Distributions \n (0: No Fraud || 1: Fraud)', fontsize=14)

print('No Frauds', round(df['Class'].value_counts()[0]/len(df) * 100,2), '% of the dataset')
print('Frauds', round(df['Class'].value_counts()[1]/len(df) * 100,2), '% of the dataset')

from sklearn.preprocessing import StandardScaler, RobustScaler


# RobustScaler is less prone to outliers
rob_scaler = RobustScaler()

df['scaled_amount'] = rob_scaler.fit_transform(df['Amount'].values.reshape(-1,1))
df['scaled_time'] = rob_scaler.fit_transform(df['Time'].values.reshape(-1,1))

df.drop(['Time','Amount'], axis=1, inplace=True)


from sklearn.model_selection import StratifiedShuffleSplit

X = df.drop('Class', axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y, random_state=42)
print('No Frauds', round(y_train.value_counts()[0]/len(y_train) * 100,2), '% of the dataset')
print('Frauds', round(y_train.value_counts()[1]/len(y_train) * 100,2), '% of the dataset')

# Oversample under represented class
sm = SMOTE(sampling_strategy='minority', random_state=42)

Xsm_train, ysm_train = sm.fit_resample(X_train, y_train)
print('No Frauds', round(ysm_train.value_counts()[0]/len(ysm_train) * 100,2), '% of the dataset')
print('Frauds', round(ysm_train.value_counts()[1]/len(ysm_train) * 100,2), '% of the dataset')


from xgboost import XGBClassifier
classifiers = {
#     "BoostingTrees": XGBClassifier(objective='reg:logistic')
    "RF": RandomForestClassifier(n_estimators=50, n_jobs=5)
}
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(random_state=42, shuffle=True)
for key, classifier in classifiers.items():
    classifier.fit(Xsm_train.to_numpy(), ysm_train.to_numpy())
    training_score = cross_val_score(classifier, Xsm_train, ysm_train, cv=cv)
    print("Classifiers: ", classifier.__class__.__name__, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")
    
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

from sklearn.metrics import classification_report
y_pred = classifiers['RF'].predict(X_test)
target_names = ['No Fraud', 'Fraud']
print(classification_report(y_test, y_pred, target_names=target_names))


# A random data row
idx = 20
print(X_test[idx:idx+1])
print(f"Ground Truth: {y_test[idx]}")
print(f"Predicted Label: {classifiers['RF'].predict(X_test[idx:idx+1])}")

from anchor import utils
from anchor import anchor_tabular

class_names = ['Not Fraud', 'Fraud']
feature_names = list(X_test.columns)

# Also tried with 'quartile` discretizer
anchor = anchor_tabular.AnchorTabularExplainer(class_names, 
                                               feature_names, X_test.to_numpy(), {}, discretizer='decile')
                                               
np.random.seed(1)
c = classifiers['RF']
print('Prediction: ', anchor.class_names[c.predict(X_test.iloc[idx:idx+1])[0]])
exp = anchor.explain_instance(X_test.loc[idx, feature_names].values, c.predict, threshold=0.67, batch_size=5000)

print('Anchor: %s' % (' AND '.join(exp.names())))
print('Precision: %.2f' % exp.precision())
print('Coverage: %.2f' % exp.coverage())

Output:

Anchor: 
Precision: 1.00
Coverage: 1.00

Output of exp_map in-case helpful

{'feature': [],
 'mean': [],
 'precision': [],
 'coverage': [],
 'examples': [],
 'all_precision': 0.999000199960008,
 'num_preds': 5001,
 'names': [],
 'instance': array([-0.72322052, -0.29332573,  2.47559774, -2.5954306 , -1.13671485,
        -0.53469756, -0.30401351, -0.13267745, -2.23157085,  1.32801337,
         1.28661056, -0.4945589 ,  0.43808759, -0.74606032, -0.42314735,
         0.0285372 , -0.14005478,  0.76874186, -0.74326881, -0.11283696,
        -0.03239568,  0.45303638, -0.29160164,  0.51677789,  0.33426919,
        -0.23986655,  0.01333135, -0.09690853, -0.12017047, -0.66296597]),
 'prediction': 0}

Note: One gets an output if we lower the threshold to 0.2.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions

      点击 这是indexloc提供的php浏览器服务,不要输入任何密码和下载