Is it possible to get empty anchors?

Still in the process of understanding the core algo but from the current understanding found it weird for the explainer to be returning empty `anchors`. Is that by design?

Mentioned below is the sample code used for experimenting,
```
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA, TruncatedSVD
import time

# Classifier Libraries
from sklearn.ensemble import RandomForestClassifier
import collections


# Other Libraries
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from collections import Counter
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
warnings.filterwarnings("ignore")

# data source: https://www.kaggle.com/mlg-ulb/creditcardfraud
# All continuous features
credit_data = pd.read_csv("../creditcard.csv")


colors = ["#0101DF", "#DF0101"]

sns.countplot('Class', data=df, palette=colors)
plt.title('Class Distributions \n (0: No Fraud || 1: Fraud)', fontsize=14)

print('No Frauds', round(df['Class'].value_counts()[0]/len(df) * 100,2), '% of the dataset')
print('Frauds', round(df['Class'].value_counts()[1]/len(df) * 100,2), '% of the dataset')

from sklearn.preprocessing import StandardScaler, RobustScaler


# RobustScaler is less prone to outliers
rob_scaler = RobustScaler()

df['scaled_amount'] = rob_scaler.fit_transform(df['Amount'].values.reshape(-1,1))
df['scaled_time'] = rob_scaler.fit_transform(df['Time'].values.reshape(-1,1))

df.drop(['Time','Amount'], axis=1, inplace=True)


from sklearn.model_selection import StratifiedShuffleSplit

X = df.drop('Class', axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y, random_state=42)
print('No Frauds', round(y_train.value_counts()[0]/len(y_train) * 100,2), '% of the dataset')
print('Frauds', round(y_train.value_counts()[1]/len(y_train) * 100,2), '% of the dataset')

# Oversample under represented class
sm = SMOTE(sampling_strategy='minority', random_state=42)

Xsm_train, ysm_train = sm.fit_resample(X_train, y_train)
print('No Frauds', round(ysm_train.value_counts()[0]/len(ysm_train) * 100,2), '% of the dataset')
print('Frauds', round(ysm_train.value_counts()[1]/len(ysm_train) * 100,2), '% of the dataset')


from xgboost import XGBClassifier
classifiers = {
#     "BoostingTrees": XGBClassifier(objective='reg:logistic')
    "RF": RandomForestClassifier(n_estimators=50, n_jobs=5)
}
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(random_state=42, shuffle=True)
for key, classifier in classifiers.items():
    classifier.fit(Xsm_train.to_numpy(), ysm_train.to_numpy())
    training_score = cross_val_score(classifier, Xsm_train, ysm_train, cv=cv)
    print("Classifiers: ", classifier.__class__.__name__, "Has a training score of", round(training_score.mean(), 2) * 100, "% accuracy score")
    
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

from sklearn.metrics import classification_report
y_pred = classifiers['RF'].predict(X_test)
target_names = ['No Fraud', 'Fraud']
print(classification_report(y_test, y_pred, target_names=target_names))


# A random data row
idx = 20
print(X_test[idx:idx+1])
print(f"Ground Truth: {y_test[idx]}")
print(f"Predicted Label: {classifiers['RF'].predict(X_test[idx:idx+1])}")

from anchor import utils
from anchor import anchor_tabular

class_names = ['Not Fraud', 'Fraud']
feature_names = list(X_test.columns)

# Also tried with 'quartile` discretizer
anchor = anchor_tabular.AnchorTabularExplainer(class_names, 
                                               feature_names, X_test.to_numpy(), {}, discretizer='decile')
                                               
np.random.seed(1)
c = classifiers['RF']
print('Prediction: ', anchor.class_names[c.predict(X_test.iloc[idx:idx+1])[0]])
exp = anchor.explain_instance(X_test.loc[idx, feature_names].values, c.predict, threshold=0.67, batch_size=5000)

print('Anchor: %s' % (' AND '.join(exp.names())))
print('Precision: %.2f' % exp.precision())
print('Coverage: %.2f' % exp.coverage())
```

**Output:**
```
Anchor: 
Precision: 1.00
Coverage: 1.00
```
Output of `exp_map` in-case helpful
```
{'feature': [],
 'mean': [],
 'precision': [],
 'coverage': [],
 'examples': [],
 'all_precision': 0.999000199960008,
 'num_preds': 5001,
 'names': [],
 'instance': array([-0.72322052, -0.29332573,  2.47559774, -2.5954306 , -1.13671485,
        -0.53469756, -0.30401351, -0.13267745, -2.23157085,  1.32801337,
         1.28661056, -0.4945589 ,  0.43808759, -0.74606032, -0.42314735,
         0.0285372 , -0.14005478,  0.76874186, -0.74326881, -0.11283696,
        -0.03239568,  0.45303638, -0.29160164,  0.51677789,  0.33426919,
        -0.23986655,  0.01333135, -0.09690853, -0.12017047, -0.66296597]),
 'prediction': 0}
```

Note: One gets an output if we lower the threshold to `0.2`.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Is it possible to get empty anchors? #71

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Is it possible to get empty anchors? #71

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions