In [ ]:

'''
                  precision    recall  f1-score   support

1)Original       0.0       1.00      1.00      1.00     85295
               1.0       0.85      0.84      0.84       148

2)with divide    0.0       1.00      1.00      1.00     85295
corr items     1.0       0.89*     0.86      0.88*      148
               
3)with           0.0       1.00      1.00      1.00     85295
corr inputs    1.0       0.82      0.89*     0.86       148          

------------------------Over Sampling-----------------------
with 1).       0.0       1.00      1.00      1.00     85295
SMOTE          1.0       0.67      0.91      0.77       148

with 1).       0.0       1.00      1.00      1.00     85295
ADASYN         1.0       0.68      0.91      0.77       148

with 2).       0.0       1.00      1.00      1.00     85295
ADASYN         1.0       0.69      0.92      0.79       148

-----------------------Under Sampling-----------------------         
with 1).       0.0       1.00      1.00      1.00     85295
TOMEK LINK     1.0       0.85      0.84      0.84       148

with 1).       0.0       1.00      1.00      1.00     85295
ENN            1.0       0.79      0.89      0.83       148

with 2).       0.0       1.00      1.00      1.00     85295
ENN            1.0       0.78      0.89      0.83       148
'''

In [1]:

import sys
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import confusion_matrix, classification_report

import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.backend as K

from tensorflow.keras.layers import Dense, Input, Dropout, BatchNormalization, concatenate
from tensorflow.keras.models import Model

SEED=1111
tf.random.set_seed(SEED)

In [2]:

data = pd.read_csv('./data/creditcard.csv')

In [3]:

data.head()

Out[3]:

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V21	V22	V23	V24	V25	V26	V27	V28	Amount
0	0.0	-1.359807	-0.072781	2.536347	1.378155	-0.338321	0.462388	0.239599	0.098698	0.363787	...	-0.018307	0.277838	-0.110474	0.066928	0.128539	-0.189115	0.133558	-0.021053	149.62
1	0.0	1.191857	0.266151	0.166480	0.448154	0.060018	-0.082361	-0.078803	0.085102	-0.255425	...	-0.225775	-0.638672	0.101288	-0.339846	0.167170	0.125895	-0.008983	0.014724	2.69
2	1.0	-1.358354	-1.340163	1.773209	0.379780	-0.503198	1.800499	0.791461	0.247676	-1.514654	...	0.247998	0.771679	0.909412	-0.689281	-0.327642	-0.139097	-0.055353	-0.059752	378.66
3	1.0	-0.966272	-0.185226	1.792993	-0.863291	-0.010309	1.247203	0.237609	0.377436	-1.387024	...	-0.108300	0.005274	-0.190321	-1.175575	0.647376	-0.221929	0.062723	0.061458	123.50
4	2.0	-1.158233	0.877737	1.548718	0.403034	-0.407193	0.095921	0.592941	-0.270533	0.817739	...	-0.009431	0.798278	-0.137458	0.141267	-0.206010	0.502292	0.219422	0.215153	69.99

5 rows × 31 columns

In [4]:

round(data.describe(), 3)

Out[4]:

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V21	V22	V23	V24	V25	V26	V27	V28	Amount	Class
count	284807.000	284807.000	284807.000	284807.000	284807.000	284807.000	284807.000	284807.000	284807.000	284807.000	...	284807.000	284807.000	284807.000	284807.000	284807.000	284807.000	284807.000	284807.000	284807.000	284807.000
mean	94813.860	0.000	0.000	-0.000	0.000	0.000	0.000	-0.000	0.000	-0.000	...	0.000	-0.000	0.000	0.000	0.000	0.000	-0.000	-0.000	88.350	0.002
std	47488.146	1.959	1.651	1.516	1.416	1.380	1.332	1.237	1.194	1.099	...	0.735	0.726	0.624	0.606	0.521	0.482	0.404	0.330	250.120	0.042
min	0.000	-56.408	-72.716	-48.326	-5.683	-113.743	-26.161	-43.557	-73.217	-13.434	...	-34.830	-10.933	-44.808	-2.837	-10.295	-2.605	-22.566	-15.430	0.000	0.000
25%	54201.500	-0.920	-0.599	-0.890	-0.849	-0.692	-0.768	-0.554	-0.209	-0.643	...	-0.228	-0.542	-0.162	-0.355	-0.317	-0.327	-0.071	-0.053	5.600	0.000
50%	84692.000	0.018	0.065	0.180	-0.020	-0.054	-0.274	0.040	0.022	-0.051	...	-0.029	0.007	-0.011	0.041	0.017	-0.052	0.001	0.011	22.000	0.000
75%	139320.500	1.316	0.804	1.027	0.743	0.612	0.399	0.570	0.327	0.597	...	0.186	0.529	0.148	0.440	0.351	0.241	0.091	0.078	77.165	0.000
max	172792.000	2.455	22.058	9.383	16.875	34.802	73.302	120.589	20.007	15.595	...	27.203	10.503	22.528	4.585	7.520	3.517	31.612	33.848	25691.160	1.000

8 rows × 31 columns

In [5]:

from sklearn.preprocessing import StandardScaler, RobustScaler

In [6]:

data['Scaled_Amount'] = RobustScaler().fit_transform(data['Amount'].values.reshape(-1,1))
data['Scaled_Time'] = RobustScaler().fit_transform(data['Time'].values.reshape(-1,1))
data.drop(['Time','Amount'], axis=1,inplace=True)

In [7]:

data.Class = data.Class.astype('float')

In [8]:

x_train,x_test, y_train,y_test = train_test_split(
    data.drop('Class',axis=1),
    data.Class,
    test_size=.3,
    stratify=data.Class,
    shuffle=True,
    random_state=SEED
)

In [9]:

x_train,x_val, y_train, y_val = train_test_split(
    data.drop('Class', axis=1),
    data.Class,
    test_size=.2,
    stratify=data.Class,
    shuffle=True,
    random_state=SEED
)

In [10]:

fraud_data = data[data.Class==1]
normal_data = data[data.Class==0]

n_fraud, n_normal, n_data = len(fraud_data), len(normal_data), len(data)
print('Normal:\t', round(100*n_normal/n_data, 3),'%')
print('Fraud:\t', round(100*n_fraud/n_data, 3),'%')

Normal:	 99.827 %
Fraud:	 0.173 %

In [11]:

plt.style.use('fivethirtyeight')

plt.figure()
plt.tight_layout()
n_columns = len(data.columns)-1
nrows=n_columns//4+(1 if n_columns%4 != 0 else 0)
fig, axes = plt.subplots(nrows,4, figsize=(28,nrows*7))

i = 0
for col in data.columns:
    if col == 'Class': continue
    i += 1
    plt.subplot(nrows,4,i)
    sns.distplot(normal_data[col], label='Normal', 
                 kde_kws={"color": "b", "lw": 1, "alpha":1})
    sns.distplot(fraud_data[col], label='Fraud',
                 kde_kws={"color": "r", "lw": 1, "alpha":1})
    plt.legend()

plt.show()

<Figure size 432x288 with 0 Axes>

In [11]:

plt.figure(figsize=(10,7))
sns.heatmap(normal_data.corr(), cmap='RdBu', center=0)

Out[11]:

<AxesSubplot:>

In [11]:

normal_corr = normal_data.drop('Class',axis=1).corr()
fraud_corr = fraud_data.drop('Class',axis=1).corr()

In [12]:

fig, axes = plt.subplots(1,2, figsize=(26,10))
sns.heatmap(normal_data.drop('Class',axis=1).sample(n_fraud).corr(), cmap='RdBu', ax=axes[0], center=0)
#sns.heatmap(normal_corr, cmap='RdBu', ax=axes[0], center=0)
axes[0].set_title('Normal')

sns.heatmap(fraud_corr, cmap='RdBu', ax=axes[1], center=0)
axes[1].set_title('Fraud')

# Fraud Data에만 두드러지는 선형적 상관관계가 보임
# 이 차이를 활용할 수 있을까?

Out[12]:

Text(0.5, 1.0, 'Fraud')

In [13]:

from collections import defaultdict

diff_corr = abs(normal_corr-fraud_corr)
columns = diff_corr.columns

diff_dict = defaultdict(int)
n_repeat = n_normal//n_fraud
for _ in range(n_repeat):
    sample_normal_corr = normal_data.drop('Class',axis=1).sample(n_fraud,replace=False).corr()
    diff_corr = abs(sample_normal_corr - fraud_corr)
    for i, col in enumerate(columns):
        for j in range(i):
            diff_dict[columns[j]+'/'+columns[i]] += diff_corr[columns[j]][columns[i]]

for key in diff_dict:
    diff_dict[key] /= n_repeat

corr_list = sorted(diff_dict.items(),key=lambda x:-x[1])[:20]

corr_items = []
for key, degree in corr_list:
    a, b = key.split('/')
    corr_items.append((a,b))

In [14]:

corr_list

Out[14]:

[('V16/V17', 1.1098562118798299),
 ('V17/V18', 1.0714954444257931),
 ('V12/V17', 1.0164098720753347),
 ('V16/V18', 1.000350677473375),
 ('V1/V3', 0.9779039533385792),
 ('V12/V16', 0.9717030496360538),
 ('V7/V10', 0.9570532768529891),
 ('V3/V5', 0.9564447061784479),
 ('V3/V7', 0.9549581398619433),
 ('V11/V12', 0.9531436615879483),
 ('V1/V7', 0.9468827099902223),
 ('V10/V17', 0.9454250931764953),
 ('V10/V12', 0.9327710615088803),
 ('V11/V14', 0.9291155590155674),
 ('V1/V5', 0.9284218042504773),
 ('V2/V7', 0.9206545142650776),
 ('V12/V14', 0.9171922094486237),
 ('V9/V10', 0.9162483913700069),
 ('V21/V22', 0.9099059014655518),
 ('V2/V3', 0.9006638934446911)]

In [32]:

params_RF= {
    'n_estimators':[100],
    'max_depth':[10,15],
    'min_samples_leaf':[1,2,4],
    'min_samples_split':[2,4]
}

In [33]:

model_RF = RandomForestClassifier(random_state=SEED, n_jobs=-1)
grid_cv = GridSearchCV(model_RF, param_grid=params_RF,
                       cv=3,
                       n_jobs=-1, scoring='f1',
                       verbose=0)
grid_cv.fit(x_train, y_train)

Out[33]:

GridSearchCV(cv=3,
             estimator=RandomForestClassifier(n_jobs=-1, random_state=1111),
             n_jobs=-1,
             param_grid={'max_depth': [10, 15], 'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 4], 'n_estimators': [100]},
             scoring='f1')

In [34]:

print('Best Params:',grid_cv.best_params_)
print('Best F1-Score: ', round(grid_cv.best_score_,3))

Best Params: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best F1-Score:  0.858

In [35]:

model_RF1 = RandomForestClassifier(
    n_estimators=100,
    max_depth=15,
    min_samples_leaf=1,
    min_samples_split=2,
    random_state=SEED,
    n_jobs=-1
)
model_RF1.fit(x_train,y_train)

Out[35]:

RandomForestClassifier(max_depth=15, n_jobs=-1, random_state=1111)

In [36]:

print(confusion_matrix(y_val, model_RF1.predict(x_val)))
print(classification_report(y_val,model_RF1.predict(x_val)))

[[56863     1]
 [   20    78]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.99      0.80      0.88        98

    accuracy                           1.00     56962
   macro avg       0.99      0.90      0.94     56962
weighted avg       1.00      1.00      1.00     56962

In [37]:

print(confusion_matrix(y_test, model_RF1.predict(x_test)))
print(classification_report(y_test,model_RF1.predict(x_test)))

[[85294     1]
 [   27   121]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.99      0.82      0.90       148

    accuracy                           1.00     85443
   macro avg       1.00      0.91      0.95     85443
weighted avg       1.00      1.00      1.00     85443

In [38]:

plt.figure(figsize=(30,20))
sns.barplot(x=data.drop('Class',axis=1).columns,
            y=model_RF1.feature_importances_)

Out[38]:

<AxesSubplot:>

In [195]:

column_index = {column:i for i,column in enumerate(x_train.columns)}

In [14]:

def print_score(model, x,y):
    pred = model.predict(x)
    y_pred = np.round(pred)
    print(confusion_matrix(y,y_pred))
    print(classification_report(y,y_pred))

In [15]:

def get_inputs(x_train):
    corr_inputs = []

    for key_a, key_b in corr_items:
        _relative = x_train[key_a].to_numpy()/(x_train[key_b].to_numpy()+1e-8)
        corr_inputs.append(_relative)

    base_inputs = np.array(x_train)
    corr_inputs = np.array(corr_inputs).T
    return base_inputs.astype('float'), corr_inputs.astype('float')

In [16]:

def visualize_history(history):
    fig, axes = plt.subplots(1,3, figsize=(20,5))

    axes[0].plot(history.history['loss'], label='loss')
    axes[0].plot(history.history['val_loss'], label='val_loss')
    axes[0].legend()

    axes[1].plot(history.history['recall'], label='recall')
    axes[1].plot(history.history['val_recall'], label='val_recall')

    axes[1].plot(history.history['precision'], label='precision')
    axes[1].plot(history.history['val_precision'], label='val_precision')
    axes[1].legend()

    axes[2].plot(history.history['f1_score'], label='f1_score')
    axes[2].plot(history.history['val_f1_score'], label='val_f1_score1')
    axes[2].legend()

    plt.show()

In [17]:

def f1_score(y_true, y_pred):
    tp = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)), axis=0) + K.epsilon()
    ground_positives = K.sum(y_true, axis=0) + K.epsilon()
    pred_positives = K.sum(K.round(K.clip(y_pred, 0, 1)), axis=0) + K.epsilon()

    precision = tp / (pred_positives+K.epsilon())
    recall = tp / (ground_positives+K.epsilon())
    
    f1 = 2*((precision*recall)/(precision+recall+K.epsilon()))

    return f1

def f1_loss(y_true, y_pred):
    tp = K.sum(y_true * y_pred, axis=0) + K.epsilon()
    ground_positives = K.sum(y_true, axis=0) + K.epsilon()
    ground_negatives = K.sum(1-y_true, axis=0) + K.epsilon()
    pred_positives = K.sum(y_pred, axis=0) + K.epsilon()

    ratio = ground_negatives/ground_positives
    
    precision = tp / (pred_positives+K.epsilon())
    recall = tp / (ground_positives+K.epsilon())
    
    f1 = 2*((precision*recall)/(precision+recall+K.epsilon()))
    
    weighted_f1 = f1*ground_positives / K.sum(ground_positives+K.epsilon())
    weighted_f1 = K.sum(weighted_f1)
    
    return 1-weighted_f1

In [18]:

# model with NN
# create function for Over/Under sampling
def model_NN(x_train, y_train, class_weight=False):
    DEEP = 5

    base_inputs, corr_inputs = get_inputs(x_train)
    base_input = Input(shape=(base_inputs.shape[-1],), name='base_input')

    base_x = Dense(512, activation='tanh', name='tanh',
                   kernel_initializer='he_normal')(base_input)
    base_x = Dropout(0.5)(base_x)

    for _ in range(DEEP):
        base_x = Dense(512, activation='relu', 
                       kernel_initializer='he_normal')(base_x)
        base_x = Dropout(0.6)(base_x)

    outputs = Dense(1, activation='sigmoid')(base_x)

    model = Model(inputs=base_input, outputs=outputs)
    model.compile(
                optimizer=keras.optimizers.Adam(learning_rate=0.005),
                #optimizer=keras.optimizers.SGD(learning_rate=0.01, momentum=0.9),
                #optimizer=keras.optimizers.RMSprop(learning_rate=0.0001),
                #loss=f1_loss,
                loss='binary_crossentropy', 
                metrics=[
                      f1_score,
                      keras.metrics.Precision(name='precision'), 
                      keras.metrics.Recall(name='recall'),
                      keras.metrics.TrueNegatives(name='tn'),
                      keras.metrics.TruePositives(name='tp')
                ])

    epochs = 100
    batch_size = 4096

    val_base_inputs, val_corr_inputs = get_inputs(x_val)
    tbase_inputs, tcorr_inputs = get_inputs(x_test)

    with tf.device('/gpu:0'):
        history = model.fit(base_inputs, y_train,
                  validation_data=(val_base_inputs, y_val),
                  # weight 조정으로 recall / precision trade
                  class_weight={0:1, 1:(n_data/n_fraud)**.5/2 if class_weight else 1},
                  callbacks = [
                               keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.2, min_lr=0.0000001, patience=10, verbose=2, 
                                                                 mode='min'),
                               keras.callbacks.ModelCheckpoint(monitor='val_f1_score', filepath='./checkpoint/NN', save_best_only=True, 
                                                               mode='max')

                               #,keras.callbacks.EarlyStopping(monitor='val_loss', patience=100, verbose=1)
                              ],
                  #callbacks = [c1,c2],
                  epochs=epochs,
                  batch_size=batch_size,
                  verbose=1)
    return model, history

In [ ]:

model, history = model_NN(x_train, y_train, class_weight=True)

In [131]:

best_model = keras.models.load_model('./checkpoint/NN', custom_objects={'f1_score':f1_score})

In [ ]:

# validation data
print_score(best_model, val_base_inputs, y_val)
print_score(model, val_base_inputs, y_val)

In [133]:

# test data
print_score(best_model, tbase_inputs, y_test)
print_score(model, tbase_inputs, y_test)

[[85271    24]
 [   22   126]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     85295
         1.0       0.84      0.85      0.85       148

    accuracy                           1.00     85443
   macro avg       0.92      0.93      0.92     85443
weighted avg       1.00      1.00      1.00     85443

[[85273    22]
 [   24   124]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     85295
         1.0       0.85      0.84      0.84       148

    accuracy                           1.00     85443
   macro avg       0.92      0.92      0.92     85443
weighted avg       1.00      1.00      1.00     85443

In [134]:

# binary_crossentropy loss 사용 시, 0.51을 1로 판단하면 loss 크고 099를 1로 판단하면 loss 작음
# f1_score는 수렴
# validation set에서의 f1_score가 더 높은 이유는 dropout layer 때문일 가능성이 있음
visualize_history(history)

In [25]:

def schedule(epoch, lr):
    if epoch==15:
        return lr * 0.5
    else:
        return lr

In [ ]:

# model with divided corr_items
# 선형적 상관관계 col_a = a * col_b -> col_a/col_b = a

DEEP = 5

base_inputs, corr_inputs = get_inputs(x_train)

dense_layers= []

input_base = Input(shape=(base_inputs.shape[-1],), name='base_input')
input_corr = Input(shape=(corr_inputs.shape[-1],), name='corr_input')

inputs = [input_base,input_corr]
input_concat = concatenate([input_base, input_corr])

x_concat = Dense(512, activation='tanh', kernel_initializer='glorot_normal', name='concat_tanh')(input_concat)
x_concat = Dropout(0.5)(x_concat)

for _ in range(DEEP):
    x_concat = Dense(512, activation='relu', kernel_initializer='he_normal')(x_concat)
    x_concat = Dropout(0.5)(x_concat)
    
outputs = Dense(1, activation='sigmoid', name='output')(x_concat)

model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.005),
              loss='binary_crossentropy', 
              metrics=[
                  f1_score,
                  keras.metrics.Precision(name='precision'), 
                  keras.metrics.Recall(name='recall'),
                  keras.metrics.TrueNegatives(name='tn'),
                  keras.metrics.TruePositives(name='tp'),
                  'acc'
              ])

epochs = 100
batch_size = 4096

val_base_inputs, val_corr_inputs = get_inputs(x_val)
tbase_inputs, tcorr_inputs = get_inputs(x_test)

with tf.device('/gpu:0'):
    history = model.fit([base_inputs, corr_inputs], y_train,
              validation_data=([val_base_inputs, val_corr_inputs],y_val),
                        
              # weight 조정으로 recall / precision trade
              class_weight={0:1, 1:(n_normal/n_fraud)**.5/2},
              callbacks = [keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.2, min_lr=0.0000001, patience=5, verbose=2,
                                                             mode='min'),
                           keras.callbacks.ModelCheckpoint(monitor='val_f1_score', filepath='./checkpoint/NN_interact', 
                                                           save_best_only=True,
                                                           mode='max'),
                           #keras.callbacks.EarlyStopping(monitor='val_f1_score', patience=100, mode='max', verbose=1)
                          ],
              epochs=epochs,
              batch_size=batch_size,
              verbose=1)

In [ ]:

model.summary()

In [83]:

best_model = keras.models.load_model('./checkpoint/NN_interact', custom_objects={'f1_score':f1_score})

In [ ]:

# validation data
print_score(best_model, [val_base_inputs, val_corr_inputs], y_val)
print_score(model, [val_base_inputs, val_corr_inputs], y_val)

In [85]:

# test data
print_score(best_model, [tbase_inputs, tcorr_inputs], y_test)
print_score(model, [tbase_inputs, tcorr_inputs], y_test)

[[85283    12]
 [   21   127]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     85295
         1.0       0.91      0.86      0.89       148

    accuracy                           1.00     85443
   macro avg       0.96      0.93      0.94     85443
weighted avg       1.00      1.00      1.00     85443

[[85279    16]
 [   20   128]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     85295
         1.0       0.89      0.86      0.88       148

    accuracy                           1.00     85443
   macro avg       0.94      0.93      0.94     85443
weighted avg       1.00      1.00      1.00     85443

In [86]:

visualize_history(history)

In [51]:

def get_inputs2(x_train):
    corr_inputs = []
    
    for key_a, key_b in corr_items:
        _relative = np.append(x_train[key_a].to_numpy(), x_train[key_b].to_numpy()).reshape(-1,2, order='F')
        corr_inputs.append(_relative)
    
    base_inputs = np.array(x_train)
    corr_inputs = np.array(corr_inputs)
    
    return base_inputs.astype('float'), corr_inputs.astype('float')

In [ ]:

# model with sigmoid interaction columns
# 선형적 관계를 가진 컬럼 -> sigmoid -> + base_inputs

DEEP = 5

base_inputs, corr_inputs = get_inputs2(x_train)
train_input = [base_inputs] + [corr_input for corr_input in corr_inputs]

inputs = []
dense_layers= []

input_base = Input(shape=(base_inputs.shape[-1],), name='base_input')
inputs.append(input_base)
    
for i, corr_input in enumerate(corr_inputs):
    _input = Input(shape=(corr_input.shape[-1],), name='corr_input_'+str(i))
    _x = Dense(1, activation='sigmoid', name='corr_dense_'+str(i))(_input)
    inputs.append(_input)
    dense_layers.append(_x)
    

input_concat = concatenate([input_base] + dense_layers)
x_concat = Dense(512, kernel_initializer='glorot_normal', activation='tanh', name='conact_tanh')(input_concat)
x_concat = Dropout(0.3)(x_concat)
for _ in range(DEEP):
    x_concat = Dense(512, activation='relu', kernel_initializer='he_normal')(x_concat)
    x_concat = Dropout(0.4)(x_concat)
    
outputs = Dense(1, activation='sigmoid', name='outputs')(x_concat)

model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.005),
              loss='binary_crossentropy', 
              #loss_weights={'output_dense':0.5, 'output_base':0.5},
              metrics=[
                  f1_score,
                  keras.metrics.Precision(name='precision'), 
                  keras.metrics.Recall(name='recall'),
                  keras.metrics.TrueNegatives(name='tn'),
                  keras.metrics.TruePositives(name='tp'),
                  'acc'
              ])

epochs = 100
batch_size = 4096

val_base_inputs, val_corr_inputs = get_inputs2(x_val)
val_input = [val_base_inputs] + [corr_input for corr_input in val_corr_inputs]

tbase_inputs, tcorr_inputs = get_inputs2(x_test)
test_input = [tbase_inputs] + [corr_input for corr_input in tcorr_inputs]

with tf.device('/gpu:0'):
    history = model.fit(train_input, y_train,
              validation_data=(val_input,y_val),
              # weight 조정으로 recall / precision trade
              class_weight={0:1, 1:(n_data/n_fraud)**.5/2},
              callbacks = [
                           keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.2, min_lr=0.000001, patience=5, 
                                                             verbose=2, mode='min'),
                           keras.callbacks.ModelCheckpoint(monitor='val_f1_score', filepath='./checkpoint/NN_inter_col', save_best_only=True, 
                                                           mode='max'),
                           #keras.callbacks.EarlyStopping(monitor='val_f1_score', patience=100, verbose=1)
                          ],
              epochs=epochs,
              batch_size=batch_size,
              verbose=1)

In [115]:

best_model = keras.models.load_model('./checkpoint/NN_inter_col', custom_objects={'f1_score':f1_score})

In [ ]:

# validation data
print_score(best_model, val_input, y_val)
print_score(model, val_input, y_val)

In [117]:

# test data
print_score(best_model, test_input, y_test)
print_score(model, test_input, y_test)

[[85271    24]
 [   21   127]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     85295
         1.0       0.84      0.86      0.85       148

    accuracy                           1.00     85443
   macro avg       0.92      0.93      0.92     85443
weighted avg       1.00      1.00      1.00     85443

[[85267    28]
 [   16   132]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     85295
         1.0       0.82      0.89      0.86       148

    accuracy                           1.00     85443
   macro avg       0.91      0.95      0.93     85443
weighted avg       1.00      1.00      1.00     85443

In [118]:

visualize_history(history)

In [179]:

def get_frauds(inputs):
    return inputs[inputs.index.isin(fraud_data.index)]
def get_normals(inputs):
    return inputs[inputs.index.isin(normal_data.index)]
def mse(x, y):
    return np.mean(np.power(x-y,2),axis=1)

In [193]:

class AutoEncoder():
    def __init__(self, units: list, n_factors: int, batch_size=16, epochs=200, callbacks={}):
        self.units = units
        self.n_factors = n_factors
        self.batch_size = batch_size
        self.epochs = epochs
        self.callbacks = callbacks
        
    def fit(self, inputs):
        normals = get_normals(inputs)
        self.inputs = Input(shape=(normals.shape[-1],))
        
        x = self.inputs
        for i, unit in enumerate(self.units):
            activation = 'tanh' if i==0 else 'relu'
            x = Dense(unit, activation=activation, kernel_initializer='he_normal')(x)
        
        x = Dense(self.n_factors, activation='relu')(x)
        
        for i, unit in enumerate(self.units[::-1]):
            activation = 'tanh' if i==0 else 'relu'
            x = Dense(unit, activation=activation,  kernel_initializer='he_normal')(x)

        self.outputs = Dense(normals.shape[-1], activation='relu')(x)
        
        model = Model(self.inputs, self.outputs)
        model.compile(loss='mse', optimizer='adam', metrics=['acc'])

        model.fit(normals, normals,
                  batch_size = self.batch_size,
                  epochs = self.epochs,
                  validation_split=0.2,
                  callbacks = self.callbacks)
        return model

In [ ]:

callbacks = keras.callbacks.EarlyStopping(patience=10)
autoencoder = AutoEncoder(units=[512,256,256,128], n_factors=16, batch_size=256)
autoencoder = autoencoder.fit(x_train)

In [207]:

normal_mse = np.array(mse(get_normals(x_test),autoencoder.predict(get_normals(x_test))))
fraud_mse = np.array(mse(get_frauds(x_test),autoencoder.predict(get_frauds(x_test))))

In [213]:

THRESHOLD = 2
print(np.sum(normal_mse<=THRESHOLD), np.sum(normal_mse>THRESHOLD))
print(np.sum(fraud_mse<=THRESHOLD), np.sum(fraud_mse>THRESHOLD))

82820 2475
25 123

In [198]:

plt.figure(figsize=(30,10))
plt.plot(get_normals(x_test).index,mse(get_normals(x_test),autoencoder.predict(get_normals(x_test))), marker='o', linestyle='',)
plt.plot(get_frauds(x_test).index,mse(get_frauds(x_test),autoencoder.predict(get_frauds(x_test))), color='r', marker='o', linestyle='',)
plt.show()

In [21]:

import imblearn

In [22]:

smote = imblearn.over_sampling.SMOTE(n_jobs=-1, random_state=SEED)
adasyn = imblearn.over_sampling.ADASYN(n_jobs=-1, random_state=SEED)

In [23]:

x_res_smote, y_res_smote = smote.fit_resample(x_train, y_train)
x_res_adasyn, y_res_adasyn = smote.fit_resample(x_train,y_train)

In [ ]:

model,history = model_NN(x_res_smote, y_res_smote)

In [146]:

print_score(model,x_test,y_test)

[[85227    68]
 [   13   135]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     85295
         1.0       0.67      0.91      0.77       148

    accuracy                           1.00     85443
   macro avg       0.83      0.96      0.88     85443
weighted avg       1.00      1.00      1.00     85443

In [147]:

visualize_history(history)

In [ ]:

model, history = model_NN(x_res_adasyn, y_res_adasyn)

In [141]:

print_score(model,x_test,y_test)

[[85231    64]
 [   14   134]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     85295
         1.0       0.68      0.91      0.77       148

    accuracy                           1.00     85443
   macro avg       0.84      0.95      0.89     85443
weighted avg       1.00      1.00      1.00     85443

In [142]:

visualize_history(history)

In [ ]:

model, history = model_NN_corr(x_res_adasyn, y_res_adasyn)

In [42]:

print_score(model, [get_inputs(x_test)], y_test)

[[85234    61]
 [   12   136]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     85295
         1.0       0.69      0.92      0.79       148

    accuracy                           1.00     85443
   macro avg       0.85      0.96      0.89     85443
weighted avg       1.00      1.00      1.00     85443

In [43]:

visualize_history(history)

In [44]:

tomek =imblearn.under_sampling.TomekLinks(n_jobs=-1)
enn = imblearn.under_sampling.EditedNearestNeighbours(n_jobs=-1)

In [46]:

x_res_tomek,y_res_tomek = tomek.fit_resample(x_train,y_train)
x_res_enn, y_res_enn  = enn.fit_resample(x_train,y_train)

In [ ]:

model,history = model_NN(x_res_tomek, y_res_tomek, class_weight=True)

In [ ]:

print_score(model, x_test, y_test)

In [ ]:

visualize_history(history)

In [ ]:

model, history = model_NN(x_res_enn, y_res_enn, class_weight=True)

In [ ]:

print_score(model, x_test, y_test)

In [ ]:

visualize_history(history)

In [ ]:

model, history = model_NN_corr(x_res_enn, y_res_enn, class_weight=True)

In [48]:

print_score(model, [get_inputs(x_test)], y_test)

[[85259    36]
 [   17   131]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     85295
         1.0       0.78      0.89      0.83       148

    accuracy                           1.00     85443
   macro avg       0.89      0.94      0.92     85443
weighted avg       1.00      1.00      1.00     85443

In [49]:

visualize_history(history)

-

Credit Card Fraud Detection / IPYNB

============================================¶

Oversampling & Undersampling¶

댓글

티스토리툴바