In [ ]:
'''
precision recall f1-score support
1)Original 0.0 1.00 1.00 1.00 85295
1.0 0.85 0.84 0.84 148
2)with divide 0.0 1.00 1.00 1.00 85295
corr items 1.0 0.89* 0.86 0.88* 148
3)with 0.0 1.00 1.00 1.00 85295
corr inputs 1.0 0.82 0.89* 0.86 148
------------------------Over Sampling-----------------------
with 1). 0.0 1.00 1.00 1.00 85295
SMOTE 1.0 0.67 0.91 0.77 148
with 1). 0.0 1.00 1.00 1.00 85295
ADASYN 1.0 0.68 0.91 0.77 148
with 2). 0.0 1.00 1.00 1.00 85295
ADASYN 1.0 0.69 0.92 0.79 148
-----------------------Under Sampling-----------------------
with 1). 0.0 1.00 1.00 1.00 85295
TOMEK LINK 1.0 0.85 0.84 0.84 148
with 1). 0.0 1.00 1.00 1.00 85295
ENN 1.0 0.79 0.89 0.83 148
with 2). 0.0 1.00 1.00 1.00 85295
ENN 1.0 0.78 0.89 0.83 148
'''
In [1]:
import sys
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Dense, Input, Dropout, BatchNormalization, concatenate
from tensorflow.keras.models import Model
SEED=1111
tf.random.set_seed(SEED)
In [2]:
data = pd.read_csv('./data/creditcard.csv')
In [3]:
data.head()
Out[3]:
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | ... | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 149.62 | 0 |
1 | 0.0 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | ... | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | 2.69 | 0 |
2 | 1.0 | -1.358354 | -1.340163 | 1.773209 | 0.379780 | -0.503198 | 1.800499 | 0.791461 | 0.247676 | -1.514654 | ... | 0.247998 | 0.771679 | 0.909412 | -0.689281 | -0.327642 | -0.139097 | -0.055353 | -0.059752 | 378.66 | 0 |
3 | 1.0 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | ... | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 123.50 | 0 |
4 | 2.0 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | ... | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 69.99 | 0 |
5 rows × 31 columns
In [4]:
round(data.describe(), 3)
Out[4]:
Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 284807.000 | 284807.000 | 284807.000 | 284807.000 | 284807.000 | 284807.000 | 284807.000 | 284807.000 | 284807.000 | 284807.000 | ... | 284807.000 | 284807.000 | 284807.000 | 284807.000 | 284807.000 | 284807.000 | 284807.000 | 284807.000 | 284807.000 | 284807.000 |
mean | 94813.860 | 0.000 | 0.000 | -0.000 | 0.000 | 0.000 | 0.000 | -0.000 | 0.000 | -0.000 | ... | 0.000 | -0.000 | 0.000 | 0.000 | 0.000 | 0.000 | -0.000 | -0.000 | 88.350 | 0.002 |
std | 47488.146 | 1.959 | 1.651 | 1.516 | 1.416 | 1.380 | 1.332 | 1.237 | 1.194 | 1.099 | ... | 0.735 | 0.726 | 0.624 | 0.606 | 0.521 | 0.482 | 0.404 | 0.330 | 250.120 | 0.042 |
min | 0.000 | -56.408 | -72.716 | -48.326 | -5.683 | -113.743 | -26.161 | -43.557 | -73.217 | -13.434 | ... | -34.830 | -10.933 | -44.808 | -2.837 | -10.295 | -2.605 | -22.566 | -15.430 | 0.000 | 0.000 |
25% | 54201.500 | -0.920 | -0.599 | -0.890 | -0.849 | -0.692 | -0.768 | -0.554 | -0.209 | -0.643 | ... | -0.228 | -0.542 | -0.162 | -0.355 | -0.317 | -0.327 | -0.071 | -0.053 | 5.600 | 0.000 |
50% | 84692.000 | 0.018 | 0.065 | 0.180 | -0.020 | -0.054 | -0.274 | 0.040 | 0.022 | -0.051 | ... | -0.029 | 0.007 | -0.011 | 0.041 | 0.017 | -0.052 | 0.001 | 0.011 | 22.000 | 0.000 |
75% | 139320.500 | 1.316 | 0.804 | 1.027 | 0.743 | 0.612 | 0.399 | 0.570 | 0.327 | 0.597 | ... | 0.186 | 0.529 | 0.148 | 0.440 | 0.351 | 0.241 | 0.091 | 0.078 | 77.165 | 0.000 |
max | 172792.000 | 2.455 | 22.058 | 9.383 | 16.875 | 34.802 | 73.302 | 120.589 | 20.007 | 15.595 | ... | 27.203 | 10.503 | 22.528 | 4.585 | 7.520 | 3.517 | 31.612 | 33.848 | 25691.160 | 1.000 |
8 rows × 31 columns
In [5]:
from sklearn.preprocessing import StandardScaler, RobustScaler
In [6]:
data['Scaled_Amount'] = RobustScaler().fit_transform(data['Amount'].values.reshape(-1,1))
data['Scaled_Time'] = RobustScaler().fit_transform(data['Time'].values.reshape(-1,1))
data.drop(['Time','Amount'], axis=1,inplace=True)
In [7]:
data.Class = data.Class.astype('float')
In [8]:
x_train,x_test, y_train,y_test = train_test_split(
data.drop('Class',axis=1),
data.Class,
test_size=.3,
stratify=data.Class,
shuffle=True,
random_state=SEED
)
In [9]:
x_train,x_val, y_train, y_val = train_test_split(
data.drop('Class', axis=1),
data.Class,
test_size=.2,
stratify=data.Class,
shuffle=True,
random_state=SEED
)
In [10]:
fraud_data = data[data.Class==1]
normal_data = data[data.Class==0]
n_fraud, n_normal, n_data = len(fraud_data), len(normal_data), len(data)
print('Normal:\t', round(100*n_normal/n_data, 3),'%')
print('Fraud:\t', round(100*n_fraud/n_data, 3),'%')
Normal: 99.827 % Fraud: 0.173 %
In [11]:
plt.style.use('fivethirtyeight')
plt.figure()
plt.tight_layout()
n_columns = len(data.columns)-1
nrows=n_columns//4+(1 if n_columns%4 != 0 else 0)
fig, axes = plt.subplots(nrows,4, figsize=(28,nrows*7))
i = 0
for col in data.columns:
if col == 'Class': continue
i += 1
plt.subplot(nrows,4,i)
sns.distplot(normal_data[col], label='Normal',
kde_kws={"color": "b", "lw": 1, "alpha":1})
sns.distplot(fraud_data[col], label='Fraud',
kde_kws={"color": "r", "lw": 1, "alpha":1})
plt.legend()
plt.show()
<Figure size 432x288 with 0 Axes>
In [11]:
plt.figure(figsize=(10,7))
sns.heatmap(normal_data.corr(), cmap='RdBu', center=0)
Out[11]:
<AxesSubplot:>
In [11]:
normal_corr = normal_data.drop('Class',axis=1).corr()
fraud_corr = fraud_data.drop('Class',axis=1).corr()
In [12]:
fig, axes = plt.subplots(1,2, figsize=(26,10))
sns.heatmap(normal_data.drop('Class',axis=1).sample(n_fraud).corr(), cmap='RdBu', ax=axes[0], center=0)
#sns.heatmap(normal_corr, cmap='RdBu', ax=axes[0], center=0)
axes[0].set_title('Normal')
sns.heatmap(fraud_corr, cmap='RdBu', ax=axes[1], center=0)
axes[1].set_title('Fraud')
# Fraud Data에만 두드러지는 선형적 상관관계가 보임
# 이 차이를 활용할 수 있을까?
Out[12]:
Text(0.5, 1.0, 'Fraud')
In [13]:
from collections import defaultdict
diff_corr = abs(normal_corr-fraud_corr)
columns = diff_corr.columns
diff_dict = defaultdict(int)
n_repeat = n_normal//n_fraud
for _ in range(n_repeat):
sample_normal_corr = normal_data.drop('Class',axis=1).sample(n_fraud,replace=False).corr()
diff_corr = abs(sample_normal_corr - fraud_corr)
for i, col in enumerate(columns):
for j in range(i):
diff_dict[columns[j]+'/'+columns[i]] += diff_corr[columns[j]][columns[i]]
for key in diff_dict:
diff_dict[key] /= n_repeat
corr_list = sorted(diff_dict.items(),key=lambda x:-x[1])[:20]
corr_items = []
for key, degree in corr_list:
a, b = key.split('/')
corr_items.append((a,b))
In [14]:
corr_list
Out[14]:
[('V16/V17', 1.1098562118798299), ('V17/V18', 1.0714954444257931), ('V12/V17', 1.0164098720753347), ('V16/V18', 1.000350677473375), ('V1/V3', 0.9779039533385792), ('V12/V16', 0.9717030496360538), ('V7/V10', 0.9570532768529891), ('V3/V5', 0.9564447061784479), ('V3/V7', 0.9549581398619433), ('V11/V12', 0.9531436615879483), ('V1/V7', 0.9468827099902223), ('V10/V17', 0.9454250931764953), ('V10/V12', 0.9327710615088803), ('V11/V14', 0.9291155590155674), ('V1/V5', 0.9284218042504773), ('V2/V7', 0.9206545142650776), ('V12/V14', 0.9171922094486237), ('V9/V10', 0.9162483913700069), ('V21/V22', 0.9099059014655518), ('V2/V3', 0.9006638934446911)]
In [32]:
params_RF= {
'n_estimators':[100],
'max_depth':[10,15],
'min_samples_leaf':[1,2,4],
'min_samples_split':[2,4]
}
In [33]:
model_RF = RandomForestClassifier(random_state=SEED, n_jobs=-1)
grid_cv = GridSearchCV(model_RF, param_grid=params_RF,
cv=3,
n_jobs=-1, scoring='f1',
verbose=0)
grid_cv.fit(x_train, y_train)
Out[33]:
GridSearchCV(cv=3, estimator=RandomForestClassifier(n_jobs=-1, random_state=1111), n_jobs=-1, param_grid={'max_depth': [10, 15], 'min_samples_leaf': [1, 2, 4], 'min_samples_split': [2, 4], 'n_estimators': [100]}, scoring='f1')
In [34]:
print('Best Params:',grid_cv.best_params_)
print('Best F1-Score: ', round(grid_cv.best_score_,3))
Best Params: {'max_depth': 15, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100} Best F1-Score: 0.858
In [35]:
model_RF1 = RandomForestClassifier(
n_estimators=100,
max_depth=15,
min_samples_leaf=1,
min_samples_split=2,
random_state=SEED,
n_jobs=-1
)
model_RF1.fit(x_train,y_train)
Out[35]:
RandomForestClassifier(max_depth=15, n_jobs=-1, random_state=1111)
In [36]:
print(confusion_matrix(y_val, model_RF1.predict(x_val)))
print(classification_report(y_val,model_RF1.predict(x_val)))
[[56863 1] [ 20 78]] precision recall f1-score support 0 1.00 1.00 1.00 56864 1 0.99 0.80 0.88 98 accuracy 1.00 56962 macro avg 0.99 0.90 0.94 56962 weighted avg 1.00 1.00 1.00 56962
In [37]:
print(confusion_matrix(y_test, model_RF1.predict(x_test)))
print(classification_report(y_test,model_RF1.predict(x_test)))
[[85294 1] [ 27 121]] precision recall f1-score support 0 1.00 1.00 1.00 85295 1 0.99 0.82 0.90 148 accuracy 1.00 85443 macro avg 1.00 0.91 0.95 85443 weighted avg 1.00 1.00 1.00 85443
In [38]:
plt.figure(figsize=(30,20))
sns.barplot(x=data.drop('Class',axis=1).columns,
y=model_RF1.feature_importances_)
Out[38]:
<AxesSubplot:>
In [195]:
column_index = {column:i for i,column in enumerate(x_train.columns)}
In [14]:
def print_score(model, x,y):
pred = model.predict(x)
y_pred = np.round(pred)
print(confusion_matrix(y,y_pred))
print(classification_report(y,y_pred))
In [15]:
def get_inputs(x_train):
corr_inputs = []
for key_a, key_b in corr_items:
_relative = x_train[key_a].to_numpy()/(x_train[key_b].to_numpy()+1e-8)
corr_inputs.append(_relative)
base_inputs = np.array(x_train)
corr_inputs = np.array(corr_inputs).T
return base_inputs.astype('float'), corr_inputs.astype('float')
In [16]:
def visualize_history(history):
fig, axes = plt.subplots(1,3, figsize=(20,5))
axes[0].plot(history.history['loss'], label='loss')
axes[0].plot(history.history['val_loss'], label='val_loss')
axes[0].legend()
axes[1].plot(history.history['recall'], label='recall')
axes[1].plot(history.history['val_recall'], label='val_recall')
axes[1].plot(history.history['precision'], label='precision')
axes[1].plot(history.history['val_precision'], label='val_precision')
axes[1].legend()
axes[2].plot(history.history['f1_score'], label='f1_score')
axes[2].plot(history.history['val_f1_score'], label='val_f1_score1')
axes[2].legend()
plt.show()
In [17]:
def f1_score(y_true, y_pred):
tp = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)), axis=0) + K.epsilon()
ground_positives = K.sum(y_true, axis=0) + K.epsilon()
pred_positives = K.sum(K.round(K.clip(y_pred, 0, 1)), axis=0) + K.epsilon()
precision = tp / (pred_positives+K.epsilon())
recall = tp / (ground_positives+K.epsilon())
f1 = 2*((precision*recall)/(precision+recall+K.epsilon()))
return f1
def f1_loss(y_true, y_pred):
tp = K.sum(y_true * y_pred, axis=0) + K.epsilon()
ground_positives = K.sum(y_true, axis=0) + K.epsilon()
ground_negatives = K.sum(1-y_true, axis=0) + K.epsilon()
pred_positives = K.sum(y_pred, axis=0) + K.epsilon()
ratio = ground_negatives/ground_positives
precision = tp / (pred_positives+K.epsilon())
recall = tp / (ground_positives+K.epsilon())
f1 = 2*((precision*recall)/(precision+recall+K.epsilon()))
weighted_f1 = f1*ground_positives / K.sum(ground_positives+K.epsilon())
weighted_f1 = K.sum(weighted_f1)
return 1-weighted_f1
In [18]:
# model with NN
# create function for Over/Under sampling
def model_NN(x_train, y_train, class_weight=False):
DEEP = 5
base_inputs, corr_inputs = get_inputs(x_train)
base_input = Input(shape=(base_inputs.shape[-1],), name='base_input')
base_x = Dense(512, activation='tanh', name='tanh',
kernel_initializer='he_normal')(base_input)
base_x = Dropout(0.5)(base_x)
for _ in range(DEEP):
base_x = Dense(512, activation='relu',
kernel_initializer='he_normal')(base_x)
base_x = Dropout(0.6)(base_x)
outputs = Dense(1, activation='sigmoid')(base_x)
model = Model(inputs=base_input, outputs=outputs)
model.compile(
optimizer=keras.optimizers.Adam(learning_rate=0.005),
#optimizer=keras.optimizers.SGD(learning_rate=0.01, momentum=0.9),
#optimizer=keras.optimizers.RMSprop(learning_rate=0.0001),
#loss=f1_loss,
loss='binary_crossentropy',
metrics=[
f1_score,
keras.metrics.Precision(name='precision'),
keras.metrics.Recall(name='recall'),
keras.metrics.TrueNegatives(name='tn'),
keras.metrics.TruePositives(name='tp')
])
epochs = 100
batch_size = 4096
val_base_inputs, val_corr_inputs = get_inputs(x_val)
tbase_inputs, tcorr_inputs = get_inputs(x_test)
with tf.device('/gpu:0'):
history = model.fit(base_inputs, y_train,
validation_data=(val_base_inputs, y_val),
# weight 조정으로 recall / precision trade
class_weight={0:1, 1:(n_data/n_fraud)**.5/2 if class_weight else 1},
callbacks = [
keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.2, min_lr=0.0000001, patience=10, verbose=2,
mode='min'),
keras.callbacks.ModelCheckpoint(monitor='val_f1_score', filepath='./checkpoint/NN', save_best_only=True,
mode='max')
#,keras.callbacks.EarlyStopping(monitor='val_loss', patience=100, verbose=1)
],
#callbacks = [c1,c2],
epochs=epochs,
batch_size=batch_size,
verbose=1)
return model, history
In [ ]:
model, history = model_NN(x_train, y_train, class_weight=True)
In [131]:
best_model = keras.models.load_model('./checkpoint/NN', custom_objects={'f1_score':f1_score})
In [ ]:
# validation data
print_score(best_model, val_base_inputs, y_val)
print_score(model, val_base_inputs, y_val)
In [133]:
# test data
print_score(best_model, tbase_inputs, y_test)
print_score(model, tbase_inputs, y_test)
[[85271 24] [ 22 126]] precision recall f1-score support 0.0 1.00 1.00 1.00 85295 1.0 0.84 0.85 0.85 148 accuracy 1.00 85443 macro avg 0.92 0.93 0.92 85443 weighted avg 1.00 1.00 1.00 85443 [[85273 22] [ 24 124]] precision recall f1-score support 0.0 1.00 1.00 1.00 85295 1.0 0.85 0.84 0.84 148 accuracy 1.00 85443 macro avg 0.92 0.92 0.92 85443 weighted avg 1.00 1.00 1.00 85443
In [134]:
# binary_crossentropy loss 사용 시, 0.51을 1로 판단하면 loss 크고 099를 1로 판단하면 loss 작음
# f1_score는 수렴
# validation set에서의 f1_score가 더 높은 이유는 dropout layer 때문일 가능성이 있음
visualize_history(history)
In [25]:
def schedule(epoch, lr):
if epoch==15:
return lr * 0.5
else:
return lr
In [ ]:
# model with divided corr_items
# 선형적 상관관계 col_a = a * col_b -> col_a/col_b = a
DEEP = 5
base_inputs, corr_inputs = get_inputs(x_train)
dense_layers= []
input_base = Input(shape=(base_inputs.shape[-1],), name='base_input')
input_corr = Input(shape=(corr_inputs.shape[-1],), name='corr_input')
inputs = [input_base,input_corr]
input_concat = concatenate([input_base, input_corr])
x_concat = Dense(512, activation='tanh', kernel_initializer='glorot_normal', name='concat_tanh')(input_concat)
x_concat = Dropout(0.5)(x_concat)
for _ in range(DEEP):
x_concat = Dense(512, activation='relu', kernel_initializer='he_normal')(x_concat)
x_concat = Dropout(0.5)(x_concat)
outputs = Dense(1, activation='sigmoid', name='output')(x_concat)
model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.005),
loss='binary_crossentropy',
metrics=[
f1_score,
keras.metrics.Precision(name='precision'),
keras.metrics.Recall(name='recall'),
keras.metrics.TrueNegatives(name='tn'),
keras.metrics.TruePositives(name='tp'),
'acc'
])
epochs = 100
batch_size = 4096
val_base_inputs, val_corr_inputs = get_inputs(x_val)
tbase_inputs, tcorr_inputs = get_inputs(x_test)
with tf.device('/gpu:0'):
history = model.fit([base_inputs, corr_inputs], y_train,
validation_data=([val_base_inputs, val_corr_inputs],y_val),
# weight 조정으로 recall / precision trade
class_weight={0:1, 1:(n_normal/n_fraud)**.5/2},
callbacks = [keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.2, min_lr=0.0000001, patience=5, verbose=2,
mode='min'),
keras.callbacks.ModelCheckpoint(monitor='val_f1_score', filepath='./checkpoint/NN_interact',
save_best_only=True,
mode='max'),
#keras.callbacks.EarlyStopping(monitor='val_f1_score', patience=100, mode='max', verbose=1)
],
epochs=epochs,
batch_size=batch_size,
verbose=1)
In [ ]:
model.summary()
In [83]:
best_model = keras.models.load_model('./checkpoint/NN_interact', custom_objects={'f1_score':f1_score})
In [ ]:
# validation data
print_score(best_model, [val_base_inputs, val_corr_inputs], y_val)
print_score(model, [val_base_inputs, val_corr_inputs], y_val)
In [85]:
# test data
print_score(best_model, [tbase_inputs, tcorr_inputs], y_test)
print_score(model, [tbase_inputs, tcorr_inputs], y_test)
[[85283 12] [ 21 127]] precision recall f1-score support 0.0 1.00 1.00 1.00 85295 1.0 0.91 0.86 0.89 148 accuracy 1.00 85443 macro avg 0.96 0.93 0.94 85443 weighted avg 1.00 1.00 1.00 85443 [[85279 16] [ 20 128]] precision recall f1-score support 0.0 1.00 1.00 1.00 85295 1.0 0.89 0.86 0.88 148 accuracy 1.00 85443 macro avg 0.94 0.93 0.94 85443 weighted avg 1.00 1.00 1.00 85443
In [86]:
visualize_history(history)
In [51]:
def get_inputs2(x_train):
corr_inputs = []
for key_a, key_b in corr_items:
_relative = np.append(x_train[key_a].to_numpy(), x_train[key_b].to_numpy()).reshape(-1,2, order='F')
corr_inputs.append(_relative)
base_inputs = np.array(x_train)
corr_inputs = np.array(corr_inputs)
return base_inputs.astype('float'), corr_inputs.astype('float')
In [ ]:
# model with sigmoid interaction columns
# 선형적 관계를 가진 컬럼 -> sigmoid -> + base_inputs
DEEP = 5
base_inputs, corr_inputs = get_inputs2(x_train)
train_input = [base_inputs] + [corr_input for corr_input in corr_inputs]
inputs = []
dense_layers= []
input_base = Input(shape=(base_inputs.shape[-1],), name='base_input')
inputs.append(input_base)
for i, corr_input in enumerate(corr_inputs):
_input = Input(shape=(corr_input.shape[-1],), name='corr_input_'+str(i))
_x = Dense(1, activation='sigmoid', name='corr_dense_'+str(i))(_input)
inputs.append(_input)
dense_layers.append(_x)
input_concat = concatenate([input_base] + dense_layers)
x_concat = Dense(512, kernel_initializer='glorot_normal', activation='tanh', name='conact_tanh')(input_concat)
x_concat = Dropout(0.3)(x_concat)
for _ in range(DEEP):
x_concat = Dense(512, activation='relu', kernel_initializer='he_normal')(x_concat)
x_concat = Dropout(0.4)(x_concat)
outputs = Dense(1, activation='sigmoid', name='outputs')(x_concat)
model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.005),
loss='binary_crossentropy',
#loss_weights={'output_dense':0.5, 'output_base':0.5},
metrics=[
f1_score,
keras.metrics.Precision(name='precision'),
keras.metrics.Recall(name='recall'),
keras.metrics.TrueNegatives(name='tn'),
keras.metrics.TruePositives(name='tp'),
'acc'
])
epochs = 100
batch_size = 4096
val_base_inputs, val_corr_inputs = get_inputs2(x_val)
val_input = [val_base_inputs] + [corr_input for corr_input in val_corr_inputs]
tbase_inputs, tcorr_inputs = get_inputs2(x_test)
test_input = [tbase_inputs] + [corr_input for corr_input in tcorr_inputs]
with tf.device('/gpu:0'):
history = model.fit(train_input, y_train,
validation_data=(val_input,y_val),
# weight 조정으로 recall / precision trade
class_weight={0:1, 1:(n_data/n_fraud)**.5/2},
callbacks = [
keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.2, min_lr=0.000001, patience=5,
verbose=2, mode='min'),
keras.callbacks.ModelCheckpoint(monitor='val_f1_score', filepath='./checkpoint/NN_inter_col', save_best_only=True,
mode='max'),
#keras.callbacks.EarlyStopping(monitor='val_f1_score', patience=100, verbose=1)
],
epochs=epochs,
batch_size=batch_size,
verbose=1)
In [115]:
best_model = keras.models.load_model('./checkpoint/NN_inter_col', custom_objects={'f1_score':f1_score})
In [ ]:
# validation data
print_score(best_model, val_input, y_val)
print_score(model, val_input, y_val)
In [117]:
# test data
print_score(best_model, test_input, y_test)
print_score(model, test_input, y_test)
[[85271 24] [ 21 127]] precision recall f1-score support 0.0 1.00 1.00 1.00 85295 1.0 0.84 0.86 0.85 148 accuracy 1.00 85443 macro avg 0.92 0.93 0.92 85443 weighted avg 1.00 1.00 1.00 85443 [[85267 28] [ 16 132]] precision recall f1-score support 0.0 1.00 1.00 1.00 85295 1.0 0.82 0.89 0.86 148 accuracy 1.00 85443 macro avg 0.91 0.95 0.93 85443 weighted avg 1.00 1.00 1.00 85443
In [118]:
visualize_history(history)
============================================¶
In [179]:
def get_frauds(inputs):
return inputs[inputs.index.isin(fraud_data.index)]
def get_normals(inputs):
return inputs[inputs.index.isin(normal_data.index)]
def mse(x, y):
return np.mean(np.power(x-y,2),axis=1)
In [193]:
class AutoEncoder():
def __init__(self, units: list, n_factors: int, batch_size=16, epochs=200, callbacks={}):
self.units = units
self.n_factors = n_factors
self.batch_size = batch_size
self.epochs = epochs
self.callbacks = callbacks
def fit(self, inputs):
normals = get_normals(inputs)
self.inputs = Input(shape=(normals.shape[-1],))
x = self.inputs
for i, unit in enumerate(self.units):
activation = 'tanh' if i==0 else 'relu'
x = Dense(unit, activation=activation, kernel_initializer='he_normal')(x)
x = Dense(self.n_factors, activation='relu')(x)
for i, unit in enumerate(self.units[::-1]):
activation = 'tanh' if i==0 else 'relu'
x = Dense(unit, activation=activation, kernel_initializer='he_normal')(x)
self.outputs = Dense(normals.shape[-1], activation='relu')(x)
model = Model(self.inputs, self.outputs)
model.compile(loss='mse', optimizer='adam', metrics=['acc'])
model.fit(normals, normals,
batch_size = self.batch_size,
epochs = self.epochs,
validation_split=0.2,
callbacks = self.callbacks)
return model
In [ ]:
callbacks = keras.callbacks.EarlyStopping(patience=10)
autoencoder = AutoEncoder(units=[512,256,256,128], n_factors=16, batch_size=256)
autoencoder = autoencoder.fit(x_train)
In [207]:
normal_mse = np.array(mse(get_normals(x_test),autoencoder.predict(get_normals(x_test))))
fraud_mse = np.array(mse(get_frauds(x_test),autoencoder.predict(get_frauds(x_test))))
In [213]:
THRESHOLD = 2
print(np.sum(normal_mse<=THRESHOLD), np.sum(normal_mse>THRESHOLD))
print(np.sum(fraud_mse<=THRESHOLD), np.sum(fraud_mse>THRESHOLD))
82820 2475 25 123
In [198]:
plt.figure(figsize=(30,10))
plt.plot(get_normals(x_test).index,mse(get_normals(x_test),autoencoder.predict(get_normals(x_test))), marker='o', linestyle='',)
plt.plot(get_frauds(x_test).index,mse(get_frauds(x_test),autoencoder.predict(get_frauds(x_test))), color='r', marker='o', linestyle='',)
plt.show()
Oversampling & Undersampling¶
In [21]:
import imblearn
In [22]:
smote = imblearn.over_sampling.SMOTE(n_jobs=-1, random_state=SEED)
adasyn = imblearn.over_sampling.ADASYN(n_jobs=-1, random_state=SEED)
In [23]:
x_res_smote, y_res_smote = smote.fit_resample(x_train, y_train)
x_res_adasyn, y_res_adasyn = smote.fit_resample(x_train,y_train)
In [ ]:
model,history = model_NN(x_res_smote, y_res_smote)
In [146]:
print_score(model,x_test,y_test)
[[85227 68] [ 13 135]] precision recall f1-score support 0.0 1.00 1.00 1.00 85295 1.0 0.67 0.91 0.77 148 accuracy 1.00 85443 macro avg 0.83 0.96 0.88 85443 weighted avg 1.00 1.00 1.00 85443
In [147]:
visualize_history(history)
In [ ]:
model, history = model_NN(x_res_adasyn, y_res_adasyn)
In [141]:
print_score(model,x_test,y_test)
[[85231 64] [ 14 134]] precision recall f1-score support 0.0 1.00 1.00 1.00 85295 1.0 0.68 0.91 0.77 148 accuracy 1.00 85443 macro avg 0.84 0.95 0.89 85443 weighted avg 1.00 1.00 1.00 85443
In [142]:
visualize_history(history)
In [ ]:
model, history = model_NN_corr(x_res_adasyn, y_res_adasyn)
In [42]:
print_score(model, [get_inputs(x_test)], y_test)
[[85234 61] [ 12 136]] precision recall f1-score support 0.0 1.00 1.00 1.00 85295 1.0 0.69 0.92 0.79 148 accuracy 1.00 85443 macro avg 0.85 0.96 0.89 85443 weighted avg 1.00 1.00 1.00 85443
In [43]:
visualize_history(history)
In [44]:
tomek =imblearn.under_sampling.TomekLinks(n_jobs=-1)
enn = imblearn.under_sampling.EditedNearestNeighbours(n_jobs=-1)
In [46]:
x_res_tomek,y_res_tomek = tomek.fit_resample(x_train,y_train)
x_res_enn, y_res_enn = enn.fit_resample(x_train,y_train)
In [ ]:
model,history = model_NN(x_res_tomek, y_res_tomek, class_weight=True)
In [ ]:
print_score(model, x_test, y_test)
In [ ]:
visualize_history(history)
In [ ]:
model, history = model_NN(x_res_enn, y_res_enn, class_weight=True)
In [ ]:
print_score(model, x_test, y_test)
In [ ]:
visualize_history(history)
In [ ]:
model, history = model_NN_corr(x_res_enn, y_res_enn, class_weight=True)
In [48]:
print_score(model, [get_inputs(x_test)], y_test)
[[85259 36] [ 17 131]] precision recall f1-score support 0.0 1.00 1.00 1.00 85295 1.0 0.78 0.89 0.83 148 accuracy 1.00 85443 macro avg 0.89 0.94 0.92 85443 weighted avg 1.00 1.00 1.00 85443
In [49]:
visualize_history(history)
댓글