1. from sklearn.model_selection import train_test_split
  2. from sklearn.preprocessing import StandardScaler
  3. from unzip_utils import unzip
  4. import numpy as np
  5. import tflearn
  6. from matplotlib import pyplot as plt
  7. import seaborn as sns
  8. from sklearn.metrics import confusion_matrix
  9. import pandas as pd
  10. import zipfile
  11. from sklearn.metrics import average_precision_score, recall_score, precision_score, f1_score
  12.  
  13. def unzip(path_to_zip_file, directory_to_extract_to):
  14. zip_ref = zipfile.ZipFile(path_to_zip_file, 'r')
  15. zip_ref.extractall(directory_to_extract_to)
  16. zip_ref.close()
  17.  
  18. def report_evaluation_metrics(y_true, y_pred):
  19. average_precision = average_precision_score(y_true, y_pred)
  20. precision = precision_score(y_true, y_pred, labels=[0, 1], pos_label=1)
  21. recall = recall_score(y_true, y_pred, labels=[0, 1], pos_label=1)
  22. f1 = f1_score(y_true, y_pred, labels=[0, 1], pos_label=1)
  23.  
  24. print('Average precision-recall score: {0:0.2f}'.format(average_precision))
  25. print('Precision: {0:0.2f}'.format(precision))
  26. print('Recall: {0:0.2f}'.format(recall))
  27. print('F1: {0:0.2f}'.format(f1))
  28.  
  29. LABELS = ["Normal", "Fraud"]
  30.  
  31. def plot_confusion_matrix(y_true, y_pred):
  32. conf_matrix = confusion_matrix(y_true, y_pred)
  33.  
  34. plt.figure(figsize=(12, 12))
  35. sns.heatmap(conf_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt="d")
  36. plt.title("Confusion matrix")
  37. plt.ylabel('True class')
  38. plt.xlabel('Predicted class')
  39. plt.show()
  40.  
  41. def plot_training_history(history):
  42. if history is None:
  43. return
  44. plt.plot(history['loss'])
  45. plt.plot(history['val_loss'])
  46. plt.title('model loss')
  47. plt.ylabel('loss')
  48. plt.xlabel('epoch')
  49. plt.legend(['train', 'test'], loc='upper right')
  50. plt.show()
  51.  
  52. def visualize_anomaly(y_true, reconstruction_error, threshold):
  53. error_df = pd.DataFrame({'reconstruction_error': reconstruction_error,
  54. 'true_class': y_true})
  55. print(error_df.describe())
  56.  
  57. groups = error_df.groupby('true_class')
  58. fig, ax = plt.subplots()
  59.  
  60. for name, group in groups:
  61. ax.plot(group.index, group.reconstruction_error, marker='o', ms=3.5, linestyle='',
  62. label="Fraud" if name == 1 else "Normal")
  63.  
  64. ax.hlines(threshold, ax.get_xlim()[0], ax.get_xlim()[1], colors="r", zorder=100, label='Threshold')
  65. ax.legend()
  66. plt.title("Reconstruction error for different classes")
  67. plt.ylabel("Reconstruction error")
  68. plt.xlabel("Data point index")
  69. plt.show()
  70.  
  71. def visualize_reconstruction_error(reconstruction_error, threshold):
  72. plt.plot(reconstruction_error, marker='o', ms=3.5, linestyle='',
  73. label='Point')
  74.  
  75. plt.hlines(threshold, xmin=0, xmax=len(reconstruction_error)-1, colors="r", zorder=100, label='Threshold')
  76. plt.legend()
  77. plt.title("Reconstruction error")
  78. plt.ylabel("Reconstruction error")
  79. plt.xlabel("Data point index")
  80. plt.show()
  81.  
  82. def preprocess_data(csv_data):
  83. credit_card_data = csv_data.drop(labels=['Class', 'Time'], axis=1)
  84. credit_card_data['Amount'] = StandardScaler().fit_transform(credit_card_data['Amount'].values.reshape(-1, 1))
  85. # print(credit_card_data.head())
  86. credit_card_np_data = credit_card_data.as_matrix()
  87. y_true = csv_data['Class'].as_matrix()
  88. return credit_card_np_data, y_true
  89.  
  90. def main():
  91. seed = 42
  92. np.random.seed(seed)
  93.  
  94. data_dir_path = './data'
  95. model_dir_path = './models'
  96.  
  97. unzip(data_dir_path + '/creditcardfraud.zip', data_dir_path)
  98. csv_data = pd.read_csv(data_dir_path + '/creditcard.csv')
  99. estimated_negative_sample_ratio = 1 - csv_data['Class'].sum() / csv_data['Class'].count()
  100. print(estimated_negative_sample_ratio)
  101. X, Y = preprocess_data(csv_data)
  102. print("sample data: X:{} Y:{}".format(X[:3], Y[:3]))
  103. print(X.shape)
  104.  
  105. # detect anomaly for the test data
  106. Ypred = []
  107. _, testX, _, testY = train_test_split(X, Y, test_size=0.2, random_state=seed)
  108.  
  109. blackY_indices = np.where(Y)[0]
  110. print(blackY_indices[:3], "sample fraud credit data")
  111. assert Y[blackY_indices[0]]
  112. assert Y[blackY_indices[-1]]
  113.  
  114. # X, Y, testX, testY = mnist.load_data(one_hot=True)
  115.  
  116. # Params
  117. original_dim = len(X[0]) # MNIST images are 28x28 pixels
  118. print("dim: {}".format(original_dim))
  119.  
  120. # Building the encoder
  121. encoder = tflearn.input_data(shape=[None, original_dim])
  122. encoder = tflearn.fully_connected(encoder, 8)
  123. encoder = tflearn.fully_connected(encoder, 4)
  124.  
  125. # Building the decoder
  126. decoder = tflearn.fully_connected(encoder, 8)
  127. decoder = tflearn.fully_connected(decoder, original_dim, activation='sigmoid')
  128.  
  129. # Regression, with mean square error
  130. net = tflearn.regression(decoder, optimizer='adam', learning_rate=0.001,
  131. loss='mean_square', metric=None)
  132.  
  133. # Training the auto encoder
  134. training_model = tflearn.DNN(net, tensorboard_verbose=0)
  135. training_model.fit(X, X, n_epoch=100, validation_set=(testX, testX),
  136. run_id="auto_encoder", batch_size=256)
  137.  
  138. """
  139. hidden_dim = 4 #original_dim//2
  140. latent_dim = 2
  141.  
  142. # Building the encoder
  143. encoder = tflearn.input_data(shape=[None, original_dim], name='input_data')
  144. encoder = tflearn.fully_connected(encoder, hidden_dim, activation='relu')
  145. z_mean = tflearn.fully_connected(encoder, latent_dim)
  146. z_std = tflearn.fully_connected(encoder, latent_dim)
  147.  
  148. # Sampler: Normal (gaussian) random distribution
  149. eps = tf.random_normal(tf.shape(z_std), dtype=tf.float32, mean=0., stddev=1.0,
  150. name='epsilon')
  151. z = z_mean + tf.exp(z_std / 2) * eps
  152.  
  153. # Building the decoder (with scope to re-use these layers later)
  154. decoder = tflearn.fully_connected(z, hidden_dim, activation='relu',
  155. scope='decoder_h')
  156. decoder = tflearn.fully_connected(decoder, original_dim, activation='sigmoid',
  157. scope='decoder_out')
  158.  
  159. # Define VAE Loss
  160. def vae_loss(x_reconstructed, x_true):
  161. # Reconstruction loss
  162. encode_decode_loss = x_true * tf.log(1e-10 + x_reconstructed) \
  163. + (1 - x_true) * tf.log(1e-10 + 1 - x_reconstructed)
  164. encode_decode_loss = -tf.reduce_sum(encode_decode_loss, 1)
  165. # KL Divergence loss
  166. kl_div_loss = 1 + z_std - tf.square(z_mean) - tf.exp(z_std)
  167. kl_div_loss = -0.5 * tf.reduce_sum(kl_div_loss, 1)
  168. return tf.reduce_mean(encode_decode_loss + kl_div_loss)
  169.  
  170. net = tflearn.regression(decoder, optimizer='rmsprop', learning_rate=0.001,
  171. loss=vae_loss, metric=None, name='target_out')
  172.  
  173. # We will need 2 models, one for training that will learn the latent
  174. # representation, and one that can take random normal noise as input and
  175. # use the decoder part of the network to generate an image
  176.  
  177. # Train the VAE
  178. training_model = tflearn.DNN(net, tensorboard_verbose=0)
  179. training_model.fit({'input_data': X}, {'target_out': X}, n_epoch=10,
  180. validation_set=(testX, testX), batch_size=256, run_id="vae")
  181.  
  182. # Build an image generator (re-using the decoding layers)
  183. # Input data is a normal (gaussian) random distribution (with dim = latent_dim)
  184. # input_noise = tflearn.input_data(shape=[None, latent_dim], name='input_noise')
  185. # decoder = tflearn.fully_connected(input_noise, hidden_dim, activation='relu',
  186. # scope='decoder_h', reuse=True)
  187. # decoder = tflearn.fully_connected(decoder, original_dim, activation='sigmoid',
  188. # scope='decoder_out', reuse=True)
  189. # just for generate new data
  190. # generator_model = tflearn.DNN(decoder, session=training_model.session)
  191. """
  192. print("training sample predict:")
  193. print(training_model.predict(X[:3]))
  194.  
  195. # pred_x_test = training_model.predict(testX)
  196.  
  197. reconstruction_error = []
  198. anomaly_information,adjusted_threshold = get_anomaly(training_model, X, estimated_negative_sample_ratio)
  199. tp = fp = tn = fn = 0
  200. blackY_indices = set(blackY_indices)
  201. for idx, (is_anomaly, dist) in enumerate(anomaly_information):
  202. predicted_label = 1 if is_anomaly else 0
  203. if is_anomaly:
  204. if idx in blackY_indices:
  205. tp += 1
  206. else:
  207. fp += 1
  208. else:
  209. if idx in blackY_indices:
  210. fn += 1
  211. else:
  212. tn += 1
  213. Ypred.append(predicted_label)
  214. reconstruction_error.append(dist)
  215.  
  216. print("blackY_indices len:{} detectd cnt:{}, true attack cnt:{}".format(len(blackY_indices), tp+fn, tp))
  217. precision = float(tp) / (tp + fp)
  218. hit_rate = float(tp) / (tp + fn)
  219. accuracy = float(tp + tn) / (tp + tn + fp + fn)
  220. print('precision = {}, hit_rate = {}, accuracy = {}'.format(precision, hit_rate, accuracy))
  221.  
  222. report_evaluation_metrics(Y, Ypred)
  223. # plot_training_history(history)
  224. visualize_anomaly(Y, reconstruction_error, adjusted_threshold)
  225. plot_confusion_matrix(Y, Ypred)
  226.  
  227. def get_anomaly(model, data, estimated_negative_sample_ratio):
  228. target_data = model.predict(data)
  229. scores = np.linalg.norm(data - target_data, axis=-1)
  230. scores2 = np.array(scores)
  231. """
  232. np.linalg.norm(np.array([[1,1,1],[2,2,2]])-np.array([[0,0,0],[0,0,0]]),axis=-1)
  233. array([1.73205081, 3.46410162])
  234. >>> 3.46*3.46
  235. 11.9716
  236. """
  237. scores.sort()
  238. cut_point = int(estimated_negative_sample_ratio * len(scores))
  239. threshold = scores[cut_point]
  240. print('estimated threshold is ' + str(threshold))
  241. return zip(scores2 >= threshold, scores2), threshold
  242.  
  243. if __name__ == '__main__':
  244. main()

效果图:

使用VAE的:

  1. from sklearn.model_selection import train_test_split
  2. from sklearn.preprocessing import StandardScaler
  3. from unzip_utils import unzip
  4. import numpy as np
  5. import tensorflow as tf
  6. import tflearn
  7. from matplotlib import pyplot as plt
  8. import seaborn as sns
  9. from sklearn.metrics import confusion_matrix
  10. import pandas as pd
  11. import zipfile
  12. from sklearn.metrics import average_precision_score, recall_score, precision_score, f1_score
  13.  
  14. def unzip(path_to_zip_file, directory_to_extract_to):
  15. zip_ref = zipfile.ZipFile(path_to_zip_file, 'r')
  16. zip_ref.extractall(directory_to_extract_to)
  17. zip_ref.close()
  18.  
  19. def report_evaluation_metrics(y_true, y_pred):
  20. average_precision = average_precision_score(y_true, y_pred)
  21. precision = precision_score(y_true, y_pred, labels=[0, 1], pos_label=1)
  22. recall = recall_score(y_true, y_pred, labels=[0, 1], pos_label=1)
  23. f1 = f1_score(y_true, y_pred, labels=[0, 1], pos_label=1)
  24.  
  25. print('Average precision-recall score: {0:0.2f}'.format(average_precision))
  26. print('Precision: {0:0.2f}'.format(precision))
  27. print('Recall: {0:0.2f}'.format(recall))
  28. print('F1: {0:0.2f}'.format(f1))
  29.  
  30. LABELS = ["Normal", "Fraud"]
  31.  
  32. def plot_confusion_matrix(y_true, y_pred):
  33. conf_matrix = confusion_matrix(y_true, y_pred)
  34.  
  35. plt.figure(figsize=(12, 12))
  36. sns.heatmap(conf_matrix, xticklabels=LABELS, yticklabels=LABELS, annot=True, fmt="d")
  37. plt.title("Confusion matrix")
  38. plt.ylabel('True class')
  39. plt.xlabel('Predicted class')
  40. plt.show()
  41.  
  42. def plot_training_history(history):
  43. if history is None:
  44. return
  45. plt.plot(history['loss'])
  46. plt.plot(history['val_loss'])
  47. plt.title('model loss')
  48. plt.ylabel('loss')
  49. plt.xlabel('epoch')
  50. plt.legend(['train', 'test'], loc='upper right')
  51. plt.show()
  52.  
  53. def visualize_anomaly(y_true, reconstruction_error, threshold):
  54. error_df = pd.DataFrame({'reconstruction_error': reconstruction_error,
  55. 'true_class': y_true})
  56. print(error_df.describe())
  57.  
  58. groups = error_df.groupby('true_class')
  59. fig, ax = plt.subplots()
  60.  
  61. for name, group in groups:
  62. ax.plot(group.index, group.reconstruction_error, marker='o', ms=3.5, linestyle='',
  63. label="Fraud" if name == 1 else "Normal")
  64.  
  65. ax.hlines(threshold, ax.get_xlim()[0], ax.get_xlim()[1], colors="r", zorder=100, label='Threshold')
  66. ax.legend()
  67. plt.title("Reconstruction error for different classes")
  68. plt.ylabel("Reconstruction error")
  69. plt.xlabel("Data point index")
  70. plt.show()
  71.  
  72. def visualize_reconstruction_error(reconstruction_error, threshold):
  73. plt.plot(reconstruction_error, marker='o', ms=3.5, linestyle='',
  74. label='Point')
  75.  
  76. plt.hlines(threshold, xmin=0, xmax=len(reconstruction_error)-1, colors="r", zorder=100, label='Threshold')
  77. plt.legend()
  78. plt.title("Reconstruction error")
  79. plt.ylabel("Reconstruction error")
  80. plt.xlabel("Data point index")
  81. plt.show()
  82.  
  83. def preprocess_data(csv_data):
  84. credit_card_data = csv_data.drop(labels=['Class', 'Time'], axis=1)
  85. credit_card_data['Amount'] = StandardScaler().fit_transform(credit_card_data['Amount'].values.reshape(-1, 1))
  86. # print(credit_card_data.head())
  87. credit_card_np_data = credit_card_data.as_matrix()
  88. y_true = csv_data['Class'].as_matrix()
  89. return credit_card_np_data, y_true
  90.  
  91. # encoder
  92. def encode(input_x, encoder_hidden_dim, latent_dim):
  93. """
  94. # keras
  95. # build encoder model
  96. inputs = Input(shape=input_shape, name='encoder_input')
  97. x = Dense(intermediate_dim, activation='relu')(inputs)
  98. z_mean = Dense(latent_dim, name='z_mean')(x)
  99. z_log_var = Dense(latent_dim, name='z_log_var')(x)
  100. """
  101. encoder = tflearn.fully_connected(input_x, encoder_hidden_dim, activation='relu')
  102. mu_encoder = tflearn.fully_connected(encoder, latent_dim, activation='linear')
  103. logvar_encoder = tflearn.fully_connected(encoder, latent_dim, activation='linear')
  104. return mu_encoder, logvar_encoder
  105.  
  106. # decoder
  107. def decode(z, decoder_hidden_dim, input_dim):
  108. """
  109. # build decoder model
  110. latent_inputs = Input(shape=(latent_dim,), name='z_sampling')
  111. x = Dense(intermediate_dim, activation='relu')(latent_inputs)
  112. outputs = Dense(original_dim, activation='sigmoid')(x)
  113. """
  114. decoder = tflearn.fully_connected(z, decoder_hidden_dim, activation='relu')
  115. x_hat = tflearn.fully_connected(decoder, input_dim, activation='linear')
  116. return x_hat
  117.  
  118. # sampler
  119. def sample(mu, logvar):
  120. """
  121. keras
  122. z = Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_var])
  123. # reparameterization trick
  124. # instead of sampling from Q(z|X), sample eps = N(0,I)
  125. # z = z_mean + sqrt(var)*eps
  126. def sampling(args):
  127. z_mean, z_log_var = args
  128. batch = K.shape(z_mean)[0]
  129. dim = K.int_shape(z_mean)[1]
  130. # by default, random_normal has mean=0 and std=1.0
  131. epsilon = K.random_normal(shape=(batch, dim))
  132. return z_mean + K.exp(0.5 * z_log_var) * epsilon
  133. """
  134. epsilon = tf.random_normal(tf.shape(logvar), dtype=tf.float32, name='epsilon')
  135. # std_encoder = tf.exp(tf.mul(0.5, logvar))
  136. # z = tf.add(mu, tf.mul(std_encoder, epsilon))
  137. z = mu + tf.exp(logvar/2) * epsilon
  138. return z
  139.  
  140. # loss function(regularization)
  141. def calculate_regularization_loss(mu, logvar):
  142. kl_divergence = -0.5 * tf.reduce_sum(1 + logvar - tf.square(mu) - tf.exp(logvar), reduction_indices=1)
  143. return kl_divergence
  144.  
  145. # loss function(reconstruction)
  146. def calculate_reconstruction_loss(x_hat, input_x):
  147. mse = tflearn.objectives.mean_square(x_hat, input_x)
  148. return mse
  149.  
  150. def main():
  151. seed = 42
  152. np.random.seed(seed)
  153.  
  154. data_dir_path = './data'
  155. model_dir_path = './models'
  156.  
  157. unzip(data_dir_path + '/creditcardfraud.zip', data_dir_path)
  158. csv_data = pd.read_csv(data_dir_path + '/creditcard.csv')
  159. estimated_negative_sample_ratio = 1 - csv_data['Class'].sum() / csv_data['Class'].count()
  160. print(estimated_negative_sample_ratio)
  161. X, Y = preprocess_data(csv_data)
  162. print("sample data: X:{} Y:{}".format(X[:3], Y[:3]))
  163. print(X.shape)
  164.  
  165. # detect anomaly for the test data
  166. Ypred = []
  167. _, testX, _, testY = train_test_split(X, Y, test_size=0.2, random_state=seed)
  168.  
  169. blackY_indices = np.where(Y)[0]
  170. print(blackY_indices[:3], "sample fraud credit data")
  171. assert Y[blackY_indices[0]]
  172. assert Y[blackY_indices[-1]]
  173.  
  174. # X, Y, testX, testY = mnist.load_data(one_hot=True)
  175.  
  176. # Params
  177. original_dim = len(X[0]) # MNIST images are 28x28 pixels
  178. print("dim: {}".format(original_dim))
  179.  
  180. """
  181. # Building the encoder
  182. encoder = tflearn.input_data(shape=[None, original_dim])
  183. encoder = tflearn.fully_connected(encoder, 8)
  184. encoder = tflearn.fully_connected(encoder, 4)
  185.  
  186. # Building the decoder
  187. decoder = tflearn.fully_connected(encoder, 8)
  188. decoder = tflearn.fully_connected(decoder, original_dim, activation='sigmoid')
  189.  
  190. # Regression, with mean square error
  191. net = tflearn.regression(decoder, optimizer='adam', learning_rate=0.001,
  192. loss='mean_square', metric=None)
  193.  
  194. # Training the auto encoder
  195. training_model = tflearn.DNN(net, tensorboard_verbose=0)
  196. training_model.fit(X, X, n_epoch=100, validation_set=(testX, testX),
  197. run_id="auto_encoder", batch_size=256)
  198.  
  199. """
  200. hidden_dim = 8 #original_dim//2
  201. latent_dim = 4
  202. input_x = tflearn.input_data(shape=(None, original_dim), name='input_x')
  203. mu, logvar = encode(input_x, hidden_dim, latent_dim)
  204. z = sample(mu, logvar)
  205. x_hat = decode(z, hidden_dim, original_dim)
  206.  
  207. regularization_loss = calculate_regularization_loss(mu, logvar)
  208. reconstruction_loss = calculate_reconstruction_loss(x_hat, input_x)
  209. target = tf.reduce_mean(tf.add(regularization_loss, reconstruction_loss))
  210.  
  211. net = tflearn.regression(x_hat, optimizer='rmsprop', learning_rate=0.001,
  212. loss=target, metric=None, name='target_out')
  213.  
  214. # We will need 2 models, one for training that will learn the latent
  215. # representation, and one that can take random normal noise as input and
  216. # use the decoder part of the network to generate an image
  217.  
  218. # Train the VAE
  219. training_model = tflearn.DNN(net, tensorboard_verbose=0)
  220. training_model.fit({'input_x': X}, {'target_out': X}, n_epoch=30,
  221. validation_set=(testX, testX), batch_size=256, run_id="vae")
  222.  
  223. """
  224. # Build an image generator (re-using the decoding layers)
  225. # Input data is a normal (gaussian) random distribution (with dim = latent_dim)
  226. # input_noise = tflearn.input_data(shape=[None, latent_dim], name='input_noise')
  227. # decoder = tflearn.fully_connected(input_noise, hidden_dim, activation='relu',
  228. # scope='decoder_h', reuse=True)
  229. # decoder = tflearn.fully_connected(decoder, original_dim, activation='sigmoid',
  230. # scope='decoder_out', reuse=True)
  231. # just for generate new data
  232. # generator_model = tflearn.DNN(decoder, session=training_model.session)
  233. """
  234.  
  235. print("training sample predict:")
  236. print(training_model.predict(X[:3]))
  237.  
  238. # pred_x_test = training_model.predict(testX)
  239.  
  240. reconstruction_error = []
  241. anomaly_information,adjusted_threshold = get_anomaly(training_model, X, estimated_negative_sample_ratio)
  242. tp = fp = tn = fn = 0
  243. blackY_indices = set(blackY_indices)
  244. for idx, (is_anomaly, dist) in enumerate(anomaly_information):
  245. predicted_label = 1 if is_anomaly else 0
  246. if is_anomaly:
  247. if idx in blackY_indices:
  248. tp += 1
  249. else:
  250. fp += 1
  251. else:
  252. if idx in blackY_indices:
  253. fn += 1
  254. else:
  255. tn += 1
  256. Ypred.append(predicted_label)
  257. reconstruction_error.append(dist)
  258.  
  259. print("blackY_indices len:{} detectd cnt:{}, true attack cnt:{}".format(len(blackY_indices), tp+fn, tp))
  260. precision = float(tp) / (tp + fp)
  261. hit_rate = float(tp) / (tp + fn)
  262. accuracy = float(tp + tn) / (tp + tn + fp + fn)
  263. print('precision = {}, hit_rate = {}, accuracy = {}'.format(precision, hit_rate, accuracy))
  264.  
  265. report_evaluation_metrics(Y, Ypred)
  266. # plot_training_history(history)
  267. visualize_anomaly(Y, reconstruction_error, adjusted_threshold)
  268. plot_confusion_matrix(Y, Ypred)
  269.  
  270. def get_anomaly(model, data, estimated_negative_sample_ratio):
  271. target_data = model.predict(data)
  272. scores = np.linalg.norm(data - target_data, axis=-1)
  273. scores2 = np.array(scores)
  274. """
  275. np.linalg.norm(np.array([[1,1,1],[2,2,2]])-np.array([[0,0,0],[0,0,0]]),axis=-1)
  276. array([1.73205081, 3.46410162])
  277. >>> 3.46*3.46
  278. 11.9716
  279. """
  280. scores.sort()
  281. cut_point = int(estimated_negative_sample_ratio * len(scores))
  282. threshold = scores[cut_point]
  283. print('estimated threshold is ' + str(threshold))
  284. return zip(scores2 >= threshold, scores2), threshold
  285.  
  286. if __name__ == '__main__':
  287. main()

使用SAE(VAE)检测信用卡欺诈——感觉误报率还是比较高啊 70%+误报 蛋疼的更多相关文章

  1. kaggle信用卡欺诈看异常检测算法——无监督的方法包括: 基于统计的技术,如BACON *离群检测 多变量异常值检测 基于聚类的技术;监督方法: 神经网络 SVM 逻辑回归

    使用google翻译自:https://software.seek.intel.com/dealing-with-outliers 数据分析中的一项具有挑战性但非常重要的任务是处理异常值.我们通常将异 ...

  2. 从信用卡欺诈模型看不平衡数据分类(1)数据层面:使用过采样是主流,过采样通常使用smote,或者少数使用数据复制。过采样后模型选择RF、xgboost、神经网络能够取得非常不错的效果。(2)模型层面:使用模型集成,样本不做处理,将各个模型进行特征选择、参数调优后进行集成,通常也能够取得不错的结果。(3)其他方法:偶尔可以使用异常检测技术,IF为主

    总结:不平衡数据的分类,(1)数据层面:使用过采样是主流,过采样通常使用smote,或者少数使用数据复制.过采样后模型选择RF.xgboost.神经网络能够取得非常不错的效果.(2)模型层面:使用模型 ...

  3. 机器学习_线性回归和逻辑回归_案例实战:Python实现逻辑回归与梯度下降策略_项目实战:使用逻辑回归判断信用卡欺诈检测

    线性回归: 注:为偏置项,这一项的x的值假设为[1,1,1,1,1....] 注:为使似然函数越大,则需要最小二乘法函数越小越好 线性回归中为什么选用平方和作为误差函数?假设模型结果与测量值 误差满足 ...

  4. 【原创 Hadoop&Spark 动手实践 12】Spark MLLib 基础、应用与信用卡欺诈检测系统动手实践

    [原创 Hadoop&Spark 动手实践 12]Spark MLLib 基础.应用与信用卡欺诈检测系统动手实践

  5. ML.NET 示例:二元分类之信用卡欺诈检测

    写在前面 准备近期将微软的machinelearning-samples翻译成中文,水平有限,如有错漏,请大家多多指正. 如果有朋友对此感兴趣,可以加入我:https://github.com/fei ...

  6. 100天搞定机器学习|Day56 随机森林工作原理及调参实战(信用卡欺诈预测)

    本文是对100天搞定机器学习|Day33-34 随机森林的补充 前文对随机森林的概念.工作原理.使用方法做了简单介绍,并提供了分类和回归的实例. 本期我们重点讲一下: 1.集成学习.Bagging和随 ...

  7. paip.检测信用卡账单数据的正确性算法

    paip.检测信用卡账单数据的正确性算法 主要3点: //1.重点检测.大钱记录 //2.检测遗漏记录 //3.排除双唇记录. //4.试着cls share,改变错误的cls. 作者Attilax ...

  8. 海外支付:抵御信用卡欺诈的CyberSource

    海外支付:抵御信用卡欺诈的CyberSource 吴剑 2014-06-04 原创文章,转载必需注明出处:http://www.cnblogs.com/wu-jian 吴剑 http://www.cn ...

  9. 从SAE又回到BAE,感觉好轻松

    [前言] 我这个人总喜欢对同一类东西比較过来比較过去,用过来用过去. 比如曾经选择浏览器,从開始ie,到遨游,世界之窗.qq等等,用了有10款左右的浏览器,每款都用了不短时间, 终于固定在火狐+chr ...

随机推荐

  1. P2571 [SCOI2010]传送带

    P2571 [SCOI2010]传送带 三分套三分. 前提条件:P3382 [模板]三分法 三分,求区间内单峰函数的最大/最小值. 我们把两条线段都跑三分,先ab后cd,求出最小值. 可以直接将二维坐 ...

  2. JavaScript DOM 元素属性 状态属性

    JavaScript DOM 元素属性 状态属性 版权声明:未经允许,严禁转载! 元素的属性 核心 DOM 为我们提供了操作元素标准属性的统一 API. 所有属性节点都储存在元素的 attribute ...

  3. 20145327 《网络对抗》MSF基础应用

    20145327 <网络对抗>MSF基础应用 主动攻击ms08_067 两台虚拟机,其中一台为kali,一台为windows xp sp3(英文版) kali ip地址:192.168.4 ...

  4. bzoj 2654 tree - 二分法 - 最小生成树

    给你一个无向带权连通图,每条边是黑色或白色.让你求一棵最小权的恰好有need条白色边的生成树. 题目保证有解. Input 第一行V,E,need分别表示点数,边数和需要的白色边数. 接下来E行,每行 ...

  5. duilib中edit获得鼠标焦点后右边框被覆盖

    转载:http://www.cnblogs.com/minggong/p/6457734.html 用duilib做了一个窗口,窗口内有一个供用户输入使用的是edit控件. XML中是这样写的: &l ...

  6. 在函数后面加上const

    1.现象 在c++类中,有些成员函数在声明时,后面加了const,如 2.作用 ①.提高程序可读性,可以一眼看出此函数不能修改类成员变量: ②.提高程序可靠性,此函数试图修改成员变量时,编译器会通不过

  7. 如果恨一个程序员,忽悠他去做iOS开发

    如果你恨一个程序员,忽悠他去做iOS开发.不管他背景是cobel还是 java,送他一本iOS开发的书.这种书最好是国人写的,容易以偏概全一点,相比洋鬼子的书,更容易学到皮毛.这叫舍不得孩子套不着狼, ...

  8. 奇怪的分式|2014年蓝桥杯B组题解析第六题-fishers

    奇怪的分式 上小学的时候,小明经常自己发明新算法.一次,老师出的题目是: 1/4 乘以 8/5 小明居然把分子拼接在一起,分母拼接在一起,答案是:18/45 (参见图1.png) 老师刚想批评他,转念 ...

  9. Death to Binary? (模拟)题解

    思路: 除去前导0,注意两个1不能相邻(11->100),注意 0 *** 或者*** 0或者0 0情况 用string的reverse()很舒服 代码: #include<cstdio& ...

  10. ubuntu 把软件源修改为国内源和更新

    1. 备份原始文件 sudo cp /etc/apt/sources.list /etc/apt/sources.list.backup 2. 修改文件并添加国内源 vi /etc/apt/sourc ...