我的代码-statistic analysis
# coding: utf-8
# In[1]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd
# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder
# File system manangement
import os
# Suppress warnings
import warnings
warnings.filterwarnings('ignore')
# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns
# In[3]:
# Test data features
app_test = pd.read_csv(r'D:\Users\sgg91044\Desktop\MEP_no_defect_data_pivot.csv')
print('Testing data shape: ', app_test.shape)
app_test.head(20000)
# In[4]:
app_test['Target'].value_counts()
app_test['Target'].astype(int).plot.hist();
# In[5]:
# Function to calculate missing values by column# Funct
def missing_values_table(app_test):
# Total missing values
mis_val = app_test.isnull().sum()
# Percentage of missing values
mis_val_percent = 100 * app_test.isnull().sum() / len(app_test)
# Make a table with the results
mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
# Rename the columns
mis_val_table_ren_columns = mis_val_table.rename(
columns = {0 : 'Missing Values', 1 : '% of Total Values'})
# Sort the table by percentage of missing descending
mis_val_table_ren_columns = mis_val_table_ren_columns[
mis_val_table_ren_columns.iloc[:,1] != 0].sort_values('% of Total Values', ascending=False).round(1)
# Print some summary information
print ("Your selected dataframe has " + str(app_test.shape[1]) + " columns.\n"
"There are " + str(mis_val_table_ren_columns.shape[0]) + " columns that have missing values.")
# Return the dataframe with missing information
return mis_val_table_ren_columns
# In[6]:
# Missing values statistics
missing_values = missing_values_table(app_test)
missing_values.head(20)
# In[7]:
RR13_MAX_median = app_test['RR13_MAX.'].median()
ETCM_PHC4_median = app_test['ETCM_PHC4'].median()
HELK_MEAN_median = app_test['HELK_MEAN'].median()
PBK4_median = app_test['PBK4'].median()
ETCM_PHB4_median = app_test['ETCM_PHB4'].median()
ETCM_PHA4_median = app_test['ETCM_PHA4'].median()
THR3_MAX_median = app_test['THR3_MAX.'].median()
THR3_MEAN_median = app_test['THR3_MEAN'].median()
RR23_MEAN_median = app_test['RR23_MEAN'].median()
RR13_MEAN_median = app_test['RR13_MEAN'].median()
THR3_MEAN_DIFF_median = app_test['THR3_MEAN_DIFF'].median()
THR3_MEAN_SLOPE_median = app_test['THR3_MEAN_SLOPE'].median()
THR3_MEAN_SLOPE_median = app_test['THR3_MEAN_SLOPE'].median()
THR3_MAX_DIFF_median = app_test['THR3_MAX._DIFF'].median()
LOWERCHM_PRESS_DIFF_median = app_test['LOWERCHM_PRESS'].median()
HELK_MAX_median = app_test['HELK_MAX.'].median()
#HELK_MIN_median = app_test['HELK_MIN.'].median()
HELK_SD_median = app_test['HELK_SD'].median()
THR3_SD_median = app_test['THR3_SD'].median()
RR23_MAX_median = app_test['RR23_MAX.'].median()
#RR13_MAX_median
#ETCM_PHC4_median
#HELK_MEAN_median
#PBK4_median
#ETCM_PHB4_median
#ETCM_PHA4_median
#THR3_MAX_median
#THR3_MEAN_median
#RR23_MEAN_median
#RR13_MEAN_median
#THR3_MEAN_DIFF_median
#THR3_MEAN_SLOPE_median
#THR3_MEAN_SLOPE_median
#THR3_MAX_DIFF_median
#LOWERCHM_PRESS_DIFF_median
#HELK_MAX_median
#HELK_MIN_median
#HELK_SD_median
#THR3_SD_median
# In[8]:
app_test=app_test.fillna({'RR13_MAX.':RR13_MAX_median,
'ETCM_PHC4':ETCM_PHC4_median,
'HELK_MEAN':HELK_MEAN_median,
'PBK4':PBK4_median,
'ETCM_PHB4':ETCM_PHB4_median,
'ETCM_PHA4':ETCM_PHA4_median,
'THR3_MAX.':THR3_MAX_median,
'THR3_MEAN':THR3_MEAN_median,
'RR23_MEAN':RR23_MEAN_median,
'RR13_MEAN':RR13_MEAN_median,
'THR3_MEAN_DIFF':THR3_MEAN_DIFF_median,
'THR3_MEAN_SLOPE':THR3_MEAN_SLOPE_median,
'THR3_MEAN_SLOPE':THR3_MEAN_SLOPE_median,
'THR3_MAX._DIFF':THR3_MAX_DIFF_median,
'LOWERCHM_PRESS':LOWERCHM_PRESS_DIFF_median,
'HELK_MAX.':HELK_MAX_median,
'HELK_SD':HELK_SD_median,
'THR3_SD':THR3_SD_median,
'RR23_MAX.':RR23_MAX_median
})
app_test
# In[9]:
# Find correlations with the target and sort
correlations = app_test.corr()['Target'].sort_values()
# Display correlations
print('Most Positive Correlations:\n', correlations.tail(20))
print('\nMost Negative Correlations:\n', correlations.head(20))
# In[10]:
# Measure the correlation of parameter'ETCM_PHC4' and target
plt.figure(figsize = (20, 8))
# KDE plot of wafers that were non-defected
sns.kdeplot(app_test.loc[app_test['Target'] == 0, 'ETCM_PHC4'],label = 'target == 0')
# KDE plot of wafers that were defected
sns.kdeplot(app_test.loc[app_test['Target'] == 1, 'ETCM_PHC4'],label = 'target == 1')
# Labeling of plot
plt.xlabel('ETCM_PHC4'); plt.ylabel('Density'); plt.title('Distribution of ETCM_PHC4');
# In[11]:
sns.boxplot(x='HELK_SD', y='Target', data=app_test)
sns.plot.show()
# In[ ]:
# Scatter Plot
app_test.plot(kind='scatter', x='PBK4', y='Target',alpha = 0.5,color = 'red')
plt.xlabel('PBK4') # label = name of label
plt.ylabel('Target')
plt.title('PBK4 Target Scatter Plot')
# In[ ]:
# Measure the correlation of parameter'HELK_MEAN' and target
plt.figure(figsize = (20, 8))
# KDE plot of wafers that were non-defected
sns.kdeplot(app_test.loc[app_test['Target'] == 0, 'HELK_MEAN'],label = 'target == 0')
# KDE plot of wafers that were defected
sns.kdeplot(app_test.loc[app_test['Target'] == 1, 'HELK_MEAN'],label = 'target == 1')
# Labeling of plot
plt.xlabel('HELK_MEAN'); plt.ylabel('Density'); plt.title('Distribution of HELK_MEAN');
# In[ ]:
plt.figure(figsize = (15, 50))
# iterate through the new features
for i, feature in enumerate(['LOWERCHM_PRESS',
'RR13_MEAN',
'RR13_MAX.',
'RR23_MEAN',
'THR3_MAX.',
'THR3_MEAN',
'RR23_MAX.',
'PBK4',
'THR3_MEAN_DIFF',
'HELK_MEAN',
'THR3_MEAN_SLOPE',
'THR3_MAX._DIFF']):
# create a new subplot for each source
plt.subplot(13, 1, i + 1)
# plot non_defected wafer
sns.kdeplot(app_test.loc[app_test['Target'] == 0, feature], label = 'Target == 0')
# plot defected wafer
sns.kdeplot(app_test.loc[app_test['Target'] == 1, feature], label = 'Target == 1')
# Label the plots
plt.title('Distribution of %s by Target Value' % feature)
plt.xlabel('%s' % feature); plt.ylabel('Density');
plt.tight_layout(h_pad = 0.1)
# In[ ]:
# Bin the parameter'ETCM_PHC4' data
ETCM_PHC4 = app_test[['Target', 'ETCM_PHC4']]
ETCM_PHC4['VALUE_BINNED'] = pd.cut(ETCM_PHC4['ETCM_PHC4'], bins = np.linspace(200, 1000, num = 17))
ETCM_PHC4.head(20)
# In[ ]:
# Group by the bin and calculate averages
ETCM_PHC4_groups = ETCM_PHC4.groupby('VALUE_BINNED').mean()
ETCM_PHC4_groups
# In[ ]:
plt.figure(figsize = (8, 8))
# Graph the age bins and the average of the target as a bar plot
plt.bar(ETCM_PHC4_groups.index.astype(str), 100 * ETCM_PHC4_groups['Target'])
# Plot labeling
plt.xticks(rotation = 75); plt.xlabel('ETCM_PHC4 value groups'); plt.ylabel('Possibility to be defected (%)')
plt.title('Possibility to be defected by ETCM_PHC4');
# In[ ]:
# Bin the parameter'HELK_MEAN' data
HELK_MEAN = app_test[['Target', 'HELK_MEAN']]
HELK_MEAN['VALUE_BINNED'] = pd.cut(HELK_MEAN['HELK_MEAN'], bins = np.linspace(0, 17.5, num = 8))
plt.figure(figsize = (8, 8))
# Group by the bin and calculate averages
HELK_MEAN_groups = HELK_MEAN.groupby('VALUE_BINNED').mean()
# Graph the age bins and the average of the target as a bar plot
plt.bar(HELK_MEAN_groups.index.astype(str), 100 * HELK_MEAN_groups['Target'])
# Plot labeling
plt.xticks(rotation = 75); plt.xlabel('HELK_MEAN value groups'); plt.ylabel('Possibility to be defected (%)')
plt.title('Possibility to be defected by HELK_MEAN');
# In[13]:
# Extract the parameters variables and show correlations
ext_data = app_test[['Target', 'RR13_MAX.',
'ETCM_PHC4' ,
'HELK_MEAN',
'PBK4',
'ETCM_PHB4' ,
'ETCM_PHA4',
'HELK_MAX.',
'HELK_SD',
'THR3_SD' ]]
ext_data_corrs = ext_data.corr()
ext_data_corrs
# In[14]:
plt.figure(figsize = (10, 10))
# Heatmap of correlations
sns.heatmap(ext_data_corrs, cmap = plt.cm.RdYlBu_r, vmin =0, annot = True, vmax = 1)
plt.title('Correlation Heatmap');
# In[ ]:
plt.figure(figsize = (10, 12))
# iterate through the sources
for i, source in enumerate(['ETCM_PHA4', 'ETCM_PHB4', 'ETCM_PHC4']):
# create a new subplot for each source
plt.subplot(3, 1, i + 1)
# plot non-defected
sns.kdeplot(app_test.loc[app_test['Target'] == 0, source], label = 'target == 0',shade=True)
# plot defected
sns.kdeplot(app_test.loc[app_test['Target'] == 1, source], label = 'target == 1',shade=True)
# Label the plots
plt.title('Distribution of %s by Target Value' % source)
plt.xlabel('%s' % source); plt.ylabel('Density');
plt.tight_layout(h_pad = 2.5)
# In[ ]:
# Copy the data for plotting
plot_data = ext_data.drop(columns = ['RR13_MAX.',
'HELK_MAX.',
'HELK_SD',
'THR3_SD' ]).copy()
# Function to calculate correlation coefficient between two columns
def corr_func(x, y, **kwargs):
r = np.corrcoef(x, y)[0][1]
ax = plt.gca()
ax.annotate("r = {:.2f}".format(r),
xy=(.2, .8), xycoords=ax.transAxes,
size = 20)
# Create the pairgrid object
grid = sns.PairGrid(data = plot_data, size = 3, diag_sharey=False,
hue = 'Target',
vars = [x for x in list(plot_data.columns) if x != 'Target'])
# Upper is a scatter plot
grid.map_upper(plt.scatter, alpha = 0.2)
# Diagonal is a histogram
grid.map_diag(sns.kdeplot)
# Bottom is density plot
grid.map_lower(sns.kdeplot, cmap = plt.cm.OrRd_r);
plt.suptitle('Ext Source Features Pairs Plot', size = 32, y = 1.05);
# In[12]:
import seaborn as sns
# Create the default pairplot
sns.pairplot(app_test)
# In[224]:
app_test.to_csv(r'D:\Users\sgg91044\Downloads\SEDA\CTM_data.csv', index=True, header=True)
# In[243]:
from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.cross_validation import train_test_split
CTM_data=pd.read_csv(r'D:\Users\sgg91044\Downloads\SEDA\CTM_data.csv')
# Median imputation of missing values
#RR23_MAX_median = CTM_data['RR23_MAX.'].median()
#CTM_data=CTM_data.fillna({'RR23_MAX.': RR23_MAX_median})
# Drop the target from the training data
#CTM_data['Target_C']=CTM_data['Target'].astype('category')
#CTM_data['Target_C'].cat.categories=['noMEP','MEP']
#le = LabelEncoder()
#le_count = 0
#le.fit(CTM_data['Target_C'])
#CTM_data['Target_C'] = le.transform(CTM_data['Target_C'])
CTM_data['Target']=CTM_data['Target'].astype('float')
CTM_data_Target= CTM_data[['Target']]
CTM_data_Columns = CTM_data.drop(columns = ['Target'])
X_train, X_test, y_train, y_test = train_test_split(CTM_data_Columns,CTM_data_Target,test_size=0.3, random_state=0)
# In[244]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)
# In[245]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))
sm = SMOTE(random_state=2)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())
print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))
print("After OverSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res==0)))
# In[228]:
#Y_test = test[['Target_C']]
#Y_train = test.drop(columns = ['Target','Target_C'])
# Feature names
#features = list(train.columns)
#y, _ = pd.factorize(app_train['Target'])
# Scale each feature to 0-1
scaler = MinMaxScaler(feature_range = (0, 1))
# Fit on the training data
#imputer.fit(train)
# Transform both training and testing data
#train = imputer.transform(train)
#test = imputer.transform(test)
# Repeat with the scaler
scaler.fit(X_train)
X_train = scaler.transform(X_train)
scaler.fit(X_test)
X_test = scaler.transform(X_test)
print('Training data shape: ', X_train.shape)
print('Testing data shape: ', X_test.shape)
# In[229]:
from sklearn.ensemble import RandomForestClassifier
# Make the random forest classifier
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50, verbose = 1, n_jobs = -1)
# In[203]:
y_test
# In[234]:
# Train on the training data
random_forest.fit(X_train,y_train)
# Extract feature importances
feature_importance_values = random_forest.feature_importances_
feature_importances = pd.DataFrame({'feature': features, 'importance': feature_importance_values})
# Make predictions on the test data
predictions = random_forest.predict(X_test)
# In[232]:
predictions
predictions.shape
# In[233]:
import pandas as pd
# 第一个参数是行索引,第二个属性为列索引
print( pd.crosstab(y_test['Target'], predictions, rownames=['Target'], colnames=['preds']))
# Make a submission dataframe
# In[165]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.cross_validation import train_test_split,cross_val_score
# In[190]:
cross_val_score(random_forest,y_test,submit )
# In[235]:
def plot_feature_importances(df):
"""
Plot importances returned by a model. This can work with any measure of
feature importance provided that higher importance is better.
Args:
df (dataframe): feature importances. Must have the features in a column
called `features` and the importances in a column called `importance
Returns:
shows a plot of the 15 most importance features
df (dataframe): feature importances sorted by importance (highest to lowest)
with a column for normalized importance
"""
# Sort features according to importance
df = df.sort_values('importance', ascending = False).reset_index()
# Normalize the feature importances to add up to one
df['importance_normalized'] = df['importance'] / df['importance'].sum()
# Make a horizontal bar chart of feature importances
plt.figure(figsize = (10, 6))
ax = plt.subplot()
# Need to reverse the index to plot most important on top
ax.barh(list(reversed(list(df.index[:20]))),
df['importance_normalized'].head(20),
align = 'center', edgecolor = 'k')
# Set the yticks and labels
ax.set_yticks(list(reversed(list(df.index[:20]))))
ax.set_yticklabels(df['feature'].head(20))
# Plot labeling
plt.xlabel('Normalized Importance'); plt.title('Feature Importances')
plt.show()
return df
# In[236]:
feature_importances_sorted = plot_feature_importances(feature_importances)
我的代码-statistic analysis的更多相关文章
- Web 检测代码 web analysis 开源 open source
1. Grape Web Statistics Grape Web Statistics is a fairly simple piece of analytics software. Grape i ...
- Android内存等信息
1. Linux中proc目录下文件详解 http://wenku.baidu.com/view/2ce89f00a6c30c2259019ef1.html 2. Android系统/proc目录详解 ...
- 结构-行为-样式-angularJs 指令实现滚动文字
最近在做XX项目的大屏展示页面,有一个表格需要用到这个滚动效果,于是就写了个指令,记录下,共同学习. Html代码: <td word-roll tword="item"&g ...
- 以太坊智能合约虚拟机(EVM)原理与实现
以太坊 EVM原理与实现 以太坊底层通过EVM模块支持合约的执行与调用,调用时根据合约地址获取到代码,生成环境后载入到EVM中运行.通常智能合约的开发流程是用solidlity编写逻辑代码,再通过编译 ...
- 20165227 2017-2018-2《Java程序设计》课程总结
20165227 2017-2018-2<Java程序设计>课程总结 每周作业链接汇总 预备作业1 简要内容: 记忆深刻的老师 我期望的师生关系 对于Java学习的看法 预备作业2 简要内 ...
- C++解析头文件-Qt自动生成信号定义
目录 一.概述 二.实现思路 三.代码讲解 1.类图 2.QtCppDescription 3.测试 四.源代码 一.概述 上一篇文章C++解析头文件-Qt自动生成信号声明我们主要讲解了怎么去解析C+ ...
- 344. Reverse String【easy】
344. Reverse String[easy] Write a function that takes a string as input and returns the string rever ...
- 手记系列之二 ----- 关于IDEA的一些使用方法经验
前言 本篇文章主要介绍的关于本人在使用IDEA的一些使用方法,一些常用设置,一些插件推荐和使用.请注意,本文特长,2w多字加上几十张图片,建议收藏观看~ 前提准备 idea官网: https://ww ...
- 数据关联分析 association analysis (Aprior算法,python代码)
1基本概念 购物篮事务(market basket transaction),如下表,表中每一行对应一个事务,包含唯一标识TID,和购买的商品集合.本文介绍一种成为关联分析(association a ...
随机推荐
- Linux c使用gumbo库解析页面表单信息(一)
一.gumbo介绍 gumbo是一个由谷歌开发的,能够解析html页面的库.功能稳定可靠,使用起来十分方便. 二.gumbo安装 (1)从https://github.com/google/gumbo ...
- numpy中的stack操作:hstack()、vstack()、stack()、dstack()、vsplit()、concatenate()
stack():沿着新的轴加入一系列数组. vstack():堆栈数组垂直顺序(行) hstack():堆栈数组水平顺序(列). dstack():堆栈数组按顺序深入(沿第三维). concatena ...
- 链接中 href='#' 和 href='###' 的区别
<a> 标签 + onclick='{jscode}' 是很常用的一种 js 运用方式,而不使用 href='javascript:{jscode}' 是为了兼容多种浏览器对 <a& ...
- SQL语句:如何让字符串转化数字
和前端联调的时候,突然出现一个状况,新增数据的时候,一直报系统错误,写下此文,留以后反复温习.菜鸟程序员一名~ 项目内容:新增产品信息 具体实现:1 获取基础信息,创建产品(调用接口传入的产品类型,如 ...
- java基础知识—数组
1.数组:是一个变量,存储相同数据类型的一组数据. 2.数据的优点:减少代码量.易查找. 3.数组的使用步骤: 1)声明数组:int scores []: 2)开辟空间:scores = new in ...
- 使用Babel将单独的js文件 中的 ES6转码为ES5
如果你并没有接触过ES6,当你看到下面的代码时,肯定是有点懵逼的(这是什么鬼?心中一万头神兽奔腾而过),但是你没看错,这就是ES6.不管你看不看它,它都在这里. 1 2 3 4 5 6 7 8 9 ...
- oraclesql语句笔记
1. ORA-00947:Not enough values 原因:values没有写足够的值与select()中的字段对应 2.查看一张表中共有多少个字段 select count(*) from ...
- 第九章:Servlet工作原理解析
9.1 从Servlet容器说起 Servlet和Servlet容器的关系,就像枪和子弹的关系,彼此依存又互相独立发展,这一切都是为了适应工业化生产.从技术角度来说,是为了解耦,通过标准化接口来互相协 ...
- Moya https配置方法
准备 iOS做https适配时对服务器是有一定要求的,服务端必须要是一个符合ATS(App Transport Security)要求的HTTPS.简单说要满足以下几个要求: 1.Transpor ...
- oracle 语句导出 导入一张表语句
导出: exp system/midsoft@bafy0929 file=d:\dzbl_models_temp.dmp tables=(emr.dzbl_models_temp) ; 导入:imp ...