# coding: utf-8

# In[1]:

# numpy and pandas for data manipulation
import numpy as np
import pandas as pd

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

# File system manangement
import os

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# In[3]:

# Test data features
app_test = pd.read_csv(r'D:\Users\sgg91044\Desktop\MEP_no_defect_data_pivot.csv')
print('Testing data shape: ', app_test.shape)
app_test.head(20000)

# In[4]:

app_test['Target'].value_counts()
app_test['Target'].astype(int).plot.hist();

# In[5]:

# Function to calculate missing values by column# Funct
def missing_values_table(app_test):
# Total missing values
mis_val = app_test.isnull().sum()

# Percentage of missing values
mis_val_percent = 100 * app_test.isnull().sum() / len(app_test)

# Make a table with the results
mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)

# Rename the columns
mis_val_table_ren_columns = mis_val_table.rename(
columns = {0 : 'Missing Values', 1 : '% of Total Values'})

# Sort the table by percentage of missing descending
mis_val_table_ren_columns = mis_val_table_ren_columns[
mis_val_table_ren_columns.iloc[:,1] != 0].sort_values('% of Total Values', ascending=False).round(1)

# Print some summary information
print ("Your selected dataframe has " + str(app_test.shape[1]) + " columns.\n"
"There are " + str(mis_val_table_ren_columns.shape[0]) + " columns that have missing values.")

# Return the dataframe with missing information
return mis_val_table_ren_columns

# In[6]:

# Missing values statistics
missing_values = missing_values_table(app_test)
missing_values.head(20)

# In[7]:

RR13_MAX_median = app_test['RR13_MAX.'].median()
ETCM_PHC4_median = app_test['ETCM_PHC4'].median()
HELK_MEAN_median = app_test['HELK_MEAN'].median()
PBK4_median = app_test['PBK4'].median()
ETCM_PHB4_median = app_test['ETCM_PHB4'].median()
ETCM_PHA4_median = app_test['ETCM_PHA4'].median()
THR3_MAX_median = app_test['THR3_MAX.'].median()
THR3_MEAN_median = app_test['THR3_MEAN'].median()
RR23_MEAN_median = app_test['RR23_MEAN'].median()
RR13_MEAN_median = app_test['RR13_MEAN'].median()
THR3_MEAN_DIFF_median = app_test['THR3_MEAN_DIFF'].median()
THR3_MEAN_SLOPE_median = app_test['THR3_MEAN_SLOPE'].median()
THR3_MEAN_SLOPE_median = app_test['THR3_MEAN_SLOPE'].median()
THR3_MAX_DIFF_median = app_test['THR3_MAX._DIFF'].median()
LOWERCHM_PRESS_DIFF_median = app_test['LOWERCHM_PRESS'].median()
HELK_MAX_median = app_test['HELK_MAX.'].median()
#HELK_MIN_median = app_test['HELK_MIN.'].median()
HELK_SD_median = app_test['HELK_SD'].median()
THR3_SD_median = app_test['THR3_SD'].median()
RR23_MAX_median = app_test['RR23_MAX.'].median()
#RR13_MAX_median
#ETCM_PHC4_median
#HELK_MEAN_median
#PBK4_median
#ETCM_PHB4_median
#ETCM_PHA4_median
#THR3_MAX_median
#THR3_MEAN_median
#RR23_MEAN_median
#RR13_MEAN_median
#THR3_MEAN_DIFF_median
#THR3_MEAN_SLOPE_median
#THR3_MEAN_SLOPE_median
#THR3_MAX_DIFF_median
#LOWERCHM_PRESS_DIFF_median
#HELK_MAX_median
#HELK_MIN_median
#HELK_SD_median
#THR3_SD_median

# In[8]:

app_test=app_test.fillna({'RR13_MAX.':RR13_MAX_median,
'ETCM_PHC4':ETCM_PHC4_median,
'HELK_MEAN':HELK_MEAN_median,
'PBK4':PBK4_median,
'ETCM_PHB4':ETCM_PHB4_median,
'ETCM_PHA4':ETCM_PHA4_median,
'THR3_MAX.':THR3_MAX_median,
'THR3_MEAN':THR3_MEAN_median,
'RR23_MEAN':RR23_MEAN_median,
'RR13_MEAN':RR13_MEAN_median,
'THR3_MEAN_DIFF':THR3_MEAN_DIFF_median,
'THR3_MEAN_SLOPE':THR3_MEAN_SLOPE_median,
'THR3_MEAN_SLOPE':THR3_MEAN_SLOPE_median,
'THR3_MAX._DIFF':THR3_MAX_DIFF_median,
'LOWERCHM_PRESS':LOWERCHM_PRESS_DIFF_median,
'HELK_MAX.':HELK_MAX_median,

'HELK_SD':HELK_SD_median,
'THR3_SD':THR3_SD_median,
'RR23_MAX.':RR23_MAX_median
})
app_test

# In[9]:

# Find correlations with the target and sort
correlations = app_test.corr()['Target'].sort_values()

# Display correlations
print('Most Positive Correlations:\n', correlations.tail(20))
print('\nMost Negative Correlations:\n', correlations.head(20))

# In[10]:

# Measure the correlation of parameter'ETCM_PHC4' and target

plt.figure(figsize = (20, 8))

# KDE plot of wafers that were non-defected
sns.kdeplot(app_test.loc[app_test['Target'] == 0, 'ETCM_PHC4'],label = 'target == 0')
# KDE plot of wafers that were defected
sns.kdeplot(app_test.loc[app_test['Target'] == 1, 'ETCM_PHC4'],label = 'target == 1')

# Labeling of plot
plt.xlabel('ETCM_PHC4'); plt.ylabel('Density'); plt.title('Distribution of ETCM_PHC4');

# In[11]:

sns.boxplot(x='HELK_SD', y='Target', data=app_test)

sns.plot.show()

# In[ ]:

# Scatter Plot
app_test.plot(kind='scatter', x='PBK4', y='Target',alpha = 0.5,color = 'red')
plt.xlabel('PBK4') # label = name of label
plt.ylabel('Target')
plt.title('PBK4 Target Scatter Plot')

# In[ ]:

# Measure the correlation of parameter'HELK_MEAN' and target

plt.figure(figsize = (20, 8))

# KDE plot of wafers that were non-defected
sns.kdeplot(app_test.loc[app_test['Target'] == 0, 'HELK_MEAN'],label = 'target == 0')
# KDE plot of wafers that were defected
sns.kdeplot(app_test.loc[app_test['Target'] == 1, 'HELK_MEAN'],label = 'target == 1')

# Labeling of plot
plt.xlabel('HELK_MEAN'); plt.ylabel('Density'); plt.title('Distribution of HELK_MEAN');

# In[ ]:

plt.figure(figsize = (15, 50))
# iterate through the new features
for i, feature in enumerate(['LOWERCHM_PRESS',
'RR13_MEAN',
'RR13_MAX.',
'RR23_MEAN',
'THR3_MAX.',
'THR3_MEAN',
'RR23_MAX.',
'PBK4',
'THR3_MEAN_DIFF',
'HELK_MEAN',
'THR3_MEAN_SLOPE',
'THR3_MAX._DIFF']):

# create a new subplot for each source
plt.subplot(13, 1, i + 1)
# plot non_defected wafer
sns.kdeplot(app_test.loc[app_test['Target'] == 0, feature], label = 'Target == 0')
# plot defected wafer
sns.kdeplot(app_test.loc[app_test['Target'] == 1, feature], label = 'Target == 1')

# Label the plots
plt.title('Distribution of %s by Target Value' % feature)
plt.xlabel('%s' % feature); plt.ylabel('Density');

plt.tight_layout(h_pad = 0.1)

# In[ ]:

# Bin the parameter'ETCM_PHC4' data
ETCM_PHC4 = app_test[['Target', 'ETCM_PHC4']]
ETCM_PHC4['VALUE_BINNED'] = pd.cut(ETCM_PHC4['ETCM_PHC4'], bins = np.linspace(200, 1000, num = 17))
ETCM_PHC4.head(20)

# In[ ]:

# Group by the bin and calculate averages
ETCM_PHC4_groups = ETCM_PHC4.groupby('VALUE_BINNED').mean()
ETCM_PHC4_groups

# In[ ]:

plt.figure(figsize = (8, 8))

# Graph the age bins and the average of the target as a bar plot
plt.bar(ETCM_PHC4_groups.index.astype(str), 100 * ETCM_PHC4_groups['Target'])

# Plot labeling
plt.xticks(rotation = 75); plt.xlabel('ETCM_PHC4 value groups'); plt.ylabel('Possibility to be defected (%)')
plt.title('Possibility to be defected by ETCM_PHC4');

# In[ ]:

# Bin the parameter'HELK_MEAN' data
HELK_MEAN = app_test[['Target', 'HELK_MEAN']]
HELK_MEAN['VALUE_BINNED'] = pd.cut(HELK_MEAN['HELK_MEAN'], bins = np.linspace(0, 17.5, num = 8))
plt.figure(figsize = (8, 8))

# Group by the bin and calculate averages
HELK_MEAN_groups = HELK_MEAN.groupby('VALUE_BINNED').mean()
# Graph the age bins and the average of the target as a bar plot
plt.bar(HELK_MEAN_groups.index.astype(str), 100 * HELK_MEAN_groups['Target'])

# Plot labeling
plt.xticks(rotation = 75); plt.xlabel('HELK_MEAN value groups'); plt.ylabel('Possibility to be defected (%)')
plt.title('Possibility to be defected by HELK_MEAN');

# In[13]:

# Extract the parameters variables and show correlations
ext_data = app_test[['Target', 'RR13_MAX.',
'ETCM_PHC4' ,
'HELK_MEAN',
'PBK4',
'ETCM_PHB4' ,
'ETCM_PHA4',
'HELK_MAX.',
'HELK_SD',
'THR3_SD' ]]
ext_data_corrs = ext_data.corr()
ext_data_corrs

# In[14]:

plt.figure(figsize = (10, 10))

# Heatmap of correlations
sns.heatmap(ext_data_corrs, cmap = plt.cm.RdYlBu_r, vmin =0, annot = True, vmax = 1)
plt.title('Correlation Heatmap');

# In[ ]:

plt.figure(figsize = (10, 12))

# iterate through the sources
for i, source in enumerate(['ETCM_PHA4', 'ETCM_PHB4', 'ETCM_PHC4']):

# create a new subplot for each source
plt.subplot(3, 1, i + 1)
# plot non-defected
sns.kdeplot(app_test.loc[app_test['Target'] == 0, source], label = 'target == 0',shade=True)
# plot defected
sns.kdeplot(app_test.loc[app_test['Target'] == 1, source], label = 'target == 1',shade=True)

# Label the plots
plt.title('Distribution of %s by Target Value' % source)
plt.xlabel('%s' % source); plt.ylabel('Density');

plt.tight_layout(h_pad = 2.5)

# In[ ]:

# Copy the data for plotting
plot_data = ext_data.drop(columns = ['RR13_MAX.',
'HELK_MAX.',
'HELK_SD',
'THR3_SD' ]).copy()

# Function to calculate correlation coefficient between two columns
def corr_func(x, y, **kwargs):
r = np.corrcoef(x, y)[0][1]
ax = plt.gca()
ax.annotate("r = {:.2f}".format(r),
xy=(.2, .8), xycoords=ax.transAxes,
size = 20)

# Create the pairgrid object
grid = sns.PairGrid(data = plot_data, size = 3, diag_sharey=False,
hue = 'Target',
vars = [x for x in list(plot_data.columns) if x != 'Target'])

# Upper is a scatter plot
grid.map_upper(plt.scatter, alpha = 0.2)

# Diagonal is a histogram
grid.map_diag(sns.kdeplot)

# Bottom is density plot
grid.map_lower(sns.kdeplot, cmap = plt.cm.OrRd_r);

plt.suptitle('Ext Source Features Pairs Plot', size = 32, y = 1.05);

# In[12]:

import seaborn as sns
# Create the default pairplot
sns.pairplot(app_test)

# In[224]:

app_test.to_csv(r'D:\Users\sgg91044\Downloads\SEDA\CTM_data.csv', index=True, header=True)

# In[243]:

from sklearn.preprocessing import MinMaxScaler, Imputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.cross_validation import train_test_split

CTM_data=pd.read_csv(r'D:\Users\sgg91044\Downloads\SEDA\CTM_data.csv')
# Median imputation of missing values
#RR23_MAX_median = CTM_data['RR23_MAX.'].median()
#CTM_data=CTM_data.fillna({'RR23_MAX.': RR23_MAX_median})

# Drop the target from the training data
#CTM_data['Target_C']=CTM_data['Target'].astype('category')
#CTM_data['Target_C'].cat.categories=['noMEP','MEP']
#le = LabelEncoder()
#le_count = 0
#le.fit(CTM_data['Target_C'])
#CTM_data['Target_C'] = le.transform(CTM_data['Target_C'])
CTM_data['Target']=CTM_data['Target'].astype('float')
CTM_data_Target= CTM_data[['Target']]
CTM_data_Columns = CTM_data.drop(columns = ['Target'])
X_train, X_test, y_train, y_test = train_test_split(CTM_data_Columns,CTM_data_Target,test_size=0.3, random_state=0)

# In[244]:

from imblearn.over_sampling import SMOTE

from sklearn.model_selection import train_test_split

print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)

# In[245]:

print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))

sm = SMOTE(random_state=2)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())

print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))

print("After OverSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res==0)))

# In[228]:

#Y_test = test[['Target_C']]
#Y_train = test.drop(columns = ['Target','Target_C'])

# Feature names
#features = list(train.columns)
#y, _ = pd.factorize(app_train['Target'])

# Scale each feature to 0-1
scaler = MinMaxScaler(feature_range = (0, 1))

# Fit on the training data
#imputer.fit(train)

# Transform both training and testing data
#train = imputer.transform(train)
#test = imputer.transform(test)

# Repeat with the scaler
scaler.fit(X_train)
X_train = scaler.transform(X_train)
scaler.fit(X_test)
X_test = scaler.transform(X_test)
print('Training data shape: ', X_train.shape)
print('Testing data shape: ', X_test.shape)

# In[229]:

from sklearn.ensemble import RandomForestClassifier

# Make the random forest classifier
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50, verbose = 1, n_jobs = -1)

# In[203]:

y_test

# In[234]:

# Train on the training data
random_forest.fit(X_train,y_train)

# Extract feature importances
feature_importance_values = random_forest.feature_importances_
feature_importances = pd.DataFrame({'feature': features, 'importance': feature_importance_values})

# Make predictions on the test data
predictions = random_forest.predict(X_test)

# In[232]:

predictions
predictions.shape

# In[233]:

import pandas as pd
# 第一个参数是行索引,第二个属性为列索引
print( pd.crosstab(y_test['Target'], predictions, rownames=['Target'], colnames=['preds']))
# Make a submission dataframe

# In[165]:

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.cross_validation import train_test_split,cross_val_score

# In[190]:

cross_val_score(random_forest,y_test,submit )

# In[235]:

def plot_feature_importances(df):
"""
Plot importances returned by a model. This can work with any measure of
feature importance provided that higher importance is better.

Args:
df (dataframe): feature importances. Must have the features in a column
called `features` and the importances in a column called `importance

Returns:
shows a plot of the 15 most importance features

df (dataframe): feature importances sorted by importance (highest to lowest)
with a column for normalized importance
"""

# Sort features according to importance
df = df.sort_values('importance', ascending = False).reset_index()

# Normalize the feature importances to add up to one
df['importance_normalized'] = df['importance'] / df['importance'].sum()

# Make a horizontal bar chart of feature importances
plt.figure(figsize = (10, 6))
ax = plt.subplot()

# Need to reverse the index to plot most important on top
ax.barh(list(reversed(list(df.index[:20]))),
df['importance_normalized'].head(20),
align = 'center', edgecolor = 'k')

# Set the yticks and labels
ax.set_yticks(list(reversed(list(df.index[:20]))))
ax.set_yticklabels(df['feature'].head(20))

# Plot labeling
plt.xlabel('Normalized Importance'); plt.title('Feature Importances')
plt.show()

return df

# In[236]:

feature_importances_sorted = plot_feature_importances(feature_importances)

我的代码-statistic analysis的更多相关文章

  1. Web 检测代码 web analysis 开源 open source

    1. Grape Web Statistics Grape Web Statistics is a fairly simple piece of analytics software. Grape i ...

  2. Android内存等信息

    1. Linux中proc目录下文件详解 http://wenku.baidu.com/view/2ce89f00a6c30c2259019ef1.html 2. Android系统/proc目录详解 ...

  3. 结构-行为-样式-angularJs 指令实现滚动文字

    最近在做XX项目的大屏展示页面,有一个表格需要用到这个滚动效果,于是就写了个指令,记录下,共同学习. Html代码: <td word-roll tword="item"&g ...

  4. 以太坊智能合约虚拟机(EVM)原理与实现

    以太坊 EVM原理与实现 以太坊底层通过EVM模块支持合约的执行与调用,调用时根据合约地址获取到代码,生成环境后载入到EVM中运行.通常智能合约的开发流程是用solidlity编写逻辑代码,再通过编译 ...

  5. 20165227 2017-2018-2《Java程序设计》课程总结

    20165227 2017-2018-2<Java程序设计>课程总结 每周作业链接汇总 预备作业1 简要内容: 记忆深刻的老师 我期望的师生关系 对于Java学习的看法 预备作业2 简要内 ...

  6. C++解析头文件-Qt自动生成信号定义

    目录 一.概述 二.实现思路 三.代码讲解 1.类图 2.QtCppDescription 3.测试 四.源代码 一.概述 上一篇文章C++解析头文件-Qt自动生成信号声明我们主要讲解了怎么去解析C+ ...

  7. 344. Reverse String【easy】

    344. Reverse String[easy] Write a function that takes a string as input and returns the string rever ...

  8. 手记系列之二 ----- 关于IDEA的一些使用方法经验

    前言 本篇文章主要介绍的关于本人在使用IDEA的一些使用方法,一些常用设置,一些插件推荐和使用.请注意,本文特长,2w多字加上几十张图片,建议收藏观看~ 前提准备 idea官网: https://ww ...

  9. 数据关联分析 association analysis (Aprior算法,python代码)

    1基本概念 购物篮事务(market basket transaction),如下表,表中每一行对应一个事务,包含唯一标识TID,和购买的商品集合.本文介绍一种成为关联分析(association a ...

随机推荐

  1. 程序员必会算法-KMP算法

    KMP算法是一种优秀的字符串匹配算法,字符串匹配的常规算法是一步一步进行移位和比较操作,直至找到完全相匹配的字符串. 下面通过一个例子,为大家仔细说明KMP算法的使用和思路: 问题: 在字符串“DEA ...

  2. awk shell

    作业一:整理博客,内容包含awk.变量.运算符.if多分支 awk   1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 2 ...

  3. Class的 getSuperclass与getGenericSuperclass区别

    一.getSuperclass   返回直接继承的父类(由于编译擦除,没有显示泛型参数)  Class<? super T> getSuperclass()           返回表示此 ...

  4. HFun.快速开发平台(三)=》通用系统用户选择

    系统中用户的选择使用特别多,将该功能统一实现可提升系统效率. 用户的选择实现相对简单,系统中基本都会提供,HFun.快速开发平台中的实现特点主要有: 1.用户的选择分为单用户选择和多用户选择. 2.用 ...

  5. 实时输出topk最频繁变动的股价

    网上看到了一道关于bloomburg的面试题,follow 评论的思路 自己试着写了一个HashHeap的实现. 基本思路是维护一个大小为K的最小堆,里面是topK股价变动的公司ID(假设ID是Int ...

  6. 如何用EFCore Lazy Loading实现Entity Split

    α角 与 β角 支持 现实生活 的 计算机系统,总有着两大偏差,第一个是 现实生活 与 计算机系统 的α角,另外一个是计算机系统的 逻辑设计 与 物理设计 的β角.举个栗子: α角:假设某个公司的商业 ...

  7. volatile与synchronized有什么区别?

    下列说法正确的是()? A.我们直接调用Thread对象的run方法会报异常,所以我们应该使用start方法来开启一个线程 B.一个进程是一个独立的运行环境,可以被看做一个程序或者一个应用.而线程是在 ...

  8. 运行python脚本时,报错InsecurePlatformWarning: A true SSLContext object is not available,解决方法

    今天,要在新环境里运行一个python脚本,遇到下面的报错: /usr/lib/python2.7/site-packages/urllib3/util/ssl_.py:160: InsecurePl ...

  9. 将float数据类型转换为str

    示例程序: #include <stdio.h> . . void UART_send_byte(char dat); void UART_send_string(unsigned cha ...

  10. python复习购物车程序

    个人学习总结: 无他,唯手熟尔!多敲多练才是王道 python 第三课 元组的灵活运用&字符串的诸多操作 Program01 '''时间 2018年2月12日12:15:28目的 购物车程序 ...