python数据分析Adult-Salary预测
具体文档戳下方网站
代码如下:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats # adult_data = pd.read_csv("adult.data")
adult_test = pd.read_csv("adult.test", header=None, na_values=' ?') # 不要表头
adult_train = pd.read_csv("adult.data", header=None, na_values=' ?')
Names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week',
'native-country', 'income']
adult_test.columns = Names
adult_train.columns = Names
adult_train.info()
# print(adult_train.head())
# -------------------------------------------数据查看-------------------------------------------
#
# # ------------------------------对分类型数据的处理------------------------------
#
# # --------------对workclass的处理----------------
# work_income_less = adult_train['workclass'][adult_train['income'] == ' <=50K'].value_counts()
# work_income_more = adult_train['workclass'][adult_train['income'] == ' >50K'].value_counts()
# work_income = pd.concat([work_income_less, work_income_more], axis=1, sort=True)
# work_income.columns = ['less than 50K', 'more than 50K']
# work_income.fillna(0, inplace=True)
# work_income.plot(kind='bar', figsize=(10, 6))
# plt.xticks(rotation=45, size=6)
# plt.title('Salery about Workclass', size=20)
# # plt.savefig("Salery_about_workclass.png")
# # 独立性检验
# from scipy.stats import chi2_contingency
# g, p, dof, expected = chi2_contingency(work_income)
# '''
# g: z值
# p: 概率值
# dof: 自由度
# expected: 期望值
# '''
# # 绘制自由度为dof的卡方分布表
# from scipy.stats import chi2
# import numpy as np
# x = np.linspace(chi2.ppf(0.01, dof), chi2.ppf(0.99, dof), 100)
# plt.figure()
# plt.plot(x,chi2.pdf(x,dof))
#
# # 以95%置信区间,查看小概率事件区间
# plt.axvline(chi2.ppf(0.975, dof), color='r')
# plt.axvline(chi2.ppf(0.025, dof), color='r')
# plt.title('chi2 distribution'+' whose dof is '+str(dof),size=20)
# plt.text(chi2.ppf(0.975, dof), 0.02, 'q=0.95,z='+str(round(chi2.ppf(0.975, dof), 2)), ha='right', va='top', color='g', alpha=0.8, size=15)
# plt.text(chi2.ppf(0.025, dof), 0.02, 'q=0.05,z='+str(round(chi2.ppf(0.025, dof), 2)), ha='left', va='top', color='g', alpha=0.8, size=15)
# plt.xlabel('g='+str(round(g,2)), color='orange',size=15)
# # plt.savefig("W_and_Income_chi2_distribution.png")
#
# # --------------对education的处理----------------
# education_income_less = adult_train['education'][adult_train['income'] == ' <=50K'].value_counts()
# education_income_more = adult_train['education'][adult_train['income'] == ' >50K'].value_counts()
# education_income = pd.concat([education_income_less, education_income_more], axis=1, sort=True)
# education_income.columns = ['less than 50K', 'more than 50K']
# education_income.fillna(0, inplace=True)
# education_income.plot(kind='bar', figsize=(10, 6))
# plt.xticks(rotation=45, size=6)
# plt.title('Salery about education', size=20)
# # plt.savefig("Salery_about_education.png")
#
# # 独立性检验
# from scipy.stats import chi2_contingency
# g, p, dof, expected = chi2_contingency(education_income)
# '''
# g: z值
# p: 概率值
# dof: 自由度
# expected: 期望值
# '''
# from scipy.stats import chi2
# import numpy as np
# x = np.linspace(chi2.ppf(0.01, dof), chi2.ppf(0.99, dof), 100)
# plt.figure()
# plt.plot(x,chi2.pdf(x,dof))
#
# # 以95%置信区间,查看小概率事件区间
# plt.axvline(chi2.ppf(0.975, dof), color='r')
# plt.axvline(chi2.ppf(0.025, dof), color='r')
# plt.title('chi2 distribution'+' whose dof is '+str(dof),size=20)
# plt.text(chi2.ppf(0.975, dof), 0.02, 'q=0.975,z='+str(round(chi2.ppf(0.975, dof), 2)), ha='right', va='top', color='g', alpha=0.8, size=15)
# plt.text(chi2.ppf(0.025, dof), 0.02, 'q=0.025,z='+str(round(chi2.ppf(0.025, dof), 2)), ha='left', va='top', color='g', alpha=0.8, size=15)
# plt.xlabel('g='+str(round(g,2))+', '+'p='+str(round(p,2)), color='orange',size=15)
# # plt.savefig("E_and_Income_chi2_distribution.png")
#
#
#
# # --------------对marital_status的处理----------------
# marital_status_income_less = adult_train['marital-status'][adult_train['income'] == ' <=50K'].value_counts()
# marital_status_income_more = adult_train['marital-status'][adult_train['income'] == ' >50K'].value_counts()
# marital_status_income = pd.concat([marital_status_income_less, marital_status_income_more], axis=1, sort=True)
# marital_status_income.columns = ['less than 50K', 'more than 50K']
# marital_status_income.fillna(0, inplace=True)
# marital_status_income.plot(kind='bar', figsize=(10, 6))
# plt.xticks(rotation=45, size=6)
# plt.title('Salery about Marital_status', size=20)
# # plt.savefig("Salery_about_Marital_status.png")
#
# # 独立性检验
# from scipy.stats import chi2_contingency
# g, p, dof, expected = chi2_contingency(marital_status_income)
# '''
# g: z值
# p: 概率值
# dof: 自由度
# expected: 期望值
# '''
# from scipy.stats import chi2
# import numpy as np
# x = np.linspace(chi2.ppf(0.01, dof), chi2.ppf(0.99, dof), 100)
# plt.figure()
# plt.plot(x,chi2.pdf(x,dof))
#
# # 以95%置信区间,查看小概率事件区间
# plt.axvline(chi2.ppf(0.975, dof), color='r')
# plt.axvline(chi2.ppf(0.025, dof), color='r')
# plt.title('chi2 distribution'+' whose dof is '+str(dof),size=20)
# plt.text(chi2.ppf(0.975, dof), 0.02, 'q=0.975,z='+str(round(chi2.ppf(0.975, dof), 2)), ha='right', va='top', color='g', alpha=0.8, size=15)
# plt.text(chi2.ppf(0.025, dof), 0.02, 'q=0.025,z='+str(round(chi2.ppf(0.025, dof), 2)), ha='left', va='top', color='g', alpha=0.8, size=15)
# plt.xlabel('g='+str(round(g,2))+', '+'p='+str(round(p,2)), color='orange',size=15)
# plt.savefig("M_and_Income_chi2_distribution.png")
#
# # --------------对occupation的处理----------------
# occupation_income_less = adult_train['occupation'][adult_train['income'] == ' <=50K'].value_counts()
# occupation_income_more = adult_train['occupation'][adult_train['income'] == ' >50K'].value_counts()
# occupation_income = pd.concat([occupation_income_less, occupation_income_more], axis=1, sort=True)
# occupation_income.columns = ['less than 50K', 'more than 50K']
# occupation_income.fillna(0, inplace=True)
# occupation_income.plot(kind='bar', figsize=(10, 6))
# plt.xticks(rotation=45, size=6)
# plt.title('Salery about Occupation', size=20)
# plt.savefig("Salery_about_Occupation.png")
#
# # 独立性检验
# from scipy.stats import chi2_contingency
# g, p, dof, expected = chi2_contingency(occupation_income)
# '''
# g: z值
# p: 概率值
# dof: 自由度
# expected: 期望值
# '''
# from scipy.stats import chi2
# import numpy as np
# x = np.linspace(chi2.ppf(0.01, dof), chi2.ppf(0.99, dof), 100)
# plt.figure()
# plt.plot(x,chi2.pdf(x,dof))
#
# # 以95%置信区间,查看小概率事件区间
# plt.axvline(chi2.ppf(0.975, dof), color='r')
# plt.axvline(chi2.ppf(0.025, dof), color='r')
# plt.title('chi2 distribution'+' whose dof is '+str(dof),size=20)
# plt.text(chi2.ppf(0.975, dof), 0.02, 'q=0.975,z='+str(round(chi2.ppf(0.975, dof), 2)), ha='right', va='top', color='g', alpha=0.8, size=15)
# plt.text(chi2.ppf(0.025, dof), 0.02, 'q=0.025,z='+str(round(chi2.ppf(0.025, dof), 2)), ha='left', va='top', color='g', alpha=0.8, size=15)
# plt.xlabel('g='+str(round(g,2))+', '+'p='+str(round(p,2)), color='orange',size=15)
# plt.savefig("O_and_Income_chi2_distribution.png")
#
# # --------------对relationship的处理----------------
# relationship_income_less = adult_train['relationship'][adult_train['income'] == ' <=50K'].value_counts()
# relationship_income_more = adult_train['relationship'][adult_train['income'] == ' >50K'].value_counts()
# relationship_income = pd.concat([relationship_income_less, relationship_income_more], axis=1, sort=True)
# relationship_income.columns = ['less than 50K', 'more than 50K']
# relationship_income.fillna(0, inplace=True)
# relationship_income.plot(kind='bar', figsize=(10, 6)) # 设置图片大小
# plt.xticks(rotation=45, size=6)
# plt.title('Salery about Relationship', size=20)
# plt.savefig("Salery_about_Relationship.png")
#
# # 独立性检验
# from scipy.stats import chi2_contingency
# g, p, dof, expected = chi2_contingency(relationship_income)
# '''
# g: z值
# p: 概率值
# dof: 自由度
# expected: 期望值
# '''
# from scipy.stats import chi2
# import numpy as np
# x = np.linspace(chi2.ppf(0.01, dof), chi2.ppf(0.99, dof), 100)
# plt.figure()
# plt.plot(x,chi2.pdf(x,dof))
#
# # 以95%置信区间,查看小概率事件区间
# plt.axvline(chi2.ppf(0.975, dof), color='r')
# plt.axvline(chi2.ppf(0.025, dof), color='r')
# plt.title('chi2 distribution'+' whose dof is '+str(dof),size=20)
# plt.text(chi2.ppf(0.975, dof), 0.02, 'q=0.975,z='+str(round(chi2.ppf(0.975, dof), 2)), ha='right', va='top', color='g', alpha=0.8, size=15)
# plt.text(chi2.ppf(0.025, dof), 0.02, 'q=0.025,z='+str(round(chi2.ppf(0.025, dof), 2)), ha='left', va='top', color='g', alpha=0.8, size=15)
# plt.xlabel('g='+str(round(g,2))+', '+'p='+str(round(p,2)), color='orange',size=15)
# plt.savefig("R_and_Income_chi2_distribution.png")
#
# # --------------对race的处理----------------
# race_income_less = adult_train['race'][adult_train['income'] == ' <=50K'].value_counts()
# race_income_more = adult_train['race'][adult_train['income'] == ' >50K'].value_counts()
# race_income = pd.concat([race_income_less, race_income_more], axis=1, sort=True)
# race_income.columns = ['less than 50K', 'more than 50K']
# race_income.fillna(0, inplace=True)
# race_income.plot(kind='bar', figsize=(10, 6))
# plt.xticks(rotation=45, size=6)
# plt.title('Salery about race', size=20)
# # plt.savefig("Salery_about_Race.png")
#
# # 独立性检验
# from scipy.stats import chi2_contingency
# g, p, dof, expected = chi2_contingency(race_income)
# '''
# g: z值
# p: 概率值
# dof: 自由度
# expected: 期望值
# '''
# from scipy.stats import chi2
# import numpy as np
# x = np.linspace(chi2.ppf(0.01, dof), chi2.ppf(0.99, dof), 100)
# plt.figure()
# plt.plot(x,chi2.pdf(x,dof))
#
# # 以95%置信区间,查看小概率事件区间
# plt.axvline(chi2.ppf(0.975, dof), color='r')
# plt.axvline(chi2.ppf(0.025, dof), color='r')
# plt.title('chi2 distribution'+' whose dof is '+str(dof),size=20)
# plt.text(chi2.ppf(0.975, dof), 0.02, 'q=0.975,z='+str(round(chi2.ppf(0.975, dof), 2)), ha='right', va='top', color='g', alpha=0.8, size=15)
# plt.text(chi2.ppf(0.025, dof), 0.02, 'q=0.025,z='+str(round(chi2.ppf(0.025, dof), 2)), ha='left', va='top', color='g', alpha=0.8, size=15)
# plt.xlabel('g='+str(round(g,2))+', '+'p='+str(round(p,2)), color='orange',size=15)
# plt.savefig("Ra_and_Income_chi2_distribution.png")
#
# # --------------对sex的处理----------------
# sex_income_less = adult_train['sex'][adult_train['income'] == ' <=50K'].value_counts()
# sex_income_more = adult_train['sex'][adult_train['income'] == ' >50K'].value_counts()
# sex_income = pd.concat([sex_income_less, sex_income_more], axis=1, sort=True)
# sex_income.columns = ['less than 50K', 'more than 50K']
# sex_income.fillna(0, inplace=True)
# sex_income.plot(kind='bar', figsize=(10, 6))
# plt.xticks(rotation=45, size=6)
# plt.title('Salery about sex', size=20)
# plt.savefig("Salery_about_Sex.png")
#
# # 独立性检验
# from scipy.stats import chi2_contingency
# g, p, dof, expected = chi2_contingency(sex_income)
# '''
# g: z值
# p: 概率值
# dof: 自由度
# expected: 期望值
# '''
# from scipy.stats import chi2
# import numpy as np
# x = np.linspace(chi2.ppf(0.01, dof), chi2.ppf(0.99, dof), 100)
# plt.figure()
# plt.plot(x,chi2.pdf(x,dof))
#
# # 以95%置信区间,查看小概率事件区间
# plt.axvline(chi2.ppf(0.975, dof), color='r')
# plt.axvline(chi2.ppf(0.025, dof), color='r')
# plt.title('chi2 distribution'+' whose dof is '+str(dof),size=20)
# plt.text(chi2.ppf(0.975, dof), 0.02, 'q=0.975,z='+str(round(chi2.ppf(0.975, dof), 2)), ha='right', va='top', color='g', alpha=0.8, size=15)
# plt.text(chi2.ppf(0.025, dof), 0.02, 'q=0.025,z='+str(round(chi2.ppf(0.025, dof), 2)), ha='left', va='top', color='g', alpha=0.8, size=15)
# plt.xlabel('g='+str(round(g,2))+', '+'p='+str(round(p,2)), color='orange',size=15)
# plt.savefig("S_and_Income_chi2_distribution.png")
#
#
# # 对native_country进行查看数据,看看国家与工资的关系
# native_country_income_less = adult_train['native-country'][adult_train['income'] == ' <=50K'].value_counts()
# native_country_income_more = adult_train['native-country'][adult_train['income'] == ' >50K'].value_counts()
# native_country_income = pd.concat([native_country_income_less, native_country_income_more], axis=1, sort=True)
# native_country_income.columns = ['less than 50K', 'more than 50K']
# native_country_income.fillna(0,inplace=True)
# print(native_country_income)
# # 独立性检验
# from scipy.stats import chi2_contingency
# g, p, dof, expected = chi2_contingency(native_country_income)
# '''
# g: z值
# p: 概率值
# dof: 自由度
# expected: 期望值
# '''
# from scipy.stats import chi2
# import numpy as np
# x = np.linspace(chi2.ppf(0.01, dof), chi2.ppf(0.99, dof), 100)
# plt.figure()
# plt.plot(x,chi2.pdf(x,dof))
#
# # 以95%置信区间,查看小概率事件区间
# plt.axvline(chi2.ppf(0.975, dof), color='r')
# plt.axvline(chi2.ppf(0.025, dof), color='r')
# plt.title('chi2 distribution'+' whose dof is '+str(dof),size=20)
# plt.text(chi2.ppf(0.975, dof), 0.02, 'q=0.975,z='+str(round(chi2.ppf(0.975, dof), 2)), ha='right', va='top', color='g', alpha=0.8, size=15)
# plt.text(chi2.ppf(0.025, dof), 0.02, 'q=0.025,z='+str(round(chi2.ppf(0.025, dof), 2)), ha='left', va='top', color='g', alpha=0.8, size=15)
# plt.xlabel('g='+str(round(g,2))+', '+'p='+str(round(p,2)), color='orange',size=15)
# plt.savefig("NC_and_Income_chi2_distribution.png")
#
# # --------------对Age的处理,数据进行离散化----------------
adult_train['age_set'] = pd.cut(adult_train['age'], bins=4, labels=['Teenager', 'young man', 'elder', 'old man'])
# # 离散数据绘图
# age_set_income_less = adult_train['age_set'][adult_train['income'] == ' <=50K'].value_counts()
# age_set_income_more = adult_train['age_set'][adult_train['income'] == ' >50K'].value_counts()
# age_set_income = pd.concat([age_set_income_less, age_set_income_more], axis=1, sort=True)
# age_set_income.columns = ['less than 50K', 'more than 50K']
# age_set_income.fillna(0, inplace=True)
# age_set_income.plot(kind='bar', figsize=(10, 6))
# plt.xticks(rotation=45)
# plt.title('Salery about Age Set', size=20)
# plt.savefig("Salery_about_Age_Set.png")
#
# # 绘制线性图
# plt.figure()
# adult_train['age'][adult_train['income'] == ' <=50K'].plot(kind='kde', figsize=(10, 6)) # 分布图
# adult_train['age'][adult_train['income'] == ' >50K'].plot(kind='kde')
# adult_train['age'].plot(kind='kde')
# plt.xlabel("Age")
# plt.ylabel("Density")
# plt.title("Age about higher_salery")
# plt.legend(('<=50K', '>50K','all age'), loc='best')
# plt.savefig("Age_kernel_density_estimation.png")
#
# # 绘制正态拟合曲线
# M_S = stats.norm.fit(adult_train['age']) # 正态拟合的平均值与标准差
# plt.figure()
# adult_train['age'].plot(kind='kde') # 原本的概率密度分布图
#
# normalDistribution = stats.norm(M_S[0], M_S[1]) # 绘制拟合的正态分布图
# x = np.linspace(normalDistribution.ppf(0.01), normalDistribution.ppf(0.99), 100)
# plt.plot(x, normalDistribution.pdf(x), c='orange')
# plt.xlabel('Age about Adult '+'mean='+str(round(M_S[0], 5))+',std='+str(round(M_S[1], 5)))
# plt.title('Age_NormalDistibution_Origin', size=20)
# plt.legend(['Origin', 'NormDistribution'])
# plt.savefig("Age_NormalDistibution_Origin.png")
#
#
# # --------------对education_num的处理,数据进行离散化----------------
# enum_income_less = adult_train['education-num'][adult_train['income'] == ' <=50K'].value_counts()
# enum_income_more = adult_train['education-num'][adult_train['income'] == ' >50K'].value_counts()
# enum_income = pd.concat([enum_income_less, enum_income_more], axis=1, sort=True)
# enum_income.columns = ['less than 50K', 'more than 50K']
# enum_income.fillna(0, inplace=True)
# enum_income.plot(kind='bar', figsize=(10, 6))
# plt.xticks(rotation=0)
# plt.title('Salery about e_num', size=20)
# # plt.savefig("Salery_about_e_num.png")
#
# # 独立性检验
# from scipy.stats import chi2_contingency
# g, p, dof, expected = chi2_contingency(enum_income)
# '''
# g: z值
# p: 概率值
# dof: 自由度
# expected: 期望值
# '''
# from scipy.stats import chi2
# import numpy as np
# x = np.linspace(chi2.ppf(0.01, dof), chi2.ppf(0.99, dof), 100)
# plt.figure()
# plt.plot(x,chi2.pdf(x,dof))
#
# # 以95%置信区间,查看小概率事件区间
# plt.axvline(chi2.ppf(0.975, dof), color='r')
# plt.axvline(chi2.ppf(0.025, dof), color='r')
# plt.title('chi2 distribution'+' whose dof is '+str(dof),size=20)
# plt.text(chi2.ppf(0.975, dof), 0.02, 'q=0.975,z='+str(round(chi2.ppf(0.975, dof), 2)), ha='right', va='top', color='g', alpha=0.8, size=15)
# plt.text(chi2.ppf(0.025, dof), 0.02, 'q=0.025,z='+str(round(chi2.ppf(0.025, dof), 2)), ha='left', va='top', color='g', alpha=0.8, size=15)
# plt.xlabel('g='+str(round(g,2))+', '+'p='+str(round(p,2)), color='orange',size=15)
# plt.savefig("EN_and_Income_chi2_distribution.png")
#
# # --------------对hours-per-week的处理,数据进行离散化----------------
# adult_train['hours_set'] = pd.cut(adult_train['hours-per-week'], bins=4, labels=['too free', 'few', 'common', 'too busy'])
# # 离散数据绘图
# hours_income_less = adult_train['hours_set'][adult_train['income'] == ' <=50K'].value_counts()
# hours_income_more = adult_train['hours_set'][adult_train['income'] == ' >50K'].value_counts()
# hours_income = pd.concat([hours_income_less, hours_income_more], axis=1, sort=True)
# hours_income.columns = ['less than 50K', 'more than 50K']
# print(hours_income)
# hours_income.fillna(0, inplace=True)
# hours_income.plot(kind='bar', figsize=(10, 6))
# plt.xticks(rotation=0)
# plt.title('Salery about hours set', size=20)
# # plt.savefig("Salery_about_hours_set.png")
#
# # 绘制线性图
# plt.figure()
# adult_train['hours-per-week'][adult_train['income'] == ' <=50K'].plot(kind='kde', figsize=(10, 6)) # 分布图
# adult_train['hours-per-week'][adult_train['income'] == ' >50K'].plot(kind='kde')
# adult_train['hours-per-week'].plot(kind='kde')
# plt.xlabel("hours-per-week")
# plt.ylabel("Density")
# plt.title("hours-per-week about higher_salery")
# plt.legend(('<=50K', '>50K','all '), loc='best')
# plt.savefig("hours_per_week_kernel_density_estimation.png")
#
#
# # --------------对capital-gain的处理----------------
# # 绘制线性图
# plt.figure()
# adult_train['capital-gain'][adult_train['income'] == ' <=50K'].plot(kind='kde', figsize=(10, 6)) # 分布图
# adult_train['capital-gain'][adult_train['income'] == ' >50K'].plot(kind='kde')
# adult_train['capital-gain'].plot(kind='kde')
# plt.xlabel("capital-gain")
# plt.ylabel("Density")
# plt.title("capital-gain about higher_salery")
# plt.legend(('<=50K', '>50K','all '), loc='best')
# plt.savefig("capital_gain_kernel_density_estimation.png")
#
# # --------------对capital-loss的处理----------------
# # 绘制线性图
# plt.figure()
# adult_train['capital-loss'][adult_train['income'] == ' <=50K'].plot(kind='kde', figsize=(10, 6)) # 分布图
# adult_train['capital-loss'][adult_train['income'] == ' >50K'].plot(kind='kde')
# adult_train['capital-loss'].plot(kind='kde')
# plt.xlabel("capital-loss")
# plt.ylabel("Density")
# plt.title("capital-loss about higher_salery")
# plt.legend(('<=50K', '>50K','all '), loc='best')
# plt.savefig("capital_loss_kernel_density_estimation.png")
#
# # 查看capital-gain与capital-loss数据
# # print(adult_train[['capital-gain','capital-loss']].head(10))
# gain_loss = adult_train.groupby(['capital-gain','capital-loss'])
# print(gain_loss.size().head())
#
# # --------------对fnlwgt的处理----------------
# # 绘制线性图
# plt.figure()
# adult_train['fnlwgt'][adult_train['income'] == ' <=50K'].plot(kind='kde', figsize=(10, 6)) # 分布图
# adult_train['fnlwgt'][adult_train['income'] == ' >50K'].plot(kind='kde')
# adult_train['fnlwgt'].plot(kind='kde')
# plt.xlabel("fnlwgt")
# plt.ylabel("Density")
# plt.title("fnlwgt about higher_salery")
# plt.legend(('<=50K', '>50K','all '), loc='best')
# plt.savefig("fnlwgt_kernel_density_estimation.png") # -------------------------------------------数据处理-------------------------------------------
# 删除不符合条件的列
adult_train_good = adult_train.drop(['fnlwgt','hours-per-week','capital-gain','capital-loss'],axis=1)
print(adult_train_good.columns) # 查看每一列的缺失率
print(adult_train_good.isnull().sum()/adult_train_good.shape[0])
#
# # -------------异常值处理-------------
# # 3σ原则检测异常值
# # 定义3σ法则识别异常值函数
# def outRange(Ser1):
# '''
# Ser1:表示传入DataFrame的某一列。
# '''
# boolInd = (Ser1.mean()-3*Ser1.std() > Ser1) | (Ser1.mean()+3*Ser1.std() < Ser1)
# index = np.arange(Ser1.shape[0])[boolInd] # range to array
# outrange = Ser1.iloc[index]
# return outrange
#
# print(outRange(adult_train_good['age'])) # # 提取出训练集
X_train = adult_train_good.drop(['income','age'], axis=1)
Y_train = adult_train_good['income'] X_train = pd.get_dummies(X_train, prefix_sep=':', columns=['workclass', 'education', 'education-num',
'marital-status', 'occupation','race',
'relationship', 'sex', 'native-country'
,'age_set'], dummy_na=True) print(X_train.head())
Y_train = pd.get_dummies(Y_train, drop_first=True).iloc[:,0] # 取一列
print(Y_train)
# 哑变量处理
# 把空白值也当作变量处理 # 数据集划分
from sklearn.model_selection import train_test_split X = X_train
y = Y_train X_train, X_test, y_train, y_test = train_test_split(X_train,Y_train, test_size=0.2, random_state=12345) # 标准化
from sklearn.preprocessing import StandardScaler
Standard = StandardScaler().fit(X_train) # 训练产生标准化的规则,因为数据集分为训练与测试,测试相当于后来的。 Xtrain = Standard.transform(X_train) # 将规则应用于训练集
Xtest = Standard.transform(X_test) # 将规则应用于测试集 # 进行分类算法
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier().fit(X_train, y_train)
y_pred = clf.predict(Xtest) # 判定分类算法
from sklearn.metrics import classification_report, auc
print(classification_report(y_test, y_pred)) # 绘制roc曲线
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = 'SimHei' # 改字体
# 求出ROC曲线的x轴和Y轴
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
print(auc(fpr, tpr))
plt.figure(figsize=(10, 6))
plt.xlim(0, 1) # 设定x轴的范围
plt.ylim(0.0, 1.1) # 设定y轴的范围
plt.xlabel('假正率')
plt.ylabel('真正率')
plt.plot(fpr, tpr, linewidth=2, linestyle="-", color='red')
plt.title('Line Roc of X_train by KNeighborsClassifier()', size=20)
plt.show() print(type(Standard.transform(X)))
print(type(y))
print(Standard.transform(X).shape)
print(y.shape) #
# #交叉验证
# from sklearn.cross_validation import cross_val_score
# k_score=[]
X = Standard.transform(X)
# for i in range(1,3):
# knn = KNeighborsClassifier(n_neighbors=i)
# scores = cross_val_score(knn, X, y, scoring='accuracy',cv=5)
# k_score.append(scores.mean())
# print(k_score) # 查看学习曲线
from sklearn.learning_curve import learning_curve
train_sizes, train_loss, test_loss = learning_curve(clf,X,y,cv=10,scoring='mean_squared_error',train_sizes=[0.1,0.25,0.5,0.75,1])
train_loss_mean = -np.mean(train_loss,axis=1)
test_loss_mean = -np.mean(test_loss,axis=1)
plt.plot(train_sizes,train_loss_mean,'o-',color='r', label='Training')
plt.plot(train_sizes,test_loss_mean,'o-',color='g', label='Cross_validation')
plt.legend(loc='best')
plt.show()
python数据分析Adult-Salary预测的更多相关文章
- 【读书笔记与思考】《python数据分析与挖掘实战》-张良均
[读书笔记与思考]<python数据分析与挖掘实战>-张良均 最近看一些机器学习相关书籍,主要是为了拓宽视野.在阅读这本书前最吸引我的地方是实战篇,我通读全书后给我印象最深的还是实战篇.基 ...
- 《Python数据分析与挖掘实战》读书笔记
大致扫了一遍,具体的代码基本都没看了,毕竟我还不懂python,并且在手机端的排版,这些代码没法看. 有收获,至少了解到以下几点: 一. Python的语法挺有意思的 有一些类似于JavaSc ...
- 小象学院Python数据分析第二期【升级版】
点击了解更多Python课程>>> 小象学院Python数据分析第二期[升级版] 主讲老师: 梁斌 资深算法工程师 查尔斯特大学(Charles Sturt University)计 ...
- Python数据分析【炼数成金15周完整课程】
点击了解更多Python课程>>> Python数据分析[炼数成金15周完整课程] 课程简介: Python是一种面向对象.直译式计算机程序设计语言.也是一种功能强大而完善的通用型语 ...
- Python数据分析简介
1,Python作为一门编程语言开发效率快,运行效率被人诟病,但是Python核心部分使用c/c++等更高效的语言来编写的还有强大的numpy, padnas, matplotlib,scipy库等应 ...
- Python数据分析常用的库总结
Python之所以能够成为数据分析与挖掘领域的最佳语言,是有其独特的优势的.因为他有很多这个领域相关的库可以用,而且很好用,比如Numpy.SciPy.Matploglib.Pandas.Scikit ...
- 快速入门 Python 数据分析实用指南
Python 现如今已成为数据分析和数据科学使用上的标准语言和标准平台之一.那么作为一个新手小白,该如何快速入门 Python 数据分析呢? 下面根据数据分析的一般工作流程,梳理了相关知识技能以及学习 ...
- python数据分析与应用
python数据分析与应用笔记 使用sklearn构建模型 1.使用sklearn转换器处理数据 import numpy as np from sklearn.datasets import loa ...
- python数据分析中常用的库
Python是数据处理常用工具,可以处理数量级从几K至几T不等的数据,具有较高的开发效率和可维护性,还具有较强的通用性和跨平台性,这里就为大家分享几个不错的数据分析工具,需要的朋友可以参考下 Pyth ...
- 小白学 Python 数据分析(1):数据分析基础
各位同学好,小编接下来为大家分享一些有关 Python 数据分析方面的内容,希望大家能够喜欢. 人工植入广告: PS:小编最近两天偷了点懒,好久没有发原创了,最近是在 CSDN 开通了一个付费专栏,用 ...
随机推荐
- postgresql----几何类型和函数
postgresql支持的几何类型如下表: 名字 存储空间 描述 表现形式 point 16字节 平面上的点 (x,y) line 32字节 直线 {A,B,C} lseg 32字节 线段 ((x1, ...
- HTML5新特性postMessage解决跨域
window.postMessage的功能是允许程序员跨域在两个窗口/frames间发送数据信息.基本上,它就像是跨域的AJAX,但不是浏览器跟服务器之间交互,而是在两个客户端之间通信.让我们来看一下 ...
- 【Odoo 8开发教程】第一章:Odoo 8.0安装
转载请注明原文地址:https://www.cnblogs.com/cnodoo/p/10779733.html odoo有三种常见的安装方式:打包程序安装.源码安装以及Docker镜像安装. 一:打 ...
- 关于OpenCV2.4.9在VS2012上的配置
今天写着篇文章是由于自从上次电脑换硬盘今天再次安装OpenCV又遇到了一些问题,最后终于搞定,,,,用的版本是2.4.9,,,因为第一次配置用3.0的没有配置成功,而2.4.9的配置成功. 首先当然是 ...
- http_build_query用法,挺方便的
http_build_query (PHP 5) http_build_query -- 生成 url-encoded 之后的请求字符串描述string http_build_query ( arra ...
- vi 替换
在vi编辑器中,能够利用 :s命令能够实现字符串的替换.详细的使用方法例如以下: 1.:s/str1/str2/ 用字符串 str2 替换行中首次出现的字符串str1: 2.:s/str1/str2/ ...
- MSTECHLNK
MSTECHLNK(微软技术直通车) 时间:2017.12.16地点:微软中关村办公楼天安门会议室
- 大数据入门第十六天——流式计算之storm详解(一)入门与集群安装
一.概述 今天起就正式进入了流式计算.这里先解释一下流式计算的概念 离线计算 离线计算:批量获取数据.批量传输数据.周期性批量计算数据.数据展示 代表技术:Sqoop批量导入数据.HDFS批量存储数据 ...
- 2017-2018-2 20155315《网络对抗技术》Exp7 :网络欺诈防范
实验目的 本实践的目标理解常用网络欺诈背后的原理,以提高防范意识,并提出具体防范方法. 实验内容 简单应用SET工具建立冒名网站 ettercap DNS spoof 结合应用两种技术,用DNS sp ...
- Photoshop CS4破解方法
先在网上下载Photoshop CS4的版本,安装后按如下步骤操作即可. 激活码: 1330-1082-3503-2270-3738-6738 1330-1776-8671-6289-7706-291 ...