1. # coding: utf-8
  2.  
  3. # In[128]:
  4.  
  5. get_ipython().magic(u'matplotlib inline')
  6. import pandas as pd
  7. from pandas import Series,DataFrame
  8. import seaborn as sns
  9. sns.set_style('whitegrid')
  10. pd.set_option('display.mpl_style', 'default')
  11. import numpy as np
  12. import matplotlib.pyplot as plt
  13.  
  14. from sklearn.linear_model import LogisticRegression
  15. from sklearn.svm import SVC, LinearSVC
  16. from sklearn.ensemble import RandomForestClassifier
  17. from sklearn.neighbors import KNeighborsClassifier
  18. from sklearn.naive_bayes import GaussianNB
  19.  
  20. train_df= pd.read_csv("/home/lpstudy/下载/train.csv")
  21. test_df = pd.read_csv("/home/lpstudy/下载/test.csv")
  22.  
  23. train_df.head()
  24.  
  25. test_df.head()
  26.  
  27. # In[129]:
  28.  
  29. train_df = train_df.drop(["Ticket","PassengerId","Name"],axis = 1)
  30. test_df = test_df.drop(["Name","Ticket"],axis =1)
  31.  
  32. # In[130]:
  33.  
  34. train_df.head()
  35.  
  36. # In[131]:
  37.  
  38. train_df["Embarked"] = train_df["Embarked"].fillna("S")
  39. #plot
  40. sns.factorplot("Embarked","Survived",data = train_df,size = 6,aspect = 2)
  41.  
  42. fig,(axis1,axis2,axis3) = plt.subplots(1,3,figsize = (15,5))
  43.  
  44. sns.countplot(x='Embarked', data=train_df, ax=axis1)
  45. sns.countplot(x='Survived', hue="Embarked", data=train_df, order=[1,0], ax=axis2)
  46.  
  47. embark_perc = train_df[["Embarked", "Survived"]].groupby(['Embarked'],as_index=False).mean()
  48. sns.barplot(x='Embarked', y='Survived', data=embark_perc,order=['S','C','Q'],ax=axis3)
  49.  
  50. embark_dummies_train = pd.get_dummies(train_df['Embarked'])
  51. embark_dummies_train.drop(['S'], axis=1, inplace=True)
  52.  
  53. embark_dummies_test = pd.get_dummies(test_df['Embarked'])
  54. embark_dummies_test.drop(['S'], axis=1, inplace=True)
  55.  
  56. train_df = train_df.join(embark_dummies_train)
  57. test_df = test_df.join(embark_dummies_test)
  58.  
  59. train_df.drop(['Embarked'], axis=1,inplace=True)
  60. test_df.drop(['Embarked'], axis=1,inplace=True)
  61.  
  62. # In[132]:
  63.  
  64. test_df["Fare"].fillna(test_df["Fare"].median(), inplace=True)
  65.  
  66. train_df['Fare'] = train_df['Fare'].astype(int)
  67. test_df['Fare'] = test_df['Fare'].astype(int)
  68.  
  69. fare_not_survived = train_df["Fare"][train_df["Survived"] == 0]
  70. fare_survived = train_df["Fare"][train_df["Survived"] == 1]
  71.  
  72. avgerage_fare = DataFrame([fare_not_survived.mean(), fare_survived.mean()])
  73. std_fare = DataFrame([fare_not_survived.std(), fare_survived.std()])
  74.  
  75. #plot
  76. train_df['Fare'].plot(kind='hist', figsize=(15,3),bins=100, xlim=(0,50))
  77.  
  78. avgerage_fare.index.names = std_fare.index.names = ["Survived"]
  79. avgerage_fare.plot(yerr=std_fare,kind='bar',legend=False)
  80.  
  81. # In[133]:
  82.  
  83. # Age
  84.  
  85. fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,4))
  86. axis1.set_title('Original Age values - Titanic')
  87. axis2.set_title('New Age values - Titanic')
  88.  
  89. average_age_titanic = train_df["Age"].mean()
  90. std_age_titanic = train_df["Age"].std()
  91. count_nan_age_titanic = train_df["Age"].isnull().sum()
  92.  
  93. # get average, std, and number of NaN values in test_df
  94. average_age_test = test_df["Age"].mean()
  95. std_age_test = test_df["Age"].std()
  96. count_nan_age_test = test_df["Age"].isnull().sum()
  97.  
  98. # generate random numbers between (mean - std) & (mean + std)
  99. rand_1 = np.random.randint(average_age_titanic - std_age_titanic, average_age_titanic + std_age_titanic, size = count_nan_age_titanic)
  100. rand_2 = np.random.randint(average_age_test - std_age_test, average_age_test + std_age_test, size = count_nan_age_test)
  101.  
  102. # plot original Age values
  103. # NOTE: drop all null values, and convert to int
  104. train_df['Age'].dropna().astype(int).hist(bins=70, ax=axis1)
  105. # test_df['Age'].dropna().astype(int).hist(bins=70, ax=axis1)
  106.  
  107. # fill NaN values in Age column with random values generated
  108. train_df["Age"][np.isnan(train_df["Age"])] = rand_1
  109. test_df["Age"][np.isnan(test_df["Age"])] = rand_2
  110.  
  111. # convert from float to int
  112. train_df['Age'] = train_df['Age'].astype(int)
  113. test_df['Age'] = test_df['Age'].astype(int)
  114.  
  115. # plot new Age Values
  116. train_df['Age'].hist(bins=70, ax=axis2)
  117. # test_df['Age'].hist(bins=70, ax=axis4)
  118.  
  119. # In[134]:
  120.  
  121. # .... continue with plot Age column
  122.  
  123. # peaks for survived/not survived passengers by their age
  124. facet = sns.FacetGrid(train_df, hue="Survived",aspect=4)
  125. facet.map(sns.kdeplot,'Age',shade= True)
  126. facet.set(xlim=(0, train_df['Age'].max()))
  127. facet.add_legend()
  128.  
  129. # average survived passengers by age
  130. fig, axis1 = plt.subplots(1,1,figsize=(18,4))
  131. average_age = train_df[["Age", "Survived"]].groupby(['Age'],as_index=False).mean()
  132. sns.barplot(x='Age', y='Survived', data=average_age)
  133.  
  134. # In[135]:
  135.  
  136. # Cabin
  137. # It has a lot of NaN values, so it won't cause a remarkable impact on prediction
  138. train_df.drop("Cabin",axis=1,inplace=True)
  139. test_df.drop("Cabin",axis=1,inplace=True)
  140.  
  141. # Family
  142.  
  143. # Instead of having two columns Parch & SibSp,
  144. # we can have only one column represent if the passenger had any family member aboard or not,
  145. # Meaning, if having any family member(whether parent, brother, ...etc) will increase chances of Survival or not.
  146. train_df['Family'] = train_df["Parch"] + train_df["SibSp"]
  147. train_df['Family'].loc[train_df['Family'] > 0] = 1
  148. train_df['Family'].loc[train_df['Family'] == 0] = 0
  149.  
  150. test_df['Family'] = test_df["Parch"] + test_df["SibSp"]
  151. test_df['Family'].loc[test_df['Family'] > 0] = 1
  152. test_df['Family'].loc[test_df['Family'] == 0] = 0
  153.  
  154. # drop Parch & SibSp
  155. train_df = train_df.drop(['SibSp','Parch'], axis=1)
  156. test_df = test_df.drop(['SibSp','Parch'], axis=1)
  157.  
  158. # plot
  159. fig, (axis1,axis2) = plt.subplots(1,2,sharex=True,figsize=(10,5))
  160.  
  161. # sns.factorplot('Family',data=train_df,kind='count',ax=axis1)
  162. sns.countplot(x='Family', data=train_df, order=[1,0], ax=axis1)
  163.  
  164. # average of survived for those who had/didn't have any family member
  165. family_perc = train_df[["Family", "Survived"]].groupby(['Family'],as_index=False).mean()
  166. sns.barplot(x='Family', y='Survived', data=family_perc, order=[1,0], ax=axis2)
  167.  
  168. axis1.set_xticklabels(["With Family","Alone"], rotation=0)
  169.  
  170. # In[136]:
  171.  
  172. # Sex
  173.  
  174. # As we see, children(age < ~16) on aboard seem to have a high chances for Survival.
  175. # So, we can classify passengers as males, females, and child
  176. def get_person(passenger):
  177. age,sex = passenger
  178. return 'child' if age < 16 else sex
  179.  
  180. train_df['Person'] = train_df[['Age','Sex']].apply(get_person,axis=1)
  181. test_df['Person'] = test_df[['Age','Sex']].apply(get_person,axis=1)
  182.  
  183. # No need to use Sex column since we created Person column
  184. train_df.drop(['Sex'],axis=1,inplace=True)
  185. test_df.drop(['Sex'],axis=1,inplace=True)
  186.  
  187. # create dummy variables for Person column, & drop Male as it has the lowest average of survived passengers
  188. person_dummies_titanic = pd.get_dummies(train_df['Person'])
  189. person_dummies_titanic.columns = ['Child','Female','Male']
  190. person_dummies_titanic.drop(['Male'], axis=1, inplace=True)
  191.  
  192. person_dummies_test = pd.get_dummies(test_df['Person'])
  193. person_dummies_test.columns = ['Child','Female','Male']
  194. person_dummies_test.drop(['Male'], axis=1, inplace=True)
  195.  
  196. train_df = train_df.join(person_dummies_titanic)
  197. test_df = test_df.join(person_dummies_test)
  198.  
  199. fig, (axis1,axis2) = plt.subplots(1,2,figsize=(10,5))
  200.  
  201. # sns.factorplot('Person',data=train_df,kind='count',ax=axis1)
  202. sns.countplot(x='Person', data=train_df, ax=axis1)
  203.  
  204. # average of survived for each Person(male, female, or child)
  205. person_perc = train_df[["Person", "Survived"]].groupby(['Person'],as_index=False).mean()
  206. sns.barplot(x='Person', y='Survived', data=person_perc, ax=axis2, order=['male','female','child'])
  207.  
  208. train_df.drop(['Person'],axis=1,inplace=True)
  209. test_df.drop(['Person'],axis=1,inplace=True)
  210.  
  211. # In[137]:
  212.  
  213. # Pclass
  214.  
  215. # sns.factorplot('Pclass',data=train_df,kind='count',order=[1,2,3])
  216. sns.factorplot('Pclass','Survived',order=[1,2,3], data=train_df,size=5)
  217.  
  218. # create dummy variables for Pclass column, & drop 3rd class as it has the lowest average of survived passengers
  219. pclass_dummies_titanic = pd.get_dummies(train_df['Pclass'])
  220. pclass_dummies_titanic.columns = ['Class_1','Class_2','Class_3']
  221. pclass_dummies_titanic.drop(['Class_3'], axis=1, inplace=True)
  222.  
  223. pclass_dummies_test = pd.get_dummies(test_df['Pclass'])
  224. pclass_dummies_test.columns = ['Class_1','Class_2','Class_3']
  225. pclass_dummies_test.drop(['Class_3'], axis=1, inplace=True)
  226.  
  227. train_df.drop(['Pclass'],axis=1,inplace=True)
  228. test_df.drop(['Pclass'],axis=1,inplace=True)
  229.  
  230. train_df = train_df.join(pclass_dummies_titanic)
  231. test_df = test_df.join(pclass_dummies_test)
  232.  
  233. # In[139]:
  234.  
  235. # define training and testing sets
  236.  
  237. X_train = train_df.drop("Survived",axis=1)
  238. Y_train = train_df["Survived"]
  239. X_test = test_df.drop("PassengerId",axis=1).copy()
  240.  
  241. # In[140]:
  242.  
  243. # Logistic Regression
  244.  
  245. logreg = LogisticRegression()
  246.  
  247. logreg.fit(X_train, Y_train)
  248.  
  249. Y_pred = logreg.predict(X_test)
  250.  
  251. logreg.score(X_train, Y_train)
  252.  
  253. # In[141]:
  254.  
  255. # Support Vector Machines
  256.  
  257. svc = SVC()
  258.  
  259. svc.fit(X_train, Y_train)
  260.  
  261. Y_pred = svc.predict(X_test)
  262.  
  263. svc.score(X_train, Y_train)
  264.  
  265. # In[142]:
  266.  
  267. # Random Forests
  268.  
  269. random_forest = RandomForestClassifier(n_estimators=100)
  270.  
  271. random_forest.fit(X_train, Y_train)
  272.  
  273. Y_pred = random_forest.predict(X_test)
  274.  
  275. random_forest.score(X_train, Y_train)
  276.  
  277. # In[143]:
  278.  
  279. # get Correlation Coefficient for each feature using Logistic Regression
  280. coeff_df = DataFrame(train_df.columns.delete(0))
  281. coeff_df.columns = ['Features']
  282. coeff_df["Coefficient Estimate"] = pd.Series(logreg.coef_[0])
  283.  
  284. # preview
  285. coeff_df
  286.  
  287. # In[ ]:

Classification and Prediction的更多相关文章

  1. 【转】Windows下使用libsvm中的grid.py和easy.py进行参数调优

    libsvm中有进行参数调优的工具grid.py和easy.py可以使用,这些工具可以帮助我们选择更好的参数,减少自己参数选优带来的烦扰. 所需工具:libsvm.gnuplot 本机环境:Windo ...

  2. A Complete Tutorial on Tree Based Modeling from Scratch (in R & Python)

    A Complete Tutorial on Tree Based Modeling from Scratch (in R & Python) MACHINE LEARNING PYTHON  ...

  3. Applied Deep Learning Resources

    Applied Deep Learning Resources A collection of research articles, blog posts, slides and code snipp ...

  4. LIBSVM的使用方法

    [原文:http://wenku.baidu.com/view/7e7b6b896529647d27285276.html] 目  录 1 Libsvm下载... 3 2 Libsvm3.0环境变量设 ...

  5. What is machine learning?

    What is machine learning? One area of technology that is helping improve the services that we use on ...

  6. 高数量类别特征(high-cardinality categorical attributes)的预处理方法

    high-cardinality categorical attributes,从字面上理解,即对于某个category特征,不同值的数量非常多,这里暂且把它叫做高数量类别属性.反之,即低数量类别属性 ...

  7. 机器学习基石8-Noise and Error

    注: 文章中所有的图片均来自台湾大学林轩田<机器学习基石>课程. 笔记原作者:红色石头 微信公众号:AI有道 上一节课,我们主要介绍了VC Dimension的概念.如果Hypothese ...

  8. Intel DAAL AI加速 ——传统决策树和随机森林

    # file: dt_cls_dense_batch.py #===================================================================== ...

  9. liblinear参数及使用方法(原创)

    开发语言:JAVA 开发工具:eclipse (下载地址 http://www.eclipse.org/downloads/) liblinear版本:liblinear-1.94.jar (下载地址 ...

随机推荐

  1. yield 与生成器

    yield的功能类似于return,但是不同之处在于它返回的是生成器. 生成器 生成器是通过一个或多个yield表达式构成的函数,每一个生成器都是一个迭代器(但是迭代器不一定是生成器). 如果一个函数 ...

  2. JQuery 提示用户名密码不为空

    $(document).ready(function(){                  //HTML()替换   HTML函数         //append()追加  :input = :t ...

  3. 支付宝RSA签名

    1.参考网上相关文章,开放php中的openssl,但使用网上例子调用openssl_pkey_new,一直报100013错误.后改用用支付宝提供的SDKdemo程序 发现使用提供的privkye,可 ...

  4. INSTALL_FAILED_SHARED_USER_INCOMPATIBLE的问题

    eclipse编译出来的apk,安装时报出INSTALL_FAILED_SHARED_USER_INCOMPATIBLE的错误. 原因:apk的AndroidManifest.xml中声明了andro ...

  5. mysql 统计sql

    1.按照月份统计数据 SELECT DATE_FORMAT(d.create_time,'%Y-%m') months,COUNT(id) AS scannum FROM detail d GROUP ...

  6. 【转】使用JMeter 完成常用的压力测试(二)

    使用JMeter 完成常用的压力测试 Login.jsp 和welcome.jsp.其中 login.jsp 负责生成 User 对象,并调用 User 的login.当 login 返回为 true ...

  7. SpringBoot中通过SpringBootServletInitializer如何实现容器初始化

    相关文章 <Servlet3.0之四:动态注册和Servlet容器初始化> <SpringBoot中通过SpringBootServletInitializer如何实现组件加载> ...

  8. 转:oracle几组重要的常见视图-v$process,v$session,v$session_wait,v$session_event

    v$process 本视图包含当前系统oracle运行的所有进程信息.常被用于将oracle或服务进程的操作系统进程ID与数据库session之间建立联系.在某些情况下非常有用: 1 如果数据库瓶颈是 ...

  9. mysql索引设计

    mysql索引设计 1.B树与B+树的区别?B-Tree:一个节点可以拥有大于2个子节点的平衡多叉树,所有关键字在整颗树中出现,包括在非叶子节点也能命中, 叶子节点之间没有链表B+Tree:每个叶子节 ...

  10. idea完成分支和主干的合并

    我们在开发一个项目的时候,我们可能会创建一个分支和一个主干.对于分支而言,我们一般在开发的时候可能会使用,但是在项目上线的时候,我们抽取的项目的地址可能是主干的svn地址,这时我们需要将分支上的svn ...