census = read.csv("census.csv")
library(caTools)
set.seed(2000)
spl = sample.split(census$over50k,SplitRatio = 0.6)
train = subset(census,spl == TRUE)
test = subset(census, spl == FALSE)
# use the logistic regression
glm = glm(over50k ~. , data = train, family = "binomial")
summary(glm) #pr(>|z|) if it is smaller than 0.1, the variables are significant

#accuracy
glm.pred = predict(glm, newdata = test, type = "response")
table(test$over50k,glm.pred >= 0.5)

(9051+1888)/nrow(test)

#baseline accuracy of test - more frequent outcome
table(test$over50k)
9713/nrow(test)

#ROC & ACU
library(ROCR)
#Then we can generate the confusion matrix
ROCpred = prediction(glm.pred, test$over50k)
plot(performance(ROCpred,measure="tpr",x.measure="fpr"),colorize = TRUE)
as.numeric(performance(ROCpred, "auc")@y.values)

#Problem 2.1 - A CART Model
library(rpart)
library(rpart.plot)
CTree = rpart(over50k ~. , data = train, method = "class")
prp(CTree)

# accuracy of the CART model
CTree.pred = predict(CTree, newdata = test, type = "class")

table(test$over50k,CTree.pred)
(9243+1596)/nrow(test)

#use another way- generate probabilities and use a threshold of 0.5 like in logistic regression
CTree.pred1 = predict(CTree, newdata = test)
p = CTree.pred1[,2] # the column of over 50k
table(test$over50k, p) # p<=0.5 it is same with the <=50k, p>0.5 means >50k

# ROC curve for the CART model - WOW
#removing the type="class" argument when making predictions
library(ROCR)
library(arulesViz)
CTree.ROCpred = prediction(CTree.pred1[,2],test$over50k)
# plot(CTree.ROCpred) can not run
plot(performance(CTree.ROCpred,measure="tpr",x.measure="fpr"),colorize = TRUE)

# to caculate the auc
as.numeric(performance(CTree.ROCpred,"auc")@y.values)

# another way to seek for auc
CTree.ROCpred2 = prediction(p,test$over50k)
as.numeric(performance(CTree.ROCpred2,"auc")@y.values)

#Problem 3.1 - A Random Forest Model
set.seed(1)
trainSmall = train[sample(nrow(train),2000),]

set.seed(1)
library(randomForest)
RFC = randomForest(over50k ~., data = trainSmall)
RFC.pred = predict(RFC,newdata = test) #using a threshold of 0.5, no need to set the type = "class"
table(test$over50k,RFC.pred)
(9586+1093)/nrow(test) # a little difference is allowed

#compute metrics that give us insight into which variables are important.
vu = varUsed(RFC, count = TRUE)
vusorted = sort(vu, decreasing = FALSE, index.return = TRUE)
dotchart(vnsorted$x, names(RFC$forest$xlevel[vusorted$ix]))

#another way to find the important variables - impurity
varImpPlot(RFC)

# select cp by Cross-validation for the CART Trees
library(caret)
library(e1071)
set.seed(2)
#Specify that we are going to use k-fold cross validation with 10 folds:
numFolds = trainControl(method = "cv", number = 10)
#Specify the grid of cp values that we wish to evaluate:
cartGrid = expand.grid(.cp = seq(0.002,0.1,0.002))
#run the train function and view the result:
tr = train(over50k ~.,data = train, method = "rpart", trControl = numFolds, tuneGrid = cartGrid)
tr # The final value used for the model was cp = 0.002.

CTree2 = rpart(over50k ~., data = train, method = "class", cp = 0.002)
CTree2.pred = predict(CTree2, newdata = test, type = "class")
table(test$over50k, CTree2.pred)
(9178+1838)/nrow(test)
prp(CTree2) # shoould be 18 splits

[Machine Learning][The Analytics Edge][Predicting Earnings from Census Data]的更多相关文章

  1. Machine Learning for Developers

    Machine Learning for Developers Most developers these days have heard of machine learning, but when ...

  2. How do I learn machine learning?

    https://www.quora.com/How-do-I-learn-machine-learning-1?redirected_qid=6578644   How Can I Learn X? ...

  3. Course Machine Learning Note

    Machine Learning Note Introduction Introduction What is Machine Learning? Two definitions of Machine ...

  4. [C2P3] Andrew Ng - Machine Learning

    ##Advice for Applying Machine Learning Applying machine learning in practice is not always straightf ...

  5. Why The Golden Age Of Machine Learning is Just Beginning

    Why The Golden Age Of Machine Learning is Just Beginning Even though the buzz around neural networks ...

  6. Introducing: Machine Learning in R(转)

    Machine learning is a branch in computer science that studies the design of algorithms that can lear ...

  7. Azure Machine Learning

    About me In my spare time, I love learning new technologies and going to hackathons. Our hackathon p ...

  8. Getting started with machine learning in Python

    Getting started with machine learning in Python Machine learning is a field that uses algorithms to ...

  9. Google's Machine Learning Crash Course #01# Introducing ML & Framing & Fundamental terminology

    INDEX Introducing ML Framing Fundamental machine learning terminology Introducing ML What you learn ...

随机推荐

  1. 源码:Java集合源码之:数组与链表(一)

    数组和链表是数据结构中最基本的部分. 数组 在java中,数组定义为一种基本类型,其可以通过下标获取到对应位置的数据.那么这种结构的数据,在内存中是怎么存放的呢? 数组在内存中是一段连续的存储单元,每 ...

  2. Xilinx------BUFG,IBUFG,BUFGP,IBUFGDS等含义以及使用

    转载-----BUFG,IBUFG,BUFGP,IBUFGDS等含义以及使用   目前,大型设计一般推荐使用同步时序电路.同步时序电路基于时钟触发沿设计,对时钟的周期.占空比.延时和抖动提出了更高的要 ...

  3. 常用命令npm,gulp, node

    npm常用命令: 检查npm模块的安装情况:(以常用模块 grunt为例说明) 1) 检查是否全局安装了模块Grunt: $npm list -g grunt 2) 列出所有已经全局安装的模块:$np ...

  4. Mac 日常使用tips

    20180725: windows标准的键盘连接了mac如何映射键盘?最大的好处是可以向后删除,还可以一键PageUP, PageDown ref: https://support.apple.com ...

  5. C#进阶系列——AOP

    一.AOP概念(转自) 老规矩,还是先看官方解释:AOP(Aspect-Oriented Programming,面向切面的编程),它是可以通过预编译方式和运行期动态代理实现在不修改源代码的情况下给程 ...

  6. 第三篇、Python函数

    1.函数和过程的定义: 1) 函数定义:函数是逻辑结构化和过程化的一种编程方法. 2) 过程定义:过程就是简单特殊没有返回值的函数. 当一个函数/过程没有使用return显示的定义返回值时,pytho ...

  7. Python oracle乱码问题

    Python使用cx_oracle连接oracle查询汉字时出现乱码 解决方式: import os os.environ['NLS_LANG'] = 'SIMPLIFIED CHINESE_CHIN ...

  8. JS event loop

    一.为什么JavaScript是单线程? JavaScript语言的一大特点就是单线程,也就是说,同一个时间只能做一件事.那么,为什么JavaScript不能有多个线程呢?这样能提高效率啊. Java ...

  9. Vue.js连接后台数据jsp页面  ̄▽ ̄

    <%@ page language="java" contentType="text/html; charset=UTF-8" pageEncoding= ...

  10. zend studio mac

    zend studio mac是一款PHP语言集成开发环境(IDE),专为开发人员研发,它包含了所有组件的开发为完整的PHP应用程序生命周期提供条件.zend studio是很多开发人员.程序员等专业 ...