census = read.csv("census.csv")
library(caTools)
set.seed(2000)
spl = sample.split(census$over50k,SplitRatio = 0.6)
train = subset(census,spl == TRUE)
test = subset(census, spl == FALSE)
# use the logistic regression
glm = glm(over50k ~. , data = train, family = "binomial")
summary(glm) #pr(>|z|) if it is smaller than 0.1, the variables are significant

#accuracy
glm.pred = predict(glm, newdata = test, type = "response")
table(test$over50k,glm.pred >= 0.5)

(9051+1888)/nrow(test)

#baseline accuracy of test - more frequent outcome
table(test$over50k)
9713/nrow(test)

#ROC & ACU
library(ROCR)
#Then we can generate the confusion matrix
ROCpred = prediction(glm.pred, test$over50k)
plot(performance(ROCpred,measure="tpr",x.measure="fpr"),colorize = TRUE)
as.numeric(performance(ROCpred, "auc")@y.values)

#Problem 2.1 - A CART Model
library(rpart)
library(rpart.plot)
CTree = rpart(over50k ~. , data = train, method = "class")
prp(CTree)

# accuracy of the CART model
CTree.pred = predict(CTree, newdata = test, type = "class")

table(test$over50k,CTree.pred)
(9243+1596)/nrow(test)

#use another way- generate probabilities and use a threshold of 0.5 like in logistic regression
CTree.pred1 = predict(CTree, newdata = test)
p = CTree.pred1[,2] # the column of over 50k
table(test$over50k, p) # p<=0.5 it is same with the <=50k, p>0.5 means >50k

# ROC curve for the CART model - WOW
#removing the type="class" argument when making predictions
library(ROCR)
library(arulesViz)
CTree.ROCpred = prediction(CTree.pred1[,2],test$over50k)
# plot(CTree.ROCpred) can not run
plot(performance(CTree.ROCpred,measure="tpr",x.measure="fpr"),colorize = TRUE)

# to caculate the auc
as.numeric(performance(CTree.ROCpred,"auc")@y.values)

# another way to seek for auc
CTree.ROCpred2 = prediction(p,test$over50k)
as.numeric(performance(CTree.ROCpred2,"auc")@y.values)

#Problem 3.1 - A Random Forest Model
set.seed(1)
trainSmall = train[sample(nrow(train),2000),]

set.seed(1)
library(randomForest)
RFC = randomForest(over50k ~., data = trainSmall)
RFC.pred = predict(RFC,newdata = test) #using a threshold of 0.5, no need to set the type = "class"
table(test$over50k,RFC.pred)
(9586+1093)/nrow(test) # a little difference is allowed

#compute metrics that give us insight into which variables are important.
vu = varUsed(RFC, count = TRUE)
vusorted = sort(vu, decreasing = FALSE, index.return = TRUE)
dotchart(vnsorted$x, names(RFC$forest$xlevel[vusorted$ix]))

#another way to find the important variables - impurity
varImpPlot(RFC)

# select cp by Cross-validation for the CART Trees
library(caret)
library(e1071)
set.seed(2)
#Specify that we are going to use k-fold cross validation with 10 folds:
numFolds = trainControl(method = "cv", number = 10)
#Specify the grid of cp values that we wish to evaluate:
cartGrid = expand.grid(.cp = seq(0.002,0.1,0.002))
#run the train function and view the result:
tr = train(over50k ~.,data = train, method = "rpart", trControl = numFolds, tuneGrid = cartGrid)
tr # The final value used for the model was cp = 0.002.

CTree2 = rpart(over50k ~., data = train, method = "class", cp = 0.002)
CTree2.pred = predict(CTree2, newdata = test, type = "class")
table(test$over50k, CTree2.pred)
(9178+1838)/nrow(test)
prp(CTree2) # shoould be 18 splits

[Machine Learning][The Analytics Edge][Predicting Earnings from Census Data]的更多相关文章

  1. Machine Learning for Developers

    Machine Learning for Developers Most developers these days have heard of machine learning, but when ...

  2. How do I learn machine learning?

    https://www.quora.com/How-do-I-learn-machine-learning-1?redirected_qid=6578644   How Can I Learn X? ...

  3. Course Machine Learning Note

    Machine Learning Note Introduction Introduction What is Machine Learning? Two definitions of Machine ...

  4. [C2P3] Andrew Ng - Machine Learning

    ##Advice for Applying Machine Learning Applying machine learning in practice is not always straightf ...

  5. Why The Golden Age Of Machine Learning is Just Beginning

    Why The Golden Age Of Machine Learning is Just Beginning Even though the buzz around neural networks ...

  6. Introducing: Machine Learning in R(转)

    Machine learning is a branch in computer science that studies the design of algorithms that can lear ...

  7. Azure Machine Learning

    About me In my spare time, I love learning new technologies and going to hackathons. Our hackathon p ...

  8. Getting started with machine learning in Python

    Getting started with machine learning in Python Machine learning is a field that uses algorithms to ...

  9. Google's Machine Learning Crash Course #01# Introducing ML & Framing & Fundamental terminology

    INDEX Introducing ML Framing Fundamental machine learning terminology Introducing ML What you learn ...

随机推荐

  1. Android App启动速度优化

    解决在桌面上点击APP图标后经过一两秒后才显示页面,以及App启动后主界面显示过慢问题 一.应用的启动方式 1.冷启动:当启动应用时,后台没有该应用的进程,这时系统会首先会创建一个新的进程分配给该应用 ...

  2. base64图片内容下载转为图片保存

    网页中的base64图片内容下载后,利用PIL转为图片保存 from skimage.io import imread from PIL import Image from cStringIO imp ...

  3. Js将数字转化为中文大写

    function number_chinese(str) { var num = parseFloat(str); var strOutput = "", strUnit = '仟 ...

  4. Kafka介绍与消息队列

    消息队列的好处: 消息队列(Message Queue) 消息: 网络中的两台计算机或者两个通讯设备之间传递的数据.例如说:文本.音乐.视频等内容. 队列:一种特殊的线性表(数据元素首尾相接),特殊之 ...

  5. html 设置input框的记忆功能(联想内容)

    autocomplete=“on/off” 1.默认情况下,autocomplete的值是on.你可以将其设置为off. 2.autocomplete属性可以放在input 元素上,也可以放在form ...

  6. 解决Tomcat启动时项目重复加载问题

    前几天一个同学项目要上线,部署到服务器时,因为客户需要通过IP直接可以访问到,所以在server.xml做了如下的配置 导致启动tomcat时候出现一个项目重复加载了两次,很容易就出现了内存溢出. 这 ...

  7. java.io.UnsupportedEncodingException

    启动项目抛错: java.io.UnsupportedEncodingException: 1 at java.lang.StringCoding.decode(StringCoding.java:1 ...

  8. leetcode79

    class Solution { public boolean exist(char[][] board, String word) { for(int i=0; i<board.length; ...

  9. Object.defineProperty(o,p,descriptor ) 理解应用

    1. Object.defineProperty  在一个对象上定义一个新属性,或修改一个已经存在的属性, 最终返回这个对象. var __define = this.__define || func ...

  10. UNITY 打包时提示sdk tools 或 sdk build tools版本低时可以直接点update 按钮进行更新

    如题.如果不更新,而选择 : use newest version installed ,打包到结尾时可能会报错,莫名其妙的java错误 而且,SDK一旦被更新后,其所在目录的 SDK MANAGER ...