R自动数据收集第二章HTML笔记2(主要关于htmlTreeParse函数)
function (file, ignoreBlanks = TRUE, handlers = NULL, replaceEntities = FALSE,
asText = FALSE, trim = TRUE, validate = FALSE, getDTD = TRUE,
isURL = FALSE, asTree = FALSE, addAttributeNamespaces = FALSE,
useInternalNodes = FALSE, isSchema = FALSE, fullNamespaceInfo = FALSE,
encoding = character(), useDotNames = length(grep("^\\.",
names(handlers)))>0, xinclude = TRUE, addFinalizer = TRUE,
error = htmlErrorHandler, isHTML = TRUE, options = integer(),
parentFirst = FALSE)
{
#asText=T则file作为XML文本处理
isMissingAsText = missing(asText)
#地址大于一个时要中止程序并抛出异常
if(length(file)>1){
file = paste(file, collapse ="\n")
if(!missing(asText)&&!asText)
stop(structure(list(message ="multiple URLs passed to xmlTreeParse. If this is the content of the file, specify asText = TRUE"),
class= c("MultipleURLError","XMLParserError",
"simpleError","error","condition")))
asText = TRUE
}
#当isURL非空时且是XML的时候,才修改URL为数值型
#比如这里isURL=1
if(missing(isURL)&&!asText)
isURL <- length(grep("^(http|ftp|file)://", file, useBytes = TRUE,
perl = TRUE))
#isHTML 默认为 TRUE
if(isHTML){
validate = FALSE
getDTD = FALSE
isSchema = FALSE
docClass ="HTMLInternalDocument"
}
else docClass = character()
#checkHandlerNames返回的是一个逻辑值,其作用是
checkHandlerNames(handlers,"DOM")
if(missing(fullNamespaceInfo)&& inherits(handlers,"RequiresNamespaceInfo"))
fullNamespaceInfo = TRUE
oldValidate = xmlValidity()
xmlValidity(validate)
on.exit(xmlValidity(oldValidate))
if(!asText && isURL == FALSE){
if(file.exists(file)== FALSE)
if(!missing(asText)&& asText == FALSE){
e = simpleError(paste("File", file,"does not exist"))
class(e)= c("FileNotFound",class(e))
stop(e)
}
else asText <- TRUE
}
if(asText && length(file)>1)
file = paste(file, collapse ="\n")
old = setEntitySubstitution(replaceEntities)
on.exit(setEntitySubstitution(old), add = TRUE)
if(asText && length(grep(sprintf("^%s?\\s*<",BOMRegExp),
file, perl = TRUE, useBytes = TRUE))==0){
if(!isHTML ||(isMissingAsText &&!inherits(file,"AsIs"))){
e = simpleError(paste("XML content does not seem to be XML:",
sQuote(file)))
class(e)= c("XMLInputError",class(e))
(if(isHTML)
warning
else stop)(e)
}
}
if(!is.logical(xinclude)){
xinclude =as.logical(xinclude)
}
if(!asText &&!isURL)
file = path.expand(as.character(file))
if(useInternalNodes && trim){
prevBlanks =.Call("RS_XML_setKeepBlanksDefault",0L,
PACKAGE ="XML")
on.exit(.Call("RS_XML_setKeepBlanksDefault", prevBlanks,
PACKAGE ="XML"), add = TRUE)
}
.oldErrorHandler = setXMLErrorHandler(error)
on.exit(.Call("RS_XML_setStructuredErrorHandler",.oldErrorHandler,
PACKAGE ="XML"), add = TRUE)
if(length(options))
options = sum(options)
ans <-.Call("RS_XML_ParseTree",as.character(file), handlers,
as.logical(ignoreBlanks),as.logical(replaceEntities),
as.logical(asText),as.logical(trim),as.logical(validate),
as.logical(getDTD),as.logical(isURL),as.logical(addAttributeNamespaces),
as.logical(useInternalNodes),as.logical(isHTML),as.logical(isSchema),
as.logical(fullNamespaceInfo),as.character(encoding),
as.logical(useDotNames), xinclude, error, addFinalizer,
as.integer(options),as.logical(parentFirst), PACKAGE ="XML")
if(!missing(handlers)&& length(handlers)&&!as.logical(asTree))
return(handlers)
if(!isSchema && length(class(ans)))
class(ans)= c(docClass, oldClass(class(ans)))
if(inherits(ans,"XMLInternalDocument"))
addDocFinalizer(ans, addFinalizer)
elseif(!getDTD &&!isSchema){
class(ans)= oldClass("XMLDocumentContent")
}
ans
}
<environment: namespace:XML>
#test
#导包
library(XML)
#链接
url <-"http://www.r-datacollection.com/materials/html/fortunes.html"
#handers函数
h2 <- list(
startElement = function(node,...){
name <- xmlName(node)
if(name %in% c("div","title")){NULL}else{node}
},
comment = function(node){NULL}
)
#正式开始,函数参数
file=url
ignoreBlanks = TRUE
handlers = h2
replaceEntities = FALSE
asText = FALSE
trim = TRUE
validate = FALSE
getDTD = TRUE
isURL = FALSE
asTree = TRUE
addAttributeNamespaces = FALSE
useInternalNodes = FALSE
isSchema = FALSE
fullNamespaceInfo = FALSE
encoding = character()
useDotNames = length(grep("^\\.", names(handlers)))>0
xinclude = TRUE
addFinalizer = TRUE
error = XML:::htmlErrorHandler
isHTML = TRUE
options = integer()
parentFirst = FALSE
#函数体部分
#asText参数没有传入
#我们将其从missing(asText)改为TRUE
isMissingAsText = TRUE
#file的长度是否大于1,大于1如果asText未传入则要报错:传入了多个URL
#不大于1,则跳过
if(length(file)>1){
file = paste(file, collapse ="\n")
if(!missing(asText)&&!asText)
stop(structure(list(message ="multiple URLs passed to xmlTreeParse. If this is the content of the file, specify asText = TRUE"),
class= c("MultipleURLError","XMLParserError",
"simpleError","error","condition")))
asText = TRUE
}
# 本来是if (missing(isURL) && !asText)
# 我们修改为if (TURE && !asText)
#isURL参数没有传递且asText参数为假才执行
if(TRUE &&!asText)
isURL <- length(grep("^(http|ftp|file)://", file, useBytes = TRUE,
perl = TRUE))
#只要有http|ftp|file中的一个协议开头,比如http://,就是URL了。
# 此时isURL=1
#是否为HTML,是的
if(isHTML){
validate = FALSE
getDTD = FALSE #从默认值T改为了F
isSchema = FALSE
docClass ="HTMLInternalDocument"
}else{docClass = character()}#否则创建空的
# class(docClass)
#返回TRUE通过检验了,否则函数会中止并抛出异常
XML:::checkHandlerNames(handlers,"DOM")
#fullNamespaceInfo参数whether to provide the namespace URI and prefix on each node
#其实就是是否在节点面前带上URI信息
#fullNamespaceInfo为空且handlers含有该属性,才执行这一步
#missing(fullNamespaceInfo)被替换为TRUE
#handlers并没有RequiresNamespaceInfo属性,所以不执行
if( TRUE && inherits(handlers,"RequiresNamespaceInfo"))
fullNamespaceInfo = TRUE
#以下两行的结果都是integer(0)
#奇妙~,先保存原有的配置
oldValidate = XML:::xmlValidity()
#当前的配置
XML:::xmlValidity(validate)
#还原原来的配置
on.exit(XML:::xmlValidity(oldValidate))
#asText为假,且isURL是假的
#结果为TFALSE,我们不必管他
if(!asText && isURL == FALSE){
if(file.exists(file)== FALSE)
#如果本地文件不存在
if(!missing(asText)&& asText == FALSE){
#抛出异常,文件不存在
e = simpleError(paste("File", file,"does not exist"))
class(e)= c("FileNotFound",class(e))
stop(e)
}
else asText <- TRUE
}
#此时asText是FALSE,这个跟我们无关
if(asText && length(file)>1)
file = paste(file, collapse ="\n")
#replaceEntities的默认值是FALSE
old = XML:::setEntitySubstitution(replaceEntities)
#old的值是FALSE
on.exit(XML:::setEntitySubstitution(old), add = TRUE)
#BOMRegExp是一个内置的常量吧
#看名字应该是基于BOM的正则表达式
#因为是FALSE,所以我们也先不去管它
if(asText && length(grep(
sprintf("^%s?\\s*<", XML:::BOMRegExp),
file, perl = TRUE, useBytes = TRUE
)
)==0)
{
if(!isHTML ||(isMissingAsText &&!inherits(file,"AsIs"))){
e = simpleError(paste("XML content does not seem to be XML:",
sQuote(file)))
class(e)= c("XMLInputError",class(e))
(if(isHTML)
warning
else stop)(e)
}
}
#xinclude默认值是TRUE
#以下三个if都是F,所以不管了
if(!is.logical(xinclude)){
xinclude =as.logical(xinclude)
}
if(!asText &&!isURL)
file = path.expand(as.character(file))
if(useInternalNodes && trim){
prevBlanks =.Call("RS_XML_setKeepBlanksDefault",0L,
PACKAGE ="XML")
on.exit(.Call("RS_XML_setKeepBlanksDefault", prevBlanks,
PACKAGE ="XML"), add = TRUE)
}
.oldErrorHandler = XML:::setXMLErrorHandler(error)
#所以,这种点开头的命名是什么鬼?
# class(.oldErrorHandler)
# [1] "list"
on.exit(.Call("RS_XML_setStructuredErrorHandler",.oldErrorHandler,
PACKAGE ="XML"), add = TRUE)
# length(options)是0,所以不执行
if(length(options))
options = sum(options)
#调用一个叫做RS_XML_ParseTree的函数
getAnywhere("RS_XML_ParseTree")
ans <-.Call("RS_XML_ParseTree",as.character(file), handlers,
as.logical(ignoreBlanks),as.logical(replaceEntities),
as.logical(asText),as.logical(trim),as.logical(validate),
as.logical(getDTD),as.logical(isURL),as.logical(addAttributeNamespaces),
as.logical(useInternalNodes),as.logical(isHTML),as.logical(isSchema),
as.logical(fullNamespaceInfo),as.character(encoding),
as.logical(useDotNames), xinclude, error, addFinalizer,
as.integer(options),as.logical(parentFirst), PACKAGE ="XML")
print(ans)
print("-------我是可爱的分割线------------------")
#这里的missing(handlers)我们就不改了哈
#毕竟只要有默认值,他都觉得是TRUE
#和我们的确传递了处理函数的效果TRUE是一样的
if(!missing(handlers)&& length(handlers)&&!as.logical(asTree))
return("呵呵")
if(!isSchema && length(class(ans)))
class(ans)= c(docClass, oldClass(class(ans)))
if(inherits(ans,"XMLInternalDocument")){
addDocFinalizer(ans, addFinalizer)
}elseif(!getDTD &&!isSchema){
print("看我的类型")
print(class(ans))
class(ans)= oldClass("XMLDocumentContent")
print(class(ans))
}
print("看条件判断的逻辑值")
print("1")
print(!missing(handlers)&& length(handlers)&&!as.logical(asTree))
print("2")
print(!isSchema && length(class(ans)))
print("3")
print(inherits(ans,"XMLInternalDocument"))
print("4")
print(!getDTD &&!isSchema)
print(class(ans))
ans
#例子1
testMissing<-function(a=TRUE,b=FALSE){
if(missing(b))
return("b is missing")
else"b is here "+b
}
testMissing(F)
# [1] "b is missing"
#例子2
if(missing(b))
return("b is missing")
# Error in missing(b) : 'missing' can only be used for arguments
#例子3
b=NULL
if(missing(b))
return("b is missing")
#没有输出
if(isHTML){
validate = FALSE
getDTD = FALSE #从默认值T改为了F
isSchema = FALSE
docClass ="HTMLInternalDocument"
}else{docClass = character()}#否则创建空的
}else docClass = character()
function(){
if(isHTML){
validate = FALSE
getDTD = FALSE #从默认值T改为了F
isSchema = FALSE
docClass ="HTMLInternalDocument"
}
else docClass = character()#否则创建空的
}
否则,会报错:
function (handlers, id ="SAX")
{
if(is.null(handlers)) #为空,则返回TRUE
return(TRUE)
ids = names(handlers) #取出handlers中的函数名
i = match(ids,GeneralHandlerNames) #匹配,返回逻辑值向量
prob = any(!is.na(i))#任一个回空才是TRUE
if(prob){
warning("future versions of the XML package will require names of general handler functions
to be prefixed by a . to distinguish them from handlers for nodes with those names. This _may_ affect the ",
paste(names(handlers)[!is.na(i)], collapse =", "))
}
#任意一个handler中的函数不是函数类型,则抛出异常
if(any(w <-!sapply(handlers,is.function)))
warning("some handlers are not functions: ", paste(names(handlers[w]),
collapse =", "))
#返回TRUE,后续代码继续运行
!prob
}
<environment: namespace:XML>
注释3.2关于GeneralHandlerNames属性
> XML:::GeneralHandlerNames
$SAX
[1]"text" "startElement"
[3]"endElement" "comment"
[5]"startDocument" "endDocument"
[7]"processingInstruction""entityDeclaration"
[9]"externalEntity"
$DOM
[1]"text" "startElement"
[3]"comment" "entity"
[5]"cdata" "processingInstruction"
分别有SAX和DOM两种方式,想起学Java时的DOM和SAX解析没?
> h2 <- list(
+ startElement = function(node,...){
+ name <- xmlName(node)
+ if(name %in% c("div","title")){NULL}else{node}
+ },
+ comment = function(node){NULL}
+)
> handlers<-h2
> ids = names(handlers)
> ids
[1]"startElement""comment"
> i = match(ids, XML:::GeneralHandlerNames)
> i
[1] NA NA
>?match
starting httpd help server ... done
>!is.na(i)#如果i中有NA,则返回F,没有NA则返回T
[1] FALSE FALSE
> prob = any(!is.na(i))#当i中的任意一个元素都不是NA的时候,prob才返回T
> prob
[1] FALSE
match returns a vector of the positions of (first) matches of its first argument in its second.
> testMatch<-c("a","c")
> testSet <- c('a','b','c')
> match(testMatch,testSet)
[1]13
x <-10
class(x)# "numeric"
oldClass(x)# NULL
#看了下文档,我个人觉得oldClass是S语言的余毒啊!!!
inherits(x,"a")#FALSE
class(x)<- c("a","b")
# x
# [1] 10
# attr(,"class")
# [1] "a" "b"
#即,x为10这个变量被赋予了两个class,分别名为"a"和 "b"
inherits(x,"a")#TRUE
inherits(x,"a", TRUE)# 1
inherits(x, c("a","b","c"), TRUE)# 1 2 0
#以下两行的结果都是integer(0)
#奇妙~,先保存原有的配置
oldValidate = XML:::xmlValidity()
#使用当前的配置
XML:::xmlValidity(validate)
#还原
on.exit(XML:::xmlValidity(oldValidate))
> XML:::xmlValidity()
integer(0)
> getAnywhere(xmlValidity)
A single object matching ‘xmlValidity’ was found
It was found in the following places
namespace:XML
with value
function (val = integer(0))
{
.Call("RS_XML_getDefaultValiditySetting",as.integer(val),
PACKAGE ="XML")
}
<environment: namespace:XML>
on.exit records the expression given as its argument as needing to be executed when the current function exits (either naturally or as the result of an error). This is useful for resetting graphical parameters or performing other cleanup actions.
> opar <- par(bg='lightblue')
> on.exit(par(opar))
> plot(c(1,2,3),c(4,5,6))#蓝色背景
> plot(c(1,2,3,4,5),runif(5))#蓝色背景
#此时关闭绘图窗口
> plot(c(1,2,3,4,5),rnorm(5))
#白色背景
plot_with_big_margins <- function(...)
{
old_pars <- par(mar = c(10,9,9,7))
on.exit(par(old_pars))
plot(...)
}
plot_with_big_margins(with(cars, speed, dist))
#不关闭图像窗口,此时再运行如下语句
plot(c(1,2,3),c(4,5,6))
对比
plot_with_big_margins <- function(...)
{
par(mar = c(10,9,9,7))
plot(...)
}
plot_with_big_margins(with(cars, speed, dist))
#不关闭图像窗口,此时再运行:
plot(c(1,2,3),c(4,5,6))
old_pars <- par(mar = c(10,9,9,7))
on.exit(par(old_pars))
不是要还原吗?为什么是把old_pars传递给par()?
> old_pars <- par(mar = c(10,9,9,7))
> old_pars
$mar
[1]5.14.14.12.1
> op <- options(stringsAsFactors = FALSE)
> op
$stringsAsFactors
[1] TRUE
par() #得到的
mar是5.14.14.12.1plot_with_big_margins <- function(...)
{
old_pars <- par(mar = c(10,9,9,7)) #原参数被保存,新参数设置生效
print(par())
,9,9,7
on.exit(par(old_pars)) #参数被还原为原来的参数
plot(...)
}
plot_with_big_margins(with(cars, speed, dist))
par()
#得到的
mar是5.14.14.12.1
my_plot <- function()
{
with(cars, plot(speed, dist))
}
save_base_plot <- function(plot_fn, file)
{
png(file)
on.exit(dev.off())
plot_fn()
}
save_base_plot(my_plot,"testcars.png")
#replaceEntities的默认值是FALSE
old = XML:::setEntitySubstitution(replaceEntities)
#old的值是FALSE
on.exit(XML:::setEntitySubstitution(old), add = TRUE)
这几行代码的功能是类似的
> XML:::setEntitySubstitution
function (val)
.Call("RS_XML_SubstituteEntitiesDefault",as.logical(val), PACKAGE ="XML")
<environment: namespace:XML>
.oldErrorHandler = XML:::setXMLErrorHandler(error)
#所以,这种点开头的命名是什么鬼?
# class(.oldErrorHandler)
# [1] "list"
on.exit(.Call("RS_XML_setStructuredErrorHandler",.oldErrorHandler,
PACKAGE ="XML"), add = TRUE)
> XML:::htmlErrorHandler
function (msg, code, domain, line, col, level, filename,class="XMLError")
{
e = makeXMLError(msg, code, domain, line, col, level, filename,
class)
dom = names(e$domain)
class(e)= c(names(e$code), sprintf("%s_Error", gsub("_FROM_",
"_", dom)),class(e))
if(e$code == xmlParserErrors["XML_IO_LOAD_ERROR"])
stop(e)
}
<environment: namespace:XML>
> XML:::setXMLErrorHandler
function (fun)
{
prev =.Call("RS_XML_getStructuredErrorHandler", PACKAGE ="XML")
sym = getNativeSymbolInfo("R_xmlStructuredErrorHandler",
"XML")$address
.Call("RS_XML_setStructuredErrorHandler", list(fun, sym),
PACKAGE ="XML")
prev
}
<environment: namespace:XML>
> getAnywhere(BOMRegExp)
A single object matching ‘BOMRegExp’ was found
It was found in the following places
namespace:XML
with value
[1]"(\\xEF\\xBB\\xBF|\\xFE\\xFF|\\xFF\\xFE)"
untar(download.packages(pkgs ="XML",
destdir =".",
type ="source")[,2])
附件列表
R自动数据收集第二章HTML笔记2(主要关于htmlTreeParse函数)的更多相关文章
- R自动数据收集第二章HTML笔记1(主要关于handler处理器函数和帮助文档所有示例)
本文知识点: 1潜在畸形页面使用htmlTreeParse函数 2startElement的用法 3闭包 4handler函数的命令和函数体主要写法 5节点的丢弃,取出,取出标签名称.属性.属 ...
- R自动数据收集第一章概述——《List of World Heritage in Danger》
导包 library(stringr) library(XML) library(maps) heritage_parsed <- htmlParse("http://en ...
- AS开发实战第二章学习笔记——其他
第二章学习笔记(1.19-1.22)像素Android支持的像素单位主要有px(像素).in(英寸).mm(毫米).pt(磅,1/72英寸).dp(与设备无关的显示单位).dip(就是dp).sp(用 ...
- #Spring实战第二章学习笔记————装配Bean
Spring实战第二章学习笔记----装配Bean 创建应用对象之间协作关系的行为通常称为装配(wiring).这也是依赖注入(DI)的本质. Spring配置的可选方案 当描述bean如何被装配时, ...
- CentOS6安装各种大数据软件 第二章:Linux各个软件启动命令
相关文章链接 CentOS6安装各种大数据软件 第一章:各个软件版本介绍 CentOS6安装各种大数据软件 第二章:Linux各个软件启动命令 CentOS6安装各种大数据软件 第三章:Linux基础 ...
- Machine Learning In Action 第二章学习笔记: kNN算法
本文主要记录<Machine Learning In Action>中第二章的内容.书中以两个具体实例来介绍kNN(k nearest neighbors),分别是: 约会对象预测 手写数 ...
- Day2 《机器学习》第二章学习笔记
这一章应该算是比价了理论的一章,我有些概率论基础,不过起初有些地方还是没看多大懂.其中有些公式的定义和模型误差的推导应该还是很眼熟的,就是之前在概率论课上提过的,不过有些模糊了,当时课上学得比较浅. ...
- Python核心编程第三版第二章学习笔记
第二章 网络编程 1.学习笔记 2.课后习题 答案是按照自己理解和查阅资料来的,不保证正确性.如由错误欢迎指出,谢谢 1. 套接字:A network socket is an endpoint of ...
- Linux第一章第二章学习笔记
第一章 Linux内核简介 1.1 Unix的历史 它是现存操作系统中最强大最优秀的系统. 设计简洁,在发布时提供原代码. 所有东西都被当做文件对待. Unix的内核和其他相关软件是用C语言编写而成的 ...
随机推荐
- oracle 自增长序列
create or replace TRIGGER "METTINGUSER".TRG_PREPN BEFORE INSERT ON "PREPROJFUN" ...
- auto_clipboard
黄山松发表于博客园:http://www.cnblogs.com/tomview/p/6137179.html #ifndef __HSS_AUTO_CLIPBOARD_HSS__#define __ ...
- 运行nltk示例 Resource u'tokenizers punkt english.pickle' not found解决
nltk安装完毕后,编写如下示例程序并运行,报Resource u'tokenizers/punkt/english.pickle' not found错误 import nltk sentence ...
- MPLS与LDP从入门到了解
多协议标签交换(MPLS)是一种用于快速转发数据包的技术,它的出现就是为了提高转发效率.因为IP转发大多靠软件进行,在转发的每一跳都要进行至少一次最长匹配查找,操作复杂导致转发速度比较慢.有些厂商借鉴 ...
- 代码管理工具 --- git的学习笔记一《git的个人开发》
重点摘要: 创建了一个文件后首先先通过git add . 添加到暂缓区,然后通过git commit -m "提交的名字" 提交到本地仓库,最后才可能push到远程仓库. 1. 个 ...
- weui 弹框中的单选效果
<!--性别修改弹框--> <div class="weui_dialog_alert" id="doctorSexDialog" style ...
- Node6.9.2 —— Http官网笔记整理
欢迎指导与讨论 : ) 序章 本文概要:http.Agent代理.http.ClientRequest客户端请求.http.server服务器.http.ServerResponse服务器相应.htt ...
- [原]在GeoServer中为OpenStreetMap数据设置OSM样式
转载请注明作者think8848和出处(http://think8848.cnblogs.com) 在前面几篇文章中,我们讲到了部署Postgresql,部署PostGis,部署GeoServer以及 ...
- easyui与ueditor合用问题
在联合使用easyui与ueditor的时候,当在dialog中通过href打开一个页面,页面中含有ueditor,如果使用dialog的close方法,可以理解为只是将dialog给隐藏了,再将点击 ...
- Scrum meeting 记录
本周Scrum Master 侯宇泰 一. 工作完成内容记录 · 陈双: 1. 后端数据库Azure Storage设计 2. Offline 上传Movie信息 3. 主页和配音页面的连接 · 鲍航 ...