private WebClient getAWebClient() {
WebClient webClient = new WebClient(BrowserVersion.FIREFOX_24);
webClient.getOptions().setTimeout(20000);
// webClient.getCookieManager().setCookiesEnabled(true);
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setJavaScriptEnabled(false);
webClient.addRequestHeader("Accept", "textml,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
webClient.addRequestHeader("Accept-Encoding", "gzip, deflate");
webClient.addRequestHeader("Accept-Language", "en-US,en;q=0.5");
webClient.addRequestHeader("Cache-Control", "max-age=0");
webClient.addRequestHeader("Connection", "keep-alive");
webClient.addRequestHeader("Host", "www.amazon.com");
webClient.addRequestHeader("User-Agent", "Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0");
return webClient;
}
/**
* 采集网页
*/
public StringBuilder crawlPage(String url) {
StringBuilder builder = new StringBuilder();
logger.info(Thread.currentThread().getName() + " crawl " + url);
// mygetpage代码放在这里
webClient.getCookieManager().clearCookies();
logger.info(Thread.currentThread().getName() + " webClient.getCookieManager().clearCookies();");
File file = new File(cookiePathAppendRandom());
logger.info(Thread.currentThread().getName() + " File file = new File(cookiePathAppendRandom());");
if (file.exists()) {
FileInputStream fin = null;
try {
fin = new FileInputStream(file);
} catch (FileNotFoundException e1) {
e1.printStackTrace();
}
CookieStore cookieStore = null;
ObjectInputStream in;
try {
in = new ObjectInputStream(fin);
cookieStore = (CookieStore) in.readObject();
in.close();
} catch (IOException e) {
logger.error(e);
} catch (ClassNotFoundException e) {
logger.error(e);
}
List<org.apache.http.cookie.Cookie> l = cookieStore.getCookies();
for (org.apache.http.cookie.Cookie temp : l) {
Cookie cookie = new Cookie(temp.getDomain(), temp.getName(), temp.getValue(), temp.getPath(),
temp.getExpiryDate(), false);
webClient.getCookieManager().addCookie(cookie);
}
}
logger.info(Thread.currentThread().getName() + " MyGetPage start,url:" + url);
HtmlPage page = MyGetPage(new StringBuffer(url));
logger.info(Thread.currentThread().getName() + " MyGetPage end,url:" + url);
if (page == null) {
// 采集过程中出现异常的model,可以统一放在一个list中,发送给server重新加入到采集分配队列
logger.info("Page null!");
AmazonCrawlModel model=new AmazonCrawlModel(crawlId, crawlURLId, url, depth,ischange);
exceptionFun(model);
return (new StringBuilder("getNullPage"));
}
logger.info(Thread.currentThread().getName() + " builder.append(page.asXml());");
builder.append(page.asXml());
logger.info(Thread.currentThread().getName() + " return builder;");
logger.info(Thread.currentThread().getName() +" CrawlPage $Length="+builder.toString().length());
if(builder.toString().length()<=300){
AmazonCrawlModel model=new AmazonCrawlModel(crawlId, crawlURLId, url, depth,ischange);
exceptionFun(model);
return (new StringBuilder("getNullPage"));
}
return builder;
}
/***
* 自定义的getpage,遇到验证码页面识别直至成功
*
*/
private HtmlPage MyGetPage(StringBuffer URL) {
HtmlPage page = null;
boolean flag = true;
int TryTimeCnt = 1;
int UnknowHostTryTimeCnt = 1;
while (flag) {
flag = false;
try {
logger.info(Thread.currentThread().getName() + " webClient.getPage : " + URL + ",CrawlURL_id:"
+ crawlURLId);
page = webClient.getPage(URL.toString());
Document doc = Jsoup.parse(page.asXml());
int robotchecknum = 1;
while (doc.select("title").text().equals("Robot Check")) {
logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())
+ " [Robot Check,URL:" + URL + "]");
String captcha_str = AmazonGetCaptcha.GetCaptcha(new StringBuilder(doc.toString()));
logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())
+ " end AmazonGetCaptcha.GetCaptcha");
logger.info(dayformat1.format(new Date()) + " " + Thread.currentThread().getName() + " : "
+ captcha_str); HtmlForm form = null; logger.info(Thread.currentThread().getName() + " page.getForms().get(0) Start");
form = page.getForms().get(0);
logger.info(Thread.currentThread().getName() + " page.getForms().get(0) End"); HtmlButton button = null; logger.info(Thread.currentThread().getName() + " form.getElementsByTagName(button).get(0) Start");
button = (HtmlButton) form.getElementsByTagName("button").get(0);
logger.info(Thread.currentThread().getName() + " form.getElementsByTagName(button).get(0) End"); logger.info(Thread.currentThread().getName() + " setValueAttribute Start");
form.getInputByName("field-keywords").setValueAttribute(captcha_str);
logger.info(Thread.currentThread().getName() + " setValueAttribute End"); logger.info(Thread.currentThread().getName() + " button.click Start");
boolean click_flag = false;
while (!click_flag) {
try {
click_flag = true;
page = button.click();
} catch (Exception e1) {
logger.error(Thread.currentThread().getName() + " button.click出错了: " + e1);
//e1.printStackTrace();
click_flag = false;
}
}
logger.info(Thread.currentThread().getName() + " button.click end");
while (page.asXml() == null) {
logger.info(Thread.currentThread().getName() + " page xml null");
logger.info(Thread.currentThread().getName() +" "+ page.asXml());
page.refresh();
logger.info(Thread.currentThread().getName() + " refresh End!");
}
logger.info(Thread.currentThread().getName() + " button.click End"); logger.info(Thread.currentThread().getName() + " Start ParsePage!");
doc = Jsoup.parse(page.asXml());
if (!doc.select("title").text().equals("Robot Check")) {
logger.info(Thread.currentThread().getName() + " " + doc.select("title").text());
logger.info(Thread.currentThread().getName() + " "
+ dayformat1.format(System.currentTimeMillis()) + " [Robot Check,captcha success:"
+ captcha_str + ",try num:" + robotchecknum + "]");
}
robotchecknum++;
} } catch (FailingHttpStatusCodeException e) {
logger.error(Thread.currentThread().getName() +" "+ e);
flag = true;
} catch (MalformedURLException e) {
logger.error(Thread.currentThread().getName() +" "+ e);
flag = true;
}catch(UnknownHostException e) {
logger.error(Thread.currentThread().getName() +" "+ e);
flag = true;
logger.info("found UnknownHostException,start sleep 20 min");
try {
Thread.sleep(1000*60*Integer.parseInt(Configuration.getProperties("unknowhost_sleeptime")));
} catch (InterruptedException e1) {
logger.error(Thread.currentThread().getName() +" "+ e1);
}
logger.info("found UnknownHostException,end sleep 20 min");
UnknowHostTryTimeCnt++;// 访问异常数加一
logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())
+ " [UnknowHostTryTimeCnt:" + UnknowHostTryTimeCnt + "]");
if (UnknowHostTryTimeCnt > Integer.parseInt(Configuration.getProperties("unknowhost_maxtrytime"))) {
return null;
}
}catch (Exception eq) {
logger.error(Thread.currentThread().getName() + " "+eq);
TryTimeCnt++;// 访问异常数加一
logger.info(Thread.currentThread().getName() + " " + dayformat1.format(System.currentTimeMillis())
+ " [TryTimeCnt:" + TryTimeCnt + "]");
if (TryTimeCnt > 5) {
return null;
}
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
logger.error(Thread.currentThread().getName() + e);
}
flag = true;
}
try {
Thread.sleep(random.nextInt(500) + 1500);
} catch (InterruptedException e) {
logger.error(Thread.currentThread().getName() + e);
flag = true;
}
}
return page;
}

Java WebClient 总结的更多相关文章

  1. Spark案例分析

    一.需求:计算网页访问量前三名 import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} /* ...

  2. C#调用JAVA接口WSSE方式用WebClient方式

    C#读取JAVA的WSSE接口的调用代码: 用webclient 方式: /// <summary> /// 调用java cxf ws_security加密的服务wcf客户端对应的加密类 ...

  3. websocket通信 实现java模拟一个client与webclient通信

    发文原由: 熟悉socket通信的同学,对于socket模拟server与client,实现相互通信, 或者使用websocket与java模拟的websocket服务器通信(比如一个聊天室),对于这 ...

  4. Java调用Http/Https接口(7,end)--WebClient调用Http/Https接口

    WebClient是Spring提供的非阻塞.响应式的Http客户端,提供同步及异步的API,将会代替RestTemplate及AsyncRestTemplate.文中所使用到的软件版本:Java 1 ...

  5. java.lang.UnsupportedClassVersionError: com/gargoylesoftware/htmlunit/WebClient : Unsupported major.minor version 52.0 (unable to load class com.gargoylesoftware.htmlunit.WebClient)

    java.lang.UnsupportedClassVersionError: com/gargoylesoftware/htmlunit/WebClient : Unsupported major. ...

  6. htmlunit学习之java.lang.NoSuchMethodError: com.gargoylesoftware.htmlunit.WebClient.getOptions()Lcom/gargoylesoftware/htmlunit/WebClientOptions;

    运行到这里就报错 java.lang.NoSuchMethodError: com.gargoylesoftware.htmlunit.WebClient.getOptions()Lcom/gargo ...

  7. webClient请求JAVA超时解决方案

    private class MyWebClient: WebClient { protected override WebRequest GetWebRequest(Uri uri) { WebReq ...

  8. C#、JAVA操作Hadoop(HDFS、Map/Reduce)真实过程概述。组件、源码下载。无法解决:Response status code does not indicate success: 500。

    一.Hadoop环境配置概述 三台虚拟机,操作系统为:Ubuntu 16.04. Hadoop版本:2.7.2 NameNode:192.168.72.132 DataNode:192.168.72. ...

  9. Atitit.http httpclient实践java c# .net php attilax总结

    Atitit.http httpclient实践java c# .net php attilax总结 1. Navtree>> net .http1 2. Httpclient理论1 2. ...

随机推荐

  1. Java字节流与字符流基本操作

    在程序中所有的数据都是以流的方式进行传输或保存的,程序需要数据时要使用输入流读取数据,而当程序需要将一些数据保存起来时,就要使用输出流. 在java.io包中流的操作主要有字节流.字符流两大类,两类都 ...

  2. MySQL学习笔记——基本语法

    SQL——结构化查询语言(Structured Query Language) 1> SQL语言不区分大小写,建议关键字用大写,但是字符串常量区分大小写 2> SQL注释:/**/多行注释 ...

  3. TortoiseSVN的相关问题

    图标覆盖问题: 创建一个保存文档的目录,创建一个工作目录,工作目录创建后,没有显示覆盖图标.解决办法如下: 找到设置,在工作目录下右键 打开界面如下: 从包含路径中设置就可以. 目录检出后把它从检出状 ...

  4. clipboard复制剪贴板功能,以及用requirejs时报错---Uncaught ReferenceError: Clipboard is not defined

    zeroclipboard是走的flash插件,手机浏览器是不支持的,所以不得不舍弃之,用clipboard,clipboard不需要flash就可以完成复制剪切等功能,而且可以兼容pc,移动端,下面 ...

  5. js中,还真不了解 console

    参考链接: https://segmentfault.com/a/1190000000481884

  6. CentOS安装wordpress权限问题

    最近在CentOS6.5上安装wordpress,遇上一个问题,安装好之后外网总是不能再网页进行配置,想了半天应该是源代码文件的权限问题,具体问题与解决如下: 如果你的wordpress安装目录是wo ...

  7. Orchard源码分析(4.1):Orchard.Environment.CollectionOrderModule类

    CollectionOrderModule类是一个Autofac模块(Module,将一系列组件和相关的功能包装在一起),而非Orchard模块.其作用是保证多个注册到容器的组件能按FIFO(Firs ...

  8. Code First 关系 Fluent API

    通过实体框架 Code First,可以使用您自己的域类表示 EF 执行查询.更改跟踪和更新函数所依赖的模型.Code First 利用称为“约定先于配置”的编程模式.这意味着 Code First ...

  9. PHP与Javascript的混合测试

    js调用php <?php $num=88; ?> <script> var a = <?php echo $num;?>; alert(a); </scri ...

  10. System.nanoTime与System.currentTimeMillis的区别

    平时产生随机数时我们经常拿时间做种子,比如用 System.currentTimeMillis的结果,但是在执行一些循环中使用了System.currentTimeMillis,那么每次的结 果将会差 ...