a code snip
- import java.util.ArrayList;
- import java.util.HashMap;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- import lombok.AllArgsConstructor;
- import lombok.Data;
- import lombok.NoArgsConstructor;
- import lombok.extern.slf4j.Slf4j;
- import org.apache.commons.lang3.StringEscapeUtils;
- import org.apache.commons.lang3.StringUtils;
- import org.apache.http.HttpEntity;
- import org.apache.http.client.methods.CloseableHttpResponse;
- import org.apache.http.client.methods.HttpGet;
- import org.apache.http.impl.client.CloseableHttpClient;
- import org.apache.http.impl.client.HttpClients;
- import org.apache.http.util.EntityUtils;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- import com.creditcloud.brick.task.CrawlTask;
- import com.creditcloud.brick.task.Extractor;
- import com.creditcloud.brick.task.Field;
- @NoArgsConstructor
- @AllArgsConstructor
- @Data
- @Slf4j
- public class DataCrawler {
- String url;
- String space;
- public HashMap<String, Object> doCrawl(CrawlTask task ) {
- HashMap<String, Object> result = new HashMap<String, Object>();
- this.url = task.getUrl();
- this.space = task.getId();
- //
- String content=this.doGet(url);
- if( StringUtils.isNotEmpty(content)) {
- Extractor actor=task.getExtractor();
- if( actor != null )
- this.parse(actor, content, result);
- }
- return result;
- }
- public String doGet( String url ) {
- String data=null;
- //return Jsoup.connect(url).userAgent("Mozilla").get();
- CloseableHttpClient httpclient = HttpClients.createDefault();
- HttpGet hGet = new HttpGet(url);
- log.info(url);
- CloseableHttpResponse response = null;
- try {
- response = httpclient.execute(hGet);
- HttpEntity entity = response.getEntity();
- System.out.println(response.getStatusLine());
- log.info( response.getStatusLine().toString() );
- //
- if (entity != null) {
- System.out.println("Response content length: " + entity.getContentLength());
- data = EntityUtils.toString(entity);
- System.out.println(data);
- EntityUtils.consume(entity);
- }
- }
- catch(Exception e){
- log.error(e.getMessage());
- }
- //response
- try
- {
- if( response != null )
- response.close();
- }
- catch(Exception e) {
- log.error(e.getMessage());
- }
- //httpclient
- try {
- if( httpclient != null )
- httpclient.close();
- } catch (Exception e) {
- log.error(e.getMessage());
- }
- return data;
- }
- public String removeHtmlLabel( String input ) {
- return input.replaceAll("<[^>]+>", "").replaceAll(" "," ").trim();
- }
- //
- public ArrayList<String> match( Extractor extractor, String input ) {
- ArrayList<String> result = new ArrayList<String>();
- switch (extractor.getType()) {
- case css: //call css
- {
- Document doc = Jsoup.parse(input);
- Elements elems = doc.select(extractor.getPattern());
- for( Element elem:elems ) {
- result.add( elem.toString() );
- }
- }
- break;
- case regex: //call regex
- {
- Pattern p = Pattern.compile( extractor.getPattern());
- Matcher m = p.matcher( input );
- String matchValue = null;
- while(m.find()) {
- matchValue = StringEscapeUtils.unescapeHtml4( m.group());
- result.add(matchValue);
- }
- }
- break;
- case empty:
- result.add(input);
- break;
- }
- return result;
- }
- public void parse( Extractor extractor, String input, HashMap<String, Object> result ) {
- //1. match by css or regex
- ArrayList<String> strlist = this.match(extractor, input);
- if( strlist.isEmpty() ) {
- //result.put( extractor.getId(), null);
- return;
- }
- //2. call children extractors
- switch(extractor.getData()) {
- case array:{
- //result.setType(ResultDataType.array);
- ArrayList<HashMap<String, Object>> list = new ArrayList<HashMap<String, Object>>();
- for( String str:strlist ) {
- HashMap<String, Object> childResult = new HashMap<String, Object>();
- for( Extractor one:extractor.getChildren()) {
- this.parse(one, str, childResult);
- }
- if( childResult.isEmpty() == false )
- list.add(childResult);
- }
- if(list.isEmpty() == false )
- result.put( extractor.getId(), list );
- }
- break;
- case field:{
- for(Field fd:extractor.getFields()) {
- String val=strlist.get( fd.getIndex() );
- result.put( fd.getName(), this.removeHtmlLabel(val) );
- }
- }
- break;
- case none: {
- for( String str:strlist ) {
- for( Extractor one:extractor.getChildren()) {
- this.parse(one, str, result);
- }
- }
- }
- break;
- }
- }
- }
a code snip的更多相关文章
- CSS code snip enjoy.
<!-- information-total得是动态获取吧. --> <div class="information-mod"> <div class ...
- C# Code Snip
1.Tryf + TAB+TAB try { } finally { } 2.Prop+Tab+Tab public int MyProperty { get; set; } 3. #region + ...
- WPF整理-跨程序集访问资源
“Sometimes binary resources are defined in one assembly (typically a class library), but areneeded i ...
- WPF整理-使用用户选择主题的颜色和字体
“Sometimes it's useful to use one of the selected colors or fonts the user has chosen in theWindows ...
- WPF整理-XAML访问静态属性
"XAML provides an easy way to set values of properties—type converters and the extended propert ...
- WPF整理-XAML构建后台类对象
1.XAML 接触WPF的第一眼就是XAML---XAML是用来描绘界面的.其实不然! "Actually, XAML has nothing to do with UI. It's mer ...
- Call C# in powershell
How to call C# code in powershell Powershell Command Add-Type usage of Add-Type we use Add-Type -Typ ...
- Windows Phone 8 开发必备资源
一.MVVM框架推荐 1. MVVM-Light 这个框架是我最常用的MVVM框架之一,它比Prism更轻量级,但对于一般的小应用,功能足够. 官方网站:http://mvvmlight.codepl ...
- 字符串的驻留(String Interning)
http://www.cnblogs.com/artech/archive/2007/03/04/663728.html 关于字符串的驻留的机制,对于那些了解它的人肯定会认为很简单,但是我相信会有很大 ...
随机推荐
- Entity Framework Power Tools
http://visualstudiogallery.msdn.microsoft.com/72a60b14-1581-4b9b-89f2-846072eff19d
- 深入浅出Zookeeper
能找到的一些zookeeper的资料一上来不是扯一通paxos算法就是一大坨一大坨的代码.很多人对zookeeper更多的是听过,所以这一篇文章就尝试用尽可能用精简的语言科普zookeeper. zo ...
- 第二百五十六天 how can I 坚持
今天比较闲,但是好累. 每天都会学到很多东西. 比如说,在没搞懂别人说这话之前,最好不要先表达自己的想法. 不宜妄自菲薄.不以物喜,不以己悲.hadoop. 睡觉.召生好速度啊,这么快就把我照片发给同 ...
- javaScript 类型判断
直接上例子: 1 判断是否为数组类型 2 判断是否为字符串类型 3 判断是否为数值类型 4 判断是否为日期类型 5 判断是否为函数 6 判断是否为对象 1 判断是否为数组类型 linenum < ...
- Red5实现直播
http://pxchen.iteye.com/blog/714591 发布端(Publish): var nc:NetConnection = new NetConnection(); nc.con ...
- 轻松学习 red5 教程 像视频一样很详细还有代码直接可Copy
转载自:http://blog.csdn.net/hongdianking/archive/2009/11/12/4804339.aspx 最近要做一个流媒体服务器,在网上逗留了好久决定选择 red5 ...
- python中列表,元组,字符串如何互相转换
python中有三个内建函数:列表,元组和字符串,他们之间的互相转换使用三个函数,str(),tuple()和list(),具体示例如下所示: >>> s = "xxxxx ...
- Linux的运行级别和chkconfig用法
Linux的运行级别和chkconfig用法 一.Linux的运行级别 在装MySQL的时候,才知道了Linux的运行级别这么一回事.汗…自己太水了…下面总结一下: 什么是运行级别呢?简 ...
- Activator.CreateInstance 方法 (Type) 的用法
转自:http://www.cnblogs.com/lmfeng/archive/2012/01/30/2331666.html Activator.CreateInstance 方法 (Type) ...
- 转移部分博客到CSDN之中
之前的文章一直发布在个人博客ivyxjc.xyz中, 现在将一部分博客移到csdn博客中.