1. import java.util.ArrayList;
  2. import java.util.HashMap;
  3. import java.util.regex.Matcher;
  4. import java.util.regex.Pattern;
  5.  
  6. import lombok.AllArgsConstructor;
  7. import lombok.Data;
  8. import lombok.NoArgsConstructor;
  9. import lombok.extern.slf4j.Slf4j;
  10.  
  11. import org.apache.commons.lang3.StringEscapeUtils;
  12. import org.apache.commons.lang3.StringUtils;
  13. import org.apache.http.HttpEntity;
  14. import org.apache.http.client.methods.CloseableHttpResponse;
  15. import org.apache.http.client.methods.HttpGet;
  16. import org.apache.http.impl.client.CloseableHttpClient;
  17. import org.apache.http.impl.client.HttpClients;
  18. import org.apache.http.util.EntityUtils;
  19. import org.jsoup.Jsoup;
  20. import org.jsoup.nodes.Document;
  21. import org.jsoup.nodes.Element;
  22. import org.jsoup.select.Elements;
  23.  
  24. import com.creditcloud.brick.task.CrawlTask;
  25. import com.creditcloud.brick.task.Extractor;
  26. import com.creditcloud.brick.task.Field;
  27.  
  28. @NoArgsConstructor
  29. @AllArgsConstructor
  30. @Data
  31. @Slf4j
  32. public class DataCrawler {
  33. String url;
  34. String space;
  35.  
  36. public HashMap<String, Object> doCrawl(CrawlTask task ) {
  37. HashMap<String, Object> result = new HashMap<String, Object>();
  38. this.url = task.getUrl();
  39. this.space = task.getId();
  40. //
  41. String content=this.doGet(url);
  42. if( StringUtils.isNotEmpty(content)) {
  43. Extractor actor=task.getExtractor();
  44. if( actor != null )
  45. this.parse(actor, content, result);
  46. }
  47. return result;
  48. }
  49.  
  50. public String doGet( String url ) {
  51. String data=null;
  52. //return Jsoup.connect(url).userAgent("Mozilla").get();
  53. CloseableHttpClient httpclient = HttpClients.createDefault();
  54. HttpGet hGet = new HttpGet(url);
  55. log.info(url);
  56. CloseableHttpResponse response = null;
  57. try {
  58. response = httpclient.execute(hGet);
  59. HttpEntity entity = response.getEntity();
  60. System.out.println(response.getStatusLine());
  61. log.info( response.getStatusLine().toString() );
  62. //
  63. if (entity != null) {
  64. System.out.println("Response content length: " + entity.getContentLength());
  65. data = EntityUtils.toString(entity);
  66. System.out.println(data);
  67. EntityUtils.consume(entity);
  68.  
  69. }
  70. }
  71. catch(Exception e){
  72. log.error(e.getMessage());
  73. }
  74. //response
  75. try
  76. {
  77. if( response != null )
  78. response.close();
  79. }
  80. catch(Exception e) {
  81. log.error(e.getMessage());
  82. }
  83. //httpclient
  84. try {
  85. if( httpclient != null )
  86. httpclient.close();
  87. } catch (Exception e) {
  88. log.error(e.getMessage());
  89. }
  90.  
  91. return data;
  92. }
  93.  
  94. public String removeHtmlLabel( String input ) {
  95. return input.replaceAll("<[^>]+>", "").replaceAll("&nbsp;"," ").trim();
  96. }
  97.  
  98. //
  99. public ArrayList<String> match( Extractor extractor, String input ) {
  100. ArrayList<String> result = new ArrayList<String>();
  101. switch (extractor.getType()) {
  102. case css: //call css
  103. {
  104. Document doc = Jsoup.parse(input);
  105. Elements elems = doc.select(extractor.getPattern());
  106. for( Element elem:elems ) {
  107. result.add( elem.toString() );
  108. }
  109. }
  110. break;
  111. case regex: //call regex
  112. {
  113. Pattern p = Pattern.compile( extractor.getPattern());
  114. Matcher m = p.matcher( input );
  115. String matchValue = null;
  116. while(m.find()) {
  117. matchValue = StringEscapeUtils.unescapeHtml4( m.group());
  118. result.add(matchValue);
  119. }
  120. }
  121. break;
  122. case empty:
  123. result.add(input);
  124. break;
  125. }
  126. return result;
  127. }
  128.  
  129. public void parse( Extractor extractor, String input, HashMap<String, Object> result ) {
  130. //1. match by css or regex
  131. ArrayList<String> strlist = this.match(extractor, input);
  132. if( strlist.isEmpty() ) {
  133. //result.put( extractor.getId(), null);
  134. return;
  135. }
  136. //2. call children extractors
  137. switch(extractor.getData()) {
  138. case array:{
  139. //result.setType(ResultDataType.array);
  140. ArrayList<HashMap<String, Object>> list = new ArrayList<HashMap<String, Object>>();
  141. for( String str:strlist ) {
  142. HashMap<String, Object> childResult = new HashMap<String, Object>();
  143. for( Extractor one:extractor.getChildren()) {
  144. this.parse(one, str, childResult);
  145. }
  146. if( childResult.isEmpty() == false )
  147. list.add(childResult);
  148. }
  149. if(list.isEmpty() == false )
  150. result.put( extractor.getId(), list );
  151. }
  152. break;
  153. case field:{
  154. for(Field fd:extractor.getFields()) {
  155. String val=strlist.get( fd.getIndex() );
  156. result.put( fd.getName(), this.removeHtmlLabel(val) );
  157. }
  158. }
  159. break;
  160. case none: {
  161. for( String str:strlist ) {
  162. for( Extractor one:extractor.getChildren()) {
  163. this.parse(one, str, result);
  164. }
  165. }
  166. }
  167. break;
  168. }
  169. }
  170. }

a code snip的更多相关文章

  1. CSS code snip enjoy.

    <!-- information-total得是动态获取吧. --> <div class="information-mod"> <div class ...

  2. C# Code Snip

    1.Tryf + TAB+TAB try { } finally { } 2.Prop+Tab+Tab public int MyProperty { get; set; } 3. #region + ...

  3. WPF整理-跨程序集访问资源

    “Sometimes binary resources are defined in one assembly (typically a class library), but areneeded i ...

  4. WPF整理-使用用户选择主题的颜色和字体

    “Sometimes it's useful to use one of the selected colors or fonts the user has chosen in theWindows ...

  5. WPF整理-XAML访问静态属性

    "XAML provides an easy way to set values of properties—type converters and the extended propert ...

  6. WPF整理-XAML构建后台类对象

    1.XAML 接触WPF的第一眼就是XAML---XAML是用来描绘界面的.其实不然! "Actually, XAML has nothing to do with UI. It's mer ...

  7. Call C# in powershell

    How to call C# code in powershell Powershell Command Add-Type usage of Add-Type we use Add-Type -Typ ...

  8. Windows Phone 8 开发必备资源

    一.MVVM框架推荐 1. MVVM-Light 这个框架是我最常用的MVVM框架之一,它比Prism更轻量级,但对于一般的小应用,功能足够. 官方网站:http://mvvmlight.codepl ...

  9. 字符串的驻留(String Interning)

    http://www.cnblogs.com/artech/archive/2007/03/04/663728.html 关于字符串的驻留的机制,对于那些了解它的人肯定会认为很简单,但是我相信会有很大 ...

随机推荐

  1. Entity Framework Power Tools

    http://visualstudiogallery.msdn.microsoft.com/72a60b14-1581-4b9b-89f2-846072eff19d

  2. 深入浅出Zookeeper

    能找到的一些zookeeper的资料一上来不是扯一通paxos算法就是一大坨一大坨的代码.很多人对zookeeper更多的是听过,所以这一篇文章就尝试用尽可能用精简的语言科普zookeeper. zo ...

  3. 第二百五十六天 how can I 坚持

    今天比较闲,但是好累. 每天都会学到很多东西. 比如说,在没搞懂别人说这话之前,最好不要先表达自己的想法. 不宜妄自菲薄.不以物喜,不以己悲.hadoop. 睡觉.召生好速度啊,这么快就把我照片发给同 ...

  4. javaScript 类型判断

    直接上例子: 1 判断是否为数组类型 2 判断是否为字符串类型 3 判断是否为数值类型 4 判断是否为日期类型 5 判断是否为函数 6 判断是否为对象 1 判断是否为数组类型 linenum < ...

  5. Red5实现直播

    http://pxchen.iteye.com/blog/714591 发布端(Publish): var nc:NetConnection = new NetConnection(); nc.con ...

  6. 轻松学习 red5 教程 像视频一样很详细还有代码直接可Copy

    转载自:http://blog.csdn.net/hongdianking/archive/2009/11/12/4804339.aspx 最近要做一个流媒体服务器,在网上逗留了好久决定选择 red5 ...

  7. python中列表,元组,字符串如何互相转换

    python中有三个内建函数:列表,元组和字符串,他们之间的互相转换使用三个函数,str(),tuple()和list(),具体示例如下所示: >>> s = "xxxxx ...

  8. Linux的运行级别和chkconfig用法

    Linux的运行级别和chkconfig用法        一.Linux的运行级别 在装MySQL的时候,才知道了Linux的运行级别这么一回事.汗…自己太水了…下面总结一下: 什么是运行级别呢?简 ...

  9. Activator.CreateInstance 方法 (Type) 的用法

    转自:http://www.cnblogs.com/lmfeng/archive/2012/01/30/2331666.html Activator.CreateInstance 方法 (Type) ...

  10. 转移部分博客到CSDN之中

    之前的文章一直发布在个人博客ivyxjc.xyz中, 现在将一部分博客移到csdn博客中.