  1. import java.io.BufferedOutputStream;
  2. import java.io.File;
  3. import java.io.FileOutputStream;
  5. import org.apache.http.HttpEntity;
  6. import org.apache.http.HttpResponse;
  7. import org.apache.http.HttpStatus;
  8. import org.apache.http.client.HttpClient;
  9. import org.apache.http.client.methods.HttpGet;
  10. import org.apache.http.impl.client.DefaultHttpClient;
  11. import org.apache.http.util.EntityUtils;
  12. import org.jsoup.Jsoup;
  13. import org.jsoup.nodes.Document;
  14. import org.jsoup.nodes.Element;
  15. import org.jsoup.select.Elements;
  17. public class JsoupParseHtml {
  19. public static String getHtmlByUrl(String url){
  20. String html = null;
  21. //创建httpClient对象
  22. HttpClient httpClient = new DefaultHttpClient();
  23. //以get方式请求该URL
  24. HttpGet httpget = new HttpGet(url);
  25. try {
  26. //得到responce对象
  27. HttpResponse responce = httpClient.execute(httpget);
  28. //返回码
  29. int resStatu = responce.getStatusLine().getStatusCode();
  30. //200正常 其他就不对
  31. if (resStatu==HttpStatus.SC_OK) {
  32. //获得相应实体
  33. HttpEntity entity = responce.getEntity();
  34. if (entity!=null) {
  35. //获得html源代码
  36. html = EntityUtils.toString(entity);
  38. }
  39. }
  40. } catch (Exception e) {
  41. System.out.println("访问【"+url+"】出现异常!");
  42. e.printStackTrace();
  43. } finally {
  44. httpClient.getConnectionManager().shutdown();
  45. }
  46. return html;
  47. }
  49. static String txtpathstr="d:\\one\\";
  51. public static void main(String[] args) throws Exception {
  53. String contents="";
  54. String urlbase="http://localhost:8080/1.htm";
  56. //String urlbase="http://www.qiushibaike.com/8hr/page/8?s=4513032";//1?s=4513032
  57. contents+=gettxtlist(urlbase);
  59. //写入文件
  60. writefile(contents);
  62. }
  64. public static String gettxtlist(String txturl) throws Exception{
  66. String content="";
  67. Document doc=jsoupconnect(txturl,360000);
  68. //Elements els= doc.select("div.content");
  70. Elements els= doc.select("html");
  72. for(Element el:els){
  73. if (el.select("body").size()>1){
  74. continue;
  75. }
  76. content+=el.text()+"\r\n";
  77. System.out.println();
  78. System.out.println(content);
  79. }
  80. return content;
  81. }
  83. public static Document jsoupconnect (String url,int timeout){
  84. Document doc=null;
  85. int retry=5;
  86. while (null==doc&&retry>0){
  87. retry--;
  88. try{
  89. doc= Jsoup.connect(url).userAgent("Mozilla/5.0 (Windows NT 6.1; rv:5.0)").timeout(timeout).get();
  90. }catch(Exception e){
  91. e.printStackTrace();
  93. }
  94. }
  95. return doc;
  96. }
  98. public static void writefile(String txtstr)throws Exception{
  99. File txtpath=new File(txtpathstr);
  100. if (!txtpath.exists()){
  101. txtpath.mkdirs();
  102. }
  103. File htxt=new File(txtpathstr+"test.txt");
  104. BufferedOutputStream outBuff = new BufferedOutputStream(new FileOutputStream(htxt));
  105. outBuff.write(txtstr.getBytes());
  106. outBuff.flush();
  107. outBuff.close();
  108. }
  110. }



