  1. package com.sinitek.sirm.common.utils;
  3. import java.io.*;
  4. import java.net.URL;
  5. import java.net.URLConnection;
  6. import java.util.*;
  7. import java.util.regex.Matcher;
  8. import java.util.regex.Pattern;
  10. /**
  11. * java简单web爬虫(网页图片)
  12. */
  13. public class Main {
  15. // 地址
  16. private static final String URL = "http://www.xxx";
  17. // 获取img标签正则
  18. private static final String IMGURL_REG = "<img.*src=(.*?)[^>]*?>";
  19. // 获取src路径的正则
  20. private static final String IMGSRC_REG = "src\\s*=\\s*\"?(.*?)(\"|>|\\s+)";
  21. //图片原始路径(如果src里的路径正确则不用)
  22. private static final String IMG_LUJING = "http://xxx/";
  23. //下载路径
  24. private static final String LUJING = "C:/res/";
  26. public static void main(String[] args) {
  27. try {
  28. Main cm=new Main();
  29. //获得html文本内容
  30. String HTML = cm.getHtml(URL);
  31. //获取图片标签
  32. List<String> imgUrl = cm.getImageUrl(HTML);
  33. //获取图片src地址
  34. List<String> imgSrc = cm.getImageSrc(imgUrl);
  35. //下载图片
  36. cm.Download(imgSrc);
  38. }catch (Exception e){
  39. System.out.println("发生错误");
  40. }
  42. }
  44. //获取HTML内容
  45. private String getHtml(String url)throws Exception{
  46. URL url1=new URL(url);
  47. URLConnection connection=url1.openConnection();
  48. InputStream in=connection.getInputStream();
  49. InputStreamReader isr=new InputStreamReader(in);
  50. BufferedReader br=new BufferedReader(isr);
  52. String line;
  53. StringBuffer sb=new StringBuffer();
  54. while((line=br.readLine())!=null){
  55. sb.append(line,0,line.length());
  56. sb.append('\n');
  57. }
  58. br.close();
  59. isr.close();
  60. in.close();
  61. return sb.toString();
  62. }
  64. //获取ImageUrl地址
  65. private List<String> getImageUrl(String html){
  66. Matcher matcher=Pattern.compile(IMGURL_REG).matcher(html);
  67. List<String>listimgurl=new ArrayList<String>();
  68. while (matcher.find()){
  69. listimgurl.add(matcher.group());
  70. }
  71. return listimgurl;
  72. }
  74. //获取ImageSrc地址
  75. private List<String> getImageSrc(List<String> listimageurl){
  76. List<String> listImageSrc=new ArrayList<String>();
  77. for (String image:listimageurl){
  78. // 匹配<img>中的src数据
  79. Matcher m = Pattern.compile(IMGSRC_REG).matcher(image);
  80. while (m.find()) {
  81. String a = m.group(1);//获取图片路径
  82. a = IMG_LUJING+a;//数据拼接
  83. listImageSrc.add(a);
  84. }
  85. }
  86. return listImageSrc;
  87. }
  89. //下载图片
  90. private void Download(List<String> listImgSrc) {
  91. try {
  92. //开始时间
  93. Date begindate = new Date();
  94. for (String url : listImgSrc) {
  95. //开始时间
  96. Date begindate2 = new Date();
  97. String imageName = url.substring(url.lastIndexOf("/") + 1, url.length());
  98. URL uri = new URL(url);
  99. InputStream in = uri.openStream();
  100. FileOutputStream fo = new FileOutputStream(new File(LUJING+imageName));//路径
  101. byte[] buf = new byte[1024];
  102. int length = 0;
  103. System.out.println("开始下载:" + url);
  104. while ((length = in.read(buf, 0, buf.length)) != -1) {
  105. fo.write(buf, 0, length);
  106. }
  107. in.close();
  108. fo.close();
  109. System.out.println(imageName + "下载完成");
  110. //结束时间
  111. Date overdate2 = new Date();
  112. double time = overdate2.getTime() - begindate2.getTime();
  113. System.out.println("耗时:" + time / 1000 + "s");
  114. }
  115. Date overdate = new Date();
  116. double time = overdate.getTime() - begindate.getTime();
  117. System.out.println("总耗时:" + time / 1000 + "s");
  118. } catch (Exception e) {
  119. System.out.println("下载失败");
  120. }
  121. }
  122. }



