使用Jsoup 爬取网易首页所有的图片

package com.enation.newtest;
 
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
 
import org.apache.commons.lang3.StringEscapeUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
 
// 爬取网易首页所有图片
public class Jsoup163 {
 
    public static void main(String[] args) throws Exception{
        String downloadPath = "D:\\360Downloads\\test";
        List<String> list = nameList("网易--首页");
        getPictures(list,1,downloadPath); //1代表下载一页，一页一般有30张图片
    }
 
    public static void getPictures(List<String> keywordList, int max,String downloadPath) throws Exception{ // key为关键词,max作为爬取的页数
        String gsm=Integer.toHexString(max)+"";
        String finalURL = "";
        String tempPath = "";
        for(String keyword : keywordList){
            tempPath = downloadPath;
            if(!tempPath.endsWith("\\")){
                       tempPath = downloadPath+"\\";
            }
            tempPath = tempPath+keyword+"\\";
            File f = new File(tempPath);
            if(!f.exists()){
                f.mkdirs();
            }
            int picCount = 1;
            for(int page=1;page<=max;page++) {
                sop("正在下载第"+page+"页面");
                Document document = null;
                try {
                    String url ="http://www.163.com/";
                    sop(url);
                    document = Jsoup.connect(url).data("query", "Java")//请求参数
                             .userAgent("Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)")//设置urer-agent  get();
                             .timeout(5000)
                             .get();
                    String xmlSource = document.toString();
                    xmlSource = StringEscapeUtils.unescapeHtml3(xmlSource);
                    //sop(xmlSource);
                    String reg = "<img.*src=(.*?)[^>]*?>";
                    String reg2 = "src\\s*=\\s*\"?(.*?)(\"|>|\\s+)";
                    String reg2datasrc = "data-src\\s*=\\s*\"?(.*?)(\"|>|\\s+)";
 
                    Pattern pattern = Pattern.compile(reg);
                    Pattern pattern2 = Pattern.compile(reg2);
                    Pattern pattern2datasrc = Pattern.compile(reg2datasrc);
 
                    Matcher m = pattern.matcher(xmlSource);
                    while (m.find()){
                        finalURL = m.group();
                        System.out.println(finalURL);
                        Matcher m2 = null;
                        if(finalURL.indexOf("data-src")>0){
                            m2 = pattern2datasrc.matcher(finalURL);
                        }else {
                            m2 = pattern2.matcher(finalURL);
                        }
                        if(m2.find()){
                            finalURL = m2.group(1);
                            System.out.println(finalURL);
                            if(finalURL.startsWith("http")){
                                sop(keyword+picCount+++":"+finalURL);
                                download(finalURL,tempPath);
                                sop("             下载成功");
                            }
                        }
                    }
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
       }
       sop("下载完毕");
       delMultyFile(downloadPath);
       sop("已经删除所有空图");
    }
    public static void delMultyFile(String path){
        File file = new File(path);
        if(!file.exists())
            throw new RuntimeException("File \""+path+"\" NotFound when excute the method of delMultyFile()....");
        File[] fileList = file.listFiles();
        File tempFile=null;
        for(File f : fileList){
            if(f.isDirectory()){
                delMultyFile(f.getAbsolutePath());
            }else{
                if(f.length()==0)
                    sop(f.delete()+"---"+f.getName());
            }
        }
    }
    public static List<String> nameList(String nameList){
        List<String> arr = new ArrayList<String>();
        String[] list;
        if(nameList.contains(","))
            list= nameList.split(",");
        else if(nameList.contains("、"))
            list= nameList.split("、");
        else if(nameList.contains(" "))
            list= nameList.split(" ");
        else{
            arr.add(nameList);
            return arr;
        }
        for(String s : list){
            arr.add(s);
        }
        return arr;
    }
    public static void sop(Object obj){
        System.out.println(obj);
    }
    //根据图片网络地址下载图片
      public static void download(String url,String path){
          //path = path.substring(0,path.length()-2);
          File file= null;
          File dirFile=null;
          FileOutputStream fos=null;
          HttpURLConnection httpCon = null;
          URLConnection  con = null;
          URL urlObj=null;
          InputStream in =null;
          byte[] size = new byte[1024];
          int num=0;
          try {
              String downloadName= url.substring(url.lastIndexOf("/")+1);
              dirFile = new File(path);
              if(!dirFile.exists() && path.length()>0){
                  if(dirFile.mkdir()){
                      sop("creat document file \""+path.substring(0,path.length()-1)+"\" success...\n");
                  }
              }else{
                  file = new File(path+downloadName);
                  fos = new FileOutputStream(file);
                  if(url.startsWith("http")){
                      urlObj = new URL(url);
                      con = urlObj.openConnection();
                      httpCon =(HttpURLConnection) con;
                      int  responseCode = httpCon.getResponseCode();
                      if(responseCode == 200){
                          in = httpCon.getInputStream();
                          while((num=in.read(size)) != -1){
                              for(int i=0;i<num;i++)
                                  fos.write(size[i]);
                          }
                      }else {
                        System.out.println("状态码："+responseCode+" 地址："+url);
                    }
                  }
              }
          }catch (FileNotFoundException notFoundE) {
              sop("找不到该网络图片....");
          }catch(NullPointerException nullPointerE){
              sop("找不到该网络图片....");
          }catch(IOException ioE){
              sop("产生IO异常.....");
          }catch (Exception e) {
              e.printStackTrace();
          }finally{
              try {
                  if(fos!=null){
                      fos.close();
                  }
              } catch (Exception e) {
                  e.printStackTrace();
              }
          }
      }
}

其中，关键点在于获取图片img标签的正则表达式和图片的链接地址

String reg = "<img.*src=(.*?)[^>]*?>";
String reg2 = "src\\s*=\\s*\"?(.*?)(\"|>|\\s+)";

运行结果：

使用Jsoup 爬取网易首页所有的图片的更多相关文章

Jsoup爬取带登录验证码的网站
今天学完爬虫之后想的爬一下我们学校的教务系统,可是发现登录的时候有验证码.因此研究了Jsoup爬取带验证码的网站: 大体的思路是:(需要注意的是__VIEWSTATE一直变化,所以我们每个页面都需要重 ...
Python爬虫实战教程：爬取网易新闻
前言本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理. 作者: Amauri PS:如有需要Python学习资料的小伙伴可以加点击 ...
如何利用python爬取网易新闻
前言本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理. 作者: LSGOGroup PS:如有需要Python学习资料的小伙伴可以 ...
jsoup爬取某网站安全数据
jsoup爬取某网站安全数据 package com.vfsd.net; import java.io.IOException; import java.sql.SQLException; impor ...
Python爬虫实战教程：爬取网易新闻；爬虫精选高手技巧
前言本文的文字及图片来源于网络,仅供学习.交流使用,不具有任何商业用途,版权归原作者所有,如有问题请及时联系我们以作处理. stars声明很多小伙伴学习Python过程中会遇到各种烦恼问题解决不了.为 ...
初识python 之爬虫：爬取某网站的壁纸图片
用到的主要知识点:requests.get 获取网页HTMLetree.HTML 使用lxml解析器解析网页xpath 使用xpath获取网页标签信息.图片地址request.urlretrieve ...
python连续爬取多个网页的图片分别保存到不同的文件夹
python连续爬取多个网页的图片分别保存到不同的文件夹作者:vpoet mail:vpoet_sir@163.com #coding:utf-8 import urllib import ur ...
Python爬取贴吧中的图片
#看到贴吧大佬在发图,准备盗一下 #只是爬取一个帖子中的图片 1.先新建一个scrapy项目 scrapy startproject TuBaEx 2.新建一个爬虫 scrapy genspider ...
Python 爬取煎蛋网妹子图片
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Date : 2017-08-24 10:17:28 # @Author : EnderZhou (z ...

随机推荐

boxes
boxes [英][bɒksɪz][美][bɑ:ksɪz] n.盒( box的名词复数 ); 一盒; 电视; 小亭; v.把…装入盒[箱,匣]中( box的第三人称单数 ); 拳击; 以上结果来自 ...
Java核心知识点学习----多线程中的阻塞队列,ArrayBlockingQueue介绍
1.什么是阻塞队列? 所谓队列,遵循的是先进先出原则(FIFO),阻塞队列,即是数据共享时,A在写数据时,B想读同一数据,那么就将发生阻塞了. 看一下线程的四种状态,首先是新创建一个线程,然后,通过s ...
PHP declare(ticks=N); 的作用
一般用法是 declare(ticks=N);拿declare(ticks=1)来说,这句主要作用有两种: 1.Zend引擎每执行1条低级语句就去执行一次 register_tick_function ...
LeetCode(五)
Minimum Depth of Binary Tree public class Solution { public int minDepth(TreeNode root) { if(root==n ...
linux系统的目录结构
前言对于每一个Linux学习者来说,了解Linux文件系统的目录结构,是学好Linux的至关重要的一步.,深入了解linux文件目录结构的标准和每个目录的详细功能,对于我们用好linux系统只管重要 ...
NSArray其中的方法--遍历,
1. ForLoop, For - in, enumerateObjects这个三个方法的区别: 遍历一个数组用For-in最快. 通过Value查询index的时候, 面对大量的数组推荐使用 enu ...
CoreLocation框架的使用---定位,求两地距离
前言: 在iOS开发中,有关导航,周边的开发,必须基于2个框架: Map Kit :用于地图展示 Core Location :用于地理定位用户隐私的保护从iOS 6开始,苹果在保护用户隐私方 ...
linux里添加locate命令
在linux里使用和find一样的功能例如 find -name xx 可以yum install mlocate 然后 updatedb 再使用locate xx 来查找xx文件
MySQL之CAST与CONVERT 函数的用法
两者具体的语法如下:CAST(value as type); CONVERT(value, type); 可以转换的类型是有限制的.这个类型可以是以下值其中的一个: 二进制,同带binary前缀的效果 ...
eclipse里面构建maven项目详解(转载)
本文来源于:http://my.oschina.net/u/1540325/blog/548530 eclipse里面构建maven项目详解 1 环境安装及分配 Maven是基于项目对象模 ...

使用Jsoup 爬取网易首页所有的图片

使用Jsoup 爬取网易首页所有的图片的更多相关文章

随机推荐

热门专题