新浪新闻页面抓取(JAVA-Jsoup)
1、使用gradle建立工程:
工程格式如下:
- include ':spider-demo'
- rootProject.name = 'my-spider-demo'
settings
- def void forceVersion(details, group, version) {
- if (details.requested.group == group) {
- details.useVersion version
- }
- }
- def void forceVersion(details, group, name, version) {
- if (details.requested.group == group && details.requested.name == name) {
- details.useVersion version
- }
- }
- allprojects { p ->
- group = 'com.my.spider'
- version = '1.0.0'
- apply plugin: 'java'
- apply plugin: 'maven'
- apply plugin: 'maven-publish'
- [compileJava, compileTestJava]*.options*.encoding = 'UTF-8'
- jar.doFirst {
- manifest {
- def manifestFile = "${projectDir}/META-INF/MANIFEST.MF"
- if (new File(manifestFile).exists())
- from (manifestFile)
- attributes 'Implementation-Title':p.name
- if (p.version.endsWith('-SNAPSHOT')) {
- attributes 'Implementation-Version': p.version + '-' + p.ext.Timestamp
- } else {
- attributes 'Implementation-Version': p.version
- }
- attributes 'Implementation-BuildDateTime':new Date()
- }
- }
- javadoc {
- options {
- encoding 'UTF-8'
- charSet 'UTF-8'
- author false
- version true
- links 'http://docs.oracle.com/javase/8/docs/api/index.html'
- memberLevel = org.gradle.external.javadoc.JavadocMemberLevel.PRIVATE
- }
- }
- if (p.name.endsWith('-api')){
- task sourcesJar(type:Jar, dependsOn:classes) {
- classifier = 'sources'
- from sourceSets.main.allSource
- }
- task javadocJar(type:Jar, dependsOn:javadoc) {
- classifier = 'javadoc'
- from javadoc.destinationDir
- }
- }
- publishing {
- repositories {
- maven {
- credentials {
- username "${repositoryUploadUsername}"
- password "${repositoryUploadPassword}"
- }
- if (version.endsWith('-SNAPSHOT')) {
- url "${repositoryUploadSnapshotUrl}"
- } else {
- url "${repositoryUploadReleaseUrl}"
- }
- }
- }
- publications {
- mavenJava(MavenPublication) {
- from components.java
- // 只有*-api才会需要发布sources和javadoc
- if (p.name.endsWith('-api')){
- artifact sourcesJar {
- classifier "sources"
- }
- artifact javadocJar {
- classifier "javadoc"
- }
- }
- }
- }
- }
- if (System.env.uploadArchives) {
- build.dependsOn publish
- }
- buildscript {
- repositories {
- maven {
- name 'Maven Repository'
- url "${repositoryMavenUrl}"
- credentials {
- username "${repositoryUsername}"
- password "${repositoryPassword}"
- }
- }
- }
- dependencies {classpath 'org.springframework.boot:spring-boot-gradle-plugin:1.4.0.RELEASE' }
- }
- afterEvaluate {Project project ->
- if (project.pluginManager.hasPlugin('java')) {
- configurations.all {
- resolutionStrategy.eachDependency {DependencyResolveDetails details ->
- forceVersion details, 'org.springframework.boot', '1.4.1.RELEASE'
- forceVersion details, 'org.slf4j', '1.7.21'
- forceVersion details, 'org.springframework', '4.3.3.RELEASE'
- }
- exclude module:'slf4j-log4j12'
- exclude module:'log4j'
- }
- dependencies {testCompile 'junit:junit:4.12' }
- }
- }
- repositories {
- maven {
- name 'Maven Repository'
- url "${repositoryMavenUrl}"
- credentials {
- username "${repositoryUsername}"
- password "${repositoryPassword}"
- }
- }
- ivy {
- name 'Ivy Repository'
- url "${repositoryIvyUrl}"
- credentials {
- username "${repositoryUsername}"
- password "${repositoryPassword}"
- }
- layout "pattern", {
- artifact '[organisation]/[module]/[revision]/[type]s/[artifact]-[revision].[ext]'
- ivy '[organisation]/[module]/[revision]/[type]s/[artifact].[ext]'
- m2compatible = true
- }
- }
- }
- // 时间戳:年月日时分
- p.ext.Timestamp = new Date().format('yyyyMMddHHmm')
- // Build Number
- p.ext.BuildNumber = System.env.BUILD_NUMBER
- if (p.ext.BuildNumber == null || "" == p.ext.BuildNumber) {
- p.ext.BuildNumber = 'x'
- }
- }
- task zipSources(type: Zip) {
- description '压缩源代码'
- project.ext.zipSourcesFile = project.name + '-' + project.version + '-' + project.ext.Timestamp + '.' + project.ext.BuildNumber + '-sources.zip'
- archiveName = project.ext.zipSourcesFile
- includeEmptyDirs = false
- from project.projectDir
- exclude '**/.*'
- exclude 'build/*'
- allprojects.each { p ->
- exclude '**/' + p.name + '/bin/*'
- exclude '**/' + p.name + '/build/*'
- exclude '**/' + p.name + '/data/*'
- exclude '**/' + p.name + '/work/*'
- exclude '**/' + p.name + '/logs/*'
- }
- }
- def CopySpec appCopySpec(Project prj, dstname = null) {
- if (!dstname) { dstname = prj.name }
- return copySpec{
- // Fat jar
- from (prj.buildDir.toString() + '/libs/' + prj.name + '-' + project.version + '.jar') {
- into dstname
- }
- // Configs
- from (prj.projectDir.toString() + '/config/examples') {
- into dstname + '/config'
- }
- // Windows start script
- from (prj.projectDir.toString() + '/' + prj.name + '.bat') {
- into dstname
- }
- // Unix conf script
- from (prj.projectDir.toString() + '/' + prj.name + '.conf') {
- into dstname
- rename prj.name, prj.name + '-' + project.version
- }
- }
- }
- task zipSetup(type: Zip, dependsOn: subprojects.build) {
- description '制作安装包'
- project.ext.zipSetupFile = project.name + '-' + project.version + '-' + project.ext.Timestamp + '.' + project.ext.BuildNumber + '-setup.zip'
- archiveName = project.name + '-' + project.version + '-' + project.ext.Timestamp + '.' + project.ext.BuildNumber + '-setup.zip'
- with appCopySpec(project(':spider-demo'))
- }
- import java.security.MessageDigest
- def generateMD5(final file) {
- MessageDigest digest = MessageDigest.getInstance("MD5")
- file.withInputStream(){is->
- byte[] buffer = new byte[8192]
- int read = 0
- while( (read = is.read(buffer)) > 0) {
- digest.update(buffer, 0, read);
- }
- }
- byte[] md5sum = digest.digest()
- BigInteger bigInt = new BigInteger(1, md5sum)
- return bigInt.toString(16)
- }
- task md5(dependsOn: [zipSetup, zipSources]) << {
- String md5_setup = generateMD5(file("${projectDir}/build/distributions/" + project.ext.zipSetupFile));
- String md5_sources = generateMD5(file("${projectDir}/build/distributions/" + project.ext.zipSourcesFile));
- println project.ext.zipSetupFile + '=' + md5_setup
- println project.ext.zipSourcesFile + '=' + md5_sources
- def newFile = new File("${projectDir}/build/distributions/"
- + project.name + '-' + project.version + '-' + project.ext.Timestamp + '.' + project.ext.BuildNumber + '-md5.txt')
- PrintWriter printWriter = newFile.newPrintWriter()
- printWriter.println project.ext.zipSetupFile + '=' + md5_setup
- printWriter.println project.ext.zipSourcesFile + '=' + md5_sources
- printWriter.flush()
- printWriter.close()
- }
- build.dependsOn subprojects.build, zipSetup, zipSources, md5
bulid.gradle
子过程相关依赖:
- apply plugin: 'spring-boot'
- apply plugin: 'application'
- distributions {
- main {
- contents {
- from ("${projectDir}/config/examples") {
- into "config"
- }
- }
- }
- }
- distTar.enabled = false
- springBoot {
- executable = true
- mainClass = 'com.my.spider.Application'
- }
- dependencies {
- compile 'org.springframework.boot:spring-boot-starter-web:1.4.0.RELEASE'
- compile 'dom4j:dom4j:1.6.1'
- compile 'commons-httpclient:commons-httpclient:3.1'
- compileOnly 'com.h2database:h2:1.4.191'
- compile 'javax.cache:cache-api:1.0.0'
- compile 'org.jboss.resteasy:resteasy-jaxrs:3.0.14.Final'
- compile 'org.jboss.resteasy:resteasy-client:3.0.14.Final'
- // Axis
- compile 'axis:axis:1.4'
- compile 'org.jsoup:jsoup:1.10.1'
- compile 'com.alibaba:fastjson:1.2.21'
- }
bulid
2、代码编写:
入口:
- package com.my.spider;
- import java.io.IOException;
- import org.springframework.boot.SpringApplication;
- import org.springframework.boot.autoconfigure.SpringBootApplication;
- import org.springframework.scheduling.annotation.EnableAsync;
- import org.springframework.scheduling.annotation.EnableScheduling;
- import com.my.spider.utils.CommonProperties;
- @SpringBootApplication
- @EnableScheduling
- @EnableAsync
- public class Application {
- public static void main(String[] args) throws IOException {
- String loc = CommonProperties.loadProperties2System(System.getProperty("spring.config.location"));
- System.getProperties().setProperty("application.version", CommonProperties.getVersion(Application.class));
- System.getProperties().setProperty("app.home", loc + "/..");
- SpringApplication.run(Application.class, args);
- }
- }
- package com.my.spider.utils;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.IOException;
- import java.util.Properties;
- import org.springframework.util.StringUtils;
- public final class CommonProperties {
- public static final String PPT_KEY_APP_HOME = "app.home";
- public static final String DEFAULT_APP_HOME = "./";
- public static final String getAppHome() {
- return System.getProperty(DEFAULT_APP_HOME, DEFAULT_APP_HOME);
- }
- public static String loadProperties2System(String location) throws IOException {
- String configLocation = location;
- File cnf;
- if (!StringUtils.hasLength(configLocation)) {
- configLocation = "./config";
- cnf = new File(configLocation);
- if (!cnf.exists() || !cnf.isDirectory()) {
- configLocation = "../config";
- cnf = new File(configLocation);
- }
- } else {
- cnf = new File(configLocation);
- }
- for (File file : cnf.listFiles()) {
- if (file.isFile() && file.getName().endsWith(".properties")) {
- Properties ppt = new Properties();
- try (FileInputStream fi = new FileInputStream(file)) {
- ppt.load(fi);
- System.getProperties().putAll(ppt);
- }
- }
- }
- return configLocation;
- }
- public static String getVersion(Class<?> clazz) {
- Package pkg = clazz.getPackage();
- String ver = (pkg != null ? pkg.getImplementationVersion() : "undefined");
- return (ver == null ? "undefined" : ver);
- }
- }
配置类:
- package com.my.spider.config;
- import org.springframework.context.annotation.ComponentScan;
- import org.springframework.context.annotation.Configuration;
- import org.springframework.scheduling.annotation.EnableScheduling;
- @EnableScheduling
- @Configuration
- @ComponentScan(basePackages = {
- "com.my.spider.rs",
- "com.my.spider.schedule"
- })
- public class AppAutoConfiguration {
- }
META-INF下spring.factories文件:
- org.springframework.boot.autoconfigure.EnableAutoConfiguration=\
- com.my.spider.config.AppAutoConfiguration
3、功能代码:
定时任务抽象类,提供三种定时任务的调用方法:
- package com.my.spider.schedule;
- import org.slf4j.Logger;
- import org.slf4j.LoggerFactory;
- import org.springframework.beans.factory.DisposableBean;
- import org.springframework.beans.factory.InitializingBean;
- import org.springframework.scheduling.annotation.Scheduled;
- import org.springframework.stereotype.Component;
- import com.fasterxml.jackson.databind.ObjectMapper;
- @Component
- public abstract class ParentSchedule implements InitializingBean,DisposableBean{
- public static Logger logger = LoggerFactory.getLogger(ParentSchedule.class);
- public final static ObjectMapper objectMapper = new ObjectMapper();
- @Scheduled(
- initialDelayString = "${agent.task.initialDelay:1000}", //
- fixedDelayString = "${agent.task.fixedDelay:10000}")
- public void dowork(){
- execute();
- }
- //定时任务一
- public abstract void execute();
- @Scheduled(cron = "${agent.task.cron:0 0 10,14,16 * * ?}")
- public void timeTask(){
- executeTimeTask();
- }
- //定时任务三
- public abstract void executeTimeTask();
- //每天12点出发
- @Scheduled(cron = "0 0 12 * * ?")
- public void otherTask(){
- executeOtherTask();
- }
- //定时任务三
- public abstract void executeOtherTask();
- }
- package com.my.spider.utils;
- import java.util.HashMap;
- import java.util.Map;
- import org.slf4j.Logger;
- import org.slf4j.LoggerFactory;
- /**
- * 页面抓取请求的公共类
- * */
- public class HttpHtmlUtils {
- public static Logger logger = LoggerFactory.getLogger(HttpHtmlUtils.class);
- public static Map<String, String> header = new HashMap<String, String>();
- public static Map<String, String> header_a = new HashMap<String, String>();
- static {
- //设置请求头
- header.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0");
- header.put("Accept","text/javascript, text/html, application/xml, text/xml, */*");
- header.put("Accept-Language","zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3");
- header.put("Accept-Encoding","gzip, deflate");
- header.put("X-Requested-With","XMLHttpRequest");
- header.put("Content-Type","text/*, application/xml");
- header.put("Connection","keep-alive");
- header_a.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0");
- header_a.put("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
- header_a.put("Accept-Language","zh-CN,zh;q=0.8");
- header_a.put("Accept-Encoding","gzip, deflate, sdch");
- header_a.put("Content-Type","application/octet-stream");
- header_a.put("Connection","keep-alive");
- header_a.put("Upgrade-Insecure-Requests", "1");
- }
- }
新浪滚动新闻抓取实现下载和分析:
- package com.my.spider.schedule;
- import java.io.IOException;
- import java.text.SimpleDateFormat;
- import java.util.ArrayList;
- import java.util.Date;
- import java.util.HashMap;
- import java.util.HashSet;
- import java.util.List;
- import java.util.Map;
- import java.util.Set;
- import org.jsoup.Connection;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- import org.slf4j.Logger;
- import org.slf4j.LoggerFactory;
- import org.springframework.beans.factory.annotation.Value;
- import org.springframework.stereotype.Component;
- import org.springframework.util.StringUtils;
- import com.my.spider.utils.FileUtils;
- import com.my.spider.utils.HttpHtmlUtils;
- @Component
- public class SinaSchedule extends ParentSchedule {
- private static Logger logger = LoggerFactory.getLogger(SinaSchedule.class);
- public static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm");
- public static SimpleDateFormat sdfYMD = new SimpleDateFormat("yyyy-MM-dd");
- private static int downloadtimeout = 5000;
- public static Set<String> titleSet = new HashSet<String>();
- @Value("${img.download.dir.prefix:D://testhtml}")
- public String dirpath;
- @Override
- public void afterPropertiesSet() throws Exception {
- // TODO Auto-generated method stub
- }
- // 抓取文章列表
- public static List<String> getArticleList(String url) {
- List<String> urlList = new ArrayList<String>();
- logger.debug("获取文章信息url:{},开始时间={}", url, sdf.format(new Date()));
- try {
- Connection connect = Jsoup.connect(url).headers(HttpHtmlUtils.header_a);
- Document document;
- document = connect.timeout(downloadtimeout).get();
- Elements newsList = document.getElementsByClass("d_list_txt");
- if (newsList != null && newsList.size() > 0) {
- newsList = newsList.get(0).getElementsByTag("ul").get(0).getElementsByTag("li");
- for (Element el : newsList) {
- String elUrl = el.getElementsByTag("a").get(0).absUrl("href");
- String urlName = el.getElementsByTag("a").get(0).text();
- String time = el.getElementsByClass("c_time").get(0).text();
- logger.debug("获取新闻:{},访问地址:{},时间:{}",urlName,elUrl,time);
- //elUrl = el.getElementsByTag("a").get(0).attr("href");
- urlList.add(elUrl);
- }
- }
- logger.debug("获取文章列表信息:结束时间={}", sdf.format(new Date()));
- return urlList;
- } catch (IOException e) {
- logger.error("访问文章列表失败:" + url + " 原因" + e.getMessage());
- }
- return null;
- }
- // 抓取文章列表
- public static Map<String, Object> getArticleInfo(String url) {
- logger.debug("获取文章信息url:{},开始时间={}", url, sdf.format(new Date()));
- try {
- Map<String, Object> map = new HashMap<String, Object>();
- Connection connect = Jsoup.connect(url).headers(HttpHtmlUtils.header);
- Document document;
- document = connect.timeout(downloadtimeout).get();
- Element titleEl = document.getElementById("artibodyTitle");
- String tilte = "";
- if (titleEl != null) {
- tilte = titleEl.text();
- }
- Elements keywords = document.getElementsByClass("article-keywords");
- String tag = "";
- StringBuffer sb = new StringBuffer();
- if (keywords != null ) {
- for (Element t : keywords.get(0).getElementsByTag("a")) {
- sb.append(t.text()).append(",");
- }
- if (!StringUtils.isEmpty(sb.toString())) {
- tag = sb.deleteCharAt(sb.lastIndexOf(",")).toString();
- }
- }
- Element contentEle = document.getElementById("artibody");
- String content = "";
- String contentText = "";
- if (contentEle != null) {
- content = contentEle.html();
- contentText = contentEle.text();
- }
- String description = "";
- Elements descEle = document.getElementsByAttributeValue("name","description");
- if (descEle != null && descEle.size() > 0) {
- description = descEle.get(0).attr("content");
- }
- List<String> imgUrls = new ArrayList<>();
- Elements imgs = contentEle.getElementsByTag("img");
- if (imgs != null && imgs.size() > 0) {
- for (Element img : imgs) {
- String imgUrl = img.attr("src");
- if (!StringUtils.isEmpty(imgUrl)) {
- imgUrls.add(imgUrl);
- }
- }
- }
- map.put("imgs", imgUrls);
- map.put("description", description);
- map.put("content", content);
- map.put("contentText", contentText);
- map.put("tag", tag);
- map.put("title", tilte);
- logger.debug("获取文章信息:结束时间={}", sdf.format(new Date()));
- return map;
- } catch (IOException e) {
- logger.error("访问文章页失败:" + url + " 原因" + e.getMessage());
- }
- return null;
- }
- @Override
- public void destroy() throws Exception {
- // TODO Auto-generated method stub
- }
- public static void main(String[] args) {
- List<String> url = new ArrayList<>();
- url.addAll(getArticleList("http://roll.news.sina.com.cn/s/channel.php?ch=01#col=89&spec=&type=&ch=0"
- + "1&k=&offset_page=0&offset_num=0&num=60&asc=&page=1"));
- titleSet.addAll(url);
- logger.debug("此次共获取到{}个",titleSet.size());
- for (String urlStr : titleSet) {
- try {
- /*
- String htmlFile = FileUtils.downloadYunFile(urlStr, "D://testhtml//sina//"+sdfYMD.format(new Date()));
- Document document = Jsoup.parse(new File(htmlFile), "utf8");
- document.getElementsByTag("tilte");
- */
- //下载保存
- FileUtils.downloadYunFile(urlStr, "D://testhtml//sina//"+sdfYMD.format(new Date()));
- getArticleInfo(urlStr);
- } catch (Throwable e) {
- }
- }
- }
- @Override
- public void execute() {
- }
- @Override
- public void executeTimeTask() {
- // TODO Auto-generated method stub
- }
- @Override
- public void executeOtherTask() {
- // TODO Auto-generated method stub
- }
- }
下载html文件代码:
- package com.my.spider.utils;
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.FileOutputStream;
- import java.io.IOException;
- import java.io.InputStream;
- import java.io.OutputStream;
- import java.net.URI;
- import java.util.Arrays;
- import org.apache.http.client.config.RequestConfig;
- import org.apache.http.client.methods.CloseableHttpResponse;
- import org.apache.http.client.methods.HttpGet;
- import org.apache.http.impl.client.CloseableHttpClient;
- import org.apache.http.impl.client.HttpClients;
- import org.slf4j.Logger;
- import org.slf4j.LoggerFactory;
- import org.springframework.http.HttpEntity;
- import org.springframework.http.HttpHeaders;
- import org.springframework.http.HttpMethod;
- import org.springframework.http.MediaType;
- import org.springframework.http.ResponseEntity;
- import org.springframework.http.client.ClientHttpRequestFactory;
- import org.springframework.http.client.HttpComponentsClientHttpRequestFactory;
- import org.springframework.util.StreamUtils;
- import org.springframework.web.client.RestTemplate;
- import org.springframework.web.util.UriComponentsBuilder;
- import com.fasterxml.jackson.databind.ObjectMapper;
- public class FileUtils {
- private static final Logger logger = LoggerFactory.getLogger(FileUtils.class);
- private static ObjectMapper _objectMapper = new ObjectMapper();
- private static int downloadTimeout = 5000;
- public static void main(String[] args) throws Throwable {
- String filePath = "/temp/temp/test.mpg";
- String dirPrex = "/temp&Z:\\\\";
- String[] paths = dirPrex.split("&");
- System.out.println(paths[1] + filePath.substring(paths[0].length() + 1).replace("/", "\\"));
- }
- // 文件复制
- public static void copy(String src, String dest) throws IOException {
- System.out.println("正在拷贝【" + src + "】到【" + dest + "】\n");
- File destFile = new File(dest);
- if (!destFile.exists()) {
- String dir = dest.substring(0, dest.lastIndexOf(File.separator));
- File dirF = new File(dir);
- if (!dirF.exists() || !dirF.isDirectory()) {
- dirF.mkdirs();
- }
- destFile.createNewFile();
- }
- FileInputStream in = new FileInputStream(src);
- FileOutputStream out = new FileOutputStream(dest);
- byte[] buffer = new byte[40960];
- while (in.read(buffer) != -1) {
- out.write(buffer);
- out.flush();
- }
- in.close();
- out.close();
- }
- // 下载云文件
- public static String downloadYunFile(String url, String dir) throws Throwable {
- String fileName = getFileName(url);
- String filePath = dir + File.separator + fileName;
- try (CloseableHttpClient httpclient = HttpClients.createDefault()) {
- HttpGet httpget = new HttpGet(url);
- httpget.setConfig(RequestConfig.custom() //
- .setConnectionRequestTimeout(downloadTimeout) //
- .setConnectTimeout(downloadTimeout) //
- .setSocketTimeout(downloadTimeout) //
- .build());
- try (CloseableHttpResponse response = httpclient.execute(httpget)) {
- org.apache.http.HttpEntity entity = response.getEntity();
- File desc = new File(filePath);
- File folder = desc.getParentFile();
- folder.mkdirs();
- try (InputStream is = entity.getContent(); //
- OutputStream os = new FileOutputStream(desc)) {
- StreamUtils.copy(is, os);
- }
- } catch (Throwable e) {
- throw new Throwable("文件下载失败......", e);
- }
- }
- return filePath;
- }
- public static String getFileName(String fileFullPath) {
- fileFullPath = fileFullPath.replace("/", "\\");
- return fileFullPath.substring(fileFullPath.lastIndexOf("\\") + 1, fileFullPath.length());
- }
- // 请求例子
- public void getToken(String url, String data) throws Throwable {
- RestTemplate restTemplate = new RestTemplate();
- ClientHttpRequestFactory clientFactory = new HttpComponentsClientHttpRequestFactory();
- restTemplate.setRequestFactory(clientFactory);
- HttpHeaders requestHeaders = new HttpHeaders();
- requestHeaders.setAccept(Arrays.asList(MediaType.APPLICATION_JSON_UTF8));
- requestHeaders.setContentType(MediaType.APPLICATION_JSON_UTF8);
- logger.debug("获取token的URL:" + url);
- URI uri = UriComponentsBuilder.fromUriString(url).build().encode().toUri();
- logger.debug("请求数据:{}", _objectMapper.writeValueAsString(data));
- HttpEntity<String> requestEntity = new HttpEntity<String>(data, requestHeaders);
- ResponseEntity<String> response = restTemplate.exchange(uri, HttpMethod.POST, requestEntity, String.class);
- String resp = response.getBody();
- logger.debug("请求返回值数据:{}", _objectMapper.writeValueAsString(resp));
- }
- }
4、总结:
Jsoup对于这种页面抓取很好用!也可能因为这是实现了一个最简单的页面抓取过程!
追加一个下载音频的代码:
- package com.my.spider.service;
- import java.net.HttpURLConnection;
- import java.util.ArrayList;
- import java.util.Arrays;
- import java.util.List;
- import java.util.Map;
- import org.jsoup.Connection;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
- import org.slf4j.Logger;
- import org.slf4j.LoggerFactory;
- import org.springframework.stereotype.Service;
- import com.alibaba.fastjson.JSONObject;
- import com.my.spider.model.AudioInfo;
- import com.my.spider.utils.FileUtils;
- import com.my.spider.utils.HttpHtmlUtils;
- import com.my.spider.utils.HttpURLConnectionFactory;
- @Service
- public class XmlyAudioService {
- public static final Logger logger = LoggerFactory.getLogger(XmlyAudioService.class);
- static String url = "http://www.ximalaya.com/dq/comic/";
- static String requetUrl = "http://www.ximalaya.com/tracks/";
- public static void main(String[] args) {
- List<String> audioUrlList = new ArrayList<String>();
- int count = getCount(url);
- if(count > 1) {
- audioUrlList.addAll(getAudioList(1,url));
- for (int i = 2; i <= count; i++) {
- url = url +i+"/";
- audioUrlList.addAll(getAudioList(i,url));
- url = url.replace(i+"/", "");
- }
- }
- List<String> audioList = new ArrayList<String>();
- //解析
- if(audioUrlList.size() > 0) {
- for (String url : audioUrlList) {
- audioList.addAll(listAudio(url));
- }
- }
- System.out.println(audioUrlList.size() + "==" + audioList.size());
- List<AudioInfo> audioInfos = new ArrayList<>();
- //下载
- for (String sound_id : audioList) {
- requetUrl = requetUrl + sound_id+".json";
- System.out.println(requetUrl);
- audioInfos.add(downloadList(requetUrl));
- requetUrl = requetUrl.replace(sound_id+".json", "");
- }
- }
- //获取音频页详情
- public static List<String> getAudioList(int num,String url){
- List<String> list = new ArrayList<>();
- try {
- Connection connect = Jsoup.connect(url).headers(HttpHtmlUtils.header_a);
- Document document = connect.timeout(5000).get();
- FileUtils.str2File(document.toString(), "G:\\xmly\\html\\comic" + num + ".html");
- Element el = document.getElementById("explore_album_detail_entry");
- Elements els = el.getElementsByClass("albumface");
- for (Element element : els) {
- list.add(element.absUrl("href"));
- }
- } catch (Throwable e) {
- logger.error("获取{}网页信息失败,{}",url,e.getMessage(),e);
- }
- return list;
- }
- public static List<String> listAudio(String url){
- List<String> list = new ArrayList<>();
- try {
- Connection connect = Jsoup.connect(url).headers(HttpHtmlUtils.header_a);
- Document document = connect.timeout(5000).get();
- FileUtils.str2File(document.toString(), "G:\\xmly\\html\\comic_"+System.currentTimeMillis()+".html");
- Elements els = document.getElementsByClass("personal_body");
- if(els!=null && els.size() > 0) {
- String sound_ids = els.get(0).attr("sound_ids");
- list.addAll(Arrays.asList(sound_ids.split(",")));
- }
- } catch (Throwable e) {
- logger.error("获取{}网页信息失败,{}",url,e.getMessage(),e);
- }
- return list;
- }
- //
- @SuppressWarnings("unchecked")
- public static AudioInfo downloadList(String url){
- AudioInfo audioInfo = new AudioInfo();
- try {
- HttpURLConnection conn = HttpURLConnectionFactory.getConn(url);
- conn.setRequestProperty("Content-Type", "*/*; charset=utf-8");
- String audioJson = HttpURLConnectionFactory.sendGet(conn);
- Map<String,Object> map = (Map<String, Object>) JSONObject.parse(audioJson);
- audioInfo.setId(map.get("id").toString());
- audioInfo.setName(map.get("title").toString());
- audioInfo.setUrl(map.get("play_path").toString());
- try {
- FileUtils.downloadRenameFile(audioInfo.getUrl(), "G:\\xmly", audioInfo.getName()+".mp3");
- } catch (Throwable e) {
- logger.error("{}下载失败,id={}",audioInfo.getName(),audioInfo.getId());;
- }
- } catch (Throwable e) {
- logger.error(e.getMessage(),e);
- }
- return audioInfo;
- }
- //获取总页数页数
- public static int getCount(String url) {
- try {
- Connection connect = Jsoup.connect(url).headers(HttpHtmlUtils.header_a);
- Document document = connect.timeout(5000).get();
- Elements els = document.getElementsByClass("pagingBar_page");
- if(els.size() < 2) {
- return 1;
- }
- Element pageCout = els.get(els.size()-2);
- return Integer.valueOf(pageCout.text());
- } catch (Throwable e) {
- e.printStackTrace();
- }
- return 0;
- }
- }
xmly.java
新浪新闻页面抓取(JAVA-Jsoup)的更多相关文章
- Python爬虫:新浪新闻详情页的数据抓取(函数版)
上一篇文章<Python爬虫:抓取新浪新闻数据>详细解说了如何抓取新浪新闻详情页的相关数据,但代码的构建不利于后续扩展,每次抓取新的详情页时都需要重新写一遍,因此,我们需要将其整理成函数, ...
- Python_网络爬虫(新浪新闻抓取)
爬取前的准备: BeautifulSoup的导入:pip install BeautifulSoup4 requests的导入:pip install requests 下载jupyter noteb ...
- selenium+BeautifulSoup+phantomjs爬取新浪新闻
一 下载phantomjs,把phantomjs.exe的文件路径加到环境变量中,也可以phantomjs.exe拷贝到一个已存在的环境变量路径中,比如我用的anaconda,我把phantomjs. ...
- python3爬虫-爬取新浪新闻首页所有新闻标题
准备工作:安装requests和BeautifulSoup4.打开cmd,输入如下命令 pip install requests pip install BeautifulSoup4 打开我们要爬取的 ...
- 门户级UGC系统的技术进化路线——新浪新闻评论系统的架构演进和经验总结(转)
add by zhj:先收藏了 摘要:评论系统是所有门户网站的核心标准服务组件之一.本文作者曾负责新浪网评论系统多年,这套系统不仅服务于门户新闻业务,还包括调查.投票等产品,经历了从单机到多机再到集群 ...
- 小爬新浪新闻AFCCL
1.任务目标: 爬取新浪新闻AFCCL的文章:文章标题.时间.来源.内容.评论数等信息. 2.目标网页: http://sports.sina.com.cn/z/AFCCL/ 3.网页分析 4.源代码 ...
- 今天写了一个简单的新浪新闻RSS操作类库
今天,有位群友问我如何获新浪新闻列表相关问题,我想,用正则表达式网页中取显然既复杂又不一定准确,现在许多大型网站都有RSS集合,所以我就跟他说用RSS应该好办一些. 一年前我写过一个RSS阅读器,不过 ...
- C# 页面抓取获取快递信息
通过页面抓取信息可以获得很多我们想要的信息,比如现在常会用到的快递查询,主要抓取的网站为http://www.kuaidi100.com/ 通过IE的网络分析我们可以得到下面信息 通过对这个网站的分析 ...
- Lance老师UI系列教程第八课->新浪新闻SlidingMenu界面的实现
UI系列教程第八课:Lance老师UI系列教程第八课->新浪新闻SlidingMenu界面的实现 今天蓝老师要讲的是关于新浪新闻侧滑界面的实现.先看看原图: 如图所示,这种侧滑效果以另一种方式替 ...
随机推荐
- 【CSS3】动画animation-关键帧keyframes
<!DOCTYPE html> <html> <head> <meta charset="utf-8"> <title> ...
- 【java】打印流的基本实现及java.io.PrintStream、java.io.PrintWriter示例
package 打印流; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; impor ...
- Oracle数据库(三)表操作,连接查询,分页
复制表 --复制表 create table new_table as select * from Product --复制表结构不要数据 在where后面跟一个不成立的条件,就会仅复制表的结构而不复 ...
- 基于阿里云的MQTT远程控制
好久没有写博客了,眼看自己的项目就要快做完了,先分享一下基于MQTT的远程控制,自己买了一个阿里的云端,然后在云端上安装了一个MQTT服务器,其实是一不小心买了两个,所以准备贡献出来一个供大家使用, ...
- 7.18 DP考试解题报告
今天的考试真的是天崩地裂,写了的三个题全炸...然而谁叫我弱+不注意细节呢???真的要扇耳光... T1:题意:一段区间的高度为这个区间中高度的最小值,给定n个宽度,求每个宽度的期望高度 40% :算 ...
- Spark源码剖析(六):Worker原理与源码剖析
上篇文章我们剖析了Master的原理和源码,知道了当Master使用资源分配算法将资源分配完成后,就会给对应的Worker发送启动Driver或者Executor的消息,那么Worker收到这些消息后 ...
- Linux第八讲随笔 -tar / 系统启动流程
linux 第八讲1.tar 参考 作用:压缩和解压文件.tar本身不具有压缩功能.他是调用压缩功能实现的. 语法:tar[必要参数][选择参数][文件] 参数:必要参数有如下: -A 新增压缩文件到 ...
- [编织消息框架][JAVA核心技术]annotation基础
应用动态代理技术要先掌握annotation技术 注解是JDK1.5之后才有的新特性,JDK1.5之后内部提供的三个注解 @Deprecated 意思是“废弃的,过时的” @Override 意思是“ ...
- SQL SERVER 常用知识整理
以前写了一些关于sql的文章,包括一些转载的,这里做下整理,方便需要时候使用 一.基础运用 SQL 数据结构操作语句 SQL 时间处理 SQL 常见函数使用 CASE WHEN THEN 小结 二.优 ...
- NFS : device is busy
unmount [ options ] -f : Force unmount (in case of an unreachable NFS system). -l : Lazy unmount. D ...