1、使用gradle建立工程:

工程格式如下:

include ':spider-demo'

rootProject.name = 'my-spider-demo'

settings

def void forceVersion(details, group, version) {
if (details.requested.group == group) {
details.useVersion version
}
} def void forceVersion(details, group, name, version) {
if (details.requested.group == group && details.requested.name == name) {
details.useVersion version
}
} allprojects { p ->
group = 'com.my.spider'
version = '1.0.0' apply plugin: 'java'
apply plugin: 'maven'
apply plugin: 'maven-publish' [compileJava, compileTestJava]*.options*.encoding = 'UTF-8' jar.doFirst {
manifest {
def manifestFile = "${projectDir}/META-INF/MANIFEST.MF"
if (new File(manifestFile).exists())
from (manifestFile) attributes 'Implementation-Title':p.name
if (p.version.endsWith('-SNAPSHOT')) {
attributes 'Implementation-Version': p.version + '-' + p.ext.Timestamp
} else {
attributes 'Implementation-Version': p.version
}
attributes 'Implementation-BuildDateTime':new Date()
}
} javadoc {
options {
encoding 'UTF-8'
charSet 'UTF-8'
author false
version true
links 'http://docs.oracle.com/javase/8/docs/api/index.html'
memberLevel = org.gradle.external.javadoc.JavadocMemberLevel.PRIVATE
}
} if (p.name.endsWith('-api')){
task sourcesJar(type:Jar, dependsOn:classes) {
classifier = 'sources'
from sourceSets.main.allSource
} task javadocJar(type:Jar, dependsOn:javadoc) {
classifier = 'javadoc'
from javadoc.destinationDir
}
} publishing {
repositories {
maven {
credentials {
username "${repositoryUploadUsername}"
password "${repositoryUploadPassword}"
} if (version.endsWith('-SNAPSHOT')) {
url "${repositoryUploadSnapshotUrl}"
} else {
url "${repositoryUploadReleaseUrl}"
}
}
}
publications {
mavenJava(MavenPublication) {
from components.java // 只有*-api才会需要发布sources和javadoc
if (p.name.endsWith('-api')){
artifact sourcesJar {
classifier "sources"
}
artifact javadocJar {
classifier "javadoc"
}
}
}
}
} if (System.env.uploadArchives) {
build.dependsOn publish
} buildscript {
repositories {
maven {
name 'Maven Repository'
url "${repositoryMavenUrl}"
credentials {
username "${repositoryUsername}"
password "${repositoryPassword}"
}
}
}
dependencies {classpath 'org.springframework.boot:spring-boot-gradle-plugin:1.4.0.RELEASE' }
} afterEvaluate {Project project ->
if (project.pluginManager.hasPlugin('java')) {
configurations.all {
resolutionStrategy.eachDependency {DependencyResolveDetails details ->
forceVersion details, 'org.springframework.boot', '1.4.1.RELEASE'
forceVersion details, 'org.slf4j', '1.7.21'
forceVersion details, 'org.springframework', '4.3.3.RELEASE'
} exclude module:'slf4j-log4j12'
exclude module:'log4j'
} dependencies {testCompile 'junit:junit:4.12' }
}
} repositories {
maven {
name 'Maven Repository'
url "${repositoryMavenUrl}"
credentials {
username "${repositoryUsername}"
password "${repositoryPassword}"
}
} ivy {
name 'Ivy Repository'
url "${repositoryIvyUrl}"
credentials {
username "${repositoryUsername}"
password "${repositoryPassword}"
}
layout "pattern", {
artifact '[organisation]/[module]/[revision]/[type]s/[artifact]-[revision].[ext]'
ivy '[organisation]/[module]/[revision]/[type]s/[artifact].[ext]'
m2compatible = true
}
}
} // 时间戳:年月日时分
p.ext.Timestamp = new Date().format('yyyyMMddHHmm')
// Build Number
p.ext.BuildNumber = System.env.BUILD_NUMBER
if (p.ext.BuildNumber == null || "" == p.ext.BuildNumber) {
p.ext.BuildNumber = 'x'
}
} task zipSources(type: Zip) {
description '压缩源代码'
project.ext.zipSourcesFile = project.name + '-' + project.version + '-' + project.ext.Timestamp + '.' + project.ext.BuildNumber + '-sources.zip'
archiveName = project.ext.zipSourcesFile
includeEmptyDirs = false from project.projectDir exclude '**/.*'
exclude 'build/*'
allprojects.each { p ->
exclude '**/' + p.name + '/bin/*'
exclude '**/' + p.name + '/build/*'
exclude '**/' + p.name + '/data/*'
exclude '**/' + p.name + '/work/*'
exclude '**/' + p.name + '/logs/*'
}
} def CopySpec appCopySpec(Project prj, dstname = null) {
if (!dstname) { dstname = prj.name }
return copySpec{
// Fat jar
from (prj.buildDir.toString() + '/libs/' + prj.name + '-' + project.version + '.jar') {
into dstname
} // Configs
from (prj.projectDir.toString() + '/config/examples') {
into dstname + '/config'
} // Windows start script
from (prj.projectDir.toString() + '/' + prj.name + '.bat') {
into dstname
} // Unix conf script
from (prj.projectDir.toString() + '/' + prj.name + '.conf') {
into dstname
rename prj.name, prj.name + '-' + project.version
}
}
} task zipSetup(type: Zip, dependsOn: subprojects.build) {
description '制作安装包'
project.ext.zipSetupFile = project.name + '-' + project.version + '-' + project.ext.Timestamp + '.' + project.ext.BuildNumber + '-setup.zip'
archiveName = project.name + '-' + project.version + '-' + project.ext.Timestamp + '.' + project.ext.BuildNumber + '-setup.zip' with appCopySpec(project(':spider-demo'))
} import java.security.MessageDigest def generateMD5(final file) {
MessageDigest digest = MessageDigest.getInstance("MD5")
file.withInputStream(){is->
byte[] buffer = new byte[8192]
int read = 0
while( (read = is.read(buffer)) > 0) {
digest.update(buffer, 0, read);
}
}
byte[] md5sum = digest.digest()
BigInteger bigInt = new BigInteger(1, md5sum)
return bigInt.toString(16)
} task md5(dependsOn: [zipSetup, zipSources]) << {
String md5_setup = generateMD5(file("${projectDir}/build/distributions/" + project.ext.zipSetupFile));
String md5_sources = generateMD5(file("${projectDir}/build/distributions/" + project.ext.zipSourcesFile));
println project.ext.zipSetupFile + '=' + md5_setup
println project.ext.zipSourcesFile + '=' + md5_sources def newFile = new File("${projectDir}/build/distributions/"
+ project.name + '-' + project.version + '-' + project.ext.Timestamp + '.' + project.ext.BuildNumber + '-md5.txt')
PrintWriter printWriter = newFile.newPrintWriter()
printWriter.println project.ext.zipSetupFile + '=' + md5_setup
printWriter.println project.ext.zipSourcesFile + '=' + md5_sources
printWriter.flush()
printWriter.close()
} build.dependsOn subprojects.build, zipSetup, zipSources, md5

bulid.gradle

子过程相关依赖:

apply plugin: 'spring-boot'
apply plugin: 'application' distributions {
main {
contents {
from ("${projectDir}/config/examples") {
into "config"
}
}
}
} distTar.enabled = false springBoot {
executable = true
mainClass = 'com.my.spider.Application'
} dependencies {
compile 'org.springframework.boot:spring-boot-starter-web:1.4.0.RELEASE'
compile 'dom4j:dom4j:1.6.1'
compile 'commons-httpclient:commons-httpclient:3.1'
compileOnly 'com.h2database:h2:1.4.191'
compile 'javax.cache:cache-api:1.0.0'
compile 'org.jboss.resteasy:resteasy-jaxrs:3.0.14.Final'
compile 'org.jboss.resteasy:resteasy-client:3.0.14.Final'
// Axis
compile 'axis:axis:1.4' compile 'org.jsoup:jsoup:1.10.1' compile 'com.alibaba:fastjson:1.2.21' }

bulid

2、代码编写:

入口:

package com.my.spider;

import java.io.IOException;

import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.scheduling.annotation.EnableAsync;
import org.springframework.scheduling.annotation.EnableScheduling; import com.my.spider.utils.CommonProperties; @SpringBootApplication
@EnableScheduling
@EnableAsync
public class Application { public static void main(String[] args) throws IOException {
String loc = CommonProperties.loadProperties2System(System.getProperty("spring.config.location"));
System.getProperties().setProperty("application.version", CommonProperties.getVersion(Application.class));
System.getProperties().setProperty("app.home", loc + "/..");
SpringApplication.run(Application.class, args);
} }
package com.my.spider.utils;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Properties; import org.springframework.util.StringUtils; public final class CommonProperties { public static final String PPT_KEY_APP_HOME = "app.home"; public static final String DEFAULT_APP_HOME = "./"; public static final String getAppHome() {
return System.getProperty(DEFAULT_APP_HOME, DEFAULT_APP_HOME);
} public static String loadProperties2System(String location) throws IOException {
String configLocation = location;
File cnf;
if (!StringUtils.hasLength(configLocation)) {
configLocation = "./config";
cnf = new File(configLocation);
if (!cnf.exists() || !cnf.isDirectory()) {
configLocation = "../config";
cnf = new File(configLocation);
}
} else {
cnf = new File(configLocation);
}
for (File file : cnf.listFiles()) {
if (file.isFile() && file.getName().endsWith(".properties")) {
Properties ppt = new Properties();
try (FileInputStream fi = new FileInputStream(file)) {
ppt.load(fi);
System.getProperties().putAll(ppt);
}
}
}
return configLocation;
} public static String getVersion(Class<?> clazz) {
Package pkg = clazz.getPackage();
String ver = (pkg != null ? pkg.getImplementationVersion() : "undefined");
return (ver == null ? "undefined" : ver);
}
}

配置类:

package com.my.spider.config;

import org.springframework.context.annotation.ComponentScan;
import org.springframework.context.annotation.Configuration;
import org.springframework.scheduling.annotation.EnableScheduling; @EnableScheduling
@Configuration
@ComponentScan(basePackages = {
"com.my.spider.rs",
"com.my.spider.schedule"
})
public class AppAutoConfiguration { }

META-INF下spring.factories文件:

org.springframework.boot.autoconfigure.EnableAutoConfiguration=\
com.my.spider.config.AppAutoConfiguration

3、功能代码:

定时任务抽象类,提供三种定时任务的调用方法:

package com.my.spider.schedule;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.DisposableBean;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component; import com.fasterxml.jackson.databind.ObjectMapper; @Component
public abstract class ParentSchedule implements InitializingBean,DisposableBean{ public static Logger logger = LoggerFactory.getLogger(ParentSchedule.class); public final static ObjectMapper objectMapper = new ObjectMapper(); @Scheduled(
initialDelayString = "${agent.task.initialDelay:1000}", //
fixedDelayString = "${agent.task.fixedDelay:10000}")
public void dowork(){
execute();
}
//定时任务一
public abstract void execute(); @Scheduled(cron = "${agent.task.cron:0 0 10,14,16 * * ?}")
public void timeTask(){
executeTimeTask();
}
//定时任务三
public abstract void executeTimeTask(); //每天12点出发
@Scheduled(cron = "0 0 12 * * ?")
public void otherTask(){
executeOtherTask();
}
//定时任务三
public abstract void executeOtherTask();
}
package com.my.spider.utils;

import java.util.HashMap;
import java.util.Map; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; /**
* 页面抓取请求的公共类
* */
public class HttpHtmlUtils { public static Logger logger = LoggerFactory.getLogger(HttpHtmlUtils.class); public static Map<String, String> header = new HashMap<String, String>(); public static Map<String, String> header_a = new HashMap<String, String>(); static {
//设置请求头
header.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0");
header.put("Accept","text/javascript, text/html, application/xml, text/xml, */*");
header.put("Accept-Language","zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3");
header.put("Accept-Encoding","gzip, deflate");
header.put("X-Requested-With","XMLHttpRequest");
header.put("Content-Type","text/*, application/xml");
header.put("Connection","keep-alive"); header_a.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0");
header_a.put("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
header_a.put("Accept-Language","zh-CN,zh;q=0.8");
header_a.put("Accept-Encoding","gzip, deflate, sdch");
header_a.put("Content-Type","application/octet-stream");
header_a.put("Connection","keep-alive");
header_a.put("Upgrade-Insecure-Requests", "1");
} }

新浪滚动新闻抓取实现下载和分析:

package com.my.spider.schedule;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set; import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils; import com.my.spider.utils.FileUtils;
import com.my.spider.utils.HttpHtmlUtils; @Component
public class SinaSchedule extends ParentSchedule { private static Logger logger = LoggerFactory.getLogger(SinaSchedule.class); public static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm"); public static SimpleDateFormat sdfYMD = new SimpleDateFormat("yyyy-MM-dd"); private static int downloadtimeout = 5000; public static Set<String> titleSet = new HashSet<String>(); @Value("${img.download.dir.prefix:D://testhtml}")
public String dirpath; @Override
public void afterPropertiesSet() throws Exception {
// TODO Auto-generated method stub } // 抓取文章列表
public static List<String> getArticleList(String url) { List<String> urlList = new ArrayList<String>();
logger.debug("获取文章信息url:{},开始时间={}", url, sdf.format(new Date())); try {
Connection connect = Jsoup.connect(url).headers(HttpHtmlUtils.header_a);
Document document;
document = connect.timeout(downloadtimeout).get();
Elements newsList = document.getElementsByClass("d_list_txt");
if (newsList != null && newsList.size() > 0) {
newsList = newsList.get(0).getElementsByTag("ul").get(0).getElementsByTag("li");
for (Element el : newsList) {
String elUrl = el.getElementsByTag("a").get(0).absUrl("href");
String urlName = el.getElementsByTag("a").get(0).text();
String time = el.getElementsByClass("c_time").get(0).text();
logger.debug("获取新闻:{},访问地址:{},时间:{}",urlName,elUrl,time);
//elUrl = el.getElementsByTag("a").get(0).attr("href");
urlList.add(elUrl);
}
}
logger.debug("获取文章列表信息:结束时间={}", sdf.format(new Date()));
return urlList;
} catch (IOException e) {
logger.error("访问文章列表失败:" + url + " 原因" + e.getMessage());
}
return null;
} // 抓取文章列表
public static Map<String, Object> getArticleInfo(String url) { logger.debug("获取文章信息url:{},开始时间={}", url, sdf.format(new Date()));
try {
Map<String, Object> map = new HashMap<String, Object>();
Connection connect = Jsoup.connect(url).headers(HttpHtmlUtils.header);
Document document;
document = connect.timeout(downloadtimeout).get();
Element titleEl = document.getElementById("artibodyTitle");
String tilte = ""; if (titleEl != null) {
tilte = titleEl.text();
} Elements keywords = document.getElementsByClass("article-keywords");
String tag = "";
StringBuffer sb = new StringBuffer();
if (keywords != null ) {
for (Element t : keywords.get(0).getElementsByTag("a")) {
sb.append(t.text()).append(",");
}
if (!StringUtils.isEmpty(sb.toString())) {
tag = sb.deleteCharAt(sb.lastIndexOf(",")).toString();
}
} Element contentEle = document.getElementById("artibody");
String content = "";
String contentText = "";
if (contentEle != null) {
content = contentEle.html();
contentText = contentEle.text();
}
String description = "";
Elements descEle = document.getElementsByAttributeValue("name","description");
if (descEle != null && descEle.size() > 0) {
description = descEle.get(0).attr("content");
}
List<String> imgUrls = new ArrayList<>();
Elements imgs = contentEle.getElementsByTag("img");
if (imgs != null && imgs.size() > 0) {
for (Element img : imgs) {
String imgUrl = img.attr("src");
if (!StringUtils.isEmpty(imgUrl)) {
imgUrls.add(imgUrl);
}
}
}
map.put("imgs", imgUrls);
map.put("description", description);
map.put("content", content);
map.put("contentText", contentText);
map.put("tag", tag);
map.put("title", tilte);
logger.debug("获取文章信息:结束时间={}", sdf.format(new Date())); return map;
} catch (IOException e) {
logger.error("访问文章页失败:" + url + " 原因" + e.getMessage());
}
return null;
} @Override
public void destroy() throws Exception {
// TODO Auto-generated method stub } public static void main(String[] args) {
List<String> url = new ArrayList<>();
url.addAll(getArticleList("http://roll.news.sina.com.cn/s/channel.php?ch=01#col=89&spec=&type=&ch=0"
+ "1&k=&offset_page=0&offset_num=0&num=60&asc=&page=1"));
titleSet.addAll(url);
logger.debug("此次共获取到{}个",titleSet.size()); for (String urlStr : titleSet) {
try {
/*
String htmlFile = FileUtils.downloadYunFile(urlStr, "D://testhtml//sina//"+sdfYMD.format(new Date()));
Document document = Jsoup.parse(new File(htmlFile), "utf8");
document.getElementsByTag("tilte");
*/
//下载保存
FileUtils.downloadYunFile(urlStr, "D://testhtml//sina//"+sdfYMD.format(new Date())); getArticleInfo(urlStr); } catch (Throwable e) { } }
} @Override
public void execute() { } @Override
public void executeTimeTask() {
// TODO Auto-generated method stub } @Override
public void executeOtherTask() {
// TODO Auto-generated method stub } }

下载html文件代码:

package com.my.spider.utils;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URI;
import java.util.Arrays; import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.http.HttpEntity;
import org.springframework.http.HttpHeaders;
import org.springframework.http.HttpMethod;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.http.client.ClientHttpRequestFactory;
import org.springframework.http.client.HttpComponentsClientHttpRequestFactory;
import org.springframework.util.StreamUtils;
import org.springframework.web.client.RestTemplate;
import org.springframework.web.util.UriComponentsBuilder; import com.fasterxml.jackson.databind.ObjectMapper; public class FileUtils { private static final Logger logger = LoggerFactory.getLogger(FileUtils.class); private static ObjectMapper _objectMapper = new ObjectMapper(); private static int downloadTimeout = 5000; public static void main(String[] args) throws Throwable {
String filePath = "/temp/temp/test.mpg";
String dirPrex = "/temp&Z:\\\\";
String[] paths = dirPrex.split("&");
System.out.println(paths[1] + filePath.substring(paths[0].length() + 1).replace("/", "\\"));
} // 文件复制
public static void copy(String src, String dest) throws IOException { System.out.println("正在拷贝【" + src + "】到【" + dest + "】\n");
File destFile = new File(dest);
if (!destFile.exists()) {
String dir = dest.substring(0, dest.lastIndexOf(File.separator));
File dirF = new File(dir);
if (!dirF.exists() || !dirF.isDirectory()) {
dirF.mkdirs();
}
destFile.createNewFile();
}
FileInputStream in = new FileInputStream(src);
FileOutputStream out = new FileOutputStream(dest);
byte[] buffer = new byte[40960];
while (in.read(buffer) != -1) {
out.write(buffer);
out.flush();
}
in.close();
out.close();
} // 下载云文件
public static String downloadYunFile(String url, String dir) throws Throwable { String fileName = getFileName(url); String filePath = dir + File.separator + fileName; try (CloseableHttpClient httpclient = HttpClients.createDefault()) {
HttpGet httpget = new HttpGet(url);
httpget.setConfig(RequestConfig.custom() //
.setConnectionRequestTimeout(downloadTimeout) //
.setConnectTimeout(downloadTimeout) //
.setSocketTimeout(downloadTimeout) //
.build());
try (CloseableHttpResponse response = httpclient.execute(httpget)) {
org.apache.http.HttpEntity entity = response.getEntity();
File desc = new File(filePath);
File folder = desc.getParentFile();
folder.mkdirs();
try (InputStream is = entity.getContent(); //
OutputStream os = new FileOutputStream(desc)) {
StreamUtils.copy(is, os);
}
} catch (Throwable e) {
throw new Throwable("文件下载失败......", e);
}
}
return filePath;
} public static String getFileName(String fileFullPath) {
fileFullPath = fileFullPath.replace("/", "\\");
return fileFullPath.substring(fileFullPath.lastIndexOf("\\") + 1, fileFullPath.length());
} // 请求例子
public void getToken(String url, String data) throws Throwable { RestTemplate restTemplate = new RestTemplate();
ClientHttpRequestFactory clientFactory = new HttpComponentsClientHttpRequestFactory();
restTemplate.setRequestFactory(clientFactory); HttpHeaders requestHeaders = new HttpHeaders();
requestHeaders.setAccept(Arrays.asList(MediaType.APPLICATION_JSON_UTF8));
requestHeaders.setContentType(MediaType.APPLICATION_JSON_UTF8);
logger.debug("获取token的URL:" + url); URI uri = UriComponentsBuilder.fromUriString(url).build().encode().toUri(); logger.debug("请求数据:{}", _objectMapper.writeValueAsString(data)); HttpEntity<String> requestEntity = new HttpEntity<String>(data, requestHeaders); ResponseEntity<String> response = restTemplate.exchange(uri, HttpMethod.POST, requestEntity, String.class);
String resp = response.getBody();
logger.debug("请求返回值数据:{}", _objectMapper.writeValueAsString(resp));
} }

4、总结:

Jsoup对于这种页面抓取很好用!也可能因为这是实现了一个最简单的页面抓取过程!

追加一个下载音频的代码:

package com.my.spider.service;

import java.net.HttpURLConnection;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map; import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Service; import com.alibaba.fastjson.JSONObject;
import com.my.spider.model.AudioInfo;
import com.my.spider.utils.FileUtils;
import com.my.spider.utils.HttpHtmlUtils;
import com.my.spider.utils.HttpURLConnectionFactory; @Service
public class XmlyAudioService { public static final Logger logger = LoggerFactory.getLogger(XmlyAudioService.class);
static String url = "http://www.ximalaya.com/dq/comic/";
static String requetUrl = "http://www.ximalaya.com/tracks/"; public static void main(String[] args) {
List<String> audioUrlList = new ArrayList<String>();
int count = getCount(url);
if(count > 1) {
audioUrlList.addAll(getAudioList(1,url));
for (int i = 2; i <= count; i++) {
url = url +i+"/";
audioUrlList.addAll(getAudioList(i,url));
url = url.replace(i+"/", "");
}
}
List<String> audioList = new ArrayList<String>();
//解析
if(audioUrlList.size() > 0) {
for (String url : audioUrlList) {
audioList.addAll(listAudio(url));
}
}
System.out.println(audioUrlList.size() + "==" + audioList.size());
List<AudioInfo> audioInfos = new ArrayList<>();
//下载
for (String sound_id : audioList) {
requetUrl = requetUrl + sound_id+".json";
System.out.println(requetUrl);
audioInfos.add(downloadList(requetUrl));
requetUrl = requetUrl.replace(sound_id+".json", "");
}
} //获取音频页详情
public static List<String> getAudioList(int num,String url){
List<String> list = new ArrayList<>();
try {
Connection connect = Jsoup.connect(url).headers(HttpHtmlUtils.header_a);
Document document = connect.timeout(5000).get();
FileUtils.str2File(document.toString(), "G:\\xmly\\html\\comic" + num + ".html");
Element el = document.getElementById("explore_album_detail_entry");
Elements els = el.getElementsByClass("albumface");
for (Element element : els) {
list.add(element.absUrl("href"));
}
} catch (Throwable e) {
logger.error("获取{}网页信息失败,{}",url,e.getMessage(),e);
}
return list;
} public static List<String> listAudio(String url){
List<String> list = new ArrayList<>();
try {
Connection connect = Jsoup.connect(url).headers(HttpHtmlUtils.header_a);
Document document = connect.timeout(5000).get();
FileUtils.str2File(document.toString(), "G:\\xmly\\html\\comic_"+System.currentTimeMillis()+".html");
Elements els = document.getElementsByClass("personal_body");
if(els!=null && els.size() > 0) {
String sound_ids = els.get(0).attr("sound_ids");
list.addAll(Arrays.asList(sound_ids.split(",")));
}
} catch (Throwable e) {
logger.error("获取{}网页信息失败,{}",url,e.getMessage(),e);
}
return list;
} //
@SuppressWarnings("unchecked")
public static AudioInfo downloadList(String url){
AudioInfo audioInfo = new AudioInfo();
try { HttpURLConnection conn = HttpURLConnectionFactory.getConn(url);
conn.setRequestProperty("Content-Type", "*/*; charset=utf-8");
String audioJson = HttpURLConnectionFactory.sendGet(conn);
Map<String,Object> map = (Map<String, Object>) JSONObject.parse(audioJson);
audioInfo.setId(map.get("id").toString());
audioInfo.setName(map.get("title").toString());
audioInfo.setUrl(map.get("play_path").toString());
try {
FileUtils.downloadRenameFile(audioInfo.getUrl(), "G:\\xmly", audioInfo.getName()+".mp3");
} catch (Throwable e) {
logger.error("{}下载失败,id={}",audioInfo.getName(),audioInfo.getId());;
} } catch (Throwable e) {
logger.error(e.getMessage(),e);
}
return audioInfo;
} //获取总页数页数
public static int getCount(String url) {
try {
Connection connect = Jsoup.connect(url).headers(HttpHtmlUtils.header_a);
Document document = connect.timeout(5000).get();
Elements els = document.getElementsByClass("pagingBar_page");
if(els.size() < 2) {
return 1;
}
Element pageCout = els.get(els.size()-2);
return Integer.valueOf(pageCout.text());
} catch (Throwable e) {
e.printStackTrace();
}
return 0;
} }

xmly.java

新浪新闻页面抓取(JAVA-Jsoup)的更多相关文章

  1. Python爬虫:新浪新闻详情页的数据抓取(函数版)

    上一篇文章<Python爬虫:抓取新浪新闻数据>详细解说了如何抓取新浪新闻详情页的相关数据,但代码的构建不利于后续扩展,每次抓取新的详情页时都需要重新写一遍,因此,我们需要将其整理成函数, ...

  2. Python_网络爬虫(新浪新闻抓取)

    爬取前的准备: BeautifulSoup的导入:pip install BeautifulSoup4 requests的导入:pip install requests 下载jupyter noteb ...

  3. selenium+BeautifulSoup+phantomjs爬取新浪新闻

    一 下载phantomjs,把phantomjs.exe的文件路径加到环境变量中,也可以phantomjs.exe拷贝到一个已存在的环境变量路径中,比如我用的anaconda,我把phantomjs. ...

  4. python3爬虫-爬取新浪新闻首页所有新闻标题

    准备工作:安装requests和BeautifulSoup4.打开cmd,输入如下命令 pip install requests pip install BeautifulSoup4 打开我们要爬取的 ...

  5. 门户级UGC系统的技术进化路线——新浪新闻评论系统的架构演进和经验总结(转)

    add by zhj:先收藏了 摘要:评论系统是所有门户网站的核心标准服务组件之一.本文作者曾负责新浪网评论系统多年,这套系统不仅服务于门户新闻业务,还包括调查.投票等产品,经历了从单机到多机再到集群 ...

  6. 小爬新浪新闻AFCCL

    1.任务目标: 爬取新浪新闻AFCCL的文章:文章标题.时间.来源.内容.评论数等信息. 2.目标网页: http://sports.sina.com.cn/z/AFCCL/ 3.网页分析 4.源代码 ...

  7. 今天写了一个简单的新浪新闻RSS操作类库

    今天,有位群友问我如何获新浪新闻列表相关问题,我想,用正则表达式网页中取显然既复杂又不一定准确,现在许多大型网站都有RSS集合,所以我就跟他说用RSS应该好办一些. 一年前我写过一个RSS阅读器,不过 ...

  8. C# 页面抓取获取快递信息

    通过页面抓取信息可以获得很多我们想要的信息,比如现在常会用到的快递查询,主要抓取的网站为http://www.kuaidi100.com/ 通过IE的网络分析我们可以得到下面信息 通过对这个网站的分析 ...

  9. Lance老师UI系列教程第八课->新浪新闻SlidingMenu界面的实现

    UI系列教程第八课:Lance老师UI系列教程第八课->新浪新闻SlidingMenu界面的实现 今天蓝老师要讲的是关于新浪新闻侧滑界面的实现.先看看原图: 如图所示,这种侧滑效果以另一种方式替 ...

随机推荐

  1. 小白的Python之路 day1 字符编码

    字符编码 python解释器在加载 .py 文件中的代码时,会对内容进行编码(默认ascill) ASCII(American Standard Code for Information Interc ...

  2. iOS 懒加载模式

    感谢: chengfang iOS开发-懒加载 1.懒加载--也称为延迟加载,即在需要的时候才加载(效率低,占用内存小).所谓懒加载,写的是其get方法. 注意:如果是懒加载的话则一定要注意先判断是否 ...

  3. Servlet过滤器简单探索

    过滤器的工作时机介于浏览器和Servlet请求处理之间,可以拦截浏览器对Servlet的请求,也可以改变Servlet对浏览器的响应. 其工作模式大概是这样的: 一.Filter的原理 在Servle ...

  4. Java I/O---序列化接口Serializable

    1.JDK API 中关于Serializable的描述 public interface Serializable 类通过实现 java.io.Serializable 接口以启用其序列化功能.未实 ...

  5. C++ 头文件系列(set)

    简介 头文件包含set.multiset两个类模版,这里要描述的概念与map非常相似,甚至连成员函数都几乎一样,所以这篇随笔会很短. set set如果翻译成中文应该是集合的意思,这里更确切的说是唯一 ...

  6. ubuntu14.04 解决屏幕亮度无法调节的问题

    sudo gedit /etc/default/grub 在打开文件中找到 GRUB_CMDLINE_LINUX="" 改成 GRUB_CMDLINE_LINUX="ac ...

  7. 502 VS 504

    本文同时发表在https://github.com/zhangyachen/zhangyachen.github.io/issues/89 首先看一下概念: 502:作为网关或者代理工作的服务器尝试执 ...

  8. AntData.ORM框架 之 读写分离

    环境准备 准备2台机器配置好Master Slaver模式 我是用vmware 2台虚拟机配置的.有需要请联系. Master:192.168.11.130 Slaver:192.168.11.133 ...

  9. bzoj 2726: [SDOI2012]任务安排

    Description 机 器上有N个需要处理的任务,它们构成了一个序列.这些任务被标号为1到N,因此序列的排列为1,2,3...N.这N个任务被分成若干批,每批包含相邻的 若干任务.从时刻0开始,这 ...

  10. HTML知识点记录

    1.input的type设置为file时,设置multiple属性可以同时选择多个文件.