1、使用gradle建立工程:

工程格式如下:

  1. include ':spider-demo'
  2.  
  3. rootProject.name = 'my-spider-demo'

settings

  1. def void forceVersion(details, group, version) {
  2. if (details.requested.group == group) {
  3. details.useVersion version
  4. }
  5. }
  6.  
  7. def void forceVersion(details, group, name, version) {
  8. if (details.requested.group == group && details.requested.name == name) {
  9. details.useVersion version
  10. }
  11. }
  12.  
  13. allprojects { p ->
  14. group = 'com.my.spider'
  15. version = '1.0.0'
  16.  
  17. apply plugin: 'java'
  18. apply plugin: 'maven'
  19. apply plugin: 'maven-publish'
  20.  
  21. [compileJava, compileTestJava]*.options*.encoding = 'UTF-8'
  22.  
  23. jar.doFirst {
  24. manifest {
  25. def manifestFile = "${projectDir}/META-INF/MANIFEST.MF"
  26. if (new File(manifestFile).exists())
  27. from (manifestFile)
  28.  
  29. attributes 'Implementation-Title':p.name
  30. if (p.version.endsWith('-SNAPSHOT')) {
  31. attributes 'Implementation-Version': p.version + '-' + p.ext.Timestamp
  32. } else {
  33. attributes 'Implementation-Version': p.version
  34. }
  35. attributes 'Implementation-BuildDateTime':new Date()
  36. }
  37. }
  38.  
  39. javadoc {
  40. options {
  41. encoding 'UTF-8'
  42. charSet 'UTF-8'
  43. author false
  44. version true
  45. links 'http://docs.oracle.com/javase/8/docs/api/index.html'
  46. memberLevel = org.gradle.external.javadoc.JavadocMemberLevel.PRIVATE
  47. }
  48. }
  49.  
  50. if (p.name.endsWith('-api')){
  51. task sourcesJar(type:Jar, dependsOn:classes) {
  52. classifier = 'sources'
  53. from sourceSets.main.allSource
  54. }
  55.  
  56. task javadocJar(type:Jar, dependsOn:javadoc) {
  57. classifier = 'javadoc'
  58. from javadoc.destinationDir
  59. }
  60. }
  61.  
  62. publishing {
  63. repositories {
  64. maven {
  65. credentials {
  66. username "${repositoryUploadUsername}"
  67. password "${repositoryUploadPassword}"
  68. }
  69.  
  70. if (version.endsWith('-SNAPSHOT')) {
  71. url "${repositoryUploadSnapshotUrl}"
  72. } else {
  73. url "${repositoryUploadReleaseUrl}"
  74. }
  75. }
  76. }
  77. publications {
  78. mavenJava(MavenPublication) {
  79. from components.java
  80.  
  81. // 只有*-api才会需要发布sources和javadoc
  82. if (p.name.endsWith('-api')){
  83. artifact sourcesJar {
  84. classifier "sources"
  85. }
  86. artifact javadocJar {
  87. classifier "javadoc"
  88. }
  89. }
  90. }
  91. }
  92. }
  93.  
  94. if (System.env.uploadArchives) {
  95. build.dependsOn publish
  96. }
  97.  
  98. buildscript {
  99. repositories {
  100. maven {
  101. name 'Maven Repository'
  102. url "${repositoryMavenUrl}"
  103. credentials {
  104. username "${repositoryUsername}"
  105. password "${repositoryPassword}"
  106. }
  107. }
  108. }
  109. dependencies {classpath 'org.springframework.boot:spring-boot-gradle-plugin:1.4.0.RELEASE' }
  110. }
  111.  
  112. afterEvaluate {Project project ->
  113. if (project.pluginManager.hasPlugin('java')) {
  114. configurations.all {
  115. resolutionStrategy.eachDependency {DependencyResolveDetails details ->
  116. forceVersion details, 'org.springframework.boot', '1.4.1.RELEASE'
  117. forceVersion details, 'org.slf4j', '1.7.21'
  118. forceVersion details, 'org.springframework', '4.3.3.RELEASE'
  119. }
  120.  
  121. exclude module:'slf4j-log4j12'
  122. exclude module:'log4j'
  123. }
  124.  
  125. dependencies {testCompile 'junit:junit:4.12' }
  126. }
  127. }
  128.  
  129. repositories {
  130. maven {
  131. name 'Maven Repository'
  132. url "${repositoryMavenUrl}"
  133. credentials {
  134. username "${repositoryUsername}"
  135. password "${repositoryPassword}"
  136. }
  137. }
  138.  
  139. ivy {
  140. name 'Ivy Repository'
  141. url "${repositoryIvyUrl}"
  142. credentials {
  143. username "${repositoryUsername}"
  144. password "${repositoryPassword}"
  145. }
  146. layout "pattern", {
  147. artifact '[organisation]/[module]/[revision]/[type]s/[artifact]-[revision].[ext]'
  148. ivy '[organisation]/[module]/[revision]/[type]s/[artifact].[ext]'
  149. m2compatible = true
  150. }
  151. }
  152. }
  153.  
  154. // 时间戳:年月日时分
  155. p.ext.Timestamp = new Date().format('yyyyMMddHHmm')
  156. // Build Number
  157. p.ext.BuildNumber = System.env.BUILD_NUMBER
  158. if (p.ext.BuildNumber == null || "" == p.ext.BuildNumber) {
  159. p.ext.BuildNumber = 'x'
  160. }
  161. }
  162.  
  163. task zipSources(type: Zip) {
  164. description '压缩源代码'
  165. project.ext.zipSourcesFile = project.name + '-' + project.version + '-' + project.ext.Timestamp + '.' + project.ext.BuildNumber + '-sources.zip'
  166. archiveName = project.ext.zipSourcesFile
  167. includeEmptyDirs = false
  168.  
  169. from project.projectDir
  170.  
  171. exclude '**/.*'
  172. exclude 'build/*'
  173. allprojects.each { p ->
  174. exclude '**/' + p.name + '/bin/*'
  175. exclude '**/' + p.name + '/build/*'
  176. exclude '**/' + p.name + '/data/*'
  177. exclude '**/' + p.name + '/work/*'
  178. exclude '**/' + p.name + '/logs/*'
  179. }
  180. }
  181.  
  182. def CopySpec appCopySpec(Project prj, dstname = null) {
  183. if (!dstname) { dstname = prj.name }
  184. return copySpec{
  185. // Fat jar
  186. from (prj.buildDir.toString() + '/libs/' + prj.name + '-' + project.version + '.jar') {
  187. into dstname
  188. }
  189.  
  190. // Configs
  191. from (prj.projectDir.toString() + '/config/examples') {
  192. into dstname + '/config'
  193. }
  194.  
  195. // Windows start script
  196. from (prj.projectDir.toString() + '/' + prj.name + '.bat') {
  197. into dstname
  198. }
  199.  
  200. // Unix conf script
  201. from (prj.projectDir.toString() + '/' + prj.name + '.conf') {
  202. into dstname
  203. rename prj.name, prj.name + '-' + project.version
  204. }
  205. }
  206. }
  207.  
  208. task zipSetup(type: Zip, dependsOn: subprojects.build) {
  209. description '制作安装包'
  210. project.ext.zipSetupFile = project.name + '-' + project.version + '-' + project.ext.Timestamp + '.' + project.ext.BuildNumber + '-setup.zip'
  211. archiveName = project.name + '-' + project.version + '-' + project.ext.Timestamp + '.' + project.ext.BuildNumber + '-setup.zip'
  212.  
  213. with appCopySpec(project(':spider-demo'))
  214. }
  215.  
  216. import java.security.MessageDigest
  217.  
  218. def generateMD5(final file) {
  219. MessageDigest digest = MessageDigest.getInstance("MD5")
  220. file.withInputStream(){is->
  221. byte[] buffer = new byte[8192]
  222. int read = 0
  223. while( (read = is.read(buffer)) > 0) {
  224. digest.update(buffer, 0, read);
  225. }
  226. }
  227. byte[] md5sum = digest.digest()
  228. BigInteger bigInt = new BigInteger(1, md5sum)
  229. return bigInt.toString(16)
  230. }
  231.  
  232. task md5(dependsOn: [zipSetup, zipSources]) << {
  233. String md5_setup = generateMD5(file("${projectDir}/build/distributions/" + project.ext.zipSetupFile));
  234. String md5_sources = generateMD5(file("${projectDir}/build/distributions/" + project.ext.zipSourcesFile));
  235. println project.ext.zipSetupFile + '=' + md5_setup
  236. println project.ext.zipSourcesFile + '=' + md5_sources
  237.  
  238. def newFile = new File("${projectDir}/build/distributions/"
  239. + project.name + '-' + project.version + '-' + project.ext.Timestamp + '.' + project.ext.BuildNumber + '-md5.txt')
  240. PrintWriter printWriter = newFile.newPrintWriter()
  241. printWriter.println project.ext.zipSetupFile + '=' + md5_setup
  242. printWriter.println project.ext.zipSourcesFile + '=' + md5_sources
  243. printWriter.flush()
  244. printWriter.close()
  245. }
  246.  
  247. build.dependsOn subprojects.build, zipSetup, zipSources, md5

bulid.gradle

子过程相关依赖:

  1. apply plugin: 'spring-boot'
  2. apply plugin: 'application'
  3.  
  4. distributions {
  5. main {
  6. contents {
  7. from ("${projectDir}/config/examples") {
  8. into "config"
  9. }
  10. }
  11. }
  12. }
  13.  
  14. distTar.enabled = false
  15.  
  16. springBoot {
  17. executable = true
  18. mainClass = 'com.my.spider.Application'
  19. }
  20.  
  21. dependencies {
  22. compile 'org.springframework.boot:spring-boot-starter-web:1.4.0.RELEASE'
  23. compile 'dom4j:dom4j:1.6.1'
  24. compile 'commons-httpclient:commons-httpclient:3.1'
  25. compileOnly 'com.h2database:h2:1.4.191'
  26. compile 'javax.cache:cache-api:1.0.0'
  27. compile 'org.jboss.resteasy:resteasy-jaxrs:3.0.14.Final'
  28. compile 'org.jboss.resteasy:resteasy-client:3.0.14.Final'
  29. // Axis
  30. compile 'axis:axis:1.4'
  31.  
  32. compile 'org.jsoup:jsoup:1.10.1'
  33.  
  34. compile 'com.alibaba:fastjson:1.2.21'
  35.  
  36. }

bulid

2、代码编写:

入口:

  1. package com.my.spider;
  2.  
  3. import java.io.IOException;
  4.  
  5. import org.springframework.boot.SpringApplication;
  6. import org.springframework.boot.autoconfigure.SpringBootApplication;
  7. import org.springframework.scheduling.annotation.EnableAsync;
  8. import org.springframework.scheduling.annotation.EnableScheduling;
  9.  
  10. import com.my.spider.utils.CommonProperties;
  11.  
  12. @SpringBootApplication
  13. @EnableScheduling
  14. @EnableAsync
  15. public class Application {
  16.  
  17. public static void main(String[] args) throws IOException {
  18. String loc = CommonProperties.loadProperties2System(System.getProperty("spring.config.location"));
  19. System.getProperties().setProperty("application.version", CommonProperties.getVersion(Application.class));
  20. System.getProperties().setProperty("app.home", loc + "/..");
  21. SpringApplication.run(Application.class, args);
  22. }
  23.  
  24. }
  1. package com.my.spider.utils;
  2.  
  3. import java.io.File;
  4. import java.io.FileInputStream;
  5. import java.io.IOException;
  6. import java.util.Properties;
  7.  
  8. import org.springframework.util.StringUtils;
  9.  
  10. public final class CommonProperties {
  11.  
  12. public static final String PPT_KEY_APP_HOME = "app.home";
  13.  
  14. public static final String DEFAULT_APP_HOME = "./";
  15.  
  16. public static final String getAppHome() {
  17. return System.getProperty(DEFAULT_APP_HOME, DEFAULT_APP_HOME);
  18. }
  19.  
  20. public static String loadProperties2System(String location) throws IOException {
  21. String configLocation = location;
  22. File cnf;
  23. if (!StringUtils.hasLength(configLocation)) {
  24. configLocation = "./config";
  25. cnf = new File(configLocation);
  26. if (!cnf.exists() || !cnf.isDirectory()) {
  27. configLocation = "../config";
  28. cnf = new File(configLocation);
  29. }
  30. } else {
  31. cnf = new File(configLocation);
  32. }
  33. for (File file : cnf.listFiles()) {
  34. if (file.isFile() && file.getName().endsWith(".properties")) {
  35. Properties ppt = new Properties();
  36. try (FileInputStream fi = new FileInputStream(file)) {
  37. ppt.load(fi);
  38. System.getProperties().putAll(ppt);
  39. }
  40. }
  41. }
  42. return configLocation;
  43. }
  44.  
  45. public static String getVersion(Class<?> clazz) {
  46. Package pkg = clazz.getPackage();
  47. String ver = (pkg != null ? pkg.getImplementationVersion() : "undefined");
  48. return (ver == null ? "undefined" : ver);
  49. }
  50. }

配置类:

  1. package com.my.spider.config;
  2.  
  3. import org.springframework.context.annotation.ComponentScan;
  4. import org.springframework.context.annotation.Configuration;
  5. import org.springframework.scheduling.annotation.EnableScheduling;
  6.  
  7. @EnableScheduling
  8. @Configuration
  9. @ComponentScan(basePackages = {
  10. "com.my.spider.rs",
  11. "com.my.spider.schedule"
  12. })
  13. public class AppAutoConfiguration {
  14.  
  15. }

META-INF下spring.factories文件:

  1. org.springframework.boot.autoconfigure.EnableAutoConfiguration=\
  2. com.my.spider.config.AppAutoConfiguration

3、功能代码:

定时任务抽象类,提供三种定时任务的调用方法:

  1. package com.my.spider.schedule;
  2.  
  3. import org.slf4j.Logger;
  4. import org.slf4j.LoggerFactory;
  5. import org.springframework.beans.factory.DisposableBean;
  6. import org.springframework.beans.factory.InitializingBean;
  7. import org.springframework.scheduling.annotation.Scheduled;
  8. import org.springframework.stereotype.Component;
  9.  
  10. import com.fasterxml.jackson.databind.ObjectMapper;
  11.  
  12. @Component
  13. public abstract class ParentSchedule implements InitializingBean,DisposableBean{
  14.  
  15. public static Logger logger = LoggerFactory.getLogger(ParentSchedule.class);
  16.  
  17. public final static ObjectMapper objectMapper = new ObjectMapper();
  18.  
  19. @Scheduled(
  20. initialDelayString = "${agent.task.initialDelay:1000}", //
  21. fixedDelayString = "${agent.task.fixedDelay:10000}")
  22. public void dowork(){
  23. execute();
  24. }
  25. //定时任务一
  26. public abstract void execute();
  27.  
  28. @Scheduled(cron = "${agent.task.cron:0 0 10,14,16 * * ?}")
  29. public void timeTask(){
  30. executeTimeTask();
  31. }
  32. //定时任务三
  33. public abstract void executeTimeTask();
  34.  
  35. //每天12点出发
  36. @Scheduled(cron = "0 0 12 * * ?")
  37. public void otherTask(){
  38. executeOtherTask();
  39. }
  40. //定时任务三
  41. public abstract void executeOtherTask();
  42. }
  1. package com.my.spider.utils;
  2.  
  3. import java.util.HashMap;
  4. import java.util.Map;
  5.  
  6. import org.slf4j.Logger;
  7. import org.slf4j.LoggerFactory;
  8.  
  9. /**
  10. * 页面抓取请求的公共类
  11. * */
  12. public class HttpHtmlUtils {
  13.  
  14. public static Logger logger = LoggerFactory.getLogger(HttpHtmlUtils.class);
  15.  
  16. public static Map<String, String> header = new HashMap<String, String>();
  17.  
  18. public static Map<String, String> header_a = new HashMap<String, String>();
  19.  
  20. static {
  21. //设置请求头
  22. header.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0");
  23. header.put("Accept","text/javascript, text/html, application/xml, text/xml, */*");
  24. header.put("Accept-Language","zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3");
  25. header.put("Accept-Encoding","gzip, deflate");
  26. header.put("X-Requested-With","XMLHttpRequest");
  27. header.put("Content-Type","text/*, application/xml");
  28. header.put("Connection","keep-alive");
  29.  
  30. header_a.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0");
  31. header_a.put("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
  32. header_a.put("Accept-Language","zh-CN,zh;q=0.8");
  33. header_a.put("Accept-Encoding","gzip, deflate, sdch");
  34. header_a.put("Content-Type","application/octet-stream");
  35. header_a.put("Connection","keep-alive");
  36. header_a.put("Upgrade-Insecure-Requests", "1");
  37. }
  38.  
  39. }

新浪滚动新闻抓取实现下载和分析:

  1. package com.my.spider.schedule;
  2.  
  3. import java.io.IOException;
  4. import java.text.SimpleDateFormat;
  5. import java.util.ArrayList;
  6. import java.util.Date;
  7. import java.util.HashMap;
  8. import java.util.HashSet;
  9. import java.util.List;
  10. import java.util.Map;
  11. import java.util.Set;
  12.  
  13. import org.jsoup.Connection;
  14. import org.jsoup.Jsoup;
  15. import org.jsoup.nodes.Document;
  16. import org.jsoup.nodes.Element;
  17. import org.jsoup.select.Elements;
  18. import org.slf4j.Logger;
  19. import org.slf4j.LoggerFactory;
  20. import org.springframework.beans.factory.annotation.Value;
  21. import org.springframework.stereotype.Component;
  22. import org.springframework.util.StringUtils;
  23.  
  24. import com.my.spider.utils.FileUtils;
  25. import com.my.spider.utils.HttpHtmlUtils;
  26.  
  27. @Component
  28. public class SinaSchedule extends ParentSchedule {
  29.  
  30. private static Logger logger = LoggerFactory.getLogger(SinaSchedule.class);
  31.  
  32. public static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm");
  33.  
  34. public static SimpleDateFormat sdfYMD = new SimpleDateFormat("yyyy-MM-dd");
  35.  
  36. private static int downloadtimeout = 5000;
  37.  
  38. public static Set<String> titleSet = new HashSet<String>();
  39.  
  40. @Value("${img.download.dir.prefix:D://testhtml}")
  41. public String dirpath;
  42.  
  43. @Override
  44. public void afterPropertiesSet() throws Exception {
  45. // TODO Auto-generated method stub
  46.  
  47. }
  48.  
  49. // 抓取文章列表
  50. public static List<String> getArticleList(String url) {
  51.  
  52. List<String> urlList = new ArrayList<String>();
  53. logger.debug("获取文章信息url:{},开始时间={}", url, sdf.format(new Date()));
  54.  
  55. try {
  56. Connection connect = Jsoup.connect(url).headers(HttpHtmlUtils.header_a);
  57. Document document;
  58. document = connect.timeout(downloadtimeout).get();
  59. Elements newsList = document.getElementsByClass("d_list_txt");
  60. if (newsList != null && newsList.size() > 0) {
  61. newsList = newsList.get(0).getElementsByTag("ul").get(0).getElementsByTag("li");
  62. for (Element el : newsList) {
  63. String elUrl = el.getElementsByTag("a").get(0).absUrl("href");
  64. String urlName = el.getElementsByTag("a").get(0).text();
  65. String time = el.getElementsByClass("c_time").get(0).text();
  66. logger.debug("获取新闻:{},访问地址:{},时间:{}",urlName,elUrl,time);
  67. //elUrl = el.getElementsByTag("a").get(0).attr("href");
  68. urlList.add(elUrl);
  69. }
  70. }
  71. logger.debug("获取文章列表信息:结束时间={}", sdf.format(new Date()));
  72. return urlList;
  73. } catch (IOException e) {
  74. logger.error("访问文章列表失败:" + url + " 原因" + e.getMessage());
  75. }
  76. return null;
  77. }
  78.  
  79. // 抓取文章列表
  80. public static Map<String, Object> getArticleInfo(String url) {
  81.  
  82. logger.debug("获取文章信息url:{},开始时间={}", url, sdf.format(new Date()));
  83. try {
  84. Map<String, Object> map = new HashMap<String, Object>();
  85. Connection connect = Jsoup.connect(url).headers(HttpHtmlUtils.header);
  86. Document document;
  87. document = connect.timeout(downloadtimeout).get();
  88. Element titleEl = document.getElementById("artibodyTitle");
  89. String tilte = "";
  90.  
  91. if (titleEl != null) {
  92. tilte = titleEl.text();
  93. }
  94.  
  95. Elements keywords = document.getElementsByClass("article-keywords");
  96. String tag = "";
  97. StringBuffer sb = new StringBuffer();
  98. if (keywords != null ) {
  99. for (Element t : keywords.get(0).getElementsByTag("a")) {
  100. sb.append(t.text()).append(",");
  101. }
  102. if (!StringUtils.isEmpty(sb.toString())) {
  103. tag = sb.deleteCharAt(sb.lastIndexOf(",")).toString();
  104. }
  105. }
  106.  
  107. Element contentEle = document.getElementById("artibody");
  108. String content = "";
  109. String contentText = "";
  110. if (contentEle != null) {
  111. content = contentEle.html();
  112. contentText = contentEle.text();
  113. }
  114. String description = "";
  115. Elements descEle = document.getElementsByAttributeValue("name","description");
  116. if (descEle != null && descEle.size() > 0) {
  117. description = descEle.get(0).attr("content");
  118. }
  119. List<String> imgUrls = new ArrayList<>();
  120. Elements imgs = contentEle.getElementsByTag("img");
  121. if (imgs != null && imgs.size() > 0) {
  122. for (Element img : imgs) {
  123. String imgUrl = img.attr("src");
  124. if (!StringUtils.isEmpty(imgUrl)) {
  125. imgUrls.add(imgUrl);
  126. }
  127. }
  128. }
  129. map.put("imgs", imgUrls);
  130. map.put("description", description);
  131. map.put("content", content);
  132. map.put("contentText", contentText);
  133. map.put("tag", tag);
  134. map.put("title", tilte);
  135. logger.debug("获取文章信息:结束时间={}", sdf.format(new Date()));
  136.  
  137. return map;
  138. } catch (IOException e) {
  139. logger.error("访问文章页失败:" + url + " 原因" + e.getMessage());
  140. }
  141. return null;
  142. }
  143.  
  144. @Override
  145. public void destroy() throws Exception {
  146. // TODO Auto-generated method stub
  147.  
  148. }
  149.  
  150. public static void main(String[] args) {
  151. List<String> url = new ArrayList<>();
  152. url.addAll(getArticleList("http://roll.news.sina.com.cn/s/channel.php?ch=01#col=89&spec=&type=&ch=0"
  153. + "1&k=&offset_page=0&offset_num=0&num=60&asc=&page=1"));
  154. titleSet.addAll(url);
  155. logger.debug("此次共获取到{}个",titleSet.size());
  156.  
  157. for (String urlStr : titleSet) {
  158. try {
  159. /*
  160. String htmlFile = FileUtils.downloadYunFile(urlStr, "D://testhtml//sina//"+sdfYMD.format(new Date()));
  161. Document document = Jsoup.parse(new File(htmlFile), "utf8");
  162. document.getElementsByTag("tilte");
  163. */
  164. //下载保存
  165. FileUtils.downloadYunFile(urlStr, "D://testhtml//sina//"+sdfYMD.format(new Date()));
  166.  
  167. getArticleInfo(urlStr);
  168.  
  169. } catch (Throwable e) {
  170.  
  171. }
  172.  
  173. }
  174. }
  175.  
  176. @Override
  177. public void execute() {
  178.  
  179. }
  180.  
  181. @Override
  182. public void executeTimeTask() {
  183. // TODO Auto-generated method stub
  184.  
  185. }
  186.  
  187. @Override
  188. public void executeOtherTask() {
  189. // TODO Auto-generated method stub
  190.  
  191. }
  192.  
  193. }

下载html文件代码:

  1. package com.my.spider.utils;
  2.  
  3. import java.io.File;
  4. import java.io.FileInputStream;
  5. import java.io.FileOutputStream;
  6. import java.io.IOException;
  7. import java.io.InputStream;
  8. import java.io.OutputStream;
  9. import java.net.URI;
  10. import java.util.Arrays;
  11.  
  12. import org.apache.http.client.config.RequestConfig;
  13. import org.apache.http.client.methods.CloseableHttpResponse;
  14. import org.apache.http.client.methods.HttpGet;
  15. import org.apache.http.impl.client.CloseableHttpClient;
  16. import org.apache.http.impl.client.HttpClients;
  17. import org.slf4j.Logger;
  18. import org.slf4j.LoggerFactory;
  19. import org.springframework.http.HttpEntity;
  20. import org.springframework.http.HttpHeaders;
  21. import org.springframework.http.HttpMethod;
  22. import org.springframework.http.MediaType;
  23. import org.springframework.http.ResponseEntity;
  24. import org.springframework.http.client.ClientHttpRequestFactory;
  25. import org.springframework.http.client.HttpComponentsClientHttpRequestFactory;
  26. import org.springframework.util.StreamUtils;
  27. import org.springframework.web.client.RestTemplate;
  28. import org.springframework.web.util.UriComponentsBuilder;
  29.  
  30. import com.fasterxml.jackson.databind.ObjectMapper;
  31.  
  32. public class FileUtils {
  33.  
  34. private static final Logger logger = LoggerFactory.getLogger(FileUtils.class);
  35.  
  36. private static ObjectMapper _objectMapper = new ObjectMapper();
  37.  
  38. private static int downloadTimeout = 5000;
  39.  
  40. public static void main(String[] args) throws Throwable {
  41. String filePath = "/temp/temp/test.mpg";
  42. String dirPrex = "/temp&Z:\\\\";
  43. String[] paths = dirPrex.split("&");
  44. System.out.println(paths[1] + filePath.substring(paths[0].length() + 1).replace("/", "\\"));
  45. }
  46.  
  47. // 文件复制
  48. public static void copy(String src, String dest) throws IOException {
  49.  
  50. System.out.println("正在拷贝【" + src + "】到【" + dest + "】\n");
  51. File destFile = new File(dest);
  52. if (!destFile.exists()) {
  53. String dir = dest.substring(0, dest.lastIndexOf(File.separator));
  54. File dirF = new File(dir);
  55. if (!dirF.exists() || !dirF.isDirectory()) {
  56. dirF.mkdirs();
  57. }
  58. destFile.createNewFile();
  59. }
  60. FileInputStream in = new FileInputStream(src);
  61. FileOutputStream out = new FileOutputStream(dest);
  62. byte[] buffer = new byte[40960];
  63. while (in.read(buffer) != -1) {
  64. out.write(buffer);
  65. out.flush();
  66. }
  67. in.close();
  68. out.close();
  69. }
  70.  
  71. // 下载云文件
  72. public static String downloadYunFile(String url, String dir) throws Throwable {
  73.  
  74. String fileName = getFileName(url);
  75.  
  76. String filePath = dir + File.separator + fileName;
  77.  
  78. try (CloseableHttpClient httpclient = HttpClients.createDefault()) {
  79. HttpGet httpget = new HttpGet(url);
  80. httpget.setConfig(RequestConfig.custom() //
  81. .setConnectionRequestTimeout(downloadTimeout) //
  82. .setConnectTimeout(downloadTimeout) //
  83. .setSocketTimeout(downloadTimeout) //
  84. .build());
  85. try (CloseableHttpResponse response = httpclient.execute(httpget)) {
  86. org.apache.http.HttpEntity entity = response.getEntity();
  87. File desc = new File(filePath);
  88. File folder = desc.getParentFile();
  89. folder.mkdirs();
  90. try (InputStream is = entity.getContent(); //
  91. OutputStream os = new FileOutputStream(desc)) {
  92. StreamUtils.copy(is, os);
  93. }
  94. } catch (Throwable e) {
  95. throw new Throwable("文件下载失败......", e);
  96. }
  97. }
  98. return filePath;
  99. }
  100.  
  101. public static String getFileName(String fileFullPath) {
  102. fileFullPath = fileFullPath.replace("/", "\\");
  103. return fileFullPath.substring(fileFullPath.lastIndexOf("\\") + 1, fileFullPath.length());
  104. }
  105.  
  106. // 请求例子
  107. public void getToken(String url, String data) throws Throwable {
  108.  
  109. RestTemplate restTemplate = new RestTemplate();
  110. ClientHttpRequestFactory clientFactory = new HttpComponentsClientHttpRequestFactory();
  111. restTemplate.setRequestFactory(clientFactory);
  112.  
  113. HttpHeaders requestHeaders = new HttpHeaders();
  114. requestHeaders.setAccept(Arrays.asList(MediaType.APPLICATION_JSON_UTF8));
  115. requestHeaders.setContentType(MediaType.APPLICATION_JSON_UTF8);
  116. logger.debug("获取token的URL:" + url);
  117.  
  118. URI uri = UriComponentsBuilder.fromUriString(url).build().encode().toUri();
  119.  
  120. logger.debug("请求数据:{}", _objectMapper.writeValueAsString(data));
  121.  
  122. HttpEntity<String> requestEntity = new HttpEntity<String>(data, requestHeaders);
  123.  
  124. ResponseEntity<String> response = restTemplate.exchange(uri, HttpMethod.POST, requestEntity, String.class);
  125. String resp = response.getBody();
  126. logger.debug("请求返回值数据:{}", _objectMapper.writeValueAsString(resp));
  127. }
  128.  
  129. }

4、总结:

Jsoup对于这种页面抓取很好用!也可能因为这是实现了一个最简单的页面抓取过程!

追加一个下载音频的代码:

  1. package com.my.spider.service;
  2.  
  3. import java.net.HttpURLConnection;
  4. import java.util.ArrayList;
  5. import java.util.Arrays;
  6. import java.util.List;
  7. import java.util.Map;
  8.  
  9. import org.jsoup.Connection;
  10. import org.jsoup.Jsoup;
  11. import org.jsoup.nodes.Document;
  12. import org.jsoup.nodes.Element;
  13. import org.jsoup.select.Elements;
  14. import org.slf4j.Logger;
  15. import org.slf4j.LoggerFactory;
  16. import org.springframework.stereotype.Service;
  17.  
  18. import com.alibaba.fastjson.JSONObject;
  19. import com.my.spider.model.AudioInfo;
  20. import com.my.spider.utils.FileUtils;
  21. import com.my.spider.utils.HttpHtmlUtils;
  22. import com.my.spider.utils.HttpURLConnectionFactory;
  23.  
  24. @Service
  25. public class XmlyAudioService {
  26.  
  27. public static final Logger logger = LoggerFactory.getLogger(XmlyAudioService.class);
  28. static String url = "http://www.ximalaya.com/dq/comic/";
  29. static String requetUrl = "http://www.ximalaya.com/tracks/";
  30.  
  31. public static void main(String[] args) {
  32. List<String> audioUrlList = new ArrayList<String>();
  33. int count = getCount(url);
  34. if(count > 1) {
  35. audioUrlList.addAll(getAudioList(1,url));
  36. for (int i = 2; i <= count; i++) {
  37. url = url +i+"/";
  38. audioUrlList.addAll(getAudioList(i,url));
  39. url = url.replace(i+"/", "");
  40. }
  41. }
  42. List<String> audioList = new ArrayList<String>();
  43. //解析
  44. if(audioUrlList.size() > 0) {
  45. for (String url : audioUrlList) {
  46. audioList.addAll(listAudio(url));
  47. }
  48. }
  49. System.out.println(audioUrlList.size() + "==" + audioList.size());
  50. List<AudioInfo> audioInfos = new ArrayList<>();
  51. //下载
  52. for (String sound_id : audioList) {
  53. requetUrl = requetUrl + sound_id+".json";
  54. System.out.println(requetUrl);
  55. audioInfos.add(downloadList(requetUrl));
  56. requetUrl = requetUrl.replace(sound_id+".json", "");
  57. }
  58. }
  59.  
  60. //获取音频页详情
  61. public static List<String> getAudioList(int num,String url){
  62. List<String> list = new ArrayList<>();
  63. try {
  64. Connection connect = Jsoup.connect(url).headers(HttpHtmlUtils.header_a);
  65. Document document = connect.timeout(5000).get();
  66. FileUtils.str2File(document.toString(), "G:\\xmly\\html\\comic" + num + ".html");
  67. Element el = document.getElementById("explore_album_detail_entry");
  68. Elements els = el.getElementsByClass("albumface");
  69. for (Element element : els) {
  70. list.add(element.absUrl("href"));
  71. }
  72. } catch (Throwable e) {
  73. logger.error("获取{}网页信息失败,{}",url,e.getMessage(),e);
  74. }
  75. return list;
  76. }
  77.  
  78. public static List<String> listAudio(String url){
  79. List<String> list = new ArrayList<>();
  80. try {
  81. Connection connect = Jsoup.connect(url).headers(HttpHtmlUtils.header_a);
  82. Document document = connect.timeout(5000).get();
  83. FileUtils.str2File(document.toString(), "G:\\xmly\\html\\comic_"+System.currentTimeMillis()+".html");
  84. Elements els = document.getElementsByClass("personal_body");
  85. if(els!=null && els.size() > 0) {
  86. String sound_ids = els.get(0).attr("sound_ids");
  87. list.addAll(Arrays.asList(sound_ids.split(",")));
  88. }
  89. } catch (Throwable e) {
  90. logger.error("获取{}网页信息失败,{}",url,e.getMessage(),e);
  91. }
  92. return list;
  93. }
  94.  
  95. //
  96. @SuppressWarnings("unchecked")
  97. public static AudioInfo downloadList(String url){
  98. AudioInfo audioInfo = new AudioInfo();
  99. try {
  100.  
  101. HttpURLConnection conn = HttpURLConnectionFactory.getConn(url);
  102. conn.setRequestProperty("Content-Type", "*/*; charset=utf-8");
  103. String audioJson = HttpURLConnectionFactory.sendGet(conn);
  104. Map<String,Object> map = (Map<String, Object>) JSONObject.parse(audioJson);
  105. audioInfo.setId(map.get("id").toString());
  106. audioInfo.setName(map.get("title").toString());
  107. audioInfo.setUrl(map.get("play_path").toString());
  108. try {
  109. FileUtils.downloadRenameFile(audioInfo.getUrl(), "G:\\xmly", audioInfo.getName()+".mp3");
  110. } catch (Throwable e) {
  111. logger.error("{}下载失败,id={}",audioInfo.getName(),audioInfo.getId());;
  112. }
  113.  
  114. } catch (Throwable e) {
  115. logger.error(e.getMessage(),e);
  116. }
  117. return audioInfo;
  118. }
  119.  
  120. //获取总页数页数
  121. public static int getCount(String url) {
  122. try {
  123. Connection connect = Jsoup.connect(url).headers(HttpHtmlUtils.header_a);
  124. Document document = connect.timeout(5000).get();
  125. Elements els = document.getElementsByClass("pagingBar_page");
  126. if(els.size() < 2) {
  127. return 1;
  128. }
  129. Element pageCout = els.get(els.size()-2);
  130. return Integer.valueOf(pageCout.text());
  131. } catch (Throwable e) {
  132. e.printStackTrace();
  133. }
  134. return 0;
  135. }
  136.  
  137. }

xmly.java

新浪新闻页面抓取(JAVA-Jsoup)的更多相关文章

  1. Python爬虫:新浪新闻详情页的数据抓取(函数版)

    上一篇文章<Python爬虫:抓取新浪新闻数据>详细解说了如何抓取新浪新闻详情页的相关数据,但代码的构建不利于后续扩展,每次抓取新的详情页时都需要重新写一遍,因此,我们需要将其整理成函数, ...

  2. Python_网络爬虫(新浪新闻抓取)

    爬取前的准备: BeautifulSoup的导入:pip install BeautifulSoup4 requests的导入:pip install requests 下载jupyter noteb ...

  3. selenium+BeautifulSoup+phantomjs爬取新浪新闻

    一 下载phantomjs,把phantomjs.exe的文件路径加到环境变量中,也可以phantomjs.exe拷贝到一个已存在的环境变量路径中,比如我用的anaconda,我把phantomjs. ...

  4. python3爬虫-爬取新浪新闻首页所有新闻标题

    准备工作:安装requests和BeautifulSoup4.打开cmd,输入如下命令 pip install requests pip install BeautifulSoup4 打开我们要爬取的 ...

  5. 门户级UGC系统的技术进化路线——新浪新闻评论系统的架构演进和经验总结(转)

    add by zhj:先收藏了 摘要:评论系统是所有门户网站的核心标准服务组件之一.本文作者曾负责新浪网评论系统多年,这套系统不仅服务于门户新闻业务,还包括调查.投票等产品,经历了从单机到多机再到集群 ...

  6. 小爬新浪新闻AFCCL

    1.任务目标: 爬取新浪新闻AFCCL的文章:文章标题.时间.来源.内容.评论数等信息. 2.目标网页: http://sports.sina.com.cn/z/AFCCL/ 3.网页分析 4.源代码 ...

  7. 今天写了一个简单的新浪新闻RSS操作类库

    今天,有位群友问我如何获新浪新闻列表相关问题,我想,用正则表达式网页中取显然既复杂又不一定准确,现在许多大型网站都有RSS集合,所以我就跟他说用RSS应该好办一些. 一年前我写过一个RSS阅读器,不过 ...

  8. C# 页面抓取获取快递信息

    通过页面抓取信息可以获得很多我们想要的信息,比如现在常会用到的快递查询,主要抓取的网站为http://www.kuaidi100.com/ 通过IE的网络分析我们可以得到下面信息 通过对这个网站的分析 ...

  9. Lance老师UI系列教程第八课->新浪新闻SlidingMenu界面的实现

    UI系列教程第八课:Lance老师UI系列教程第八课->新浪新闻SlidingMenu界面的实现 今天蓝老师要讲的是关于新浪新闻侧滑界面的实现.先看看原图: 如图所示,这种侧滑效果以另一种方式替 ...

随机推荐

  1. 【CSS3】动画animation-关键帧keyframes

    <!DOCTYPE html> <html> <head> <meta charset="utf-8"> <title> ...

  2. 【java】打印流的基本实现及java.io.PrintStream、java.io.PrintWriter示例

    package 打印流; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; impor ...

  3. Oracle数据库(三)表操作,连接查询,分页

    复制表 --复制表 create table new_table as select * from Product --复制表结构不要数据 在where后面跟一个不成立的条件,就会仅复制表的结构而不复 ...

  4. 基于阿里云的MQTT远程控制

    好久没有写博客了,眼看自己的项目就要快做完了,先分享一下基于MQTT的远程控制,自己买了一个阿里的云端,然后在云端上安装了一个MQTT服务器,其实是一不小心买了两个,所以准备贡献出来一个供大家使用, ...

  5. 7.18 DP考试解题报告

    今天的考试真的是天崩地裂,写了的三个题全炸...然而谁叫我弱+不注意细节呢???真的要扇耳光... T1:题意:一段区间的高度为这个区间中高度的最小值,给定n个宽度,求每个宽度的期望高度 40% :算 ...

  6. Spark源码剖析(六):Worker原理与源码剖析

    上篇文章我们剖析了Master的原理和源码,知道了当Master使用资源分配算法将资源分配完成后,就会给对应的Worker发送启动Driver或者Executor的消息,那么Worker收到这些消息后 ...

  7. Linux第八讲随笔 -tar / 系统启动流程

    linux 第八讲1.tar 参考 作用:压缩和解压文件.tar本身不具有压缩功能.他是调用压缩功能实现的. 语法:tar[必要参数][选择参数][文件] 参数:必要参数有如下: -A 新增压缩文件到 ...

  8. [编织消息框架][JAVA核心技术]annotation基础

    应用动态代理技术要先掌握annotation技术 注解是JDK1.5之后才有的新特性,JDK1.5之后内部提供的三个注解 @Deprecated 意思是“废弃的,过时的” @Override 意思是“ ...

  9. SQL SERVER 常用知识整理

    以前写了一些关于sql的文章,包括一些转载的,这里做下整理,方便需要时候使用 一.基础运用 SQL 数据结构操作语句 SQL 时间处理 SQL 常见函数使用 CASE WHEN THEN 小结 二.优 ...

  10. NFS : device is busy

    unmount [ options ] -f : Force unmount (in case of an unreachable NFS system). -l  : Lazy unmount. D ...