一、代码演示

如果中途中断,可进行刷选过滤已拉取省份数据

/**
* TODO
*
* @author kevin
* @createTime 2019-11-18 19:37
*/
@RestController
public class CityController { @Autowired
private ProvinceService provinceService;
@Autowired
private HttpUtil httpUtil;
private String yearHref = "";
private int index; // {"provincetr", "citytr", "countytr", "towntr", "villagetr"};
@GetMapping("/start")
public ResultTemplate<String> spider() throws Exception {
String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/";
String charset = "gb2312";
Document rootDoc = httpUtil.get(url, charset); if (rootDoc == null) {
return of("fail");
}
Element firstElement = rootDoc.getElementsByClass("center_list_contlist").get(0);
// http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/index.html
yearHref = firstElement.select("a").get(0).attr("href"); // 最近一个年份的省份链接
Document doc = httpUtil.get(yearHref, charset);
// 遍历所有的省
Elements provinceElements = doc.getElementsByClass("provincetr");
for (Element element : provinceElements) {
Elements aEles = element.select("a");
for (Element aEle : aEles) {
String name = aEle.text();
// 11.html
String provincesHref = aEle.attr("href");
String code = provincesHref.substring(0, provincesHref.indexOf("."));
index = yearHref.lastIndexOf("/") + 1;
// http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/11.html
provincesHref = yearHref.substring(0, index) + provincesHref;
DicProvince province = new DicProvince()
.setProvinceName(name)
.setProvinceCode(code)
.setCountryId(1196612453660643329L)
.setCreateDate(LocalDateTime.now())
.setCreateUserid(1L)
.setCreateUsername("admin");
if ("北京市".equals(name) || "天津市".equals(name) || "河北省".equals(name)) {
System.out.println("未执行市:" + name);
} else {
System.out.println("开始时间:" + LocalDateTime.now());
System.out.println("省名称:" + name);
Long id = provinceService.insertProvince(province);
getCites(provincesHref, charset, id);
}
}
}
return of("spider crawl end.");
} private void getCites(String url, String charset, Long provinceId) throws Exception {
Document rootDoc = null;
int i = 0;
while (rootDoc == null) {
try {
i++;
if (i >= 3) {
System.out.println("循环次数:" + i);
}
rootDoc = httpUtil.get(url, charset);
} catch (Exception e) {
rootDoc = null;
System.out.println("请求网页链接报错");
}
}
i = 0;
if (rootDoc != null) {
Elements cityElements = rootDoc.getElementsByClass("citytr");
for (Element cityElement : cityElements) {
Element aEle = cityElement.select("a").get(1); // 第二个是市的名字
String name = aEle.text();
// 11/1101.html
String cityHref = aEle.attr("href");
int start = cityHref.lastIndexOf("/") + 1;
String code = cityHref.substring(start, cityHref.indexOf("."));
cityHref = yearHref.substring(0, index) + cityHref;
DicCity city = new DicCity()
.setCityName(name)
.setCityCode(code)
.setProvinceId(provinceId)
.setCreateDate(LocalDateTime.now())
.setCreateUserid(1L)
.setCreateUsername("admin");
Long id = provinceService.insertCity(city);
//Long id=1L; getDistrict(cityHref, charset, id);
}
}
} // 区县
private void getDistrict(String url, String charset, Long idDis) throws Exception {
Document rootDoc = null;
int i = 0;
while (rootDoc == null) {
try {
i++;
if (i >= 3) {
System.out.println("循环次数:" + i);
}
rootDoc = httpUtil.get(url, charset);
} catch (Exception e) {
rootDoc = null;
System.out.println("请求网页链接报错");
}
}
i = 0;
if (rootDoc != null) {
Elements cityElements = rootDoc.getElementsByClass("countytr");
for (Element cityElement : cityElements) {
try {
Element aEle = cityElement.select("a").get(1);
String name = aEle.text();
String cityHref = aEle.attr("href");
int start = cityHref.lastIndexOf("/") + 1;
String code = cityHref.substring(start, cityHref.indexOf(".")); int index = url.lastIndexOf("/") + 1;
cityHref = url.substring(0, index) + cityHref; DicDistrict district = new DicDistrict().setDistrictName(name).setDistrictCode(code).setCityId(idDis)
.setCreateDate(LocalDateTime.now())
.setCreateUserid(1L)
.setCreateUsername("admin");
Long id = provinceService.insertDistrict(district);
//Long id=1L;
getStreet(cityHref, charset, id);
} catch (Exception e) {
System.out.println("市辖区");
Element aEle = cityElement.select("td").get(0);
String code = aEle.text(); Element aEle2 = cityElement.select("td").get(1);
String name = aEle2.text(); DicDistrict district = new DicDistrict().setDistrictName(name).setDistrictCode(code).setCityId(idDis);
Long id = provinceService.insertDistrict(district);
System.out.println("执行完毕"); } }
}
} // 街道
private void getStreet(String url, String charset, Long idStr) throws Exception {
Document rootDoc = null;
int i = 0;
while (rootDoc == null) {
try {
i++;
if (i >= 3) {
System.out.println("循环次数:" + i);
}
rootDoc = httpUtil.get(url, charset);
} catch (Exception e) {
rootDoc = null;
System.out.println("请求网页链接报错");
}
}
i = 0;
if (rootDoc != null) {
Elements cityElements = rootDoc.getElementsByClass("towntr");
for (Element cityElement : cityElements) {
Element aEle = cityElement.select("a").get(1); // 第二个是市的名字
String name = aEle.text();
String cityHref = aEle.attr("href");
int start = cityHref.lastIndexOf("/") + 1;
String code = cityHref.substring(start, cityHref.indexOf("."));
int index = url.lastIndexOf("/") + 1;
cityHref = url.substring(0, index) + cityHref;
DicStreet street = new DicStreet()
.setStreetName(name)
.setStreetCode(code)
.setDistrictId(idStr)
.setCreateDate(LocalDateTime.now())
.setCreateUserid(1L)
.setCreateUsername("admin");
Long id = provinceService.insertStreet(street);
//Long id=1L;
getCommunity(cityHref, charset, id);
}
}
} // 社区
private void getCommunity(String url, String charset, Long idPro) throws Exception {
Document rootDoc = null;
int i = 0;
while (rootDoc == null) {
try {
i++;
if (i >= 3) {
System.out.println("循环次数:" + i);
}
rootDoc = httpUtil.get(url, charset);
} catch (Exception e) {
rootDoc = null;
System.out.println("请求网页链接报错");
}
}
i = 0;
if (rootDoc != null) {
Elements cityElements = rootDoc.getElementsByClass("villagetr");
for (Element cityElement : cityElements) {
Element aEle = cityElement.select("td").get(0);
String code = aEle.text(); Element aEle2 = cityElement.select("td").get(1);
String cl_code = aEle2.text(); Element aEle3 = cityElement.select("td").get(2);
String name = aEle3.text(); DicCommunity community = new DicCommunity()
.setCommunityName(name)
.setCommunityCode(code)
.setClassificationCode(cl_code)
.setStreetId(idPro)
.setCreateDate(LocalDateTime.now())
.setCreateUserid(1L)
.setCreateUsername("admin");
Long id = provinceService.insertCommunity(community);
}
}
} }

二、HttppUtil工具类

/**
* TODO
*
* @author kevin
* @createTime 2019-11-20 9:17
*/
@Component
public class HttpUtil {
public Document get(String url, String charset) throws IOException {
String userAgent = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36";
URL url2 = new URL(url);
HttpURLConnection connection = (HttpURLConnection)url2.openConnection();
connection.setRequestMethod("GET");
//是否允许缓存,默认true。
connection.setUseCaches(Boolean.FALSE);
//设置请求头信息
connection.addRequestProperty("Connection", "close");
connection.addRequestProperty("user-agent", userAgent);
//设置连接主机超时(单位:毫秒)
connection.setConnectTimeout(80000);
//设置从主机读取数据超时(单位:毫秒)
connection.setReadTimeout(80000);
//开始请求
try {
Document doc = Jsoup.parse(connection.getInputStream(), charset, url);
return doc;
} catch (Exception e) {
System.out.println("parse error: " + url);
}
return null;
} }

三、service部分,根据需要自行定义数据库表

/**
* TODO
*
* @author kevin
* @createTime 2019-11-18 20:41
*/
@Service
public class ProvinceServiceImpl implements ProvinceService { @Autowired
private ProvinceMapper provinceMapper;
@Autowired
private CityMapper cityMapper;
@Autowired
private DistrictMapper districtMapper;
@Autowired
private StreetMapper streetMapper;
@Autowired
private CommunityMapper communityMapper; @Override
public Long insertProvince(DicProvince dicProvince) {
int res=0;
while (res!=1){
try {
res=provinceMapper.insert(dicProvince);
} catch (Exception e) {
res=0;
System.out.println("插入省数据失败");
e.printStackTrace();
}
}
return dicProvince.getProvinceId();
} @Override
public Long insertCity(DicCity dicCity) {
int res=0;
while(res!=1){
try {
res=cityMapper.insert(dicCity);
} catch (Exception e) {
res=0;
System.out.println("插入市数据失败");
e.printStackTrace();
}
}
return dicCity.getCityId();
} @Override
public Long insertDistrict(DicDistrict dicDistrict) {
int res=0;
while (res!=1){
try {
res=districtMapper.insert(dicDistrict);
} catch (Exception e) {
res=0;
System.out.println("插入区县数据失败");
e.printStackTrace();
}
}
return dicDistrict.getDistrictId();
} @Override
public Long insertStreet(DicStreet dicStreet) {
int res=0;
while (res!=1){
try {
res=streetMapper.insert(dicStreet);
} catch (Exception e) {
res=0;
System.out.println("插入街道数据失败");
e.printStackTrace();
}
}
return dicStreet.getStreetId();
} @Override
public Long insertCommunity(DicCommunity dicCommunity) {
int res=0;
while (res!=1){
try {
res=communityMapper.insert(dicCommunity);
} catch (Exception e) {
res=0;
System.out.println("插入社区数据失败");
e.printStackTrace();
}
}
return dicCommunity.getCommunityId();
} }

  

基于【 springBoot+jsoup】一 || 爬取全国行政区划数据的更多相关文章

  1. Java使用Jsoup之爬取博客数据应用实例

    导入Maven依赖 <!-- https://mvnrepository.com/artifact/org.jsoup/jsoup --> <dependency> <g ...

  2. python爬虫学习之爬取全国各省市县级城市邮政编码

    实例需求:运用python语言在http://www.ip138.com/post/网站爬取全国各个省市县级城市的邮政编码,并且保存在excel文件中 实例环境:python3.7 requests库 ...

  3. Python爬取招聘网站数据,给学习、求职一点参考

    1.项目背景 随着科技的飞速发展,数据呈现爆发式的增长,任何人都摆脱不了与数据打交道,社会对于“数据”方面的人才需求也在不断增大.因此了解当下企业究竟需要招聘什么样的人才?需要什么样的技能?不管是对于 ...

  4. Java实现爬取京东手机数据

    Java实现爬取京东手机数据 最近看了某马的Java爬虫视频,看完后自己上手操作了下,基本达到了爬数据的要求,HTML页面源码也刚好复习了下,之前发布两篇关于简单爬虫的文章,也刚好用得上.项目没什么太 ...

  5. Java爬取同花顺股票数据(附源码)

    最近有小伙伴问我能不能抓取同花顺的数据,最近股票行情还不错,想把数据抓下来自己分析分析.我大A股,大家都知道的,一个概念火了,相应的股票就都大涨. 如果能及时获取股票涨跌信息,那就能在刚开始火起来的时 ...

  6. Scrapy 通过登录的方式爬取豆瓣影评数据

    Scrapy 通过登录的方式爬取豆瓣影评数据 爬虫 Scrapy 豆瓣 Fly 由于需要爬取影评数据在来做分析,就选择了豆瓣影评来抓取数据,工具使用的是Scrapy工具来实现.scrapy工具使用起来 ...

  7. selenium跳过webdriver检测并爬取天猫商品数据

    目录 简介 编写思路 使用教程 演示图片 源代码 @(文章目录) 简介 现在爬取淘宝,天猫商品数据都是需要首先进行登录的.上一节我们已经完成了模拟登录淘宝的步骤,所以在此不详细讲如何模拟登录淘宝.把关 ...

  8. 【scrapy_redis】调试后爬取了部分数据,然后重新调试时,直接被去重机制过滤掉无法重头开始爬取

    这2天遇到一个问题,之前调试的时候爬取了一些数据,结果第二天重新调试的时候发现爬虫很快结束,而且还没有报错.后来从日志里看到这个: no more duplicates will be shown ( ...

  9. node 爬虫 --- 将爬取到的数据,保存到 mysql 数据库中

    步骤一:安装必要模块 (1)cheerio模块 ,一个类似jQuery的选择器模块,分析HTML利器. (2)request模块,让http请求变的更加简单 (3)mysql模块,node连接mysq ...

随机推荐

  1. net.ipv4.tcp_fin_timeout的错误理解

        按照文档的说法,貌似长久以来我对于tcp_fin_timeout的理解都是错误的 先备份在这里,再验证 提高Linux应对短连接的负载能力 在存在大量短连接的情况下,Linux的TCP栈一般都 ...

  2. python-learning-第二季-数据处理numpy

    https://www.bjsxt.com/down/8468.html numpy-科学计算基础库 例子: import numpy as np #创建数组 a = np.arange() prin ...

  3. DECODE函数和CASE WHEN 比较

    http://blog.csdn.net/zhangbingtao2011/article/details/51384393 一,DECODE函数 其基本语法为: DECODE(value, if1, ...

  4. 123457123456#0#----com.tym.ErTongFanPai20--前拼后广--儿童FanPai_tym

    com.tym.ErTongFanPai20--前拼后广--儿童FanPai_tym

  5. HR数据分析常用的50个公式

    HR数据分析常用的50个公式 HR经常需要和数据打交道,如入职率.离职率.加班费计算等.虽然日常工作中,单个数据的计算并不麻烦,但几十上百个累计在一起,确实很容易混淆,甚至计算失误.今天小编急HR之所 ...

  6. Win10下载安装PostgreSQL 11.1

    下载地址:https://get.enterprisedb.com/postgresql/postgresql-11.1-1-windows-x64.exe Installation Director ...

  7. 【转】百万年薪挖了p8,难道是水货?

    大厦新搬进来一家创业公司,老板红光满面地提着果篮上楼拜访,说是刚拿到了投资人的钱,正准备扩充团队大干一场.那个时候的他踌躇满志,顾盼生辉.当时我想,能在这个大环境下拿到投资的公司,做的产品应该是有前景 ...

  8. 【WAP触屏】YouKu视频弹窗播放组件

    (function(window){ /* youku api : http://open.youku.com/tools 调用方法 : LM_youkuPop.open('XODI5Mzk3MDAw ...

  9. 【GStreamer开发】GStreamer基础教程04——时间管理

    目标 本教程主要讲述一些和时间相关的内容.主要包括: 1. 如何问pipeline查询到流的总时间和当前播放的时间 2. 如何在流内部实现跳转功能 介绍 GstQuery是向一个element或者pa ...

  10. sudo权限配置

    首先要禁止root的用户登录ssh  在ssh配置文件里面把root用户no掉,一般公司不允许用第三方软件直接root登陆. 一.linux给用户添加sudo权限: 有时候,linux下面运行sudo ...