原文出处http://www.yund.tech/zdetail.html?type=1&id=ee06002e2b83e7677c30aedc52d3429e

作者:jstarseven


现在的网站千奇百怪,什么样格式的都有,需要提取网页中的列表数据,有时候挨个分析处理很头疼,本文是一个页面结构分析的程序,可以分析处理页面大致列表结构。

废话不多说,我也不会说,show me code,code is terrible,so what  hahaha。-------jstarseven

1.抽取元素dom结构框架

     /**
* 分析元素dom结构框架
*
* @param node
* @return
*/
public String filterHtml(Element node) {
//去除节点的属性值
Document new_node = Jsoup.parse(node.outerHtml());
Elements elements = new_node.getAllElements();
for (Element item : elements) {
Attributes attributes = item.attributes();
for (Attribute a : attributes) {
if (a.getKey().equals(KeysEnum.attr_scroce)) {
item.removeAttr(a.getKey());
continue;
}
a.setValue(StringUtils.EMPTY);
}
}
//去除注释节点,节点文本内容
String str_new = new_node.outerHtml().replaceAll("<!--?(.*?)-->", "");
str_new = str_new.replaceAll("\\s*", "");
str_new = str_new.replaceAll(">(.*?)<", "><");
return str_new;
}

2.采用动态规划处理两个字符串相似度

 public class SimilarDegree {

     public static final double degree = 0.8;

     /**
* 采用动态规划的方法解决
*
* @param source
* @param target
* @return
*/
public static int EditDistance(String source, String target) {
char[] sources = source.toCharArray();
char[] targets = target.toCharArray();
int sourceLen = sources.length;
int targetLen = targets.length;
int[][] d = new int[sourceLen + 1][targetLen + 1];
for (int i = 0; i <= sourceLen; i++) {
d[i][0] = i;
}
for (int i = 0; i <= targetLen; i++) {
d[0][i] = i;
} for (int i = 1; i <= sourceLen; i++) {
for (int j = 1; j <= targetLen; j++) {
if (sources[i - 1] == targets[j - 1]) {
d[i][j] = d[i - 1][j - 1];
} else {
//插入
int insert = d[i][j - 1] + 1;
//删除
int delete = d[i - 1][j] + 1;
//替换
int replace = d[i - 1][j - 1] + 1;
d[i][j] = Math.min(insert, delete) > Math.min(delete, replace) ? Math.min(delete, replace) :
Math.min(insert, delete);
}
}
}
return d[sourceLen][targetLen];
} public static void main(String[] args) {
System.out.println(EditDistance("html > body > ul > li.proiect_item:nth-child(1) > div.item_row.item_row_title > div:nth-child(1) > a",
"html > body > ul > li.proiect_item:nth-child(2) > div.item_row.item_row_title > div:nth-child(1) > a"));
} }

3.对网页中每个节点的一级孩子节点分类

 /**
* 统计列表下各个一级节点类型及个数
*
* @param node
* @return
*/
private Map<String, Integer> getGroupNode(Element node) {
Map<String, Integer> map = new HashMap<String, Integer>();
Elements children = node.children();
for (Element item : children) {
if (KeysEnum.input.equalsIgnoreCase(item.tagName()) || KeysEnum.br.equalsIgnoreCase(item.tagName())
|| KeysEnum.script.equalsIgnoreCase(item.tagName()) || KeysEnum.link.equalsIgnoreCase(item.tagName())
|| KeysEnum.style.equalsIgnoreCase(item.tagName()) || KeysEnum.meta.equalsIgnoreCase(item.tagName())
|| KeysEnum.select.equalsIgnoreCase(item.tagName()) || KeysEnum.option.equalsIgnoreCase(item.tagName())
|| KeysEnum.video.equals(item.tagName()) || KeysEnum.audio.equals(item.tagName())
|| KeysEnum.textarea.equals(item.tagName())) continue;
String key = filterHtml(item);
if (map.containsKey(key)) {
map.put(key, (Integer) map.get(key) + 1);
} else {
boolean is_like = false;
for (String map_key : map.keySet()) {
int dis = SimilarDegree.EditDistance(key, (String) map_key);
float v = (float) (key.length() - dis) / key.length();
if (v > SimilarDegree.degree) {
map.put(map_key, (Integer) map.get(map_key) + 1);
is_like = true;
break;
}
}
if (!is_like) map.put(key, 1);
}
}
return map;
}

4.处理网页中每个元素的叶子节点

 /**
* 获取叶子节点选择器
*
* @param node
*/
public static List<String> getYeziNodeSel(Element node) {
List<String> list = new ArrayList<String>();
Elements all = node.getAllElements();
for (Element item : all) {
if (item.children().isEmpty()) list.add(item.cssSelector());
}
return list;
}

5.时间提取工具类

 /**
* jstarseven
* 通用时间处理类 return Date
* */
public class DateParser {
private static int timezone = 0;
private static final Pattern[] DPTN = { Pattern.compile(
"(\\d{1,2})[\\s\\-\\/](\\d{1,2})[\\s\\-\\/](20\\d{2})\\s{0,2}((\\d{1,2})[:\\s](\\d{1,2})[:\\s]?(\\d{1,2})?)?"), Pattern.compile(
"((20)?\\d{2}) {0,2}[\\.\\-/年] {0,2}(\\d{1,2}) {0,2}[\\.\\-/月] {0,2}(\\d{1,2}) {0,2}[日 \\s]{0,2}((上午)|(下午))?\\s{0,2}((\\d{1,2})[:\\s时](\\d{1,2})[:\\s分]?(\\d{1,2})?)?"), Pattern.compile("((20)?\\d{2})/(\\d{2})(\\d{2})"), Pattern.compile(
"(\\d{1,2})[\\.\\-\\s/月](\\d{1,2})[日\\s]{0,2}((上午)|(下午))?\\s{0,2}((\\d{1,2})[:\\s](\\d{1,2})[:\\s]?(\\d{1,2})?)?"), Pattern.compile("([今前昨]天)?\\s{0,4}(\\d{1,2})[:\\s]{1,3}(\\d{1,2})[:\\s]?(\\d{1,2})?"), Pattern.compile("[今前昨]天"), Pattern.compile("((\\d{1,2})|(半))\\s*个?([天秒小时分钟周月年]{1,2})前"), Pattern.compile("(\\d{1,2})小?时(\\d{1,2})分钟?前"), Pattern.compile("(20\\d{2})[01]?(\\d{2})[012]?(\\d{2})") }; public static Date parse(Object obj) {
if (obj == null) {
return null;
}
if ((obj instanceof Date)) {
return (Date) obj;
}
if ((obj instanceof Number)) {
return new Date(((Number) obj).longValue());
}
String str = ((String) obj).trim();
if ((str.length() == 0) || ("null".equalsIgnoreCase(str))) {
return null;
}
str = transZH(str);
Calendar c = Calendar.getInstance();
c.setTimeInMillis(System.currentTimeMillis()); Matcher mt = DPTN[0].matcher(str);
if (mt.find()) {
int date = Integer.parseInt(mt.group(2));
if ((date == 0) || (date > 31)) {
return null;
}
int month = Integer.parseInt(mt.group(1));
if (month <= 0) {
return null;
}
if (month > 12) {
if ((date > 0) && (date <= 12) && (month < 32)) {
int tmp = month;
month = date;
date = tmp;
} else {
return null;
}
}
String sy = mt.group(3);
int year = Integer.parseInt(sy);
if ((year < 2000) || (year > 2099)) {
return null;
}
String hms = mt.group(4);
if ((hms == null) || (hms.length() == 0)) {
c.set(year, month - 1, date, timezone > 0 ? timezone : 0, 0, 0);
return c.getTime();
}
int hour = Integer.parseInt(mt.group(5));
if (hour >= 24) {
return null;
}
int min = Integer.parseInt(mt.group(6));
if (min >= 60) {
return null;
}
String ssec = mt.group(7);
int sec = (ssec == null) || (ssec.length() == 0) ? 0 : Integer.parseInt(ssec);
c.set(year, month - 1, date, hour, min, sec);
return c.getTime();
}
mt = DPTN[1].matcher(str);
if (mt.find()) {
String sy = mt.group(1);
if (sy.length() == 2) {
sy = "20" + sy;
}
int year = Integer.parseInt(sy);
if ((year < 2000) || (year > 2099)) {
return null;
}
int month = Integer.parseInt(mt.group(3)) - 1;
if ((month < 0) || (month > 11)) {
return null;
}
int date = Integer.parseInt(mt.group(4));
if (date > 31) {
return null;
}
String ss = mt.group(8);
if ((ss == null) || (ss.length() == 0)) {
c.set(year, month, date, timezone > 0 ? timezone : 0, 0, 0);
return c.getTime();
}
int hour = Integer.parseInt(mt.group(9));
if (hour >= 24) {
return null;
}
int min = Integer.parseInt(mt.group(10));
if (min >= 60) {
return null;
}
String ssec = mt.group(11);
int sec = (ssec == null) || (ssec.length() == 0) ? 0 : Integer.parseInt(ssec);
if (("下午".equals(mt.group(5))) && (hour < 12)) {
hour += 12;
}
c.set(year, month, date, hour, min, sec);
return c.getTime();
}
mt = DPTN[2].matcher(str);
if (mt.find()) {
String strYear = mt.group(1);
if (!strYear.startsWith("20")) {
strYear = "20" + strYear;
}
int year = Integer.parseInt(strYear);
int month = Integer.parseInt(mt.group(3)) - 1;
int day = Integer.parseInt(mt.group(4));
c.set(year, month, day, 0, 0, 0);
return c.getTime();
}
mt = DPTN[3].matcher(str);
if (mt.find()) {
int year = c.get(1);
int month = Integer.parseInt(mt.group(1)) - 1;
if (month < 0) {
return null;
}
if (month > c.get(2)) {
year--;
}
int date = Integer.parseInt(mt.group(2));
if (date > 31) {
return null;
}
String p = mt.group(6);
if ((p == null) || (p.length() == 0)) {
c.set(year, month, date, timezone > 0 ? timezone : 0, 0, 0);
return c.getTime();
}
int hour = Integer.parseInt(mt.group(7));
if (hour >= 24) {
return null;
}
int min = Integer.parseInt(mt.group(8));
if (min >= 60) {
return null;
}
String ssec = mt.group(9);
int sec = (ssec == null) || (ssec.length() == 0) ? 0 : Integer.parseInt(ssec);
if (("下午".equals(mt.group(3))) && (hour < 12)) {
hour += 12;
}
c.set(year, month, date, hour, min, sec);
return c.getTime();
}
mt = DPTN[4].matcher(str);
if (mt.find()) {
int hour = Integer.parseInt(mt.group(2));
if (hour >= 24) {
return null;
}
int min = Integer.parseInt(mt.group(3));
if (min >= 60) {
return null;
}
String day = mt.group(1);
if ("昨天".equals(day)) {
c.add(5, -1);
} else if ("前天".equals(day)) {
c.add(5, -2);
}
c.set(11, hour);
c.set(12, min);
return c.getTime();
}
mt = DPTN[5].matcher(str);
if (mt.find()) {
String day = mt.group(0);
if ("昨天".equals(day)) {
c.add(5, -1);
} else if ("前天".equals(day)) {
c.add(5, -2);
}
return c.getTime();
}
mt = DPTN[6].matcher(str);
if (mt.find()) {
String s = mt.group(4);
long t;
if ("年".equals(s)) {
t = 31536000000L;
} else {
if ("月".equals(s)) {
t = 2592000000L;
} else {
if ("周".equals(s)) {
t = 604800000L;
} else {
if ("天".equals(s)) {
t = 86400000L;
} else {
if ("小时".equals(s)) {
t = 3600000L;
} else {
if ("时".equals(s)) {
t = 3600000L;
} else {
if ("分钟".equals(s)) {
t = 60000L;
} else {
if ("分".equals(s)) {
t = 60000L;
} else {
if ("秒".equals(s)) {
t = 1000L;
} else {
return null;
}
}
}
}
}
}
}
}
}
String vs = mt.group(1);
if ("半".equals(vs)) {
t = System.currentTimeMillis() - t / 2L;
} else {
t = System.currentTimeMillis() - Integer.parseInt(vs) * t;
}
return new Date(t);
}
mt = DPTN[7].matcher(str);
if (mt.find()) {
int hh = Integer.parseInt(mt.group(1));
int nn = Integer.parseInt(mt.group(2));
long t = 3600000 * hh + 60000 * nn;
return new Date(System.currentTimeMillis() - t);
}
mt = DPTN[8].matcher(str);
if (mt.find()) {
String sy = mt.group(1);
int year = Integer.parseInt(sy);
if ((year < 2000) || (year > 2099)) {
return null;
}
int month = Integer.parseInt(mt.group(2)) - 1;
if ((month < 0) || (month > 11)) {
return null;
}
int date = Integer.parseInt(mt.group(3));
if (date > 31) {
return null;
}
c.set(year, month, date, timezone > 0 ? timezone : 0, 0, 0);
return c.getTime();
}
return null;
} private static String transZH(String string) {
String zh = "〇一二三四五六七八九";
string = string.replace("整", "0分").replaceAll("[上下]午", "");
StringBuffer buffer = new StringBuffer();
for (Character Char : string.toCharArray()) {
int index = zh.indexOf(Char);
if (index >= 0) {
buffer.append(index);
} else {
buffer.append(Char);
}
}
String str = buffer.toString();
int index = str.indexOf("十");
if (index == -1) {
return str;
} else {
if (!Character.isDigit(str.charAt(index-1)) && !Character.isDigit(str.charAt(index+1))) {
str=str.replace("十", "10");
}else if (Character.isDigit(str.charAt(index-1)) && !Character.isDigit(str.charAt(index+1))) {
str=str.replace("十", "0");
}else if(!Character.isDigit(str.charAt(index-1)) && Character.isDigit(str.charAt(index+1))){
str=str.replace("十", "1");
}else if(Character.isDigit(str.charAt(index-1)) && Character.isDigit(str.charAt(index+1))){
str=str.replace("十", "");
}
return str;
} } public static void main(String[] args) {
System.out.println(parse("1982-01-01 00:00:00"));
System.out.println(transZH("二〇一七年九月十日 上午十时整"));
System.out.println(transZH("二〇一七年九月二十日 上午九时整"));
System.out.println(transZH("二〇一七年九月十九日 上午九时整"));
System.out.println(transZH("二〇一七年九月二十三日 上午九时整"));
System.out.println("timezone=" + timezone);
String[] testdata = { "1982-01-01 00:00:00","11-13 15:24", "2009-8-30 16:42:10", "8-23 15:24", "2周前", "3 天前", "12 分钟前", "3天前",
"前天 09:36", "昨天 09:21 ", "2010-12-17 00:23 ", "2010-12-17 ", "昨天 12:37 ", "2011-8-15 08:42",
"25-7-2011 11:43:57", "1-9-2011", "06-03", "半小时前", "今天发表", "昨天发表", "前天发表", "06-03-2010",
"02-01-2010 00:39", "3小时26分钟前", "2010-8-24 上午 01:17:32", "2010-8-24 下午 01:17:32", "7小时前 »",
"4/29/2010 1:31:00", "2012 年 1 月 31 日", "17时20分前", "2017年10月12日 14时30分", "二〇一七年九月十九日 上午九时整" }; DateFormat df = DateFormat.getDateTimeInstance(2, 2);
for (String s : testdata) {
Date d = parse(s);
System.out.println(s + "\t\t" + (d == null ? d : df.format(d)));
}
} }

6.自定义比较器对网页所有元素排序,发现结果靠前的基本都是列表元素

比较器:按照疑似列表的可能性

 /**
* 排序子节点
* 1.最大相同dom结构长度
* 2.最大相同dom结构元素数量
*
* @param nodes
* @return
*/
private Elements sortBy(Elements nodes, String base_url) {
// System.setProperty("java.util.Arrays.useLegacyMergeSort", "true");
nodes.sort(new Comparator<Element>() {
@Override
public int compare(Element o1, Element o2) {
double o1_rate = reckonRate(o1);
double o2_rate = reckonRate(o2);
return (o2_rate > o1_rate) ? 1 : ((o2_rate == o1_rate) ? 0 : -1);
} private double reckonRate(Element o) {
if (StringUtils.isNotBlank(base_url) && KeysEnum.a.equalsIgnoreCase(o.tagName()) && base_url.equalsIgnoreCase(o.attr(KeysEnum.attr_href)))
o.attr(KeysEnum.attr_list_tag_name, o.text());
if (null == o || o.children().size() < 2
|| KeysEnum.html.equalsIgnoreCase(o.tagName()) || KeysEnum.body.equalsIgnoreCase(o.tagName()) || KeysEnum.link.equalsIgnoreCase(o.tagName())
|| KeysEnum.head.equalsIgnoreCase(o.tagName()) || KeysEnum.title.equalsIgnoreCase(o.tagName()) || KeysEnum.meta.equalsIgnoreCase(o.tagName())
|| KeysEnum.script.equalsIgnoreCase(o.tagName()) || KeysEnum.style.equalsIgnoreCase(o.tagName())) {
o.attr(KeysEnum.attr_scroce, "0");
return 0;
}
String style = o.attr(KeysEnum.style);
if (StringUtils.isNotBlank(style) && style.contains(KeysEnum.display_none)) {
o.attr(KeysEnum.attr_scroce, "0");
return 0;
}
Map<String, Object> maxKeyDom = getMaxKeyDom(o);
String key = (String) maxKeyDom.get(KeysEnum.max_key);
int num = (int) maxKeyDom.get(KeysEnum.max_num);
if (num < 2) {
o.attr(KeysEnum.attr_scroce, "0");
return 0;
}
int scroce = num * key.length();
Elements tags = o.children();
for (Element a : tags) {
if (KeysEnum.div.equalsIgnoreCase(a.tagName())) scroce += 5;
if (KeysEnum.ul.equalsIgnoreCase(a.tagName())) scroce += 10;
if (KeysEnum.li.equalsIgnoreCase(a.tagName())) scroce += 10;
if (KeysEnum.tbody.equalsIgnoreCase(a.tagName())) scroce += 5;
if (KeysEnum.table.equalsIgnoreCase(a.tagName())) scroce += 5;
if (KeysEnum.tr.equalsIgnoreCase(a.tagName())) scroce += 10;
if (KeysEnum.td.equalsIgnoreCase(a.tagName())) scroce += 1;
if (KeysEnum.a.equalsIgnoreCase(a.tagName())) scroce += 1;
if (KeysEnum.p.equalsIgnoreCase(a.tagName())) scroce += 1;
try {
Date time = DateParser.parse(a.text());
if (null != time) scroce += 20;
} catch (Exception e) {
}
}
if (o.text().contains(KeysEnum.next_page)) scroce += 100;
if (o.text().contains(KeysEnum.start_page) || o.text().contains(KeysEnum.fisrt_page)) scroce += 100;
if (o.text().contains(KeysEnum.end_page) || o.text().contains(KeysEnum.last_page) || o.text().contains(KeysEnum.final_page))
scroce += 100;
o.attr(KeysEnum.attr_scroce, String.valueOf(scroce));
return scroce;
}
});
return nodes;
}

7.处理页面html,调用列表分析返回json结果

  /**
* 提取页面列表元素的选择器以及页面分类标签
*
* @param document
* @param is_subitem
* @return
*/
public static Map<String, Object> dealListNode(Document document, boolean is_subitem) throws Exception {
Map<String, Object> result = new HashMap<String, Object>();
try {
ListAutoFire listAutoFire = new ListAutoFire();
Elements list_node = listAutoFire.autoFireListNodes(document);
List<Map<String, Object>> lists = new ArrayList();
if (null != list_node && list_node.size() > 0) {
for (Element list_sel_item : list_node) {
if (list_sel_item.hasAttr(KeysEnum.attr_list_tag_name) && StringUtils.isNotBlank(list_sel_item.attr(KeysEnum.attr_list_tag_name))) {
result.put(KeysEnum.tag_name, list_sel_item.attr(KeysEnum.attr_list_tag_name));
continue;
}
Map<String, Object> list_dom_frame = new HashMap<>();
list_dom_frame.put(KeysEnum.list_sel, list_sel_item.cssSelector());
if (is_subitem) {
Map<String, List<String>> listItem = new HashMap<String, List<String>>();
for (Element item : list_sel_item.children())
listItem.put(item.cssSelector(), getYeziNodeSel(item));
list_dom_frame.put(KeysEnum.list_dom, listItem);
}
list_dom_frame.put(KeysEnum.attr_scroce, list_sel_item.attr(KeysEnum.attr_scroce));
lists.add(list_dom_frame);
}
}
result.put(KeysEnum.list, lists);
} catch (Exception e) {
throw new Exception(KeysEnum.error_info, e.getCause());
}
return result;
} /**
* 处理网页结构
*
* @param home_url 入口地址
* @param list_index 列表元素获取数量
* @param is_subitem 是否处理列表元素子项抽取 true/false
* @param is_ifr 是否处理iframe true/false
* @return
*/
public static Map<String, Object> getWebSiteFrame(String home_url, int list_index, boolean is_subitem, boolean is_ifr) {
Map<String, Object> result = new HashMap<String, Object>();
if (StringUtils.isBlank(home_url)) return result;
try {
Document html = Jsoup.connect(home_url).ignoreContentType(true).validateTLSCertificates(false).timeout(5000).get();
if (null == html) throw new Exception(KeysEnum.open_fail);
Map<String, Object> mapNode = dealListNode(html, is_subitem);
List listNode = (List) mapNode.get(KeysEnum.list);
result.put(KeysEnum.home_url, home_url);
result.put(KeysEnum.tag_name, mapNode.get(KeysEnum.tag_name));
result.put(KeysEnum.list, listNode.subList(0, listNode.size() > list_index ? list_index : listNode.size()));
result.put(KeysEnum.ifrs, new ArrayList());
if (is_ifr) {
List<Map<String, Object>> ifrs = (List<Map<String, Object>>) result.get(KeysEnum.ifrs);
Elements iframe_nodes = html.getElementsByTag(KeysEnum.iframe);
if (null != iframe_nodes) {
for (Element iframe : iframe_nodes) {
String iframe_url = iframe.attr(KeysEnum.attr_src);
if (StringUtils.isBlank(iframe_url)) continue;
try {
Document iframe_html = Jsoup.connect(iframe_url).ignoreContentType(true).validateTLSCertificates(false).timeout(5000).get();
if (null == iframe_html) continue;
Map<String, Object> ifrMapNode = dealListNode(iframe_html, is_subitem);
List ifrListNode = (List) ifrMapNode.get(KeysEnum.list);
Map<String, Object> ifr_map = new HashMap();
ifr_map.put(KeysEnum.home_url, iframe_url);
ifr_map.put(KeysEnum.tag_name, ifrMapNode.get(KeysEnum.tag_name));
ifr_map.put(KeysEnum.list, ifrListNode.subList(0, ifrListNode.size() > list_index ? list_index : ifrListNode.size()));
ifrs.add(ifr_map);
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
} catch (Exception e) {
e.printStackTrace();
result.clear();
result.put(KeysEnum.home_url, home_url);
result.put(KeysEnum.error, KeysEnum.error_info);
result.put(KeysEnum.message, e.toString());
}
return result;
} /**
* 处理网页结构
*
* @param home_url 入口地址
* @param list_index 列表元素获取数量
* @param is_subitem 是否处理列表元素子项抽取 true/false
* @return
*/
public static Map<String, Object> getWebSiteFrame(String home_url, int list_index, boolean is_subitem) {
return getWebSiteFrame(home_url, list_index, is_subitem, false);
} /**
* 处理网页结构
*
* @param home_url 入口地址
* @param list_index 列表元素获取数量
* @return
*/
public static Map<String, Object> getWebSiteFrame(String home_url, int list_index) {
return getWebSiteFrame(home_url, list_index, false);
} /**
* 处理网页结构
*
* @param home_url 入口地址
* @return
*/
public static Map<String, Object> getWebSiteFrame(String home_url) {
return getWebSiteFrame(home_url, 10);
}

8.生成页面分析结果标记文件

 public static void createMarkFile(Map siteFrame, String home_url, String path) {
try {
Document doc = Jsoup.connect(home_url).ignoreContentType(true).validateTLSCertificates(false).timeout(5000).get();
if (null == doc) return;
String style = ".mark_color {" +
"position:relative;" +
"pointer-events:none;" +
"left:0px;top:0px;" +
"display:inline-block;" +
"margin:-2px;width:100%;" +
"height:100%;" +
"border:dashed 2px #FF69B4;" +
"background-color: #43CD80;" +
"opacity:0.75;" +
"} " ;
List list = (List) siteFrame.get("list");
for (Object item : list) {
Map item_map = (Map) item;
String sel = (String) item_map.get("list_sel");
doc.select(sel).addClass("mark_color");
}
String content = doc.html();
content = content.contains("<base") ? content : content.replaceFirst("<head", "<base href='" + home_url + "'/><style>" + style + "</style><head");
FileUtils.writeStringToFile(new File(path), content, "UTF-8", false); } catch (IOException e) {
e.printStackTrace();
}
}

9.上述第7步返回的结果实例:

拿cnblog首页做测试,返回结果:

字段解释:

home_url :分析的页面地址

tag_name :当前页面的类型,多数情况下不正确,我只是拿home_url和页面的url比对,取了对应的text

list:页面中疑似列表元素

list_sel:页面中疑似列表元素的选择器

list_dom:页面中疑似列表元素的 一级孩子节点元素,叶子元素选择器

ifrs:页面中包含iframe分析的结果,没有则为空

 {
"home_url": "https://www.cnblogs.com/",
"tag_name": "1",
"list": [
{
"list_sel": "#post_list",
"list_dom": {
"#post_list > div.post_item:nth-child(7)": [
"#digg_count_9500831",
"#post_list > div.post_item:nth-child(7) > div.digg > div.clear",
"#digg_tip_9500831",
"#post_list > div.post_item:nth-child(7) > div.post_item_body > h3 > a.titlelnk",
"#post_list > div.post_item:nth-child(7) > div.post_item_body > p.post_item_summary",
"#post_list > div.post_item:nth-child(7) > div.post_item_body > div.post_item_foot > a.lightblue",
"#post_list > div.post_item:nth-child(7) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
"#post_list > div.post_item:nth-child(7) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
"#post_list > div.post_item:nth-child(7) > div.clear"
],
"#post_list > div.post_item:nth-child(19)": [
"#digg_count_9499348",
"#post_list > div.post_item:nth-child(19) > div.digg > div.clear",
"#digg_tip_9499348",
"#post_list > div.post_item:nth-child(19) > div.post_item_body > h3 > a.titlelnk",
"#post_list > div.post_item:nth-child(19) > div.post_item_body > p.post_item_summary > a > img.pfs",
"#post_list > div.post_item:nth-child(19) > div.post_item_body > div.post_item_foot > a.lightblue",
"#post_list > div.post_item:nth-child(19) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
"#post_list > div.post_item:nth-child(19) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
"#post_list > div.post_item:nth-child(19) > div.clear"
],
"#post_list > div.post_item:nth-child(6)": [
"#digg_count_9500833",
"#post_list > div.post_item:nth-child(6) > div.digg > div.clear",
"#digg_tip_9500833",
"#post_list > div.post_item:nth-child(6) > div.post_item_body > h3 > a.titlelnk",
"#post_list > div.post_item:nth-child(6) > div.post_item_body > p.post_item_summary",
"#post_list > div.post_item:nth-child(6) > div.post_item_body > div.post_item_foot > a.lightblue",
"#post_list > div.post_item:nth-child(6) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
"#post_list > div.post_item:nth-child(6) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
"#post_list > div.post_item:nth-child(6) > div.clear"
],
"#post_list > div.post_item:nth-child(9)": [
"#digg_count_9500757",
"#post_list > div.post_item:nth-child(9) > div.digg > div.clear",
"#digg_tip_9500757",
"#post_list > div.post_item:nth-child(9) > div.post_item_body > h3 > a.titlelnk",
"#post_list > div.post_item:nth-child(9) > div.post_item_body > p.post_item_summary > a > img.pfs",
"#post_list > div.post_item:nth-child(9) > div.post_item_body > div.post_item_foot > a.lightblue",
"#post_list > div.post_item:nth-child(9) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
"#post_list > div.post_item:nth-child(9) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
"#post_list > div.post_item:nth-child(9) > div.clear"
],
"#post_list > div.post_item:nth-child(17)": [
"#digg_count_9495616",
"#post_list > div.post_item:nth-child(17) > div.digg > div.clear",
"#digg_tip_9495616",
"#post_list > div.post_item:nth-child(17) > div.post_item_body > h3 > a.titlelnk",
"#post_list > div.post_item:nth-child(17) > div.post_item_body > p.post_item_summary > a > img.pfs",
"#post_list > div.post_item:nth-child(17) > div.post_item_body > div.post_item_foot > a.lightblue",
"#post_list > div.post_item:nth-child(17) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
"#post_list > div.post_item:nth-child(17) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
"#post_list > div.post_item:nth-child(17) > div.clear"
],
"#post_list > div.post_item:nth-child(8)": [
"#digg_count_9500822",
"#post_list > div.post_item:nth-child(8) > div.digg > div.clear",
"#digg_tip_9500822",
"#post_list > div.post_item:nth-child(8) > div.post_item_body > h3 > a.titlelnk",
"#post_list > div.post_item:nth-child(8) > div.post_item_body > p.post_item_summary > a > img.pfs",
"#post_list > div.post_item:nth-child(8) > div.post_item_body > div.post_item_foot > a.lightblue",
"#post_list > div.post_item:nth-child(8) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
"#post_list > div.post_item:nth-child(8) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
"#post_list > div.post_item:nth-child(8) > div.clear"
],
"#post_list > div.post_item:nth-child(18)": [
"#digg_count_9499454",
"#post_list > div.post_item:nth-child(18) > div.digg > div.clear",
"#digg_tip_9499454",
"#post_list > div.post_item:nth-child(18) > div.post_item_body > h3 > a.titlelnk",
"#post_list > div.post_item:nth-child(18) > div.post_item_body > p.post_item_summary > a > img.pfs",
"#post_list > div.post_item:nth-child(18) > div.post_item_body > div.post_item_foot > a.lightblue",
"#post_list > div.post_item:nth-child(18) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
"#post_list > div.post_item:nth-child(18) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
"#post_list > div.post_item:nth-child(18) > div.clear"
],
"#post_list > div.post_item:nth-child(3)": [
"#digg_count_9500944",
"#post_list > div.post_item:nth-child(3) > div.digg > div.clear",
"#digg_tip_9500944",
"#post_list > div.post_item:nth-child(3) > div.post_item_body > h3 > a.titlelnk",
"#post_list > div.post_item:nth-child(3) > div.post_item_body > p.post_item_summary > a > img.pfs",
"#post_list > div.post_item:nth-child(3) > div.post_item_body > div.post_item_foot > a.lightblue",
"#post_list > div.post_item:nth-child(3) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
"#post_list > div.post_item:nth-child(3) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
"#post_list > div.post_item:nth-child(3) > div.clear"
],
"#post_list > div.post_item:nth-child(2)": [
"#digg_count_9500357",
"#post_list > div.post_item:nth-child(2) > div.digg > div.clear",
"#digg_tip_9500357",
"#post_list > div.post_item:nth-child(2) > div.post_item_body > h3 > a.titlelnk",
"#post_list > div.post_item:nth-child(2) > div.post_item_body > p.post_item_summary",
"#post_list > div.post_item:nth-child(2) > div.post_item_body > div.post_item_foot > a.lightblue",
"#post_list > div.post_item:nth-child(2) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
"#post_list > div.post_item:nth-child(2) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
"#post_list > div.post_item:nth-child(2) > div.clear"
],
"#post_list > div.post_item:nth-child(5)": [
"#digg_count_9500890",
"#post_list > div.post_item:nth-child(5) > div.digg > div.clear",
"#digg_tip_9500890",
"#post_list > div.post_item:nth-child(5) > div.post_item_body > h3 > a.titlelnk",
"#post_list > div.post_item:nth-child(5) > div.post_item_body > p.post_item_summary > a > img.pfs",
"#post_list > div.post_item:nth-child(5) > div.post_item_body > div.post_item_foot > a.lightblue",
"#post_list > div.post_item:nth-child(5) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
"#post_list > div.post_item:nth-child(5) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
"#post_list > div.post_item:nth-child(5) > div.clear"
],
"#post_list > div.post_item:nth-child(4)": [
"#digg_count_9500935",
"#post_list > div.post_item:nth-child(4) > div.digg > div.clear",
"#digg_tip_9500935",
"#post_list > div.post_item:nth-child(4) > div.post_item_body > h3 > a.titlelnk",
"#post_list > div.post_item:nth-child(4) > div.post_item_body > p.post_item_summary > a > img.pfs",
"#post_list > div.post_item:nth-child(4) > div.post_item_body > div.post_item_foot > a.lightblue",
"#post_list > div.post_item:nth-child(4) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
"#post_list > div.post_item:nth-child(4) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
"#post_list > div.post_item:nth-child(4) > div.clear"
],
"#post_list > div.post_item:nth-child(1)": [
"#digg_count_9501071",
"#post_list > div.post_item:nth-child(1) > div.digg > div.clear",
"#digg_tip_9501071",
"#post_list > div.post_item:nth-child(1) > div.post_item_body > h3 > a.titlelnk",
"#post_list > div.post_item:nth-child(1) > div.post_item_body > p.post_item_summary > a > img.pfs",
"#post_list > div.post_item:nth-child(1) > div.post_item_body > div.post_item_foot > a.lightblue",
"#post_list > div.post_item:nth-child(1) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
"#post_list > div.post_item:nth-child(1) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
"#post_list > div.post_item:nth-child(1) > div.clear"
],
"#post_list > div.post_item:nth-child(15)": [
"#digg_count_9403762",
"#post_list > div.post_item:nth-child(15) > div.digg > div.clear",
"#digg_tip_9403762",
"#post_list > div.post_item:nth-child(15) > div.post_item_body > h3 > a.titlelnk",
"#post_list > div.post_item:nth-child(15) > div.post_item_body > p.post_item_summary > a > img.pfs",
"#post_list > div.post_item:nth-child(15) > div.post_item_body > div.post_item_foot > a.lightblue",
"#post_list > div.post_item:nth-child(15) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
"#post_list > div.post_item:nth-child(15) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
"#post_list > div.post_item:nth-child(15) > div.clear"
],
"#post_list > div.post_item:nth-child(16)": [
"#digg_count_9499534",
"#post_list > div.post_item:nth-child(16) > div.digg > div.clear",
"#digg_tip_9499534",
"#post_list > div.post_item:nth-child(16) > div.post_item_body > h3 > a.titlelnk",
"#post_list > div.post_item:nth-child(16) > div.post_item_body > p.post_item_summary > a > img.pfs",
"#post_list > div.post_item:nth-child(16) > div.post_item_body > div.post_item_foot > a.lightblue",
"#post_list > div.post_item:nth-child(16) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
"#post_list > div.post_item:nth-child(16) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
"#post_list > div.post_item:nth-child(16) > div.clear"
],
"#post_list > div.post_item:nth-child(13)": [
"#digg_count_9465698",
"#post_list > div.post_item:nth-child(13) > div.digg > div.clear",
"#digg_tip_9465698",
"#post_list > div.post_item:nth-child(13) > div.post_item_body > h3 > a.titlelnk",
"#post_list > div.post_item:nth-child(13) > div.post_item_body > p.post_item_summary > a > img.pfs",
"#post_list > div.post_item:nth-child(13) > div.post_item_body > div.post_item_foot > a.lightblue",
"#post_list > div.post_item:nth-child(13) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
"#post_list > div.post_item:nth-child(13) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
"#post_list > div.post_item:nth-child(13) > div.clear"
],
"#post_list > div.post_item:nth-child(14)": [
"#digg_count_9498410",
"#post_list > div.post_item:nth-child(14) > div.digg > div.clear",
"#digg_tip_9498410",
"#post_list > div.post_item:nth-child(14) > div.post_item_body > h3 > a.titlelnk",
"#post_list > div.post_item:nth-child(14) > div.post_item_body > p.post_item_summary > a > img.pfs",
"#post_list > div.post_item:nth-child(14) > div.post_item_body > div.post_item_foot > a.lightblue",
"#post_list > div.post_item:nth-child(14) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
"#post_list > div.post_item:nth-child(14) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
"#post_list > div.post_item:nth-child(14) > div.clear"
],
"#post_list > div.post_item:nth-child(11)": [
"#digg_count_9500633",
"#post_list > div.post_item:nth-child(11) > div.digg > div.clear",
"#digg_tip_9500633",
"#post_list > div.post_item:nth-child(11) > div.post_item_body > h3 > a.titlelnk",
"#post_list > div.post_item:nth-child(11) > div.post_item_body > p.post_item_summary",
"#post_list > div.post_item:nth-child(11) > div.post_item_body > div.post_item_foot > a.lightblue",
"#post_list > div.post_item:nth-child(11) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
"#post_list > div.post_item:nth-child(11) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
"#post_list > div.post_item:nth-child(11) > div.clear"
],
"#post_list > div.post_item:nth-child(12)": [
"#digg_count_9500352",
"#post_list > div.post_item:nth-child(12) > div.digg > div.clear",
"#digg_tip_9500352",
"#post_list > div.post_item:nth-child(12) > div.post_item_body > h3 > a.titlelnk",
"#post_list > div.post_item:nth-child(12) > div.post_item_body > p.post_item_summary > a > img.pfs",
"#post_list > div.post_item:nth-child(12) > div.post_item_body > div.post_item_foot > a.lightblue",
"#post_list > div.post_item:nth-child(12) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
"#post_list > div.post_item:nth-child(12) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
"#post_list > div.post_item:nth-child(12) > div.clear"
],
"#post_list > div.post_item:nth-child(20)": [
"#digg_count_9499225",
"#post_list > div.post_item:nth-child(20) > div.digg > div.clear",
"#digg_tip_9499225",
"#post_list > div.post_item:nth-child(20) > div.post_item_body > h3 > a.titlelnk",
"#post_list > div.post_item:nth-child(20) > div.post_item_body > p.post_item_summary > a > img.pfs",
"#post_list > div.post_item:nth-child(20) > div.post_item_body > div.post_item_foot > a.lightblue",
"#post_list > div.post_item:nth-child(20) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
"#post_list > div.post_item:nth-child(20) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
"#post_list > div.post_item:nth-child(20) > div.clear"
],
"#post_list > div.post_item:nth-child(10)": [
"#digg_count_9500632",
"#post_list > div.post_item:nth-child(10) > div.digg > div.clear",
"#digg_tip_9500632",
"#post_list > div.post_item:nth-child(10) > div.post_item_body > h3 > a.titlelnk",
"#post_list > div.post_item:nth-child(10) > div.post_item_body > p.post_item_summary > a > img.pfs",
"#post_list > div.post_item:nth-child(10) > div.post_item_body > div.post_item_foot > a.lightblue",
"#post_list > div.post_item:nth-child(10) > div.post_item_body > div.post_item_foot > span.article_comment > a.gray",
"#post_list > div.post_item:nth-child(10) > div.post_item_body > div.post_item_foot > span.article_view > a.gray",
"#post_list > div.post_item:nth-child(10) > div.clear"
]
},
"scroce": "9860"
},
{
"list_sel": "#cate_item",
"list_dom": {
"#cate_item_108705": [
"#cate_item_108705 > a"
],
"#cate_item_108704": [
"#cate_item_108704 > a"
],
"#cate_item_108703": [
"#cate_item_108703 > a"
],
"#cate_item_4": [
"#cate_item_4 > a"
],
"#cate_item_2": [
"#cate_item_2 > a"
],
"#cate_item_108709": [
"#cate_item_108709 > a"
],
"#cate_item_0": [
"#cate_item_0 > a"
],
"#cate_item_108698": [
"#cate_item_108698 > a"
],
"#cate_item_108724": [
"#cate_item_108724 > a"
],
"#cate_item_108701": [
"#cate_item_108701 > a"
],
"#cate_item_108712": [
"#cate_item_108712 > a"
],
"#cate_item_-1": [
"#cate_item_-1 > a"
]
},
"scroce": "1248"
},
{
"list_sel": "#friend_link",
"list_dom": {
"#friend_link > a:nth-child(15)": [
"#friend_link > a:nth-child(15)"
],
"#friend_link > a:nth-child(16)": [
"#friend_link > a:nth-child(16)"
],
"#friend_link > a:nth-child(17)": [
"#friend_link > a:nth-child(17)"
],
"#friend_link > a:nth-child(18)": [
"#friend_link > a:nth-child(18)"
],
"#friend_link > a:nth-child(1)": [
"#friend_link > a:nth-child(1)"
],
"#friend_link > a:nth-child(11)": [
"#friend_link > a:nth-child(11)"
],
"#friend_link > a:nth-child(12)": [
"#friend_link > a:nth-child(12)"
],
"#friend_link > a:nth-child(3)": [
"#friend_link > a:nth-child(3)"
],
"#friend_link > a:nth-child(13)": [
"#friend_link > a:nth-child(13)"
],
"#friend_link > a:nth-child(2)": [
"#friend_link > a:nth-child(2)"
],
"#friend_link > a:nth-child(14)": [
"#friend_link > a:nth-child(14)"
],
"#friend_link > a:nth-child(19)": [
"#friend_link > a:nth-child(19)"
],
"#friend_link > a:nth-child(5)": [
"#friend_link > a:nth-child(5)"
],
"#friend_link > a:nth-child(4)": [
"#friend_link > a:nth-child(4)"
],
"#friend_link > a:nth-child(7)": [
"#friend_link > a:nth-child(7)"
],
"#friend_link > a:nth-child(6)": [
"#friend_link > a:nth-child(6)"
],
"#friend_link > a:nth-child(10)": [
"#friend_link > a:nth-child(10)"
],
"#friend_link > a:nth-child(9)": [
"#friend_link > a:nth-child(9)"
],
"#friend_link > a:nth-child(8)": [
"#friend_link > a:nth-child(8)"
]
},
"scroce": "1197"
},
{
"list_sel": "#side_nav",
"list_dom": {
"#side_nav > div.w_l:nth-child(16)": [
"#side_nav > div.w_l:nth-child(16) > h4",
"#site_stats"
],
"#side_nav > p.r_l_1:nth-child(7)": [
"#side_nav > p.r_l_1:nth-child(7)"
],
"#side_nav > p.r_l_2:nth-child(8)": [
"#side_nav > p.r_l_2:nth-child(8)"
],
"#side_nav > p.r_l_3:nth-child(9)": [
"#side_nav > p.r_l_3:nth-child(9)"
],
"#side_nav > p.r_l_1:nth-child(5)": [
"#side_nav > p.r_l_1:nth-child(5)"
],
"#side_nav > p.r_l_3:nth-child(13)": [
"#side_nav > p.r_l_3:nth-child(13)"
],
"#side_nav > p.r_l_2:nth-child(4)": [
"#side_nav > p.r_l_2:nth-child(4)"
],
"#side_nav > p.r_l_3:nth-child(19)": [
"#side_nav > p.r_l_3:nth-child(19)"
],
"#side_nav > p.r_l_3:nth-child(3)": [
"#side_nav > p.r_l_3:nth-child(3)"
],
"#side_nav > div.w_l:nth-child(6)": [
"#side_nav > div.w_l:nth-child(6) > h4",
"#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(1) > a",
"#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(2) > a",
"#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(3) > a",
"#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(4) > a",
"#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(5) > a",
"#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(6) > a"
],
"#side_nav > p.r_l_2:nth-child(18)": [
"#side_nav > p.r_l_2:nth-child(18)"
],
"#side_nav > div.l_s:nth-child(12)": [
"#side_nav > div.l_s:nth-child(12)"
],
"#cate_title_block": [
"#cate_title_title > div.cate_title",
"#cate_item_108698 > a",
"#cate_item_2 > a",
"#cate_item_108701 > a",
"#cate_item_108703 > a",
"#cate_item_108704 > a",
"#cate_item_108705 > a",
"#cate_item_108709 > a",
"#cate_item_108712 > a",
"#cate_item_108724 > a",
"#cate_item_4 > a",
"#cate_item_0 > a",
"#cate_item_-1 > a",
"#cate_title_block > div.cate_bottom",
"#cate_sub_block",
"#cate_title_block > script"
],
"#side_nav > div.l_s:nth-child(2)": [
"#side_nav > div.l_s:nth-child(2)"
],
"#side_nav > p.r_l_1:nth-child(17)": [
"#side_nav > p.r_l_1:nth-child(17)"
],
"#side_nav > p.r_l_2:nth-child(14)": [
"#side_nav > p.r_l_2:nth-child(14)"
],
"#side_nav > p.r_l_1:nth-child(15)": [
"#side_nav > p.r_l_1:nth-child(15)"
],
"#user_stats": [
"#user_stats"
],
"#side_nav > div.l_s:nth-child(10)": [
"#side_nav > div.l_s:nth-child(10)"
]
},
"scroce": "975"
},
{
"list_sel": "#paging_block > div.pager",
"list_dom": {
"#paging_block > div.pager > a.p_9.middle": [
"#paging_block > div.pager > a.p_9.middle"
],
"#paging_block > div.pager > a.p_7.middle": [
"#paging_block > div.pager > a.p_7.middle"
],
"#paging_block > div.pager > a.p_8.middle": [
"#paging_block > div.pager > a.p_8.middle"
],
"#paging_block > div.pager > a:nth-child(14)": [
"#paging_block > div.pager > a:nth-child(14)"
],
"#paging_block > div.pager > a.p_11.middle": [
"#paging_block > div.pager > a.p_11.middle"
],
"#paging_block > div.pager > a.p_3.middle": [
"#paging_block > div.pager > a.p_3.middle"
],
"#paging_block > div.pager > a.p_4.middle": [
"#paging_block > div.pager > a.p_4.middle"
],
"#paging_block > div.pager > a.p_10.middle": [
"#paging_block > div.pager > a.p_10.middle"
],
"#paging_block > div.pager > a.p_2.middle": [
"#paging_block > div.pager > a.p_2.middle"
],
"#paging_block > div.pager > a.p_5.middle": [
"#paging_block > div.pager > a.p_5.middle"
],
"#paging_block > div.pager > a.p_6.middle": [
"#paging_block > div.pager > a.p_6.middle"
],
"#paging_block > div.pager > a.p_1.current": [
"#paging_block > div.pager > a.p_1.current"
],
"#paging_block > div.pager > span.ellipsis": [
"#paging_block > div.pager > span.ellipsis"
],
"#paging_block > div.pager > a.p_200.last": [
"#paging_block > div.pager > a.p_200.last"
]
},
"scroce": "865"
},
{
"list_sel": "#main > div.post_nav_block_wrapper > ul.post_nav_block",
"list_dom": {
"#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(1)": [
"#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(1) > a.current_nav"
],
"#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(3)": [
"#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(3) > a"
],
"#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(2)": [
"#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(2) > a"
],
"#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(5)": [
"#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(5) > a"
],
"#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(4)": [
"#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(4) > a"
],
"#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(7)": [
"#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(7) > a"
],
"#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(6)": [
"#main > div.post_nav_block_wrapper > ul.post_nav_block > li:nth-child(6) > a"
]
},
"scroce": "590"
},
{
"list_sel": "#nav_menu",
"list_dom": {
"#nav_menu > a:nth-child(3)": [
"#nav_menu > a:nth-child(3)"
],
"#nav_menu > a:nth-child(2)": [
"#nav_menu > a:nth-child(2)"
],
"#nav_menu > a:nth-child(5)": [
"#nav_menu > a:nth-child(5)"
],
"#nav_menu > a:nth-child(4)": [
"#nav_menu > a:nth-child(4)"
],
"#nav_menu > a:nth-child(1)": [
"#nav_menu > a:nth-child(1)"
],
"#nav_menu > a:nth-child(7)": [
"#nav_menu > a:nth-child(7)"
],
"#nav_menu > a:nth-child(6)": [
"#nav_menu > a:nth-child(6)"
],
"#nav_menu > a:nth-child(9)": [
"#nav_menu > a:nth-child(9)"
],
"#nav_menu > a:nth-child(8)": [
"#nav_menu > a:nth-child(8)"
]
},
"scroce": "486"
},
{
"list_sel": "#side_nav > div.w_l:nth-child(6) > ul",
"list_dom": {
"#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(3)": [
"#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(3) > a"
],
"#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(2)": [
"#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(2) > a"
],
"#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(1)": [
"#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(1) > a"
],
"#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(6)": [
"#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(6) > a"
],
"#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(5)": [
"#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(5) > a"
],
"#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(4)": [
"#side_nav > div.w_l:nth-child(6) > ul > li:nth-child(4) > a"
]
},
"scroce": "486"
},
{
"list_sel": "#headline_block > ul",
"list_dom": {
"#headline_block > ul > li:nth-child(4)": [
"#headline_block > ul > li:nth-child(4) > a:nth-child(1)",
"#headline_block > ul > li:nth-child(4) > a.right_more"
],
"#headline_block > ul > li.editor_pick": [
"#editor_pick_count",
"#headline_block > ul > li.editor_pick > a.right_more"
],
"#headline_block > ul > li:nth-child(3)": [
"#headline_block > ul > li:nth-child(3) > a:nth-child(1)",
"#headline_block > ul > li:nth-child(3) > a.right_more"
],
"#headline_block > ul > li:nth-child(2)": [
"#headline_block > ul > li:nth-child(2) > a:nth-child(1)",
"#headline_block > ul > li:nth-child(2) > a.right_more"
]
},
"scroce": "407"
},
{
"list_sel": "#header",
"list_dom": {
"#header > p.h_r_3:nth-child(1)": [
"#header > p.h_r_3:nth-child(1)"
],
"#header > p.h_r_2:nth-child(6)": [
"#header > p.h_r_2:nth-child(6)"
],
"#header > p.h_r_1:nth-child(3)": [
"#header > p.h_r_1:nth-child(3)"
],
"#header > p.h_r_2:nth-child(2)": [
"#header > p.h_r_2:nth-child(2)"
],
"#header > p.h_r_1:nth-child(5)": [
"#header > p.h_r_1:nth-child(5)"
],
"#header > p.h_r_3:nth-child(7)": [
"#header > p.h_r_3:nth-child(7)"
],
"#header_block": [
"#logo > h1 > a > img",
"#header_block > div.clear"
]
},
"scroce": "335"
}
],
"ifrs": []
}

10.上述第8步标记文件效果:

红色虚线框起来的是返回的json结果中list中的list_sel选择器选中的元素

分析结果统计:

处理了将近1万的网站发现,大致的网页列表结构可以发现,平时时间大致在2-3s左右,因为用的是jsoup访问的网页,包含了网页响应的时间,时间复杂度待优化,

分析结果对于一些比较复杂乱的网页支持有待加强,代码写的比较乱,有待优化,应该会有更好的处理方式,还请指教,相互学习交流。

转载请注明出处:https://www.cnblogs.com/jstarseven/p/9501210.html

源码地址:https://github.com/jstarseven/list-autofire


-END-

java实现网页结构分析列表发现的更多相关文章

  1. JAVA 异常类型结构分析

    JAVA 异常类型结构分析 Throwable 是所有异常类型的基类,Throwable 下一层分为两个分支,Error 和 Exception. Error 和 Exception Error Er ...

  2. java中把list列表转为arrayList以及arraylist数组截取的简单方法

    java中把list列表转为arrayList以及arraylist数组截取的简单方法 package xiaobai; import java.util.ArrayList; import java ...

  3. 用java编网页的学习流程,我的一些小心得(初学java到高深运用)

    (1)java基础:首先得会写int,String,for循环,数组,**等等(熟练各种基础的关键字,各种java自带的排序,随即等等算法)什么是封装,继承,多态,然后private,public,p ...

  4. 如何以Java实现网页截图技术

    转自   http://blog.csdn.net/cping1982/article/details/5353049 今天看到某网友关于“如何以Java实现网页截图技术”的咨询帖,由于出现该咨询的地 ...

  5. java读取网页图片路径并下载到本地

    java读取网页图片路径并下载到本地 最近公司需要爬取一些网页上的数据,自己就简单的写了一个demo,其中有一些数据是图片,需要下载下来到本地并且 将图片的路径保存到数据库,示例代码如下: packa ...

  6. Java获取系统安装软件列表

    /** * @author <a href="mailto:foohsinglong@gmail.com">kevin.long</a> * @descri ...

  7. [Python] - 使用chardet检查网页编码格式时发现的问题

    最近在使用chardet检查网页编码格式时发现如下问题: 用urllib打开网页再检查编码格式和用urllib2打开网页检查编码格式结果不一样,所以urllib2打开可能导致问题,需要关注. 查看了相 ...

  8. java中如何使用列表数组

    java中如何使用列表数组 觉得有用的话,欢迎一起讨论相互学习~Follow Me 转载链接 https://blog.csdn.net/hgtjcxy/article/details/8183519 ...

  9. JAVA字符串处理函数列表一览

    JAVA字符串处理函数列表一览   Java中的字符串也是一连串的字符.但是与许多其他的计算机语言将字符串作为字符数组处理不同,Java将字符串作为String类型对象来处理.将字符串作为内置的对象处 ...

随机推荐

  1. 网络基础-- 之 子网划分 and 一些基础解释

    子网划分的核心思想就是------   借主机为为网络位 最近几天--看了一下今天就分享一波. 首先我们先来理解一下 -----   one. 进制的转换   -----   two. IP地址 -- ...

  2. 426. Convert Binary Search Tree to Sorted Doubly Linked List把bst变成双向链表

    [抄题]: Convert a BST to a sorted circular doubly-linked list in-place. Think of the left and right po ...

  3. Python开发——函数【基础】

    函数的定义 以下规则 函数代码块以 def 关键词开头,后接函数标识符名称和圆括号(). 任何传入参数和自变量必须放在圆括号中间.圆括号之间可以用于定义参数. 函数的第一行语句可以选择性地使用文档字符 ...

  4. Python:每日一题004

    题目: 输入某年某月某日,判断这一天是这一年的第几天? 程序分析: 以3月5日为例,应该先把前两个月的加起来,然后再加上5天即本年的第几天,特殊情况,闰年且输入月份大于2时需考虑多加一天 个人的思路及 ...

  5. boost 编写finger服务

    本篇是模仿PYTHON TWISTED写一个FINGER示例. 从最简单的链接到通过接收字符串返回不同的内容 1 最简单的链接 #include <ctime> #include < ...

  6. Java-static关键字解析

    static关键字是很多朋友在编写代码和阅读代码时碰到的比较难以理解的一个关键字,也是各大公司的面试官喜欢在面试时问到的知识点之一.下面就先讲述一下static关键字的用法和平常容易误解的地方,最后列 ...

  7. 微信公众号自定义菜单中添加emoji表情

    做微信公众号开发,可能会遇到如何加入emoji表情的问题.今天在“海南旅游小管家”公众号的菜单中加入了emoji表情,特此记录备忘. 1.登录微信公众号,在左侧找到[开发者工具]菜单,点击进入,找到[ ...

  8. 团队-爬取豆瓣电影TOP250-代码设计规范

    队长博客:http://www.cnblogs.com/gengwenhao/

  9. Java流程控制语句

    流程控制语句 内容: if... if...else if...else if...else switch...case for while do...while 分支结构if 接下来要学习的if条件 ...

  10. webpack Cannot find module 'webpack/schemas/WebpackOptions.json'

    webpack-dev-server版本的问题 一直在解决这个问题,最后竟然发现...安装2.9.1版本就可以了 npm install webpack-dev-server@2.9.1