crawler_基于块儿统计正文抽取

在线查看效果：http://tool.haoshuju.cn/

import java.util.ArrayList;

import java.util.Arrays;

import java.util.List;

import java.util.regex.Pattern;

/**

 * <p>

 * 在线性时间内抽取主题类（新闻、博客等）网页的正文。 采用了<b>基于行块分布函数</b>的方法，为保持通用性没有针对特定网站编写规则。

 * </p>

 *

 * @author Chen Xin(xchen@ir.hit.edu.cn) Created on 2009-1-11 Updated on

 *         2010-08-09

 * @note cphmvp 优化性能提速三倍，优化列表页中含其它标签识别 2014-6-5 11:53:33

 */

public class TextExtract {

    private List<String> lines;

    private final static int blocksWidth = 3;

    private int threshold;

    private String html;

    private boolean flag;

    private int start;

    private int end;

    private StringBuilder text;

    private ArrayList<Integer> indexDistribution;

    public TextExtract() {

        lines = new ArrayList<String>();

        indexDistribution = new ArrayList<Integer>();

        text = new StringBuilder();

        flag = false;

        /* 当待抽取的网页正文中遇到成块的新闻标题未剔除时，只要增大此阈值即可。 */

        /* 阈值增大，准确率提升，召回率下降；值变小，噪声会大，但可以保证抽到只有一句话的正文 */

        threshold = -1;

    }

    /**

     * 抽取网页正文，不判断该网页是否是目录型。即已知传入的肯定是可以抽取正文的主题类网页。

     *

     * @param _html

     *            网页HTML字符串

     *

     * @return 网页正文string

     */

    public String parse(String _html) {

        return parse(_html, false);

    }

    /**

     * 判断传入HTML，若是主题类网页，则抽取正文；否则输出<b>"unkown"</b>。

     *

     * @param _html

     *            网页HTML字符串

     * @param _flag

     *            true进行主题类判断, 省略此参数则默认为false

     *

     * @return 网页正文string<br/>

     * @note 2014年6月4日11:55:51 修复图片标签空格处理 cphmvp

     */

    public String parse(String _html, boolean _flag) {

        flag = _flag;

        html = _html;

        html = preProcess(html);

        // System.out.println(html);

        return getText().replaceAll("imgsrc=", "img src=");

    }

    private static int FREQUENT_URL = 30;

    private static Pattern links = Pattern

            .compile(

                    "<[aA]\\s+[Hh][Rr][Ee][Ff]=[\"|\']?([^>\"\' ]+)[\"|\']?\\s*[^>]*>([^>]+)</a>(\\s*.{0,"

                            + FREQUENT_URL

                            + "}\\s*<a\\s+href=[\"|\']?([^>\"\' ]+)[\"|\']?\\s*[^>]*>([^>]+)</[aA]>){2,100}",

                    Pattern.DOTALL);

    private static String preProcess(String source) {

        source = source.replaceAll("(?is)<!DOCTYPE.*?>", "");

        source = source.replaceAll("(?is)<!--.*?-->", ""); // remove html

                                                            // comment

        source = source.replaceAll("(?is)<script.*?>.*?</script>", ""); // remove

                                                                        // javascript

        source = source.replaceAll("(?is)<style.*?>.*?</style>", ""); // remove

                                                                        // css

        source = source.replaceAll("&.{2,5};|&#.{2,5};", " "); // remove special

                                                                // char

        // 剔除连续成片的超链接文本（认为是，广告或噪音）,超链接多藏于span中

        source = source.replaceAll("<[sS][pP][aA][nN].*?>", "");

        source = source.replaceAll("</[sS][pP][aA][nN]>", "");

        // int len = source.length();

        // while ((source = links.matcher(source).replaceAll("")).length() !=

        // len) {

        // len = source.length();

        // }

        // ;

        // 提升性能三倍 cphmvp

        source = source.replaceAll(" ", "");

        // [\\s\\S]{0,30} 用以表示a之间的间隙容忍度

        String regex = "<[a|A][^>]*?>[^>]+</[a|A]>(?:\\s*[\\s\\S]{0,30}\\s*<[a|A][^>]*?>[^>]+</[a|A]>){2,100}";

        source = source.replaceAll(regex, "");

        // continue;

        // source = links.matcher(source).replaceAll("");

        // 防止html中在<>中包括大于号的判断

        source = source.replaceAll("<[^>'\"]*['\"].*['\"].*?>", "");

        source = source.replaceAll("<.*?>", "");

        source = source.replaceAll("<.*?>", "");

        source = source.replaceAll("\r\n", "\n");

        return source;

    }

    private String getText() {

        lines = Arrays.asList(html.split("\n"));

        indexDistribution.clear();

        int empty = 0;// 空行的数量

        for (int i = 0; i < lines.size() - blocksWidth; i++) {

            if (lines.get(i).length() == 0) {

                empty++;

            }

            int wordsNum = 0;

            for (int j = i; j < i + blocksWidth; j++) {

                lines.set(j, lines.get(j).replaceAll("\\s+", ""));

                wordsNum += lines.get(j).length();

            }

            indexDistribution.add(wordsNum);

            // System.out.println(wordsNum);

        }

        int sum = 0;

        for (int i = 0; i < indexDistribution.size(); i++) {

            sum += indexDistribution.get(i);

        }

        // 正文全部script情况，抽取不到正文

        if (indexDistribution.size() == 0)

            return "";

        threshold = Math.min(100, (sum / indexDistribution.size()) << (empty

                / (lines.size() - empty) >>> 1));

        threshold = Math.max(50, threshold);

        start = -1;

        end = -1;

        boolean boolstart = false, boolend = false;

        boolean firstMatch = true;// 前面的标题块往往比较小，应该减小与它匹配的阈值

        text.setLength(0);

        StringBuilder buffer = new StringBuilder();

        for (int i = 0; i < indexDistribution.size() - 1; i++) {

            if (firstMatch && !boolstart) {

                if (indexDistribution.get(i) > (threshold / 2) && !boolstart) {

                    if (indexDistribution.get(i + 1).intValue() != 0

                            || indexDistribution.get(i + 2).intValue() != 0) {

                        firstMatch = false;

                        boolstart = true;

                        start = i;

                        continue;

                    }

                }

            }

            if (indexDistribution.get(i) > threshold && !boolstart) {

                if (indexDistribution.get(i + 1).intValue() != 0

                        || indexDistribution.get(i + 2).intValue() != 0

                        || indexDistribution.get(i + 3).intValue() != 0) {

                    boolstart = true;

                    start = i;

                    continue;

                }

            }

            if (boolstart) {

                if (indexDistribution.get(i).intValue() == 0

                        || indexDistribution.get(i + 1).intValue() == 0) {

                    end = i;

                    boolend = true;

                }

            }

            if (boolend) {

                buffer.setLength(0);

                // System.out.println(start+1 + "\t\t" + end+1);

                for (int ii = start; ii <= end; ii++) {

                    if (lines.get(ii).length() < 5)

                        continue;

                    buffer.append(lines.get(ii) + "\n");

                }

                String str = buffer.toString();

                // System.out.println(str);

                if (str.contains("Copyright") || str.contains("版权所有"))

                    continue;

                text.append(str);

                boolstart = boolend = false;

            }

        }

        if (start > end) {

            buffer.setLength(0);

            int size_1 = lines.size() - 1;

            for (int ii = start; ii <= size_1; ii++) {

                if (lines.get(ii).length() < 5)

                    continue;

                buffer.append(lines.get(ii) + "\n");

            }

            String str = buffer.toString();

            // System.out.println(str);

            if ((!str.contains("Copyright")) || (!str.contains("版权所有"))) {

                text.append(str);

            }

        }

        return text.toString();

    }

    public static void main(String[] args) {

        System.out.println("===============");

        String s = "<img  class='fit-image' onload='javascript:if(this.width>498)this.width=498;' />hello";

        // source = source.replaceAll("<[^'\"]*['\"].*['\"].*?>", "");

        System.out.println(TextExtract.preProcess(s));

    }

}

crawler_基于块儿统计正文抽取_改进版的更多相关文章

java版正文抽取基于文字连接比
package cn.tdt.crawl.jdbc; import java.util.regex.Matcher; import java.util.regex.Pattern; import or ...
Codevs_1040_[NOIP2001]_统计单词个数_(划分型动态规划)
描述 http://codevs.cn/problem/1040/ 与Codevs_1017_乘积最大很像,都是划分型dp. 给出一个字符串和几个单词,要求将字符串划分成k段,在每一段中求共有多少单词 ...
基于Gitlab统计代码行--统计所有仓库、所有提交人的代码总行数（新增加-删除）
公司绩效考核要求,统计GITLAB仓库所有人提示有效代码行业脚本1: 统计所有仓库.所有提交人的代码总行数(新增加-删除) 脚本2: 统计所有仓库.所有提交人的代码提交汇总与删除汇总脚本3: 统计 ...
Adaboost算法的一个简单实现——基于《统计学习方法(李航)》第八章
最近阅读了李航的<统计学习方法(第二版)>,对AdaBoost算法进行了学习. 在第八章的8.1.3小节中,举了一个具体的算法计算实例.美中不足的是书上只给出了数值解,这里用代码将它实现一 ...
基于服务的SOA架构_后续篇
今天是元宵节,首先祝各位广大博友在接下来的光阴中技术更上一层,事事如意! 昨天简单介绍了一下本人在近期开发过的一个电商购物平台的架构流程和一些技术说明:今天将详细总结一下在项目中用到的各个架构技术的环 ...
【洛谷3321_BZOJ3992】[SDOI2015]序列统计（原根_多项式）
题目: 洛谷3321 分析: 一个转化思路比较神(典型?)的题-- 一个比较显然的\(O(n^3)\)暴力是用\(f[i][j]\)表示选了\(i\)个数,当前积在模\(m\)意义下为\(j\)的方案 ...
基于行块分布函数的通用网页正文内容抽取（带HTML格式）
算法思路: 假如网页正文(过滤html标签后的)有n行,以k行为一行块,总共可构成n-k+1行块: 以行号为索引号,以行块长度为索引值,形成行块稀疏矩阵: 以上面的稀疏矩阵为基础,找出其骤升骤降点,分 ...
数据挖掘：基于Spark+HanLP实现影视评论关键词抽取(1)
1. 背景近日项目要求基于爬取的影视评论信息,抽取影视的关键字信息.考虑到影视评论数据量较大,因此采用Spark处理框架.关键词提取的处理主要包含分词+算法抽取两部分.目前分词工具包较为主流的,包括 ...
我为开源做贡献，网页正文提取——Html2Article
为什么要做正文提取一般做舆情分析,都会涉及到网页正文内容提取.对于分析而言,有价值的信息是正文部分,大多数情况下,为了便于分析,需要将网页中和正文不相干的部分给剔除.可以说正文提取的好坏,直接影响了 ...

随机推荐

CareerCup它1.8 串移包括问题
[称号] 原文: 1.8 Assume you have a method isSubstring which checks if one word is a substring of another ...
解决PhpCms V9后台无法上传图片
PHPCMS V9 在近期一次更新的版本号(9.4.2)中因为代码推断失误.导致PHPCMS在后台更新文章无法上传图片而导致的bug.在PHPCMS论坛中找到了暂时解决方式,希望PHPCMS官方能尽快 ...
同ListView该接口无法通过手势滑动左右切换界面问题解决方法
同ListView该接口无法通过手势滑动左右切换界面问题解决方法问题描写叙述: 在做OnGestureListener滑动切换窗体的时候,会遇到这种问题.就是当界面中含有ListView的时候.On ...
Redis测井系统
什么是 SLOWLOG Slow log 是 Redis 用来记录查询运行时间的日志系统. 查询运行时间指的是不包含像client响应(talking).发送回复等 IO 操作,而单单是运行一个查询命 ...
POJ1458 Common Subsequence 【最长公共子序列】
Common Subsequence Time Limit: 1000MS Memory Limit: 10000K Total Submissions: 37614 Accepted: 15 ...
VIM 初步
按i前插入字符.a在光标后追加字符 ctrl+d将光标下称半个窗体.按Ctrl+u将光标上移半个窗体在输入模式下:ctrl+h删除字符,ctrl+u删除行.ctrl+w删除字命令模式下.x删除字符 ...
MEF初体验之九：部件生命周期
理解MEF容器中部件的生命周期及其含义是非常重要的.鉴于MEF重点在开放端应用程序,这将变得尤其重要的,一旦app ships和第三方扩展开始运行,作为应用程序的开发者将很好地控制这一系列的部件.生命 ...
CSDN个人空间能再烂吗？
CSDN空间你敢再烂么? 从CSDN博客跳转到CSDN个人空间的入口还算明显,可是想从个人空间跳转到博客,可真是众里寻他千百度.跳转接口怎么寻都寻不到.根本没有这个跳转的入口.唯一的途径仅仅能从写博文 ...
javascript基金会——鼠标事件，系统对话框，等等。
1.鼠标事件 (1).onclick:用户点击鼠标左键,并且当焦点处于button准时,按用户Enter关键,发生onclick事件 (2).ondblclick:当用户双击鼠标左键.发生ondblc ...
android file.createnewfile ioexception
近期在写项目的时候,文件有时候能创建成功有时候直接io异常,真是太扯淡.找了许久,最终找到原因 android 中创建文件,文件的名字中不能包括冒号啊这种特殊字符, 仅仅要你感觉有点特殊的字符最好都不 ...

crawler_基于块儿统计正文抽取_改进版

crawler_基于块儿统计正文抽取_改进版的更多相关文章

随机推荐

热门专题