HTML 转文本及HTML内容提取(C#)

//1、HTML直接转文本
//使用方法
HtmlToText convert = new HtmlToText();
textBox2.Text = convert.Convert(textBox1.Text);
//代码
/// <summary>
/// Converts HTML to plain text.
/// </summary>
class HtmlToText
{
// Static data tables
protected static Dictionary<string, string> _tags;
protected static HashSet<string> _ignoreTags;
// Instance variables
protected TextBuilder _text;
protected string _html;
protected int _pos;
// Static constructor (one time only)
static HtmlToText()
{
_tags = new Dictionary<string, string>();
_tags.Add("address", "\n");
_tags.Add("blockquote", "\n");
_tags.Add("div", "\n");
_tags.Add("dl", "\n");
_tags.Add("fieldset", "\n");
_tags.Add("form", "\n");
_tags.Add("h1", "\n");
_tags.Add("/h1", "\n");
_tags.Add("h2", "\n");
_tags.Add("/h2", "\n");
_tags.Add("h3", "\n");
_tags.Add("/h3", "\n");
_tags.Add("h4", "\n");
_tags.Add("/h4", "\n");
_tags.Add("h5", "\n");
_tags.Add("/h5", "\n");
_tags.Add("h6", "\n");
_tags.Add("/h6", "\n");
_tags.Add("p", "\n");
_tags.Add("/p", "\n");
_tags.Add("table", "\n");
_tags.Add("/table", "\n");
_tags.Add("ul", "\n");
_tags.Add("/ul", "\n");
_tags.Add("ol", "\n");
_tags.Add("/ol", "\n");
_tags.Add("/li", "\n");
_tags.Add("br", "\n");
_tags.Add("/td", "\t");
_tags.Add("/tr", "\n");
_tags.Add("/pre", "\n");
_ignoreTags = new HashSet<string>();
_ignoreTags.Add("script");
_ignoreTags.Add("noscript");
_ignoreTags.Add("style");
_ignoreTags.Add("object");
}
/// <summary>
/// Converts the given HTML to plain text and returns the result.
/// </summary>
/// <param name="html">HTML to be converted</param>
/// <returns>Resulting plain text</returns>
public string Convert(string html)
{
// Initialize state variables
_text = new TextBuilder();
_html = html;
_pos = 0;
// Process input
while (!EndOfText)
{
if (Peek() == '<')
{
// HTML tag
bool selfClosing;
string tag = ParseTag(out selfClosing);
// Handle special tag cases
if (tag == "body")
{
// Discard content before <body>
_text.Clear();
}
else if (tag == "/body")
{
// Discard content after </body>
_pos = _html.Length;
}
else if (tag == "pre")
{
// Enter preformatted mode
_text.Preformatted = true;
EatWhitespaceToNextLine();
}
else if (tag == "/pre")
{
// Exit preformatted mode
_text.Preformatted = false;
}
string value;
if (_tags.TryGetValue(tag, out value))
_text.Write(value);
if (_ignoreTags.Contains(tag))
EatInnerContent(tag);
}
else if (Char.IsWhiteSpace(Peek()))
{
// Whitespace (treat all as space)
_text.Write(_text.Preformatted ? Peek() : ' ');
MoveAhead();
}
else
{
// Other text
_text.Write(Peek());
MoveAhead();
}
}
// Return result
return HttpUtility.HtmlDecode(_text.ToString());
}
// Eats all characters that are part of the current tag
// and returns information about that tag
protected string ParseTag(out bool selfClosing)
{
string tag = String.Empty;
selfClosing = false;
if (Peek() == '<')
{
MoveAhead();
// Parse tag name
EatWhitespace();
int start = _pos;
if (Peek() == '/')
MoveAhead();
while (!EndOfText && !Char.IsWhiteSpace(Peek()) &&
Peek() != '/' && Peek() != '>')
MoveAhead();
tag = _html.Substring(start, _pos - start).ToLower();
// Parse rest of tag
while (!EndOfText && Peek() != '>')
{
if (Peek() == '"' || Peek() == '\'')
EatQuotedValue();
else
{
if (Peek() == '/')
selfClosing = true;
MoveAhead();
}
}
MoveAhead();
}
return tag;
}
// Consumes inner content from the current tag
protected void EatInnerContent(string tag)
{
string endTag = "/" + tag;
while (!EndOfText)
{
if (Peek() == '<')
{
// Consume a tag
bool selfClosing;
if (ParseTag(out selfClosing) == endTag)
return;
// Use recursion to consume nested tags
if (!selfClosing && !tag.StartsWith("/"))
EatInnerContent(tag);
}
else MoveAhead();
}
}
// Returns true if the current position is at the end of
// the string
protected bool EndOfText
{
get { return (_pos >= _html.Length); }
}
// Safely returns the character at the current position
protected char Peek()
{
return (_pos < _html.Length) ? _html[_pos] : (char)0;
}
// Safely advances to current position to the next character
protected void MoveAhead()
{
_pos = Math.Min(_pos + 1, _html.Length);
}
// Moves the current position to the next non-whitespace
// character.
protected void EatWhitespace()
{
while (Char.IsWhiteSpace(Peek()))
MoveAhead();
}
// Moves the current position to the next non-whitespace
// character or the start of the next line, whichever
// comes first
protected void EatWhitespaceToNextLine()
{
while (Char.IsWhiteSpace(Peek()))
{
char c = Peek();
MoveAhead();
if (c == '\n')
break;
}
}
// Moves the current position past a quoted value
protected void EatQuotedValue()
{
char c = Peek();
if (c == '"' || c == '\'')
{
// Opening quote
MoveAhead();
// Find end of value
int start = _pos;
_pos = _html.IndexOfAny(new char[] { c, '\r', '\n' }, _pos);
if (_pos < 0)
_pos = _html.Length;
else
MoveAhead(); // Closing quote
}
}
/// <summary>
/// A StringBuilder class that helps eliminate excess whitespace.
/// </summary>
protected class TextBuilder
{
private StringBuilder _text;
private StringBuilder _currLine;
private int _emptyLines;
private bool _preformatted;
// Construction
public TextBuilder()
{
_text = new StringBuilder();
_currLine = new StringBuilder();
_emptyLines = 0;
_preformatted = false;
}
/// <summary>
/// Normally, extra whitespace characters are discarded.
/// If this property is set to true, they are passed
/// through unchanged.
/// </summary>
public bool Preformatted
{
get
{
return _preformatted;
}
set
{
if (value)
{
// Clear line buffer if changing to
// preformatted mode
if (_currLine.Length > 0)
FlushCurrLine();
_emptyLines = 0;
}
_preformatted = value;
}
}
/// <summary>
/// Clears all current text.
/// </summary>
public void Clear()
{
_text.Length = 0;
_currLine.Length = 0;
_emptyLines = 0;
}
/// <summary>
/// Writes the given string to the output buffer.
/// </summary>
/// <param name="s"></param>
public void Write(string s)
{
foreach (char c in s)
Write(c);
}
/// <summary>
/// Writes the given character to the output buffer.
/// </summary>
/// <param name="c">Character to write</param>
public void Write(char c)
{
if (_preformatted)
{
// Write preformatted character
_text.Append(c);
}
else
{
if (c == '\r')
{
// Ignore carriage returns. We'll process
// '\n' if it comes next
}
else if (c == '\n')
{
// Flush current line
FlushCurrLine();
}
else if (Char.IsWhiteSpace(c))
{
// Write single space character
int len = _currLine.Length;
if (len == 0 || !Char.IsWhiteSpace(_currLine[len - 1]))
_currLine.Append(' ');
}
else
{
// Add character to current line
_currLine.Append(c);
}
}
}
// Appends the current line to output buffer
protected void FlushCurrLine()
{
// Get current line
string line = _currLine.ToString().Trim();
// Determine if line contains non-space characters
string tmp = line.Replace(" ", String.Empty);
if (tmp.Length == 0)
{
// An empty line
_emptyLines++;
if (_emptyLines < 2 && _text.Length > 0)
_text.AppendLine(line);
}
else
{
// A non-empty line
_emptyLines = 0;
_text.AppendLine(line);
}
// Reset current line
_currLine.Length = 0;
}
/// <summary>
/// Returns the current output as a string.
/// </summary>
public override string ToString()
{
if (_currLine.Length > 0)
FlushCurrLine();
return _text.ToString();
}
}
}
//2、提取html的正文类
using System;
using System.Text;
namespace HtmlStrip
{
class MainClass
{
public static void Main (string[] args)
{
string str = "<div>abc</div><span>efg</span><br /><script>888</script>oo";
//System.IO.StreamReader rd=new System.IO.StreamReader ("/home/lx/test.html");
//str=rd.ReadToEnd ();
HtmlParser t = new HtmlParser (str); //
t.KeepTag (new string[] { "br" }); //设置br标签不过虑
Console.Write (t.Text ());
}
}
class HtmlParser
{
private string[] htmlcode; //把html转为数组形式用于分析
private StringBuilder result = new StringBuilder (); //输出的结果
private int seek; //分析文本时候的指针位置
private string[] keepTag; //用于保存要保留的尖括号内容
private bool _inTag; //标记现在的指针是不是在尖括号内
private bool needContent = true; //是否要提取正文
private string tagName; //当前尖括号的名字
private string[] specialTag = new string[] { "script", "style", "!--" }; //特殊的尖括号内容，一般这些标签的正文是不要的
/// <summary>
/// 当指针进入尖括号内，就会触发这个属性。这里主要逻辑是提取尖括号里的标签名字
/// </summary>
public bool inTag {
get { return _inTag; }
set {
_inTag = value;
if (!value)
return;
bool ok = true;
tagName = "";
while (ok) {
string word = read ();
if (word != " " && word != ">") {
tagName += word;
} else if (word == " " && tagName.Length > 0) {
ok = false;
} else if (word == ">") {
ok = false;
inTag = false;
seek -= 1;
}
}
}
}
/// <summary>
/// 初始化类
/// </summary>
/// <param name="html">
/// 要分析的html代码
/// </param>
public HtmlParser (string html)
{
htmlcode = new string[html.Length];
for (int i = 0; i < html.Length; i++) {
htmlcode[i] = html[i].ToString ();
}
KeepTag (new string[] { });
}
/// <summary>
/// 设置要保存那些标签不要被过滤掉
/// </summary>
/// <param name="tags">
///
/// </param>
public void KeepTag (string[] tags)
{
keepTag = tags;
}
/// <summary>
///
/// </summary>
/// <returns>
/// 输出处理后的文本
/// </returns>
public string Text ()
{
int startTag = 0;
int endTag = 0;
while (seek < htmlcode.Length) {
string word = read ();
if (word.ToLower () == "<") {
startTag = seek;
inTag = true;
} else if (word.ToLower () == ">") {
endTag = seek;
inTag = false;
if (iskeepTag (tagName.Replace ("/", ""))) {
for (int i = startTag - 1; i < endTag; i++) {
result.Append (htmlcode[i].ToString ());
}
} else if (tagName.StartsWith ("!--")) {
bool ok = true;
while (ok) {
if (read () == "-") {
if (read () == "-") {
if (read () == ">") {
ok = false;
} else {
seek -= 1;
}
}
}
}
} else {
foreach (string str in specialTag) {
if (tagName == str) {
needContent = false;
break;
} else
needContent = true;
}
}
} else if (!inTag && needContent) {
result.Append (word);
}
}
return result.ToString ();
}
/// <summary>
/// 判断是否要保存这个标签
/// </summary>
/// <param name="tag">
/// A <see cref="System.String"/>
/// </param>
/// <returns>
/// A <see cref="System.Boolean"/>
/// </returns>
private bool iskeepTag (string tag)
{
foreach (string ta in keepTag) {
if (tag.ToLower () == ta.ToLower ()) {
return true;
}
}
return false;
}
private string read ()
{
return htmlcode[seek++];
}
}
}

引文原址：http://blog.csdn.net/cjh200102/article/details/6824895

HTML 转文本及HTML内容提取(C#)的更多相关文章

python利用正则表达式提取文本中特定内容
正则表达式是一个特殊的字符序列,它能帮助你方便的检查一个字符串是否与某种模式匹配. Python 自1.5版本起增加了re 模块,它提供 Perl 风格的正则表达式模式. re 模块使 Python ...
关于MFC文本框输入内容的获取与设置文本框的内容
八月要开始做界面了<(￣︶￣)/,然而目前只会用MFC╮(╯▽╰)╭ 好吧,言归正传,设置好文本框后,要获取用户输入的内容,可以用: GetDlgItemText() ; 这个函数有两个参数,第 ...
jquery获取文本框的内容
使用jquery获取文本框的内容有以下几种: 1.根据ID取值(id属性): // javascript <script type="text/javascript"> ...
jQuery清除文本框，内容并设置不可用
JQuery清除文本框,内容并设置不可用如果是设置只读,则将disabled换成readonly function CleanText(textid) { $("#"+text ...
Python即时网络爬虫项目: 内容提取器的定义(Python2.7版本)
1. 项目背景在Python即时网络爬虫项目启动说明中我们讨论一个数字:程序员浪费在调测内容提取规则上的时间太多了(见上图),从而我们发起了这个项目,把程序员从繁琐的调测规则中解放出来,投入到更高端 ...
Python即时网络爬虫项目: 内容提取器的定义
1. 项目背景在python 即时网络爬虫项目启动说明中我们讨论一个数字:程序员浪费在调测内容提取规则上的时间,从而我们发起了这个项目,把程序员从繁琐的调测规则中解放出来,投入到更高端的数据处理工作 ...
API例子：用Java/JavaScript下载内容提取器
1,引言本文讲解怎样用Java和JavaScript使用 GooSeeker API 接口下载内容提取器,这是一个示例程序.什么是内容提取器?为什么用这种方式?源自Python即时网络爬虫开源项目: ...
.Net 文本框实现内容提示(仿Google、Baidu)
原文:.Net 文本框实现内容提示(仿Google.Baidu) 1.Demo下载: 文本框实现内容提示(仿Google.Baidu).rar 2.创建数据库.表(我用的sqlserver2008数据 ...
shell编程系列11--文本处理三剑客之sed利用sed删除文本中的内容
shell编程系列11--文本处理三剑客之sed利用sed删除文本中的内容删除命令对照表命令含义 1d 删除第一行内容 ,10d 删除1行到10行的内容 ,+5d 删除10行到16行的内容 /p ...

随机推荐

Oracle 错误码
Oracle作为一款比较优秀同时也比较难以掌握的大型数据库,在我们学习使用的过程中,不可避免的会遇到一些错误,为此 Oracle 给出了一套完备的错误消息提示机制我们可以根据Oracle给出的消息提 ...
GET——token
private function get_token(){ $appid="wx4dae5d61b7f9935c"; $appSecret="24a91315a1a62a ...
移动端web页面使用position:fixed问题总结
近期完成了一个新的项目(搜狐直播),其中又涉及到了 fixed(固定位置定位)的问题,在之前的文章<移动Web产品前端开发口诀——“快”>中已经阐述过我对 iScroll 的态度,所以在这 ...
php经典面试题
1. 用PHP打印出前一天的时间,打印格式是2007年5月10日 22:21:21 2. PHP代码如下:$a="hello"; $b=&$a;unset($b);$b=& ...
C++通过OCCI操作Oracle数据库详解
1.安装OCCI 如果oracle数据库默认没有安装OCCI,可以自己从官网上下载与自己数据库版本一致的API,其中包含四个软件包: oracle-instantclient-sqlplus-10.2 ...
Android 常用对话框Dialog封装
Android 6种常用对话框Dialog封装包括: 消息对话框.警示(含确认.取消)对话框.单选对话框. 复选对话框.列表对话框.自定义视图(含确认.取消)对话框分别如下图所示: ...
C51库函数积累
C51库函数积累: (1)_chkfloat_: 函数定义:unsigned char _chkfloat_ ( float val); /* number to check */ 函数功能:_chk ...
qt-solutions提供了8个开源项目
其实这是官方提供的源代码,至于为什么会另建项目,而没有整合到QT项目里去,我猜可能有2个原因: 1. 这几个项目本身不完善,并且也不是QT项目的核心,因此没有必要花精力去完善 2. 一定程度上可以维护 ...
activity5 流程入门
http://blog.csdn.net/yangyi22/article/details/9225849 谢谢原文作者提供!
学习下关于ViewStub实例的用法及带Drawable的TextView的妙用
在项目中,我们可能有多种数据来源比如: 里面有ListView也有当获得数据为空的时候显示的空信息.根据点击的项目还是差事不同,显示的空消息也不同.a.没有收藏的项目,b目前没有收藏的差事. 其实实现 ...

HTML 转文本及HTML内容提取(C#)

HTML 转文本及HTML内容提取(C#)的更多相关文章

随机推荐

热门专题