提取html的正文

1 using System;
2 using System.Text;
3 namespace HtmlStrip
4 {
5 class MainClass
6 {
7 public static void Main (string[] args)
8 {
9 string str = "<div>abc</div><span>efg</span><br /><script>888</script>oo";
10 //System.IO.StreamReader rd=new System.IO.StreamReader ("/home/lx/test.html");
11 //str=rd.ReadToEnd ();
12 HtmlParser t = new HtmlParser (str); //
13 t.KeepTag (new string[] { "br" }); //设置br标签不过虑
14 Console.Write (t.Text ());
15 }
16
17
18
19 }
20 class HtmlParser
21 {
22 private string[] htmlcode; //把html转为数组形式用于分析
23 private StringBuilder result = new StringBuilder (); //输出的结果
24 private int seek; //分析文本时候的指针位置
25 private string[] keepTag; //用于保存要保留的尖括号内容
26 private bool _inTag; //标记现在的指针是不是在尖括号内
27 private bool needContent = true; //是否要提取正文
28 private string tagName; //当前尖括号的名字
29 private string[] specialTag = new string[] { "script", "style", "!--" }; //特殊的尖括号内容，一般这些标签的正文是不要的
30
31 /// <summary>
32 /// 当指针进入尖括号内，就会触发这个属性。这里主要逻辑是提取尖括号里的标签名字
33 /// </summary>
34 public bool inTag {
35 get { return _inTag; }
36 set {
37 _inTag = value;
38 if (!value)
39 return;
40 bool ok = true;
41 tagName = "";
42 while (ok) {
43 string word = read ();
44 if (word != " " && word != ">") {
45 tagName += word;
46 } else if (word == " " && tagName.Length > 0) {
47 ok = false;
48 } else if (word == ">") {
49 ok = false;
50 inTag = false;
51 seek -= 1;
52 }
53 }
54 }
55 }
56 /// <summary>
57 /// 初始化类
58 /// </summary>
59 /// <param name="html">
60 /// 要分析的html代码
61 /// </param>
62 public HtmlParser (string html)
63 {
64 htmlcode = new string[html.Length];
65 for (int i = 0; i < html.Length; i++) {
66 htmlcode[i] = html[i].ToString ();
67 }
68 KeepTag (new string[] { });
69 }
70 /// <summary>
71 /// 设置要保存那些标签不要被过滤掉
72 /// </summary>
73 /// <param name="tags">
74 ///
75 /// </param>
76 public void KeepTag (string[] tags)
77 {
78 keepTag = tags;
79 }
80
81 /// <summary>
82 ///
83 /// </summary>
84 /// <returns>
85 /// 输出处理后的文本
86 /// </returns>
87 public string Text ()
88 {
89 int startTag = 0;
90 int endTag = 0;
91 while (seek < htmlcode.Length) {
92 string word = read ();
93 if (word.ToLower () == "<") {
94 startTag = seek;
95 inTag = true;
96 } else if (word.ToLower () == ">") {
97 endTag = seek;
98 inTag = false;
99 if (iskeepTag (tagName.Replace ("/", ""))) {
100 for (int i = startTag - 1; i < endTag; i++) {
101 result.Append (htmlcode[i].ToString ());
102 }http://www.huiyi8.com/clxgt/
103 } else if (tagName.StartsWith ("!--")) {
104 bool ok = true;窗帘效果图
105 while (ok) {
106 if (read () == "-") {
107 if (read () == "-") {
108 if (read () == ">") {
109 ok = false;
110 } else {
111 seek -= 1;
112 }
113 }
114 }
115 }
116 } else {
117 foreach (string str in specialTag) {
118 if (tagName == str) {
119 needContent = false;
120 break;
121 } else
122 needContent = true;
123 }
124 }
125 } else if (!inTag && needContent) {
126 result.Append (word);
127 }
128
129 }
130 return result.ToString ();
131 }
132 /// <summary>
133 /// 判断是否要保存这个标签
134 /// </summary>
135 /// <param name="tag">
136 /// A <see cref="System.String"/>
137 /// </param>
138 /// <returns>
139 /// A <see cref="System.Boolean"/>
140 /// </returns>
141 private bool iskeepTag (string tag)
142 {
143 foreach (string ta in keepTag) {
144 if (tag.ToLower () == ta.ToLower ()) {
145 return true;
146 }
147 }
148 return false;
149 }
150 private string read ()
151 {
152 return htmlcode[seek++];
153 }
154
155 }
156 }
157

提取html的正文的更多相关文章

c# 使用正则表达式提取章节小说正文全本篇
这一节主要内容是使用正则表达式提取网站的正文,主要面向于小说章节网站.其中涉及到一些其他知识点,比如异步读取.异步流写入等,代码中都会有详细的注解.现在流行的网络文学都是每日一更或几更,没有一个统一的 ...
提取HTML的正文类
本文转载:http://blog.csdn.net/cjh200102/article/details/6824895 //2.提取html的正文类 using System; using Syst ...
利用正则提取discuz的正文内容
源正文: [p=24, null, left][color=#000][font=宋体]近日,香港著名漫画家马荣成在香港举办的"[color=#ff660][url=http://cul.c ...
HTML 转文本及HTML内容提取(C#)
//1.HTML直接转文本 //使用方法 HtmlToText convert = new HtmlToText(); textBox2.Text = convert.Convert(textBox1 ...
Java 提取Word中的文本和图片
本文将介绍通过Java来提取或读取Word文档中文本和图片的方法.这里提取文本和图片包括同时提取文档正文当中以及页眉.页脚中的的文本和图片. 使用工具:Free Spire.Doc for Java ...
Python网页正文转换语音文件的操作方法
天气真的是越来越冷啦,有时候我们想翻看网页新闻,但是又冷的不想把手拿出来,移动鼠标翻看.这时候,是不是特别想电脑像讲故事一样,给我们念出来呢?人生苦短,我有python啊,试试用 Python 来朗读 ...
UWP开发入门（二十三）——WebView
本篇讨论在UWP开发中使用WebView控件时常见的问题,以及一些小技巧. WebView是实际开发中常用的控件,很多大家抱怨的套网页的应用都是通过WebView来实现的.这里要澄清一个问题,套网页的 ...
JAVA爬虫 WebCollector
JAVA爬虫 WebCollector 爬虫简介: WebCollector是一个无须配置.便于二次开发的JAVA爬虫框架(内核),它提供精简的的API,只需少量代码即可实现一个功能强大的爬虫. 爬虫 ...
Python爬虫初学（二）—— 爬百度贴吧
Python爬虫初学(二)-- 爬百度贴吧昨天初步接触了爬虫,实现了爬取网络段子并逐条阅读等功能,详见Python爬虫初学(一). 今天准备对百度贴吧下手了,嘿嘿.依然是跟着这个博客学习的,这次仿照 ...

随机推荐

笔记-迎难而上之Java基础进阶6
import java.io.*; public class InputStreamDemo{ public static void main(String[] args) throws IOExce ...
SilverLight：布局（3）StackPanel 对象
ylbtech-SilverLight-Layout: 布局(3)StackPanel 对象 A, Nesting Layout Containers(内嵌布局容器) B, StackPanel(队列 ...
22. Spring Boot 拦截器HandlerInterceptor【从零开始学Spring Boot】
转:http://blog.csdn.net/linxingliang/article/details/52069495 上一篇对过滤器的定义做了说明,也比较简单.过滤器属于Servlet范畴的API ...
POJ2386 Lake Counting 【DFS】
Lake Counting Time Limit: 1000MS Memory Limit: 65536K Total Submissions: 20782 Accepted: 10473 D ...
WinDbg抓取dmp文件
应用程序发生异常时抓取dmp: adplus.vbs -crash -pn w3wp.exe -y srv*c:\symbols*http://msdl.microsoft.com/download/ ...
Android的logger机制分析
分析安卓的Logger机制一.概述 Logger机制是在Android系统中提供的一个轻量级的日志系统,这个日志系统是以驱动程序的形式在内核空间实现的,在用户空间分别提供了Java接口和C/C++接 ...
ffmpeg 内存池
ffmpeg 部分内存管理采用了内存池技术.基本的接口在libavutil目录下的buffer.c文件中实现: 1. av_buffer_pool_init 初始化内存池 2 av_buffer_ ...
mysql + php 中文乱码全是？解决方法
在my.ini文件中找到[client]和[mysqld]字段,在下面均加上default-character-set=utf8,保存并关闭,重启服务器在window下重启失败,这是因为你安装了高版 ...
Java泛型类型变量的限定
有时候,类和方法须要对类型变量加以约束.比方你有一个方法,你仅仅希望它接收某个特定类型及其子类型作为參数. 以下就举一个方法限定接收參数的类型的样例来说明怎样限定类型变量. 首先有几个简单的辅助类: ...
kubernetes调度之pod优先级和资源抢占
系列目录 Pod可以拥有优先级.优先意味着相对于其它pod某个pod更为重要.如果重要的pod不能被调度,则kubernetes调度器会优先于(驱离)低优先级的pod来让处于pending状态的高优先 ...

提取html的正文

提取html的正文的更多相关文章

随机推荐

热门专题