C#工具:分词辅助类
using System;
using System.Collections;
using System.IO;
using System.Text.RegularExpressions; namespace Common
{
/// <summary>
/// 分词辅助类
/// </summary>
public class SegList
{
public int MaxLength;
private ArrayList m_seg; public int Count
{
get
{
return m_seg.Count;
}
} public SegList()
{
m_seg = new ArrayList();
MaxLength = ;
} public void Add(object obj)
{
m_seg.Add(obj);
if (MaxLength < obj.ToString().Length)
{
MaxLength = obj.ToString().Length;
}
} public object GetElem(int i)
{
if (i < this.Count)
return m_seg[i];
else
return null;
} public void SetElem(int i, object obj)
{
m_seg[i] = obj;
} public bool Contains(object obj)
{
return m_seg.Contains(obj);
} /// <summary>
/// 按长度排序
/// </summary>
public void Sort()
{
Sort(this);
} /// <summary>
/// 按长度排序
/// </summary>
public void Sort(SegList list)
{
int max = ;
for (int i = ; i < list.Count - ; ++i)
{
max = i;
for (int j = i + ; j < list.Count; ++j)
{ string str1 = list.GetElem(j).ToString();
string str2 = list.GetElem(max).ToString();
int l1;
int l2;
if (str1 == "null")
l1 = ;
else
l1 = str1.Length; if (str2 == "null")
l2 = ;
else
l2 = str2.Length; if (l1 > l2)
max = j;
}
object o = list.GetElem(max);
list.SetElem(max, list.GetElem(i));
list.SetElem(i, o);
}
}
} /// <summary>
/// 分词类
/// </summary>
//----------------调用----------------------
//Segment seg = new Segment();
//seg.InitWordDics();
//seg.EnablePrefix = true;
//seg.Separator =" ";
//seg.SegmentText("字符串", false).Trim();
//-------------------------------------------
public class Segment
{
#region 私有字段
private string m_DicPath = System.Web.HttpContext.Current.Server.MapPath("bin/ShootSeg/sDict.dic");
private string m_NoisePath = System.Web.HttpContext.Current.Server.MapPath("bin/ShootSeg/sNoise.dic");
private string m_NumberPath = System.Web.HttpContext.Current.Server.MapPath("bin/ShootSeg/sNumber.dic");
private string m_WordPath = System.Web.HttpContext.Current.Server.MapPath("bin/ShootSeg/sWord.dic");
private string m_PrefixPath = System.Web.HttpContext.Current.Server.MapPath("bin/ShootSeg/sPrefix.dic");
private Hashtable htWords;
private ArrayList alNoise;
private ArrayList alNumber;
private ArrayList alWord;
private ArrayList alPrefix;
private double m_EventTime = ; /// <summary>
/// 分隔符
/// </summary>
private string m_Separator = " "; /// <summary>
/// 用于验证汉字的正则表达式
/// </summary>
private string strChinese = "[\u4e00-\u9fa5]";
#endregion #region 公有属性
/// <summary>
/// 基本词典路径
/// </summary>
public string DicPath
{
get
{
return m_DicPath;
}
set
{
m_DicPath = value;
}
} /// <summary>
/// 数据缓存函数
/// </summary>
/// <param name="key">索引键</param>
/// <param name="val">缓存的数据</param>
private static void SetCache(string key, object val)
{
if (val == null) val = " ";
System.Web.HttpContext.Current.Application.Lock();
System.Web.HttpContext.Current.Application.Set(key, val);
System.Web.HttpContext.Current.Application.UnLock();
} /// <summary>
/// 读取缓存
/// </summary>
private static object GetCache(string key)
{
return System.Web.HttpContext.Current.Application.Get(key);
} /// <summary>
/// 暂时无用
/// </summary>
public string NoisePath
{
get
{
return m_NoisePath;
}
set
{
m_NoisePath = value;
}
} /// <summary>
/// 数字词典路径
/// </summary>
public string NumberPath
{
get
{
return m_NumberPath;
}
set
{
m_NumberPath = value;
}
} /// <summary>
/// 字母词典路径
/// </summary>
public string WordPath
{
get
{
return m_WordPath;
}
set
{
m_WordPath = value;
}
} /// <summary>
/// 姓名前缀字典 用于纠错姓名
/// </summary>
public string PrefixPath
{
get
{
return m_PrefixPath;
}
set
{
m_PrefixPath = value;
}
} /// <summary>
/// 是否开启姓名纠错功能
/// </summary>
public bool EnablePrefix
{
get
{
if (alPrefix.Count == )
return false;
else
return true;
}
set
{
if (value)
alPrefix = LoadWords(PrefixPath, alPrefix);
else
alPrefix = new ArrayList();
}
} /// <summary>
/// 用时每次进行加载或分词动作后改属性表示为上一次动作所用时间
/// 已精确到毫秒但分词操作在字符串较短时可能为0
/// </summary>
public double EventTime
{
get
{
return m_EventTime;
}
} /// <summary>
/// 分隔符,默认为空格
/// </summary>
public string Separator
{
get
{
return m_Separator;
}
set
{
if (value != "" && value != null) m_Separator = value;
}
}
#endregion #region 构造方法
/// <summary>
/// 构造方法
/// </summary>
public Segment()
{ } /// <summary>
/// 构造方法
/// </summary>
public Segment(string p_DicPath, string p_NoisePath, string p_NumberPath, string p_WordPath)
{
m_WordPath = p_DicPath;
m_WordPath = p_NoisePath;
m_WordPath = p_NumberPath;
m_WordPath = p_WordPath;
this.InitWordDics();
}
#endregion #region 公有方法
/// <summary>
/// 加载词列表
/// </summary>
public void InitWordDics()
{
DateTime start = DateTime.Now;
if (GetCache("jcms_dict") == null)
{
htWords = new Hashtable();
Hashtable father = htWords;
Hashtable forfather = htWords; string strChar1;
string strChar2; StreamReader reader = new StreamReader(DicPath, System.Text.Encoding.UTF8);
string strline = reader.ReadLine(); SegList list;
Hashtable child = new Hashtable(); long i = ;
while (strline != null && strline.Trim() != "")
{
i++;
strChar1 = strline.Substring(, );
strChar2 = strline.Substring(, );
if (!htWords.ContainsKey(strChar1))
{
father = new Hashtable();
htWords.Add(strChar1, father);
}
else
{
father = (Hashtable)htWords[strChar1];
} if (!father.ContainsKey(strChar2))
{
list = new SegList();
if (strline.Length > )
list.Add(strline.Substring());
else
list.Add("null");
father.Add(strChar2, list);
}
else
{
list = (SegList)father[strChar2];
if (strline.Length > )
{
list.Add(strline.Substring());
}
else
{
list.Add("null");
}
father[strChar2] = list;
}
htWords[strChar1] = father;
strline = reader.ReadLine();
}
try
{
reader.Close();
}
catch
{ }
SetCache("jcms_dict", htWords);
}
htWords = (Hashtable)GetCache("jcms_dict"); alNoise = LoadWords(NoisePath, alNoise);
alNumber = LoadWords(NumberPath, alNumber);
alWord = LoadWords(WordPath, alWord);
alPrefix = LoadWords(PrefixPath, alPrefix); TimeSpan duration = DateTime.Now - start;
m_EventTime = duration.TotalMilliseconds;
} /// <summary>
/// 加载文本词组到ArrayList
/// </summary>
public ArrayList LoadWords(string strPath, ArrayList list)
{
StreamReader reader = new StreamReader(strPath, System.Text.Encoding.UTF8);
list = new ArrayList();
string strline = reader.ReadLine();
while (strline != null)
{
list.Add(strline);
strline = reader.ReadLine();
}
try
{
reader.Close();
}
catch
{ }
return list;
} /// <summary>
/// 输出词列表
/// </summary>
public void OutWords()
{
IDictionaryEnumerator idEnumerator1 = htWords.GetEnumerator();
while (idEnumerator1.MoveNext())
{
IDictionaryEnumerator idEnumerator2 = ((Hashtable)idEnumerator1.Value).GetEnumerator();
while (idEnumerator2.MoveNext())
{
SegList aa = (SegList)idEnumerator2.Value;
for (int i = ; i < aa.Count; i++)
{
Console.WriteLine(idEnumerator1.Key.ToString() + idEnumerator2.Key.ToString() + aa.GetElem(i).ToString());
}
}
}
} /// <summary>
/// 输出ArrayList
/// </summary>
public void OutArrayList(ArrayList list)
{
if (list == null) return;
for (int i = ; i < list.Count; i++)
{
Console.WriteLine(list[i].ToString());
}
} /// <summary>
/// 分词过程,不支持回车
/// </summary>
/// <param name="strText">要分词的文本</param>
/// <returns>分词后的文本</returns>
public string SegmentText(string strText)
{
strText = (strText + "$").Trim();
if (htWords == null) return strText;
if (strText.Length < ) return strText;
DateTime start = DateTime.Now;
int length = ;
int preFix = ;
bool word = false;
bool number = false;
string reText = "";
string strPrefix = "";
string strLastChar = "";
string strLastWords = Separator; for (int i = ; i < strText.Length - ; i++)
{
#region 对于每一个字的处理过程
string strChar1 = strText.Substring(i, );
string strChar2 = strText.Substring(i + , ).Trim();
bool yes;
SegList l;
Hashtable h; if (reText.Length > ) strLastChar = reText.Substring(reText.Length - ); if (strChar1 == " ")
{
if ((number || word) && strLastChar != Separator) reText += this.Separator;
yes = true;
}
else
yes = false; int CharType = GetCharType(strChar1);
switch (CharType)
{
case :
#region 如果是数字,如果数字的上一位是字母要和后面的数字分开
if (word)
{
reText += Separator;
}
word = false;
number = true;
strLastWords = "";
break;
#endregion
case :
case :
#region 如果是字母
if (number)
strLastWords = Separator;
else
strLastWords = ""; word = true;
number = false;
break;
#endregion
case :
case :
#region 第一级哈希表是否包含关键字,假如包含处理第二级哈希表
//上一个字是否为字母
if (word) reText += Separator; #region 检测上一个是否是数字,这个过程是用于修正数字后的量词的
if (number && CharType != )
{
h = (Hashtable)htWords["n"];
if (h.ContainsKey(strChar1))
{
l = (SegList)h[strChar1];
if (l.Contains(strChar2))
{
reText += strChar1 + strChar2 + Separator;
yes = true;
i++;
}
else if (l.Contains("null"))
{
reText += strChar1 + Separator;
yes = true;
}
}
else
reText += Separator;
}
#endregion //非汉字数字的汉字
if (CharType == )
{
word = false;
number = false;
strLastWords = Separator;
}
else
{
word = false;
number = true;
strLastWords = "";
} //第二级哈希表取出
h = (Hashtable)htWords[strChar1]; //第二级哈希表是否包含关键字
if (h.ContainsKey(strChar2))
{
#region 第二级包含关键字
//取出ArrayList对象
l = (SegList)h[strChar2]; //遍历每一个对象 看是否能组合成词
for (int j = ; j < l.Count; j++)
{
bool have = false;
string strChar3 = l.GetElem(j).ToString(); //对于每一个取出的词进行检测,看是否匹配,长度保护
if ((strChar3.Length + i + ) < strText.Length)
{
//向i+2后取出m长度的字
string strChar = strText.Substring(i + , strChar3.Length).Trim();
if (strChar3 == strChar && !yes)
{
if (strPrefix != "")
{
reText += strPrefix + Separator;
strPrefix = "";
preFix = ;
}
reText += strChar1 + strChar2 + strChar;
i += strChar3.Length + ;
have = true;
yes = true;
break;
}
}
else if ((strChar3.Length + i + ) == strText.Length)
{
string strChar = strText.Substring(i + ).Trim();
if (strChar3 == strChar && !yes)
{
if (strPrefix != "")
{
reText += strPrefix + Separator;
strPrefix = "";
preFix = ;
}
reText += strChar1 + strChar2 + strChar;
i += strChar3.Length + ;
have = true;
yes = true;
break;
}
} if (!have && j == l.Count - && l.Contains("null") && !yes)
{
if (preFix == )
{
reText += strPrefix + strChar1 + strChar2;
strPrefix = "";
preFix = ;
}
else if (preFix > )
{
reText += strPrefix + strLastWords + strChar1 + strChar2;
strPrefix = "";
preFix = ;
}
else
{
if (CharType == ) reText += strChar1 + strChar2;
else reText += strChar1 + strChar2;
strLastWords = this.Separator;
number = false;
}
i++;
yes = true;
break;
}
else if (have)
{
break;
}
}
#endregion //如果没有匹配还可能有一种情况,这个词语只有两个字,以这两个字开头的词语不存在
if (!yes && l.Contains("null"))
{
if (preFix == )
{
reText += strPrefix + strChar1 + strChar2;
strPrefix = "";
preFix = ;
}
else if (preFix > )
{
reText += strPrefix + strLastWords + strChar1 + strChar2;
strPrefix = "";
preFix = ;
}
else
{
if (CharType == ) reText += strChar1 + strChar2;
else reText += strChar1 + strChar2;
strLastWords = this.Separator;
number = false;
}
i++;
yes = true;
}
if (reText.Length > ) strLastChar = reText.Substring(reText.Length - );
if (CharType == && GetCharType(strLastChar) == )
{
number = true;
}
else if (strLastChar != this.Separator) reText += this.Separator;
}
#endregion
break;
default:
#region 未知字符,可能是生僻字,也可能是标点符合之类
if (word && !yes)
{
reText += Separator;
}
else if (number && !yes)
{
reText += Separator;
}
number = false;
word = false;
strLastWords = this.Separator;
break;
#endregion
}
if (!yes && number || !yes && word)
{
reText += strChar1;
yes = true;
}
if (!yes)
{
#region 处理姓名问题
if (preFix == )
{
if (alPrefix.Contains(strChar1 + strChar2))
{
i++;
strPrefix = strChar1 + strChar2;
preFix++;
}
else if (alPrefix.Contains(strChar1))
{
if (!number)
{
strPrefix = strChar1;
preFix++;
}
else
{
reText += strChar1 + strLastWords;
number = false;
word = false;
}
}
else
{
if (preFix == )
{
reText += strPrefix + Separator + strChar1 + Separator;
strPrefix = "";
preFix = ;
}
else if (preFix > )
{
if (Regex.IsMatch(strChar1, strChinese))
{
strPrefix += strChar1;
preFix++;
}
else
{
reText += strPrefix + Separator + strChar1 + Separator;
strPrefix = "";
preFix = ;
}
}
else
{
reText += strChar1 + strLastWords;
number = false;
word = false;
}
}
}
else
{
if (preFix == )
{
reText += strPrefix + Separator + strChar1 + Separator;
strPrefix = "";
preFix = ;
}
else if (preFix > )
{
if (Regex.IsMatch(strChar1, strChinese))
{
strPrefix += strChar1;
preFix++;
}
else
{
reText += strPrefix + Separator + strChar1 + Separator;
strPrefix = "";
preFix = ;
}
}
else
{
reText += strChar1 + strLastWords;
number = false;
}
}
#endregion
}
length = i;
#endregion
} #region 最后防止最后一个字的丢失
if (length < strText.Length - )
{
string strLastChar1 = strText.Substring(strText.Length - ).Trim();
string strLastChar2 = strText.Substring(strText.Length - ).Trim(); if (reText.Length > ) strLastChar = reText.Substring(reText.Length - );
if (preFix != )
{
reText += strPrefix + strLastChar1;
}
else
{
switch (GetCharType(strLastChar1))
{
case :
if (strLastChar1 != "." && strLastChar1 != ".")
reText += strLastChar1;
else
reText += Separator + strLastChar1;
break;
case :
case :
if (alWord.Contains(strLastChar2))
reText += strLastChar1;
break;
case :
case :
if ((number || word) && strLastChar != Separator)
reText += Separator + strLastChar1;
else
reText += strLastChar1;
break;
default:
if (strLastChar != Separator)
reText += Separator + strLastChar1;
else
reText += strLastChar1;
break;
}
}
if (reText.Length > ) strLastChar = (reText.Substring(reText.Length - ));
if (strLastChar != this.Separator) reText += this.Separator;
}
#endregion TimeSpan duration = DateTime.Now - start;
m_EventTime = duration.TotalMilliseconds;
return reText.Replace(" $", ""); //这里包含一个字的,则去掉
} /// <summary>
/// 重载分词过程,支持回车
/// </summary>
public string SegmentText(string strText, bool Enter)
{
if (Enter)
{
DateTime start = DateTime.Now;
string[] strArr = strText.Split('\n'); string reText = "";
for (int i = ; i < strArr.Length; i++)
{
reText += SegmentText(strArr[i]) + "\r\n";
} TimeSpan duration = DateTime.Now - start;
m_EventTime = duration.TotalMilliseconds;
return reText;
}
else
{
return SegmentText(strText);
}
} #region 判断字符类型
/// <summary>
/// 判断字符类型,0为未知,1为数字,2为字母,3为汉字,4为汉字数字
/// </summary>
private int GetCharType(string p_Char)
{
int CharType = ;
if (alNumber.Contains(p_Char)) CharType = ;
if (alWord.Contains(p_Char)) CharType = ;
if (htWords.ContainsKey(p_Char)) CharType += ;
return CharType;
}
#endregion #region 对加载的词典排序并重新写入
/// <summary>
/// 对加载的词典排序并重新写入
/// </summary>
public void SortDic()
{
SortDic(false);
} /// <summary>
/// 对加载的词典排序并重新写入
/// </summary>
/// <param name="Reload">是否重新加载</param>
public void SortDic(bool Reload)
{
DateTime start = DateTime.Now;
StreamWriter sw = new StreamWriter(DicPath, false, System.Text.Encoding.UTF8); IDictionaryEnumerator idEnumerator1 = htWords.GetEnumerator();
while (idEnumerator1.MoveNext())
{
IDictionaryEnumerator idEnumerator2 = ((Hashtable)idEnumerator1.Value).GetEnumerator();
while (idEnumerator2.MoveNext())
{
SegList aa = (SegList)idEnumerator2.Value;
aa.Sort();
for (int i = ; i < aa.Count; i++)
{
if (aa.GetElem(i).ToString() == "null")
sw.WriteLine(idEnumerator1.Key.ToString() + idEnumerator2.Key.ToString());
else
sw.WriteLine(idEnumerator1.Key.ToString() + idEnumerator2.Key.ToString() + aa.GetElem(i).ToString());
}
}
}
sw.Close(); if (Reload) InitWordDics(); TimeSpan duration = DateTime.Now - start;
m_EventTime = duration.TotalMilliseconds;
}
#endregion /// <summary>
/// 删除两行完全相同的词,暂时无用!
/// </summary>
/// <returns>相同词条个数</returns>
public int Optimize()
{
int l = ;
DateTime start = DateTime.Now; Hashtable htOptimize = new Hashtable();
StreamReader reader = new StreamReader(DicPath, System.Text.Encoding.UTF8);
string strline = reader.ReadLine();
while (strline != null && strline.Trim() != "")
{
if (!htOptimize.ContainsKey(strline))
htOptimize.Add(strline, null);
else
l++;
}
Console.WriteLine("ready");
try
{
reader.Close();
}
catch { }
StreamWriter sw = new StreamWriter(DicPath, false, System.Text.Encoding.UTF8);
IDictionaryEnumerator ide = htOptimize.GetEnumerator();
while (ide.MoveNext())
sw.WriteLine(ide.Key.ToString());
try
{
sw.Close();
}
catch { }
TimeSpan duration = DateTime.Now - start;
m_EventTime = duration.TotalMilliseconds;
return l;
}
#endregion
}
}
SegList
C#工具:分词辅助类的更多相关文章
- [分词] C#SegList分词辅助类,帮助类 (转载)
点击下载 SegList.rar 主要功能如下最新的SegList分词辅助类,帮助类看下面代码吧 /// <summary> /// 类说明:SegList /// 编 码 人:苏飞 // ...
- java并发编程工具类辅助类:CountDownLatch、CyclicBarrier和 Semaphore
在java 1.5中,提供了一些非常有用的辅助类来帮助我们进行并发编程,比如CountDownLatch,CyclicBarrier和Semaphore,今天我们就来学习一下这三个辅助类的用法. 以下 ...
- 常用中文分词工具分词&词性标注简单应用(jieba、pyhanlp、pkuseg、foolnltk、thulac、snownlp、nlpir)
1.jieba分词&词性标注 import jieba import jieba.posseg as posseg txt1 =''' 文本一: 人民网华盛顿3月28日电(记者郑琪)据美国约翰 ...
- NLP自然语言处理中英文分词工具集锦与基本使用介绍
一.中文分词工具 (1)Jieba (2)snowNLP分词工具 (3)thulac分词工具 (4)pynlpir 分词工具 (5)StanfordCoreNLP分词工具 1.from stanfor ...
- 句法分析工具 LTP HanLP
参考:http://cslt.riit.tsinghua.edu.cn/mediawiki/images/e/e5/%E5%8F%A5%E6%B3%95%E5%B7%A5%E5%85%B7%E5%88 ...
- java中重要的多线程工具类
前言 之前学多线程的时候没有学习线程的同步工具类(辅助类).ps:当时觉得暂时用不上,认为是挺高深的知识点就没去管了.. 在前几天,朋友发了一篇比较好的Semaphore文章过来,然后在浏览博客的时候 ...
- 重写lucene.net的分词器支持3.0.3.0版本
lucene.net中每个分词器都是一个类,同时有一个辅助类,这个辅助类完成分词的大部分逻辑.分词类以Analyzer结尾,辅助类通常以Tokenizer结尾.分类词全部继承自Analyzer类,辅助 ...
- 第二百三十六节,Bootstrap辅组类和响应式工具
Bootstrap辅组类和响应式工具 学习要点: 1.辅组类 2.响应式工具 本节课我们主要学习一下 Bootstrap 的辅组类和响应式工具,辅助类提供了一组类来辅 组页面设计,而响应式工具则利用媒 ...
- .NET开发相关使用工具和框架
转自: http://www.cnblogs.com/NatureSex/archive/2011/04/21/2023265.html 开发类 visual_studio 2005-2010系列-- ...
随机推荐
- CSS3实例分享之多重背景的实现(Multiple backgrounds)
CSS3的诞生为我们解决了这一问题,在CSS3里,通过background-image或者background可以为一个容器设置多张背景图像,也就是说可以把不同背景图象只放到一个块元素里. 首先我们来 ...
- 前端教程(1)http协议的深刻理解
一 HTTP协议简介 作为学习前端开发的开始,我们必须搞明白以下几件事 1.什么是互联网 互联网=物理连接介质+互联网协议 2.互联网建立的目的? 数据传输打破地域限制,否则的话,我 ...
- 大数据技术之_19_Spark学习_03_Spark SQL 应用解析小结
========== Spark SQL ==========1.Spark SQL 是 Spark 的一个模块,可以和 RDD 进行混合编程.支持标准的数据源.可以集成和替代 Hive.可以提供 J ...
- 从壹开始微服务 [ DDD ] 之十一 ║ 基于源码分析,命令分发的过程(二)
缘起 哈喽小伙伴周三好,老张又来啦,DDD领域驱动设计的第二个D也快说完了,下一个系列我也在考虑之中,是 Id4 还是 Dockers 还没有想好,甚至昨天我还想,下一步是不是可以写一个简单的Angu ...
- 从0到1:使用Caliburn.Micro(WPF和MVVM)开发简单的计算器
从0到1:使用Caliburn.Micro(WPF和MVVM)开发简单的计算器 之前时间一直在使用Caliburn.Micro这种应用了MVVM模式的WPF框架做开发,是时候总结一下了. Calibu ...
- java游戏开发杂谈 - java编程怎么学
java语言包含很多的知识点,我们并不需要把java语言的知识点都学全了,才开始编程. 你只需要了解: 1,java的数据类型和变量定义 2,类和对象的初步印象. 3,if-else, wh ...
- DSAPI 网页获取本地程序登陆用户
这是一个非常简单的示例,在网页或其他平台获取程序中已经登陆的用户,当然也可以是其他信息. 源码 Imports DSAPI.网络.HTTP监听_DSWebAPI Public Class Form1 ...
- JS,JQ 格式化小数位数
在<script>中: $(function(){ var num=$(".price").length;/*获取应用了class="price"的 ...
- Android SQLite 数据库学习
SQLite 数据库简介 SQLite 是一个轻量级数据库,它是D. Richard Hipp建立的公有领域项目,在2000年发布了第一个版本.它的设计目标是嵌入式的,而且占用资源非常低,在内存中只需 ...
- gitbook 入门教程之环境要求
gitbook 是基于 node.js 的命令行工具,首先需要安装并配置好 node.js 环境,然后才能安装gitbook 相关工具. 由于安装工具全部都是国外网站,因此速度可能会很慢,也可能需要F ...