[分词] C#SegList分词辅助类,帮助类 (转载)
主要功能如下
最新的SegList分词辅助类,帮助类
看下面代码吧
/// <summary>
/// 类说明:SegList
/// 编 码 人:苏飞
/// 联系方式:361983679
/// 更新网站:[url=http://www.sufeinet.com/thread-655-1-1.html]http://www.sufeinet.com/thread-655-1-1.html[/url]
/// </summary>
using System;
using System.Collections;
using System.IO;
using System.Text.RegularExpressions; namespace DotNet.Utilities
{
/// <summary>
/// 分词辅助类
/// </summary>
public class SegList
{
public int MaxLength;
private ArrayList m_seg; public int Count
{
get
{
return m_seg.Count;
}
} public SegList()
{
m_seg = new ArrayList();
MaxLength = ;
} public void Add(object obj)
{
m_seg.Add(obj);
if (MaxLength < obj.ToString().Length)
{
MaxLength = obj.ToString().Length;
}
} public object GetElem(int i)
{
if (i < this.Count)
return m_seg[i];
else
return null;
} public void SetElem(int i, object obj)
{
m_seg[i] = obj;
} public bool Contains(object obj)
{
return m_seg.Contains(obj);
} /// <summary>
/// 按长度排序
/// </summary>
public void Sort()
{
Sort(this);
} /// <summary>
/// 按长度排序
/// </summary>
public void Sort(SegList list)
{
int max = ;
for (int i = ; i < list.Count - ; ++i)
{
max = i;
for (int j = i + ; j < list.Count; ++j)
{ string str1 = list.GetElem(j).ToString();
string str2 = list.GetElem(max).ToString();
int l1;
int l2;
if (str1 == "null")
l1 = ;
else
l1 = str1.Length; if (str2 == "null")
l2 = ;
else
l2 = str2.Length; if (l1 > l2)
max = j;
}
object o = list.GetElem(max);
list.SetElem(max, list.GetElem(i));
list.SetElem(i, o);
}
}
} /// <summary>
/// 分词类
/// </summary>
//----------------调用----------------------
//Segment seg = new Segment();
//seg.InitWordDics();
//seg.EnablePrefix = true;
//seg.Separator =" ";
//seg.SegmentText("字符串", false).Trim();
//-------------------------------------------
public class Segment
{
#region 私有字段
private string m_DicPath = System.Web.HttpContext.Current.Server.MapPath("bin/ShootSeg/sDict.dic");
private string m_NoisePath = System.Web.HttpContext.Current.Server.MapPath("bin/ShootSeg/sNoise.dic");
private string m_NumberPath = System.Web.HttpContext.Current.Server.MapPath("bin/ShootSeg/sNumber.dic");
private string m_WordPath = System.Web.HttpContext.Current.Server.MapPath("bin/ShootSeg/sWord.dic");
private string m_PrefixPath = System.Web.HttpContext.Current.Server.MapPath("bin/ShootSeg/sPrefix.dic");
private Hashtable htWords;
private ArrayList alNoise;
private ArrayList alNumber;
private ArrayList alWord;
private ArrayList alPrefix;
private double m_EventTime = ; /// <summary>
/// 分隔符
/// </summary>
private string m_Separator = " "; /// <summary>
/// 用于验证汉字的正则表达式
/// </summary>
private string strChinese = "[\u4e00-\u9fa5]";
#endregion #region 公有属性
/// <summary>
/// 基本词典路径
/// </summary>
public string DicPath
{
get
{
return m_DicPath;
}
set
{
m_DicPath = value;
}
} /// <summary>
/// 数据缓存函数
/// </summary>
/// <param name="key">索引键</param>
/// <param name="val">缓存的数据</param>
private static void SetCache(string key, object val)
{
if (val == null) val = " ";
System.Web.HttpContext.Current.Application.Lock();
System.Web.HttpContext.Current.Application.Set(key, val);
System.Web.HttpContext.Current.Application.UnLock();
} /// <summary>
/// 读取缓存
/// </summary>
private static object GetCache(string key)
{
return System.Web.HttpContext.Current.Application.Get(key);
} /// <summary>
/// 暂时无用
/// </summary>
public string NoisePath
{
get
{
return m_NoisePath;
}
set
{
m_NoisePath = value;
}
} /// <summary>
/// 数字词典路径
/// </summary>
public string NumberPath
{
get
{
return m_NumberPath;
}
set
{
m_NumberPath = value;
}
} /// <summary>
/// 字母词典路径
/// </summary>
public string WordPath
{
get
{
return m_WordPath;
}
set
{
m_WordPath = value;
}
} /// <summary>
/// 姓名前缀字典 用于纠错姓名
/// </summary>
public string PrefixPath
{
get
{
return m_PrefixPath;
}
set
{
m_PrefixPath = value;
}
} /// <summary>
/// 是否开启姓名纠错功能
/// </summary>
public bool EnablePrefix
{
get
{
if (alPrefix.Count == )
return false;
else
return true;
}
set
{
if (value)
alPrefix = LoadWords(PrefixPath, alPrefix);
else
alPrefix = new ArrayList();
}
} /// <summary>
/// 用时每次进行加载或分词动作后改属性表示为上一次动作所用时间
/// 已精确到毫秒但分词操作在字符串较短时可能为0
/// </summary>
public double EventTime
{
get
{
return m_EventTime;
}
} /// <summary>
/// 分隔符,默认为空格
/// </summary>
public string Separator
{
get
{
return m_Separator;
}
set
{
if (value != "" && value != null) m_Separator = value;
}
}
#endregion #region 构造方法
/// <summary>
/// 构造方法
/// </summary>
public Segment()
{ } /// <summary>
/// 构造方法
/// </summary>
public Segment(string p_DicPath, string p_NoisePath, string p_NumberPath, string p_WordPath)
{
m_WordPath = p_DicPath;
m_WordPath = p_NoisePath;
m_WordPath = p_NumberPath;
m_WordPath = p_WordPath;
this.InitWordDics();
}
#endregion #region 公有方法
/// <summary>
/// 加载词列表
/// </summary>
public void InitWordDics()
{
DateTime start = DateTime.Now;
if (GetCache("jcms_dict") == null)
{
htWords = new Hashtable();
Hashtable father = htWords;
Hashtable forfather = htWords; string strChar1;
string strChar2; StreamReader reader = new StreamReader(DicPath, System.Text.Encoding.UTF8);
string strline = reader.ReadLine(); SegList list;
Hashtable child = new Hashtable(); long i = ;
while (strline != null && strline.Trim() != "")
{
i++;
strChar1 = strline.Substring(, );
strChar2 = strline.Substring(, );
if (!htWords.ContainsKey(strChar1))
{
father = new Hashtable();
htWords.Add(strChar1, father);
}
else
{
father = (Hashtable)htWords[strChar1];
} if (!father.ContainsKey(strChar2))
{
list = new SegList();
if (strline.Length > )
list.Add(strline.Substring());
else
list.Add("null");
father.Add(strChar2, list);
}
else
{
list = (SegList)father[strChar2];
if (strline.Length > )
{
list.Add(strline.Substring());
}
else
{
list.Add("null");
}
father[strChar2] = list;
}
htWords[strChar1] = father;
strline = reader.ReadLine();
}
try
{
reader.Close();
}
catch
{ }
SetCache("jcms_dict", htWords);
}
htWords = (Hashtable)GetCache("jcms_dict"); alNoise = LoadWords(NoisePath, alNoise);
alNumber = LoadWords(NumberPath, alNumber);
alWord = LoadWords(WordPath, alWord);
alPrefix = LoadWords(PrefixPath, alPrefix); TimeSpan duration = DateTime.Now - start;
m_EventTime = duration.TotalMilliseconds;
} /// <summary>
/// 加载文本词组到ArrayList
/// </summary>
public ArrayList LoadWords(string strPath, ArrayList list)
{
StreamReader reader = new StreamReader(strPath, System.Text.Encoding.UTF8);
list = new ArrayList();
string strline = reader.ReadLine();
while (strline != null)
{
list.Add(strline);
strline = reader.ReadLine();
}
try
{
reader.Close();
}
catch
{ }
return list;
} /// <summary>
/// 输出词列表
/// </summary>
public void OutWords()
{
IDictionaryEnumerator idEnumerator1 = htWords.GetEnumerator();
while (idEnumerator1.MoveNext())
{
IDictionaryEnumerator idEnumerator2 = ((Hashtable)idEnumerator1.Value).GetEnumerator();
while (idEnumerator2.MoveNext())
{
SegList aa = (SegList)idEnumerator2.Value;
for (int i = ; i < aa.Count; i++)
{
Console.WriteLine(idEnumerator1.Key.ToString() + idEnumerator2.Key.ToString() + aa.GetElem(i).ToString());
}
}
}
} /// <summary>
/// 输出ArrayList
/// </summary>
public void OutArrayList(ArrayList list)
{
if (list == null) return;
for (int i = ; i < list.Count; i++)
{
Console.WriteLine(list[i].ToString());
}
} /// <summary>
/// 分词过程,不支持回车
/// </summary>
/// <param name="strText">要分词的文本</param>
/// <returns>分词后的文本</returns>
public string SegmentText(string strText)
{
strText = (strText + "$").Trim();
if (htWords == null) return strText;
if (strText.Length < ) return strText;
DateTime start = DateTime.Now;
int length = ;
int preFix = ;
bool word = false;
bool number = false;
string reText = "";
string strPrefix = "";
string strLastChar = "";
string strLastWords = Separator; for (int i = ; i < strText.Length - ; i++)
{
#region 对于每一个字的处理过程
string strChar1 = strText.Substring(i, );
string strChar2 = strText.Substring(i + , ).Trim();
bool yes;
SegList l;
Hashtable h; if (reText.Length > ) strLastChar = reText.Substring(reText.Length - ); if (strChar1 == " ")
{
if ((number || word) && strLastChar != Separator) reText += this.Separator;
yes = true;
}
else
yes = false; int CharType = GetCharType(strChar1);
switch (CharType)
{
case :
#region 如果是数字,如果数字的上一位是字母要和后面的数字分开
if (word)
{
reText += Separator;
}
word = false;
number = true;
strLastWords = "";
break;
#endregion
case :
case :
#region 如果是字母
if (number)
strLastWords = Separator;
else
strLastWords = ""; word = true;
number = false;
break;
#endregion
case :
case :
#region 第一级哈希表是否包含关键字,假如包含处理第二级哈希表
//上一个字是否为字母
if (word) reText += Separator; #region 检测上一个是否是数字,这个过程是用于修正数字后的量词的
if (number && CharType != )
{
h = (Hashtable)htWords["n"];
if (h.ContainsKey(strChar1))
{
l = (SegList)h[strChar1];
if (l.Contains(strChar2))
{
reText += strChar1 + strChar2 + Separator;
yes = true;
i++;
}
else if (l.Contains("null"))
{
reText += strChar1 + Separator;
yes = true;
}
}
else
reText += Separator;
}
#endregion //非汉字数字的汉字
if (CharType == )
{
word = false;
number = false;
strLastWords = Separator;
}
else
{
word = false;
number = true;
strLastWords = "";
} //第二级哈希表取出
h = (Hashtable)htWords[strChar1]; //第二级哈希表是否包含关键字
if (h.ContainsKey(strChar2))
{
#region 第二级包含关键字
//取出ArrayList对象
l = (SegList)h[strChar2]; //遍历每一个对象 看是否能组合成词
for (int j = ; j < l.Count; j++)
{
bool have = false;
string strChar3 = l.GetElem(j).ToString(); //对于每一个取出的词进行检测,看是否匹配,长度保护
if ((strChar3.Length + i + ) < strText.Length)
{
//向i+2后取出m长度的字
string strChar = strText.Substring(i + , strChar3.Length).Trim();
if (strChar3 == strChar && !yes)
{
if (strPrefix != "")
{
reText += strPrefix + Separator;
strPrefix = "";
preFix = ;
}
reText += strChar1 + strChar2 + strChar;
i += strChar3.Length + ;
have = true;
yes = true;
break;
}
}
else if ((strChar3.Length + i + ) == strText.Length)
{
string strChar = strText.Substring(i + ).Trim();
if (strChar3 == strChar && !yes)
{
if (strPrefix != "")
{
reText += strPrefix + Separator;
strPrefix = "";
preFix = ;
}
reText += strChar1 + strChar2 + strChar;
i += strChar3.Length + ;
have = true;
yes = true;
break;
}
} if (!have && j == l.Count - && l.Contains("null") && !yes)
{
if (preFix == )
{
reText += strPrefix + strChar1 + strChar2;
strPrefix = "";
preFix = ;
}
else if (preFix > )
{
reText += strPrefix + strLastWords + strChar1 + strChar2;
strPrefix = "";
preFix = ;
}
else
{
if (CharType == ) reText += strChar1 + strChar2;
else reText += strChar1 + strChar2;
strLastWords = this.Separator;
number = false;
}
i++;
yes = true;
break;
}
else if (have)
{
break;
}
}
#endregion //如果没有匹配还可能有一种情况,这个词语只有两个字,以这两个字开头的词语不存在
if (!yes && l.Contains("null"))
{
if (preFix == )
{
reText += strPrefix + strChar1 + strChar2;
strPrefix = "";
preFix = ;
}
else if (preFix > )
{
reText += strPrefix + strLastWords + strChar1 + strChar2;
strPrefix = "";
preFix = ;
}
else
{
if (CharType == ) reText += strChar1 + strChar2;
else reText += strChar1 + strChar2;
strLastWords = this.Separator;
number = false;
}
i++;
yes = true;
}
if (reText.Length > ) strLastChar = reText.Substring(reText.Length - );
if (CharType == && GetCharType(strLastChar) == )
{
number = true;
}
else if (strLastChar != this.Separator) reText += this.Separator;
}
#endregion
break;
default:
#region 未知字符,可能是生僻字,也可能是标点符合之类
if (word && !yes)
{
reText += Separator;
}
else if (number && !yes)
{
reText += Separator;
}
number = false;
word = false;
strLastWords = this.Separator;
break;
#endregion
}
if (!yes && number || !yes && word)
{
reText += strChar1;
yes = true;
}
if (!yes)
{
#region 处理姓名问题
if (preFix == )
{
if (alPrefix.Contains(strChar1 + strChar2))
{
i++;
strPrefix = strChar1 + strChar2;
preFix++;
}
else if (alPrefix.Contains(strChar1))
{
if (!number)
{
strPrefix = strChar1;
preFix++;
}
else
{
reText += strChar1 + strLastWords;
number = false;
word = false;
}
}
else
{
if (preFix == )
{
reText += strPrefix + Separator + strChar1 + Separator;
strPrefix = "";
preFix = ;
}
else if (preFix > )
{
if (Regex.IsMatch(strChar1, strChinese))
{
strPrefix += strChar1;
preFix++;
}
else
{
reText += strPrefix + Separator + strChar1 + Separator;
strPrefix = "";
preFix = ;
}
}
else
{
reText += strChar1 + strLastWords;
number = false;
word = false;
}
}
}
else
{
if (preFix == )
{
reText += strPrefix + Separator + strChar1 + Separator;
strPrefix = "";
preFix = ;
}
else if (preFix > )
{
if (Regex.IsMatch(strChar1, strChinese))
{
strPrefix += strChar1;
preFix++;
}
else
{
reText += strPrefix + Separator + strChar1 + Separator;
strPrefix = "";
preFix = ;
}
}
else
{
reText += strChar1 + strLastWords;
number = false;
}
}
#endregion
}
length = i;
#endregion
} #region 最后防止最后一个字的丢失
if (length < strText.Length - )
{
string strLastChar1 = strText.Substring(strText.Length - ).Trim();
string strLastChar2 = strText.Substring(strText.Length - ).Trim(); if (reText.Length > ) strLastChar = reText.Substring(reText.Length - );
if (preFix != )
{
reText += strPrefix + strLastChar1;
}
else
{
switch (GetCharType(strLastChar1))
{
case :
if (strLastChar1 != "." && strLastChar1 != ".")
reText += strLastChar1;
else
reText += Separator + strLastChar1;
break;
case :
case :
if (alWord.Contains(strLastChar2))
reText += strLastChar1;
break;
case :
case :
if ((number || word) && strLastChar != Separator)
reText += Separator + strLastChar1;
else
reText += strLastChar1;
break;
default:
if (strLastChar != Separator)
reText += Separator + strLastChar1;
else
reText += strLastChar1;
break;
}
}
if (reText.Length > ) strLastChar = (reText.Substring(reText.Length - ));
if (strLastChar != this.Separator) reText += this.Separator;
}
#endregion TimeSpan duration = DateTime.Now - start;
m_EventTime = duration.TotalMilliseconds;
return reText.Replace(" $", ""); //这里包含一个字的,则去掉
} /// <summary>
/// 重载分词过程,支持回车
/// </summary>
public string SegmentText(string strText, bool Enter)
{
if (Enter)
{
DateTime start = DateTime.Now;
string[] strArr = strText.Split('\n'); string reText = "";
for (int i = ; i < strArr.Length; i++)
{
reText += SegmentText(strArr[i]) + "\r\n";
} TimeSpan duration = DateTime.Now - start;
m_EventTime = duration.TotalMilliseconds;
return reText;
}
else
{
return SegmentText(strText);
}
} #region 判断字符类型
/// <summary>
/// 判断字符类型,0为未知,1为数字,2为字母,3为汉字,4为汉字数字
/// </summary>
private int GetCharType(string p_Char)
{
int CharType = ;
if (alNumber.Contains(p_Char)) CharType = ;
if (alWord.Contains(p_Char)) CharType = ;
if (htWords.ContainsKey(p_Char)) CharType += ;
return CharType;
}
#endregion #region 对加载的词典排序并重新写入
/// <summary>
/// 对加载的词典排序并重新写入
/// </summary>
public void SortDic()
{
SortDic(false);
} /// <summary>
/// 对加载的词典排序并重新写入
/// </summary>
/// <param name="Reload">是否重新加载</param>
public void SortDic(bool Reload)
{
DateTime start = DateTime.Now;
StreamWriter sw = new StreamWriter(DicPath, false, System.Text.Encoding.UTF8); IDictionaryEnumerator idEnumerator1 = htWords.GetEnumerator();
while (idEnumerator1.MoveNext())
{
IDictionaryEnumerator idEnumerator2 = ((Hashtable)idEnumerator1.Value).GetEnumerator();
while (idEnumerator2.MoveNext())
{
SegList aa = (SegList)idEnumerator2.Value;
aa.Sort();
for (int i = ; i < aa.Count; i++)
{
if (aa.GetElem(i).ToString() == "null")
sw.WriteLine(idEnumerator1.Key.ToString() + idEnumerator2.Key.ToString());
else
sw.WriteLine(idEnumerator1.Key.ToString() + idEnumerator2.Key.ToString() + aa.GetElem(i).ToString());
}
}
}
sw.Close(); if (Reload) InitWordDics(); TimeSpan duration = DateTime.Now - start;
m_EventTime = duration.TotalMilliseconds;
}
#endregion /// <summary>
/// 删除两行完全相同的词,暂时无用!
/// </summary>
/// <returns>相同词条个数</returns>
public int Optimize()
{
int l = ;
DateTime start = DateTime.Now; Hashtable htOptimize = new Hashtable();
StreamReader reader = new StreamReader(DicPath, System.Text.Encoding.UTF8);
string strline = reader.ReadLine();
while (strline != null && strline.Trim() != "")
{
if (!htOptimize.ContainsKey(strline))
htOptimize.Add(strline, null);
else
l++;
}
Console.WriteLine("ready");
try
{
reader.Close();
}
catch { }
StreamWriter sw = new StreamWriter(DicPath, false, System.Text.Encoding.UTF8);
IDictionaryEnumerator ide = htOptimize.GetEnumerator();
while (ide.MoveNext())
sw.WriteLine(ide.Key.ToString());
try
{
sw.Close();
}
catch { }
TimeSpan duration = DateTime.Now - start;
m_EventTime = duration.TotalMilliseconds;
return l;
}
#endregion
}
}
[分词] C#SegList分词辅助类,帮助类 (转载)的更多相关文章
- ElasticSearch已经配置好ik分词和mmseg分词(转)
ElasticSearch是一个基于Lucene构建的开源,分布式,RESTful搜索引擎.设计用于云计算中,能够达到实时搜索,稳定,可靠,快速,安装使用方便.支持通过HTTP使用JSON进行数据索引 ...
- 为 Elasticsearch 添加中文分词,对比分词器效果
转自:http://keenwon.com/1404.html 为 Elasticsearch 添加中文分词,对比分词器效果 Posted in 后端 By KeenWon On 2014年12月12 ...
- python中文分词:结巴分词
中文分词是中文文本处理的一个基础性工作,结巴分词利用进行中文分词.其基本实现原理有三点: 基于Trie树结构实现高效的词图扫描,生成句子中汉字所有可能成词情况所构成的有向无环图(DAG) 采用了动态规 ...
- .添加索引和类型,同时设定edgengram分词和charsplit分词
1.添加索引和类型,同时设定edgengram分词和charsplit分词 curl -XPUT 'http://127.0.0.1:9200/userindex/' -d '{ "se ...
- 为Elasticsearch添加中文分词,对比分词器效果
http://keenwon.com/1404.html Elasticsearch中,内置了很多分词器(analyzers),例如standard (标准分词器).english(英文分词)和chi ...
- ES 09 - 定制Elasticsearch的分词器 (自定义分词策略)
目录 1 索引的分析 1.1 分析器的组成 1.2 倒排索引的核心原理-normalization 2 ES的默认分词器 3 修改分词器 4 定制分词器 4.1 向索引中添加自定义的分词器 4.2 测 ...
- Elasticsearch拼音分词和IK分词的安装及使用
一.Es插件配置及下载 1.IK分词器的下载安装 关于IK分词器的介绍不再多少,一言以蔽之,IK分词是目前使用非常广泛分词效果比较好的中文分词器.做ES开发的,中文分词十有八九使用的都是IK分词器. ...
- 和我一起打造个简单搜索之IK分词以及拼音分词
elasticsearch 官方默认的分词插件,对中文分词效果不理想,它是把中文词语分成了一个一个的汉字.所以我们引入 es 插件 es-ik.同时为了提升用户体验,引入 es-pinyin 插件.本 ...
- 盘古分词+一元/二元分词Lucene
本文参考自:https://blog.csdn.net/mss359681091/article/details/52078147 http://www.cnblogs.com/top5/archiv ...
随机推荐
- Learning WCF Chapter1 Exposing Multiple Service Endpoints
So far in this chapter,I have shown you different ways to create services,how to expose a service en ...
- Java NIO原理及实例
Java NIO是在jdk1.4开始使用的,它既可以说成“新I/O”,也可以说成非阻塞式I/O.下面是java NIO的工作原理: 1. 由一个专门的线程来处理所有的 IO 事件,并负责分发. 2. ...
- UITextView 动态高度计算(iOS7版)
NSDictionary *attrsDictionary = [NSDictionarydictionaryWithObject:[UIFontsystemFontOfSize:kCellConte ...
- How to Create a SharePoint 2010 Project Without SharePoint Server
转:http://community.bamboosolutions.com/blogs/sharepoint-2010/archive/2012/06/21/create-a-sharepoint- ...
- android 中对apache httpclient及httpurlconnection的选择
在官方blog中,android工程师谈到了如何去选择apache client和httpurlconnection的问题: 原文见http://android-developers.blogspot ...
- 统计难题 HDOJ--2222
Keywords Search Time Limit: 2000/1000 MS (Java/Others) Memory Limit: 65536/32768 K (Java/Others)T ...
- devi into python 笔记(七)locals与globals 字典格式化字符串 字符集
locals()与globals(): """ locals:局部命名空间 globals:全局命名空间 都是以dictionary的形式保存的,变量名是键,变量值是值 ...
- 笔记本CPU的型号和类型的区分方法
笔记本CPU的型号和类型的区分方法: 1.所有笔记本CPU型号后面默认为M,代表移动版. 2.如果M变为H,则代表高性能版本,时钟频率更高,性能强,但功耗更大一点,如I7 4500H. 3.如果M变为 ...
- hdoj 2816 I Love You Too
I Love You Too Time Limit: 2000/1000 MS (Java/Others) Memory Limit: 32768/32768 K (Java/Others)To ...
- radix树
今天在学Linux内核页高速缓存时,学到了一种新的数据结构radix树(基数),经过分析,感觉此数据结构有点鸡肋,有可能是我理解不到位吧. 先来张图,给大家以直观的印象 当有一个key-value型的 ...