点击下载 SegList.rar

主要功能如下
最新的SegList分词辅助类,帮助类
看下面代码吧

  1. /// <summary>
  2. /// 类说明:SegList
  3. /// 编 码 人:苏飞
  4. /// 联系方式:361983679
  5. /// 更新网站:[url=http://www.sufeinet.com/thread-655-1-1.html]http://www.sufeinet.com/thread-655-1-1.html[/url]
  6. /// </summary>
  7. using System;
  8. using System.Collections;
  9. using System.IO;
  10. using System.Text.RegularExpressions;
  11.  
  12. namespace DotNet.Utilities
  13. {
  14. /// <summary>
  15. /// 分词辅助类
  16. /// </summary>
  17. public class SegList
  18. {
  19. public int MaxLength;
  20. private ArrayList m_seg;
  21.  
  22. public int Count
  23. {
  24. get
  25. {
  26. return m_seg.Count;
  27. }
  28. }
  29.  
  30. public SegList()
  31. {
  32. m_seg = new ArrayList();
  33. MaxLength = ;
  34. }
  35.  
  36. public void Add(object obj)
  37. {
  38. m_seg.Add(obj);
  39. if (MaxLength < obj.ToString().Length)
  40. {
  41. MaxLength = obj.ToString().Length;
  42. }
  43. }
  44.  
  45. public object GetElem(int i)
  46. {
  47. if (i < this.Count)
  48. return m_seg[i];
  49. else
  50. return null;
  51. }
  52.  
  53. public void SetElem(int i, object obj)
  54. {
  55. m_seg[i] = obj;
  56. }
  57.  
  58. public bool Contains(object obj)
  59. {
  60. return m_seg.Contains(obj);
  61. }
  62.  
  63. /// <summary>
  64. /// 按长度排序
  65. /// </summary>
  66. public void Sort()
  67. {
  68. Sort(this);
  69. }
  70.  
  71. /// <summary>
  72. /// 按长度排序
  73. /// </summary>
  74. public void Sort(SegList list)
  75. {
  76. int max = ;
  77. for (int i = ; i < list.Count - ; ++i)
  78. {
  79. max = i;
  80. for (int j = i + ; j < list.Count; ++j)
  81. {
  82.  
  83. string str1 = list.GetElem(j).ToString();
  84. string str2 = list.GetElem(max).ToString();
  85. int l1;
  86. int l2;
  87. if (str1 == "null")
  88. l1 = ;
  89. else
  90. l1 = str1.Length;
  91.  
  92. if (str2 == "null")
  93. l2 = ;
  94. else
  95. l2 = str2.Length;
  96.  
  97. if (l1 > l2)
  98. max = j;
  99. }
  100. object o = list.GetElem(max);
  101. list.SetElem(max, list.GetElem(i));
  102. list.SetElem(i, o);
  103. }
  104. }
  105. }
  106.  
  107. /// <summary>
  108. /// 分词类
  109. /// </summary>
  110. //----------------调用----------------------
  111. //Segment seg = new Segment();
  112. //seg.InitWordDics();
  113. //seg.EnablePrefix = true;
  114. //seg.Separator =" ";
  115. //seg.SegmentText("字符串", false).Trim();
  116. //-------------------------------------------
  117. public class Segment
  118. {
  119. #region 私有字段
  120. private string m_DicPath = System.Web.HttpContext.Current.Server.MapPath("bin/ShootSeg/sDict.dic");
  121. private string m_NoisePath = System.Web.HttpContext.Current.Server.MapPath("bin/ShootSeg/sNoise.dic");
  122. private string m_NumberPath = System.Web.HttpContext.Current.Server.MapPath("bin/ShootSeg/sNumber.dic");
  123. private string m_WordPath = System.Web.HttpContext.Current.Server.MapPath("bin/ShootSeg/sWord.dic");
  124. private string m_PrefixPath = System.Web.HttpContext.Current.Server.MapPath("bin/ShootSeg/sPrefix.dic");
  125. private Hashtable htWords;
  126. private ArrayList alNoise;
  127. private ArrayList alNumber;
  128. private ArrayList alWord;
  129. private ArrayList alPrefix;
  130. private double m_EventTime = ;
  131.  
  132. /// <summary>
  133. /// 分隔符
  134. /// </summary>
  135. private string m_Separator = " ";
  136.  
  137. /// <summary>
  138. /// 用于验证汉字的正则表达式
  139. /// </summary>
  140. private string strChinese = "[\u4e00-\u9fa5]";
  141. #endregion
  142.  
  143. #region 公有属性
  144. /// <summary>
  145. /// 基本词典路径
  146. /// </summary>
  147. public string DicPath
  148. {
  149. get
  150. {
  151. return m_DicPath;
  152. }
  153. set
  154. {
  155. m_DicPath = value;
  156. }
  157. }
  158.  
  159. /// <summary>
  160. /// 数据缓存函数
  161. /// </summary>
  162. /// <param name="key">索引键</param>
  163. /// <param name="val">缓存的数据</param>
  164. private static void SetCache(string key, object val)
  165. {
  166. if (val == null) val = " ";
  167. System.Web.HttpContext.Current.Application.Lock();
  168. System.Web.HttpContext.Current.Application.Set(key, val);
  169. System.Web.HttpContext.Current.Application.UnLock();
  170. }
  171.  
  172. /// <summary>
  173. /// 读取缓存
  174. /// </summary>
  175. private static object GetCache(string key)
  176. {
  177. return System.Web.HttpContext.Current.Application.Get(key);
  178. }
  179.  
  180. /// <summary>
  181. /// 暂时无用
  182. /// </summary>
  183. public string NoisePath
  184. {
  185. get
  186. {
  187. return m_NoisePath;
  188. }
  189. set
  190. {
  191. m_NoisePath = value;
  192. }
  193. }
  194.  
  195. /// <summary>
  196. /// 数字词典路径
  197. /// </summary>
  198. public string NumberPath
  199. {
  200. get
  201. {
  202. return m_NumberPath;
  203. }
  204. set
  205. {
  206. m_NumberPath = value;
  207. }
  208. }
  209.  
  210. /// <summary>
  211. /// 字母词典路径
  212. /// </summary>
  213. public string WordPath
  214. {
  215. get
  216. {
  217. return m_WordPath;
  218. }
  219. set
  220. {
  221. m_WordPath = value;
  222. }
  223. }
  224.  
  225. /// <summary>
  226. /// 姓名前缀字典 用于纠错姓名
  227. /// </summary>
  228. public string PrefixPath
  229. {
  230. get
  231. {
  232. return m_PrefixPath;
  233. }
  234. set
  235. {
  236. m_PrefixPath = value;
  237. }
  238. }
  239.  
  240. /// <summary>
  241. /// 是否开启姓名纠错功能
  242. /// </summary>
  243. public bool EnablePrefix
  244. {
  245. get
  246. {
  247. if (alPrefix.Count == )
  248. return false;
  249. else
  250. return true;
  251. }
  252. set
  253. {
  254. if (value)
  255. alPrefix = LoadWords(PrefixPath, alPrefix);
  256. else
  257. alPrefix = new ArrayList();
  258. }
  259. }
  260.  
  261. /// <summary>
  262. /// 用时每次进行加载或分词动作后改属性表示为上一次动作所用时间
  263. /// 已精确到毫秒但分词操作在字符串较短时可能为0
  264. /// </summary>
  265. public double EventTime
  266. {
  267. get
  268. {
  269. return m_EventTime;
  270. }
  271. }
  272.  
  273. /// <summary>
  274. /// 分隔符,默认为空格
  275. /// </summary>
  276. public string Separator
  277. {
  278. get
  279. {
  280. return m_Separator;
  281. }
  282. set
  283. {
  284. if (value != "" && value != null) m_Separator = value;
  285. }
  286. }
  287. #endregion
  288.  
  289. #region 构造方法
  290. /// <summary>
  291. /// 构造方法
  292. /// </summary>
  293. public Segment()
  294. { }
  295.  
  296. /// <summary>
  297. /// 构造方法
  298. /// </summary>
  299. public Segment(string p_DicPath, string p_NoisePath, string p_NumberPath, string p_WordPath)
  300. {
  301. m_WordPath = p_DicPath;
  302. m_WordPath = p_NoisePath;
  303. m_WordPath = p_NumberPath;
  304. m_WordPath = p_WordPath;
  305. this.InitWordDics();
  306. }
  307. #endregion
  308.  
  309. #region 公有方法
  310. /// <summary>
  311. /// 加载词列表
  312. /// </summary>
  313. public void InitWordDics()
  314. {
  315. DateTime start = DateTime.Now;
  316. if (GetCache("jcms_dict") == null)
  317. {
  318. htWords = new Hashtable();
  319. Hashtable father = htWords;
  320. Hashtable forfather = htWords;
  321.  
  322. string strChar1;
  323. string strChar2;
  324.  
  325. StreamReader reader = new StreamReader(DicPath, System.Text.Encoding.UTF8);
  326. string strline = reader.ReadLine();
  327.  
  328. SegList list;
  329. Hashtable child = new Hashtable();
  330.  
  331. long i = ;
  332. while (strline != null && strline.Trim() != "")
  333. {
  334. i++;
  335. strChar1 = strline.Substring(, );
  336. strChar2 = strline.Substring(, );
  337. if (!htWords.ContainsKey(strChar1))
  338. {
  339. father = new Hashtable();
  340. htWords.Add(strChar1, father);
  341. }
  342. else
  343. {
  344. father = (Hashtable)htWords[strChar1];
  345. }
  346.  
  347. if (!father.ContainsKey(strChar2))
  348. {
  349. list = new SegList();
  350. if (strline.Length > )
  351. list.Add(strline.Substring());
  352. else
  353. list.Add("null");
  354. father.Add(strChar2, list);
  355. }
  356. else
  357. {
  358. list = (SegList)father[strChar2];
  359. if (strline.Length > )
  360. {
  361. list.Add(strline.Substring());
  362. }
  363. else
  364. {
  365. list.Add("null");
  366. }
  367. father[strChar2] = list;
  368. }
  369. htWords[strChar1] = father;
  370. strline = reader.ReadLine();
  371. }
  372. try
  373. {
  374. reader.Close();
  375. }
  376. catch
  377. { }
  378. SetCache("jcms_dict", htWords);
  379. }
  380. htWords = (Hashtable)GetCache("jcms_dict");
  381.  
  382. alNoise = LoadWords(NoisePath, alNoise);
  383. alNumber = LoadWords(NumberPath, alNumber);
  384. alWord = LoadWords(WordPath, alWord);
  385. alPrefix = LoadWords(PrefixPath, alPrefix);
  386.  
  387. TimeSpan duration = DateTime.Now - start;
  388. m_EventTime = duration.TotalMilliseconds;
  389. }
  390.  
  391. /// <summary>
  392. /// 加载文本词组到ArrayList
  393. /// </summary>
  394. public ArrayList LoadWords(string strPath, ArrayList list)
  395. {
  396. StreamReader reader = new StreamReader(strPath, System.Text.Encoding.UTF8);
  397. list = new ArrayList();
  398. string strline = reader.ReadLine();
  399. while (strline != null)
  400. {
  401. list.Add(strline);
  402. strline = reader.ReadLine();
  403. }
  404. try
  405. {
  406. reader.Close();
  407. }
  408. catch
  409. { }
  410. return list;
  411. }
  412.  
  413. /// <summary>
  414. /// 输出词列表
  415. /// </summary>
  416. public void OutWords()
  417. {
  418. IDictionaryEnumerator idEnumerator1 = htWords.GetEnumerator();
  419. while (idEnumerator1.MoveNext())
  420. {
  421. IDictionaryEnumerator idEnumerator2 = ((Hashtable)idEnumerator1.Value).GetEnumerator();
  422. while (idEnumerator2.MoveNext())
  423. {
  424. SegList aa = (SegList)idEnumerator2.Value;
  425. for (int i = ; i < aa.Count; i++)
  426. {
  427. Console.WriteLine(idEnumerator1.Key.ToString() + idEnumerator2.Key.ToString() + aa.GetElem(i).ToString());
  428. }
  429. }
  430. }
  431. }
  432.  
  433. /// <summary>
  434. /// 输出ArrayList
  435. /// </summary>
  436. public void OutArrayList(ArrayList list)
  437. {
  438. if (list == null) return;
  439. for (int i = ; i < list.Count; i++)
  440. {
  441. Console.WriteLine(list[i].ToString());
  442. }
  443. }
  444.  
  445. /// <summary>
  446. /// 分词过程,不支持回车
  447. /// </summary>
  448. /// <param name="strText">要分词的文本</param>
  449. /// <returns>分词后的文本</returns>
  450. public string SegmentText(string strText)
  451. {
  452. strText = (strText + "$").Trim();
  453. if (htWords == null) return strText;
  454. if (strText.Length < ) return strText;
  455. DateTime start = DateTime.Now;
  456. int length = ;
  457. int preFix = ;
  458. bool word = false;
  459. bool number = false;
  460. string reText = "";
  461. string strPrefix = "";
  462. string strLastChar = "";
  463. string strLastWords = Separator;
  464.  
  465. for (int i = ; i < strText.Length - ; i++)
  466. {
  467. #region 对于每一个字的处理过程
  468. string strChar1 = strText.Substring(i, );
  469. string strChar2 = strText.Substring(i + , ).Trim();
  470. bool yes;
  471. SegList l;
  472. Hashtable h;
  473.  
  474. if (reText.Length > ) strLastChar = reText.Substring(reText.Length - );
  475.  
  476. if (strChar1 == " ")
  477. {
  478. if ((number || word) && strLastChar != Separator) reText += this.Separator;
  479. yes = true;
  480. }
  481. else
  482. yes = false;
  483.  
  484. int CharType = GetCharType(strChar1);
  485. switch (CharType)
  486. {
  487. case :
  488. #region 如果是数字,如果数字的上一位是字母要和后面的数字分开
  489. if (word)
  490. {
  491. reText += Separator;
  492. }
  493. word = false;
  494. number = true;
  495. strLastWords = "";
  496. break;
  497. #endregion
  498. case :
  499. case :
  500. #region 如果是字母
  501. if (number)
  502. strLastWords = Separator;
  503. else
  504. strLastWords = "";
  505.  
  506. word = true;
  507. number = false;
  508. break;
  509. #endregion
  510. case :
  511. case :
  512. #region 第一级哈希表是否包含关键字,假如包含处理第二级哈希表
  513. //上一个字是否为字母
  514. if (word) reText += Separator;
  515.  
  516. #region 检测上一个是否是数字,这个过程是用于修正数字后的量词的
  517. if (number && CharType != )
  518. {
  519. h = (Hashtable)htWords["n"];
  520. if (h.ContainsKey(strChar1))
  521. {
  522. l = (SegList)h[strChar1];
  523. if (l.Contains(strChar2))
  524. {
  525. reText += strChar1 + strChar2 + Separator;
  526. yes = true;
  527. i++;
  528. }
  529. else if (l.Contains("null"))
  530. {
  531. reText += strChar1 + Separator;
  532. yes = true;
  533. }
  534. }
  535. else
  536. reText += Separator;
  537. }
  538. #endregion
  539.  
  540. //非汉字数字的汉字
  541. if (CharType == )
  542. {
  543. word = false;
  544. number = false;
  545. strLastWords = Separator;
  546. }
  547. else
  548. {
  549. word = false;
  550. number = true;
  551. strLastWords = "";
  552. }
  553.  
  554. //第二级哈希表取出
  555. h = (Hashtable)htWords[strChar1];
  556.  
  557. //第二级哈希表是否包含关键字
  558. if (h.ContainsKey(strChar2))
  559. {
  560. #region 第二级包含关键字
  561. //取出ArrayList对象
  562. l = (SegList)h[strChar2];
  563.  
  564. //遍历每一个对象 看是否能组合成词
  565. for (int j = ; j < l.Count; j++)
  566. {
  567. bool have = false;
  568. string strChar3 = l.GetElem(j).ToString();
  569.  
  570. //对于每一个取出的词进行检测,看是否匹配,长度保护
  571. if ((strChar3.Length + i + ) < strText.Length)
  572. {
  573. //向i+2后取出m长度的字
  574. string strChar = strText.Substring(i + , strChar3.Length).Trim();
  575. if (strChar3 == strChar && !yes)
  576. {
  577. if (strPrefix != "")
  578. {
  579. reText += strPrefix + Separator;
  580. strPrefix = "";
  581. preFix = ;
  582. }
  583. reText += strChar1 + strChar2 + strChar;
  584. i += strChar3.Length + ;
  585. have = true;
  586. yes = true;
  587. break;
  588. }
  589. }
  590. else if ((strChar3.Length + i + ) == strText.Length)
  591. {
  592. string strChar = strText.Substring(i + ).Trim();
  593. if (strChar3 == strChar && !yes)
  594. {
  595. if (strPrefix != "")
  596. {
  597. reText += strPrefix + Separator;
  598. strPrefix = "";
  599. preFix = ;
  600. }
  601. reText += strChar1 + strChar2 + strChar;
  602. i += strChar3.Length + ;
  603. have = true;
  604. yes = true;
  605. break;
  606. }
  607. }
  608.  
  609. if (!have && j == l.Count - && l.Contains("null") && !yes)
  610. {
  611. if (preFix == )
  612. {
  613. reText += strPrefix + strChar1 + strChar2;
  614. strPrefix = "";
  615. preFix = ;
  616. }
  617. else if (preFix > )
  618. {
  619. reText += strPrefix + strLastWords + strChar1 + strChar2;
  620. strPrefix = "";
  621. preFix = ;
  622. }
  623. else
  624. {
  625. if (CharType == ) reText += strChar1 + strChar2;
  626. else reText += strChar1 + strChar2;
  627. strLastWords = this.Separator;
  628. number = false;
  629. }
  630. i++;
  631. yes = true;
  632. break;
  633. }
  634. else if (have)
  635. {
  636. break;
  637. }
  638. }
  639. #endregion
  640.  
  641. //如果没有匹配还可能有一种情况,这个词语只有两个字,以这两个字开头的词语不存在
  642. if (!yes && l.Contains("null"))
  643. {
  644. if (preFix == )
  645. {
  646. reText += strPrefix + strChar1 + strChar2;
  647. strPrefix = "";
  648. preFix = ;
  649. }
  650. else if (preFix > )
  651. {
  652. reText += strPrefix + strLastWords + strChar1 + strChar2;
  653. strPrefix = "";
  654. preFix = ;
  655. }
  656. else
  657. {
  658. if (CharType == ) reText += strChar1 + strChar2;
  659. else reText += strChar1 + strChar2;
  660. strLastWords = this.Separator;
  661. number = false;
  662. }
  663. i++;
  664. yes = true;
  665. }
  666. if (reText.Length > ) strLastChar = reText.Substring(reText.Length - );
  667. if (CharType == && GetCharType(strLastChar) == )
  668. {
  669. number = true;
  670. }
  671. else if (strLastChar != this.Separator) reText += this.Separator;
  672. }
  673. #endregion
  674. break;
  675. default:
  676. #region 未知字符,可能是生僻字,也可能是标点符合之类
  677. if (word && !yes)
  678. {
  679. reText += Separator;
  680. }
  681. else if (number && !yes)
  682. {
  683. reText += Separator;
  684. }
  685. number = false;
  686. word = false;
  687. strLastWords = this.Separator;
  688. break;
  689. #endregion
  690. }
  691. if (!yes && number || !yes && word)
  692. {
  693. reText += strChar1;
  694. yes = true;
  695. }
  696. if (!yes)
  697. {
  698. #region 处理姓名问题
  699. if (preFix == )
  700. {
  701. if (alPrefix.Contains(strChar1 + strChar2))
  702. {
  703. i++;
  704. strPrefix = strChar1 + strChar2;
  705. preFix++;
  706. }
  707. else if (alPrefix.Contains(strChar1))
  708. {
  709. if (!number)
  710. {
  711. strPrefix = strChar1;
  712. preFix++;
  713. }
  714. else
  715. {
  716. reText += strChar1 + strLastWords;
  717. number = false;
  718. word = false;
  719. }
  720. }
  721. else
  722. {
  723. if (preFix == )
  724. {
  725. reText += strPrefix + Separator + strChar1 + Separator;
  726. strPrefix = "";
  727. preFix = ;
  728. }
  729. else if (preFix > )
  730. {
  731. if (Regex.IsMatch(strChar1, strChinese))
  732. {
  733. strPrefix += strChar1;
  734. preFix++;
  735. }
  736. else
  737. {
  738. reText += strPrefix + Separator + strChar1 + Separator;
  739. strPrefix = "";
  740. preFix = ;
  741. }
  742. }
  743. else
  744. {
  745. reText += strChar1 + strLastWords;
  746. number = false;
  747. word = false;
  748. }
  749. }
  750. }
  751. else
  752. {
  753. if (preFix == )
  754. {
  755. reText += strPrefix + Separator + strChar1 + Separator;
  756. strPrefix = "";
  757. preFix = ;
  758. }
  759. else if (preFix > )
  760. {
  761. if (Regex.IsMatch(strChar1, strChinese))
  762. {
  763. strPrefix += strChar1;
  764. preFix++;
  765. }
  766. else
  767. {
  768. reText += strPrefix + Separator + strChar1 + Separator;
  769. strPrefix = "";
  770. preFix = ;
  771. }
  772. }
  773. else
  774. {
  775. reText += strChar1 + strLastWords;
  776. number = false;
  777. }
  778. }
  779. #endregion
  780. }
  781. length = i;
  782. #endregion
  783. }
  784.  
  785. #region 最后防止最后一个字的丢失
  786. if (length < strText.Length - )
  787. {
  788. string strLastChar1 = strText.Substring(strText.Length - ).Trim();
  789. string strLastChar2 = strText.Substring(strText.Length - ).Trim();
  790.  
  791. if (reText.Length > ) strLastChar = reText.Substring(reText.Length - );
  792. if (preFix != )
  793. {
  794. reText += strPrefix + strLastChar1;
  795. }
  796. else
  797. {
  798. switch (GetCharType(strLastChar1))
  799. {
  800. case :
  801. if (strLastChar1 != "." && strLastChar1 != ".")
  802. reText += strLastChar1;
  803. else
  804. reText += Separator + strLastChar1;
  805. break;
  806. case :
  807. case :
  808. if (alWord.Contains(strLastChar2))
  809. reText += strLastChar1;
  810. break;
  811. case :
  812. case :
  813. if ((number || word) && strLastChar != Separator)
  814. reText += Separator + strLastChar1;
  815. else
  816. reText += strLastChar1;
  817. break;
  818. default:
  819. if (strLastChar != Separator)
  820. reText += Separator + strLastChar1;
  821. else
  822. reText += strLastChar1;
  823. break;
  824. }
  825. }
  826. if (reText.Length > ) strLastChar = (reText.Substring(reText.Length - ));
  827. if (strLastChar != this.Separator) reText += this.Separator;
  828. }
  829. #endregion
  830.  
  831. TimeSpan duration = DateTime.Now - start;
  832. m_EventTime = duration.TotalMilliseconds;
  833. return reText.Replace(" $", ""); //这里包含一个字的,则去掉
  834. }
  835.  
  836. /// <summary>
  837. /// 重载分词过程,支持回车
  838. /// </summary>
  839. public string SegmentText(string strText, bool Enter)
  840. {
  841. if (Enter)
  842. {
  843. DateTime start = DateTime.Now;
  844. string[] strArr = strText.Split('\n');
  845.  
  846. string reText = "";
  847. for (int i = ; i < strArr.Length; i++)
  848. {
  849. reText += SegmentText(strArr[i]) + "\r\n";
  850. }
  851.  
  852. TimeSpan duration = DateTime.Now - start;
  853. m_EventTime = duration.TotalMilliseconds;
  854. return reText;
  855. }
  856. else
  857. {
  858. return SegmentText(strText);
  859. }
  860. }
  861.  
  862. #region 判断字符类型
  863. /// <summary>
  864. /// 判断字符类型,0为未知,1为数字,2为字母,3为汉字,4为汉字数字
  865. /// </summary>
  866. private int GetCharType(string p_Char)
  867. {
  868. int CharType = ;
  869. if (alNumber.Contains(p_Char)) CharType = ;
  870. if (alWord.Contains(p_Char)) CharType = ;
  871. if (htWords.ContainsKey(p_Char)) CharType += ;
  872. return CharType;
  873. }
  874. #endregion
  875.  
  876. #region 对加载的词典排序并重新写入
  877. /// <summary>
  878. /// 对加载的词典排序并重新写入
  879. /// </summary>
  880. public void SortDic()
  881. {
  882. SortDic(false);
  883. }
  884.  
  885. /// <summary>
  886. /// 对加载的词典排序并重新写入
  887. /// </summary>
  888. /// <param name="Reload">是否重新加载</param>
  889. public void SortDic(bool Reload)
  890. {
  891. DateTime start = DateTime.Now;
  892. StreamWriter sw = new StreamWriter(DicPath, false, System.Text.Encoding.UTF8);
  893.  
  894. IDictionaryEnumerator idEnumerator1 = htWords.GetEnumerator();
  895. while (idEnumerator1.MoveNext())
  896. {
  897. IDictionaryEnumerator idEnumerator2 = ((Hashtable)idEnumerator1.Value).GetEnumerator();
  898. while (idEnumerator2.MoveNext())
  899. {
  900. SegList aa = (SegList)idEnumerator2.Value;
  901. aa.Sort();
  902. for (int i = ; i < aa.Count; i++)
  903. {
  904. if (aa.GetElem(i).ToString() == "null")
  905. sw.WriteLine(idEnumerator1.Key.ToString() + idEnumerator2.Key.ToString());
  906. else
  907. sw.WriteLine(idEnumerator1.Key.ToString() + idEnumerator2.Key.ToString() + aa.GetElem(i).ToString());
  908. }
  909. }
  910. }
  911. sw.Close();
  912.  
  913. if (Reload) InitWordDics();
  914.  
  915. TimeSpan duration = DateTime.Now - start;
  916. m_EventTime = duration.TotalMilliseconds;
  917. }
  918. #endregion
  919.  
  920. /// <summary>
  921. /// 删除两行完全相同的词,暂时无用!
  922. /// </summary>
  923. /// <returns>相同词条个数</returns>
  924. public int Optimize()
  925. {
  926. int l = ;
  927. DateTime start = DateTime.Now;
  928.  
  929. Hashtable htOptimize = new Hashtable();
  930. StreamReader reader = new StreamReader(DicPath, System.Text.Encoding.UTF8);
  931. string strline = reader.ReadLine();
  932. while (strline != null && strline.Trim() != "")
  933. {
  934. if (!htOptimize.ContainsKey(strline))
  935. htOptimize.Add(strline, null);
  936. else
  937. l++;
  938. }
  939. Console.WriteLine("ready");
  940. try
  941. {
  942. reader.Close();
  943. }
  944. catch { }
  945. StreamWriter sw = new StreamWriter(DicPath, false, System.Text.Encoding.UTF8);
  946. IDictionaryEnumerator ide = htOptimize.GetEnumerator();
  947. while (ide.MoveNext())
  948. sw.WriteLine(ide.Key.ToString());
  949. try
  950. {
  951. sw.Close();
  952. }
  953. catch { }
  954. TimeSpan duration = DateTime.Now - start;
  955. m_EventTime = duration.TotalMilliseconds;
  956. return l;
  957. }
  958. #endregion
  959. }
  960. }

[分词] C#SegList分词辅助类,帮助类 (转载)的更多相关文章

  1. ElasticSearch已经配置好ik分词和mmseg分词(转)

    ElasticSearch是一个基于Lucene构建的开源,分布式,RESTful搜索引擎.设计用于云计算中,能够达到实时搜索,稳定,可靠,快速,安装使用方便.支持通过HTTP使用JSON进行数据索引 ...

  2. 为 Elasticsearch 添加中文分词,对比分词器效果

    转自:http://keenwon.com/1404.html 为 Elasticsearch 添加中文分词,对比分词器效果 Posted in 后端 By KeenWon On 2014年12月12 ...

  3. python中文分词:结巴分词

    中文分词是中文文本处理的一个基础性工作,结巴分词利用进行中文分词.其基本实现原理有三点: 基于Trie树结构实现高效的词图扫描,生成句子中汉字所有可能成词情况所构成的有向无环图(DAG) 采用了动态规 ...

  4. .添加索引和类型,同时设定edgengram分词和charsplit分词

    1.添加索引和类型,同时设定edgengram分词和charsplit分词 curl -XPUT 'http://127.0.0.1:9200/userindex/' -d '{   "se ...

  5. 为Elasticsearch添加中文分词,对比分词器效果

    http://keenwon.com/1404.html Elasticsearch中,内置了很多分词器(analyzers),例如standard (标准分词器).english(英文分词)和chi ...

  6. ES 09 - 定制Elasticsearch的分词器 (自定义分词策略)

    目录 1 索引的分析 1.1 分析器的组成 1.2 倒排索引的核心原理-normalization 2 ES的默认分词器 3 修改分词器 4 定制分词器 4.1 向索引中添加自定义的分词器 4.2 测 ...

  7. Elasticsearch拼音分词和IK分词的安装及使用

    一.Es插件配置及下载 1.IK分词器的下载安装 关于IK分词器的介绍不再多少,一言以蔽之,IK分词是目前使用非常广泛分词效果比较好的中文分词器.做ES开发的,中文分词十有八九使用的都是IK分词器. ...

  8. 和我一起打造个简单搜索之IK分词以及拼音分词

    elasticsearch 官方默认的分词插件,对中文分词效果不理想,它是把中文词语分成了一个一个的汉字.所以我们引入 es 插件 es-ik.同时为了提升用户体验,引入 es-pinyin 插件.本 ...

  9. 盘古分词+一元/二元分词Lucene

    本文参考自:https://blog.csdn.net/mss359681091/article/details/52078147 http://www.cnblogs.com/top5/archiv ...

随机推荐

  1. 尚未在 Web 服务器上注册 ASP.NET 4.0” 的解决办法

    http://www.sowsoy.com/topics-537.html win7,vs2010创建.NetFramework 4框架下的Asp.Net空网站.系统提示 “尚未在 Web 服务器上注 ...

  2. 如何在Azure环境里做好信息传递可扩展性经验分享

    作者 王枫 发布于2014年5月15日 综述 本文介绍建立一个在Azure上使用Azure服务总线, 高吞吐量短信平台的必要步骤.在这篇文章中提出的解决方案是在响应由客户的具体要求,建立一个基于Win ...

  3. CPU acceleration status:HAXM must be updated(version 1.1.1<6.0.1)

    终于上手as了,感觉很爽 但是感觉也特闹心啊 还好有stackoverflow(这特么才是一个神奇的网站好吗) 废话少说 记录一下: 前面历经的磨难暂时不说了,就这个CPU acceleration ...

  4. Delphi 预编译指令 的用法

    A.3 使用条件编译指令条件编译指令是非常重要的编译指令,他控制着在不同条件下(例如,不同的操作系统)产生不同的代码.条件编译指令是包含在注释括号之内的,如下表所示.                 ...

  5. Unity3d BTDF实时折射模拟有粗糙度的半透明物体

    折射的原理是运用BTDF的一个球形高斯近似 需要考虑折射光的来源,一般会想到用环境贴图(IBL)或者grab texture,但是折射光不全都来自一个平面,所以选择环境贴图来作为折射光.这个效果主要是 ...

  6. 关于 Unity NavMesh 数据的访问

    目前的工作需要加入自动寻路,后来决定使用 unity 自带的 NavMesh,但有个问题是这个寻路数据,服务器也是需要的,那么我就要把这个数据导出为服务器所用才行.      但 NaveMesh 暂 ...

  7. ubuntu中为hive配置远程MYSQL database

    一.安装mysql $ sudo apt-get install mysql-server 启动守护进程 $ sudo service mysql start 二.配置mysql服务与连接器 1.安装 ...

  8. HDOJ/HDU 1328 IBM Minus One(水题一个,试试手)

    Problem Description You may have heard of the book '2001 - A Space Odyssey' by Arthur C. Clarke, or ...

  9. Bzoj 1598: [Usaco2008 Mar]牛跑步 dijkstra,堆,K短路,A*

    1598: [Usaco2008 Mar]牛跑步 Time Limit: 10 Sec  Memory Limit: 162 MBSubmit: 427  Solved: 246[Submit][St ...

  10. Android SeekBar 和 draw9patch 的使用

    今天要使用一个SeekBar控件,其实我觉得Android默认样式已经很不错了,无奈设计不同意,而且SeekBar左右两边也有图片,默认样式和图片也确实不协调,因此这里使用图片自定义SeekBar样式 ...