C#脏字过滤算法

public class DirtyWordOper
    {
        private static Dictionary<string, object> hash = new Dictionary<string, object>();
        private static BitArray firstCharCheck = new BitArray(char.MaxValue);//把脏词的第一个字符记录下来
        private static BitArray allCharCheck = new BitArray(char.MaxValue);//把每一个个脏词的所有字符都记录下来
        private static int maxLength = 0;//
        private static bool onlyOne = true;

#region
        /// <summary>
        /// 返回替换后的字符串字符串的长度不变
        /// </summary>
        /// <param name="text"></param>
        /// <returns></returns>
        public string Replace(string text)
        {
            if (onlyOne)
            {
                Init();//初始化数据执行一次就不会执行了
                onlyOne = false;
            }
            if (!isDirtyword(text))
            {
                return text;
            }
            //获取替换操作表
            List<DetailRepModel> drlist = GetList(text);
            //执行替换操作
            return Replace2(text, drlist);
        }

/// <summary>
        /// 初始化用只执行一次
        /// </summary>
        /// <param name="text"></param>
        private static void Init()
        {
            string[] badwords = DirtyWordData.DirtyKeyword.Split('|');
            foreach (string bw in badwords)
            {
                string[] strarrtemp = bw.Split('&');
                string word = strarrtemp[0];
                word = word.Trim();//去掉数据中的空格及格式符号
                word = word.Replace("/r", "");
                word = word.Replace("/n", "");
                if (word == "")
                {
                    break;
                }
                if (!hash.ContainsKey(word))
                {
                    hash.Add(word, null);
                    maxLength = Math.Max(maxLength, word.Length);
                    firstCharCheck[word[0]] = true; 代码生成器

foreach (char c in word)
                    {
                        allCharCheck[c] = true;
                    }
                }
            }
        }
        /// <summary>
        /// 是否包含了脏词
        /// </summary>
        /// <param name="text"></param>
        /// <returns></returns>
        private static bool isDirtyword(string text)
        {
            int index = 0;
            //int offset = 0;
            while (index < text.Length)
            {
                //如果第一个字符都不符合
                if (!firstCharCheck[text[index]])
                {// 直接找到与脏词第一字符相同为止
                    while (index < text.Length - 1 && !firstCharCheck[text[++index]]) ;
                }
                for (int j = 1; j <= Math.Min(maxLength, text.Length - index); j++)
                {
                    if (!allCharCheck[text[index + j - 1]])
                    {
                        break;
                    }
                    string sub = text.Substring(index, j);
                    //判定脏字字典中是否包括了脏词
                    if (hash.ContainsKey(sub))
                    {
                        return true;//是
                    }
                }
                index++;
            }
            return false;//否
        }

/// <summary>
        /// 返回操作列表
        /// </summary>
        /// <param name="text"></param>
        /// <returns></returns>
        private static List<DetailRepModel> GetList(string text)
        {
            List<DetailRepModel> DetailList = new List<DetailRepModel>();
            int index = 0;
            while (index < text.Length)
            {
                if (!firstCharCheck[text[index]])
                {
                    while (index < text.Length - 1 && !firstCharCheck[text[++index]]) ;
                }
                DetailRepModel tempDetail = null;
                for (int j = 1; j <= Math.Min(maxLength, text.Length - index); j++)
                {
                    if (!allCharCheck[text[index + j - 1]])
                    {
                        if (tempDetail != null)
                        {//优先先字符串替换
                            index = index + tempDetail.number - 1;//索引要返回上一位，所以要减1
                            DetailList.Add(tempDetail);
                        }
                        break;
                    }
                    string sub = text.Substring(index, j);
                    if (hash.ContainsKey(sub))
                    {
                        tempDetail = new DetailRepModel();
                        tempDetail.index = index;
                        tempDetail.number = sub.Length;
                        tempDetail.content = sub;
                        //break;//进行下一次不然要出现， abc 其中ab 与a都关键字要生成两个操作
                    }
                    if (tempDetail != null)
                    {
                        if (j + 1 > Math.Min(maxLength, text.Length - index))
                        {//优先先字符串替换
                            DetailList.Add(tempDetail);
                            index = index + tempDetail.number - 1;//索引要返回上一位，所以要减1
                        }
                    }
                }
                index++;
            }
            return DetailList;
        }
        /// <summary>
        /// 传入字串和脏字替换操作表,
        /// </summary>
        /// <param name="text"></param>
        /// <param name="drlist"></param>
        /// <returns> 输出替换后的字串</returns>
        private static string Replace2(string text, List<DetailRepModel> drlist)
        {

if (drlist == null || drlist.Count == 0 || text == "")
            {
                return text;
            }
            foreach (DetailRepModel dr in drlist)
            {
                if (dr != null)
                {
                    string strtemp = text.Substring(dr.index, dr.number);
                    object ob = DirtyWordData.DirtyHT[(object)strtemp];
                    if (ob == null)
                    {
                        //记录错误
                        break;
                    }
                    // 这样替换有错误，
                    text = text.Substring(0, dr.index) + ob.ToString() + text.Substring(dr.index + dr.number);
                    //text = text.Replace(strtemp, ob.ToString());
                }
            }
            return text;
        }
        #endregion
    }

效果还行，不过我们老大给我说了个方法更NB，说比这种要快50倍；只是写起来有点麻烦

public interface IReplaceDW
    {
        string Replace(string s);
    }
    public class ReplaceDW
    {
        public static void AddToWords(DirtyChar parent, string s, string t)
        {
            DirtyChar dc = parent.Children.Find(o => o.Orienginal == s[0]);
            if (dc == null)
            {
                dc = new DirtyChar() { Orienginal = s[0], Children = new List<DirtyChar>(), Target = "" };
                parent.Children.Add(dc);
            }
            if (s.Length > 1)
            {//
                AddToWords(dc, s.Substring(1), t);
            }
            else
            {
                dc.Target = t;
            }
        }

public static string BuildChildren(DirtyChar dc, int deepLevel)
        {
            StringBuilder sb = new StringBuilder();
            string spaces = new string(' ', deepLevel + 4);

if (dc.Children.Count > 0)
            {
                sb.Append(@"
" + spaces + @"if (i + 1 == len){");
                sb.Append(@"
" + spaces + @"    sb.Append(""" + dc.Target + @""");
                ");
                sb.Append(@"
" + spaces + @"    i++;
" + spaces + @"    break;}");
                sb.Append(@"
" + spaces + @" switch (s[i + " + deepLevel.ToString() + @"])
" + spaces + @" {
");
                foreach (DirtyChar c in dc.Children)
                {
                    sb.Append(@"
" + spaces + @"  case '" + c.Orienginal + @"':
");
                    sb.Append(BuildChildren(c, deepLevel + 1));
                    sb.Append(@"
" + spaces + @"   break;");
                }


                sb.Append(@"
" + spaces + @" default:
" + spaces + @"    sb.Append(""" + dc.Target + @""");
" + spaces + @"    i++;
" + spaces + @"    break;
" + spaces + @" }
");
            }
            else
            {
                sb.Append(@"
" + spaces + @"  sb.Append(""" + dc.Target + @""");
");
                if (deepLevel == 1)
                {
                    sb.Append(@"
" + spaces + @"  i++;
");
                }
                else
                {
                    sb.Append(@"
" + spaces + @"  i += " + (deepLevel).ToString() + @";
");
                }
            }
            return sb.ToString();
        }

private IReplaceDW _r = null;
        private static bool isfirst = true;
        public string Replace(string s)
        {
            return _r.Replace(s);
        }
        private static List<KeyValuePair<string, string>> tmp = new List<KeyValuePair<string, string>>();
        public ReplaceDW()
        {
            if (isfirst)
            {
                List<KeyValuePair<string, string>> dict = new List<KeyValuePair<string, string>>();
                foreach (DictionaryEntry d in KeyWord.DirtyWordData.DirtyHT)
                {
                    dict.Add(new KeyValuePair<string, string>(d.Key.ToString(), d.Value.ToString()));
                }
                // 整理进 list
                //List<KeyValuePair<string, string>> tmp = new List<KeyValuePair<string, string>>();
                foreach (KeyValuePair<string, string> kv in dict)
                {
                    tmp.Add(kv);
                }
                // 倒排
                tmp.Sort((a, b) => { return b.Key.CompareTo(a.Key); });
                isfirst = false;
            }
            var compiler = new CSharpCodeProvider();
            var options = new CompilerParameters();

// set compile options
            options.CompilerOptions = "/o";
            options.GenerateExecutable = false;
            options.GenerateInMemory = true;
            options.ReferencedAssemblies.Add("System.dll");
            options.ReferencedAssemblies.Add(this.GetType().Assembly.Location);

// set the source code to compile
            DirtyChar words = new DirtyChar() { Children = new List<DirtyChar>() };
            //DirtyChar words2 = new DirtyChar();
            //words2.Children = new List<DirtyChar>();
            foreach (KeyValuePair<string, string> kv in tmp)
            {//构建字典表
                AddToWords(words, kv.Key, kv.Value);
            }

StringBuilder sb = new StringBuilder();
            sb.Append(@"
using System;
namespace KeyWord
{
public class ReplaceDW_ : IReplaceDW
{
    public string Replace( string s )
{
  int len = s.Length, i = 0;
        System.Text.StringBuilder sb = new System.Text.StringBuilder(len);
");
            sb.Append(@"
  while (i < len)
  {
   switch (s[i])
   {
");
            foreach (DirtyChar c in words.Children)
            {
                sb.Append(@"
    case '" + c.Orienginal + @"':
");
                sb.Append(BuildChildren(c, 1));
                sb.Append(@"
     break;");
            }
            sb.Append(@"
    default:
     sb.Append(s[i++]);
     break;
   }
  }
");
            sb.Append(@"
  return sb.ToString();

}
}
}");
            // compile the code, on-the-fly
            var result = compiler.CompileAssemblyFromSource(options, sb.ToString());

            foreach (var error in result.Errors)
            {
                // print errors
                ;
            }

// if compilation sucessed
            if ((!result.Errors.HasErrors) && (result.CompiledAssembly != null))
            {
                var type = result.CompiledAssembly.GetType("KeyWord.ReplaceDW_");
                try
                {
                    if (type != null)
                    {
                        this._r = Activator.CreateInstance(type) as IReplaceDW;
                    }
                    this.Replace("x"); //预热
                    this.Replace("x"); //预热
                }
                catch (Exception ex)
                {
                    Console.WriteLine(ex);
                }
            }
        }
    }

点击下载本例源码

C#脏字过滤算法的更多相关文章

SVD++：推荐系统的基于矩阵分解的协同过滤算法的提高
1.背景知识在讲SVD++之前,我还是想先回到基于物品相似的协同过滤算法.这个算法基本思想是找出一个用户有过正反馈的物品的相似的物品来给其作为推荐.其公式为:
GBDT(Gradient Boosting Decision Tree)算法&协同过滤算法
GBDT(Gradient Boosting Decision Tree)算法参考:http://blog.csdn.net/dark_scope/article/details/24863289 理 ...
Spark机器学习之协同过滤算法
Spark机器学习之协同过滤算法一).协同过滤 1.1 概念协同过滤是一种借助"集体计算"的途径.它利用大量已有的用户偏好来估计用户对其未接触过的物品的喜好程度.其内在思想是相 ...
Collaborative Filtering(协同过滤)算法详解
基本思想基于用户的协同过滤算法是通过用户的历史行为数据发现用户对商品或内容的喜欢(如商品购买,收藏,内容评论或分享),并对这些喜好进行度量和打分.根据不同用户对相同商品或内容的态度和偏好程度计算用户 ...
【机器学习笔记一】协同过滤算法 - ALS
参考资料 [1]<Spark MLlib 机器学习实践> [2]http://blog.csdn.net/u011239443/article/details/51752904 [3]线性 ...
吴恩达机器学习笔记58-协同过滤算法（Collaborative Filtering Algorithm）
在之前的基于内容的推荐系统中,对于每一部电影,我们都掌握了可用的特征,使用这些特征训练出了每一个用户的参数.相反地,如果我们拥有用户的参数,我们可以学习得出电影的特征. 但是如果我们既没有用户的参数, ...
Spark机器学习(11)：协同过滤算法
协同过滤(Collaborative Filtering,CF)算法是一种常用的推荐算法,它的思想就是找出相似的用户或产品,向用户推荐相似的物品,或者把物品推荐给相似的用户.怎样评价用户对商品的偏好? ...
亚马逊协同过滤算法 Collaborative filtering
这节课时郭强的三维课.他讲的是MAYA和max .自己对这个也不怎么的感兴趣.而且这个课感觉属于数字媒体.自己对游戏,动画,这些东西一点都不兴趣,比如大一的时候刚开学的时候,张瑞的数字媒体的导论课.还 ...
win7下使用Taste实现协同过滤算法
如果要实现Taste算法,必备的条件是: 1) JDK,使用1.6版本.需要说明一下,因为要基于Eclipse构建,所以在设置path的值之前要先定义JAVA_HOME变量. 2) Maven,使用2 ...

随机推荐

Python3常用学习网站总结（随时更新）
Python资源大全 http://python.jobbole.com/84464/ https://github.com/jobbole/awesome-python-cn scrapy: h ...
There is no getter for property named xxx' in 'class java.lang.xxx'
在xxxMapper.xml我们使用sql片段来提高sql代码的复用性,当时新手传入参数时常常出现这样的错误: There is no getter for property named xxx' i ...
zabbix灵活使用userparameters
userparameters介绍官网文献:https://www.zabbix.com/documentation/2.0/manual/config/items/userparameters 当我 ...
Hexo+Github搭建博客
要使用Hexo,需要在你的系统中支持Nodejs以及Git,如果还没有,那就开始安装吧! 安装Node.js 下载Node.js 参考地址:安装Node.js 安装Git 下载地址:http://gi ...
MySql入门(1)
MySql入门(1) 安装检查系统中是否已经安装了MySQL sudo netstat -tap | grep mysql 若没有显示已安装结果,则没有安装.否则表示已经安装. sudo apt-g ...
java学习笔记(一) 服务器的认识
RPC与RMI.SOAP的联系及区别 http://www.jb51.net/article/68971.htm 几大服务器的区别nginx/tomcat/ http://www.cnblogs.co ...
STM32F10XX存储器细节
>> STM32F10XX系统架构 >> 程序存储器.数据存储器.寄存器和输入输出端口被组织在同一个4GB的线性地址空间内. >> 数据字节以小端格式存放在存储 ...
接口测试思路，jmeter，接口测试流程
接口测试总结一:接口测试思想接口测试:通过向服务器端发送请求,获取响应与预期结果做对比的一种服务端黑盒测试过程. 解释:接口就是将浏览器,客户端,手机端,或者服务器调用另一个服务器的请求抽离出来测 ...
LINUX 笔记-iostat命令
显示CPU和I/O统计信息 iostat的不带参数的显示CPU和I/ O的所有分区的统计信息 -c Display the CPU utilization report. -d Display the ...
JavaScript--我发现，原来你是这样的JS（引用类型不简单，且听我娓娓道来）
一.介绍没错,这是第五篇,到了引用类型,这次要分成两次博文了,太多内容了,这是前篇,篇幅很长也很多代码,主要讲引用类型和常用的引用类型,代码试验过的,老铁没毛病. 坚持看坚持写,不容易不容易,希望大 ...

C#脏字过滤算法

C#脏字过滤算法的更多相关文章

随机推荐

热门专题