请教！！C#敏感字替换特定字符串

茗香淡然 2015-09-30 10:28:06

现在过虑敏感字算法很多，五花八门的，但很多的不是把敏感字替换成‘*’就是直接消灭了了事，但我现在想要的是替换成特定的字符串。
　　a.txt 中设定的关键字替换：

尼玛的｜{ 编者按：[你太坏了]}
我就太阳了｜{编者按：[不要这样子]}
F.U.C.K｜{[别骂人，这样真的不好。]}
去si吧｜{[你确定要这样吗？]}
你了不起｜{[好吧，败给你了。一会带你看星星！]}

　　现在我想把‘｜’前面定义为[敏感字]，然后替换成花括号‘{}’中的字符串。

　　有没有类似于二维数组这样的存储效果：
string[][] array = new string[][];
array[0][0] = "尼玛的";
array[0][1] = "编者按：[你太坏了]";
array[1][0] = "我就太阳了";
array[1][1] ="编者按：[不要这样子]";

一碰到array[i][0]的敏感字，就替换为array[i][1]中的字符串。

http://www.cnblogs.com/yeerh/archive/2011/10/20/2219035.html 优化的算法



    /// <summary>

    /// 优化的算法

    /// </summary>

    public class BadWordsFilter2

    {

        private HashSet<string> hash = new HashSet<string>();



        private ushort[] fastCheck = new ushort[char.MaxValue + 1];

        private ushort[] startLength = new ushort[char.MaxValue + 1];

        private ushort[] endLength = new ushort[char.MaxValue + 1];



        private int maxWordLength = 0;

        private int minWordLength = int.MaxValue;



        public void AddKey(string word)

        {

            if (word.Length > 16)

            {

                throw new Exception("参数最大16个字符");

            }



            maxWordLength = Math.Max(maxWordLength, word.Length);

            minWordLength = Math.Min(minWordLength, word.Length);

            //字符出现的位置(1-16),

            for (int i = 0; i < word.Length; i++)

            {

                fastCheck[word[i]] |= (byte)(1 << i);

            }



            ushort mask = (ushort)(1 << word.Length - 1);

            //以x开始的字符的长度

            startLength[word[0]] |= mask;

            //以x结束的字符的长度

            endLength[word[word.Length - 1]] |= mask;



            hash.Add(word);

        }



        public bool HasBadWord(string text)

        {

            for (int index = 0; index < text.Length; index++)

            {

                int count = 0;

                int maxIndex = Math.Min(maxWordLength + index, text.Length);

                char begin = text[index];

                for (int j = index; j < maxIndex; j++)

                {

                    char current = text[j];

                    ushort mask = (ushort)(1 << count);

                    if ((fastCheck[current] & mask) == 0)

                    {

                        index += count;

                        break;

                    }

                    ++count;

                    if ((startLength[begin] & mask) > 0 && (endLength[current] & mask) > 0)

                    {

                        string sub = text.Substring(index, count);

                        if (hash.Contains(sub))

                        {

                            //index += (count - 1);

                            return true;

                        }

                    }

                }

            }

            return false;

        }

        public string FindOne(string text)

        {

            for (int index = 0; index < text.Length; index++)

            {

                int count = 0;

                int maxIndex = Math.Min(maxWordLength + index, text.Length);

                char begin = text[index];

                for (int j = index; j < maxIndex; j++)

                {

                    char current = text[j];

                    ushort mask = (ushort)(1 << count);

                    if ((fastCheck[current] & mask) == 0)

                    {

                        index += count;

                        break;

                    }

                    ++count;

                    if ((startLength[begin] & mask) > 0 && (endLength[current] & mask) > 0)

                    {

                        string sub = text.Substring(index, count);

                        if (hash.Contains(sub))

                        {

                            index += (count - 1);

                            return sub;

                        }

                    }

                }

            }

            return string.Empty;

        }



        public IEnumerable<string> FindAll(string text)

        {

            for (int index = 0; index < text.Length; index++)

            {

                int count = 0;

                int maxIndex = Math.Min(maxWordLength + index, text.Length);

                char begin = text[index];

                for (int j = index; j < maxIndex; j++)

                {

                    char current = text[j];

                    ushort mask = (ushort)(1 << count);

                    if ((fastCheck[current] & mask) == 0)

                    {

                        index += count;

                        break;

                    }

                    ++count;

                    if ((startLength[begin] & mask) > 0 && (endLength[current] & mask) > 0)

                    {

                        string sub = text.Substring(index, count);

                        if (hash.Contains(sub))

                        {

                            index += (count - 1);

                            yield return sub;

                            break;

                        }

                    }

                }

            }

        }

    }

...全文

442 9 打赏收藏转发到动态举报

写回复

用AI写文章

9 条回复

切换为时间正序

请发表友善的回复…

发表回复

茗香淡然 2015-10-08

打赏
举报

引用 8 楼 ajianchina 的回复:

犯不着这样吧，你是用户单次提交的时候进行处理，还是进行批量处理？文本内容一般多大？

文本内容长度不确定，不过应该不超过1.5万字吧。

茗香淡然 2015-09-30

打赏
举报

引用 6 楼 ajianchina 的回复:

你写这个类就是专门干替换的吗？

是的。

ajianchina 2015-09-30

打赏
举报

你写这个类就是专门干替换的吗？

茗香淡然 2015-09-30

打赏
举报

引用 4 楼 phommy 的回复:


using System;
using System.Linq;
using System.Text.RegularExpressions;

namespace ConsoleApplication1
{
    class Program
    {
        static void Main(string[] args)
        {
            var a_txt = @"尼玛的｜{ 编者按：[你太坏了]}
我就太阳了｜{编者按：[不要这样子]}
F.U.C.K｜{[别骂人，这样真的不好。]}
去si吧｜{[你确定要这样吗？]}
你了不起｜{[好吧，败给你了。一会带你看星星！]}";

            var input = "尼玛的的的的我我我我就太阳了了了你了不起起起";

            var separator = new[]
                            {
                                "\r\n"
                            };
            var dict =
                a_txt.Split(separator, StringSplitOptions.RemoveEmptyEntries).Select(s => s.Split('｜')).ToDictionary(
                    s => s[0],
                    s => s[1]);
            var regex = string.Join("|", dict.Keys);

            var output = Regex.Replace(input, regex, m => dict[m.Value]);
            Console.WriteLine(output);
        }
    }
}

var regex = string.Join("|", dict.Keys); 无效参数

phommy 2015-09-30

打赏
举报


using System;
using System.Linq;
using System.Text.RegularExpressions;

namespace ConsoleApplication1
{
    class Program
    {
        static void Main(string[] args)
        {
            var a_txt = @"尼玛的｜{ 编者按：[你太坏了]}
我就太阳了｜{编者按：[不要这样子]}
F.U.C.K｜{[别骂人，这样真的不好。]}
去si吧｜{[你确定要这样吗？]}
你了不起｜{[好吧，败给你了。一会带你看星星！]}";

            var input = "尼玛的的的的我我我我就太阳了了了你了不起起起";

            var separator = new[]
                            {
                                "\r\n"
                            };
            var dict =
                a_txt.Split(separator, StringSplitOptions.RemoveEmptyEntries).Select(s => s.Split('｜')).ToDictionary(
                    s => s[0],
                    s => s[1]);
            var regex = string.Join("|", dict.Keys);

            var output = Regex.Replace(input, regex, m => dict[m.Value]);
            Console.WriteLine(output);
        }
    }
}

娃都会打酱油了 2015-09-30

打赏
举报

Dictionary<string, string> dic = new Dictionary<string, string>();
dic.Add("尼玛的", "编者按：[你太坏了]");
Console.WriteLine(dic["尼玛的"]);
//foreach(var kv in dic)

茗香淡然 2015-09-30