C# 过滤字符串中的汉字,效率高点的

jt9079 2010-05-27 05:30:44
例如: string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";


有时候字符串可能会比较长,所以求效率比较好的。。
要求过滤之后:
string content = "13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n";
...全文
1058 29 打赏 收藏 转发到动态 举报
写回复
用AI写文章
29 条回复
切换为时间正序
请发表友善的回复…
发表回复
deng68 2013-02-01
  • 打赏
  • 举报
回复
高人不少 啊,学习了
thinkpad_one 2012-12-07
  • 打赏
  • 举报
回复
查资料路过,都是高人啊
jiang_chao 2012-06-27
  • 打赏
  • 举报
回复
csdn高手就是多啊
jt9079 2010-05-28
  • 打赏
  • 举报
回复
这么多牛人要帮忙,真是非常感谢啊。。
只是分太好了。。
有点不好意思。。
duanaowen 2010-05-28
  • 打赏
  • 举报
回复
就是用正则表达式最好的。
sanmi 2010-05-28
  • 打赏
  • 举报
回复
路过,高手真多。
leon9090 2010-05-28
  • 打赏
  • 举报
回复
受益匪浅!~
skep99 2010-05-28
  • 打赏
  • 举报
回复
感谢20楼wuyazhe的提醒,
用指针写了下,又快了一点点,
10000的数据量太少,已经出现0毫秒了,测试次数改成了100万

按0.7秒处理156字符100万次计算,每秒过滤字符超过2亿了,这个效率楼主应该能接受了

代码

unsafe static void kickoffChinese3()
{
//string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";
char[] chars = content.ToCharArray();
char[] result = new char[chars.Length];

fixed (char* pChars = chars)
{
fixed (char* pResult = result)
{
char* pCurrentChars = pChars;
char* pCurrentResult = pResult;
char* pLimit = pChars + chars.Length;
while (pCurrentChars < pLimit)
{
if (*pCurrentChars < 0x4E00 || *pCurrentChars > 0x9FA5)
*pCurrentResult++ = *pCurrentChars;
pCurrentChars++;
}
*pCurrentResult = '\0';
}
}
//Console.WriteLine(new string(result));
}


测试结果,单位毫秒

regex foreach linq skep99 skep99-1 unsafe
6750 3687 5500 938 828 703
6766 3703 5437 938 828 703
6828 3797 5547 937 813 719
6796 3688 5406 891 828 703
6781 3703 5579 906 828 719
6828 3781 5734 938 828 703
6922 3719 5578 937 829 703
6734 3703 5500 938 828 719
6843 3750 5532 984 875 750
7000 3797 5562 969 828 735
6859 3672 5531 938 812 719
6875 3625 5484 922 813 734
7031 3735 5500 921 829 718
6875 3688 5500 937 813 719
6734 3672 5625 906 828 719
6781 3703 5547 906 813 719
6703 3609 5485 906 828 703
6766 3734 5625 922 828 719
6859 3719 5484 922 828 735
6750 3672 5468 922 844 703



messi 2010-05-27
  • 打赏
  • 举报
回复
学习啦
兔子-顾问 2010-05-27
  • 打赏
  • 举报
回复
要效率么。。
项目打开unsafe的选项。
在skep99基础上,再优化一次。省掉复制的时间。
static void Main(string[] args)
{
Console.WriteLine("regex\t\tforeach\t\tlinq\t\tskep99\t\tskep99-1");
for (int x = 0; x < 20; x++)
{
int tick = Environment.TickCount;
for (int i = 0; i < 10000; i++) TestChineseRegex01();
tick = Environment.TickCount - tick;
int tick1 = Environment.TickCount;
for (int i = 0; i < 10000; i++) TestChineseForeach01();
tick1 = Environment.TickCount - tick1;
int tick2 = Environment.TickCount;
for (int i = 0; i < 10000; i++) TestChineseLinq01();
tick2 = Environment.TickCount - tick2;
int tick3 = Environment.TickCount;
for (int i = 0; i < 10000; i++) kickoffChinese();
tick3 = Environment.TickCount - tick3;
int tick4 = Environment.TickCount;
for (int i = 0; i < 10000; i++) kickoffChinese1();
tick4 = Environment.TickCount - tick4;
Console.WriteLine(tick.ToString() + "\t\t" + tick1.ToString() + "\t\t" + tick2.ToString() + "\t\t" + tick3.ToString() + "\t\t" + tick4.ToString());
}
Console.ReadKey();
}

static Regex reg = new Regex(@"[\u4e00-\u9fa5]+", RegexOptions.Compiled);
static readonly string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";
static StringBuilder builder = new StringBuilder(content.Length);

private static void TestChineseLinq01()
{
//string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";
string result = new string(content.Where(c => ((uint)c < 0x4e00 || (uint)c > 0x9fa5)).ToArray());
//Console.WriteLine(result);
}

private static void TestChineseRegex01()
{
//string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";
string result = reg.Replace(content, string.Empty);
//Console.WriteLine(result);
}

private static void TestChineseForeach01()
{
builder.Clear();
for (int i = 0; i < content.Length; i++)
{
if (content[i] < 0x4e00 || content[i] > 0x9fa5)
{
builder.Append(content[i]);
}
}
//Console.WriteLine(builder.ToString());
}

static void kickoffChinese()
{
//string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";

char[] chars = content.ToCharArray();
int offset = 0;
char[] result = new char[chars.Length];
for (int i = 0; i < chars.Length; i++)
{
if (chars[i] < 0x4E00 || chars[i] > 0x9FA5)
{
result[offset] = chars[i];
offset++;
}
}
char[] dest = new char[offset];
Array.Copy(result, dest, offset);
//Console.WriteLine(new string(dest));
}

static void kickoffChinese1()
{
//string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";

char[] chars = content.ToCharArray();
int offset = 0;
char[] result = new char[chars.Length];
for (int i = 0; i < chars.Length; i++)
{
if (chars[i] < 0x4E00 || chars[i] > 0x9FA5)
{
result[offset] = chars[i];
offset++;
}
}
//char[] dest = new char[offset];
//Array.Copy(result, dest, offset);
result[offset + 1] = '\0';
//Console.WriteLine(new string(result));
//Console.WriteLine(new string(dest));
}


结果:

regex foreach linq skep99 skep99-1
130 60 241 60 40
180 50 211 40 40
120 60 170 40 40
130 61 170 50 30
120 60 170 40 41
130 50 180 50 30
130 60 181 40 40
130 60 170 50 30
131 50 180 40 40
130 110 241 60 40
150 60 190 51 50
130 60 170 40 40
120 61 170 50 30
130 60 170 40 41
120 60 170 50 40
130 60 171 50 30
120 60 170 50 30
131 60 180 40 40
120 60 171 40 40
120 60 180 40 40
兔子-顾问 2010-05-27
  • 打赏
  • 举报
回复
好吧,按客客说的,我吧regex声明放外面,似乎也改变不大。
static void Main(string[] args)
{
Console.WriteLine("regex\t\tforeach\t\tlinq\t\tkickoffChinese");
for (int x = 0; x < 20; x++)
{
int tick = Environment.TickCount;
for (int i = 0; i < 10000; i++) TestChineseRegex01();
tick = Environment.TickCount - tick;
int tick1 = Environment.TickCount;
for (int i = 0; i < 10000; i++) TestChineseForeach01();
tick1 = Environment.TickCount - tick1;
int tick2 = Environment.TickCount;
for (int i = 0; i < 10000; i++) TestChineseLinq01();
tick2 = Environment.TickCount - tick2;
int tick3 = Environment.TickCount;
for (int i = 0; i < 10000; i++) kickoffChinese();
tick3 = Environment.TickCount - tick3;
Console.WriteLine(tick.ToString() + "\t\t" + tick1.ToString() + "\t\t" + tick2.ToString() + "\t\t" + tick3.ToString());
}
Console.ReadKey();
}

static Regex reg = new Regex(@"[\u4e00-\u9fa5]+", RegexOptions.Compiled);
static string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";
static StringBuilder builder = new StringBuilder(content.Length);

private static void TestChineseLinq01()
{
//string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";
string result = new string(content.Where(c => ((uint)c < 0x4e00 || (uint)c > 0x9fa5)).ToArray());
//Console.WriteLine(result);
}

private static void TestChineseRegex01()
{
//string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";
string result = reg.Replace(content, string.Empty);
//Console.WriteLine(result);
}

private static void TestChineseForeach01()
{
builder.Clear();
for (int i = 0; i < content.Length; i++)
{
if (content[i] < 0x4e00 || content[i] > 0x9fa5)
{
builder.Append(content[i]);
}
}
//Console.WriteLine(builder.ToString());
}

static void kickoffChinese()
{
//string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";

char[] chars = content.ToCharArray();
int offset = 0;
char[] result = new char[chars.Length];
for (int i = 0; i < chars.Length; i++)
{
if (chars[i] < 0x4E00 || chars[i] > 0x9FA5)
{
result[offset] = chars[i];
offset++;
}
}
char[] dest = new char[offset];
Array.Copy(result, dest, offset);
//Console.WriteLine(new string(dest));
}

结果

regex foreach linq kickoffChinese
140 60 261 40
130 60 170 50
181 70 190 50
120 60 170 41
130 60 170 40
130 60 171 50
120 60 170 40
130 61 170 40
130 60 160 50
121 70 170 50
120 70 241 60
170 70 180 50
151 60 170 50
120 60 171 40
130 60 160 50
130 60 161 50
120 60 170 40
120 61 170 40
130 60 160 50
121 60 170 40


skep99确实基础扎实,多次都是能给出效率很高的解法。
xiongxyt2 2010-05-27
  • 打赏
  • 举报
回复
这么多高手过招,受益匪浅啊。
skep99 2010-05-27
  • 打赏
  • 举报
回复
按14楼的方法做了个比较

REGEX linq kickoffChinese
156 32 15
157 47 15
172 31 16
156 47 16
140 47 16
156 31 16
156 47 16
156 47 15
141 47 16
140 47 16
156 31 16
156 47 16
140 47 16
140 47 16
140 47 16
156 47 16
140 47 16
140 47 16
141 46 16
156 32 15
如此简单 2010-05-27
  • 打赏
  • 举报
回复
if(radioButton1.Checked){ control = form2.zhucex;}
else if(radioButton2.Checked){ control =form3.zhuces;}
else if(radioButton3.Checked){ control = warring;}
else if(radioButton4.Checked){ control =suggest;}
else if(radioButton5.Checked){ control =form4.mumawe;}
else if(radioButton6.Checked){ control =drop;}
if (control =="000000")
{
MessageBox.Show("你没有输入任何控制目标!不发控制信号");
richTextBox1.AppendText("你没有输入任何控制目标!不发控制信号");
}
else if(control != "000000")
{
try
{
//记录操作
richTextBox1.AppendText (control + "正在试图控制,等待回应......" + "\r");
stream = client.GetStream();
if(stream.CanWrite )
{
byte[] by = System.Text.Encoding.ASCII.GetBytes(control.ToCharArray ());
stream.Write(by,0,by.Length);
stream.Flush();
threadReceive =new Thread(new ThreadStart(receive));
threadReceive.Start();
}//endif
}//try
catch
{
richTextBox1.AppendText("服务器未连接1控制无效!" +"\r");
MessageBox.Show("服务器未连接1控制无效!" +"\r");
}
}//else if
}
-过客- 2010-05-27
  • 打赏
  • 举报
回复
无牙,效率不是这样比较的哈,因为正则不能这样用

如果你用循环对比的话,需要在循环体外声明正则,这样对正则才算公平

之所以认为foreach会比正则快,是因为用foreach时的优化空间比正则大一些
当然,同时还需要关注一下内存占用情况

不过话说回来,如果不是百万级的数据,一般是感觉不到处理效率差异的
兔子-顾问 2010-05-27
  • 打赏
  • 举报
回复
论坛眼神最差的算我了,仔细看才发现,居然2次对比的都是regex的。。。。
重贴测试代码

static void Main(string[] args)
{
Console.WriteLine("regex" + "\t\t" + "foreach" + "\t\t" + "linq");
for (int x = 0; x < 20; x++)
{
int tick = Environment.TickCount;
for (int i = 0; i < 10000; i++) TestChineseRegex01();
tick = Environment.TickCount - tick;
int tick1 = Environment.TickCount;
for (int i = 0; i < 10000; i++) TestChineseForeach01();
tick1 = Environment.TickCount - tick1;
int tick2 = Environment.TickCount;
for (int i = 0; i < 10000; i++) TestChineseLinq01();
tick2 = Environment.TickCount - tick2;
Console.WriteLine(tick.ToString() + "\t\t" + tick1.ToString() + "\t\t" + tick2.ToString());
}
Console.ReadKey();
}

private static void TestChineseLinq01()
{
string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";
string result = new string(content.Where(c => ((uint)c < 0x4e00 || (uint)c > 0x9fa5)).ToArray());
//Console.WriteLine(result);
}

private static void TestChineseRegex01()
{
Regex reg = new Regex(@"[\u4e00-\u9fa5]+");
string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";
string result = reg.Replace(content, "");
//Console.WriteLine(result);
}

private static void TestChineseForeach01()
{
string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";
StringBuilder builder = new StringBuilder(content.Length);
for (int i = 0; i < content.Length; i++)
{
int n = char.ConvertToUtf32(content, i);
if (n < 0x4e00 || n > 0x9fa5)
{
builder.Append(content[i]);
}
}
//Console.WriteLine(builder.ToString());
}


测试结果

regex foreach linq
203 47 94
171 31 94
156 47 78
171 47 78
156 47 62
172 31 78
156 47 78
156 47 78
156 47 78
156 31 78
218 47 78
172 31 78
171 47 78
156 47 78
172 46 78
156 47 78
172 31 78
156 47 78
171 32 78
171 31 78

如客客师傅说的,foreach 最快。
skep99 2010-05-27
  • 打赏
  • 举报
回复
我也贴一个,用数组可能会快点,100万次1.6秒 ,T7500的cpu


void kickoffChinese()
{
string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";

char[] chars = content.ToCharArray();
int offset = 0;
char[] result = new char[chars.Length];
for (int i = 0; i < chars.Length; i++)
{
if (chars[i] < 0x4E00 || chars[i] > 0x9FA5)
{
result[offset] = chars[i];
offset++;
}
}
char[] dest=new char[offset];
Array.Copy(result, dest, offset);
//Console.WriteLine(new string(dest));
}

兔子-顾问 2010-05-27
  • 打赏
  • 举报
回复
要说效率。嘿嘿。写了个更快的。
直接上对比代码和结果
static void Main(string[] args)
{
for (int x = 0; x < 20; x++)
{
int tick = Environment.TickCount;
for (int i = 0; i < 10000; i++) TestChineseRegex01();
tick = Environment.TickCount - tick;
int tick1 = Environment.TickCount;
for (int i = 0; i < 10000; i++) TestChineseRegex01();
tick1 = Environment.TickCount - tick1;
int tick2 = Environment.TickCount;
for (int i = 0; i < 10000; i++) TestChineseLinq01();
tick2 = Environment.TickCount - tick2;
Console.WriteLine(tick.ToString() + "\t" + tick1.ToString() + "\t" + tick2.ToString());
}
Console.ReadKey();
}

private static void TestChineseLinq01()
{
string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";
string result = new string(content.Where(c => ((uint)c < 0x4e00 || (uint)c > 0x9fa5)).ToArray());
//Console.WriteLine(result);
}

private static void TestChineseRegex01()
{
Regex reg = new Regex(@"[\u4e00-\u9fa5]+");
string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";
string result = reg.Replace(content, "");
//Console.WriteLine(result);
}


对比结果

regex foreach linq
203 171 94
156 172 78
171 172 78
171 156 78
172 172 78
171 156 94
156 171 78
156 156 94
156 172 78
156 171 78
172 156 78
171 188 93
156 156 94
156 187 140
156 172 94
156 171 78
172 171 78
156 172 78
172 156 78
171 156 78
兔子-顾问 2010-05-27
  • 打赏
  • 举报
回复
可能我写的不好,20次对比,还是差不多的。正则还快一点,可能要考虑反复构建正则引擎时候,上一次没有释放带来的优化,不知道有没有。

static void Main(string[] args)
{
for (int x = 0; x < 20; x++)
{
int tick = Environment.TickCount;
for (int i = 0; i < 10000; i++) TestChineseRegex01();
tick = Environment.TickCount - tick;
int tick1 = Environment.TickCount;
for (int i = 0; i < 10000; i++) TestChineseRegex01();
tick1 = Environment.TickCount - tick1;
Console.WriteLine(tick.ToString() + "," + tick1.ToString());
}
Console.ReadKey();
}

private static void TestChineseRegex01()
{
Regex reg = new Regex(@"[\u4e00-\u9fa5]+");
string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";
string result = reg.Replace(content, "");
//Console.WriteLine(result);
}

private static void TestChinese()
{
string content = "13440900984\r\n13440900984\r\n你好\r\n13440900984\r\n你好\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984\r\n13440900984你好\r\n鸟\r\n13440900984\r\n13440900984\r\n";
StringBuilder builder = new StringBuilder(content.Length);
for (int i = 0; i < content.Length; i++)
{
int n = char.ConvertToUtf32(content, i);
if (n < 0x4e00 || n > 0x9fa5)
{
builder.Append(content[i]);
}
}
//Console.WriteLine(builder.ToString());
}



几次测试结果:
171,172
156,187
172,156
171,172
156,172
171,172
156,171
172,156
172,156
187,203
171,187
141,171
156,172
172,156
187,171
156,172
156,172
156,171
156,172
171,156
-------
165.3 170.85<-平均值
正则 165.3 ms
foreach 170.85 ms
wosizy 2010-05-27
  • 打赏
  • 举报
回复
先汉字判断
private static int isCharacter(String word){
byte[] str_byte = null;
str_byte = word.substring(0, 1).getBytes();
if(str_byte.length==2){
return 1;//是汉字
}else{
return 0;//不是汉字
}
}

接着过滤汉字..
system.text.regularexpressions.regex regex = new system.text.regularexpressions.regex( "[\u4e00-\u9fa5]");
string replacedstring = regex.replace(str,"");//如果存在指定编码的字符串则过滤掉




加载更多回复(9)

110,561

社区成员

发帖
与我相关
我的任务
社区描述
.NET技术 C#
社区管理员
  • C#
  • Web++
  • by_封爱
加入社区
  • 近7日
  • 近30日
  • 至今
社区公告

让您成为最强悍的C#开发者

试试用AI创作助手写篇文章吧