62,243
社区成员




LZ串有问题:
<divid="content">
要改为:
<div id="content">
div和id中间至少一个空格字符的,因为我的正则是:
<div\s+id="content">
\s+代表至少一个空格字符。
static void Main(string[] args)
{
string str = @"<html> <head> </head> <body> <divid=""content""> <div><div></div><div><div></div></div></div>
</div> <div> </div> </body> </html>";
Match match = Regex.Match(str, @"<div\s*id=""content""\s*>(?<text>[\s\S]*)</div>");
if (match.Success)
{
string result = GetContent(match.Value, match.Groups["text"].Value);
Console.WriteLine(result);
}
static string GetContent(string match, string content)
{
if (Regex.Matches(content, "<div>").Count != Regex.Matches(content, "</div>").Count)
{
string sub = match.Substring(0, match.LastIndexOf("</div>"));
match = GetContent(sub, Regex.Match(sub, @"<div\s*id=""content""\s*>(?<text>[\s\S]*)</div>").Groups["text"].Value);
}
else if (content.LastIndexOf("<div>") > content.LastIndexOf("</div>"))
{
string sub = match.Substring(0, match.LastIndexOf("<div>"));
match = GetContent(sub, Regex.Match(sub, @"<div\s*id=""content""\s*>(?<text>[\s\S]*)</div>").Groups["text"].Value);
}
return match;
}
<div(?:\s+(?:""[^""]*""|'[^']*'|[^""'>])*)?>
上面这个式了保证了正确地排除引号中的HTML标记,如<div>之类,也就是说引号里的<div>不算<div>:
<div comment="this is <div> in commment">
using System;
using System.Text.RegularExpressions;
class Test
{
static void Main()
{
string s = @" <html> <head> </head> <body> <div id=""content""> <div> </div>
<div comment=""this is <div> in commment""></div>
</div> <div> </div> </body> </html>";
string pattern = @"(?six)<div\s+id=""content"">
(
(?>
(?!<div\b|</div>).
|
<div(?:\s+(?:""[^""]*""|'[^']*'|[^""'>])*)?>(?'div')
|
</div>(?'-div')
)*
(?(div)(?!))
)
</div>";
foreach (Match m in Regex.Matches(s, pattern))
{
Console.WriteLine(m.Value);
}
}
}
/* 程序输出:
<div id="content"> <div> </div>
<div comment="this is <div> in commment"></div>
</div>
*/
using System.Xml;
using System;
public class t
{
public static void Main(string[] args)
{
string div = "<div><div id=\"content\"><div><div></div></div></div><div></div></div>";
XmlDocument doc = new XmlDocument();
doc.LoadXml(div);
Console.WriteLine(doc.InnerXml);
XmlNodeList nl = doc.GetElementsByTagName("div");
foreach (XmlNode n in nl)
{
if (n.Attributes["id"]!=null&&n.Attributes["id"].Value == "content")
{
Console.WriteLine(n.InnerXml);
break;
}
}
}
}
using System;
using System.Text.RegularExpressions;
class Test
{
static void Main()
{
string s = Console.In.ReadToEnd();
string pattern = @"(?six)<div\s+id=""content"">
(?'MyCont'
(?>
(?!<div\b|</div>).
|
<div(?:\s+(?:""[^""]*""|'[^']*'|[^""'>])*)?>(?'div')
|
</div>(?'-div')
)*
(?(div)(?!))
)
</div>";
foreach (Match m in Regex.Matches(s, pattern))
{
Console.WriteLine(m.Value);
}
}
}