110,567
社区成员
发帖
与我相关
我的任务
分享
public class SimpleHtmlParser
{
/// <summary>
/// 解析函数
/// </summary>
/// <param name="s">解析字符串</param>
/// <param name="elements">解析后的控件列表</param>
/// <returns>返回控件树</returns>
public static Element ParseHtml(string s,out List<Element> elements)
{
elements = new List<Element>();
elements.Clear();
Stack<Element> es = new Stack<Element>();
string pattern = @"(?=(</?table.*?>)|(</?div/?.*?>))";
RegexOptions options = RegexOptions.None | RegexOptions.IgnoreCase | RegexOptions.Singleline;
Regex regex = new Regex(pattern, options);
MatchCollection matches = regex.Matches(s);
var element = new Element();
var lastElement = element;
foreach (Match match in matches)
{
var wordindex = 0;
var wordlength = 0;
var word = "";
for (int i = 0; i < match.Groups.Count; i++)
{
var t = match.Groups[i];
if (t.Length > 0)
{
wordindex = t.Index;
wordlength = t.Length;
word = t.Value;
break;
}
}
if (wordlength <= 0) continue;
if (word == "<div/>") continue;
bool isTable = word.IndexOf("table") >= 0;
bool isDiv = word.IndexOf("div") >= 0;
bool isEnd = word.IndexOf("</") >= 0;
if (!isEnd)
{
//新标签
Element ee;
if (isDiv)
{
ee = new DivElement();
}
else if (isTable)
{
ee = new TableElement();
}
else
{
ee = new Element();
}
ee.StartTagIndex = wordindex;
ee.StartTagLength = wordlength;
ee.BegTag = word;
//设定父级
ee.Parent = lastElement;
lastElement = ee;
ee.Parent.Children.Add(ee);
//进栈
es.Push(ee);
}
else
{
//闭合标签
var t = es.Pop();
t.EndTag = word;
t.EndIndex = wordindex;
t.EndTagLength = wordlength;
lastElement = t.Parent;
t.OuterHtml = s.Substring(t.StartTagIndex, (t.EndIndex - t.StartTagIndex) + t.EndTagLength);
t.InnerHtml = s.Substring(t.StartTagIndex + t.StartTagLength, (t.EndIndex - t.StartTagIndex-t.StartTagLength));
elements.Add(t);
}
}
return element;
}
//去除代码中无用的标签
public static string ReplaceFontSpan(string s)
{
Regex r = new Regex("<head>.*?</head>");
s = r.Replace(s, "");
r = new Regex("</?font.*?>");
s = r.Replace(s, "");
r = new Regex("</?span.*?>");
s = r.Replace(s, "");
r = new Regex("</?a.*?>");
s = r.Replace(s, "");
return s;
}
//下载网页源文件
public static string DownLoadHtml(string url)
{
try
{
HttpWebRequest r = (HttpWebRequest)WebRequest.Create(url);
r.Method = "get";
HttpWebResponse rep = (HttpWebResponse)r.GetResponse();
Stream receiveStream = rep.GetResponseStream();
StreamReader readStream = new StreamReader(receiveStream, System.Text.Encoding.Default);
var result = readStream.ReadToEnd();
return result.ToString();
}
catch
{
return "";
}
}
}
public class Element : StringElement
{
public int StartTagIndex { get; set; }
public int StartTagLength {get;set;}
public int EndIndex { get; set; }
public int EndTagLength { get; set; }
public string BegTag { get; set; }
public string EndTag {get;set;}
public List<Element> Children = new List<Element>();
public Element Parent
{
get;
set;
}
}
public class DivElement : Element
{
}
public class TableElement : Element
{
}
public class TrElement : Element
{
}
public class StringElement
{
public string OuterHtml
{
get;
set;
}
public string InnerHtml
{
get;
set;
}
}
private void Form1_Load(object sender, EventArgs e)
{
var url = "http://www.baidu.com/s?wd=惠阳妇科病医院&rsv_bp=0&rsv_spt=3&inputT=21000";
var s = SimpleHtmlParser.DownLoadHtml(url);
//去除无用标签
s = SimpleHtmlParser.ReplaceFontSpan(s);
List<Element> t = null;
var element = SimpleHtmlParser.ParseHtml(s, out t);
//查找没有子控件的结果
foreach (var o in t)
{
if (o.Children.Count <= 0)
{
MessageBox.Show(o.OuterHtml);
}
}
//遍历控件树
List(element);
}
public void List(Element e)
{
if (e.Children.Count > 0)
{
foreach (var t in e.Children)
{
List(t);
}
}
MessageBox.Show(e.OuterHtml);
}