111,094
社区成员




private void Analyze_Click(object sender, EventArgs e)
{
#region 获得网页的html
string html = "";
try
{
/*string url = WebBrowser.Url.ToString();
System.Net.WebClient aWebClient = new System.Net.WebClient();
aWebClient.Encoding = System.Text.Encoding.Default;
html = aWebClient.DownloadString("file:///E:/学习/大三课程/Test.htm");*/
StreamReader sr = new StreamReader(WebBrowser.DocumentStream, Encoding.Default);
html = sr.ReadToEnd();
//html = WebBrowser.DocumentText;
FileInfo fi = new FileInfo("E:\\test.txt");
FileStream fs;
if (fi.Exists)
{
fs = fi.Open(FileMode.CreateNew);
}
else
{
fs = fi.Create();
}
StreamWriter bw = new StreamWriter(fs);
bw.Write(html);
bw.WriteLine("测试");
bw.Close();
}
catch (Exception ex)
{
MessageBox.Show(ex.Message);
}
#endregion
#region 分析网页html节点
//Lexer词法分析器直接解析HTML,Lexer中真正执行词法分析的是NextCode()方法
Lexer lexer = new Lexer(html);
//解析
Parser parser = new Parser(lexer);
//获取我们想要的节点,类似于NodeList nodes = parser.parse(new MyNodeFilter(new String[]{"INPUT","FONT"}));获取INPUT和FONT节点
//NodeList htmlNodes = parser.Parse(null);
NodeList htmlNodes = parser.Parse(new TagNameFilter("TABLE"));
this.treeView1.Nodes.Clear();
this.treeView1.Nodes.Add("root");
TreeNode treeRoot = this.treeView1.Nodes[0];
for (int i = 0; i < htmlNodes.Count; i++)
{
//获取所有的节点
this.RecursionHtmlNode(treeRoot, htmlNodes[i], false);
}
#endregion
}
private void RecursionHtmlNode(TreeNode treeNode, INode htmlNode, bool siblingRequired)
{
//如果节点不为空
if (htmlNode == null || treeNode == null) return;
TreeNode current = treeNode;
TreeNode content ;
//标签
if (htmlNode is ITag)
{
ITag tag = (htmlNode as ITag);
//if (!tag.IsEndTag())
//{
string nodeString = tag.TagName;
if (tag.Attributes != null && tag.Attributes.Count > 0)
{
if (tag.Attributes["ID"] != null)
{
nodeString = nodeString + " { id=\"" + tag.Attributes["ID"].ToString() + "\" }";
}
if (tag.Attributes["HREF"] != null)
{
nodeString = nodeString + " { href=\"" + tag.Attributes["HREF"].ToString() + "\" }";
}
if (tag.Attributes["TD"] != null)
{
nodeString = nodeString + " { td = \""+tag.Attributes["TD"].ToString()+"\" }";
}
}
current = new TreeNode(nodeString);
treeNode.Nodes.Add(current);
//}
}
//获取节点间的内容
if (htmlNode.Children != null && htmlNode.Children.Count > 0)
{
this.RecursionHtmlNode(current, htmlNode.FirstChild, true);
content = new TreeNode(htmlNode.FirstChild.GetText());
treeNode.Nodes.Add(content);
}
//the sibling nodes
if (siblingRequired)
{
INode sibling = htmlNode.NextSibling;
while (sibling != null)
{
this.RecursionHtmlNode(treeNode, sibling, false);
sibling = sibling.NextSibling;
}
}
}
实际的运行情况如下: