C#下解析HTML的两种方法介绍

在搜索引擎的开发中，我们需要对html进行解析。本文介绍c#解析html的两种方法。
ad：
在搜索引擎的开发中，我们需要对网页的html内容进行检索，难免的就需要对html进行解析。拆分每一个节点并且获取节点间的内容。此文介绍两种c#解析html的方法。
c#解析html的第一种方法：
用system.net.webclient下载web page存到本地文件或者string中，用正则表达式来分析。这个方法可以用在web crawler等需要分析很多web page的应用中。
估计这也是大家最直接，最容易想到的一个方法。
转自网上的一个实例：所有的href都抽取出来：
using system; using system.net; using system.text; using system.text.regularexpressions; namespace httpget { class class1 { [stathread] static void main(string[] args) { system.net.webclient client = new webclient(); byte[] page = client.downloaddata("http://www.google.com"); string content = system.text.encoding.utf8.getstring(page); string regex = "href=[\\\"\\\'](http:\\/\\/|\\.\\/|\\/)?\\w+(\\.\\w+)*(\\/\\w+(\\.\\w+)?)*(\\/|\\?\\w*=\\w*(&\\w*=\\w*)*)?[\\\"\\\']"; regex re = new regex(regex); matchcollection matches = re.matches(content); system.collections.ienumerator enu = matches.getenumerator(); while (enu.movenext() && enu.current != null) { match match = (match)(enu.current); console.write(match.value + "\r\n"); } } } }
c#解析html的第二种方法：
利用winista.htmlparser.net 解析html。这是.net平台下解析html的开源代码，网上有源码下载，百度一下就能搜到，这里就不提供了。并且有英文的帮助文档。找不到的留下邮箱。
个人认为这是.net平台下解析html不错的解决方案，基本上能够满足我们对html的解析工作。
自己做了个实例：
using system; using system.collections.generic; using system.componentmodel; using system.data; using system.drawing; using system.linq; using system.text; using system.windows.forms; using winista.text.htmlparser; using winista.text.htmlparser.lex; using winista.text.htmlparser.util; using winista.text.htmlparser.tags; using winista.text.htmlparser.filters; namespace htmlparser { public partial class form1 : form { public form1() { initializecomponent(); addurl(); } private void btnparser_click(object sender, eventargs e) { #region 获得网页的html try { txthtmlwhole.text = ""; string url = cburl.selecteditem.tostring().trim(); system.net.webclient awebclient = new system.net.webclient(); awebclient.encoding = system.text.encoding.default; string html = awebclient.downloadstring(url); txthtmlwhole.text = html; } catch (exception ex) { messagebox.show(ex.message); } #endregion #region 分析网页html节点 lexer lexer = new lexer(this.txthtmlwhole.text); parser parser = new parser(lexer); nodelist htmlnodes = parser.parse(null); this.treeview1.nodes.clear(); this.treeview1.nodes.add("root"); treenode treeroot = this.treeview1.nodes[0]; for (int i = 0; i < htmlnodes.count; i++) { this.recursionhtmlnode(treeroot, htmlnodes[i], false); } #endregion } private void recursionhtmlnode(treenode treenode, inode htmlnode, bool siblingrequired) { if (htmlnode == null || treenode == null) return; treenode current = treenode; treenode content ; //current node if (htmlnode is itag) { itag tag = (htmlnode as itag); if (!tag.isendtag()) { string nodestring = tag.tagname; if (tag.attributes != null && tag.attributes.count > 0) { if (tag.attributes["id"] != null) { nodestring = nodestring + " { id=\"" + tag.attributes["id"].tostring() + "\" }"; } if (tag.attributes["href"] != null) { nodestring = nodestring + " { href=\"" + tag.attributes["href"].tostring() + "\" }"; } } current = new treenode(nodestring); treenode.nodes.add(current); } } //获取节点间的内容 if (htmlnode.children != null && htmlnode.children.count > 0) { this.recursionhtmlnode(current, htmlnode.firstchild, true); content = new treenode(htmlnode.firstchild.gettext()); treenode.nodes.add(content); } //the sibling nodes if (siblingrequired) { inode sibling = htmlnode.nextsibling; while (sibling != null) { this.recursionhtmlnode(treenode, sibling, false); sibling = sibling.nextsibling; } } } private void addurl() { cburl.items.add("http://www.hao123.com"); cburl.items.add("http://www.sina.com"); cburl.items.add("http://www.heuet.edu.cn"); } } }
运行效果：
实现取来很容易，结合winista.htmlparser源码很快就可以实现想要的效果。
小结：
简单介绍了两种c#解析html的的方法，大家有什么其他好的方法还望指教。
更多c#下解析html的两种方法介绍。

C#下解析HTML的两种方法介绍

VIP推荐