介绍C#解析HTML的两种方法

在搜索引擎的开发中，我们需要对网页的Html内容进行检索，难免的就需要对Html进行解析。拆分每一个节点并且获取节点间的内容。此文介绍两种C#解析Html的方法。

创新互联专注于企业全网营销推广、网站重做改版、盘锦网站定制设计、自适应品牌网站建设、H5开发、电子商务商城网站建设、集团公司官网建设、外贸营销网站建设、高端网站制作、响应式网页设计等建站业务，价格优惠性价比高，为盘锦等各大城市提供网站开发制作服务。

C#解析Html的第一种方法：

用System.Net.WebClient下载Web Page存到本地文件或者String中，用正则表达式来分析。这个方法可以用在Web Crawler等需要分析很多Web Page的应用中。

估计这也是大家最直接，最容易想到的一个方法。

转自网上的一个实例：所有的href都抽取出来：

 
 
 
 
  
  
  
  using System;    
  
  
  using System.Net;    
  
  
  using System.Text;    
  
  
  using System.Text.RegularExpressions;    
  
  
  namespace HttpGet    
  
  
  {    
  
  
      class Class1    
  
  
      {    
  
  
          [STAThread]    
  
  
          static void Main(string[] args)    
  
  
          {    
  
  
              System.Net.WebClient client = new WebClient();    
  
  
              byte[] page = client.DownloadData("http://www.google.com");    
  
  
              string content = System.Text.Encoding.UTF8.GetString(page);    
  
  
              string regex = "href=[\\\"\\\'](http:\\/\\/|\\.\\/|\\/)?\\w+(\\.\\w+)*(\\/\\w+(\\.\\w+)?)*(\\/|\\?\\w*=\\w*(&\\w*=\\w*)*)?[\\\"\\\']";    
  
  
              Regex re = new Regex(regex);    
  
  
              MatchCollection matches = re.Matches(content);    
  
  
     
  
  
              System.Collections.IEnumerator enu = matches.GetEnumerator();    
  
  
              while (enu.MoveNext() && enu.Current != null)    
  
  
              {    
  
  
                  Match match = (Match)(enu.Current);    
  
  
                  Console.Write(match.Value + "\r\n");    
  
  
              }    
  
  
          }    
  
  
      }    
  
  
  }

一些爬虫的HTML解析中也是用的类似的方法。

C#解析Html的第二种方法：

利用Winista.Htmlparser.Net 解析Html。这是.NET平台下解析Html的开源代码，网上有源码下载，百度一下就能搜到，这里就不提供了。并且有英文的帮助文档。找不到的留下邮箱。

个人认为这是.net平台下解析html不错的解决方案，基本上能够满足我们对html的解析工作。

自己做了个实例：

 
 
 
 
  
  
  
  using System;    
  
  
  using System.Collections.Generic;    
  
  
  using System.ComponentModel;    
  
  
  using System.Data;    
  
  
  using System.Drawing;    
  
  
  using System.Linq;    
  
  
  using System.Text;    
  
  
  using System.Windows.Forms;    
  
  
  using Winista.Text.HtmlParser;    
  
  
  using Winista.Text.HtmlParser.Lex;    
  
  
  using Winista.Text.HtmlParser.Util;    
  
  
  using Winista.Text.HtmlParser.Tags;    
  
  
  using Winista.Text.HtmlParser.Filters;    
  
  
     
  
  
     
  
  
  namespace HTMLParser    
  
  
  {    
  
  
      public partial class Form1 : Form    
  
  
      {    
  
  
          public Form1()    
  
  
          {    
  
  
              InitializeComponent();    
  
  
              AddUrl();    
  
  
          }    
  
  
     
  
  
          private void btnParser_Click(object sender, EventArgs e)    
  
  
          {    
  
  
              #region 获得网页的html    
  
  
              try   
  
  
              {    
  
  
     
  
  
                  txtHtmlWhole.Text = "";    
  
  
                  string url = CBUrl.SelectedItem.ToString().Trim();    
  
  
                  System.Net.WebClient aWebClient = new System.Net.WebClient();    
  
  
                  aWebClient.Encoding = System.Text.Encoding.Default;    
  
  
                  string html = aWebClient.DownloadString(url);    
  
  
                  txtHtmlWhole.Text = html;    
  
  
              }    
  
  
              catch (Exception ex)    
  
  
              {    
  
  
                  MessageBox.Show(ex.Message);    
  
  
              }    
  
  
              #endregion    
  
  
     
  
  
              #region 分析网页html节点    
  
  
              Lexer lexer = new Lexer(this.txtHtmlWhole.Text);    
  
  
              Parser parser = new Parser(lexer);    
  
  
              NodeList htmlNodes = parser.Parse(null);    
  
  
              this.treeView1.Nodes.Clear();    
  
  
              this.treeView1.Nodes.Add("root");    
  
  
              TreeNode treeRoot = this.treeView1.Nodes[0];    
  
  
              for (int i = 0; i <  htmlNodes.Count; i++)    
  
  
              {    
  
  
                  this.RecursionHtmlNode(treeRoot, htmlNodes[i], false);    
  
  
              }    
  
  
     
  
  
              #endregion    
  
  
     
  
  
          }    
  
  
     
  
  
          private void RecursionHtmlNode(TreeNode treeNode, INode htmlNode, bool siblingRequired)    
  
  
          {    
  
  
              if (htmlNode == null || treeNode == null) return;    
  
  
     
  
  
              TreeNode current = treeNode;    
  
  
              TreeNode content ;    
  
  
              //current node    
  
  
              if (htmlNode is ITag)    
  
  
              {    
  
  
                  ITag tag = (htmlNode as ITag);    
  
  
                  if (!tag.IsEndTag())    
  
  
                  {    
  
  
                      string nodeString = tag.TagName;    
  
  
                      if (tag.Attributes != null && tag.Attributes.Count > 0)    
  
  
                      {    
  
  
                          if (tag.Attributes["ID"] != null)    
  
  
                          {    
  
  
                              nodeString = nodeString + " { id=\"" + tag.Attributes["ID"].ToString() + "\" }";    
  
  
                          }    
  
  
                          if (tag.Attributes["HREF"] != null)    
  
  
                          {    
  
  
                              nodeString = nodeString + " { href=\"" + tag.Attributes["HREF"].ToString() + "\" }";    
  
  
                          }    
  
  
                      }    
  
  
                          
  
  
                      current = new TreeNode(nodeString);    
  
  
                      treeNode.Nodes.Add(current);    
  
  
                  }    
  
  
              }    
  
  
     
  
  
              //获取节点间的内容    
  
  
              if (htmlNode.Children != null && htmlNode.Children.Count > 0)    
  
  
              {    
  
  
                  this.RecursionHtmlNode(current, htmlNode.FirstChild, true);    
  
  
                  content = new TreeNode(htmlNode.FirstChild.GetText());    
  
  
                  treeNode.Nodes.Add(content);    
  
  
              }    
  
  
     
  
  
              //the sibling nodes    
  
  
              if (siblingRequired)    
  
  
              {    
  
  
                  INode sibling = htmlNode.NextSibling;    
  
  
                  while (sibling != null)    
  
  
                  {    
  
  
                      this.RecursionHtmlNode(treeNode, sibling, false);    
  
  
                      sibling = sibling.NextSibling;    
  
  
                  }    
  
  
              }    
  
  
          }    
  
  
          private void AddUrl()    
  
  
          {    
  
  
              CBUrl.Items.Add("http://www.hao123.com");    
  
  
              CBUrl.Items.Add("http://www.sina.com");    
  
  
              CBUrl.Items.Add("http://www.heuet.edu.cn");    
  
  
          }    
  
  
     
  
  
              
  
  
     
  
  
      }    
  
  
  }

运行效果：

实现取来很容易，结合Winista.Htmlparser源码很快就可以实现想要的效果。

小结：

简单介绍了两种C#解析Html的的方法，大家有什么其他好的方法还望指教。

本文题目：介绍C#解析HTML的两种方法
文章路径：http://www.mswzjz.com/qtweb/news20/185420.html

网站建设、网络推广公司-创新互联，是专注品牌与效果的网站制作，网络营销seo公司；服务项目有等

声明：本网站发布的内容（图片、视频和文字）以用户投稿、用户转载内容为主，如果涉及侵权请尽快告知，我们将会在第一时间删除。文章观点不代表本网站立场，如需处理请联系客服。电话：028-86922220；邮箱：631063699@qq.com。内容未经允许不得转载，或转载时需注明来源：创新互联

猜你还喜欢下面的内容