你的位置:首页 > 软件开发 > ASP.net > 正则表达式相关:C# 抓取网页类(获取网页中所有信息)

正则表达式相关:C# 抓取网页类(获取网页中所有信息)

发布时间:2016-04-29 22:00:11
类的代码: 1 using System; 2 using System.Data; 3 using System.Configuration; 4 using System.Net; 5 using System.IO; 6 using System.Text; 7 ...

类的代码:

 1 using System;  2 using System.Data;  3 using System.Configuration;  4 using System.Net;  5 using System.IO;  6 using System.Text;  7 using System.Collections.Generic;  8 using System.Text.RegularExpressions;  9 using System.Threading;  10 using System.Web;  11 using System.Web.UI.MobileControls;  12   /// <summary>  13   /// 网页类  14   /// </summary>  15   public class WebPage  16   {  17     #region 私有成员  18     private Uri m_uri;  //url  19     private List<Link> m_links;  //此网页上的链接  20     private get='_blank'>string m_title;    //标题  21     private string m_html;     //HTML代码  22     private string m_outstr;    //网页可输出的纯文本  23     private bool m_good;      //网页是否可用  24     private int m_pagesize;    //网页的大小  25     private static Dictionary<string, CookieContainer> webcookies = new Dictionary<string, CookieContainer>();//存放所有网页的Cookie  26      27     #endregion  28  29     #region 属性  30   31     /// <summary>  32     /// 通过此属性可获得本网页的网址,只读  33     /// </summary>  34     public string URL  35     {  36       get  37       {  38         return m_uri.AbsoluteUri;  39       }  40     }  41   42     /// <summary>  43     /// 通过此属性可获得本网页的标题,只读  44     /// </summary>  45     public string Title  46     {  47       get  48       {  49         if (m_title == "")  50         {  51           Regex reg = new Regex(@"(?m)<title[^>]*>(?<title>(?:\w|\W)*?)</title[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase);  52           Match mc = reg.Match(m_html);  53           if (mc.Success)  54             m_title = mc.Groups["title"].Value.Trim();  55         }  56         return m_title;  57       }  58     }  59     public string M_html  60     {  61       get  62       {  63         if (m_html == null)  64         {  65           m_html = "";  66         }  67         return m_html;  68       }  69     }  70     /// <summary>  71     /// 此属性获得本网页的所有链接信息,只读  72     /// </summary>  73     public List<Link> Links  74     {  75       get  76       {  77         if (m_links.Count == 0) getLinks();  78         return m_links;  79       }  80     }  81   82   83     /// <summary>  84     /// 此属性返回本网页的全部纯文本信息,只读  85     /// </summary>  86     public string Context  87     {  88       get  89       {  90         if (m_outstr == "") getContext(Int16.MaxValue);  91         return m_outstr;  92       }  93     }  94   95     /// <summary>  96     /// 此属性获得本网页的大小  97     /// </summary>  98     public int PageSize  99     { 100       get 101       { 102         return m_pagesize; 103       } 104     } 105     /// <summary> 106     /// 此属性获得本网页的所有站内链接 107     /// </summary> 108     public List<Link> InsiteLinks 109     { 110       get 111       { 112         return getSpecialLinksByUrl("^http://" + m_uri.Host, Int16.MaxValue); 113       } 114     } 115  116     /// <summary> 117     /// 此属性表示本网页是否可用 118     /// </summary> 119     public bool IsGood 120     { 121       get 122       { 123         return m_good; 124       } 125     } 126     /// <summary> 127     /// 此属性表示网页的所在的网站 128     /// </summary> 129     public string Host 130     { 131       get 132       { 133         return m_uri.Host; 134       } 135     } 136     #endregion 137  138  139     /// <summary> 140     /// 从HTML代码中分析出链接信息 141     /// </summary> 142     /// <returns>List<Link></returns> 143     private List<Link> getLinks() 144     { 145       if (m_links.Count == 0) 146       { 147         Regex[] regex = new Regex[2]; 148         regex[0] = new Regex(@"<a\shref\s*=""(?<URL>[^""]*).*?>(?<title>[^<]*)</a>", RegexOptions.IgnoreCase | RegexOptions.Singleline); 149         regex[1] = new Regex("<[i]*frame[^><]+src='/images/loading.gif' data-original=(\"|')?(?<url>([^>\"'\\s)])+)(\"|')?[^>]*>", RegexOptions.IgnoreCase); 150  151         for (int i = 0; i < 2; i++) 152         { 153           Match match = regex[i].Match(m_html); 154           while (match.Success) 155           { 156             try 157             { 158               string url = HttpUtility.UrlDecode(new Uri(m_uri, match.Groups["URL"].Value).AbsoluteUri); 159  160               string text = ""; 161               if (i == 0) text = new Regex("(<[^>]+>)|(\\s)|( )|&|\"", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(match.Groups["text"].Value, ""); 162  163               Link link = new Link(); 164               link.Text = text; 165               link.NavigateUrl = url; 166  167               m_links.Add(link); 168             } 169             catch (Exception ex) { Console.WriteLine(ex.Message); }; 170             match = match.NextMatch(); 171           } 172         } 173       } 174       return m_links; 175     } 176     /// <summary> 177     /// 此私有方法从一段HTML文本中提取出一定字数的纯文本 178     /// </summary> 179     /// <param name="instr">HTML代码</param> 180     /// <param name="firstN">提取从头数多少个字</param> 181     /// <param name="withLink">是否要链接里面的字</param> 182     /// <returns>纯文本</returns> 183     private string getFirstNchar(string instr, int firstN, bool withLink) 184     { 185       if (m_outstr == "") 186       { 187         m_outstr = instr.Clone() as string; 188         m_outstr = new Regex(@"(?m)<script[^>]*>(\w|\W)*?</script[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, ""); 189         m_outstr = new Regex(@"(?m)<style[^>]*>(\w|\W)*?</style[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, ""); 190         m_outstr = new Regex(@"(?m)<select[^>]*>(\w|\W)*?</select[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, ""); 191         if (!withLink) m_outstr = new Regex(@"(?m)<a[^>]*>(\w|\W)*?</a[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, ""); 192         Regex objReg = new System.Text.RegularExpressions.Regex("(<[^>]+?>)| ", RegexOptions.Multiline | RegexOptions.IgnoreCase); 193         m_outstr = objReg.Replace(m_outstr, ""); 194         Regex objReg2 = new System.Text.RegularExpressions.Regex("(\\s)+", RegexOptions.Multiline | RegexOptions.IgnoreCase); 195         m_outstr = objReg2.Replace(m_outstr, " "); 196  197       } 198       return m_outstr.Length > firstN ? m_outstr.Substring(0, firstN) : m_outstr; 199     } 200 201 202     #region 公有文法 203     /// <summary> 204     /// 此公有方法提取网页中一定字数的纯文本,包括链接文字 205     /// </summary> 206     /// <param name="firstN">字数</param> 207     /// <returns></returns> 208     public string getContext(int firstN) 209     { 210       return getFirstNchar(m_html, firstN, true); 211     } 212  213     /// <summary> 214     /// 此公有方法从本网页的链接中提取一定数量的链接,该链接的URL满足某正则式 215     /// </summary> 216     /// <param name="pattern">正则式</param> 217     /// <param name="count">返回的链接的个数</param> 218     /// <returns>List<Link></returns> 219     public List<Link> getSpecialLinksByUrl(string pattern, int count) 220     { 221       if (m_links.Count == 0) getLinks(); 222       List<Link> SpecialLinks = new List<Link>(); 223       List<Link>.Enumerator i; 224       i = m_links.GetEnumerator(); 225       int cnt = 0; 226       while (i.MoveNext() && cnt < count) 227       { 228         if (new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase).Match(i.Current.NavigateUrl).Success) 229         { 230           SpecialLinks.Add(i.Current); 231           cnt++; 232         } 233       } 234       return SpecialLinks; 235     } 236  237     /// <summary> 238     /// 此公有方法从本网页的链接中提取一定数量的链接,该链接的文字满足某正则式 239     /// </summary> 240     /// <param name="pattern">正则式</param> 241     /// <param name="count">返回的链接的个数</param> 242     /// <returns>List<Link></returns> 243     public List<Link> getSpecialLinksByText(string pattern, int count) 244     { 245       if (m_links.Count == 0) getLinks(); 246       List<Link> SpecialLinks = new List<Link>(); 247       List<Link>.Enumerator i; 248       i = m_links.GetEnumerator(); 249       int cnt = 0; 250       while (i.MoveNext() && cnt < count) 251       { 252         if (new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase).Match(i.Current.Text).Success) 253         { 254           SpecialLinks.Add(i.Current); 255           cnt++; 256         } 257       } 258       return SpecialLinks; 259     } 260  261     /// <summary> 262     /// 这公有方法提取本网页的纯文本中满足某正则式的文字 by 何问起263     /// </summary> 264     /// <param name="pattern">正则式</param> 265     /// <returns>返回文字</returns> 266     public string getSpecialWords(string pattern) 267     { 268       if (m_outstr == "") getContext(Int16.MaxValue); 269       Regex regex = new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase); 270       Match mc = regex.Match(m_outstr); 271       if (mc.Success) 272         return mc.Groups[1].Value; 273       return string.Empty; 274     } 275     #endregion 276 277     #region 构造函数 278  279     private void Init(string _url) 280     { 281       try 282       { 283         m_uri = new Uri(_url); 284         m_links = new List<Link>(); 285         m_html = ""; 286         m_outstr = ""; 287         m_title = ""; 288         m_good = true; 289         if (_url.EndsWith(".rar") || _url.EndsWith(".dat") || _url.EndsWith(".msi")) 290         { 291           m_good = false; 292           return; 293         } 294         HttpWebRequest rqst = (HttpWebRequest)WebRequest.Create(m_uri); 295         rqst.AllowAutoRedirect = true; 296         rqst.MaximumAutomaticRedirections = 3; 297         rqst.UserAgent = "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)"; 298         rqst.KeepAlive = true; 299         rqst.Timeout = 10000; 300         lock (WebPage.webcookies) 301         { 302           if (WebPage.webcookies.ContainsKey(m_uri.Host)) 303             rqst.CookieContainer = WebPage.webcookies[m_uri.Host]; 304           else 305           { 306             CookieContainer cc = new CookieContainer(); 307             WebPage.webcookies[m_uri.Host] = cc; 308             rqst.CookieContainer = cc; 309           } 310         } 311         HttpWebResponse rsps = (HttpWebResponse)rqst.GetResponse(); 312         Stream sm = rsps.GetResponseStream(); 313         if (!rsps.ContentType.ToLower().StartsWith("text/") || rsps.ContentLength > 1 << 22) 314         { 315           rsps.Close(); 316           m_good = false; 317           return; 318         } 319         Encoding cding = System.Text.Encoding.Default; 320         string contenttype = rsps.ContentType.ToLower(); 321         int ix = contenttype.IndexOf("charset="); 322         if (ix != -1) 323         { 324           try 325           { 326             cding = System.Text.Encoding.GetEncoding(rsps.ContentType.Substring(ix + "charset".Length + 1)); 327           } 328           catch 329           { 330             cding = Encoding.Default; 331           } 332           333           //该处视情况而定 有的需要解码 334           //m_html = HttpUtility.HtmlDecode(new StreamReader(sm, cding).ReadToEnd()); 335           m_html = new StreamReader(sm, cding).ReadToEnd(); 336            337         } 338         else 339         { 340          //该处视情况而定 有的需要解码 341          //m_html = HttpUtility.HtmlDecode(new StreamReader(sm, cding).ReadToEnd()); 342            343           m_html = new StreamReader(sm, cding).ReadToEnd(); 344           Regex regex = new Regex("charset=(?<cding>[^=]+)?\"", RegexOptions.IgnoreCase); 345           string strcding = regex.Match(m_html).Groups["cding"].Value; 346           try 347           { 348             cding = Encoding.GetEncoding(strcding); 349           } 350           catch 351           { 352             cding = Encoding.Default; 353           } 354           byte[] bytes = Encoding.Default.GetBytes(m_html.ToCharArray()); 355           m_html = cding.GetString(bytes); 356           if (m_html.Split('?').Length > 100) 357           { 358             m_html = Encoding.Default.GetString(bytes); 359           } 360         } 361         m_pagesize = m_html.Length; 362         m_uri = rsps.ResponseUri; 363         rsps.Close(); 364       } 365       catch (Exception ex) 366       { 367         368       } 369     } 370     public WebPage(string _url) 371     { 372       string uurl = ""; 373       try 374       { 375         uurl = Uri.UnescapeDataString(_url); 376         _url = uurl; 377       } 378       catch { }; 379       Init(_url); 380     } 381     #endregion 382   } 

 

海外公司注册、海外银行开户、跨境平台代入驻、VAT、EPR等知识和在线办理:https://www.xlkjsw.com

原标题:正则表达式相关:C# 抓取网页类(获取网页中所有信息)

关键词:C#

C#
*特别声明:以上内容来自于网络收集,著作权属原作者所有,如有侵权,请联系我们: admin#shaoqun.com (#换成@)。

可能感兴趣文章

我的浏览记录