你的位置:首页 > ASP.net教程

[ASP.net教程]dotNet使用HttpWebRequest模拟浏览器


在编写网络爬虫时,HttpWebRequest几乎可以完成绝大多数网站的抓取,为了更好的使用这一技术,我将常用的几个功能进行了封装,以方便调用。这个类已经在多个项目中得到使用,主要解决了Cookies相关的一些问题;如果有其它方面的问题可以提出来,我会进一步完善。

目前HttpHelper包含了以下几个方面:

  • GetHttpContent:通过Get或Post来获取网页的Html
  • SetCookie:根据response中头部的set-cookie对cookie进行设置,能识别httponly
  • GetAllCookies:将CookieContainer转换为键值对,方便存储和跨程序间调用
  • ConvertToCookieContainer:将键值对转换回CookieContainer供程序调用
  • BuildPostData:通过一个需要post的html构建出postdata

代码如下:

 1 using System; 2 using System.Collections.Generic; 3 using System.Collections.Specialized; 4 using System.IO; 5 using System.IO.Compression; 6 using System.Linq; 7 using System.Net; 8 using System.Net.Security; 9 using System.Security.Cryptography.X509Certificates; 10 using System.Text; 11 using System.Text.RegularExpressions; 12 using System.Collections; 13 using HtmlAgilityPack; 14  15 namespace TNIdea.Common.Helper 16 { 17   public class HttpHelper 18   { 19     public const string CharsetReg = @"(meta.*?charset=""?(?<Charset>[^\s""'>]+)""?)|("; 20  21     /// <summary> 22     /// 获取网页的内容 23     /// </summary> 24     /// <param name="url">Url</param> 25     /// <param name="postData">Post的信息</param> 26     /// <param name="cookies">Cookies</param> 27     /// <param name="userAgent">浏览器标识</param> 28     /// <param name="referer">来源页</param> 29     /// <param name="cookiesDomain">Cookies的Domian参数,配合cookies使用;为空则取url的Host</param> 30     /// <param name="encode">编码方式,用于解析html</param> 31     /// <returns></returns> 32     public static string GetHttpContent(string url, string postData = null, CookieContainer cookies = null, string userAgent = "", string referer = "", string cookiesDomain = "", Encoding encode = null) 33     { 34       try 35       { 36         HttpWebResponse httpResponse = null; 37         if (!string.IsNullOrWhiteSpace(postData)) 38           httpResponse = CreatePostHttpResponse(url, postData, cookies: cookies, userAgent: userAgent, referer: referer); 39         else 40           httpResponse = CreateGetHttpResponse(url, cookies: cookies, userAgent: userAgent, referer: referer); 41  42         #region 根据Html头判断 43         string Content = null; 44         //缓冲区长度 45         const int N_CacheLength = 10000; 46         //头部预读取缓冲区,字节形式 47         var bytes = new List<byte>(); 48         int count = 0; 49         //头部预读取缓冲区,字符串 50         String cache = string.Empty; 51  52         //创建流对象并解码 53         Stream ResponseStream; 54         switch (httpResponse.ContentEncoding.ToUpperInvariant()) 55         { 56           case "GZIP": 57             ResponseStream = new GZipStream( 58               httpResponse.GetResponseStream(), CompressionMode.Decompress); 59             break; 60           case "DEFLATE": 61             ResponseStream = new DeflateStream( 62               httpResponse.GetResponseStream(), CompressionMode.Decompress); 63             break; 64           default: 65             ResponseStream = httpResponse.GetResponseStream(); 66             break; 67         } 68  69         try 70         { 71           while ( 72             !(cache.EndsWith("</head>", StringComparison.OrdinalIgnoreCase) 73              || count >= N_CacheLength)) 74           { 75             var b = (byte)ResponseStream.ReadByte(); 76             if (b < 0) //end of stream 77             { 78               break; 79             } 80             bytes.Add(b); 81  82             count++; 83             cache += (char)b; 84           } 85  86  87           if (encode == null) 88           { 89             try 90             { 91               if (httpResponse.CharacterSet == "ISO-8859-1" || httpResponse.CharacterSet == "zh-cn") 92               { 93                 Match match = Regex.Match(cache, CharsetReg, RegexOptions.IgnoreCase | RegexOptions.Multiline); 94                 if (match.Success) 95                 { 96                   try 97                   { 98                     string charset = match.Groups["Charset"].Value; 99                     encode = Encoding.GetEncoding(charset);100                   }101                   catch { }102                 }103                 else104                   encode = Encoding.GetEncoding("GB2312");105               }106               else107                 encode = Encoding.GetEncoding(httpResponse.CharacterSet);108             }109             catch { }110           }111 112           //缓冲字节重新编码,然后再把流读完113           var Reader = new StreamReader(ResponseStream, encode);114           Content = encode.GetString(bytes.ToArray(), 0, count) + Reader.ReadToEnd();115           Reader.Close();116         }117         catch (Exception ex)118         {119           return ex.ToString();120         }121         finally122         {123           httpResponse.Close();124         }125         #endregion 根据Html头判断126 127         //获取返回的Cookies,支持httponly128         if (string.IsNullOrWhiteSpace(cookiesDomain))129           cookiesDomain = httpResponse.ResponseUri.Host;130 131         cookies = new CookieContainer();132         CookieCollection httpHeaderCookies = SetCookie(httpResponse, cookiesDomain);133         cookies.Add(httpHeaderCookies ?? httpResponse.Cookies);134 135         return Content;136       }137       catch138       {139         return string.Empty;140       }141     }142 143 144     /// <summary>145     /// 创建GET方式的HTTP请求 146     /// </summary>147     /// <param name="url"></param>148     /// <param name="timeout"></param>149     /// <param name="userAgent"></param>150     /// <param name="cookies"></param>151     /// <param name="referer"></param>152     /// <returns></returns>153     public static HttpWebResponse CreateGetHttpResponse(string url, int timeout = 60000, string userAgent = "", CookieContainer cookies = null, string referer = "")154     {155       HttpWebRequest request = null;156       if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase))157       {158         //对服务端证书进行有效性校验(非第三方权威机构颁发的证书,如自己生成的,不进行验证,这里返回true)159         ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult);160         request = WebRequest.Create(url) as HttpWebRequest;161         //request.ProtocolVersion = HttpVersion.Version10;  //http版本,默认是1.1,这里设置为1.0162       }163       else164       {165         request = WebRequest.Create(url) as HttpWebRequest;166       }167 168       request.Referer = referer;169       request.Method = "GET";170 171       //设置代理UserAgent和超时172       if (string.IsNullOrWhiteSpace(userAgent))173         userAgent = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36";174 175       request.UserAgent = userAgent;176       request.Timeout = timeout;177       request.KeepAlive = true;178       request.AllowAutoRedirect = true;179 180       if (cookies == null)181         cookies = new CookieContainer();182       request.CookieContainer = cookies;183 184       return request.GetResponse() as HttpWebResponse;185     }186 187     /// <summary>188     /// 创建POST方式的HTTP请求189     /// </summary>190     /// <param name="url"></param>191     /// <param name="postData"></param>192     /// <param name="timeout"></param>193     /// <param name="userAgent"></param>194     /// <param name="cookies"></param>195     /// <param name="referer"></param>196     /// <returns></returns>197     public static HttpWebResponse CreatePostHttpResponse(string url, string postData, int timeout = 60000, string userAgent = "", CookieContainer cookies = null, string referer = "")198     {199       HttpWebRequest request = null;200       //如果是发送HTTPS请求 201       if (url.StartsWith("https", StringComparison.OrdinalIgnoreCase))202       {203         ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult);204         request = WebRequest.Create(url) as HttpWebRequest;205         //request.ProtocolVersion = HttpVersion.Version10;206       }207       else208       {209         request = WebRequest.Create(url) as HttpWebRequest;210       }211       request.Referer = referer;212       request.Method = "POST";213       request.ContentType = "application/x-www-form-urlencoded";214 215       //设置代理UserAgent和超时216       if (string.IsNullOrWhiteSpace(userAgent))217         request.UserAgent = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36";218       else219         request.UserAgent = userAgent;220       request.Timeout = timeout;221       request.KeepAlive = true;222       request.AllowAutoRedirect = true;223 224       if (cookies == null)225         cookies = new CookieContainer();226       request.CookieContainer = cookies;227 228       //发送POST数据 229       if (!string.IsNullOrWhiteSpace(postData))230       {231         byte[] data = Encoding.UTF8.GetBytes(postData);232         request.ContentLength = data.Length;233         using (Stream stream = request.GetRequestStream())234         {235           stream.Write(data, 0, data.Length);236         }237       }238       //string[] values = request.Headers.GetValues("Content-Type");239       return request.GetResponse() as HttpWebResponse;240     }241 242     /// <summary>243     /// 验证证书244     /// </summary>245     /// <param name="sender"></param>246     /// <param name="certificate"></param>247     /// <param name="chain"></param>248     /// <param name="errors"></param>249     /// <returns>是否验证通过</returns>250     private static bool CheckValidationResult(object sender, X509Certificate certificate, X509Chain chain, SslPolicyErrors errors)251     {252       if (errors == SslPolicyErrors.None)253         return true;254       return false;255     }256 257     /// <summary>258     /// 根据response中头部的set-cookie对request中的cookie进行设置259     /// </summary>260     /// <param name="setCookie">The set cookie.</param>261     /// <param name="defaultDomain">The default domain.</param>262     /// <returns></returns>263     private static CookieCollection SetCookie(HttpWebResponse response, string defaultDomain)264     {265       try266       {267         string[] setCookie = response.Headers.GetValues("Set-Cookie");268 269         // there is bug in it,the datetime in "set-cookie" will be sepreated in two pieces.270         List<string> a = new List<string>(setCookie);271         for (int i = setCookie.Length - 1; i > 0; i--)272         {273           if (a[i].Substring(a[i].Length - 3) == "GMT")274           {275             a[i - 1] = a[i - 1] + ", " + a[i];276             a.RemoveAt(i);277             i--;278           }279         }280         setCookie = a.ToArray<string>();281         CookieCollection cookies = new CookieCollection();282         foreach (string str in setCookie)283         {284           NameValueCollection hs = new NameValueCollection();285           foreach (string i in str.Split(';'))286           {287             int index = i.IndexOf("=");288             if (index > 0)289               hs.Add(i.Substring(0, index).Trim(), i.Substring(index + 1).Trim());290             else291               switch (i)292               {293                 case "HttpOnly":294                   hs.Add("HttpOnly", "True");295                   break;296                 case "Secure":297                   hs.Add("Secure", "True");298                   break;299               }300           }301           Cookie ck = new Cookie();302           foreach (string Key in hs.AllKeys)303           {304             switch (Key.ToLower().Trim())305             {306               case "path":307                 ck.Path = hs[Key];308                 break;309               case "expires":310                 ck.Expires = DateTime.Parse(hs[Key]);311                 break;312               case "domain":313                 ck.Domain = hs[Key];314                 break;315               case "httpOnly":316                 ck.HttpOnly = true;317                 break;318               case "secure":319                 ck.Secure = true;320                 break;321               default:322                 ck.Name = Key;323                 ck.Value = hs[Key];324                 break;325             }326           }327           if (ck.Domain == "") ck.Domain = defaultDomain;328           if (ck.Name != "") cookies.Add(ck);329         }330         return cookies;331       }332       catch333       {334         return null;335       }336     }337 338     /// <summary>339     /// 遍历CookieContainer340     /// </summary>341     /// <param name="cookieContainer"></param>342     /// <returns>List of cookie</returns>343     public static Dictionary<string, string> GetAllCookies(CookieContainer cookieContainer)344     {345       Dictionary<string, string> cookies = new Dictionary<string, string>();346 347       Hashtable table = (Hashtable)cookieContainer.GetType().InvokeMember("m_domainTable",348         System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.GetField |349         System.Reflection.BindingFlags.Instance, null, cookieContainer, new object[] { });350 351       foreach (string pathList in table.Keys)352       {353         StringBuilder _cookie = new StringBuilder();354         SortedList cookieColList = (SortedList)table[pathList].GetType().InvokeMember("m_list",355           System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.GetField356           | System.Reflection.BindingFlags.Instance, null, table[pathList], new object[] { });357         foreach (CookieCollection colCookies in cookieColList.Values)358           foreach (Cookie c in colCookies)359             _cookie.Append(c.Name + "=" + c.Value + ";");360 361         cookies.Add(pathList, _cookie.ToString().TrimEnd(';'));362       }363       return cookies;364     }365 366     /// <summary>367     /// convert cookies string to CookieContainer368     /// </summary>369     /// <param name="cookies"></param>370     /// <returns></returns>371     public static CookieContainer ConvertToCookieContainer(Dictionary<string, string> cookies)372     {373       CookieContainer cookieContainer = new CookieContainer();374 375       foreach (var cookie in cookies)376       {377         string[] strEachCookParts = cookie.Value.Split(';');378         int intEachCookPartsCount = strEachCookParts.Length;379 380         foreach (string strCNameAndCValue in strEachCookParts)381         {382           if (!string.IsNullOrEmpty(strCNameAndCValue))383           {384             Cookie cookTemp = new Cookie();385             int firstEqual = strCNameAndCValue.IndexOf("=");386             string firstName = strCNameAndCValue.Substring(0, firstEqual);387             string allValue = strCNameAndCValue.Substring(firstEqual + 1, strCNameAndCValue.Length - (firstEqual + 1));388             cookTemp.Name = firstName;389             cookTemp.Value = allValue;390             cookTemp.Path = "/";391             cookTemp.Domain = cookie.Key;392             cookieContainer.Add(cookTemp);393           }394         }395       }396       return cookieContainer;397     }398 399     public static string BuildPostData(string htmlContent)400     {401       HtmlDocument htmlDoc = new HtmlDocument();402       htmlDoc.LoadHtml(htmlContent);403       //Get the form node collection.404       HtmlNode htmlNode = htmlDoc.DocumentNode.SelectSingleNode("//form");405       HtmlNodeCollection htmlInputs = htmlNode.SelectNodes("//input");406 407       StringBuilder postData = new StringBuilder();408 409       foreach (HtmlNode input in htmlInputs)410       {411         if(input.Attributes["value"] != null)412           postData.Append(input.Attributes["name"].Value + "=" + input.Attributes["value"].Value + "&");413       }414       return postData.ToString().TrimEnd('&');415     }416   }417 }

部分网站需要登录的问题我已经着手通过另一个项目来解决(imitate-login),目前还有许多网页使用了JavaScript或各种基于JS的框架来对网页进行数据加载,如何来模拟执行JavaScript暂时还没找到比较优美的解决方案,如果大家有什么好的方案可以发给我,谢谢!

 未经授权,拒绝任何全文及摘要转载!