你的位置:首页 > ASP.net教程

[ASP.net教程]爬虫之博客园精华客户端


  在博客园学习知识是很方便的,但若做成客户端,自定义获取数据,那就更好啦!

  那么需求有哪些呢,第一,我只查看推荐数大于2的文章;第二,我想要只查看C#或者Java的文章;第三,我想要查看推荐数大于2的新闻;第四,我还想搜索文章,并且只搜索推荐数大于2的文章。

  先来预览一下成品吧

  其中列表里左边是推荐数,反正我是优先看推荐数多的,中间是标题,右边是日期,至于其他信息,额,我其实不太关心,点击一行后直接在浏览器打开。

  额,大体先这样吧,那么实现这些功能需要什么技能呢,首先我得准备一下通用类,大概需要web请求的帮助类、Gzip格式网页的加解密帮助类、html字符串解析的帮助类。

  • web请求的帮助类:WebHelper
  public class WebHelper  {    public readonly WebClient Web = new WebClient();    //错误重试次数    private int _tryTimes;    public Encoding Encoding    {      set      {        Web.Encoding = value;      }    }    public WebHelper()    {      Web.Encoding = Encoding.UTF8;    }    public WebHelper(Encoding encoding)    {      Web.Encoding = encoding;    }    /// <summary>    /// 下载请求的资源    /// </summary>    /// <param name="url">URL</param>    /// <returns></returns>    public string DownloadString(string url)    {      try      {        return Web.DownloadString(url);      }      catch(WebException e)      {        if (e.Message.Contains("404") || e.Status == WebExceptionStatus.ConnectFailure || e.Status == WebExceptionStatus.ProtocolError || _tryTimes == 2)        {          _tryTimes = 0;          return null;        }        _tryTimes++;        return DownloadString(url);      }    }    /// <summary>    /// 将指定的字符串上载到指定的资源    /// </summary>    /// <param name="address">地址</param>    /// <param name="data">参数</param>    /// <returns></returns>    public string UploadString(string address, string data)    {      Web.Headers.Add("Content-Type", "application/x-www-form-urlencoded");      try      {        return Web.UploadString(address, "POST", data);      }      catch      {        if (_tryTimes == 2)        {          _tryTimes = 0;          return null;        }        _tryTimes++;        return UploadString(address, data);      }    }    /// <summary>    ///   下载请求的资源(资源采用Gzip压缩)    /// </summary>    /// <param name="url">URL</param>    /// <param name="encoding">页面编码格式</param>    /// <returns></returns>    public string DownloadGzipString(string url, Encoding encoding)    {      Web.Headers.Add("Accept-Encoding", "gzip");      try      {        return encoding.GetString(ZipHelper.GzipDecompress(Web.DownloadData(url)));      }      catch (WebException e)      {        if (e.Message.Contains("404") || e.Status == WebExceptionStatus.ConnectFailure ||          e.Status == WebExceptionStatus.ProtocolError || _tryTimes == 2)        {          _tryTimes = 0;          return null;        }        _tryTimes++;        return DownloadGzipString(url, encoding);      }      finally      {        Web.Headers.Remove("Accept-Encoding");      }    }  }

这里有三个方法,其中的DownloadString和UploadString和.net Framework的WebClient的方法用法一样,多了一个DownloadGzipString方法,这个方法用于get一个用Gzip压缩的页面,之所以重复写DownloadString和UploadString是因为我懒,有时候请求网页出现异常并不是该网页不能请求,多请求几次就能获取,这里自动尝试3次请求,3次请求过后依然失败则返回null。当然还有一种情况是需要用代理的,考虑到需要用代理的地方不多,并且代理的IP端口一般需要花钱来买,这里就不贴用代理来请求页面的代码了,之前买过两天耍过代理,我那时候的实现思路就是加一个ProxyPool代理池类,代理池从代理网站获取当前可用的代理,一般是一次获取十几个,然后放入代理池,请求需要代理的网站时就去代理池获取代理,WebClient.Proxy = new WebProxy(host, port);加了这个再去请求页面就可以了,当然代理不一定可靠,所以当失败后不要灰心,再用其他代理试试,总有一个成功的,当需要多线程请求网页时,就new多个WebHelper类,他们都会共用一个ProxyPool代理池的。

  • Gzip格式网页的加解密帮助类ZipHelper
  public class ZipHelper  {    /// <summary>    /// Gzip压缩    /// </summary>    /// <param name="cbytes">需压缩的数据</param>    /// <returns></returns>    public static byte[] GzipCompress(byte[] cbytes)    {      using (MemoryStream cms = new MemoryStream())      {        using (GZipStream gzip = new GZipStream(cms, CompressionMode.Compress))        {          //将数据写入基础流,同时会被压缩          gzip.Write(cbytes, 0, cbytes.Length);        }        return cms.ToArray();      }    }    /// <summary>    /// Gzip解压    /// </summary>    /// <param name="cbytes">需解压的数据</param>    /// <returns></returns>    public static byte[] GzipDecompress(byte[] cbytes)    {      using (MemoryStream dms = new MemoryStream())      {        using (MemoryStream cms = new MemoryStream(cbytes))        {          using (GZipStream gzip = new GZipStream(cms, CompressionMode.Decompress))          {            byte[] bytes = new byte[1024];            int len = 0;            //读取压缩流,同时会被解压            while ((len = gzip.Read(bytes, 0, bytes.Length)) > 0)            {              dms.Write(bytes, 0, len);            }            return dms.ToArray();          }        }      }    }  }

  • html字符串解析的帮助类StringHelper
  public class StringHelper  {    /// <summary>    ///   根据传入str进行遍历取出列表    /// </summary>    /// <param name="str">传入字符串</param>    /// <param name="startStr">开始字符串</param>    /// <param name="endStr">结束字符串</param>    /// <param name="remove">是否去除开始和结束字符串取出数据</param>    /// <returns></returns>    public static List<string> GetList(string str, string startStr, string endStr, bool remove = true)    {      var lst = new List<string>();      int startIndex = 0;      while (true)      {        string v = GetVal(str, startStr, endStr, remove, ref startIndex);        if (startIndex == -1)        {          break;        }        lst.Add(v);      }      return lst;    }    public static string GetVal(string str, string startStr, string endStr, bool remove = true, int startIndex = 0)    {      return GetVal(str, startStr, endStr, remove, ref startIndex);    }    private static string GetVal(string str, string startStr, string endStr, bool remove, ref int startIndex)    {      int istart = str.IndexOf(startStr, startIndex, StringComparison.CurrentCulture);      if (istart == -1)      {        startIndex = -1;        return string.Empty;      }      int iend = str.IndexOf(endStr, istart + startStr.Length, StringComparison.Ordinal);      if (iend == -1)      {        startIndex = -1;        return string.Empty;      }      startIndex = iend + endStr.Length;      if (remove)      {        istart += startStr.Length;        return str.Substring(istart, iend - istart);      }      return str.Substring(istart, startIndex - istart);    }    /// <summary>    ///   根据传入str进行遍历取出列表    /// </summary>    /// <param name="str">传入字符串</param>    /// <param name="startStr">开始字符串</param>    /// <param name="needLength">需要获取的长度(不含开始字符串的长度)</param>    /// <param name="remove">是否去除开始字符串取出数据</param>    /// <returns></returns>    public static List<string> GetList(string str, string startStr, int needLength, bool remove = true)    {      var lst = new List<string>();      int startIndex = 0;      while (true)      {        string v = GetVal(str, startStr, needLength, remove, ref startIndex);        if (startIndex == -1)        {          break;        }        lst.Add(v);      }      return lst;    }    public static string GetVal(string str, string startStr, int needLength, bool remove = true, int startIndex = 0)    {      return GetVal(str, startStr, needLength, remove, ref startIndex);    }    public static string GetVal(string str, string startStr, int needLength, bool remove, ref int startIndex)    {      int istart = str.IndexOf(startStr, startIndex, StringComparison.Ordinal);      if (istart == -1)      {        startIndex = -1;        return string.Empty;      }      startIndex = istart + startStr.Length + needLength;      if (startIndex > str.Length)      {        startIndex = -1;        return string.Empty;      }      return remove        ? str.Substring(istart + startStr.Length, needLength)        : str.Substring(istart, startStr.Length + needLength);    }    /// <summary>    ///   获取字符串里的所有href链接    /// </summary>    /// <param name="str">字符串</param>    /// <returns></returns>    public static List<string> GetUrls(string str)    {      return GetList(str, "href=\"", "\"");    }    /// <summary>    ///   获取字符串里的首个href链接    /// </summary>    /// <param name="str"></param>    /// <returns></returns>    public static string GetUrl(string str)    {      return GetVal(str, "href=\"", "\"");    }    public static string ToGB2312(string str)    {      string r = "";      MatchCollection mc = Regex.Matches(str, @"\\u([\w]{2})([\w]{2})",        RegexOptions.Compiled | RegexOptions.IgnoreCase);      var bts = new byte[2];      foreach (Match m in mc)      {        bts[0] = (byte) int.Parse(m.Groups[2].Value, NumberStyles.HexNumber);        bts[1] = (byte) int.Parse(m.Groups[1].Value, NumberStyles.HexNumber);        r += Encoding.Unicode.GetString(bts);      }      return r;    }    /// <summary>    /// 除去所有在html元素中标记    /// </summary>    /// <param name="html"></param>    /// <returns></returns>    public static string RemoveHTMLTags(string html)    {      Regex regex = new Regex(@"<[^>]+>|</[^>]+>");      return regex.Replace(html, "");    }  }

View Code

这里主要包含了GetList、RemoveHTMLTags和GetVal方法,爬虫解析数据就靠他们了,具体的使用方法下面会有讲解。

 

  到这里通用类大体就介绍完了,现在开始实地施工。

  • 首先获取文章、获取新闻和查找文章的关键方法
    /// <summary>    /// 获取推荐数大于2的博客    /// </summary>    /// <param name="pageIndex"></param>    private bool AddPost(int pageIndex)    {      var url = "https://www.cnblogs.com/mvc/AggSite/PostList.aspx";      var html = _web.UploadString(url, GetUrl() + pageIndex);      var posts = StringHelper.GetList(html, "\"post_item", "\"article_comment");      if (posts.Count == 0)      {        return false;      }      foreach (var item in posts)      {        var n = StringHelper.GetVal(item, "\"diggnum", "/span>");        var diggnum = Convert.ToInt32(StringHelper.GetVal(n, ">", "<"));        if (diggnum < 3)        {          continue;        }        var t = StringHelper.GetVal(item, "\"titlelnk", "/a>");        var title = StringHelper.GetVal(t, ">", "<");        var time = StringHelper.GetVal(item, "发布于 ", 16);        _urls.Add(StringHelper.GetUrl(t));        lstPost.Items.Add($"{diggnum} {title} {time}");      }      return true;    }    /// <summary>    /// 添加搜索的博客    /// </summary>    /// <param name="pageIndex">页数</param>    private bool AddSearchPost(int pageIndex)    {      var url = $"http://zzk.cnblogs.com/s/blogpost?Keywords={txtSearch.Text.Trim()}&pageindex={pageIndex}";var html = _web.DownloadGzipString(url, Encoding.UTF8);      var posts = StringHelper.GetList(html, "\"searchItem", "\"searchItemInfo-comments");      if (posts.Count == 0)      {        return false;      }      foreach (var item in posts)      {        var diggnum = StringHelper.GetVal(item, ">推荐(", ")");        var n = StringHelper.GetVal(item, "searchItemTitle\">", "</h3>");        var title = StringHelper.RemoveHTMLTags(StringHelper.GetVal(n, "\">", "</a>"));        var date = StringHelper.GetVal(item, "searchItemInfo-publishDate\">", "</span>");        _urls.Add(StringHelper.GetUrl(n));        lstPost.Items.Add($"{diggnum} {title} {date}");      }      return true;    }    /// <summary>    /// 获取推荐数大于2的新闻    /// </summary>    /// <param name="pageIndex"></param>    private bool AddNews(int pageIndex)    {      var url = "https://www.cnblogs.com/mvc/AggSite/NewsList.aspx";      var html = _web.UploadString(url, $"CategoryId=-1&CategoryType=News&ItemListActionName=NewsList&ItemListActionName=NewsList&PageIndex=" + pageIndex);      var posts = StringHelper.GetList(html, "\"post_item", "\"article_comment");      if (posts.Count == 0)      {        return false;      }      foreach (var item in posts)      {        var n = StringHelper.GetVal(item, "\"diggnum", "/span>");        var diggnum = Convert.ToInt32(StringHelper.GetVal(n, ">", "<"));        if (diggnum < 3)        {          continue;        }        var t = StringHelper.GetVal(item, "\"titlelnk", "/a>");        var title = StringHelper.GetVal(t, ">", "<");        var time = StringHelper.GetVal(item, "发布于 ", 16);        var link = StringHelper.GetUrl(t);        if (!link.Contains("http"))        {          link = "https:" + link;        }        _urls.Add(link);        lstPost.Items.Add($"{diggnum} {title} {time}");      }      return true;    }

  授人以鱼不然授人以渔,这些是怎么回事呢

在博客园首页按下F12,点击下一页,看看那些请求,瞄一瞄,就知道PostList.aspx是数据关键,里面的参数中CategoryId是分类ID,CategoryType是分类种类,暂时发现SiteHome和TopSiteCategory两个值,当点击母分类时,这个值就是TopSiteCategory,当点击子分类时,这个值就是SiteHome,PageIndex当前页这个众所周知啦,ParentCategoryId是父分类的ID,只有点击子分类时需要把父分类的ID赋值到这个字段。说了这么多,这个还只是获取文章的接口,另外两个查询文章的和获取新闻的也大同小异啦,大家自己研究。另外贴出的代码里有个GetUrl方法,这个就是为了赋值这些参数的,也贴出来吧

    private string GetUrl()    {      string categoryId = "808";      string categoryType = "SiteHome";      string parentCategoryId = "0";      switch (cbbCate.SelectedIndex)      {        case 0:          parentCategoryId = "108698";          categoryType = "SiteCategory";          switch (cbbType.SelectedIndex)          {            case 0:              categoryId = "18156";              break;            case 1:              categoryId = "108699";              break;            case 2:              categoryId = "108700";              break;            case 3:              categoryId = "108760";              break;            case 4:              categoryId = "108716";              break;            case 5:              categoryId = "108717";              break;            case 6:              categoryId = "108718";              break;            case 7:              categoryId = "108719";              break;            case 8:              categoryId = "108720";              break;             case 9:              categoryId = "108728";              break;            case 10:              categoryId = "108729";              break;            case 11:              categoryId = "108730";              break;            case 12:              categoryId = "108738";              break;            case 13:              categoryId = "108739";              break;            case 14:              categoryId = "108758";              break;            default:              categoryId = parentCategoryId;              categoryType = "TopSiteCategory";              parentCategoryId = "0";              break;          }          break;        case 1:          parentCategoryId = "2";          categoryType = "SiteCategory";          switch (cbbType.SelectedIndex)          {            case 0:              categoryId = "106876";              break;            case 1:              categoryId = "106880";              break;            case 2:              categoryId = "106882";              break;            case 3:              categoryId = "106877";              break;            case 4:              categoryId = "108696";              break;            case 5:              categoryId = "106894";              break;            case 6:              categoryId = "108735";              break;            case 7:              categoryId = "108746";              break;            case 8:              categoryId = "108748";              break;            case 9:              categoryId = "108751";              break;            case 10:              categoryId = "108752";              break;            case 11:              categoryId = "108753";              break;            case 12:              categoryId = "108742";              break;            case 13:              categoryId = "108754";              break;            default:              categoryId = parentCategoryId;              categoryType = "TopSiteCategory";              parentCategoryId = "0";              break;          }          break;        case 2:          parentCategoryId = "108701";          categoryType = "SiteCategory";          switch (cbbType.SelectedIndex)          {            case 0:              categoryId = "106892";              break;            case 1:              categoryId = "108702";              break;            case 2:              categoryId = "106884";              break;            case 3:              categoryId = "108750";              break;            default:              categoryId = parentCategoryId;              categoryType = "TopSiteCategory";              parentCategoryId = "0";              break;          }          break;        case 3:          parentCategoryId = "108703";          categoryType = "SiteCategory";          switch (cbbType.SelectedIndex)          {            case 0:              categoryId = "106883";              break;            case 1:              categoryId = "106893";              break;            case 2:              categoryId = "108731";              break;            case 3:              categoryId = "108737";              break;            default:              categoryId = parentCategoryId;              categoryType = "TopSiteCategory";              parentCategoryId = "0";              break;          }          break;        case 4:          parentCategoryId = "108704";          categoryType = "SiteCategory";          switch (cbbType.SelectedIndex)          {            case 0:              categoryId = "78111";              break;            case 1:              categoryId = "50349";              break;            case 2:              categoryId = "106878";              break;            case 3:              categoryId = "108732";              break;            case 4:              categoryId = "108734";              break;            case 5:              categoryId = "108747";              break;            case 6:              categoryId = "108749";              break;            case 7:              categoryId = "3";              break;            default:              categoryId = parentCategoryId;              categoryType = "TopSiteCategory";              parentCategoryId = "0";              break;          }          break;        case 5:          parentCategoryId = "108705";          categoryType = "SiteCategory";          switch (cbbType.SelectedIndex)          {            case 0:              categoryId = "108706";              break;            case 1:              categoryId = "108707";              break;            case 2:              categoryId = "108736";              break;            case 3:              categoryId = "108708";              break;            case 4:              categoryId = "106886";              break;            default:              categoryId = parentCategoryId;              categoryType = "TopSiteCategory";              parentCategoryId = "0";              break;          }          break;        case 6:          parentCategoryId = "108709";          categoryType = "SiteCategory";          switch (cbbType.SelectedIndex)          {            case 0:              categoryId = "108710";              break;            case 1:              categoryId = "106891";              break;            case 2:              categoryId = "106889";              break;            default:              categoryId = parentCategoryId;              categoryType = "TopSiteCategory";              parentCategoryId = "0";              break;          }          break;        case 7:          parentCategoryId = "108712";          categoryType = "SiteCategory";          switch (cbbType.SelectedIndex)          {            case 0:              categoryId = "108713";              break;            case 1:              categoryId = "108714";              break;            case 2:              categoryId = "108715";              break;            case 3:              categoryId = "108743";              break;            case 4:              categoryId = "108756";              break;            case 5:              categoryId = "106881";              break;            default:              categoryId = parentCategoryId;              categoryType = "TopSiteCategory";              parentCategoryId = "0";              break;          }          break;        case 8:          parentCategoryId = "108724";          categoryType = "SiteCategory";          switch (cbbType.SelectedIndex)          {            case 0:              categoryId = "108721";              break;            case 1:              categoryId = "108725";              break;            case 2:              categoryId = "108726";              break;            case 3:              categoryId = "108755";              break;            case 4:              categoryId = "108757";              break;            default:              categoryId = parentCategoryId;              categoryType = "TopSiteCategory";              parentCategoryId = "0";              break;          }          break;        case 9:          parentCategoryId = "4";          categoryType = "SiteCategory";          switch (cbbType.SelectedIndex)          {            case 0:              categoryId = "807";              break;            case 1:              categoryId = "106879";              break;            case 2:              categoryId = "33909";              break;            case 3:              categoryId = "106885";              break;            case 4:              categoryId = "106895";              break;            case 5:              categoryId = "108759";              break;            default:              categoryId = parentCategoryId;              categoryType = "TopSiteCategory";              parentCategoryId = "0";              break;          }          break;      }      return $"CategoryId={categoryId}&CategoryType={categoryType}&ParentCategoryId={parentCategoryId}&ItemListActionName=PostList&PageIndex=";    }  

View Code

  功能大体介绍完了,末了还有个小惊喜,就是提示框,怎么在Winform中弹出提示框,过段时间自动消失呢,像这样

其实这个不难,弄个定时器就好啦

但需要注意的是,怎么才能弹出提示在最顶层呢,不然看不到呢,其实把TopMost属性设为True就好了,另外ShowIcon、ShowInTaskbar、MaximizeBox和MinimizeBox也要设为false,StartPosition设为CenterScreen,这样才专业。

  由于刚弄成,难免会有疏忽八哥,大家看到后要帮忙指正,附上代码博客园精华客户端