你的位置:首页 > ASP.net教程

[ASP.net教程]C#写爬虫,版本V2.0


这个版本主要是以百度图片为对象,对其进行爬虫操作,实现了最基本的下载功能,但是缺陷非常多,日后还会对其进行改进。

打开百度图片,同时打开开发者工具,我们会发现,百度图片是通过如下的一段ajax来加载图片的。

http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1466428638972_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E5%94%90%E5%AB%A3&f=3&oq=tangyan&rsp=0

这里,我们只需了解word后面就是我们的关键字,那么,这个就比较好弄了,结合一部分V1.0的代码,很快就可以开发出来,原理和V1.0类似。

后台代码如下:

using System;using System.Collections.Generic;using System.ComponentModel;using System.Data;using System.Drawing;using System.IO;using System.Linq;using System.Net;using System.Text;using System.Threading.Tasks;using System.Windows.Forms;using Newtonsoft.Json.Linq;using Newtonsoft.Json;using System.Text.RegularExpressions;namespace 针对百度图片的动态网页爬虫{  public partial class Form1 : Form  {    static int count = 0;    public Form1()    {      InitializeComponent();    }    private void btnDo_Click(object sender, EventArgs e)    {      int pageCount=2;      string keyword = this.keyWords.Text;      for (int i = 0; i <pageCount; i++)      {        HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create("http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1466307565574_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word="+keyword.ToString());        using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())        {          if (response.StatusCode == HttpStatusCode.OK)          {            using (Stream stream = response.GetResponseStream())            {              try              {                // 下载指定页的所有图片                DownloadPage(stream);              }              catch (Exception ex)              {                // 跨线程访问UI线程的txtLogs                              }            }          }          else          {            // MessageBox.Show("获取第" + pageCount + "页失败:" + response.StatusCode);          }        }      }      MessageBox.Show("执行成功,共"+count.ToString()+"图片");    }    private static string[] getLinks(string html)    {      const string pattern = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?";      Regex r = new Regex(pattern, RegexOptions.IgnoreCase); //新建正则模式      MatchCollection m = r.Matches(html); //获得匹配结果      string[] links = new string[m.Count];      int count=0;      for (int i = 0; i < m.Count; i++)      {        if(isValiable(m[i].ToString()))        {           links[count] = m[i].ToString(); //提取出结果          count++;        }             }      return links;    }    private void DownloadPage(Stream stream)    {      using(StreamReader reader=new StreamReader(stream))      {        string r1;        StringBuilder sb = new StringBuilder();        while((r1=reader.ReadLine())!=null)        {          sb.Append(r1);        }        FileStream aFile = new FileStream("../../txt.txt", FileMode.OpenOrCreate);        StreamWriter sw = new StreamWriter(aFile);//将网页存储到了txt文本文件中        sw.WriteLine(sb.ToString());        sw.Close();        string[] s;        s = getLinks(sb.ToString());        int i = 0;                for(i=0;i<s.Count();i++)        {          if(s[i]!=null||s[i]!="")          {            count++;            savePicture(s[i]);          }                  }        this.label2.Text = count.ToString();      }    }    private static bool isValiable(string url)    {      if (url.Contains(".jpg") || url.Contains(".gif") || url.Contains(".png"))      {        return true; //得到一些图片之类的资源      }      return false;    }    private static void savePicture(string path)    {      DataClasses1DataContext db = new DataClasses1DataContext();      Uri url = new Uri(path);      HttpWebRequest webRequest = (HttpWebRequest)HttpWebRequest.Create(url);      webRequest.Referer = "http://image.baidu.com";      HttpWebResponse webResponse = (HttpWebResponse)webRequest.GetResponse();      if (isValiable(path))//判断如果是图片,就将其存储到数据库中。      {        Bitmap myImage = new Bitmap(webResponse.GetResponseStream());        MemoryStream ms = new MemoryStream();        myImage.Save(ms, System.Drawing.Imaging.ImageFormat.Jpeg);        var p = new pictureUrl        {          pictureUrl1 = ms.ToArray()        };        db.pictureUrl.InsertOnSubmit(p);        db.SubmitChanges();      }    }  }}

演示效果:

这个程序只是解决了有无得问题,还有许多问题,以后会继续解决。