你的位置:首页 > ASP.net教程

[ASP.net教程]C#写爬虫,版本V2.1


  这次是对2.0的小修补,2.0交互几乎没有,这次添加了进度条,和文本框,同时由于取得的链接主要会出现错误是:webResponse错误。

针对这种情况,设置了

 try        {          webResponse = (HttpWebResponse)webRequest.GetResponse();        }        catch(WebException ex)        {          webResponse = (HttpWebResponse)ex.Response;        }

截取错误信息,这里我们不处理,后续直接判定statecode属性来决定是否还要执行下面的程序。

另外一点变化就是以前是通过将所获取的网页存到文本中去,这次

WebRequest myRequest = WebRequest.Create("http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1466307565574_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=" + Uri.EscapeDataString(keyWord));      HttpWebResponse myResponse = (HttpWebResponse)myRequest.GetResponse();      if (myResponse.StatusCode == HttpStatusCode.OK)      {        Stream strm = myResponse.GetResponseStream();        StreamReader sr = new StreamReader(strm);        string line = sr.ReadToEnd();

将它全放入了string中。

最后一点是去掉了DownloadPage这个方法,如上,它的功能可以放入按钮的单击事件中实现,没有必要把一件事做两遍。

下面是前台页面:

后台代码:

using Newtonsoft.Json;using Newtonsoft.Json.Linq;using System;using System.Collections.Generic;using System.ComponentModel;using System.Data;using System.Drawing;using System.IO;using System.Linq;using System.Net;using System.Text;using System.Text.RegularExpressions;using System.Threading.Tasks;using System.Windows.Forms;namespace 百度图片爬虫V2._1{  public partial class Form1 : Form  {    public delegate void AsynFunction(string s,int i);    public Form1()    {      InitializeComponent();    }    private static string[] getLinks(string html, out int counts)    {      const string pattern = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?";      Regex r = new Regex(pattern, RegexOptions.IgnoreCase); //新建正则模式      MatchCollection m = r.Matches(html); //获得匹配结果      string[] links = new string[m.Count];      int count = 0;      for (int i = 0; i < m.Count; i++)      {        if (isValiable(m[i].ToString()))        {          links[count] = m[i].ToString(); //提取出结果          count++;        }      }      counts = count;      return links;    }    private void button1_Click(object sender, EventArgs e)    {      string keyWord = this.textBox1.Text;      WebRequest myRequest = WebRequest.Create("http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1466307565574_R&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=" + Uri.EscapeDataString(keyWord));      HttpWebResponse myResponse = (HttpWebResponse)myRequest.GetResponse();      if (myResponse.StatusCode == HttpStatusCode.OK)      {        Stream strm = myResponse.GetResponseStream();        StreamReader sr = new StreamReader(strm);        string line = sr.ReadToEnd();        int counts = 0;        string[] str = getLinks(line, out counts);        this.progressBar1.Maximum = counts;        for (int i = 0; i < counts; i++)        {          AsynFunction fun = new AsynFunction(savePicture);          fun.BeginInvoke(str[i],i, ar => {            fun.EndInvoke(ar);            this.progressBar1.BeginInvoke(new Action(() =>            {              this.progressBar1.Value =progressBar1.Maximum;            }));            this.textBox2.BeginInvoke(new Action(() =>            {              StringBuilder sb=new StringBuilder();              sb.Append(Environment.NewLine);             // sb.Append(str[i].ToString());              sb.Append("下载结束");              this.textBox2.Text += sb.ToString();            }));          }, fun);        }      }    }    private static bool isValiable(string url)    {      if (url.Contains(".jpg") || url.Contains(".gif") || url.Contains(".png"))      {        return true; //得到一些图片之类的资源      }      return false;    }    public void savePicture(string path,int i)    {      if (path != "" && path != null)      {        DataClasses1DataContext db = new DataClasses1DataContext();        Uri url = new Uri(path);        HttpWebRequest webRequest = (HttpWebRequest)HttpWebRequest.Create(url);        webRequest.Referer = "http://image.baidu.com";        webRequest.Timeout = 30000;        //设置连接超时时间         webRequest.AllowAutoRedirect = true;        webRequest.Headers.Set("Pragma", "no-cache");        webRequest.UserAgent = "Mozilla-Firefox-Spider(Wenanry)";        HttpWebResponse webResponse;        try        {          webResponse = (HttpWebResponse)webRequest.GetResponse();        }        catch(WebException ex)        {          webResponse = (HttpWebResponse)ex.Response;        }               if(webResponse!=null&&webResponse.StatusCode==HttpStatusCode.OK)        {          if (isValiable(path))//判断如果是图片,就将其存储到数据库中。          {            Bitmap myImage = new Bitmap(webResponse.GetResponseStream());            MemoryStream ms = new MemoryStream();            myImage.Save(ms, System.Drawing.Imaging.ImageFormat.Jpeg);            var p = new pictureUrl            {              pictureUrl1 = ms.ToArray()            };            db.pictureUrl.InsertOnSubmit(p);            db.SubmitChanges();            this.progressBar1.BeginInvoke(new Action(() =>            {              this.progressBar1.Value = i;            }));            this.textBox2.BeginInvoke(new Action(() =>            {              StringBuilder sb1 = new StringBuilder();              sb1.Append(path);              sb1.Append("图片下载开始" + Environment.NewLine);              this.textBox2.Text += sb1.ToString();            }));                     }        }      }    }    private void button2_Click(object sender, EventArgs e)    {      this.Close();    }  }}