你的位置:首页 > ASP.net教程

[ASP.net教程]csharp:正则表达式采集网页数据


https://msdn.microsoft.com/zh-cn/library/system.text.regularexpressions.regex(v=vs.110).aspx

 /// <summary>    ///     /// </summary>    /// <param name="sender"></param>    /// <param name="e"></param>    private void StatsoneForm_Load(object sender, EventArgs e)    {      string s = @"<tr height='19' style='height:14.25pt;mso-height-source:userset;mso-height-alt:285;'>  <td class='xl67' height='19' style='height:14.25pt;'></td>  <td class='xl71' x:num>110000</td>  <td class='xl71' x:str>北京市</td>  <td class='xl67'></td>  <td class='xl70'></td>  <td class='xl70'></td>  <td class='xl70'></td>  <td colspan='3' style='mso-ignore:colspan;'></td>  </tr>";      string f = ExtensionPost(s);      MessageBox.Show(f);      string sb = @"<p class='msonormal' style='line-height: 150%'><span lang='en-us' style='line-height: 150%; font-family: 'times new roman', 'serif'; font-size: 12pt'>110000<span>  </span></span><span style='line-height: 150%; font-family: 宋体; font-size: 12pt'>北京市</span></p>";      string fb = ExtensionPostb(sb);      MessageBox.Show(fb);      string strhtml = @"<p class='msonormal' style='line-height: 150%'><span lang='en-us' style='line-height: 150%; font-family: 'times new roman', 'serif'; font-size: 12pt'>110000<span>  </span></span><span style='line-height: 150%; font-family: 宋体; font-size: 12pt'>北京市</span></p><p class='msonormal' style='line-height: 150%'><span lang='en-us' style='line-height: 150%; font-family: 'times new roman', 'serif'; font-size: 12pt'>110100<span>  </span></span><span style='line-height: 150%; font-family: 宋体; font-size: 12pt'>市辖区</span></p><p class='msonormal' style='line-height: 150%'><span lang='en-us' style='line-height: 150%; font-family: 'times new roman', 'serif'; font-size: 12pt'>110101<span>  </span></span><span style='line-height: 150%; font-family: 宋体; font-size: 12pt'>东城区</span></p><p class='msonormal' style='line-height: 150%'><span lang='en-us' style='line-height: 150%; font-family: 'times new roman', 'serif'; font-size: 12pt'>110102<span>  </span></span><span style='line-height: 150%; font-family: 宋体; font-size: 12pt'>西城区</span></p><p class='msonormal' style='line-height: 150%'><span lang='en-us' style='line-height: 150%; font-family: 'times new roman', 'serif'; font-size: 12pt'>110105<span>  </span></span><span style='line-height: 150%; font-family: 宋体; font-size: 12pt'>朝阳区</span></p><p class='msonormal' style='line-height: 150%'><span lang='en-us' style='line-height: 150%; font-family: 'times new roman', 'serif'; font-size: 12pt'>110106<span>  </span></span><span style='line-height: 150%; font-family: 宋体; font-size: 12pt'>丰台区</span></p>";      IEnumerable<AreaHtmlValue> htmlValue = GetRegValue(@"<p class='msonormal' style='line-height: 150%'><span lang='en-us' style='line-height: 150%; font-family: 'times new roman', 'serif'; font-size: 12pt'>(?<code>\d+)<span>  </span></span><span style='line-height: 150%; font-family: 宋体; font-size: 12pt'>(?<name>\w*)</span></p>", strhtml);      List<AreaInfo> areaList = (from v in htmlValue                   // let name = v.Name.Replace("自治区直辖县级行政区划", "县").Replace("市辖区", "市").Replace("省直辖县级行政区划", "县")                   // .Replace("县", "")                    select new AreaInfo                    {                      AreaCode = v.Code.Substring(0, 5),                      AreaName = v.Name,                      AreaFullName = v.Name,                      ParentAreaCode = "0",                      ParentId = 0,                      CreateTime = DateTime.Now,                      AreaYear = 2015                    }).ToList();      this.dataGridView1.DataSource = areaList;    }    /// <summary>    /// <tr height='19' style='height:14.25pt;mso-height-source:userset;mso-height-alt:285;'>    /// <td class='xl67' height='19' style='height:14.25pt;'></td>    /// <td class='xl71' x:num>654326</td>    ///<td class='xl71' x:str><span style='mso-spacerun:yes;'>    </span><font class='font3'>吉木乃县</font></td>    ///<td class='xl67'></td>    ///<td class='xl70'></td>    ///<td class='xl70'></td>    ///<td class='xl70'></td>    ///<td colspan='3' style='mso-ignore:colspan;'></td>    ///</tr>    /// </summary>    /// <param name="url"></param>    /// <returns></returns>    static String ExtensionPost(String url)    {      //<tr height='19' style='height:14.25pt;mso-height-source:userset;mso-height-alt:285;'>  <td class='xl67' height='19' style='height:14.25pt;'></td>  <td class='xl71' x:num>110000</td>  <td class='xl71' x:str>北京市</td>  <td class='xl67'></td>  <td class='xl70'></td>  <td class='xl70'></td>  <td class='xl70'></td>  <td colspan='3' style='mso-ignore:colspan;'></td>  </tr>      //<p class='msonormal' style='line-height: 150%'><span lang='en-us' style='line-height: 150%; font-family: 'times new roman', 'serif'; font-size: 12pt'>110000<span>  </span></span><span style='line-height: 150%; font-family: 宋体; font-size: 12pt'> 北京市</span></p>      //<tr height='19' style='height:14.25pt;mso-height-source:userset;mso-height-alt:285;'><td class='xl67' height='19' style='height:14.25pt;'></td><td class='xl71' x:num>654326</td><td class='xl71' x:str><span style='mso-spacerun:yes;'>    </span><font class='font3'>吉木乃县</font></td><td class='xl67'></td><td class='xl70'></td><td class='xl70'></td><td class='xl70'></td><td colspan='3' style='mso-ignore:colspan;'></td></tr>      // Regex r = new Regex(@"<tr height='19' style='height:14.25pt;mso-height-source:userset;mso-height-alt:285;'><td class='xl67' height='19' style='height:14.25pt;'></td><td class='xl71' x:num>(?<port>\d+)</td><td class='xl71' x:str><span style='mso-spacerun:yes;'>    </span><font class='font3'>(?<proto>\w+)</font></td><td class='xl67'></td><td class='xl70'></td><td class='xl70'></td><td class='xl70'></td><td colspan='3' style='mso-ignore:colspan;'></td></tr>",      Regex r = new Regex(@"<tr height='19' style='height:14.25pt;mso-height-source:userset;mso-height-alt:285;'>  <td class='xl67' height='19' style='height:14.25pt;'></td>  <td class='xl71' x:num>(?<port>\d+)</td>  <td class='xl71' x:str>(?<proto>\w+)</td>  <td class='xl67'></td>  <td class='xl70'></td>  <td class='xl70'></td>  <td class='xl70'></td>  <td colspan='3' style='mso-ignore:colspan;'></td>  </tr>",      RegexOptions.Compiled);      return r.Match(url).Result("${proto}${port}");    }    /// <summary>    ///     /// </summary>    /// <param name="url"></param>    /// <returns></returns>    static string ExtensionPostb(string url)    {      Regex r = new Regex(@"<p class='msonormal' style='line-height: 150%'><span lang='en-us' style='line-height: 150%; font-family: 'times new roman', 'serif'; font-size: 12pt'>(?<port>\d+)<span>  </span></span><span style='line-height: 150%; font-family: 宋体; font-size: 12pt'>(?<proto>\w*)</span></p>",      RegexOptions.Compiled);      return r.Match(url).Result("${proto}${port}");    }    /// <summary>    /// http://files2.mca.gov.cn/www/201512/20151224151630189.htm    /// <tr height="19" style='height:14.25pt;mso-height-source:userset;mso-height-alt:285;'><td height="19" style='height:14.25pt;'></td><td x:num>110000</td><td x:str>北京市</td><td ></td><td ></td><td ></td><td ></td><td colspan="3" style='mso-ignore:colspan;'></td></tr>    /// http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201504/t20150415_712722.html    /// <p ><span lang="EN-US" >110000<span>  </span></span><span > 北京市</span></p>    /// </summary>    /// <param name="sender"></param>    /// <param name="e"></param>    private void button1_Click(object sender, EventArgs e)    {      try      {        //1        string url = "http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201504/t20150415_712722.html";        //2        //string url = "http://files2.mca.gov.cn/www/201512/20151224151630189.htm";        // 获取相关HTML块        //IEnumerable<AreaHtmlValue> htmlValue =GetRegValue(@"<tr class='villagetr'><td>(?<code>\d{12})</td><td>(?<type>\d{3})</td><td>(?<name>\w*)</td></tr>",GetHtml(url));        //<p class='msonormal' style='line-height: 150%'><span lang='en-us' style='line-height: 150%; font-family: 'times new roman', 'serif'; font-size: 12pt'>110000<span>  </span></span><span style='line-height: 150%; font-family: 宋体; font-size: 12pt'> 北京市</span></p>        //1        IEnumerable<AreaHtmlValue> htmlValue = GetRegValue(@"<p class='msonormal' style='line-height: 150%'><span lang='en-us' style='line-height: 150%; font-family: 'times new roman', 'serif'; font-size: 12pt'>(?<code>\d+)<span>  </span></span><span style='line-height: 150%; font-family: 宋体; font-size: 12pt'>(?<name>\w*)</span></p>", GetHtml(url));        //2        //IEnumerable<AreaHtmlValue> htmlValue = GetRegValue(@"<tr height='19' style='height:14.25pt;mso-height-source:userset;mso-height-alt:285;'><td class='xl67' height='19' style='height:14.25pt;'></td><td class='xl71' x:num>(?<code>\d+)</td><td class='xl71' x:str><span style='mso-spacerun:yes;'>    </span><font class='font3'>(?<name>\w*)</font></td><td class='xl67'></td><td class='xl70'></td><td class='xl70'></td><td class='xl70'></td><td colspan='3' style='mso-ignore:colspan;'></td></tr>", GetHtml(url));        //this.richTextBox1.Text = GetHtml(url);        // this.textBox1.Text = GetHtml(url);        // this.textBox1.SelectAll();        List<AreaInfo> areaList = (from v in htmlValue                      //let name = v.Name.Replace("自治区直辖县级行政区划", "县").Replace("市辖区", "市").Replace("省直辖县级行政区划", "县")                         //.Replace("县", "")                       select new AreaInfo                       {                         AreaCode = v.Code.Substring(0, 6),                         AreaName = v.Name,                         AreaFullName = v.Name,                         ParentAreaCode = "0",                                             ParentId = 0,                         CreateTime = DateTime.Now,                         AreaYear = 2015                       }).ToList();                this.dataGridView2.DataSource = areaList;        WebClient wc = new WebClient();        string mainData = Encoding.UTF8.GetString(wc.DownloadData(string.Format("http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201504/t20150415_712722.html")));        this.richTextBox2.Text =RemoveScript(RemoveStyle(ReplaceEnter(mainData)));       // this.textBox2.Text = RemoveScript(RemoveStyle(ReplaceEnter(mainData)));       // this.textBox2.SelectAll();      }      catch (Exception ex)      {        ex.Message.ToString();      }    }    #region 网页源码    /// <summary>    ///     /// </summary>    /// <param name="url"></param>    private static void updowndimg(string url)    {      WebClient client = new WebClient();      string html = client.DownloadString(url);      MatchCollection matches = Regex.Matches(html, "<img\\s*.*src=\"(.+?)\".*/>");      for (int i = 0; i < matches.Count; i++)      {        string img = matches[i].Groups[1].Value.Replace("\"", string.Empty);        img = "url/" + img;        client.DownloadFile(img, @"c:\g\" + Path.GetFileName(img));        Console.WriteLine(img);      }      Console.ReadKey();    }    /// <summary>    /// 例如,Find_po在字开头处查找以"po"开头的字符串:    /// </summary>    static void Find_po()    {      string text = @" I can not find my position in Beijing ";      string pattern = @"\bpo\S*ion\b";      MatchCollection matches = Regex.Matches(text, pattern, RegexOptions.IgnoreCase      | RegexOptions.IgnorePatternWhitespace | RegexOptions.ExplicitCapture);      WriteMatches(text, matches);    }    /// <summary>    ///     /// </summary>    /// <param name="text"></param>    /// <param name="matches"></param>    static void WriteMatches(string text, MatchCollection matches)    {      Console.WriteLine("Original text was: \n\n" + text + "\n");      Console.WriteLine("No. of matches: " + matches.Count);      foreach (Match nextMatch in matches)      {        int Index = nextMatch.Index;        string result = nextMatch.ToString();        int charsBefore = (Index < 5) ? Index : 5;        int fromEnd = text.Length - Index - result.Length;        int charsAfter = (fromEnd < 5) ? fromEnd : 5;        int charsToDisplay = charsBefore + charsAfter + result.Length;        Console.WriteLine("Index: {0}, \tString: {1}, \t{2}", Index, result,        text.Substring(Index - charsBefore, charsToDisplay));      }    }    /// <summary>    /// 如,"http://www.yahoo.com.cn:8080/index.html"将返回"http:8080"。    /// </summary>    /// <param name="url"></param>    /// <returns></returns>    String Extension(String url)    {      Regex r = new Regex(@"^(?<proto>\w+)://[^/]+?(?<port>:\d+)?/",      RegexOptions.Compiled);      return r.Match(url).Result("${proto}${port}");    }    /// <summary>    ///   获取远程网页源码    /// </summary>    /// <param name="url"></param>    /// <returns></returns>    private static string GetHtml(string url)    {      try      {        WebRequest webRequest = WebRequest.Create(url);        WebResponse webResponse = webRequest.GetResponse();        Stream reader = webResponse.GetResponseStream();        if (reader != null)        {          var respStreamReader = new StreamReader(reader, Encoding.UTF8); //          var cbuffer = new char[1024];          int byteRead = respStreamReader.Read(cbuffer, 0, 256);          string strBuff = string.Empty;          while (byteRead != 0)          {            var strResp = new string(cbuffer, 0, byteRead);            strBuff = strBuff + strResp;            byteRead = respStreamReader.Read(cbuffer, 0, 256);          }          strBuff = RemoveScript(RemoveStyle(ReplaceEnter(strBuff)));          return strBuff;        }      }      catch (Exception)      {        AreaLogHelper.WriteLogFile("【异常URL】" + url);        Console.WriteLine("【异常URL】" + url);      }      return string.Empty;    }    /// <summary>    ///   替换网页中的换行和引号    /// </summary>    /// <param name="htmlCode">HTML源代码</param>    /// <returns></returns>    private static string ReplaceEnter(string htmlCode)    {      if (string.IsNullOrEmpty(htmlCode))        return string.Empty;      return htmlCode.Replace("\r\n", "").Replace("\"", "'").Replace("\n", "").Replace("\r", "").Replace("   ", "").Replace("  ", "").Replace("  ", "").Replace("  ", "").Replace(" ", "").ToLower();//.Replace("\"", "").Replace(" ", "")    }      #region private methods       private static string RemoveComment(string input)       {         string result = input;         //remove comment         result = Regex.Replace(result, @"<!--[^-]*-->", string.Empty, RegexOptions.IgnoreCase);         return result;       }       private static string RemoveStyle(string input)       {         string result = input;         //remove all styles         result = Regex.Replace(result, @"<style[^>]*?>.*?</style>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);         return result;       }      private static string RemoveScript(string input)       {         string result = input;         result = Regex.Replace(result, @"<script[^>]*?>.*?</script>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);         result = Regex.Replace(result, @"<noscript[^>]*?>.*?</noscript>", string.Empty, RegexOptions.IgnoreCase | RegexOptions.Singleline);         return result;       }      private static string RemoveTags(string input)       {         string result = input;         result = result.Replace(" ", " ");         result = result.Replace("", "\"");         result = result.Replace("<", "<");         result = result.Replace(">", ">");         result = result.Replace("&", "&");         result = result.Replace("<br>", "\r\n");         result = Regex.Replace(result, @"<[\s\S]*?>", string.Empty, RegexOptions.IgnoreCase);         return result;       }       #endregion    /// <summary>    /// 执行正则提取出值    /// </summary>    /// <param name="regexString">正则表达式</param>    /// <param name="remoteStr">HtmlCode源代码</param>    /// <returns></returns>    private static IEnumerable<AreaHtmlValue> GetRegValue(string regexString, string remoteStr)    {      var reg = new Regex(regexString, RegexOptions.Compiled);//RegexOptions.Compiled      MatchCollection mc = reg.Matches(remoteStr);      return (from Match m in mc          select new AreaHtmlValue          {            Code = m.Groups["code"].Value,            Name = m.Groups["name"].Value,            // Type = m.Groups["type"].Value          }).ToList();    }    private class AreaHtmlValue    {      public string Code { get; set; }      public string Name { get; set; }      public string Type { get; set; }    }    #endregion  }