星空网 > 软件开发 > ASP.net

asp.net 网页抓取内容

网页抓取代码

using System;using System.Collections.Generic;using System.Linq;using System.Web;//using System.Net;using System.IO;using System.Text.RegularExpressions;using System.Text;namespace WSYL.Web.Common{  public static class GetSteamShipInfo  {    public static string GetWebSite(string steamshipname,int itype)    {      if (steamshipname == null || steamshipname.Trim() == "")        return null;      //step1: get html from url      string urlToCrawl = @"网址";      //generate http request      HttpWebRequest req = (HttpWebRequest)WebRequest.Create(urlToCrawl);      //use GET method to get url's html      req.Method = "GET";      //use request to get response      HttpWebResponse resp = (HttpWebResponse)req.GetResponse();      // 二〇一五年八月十二日 18:14:45 需要增加判断网页解析超时问题 防止网页假死      // string htmlCharset = "UTF-8";      string htmlCharset = "utf-8";      //use songtaste's html's charset GB2312 to decode html      //otherwise will return messy code      Encoding htmlEncoding = Encoding.GetEncoding(htmlCharset);      StreamReader sr = new StreamReader(resp.GetResponseStream(), htmlEncoding);      //read out the returned html      string respHtml = sr.ReadToEnd();      //第三种获取内容      //Match TitleMatch = Regex.Match(rtbExtractedHtml.Text.ToString(), "<td width=\"30%\">([^<]*)</td>", RegexOptions.IgnoreCase | RegexOptions.Multiline);
//需要获取的代码开始和结尾内容
Match TitleMatch2 = Regex.Match(respHtml.ToString(), "<td align=\"left\" bgcolor=\"#EEEEEE\">([^<]*)</td>", RegexOptions.IgnoreCase | RegexOptions.Multiline); // txbExtractedInfo.Text = TitleMatch2.Groups[1].Value+"/"+ TitleMatch2.Groups[2].Value; if (TitleMatch2.Groups[1].Value.Length == 0 || TitleMatch2.Groups[1].Value=="") return respHtml = ""; if(itype==0) { respHtml = TitleMatch2.Groups[1].Value.ToString(); } if(itype==1) { respHtml = StripHtml(TitleMatch2.NextMatch().Value.ToString()); } if (itype == 2) { respHtml = TitleMatch2.Groups[1].Value + "/" + StripHtml(TitleMatch2.NextMatch().Value.ToString()); } return respHtml; } /// <summary> /// 去除html标签和空格有些例外会使得去除不干净,所以建议连续两次转化。这样将Html标签转化为了空格。太多连续的空格会影响之后对字符串的操作 /// </summary> /// <param name="strHtml">标签内容</param> /// <returns></returns> private static string StripHtml(string strHtml) { Regex objRegExp = new Regex("<(.|\n)+?>"); string strOutput = objRegExp.Replace(strHtml, ""); strOutput = strOutput.Replace("<", "&lt;"); strOutput = strOutput.Replace(">", "&gt;"); //把所有空格变为一个空格 Regex r = new Regex(@"\s+"); strOutput = r.Replace(strOutput, " "); return strOutput.Trim(); } }}

 




原标题:asp.net 网页抓取内容

关键词:ASP.NET

*特别声明:以上内容来自于网络收集,著作权属原作者所有,如有侵权,请联系我们: admin#shaoqun.com (#换成@)。

怎么赶fba跟卖:https://www.goluckyvip.com/tag/48506.html
怎么寄fba:https://www.goluckyvip.com/tag/48507.html
怎么寄东西到德国:https://www.goluckyvip.com/tag/48508.html
怎么加盟做跨境电商:https://www.goluckyvip.com/tag/48509.html
怎么加入跨境电商:https://www.goluckyvip.com/tag/48511.html
怎么建海外仓:https://www.goluckyvip.com/tag/48513.html
长治婚庆女司仪和主持人:https://www.vstour.cn/a/366176.html
北京丰台区水上乐园哪家好玩?:https://www.vstour.cn/a/366177.html
相关文章
我的浏览记录
最新相关资讯
海外公司注册 | 跨境电商服务平台 | 深圳旅行社 | 东南亚物流