星空网 > 软件开发 > Java

使用JAVA抓取网页数据

一、使用 HttpClient 抓取网页数据

public String getHtml(String htmlurl) throws IOException {		StringBuffer sb = new StringBuffer();		String acceptEncoding = "";		/* 1.生成 HttpClinet 对象并设置参数 */		HttpClient httpClient = new HttpClient();		GetMethod method = new GetMethod(htmlurl);		int statusCode;		try {			statusCode = httpClient.executeMethod(method);			// 判断访问的状态码			if (statusCode != HttpStatus.SC_OK) {				return null;			} else {				if (method.getResponseHeader("Content-Encoding") != null)					acceptEncoding = method.getResponseHeader(							"Content-Encoding").getValue();				if (acceptEncoding.toLowerCase().indexOf("gzip") > -1) {					// 建立gzip解压工作流					InputStream is;					is = method.getResponseBodyAsStream();					GZIPInputStream gzin = new GZIPInputStream(is);					InputStreamReader isr = new InputStreamReader(gzin, Charset.forName(CHARSET)); // 设置读取流的编码格式,自定义编码					java.io.BufferedReader br = new java.io.BufferedReader(isr);					String tempbf;					while ((tempbf = br.readLine()) != null) {						if(StringUtils.isNotBlank(tempbf)){							sb.append(tempbf);						}					}					isr.close();					gzin.close();					System.out.println(sb);				} else {					InputStreamReader isr;					isr = new InputStreamReader(							method.getResponseBodyAsStream(), CHARSET);					java.io.BufferedReader br = new java.io.BufferedReader(isr);					String tempbf;					while ((tempbf = br.readLine()) != null) {						if(StringUtils.isNotBlank(tempbf)){							sb.append(tempbf);						}					}					isr.close();				}			}		} catch (HttpException e) {			e.printStackTrace();		} catch (IOException e) {			e.printStackTrace();		}		method.abort();		method.releaseConnection();		return sb.toString();	}

 

二、使用HttpPost抓取网页数据

private static CloseableHttpClient httpClient;	private static BasicHttpContext httpContext;	private static BasicCookieStore cookieStore;	private static PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();	private static RequestConfig globalConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.BEST_MATCH).build();	private static RequestConfig localConfig = RequestConfig.copy(globalConfig).setCookieSpec(CookieSpecs.BROWSER_COMPATIBILITY).build();public String getHtml(String url){		HttpClientBuilder builder = HttpClients.custom();		cookieStore = new BasicCookieStore();		builder.setConnectionManager(cm);		builder.setDefaultCookieStore(cookieStore);		builder.setDefaultRequestConfig(globalConfig);		httpClient = builder.build();		httpContext = new BasicHttpContext();		httpContext.setAttribute(ClientContext.COOKIE_STORE, cookieStore);		HttpPost httpPost = new HttpPost(url);		httpPost.setConfig(localConfig);		httpPost.setHeader("Accept", "text/html,application/xhtml+

  




原标题:使用JAVA抓取网页数据

关键词:JAVA

*特别声明:以上内容来自于网络收集,著作权属原作者所有,如有侵权,请联系我们: admin#shaoqun.com (#换成@)。

视频编辑软件:https://www.goluckyvip.com/tag/12568.html
航班影响:https://www.goluckyvip.com/tag/1257.html
全球物流排行:https://www.goluckyvip.com/tag/12570.html
原产地证明:https://www.goluckyvip.com/tag/12571.html
汽车关税:https://www.goluckyvip.com/tag/12572.html
印度反击美国:https://www.goluckyvip.com/tag/12573.html
小米SU7的不甘心全写进了 “周边”的商品链接里 :https://www.kjdsnews.com/a/1836549.html
10天涨粉300+万 “王婆说媒”凭什么一夜爆红? :https://www.kjdsnews.com/a/1836550.html
相关文章
我的浏览记录
最新相关资讯
海外公司注册 | 跨境电商服务平台 | 深圳旅行社 | 东南亚物流