你的位置:首页 > Java教程

[Java教程]使用JAVA抓取网页数据


一、使用 HttpClient 抓取网页数据

public String getHtml(String htmlurl) throws IOException {		StringBuffer sb = new StringBuffer();		String acceptEncoding = "";		/* 1.生成 HttpClinet 对象并设置参数 */		HttpClient httpClient = new HttpClient();		GetMethod method = new GetMethod(htmlurl);		int statusCode;		try {			statusCode = httpClient.executeMethod(method);			// 判断访问的状态码			if (statusCode != HttpStatus.SC_OK) {				return null;			} else {				if (method.getResponseHeader("Content-Encoding") != null)					acceptEncoding = method.getResponseHeader(							"Content-Encoding").getValue();				if (acceptEncoding.toLowerCase().indexOf("gzip") > -1) {					// 建立gzip解压工作流					InputStream is;					is = method.getResponseBodyAsStream();					GZIPInputStream gzin = new GZIPInputStream(is);					InputStreamReader isr = new InputStreamReader(gzin, Charset.forName(CHARSET)); // 设置读取流的编码格式,自定义编码					java.io.BufferedReader br = new java.io.BufferedReader(isr);					String tempbf;					while ((tempbf = br.readLine()) != null) {						if(StringUtils.isNotBlank(tempbf)){							sb.append(tempbf);						}					}					isr.close();					gzin.close();					System.out.println(sb);				} else {					InputStreamReader isr;					isr = new InputStreamReader(							method.getResponseBodyAsStream(), CHARSET);					java.io.BufferedReader br = new java.io.BufferedReader(isr);					String tempbf;					while ((tempbf = br.readLine()) != null) {						if(StringUtils.isNotBlank(tempbf)){							sb.append(tempbf);						}					}					isr.close();				}			}		} catch (HttpException e) {			e.printStackTrace();		} catch (IOException e) {			e.printStackTrace();		}		method.abort();		method.releaseConnection();		return sb.toString();	}

 

二、使用HttpPost抓取网页数据

private static CloseableHttpClient httpClient;	private static BasicHttpContext httpContext;	private static BasicCookieStore cookieStore;	private static PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();	private static RequestConfig globalConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.BEST_MATCH).build();	private static RequestConfig localConfig = RequestConfig.copy(globalConfig).setCookieSpec(CookieSpecs.BROWSER_COMPATIBILITY).build();public String getHtml(String url){		HttpClientBuilder builder = HttpClients.custom();		cookieStore = new BasicCookieStore();		builder.setConnectionManager(cm);		builder.setDefaultCookieStore(cookieStore);		builder.setDefaultRequestConfig(globalConfig);		httpClient = builder.build();		httpContext = new BasicHttpContext();		httpContext.setAttribute(ClientContext.COOKIE_STORE, cookieStore);		HttpPost httpPost = new HttpPost(url);		httpPost.setConfig(localConfig);		httpPost.setHeader("Accept", "text/html,application/xhtml+