星空网 > 软件开发 > Java

用java实现新浪爬虫,代码完整剖析(仅针对当前SinaSignOn有效)

先来看我们的web.

 1 <!DOCTYPE web-app PUBLIC 2  "-//Sun Microsystems, Inc.//DTD Web Application 2.3//EN" 3  "http://java.sun.com/dtd/web-app_2_3.dtd" > 4  5 <web-app> 6  <display-name>MySinaSpider</display-name> 7   <listener> 8     <listener-class>main.java.sina.spider.StartSpiderLisenter</listener-class> 9    </listener>10 </web-app>

这样的配置当启动tomcat的时候,就会运行爬虫,然后再看我们的StartSpiderLisenter类,如下

 1 package main.java.sina.spider; 2  3 import javax.servlet.ServletContextEvent; 4 import javax.servlet.ServletContextListener; 5 import main.java.sina.bean.info.LoginInfo; 6 import main.java.sina.utils.Constant; 7  8 public class StartSpiderLisenter implements ServletContextListener{ 9 10   public void contextDestroyed(ServletContextEvent arg0) {11     12   }13 14   public void contextInitialized(ServletContextEvent arg0) {15     Constant.personalHomePage = "http://weibo.com/zhaoyao2012/home"; //填写你自己的新浪微博个人主页16     LoginInfo.username = "***"; //填写你的新浪微博用户名18     LoginInfo.password = "***"; //填写你的新浪微博密码19     Constant.enableProxy = false; //是否使用代理20     Spider.start();21   }22 23 }

很明显我们看到StartSpiderLisenter 类是继承自ServletContextListener这个接口,一定要实现它的两个方法,contextInitialized和contextDestroyed.它们分别在初始化和销毁的时候被容器调用。我们看到在contextInitialized初始化上下文的方法中调用了Spider.start()方法。那么我们来看看Spider这个类,如下:

 1 package main.java.sina.spider; 2  3 import java.io.IOException; 4 import java.util.regex.Matcher; 5 import java.util.regex.Pattern; 7 import org.quartz.JobBuilder; 8 import org.quartz.JobDetail; 9 import org.quartz.Scheduler; 10 import org.quartz.SchedulerException; 11 import org.quartz.SchedulerFactory; 12 import org.quartz.SimpleScheduleBuilder; 13 import org.quartz.SimpleTrigger; 14 import org.quartz.TriggerBuilder; 15 import org.quartz.impl.StdSchedulerFactory; 17 import main.java.sina.bean.info.LoginInfo; 18 import main.java.sina.httpclient.LoginSina; 19 import main.java.sina.httpclient.SpiderSina; 20 import main.java.sina.job.KeywordSearchJob; 21 import main.java.sina.utils.Constant; 22 import main.java.sina.utils.HttpHelper; 23 import main.java.test.SpiderTest; 24  25 public class Spider { 26  27   public static void main(String[] args) { 28  29     Constant.personalHomePage = "****";   30     LoginInfo.username = "****"; 31     LoginInfo.password = "****"; 32     Constant.enableProxy = false; 33     Constant.hourbefore = 0; //这个参数用于设置时差 34     start(); 35      36   } 37   public static void start() { 38      39     final SchedulerFactory factory = new StdSchedulerFactory(); 40     try { 41       Scheduler scheduler = factory.getScheduler(); 42       JobDetail jobDetail = JobBuilder.newJob(KeywordSearchJob.class) 43           .withIdentity("keywordSearch", "weibo").build(); 44       SimpleTrigger trigger = TriggerBuilder.newTrigger() 45           .withIdentity("keywordSearch", "weibo") 46           .withSchedule(SimpleScheduleBuilder.repeatHourlyForever()) 47           .build(); 48       scheduler.scheduleJob(jobDetail, trigger); 49       scheduler.start(); 50     } catch (SchedulerException e) { 51       e.printStackTrace(); 52     } 53   } 54  55   public static SpiderSina createSpider() { 56     LoginSina ls = new LoginSina(LoginInfo.username, LoginInfo.password); 57     ls.dologinSina(); 58     ls.redirect(); 59     SpiderSina spider = new SpiderSina(ls); 60  61     return spider; 62   } 63  64   public static void sendMidsofDays(SpiderSina spider,String keyword, String fromdate, 65       String todate) { 66      67     try { 68       String midsString = ""; 69       for (int i = 1; i <= 50; i++) { 70         String htmlContent = spider 71             .search(keyword, i, fromdate, todate); 72         if (htmlContent.contains("noresult_support")) { 73           break; 74         } 75         System.out.println(i); 76         Pattern pattern = Pattern.compile("<div mid=\"([0-9]*)\""); 77  78         String start = "\"pid\":\"pl_weibo_direct\""; 79         try { 80           htmlContent = htmlContent.substring(htmlContent 81               .indexOf(start)); 82         } catch (Exception e) { 83           htmlContent = htmlContent.substring(1); 84         } 85         htmlContent = htmlContent.replace("\\\"", "\""); 86         htmlContent = htmlContent.replace("\\/", "/"); 87         Matcher matcher = pattern.matcher(htmlContent); 88         while (matcher.find()) { 89           System.out.println(matcher.group(1)); 90           midsString += matcher.group(1) + ","; 91         } 92         if (i == 37) { 93           try { 94             Thread.sleep(1000 * 60 * 30); 95           } catch (InterruptedException e) { 96             e.printStackTrace(); 97           } 98         } 99       }100       System.out.println(midsString);101       HttpHelper.getLiveData(midsString, Constant.CommentUrl);102     } catch (IOException e) {103       e.printStackTrace();104     }105 106   }107 }

我们在Spider.start()方法中,看到了作业KeywordSearchJob.class,那么我们来看看这个KeywordSearchJob类的实现,如下:

 1 package main.java.sina.job; 2  3 import org.quartz.Job; 4 import org.quartz.JobExecutionContext; 5 import org.quartz.JobExecutionException; 6 import main.java.sina.httpclient.SpiderSina; 7 import main.java.sina.spider.Spider; 8 import main.java.sina.utils.Constant; 9 import main.java.sina.utils.Utils;10 11 public class KeywordSearchJob implements Job {12 13   public void execute(JobExecutionContext arg0) throws JobExecutionException {14 15     Constant.enableProxy = false; //我的爬虫中没有使用代理,故值设为false.16     String keyword = "%25E5%25AE%2581%25E6%25B3%25A2%25E5%25A4%25A7%25E5%25AD%25A6";//被编码后的关键字17     String datehour = Utils.getDateOfSpecifiedPreHour(Constant.hourbefore);//这个工具类实现了时差格式的转换18     SpiderSina spider = Spider.createSpider();19     spider.forwardToWeiboPage();20     Spider.sendMidsofDays(spider,keyword,datehour,datehour);21   }22 23 }

接下来,我们看几个工具类的实现:首先来看下Utils.java这个类,如下:它实现了日期的格式的一些转换

 1 package main.java.sina.utils; 2  3 import java.io.BufferedReader; 4 import java.io.BufferedWriter; 5 import java.io.File; 6 import java.io.FileInputStream; 7 import java.io.FileNotFoundException; 8 import java.io.FileOutputStream; 9 import java.io.FileWriter; 10 import java.io.IOException; 11 import java.io.InputStream; 12 import java.io.InputStreamReader; 13 import java.io.StringReader; 14 import java.io.UnsupportedEncodingException; 15 import java.text.ParseException; 16 import java.text.SimpleDateFormat; 17 import java.util.Calendar; 18 import java.util.Date; 19 import java.util.Properties; 20  21 import org.htmlparser.Parser; 22 import org.htmlparser.lexer.Lexer; 23 import org.htmlparser.lexer.Page; 24 import org.htmlparser.util.DefaultParserFeedback; 25 // I/O操作类 26 public class Utils { 27    28   public static Date getDateFromString(String dtext,Date fileCreateDate) { 29     Date date=null; 30     int y,mm,se;  31     Calendar c = Calendar.getInstance();  32     c.setTime(fileCreateDate); 33     y = c.get(Calendar.YEAR); //年  34     //d = c.get(Calendar.DAY_OF_MONTH); //日 35     mm = c.get(Calendar.MINUTE); //分 36     se = c.get(Calendar.SECOND);//秒 37     if(dtext.contains("秒前")){ 38       int end=0; 39       for(int i=0;i<dtext.length();i++){ 40         if(dtext.charAt(i)>='0' && dtext.charAt(i)<='9'){ 41           end++; 42         }else{ 43           break; 44         } 45       } 46       dtext=dtext.substring(0,end); 47       int second=Integer.parseInt(dtext); 48       c.set(Calendar.SECOND, se-second); 49       date=c.getTime(); 50     } 51     else if(dtext.contains("分钟前")){ 52       int end=0; 53       for(int i=0;i<dtext.length();i++){ 54         if(dtext.charAt(i)>='0' && dtext.charAt(i)<='9'){ 55           end++; 56         }else{ 57           break; 58         } 59       } 60       dtext=dtext.substring(0,end); 61       int minute=Integer.parseInt(dtext); 62       c.set(Calendar.MINUTE, mm-minute); 63       date=c.getTime(); 64     }else if(dtext.contains("今天")){ 65       dtext=dtext.replace("今天 ", "").trim(); 66       String ss[]=dtext.split(":"); 67       if(ss!=null && ss.length==2){ 68         c.set(Calendar.HOUR_OF_DAY, Integer.parseInt(ss[0])); 69         c.set(Calendar.MINUTE, Integer.parseInt(ss[1])); 70         date=c.getTime(); 71        } 72     }else if(dtext.contains("月")){ 73       dtext=y+"年".concat(dtext); 74       SimpleDateFormat sf=new SimpleDateFormat("yyyy年MM月dd日 HH:mm"); 75       try { 76         date=sf.parse(dtext); 77       } catch (ParseException e) { 78         e.printStackTrace(); 79       } 80     }else if(dtext.contains("-")){ 81       SimpleDateFormat sf=new SimpleDateFormat("yyyy-MM-dd HH:mm"); 82       try { 83         date=sf.parse(dtext); 84       } catch (ParseException e) { 85         e.printStackTrace(); 86       } 87     } 88     return date; 89   } 90   public static void writeFileFromStream(String filename,InputStream in){ 91     if(filename==null || filename.trim().length()==0) 92       return; 93     File file=new File(filename); 94     if(!file.exists()){ 95       try { 96         file.createNewFile(); 97       } catch (IOException e) { 98         e.printStackTrace(); 99       }100     }101     FileOutputStream fou=null;102     try {103       fou = new FileOutputStream(file);104       byte []buffer=new byte[1024*4];105       int len=-1;106       while((len=in.read(buffer))!=-1){107         fou.write(buffer,0,len);108       }109     } catch (FileNotFoundException e) {110       e.printStackTrace();111     } catch (IOException e) {112       e.printStackTrace();113     }finally{114       if(in!=null)115         try {116           in.close();117         } catch (IOException e) {118           e.printStackTrace();119         }120       if(fou!=null)121         try {122           fou.close();123         } catch (IOException e) {124           e.printStackTrace();125         }126     }127   }  128   public static void writeFileFromString(String filename,String str){129     if(filename==null || filename.trim().length()==0)130       filename="tmp.txt";131     File file=new File(filename);132     if(!file.exists()){133       try {134         file.createNewFile();135       } catch (IOException e) {136         e.printStackTrace();137       }138     }139     BufferedWriter writer=null;140     BufferedReader reader=null;141     try {142       writer=new BufferedWriter(new FileWriter(file));143       reader=new BufferedReader(new StringReader(str));144       String tmp=null;145       StringBuffer buffer=new StringBuffer();146       while((tmp=reader.readLine())!=null)147         buffer.append(tmp+"\n");148       writer.write(buffer.toString());149       150     } catch (IOException e) {151       e.printStackTrace();152     }finally{153       try {154         reader.close();155         writer.close();156       } catch (IOException e) {157         e.printStackTrace();158       }159     }160     161   }162   163   164   165   public static String getStringFromStream(InputStream in) {166     BufferedReader reader=null;167     reader = new BufferedReader(new InputStreamReader(in));168     StringBuffer buffer=new StringBuffer();169     String str=null;170     try{171       while((str=reader.readLine())!=null){172         buffer.append(str+"\n");173       }  174       reader.close();175     }catch(Exception ex){176       ex.printStackTrace();177     }      178     try {179       return new String(buffer.toString().getBytes(),"utf-8");180     } catch (UnsupportedEncodingException e) {181       e.printStackTrace();182       return "error:"+e.getMessage();183     }184   }185  //得到数据库的配置信息186   public static Properties getDBconfig(){187     Properties properties=new Properties();188     InputStream in = null;189     try {190       in = new FileInputStream(new File("config/dbconfig.ini"));191       properties.load(in);192     } catch (FileNotFoundException e) {193       e.printStackTrace();194     } catch (IOException e) {195       e.printStackTrace();196     }finally{197       if(in!=null)198         try {199           in.close();200         } catch (IOException e) {201           e.printStackTrace();202         }203     }204     return properties;205   }206   207   public static Parser createParser(String inputHTML) {208     Lexer mLexer = new Lexer(new Page(inputHTML));209     Parser parser = new Parser(mLexer, new DefaultParserFeedback(210         DefaultParserFeedback.QUIET));211     return parser;212   }213   214   public static String getDateOfSpecifiedPreHour(int hourNum){215     SimpleDateFormat sdFormat = new SimpleDateFormat("yyyy-MM-dd-HH");216     Date date = new Date();217     System.out.println("date -" +date + " " + hourNum);218     Calendar calendar = Calendar.getInstance();219     calendar.setTime(date);220     calendar.add(Calendar.HOUR_OF_DAY, -1 * hourNum);221     System.out.println("date2 -" +sdFormat.format(calendar.getTime()));222     return sdFormat.format(calendar.getTime());223   }  224 }

再来看一下ThreadPool.java这个类,如下:这是一个线程工具类,定义了线程的一些动作

 1 package main.java.sina.utils; 2  3 import java.util.List; 4 import java.util.concurrent.ExecutorService; 5 import java.util.concurrent.Executors; 6  7 /** 9  * 线程池工具类10 */11 public class ThreadPool {12   private ExecutorService service;13   private List<Thread> threadList;14 15   public ThreadPool(int limite, List<Thread> threadList) {16     this.service = Executors.newFixedThreadPool(limite);17     this.threadList = threadList;18   }19 20   public void execute() {21     if(threadList==null ||threadList.size()==0) return ;22     for (int index = 0; index < threadList.size(); index++) {23       Thread t=threadList.get(index);24       service.execute(t);25     }26   }27   public boolean isTerminated(){28     return service.isTerminated();29   }30   31   public void shutDown() {32     service.shutdown();33   }34 }

然后再看一下Constant.java这个常量类,如下:常量类把系统总用到的一些常量写在这里,以后项目维护需要更改的时候,方便维护更改

package main.java.sina.utils;/** * @ClassName: Constant  * */public class Constant {  public static boolean enableProxy = false;  public static String liveCommentUrl = "http://localhost:8080/social-hub-connector/loadingLiveData";  public static String CommentUrl = "http://localhost:8080/social-hub-connector/loadingData";  public static String personalHomePage = "******";  public static String weiboUsername = "*********";  public static String weiboPassword = "*********";  public static int hourbefore = 0;}

再来看一下Base64Encoder.java类,它对一些字段进行了编码的类,如下:

 1 package main.java.sina.utils; 2  3 /** 4  *  5 */ 6 public class Base64Encoder { 7   private static final char last2byte = (char) Integer.parseInt("00000011", 2); 8   private static final char last4byte = (char) Integer.parseInt("00001111", 2); 9   private static final char last6byte = (char) Integer.parseInt("00111111", 2);10   private static final char lead6byte = (char) Integer.parseInt("11111100", 2);11   private static final char lead4byte = (char) Integer.parseInt("11110000", 2);12   private static final char lead2byte = (char) Integer.parseInt("11000000", 2);13   private static final char[] encodeTable = new char[]{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'};14 15   public Base64Encoder() {16   }17   public static String encode(byte[] from) {18     StringBuffer to = new StringBuffer((int) (from.length * 1.34) + 3);19     int num = 0;20     char currentByte = 0;21     for (int i = 0; i < from.length; i++) {22       num = num % 8;23       while (num < 8) {24         switch (num) {25           case 0:26             currentByte = (char) (from[i] & lead6byte);27             currentByte = (char) (currentByte >>> 2);28             break;29           case 2:30             currentByte = (char) (from[i] & last6byte);31             break;32           case 4:33             currentByte = (char) (from[i] & last4byte);34             currentByte = (char) (currentByte << 2);35             if ((i + 1) < from.length) {36               currentByte |= (from[i + 1] & lead2byte) >>> 6;37             }38             break;39           case 6:40             currentByte = (char) (from[i] & last2byte);41             currentByte = (char) (currentByte << 4);42             if ((i + 1) < from.length) {43               currentByte |= (from[i + 1] & lead4byte) >>> 4;44             }45             break;46         }47         to.append(encodeTable[currentByte]);48         num += 6;49       }50     }51     if (to.length() % 4 != 0) {52       for (int i = 4 - to.length() % 4; i > 0; i--) {53         to.append("=");54       }55     }56     return to.toString();57   }58 }

这个类中,针对新浪的一些特殊的加密规则,写的方法,这个在拼接最终的URl的时候回用到,如根据servertime+nonce两个参数来生成一串字符串加密规则:

 1 package main.java.sina.utils; 2 import java.io.File; 3 import java.io.FileReader; 4  5 import javax.script.Invocable; 6 import javax.script.ScriptEngine; 7 import javax.script.ScriptEngineManager; 8  9 /**10  * 12 */13 public class EncodeSuAndSp {14   static ScriptEngineManager mgr = new ScriptEngineManager(); 15   static ScriptEngine engine = mgr.getEngineByExtension("js");16   static Invocable inv = (Invocable) engine;  17    18   public static String getEncryptedP(String password,String servertime,String nonce){19     String value1="";20     try { 21       engine.eval(new FileReader(new File("js/encrypt.js")));22       value1 = String.valueOf(inv.invokeFunction("hex_sha1",password));23       value1 = String.valueOf(inv.invokeFunction("hex_sha1",value1));24       value1 = String.valueOf(inv.invokeFunction("hex_sha1",value1+servertime+nonce));25     } catch (Exception e) {26       e.printStackTrace();27     }28     return value1;29   }30   31   32   public static String getEncodedUsername(String username){33     String value1="";34     try {35       engine.eval(new FileReader(new File("js/encrypt.js")));36       value1 = String.valueOf(inv.invokeFunction("encode",username));37       System.out.println(value1);38     } catch (Exception e) {39       e.printStackTrace();40     }41     return value1;42   }43 }

package main.java.sina.utils;import java.io.UnsupportedEncodingException;import java.net.URLDecoder;import java.net.URLEncoder;public class EncodeUtils {  public static final String encodeURL(String str,String enc) {    try {      return URLEncoder.encode(str, enc);    } catch (UnsupportedEncodingException e) {      throw new RuntimeException(e);    }  }  public static final String decodeURL(String str,String enc) {    try {      return URLDecoder.decode(str, enc);    } catch (UnsupportedEncodingException e) {      throw new RuntimeException(e);    }  }    public static String unicdoeToGB2312(String str) {    String res = null;    if(str==null ){      return "";    }    StringBuffer sb = new StringBuffer();    try {      while (str.length() > 0) {        if (str.startsWith("\\u")) {          int x = 0;          try{            x = Integer.parseInt(str.substring(2, 6), 16);          }catch(Exception ex){            x= 0;          }          sb.append((char) x);          str = str.substring(6);        } else {          sb.append(str.charAt(0));          str = str.substring(1);        }      }      res = sb.toString();    } catch (Exception e) {      e.printStackTrace(System.err);    }    res=res.replaceAll("\\\\r", "")      .replaceAll("\\\\n", "")      .replaceAll("\\\\t", "")      .replaceAll("&nbsp;", "")      .replaceAll("&gt", "")      .replaceAll("\\[", "\"")      .replaceAll("\\]", "\"");    return res;  }    public static String unicodeTogb2312(String str) {    String res = null;    StringBuffer sb = new StringBuffer();    try {      while (str.length() > 0) {        if (str.startsWith("\\u")) {          int x = Integer.parseInt(str.substring(2, 6), 16);          sb.append((char) x);          str = str.substring(6);        } else {          sb.append(str.charAt(0));          str = str.substring(1);        }      }      res = sb.toString();    } catch (Exception e) {      e.printStackTrace(System.err);    }    res=res.replaceAll("\\\\r", "")        .replaceAll("\\\\t", "")        .replaceAll("&nbsp;", "")        .replaceAll("&gt", "")        .replaceAll("\\\\n", "");    return res;  }}

这个类很关键HttpUtils.java类,这个方法中重写了doPost()和doGet()方法.如下:

package main.java.sina.utils;import java.io.ByteArrayInputStream;import java.io.File;import java.io.FileInputStream;import java.io.IOException;import java.io.InputStream;import java.util.ArrayList;import java.util.List;import java.util.Map;import java.util.Random;import java.util.Set;import org.apache.http.Header;import org.apache.http.HttpEntity;import org.apache.http.HttpHost;import org.apache.http.HttpResponse;import org.apache.http.HttpVersion;import org.apache.http.NameValuePair;import org.apache.http.client.ClientProtocolException;import org.apache.http.client.HttpClient;import org.apache.http.client.entity.UrlEncodedFormEntity;import org.apache.http.client.methods.HttpGet;import org.apache.http.client.methods.HttpPost;import org.apache.http.client.methods.HttpUriRequest;import org.apache.http.conn.params.ConnRoutePNames;import org.apache.http.conn.params.ConnRouteParams;import org.apache.http.cookie.Cookie;import org.apache.http.entity.InputStreamEntity;import org.apache.http.impl.client.DefaultHttpClient;import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;import org.apache.http.impl.cookie.BasicClientCookie;import org.apache.http.message.BasicNameValuePair;import org.apache.http.params.BasicHttpParams;import org.apache.http.params.CoreProtocolPNames;import org.apache.http.params.HttpParams;import org.apache.http.params.HttpProtocolParams;import org.apache.http.protocol.BasicHttpContext;import org.apache.http.protocol.ExecutionContext;import org.apache.http.protocol.HTTP;import org.apache.http.protocol.HttpContext;/** * http操作相关的类 */public class HttpUtils {  /*   * params :   * url: 地址   * headers请求头部信息   * return : httpresponse响应   */  public static HttpResponse doGet(String url,Map<String,String> headers){    HttpClient client=createHttpClient();    HttpGet getMethod=new HttpGet(url);    HttpResponse response=null;        HttpContext httpContext = new BasicHttpContext();    try {      if(headers!=null && headers.keySet().size()>0){        for(String key:headers.keySet()){          getMethod.addHeader(key, headers.get(key));        }      }        response=client.execute(getMethod);      HttpUriRequest realRequest = (HttpUriRequest)httpContext.getAttribute(ExecutionContext.HTTP_REQUEST);      System.out.println(realRequest.getURI());    } catch (ClientProtocolException e) {      e.printStackTrace();    } catch (IOException e) {      String msg=e.getMessage();      if(msg.contains("Truncated chunk")){        System.out.println(e.getMessage() +" 数据获取不完整,需要重新获取。");      }else{        System.out.println(e.getMessage() +" 连接被拒绝,需要降低爬取频率。");      }    } catch(Exception e){    }    System.out.println(response);    return response;      }    /*   * params :   * url: 地址   * headers:请求头部信息   * params:post的请求数据   * return : httpresponse响应   */    public static HttpResponse doPost(String url,Map<String,String> headers,Map<String,String> params){    HttpClient client=createHttpClient();    HttpPost postMethod=new HttpPost(url);    HttpResponse response=null;    try {      if(headers!=null && headers.keySet().size()>0){        for(String key:headers.keySet()){          postMethod.addHeader(key, headers.get(key));        }      }        List<NameValuePair> p=null;      if(params!=null && params.keySet().size()>0){        p=new ArrayList<NameValuePair>();        for(String key:params.keySet()){          p.add(new BasicNameValuePair(key,params.get(key)));        }      }      if(p!=null)        postMethod.setEntity(new UrlEncodedFormEntity(p,HTTP.UTF_8));      response=client.execute(postMethod);    } catch (ClientProtocolException e) {      e.printStackTrace();    } catch (IOException e) {      e.printStackTrace();    }     return response;        }    //上传一个文件  public static HttpResponse doPost(String url,Map<String,String> headers,String fileName){    HttpClient client=createHttpClient();    HttpPost postMethod=new HttpPost(url);    String boundary = "";    HttpResponse response=null;    try {      if(headers!=null && headers.keySet().size()>0){        for(String key:headers.keySet()){          postMethod.addHeader(key, headers.get(key));          if(key.equals("Content-Type")){            String tmp=headers.get(key);            boundary=tmp.substring(tmp.indexOf("=")+1);          }        }      }        File file=new File(fileName);      InputStream in=new FileInputStream(file);            StringBuffer buffer=new StringBuffer();      buffer.append(boundary).append("\n")         .append("Content-Disposition: form-data; name=\"pic1\"; filename=\""+file.getName()).append("\"\n")         .append("Content-Type: image/pjpeg").append("\n")         .append("\n");            System.out.println(buffer.toString());            String tmpstr=Utils.getStringFromStream(in);      tmpstr=Base64Encoder.encode(tmpstr.getBytes());      buffer.append(tmpstr).append("\n");      buffer.append(boundary+"--").append("\n");            System.out.println(buffer.toString());            in=new ByteArrayInputStream(buffer.toString().getBytes());            InputStreamEntity ise=new InputStreamEntity(in,buffer.toString().getBytes().length);             postMethod.setEntity(ise);             response=client.execute(postMethod);    } catch (ClientProtocolException e) {      e.printStackTrace();    } catch (IOException e) {      e.printStackTrace();    }     return response;        }  /*   * params :   * httpresponse   * return : 响应的头部信息   */    public static List<Header> getReponseHeaders(HttpResponse response){    List<Header> headers=null;    Header[] hds=response.getAllHeaders();    if(hds!=null && hds.length>0){      headers=new ArrayList<Header>();      for(int i=0;i<hds.length;i++){        headers.add(hds[i]);      }    }        return headers;  }    /*   * params :   * headers:头部信息    * request:请求   */  public static void setHeaders(Map<String,String> headers,HttpUriRequest request){    if(headers!=null && headers.keySet().size()>0){      for(String key:headers.keySet()){        request.addHeader(key, headers.get(key));      }    }  }    /*   * params :   * httpresponse   * return : 响应的cookies值   */    public static List<Cookie> getResponseCookies(HttpResponse response){    List<Cookie> cookies=null;    Header[] hds=response.getAllHeaders();    if(hds!=null && hds.length>0){      for(int i=0;i<hds.length;i++){        if(hds[i].getName().equalsIgnoreCase("Set-Cookie")){          if(cookies==null){            cookies=new ArrayList<Cookie>();          }                     String cookiestring[]=hds[i].getValue().split(";");          String ss[]=cookiestring[0].split("=",2);          String cookiename=ss[0];          String cookievalue=ss[1];          Cookie cookie=new BasicClientCookie(cookiename,cookievalue);          cookies.add(cookie);        }      }    }        return cookies;  }  /*   * params :   * cookies数组   * return : cookies数组组成的字符串   */  public static String setCookie2String(List<Cookie> cookies){    StringBuilder builder=null;     if(cookies!=null && cookies.size()>0){      builder=new StringBuilder();      for(int j=0;j<cookies.size();j++){        Cookie c=cookies.get(j);        builder.append(c.getName()+"="+c.getValue());        if(j!=cookies.size()-1)          builder.append("; ");       }      return builder.toString();    }        return null;  }    /*   * 从响应中得到输入流   */  public static InputStream getInputStreamFromResponse(HttpResponse response){    if(response==null){      return null;    }    HttpEntity entity=response.getEntity();    InputStream in=null;    try {      in = entity.getContent();    } catch (IllegalStateException e) {      e.printStackTrace();    } catch (IOException e) {      e.printStackTrace();    }    return in;  }    /*   * 从响应中得到字符串   */  public static String getStringFromResponse(HttpResponse response){    if(response==null){      return null;    }    InputStream in=getInputStreamFromResponse(response);    String responseText="";    if(in!=null){      responseText=Utils.getStringFromStream(in);    }    return responseText;  }    /**   * 创建支持多线程并发连接的HTTPCLIENT   */  private final static HttpClient createHttpClient() {     String proxyHost = "web-proxy-sha.chn.hp.com";     int proxyPort = 8080;     HttpHost proxy = new HttpHost(proxyHost,proxyPort);    HttpParams params = new BasicHttpParams();    if(Constant.enableProxy){      params.setParameter(ConnRouteParams.DEFAULT_PROXY, proxy);    }    HttpProtocolParams.setVersion(params, HttpVersion.HTTP_1_1);    HttpProtocolParams.setContentCharset(params, "UTF-8");        ThreadSafeClientConnManager clientmanager = new ThreadSafeClientConnManager();    clientmanager.setMaxTotal(20);    HttpClient client = new DefaultHttpClient(clientmanager, params);        //定义了环形重定向,定向到相同的路径是否被允许.    client.getParams().setParameter("http.protocol.allow-circular-redirects", true);         //定义了重定向的最大数量    client.getParams().setParameter("http.protocol.max-redirects", 50);        //定义了重定向是否应该自动处理    client.getParams().setParameter("http.protocol.handle-redirects", false);    return client;  }    /**   *加入代理的功能   * @return HttpClient 对象   */  public static HttpClient getDefaultHttpClientByProxy() {    HttpClient httpclient =createHttpClient();    String filePath = "proxy.properties";    HttpHost proxy = null;    Map<String, String> map = ReadIni.getDbini(filePath);    if (map.size() == 0) {      throw new RuntimeException("无可用代理");    } else {      Set<String> set = map.keySet();      String[] array = (String[]) set.toArray(new String[set.size()]);      Random r = new Random();      int rnum = r.nextInt(array.length);      String ip = array[rnum];      String port = map.get(ip);      proxy = new HttpHost(ip, Integer.parseInt(port));    }    httpclient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY,proxy);    httpclient.getParams().setParameter(CoreProtocolPNames.PROTOCOL_VERSION, HttpVersion.HTTP_1_1);    return httpclient;  }}

接下来卡一个HttpHelper的辅助类,如下:

/** * */package main.java.sina.utils;import java.io.IOException;import org.apache.commons.httpclient.HttpClient;import org.apache.commons.httpclient.HttpException;import org.apache.commons.httpclient.methods.PostMethod;/** * @ClassName: HttpHelper * */public class HttpHelper {  public static String getLiveData(String requestData,String url)      throws HttpException, IOException {    PostMethod postMethod = new PostMethod(url);    postMethod.setParameter("mids", requestData);    HttpClient httpClient = new HttpClient();    int statusCode = httpClient.executeMethod(postMethod);    String response = postMethod.getResponseBodyAsString();    postMethod.releaseConnection();    System.out.println(response);    return response;  }    public static String getHobbyData(String userid, String hobbys)      throws HttpException, IOException {    PostMethod postMethod = new PostMethod("http://c0048925.itcs.hp.com:8080/connector/loadingHobby");    postMethod.setParameter("userid", userid);    postMethod.setParameter("hobbys", hobbys);    HttpClient httpClient = new HttpClient();    int statusCode = httpClient.executeMethod(postMethod);    String response = postMethod.getResponseBodyAsString();    postMethod.releaseConnection();    System.out.println(response);    return response;  }}

ReadIni.java类,在读文本文件中使用,如下:

package main.java.sina.utils; import java.io.BufferedReader;import java.io.FileInputStream;import java.io.FileNotFoundException;import java.io.InputStreamReader;import java.util.HashMap;import java.util.Map;public class ReadIni {  public static Map<String, String> getDbini(String file) {    Map<String, String> map = new HashMap<String, String>();    InputStreamReader isr = null;    try{      isr = new InputStreamReader(new FileInputStream(file));    } catch (FileNotFoundException e1) {      e1.printStackTrace();    }    BufferedReader br = new BufferedReader(isr);    String s = null;    try {      s = br.readLine();      while (s != null) {        if (s.trim().length() > 0) {          String[] s1 = getIni(s);          map.put(s1[0], s1[1]);          s = br.readLine();        }      }      br.close();      isr.close();    } catch (Exception e) {      e.printStackTrace();    }    return map;  }  public static String[] getIni(String str) {    String[] temp = str.split("=");    return temp;  }}

然后,我们跳转到登录sina,来看一下loginSina这个类的实现:

package main.java.sina.httpclient;import java.io.IOException;import java.io.InputStream;import java.io.UnsupportedEncodingException;import java.math.BigInteger;import java.security.InvalidKeyException;import java.security.KeyFactory;import java.security.NoSuchAlgorithmException;import java.security.interfaces.RSAPublicKey;import java.security.spec.InvalidKeySpecException;import java.security.spec.RSAPublicKeySpec;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import java.util.Scanner;import javax.crypto.BadPaddingException;import javax.crypto.Cipher;import javax.crypto.IllegalBlockSizeException;import javax.crypto.NoSuchPaddingException;import org.apache.commons.codec.binary.Hex;import org.apache.commons.httpclient.params.HttpParams;import org.apache.http.HttpResponse;import org.apache.http.client.HttpClient;import org.apache.http.client.methods.HttpGet;import org.apache.http.cookie.Cookie;import org.springframework.core.io.ClassPathResource;import main.java.sina.json.msg.PreLoginResponseMessage;import main.java.sina.utils.Base64Encoder;import main.java.sina.utils.EncodeUtils;import main.java.sina.utils.HttpUtils;import main.java.sina.utils.JsonUtils;import main.java.sina.utils.Utils;public class LoginSina {  private String username;  private String password;  private String rsakv;  private String pubkey;    //servertime和nonce都是在登录时需要使用的,用于post信息的加密  private String servertime;//服务器的时间  private String nonce;//一次性字符串  private String userid;//用户微博ID  private String pcid;//若需要输入验证码时用到  private String userdomainname;//用于域名  private String door;//验证码    private Map<String,String> headers=null;    private List<Cookie> cookies=null;      public LoginSina(String username,String password){    this.username=username;    this.password=password;    init();  }    public Map<String,String> getHeaders(){    Map<String,String> hds=null;    if(headers!=null && headers.keySet().size()>0){      hds=new HashMap<String,String>();      for(String key:headers.keySet()){        hds.put(key,headers.get(key));      }    }    return hds;  }    public List<Cookie> getCookies(){    List<Cookie> cc=null;    if(cookies!=null && cookies.size()>0){      cc=new ArrayList<Cookie>();      for(int i=0;i<cookies.size();i++){        cc.add(cookies.get(i));      }    }    return cc;  }  //登录微博  public String dologinSina(){    System.out.println("---do login, please hold on...---");    String url="http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.5)";//v1.3.17    Map<String,String> headers=new HashMap<String,String>();    Map<String,String> params=new HashMap<String,String>();        /*HTTP协议中的headers:http://www.cnblogs.com/yuzhongwusan/archive/2011/10/20/2218954.html     * */    headers.put("Accept", "text/html, application/xhtml+);    headers.put("Referer", "http://login.sina.com.cn/member/my.php?entry=sso");    headers.put("Accept-Language", "zh-cn");    headers.put("User-Agent", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; BOIE9;ZHCN");    headers.put("Host", "login.sina.com.cn");    headers.put("Connection", "Keep-Alive");    headers.put("Content-Type", "application/x-www-form-urlencoded");    headers.put("Cache-Control", "no-cache");    params.put("encoding", "UTF-8");    params.put("entry", "weibo");    params.put("from", "");    params.put("prelt", "112");    params.put("gateway", "1");    params.put("nonce", nonce);    params.put("pwencode", "rsa2");//wsse    params.put("returntype", "META");    params.put("pagerefer", "");    params.put("savestate", "7");      params.put("servertime", servertime);    params.put("rsakv", rsakv);    params.put("service", "miniblog");    params.put("sp", getEncryptedP());    params.put("ssosimplelogin", "1");    params.put("su", getEncodedU());    params.put("url", "http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack");    params.put("useticket", "1");    params.put("vsnf", "1");    HttpResponse response=HttpUtils.doPost(url, headers, params);    this.cookies=HttpUtils.getResponseCookies(response);    this.headers=headers;    String responseText=HttpUtils.getStringFromResponse(response);    try {      responseText=new String(responseText.getBytes(),"GBK");      if(!responseText.contains("retcode=0")){        downloadCheckImage();        this.nonce=getnonce();        Scanner s=new Scanner(System.in);        if(responseText.contains("retcode=4049"))          System.out.println("请输入验证码:");        else if(responseText.contains("retcode=2070")){          System.out.println("验证码不正确,请再次输入验证码:");        }        this.door=s.next();        dologinSina();      }    } catch (UnsupportedEncodingException e) {      e.printStackTrace();    }    System.out.println("Congratulations, you have login success!");    return responseText;  }  //登录后重定向  public String redirect(){    String cookieValue=HttpUtils.setCookie2String(this.cookies);    this.headers.clear();    this.headers.put("Accept", "image/gif, image/jpeg, image/pjpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*");    this.headers.put("Accept-Language", "zh-cn");    this.headers.put("Connection", "Keep-Alive");    this.headers.put("Host", "sina.com.cn");    this.headers.put("Referer", "http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)");    this.headers.put("User", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; QQDownload 691)");    this.headers.put("Cookie", cookieValue);    String ssosavestate=""; //SSO即Sina Sign-on,    String ticket = "";    for(Cookie c:this.cookies){      if(c.getName().equals("ALF")){        ssosavestate=c.getValue();      }else if(c.getName().equals("tgc")){        ticket=c.getValue();      }    }    String url="http://weibo.com/ajaxlogin.php?" +        "framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack&" +        "sudaref=weibo.com";    HttpResponse response=HttpUtils.doGet(url, this.headers);    response=HttpUtils.doGet(url, this.headers);      String responseText=HttpUtils.getStringFromResponse(response);    return responseText;  }  //生成一次性的字符串 6位 用于加密  private String getnonce() {    String x = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";    String str = "";    for (int i = 0; i < 6; i++) {      str += x.charAt((int)Math.ceil(Math.random() * 1000000) % x.length());    }    return str;  }  //初始化:得到服务区的时间servertime和一次性字符串nonce  private void init(){    String url=compositeUrl();    Map<String,String> headers=new HashMap<String,String>();    headers.put("Accept", "*/*");    headers.put("Referer", "http://weibo.com/");    headers.put("Accept-Language", "zh-cn");    headers.put("User-Agent", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; QQDownload 691)");    headers.put("Host", "login.sina.com.cn");    headers.put("Connection", "Keep-Alive");        HttpResponse response=HttpUtils.doGet(url, headers);    String responseText=HttpUtils.getStringFromResponse(response);    int begin=responseText.indexOf("{");    int end=responseText.lastIndexOf("}");    responseText=responseText.substring(begin,end+1);    PreLoginResponseMessage plrmsg =JsonUtils.jsontoPreLoginResponseMessage(responseText);    this.nonce=plrmsg.getNonce();    this.servertime=plrmsg.getServertime()+"";    this.pubkey=plrmsg.getPubkey();    this.rsakv=plrmsg.getRsakv();    this.pcid=plrmsg.getPcid();  }  //下载验证码  private void downloadCheckImage() {    if(pcid==null) return;    this.headers.remove("Content-Type");    try {      if(this.cookies != null){        this.cookies.clear();      }          } catch (Exception e) {      e.printStackTrace();    }    String cookieValue=HttpUtils.setCookie2String(this.cookies);    this.headers.put("Cookie", cookieValue);    String url="http://login.sina.com.cn/cgi/pin.php?r="+(long)(Math.random()*100000000)+"&s=0&p="+this.pcid;    HttpResponse response=HttpUtils.doGet(url, headers);    InputStream in=HttpUtils.getInputStreamFromResponse(response);    try {      //System.out.println(new ClassPathResource("checkImage.jpeg").getFile().getPath());      Utils.writeFileFromStream(new ClassPathResource("checkImage.jpeg").getFile().getPath(), in);    } catch (IOException e) {      e.printStackTrace();    }  }  //组合预登陆时的URL  private String compositeUrl(){    StringBuilder builder=new StringBuilder();    builder.append("http://login.sina.com.cn/sso/prelogin.php?")      .append("entry=weibo&callback=sinaSSOController.preloginCallBack&")      .append("su="+getEncodedU())      .append("&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.5)&_="+System.currentTimeMillis());    return builder.toString();  }  //对用户名进行编码  private String getEncodedU() {    if(username!=null && username.length()>0){      return Base64Encoder.encode(EncodeUtils.encodeURL(username,"utf-8").getBytes());    }    return "";  }  //对密码进行编码  private String getEncryptedP(){//    return EncodeSuAndSp.getEncryptedP(password, servertime, nonce);    String data=servertime+"\t"+nonce+"\n"+password;    String spT=rsaCrypt(pubkey, "10001", data);    return spT;  }    public static String rsaCrypt(String pubkey, String exponentHex, String pwd,String servertime,String nonce) {     String data=servertime+"\t"+nonce+"\n"+pwd;     return rsaCrypt(pubkey,exponentHex,data);  }  public static String rsaCrypt(String pubkey, String exponentHex, String messageg) {      KeyFactory factory=null;      try {        factory = KeyFactory.getInstance("RSA");      } catch (NoSuchAlgorithmException e1) {        return "";      }      BigInteger publicExponent = new BigInteger(pubkey, 16); /* public exponent */      BigInteger modulus = new BigInteger(exponentHex, 16); /* modulus */      RSAPublicKeySpec spec = new RSAPublicKeySpec(publicExponent, modulus);      RSAPublicKey pub=null;      try {        pub = (RSAPublicKey) factory.generatePublic(spec);      } catch (InvalidKeySpecException e1) {        return "";      }      Cipher enc=null;      byte[] encryptedContentKey =null;      try {        enc = Cipher.getInstance("RSA");        enc.init(Cipher.ENCRYPT_MODE, pub);        encryptedContentKey = enc.doFinal(messageg.getBytes());      } catch (NoSuchAlgorithmException e1) {        System.out.println(e1.getMessage());        return "";      } catch (NoSuchPaddingException e1) {        System.out.println(e1.getMessage());        return "";      } catch (InvalidKeyException e1) {        System.out.println(e1.getMessage());        return "";      } catch (IllegalBlockSizeException e1) {        System.out.println(e1.getMessage());        return "";      } catch (BadPaddingException e1) {        System.out.println(e1.getMessage());        return "";      }       return new String(Hex.encodeHex(encryptedContentKey));  }  public void setUserid(String userid) {    this.userid = userid;  }  public String getUserid() {    return userid;  }  public void setUserdomainname(String userdomainname) {    this.userdomainname = userdomainname;  }  public String getUserdomainname() {    return userdomainname;  }}

Spider.sina类如下:

 1 package main.java.sina.httpclient; 2 import java.util.HashMap; 3 import java.util.List; 4 import java.util.Map; 5  6 import org.apache.http.HttpResponse; 7 import org.apache.http.cookie.Cookie; 8  9 import main.java.sina.utils.Constant; 10 import main.java.sina.utils.EncodeUtils; 11 import main.java.sina.utils.HttpUtils; 12 import main.java.sina.utils.Utils; 13  14 public class SpiderSina { 15   private LoginSina ls; 16   private Map<String,String> headers; 17   private final int ADDFOLLOWING =1; 18   private final int CANCELFOLLOWING =2; 19   public SpiderSina(LoginSina ls){ 20     this.ls=ls; 21     this.headers=new HashMap<String,String>(); 22     headers.put("Accept", "text/html, application/xhtml+); 23     headers.put("Accept-Language", "zh-cn"); 24     headers.put("User-Agent", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; BOIE9;ZHCN"); 25     headers.put("Connection", "Keep-Alive"); 26     headers.put("Cache-Control", "no-cache"); 27     String cookieValue=HttpUtils.setCookie2String(ls.getCookies()); 28     headers.put("Cookie", cookieValue); 29   } 36   public String getGroupCategory(){ 37     String url="http://q.weibo.com/"; 38     this.headers.put("Host", "q.weibo.com"); 39     HttpResponse response=HttpUtils.doGet(url, headers); 40     String responseText=HttpUtils.getStringFromResponse(response); 41     responseText=EncodeUtils.unicdoeToGB2312(responseText); 42     return responseText; 43   } 44   public String search(String keyword, int pageNo){ 47     String url="http://s.weibo.com/weibo/%25E5%25AE%2581%25E6%25B3%25A2%25E5%25A4%25A7%25E5%25AD%25A6&page="+pageNo; 48     String cookieValue = "SINAGLOBAL=8556698272004.724.1417744632425; un=shy_annan@126.com; myuid=5439352084; wvr=6; un=sm2014121904@126.com; _s_tentry=developer.51cto.com; SWB=usrmdinst_14; SUS=SID-5438576807-1419173757-GZ-lrze7-d8e1e3f082b428c12412c8ba30f0a6de; SUE=es%3D4cdfdd5d5f0f75141c092b32f89525a2%26ev%3Dv1%26es2%3D469e50c869315e57efeec3012c3bb6a8%26rs0%3DoWdG36CQ33LUEtKTvGn907Zy1mwFETvSVJsxeHEiaMPcKDB7pFxg596a2pLhFLJfQmswf4AvXYAkzTfemrYgWrz%252BQPustEA2wLNYufYpAZqFsGWanhTBq6elzB2yoZp41xcpy1WwXn1CuvzIzzEYpuILjHahkmJDQDQy6KaxlbA%253D%26rv%3D0; SUP=cv%3D1%26bt%3D1419173757%26et%3D1419260157%26d%3Dc909%26i%3Da6de%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D27%26st%3D0%26uid%3D5438576807%26name%3Dsm2014121904%2540126.com%26nick%3DSocialMedia%25E5%259B%259B%25E5%25A8%2583%26fmp%3D%26lcp%3D; SUB=_2A255kq8tDeTxGeNK6FoU9yjEyzuIHXVa6DVlrDV8PUNbvtBeLW3TkW-bMoi0G_bBfpbS3TMqcXg6zDWFLA..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhGThsH46uNrx1VY0ApV0SR5JpX5KMt; ALF=1450709756; SSOLoginState=1419173757; WBStore=bc5ad8450c3f8a48|undefined; Apache=1027467835228.8901.1419173761694; ULV=1419173761704:6:6:1:1027467835228.8901.1419173761694:1418797827169; UOR=www.ilehao.com,widget.weibo.com,login.sina.com.cn; ULOGIN_IMG=14192385783486"; 49     headers.put("Accept", "text/html,application/xhtml+); 50     //headers.put("Accept-Encoding", "gzip, deflate, sdch"); 51     headers.put("Accept-Language", "zh-CN"); 52     headers.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36"); 53     headers.put("Connection", "Keep-Alive"); 54     headers.put("Cache-Control", "max-age=0"); 55     headers.put("Referer", "http://login.sina.com.cn/sso/login.php?url=http%3A%2F%2Fs.weibo.com%2Fweibo%2F%2525E6%252583%2525A0%2525E6%252599%2525AE%26page%3D2&_rand=1419173756.6387&gateway=1&service=weibo&entry=miniblog&useticket=1&returntype=META"); 56     headers.put("Cookie", cookieValue); 57     this.headers.put("Host", "s.weibo.com"); 58     HttpResponse response=HttpUtils.doGet(url, headers); 59     String responseText=HttpUtils.getStringFromResponse(response); 60     responseText=EncodeUtils.unicdoeToGB2312(responseText); 61      62      63     return responseText; 64   } 65    66   public String searchCommentsByUid(String uid){ 67      68     String url="http://www.weibo.com/u/"+uid; 69     String cookieValue = "SINAGLOBAL=8556698272004.724.1417744632425; myuid=2035860051; wvr=6; YF-Ugrow-G0=ad06784f6deda07eea88e095402e4243; SSOLoginState=1423150079; YF-V5-G0=32eb5467e9bfc8b60c2d771056535ac5; _s_tentry=www.weibo.com; Apache=6264929557219.147.1423150103832; ULV=1423150103842:18:2:2:6264929557219.147.1423150103832:1422769721265; ULOGIN_IMG=1423233797946; YF-Page-G0=82cdcdfb16327a659fbb60cc9368fb19; SUS=SID-2035860051-1423286223-GZ-jdkh4-c8ea11de0a42151313986e52f9aa6017; SUE=es%3D8701ff5aca59244ff1ff263cf985bee6%26ev%3Dv1%26es2%3D7995c9eb7455697c09fac4f7486e14eb%26rs0%3DTyXXIRjcEw%252BeS5PaVSM%252FhQjc2JGhKBOe3uFTgShiIUAbPFI2eKtrgxM2wIi9A1xndiTFFM72zY%252FDKYFXONrgkao5cRo%252FHkydV%252FnaQjNmXoeESu5gi6Iq0aX883NhGR0utBVNZb5XaIG3X6HMMfBJC%252B7pnVHogEo8eD6cx8nzN5c%253D%26rv%3D0; SUP=cv%3D1%26bt%3D1423286223%26et%3D1423372623%26d%3Dc909%26i%3D6017%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D0%26st%3D0%26uid%3D2035860051%26name%3Dshy_annan%2540126.com%26nick%3D%25E7%2594%25A8%25E6%2588%25B72035860051%26fmp%3D%26lcp%3D2013-08-18%252021%253A48%253A10; SUB=_2A2550e-fDeTxGeRO6FcZ9i7Mzj2IHXVap0ZXrDV8PUNbvtBuLWnTkW-gBGVORTA7J_lSZzAqzW6E50JjBQ..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9Wh7oKNCGYcNnhlC6eqqQbbl5JpX5KMt; SUHB=0M20OGRPiOKzyc; ALF=1454822222; UOR=www.ilehao.com,widget.weibo.com,login.sina.com.cn"; 70     headers.put("Accept", "text/html,application/xhtml+); 71     headers.put("Accept-Language", "zh-CN"); 72     headers.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36"); 73     headers.put("Connection", "Keep-Alive"); 74     headers.put("Cache-Control", "max-age=0"); 75     headers.put("Cookie", cookieValue); 76     this.headers.put("Host", "www.weibo.com"); 77     HttpResponse response=HttpUtils.doGet(url, headers); 78     String responseText=HttpUtils.getStringFromResponse(response); 79     responseText=EncodeUtils.unicdoeToGB2312(responseText); 82     return responseText; 83   }   85 //爬虫根据关键字,查询时间断,和查询页数 来得到htmlContent 86 public String search(String keyword, int pageNo, String fromdate,String todate){ 87   StringBuffer stringBuffer = new StringBuffer(200);   93   stringBuffer.append("http://s.weibo.com/weibo/"+ keyword +"&page="); 94   stringBuffer.append(pageNo); 95   stringBuffer.append("&typeall=1&suball=1&timescope=custom:"); 96   stringBuffer.append(fromdate); 97   stringBuffer.append(":"); 98   stringBuffer.append(todate); 99   stringBuffer.append("&Refer=g");104   String url = stringBuffer.toString();105   String cookieValue = headers.get("Cookie");106   headers.put("Accept", "text/html,application/xhtml+);107   //headers.put("Accept-Encoding", "gzip, deflate, sdch");108   headers.put("Accept-Language", "zh-CN");109   headers.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36");110   headers.put("Connection", "Keep-Alive");111   headers.put("Cache-Control", "max-age=0");112   headers.put("Referer", "http://s.weibo.com/weibo/%25E5%25AE%2581%25E6%25B3%25A2%25E5%25A4%25A7%25E5%25AD%25A6&typeall=1&suball=1&timescope=custom:"+fromdate+":"+todate+"&Refer=g");113   headers.put("Cookie", cookieValue);114   this.headers.put("Host", "s.weibo.com");115   HttpResponse response=HttpUtils.doGet(url, headers);116   String responseText=HttpUtils.getStringFromResponse(response);117   responseText=EncodeUtils.unicdoeToGB2312(responseText);118   119   System.out.println("************htmlContent start***********");120   System.out.println(responseText);121   System.out.println("************htmlContent end***********");125   return responseText;  127 } 129 public void forwardToWeiboPage(){130   String url = Constant.personalHomePage;131   headers.put("Accept", "text/html,application/xhtml+);133   headers.put("Accept-Language", "zh-CN");134   headers.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36");135   headers.put("Connection", "Keep-Alive");137   this.headers.put("Host", "s.weibo.com");138   HttpResponse response=HttpUtils.doGet(url, headers);139   String responseText=HttpUtils.getStringFromResponse(response);140   responseText=EncodeUtils.unicdoeToGB2312(responseText);141   List<Cookie> cookies = HttpUtils.getResponseCookies(response);142   String cookie = HttpUtils.setCookie2String(cookies);144   headers.put("Cookie", cookie);  146 }150   public String getGroupCategory(int id){151     String url="http://q.weibo.com/class/category/?id="+id;152     this.headers.put("Host", "q.weibo.com");154     HttpResponse response=HttpUtils.doGet(url, headers);155     String responseText=HttpUtils.getStringFromResponse(response);156     responseText=EncodeUtils.unicdoeToGB2312(responseText);157     return responseText;158   }169   //得到微群管理员ID信息,其实用户成员的第一页 HTML页面170   public String getGroupAdministrator(String groupid) {171     String url="http://q.weibo.com/"+groupid+"/members/all";172     this.headers.remove("Referer");173     this.headers.put("Host", "q.weibo.com");174     this.headers.remove("Content-Type");175     this.headers.remove("x-requested-with");176     HttpResponse response=HttpUtils.doGet(url, headers);177     String responseText=HttpUtils.getStringFromResponse(response);178     return responseText;179   }180   //根据微群号和页号得到群成员ID信息 -----JSON格式数据181   public String getGroupMembers(String groupid,int pagenumber){182     this.headers.put("Referer", "http://q.weibo.com/"+groupid+"/members/all");183     this.headers.put("Host", "q.weibo.com");184     this.headers.put("Content-Type", "application/x-www-form-urlencoded");185     this.headers.put("x-requested-with", "); 187     Map<String,String> params=new HashMap<String,String>();188     params.put("_t", "0");189     params.put("page", pagenumber+"");190     params.put("gid", groupid);191     params.put("query","");192     params.put("tab", "all");193     params.put("vip", "1");194     String url="http://q.weibo.com/ajax/members/page";195     HttpResponse response=HttpUtils.doPost(url, headers, params);196     return HttpUtils.getStringFromResponse(response);197   }198   /*199    * 得到微群中微博信息 经过多次尝试成功200    * 每次获得50个微博记录,page是页号, count值50 可以在1-75之间,但是,每次开始的时候还是从50的倍数开始的201   */202   public String getGroupTopic(int page,int count,String gid){203     this.headers.put("Referer", "http://q.weibo.com/"+gid);204     this.headers.put("Host", "q.weibo.com");205     this.headers.put("Content-Type", "application/x-www-form-urlencoded");206     this.headers.put("x-requested-with", ");207     Integer pre_page=1;208     if(page==1){209       pre_page=2;210     }else{211       pre_page=page-1;212     }213     Map<String,String> params=new HashMap<String,String>();214     params.put("_k", System.currentTimeMillis()+"");215     params.put("_t", "0");216     params.put("count", count+"");217     //params.put("end_id", end_id);218     params.put("gid", gid);219     params.put("is_search","");220     params.put("key_word", "");221     params.put("me", "0");222     params.put("mids", "");223     params.put("new", "0");224     params.put("page", page+"");225     params.put("pagebar", "0"); 226     params.put("pre_page", pre_page+"");227     params.put("since_id", "0");228     params.put("uid", "0");229     230     String url="http://q.weibo.com/ajax/mblog/groupfeed";231     HttpResponse response=HttpUtils.doPost(url, headers, params);232     return HttpUtils.getStringFromResponse(response);233   }234   /*235    * 得到微群中微博信息数目236    * 这个信息中其实还包含了微群的所有的基本信息~~~~~~~~~~****** json格式的数据信息237   */238   public String getGroupMessageNumber(String gid){239     this.headers.put("Referer", "http://q.weibo.com/"+gid);240     this.headers.put("Host", "q.weibo.com");241     this.headers.put("Content-Type", "application/x-www-form-urlencoded");242     this.headers.put("x-requested-with", ");243     String url="http://q.weibo.com/ajax/rightnav/groupprofile?gid="+gid+"&_t=0&__rnd="+System.currentTimeMillis();244     HttpResponse response=HttpUtils.doGet(url, headers);245     return HttpUtils.getStringFromResponse(response);246   }247   //得到微群的主页信息 HTML页码  主要是为了得到第一条微博记录的MID值248   public String getgroupMainPage(String groupid) {249     String url="http://q.weibo.com/"+groupid+"?topnav=1";250     this.headers.remove("Referer");251     this.headers.put("Host", "q.weibo.com");252     this.headers.remove("Content-Type");253     this.headers.remove("x-requested-with");254     255     HttpResponse response=HttpUtils.doGet(url, headers);256     String responseText=HttpUtils.getStringFromResponse(response);257     return responseText;258   }259   /*260    * 根据分类得到微群信息261    * categroyID :分类ID号262    * pagenumber:页号263    * sort:分类方式 1 按成员人数 2按 微群博数 3按创建时间分类264    * count:每页的记录数目265   */266   public String getGroupByCategroy(int categroyID,int pagenumber,int sort,int count){267     this.headers.put("Referer", "http://q.weibo.com/class/category/?id="+categroyID);268     this.headers.put("Host", "q.weibo.com");269     this.headers.put("Content-Type", "application/x-www-form-urlencoded");270     this.headers.put("x-requested-with", ");271     Map<String,String> params=new HashMap<String,String>();272     params.put("_t", "0");273     params.put("page", pagenumber+"");274     params.put("id", categroyID+"");275     params.put("sort",sort+"");276     params.put("count", count+"");277     278     String url="http://q.weibo.com/ajax/class/category";279     HttpResponse response=HttpUtils.doPost(url, headers,params);280     String responseText=HttpUtils.getStringFromResponse(response);281     responseText=EncodeUtils.unicdoeToGB2312(responseText);282     return responseText;283   }284   //得到表情列表信息285   public String getFaceList(){286     String url="http://weibo.com/aj/mblog/face?type=face&_t=0&__rnd="+System.currentTimeMillis();287     this.headers.put("Referer", "http://weibo.com/");288     this.headers.put("Host", "weibo.com");289     this.headers.put("Content-Type", "application/x-www-form-urlencoded");290     this.headers.put("x-requested-with", ");291     292     HttpResponse response=HttpUtils.doGet(url, headers);293     String responseText=HttpUtils.getStringFromResponse(response);294     System.out.println(responseText);295     Utils.writeFileFromString("tmpFile/faceList.txt", responseText);296     return responseText;297   }307   //用户基本信息     主要是将要解析用户主页下方经过编码后的内容308   public String getMemberInfo(String memberID){309     String url="http://weibo.com/"+memberID+"/info";310     this.headers.put("Host", "weibo.com");311     this.headers.put("Referer", "http://weibo.com/u/"+memberID);312     HttpResponse response=HttpUtils.doGet(url, headers);313     String responseText=HttpUtils.getStringFromResponse(response);314     return responseText;315   }316   //用户粉丝用户信息  html页面,每次20个317   public String getMemberFans(String memberID,int page){318     String url="http://weibo.com/"+memberID+"/fans?&uid=1689219395&tag=&page="+page;319     this.headers.put("Host", "weibo.com");320     this.headers.put("Referer", "http://weibo.com/"+memberID+"/fans");321     HttpResponse response=HttpUtils.doGet(url, headers);322     String responseText=HttpUtils.getStringFromResponse(response);323     return responseText;324   }325   //用户关注的用户信息   html页面326   public String getMemberFollowing(String memberID,int page){327     String url="http://weibo.com/"+memberID+"/follow?page="+page;328     this.headers.put("Host", "weibo.com");329     this.headers.put("Referer", "http://weibo.com/"+memberID+"/follow");330     HttpResponse response=HttpUtils.doGet(url, headers);331     String responseText=HttpUtils.getStringFromResponse(response);332     return responseText;333   }334   335   /*336    * @params 337    *  memberID:是用户ID338    *  max_id:每次AJAX获得数据时上面一次的最后一个ID值339    *  end_id:用户最新的一条微博的ID值340    *  k:一个随机数341    *  page:页号342    *  pre_page:前一页343    *  count:每次返回的数值 当max_id为null是 count=50 否则为15344    *   pagebar:ajax时,第一次为0,第二次为1345    *  注意:346    *  1 用此请求,每次获得的数据格式都一样,用同样的解析方法来进行解析。347    *  2 每次一页可以获得总共45条记录,需要三次请求。每次请求可获得15条记录。348    *  3 max_id可以不用到,直接等于 end_id就可以了.349    *  4 第一次请求时可以将end_id设置为NUll,即为第一次时翻页时的请求后边的滚动时必须有end_id参数,end_id为第一页的第一条ID即可。350   */351   //获得用户发布的微博信息  json格式的数据  352   public String getMemberReleaseTopic(String memberID,String end_id,Integer page,Integer pagebar){353     String url="";354     Integer pre_page=1;355     Integer count=0;356     String k=System.currentTimeMillis()+""+(int)(Math.random()*100000)%100;357     if(end_id==null){358       count=50;359       if(page==1){360         pre_page=2;361       }else{362         pre_page=page-1;363       }364       url="http://weibo.com/aj/mblog/mbloglist?" +365       "page="+page+"&count="+count+"&pre_page="+pre_page+"&" +366       "_k="+ k+"&uid="+memberID+367       "&_t=0&__rnd="+System.currentTimeMillis();368     }else{369       count=15;370       pre_page=page;371       url="http://weibo.com/aj/mblog/mbloglist?" +372       "page="+page+"&count="+count+"&max_id="+end_id+"&" +373       "pre_page="+pre_page+"&end_id="+end_id+"&" +374       "pagebar="+pagebar+"&_k="+k+"&" +375       "uid="+memberID+"&_t=0&__rnd="+System.currentTimeMillis();376     }377     String cookieValue = "SINAGLOBAL=8556698272004.724.1417744632425; un=sm2014121903@126.com; myuid=5439352084; YF-Ugrow-G0=4703aa1c27ac0c4bab8fc0fc5968141e; SSOLoginState=1421374583; wvr=6; YF-V5-G0=8c4aa275e8793f05bfb8641c780e617b; _s_tentry=login.sina.com.cn; Apache=2461283528245.9854.1421374588453; ULV=1421374588550:13:5:3:2461283528245.9854.1421374588453:1421210767499; UOR=www.ilehao.com,widget.weibo.com,login.sina.com.cn; SUS=SID-2035860051-1421462085-GZ-7jcgb-1539d643bae5195fb7f792b2ae77befb; SUE=es%3Df15e11ed09b6a0108a28adfa58609b78%26ev%3Dv1%26es2%3Da0f706efac5c89495062648a4de3e337%26rs0%3DZBxlOUv0mhmxyHfOVmZ3tH7tNvAp08BjPeLUJPdu9WzG38Dsm40px%252Bd9w21ycDpZQwBK3q0prFfNs%252F8ZuZSasa1eps%252FOGNxJ3CIHN8JN%252Fik6gVpIPgVeeRdalNWTIbth6hLa34uOp%252BXii%252Bxeib%252BvINsr%252FdOvQx6kjp6fsC44QXc%253D%26rv%3D0; SUP=cv%3D1%26bt%3D1421462085%26et%3D1421548485%26d%3Dc909%26i%3Dbefb%26us%3D1%26vf%3D0%26vt%3D0%26ac%3D2%26st%3D0%26uid%3D2035860051%26name%3Dshy_annan%2540126.com%26nick%3D%25E7%2594%25A8%25E6%2588%25B72035860051%26fmp%3D%26lcp%3D2013-08-18%252021%253A48%253A10; SUB=_2A255vboVDeTxGeRO6FcZ9i7Mzj2IHXVazdpdrDV8PUNbvtBuLVj-kW91jmbQSGo7Rn30RVvGP5KOgBgNgQ..; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9Wh7oKNCGYcNnhlC6eqqQbbl5JpX5KMt; ALF=1452998078; ULOGIN_IMG=14214638933178; YF-Page-G0=0acee381afd48776ab7a56bd67c2e7ac";378     headers.put("Cookie", cookieValue);379     this.headers.put("Referer", "http://weibo.com/u/"+memberID);380     this.headers.put("Host", "www.weibo.com");381     this.headers.put("Content-Type", "application/x-www-form-urlencoded");382     this.headers.put("x-requested-with", ");383     url = "http://weibo.com/u/"+memberID;384     HttpResponse response=HttpUtils.doGet(url, headers);385     if(response==null){386       return "";387     }388     return HttpUtils.getStringFromResponse(response);389   }390   /*391    * ~~~~~~~~~~~~~~~~~~~~~获取用户的一些信息~~~end~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~392   */393   394   395   //**********************************************************************************396 397   /*398    * 名人堂与达人信息399   */400   public String getVerified(String url){401     this.headers.put("Host", "verified.weibo.com");402     this.headers.put("Referer", "http://plaza.weibo.com/?topnav=1&wvr=4");403     HttpResponse response=HttpUtils.doGet(url, headers);404     String responseText=HttpUtils.getStringFromResponse(response);405     return responseText;406   }407 408   public String getVerifiedMember(String path,Integer g_index){409     String url="http://verified.weibo.com/aj/getgrouplist?g_index="+g_index+410     "&path="+path+"&_t=0&__rnd="+System.currentTimeMillis();411     this.headers.put("Host", "verified.weibo.com");412     this.headers.put("Referer", path);413     this.headers.put("Content-Type", "application/x-www-form-urlencoded");414     this.headers.put("x-requested-with", ");415     HttpResponse response=HttpUtils.doGet(url, headers);416     String responseText=HttpUtils.getStringFromResponse(response);417   418     return responseText;419   }420   421   public String setArea(Integer provinceID){422     this.headers.put("Referer", "http://club.weibo.com/list");423     this.headers.put("Host", "club.weibo.com");424     this.headers.put("Content-Type", "application/x-www-form-urlencoded");425     this.headers.put("x-requested-with", ");426 427     Map<String,String> params=new HashMap<String,String>();428     429     params.put("_t", "0");430     params.put("city", "1000");431     params.put("prov", provinceID+"");432     433     String url="http://club.weibo.com/ajax_setArea.php";434     HttpResponse response=HttpUtils.doPost(url, headers, params);435     436     List<Cookie> cks=HttpUtils.getResponseCookies(response);437     List<Cookie> cookies=ls.getCookies();438     cookies.addAll(cks);439     String cookieValue=HttpUtils.setCookie2String(cookies);440     this.headers.put("Cookie", cookieValue);441     442     return HttpUtils.getStringFromResponse(response);443   }444   445   public String getDaRen(Integer page){446     String op="ltime"; 447     String url="http://club.weibo.com/list?sex=3&op="+op+"&page="+page+"&";448     Integer pre_page=(page<=1? 2:page-1);449     this.headers.put("Host", "club.weibo.com");450     this.headers.put("Referer", "http://club.weibo.com/list?sex=3&op=ltime&page="+pre_page+"&");451     this.headers.remove("Content-Type");452     this.headers.remove("x-requested-with");453     454     HttpResponse response=HttpUtils.doGet(url, headers);455     if(response!= null){456       return HttpUtils.getStringFromResponse(response);457     }458     return "";459     460   }470   //发布一条文字微博471   public String releaseTopic(String content){472     this.headers.put("Referer", "http://weibo.com/");473     this.headers.put("Host", "weibo.com");474     this.headers.put("Content-Type", "application/x-www-form-urlencoded");475     this.headers.put("x-requested-with", ");476     Map<String,String> params=new HashMap<String,String>();477     params.put("_t", "0");478     params.put("location", "home");479     params.put("module", "stissue");480     params.put("pic_id", "");481     params.put("text", content);482     String url="http://weibo.com/aj/mblog/add?__rnd="+System.currentTimeMillis();483     HttpResponse response=HttpUtils.doPost(url, headers, params);484     return HttpUtils.getStringFromResponse(response);485   }519   //得到自己关注的成员520   public String getSelfFollowIngs(){521     return "";522   }523   //得到自己的粉丝524   public String getSelfFollowers(){525     return "";526   }527   //得到自己加入的微群528   public String getSelfJoinedGroups(){529     return "";530   }531   //得到自己的标签532   public String getSelfTags(){533     return "";534   }535   //得到自己发布的微博536   public String getSelfReleaseTopics(){537     return "";538   }539   //得到自己主页的微博540   public String getSelfPageTopics(){541     return "";542   }543   //关注一个人544   public String addFollowing(String memberid){545     return addorcancleFollowing(memberid,this.ADDFOLLOWING);546   }547   //取消关注一个人548   public String cancelFollowing(String memberid){549     return addorcancleFollowing(memberid,this.CANCELFOLLOWING);550   }551   private String addorcancleFollowing(String memberid,int option){552     String url="";553     switch(option){554       case ADDFOLLOWING:555         url="http://weibo.com/aj/f/followed?__rnd="+System.currentTimeMillis();556         break;557       case CANCELFOLLOWING:558         url="http://weibo.com/aj/f/unfollow?__rnd="+System.currentTimeMillis();559         break;560     }561     562     Map<String,String> params=new HashMap<String,String>();563 564     this.headers.put("Referer", "http://weibo.com/");565     this.headers.put("Host", "weibo.com");566     this.headers.put("Content-Type", "application/x-www-form-urlencoded");567     this.headers.put("Referer", "http://weibo.com/");568     this.headers.put("x-requested-with", ");569     570     params.put("_t", "0");571     params.put("f", "1");572     params.put("location", "profile");573     params.put("refer_flag", "");574     params.put("refer_sort", "profile");575     params.put("uid", memberid);576     577     HttpResponse response=HttpUtils.doPost(url, headers, params);578     return HttpUtils.getStringFromResponse(response);579   }584   /**585    * 得到的标签信息 调用一次10个 586    * @return587   */588   public String getTags(){589     String url="http://account.weibo.com/set/aj/tagsuggest?__rnd="+System.currentTimeMillis();  590     this.headers.put("Referer", "http://account.weibo.com/set/tag#");591     this.headers.put("Host", "account.weibo.com");592     HttpResponse response=HttpUtils.doGet(url, headers);593     return HttpUtils.getStringFromResponse(response);594   }595   596   /**597    * 得到微博热词信息598    * @param k :热词的门类599   */600   public String getHotWords(String k){601     String url="http://data.weibo.com/top/keyword?k="+k;  602     try{603       Integer.parseInt(k);604     }catch(Exception ex){605       url="http://data.weibo.com/top/keyword?t="+k;  606     }607     this.headers.put("Referer", "http://data.weibo.com/top/keyword");608     this.headers.put("Host", "data.weibo.com");609     HttpResponse response=HttpUtils.doGet(url, headers);610     return HttpUtils.getStringFromResponse(response);611   }612   613   /**614    * 得到微博热帖子615    * @param cat 表示热帖门类616    * @param page 表示页号617   */618   public String getHotWeibo(String cat,int page){619     String url="http://data.weibo.com/hot/ajax/catfeed?page="+page+"&cat="+cat+"&_t=0&__rnd="+System.currentTimeMillis();  620     this.headers.put("Referer", "http://data.weibo.com/hot/minibloghot");621     this.headers.put("Host", "data.weibo.com");622     HttpResponse response=HttpUtils.doGet(url, headers);623     return HttpUtils.getStringFromResponse(response);624   }625   626   /**627    * 按照分类获取 微博吧名字 第一步628   */629   public String getWeiBar(String ctgid,int p){630     String sort="post";631     String url="http://weiba.weibo.com/aj_f/CategoryList?sort="+sort+"&p="+p+"&ctgid="+ctgid+"&_t=0&__rnd="+System.currentTimeMillis();  632     this.headers.put("Referer", "http://weiba.weibo.com/ct/"+ctgid);633     this.headers.put("Host", "weiba.weibo.com");634     this.headers.put("Accept", "*/*");635     this.headers.put("Content-Type", "application/x-www-form-urlencoded");636     this.headers.put("X-Requested-With", ");637     HttpResponse response=HttpUtils.doGet(url, headers);638     return HttpUtils.getStringFromResponse(response);639   }640   /**641    * 根据微博吧 名称 ,得到该吧内的所有帖子标题 第二步642   */643   public String getWeiBarByWeibarName(String bid,int p){644     String url="http://weiba.weibo.com/aj_t/postlist?bid="+bid+"&p="+p+"&_t=all&__rnd="+System.currentTimeMillis();  645     this.headers.put("Referer", "http://weiba.weibo.com/");646     this.headers.put("Host", "weiba.weibo.com");647     this.headers.put("Accept", "*/*");648     this.headers.put("Content-Type", "application/x-www-form-urlencoded");649     this.headers.put("X-Requested-With", ");650     HttpResponse response=HttpUtils.doGet(url, headers);651     return HttpUtils.getStringFromResponse(response);652   }653   654   /**655    * 新浪微公益名单656    * type ="donate"657    * type="discuss"658   */659   public String getWeiGongYiMember(int page,int projectID,String type){660     String url="http://gongyi.weibo.com/aj_personal_helpdata?page="+page+"&type="+type+"&project_id="+projectID+"&_t=0&__rnd="+System.currentTimeMillis();  661     this.headers.put("Referer", "http://gongyi.weibo.com/"+projectID);662     this.headers.put("Host", "gongyi.weibo.com");663     this.headers.put("Accept", "*/*");664     this.headers.put("Content-Type", "application/x-www-form-urlencoded");665     this.headers.put("X-Requested-With", ");666     HttpResponse response=HttpUtils.doGet(url, headers);667     return HttpUtils.getStringFromResponse(response);668   }669 }

 




原标题:用java实现新浪爬虫,代码完整剖析(仅针对当前SinaSignOn有效)

关键词:JAVA

*特别声明:以上内容来自于网络收集,著作权属原作者所有,如有侵权,请联系我们: admin#shaoqun.com (#换成@)。
相关文章
我的浏览记录
最新相关资讯
海外公司注册 | 跨境电商服务平台 | 深圳旅行社 | 东南亚物流