你的位置:首页 > Java教程

[Java教程]crawler4j:轻量级多线程网络爬虫实例


crawler4j是Java实现的开源网络爬虫。提供了简单易用的接口,可以在几分钟内创建一个多线程网络爬虫。

下面实例结合jsoup(中文版API),javacvs 爬取自如租房网(http://sh.ziroom.com/z/nl/)租房信息。

1.maven导入相关包

 1     <dependency> 2       <groupId>edu.uci.ics</groupId> 3       <artifactId>crawler4j</artifactId> 4       <version>4.2</version> 5     </dependency> 6     <dependency> 7       <groupId>org.jsoup</groupId> 8       <artifactId>jsoup</artifactId> 9       <version>1.8.3</version>10     </dependency>11     <dependency>12       <groupId>net.sourceforge.javacsv</groupId>13       <artifactId>javacsv</artifactId>14       <version>2.0</version>15     </dependency>

2.创建自己的Crawler类 继承 WebCrawler 

 1 public class ZiroomCrawler extends WebCrawler { 2   /** 爬取数据保存文件路径 */ 3   private final static String CSV_PATH = "data/crawl/ziroom.csv"; 4   /** 爬取匹配原则 */ 5   private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|bmp|gif|jpe?g|ico" 6       + "|png|tiff?|mid|mp2|mp3|mp4" + "|wav|avi|mov|mpeg|ram|m4v|pdf" + "|rm|smil|wmv|swf|wma|zip|rar|gz))$"); 7   /** 爬取link文件路径 */ 8   private final static String LINK_PATH = "data/crawl/link.csv"; 9   private static final Logger logger = LoggerFactory.getLogger(ZiroomCrawler.class); 10  11   private final static String URL_PREFIX = "http://sh.ziroom.com/z/nl/"; 12  13   private final File csv; 14  15   private final File csv2; 16   private CsvWriter cw; 17   private CsvWriter cw2; 18  19   /** 20    * You should implement this function to specify whether the given url 21    * should be crawled or not (based on your crawling logic). 22   */ 23   CrawlStat myCrawlStat; 24  25   public ZiroomCrawler() throws IOException { 26     myCrawlStat = new CrawlStat(); 27     csv = new File(CSV_PATH); 28     csv2 = new File(LINK_PATH); 29     if (csv.isFile()) { 30       csv.delete(); 31     } 32     if (csv2.isFile()) { 33       csv2.delete(); 34     } 35     cw2 = new CsvWriter(new FileWriter(csv2, true), ','); 36     cw2.write("请求路径"); 37     cw2.endRecord(); 38     cw2.close(); 39     cw = new CsvWriter(new FileWriter(csv, true), ','); 40     cw.write("图片"); 41     cw.write("价格"); 42     cw.write("地址"); 43     cw.write("说明"); 44     cw.endRecord(); 45     cw.close(); 46   } 47  48   public void dumpMyData() { 49     final int id = getMyId(); 50     // You can configure the log to output to file 51     logger.info("Crawler {} > Processed Pages: {}", id, myCrawlStat.getTotalProcessedPages()); 52     logger.info("Crawler {} > Total Links Found: {}", id, myCrawlStat.getTotalLinks()); 53     logger.info("Crawler {} > Total Text Size: {}", id, myCrawlStat.getTotalTextSize()); 54   } 55  56   @Override 57   public Object getMyLocalData() { 58     return myCrawlStat; 59   } 60  61   @Override 62   public void onBeforeExit() { 63     dumpMyData(); 64   } 65  66   /* 67    * 这个方法决定了要抓取的URL及其内容,例子中只允许抓取“http://sh.ziroom.com/z/nl/”这个域的页面, 68    * 不允许.css、.js和多媒体等文件 69    * 70    * @see edu.uci.ics.crawler4j.crawler.WebCrawler#shouldVisit(edu.uci.ics. 71    * crawler4j.crawler.Page, edu.uci.ics.crawler4j.url.WebURL) 72   */ 73   @Override 74   public boolean shouldVisit(Page referringPage, WebURL url) { 75     final String href = url.getURL().toLowerCase(); 76  77     if (FILTERS.matcher(href).matches() || !href.startsWith(URL_PREFIX)) { 78       return false; 79     } 80     return true; 81   } 82  83   /* 84    * 当URL下载完成会调用这个方法。你可以轻松获取下载页面的url, 文本, 链接, html,和唯一id等内容。 85    * 86    * @see 87    * edu.uci.ics.crawler4j.crawler.WebCrawler#visit(edu.uci.ics.crawler4j. 88    * crawler.Page) 89   */ 90   @Override 91   public void visit(Page page) { 92     final String url = page.getWebURL().getURL(); 93     System.out.println("-----------爬取路径:" + url); 94     myCrawlStat.incProcessedPages(); 95     if (page.getParseData() instanceof HtmlParseData) { 96       final HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); 97       final Set<WebURL> links = htmlParseData.getOutgoingUrls(); 98       try { 99         linkToCsv(links);100       } catch (final IOException e2) {101         // TODO Auto-generated catch block102         e2.printStackTrace();103       }104       myCrawlStat.incTotalLinks(links.size());105       try {106         myCrawlStat.incTotalTextSize(htmlParseData.getText().getBytes("UTF-8").length);107       } catch (final UnsupportedEncodingException e1) {108         // TODO Auto-generated catch block109         e1.printStackTrace();110       }111       final String html = htmlParseData.getHtml();112 113       final Document doc = Jsoup.parse(html);114 115       final Elements contents = doc.select("li[class=clearfix]");116 117       for (final Element c : contents) {118         // 图片119         final String img = c.select(".img img").first().attr("src");120         System.out.println("图片:" + img);121 122         // 地址123         final Element txt = c.select("div[class=txt]").first();124         final String arr1 = txt.select("h3 a").first().text();125         final String arr2 = txt.select("h4 a").first().text();126         final String arr3 = txt.select("div[class=detail]").first().text();127 128         final String arr = arr1.concat(arr1 + ",").concat(arr2 + ",").concat(arr3);129         System.out.println("地址:" + arr);130         // 说明131         final String rank = txt.select("p").first().text();132         System.out.println("说明:" + rank);133 134         // 价格135         final String pirce = c.select("p[class=price]").first().text();136 137         try {138           cw = new CsvWriter(new FileWriter(csv, true), ',');139           cw.write(img);140           cw.write(pirce);141           cw.write(arr);142           cw.write(rank);143           cw.endRecord();144           cw.flush();145           cw.close();146         } catch (final IOException e) {147           e.printStackTrace();148         }149       }150     }151   }152 153   private void linkToCsv(Set<WebURL> links) throws IOException {154     cw2 = new CsvWriter(new FileWriter(csv2, true), ',');155     for (final WebURL webURL : links) {156       cw2.write(webURL.getURL());157     }158     cw2.flush();159     cw2.endRecord();160     cw2.close();161 162   }

View Code
public class CrawlStat {  private long totalLinks;  private int totalProcessedPages;  private long totalTextSize;  public long getTotalLinks() {    return totalLinks;  }  public int getTotalProcessedPages() {    return totalProcessedPages;  }  public long getTotalTextSize() {    return totalTextSize;  }  public void incProcessedPages() {    this.totalProcessedPages++;  }  public void incTotalLinks(int count) {    this.totalLinks += count;  }  public void incTotalTextSize(int count) {    this.totalTextSize += count;  }  public void setTotalLinks(long totalLinks) {    this.totalLinks = totalLinks;  }  public void setTotalProcessedPages(int totalProcessedPages) {    this.totalProcessedPages = totalProcessedPages;  }  public void setTotalTextSize(long totalTextSize) {    this.totalTextSize = totalTextSize;  }}

View Code

 


3.编写运行脚本的类

public class ZiroomController {  public static void main(String[] args) {    System.out.println("-------begin:" + new Timestamp(System.currentTimeMillis()));    final String crawlStorageFolder = "data/crawl/root";    final int numberOfCrawlers = 7;    final CrawlConfig config = new CrawlConfig();    config.setCrawlStorageFolder(crawlStorageFolder);    config.setPolitenessDelay(1000);    config.setIncludeBinaryContentInCrawling(false);    config.setMaxPagesToFetch(50);    // config.setResumableCrawling(true);    /*     * Instantiate the controller for this crawl.     */    final PageFetcher pageFetcher = new PageFetcher(config);    final RobotstxtConfig robotstxtConfig = new RobotstxtConfig();    final RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);    CrawlController controller;    try {      controller = new CrawlController(config, pageFetcher, robotstxtServer);      /*       * For each crawl, you need to add some seed urls. These are the       * first URLs that are fetched and then the crawler starts following       * links which are found in these pages       */      controller.addSeed("http://sh.ziroom.com/z/nl/");      // controller.addSeed("http://www.ziroom.com/z/nl/z3-u2.html/");      // controller.addSeed("http://www.ics.uci.edu/~welling/");      // controller.addSeed("http://www.ics.uci.edu/");      /*       * Start the crawl. This is a blocking operation, meaning that your       * code will reach the line after this only when crawling is       * finished.       */      controller.start(ZiroomCrawler.class, numberOfCrawlers);      final List<Object> crawlersLocalData = controller.getCrawlersLocalData();      long totalLinks = 0;      long totalTextSize = 0;      int totalProcessedPages = 0;      for (final Object localData : crawlersLocalData) {        final CrawlStat stat = (CrawlStat) localData;        totalLinks += stat.getTotalLinks();        totalTextSize += stat.getTotalTextSize();        totalProcessedPages += stat.getTotalProcessedPages();      }      System.out.println("Aggregated Statistics:");      System.out.println("\tProcessed Pages: {}" + totalProcessedPages);      System.out.println("\tTotal Links found: {}" + totalLinks);      System.out.println("\tTotal Text Size: {}" + totalTextSize);    } catch (final Exception e) {      // TODO Auto-generated catch block      e.printStackTrace();    }  }}

View Code

刚开始写博客 仅供参考!请多指教!

开源地址: https://github.com/yasserg/crawler4j