你的位置:首页 > Java教程

[Java教程]lucene4.0 基于smb文件服务器的全文检索


使用lucene 4.0版本的全文检索

所需要的jar包 

网速太慢,下次有空再把jar传上来

1.FileIndex  建立索引,查询,删除,更新

package com.strongit.tool.retrieval;import java.io.File;import java.net.MalformedURLException;import java.util.ArrayList;import java.util.Date;import java.util.List;import jcifs.smb.SmbException;import jcifs.smb.SmbFile;import jcifs.smb.SmbFileFilter;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.LongField;import org.apache.lucene.document.TextField;import org.apache.lucene.document.Field.Store;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.index.Term;import org.apache.lucene.queryparser.classic.QueryParser;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.apache.lucene.util.Version;import com.strongit.util.BaseinfoConfigurer;public class FileIndex {		private static String INDEX_DIR = "D:\\index";	private static Analyzer analyzer = null;	private static Directory directory = null;	private static IndexWriter indexWriter = null;	private static String content = "";		 public static void main(String[] args) {	    try {//	    createIndex();//创建索引//	      search("测试");//	      insert();//新增索引,不删除之前的//	      delete("1470817624520");//	      update();	    } catch (Exception e) {	      e.printStackTrace();	    }	  }	/**	 * 删除索引	 * 	 * @param @param str 删除的关键字 建立索引时的id	 * @param @throws Exception	 * @author wusongxiao	 * @date 2016年8月10日	 */	public static void delete(String str) throws Exception {		Date date1 = new Date();		analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);		directory = FSDirectory.open(new File(INDEX_DIR));		IndexWriterConfig config = new IndexWriterConfig(				Version.LUCENE_CURRENT, analyzer);		indexWriter = new IndexWriter(directory, config);		// indexWriter.deleteDocuments(new Term("filename",str));		indexWriter.deleteDocuments(new Term("id", str)); // 建立索引时 给这个索引赋一个id		indexWriter.close();		Date date2 = new Date();		System.out.println("删除索引耗时:" + (date2.getTime() - date1.getTime())				+ "ms\n");	}	/**	 * 新增加索引,不覆盖之前的	 * 	 * @Description: TODO	 * @param @throws Exception	 * @return void	 * @throws	 * @author wusongxiao	 * @date 2016年8月10日	 */	public static void insert(List listname) throws Exception {//		String path = "smb://admini:2014wh@192.168.168.140/resource/Teaching/test001.txt";		for(int j =0;j<listname.size();j++){						String path=  listname.get(j); //文件地址			SmbFile folder = new SmbFile(path);			List<SmbFile> fileList = new ArrayList<SmbFile>();			fileList.add(folder);			for (SmbFile file : fileList) {				content = "";				// 获取文件后缀				String type = file.getName().substring(						file.getName().lastIndexOf(".") + 1);				if ("txt".equalsIgnoreCase(type)) {					content += ReadFile.readTxt(file.getPath(),"gb2312");				} else if ("doc".equalsIgnoreCase(type)) {					content += ReadFile.readWorddoc(file.getPath());				} else if ("xls".equalsIgnoreCase(type)) {					content += ReadFile.xls2String(file.getPath());				} else if ("xlsx".equalsIgnoreCase(type)) {					content += ReadFile.readExcel2007(file.getPath());				} else if ("ppt".equalsIgnoreCase(type)) {					content += ReadFile.readPowerPoint(file.getPath());				} else if ("pdf".equalsIgnoreCase(type)) {					content += ReadFile.readPdf(file.getPath());				}else if ("docx".equalsIgnoreCase(type)) {					content += ReadFile.readWorddocx(file.getPath());				}								//				System.out.println("name :" + file.getName());//名称//				System.out.println("path :" + file.getPath());//地址//               System.out.println("content :"+content);//content内容				try {					analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);					directory = FSDirectory.open(new File(INDEX_DIR));					File indexFile = new File(INDEX_DIR);					if (!indexFile.exists()) {						indexFile.mkdirs();					}					IndexWriterConfig config = new IndexWriterConfig(							Version.LUCENE_CURRENT, analyzer);					indexWriter = new IndexWriter(directory, config);			//		String ID = pathname[1].toString();//赋值一个唯一的ID,方便删除					Document document = new Document();					document.add(new TextField("filename", file.getName(),							Store.YES));					document.add(new TextField("content", content, Store.YES));					document.add(new TextField("path", file.getPath(), Store.YES));			//		document.add(new TextField("id", ID, Store.YES));					indexWriter.addDocument(document);					indexWriter.commit();					ReadFile.closeWriter(indexWriter);				} catch (Exception e) {					e.printStackTrace();				}				content = "";			}					}			}	/**	 * 查询索引	 * 	 * @Description: TODO	 * @param @param str 查询关键字	 * @param @throws Exception	 * @return void	 * @throws	 * @author wusongxiao	 * @date 2016年8月10日	 */	public static List search(String str) throws Exception {		directory = FSDirectory.open(new File(INDEX_DIR));		analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);		DirectoryReader ireader = DirectoryReader.open(directory);		IndexSearcher isearcher = new IndexSearcher(ireader);
QueryParser parser = new QueryParser(Version.LUCENE_30, "content", analyzer);//LUCENE_30 不分词查询,只搜关键词 Query query = parser.parse(str); List liatname = new ArrayList(); ScoreDoc[] hits = isearcher.search(query, null, 10000).scoreDocs; for (int i = 0; i < hits.length; i++) { Document hitDoc = isearcher.doc(hits[i].doc); System.out.println(hitDoc.get("filename"));// 文件名 liatname.add(hitDoc.get("filename")); // System.out.println(hitDoc.get("content"));//内容 } ireader.close(); directory.close(); return liatname; } /** * 更新索引 更新原来索引的内容---只是改变原来文件的索引 * * @Description: TODO * @param @throws Exception * @return void * @throws * @author wusongxiao * @date 2016年8月10日 */ public static void update() throws Exception { String path = "D:\\file\\file\\f1\\test2.txt"; SmbFile folder = new SmbFile(path); List<SmbFile> fileList = new ArrayList<SmbFile>(); fileList.add(folder); Date date1 = new Date(); for (SmbFile file : fileList) { content = ""; // 获取文件后缀 String type = file.getName().substring( file.getName().lastIndexOf(".") + 1); if ("txt".equalsIgnoreCase(type)) { content += ReadFile.readTxt(file.getPath(),"gb2312"); } else if ("doc".equalsIgnoreCase(type)) { content += ReadFile.readWorddoc(file.getPath()); } else if ("xls".equalsIgnoreCase(type)) { content += ReadFile.xls2String(file.getPath()); } else if ("xlsx".equalsIgnoreCase(type)) { content += ReadFile.readExcel2007(file.getPath()); } else if ("ppt".equalsIgnoreCase(type)) { content += ReadFile.readPowerPoint(file.getPath()); } else if ("pdf".equalsIgnoreCase(type)) { content += ReadFile.readPdf(file.getPath()); }else if ("docx".equalsIgnoreCase(type)) { content += ReadFile.readWorddocx(file.getPath()); } System.out.println("name :" + file.getName()); System.out.println("path :" + file.getPath()); // System.out.println("content :"+content);//content内容 System.out.println(); try { analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); directory = FSDirectory.open(new File(INDEX_DIR)); File indexFile = new File(INDEX_DIR); if (!indexFile.exists()) { indexFile.mkdirs(); } IndexWriterConfig config = new IndexWriterConfig( Version.LUCENE_CURRENT, analyzer); indexWriter = new IndexWriter(directory, config); Long time = date1.getTime(); String tt = time.toString(); Document document = new Document(); document.add(new TextField("filename", file.getName(), Store.YES)); document.add(new TextField("content", content, Store.YES)); document.add(new TextField("path", file.getPath(), Store.YES)); document.add(new TextField("id", tt, Store.YES)); indexWriter.updateDocument(new Term("filename", "text1"), document); indexWriter.close(); ReadFile.closeWriter(indexWriter); } catch (Exception e) { e.printStackTrace(); } content = ""; } Date date2 = new Date(); System.out.println("更新索引耗时:" + (date2.getTime() - date1.getTime()) + "ms\n"); } /** * 创建索引,删除之前的索引,更新全部文件的索引 * * @Description: TODO * @param @param path * @param @return * @param @throws Exception * @return boolean * @throws * @author wusongxiao * @date 2016年8月10日 */ public static boolean createIndex(String path) throws Exception { // public static boolean createIndex() throws Exception { Date date1 = new Date(); String username = (String) BaseinfoConfigurer .getContextProperty("username"); String possword = (String) BaseinfoConfigurer .getContextProperty("possword"); String fileServerIp = (String) BaseinfoConfigurer .getContextProperty("fileServerIp"); String sharedirectory = (String) BaseinfoConfigurer .getContextProperty("sharedirectory"); path = "smb" + "://" + username + ":" + possword + "@" + fileServerIp + "/" + sharedirectory + "/"; //删除之前索引 ReadFile.deleteDir(new File(INDEX_DIR+"\\")); // String path = "smb://admini:2014wh@192.168.168.140/resource/"; SmbFile folder = new SmbFile(path); SmbFile[] result = searchFile(folder);// 根目录下的所有文件夹文件 for (SmbFile file : result) { content = ""; // 获取文件后缀 String type = file.getName().substring( file.getName().lastIndexOf(".") + 1); if ("txt".equalsIgnoreCase(type)) { content += ReadFile.readTxt(file.getPath(),"gb2312"); } else if ("doc".equalsIgnoreCase(type)) { content += ReadFile.readWorddoc(file.getPath()); } else if ("xls".equalsIgnoreCase(type)) { content += ReadFile.xls2String(file.getPath()); } else if ("xlsx".equalsIgnoreCase(type)) { content += ReadFile.readExcel2007(file.getPath()); } else if ("ppt".equalsIgnoreCase(type)) { content += ReadFile.readPowerPoint(file.getPath()); } else if ("pdf".equalsIgnoreCase(type)) { content += ReadFile.readPdf(file.getPath()); }else if ("docx".equalsIgnoreCase(type)) { content += ReadFile.readWorddocx(file.getPath()); } System.out.println("name :" + file.getName()); System.out.println("path :" + file.getPath()); // System.out.println("content :"+content); System.out.println(); try { analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); directory = FSDirectory.open(new File(INDEX_DIR)); File indexFile = new File(INDEX_DIR); if (!indexFile.exists()) { indexFile.mkdirs(); } IndexWriterConfig config = new IndexWriterConfig( Version.LUCENE_CURRENT, analyzer); indexWriter = new IndexWriter(directory, config); Document document = new Document(); document.add(new TextField("filename", file.getName(), Store.YES)); document.add(new TextField("content", content, Store.YES)); document.add(new TextField("path", file.getPath(), Store.YES)); // document.add(new TextField("id", tt, Store.YES)); indexWriter.addDocument(document); indexWriter.commit(); ReadFile.closeWriter(indexWriter); } catch (Exception e) { e.printStackTrace(); } content = ""; } Date date2 = new Date(); System.out.println("创建索引-----耗时:" + (date2.getTime() - date1.getTime()) + "ms\n"); return true; } /** * 递归查找所有的文件 * * @Description: TODO * @param @param folder * @param @return * @return SmbFile[] * @throws * @author wusongxiao * @date 2016年8月10日 */ public static SmbFile[] searchFile(SmbFile folder) { SmbFile[] subFolders = null; try { subFolders = folder.listFiles(new SmbFileFilter() { // 运用内部匿名类获得文件 @Override public boolean accept(SmbFile pathname) {// 实现FileFilter类的accept方法 try { if (pathname.isDirectory() || (pathname.isFile())) {// 目录或文件包含关键字 return true; } } catch (SmbException e) { } return false; } }); } catch (SmbException e1) { e1.printStackTrace(); } List<SmbFile> result = new ArrayList<SmbFile>();// 声明一个集合 for (int i = 0; i < subFolders.length; i++) {// 循环显示文件夹或文件 try { if (subFolders[i].isFile()) {// 如果是文件则将文件添加到结果列表中 result.add(subFolders[i]); } else {// 如果是文件夹,则递归调用本方法,然后把所有的文件加到结果列表中 SmbFile[] foldResult = searchFile(subFolders[i]); for (int j = 0; j < foldResult.length; j++) {// 循环显示文件 String smname = foldResult[j].toString(); String txtname = smname.substring(smname .lastIndexOf("/") + 1);// 截取文件名 String txtName = txtname.substring(txtname .lastIndexOf("."));// 截取格式 if (".txt".equals(txtName)) { result.add(foldResult[j]);// 文件保存到集合中 } else if (".ppt".equals(txtName)) { result.add(foldResult[j]);// 文件保存到集合中 } else if (".doc".equals(txtName)) { result.add(foldResult[j]);// 文件保存到集合中 } else if (".xls".equals(txtName)) { result.add(foldResult[j]);// 文件保存到集合中 } else if (".xlsx".equals(txtName)) { result.add(foldResult[j]);// 文件保存到集合中 } else if (".pdf".equals(txtName)) { result.add(foldResult[j]);// 文件保存到集合中 } } } } catch (SmbException e) { e.printStackTrace(); } } SmbFile files[] = new SmbFile[result.size()];// 声明文件数组,长度为集合的长度 result.toArray(files);// 集合数组化 return files; }}

  

 

2.读取文档的方法类 txt,xlsx,xls,ppt,pdf,doc, docx(不能读取图片)

package com.strongit.tool.retrieval;import java.io.BufferedReader;import java.io.File;import java.io.IOException;import java.io.InputStreamReader;import java.util.List; import jcifs.smb.SmbFileInputStream;import jxl.Cell;import jxl.Sheet;import jxl.Workbook;import org.apache.lucene.index.IndexWriter;import org.apache.pdfbox.pdfparser.PDFParser;import org.apache.pdfbox.util.PDFTextStripper;import org.apache.poi.hslf.HSLFSlideShow;import org.apache.poi.hslf.model.Slide;import org.apache.poi.hslf.model.TextRun;import org.apache.poi.hslf.usermodel.SlideShow;import org.apache.poi.hwpf.HWPFDocument;import org.apache.poi.hwpf.usermodel.Paragraph;import org.apache.poi.hwpf.usermodel.Range;import org.apache.poi.xssf.usermodel.XSSFCell;import org.apache.poi.xssf.usermodel.XSSFRow;import org.apache.poi.xssf.usermodel.XSSFSheet;import org.apache.poi.xssf.usermodel.XSSFWorkbook;import org.apache.poi.xwpf.usermodel.XWPFDocument;import org.apache.poi.xwpf.usermodel.XWPFParagraph;/** * 读取文档方法 * ClassName: ReadFile * @Description: TODO * @date 2016年8月10日 * @author wsx */public class ReadFile {		 private static ReadFile indexManager;		 /**   * 读取doc文件内容   * @param filepath 想要读取的文件地址   * @return 返回文件内容   */	 public static String readWorddoc(String filepath) {			StringBuffer content = new StringBuffer("");// 文档内容			try {				HWPFDocument doc = new HWPFDocument(new SmbFileInputStream(filepath));				Range range = doc.getRange();				int paragraphCount = range.numParagraphs();// 段落				for (int i = 0; i < paragraphCount; i++) {// 遍历段落读取数据					Paragraph pp = range.getParagraph(i);					content.append(pp.text());				}			} catch (Exception e) {				e.printStackTrace();			}			return content.toString().trim();		}	 /**	 * docx 格式建立索引,图片没有读到,只读取的数据 	 * @Description: TODO	 * @param @param filepath	 * @param @return  	 * @return String 	 * @date 2016年8月12日	 */	 public static String readWorddocx(String filepath) {			StringBuffer content = new StringBuffer("");// 文档内容			try { 				//   D://file//docx.docx   D://file//doc.doc//				filepath = "D://file//docx.docx";				SmbFileInputStream in = new SmbFileInputStream(filepath);//载入文档 				//word docx 图片不会被读取,只读取数据          XWPFDocument xwpf = new XWPFDocument(in);//得到word文档的信息        List<XWPFParagraph> listParagraphs = xwpf.getParagraphs();//得到段落信息              for(int i =0;i<listParagraphs.size();i++){      	 String cont = listParagraphs.get(i).getRuns().toString();      	 content.append(cont);       }        	    } catch (Exception e) { 	      e.printStackTrace(); 	    } 			return content.toString().trim();		}    /**   * 读取xls文件内容   * @param filepath 想要读取的文件对象   * @return 返回文件内容   */  public static String xls2String(String filepath){    String result = "";    try{      SmbFileInputStream fis = new SmbFileInputStream(filepath);        StringBuilder sb = new StringBuilder();        jxl.Workbook rwb = Workbook.getWorkbook(fis);        Sheet[] sheet = rwb.getSheets();        for (int i = 0; i < sheet.length; i++) {          Sheet rs = rwb.getSheet(i);          for (int j = 0; j < rs.getRows(); j++) {           Cell[] cells = rs.getRow(j);           for(int k=0;k<cells.length;k++)           sb.append(cells[k].getContents() + " ");          }        }        fis.close();        result += sb.toString();    }catch(Exception e){      e.printStackTrace();    }    return result;  }  /**   * PDF格式 文件创建索引   * @Description: TODO   * @param @param path   * @param @return   * @param @throws Exception    * @return String    * @date 2016年8月11日   */  public static String readPdf(String path) throws Exception {    StringBuffer content = new StringBuffer("");// 文档内容    SmbFileInputStream fis = new SmbFileInputStream(path);    PDFParser p = new PDFParser(fis);    p.parse();    PDFTextStripper ts = new PDFTextStripper();    content.append(ts.getText(p.getPDDocument()));    fis.close();    return content.toString().trim();  }    /**   * 读取xlsx格式的excel文档   * @param @param filepath   * @param @throws IOException    * @author wusongxiao   * @date 2016年8月10日   */  public static String readExcel2007(String filepath) throws IOException {//		System.out.println(filepath);		StringBuffer content = new StringBuffer();		// 构造 XSSFWorkbook 对象,strPath 传入文件路径 **** SmbFileInputStream SMB读取文件 ***		XSSFWorkbook xwb = new XSSFWorkbook(new SmbFileInputStream(filepath));		// 循环工作表Sheet		for (int numSheet = 0; numSheet < xwb.getNumberOfSheets(); numSheet++) {			XSSFSheet xSheet = xwb.getSheetAt(numSheet);			if (xSheet == null) {				continue;			}			// 循环行Row			for (int rowNum = 0; rowNum <= xSheet.getLastRowNum(); rowNum++) {				XSSFRow xRow = xSheet.getRow(rowNum);				if (xRow == null) {					continue;				}				// 循环列Cell				for (int cellNum = 0; cellNum <= xRow.getLastCellNum(); cellNum++) {					XSSFCell xCell = xRow.getCell(cellNum);					if (xCell == null) {						continue;					}					String s = null;					if (xCell.getCellType() == XSSFCell.CELL_TYPE_BOOLEAN) {						content.append(xCell.getBooleanCellValue());					} else if (xCell.getCellType() == XSSFCell.CELL_TYPE_NUMERIC) {						content.append(xCell.getNumericCellValue());					} else {						content.append(xCell.getStringCellValue() + ""); //+ "\n"					}				}			}		}		return content.toString();	}  /**   * 读取txt文档   * @param @param filepath 地址   * @param @param charSet 编码格式   * @param @throws IOException    * @author wusongxiao   * @date 2016年8月10日   */  public static String readTxt(String filepath, String charSet)			throws IOException {		BufferedReader reader = new BufferedReader(new InputStreamReader(				new SmbFileInputStream(filepath), charSet)); //reader.readLine() 读取txt文本 String的		String line = new String();		String temp = new String();		while ((line = reader.readLine()) != null) {			temp += line;		}		reader.close();		return temp;	}  /**   * 读取ppt文件   * @Description: TODO   * @param @param filepath   * @param @return    * @return String    * @date 2016年8月10日   */  public static String readPowerPoint(String filepath) {		StringBuffer content = new StringBuffer("");		try {			SlideShow ss = new SlideShow(new HSLFSlideShow(new SmbFileInputStream(filepath)));// is			// 为文件的InputStream,建立SlideShow			Slide[] slides = ss.getSlides();// 获得每一张幻灯片			for (int i = 0; i < slides.length; i++) {				TextRun[] t = slides[i].getTextRuns();// 为了取得幻灯片的文字内容,建立TextRun				for (int j = 0; j < t.length; j++) {					content.append(t[j].getText());// 这里会将文字内容加到content中去				}			}		} catch (Exception ex) {			System.out.println(ex.toString());		}		return content.toString();	}    public static void closeWriter(IndexWriter indexWriter) throws Exception {    if (indexWriter != null) {      indexWriter.close();    }  }    /**   * 创建索引管理器   * @return 返回索引管理器对象   */  public ReadFile getManager(){    if(indexManager == null){      this.indexManager = new ReadFile();    }    return indexManager;  }  /**   * 删除目录下的所有索引   * @Description: TODO   * @param @param file   * @param @return    * @return boolean    * @throws   * @author wusongxiao   * @date 2016年8月10日   */  public static boolean deleteDir(File file){    if(file.isDirectory()){      File[] files = file.listFiles();      for(int i=0; i<files.length; i++){        deleteDir(files[i]);      }    }    file.delete();    return true;  }}

  

整个都是基于SMB 文件服务器的lucene4.0全文检索,如果是本地文件的话   只需要把所有的地址   类似 SmbFileInputStream  去掉 Smb   就可以了