你的位置:首页 > Java教程

[Java教程]java word文档 转 html文件


一、简介
  一般word文件后缀有doc、docx两种。docx是office word 2007以及以后版本文档的扩展名;doc是office word 2003文档保存的扩展名。对于这两种格式的word转换成html需要使用不同的方法。
对于docx格式的文档使用xdocreport进行转换。依赖如下:

<dependency>  <groupId>fr.opensagres.xdocreport</groupId>  <artifactId>fr.opensagres.xdocreport.document</artifactId>  <version>1.0.5</version></dependency><dependency>   <groupId>fr.opensagres.xdocreport</groupId>   <artifactId>org.apache.poi.xwpf.converter.xhtml</artifactId>   <version>1.0.5</version> </dependency>

对于docx格式的文档使用poi进行转换。依赖如下:

<dependency>  <groupId>org.apache.poi</groupId>  <artifactId>poi</artifactId>  <version>3.12</version></dependency><dependency>  <groupId>org.apache.poi</groupId>  <artifactId>poi-scratchpad</artifactId>  <version>3.12</version></dependency>

二:示例
  代码示例如下:

 1 package com.test.word; 2  3 import java.io.File; 4 import java.io.FileInputStream; 5 import java.io.FileNotFoundException; 6 import java.io.FileOutputStream; 7 import java.io.IOException; 8 import java.io.InputStream; 9 import java.io.OutputStream; 10  11 import javax. 12 import javax. 13 import javax. 14 import javax. 15 import javax. 16 import javax. 17 import javax. 18 import javax. 19  20 import org.apache.poi.hwpf.HWPFDocument; 21 import org.apache.poi.hwpf.converter.PicturesManager; 22 import org.apache.poi.hwpf.converter.WordToHtmlConverter; 23 import org.apache.poi.hwpf.usermodel.PictureType; 24 import org.apache.poi.xwpf.converter.core.FileImageExtractor; 25 import org.apache.poi.xwpf.converter.core.FileURIResolver; 26 import org.apache.poi.xwpf.converter.xhtml.XHTMLConverter; 27 import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions; 28 import org.apache.poi.xwpf.usermodel.XWPFDocument; 29 import org.junit.Test; 30 import org.w3c.dom.Document; 31  32 /** 33  * word 转换成html 34 */ 35 public class WordToHtml { 36    37   /** 38    * 2007版本word转换成html 39    * @throws IOException 40   */ 41   @Test  42   public void Word2007ToHtml() throws IOException { 43     String filepath = "C:/test/"; 44     String fileName = "滕王阁序2007.docx"; 45     String htmlName = "滕王阁序2007.html"; 46     final String file = filepath + fileName; 47     File f = new File(file);  48     if (!f.exists()) {  49       System.out.println("Sorry File does not Exists!");  50     } else {  51       if (f.getName().endsWith(".docx") || f.getName().endsWith(".DOCX")) {  52           53         // 1) 加载word文档生成 XWPFDocument对象  54         InputStream in = new FileInputStream(f);  55         XWPFDocument document = new XWPFDocument(in);  56   57         // 2) 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录)  58         File imageFolderFile = new File(filepath);  59         XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile));  60         options.setExtractor(new FileImageExtractor(imageFolderFile));  61         options.setIgnoreStylesIfUnused(false);  62         options.setFragment(true);  63           64         // 3) 将 XWPFDocument转换成XHTML  65         OutputStream out = new FileOutputStream(new File(filepath + htmlName));  66         XHTMLConverter.getInstance().convert(document, out, options);  67          68         //也可以使用字符数组流获取解析的内容 69 //        ByteArrayOutputStream baos = new ByteArrayOutputStream();  70 //        XHTMLConverter.getInstance().convert(document, baos, options);  71 //        String content = baos.toString(); 72 //        System.out.println(content); 73 //         baos.close(); 74       } else {  75         System.out.println("Enter only MS Office 2007+ files");  76       }  77     }  78   }  79    80   /** 81    * /** 82    * 2003版本word转换成html 83    * @throws IOException 84    * @throws TransformerException 85    * @throws ParserConfigurationException 86   */ 87   @Test  88   public void Word2003ToHtml() throws IOException, TransformerException, ParserConfigurationException { 89     String filepath = "C:/test/"; 90     final String imagepath = "C:/test/image/"; 91     String fileName = "滕王阁序2003.doc"; 92     String htmlName = "滕王阁序2003.html"; 93     final String file = filepath + fileName; 94     InputStream input = new FileInputStream(new File(file)); 95     HWPFDocument wordDocument = new HWPFDocument(input); 96     WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument()); 97     //设置图片存放的位置 98     wordToHtmlConverter.setPicturesManager(new PicturesManager() { 99       public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) {100         File imgPath = new File(imagepath);101         if(!imgPath.exists()){//图片目录不存在则创建102           imgPath.mkdirs();103         }104         File file = new File(imagepath + suggestedName);105         try {106           OutputStream os = new FileOutputStream(file);107           os.write(content);108           os.close();109         } catch (FileNotFoundException e) {110           e.printStackTrace();111         } catch (IOException e) {112           e.printStackTrace();113         }114         return imagepath + suggestedName;115       }116     });117     118     //解析word文档119     wordToHtmlConverter.processDocument(wordDocument);120     Document htmlDocument = wordToHtmlConverter.getDocument();121     122     File htmlFile = new File(filepath + htmlName);123     OutputStream outStream = new FileOutputStream(htmlFile);124     125     //也可以使用字符数组流获取解析的内容126 //    ByteArrayOutputStream baos = new ByteArrayOutputStream(); 127 //    OutputStream outStream = new BufferedOutputStream(baos);128 129     DOMSource domSource = new DOMSource(htmlDocument);130     StreamResult streamResult = new StreamResult(outStream);131 132     TransformerFactory factory = TransformerFactory.newInstance();133     Transformer serializer = factory.newTransformer();134     serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");135     serializer.setOutputProperty(OutputKeys.INDENT, "yes");136     serializer.setOutputProperty(OutputKeys.METHOD, "html");137     138     serializer.transform(domSource, streamResult);139 140     //也可以使用字符数组流获取解析的内容141 //    String content = baos.toString();142 //    System.out.println(content);143 //    baos.close();144     outStream.close();145   }146 }

  运行生存文件结果如下: