[整理]poi读取word 2007批注信息
本文非原创,只是整理了下代码,原代码出自:http://blog.chiefleo.me/archives/429.原文如下:
普通的读取批注信息方法:
public void readWordDocxComments(String fileName) {
XWPFDocument document = null;
XWPFComment[] comments = null;
try {
document = new XWPFDocument(POIXMLDocument.openPackage(fileName));
comments = document.getComments();
for (int i = 0; i < comments.length; i++) {
System.out.println("Id= " + comments[i].getId());
System.out.println("Text= " + comments[i].getText());
System.out.println("Author= " + comments[i].getAuthor());
}
} catch (Exception e) {
e.printStackTrace();
}
}
不能获取批注对应的正文信息,修改后的代码如下:
import java.io.File;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLDocumentPart;
import org.apache.poi.xwpf.usermodel.XWPFComment;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFRelation;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTComment;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CommentsDocument;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CommentsDocument.Factory;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
public class POI_读取批注_S4_Test {
private File file;
/** Word document */
private XWPFDocument docx;
/** 批注内容数组 */
private XWPFComment[] comments;//
/** 批注引用正文map,结构-<批注Id,正文text> */
private Map<String, String> commentRefs;// /** 日期格式化类型 */
private final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
/** 批注所引用正文装配Map完毕标识 */
private static final String COMMENT_REF_FILLED_OK = "OK";
/** 批注最大下标 */
private String maxCommentIndex;
/*
* @param filePath Word文件路径
*/
public POI_读取批注_S4_Test(String filePath) throws Exception {
file = new File(filePath);
initAttributes();
}
/*
* 初始化成员变量
* @throws Exception Word缺陷导入异常
*/
private void initAttributes() throws Exception {
try {
docx = new XWPFDocument(POIXMLDocument.openPackage(file
.getCanonicalPath()));
comments = docx.getComments();
maxCommentIndex = String.valueOf(comments.length - 1);
commentRefs = new HashMap<String, String>();
fillCommentRef(docx.getDocument().getDomNode(),
new StringBuilder(), new StringBuilder(),
new StringBuilder(), commentRefs);
} catch (Exception e) {
throw new Exception(new StringBuilder().append("Word文件格式错误")
.append("-").append(e.getMessage()).toString(), e);
}
}
/*
* 获取批注内容
*/
public XWPFComment[] getComments() {
return comments;
}
public Map<String, String> getCommentRefs() {
return commentRefs;
}
/*
* 获取日期格式化类型
*/
public SimpleDateFormat getSdf() {
return sdf;
}
/* 获取批注日期List */
public List<Date> getSubmitDateList() {
Map<String, Date> dateMap = new HashMap<String, Date>();
List<Date> dateList = new ArrayList<Date>();
try {
Iterator<POIXMLDocumentPart> iter = docx.getRelations().iterator();
do {
if (!iter.hasNext())
break;
POIXMLDocumentPart p = (POIXMLDocumentPart) iter.next();
String relation = p.getPackageRelationship()
.getRelationshipType();
if (relation.equals(XWPFRelation.COMMENT.getRelation())) {
CommentsDocument cmntdoc;
cmntdoc = Factory
.parse(p.getPackagePart().getInputStream());
List<CTComment> commentList = cmntdoc.getComments()
.getCommentList();
int len = commentList.size();
int j = 0;
while (j < len) {
CTComment ctcomment = commentList.get(j);
dateMap.put(ctcomment.getId().toString(), ctcomment
.getDate().getTime());
j++;
}
}
} while (true);
} catch (Exception e) {
}
if (dateMap != null) {
for (XWPFComment comment : comments) {
dateList.add(dateMap.get(comment.getId()));
}
}
return dateList;
}
/*
* 获取批注作者List
*/
public List<String> getSubmitterList() {
List<String> list = new ArrayList<String>();
for (XWPFComment comment : comments) {
list.add(comment.getAuthor().trim());
}
return list;
}
/*
* 组装批注引用文本Map,Map结构-<commentId,text>
* @param node WordProcessingML node
* @param id 批注ID
* @param value 批注引用正文文本
* @param convertOK 正文组装完毕标识 ,组装完毕 = "OK"
* @param map 要填充的目标Map
*/
private void fillCommentRef(Node node, StringBuilder id,
StringBuilder value, StringBuilder convertOK,
Map<String, String> map) throws Exception {
// fillCommentRef方法要求所有参数不能为null,如果为null,抛出异常
if (!insureNotNull(node, id, value, convertOK, map)) {
throw new IllegalArgumentException(new StringBuilder()
.append(this.getClass().getName())
.append("fillCommentRef(").append(node).append(",")
.append(id).append(",").append(value).append(",")
.append(convertOK).append(",").append(map).append(")")
.toString());
}
/*
* docx文件批注所引用的正文保存在document.xml中,可以通过重命名xx.docx为xx.zip来查看
* 其中如果某段正文文本内容有批注,那么会在document.xml这样保存 <w:commentRangeStart w:id="0" />
* <w:t>正文文本</w:t> </w:r> <w:commentRangeEnd w:id="0" />
* 如果被批注的是在图片上加批注,那么会在document
* .xml中这样保存(仅限真正docx文件,如果是doc文件另存为docx文件,<wp:docPr节点中是没有属性的)
* <w:commentRangeStart w:id="1" /> <wp:docPr id="1" name="xxx"
* descr="yyy.png" /> <w:commentRangeEnd w:id="1" /> *
* 1)id初始值为空,如果解析到节点w:commentRangeStart,就代表是有批注的部分,需要把参数id设为节点的id属性值
* 2)顺次解析下面节点
* ,如果此时的id不为空,就代表进入批注引用部分,w:t是文本内容,直接append;wp:docPr是图片内容,用"[xxx]"
* 来区分是图片,然后append.
* 3)如果解析到节点w:commentRangeEnd,就代表一个批注引用完毕,这时需要向Map中put(id,value)值;
* 判断当前的批注Id是不是最大
* ,如果为最大批注Id,convertOK置为"OK",用此标识来说明批注引用提取完毕,退出节点for循环?例如一个很大的Word文件
* ,只在第2页做了一个批注,前面的做法会很有用;
* 同时还要做好一条批注引用解析完毕的收尾工作:id清空,代表下面节点又是无批注的部分;value清空,待下次新的批注append.
*/
if ("w:t".equals(node.getNodeName()) && id.length() > 0) {
value.append(node.getFirstChild().getNodeValue());
} else if ("wp:docPr".equals(node.getNodeName()) && id.length() > 0) {
value.append("[").append(getAttribute(node, "name")).append("]");
} else if ("w:commentRangeStart".equals(node.getNodeName())) {
id.setLength(0);
id.append(getAttribute(node, "w:id"));
value.setLength(0);
} else if ("w:commentRangeEnd".equals(node.getNodeName())
&& id.length() > 0) {
if (id.toString().equals(getAttribute(node, "w:id"))) {
map.put(id.toString(), value.toString());
if (id.toString().equals(maxCommentIndex)) {
convertOK.setLength(0);
convertOK.append(COMMENT_REF_FILLED_OK);
id.setLength(0);
value.setLength(0);
}
}
}
if (node.hasChildNodes()) {
NodeList temp = node.getChildNodes();
for (int i = 0; i < temp.getLength(); i++) {
if (convertOK.toString().endsWith(COMMENT_REF_FILLED_OK)) {
break;
}
fillCommentRef(temp.item(i), id, value, convertOK, map);
}
}
}
/***
* @param node
* 当前的Node
* @param attName
* 要获取的属性名
* @return 属性值,没有该属性时返回null
*/
private static String getAttribute(Node node, String attName) {
return (node.hasAttributes() && node.getAttributes().getNamedItem(
attName) != null) ? node.getAttributes().getNamedItem(attName)
.getNodeValue() : null;
}
/*
* 确保此方法的所有参数均不为空
* @param objects 对象参数
* @return 所有参数均不为空返回true 否则为false
*/
private boolean insureNotNull(Object... objects) {
for (Object object : objects) {
if (object == null) {
return false;
}
}
return true;
}
public static void main(String[] args) throws Exception {
StringBuffer value = new StringBuffer();
POI_读取批注_S4_Test wh = new POI_读取批注_S4_Test(
"f:/saveFile/temp/sys_comment_07.docx");
XWPFComment[] comments = wh.getComments();
Map<String, String> commenRefMap = wh.getCommentRefs();
List<Date> l = wh.getSubmitDateList();
SimpleDateFormat sdf = wh.getSdf();
XWPFComment comment;
for (int i = 0; i < comments.length; i++) {
comment = comments[i];
value.append("批注Id:").append(comment.getId()).append(", ")
.append("批注作者:").append(comment.getAuthor()).append(", ")
.append("批注日期:").append(sdf.format(l.get(i))).append(", ")
.append("批注内容:").append(comment.getText()).append(", ")
.append("批注引用正文:")
.append(commenRefMap.get(comment.getId()));
value.append("\n");
}
System.out.println(value);
}
}
结果为:
全文完。