81,092
社区成员
发帖
与我相关
我的任务
分享
1. public class WordExtractor {
2. public WordExtractor() {
3. }
4.
5. public String extractText(InputStream in) throws IOException {
6. ArrayList text = new ArrayList();
7. POIFSFileSystem fsys = new POIFSFileSystem(in);
8.
9. DocumentEntry headerProps = (DocumentEntry) fsys.getRoot().getEntry("WordDocument");
10. DocumentInputStream din = fsys.createDocumentInputStream("WordDocument");
11. byte[] header = new byte[headerProps.getSize()];
12.
13. din.read(header);
14. din.close();
15. // Prende le informazioni dall'header del documento
16. int info = LittleEndian.getShort(header, 0xa);
17.
18. boolean useTable1 = (info & 0x200) != 0;
19.
20. //boolean useTable1 = true;
21.
22. // Prende informazioni dalla piece table
23. int complexOffset = LittleEndian.getInt(header, 0x1a2);
24. //int complexOffset = LittleEndian.getInt(header);
25.
26. String tableName = null;
27. if (useTable1) {
28. tableName = "1Table";
29. } else {
30. tableName = "0Table";
31. }
32.
33. DocumentEntry table = (DocumentEntry) fsys.getRoot().getEntry(tableName);
34. byte[] tableStream = new byte[table.getSize()];
35.
36. din = fsys.createDocumentInputStream(tableName);
37.
38. din.read(tableStream);
39. din.close();
40.
41. din = null;
42. fsys = null;
43. table = null;
44. headerProps = null;
45.
46. int multiple = findText(tableStream, complexOffset, text);
47.
48. StringBuffer sb = new StringBuffer();
49. int size = text.size();
50. tableStream = null;
51.
52. for (int x = 0; x < size; x++) {
53.
54. WordTextPiece nextPiece = (WordTextPiece) text.get(x);
55. int start = nextPiece.getStart();
56. int length = nextPiece.getLength();
57.
58. boolean unicode = nextPiece.usesUnicode();
59. String toStr = null;
60. if (unicode) {
61. toStr = new String(header, start, length * multiple, "UTF-16LE");
62. } else {
63. toStr = new String(header, start, length, "ISO-8859-1");
64. }
65. sb.append(toStr).append(" ");
66.
67. }
68. return sb.toString();
69. }
70.
71. private static int findText(byte[] tableStream, int complexOffset, ArrayList text)
72. throws IOException {
73. //actual text
74. int pos = complexOffset;
75. int multiple = 2;
76. //skips through the prms before we reach the piece table. These contain data
77. //for actual fast saved files
78. while (tableStream[pos] == 1) {
79. pos++;
80. int skip = LittleEndian.getShort(tableStream, pos);
81. pos += 2 + skip;
82. }
83. if (tableStream[pos] != 2) {
84. throw new IOException("corrupted Word file");
85. } else {
86. //parse out the text pieces
87. int pieceTableSize = LittleEndian.getInt(tableStream, ++pos);
88. pos += 4;
89. int pieces = (pieceTableSize - 4) / 12;
90. for (int x = 0; x < pieces; x++) {
91. int filePos =
92. LittleEndian.getInt(tableStream, pos + ((pieces + 1) * 4) + (x *"/images/forum/smiles/icon_cool.gif"/> + 2);
93. boolean unicode = false;
94. if ((filePos & 0x40000000) == 0) {
95. unicode = true;
96. } else {
97. unicode = false;
98. multiple = 1;
99. filePos &= ~(0x40000000); //gives me FC in doc stream
100. filePos /= 2;
101. }
102. int totLength =
103. LittleEndian.getInt(tableStream, pos + (x + 1) * 4)
104. - LittleEndian.getInt(tableStream, pos + (x * 4));
105.
106. WordTextPiece piece = new WordTextPiece(filePos, totLength, unicode);
107. text.add(piece);
108.
109. }
110.
111. }
112. return multiple;
113. }
114. public static void main(String[] args){
115. WordExtractor w = new WordExtractor();
116. POIFSFileSystem ps = new POIFSFileSystem();
117. try{
118.
119. File file = new File("C:\\test.doc");
120.
121. InputStream in = new FileInputStream(file);
122. String s = w.extractText(in);
123. System.out.println(s);
124.
125.
126. }catch(Exception e){
127. e.printStackTrace();
128. }
129.
130. }
131.
132. }
133. class WordTextPiece {
134. private int _fcStart;
135. private boolean _usesUnicode;
136. private int _length;
137.
138. public WordTextPiece(int start, int length, boolean unicode) {
139. _usesUnicode = unicode;
140. _length = length;
141. _fcStart = start;
142. }
143. public boolean usesUnicode() {
144. return _usesUnicode;
145. }
146.
147. public int getStart() {
148. return _fcStart;
149. }
150. public int getLength() {
151. return _length;
152. }
153.
154. }
1. public boolean writeWordFile(String path, String content) {
2. boolean w = false;
3. try {
4.
5. // byte b[] = content.getBytes("ISO-8859-1");
6. byte b[] = content.getBytes();
7.
8. ByteArrayInputStream bais = new ByteArrayInputStream(b);
9.
10. POIFSFileSystem fs = new POIFSFileSystem();
11. DirectoryEntry directory = fs.getRoot();
12.
13. DocumentEntry de = directory.createDocument("WordDocument", bais);
14.
15. FileOutputStream ostream = new FileOutputStream(path);
16.
17. fs.writeFilesystem(ostream);
18.
19. bais.close();
20. ostream.close();
21.
22. } catch (IOException e) {
23. e.printStackTrace();
24. }
25.
26. return w;
27. }