PageRenderTime 4006ms CodeModel.GetById 27ms RepoModel.GetById 1ms app.codeStats 0ms

/ontopia-classify/src/main/java/net/ontopia/topicmaps/classify/OOXMLWordFormatModule.java

http://ontopia.googlecode.com/
Java | 46 lines | 33 code | 8 blank | 5 comment | 1 complexity | 50b856b5bf383b5de9fee620dfd4ba04 MD5 | raw file
Possible License(s): LGPL-2.1, Apache-2.0
  1. package net.ontopia.topicmaps.classify;
  2. import java.io.*;
  3. import java.util.*;
  4. import net.ontopia.xml.*;
  5. import net.ontopia.utils.*;
  6. import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
  7. import org.apache.poi.openxml4j.opc.OPCPackage;
  8. /**
  9. * INTERNAL: A format module for the OOXML WordProcessingML format.
  10. */
  11. public class OOXMLWordFormatModule implements FormatModuleIF {
  12. protected String[] extensions = new String[] {".docx"};
  13. // these are really magic bytes for all zip files...
  14. protected byte[] magicBytes = new byte[] {
  15. (byte) 0x50, (byte) 0x4B, (byte) 0x03, (byte) 0x04 };
  16. public boolean matchesContent(ClassifiableContentIF cc) {
  17. return false;
  18. }
  19. public boolean matchesIdentifier(ClassifiableContentIF cc) {
  20. boolean matches = FormatModule.matchesExtension(cc.getIdentifier(), extensions);
  21. if (!matches) return false;
  22. // name matches, then check office magic bytes
  23. return FormatModule.startsWith(cc.getContent(), magicBytes);
  24. }
  25. public void readContent(ClassifiableContentIF cc, TextHandlerIF handler) {
  26. try {
  27. OPCPackage opc = OPCPackage.open(new ByteArrayInputStream(cc.getContent()));
  28. XWPFWordExtractor extractor = new XWPFWordExtractor(opc);
  29. String s = extractor.getText();
  30. char[] c = s.toCharArray();
  31. handler.startRegion("document");
  32. handler.text(c, 0, c.length);
  33. handler.endRegion();
  34. } catch (Exception e) {
  35. throw new OntopiaRuntimeException(e);
  36. }
  37. }
  38. }