/test/core/com/itextpdf/text/pdf/parser/TextExtractionTest.java

https://github.com/virasak/iText-4.2.0 · Java · 253 lines · 156 code · 80 blank · 17 comment · 1 complexity · a888a420c03b3b21981695d2e8de33d0 MD5 · raw file

  1. /*
  2. * Created on Nov 5, 2009
  3. * (c) 2009 Trumpet, Inc.
  4. *
  5. */
  6. package com.itextpdf.text.pdf.parser;
  7. import java.awt.geom.AffineTransform;
  8. import java.io.ByteArrayOutputStream;
  9. import java.io.File;
  10. import java.io.FileOutputStream;
  11. import org.junit.After;
  12. import org.junit.Assert;
  13. import org.junit.Before;
  14. import org.junit.Test;
  15. import com.itextpdf.text.Document;
  16. import com.itextpdf.text.DocumentException;
  17. import com.itextpdf.text.PageSize;
  18. import com.itextpdf.text.pdf.BaseFont;
  19. import com.itextpdf.text.pdf.PdfContentByte;
  20. import com.itextpdf.text.pdf.PdfReader;
  21. import com.itextpdf.text.pdf.PdfWriter;
  22. /**
  23. * @author kevin
  24. */
  25. public class TextExtractionTest {
  26. String TEXT1 = "TEXT1 TEXT1";
  27. String TEXT2 = "TEXT2 TEXT2";
  28. @Before
  29. public void setUp() throws Exception {
  30. }
  31. @After
  32. public void tearDown() throws Exception {
  33. }
  34. @Test
  35. public void testCoLinnearText() throws Exception{
  36. byte[] bytes = createPdfWithRotatedText(TEXT1, TEXT2, 0, false, 0);
  37. PdfTextExtractor ex = new PdfTextExtractor(new PdfReader(bytes));
  38. Assert.assertEquals(TEXT1 + TEXT2, ex.getTextFromPage(1));
  39. }
  40. @Test
  41. public void testCoLinnearTextWithSpace() throws Exception{
  42. byte[] bytes = createPdfWithRotatedText(TEXT1, TEXT2, 0, false, 2);
  43. PdfTextExtractor ex = new PdfTextExtractor(new PdfReader(bytes));
  44. //saveBytesToFile(bytes, new File("c:/temp/test.pdf"));
  45. Assert.assertEquals(TEXT1 + " " + TEXT2, ex.getTextFromPage(1));
  46. }
  47. @Test
  48. public void testCoLinnearTextEndingWithSpaceCharacter() throws Exception{
  49. // in this case, we shouldn't be inserting an extra space
  50. TEXT1 = TEXT1 + " ";
  51. byte[] bytes = createPdfWithRotatedText(TEXT1, TEXT2, 0, false, 2);
  52. PdfTextExtractor ex = new PdfTextExtractor(new PdfReader(bytes));
  53. //saveBytesToFile(bytes, new File("c:/temp/test.pdf"));
  54. Assert.assertEquals(TEXT1 + TEXT2, ex.getTextFromPage(1));
  55. }
  56. @Test
  57. public void testUnRotatedText() throws Exception{
  58. byte[] bytes = createPdfWithRotatedText(TEXT1, TEXT2, 0, true, -20);
  59. PdfTextExtractor ex = new PdfTextExtractor(new PdfReader(bytes));
  60. Assert.assertEquals(TEXT1 + "\n" + TEXT2, ex.getTextFromPage(1));
  61. }
  62. @Test
  63. public void testRotatedText() throws Exception{
  64. byte[] bytes = createPdfWithRotatedText(TEXT1, TEXT2, -90, true, -20);
  65. PdfTextExtractor ex = new PdfTextExtractor(new PdfReader(bytes));
  66. Assert.assertEquals(TEXT1 + "\n" + TEXT2, ex.getTextFromPage(1));
  67. }
  68. @Test
  69. public void testPartiallyRotatedText() throws Exception{
  70. byte[] bytes = createPdfWithRotatedText(TEXT1, TEXT2, 33, true, -20);
  71. PdfTextExtractor ex = new PdfTextExtractor(new PdfReader(bytes));
  72. Assert.assertEquals(TEXT1 + "\n" + TEXT2, ex.getTextFromPage(1));
  73. }
  74. @Test
  75. public void testWordSpacingCausedByExplicitGlyphPositioning() throws Exception{
  76. byte[] bytes = createPdfWithArrayText(TEXT1, TEXT2, 250);
  77. PdfTextExtractor ex = new PdfTextExtractor(new PdfReader(bytes));
  78. Assert.assertEquals(TEXT1 + " " + TEXT2, ex.getTextFromPage(1));
  79. }
  80. @Test
  81. public void testWordSpacingCausedByExplicitGlyphPositioning2() throws Exception{
  82. byte[] bytes = createPdfWithArrayText("[(S)3.2(an)-255.0(D)13.0(i)8.3(e)-10.1(g)1.6(o)-247.5(C)2.4(h)5.8(ap)3.0(t)10.7(er)]TJ");
  83. PdfTextExtractor ex = new PdfTextExtractor(new PdfReader(bytes));
  84. Assert.assertEquals("San Diego Chapter", ex.getTextFromPage(1));
  85. }
  86. @Test
  87. public void testTrailingSpace() throws Exception{
  88. byte[] bytes = createPdfWithRotatedText(TEXT1 + " ", TEXT2, 0, false, 20);
  89. PdfTextExtractor ex = new PdfTextExtractor(new PdfReader(bytes));
  90. Assert.assertEquals(TEXT1 + " " + TEXT2, ex.getTextFromPage(1));
  91. }
  92. @Test
  93. public void testLeadingSpace() throws Exception{
  94. byte[] bytes = createPdfWithRotatedText(TEXT1, " " + TEXT2, 0, false, 20);
  95. PdfTextExtractor ex = new PdfTextExtractor(new PdfReader(bytes));
  96. Assert.assertEquals(TEXT1 + " " + TEXT2, ex.getTextFromPage(1));
  97. }
  98. private static byte[] createPdfWithArrayText(String directContentTj) throws Exception{
  99. final ByteArrayOutputStream byteStream = new ByteArrayOutputStream();
  100. final Document document = new Document();
  101. PdfWriter writer = PdfWriter.getInstance(document, byteStream);
  102. document.setPageSize(PageSize.LETTER);
  103. document.open();
  104. PdfContentByte cb = writer.getDirectContent();
  105. BaseFont font = BaseFont.createFont();
  106. cb.transform(AffineTransform.getTranslateInstance(100, 500));
  107. cb.beginText();
  108. cb.setFontAndSize(font, 12);
  109. cb.getInternalBuffer().append(directContentTj + "\n");
  110. cb.endText();
  111. document.close();
  112. final byte[] pdfBytes = byteStream.toByteArray();
  113. return pdfBytes;
  114. }
  115. private static byte[] createPdfWithArrayText(String text1, String text2, int spaceInPoints) throws Exception{
  116. final ByteArrayOutputStream byteStream = new ByteArrayOutputStream();
  117. final Document document = new Document();
  118. PdfWriter writer = PdfWriter.getInstance(document, byteStream);
  119. document.setPageSize(PageSize.LETTER);
  120. document.open();
  121. PdfContentByte cb = writer.getDirectContent();
  122. BaseFont font = BaseFont.createFont();
  123. cb.beginText();
  124. cb.setFontAndSize(font, 12);
  125. cb.getInternalBuffer().append("[(" + text1 + ")" + (-spaceInPoints) + "(" + text2 + ")]TJ\n");
  126. cb.endText();
  127. document.close();
  128. final byte[] pdfBytes = byteStream.toByteArray();
  129. return pdfBytes;
  130. }
  131. private static byte[] createPdfWithRotatedText(String text1, String text2, float rotation, boolean moveTextToNextLine, float moveTextDelta) throws Exception {
  132. final ByteArrayOutputStream byteStream = new ByteArrayOutputStream();
  133. final Document document = new Document();
  134. PdfWriter writer = PdfWriter.getInstance(document, byteStream);
  135. document.setPageSize(PageSize.LETTER);
  136. document.open();
  137. PdfContentByte cb = writer.getDirectContent();
  138. BaseFont font = BaseFont.createFont();
  139. float x = document.getPageSize().getWidth()/2;
  140. float y = document.getPageSize().getHeight()/2;
  141. cb.transform(AffineTransform.getTranslateInstance(x, y));
  142. cb.moveTo(-10, 0);
  143. cb.lineTo(10, 0);
  144. cb.moveTo(0, -10);
  145. cb.lineTo(0, 10);
  146. cb.stroke();
  147. cb.beginText();
  148. cb.setFontAndSize(font, 12);
  149. cb.transform(AffineTransform.getRotateInstance(rotation/180f*Math.PI));
  150. cb.showText(text1);
  151. if (moveTextToNextLine)
  152. cb.moveText(0, moveTextDelta);
  153. else
  154. cb.transform(AffineTransform.getTranslateInstance(moveTextDelta, 0));
  155. cb.showText(text2);
  156. cb.endText();
  157. document.close();
  158. final byte[] pdfBytes = byteStream.toByteArray();
  159. return pdfBytes;
  160. }
  161. /**
  162. * Used for testing only if we need to open the PDF itself
  163. * @param bytes
  164. * @param file
  165. * @throws Exception
  166. */
  167. private void saveBytesToFile(byte[] bytes, File file) throws Exception{
  168. final FileOutputStream outputStream = new FileOutputStream(file);
  169. outputStream.write(bytes);
  170. outputStream.close();
  171. System.out.println("PDF dumped to " + file.getAbsolutePath());
  172. }
  173. }