PageRenderTime 57ms CodeModel.GetById 27ms RepoModel.GetById 0ms app.codeStats 0ms

/itextsharp-5.0.6/iTextSharp/text/pdf/parser/LocationTextExtractionStrategy.cs

#
C# | 275 lines | 104 code | 41 blank | 130 comment | 24 complexity | 3b91fb3a989875d8f4d1c1b71f7024cd MD5 | raw file
  1. using System;
  2. using System.Collections.Generic;
  3. using System.Text;
  4. /*
  5. * $Id: SimpleTextExtractingPdfContentRenderListener.java 4115 2009-12-01 14:08:23Z blowagie $
  6. *
  7. * This file is part of the iText project.
  8. * Copyright (c) 1998-2009 1T3XT BVBA
  9. * Authors: Kevin Day, Bruno Lowagie, Paulo Soares, et al.
  10. *
  11. * This program is free software; you can redistribute it and/or modify
  12. * it under the terms of the GNU Affero General Public License version 3
  13. * as published by the Free Software Foundation with the addition of the
  14. * following permission added to Section 15 as permitted in Section 7(a):
  15. * FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY 1T3XT,
  16. * 1T3XT DISCLAIMS THE WARRANTY OF NON INFRINGEMENT OF THIRD PARTY RIGHTS.
  17. *
  18. * This program is distributed in the hope that it will be useful, but
  19. * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  20. * or FITNESS FOR A PARTICULAR PURPOSE.
  21. * See the GNU Affero General Public License for more details.
  22. * You should have received a copy of the GNU Affero General Public License
  23. * along with this program; if not, see http://www.gnu.org/licenses or write to
  24. * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  25. * Boston, MA, 02110-1301 USA, or download the license from the following URL:
  26. * http://itextpdf.com/terms-of-use/
  27. *
  28. * The interactive user interfaces in modified source and object code versions
  29. * of this program must display Appropriate Legal Notices, as required under
  30. * Section 5 of the GNU Affero General Public License.
  31. *
  32. * In accordance with Section 7(b) of the GNU Affero General Public License,
  33. * you must retain the producer line in every PDF that is created or manipulated
  34. * using iText.
  35. *
  36. * You can be released from the requirements of the license by purchasing
  37. * a commercial license. Buying such a license is mandatory as soon as you
  38. * develop commercial activities involving the iText software without
  39. * disclosing the source code of your own applications.
  40. * These activities include: offering paid services to customers as an ASP,
  41. * serving PDFs on the fly in a web application, shipping iText with a closed
  42. * source product.
  43. *
  44. * For more information, please contact iText Software Corp. at this
  45. * address: sales@itextpdf.com
  46. */
  47. namespace iTextSharp.text.pdf.parser {
  48. /**
  49. * <b>Development preview</b> - this class (and all of the parser classes) are still experiencing
  50. * heavy development, and are subject to change both behavior and interface.
  51. * <br>
  52. * A text extraction renderer that keeps track of relative position of text on page
  53. * The resultant text will be relatively consistent with the physical layout that most
  54. * PDF files have on screen.
  55. * <br>
  56. * This renderer keeps track of the orientation and distance (both perpendicular
  57. * and parallel) to the unit vector of the orientation. Text is ordered by
  58. * orientation, then perpendicular, then parallel distance. Text with the same
  59. * perpendicular distance, but different parallel distance is treated as being on
  60. * the same line.
  61. * <br>
  62. * This renderer also uses a simple strategy based on the font metrics to determine if
  63. * a blank space should be inserted into the output.
  64. *
  65. * @since 5.0.2
  66. */
  67. public class LocationTextExtractionStrategy : ITextExtractionStrategy {
  68. /** set to true for debugging */
  69. public static bool DUMP_STATE = false;
  70. /** a summary of all found text */
  71. private List<TextChunk> locationalResult = new List<TextChunk>();
  72. /**
  73. * Creates a new text extraction renderer.
  74. */
  75. public LocationTextExtractionStrategy() {
  76. }
  77. /**
  78. * @see com.itextpdf.text.pdf.parser.RenderListener#beginTextBlock()
  79. */
  80. public void BeginTextBlock(){
  81. }
  82. /**
  83. * @see com.itextpdf.text.pdf.parser.RenderListener#endTextBlock()
  84. */
  85. public void EndTextBlock(){
  86. }
  87. /**
  88. * Returns the result so far.
  89. * @return a String with the resulting text.
  90. */
  91. public String GetResultantText(){
  92. if (DUMP_STATE) DumpState();
  93. locationalResult.Sort();
  94. StringBuilder sb = new StringBuilder();
  95. TextChunk lastChunk = null;
  96. foreach (TextChunk chunk in locationalResult) {
  97. if (lastChunk == null){
  98. sb.Append(chunk.text);
  99. } else {
  100. if (chunk.SameLine(lastChunk)){
  101. float dist = chunk.DistanceFromEndOf(lastChunk);
  102. if (dist < -chunk.charSpaceWidth)
  103. sb.Append(' ');
  104. // we only insert a blank space if the trailing character of the previous string wasn't a space, and the leading character of the current string isn't a space
  105. else if (dist > chunk.charSpaceWidth/2.0f && chunk.text[0] != ' ' && lastChunk.text[lastChunk.text.Length-1] != ' ')
  106. sb.Append(' ');
  107. sb.Append(chunk.text);
  108. } else {
  109. sb.Append('\n');
  110. sb.Append(chunk.text);
  111. }
  112. }
  113. lastChunk = chunk;
  114. }
  115. return sb.ToString();
  116. }
  117. /** Used for debugging only */
  118. private void DumpState(){
  119. foreach (TextChunk location in locationalResult) {
  120. location.PrintDiagnostics();
  121. Console.WriteLine();
  122. }
  123. }
  124. /**
  125. *
  126. * @see com.itextpdf.text.pdf.parser.RenderListener#renderText(com.itextpdf.text.pdf.parser.TextRenderInfo)
  127. */
  128. public void RenderText(TextRenderInfo renderInfo) {
  129. LineSegment segment = renderInfo.GetBaseline();
  130. TextChunk location = new TextChunk(renderInfo.GetText(), segment.GetStartPoint(), segment.GetEndPoint(), renderInfo.GetSingleSpaceWidth());
  131. locationalResult.Add(location);
  132. }
  133. /**
  134. * Represents a chunk of text, it's orientation, and location relative to the orientation vector
  135. */
  136. private class TextChunk : IComparable<TextChunk>{
  137. /** the text of the chunk */
  138. internal String text;
  139. /** the starting location of the chunk */
  140. internal Vector startLocation;
  141. /** the ending location of the chunk */
  142. internal Vector endLocation;
  143. /** unit vector in the orientation of the chunk */
  144. internal Vector orientationVector;
  145. /** the orientation as a scalar for quick sorting */
  146. internal int orientationMagnitude;
  147. /** perpendicular distance to the orientation unit vector (i.e. the Y position in an unrotated coordinate system)
  148. * we round to the nearest integer to handle the fuzziness of comparing floats */
  149. internal int distPerpendicular;
  150. /** distance of the start of the chunk parallel to the orientation unit vector (i.e. the X position in an unrotated coordinate system) */
  151. internal float distParallelStart;
  152. /** distance of the end of the chunk parallel to the orientation unit vector (i.e. the X position in an unrotated coordinate system) */
  153. internal float distParallelEnd;
  154. /** the width of a single space character in the font of the chunk */
  155. internal float charSpaceWidth;
  156. public TextChunk(String str, Vector startLocation, Vector endLocation, float charSpaceWidth) {
  157. this.text = str;
  158. this.startLocation = startLocation;
  159. this.endLocation = endLocation;
  160. this.charSpaceWidth = charSpaceWidth;
  161. orientationVector = endLocation.Subtract(startLocation).Normalize();
  162. orientationMagnitude = (int)(Math.Atan2(orientationVector[Vector.I2], orientationVector[Vector.I1])*1000);
  163. // see http://mathworld.wolfram.com/Point-LineDistance2-Dimensional.html
  164. // the two vectors we are crossing are in the same plane, so the result will be purely
  165. // in the z-axis (out of plane) direction, so we just take the I3 component of the result
  166. Vector origin = new Vector(0,0,1);
  167. distPerpendicular = (int)(startLocation.Subtract(origin)).Cross(orientationVector)[Vector.I3];
  168. distParallelStart = orientationVector.Dot(startLocation);
  169. distParallelEnd = orientationVector.Dot(endLocation);
  170. }
  171. public void PrintDiagnostics(){
  172. Console.WriteLine("Text (@" + startLocation + " -> " + endLocation + "): " + text);
  173. Console.WriteLine("orientationMagnitude: " + orientationMagnitude);
  174. Console.WriteLine("distPerpendicular: " + distPerpendicular);
  175. Console.WriteLine("distParallel: " + distParallelStart);
  176. }
  177. /**
  178. * @param as the location to compare to
  179. * @return true is this location is on the the same line as the other
  180. */
  181. public bool SameLine(TextChunk a){
  182. if (orientationMagnitude != a.orientationMagnitude) return false;
  183. if (distPerpendicular != a.distPerpendicular) return false;
  184. return true;
  185. }
  186. /**
  187. * Computes the distance between the end of 'other' and the beginning of this chunk
  188. * in the direction of this chunk's orientation vector. Note that it's a bad idea
  189. * to call this for chunks that aren't on the same line and orientation, but we don't
  190. * explicitly check for that condition for performance reasons.
  191. * @param other
  192. * @return the number of spaces between the end of 'other' and the beginning of this chunk
  193. */
  194. public float DistanceFromEndOf(TextChunk other){
  195. float distance = distParallelStart - other.distParallelEnd;
  196. return distance;
  197. }
  198. /**
  199. * Compares based on orientation, perpendicular distance, then parallel distance
  200. * @see java.lang.Comparable#compareTo(java.lang.Object)
  201. */
  202. public int CompareTo(TextChunk rhs) {
  203. if (this == rhs) return 0; // not really needed, but just in case
  204. int rslt;
  205. rslt = CompareInts(orientationMagnitude, rhs.orientationMagnitude);
  206. if (rslt != 0) return rslt;
  207. rslt = CompareInts(distPerpendicular, rhs.distPerpendicular);
  208. if (rslt != 0) return rslt;
  209. // note: it's never safe to check floating point numbers for equality, and if two chunks
  210. // are truly right on top of each other, which one comes first or second just doesn't matter
  211. // so we arbitrarily choose this way.
  212. rslt = distParallelStart < rhs.distParallelStart ? -1 : 1;
  213. return rslt;
  214. }
  215. /**
  216. *
  217. * @param int1
  218. * @param int2
  219. * @return comparison of the two integers
  220. */
  221. private static int CompareInts(int int1, int int2){
  222. return int1 == int2 ? 0 : int1 < int2 ? -1 : 1;
  223. }
  224. }
  225. /**
  226. * no-op method - this renderer isn't interested in image events
  227. * @see com.itextpdf.text.pdf.parser.RenderListener#renderImage(com.itextpdf.text.pdf.parser.ImageRenderInfo)
  228. * @since 5.0.1
  229. */
  230. public void RenderImage(ImageRenderInfo renderInfo) {
  231. // do nothing
  232. }
  233. }
  234. }