/dotNET/pdfclown.samples.cli/src/org/pdfclown/samples/cli/BasicTextExtractionSample.cs

https://bitbucket.org/VahidN/pdfclown · C# · 79 lines · 56 code · 6 blank · 17 comment · 7 complexity · 2ea8880cb784f2dd6919b95b1d31e27e MD5 · raw file

  1. using org.pdfclown.documents;
  2. using org.pdfclown.documents.contents;
  3. using org.pdfclown.documents.contents.fonts;
  4. using org.pdfclown.documents.contents.objects;
  5. using org.pdfclown.files;
  6. using org.pdfclown.tools;
  7. using System;
  8. using System.Collections.Generic;
  9. namespace org.pdfclown.samples.cli
  10. {
  11. /**
  12. <summary>This sample demonstrates the low-level way to extract text from a PDF document.</summary>
  13. <remarks>In order to obtain richer information about the extracted text content,
  14. see the other available samples (<see cref="TextInfoExtractionSample"/>,
  15. <see cref="AdvancedTextExtractionSample"/>).</remarks>
  16. */
  17. public class BasicTextExtractionSample
  18. : Sample
  19. {
  20. public override void Run(
  21. )
  22. {
  23. // 1. Opening the PDF file...
  24. string filePath = PromptFileChoice("Please select a PDF file");
  25. using(File file = new File(filePath))
  26. {
  27. Document document = file.Document;
  28. // 2. Text extraction from the document pages.
  29. foreach(Page page in document.Pages)
  30. {
  31. if(!PromptNextPage(page, false))
  32. {
  33. Quit();
  34. break;
  35. }
  36. Extract(
  37. new ContentScanner(page) // Wraps the page contents into a scanner.
  38. );
  39. }
  40. }
  41. }
  42. /**
  43. <summary>Scans a content level looking for text.</summary>
  44. */
  45. /*
  46. NOTE: Page contents are represented by a sequence of content objects,
  47. possibly nested into multiple levels.
  48. */
  49. private void Extract(
  50. ContentScanner level
  51. )
  52. {
  53. if(level == null)
  54. return;
  55. while(level.MoveNext())
  56. {
  57. ContentObject content = level.Current;
  58. if(content is ShowText)
  59. {
  60. Font font = level.State.Font;
  61. // Extract the current text chunk, decoding it!
  62. Console.WriteLine(font.Decode(((ShowText)content).Text));
  63. }
  64. else if(content is Text
  65. || content is ContainerObject)
  66. {
  67. // Scan the inner level!
  68. Extract(level.ChildLevel);
  69. }
  70. }
  71. }
  72. }
  73. }