PageRenderTime 26ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 0ms

/PDF_extract.pl

https://bitbucket.org/d20pfsrd/extractifier
Perl | 148 lines | 100 code | 33 blank | 15 comment | 12 complexity | b12709920164ff255f5246c79dc65bf0 MD5 | raw file
  1. #!/usr/bin/perl
  2. # Author: VxP
  3. #
  4. # Additional Contributions: d20pfsrd community
  5. #
  6. # Description: creates HTML output from PDF input
  7. # tries to preserve paragraphs and formatting
  8. #
  9. # Usage: PDF_extract.pl <intput_file.pdf>
  10. use strict;
  11. use warnings;
  12. use Getopt::Std;
  13. ### OPTIONS
  14. our($opt_f, $opt_l, $opt_h, $opt_s);
  15. getopts('f:l:hs');
  16. $Getopt::Std::STANDARD_HELP_VERSION = 1;
  17. sub HELP_MESSAGE {
  18. print <<" END";
  19. Usage: PDF_extract [OPTIONS] <INPUT_FILE>
  20. The extractifier translates PDF input into HTML output using the pdftohtml
  21. utility as a back end, and performs some janitorial purposes with the
  22. resulting output.
  23. -h display this help information and exit
  24. -f P specify an (optional) starting page number in the PDF
  25. # broken by for loop iterator
  26. -l P specify an (optional) ending page number in the PDF
  27. -s specify simplified output
  28. # may be subject to change if it should be default
  29. KNOWN BUGS: One of the current developers (as of this writing) will try to
  30. ensure that known bugs are always tracked using the issue tracker at a public
  31. BitBucket repository:
  32. http://bitbucket.org/d20pfsrd/extractifier/issues
  33. If you wish to report any bugs, you may use that issue tracker or (at this
  34. time at least) the d20pfsrd-contributors Google group.
  35. END
  36. exit;
  37. }
  38. if ($opt_h) {
  39. HELP_MESSAGE();
  40. }
  41. ### PDFTOHTML OPTIONS
  42. my $pdftohtml_opts = "-i -stdout";
  43. if ($opt_f) {
  44. $pdftohtml_opts = "$pdftohtml_opts" . " -f $opt_f";
  45. }
  46. if ($opt_l) {
  47. $pdftohtml_opts = "$pdftohtml_opts" . " -l $opt_l";
  48. }
  49. if (!$opt_s) {
  50. $pdftohtml_opts = "$pdftohtml_opts" . ' -c';
  51. }
  52. ### MAIN PROGRAM
  53. my @data = `pdftohtml $pdftohtml_opts $ARGV[0]`;
  54. my $name = substr($ARGV[0], 0, -4);
  55. my @textfile;
  56. my $i=0;
  57. for (my $n=0; $n<=$#data; $n++) {
  58. my $filenumber = $n+1;
  59. my $filename = $name."-".$filenumber.".html";
  60. open DATA, $filename or die $!;
  61. while (my $line = <DATA>) {
  62. if ($line =~ m/^<DIV/) {
  63. if ($line !~ m/>(\d+<|paizo.com|TM|®|)/) {
  64. chomp $line;
  65. $line =~ s/^<DIV.*?>/<DIV>/;
  66. $line =~ s/<nobr><span.*?>//;
  67. $line =~ s/<\/span><\/nobr>//;
  68. $textfile[$i] = $line;
  69. $i++;
  70. }
  71. }
  72. }
  73. close DATA;
  74. unlink $filename;
  75. }
  76. for (my $m=0; $m<=$#textfile; $m++) {
  77. $textfile[$m] =~ s/(<i>|<\/i>)//g;
  78. if ($textfile[$m] =~ m/(&nbsp;<\/DIV>|>\w<)/) {
  79. $textfile[$m] =~ s/<\/DIV>//;
  80. $textfile[$m+1] =~ s/<DIV>//;
  81. }
  82. if ($textfile[$m] =~ m/<DIV>([a-z]|&nbsp;|[,.;-]|)/) {
  83. $textfile[$m-1] =~ s/<\/DIV>//;
  84. $textfile[$m] =~ s/<DIV>//;
  85. }
  86. $textfile[$m] =~ s/&nbsp;<br>/ /g;
  87. $textfile[$m] =~ s/-<br>/-/g;
  88. $textfile[$m] =~ s/<br>//g;
  89. $textfile[$m] =~ s/<br>/<\/DIV><DIV>/g;
  90. $textfile[$m] =~ s/<\/DIV>/<\/DIV>\n/g;
  91. }
  92. my $outfile1 = $name.".tmp";
  93. my $handle1 = ">".$outfile1;
  94. open OUTPUT, $handle1 or die $!;
  95. print OUTPUT '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"><HTML><HEAD><TITLE>'.$name.'</TITLE><META http-equiv="Content-Type"content="text/html; charset=UTF-8"></HEAD>';
  96. print OUTPUT @textfile;
  97. print OUTPUT "</DIV></BODY></HTML>";
  98. close OUTPUT;
  99. my $outfile2 = $name.".html";
  100. my $handle2 = ">".$outfile2;
  101. open EDITINPUT, $outfile1 or die $!;
  102. open EDITOUTPUT, $handle2 or die $!;
  103. while (my $line = <EDITINPUT>) {
  104. $line =~ s/&nbsp;&nbsp;/&nbsp;/g;
  105. $line =~ s/&nbsp;/ /g;
  106. # my $count = ()= $line =~ m/\w+/g;
  107. print EDITOUTPUT $line;
  108. }
  109. close EDITINPUT;
  110. close EDITOUTPUT;
  111. unlink $outfile1;
  112. my $filename2 = $name."-outline.html";
  113. unlink $filename2;
  114. my $filename3 = $name."_ind.html";
  115. unlink $filename3;