PageRenderTime 66ms CodeModel.GetById 27ms RepoModel.GetById 1ms app.codeStats 0ms

/tools/docx2txt.pl

https://github.com/foswiki/StringifierContrib
Perl | 280 lines | 120 code | 58 blank | 102 comment | 11 complexity | 892f6f1edad8e61602abd5f81dad717c MD5 | raw file
  1. #!/usr/bin/env perl
  2. # docx2txt, a command-line utility to convert Docx documents to text format.
  3. # Copyright (C) 2008-2009 Sandeep Kumar
  4. # Copyright (C) 2009-2018 Foswiki Contributors
  5. #
  6. # This program is free software; you can redistribute it and/or modify
  7. # it under the terms of the GNU General Public License as published by
  8. # the Free Software Foundation; either version 3 of the License, or
  9. # (at your option) any later version.
  10. #
  11. # This program is distributed in the hope that it will be useful,
  12. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. # GNU General Public License for more details.
  15. #
  16. # You should have received a copy of the GNU General Public License
  17. # along with this program; if not, write to the Free Software
  18. # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. #
  20. # This script extracts text from document.xml contained inside .docx file.
  21. # Perl v5.8.2 was used for testing this script.
  22. #
  23. # Author : Sandeep Kumar (shimple0 -AT- Yahoo .DOT. COM)
  24. #
  25. # ChangeLog :
  26. #
  27. # 10/08/2008 - Initial version (v0.1)
  28. # 15/08/2008 - Script takes two arguments [second optional] now and can be
  29. # used independently to extract text from docx file. It accepts
  30. # docx file directly, instead of xml file.
  31. # 18/08/2008 - Added support for center and right justification of text that
  32. # fits in a line 80 characters wide (adjustable).
  33. # 03/09/2008 - Fixed the slip in usage message.
  34. # 12/09/2008 - Slightly changed the script invocation and argument handling
  35. # to incorporate some of the shell script functionality here.
  36. # Added support to handle embedded urls in docx document.
  37. # 23/09/2008 - Changed #! line to use /usr/bin/env - good suggestion from
  38. # Rene Maroufi (info>AT<maroufi>DOT<net) to reduce user work
  39. # during installation.
  40. # 31/08/2009 - Added support for handling more escape characters.
  41. # Using OS specific null device to redirect stderr.
  42. # Saving text file in binary mode.
  43. # 03/09/2009 - Updations based on feedback/suggestions from Sergei Kulakov
  44. # (sergei>AT<dewia>DOT<com).
  45. # - removal of non-document text in between TOC related tags.
  46. # - display of hyperlink alongside linked text user controlled.
  47. # - some character conversion updates
  48. # 05/09/2009 - Merged cjustify and rjustify into single subroutine justify.
  49. # Added more character conversions.
  50. # Organised conversion mappings in tabular form for speedup and
  51. # easy maintenance.
  52. # Tweaked code to reduce number of passes over document content.
  53. #
  54. #
  55. # Adjust the settings here.
  56. #
  57. use strict;
  58. use warnings;
  59. my $unzip = "/usr/bin/unzip"; # Windows path like "C:\\path\\to\\unzip.exe"
  60. my $nl = "\n"; # Alternative is "\r\n".
  61. my $lindent = " "; # Indent nested lists by "\t", " " etc.
  62. my $lwidth = 80; # Line width, used for short line justification.
  63. my $showHyperLink = "N"; # Show hyperlink alongside linked text.
  64. # ToDo: Better list handling. Currently assumed 8 level nesting.
  65. my @levchar = ('*', '+', 'o', '-', '**', '++', 'oo', '--');
  66. #
  67. # Character conversion tables
  68. #
  69. # Only amp, gt and lt are required for docx escapes, others are used for better
  70. # text experience.
  71. my %escChrs = ( amp => '&', gt => '>', lt => '<',
  72. acute => '\'', brvbar => '|', copy => '(C)', divide => '/',
  73. laquo => '<<', macr => '-', nbsp => ' ', raquo => '>>',
  74. reg => '(R)', shy => '-', times => 'x'
  75. );
  76. my %splchars = (
  77. "\xC2\xA0" => ' ', # <nbsp>
  78. "\xC2\xA6" => '|', # <brokenbar>
  79. "\xC2\xA9" => '(C)', # <copyright>
  80. "\xC2\xAB" => '<<', # <laquo>
  81. "\xC2\xAC" => '-', # <negate>
  82. "\xC2\xAE" => '(R)', # <regd>
  83. "\xC2\xB1" => '+-', # <plusminus>
  84. "\xC2\xBB" => '>>', # <raquo>
  85. # "\xC2\xA7" => '', # <section>
  86. # "\xC2\xB6" => '', # <para>
  87. "\xC3\x97" => 'x', # <mul>
  88. "\xC3\xB7" => '/', # <div>
  89. "\xE2\x80\x82" => ' ', # <enspc>
  90. "\xE2\x80\x83" => ' ', # <emspc>
  91. "\xE2\x80\x85" => ' ', # <qemsp>
  92. "\xE2\x80\x93" => ' - ', # <endash>
  93. "\xE2\x80\x94" => ' -- ', # <emdash>
  94. "\xE2\x80\x98" => '`', # <soq>
  95. "\xE2\x80\x99" => '\'', # <scq>
  96. "\xE2\x80\x9C" => '"', # <doq>
  97. "\xE2\x80\x9D" => '"', # <dcq>
  98. "\xE2\x80\xA2" => '::', # <diamond symbol>
  99. "\xE2\x80\xA6" => '...', # <ellipsis>
  100. "\xE2\x84\xA2" => '(TM)', # <trademark>
  101. "\xE2\x89\xA0" => '!=', # <neq>
  102. "\xE2\x89\xA4" => '<=', # <leq>
  103. "\xE2\x89\xA5" => '>=', # <geq>
  104. #
  105. # Currency symbols
  106. #
  107. "\xC2\xA2" => 'cent',
  108. "\xC2\xA3" => 'Pound',
  109. "\xC2\xA5" => 'Yen',
  110. "\xE2\x82\xAC" => 'Euro'
  111. );
  112. #
  113. # Check argument(s) sanity.
  114. #
  115. my $usage = <<USAGE;
  116. Usage: $0 <infile.docx> [outfile.txt|-]
  117. Use '-' as the outfile name to dump the text on STDOUT.
  118. Output is saved in infile.txt if second argument is omitted.
  119. USAGE
  120. die $usage if (@ARGV == 0 || @ARGV > 2);
  121. stat($ARGV[0]);
  122. die "Can't read docx file <$ARGV[0]>!\n" if ! (-f _ && -r _);
  123. die "<$ARGV[0]> does not seem to be docx file!\n" if -T _;
  124. #
  125. # Extract needed data from argument docx file.
  126. #
  127. my $nulldevice;
  128. if ($ENV{OS} && $ENV{OS} =~ /^Windows/) {
  129. $nulldevice = "nul";
  130. } else {
  131. $nulldevice = "/dev/null";
  132. }
  133. my $content = `$unzip -p '$ARGV[0]' word/document.xml 2>$nulldevice`;
  134. die "Failed to extract required information from <$ARGV[0]>!\n" if ! $content;
  135. #
  136. # Be ready for outputting the extracted text contents.
  137. #
  138. if (@ARGV == 1) {
  139. $ARGV[1] = $ARGV[0];
  140. $ARGV[1] .= ".txt" if !($ARGV[1] =~ s/\.docx$/\.txt/);
  141. }
  142. my $txtfile;
  143. open($txtfile, "> $ARGV[1]") || die "Can't create <$ARGV[1]> for output!\n";
  144. #
  145. # Gather information about header, footer, hyperlinks, images, footnotes etc.
  146. #
  147. $_ = `$unzip -p '$ARGV[0]' word/_rels/document.xml.rels 2>$nulldevice`;
  148. my %docurels;
  149. while (/<Relationship Id="(.*?)" Type=".*?\/([^\/]*?)" Target="(.*?)"( .*?)?\/>/g)
  150. {
  151. $docurels{"$2:$1"} = $3;
  152. }
  153. #
  154. # Subroutines for center and right justification of text in a line.
  155. #
  156. sub justify {
  157. my $len = length $_[1];
  158. if ($_[0] eq "center" && $len < ($lwidth - 1)) {
  159. return ' ' x (($lwidth - $len) / 2) . $_[1];
  160. } elsif ($_[0] eq "right" && $len < $lwidth) {
  161. return ' ' x ($lwidth - $len) . $_[1];
  162. } else {
  163. return $_[1];
  164. }
  165. }
  166. #
  167. # Subroutines for dealing with embedded links and images
  168. #
  169. sub hyperlink {
  170. return $_[1] . (lc $showHyperLink eq "y" ? " [HYPERLINK: $docurels{\"hyperlink:$_[0]\"}]" : "");
  171. }
  172. #
  173. # Text extraction starts.
  174. #
  175. my %tag2chr = (tab => "\t", noBreakHyphen => "-", softHyphen => " - ");
  176. $content =~ s/<?xml .*?\?>(\r)?\n//;
  177. $content =~ s{<w:p [^/>]+?/>|</w:p>|<w:br/>}|$nl|og;
  178. $content =~ s{<w:(tab|noBreakHyphen|softHyphen)/>}|$tag2chr{$1}|og;
  179. my $hr = '-' x 78 . $nl;
  180. $content =~ s|<w:pBdr>.*?</w:pBdr>|$hr|og;
  181. $content =~ s|<w:numPr><w:ilvl w:val="([0-9]+)"/>|$lindent x $1 . "$levchar[$1] "|oge;
  182. #
  183. # Uncomment either of below two lines and comment above line, if dealing
  184. # with more than 8 level nested lists.
  185. #
  186. # $content =~ s|<w:numPr><w:ilvl w:val="([0-9]+)"/>|$lindent x $1 . '* '|oge;
  187. # $content =~ s|<w:numPr><w:ilvl w:val="([0-9]+)"/>|'*' x ($1+1) . ' '|oge;
  188. $content =~ s{<w:caps/>.*?(<w:t>|<w:t [^>]+>)(.*?)</w:t>}/uc $2/oge;
  189. $content =~ s{<w:pPr><w:jc w:val="([^"]*?)"/></w:pPr><w:r><w:t>(.*?)</w:t></w:r>}/justify($1,$2)/oge;
  190. $content =~ s{<w:hyperlink r:id="(.*?)".*?>(.*?)</w:hyperlink>}/hyperlink($1,$2)/oge;
  191. # Remove stuff between TOC related tags.
  192. if ($content =~ m|<w:pStyle w:val="TOCHeading"/>|) {
  193. $content =~ s|<w:instrText[^>]*>.*?</w:instrText>||og;
  194. }
  195. $content =~ s/<.*?>//og;
  196. #
  197. # Convert non-ASCII characters/character sequences to ASCII characters.
  198. #
  199. $content =~ s/(\xE2..|\xC2.|\xC3.)/($splchars{$1} ? $splchars{$1} : $1)/oge;
  200. #
  201. # Convert docx specific escape chars first.
  202. #
  203. $content =~ s/(&)(amp|gt|lt)(;)/$escChrs{lc $2}/iog;
  204. #
  205. # Another pass for a better text experience, after sequences like "&amp;laquo;"
  206. # are converted to "&laquo;".
  207. #
  208. $content =~ s/((&)([a-z]+)(;))/($escChrs{lc $3} ? $escChrs{lc $3} : $1)/ioge;
  209. #
  210. # Write the extracted and converted text contents to output.
  211. #
  212. binmode $txtfile; # Ensure no auto-conversion of '\n' to '\r\n' on Windows.
  213. print $txtfile $content;
  214. close $txtfile;