PageRenderTime 47ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 0ms

/linkifier

https://bitbucket.org/d20pfsrd/linkifier
Perl | 335 lines | 210 code | 71 blank | 54 comment | 18 complexity | c99bae95ba4dd31b6293aa6e4e448ffb MD5 | raw file
Possible License(s): AGPL-3.0
  1. #! /usr/bin/perl
  2. use strict;
  3. use autodie;
  4. use warnings;
  5. #########################
  6. ## DEBUGGING
  7. #########################
  8. # this is just to keep debugging from interfering with the actual script
  9. sub debug_setup {
  10. # only need this for debugging
  11. use Time::HiRes qw< gettimeofday tv_interval >;
  12. $main::t0 = [gettimeofday];
  13. # try to load Debuggit; use it if possible, otherwise just do enough to keep the compiler from
  14. # complaining
  15. if ( eval { require 'Debuggit.pm' } ) {
  16. # set debugging level here
  17. Debuggit->import(DEBUG => 1);
  18. }
  19. else {
  20. eval q{
  21. use constant DEBUG => 0;
  22. sub debuggit {}
  23. }
  24. }
  25. }
  26. sub elapsed {
  27. debuggit(1 => @_, tv_interval $main::t0);
  28. }
  29. BEGIN { debug_setup() }
  30. #########################
  31. ## MODULES
  32. #########################
  33. use File::Spec;
  34. use Getopt::Std;
  35. use Tie::IxHash;
  36. use File::Basename;
  37. use Net::Google::Spreadsheets;
  38. use Text::Balanced qw< gen_extract_tagged extract_multiple >;
  39. elapsed('done loading modules');
  40. #########################
  41. ## OPTIONS
  42. #########################
  43. our $ME = basename($0);
  44. our($opt_h, $opt_l, $opt_p, $opt_u);
  45. getopts('hlp:u:');
  46. $opt_p ||= 'XXX';
  47. $opt_u ||= 'XXX';
  48. my $preserve_links = $opt_l;
  49. $Getopt::Std::STANDARD_HELP_VERSION = 'true';
  50. my $help = qq{
  51. Usage: $ME [OPTIONS] <INPUT_FILE>
  52. Linkifier produces output that "linkifies" terms specified in a datafile,
  53. turning them into HTML links that lead to URL values provided in that
  54. datafile.
  55. -h display this help information and exit
  56. -l preserve links already extant in the file
  57. (this may be subject to change)
  58. -p PASS specify a password
  59. -u USER specify a username
  60. EXAMPLES:
  61. with neither username nor password preconfigured, specify both:
  62. $ME -u USER -p PASS inputfile.txt
  63. to save output in a file (using a redirect), with username and
  64. password preconfigured:
  65. $ME inputfile.txt > outputfile.txt
  66. the same thing, specifying username and password:
  67. $ME -u USER -p PASSWORD inputfile.txt > outputfile.txt
  68. CONFIGURATION: Within the linkifier program source code, there are two
  69. instances of the string 'XXX'. These instances can be replaced with a
  70. username and password for your Google account used to access a Google
  71. Spreadsheets datafile. This allows you to run the program without having
  72. to manually enter a username and password every time you execute the
  73. program. WARNING! Do NOT share a version of the program that contains
  74. your username and password with anyone! For developers using a shared
  75. version control system to work on this program, this means you should
  76. NEVER commit a version of the program that contains your username and
  77. password. Using the -u and -p options should always be the preferred
  78. means of specifying username and password.
  79. KNOWN BUGS: One of the current developers (as of this writing) will try
  80. to ensure that known bugs are always tracked using the issue tracker at
  81. a public BitBucket repository:
  82. http://bitbucket.org/d20pfsrd/linkifier/issues
  83. If you wish to report any bugs, you may use that issue tracker or (at
  84. this time at least) the d20pfsrd-contributors Google group.
  85. };
  86. if ($opt_h) {
  87. print $help;
  88. exit;
  89. }
  90. sub HELP_MESSAGE {
  91. print "Use the -h option for a complete help message.\n";
  92. exit;
  93. }
  94. elapsed('done checking options');
  95. #########################
  96. ## BEGIN MAIN PROGRAM
  97. #########################
  98. our @stuff;
  99. my $links = read_phrases();
  100. local $/ = undef;
  101. while ( <> ) {
  102. # disguise some stuff so it doesn't get replaced
  103. collapse_stuff($_);
  104. # modify it
  105. foreach my $phrase (keys %$links) {
  106. my $re = quotemeta($phrase); # JIC phrase includes regex metachars
  107. my $count = s{
  108. \b$re\b # find phrase, only as separate words
  109. }
  110. {
  111. scalar(
  112. # push the link (includes the original phrase) onto our href array
  113. (push @stuff, \$links->{$phrase}),
  114. # replace with a reference to the index in the href array
  115. "{X$#stuff}"
  116. )
  117. }xeg;
  118. debuggit(5 => $phrase, '=>', $count) if $phrase =~ /^E/;
  119. }
  120. elapsed('done transforming');
  121. expand_stuff($_); # ... put all the links back ...
  122. print;
  123. }
  124. elapsed('done with all files');
  125. #########################
  126. ## SUBS
  127. #########################
  128. use constant PHRASE_FILE => File::Spec->tmpdir . '/linkifier.phrases';
  129. sub read_phrases {
  130. # no clue why this is necessary
  131. no warnings 'once';
  132. my %links;
  133. if ( -e PHRASE_FILE ) {
  134. if (open(IN, '<', PHRASE_FILE)) {
  135. warn("reading phrases from ", PHRASE_FILE);
  136. tie %links, 'Tie::IxHash';
  137. while ( <IN> ) {
  138. chomp;
  139. my ($key, $value) = split("\t");
  140. $links{$key} = $value;
  141. }
  142. close(IN);
  143. elapsed('done reading');
  144. } else {
  145. die("phrase file exists but can't open it");
  146. }
  147. } else {
  148. # read Google spreadsheet contents containing link substituions
  149. my $google = Net::Google::Spreadsheets->new(
  150. password => $opt_p,
  151. username => $opt_u,
  152. );
  153. my $ssheet = $google->spreadsheet(
  154. { title => 'd20pfsrd-string-replacements' }
  155. );
  156. my $wsheet = $ssheet->worksheet();
  157. my %values = map { $_->content || () } $wsheet->cells();
  158. elapsed('done reading');
  159. # now turn that into a sorted hash
  160. # first sort criteria: length of phrase (longest first)
  161. # this guarantees we'll catch (e.g.) "disguise spell" before "disguise"
  162. # second sort criteria: alphabetic
  163. # probably not necessary, but I'm leaving it in for now
  164. tie %links, 'Tie::IxHash', map {
  165. $_ => $values{$_}
  166. } sort { length($b) <=> length($a) || $a cmp $b } keys %values;
  167. debuggit(4 => "links", DUMP => \%links);
  168. elapsed('done sorting');
  169. # now save for future use
  170. if (open(OUT, '>', PHRASE_FILE)) {
  171. print OUT join("\t", $_, $links{$_}), "\n" foreach keys %links;
  172. close(OUT);
  173. } else {
  174. die("can't create phrase file");
  175. }
  176. }
  177. return \%links;
  178. }
  179. my ($extract_hrefs, $extract_headers);
  180. sub collapse_stuff
  181. {
  182. # this will make this run much faster; see POD for Text::Balanced
  183. # need 3rd argument in each to preserve whitespace before tags
  184. #
  185. # we will build 2 extractors:
  186. # 1) one for href links, and their contents
  187. # 2) one for headers (<h1>, <h2>, etc) and their contents
  188. $extract_hrefs = gen_extract_tagged(
  189. qr/<a.+?href.+?>/i, undef, ''
  190. ) unless $extract_hrefs;
  191. $extract_headers = gen_extract_tagged(
  192. qr/<h\d.*?>/i, undef, ''
  193. ) unless $extract_headers;
  194. # this is an algorithm cribbed from Filter::Simple (by the excellent
  195. # Damian Conway)
  196. #
  197. # this is the second time I've stolen it, primarily because it worked
  198. # so well the first time
  199. #
  200. # this one will replace <a href> type tags with placeholders; each
  201. # placeholder will have a number in it that corresponds to an index
  202. # in an array
  203. #
  204. # the array helps us put everything back exactly as it was at the end
  205. # (see expand_stuff)
  206. #
  207. # the reason we do this is so that we can recognize a phrase that has
  208. # _already_ been linked, and make sure we don't try to link it again
  209. $_[0] = join(
  210. '', map { collapse() } extract_multiple($_[0],
  211. [ # munge the following things, so they don't get replaced:
  212. # <a> tags which are hrefs, and their contents
  213. { HREF => $extract_hrefs },
  214. # <h#> tags (headers), and their contents
  215. { H => $extract_headers },
  216. # feature names (phrases followed by (EX), (Sp), or (Su))
  217. { FNAME => qr/\w[\w\s]*?\((?:Ex|Sp|Su)\)/ },
  218. ]
  219. )
  220. );
  221. debuggit(4 => "after replacing links, HTML looks like:", $_[0]);
  222. debuggit(3 => "href array is", DUMP => \@stuff);
  223. elapsed('done collapsing');
  224. }
  225. sub collapse {
  226. my $type = ref $_;
  227. if (not $type) {
  228. # no type means this is not something we want to collapse,
  229. # so just throw it right back
  230. return $_;
  231. } elsif ($type eq 'HREF') {
  232. if ($preserve_links) {
  233. # we'll collapse it; this should match the outer else code
  234. push @stuff, $_;
  235. return "{X$#stuff}";
  236. } else {
  237. # instead of collapsing, we're going to strip the tags off and leave just the content
  238. $$_ =~ m@<.*?>(.*?)</.*?>@;
  239. return $1;
  240. }
  241. } else {
  242. # so we do want to collapse it, so push it on our list and return a
  243. # reference to where we stuck it
  244. push @stuff, $_;
  245. return "{X$#stuff}";
  246. }
  247. }
  248. sub expand_stuff {
  249. # now put the hrefs back:
  250. # just replace all our special references of the form {X#}
  251. # with the corresponding index in the hrefs array
  252. # don't forget that each element is actually a _ref_ to the link
  253. # which explains why we need to wrap it in ${ }
  254. $_[0] =~ s/{X(\d+)}/${$stuff[$1]}/g;
  255. elapsed('done expanding');
  256. }