/linkifier
Perl | 335 lines | 210 code | 71 blank | 54 comment | 18 complexity | c99bae95ba4dd31b6293aa6e4e448ffb MD5 | raw file
Possible License(s): AGPL-3.0
- #! /usr/bin/perl
- use strict;
- use autodie;
- use warnings;
- #########################
- ## DEBUGGING
- #########################
- # this is just to keep debugging from interfering with the actual script
- sub debug_setup {
- # only need this for debugging
- use Time::HiRes qw< gettimeofday tv_interval >;
- $main::t0 = [gettimeofday];
- # try to load Debuggit; use it if possible, otherwise just do enough to keep the compiler from
- # complaining
- if ( eval { require 'Debuggit.pm' } ) {
- # set debugging level here
- Debuggit->import(DEBUG => 1);
- }
- else {
- eval q{
- use constant DEBUG => 0;
- sub debuggit {}
- }
- }
- }
- sub elapsed {
- debuggit(1 => @_, tv_interval $main::t0);
- }
- BEGIN { debug_setup() }
- #########################
- ## MODULES
- #########################
- use File::Spec;
- use Getopt::Std;
- use Tie::IxHash;
- use File::Basename;
- use Net::Google::Spreadsheets;
- use Text::Balanced qw< gen_extract_tagged extract_multiple >;
- elapsed('done loading modules');
- #########################
- ## OPTIONS
- #########################
- our $ME = basename($0);
- our($opt_h, $opt_l, $opt_p, $opt_u);
- getopts('hlp:u:');
- $opt_p ||= 'XXX';
- $opt_u ||= 'XXX';
- my $preserve_links = $opt_l;
- $Getopt::Std::STANDARD_HELP_VERSION = 'true';
- my $help = qq{
- Usage: $ME [OPTIONS] <INPUT_FILE>
- Linkifier produces output that "linkifies" terms specified in a datafile,
- turning them into HTML links that lead to URL values provided in that
- datafile.
- -h display this help information and exit
- -l preserve links already extant in the file
- (this may be subject to change)
- -p PASS specify a password
- -u USER specify a username
- EXAMPLES:
- with neither username nor password preconfigured, specify both:
- $ME -u USER -p PASS inputfile.txt
- to save output in a file (using a redirect), with username and
- password preconfigured:
- $ME inputfile.txt > outputfile.txt
- the same thing, specifying username and password:
- $ME -u USER -p PASSWORD inputfile.txt > outputfile.txt
- CONFIGURATION: Within the linkifier program source code, there are two
- instances of the string 'XXX'. These instances can be replaced with a
- username and password for your Google account used to access a Google
- Spreadsheets datafile. This allows you to run the program without having
- to manually enter a username and password every time you execute the
- program. WARNING! Do NOT share a version of the program that contains
- your username and password with anyone! For developers using a shared
- version control system to work on this program, this means you should
- NEVER commit a version of the program that contains your username and
- password. Using the -u and -p options should always be the preferred
- means of specifying username and password.
- KNOWN BUGS: One of the current developers (as of this writing) will try
- to ensure that known bugs are always tracked using the issue tracker at
- a public BitBucket repository:
- http://bitbucket.org/d20pfsrd/linkifier/issues
- If you wish to report any bugs, you may use that issue tracker or (at
- this time at least) the d20pfsrd-contributors Google group.
- };
- if ($opt_h) {
- print $help;
- exit;
- }
- sub HELP_MESSAGE {
- print "Use the -h option for a complete help message.\n";
- exit;
- }
- elapsed('done checking options');
- #########################
- ## BEGIN MAIN PROGRAM
- #########################
- our @stuff;
- my $links = read_phrases();
- local $/ = undef;
- while ( <> ) {
- # disguise some stuff so it doesn't get replaced
- collapse_stuff($_);
- # modify it
- foreach my $phrase (keys %$links) {
- my $re = quotemeta($phrase); # JIC phrase includes regex metachars
- my $count = s{
- \b$re\b # find phrase, only as separate words
- }
- {
- scalar(
- # push the link (includes the original phrase) onto our href array
- (push @stuff, \$links->{$phrase}),
- # replace with a reference to the index in the href array
- "{X$#stuff}"
- )
- }xeg;
- debuggit(5 => $phrase, '=>', $count) if $phrase =~ /^E/;
- }
- elapsed('done transforming');
- expand_stuff($_); # ... put all the links back ...
- print;
- }
- elapsed('done with all files');
- #########################
- ## SUBS
- #########################
- use constant PHRASE_FILE => File::Spec->tmpdir . '/linkifier.phrases';
- sub read_phrases {
- # no clue why this is necessary
- no warnings 'once';
- my %links;
- if ( -e PHRASE_FILE ) {
- if (open(IN, '<', PHRASE_FILE)) {
- warn("reading phrases from ", PHRASE_FILE);
- tie %links, 'Tie::IxHash';
- while ( <IN> ) {
- chomp;
- my ($key, $value) = split("\t");
- $links{$key} = $value;
- }
- close(IN);
- elapsed('done reading');
- } else {
- die("phrase file exists but can't open it");
- }
- } else {
- # read Google spreadsheet contents containing link substituions
- my $google = Net::Google::Spreadsheets->new(
- password => $opt_p,
- username => $opt_u,
- );
- my $ssheet = $google->spreadsheet(
- { title => 'd20pfsrd-string-replacements' }
- );
- my $wsheet = $ssheet->worksheet();
- my %values = map { $_->content || () } $wsheet->cells();
- elapsed('done reading');
- # now turn that into a sorted hash
- # first sort criteria: length of phrase (longest first)
- # this guarantees we'll catch (e.g.) "disguise spell" before "disguise"
- # second sort criteria: alphabetic
- # probably not necessary, but I'm leaving it in for now
- tie %links, 'Tie::IxHash', map {
- $_ => $values{$_}
- } sort { length($b) <=> length($a) || $a cmp $b } keys %values;
- debuggit(4 => "links", DUMP => \%links);
- elapsed('done sorting');
- # now save for future use
- if (open(OUT, '>', PHRASE_FILE)) {
- print OUT join("\t", $_, $links{$_}), "\n" foreach keys %links;
- close(OUT);
- } else {
- die("can't create phrase file");
- }
- }
- return \%links;
- }
- my ($extract_hrefs, $extract_headers);
- sub collapse_stuff
- {
- # this will make this run much faster; see POD for Text::Balanced
- # need 3rd argument in each to preserve whitespace before tags
- #
- # we will build 2 extractors:
- # 1) one for href links, and their contents
- # 2) one for headers (<h1>, <h2>, etc) and their contents
- $extract_hrefs = gen_extract_tagged(
- qr/<a.+?href.+?>/i, undef, ''
- ) unless $extract_hrefs;
- $extract_headers = gen_extract_tagged(
- qr/<h\d.*?>/i, undef, ''
- ) unless $extract_headers;
- # this is an algorithm cribbed from Filter::Simple (by the excellent
- # Damian Conway)
- #
- # this is the second time I've stolen it, primarily because it worked
- # so well the first time
- #
- # this one will replace <a href> type tags with placeholders; each
- # placeholder will have a number in it that corresponds to an index
- # in an array
- #
- # the array helps us put everything back exactly as it was at the end
- # (see expand_stuff)
- #
- # the reason we do this is so that we can recognize a phrase that has
- # _already_ been linked, and make sure we don't try to link it again
- $_[0] = join(
- '', map { collapse() } extract_multiple($_[0],
- [ # munge the following things, so they don't get replaced:
- # <a> tags which are hrefs, and their contents
- { HREF => $extract_hrefs },
- # <h#> tags (headers), and their contents
- { H => $extract_headers },
- # feature names (phrases followed by (EX), (Sp), or (Su))
- { FNAME => qr/\w[\w\s]*?\((?:Ex|Sp|Su)\)/ },
- ]
- )
- );
- debuggit(4 => "after replacing links, HTML looks like:", $_[0]);
- debuggit(3 => "href array is", DUMP => \@stuff);
- elapsed('done collapsing');
- }
- sub collapse {
- my $type = ref $_;
- if (not $type) {
- # no type means this is not something we want to collapse,
- # so just throw it right back
- return $_;
- } elsif ($type eq 'HREF') {
- if ($preserve_links) {
- # we'll collapse it; this should match the outer else code
- push @stuff, $_;
- return "{X$#stuff}";
- } else {
- # instead of collapsing, we're going to strip the tags off and leave just the content
- $$_ =~ m@<.*?>(.*?)</.*?>@;
- return $1;
- }
- } else {
- # so we do want to collapse it, so push it on our list and return a
- # reference to where we stuck it
- push @stuff, $_;
- return "{X$#stuff}";
- }
- }
- sub expand_stuff {
- # now put the hrefs back:
- # just replace all our special references of the form {X#}
- # with the corresponding index in the hrefs array
- # don't forget that each element is actually a _ref_ to the link
- # which explains why we need to wrap it in ${ }
- $_[0] =~ s/{X(\d+)}/${$stuff[$1]}/g;
- elapsed('done expanding');
- }