linkifier | searchcode

/linkifier

https://bitbucket.org/d20pfsrd/linkifier
Perl | 335 lines | 210 code | 71 blank | 54 comment | 18 complexity | c99bae95ba4dd31b6293aa6e4e448ffb MD5 | raw file
Possible License(s): AGPL-3.0

#! /usr/bin/perl

use strict;
use autodie;
use warnings;


#########################
## DEBUGGING
#########################

# this is just to keep debugging from interfering with the actual script
sub debug_setup {
  # only need this for debugging
  use Time::HiRes qw< gettimeofday tv_interval >;
  $main::t0 = [gettimeofday];

  # try to load Debuggit; use it if possible, otherwise just do enough to keep the compiler from
  # complaining

  if ( eval { require 'Debuggit.pm' } ) {
    # set debugging level here
    Debuggit->import(DEBUG => 1);
  }

  else {
    eval q{
      use constant DEBUG => 0;
      sub debuggit {}
    }
  }
}

sub elapsed {
  debuggit(1 => @_, tv_interval $main::t0);
}

BEGIN { debug_setup() }


#########################
## MODULES
#########################

use File::Spec;
use Getopt::Std;
use Tie::IxHash;
use File::Basename;
use Net::Google::Spreadsheets;
use Text::Balanced qw< gen_extract_tagged extract_multiple >;

elapsed('done loading modules');


#########################
## OPTIONS
#########################

our $ME = basename($0);

our($opt_h, $opt_l, $opt_p, $opt_u);
getopts('hlp:u:');

$opt_p ||= 'XXX';
$opt_u ||= 'XXX';

my $preserve_links = $opt_l;

$Getopt::Std::STANDARD_HELP_VERSION = 'true';

my $help = qq{
  Usage: $ME [OPTIONS] <INPUT_FILE>

  Linkifier produces output that "linkifies" terms specified in a datafile,
  turning them into HTML links that lead to URL values provided in that
  datafile.

      -h            display this help information and exit

      -l            preserve links already extant in the file
                    (this may be subject to change)

      -p PASS       specify a password

      -u USER       specify a username

  EXAMPLES:

      with neither username nor password preconfigured, specify both:

          $ME -u USER -p PASS inputfile.txt

      to save output in a file (using a redirect), with username and
      password preconfigured:

          $ME inputfile.txt > outputfile.txt

      the same thing, specifying username and password:

          $ME -u USER -p PASSWORD inputfile.txt > outputfile.txt

  CONFIGURATION:  Within the linkifier program source code, there are two
  instances of the string 'XXX'.  These instances can be replaced with a
  username and password for your Google account used to access a Google
  Spreadsheets datafile.  This allows you to run the program without having
  to manually enter a username and password every time you execute the
  program.  WARNING!  Do NOT share a version of the program that contains
  your username and password with anyone!  For developers using a shared
  version control system to work on this program, this means you should
  NEVER commit a version of the program that contains your username and
  password.  Using the -u and -p options should always be the preferred
  means of specifying username and password.

  KNOWN BUGS:  One of the current developers (as of this writing) will try
  to ensure that known bugs are always tracked using the issue tracker at
  a public BitBucket repository:

      http://bitbucket.org/d20pfsrd/linkifier/issues

  If you wish to report any bugs, you may use that issue tracker or (at
  this time at least) the d20pfsrd-contributors Google group.

};

if ($opt_h) {
  print $help;
  exit;
}

sub HELP_MESSAGE {
  print "Use the -h option for a complete help message.\n";
  exit;
}

elapsed('done checking options');


#########################
## BEGIN MAIN PROGRAM
#########################


our @stuff;
my $links = read_phrases();

local $/ = undef;
while ( <> ) {
  # disguise some stuff so it doesn't get replaced
  collapse_stuff($_);

  # modify it
  foreach my $phrase (keys %$links) {
    my $re = quotemeta($phrase);        # JIC phrase includes regex metachars
    my $count = s{
      \b$re\b                           # find phrase, only as separate words
    }

    {
      scalar(
        # push the link (includes the original phrase) onto our href array
        (push @stuff, \$links->{$phrase}),

        # replace with a reference to the index in the href array
        "{X$#stuff}"
      )
    }xeg;

    debuggit(5 => $phrase, '=>', $count) if $phrase =~ /^E/;
  }

  elapsed('done transforming');

  expand_stuff($_);								# ... put all the links back ...

  print;
}

elapsed('done with all files');


#########################
## SUBS
#########################

use constant PHRASE_FILE => File::Spec->tmpdir . '/linkifier.phrases';

sub read_phrases {
  # no clue why this is necessary
  no warnings 'once';

  my %links;

  if ( -e PHRASE_FILE ) {
    if (open(IN, '<', PHRASE_FILE)) {
      warn("reading phrases from ", PHRASE_FILE);
      tie %links, 'Tie::IxHash';

      while ( <IN> ) {
        chomp;
        my ($key, $value) = split("\t");
        $links{$key} = $value;
      }

      close(IN);
      elapsed('done reading');
    } else {
      die("phrase file exists but can't open it");
    }
  } else {
    # read Google spreadsheet contents containing link substituions
    my $google = Net::Google::Spreadsheets->new(
      password => $opt_p,
      username => $opt_u,
    );

    my $ssheet = $google->spreadsheet(
      { title => 'd20pfsrd-string-replacements' }
    );

    my $wsheet = $ssheet->worksheet();
    my %values = map { $_->content || () } $wsheet->cells();
    elapsed('done reading');

    # now turn that into a sorted hash
    # first sort criteria: length of phrase (longest first)
    # this guarantees we'll catch (e.g.) "disguise spell" before "disguise"
    # second sort criteria: alphabetic
    #   probably not necessary, but I'm leaving it in for now
    tie %links, 'Tie::IxHash', map {
      $_ => $values{$_}
    } sort { length($b) <=> length($a) || $a cmp $b } keys %values;

    debuggit(4 => "links", DUMP => \%links);
    elapsed('done sorting');

    # now save for future use
    if (open(OUT, '>', PHRASE_FILE)) {
      print OUT join("\t", $_, $links{$_}), "\n" foreach keys %links;
      close(OUT);
    } else {
      die("can't create phrase file");
    }
  }

  return \%links;
}


my ($extract_hrefs, $extract_headers);
sub collapse_stuff
{
  # this will make this run much faster; see POD for Text::Balanced
  # need 3rd argument in each to preserve whitespace before tags
  #
  # we will build 2 extractors:
  # 1) one for href links, and their contents
  # 2) one for headers (<h1>, <h2>, etc) and their contents
  $extract_hrefs = gen_extract_tagged(
    qr/<a.+?href.+?>/i, undef, ''
  ) unless $extract_hrefs;

  $extract_headers = gen_extract_tagged(
    qr/<h\d.*?>/i, undef, ''
  ) unless $extract_headers;

  # this is an algorithm cribbed from Filter::Simple (by the excellent
  # Damian Conway)
  #
  # this is the second time I've stolen it, primarily because it worked
  # so well the first time
  #
  # this one will replace <a href> type tags with placeholders; each
  # placeholder will have a number in it that corresponds to an index
  # in an array
  #
  # the array helps us put everything back exactly as it was at the end
  # (see expand_stuff)
  #
  # the reason we do this is so that we can recognize a phrase that has
  # _already_ been linked, and make sure we don't try to link it again

  $_[0] = join(
    '', map { collapse() } extract_multiple($_[0],
      [   # munge the following things, so they don't get replaced:

          # <a> tags which are hrefs, and their contents
          { HREF => $extract_hrefs },

          # <h#> tags (headers), and their contents
          { H => $extract_headers },

          # feature names (phrases followed by (EX), (Sp), or (Su))
          { FNAME => qr/\w[\w\s]*?\((?:Ex|Sp|Su)\)/ },
      ]
    )
  );

  debuggit(4 => "after replacing links, HTML looks like:", $_[0]);
  debuggit(3 => "href array is", DUMP => \@stuff);
  elapsed('done collapsing');
}

sub collapse {
  my $type = ref $_;
  if (not $type) {
    # no type means this is not something we want to collapse,
    # so just throw it right back
    return $_;
  } elsif ($type eq 'HREF') {
    if ($preserve_links) {
      # we'll collapse it; this should match the outer else code
      push @stuff, $_;
      return "{X$#stuff}";
    } else {
      # instead of collapsing, we're going to strip the tags off and leave just the content
      $$_ =~ m@<.*?>(.*?)</.*?>@;
      return $1;
    }
  } else {
    # so we do want to collapse it, so push it on our list and return a
    # reference to where we stuck it
    push @stuff, $_;
    return "{X$#stuff}";
  }
}

sub expand_stuff {
  # now put the hrefs back:
  # just replace all our special references of the form {X#}
  # with the corresponding index in the hrefs array
  # don't forget that each element is actually a _ref_ to the link
  # which explains why we need to wrap it in ${ }
  $_[0] =~ s/{X(\d+)}/${$stuff[$1]}/g;
  elapsed('done expanding');
}