unigrabber.pl - This script is free software by the terms o…

/unigrabber.pl

https://bitbucket.org/naegele/unigrabber · Perl · 115 lines · 83 code · 12 blank · 20 comment · 6 complexity · 35f7e4f02aa0d1395e064cde72bb8cb3 MD5 · raw file

#!/usr/bin/perl

# This script is free software by the terms of the GPLv3

# Contact:
#   Daniel N?gele -  daniel.n.wb@gmail.com
# Thanks to okbr and FaKeller (bitbucket usernames) for the help with the stylesheet

# Follow the comments here, also make sure the directory
# /mirror/password (whatever you choose) is created before running the script.
# Besides a running perl interpreter, this script needs the perl module
# LWP::Simple and the linux utility tar (exchangable).

use LWP::Simple;

$url_file='./urls.txt';
$html_file='./index.html';

sub prepnext
{
    $nr++;
    $full_url = $url[$i];
    if($nr > 9) 
    {
        $full_url =~ s/0<NUM>/<NUM>/g;
    }
    $full_url =~ s/<NUM>/$nr/g; 
    @temp = split(/\//, $full_url);
    $filename = $temp[-1];

    # ugly, but checks if there has been a number left out
    if(not head($full_url) and $_[0] == undef)
    {
        prepnext(1);
    }
}

while(True)
{
    open(URL, $url_file); 
    chomp(@url=<URL>); # read the url file by line
    close(URL);

    open(HTML, ">" . $html_file); 
    # insert custom html title/stylesheet below, 
    print HTML "
        <html>\n
        <head>\n
          <title>PDF-Links Informatik SS12</title>\n
          <link rel='stylesheet' type='text/css' href='style.css'/>\n
        </head>\n
        <body>\n
        <div id='content'>\n";

    # loop over each url
    for($i = 1; $i <= $#url; $i = $i + 2)
    {
        # this code assembles the path of the copies on your server
        $path = "mirror/" . $url[$i - 1];
        unless(-d $path)
        {
            mkdir $path;
        }
        # header for a collection of files
        print HTML "<h3>" . $url[$i - 1] . "</h3>\n";

        $nr = 0;
        prepnext();
        
        # loop over the files at the url
        while(head($full_url)) # checks availability
        {
            unless(-e $filename)
            {
                # download if it hasn't been downloaded in a previous run
                getstore($full_url, $path . "/" . $filename);
            }
            # prints the links to the output file, especially 'Blatt'
            if ($filename =~ /tex$/) {
                getstore($full_url, $path . "/" . $filename);
                print HTML "<p><a href='" . $full_url .  "'>TexFile</a>\n";
                print HTML "<a class='alt' id='alt' href='./" . $path . "/" .
                            $filename . "'>(mirror)</a></p>\n";
                $filename =~ s/tex/pdf/;
                print HTML "<p><a href='./" . $path . "/" .
                            $filename . "'>PDF File (aktuell)</a></p>\n";
            } 
            else {
                print HTML "<p><a href='" . $full_url .  "'>Blatt " . $nr .  "</a>\n";
                print HTML "<a class='alt' id='alt' href='./" . $path . "/" .
                            $filename . "'>(mirror)</a></p>\n";
            }
            prepnext();
        }
    }
    # again, custom html, styles and the footer
    print HTML "
        <div style='clear: both;'></div>\n
        </div>\n
        <div id='footer'>\n
        <div class='grid_half lft'>\n
            <a href='./mirror/mirror.tar'>Archiv</a><br>\n
            <a href='https://bitbucket.org/naegele/unigrabber'>Source</a>
        </div>\n
        <div class='grid_half rgt'>\n
        Erstellt: " . scalar localtime() .  "</div>\n</div>\n </body>\n</html>";
    close(HTML);
    # external command, providing the archive
    system("tar -cf mirror/mirror.tar ./mirror");
    system('rubber -d --into mirror/"DSA TexFile"/ mirror/"DSA TexFile"/scribe_notes.tex');
    print "Updated. \n";
    # ugly method below, cronjob recommended
    sleep(28800); # = 8h
    system('rm mirror/"DSA TexFile"/scribe_notes.tex');
}