PageRenderTime 47ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 1ms

/uplug-main/lib/Uplug.pm

https://bitbucket.org/tiedemann/uplug
Perl | 549 lines | 329 code | 139 blank | 81 comment | 47 complexity | 707d2007fde4ad3e2ef87adfc4425ec3 MD5 | raw file
Possible License(s): GPL-3.0, LGPL-2.1, BSD-3-Clause
  1. #-*-perl-*-
  2. #---------------------------------------------------------------------------
  3. # Copyright (C) 2004-2012 Joerg Tiedemann
  4. #
  5. # This program is free software; you can redistribute it and/or modify
  6. # it under the terms of the GNU General Public License as published by
  7. # the Free Software Foundation; either version 2 of the License, or
  8. # (at your option) any later version.
  9. #
  10. # This program is distributed in the hope that it will be useful,
  11. # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. # GNU General Public License for more details.
  14. #
  15. # You should have received a copy of the GNU General Public License
  16. # along with this program; if not, write to the Free Software
  17. # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  18. #---------------------------------------------------------------------------
  19. =head1 NAME
  20. Uplug - a toolbox for processing (parallel) text corpora
  21. =head1 SYNOPSIS
  22. $module = 'pre/basic';
  23. %args = ( '-in' => $input_file_name,
  24. '-ci' => $input_char_encoding );
  25. my $uplug=Uplug->new($module, %args); # create a new uplug module
  26. $uplug->load(); # load it
  27. $uplug->run(); # and run it
  28. =head1 DESCRIPTION
  29. This library provides the main methods for loading Uplug modules and running them. Configuration files describe the module and its parameters (see L<Uplug::Config>). Each module may contain a number of sub-modules. Each of them can usually calls the uplug scripts provided in the package.
  30. =head1 USAGE
  31. More information on how to use the Uplug toolkit with the provided modules can be found here:
  32. L<uplug>
  33. Add-ons and language-specific modules can be downloaded from the Uplug project website at bitbucket: L<https://bitbucket.org/tiedemann/uplug>
  34. =cut
  35. package Uplug;
  36. require 5.005;
  37. use strict;
  38. use IO::File;
  39. use File::Temp qw/ :POSIX /;
  40. use Uplug::Config;
  41. use File::Basename;
  42. use FindBin qw($Bin);
  43. use vars qw($VERSION $AUTHOR $DEBUG);
  44. use vars qw(@TempFiles);
  45. $VERSION = '0.3.10';
  46. $AUTHOR = 'Joerg Tiedemann';
  47. $DEBUG = 0;
  48. #-----------------------------------------------------------------------
  49. BEGIN{
  50. setpgrp(0,0); # become leader of the process group
  51. $SIG{HUP}=sub{die "# Uplug.pm: hangup";};
  52. }
  53. END{
  54. local $SIG{HUP}='IGNORE'; # ignore HANGUP signal for right now
  55. kill ('HUP',-$$); # kill child processes before you die
  56. }
  57. #-----------------------------------------------------------------------
  58. =head1 Class methods
  59. =head2 Constructor
  60. $uplug = new Uplug ( $module, %args )
  61. Construct a new Uplug object for the given Uplug module ($module refers to a configuration file). Module arguments are specified in C<%args> and depend on the module. For more information about specific Uplug modules, use the Uplug startup script:
  62. uplug -h module-name
  63. =cut
  64. sub new{
  65. my $class=shift;
  66. my $configfile=shift;
  67. my $self={};
  68. bless $self,$class;
  69. $self->{CONFIGFILE} = $configfile;
  70. $self->{CONFIG} = &ReadConfig($configfile,@_);
  71. mkdir 'data',0755 if (! -d 'data');
  72. mkdir 'data/runtime',0755 if (! -d 'data/runtime');
  73. $self->{RUNTIMEDIR} = 'data/runtime/'.$$;
  74. mkdir $self->{RUNTIMEDIR},0755 if (! -d $self->{RUNTIMEDIR});
  75. return $self;
  76. }
  77. ##---------------------------------------------------------------------
  78. ## DESTROY: clean up! remove all temporary files and directories!
  79. sub DESTROY{
  80. my $self=shift;
  81. if ($DEBUG){exit;}
  82. unlink $self->{MODULE};
  83. if (ref($self->{TEMPFILES}) eq 'ARRAY'){
  84. unlink @{$self->{TEMPFILES}};
  85. }
  86. rmdir $self->{RUNTIMEDIR};
  87. }
  88. =head2 C<load>
  89. $uplug->load()
  90. Load the module given in the constructor and all its sub-modules. This also creates temporary configuration files with adjusted parameters in C<data/runtime>.
  91. =cut
  92. ##---------------------------------------------------------------------
  93. ## load module configurations
  94. ## * create runtime config files the module and all submodules
  95. sub load{
  96. my $self=shift;
  97. my $count=1;
  98. my $runtime = $self->{RUNTIMEDIR}.'/';
  99. $runtime .= basename($self->{CONFIGFILE});
  100. while (-e $runtime.$count){$count++;}
  101. $self->{MODULE} = $runtime.$count;
  102. push(@{$self->{TEMPFILES}},$self->{MODULE});
  103. &WriteConfig($self->{MODULE},$self->{CONFIG});
  104. $self->loadSubMods();
  105. $self->data($self->output()); # my own data is available
  106. }
  107. =head2 C<run>
  108. $uplug->run()
  109. Run all commands specified in all sub-modules. Pipeline commands will be constructed according to the sequence of sub-modules and the specifications in the Uplug configuration files. The will be simply executed as external system calls.
  110. =cut
  111. ##---------------------------------------------------------------------
  112. ## run the Uplug module (and all its submodules)
  113. ## * get the system command
  114. ## * split it up into separate system calls
  115. ## * run the system calls and print elapsed time/call
  116. sub run{
  117. my $self=shift;
  118. my $cmd=$self->command();
  119. my @seq=split(/;/,$cmd); # split command sequence
  120. my $start=time();
  121. for (@seq){
  122. my $time=time();
  123. print STDERR "$_\n---------------------------------------------\n";
  124. if (my $sig=system ($_)){
  125. print STDERR "# Uplug.pm: Got signal $? from child process:\n";
  126. print STDERR "# $_\n";
  127. return 0;
  128. }
  129. $time=time()-$time;
  130. my ($sec,$min,$hour,$mday,$mon,$year)=gmtime($time);
  131. printf STDERR
  132. " processing time: %2d:%2d:%2d:%2d:%2d:%2d\n",
  133. $year-70,$mon,$mday-1,$hour,$min,$sec;
  134. }
  135. $start=time()-$start;
  136. my ($sec,$min,$hour,$mday,$mon,$year)=gmtime($start);
  137. printf STDERR
  138. " total processing time: %2d:%2d:%2d:%2d:%2d:%2d\n",
  139. $year-70,$mon,$mday-1,$hour,$min,$sec;
  140. }
  141. =head1 Class-internal methods
  142. =head2 C<loadSubMods>
  143. Load all sub-modules and adjust input and output according to the configuration files and the current pipe-line. Output streams will be used as input streams with the same name for the next sub-module. This method tries to find possible pipelines for combining commands.
  144. =cut
  145. ##---------------------------------------------------------------------
  146. ## create config files for all sub-modules
  147. ## * modify input/output according to the data in the module sequence
  148. ## * check if I can use pipes (stdout -> stdin)
  149. ## * expand loops
  150. sub loadSubMods{
  151. my $self=shift;
  152. my $submod=&GetParam($self->{CONFIG},'module','submodules');
  153. my $loop=&GetParam($self->{CONFIG},'module','loop');
  154. my ($loopstart,$loopend)=split(/:/,$loop);
  155. my $iter=&GetParam($self->{CONFIG},'module','iterations');
  156. if (ref($submod) eq 'ARRAY'){
  157. $self->{SUBMOD}=[]; # initialize sub-module array
  158. my $count=1; # iteration counter
  159. my $input=$self->input; # my input will be
  160. my $data=$self->data($input); # the initial data collection
  161. my $stdout; # is defined if previous module produces STDOUT
  162. my $i=0; # sub-module number
  163. my $n=0; # module number in the sequence
  164. while ($i<@$submod){
  165. if ((defined $iter) and ($count>$iter)){last;}
  166. my ($conf,@par)=split(/\s+/,$submod->[$i]);
  167. $i++ && next unless (-e &FindConfig($conf)); # skip modules without config
  168. $self->{SUBMOD}->[$n]=Uplug->new($conf,@par); # check also params
  169. $self->{SUBMOD}->[$n]->input($data); # change input
  170. ## check if stdout in last module but no stdin now
  171. ## --> if yes: broken pipe!
  172. my $broken=0;
  173. my $stdin=$self->{SUBMOD}->[$n]->stdin();
  174. if ($stdout and (not $stdin)){
  175. $broken = 1;
  176. }
  177. ## otherwise if STDIN and STDOUT:
  178. ## check if any output file is in use
  179. ## if yes --> broken pipe
  180. elsif ($stdin and $stdout){
  181. my $out=$self->{SUBMOD}->[$n]->output();
  182. if (ref($out) eq 'HASH'){
  183. for (keys %$out){
  184. if ((exists $out->{file}) and
  185. $self->FileInUse($out->{file})){
  186. $broken=1;
  187. last;
  188. }
  189. }
  190. }
  191. }
  192. ## if pipe is broken:
  193. ## * save to temp file if no file given
  194. ## * delete 'stdout' flag from config file
  195. if ($broken){
  196. if (not &GetParam($self->{SUBMOD}->[$n-1]->{CONFIG},
  197. 'output',$stdout,'file')){
  198. my $tmpfile=$self->NewTempFile();
  199. &SetParam($self->{SUBMOD}->[$n-1]->{CONFIG},
  200. $tmpfile,'output',$stdout,'file');
  201. &SetParam($data,$tmpfile,'output',$stdout,'file');
  202. $self->{SUBMOD}->[$n-1]->load();
  203. }
  204. &SetParam($self->{SUBMOD}->[$n-1]->{CONFIG},
  205. undef,'module','stdout');
  206. }
  207. ## change input data according to available data-spec
  208. ## load the current module
  209. $self->{SUBMOD}->[$n]->load(); # load module
  210. $stdout=$self->{SUBMOD}->[$n]->stdout();
  211. my $new=$self->{SUBMOD}->[$n]->data(); # get new output
  212. $data=$self->data($new); # set new data
  213. ## jump back to the loop start
  214. ## (if a loop is defiend)
  215. if ((defined $loopend) and ($i==$loopend)){
  216. $count++;
  217. $i=$loopstart-1;
  218. }
  219. $i++;$n++;
  220. }
  221. # if there is at least one submodule:
  222. # my output should be the one produced by the last submodule
  223. if (@$submod){
  224. my $output=$self->output;
  225. $self->{SUBMOD}->[-1]->output($output);
  226. my $data=$self->data($output);
  227. }
  228. $self->data($data);
  229. }
  230. }
  231. =head2 C<command>
  232. $cmd = $uplug->command()
  233. Return a sequence of system commands for the entire pipeline. Commands are separated by ';'. System command may include several pipelines through STDIN/STDOUT.
  234. =cut
  235. ##---------------------------------------------------------------------
  236. ## return the system command to be called for this Uplug module
  237. ## (including all sub-modules, pipes, ...)
  238. sub command{
  239. my $self=shift;
  240. my $stdout=shift;
  241. if (ref($self->{SUBMOD}) eq 'ARRAY'){
  242. my $cmd;
  243. my $loop=&GetParam($self->{CONFIG},'module','loop');
  244. my ($loopstart,$loopend)=split(/:/,$loop);
  245. my $iter=&GetParam($self->{CONFIG},'module','iterations');
  246. my $count=0;
  247. for my $s (@{$self->{SUBMOD}}){
  248. my $c=$s->command($cmd,$stdout);
  249. my $stdin=$s->stdin();
  250. if ($stdout and $stdin){
  251. $cmd.=' | '.$c;
  252. }
  253. elsif ($cmd){$cmd.=';'.$c;}
  254. else{$cmd=$c;}
  255. $stdout=$s->stdout;
  256. }
  257. return $cmd;
  258. }
  259. my $bin=&GetParam($self->{CONFIG},'module','location');
  260. my $cmd=&GetParam($self->{CONFIG},'module','program');
  261. if (-f $bin.'/'.$cmd){$cmd=$bin.'/'.$cmd;}
  262. # if (-f $Bin.'/'.$cmd){$cmd=$Bin.'/'.$cmd;}
  263. $cmd.=' -i '.$self->{MODULE};
  264. if ($DEBUG){
  265. $cmd='perl -d:DProf '.$cmd;
  266. }
  267. return $cmd;
  268. }
  269. =head2 C<input>
  270. Change the input settings in a particular configuration.
  271. =cut
  272. ##---------------------------------------------------------------------
  273. ## change input settings in the module configuraton
  274. ## (only for the ones that exist already)
  275. ## and write changes to the physical config file
  276. #
  277. # we make exception: the attribute 'root' should NOT be overwritten!
  278. # (this is necessary to not change the root tag in pipelines!)
  279. # TODO: this is not a nice solution! Is there a more general solution?!
  280. sub input{
  281. my $self=shift;
  282. my ($input)=@_;
  283. if (ref($input) eq 'HASH'){
  284. foreach (keys %$input){
  285. if (&GetParam($self->{CONFIG},'input',$_)){
  286. ## don't change the root attribute if it exists
  287. ## (this is quite a hack - awful!!!)
  288. if ((exists $self->{CONFIG}->{input}->{$_}->{root}) &&
  289. (exists $input->{$_}->{root})){
  290. $input->{$_}->{root} = $self->{CONFIG}->{input}->{$_}->{root};
  291. }
  292. &SetParam($self->{CONFIG},$input->{$_},'input',$_);
  293. }
  294. $self->{DATA}->{$_}=$input->{$_};
  295. }
  296. if (exists $self->{MODULE}){
  297. &WriteConfig($self->{MODULE},$self->{CONFIG});
  298. }
  299. }
  300. return &GetParam($self->{CONFIG},'input');
  301. }
  302. =head2 C<output>
  303. Change the output settings in a particular configuration.
  304. =cut
  305. ##---------------------------------------------------------------------
  306. ## change output settings in the module configuraton
  307. ## (only for the ones that exist already)
  308. ## and write changes to the physical config file
  309. sub output{
  310. my $self=shift;
  311. my ($output)=@_;
  312. if (ref($output) eq 'HASH'){
  313. foreach (keys %$output){
  314. if (&GetParam($self->{CONFIG},'output',$_)){
  315. &SetParam($self->{CONFIG},$output->{$_},'output',$_);
  316. }
  317. $self->{DATA}->{$_}=$output->{$_};
  318. }
  319. $self->load();
  320. }
  321. return &GetParam($self->{CONFIG},'output');
  322. }
  323. =head2 C<data>
  324. Set/return data files available in the current pipeline.
  325. =cut
  326. ##---------------------------------------------------------------------
  327. ## set/return available data
  328. ## (here we store al kinds of data available in the module sequence)
  329. sub data{
  330. my $self=shift;
  331. my ($data)=@_;
  332. if (ref($data) eq 'HASH'){
  333. foreach (keys %$data){
  334. $self->{DATA}->{$_}=$data->{$_};
  335. }
  336. }
  337. if (ref($self->{DATA}) eq 'HASH'){ # save open files
  338. for my $d (keys %{$self->{DATA}}){ # (to check pipe-conflicts)
  339. if (exists $self->{DATA}->{$d}->{file}){
  340. $self->{FILES}->{$self->{DATA}->{$d}->{file}}=1;
  341. }
  342. }
  343. }
  344. return $self->{DATA};
  345. }
  346. =head2 C<stdin>
  347. Check whether their is an input stream that can read from STDIN.
  348. =cut
  349. ##---------------------------------------------------------------------
  350. # stdin: returns input name if there is one that reads from stdin
  351. # (looks at {module => {stdin => '...'}}
  352. # and the definition of the input stream (check 'file' attr))
  353. # returns undef if no input defined that reads from STDIN
  354. sub stdin{
  355. my $self=shift;
  356. my $in=&GetParam($self->{CONFIG},'module','stdin');
  357. if (&GetParam($self->{CONFIG},'input',$in)){
  358. if (not &GetParam($self->{CONFIG},'input',$in,'file')){
  359. return $in;
  360. }
  361. }
  362. return undef;
  363. }
  364. =head2 C<stdout>
  365. Check whether their is an output stream that can write to STDOUT.
  366. =cut
  367. ##---------------------------------------------------------------------
  368. # stdout: same as stdin but for STDOUT
  369. sub stdout{
  370. my $self=shift;
  371. my $out=&GetParam($self->{CONFIG},'module','stdout');
  372. if (&GetParam($self->{CONFIG},'output',$out)){
  373. if (not &GetParam($self->{CONFIG},'output',$out,'file')){
  374. return $out;
  375. }
  376. }
  377. return undef;
  378. }
  379. =head2 FileInUse
  380. Checks whether a particular file is already in use in the current pipeline (avoids writing over files that a command still reads from).
  381. =cut
  382. sub FileInUse{
  383. my $self=shift;
  384. return $self->{FILES}->{$_[0]};
  385. }
  386. =head2 C<NewTempFile>
  387. Return a new temporary file (in data/runtime).
  388. =cut
  389. ##---------------------------------------------------------------------
  390. ## return a temporary file name (and touch it)
  391. #
  392. # TODO: use File::Temp instead
  393. sub NewTempFile{
  394. my $self=shift;
  395. my $count=0;
  396. my $temp = $self->{RUNTIMEDIR}.'/.temp';
  397. while (-e $temp.$count){
  398. $count++;
  399. }
  400. $temp.=$count;
  401. open F,">$temp";close F;
  402. push (@{$self->{TEMPFILES}},$temp);
  403. return $temp;
  404. }
  405. 1;
  406. __END__
  407. =head1 SEE ALSO
  408. L<uplug>, L<Uplug::Config>, L<Uplug::IO::Any>
  409. For the latest sources, language packs, additional modules and tools: Please, have a look at the project website at L<https://bitbucket.org/tiedemann/uplug>
  410. =cut