PageRenderTime 52ms CodeModel.GetById 24ms RepoModel.GetById 0ms app.codeStats 1ms

/tags/rel-0-2/tools/extractdata.pl

#
Perl | 478 lines | 365 code | 56 blank | 57 comment | 31 complexity | 4ebd477cefc9d43a98d9cb856e35e1ba MD5 | raw file
Possible License(s): GPL-2.0, LGPL-3.0, LGPL-2.1, GPL-3.0, CC-BY-SA-3.0
  1. #!/usr/bin/perl -w
  2. # $Revision: 1.11 $
  3. # the produced freedict-database.xml has the following schema:
  4. #
  5. # document element: FreeDictDatabase
  6. # attributes: none
  7. # children: dictionary*
  8. #
  9. # element: dictionary
  10. # children: release*
  11. # attributes:
  12. # @name language-combination, eg. eng-deu
  13. # @edition taken from TEI header, will be used as release version
  14. # @headwords `wc -l dictd-formatted-db.index`
  15. # @date last change of TEI file
  16. # @status contents of status note in TEI header, if available
  17. # @sourceURL URL in sourceDesc in TEI header (upstream project)
  18. # @notes unused
  19. # @HEADorRelease in CVS, unused
  20. # @maintainerName Maintainer name (without email) from
  21. # /TEI.2/fileDesc/titleStmt/respStmt/name[../resp='Maintainer']
  22. # @maintainerEmail Email address of Maintainer from same place
  23. # @unsupported space separated list of platforms, eg. "evolutionary bedic"
  24. #
  25. # element: release
  26. # children: none
  27. # attributes:
  28. # @platform allowed values: dict-tgz, dict-tbz2, mobi,
  29. # bedic, deb, rpm, gem, src, evolutionary
  30. # @version version of the dictionary this is a release of
  31. # @URL URL where this release can be downloaded
  32. # (additional click may be required by SourceForge)
  33. # @size size of this release in bytes
  34. # @date when this release was made, eg. 2004-12-25
  35. use FindBin;
  36. use Getopt::Std;
  37. use XML::DOM;
  38. use File::stat;
  39. use strict;
  40. our($opt_v, $opt_h, $opt_a, $opt_d, $opt_f, $opt_r, $opt_l);
  41. getopts('vhald:fr:');
  42. sub printd
  43. {
  44. return if !$opt_v;
  45. print @_;
  46. }
  47. my $FREEDICTDIR = $ENV{'FREEDICTDIR'} || "$FindBin::Bin/..";
  48. printd "Using FREEDICTDIR=$FREEDICTDIR\n";
  49. my $dbfile = "$FREEDICTDIR/freedict-database.xml";
  50. if($opt_h)
  51. {
  52. print <<EOT;
  53. $0 [options] (-a | -d <la1-la2>) [-r [<file>]]
  54. Gather metadata from TEI files in FreeDict file tree
  55. and save it in the XML file $dbfile.
  56. The location is taken from the environment variable
  57. FREEDICTDIR or, if that is not set, the parent directory
  58. of the script is taken, assuming the script resides
  59. in the tools subdirectory of the FreeDict file tree.
  60. Options:
  61. -h help & exit
  62. -v verbose
  63. -a extract metadata from all available databases
  64. -d extract data only from database la1-la2
  65. -f force update of extracted data from TEI file,
  66. even if its modification time is less than the last update
  67. -l leave $dbfile untouched
  68. -r extract released packages from a SourceForge file release
  69. HTML page. Uses STDIN if '-' given as filename.
  70. For FreeDict download:
  71. http://sourceforge.net/project/showfiles.php?group_id=1419
  72. EOT
  73. exit;
  74. }
  75. sub contains_dictionary
  76. {
  77. my($doc, $entry) = @_;
  78. my $nodes = $doc->getElementsByTagName("dictionary");
  79. my $n = $nodes->getLength;
  80. for(my $i = 0; $i < $n; $i++)
  81. {
  82. my $node = $nodes->item($i);
  83. my $name = $node->getAttributeNode("name");
  84. next unless $name;
  85. return $node if($name->getValue eq $entry);
  86. }
  87. return undef;
  88. }
  89. sub fdict_extract_metadata
  90. {
  91. my($dirname, $entry, $doc) = @_;
  92. printd " Getting metadata from dictionary in '$dirname/$entry'\n";
  93. my $docel = $doc->getDocumentElement();
  94. # find old dictionary element -> update
  95. my $d = contains_dictionary($doc, $entry);
  96. # else create new dictionary element
  97. if(!defined $d)
  98. {
  99. printd " Dictionary not found in database. Inserting it.\n";
  100. $docel->appendChild( $doc->createTextNode(" ") );
  101. $d = $doc->createElement('dictionary');
  102. $docel->appendChild($d);
  103. $docel->appendChild( $doc->createTextNode("\n") );
  104. $d->setAttribute('name', $entry);
  105. }
  106. ###################################################################
  107. my($headwords, $edition, $date, $status, $sourceURL, $maintainerName,
  108. $maintainerEmail, $unsupported);
  109. my $indexfile = "$dirname/$entry/$entry.index";
  110. if(!-r $indexfile)
  111. {
  112. system "cd $dirname/$entry && make $entry.index"
  113. or print STDERR " ERROR: Failed to remake $entry.index\n";
  114. }
  115. if(-r $indexfile)
  116. {
  117. my @a = split ' ', `wc -l "$indexfile"`;
  118. $headwords = (shift @a) - 8;# substract /00-?database.*/ entries
  119. printd " $headwords headwords\n";
  120. }
  121. else
  122. {
  123. print STDERR " Where is file '$indexfile'?\n";
  124. $headwords = "ERROR: Could not find $indexfile";
  125. }
  126. $d->setAttribute('headwords', $headwords);
  127. ###################################################################
  128. my $teifile = "$dirname/$entry/$entry.tei";
  129. if(!-r $teifile)
  130. {
  131. system "cd $dirname/$entry && make $teifile"
  132. or print STDERR " ERROR: Failed to remake $teifile\n";
  133. }
  134. if(-r $teifile)
  135. {
  136. my $s = stat $teifile;
  137. my @ss = localtime($s->mtime);
  138. $date = sprintf("%4d-%02d-%02d", $ss[5]+1900, $ss[4]+1, $ss[3]);
  139. if($date le $d->getAttribute('date') and !$opt_f)
  140. {
  141. printd " Skipping time consuming extraction steps for update (try -f).\n";
  142. return;
  143. }
  144. ###################################################################
  145. #$edition = `sabcmd xsl/getedition.xsl "$teifile"`;
  146. # the --no-print-directory switch is required if extractdata is
  147. # run from inside a Makefile
  148. $edition = `cd $dirname/$entry;make --no-print-directory version`;
  149. ###################################################################
  150. #$status = `sabcmd xsl/getstatus.xsl "$teifile"`;
  151. $status = `cd $dirname/$entry;make --no-print-directory status`;
  152. $status = 'unknown' if(!$status);
  153. ###################################################################
  154. #$sourceURL = `sabcmd xsl/getsourceurl.xsl "$teifile"`;
  155. $sourceURL = `cd $dirname/$entry;make --no-print-directory sourceURL`;
  156. ###################################################################
  157. #my $maintainer = `sabcmd xsl/getmaintainer.xsl "$teifile"`;
  158. my $maintainer = `cd $dirname/$entry;make --no-print-directory maintainer`;
  159. if($maintainer =~ /^([^<]+)\s<(.*)>$/)
  160. {
  161. $maintainerName = $1;
  162. $maintainerEmail = $2;
  163. #printd " Extracted maintainer: name='$maintainerName' email='$maintainerEmail'\n";
  164. }
  165. else
  166. {
  167. printd " Could not extract maintainer name or email from:\n" .
  168. "\t$maintainer\n";
  169. }
  170. ###################################################################
  171. $unsupported = `cd $dirname/$entry && make --no-print-directory print-unsupported`;
  172. printd " Failed to get info on unsupported platforms: $! $?\n" if(!defined $unsupported);
  173. ###################################################################
  174. }
  175. else
  176. {
  177. $edition = "ERROR: $teifile not readable";
  178. $date = $edition;
  179. $status = $edition;
  180. $sourceURL = $edition;
  181. }
  182. $d->setAttribute('edition', $edition);
  183. $d->setAttribute('date', $date);
  184. $d->setAttribute('status', $status);
  185. $d->setAttribute('sourceURL', $sourceURL);
  186. $d->setAttribute('maintainerName', $maintainerName);
  187. $d->setAttribute('maintainerEmail', $maintainerEmail);
  188. if(defined $unsupported && $unsupported =~ /[^\s]/)
  189. {
  190. $d->setAttribute('unsupported', $unsupported);
  191. }
  192. else
  193. {
  194. $d->removeAttribute('unsupported');
  195. }
  196. }
  197. sub fdict_extract_all_metadata
  198. {
  199. my($dirname, $doc) = @_;
  200. my($dir, $entry);
  201. printd "Getting metadata of all databases\n";
  202. opendir $dir, $dirname;
  203. while($entry = readdir($dir))
  204. {
  205. next if(! -d $dirname.'/'.$entry);
  206. next if($entry !~ '^(\p{IsAlpha}{3})-(\p{IsAlpha}{3})$');
  207. fdict_extract_metadata($dirname, $entry, $doc);
  208. }
  209. }
  210. ##################################################################
  211. sub fdict_extract_releases
  212. {
  213. my $doc = shift;
  214. my $docel = $doc->getDocumentElement();
  215. my $file = *STDIN;
  216. if($opt_r ne '-')
  217. {
  218. if(!open($file,'<', $opt_r))
  219. {
  220. print "Cannot read file '$opt_r'\n";
  221. exit;
  222. };
  223. };
  224. my @lines = <$file>;
  225. chomp foreach(@lines);
  226. my $line = join '', @lines;
  227. # tackle it with regexps
  228. my($packages, $filename, $size, $downloads, $URL);
  229. my @packs = split /<tr class="package">/, $line;
  230. shift @packs;# throw away garbage before first package
  231. # for all packages
  232. foreach(@packs)
  233. {
  234. $packages++;# counts packages
  235. $line = $_;
  236. warn " cannot find release number"
  237. if($line !~ /id="pkg\d+_\d+rel\d+_\d+">([\d\.]+)<\/a>/cg);
  238. my $release_version = $1;
  239. warn " cannot find release date"
  240. if($line !~ /otes<\/a>\] \(([\d\- :]+)\) <\/small>/cg);
  241. my $release_date = $1;
  242. printd "\n package $packages: release_number: '$release_version' " .
  243. "release_date: '$release_date'\n";
  244. # for all files of a release
  245. while($line =~ /<a href="(http:\/\/prdownloads\.sourceforge\.net\/freedict\/[^\?]{5,50}\?download)">([^<]{5,50})<\/a>/cg)
  246. {
  247. #printd "1: $1 2: $2"\n";
  248. #warn "cannot find filename" if($line !~ /\?download">([^<]*)<\/a><\/td>/cg);
  249. $filename = $2;
  250. $URL = $1;
  251. $size = -1;
  252. warn " cannot find size"
  253. if($line !~ /<td (class="even")?>(\d+)<\/td>/cg) or
  254. $size = $2;
  255. $downloads = -1;
  256. warn " cannot find downloads"
  257. if($line !~ /\">(\d*)<\/a><\/td>/cg);
  258. $downloads = $1;
  259. printd "\tfilename: $filename size: $size\n";
  260. ################################################################
  261. # find old dictionary element -> update
  262. my $name;
  263. if($filename =~ /^freedict-/) { $name = substr($filename, 9,7) }
  264. else { $name = substr($filename,0,7); }
  265. if($name !~ /^\w{3}-\w{3}$/)
  266. {
  267. printd "Invalid dictionary name '$name'. Skipping release.\n";
  268. next;
  269. }
  270. my $d = contains_dictionary($doc, $name);
  271. if(!$d)
  272. {
  273. print " Dictionary '$name' not in our database. Skipping release.\n";
  274. next;
  275. }
  276. # find platform by extracting it from filename
  277. # allowed values: dict-tgz, dict-tbz2, mobi, bedic, deb, rpm, gem, src
  278. my($platform, $fileversion, $sfn, $ssfn);
  279. # cut prefix "freedict-" if available
  280. if($filename =~ /^freedict-/) { $sfn = substr($filename, 9); }
  281. else { $sfn = $filename; }
  282. # cut language combination
  283. $ssfn = substr($sfn, 7);
  284. # cut a minus sign. if available
  285. if($ssfn =~ /^-/) { $ssfn = substr($ssfn, 1); }
  286. if($ssfn =~ /^\.tar\.gz/)
  287. { $platform = 'dict-tgz'; }
  288. elsif($ssfn =~ /^\d{1,3}\.\d{1,3}(\.\d{1,3})?\.tar\.gz/)
  289. { $platform = 'dict-tgz'; }
  290. elsif($ssfn =~ /^\.tar\.bz2/)
  291. { $platform = 'dict-tbz2'; }
  292. elsif($ssfn =~ /^\d{1,3}\.\d{1,3}(\.\d{1,3})?\.tar\.bz2/)
  293. { $platform = 'dict-tbz2'; }
  294. elsif($ssfn =~ /\.dic\.dz/)
  295. # eg. freedict-kha-deu-0.0.1.dic.dz
  296. { $platform = 'bedic'; }
  297. elsif($ssfn =~ /\.ipk/)
  298. # eg. freedict-kha-deu-0.0.1.ipk
  299. { $platform = 'zbedic'; }
  300. elsif($ssfn =~ /\.evolutionary\.zip/)
  301. # eg. freedict-afr-eng-0.1.evolutionary.zip
  302. { $platform = 'evolutionary'; }
  303. elsif($ssfn =~ /\d{1,3}\.\d{1,3}(\.\d{1,3})?\.src(\.tar)?\.bz2/)
  304. { $platform = 'src'; }
  305. elsif($ssfn =~ /^\d{1,3}\.\d{1,3}(\.\d{1,3})?-(\w+)\.noarch\.rpm/)
  306. # eg. freedict-kha-deu-0.0.1-1.noarch.rpm
  307. { $platform = 'rpm'; }
  308. elsif($ssfn =~ /^\d{1,3}\.\d{1,3}(\.\d{1,3})?-(\w+)\.[\w\.]+/)
  309. { $platform = $2; }
  310. else
  311. {
  312. print "Cannot make sense of filename '$filename'. Skip.\n";
  313. next;
  314. }
  315. # find old release element
  316. my $r;
  317. for my $kid ($d->getElementsByTagName('release'))
  318. {
  319. if($kid->getAttribute('platform') eq $platform)
  320. {
  321. $r = $kid; last;# found
  322. }
  323. }
  324. # create new release element if no previous found
  325. if(!$r)
  326. {
  327. print "+\tRelease not found in database. Inserting it.\n";
  328. $d->appendChild( $doc->createTextNode("\n") ) if( ! @{ ($d->getChildNodes) } );
  329. $d->appendChild( $doc->createTextNode(" ") );
  330. $r = $doc->createElement('release');
  331. $d->appendChild($r);
  332. $d->appendChild( $doc->createTextNode("\n") );
  333. $r->setAttribute('platform', $platform);
  334. }
  335. # if $r refers to an older release than available in the database,
  336. # don't update the database
  337. # $release_version = "0.0.1" if($release_version eq "");
  338. # next if($r->getAttribute('version') ge $release_version);
  339. printd "+\tUpdating release for $platform platform. Old: '" .
  340. $r->getAttribute('version') . "' New: '$release_version'\n";
  341. $r->setAttribute('version', $release_version);
  342. $r->setAttribute('URL', $URL);
  343. $r->setAttribute('size', $size);
  344. $r->setAttribute('date', substr($release_date,0,10));
  345. } # while
  346. } # while
  347. }
  348. ##################################################################
  349. if($opt_d && $opt_a)
  350. {
  351. print STDERR "Only one of -d and -a may be given at the same time.\n";
  352. exit;
  353. }
  354. if(!$opt_d && !$opt_a && !$opt_r)
  355. {
  356. print STDERR "One of -h, -d, -a or -r must be given.\n";
  357. exit;
  358. }
  359. my $parser = new XML::DOM::Parser;
  360. my $doc;
  361. if(-s $dbfile)
  362. {
  363. $doc = $parser->parsefile ($dbfile);
  364. printd "Successfully read $dbfile.\n";
  365. my $nodes = $doc->getElementsByTagName("dictionary");
  366. my $n = $nodes->getLength;
  367. printd "$n dictionary/-ies in my database.\n";
  368. }
  369. else
  370. {
  371. printd "Creating new database.\n";
  372. $doc = new XML::DOM::Document;
  373. $doc->appendChild( $doc->createElement('FreeDictDatabase') );
  374. }
  375. fdict_extract_metadata($FREEDICTDIR, $opt_d, $doc) if $opt_d;
  376. fdict_extract_all_metadata($FREEDICTDIR, $doc) if $opt_a;
  377. fdict_extract_releases($doc) if $opt_r;
  378. if($opt_l)
  379. {
  380. printd "Leaving $dbfile untouched.\n";
  381. exit(0);
  382. }
  383. # Write out freedict-database.xml
  384. `cp $dbfile $dbfile.bak` if(-s $dbfile);
  385. printd "Writing $dbfile\n";
  386. $SIG{INT} = 'IGNORE';
  387. $doc->printToFile ($dbfile);
  388. $SIG{INT} = 'DEFAULT';