/scripts/Bio-DB-GFF/bp_bulk_load_gff.pl

http://github.com/bioperl/bioperl-live · Perl · 698 lines · 556 code · 121 blank · 21 comment · 83 complexity · a5741ea1aadcce942b1a20258ae99250 MD5 · raw file

  1. #!/usr/bin/perl
  2. use strict;
  3. use warnings;
  4. # use lib './blib/lib';
  5. use DBI;
  6. use IO::File;
  7. use File::Spec;
  8. use Getopt::Long;
  9. use Bio::DB::GFF;
  10. use Bio::DB::GFF::Util::Binning 'bin';
  11. use constant MYSQL => 'mysql';
  12. use constant FDATA => 'fdata';
  13. use constant FTYPE => 'ftype';
  14. use constant FGROUP => 'fgroup';
  15. use constant FDNA => 'fdna';
  16. use constant FATTRIBUTE => 'fattribute';
  17. use constant FATTRIBUTE_TO_FEATURE => 'fattribute_to_feature';
  18. =head1 NAME
  19. bp_bulk_load_gff.pl - Bulk-load a Bio::DB::GFF database from GFF files.
  20. =head1 SYNOPSIS
  21. % bp_bulk_load_gff.pl -d testdb dna1.fa dna2.fa features1.gff features2.gff ...
  22. =head1 DESCRIPTION
  23. This script loads a Bio::DB::GFF database with the features contained
  24. in a list of GFF files and/or FASTA sequence files. You must use the
  25. exact variant of GFF described in L<Bio::DB::GFF>. Various
  26. command-line options allow you to control which database to load and
  27. whether to allow an existing database to be overwritten.
  28. This script differs from bp_load_gff.pl in that it is hard-coded to use
  29. MySQL and cannot perform incremental loads. See L<bp_load_gff.pl> for an
  30. incremental loader that works with all databases supported by
  31. Bio::DB::GFF, and L<bp_fast_load_gff.pl> for a MySQL loader that supports
  32. fast incremental loads.
  33. =head2 NOTES
  34. If the filename is given as "-" then the input is taken from standard
  35. input. Compressed files (.gz, .Z, .bz2) are automatically
  36. uncompressed.
  37. FASTA format files are distinguished from GFF files by their filename
  38. extensions. Files ending in .fa, .fasta, .fast, .seq, .dna and their
  39. uppercase variants are treated as FASTA files. Everything else is
  40. treated as a GFF file. If you wish to load -fasta files from STDIN,
  41. then use the -f command-line swith with an argument of '-', as in
  42. gunzip my_data.fa.gz | bp_fast_load_gff.pl -d test -f -
  43. The nature of the bulk load requires that the database be on the local
  44. machine and that the indicated user have the "file" privilege to load
  45. the tables and have enough room in /usr/tmp (or whatever is specified
  46. by the \$TMPDIR environment variable), to hold the tables transiently.
  47. Local data may now be uploaded to a remote server via the --local option
  48. with the database host specified in the dsn, e.g. dbi:mysql:test:db_host
  49. The adaptor used is dbi::mysqlopt. There is currently no way to
  50. change this.
  51. About maxfeature: the default value is 100,000,000 bases. If you have
  52. features that are close to or greater that 100Mb in length, then the
  53. value of maxfeature should be increased to 1,000,000,000. This value
  54. must be a power of 10.
  55. Note that Windows users must use the --create option.
  56. If the list of GFF or fasta files exceeds the kernel limit for the
  57. maximum number of command-line arguments, use the
  58. --long_list /path/to/files option.
  59. =head1 COMMAND-LINE OPTIONS
  60. Command-line options can be abbreviated to single-letter options.
  61. e.g. -d instead of --database.
  62. --database <dsn> Database name (default dbi:mysql:test)
  63. --adaptor Adaptor name (default mysql)
  64. --create Reinitialize/create data tables without asking
  65. --user Username to log in as
  66. --fasta File or directory containing fasta files to load
  67. --long_list Directory containing a very large number of
  68. GFF and/or FASTA files
  69. --password Password to use for authentication
  70. (Does not work with Postgres, password must be
  71. supplied interactively or be left empty for
  72. ident authentication)
  73. --maxbin Set the value of the maximum bin size
  74. --local Flag to indicate that the data source is local
  75. --maxfeature Set the value of the maximum feature size (power of 10)
  76. --group A list of one or more tag names (comma or space separated)
  77. to be used for grouping in the 9th column.
  78. --gff3_munge Activate GFF3 name munging (see Bio::DB::GFF)
  79. --summary Generate summary statistics for drawing coverage histograms.
  80. This can be run on a previously loaded database or during
  81. the load.
  82. --Temporary Location of a writable scratch directory
  83. =head1 SEE ALSO
  84. L<Bio::DB::GFF>, L<fast_load_gff.pl>, L<load_gff.pl>
  85. =head1 AUTHOR
  86. Lincoln Stein, lstein@cshl.org
  87. Copyright (c) 2002 Cold Spring Harbor Laboratory
  88. This library is free software; you can redistribute it and/or modify
  89. it under the same terms as Perl itself. See DISCLAIMER.txt for
  90. disclaimers of warranty.
  91. =cut
  92. package Bio::DB::GFF::Adaptor::fauxmysql;
  93. use Bio::DB::GFF::Adaptor::dbi::mysqlopt;
  94. use vars '@ISA';
  95. @ISA = 'Bio::DB::GFF::Adaptor::dbi::mysqlopt';
  96. sub insert_sequence {
  97. my $self = shift;
  98. my ($id,$offset,$seq) = @_;
  99. print join("\t",$id,$offset,$seq),"\n";
  100. };
  101. package Bio::DB::GFF::Adaptor::fauxmysqlcmap;
  102. use Bio::DB::GFF::Adaptor::dbi::mysqlcmap;
  103. use vars '@ISA';
  104. @ISA = 'Bio::DB::GFF::Adaptor::dbi::mysqlcmap';
  105. sub insert_sequence {
  106. my $self = shift;
  107. my ($id,$offset,$seq) = @_;
  108. print join("\t",$id,$offset,$seq),"\n";
  109. };
  110. package Bio::DB::GFF::Adaptor::fauxpg;
  111. use Bio::DB::GFF::Adaptor::dbi::pg;
  112. use vars '@ISA';
  113. @ISA = 'Bio::DB::GFF::Adaptor::dbi::pg';
  114. #these two subs are to separate the table creation from the
  115. #index creation
  116. sub do_initialize {
  117. my $self = shift;
  118. my $erase = shift;
  119. $self->drop_all if $erase;
  120. my $dbh = $self->features_db;
  121. my $schema = $self->schema;
  122. foreach my $table_name ($self->tables) {
  123. my $create_table_stmt = $schema->{$table_name}{table} ;
  124. $dbh->do($create_table_stmt) || warn $dbh->errstr;
  125. # $self->create_other_schema_objects(\%{$schema->{$table_name}});
  126. }
  127. 1;
  128. }
  129. sub _create_indexes_etc {
  130. my $self = shift;
  131. my $dbh = $self->features_db;
  132. my $schema = $self->schema;
  133. foreach my $table_name ($self->tables) {
  134. $self->create_other_schema_objects(\%{$schema->{$table_name}});
  135. }
  136. }
  137. sub insert_sequence {
  138. my $self = shift;
  139. my ($id,$offset,$seq) = @_;
  140. print "$id\t$offset\t$seq\n";
  141. }
  142. package main;
  143. eval "use Time::HiRes"; undef $@;
  144. my $timer = defined &Time::HiRes::time;
  145. my $bWINDOWS = 0; # Boolean: is this a MSWindows operating system?
  146. if ($^O =~ /MSWin32/i) {
  147. $bWINDOWS = 1;
  148. }
  149. my ($DSN,$ADAPTOR,$FORCE,$USER,$PASSWORD,$FASTA,$LOCAL,$MAX_BIN,$GROUP_TAG,$LONG_LIST,$MUNGE,$TMPDIR);
  150. GetOptions ('database:s' => \$DSN,
  151. 'adaptor:s' => \$ADAPTOR,
  152. 'create' => \$FORCE,
  153. 'user:s' => \$USER,
  154. 'password:s' => \$PASSWORD,
  155. 'fasta:s' => \$FASTA,
  156. 'local' => \$LOCAL,
  157. 'maxbin|maxfeature:s' => \$MAX_BIN,
  158. 'group:s' => \$GROUP_TAG,
  159. 'long_list:s' => \$LONG_LIST,
  160. 'gff3_munge' => \$MUNGE,
  161. 'Temporary:s' => \$TMPDIR,
  162. ) or (system('pod2text', $0), exit -1);
  163. # If called as pg_bulk_load_gff.pl behave as that did.
  164. if ($0 =~/pg_bulk_load_gff.pl/){
  165. $ADAPTOR ||= 'Pg';
  166. $DSN ||= 'test';
  167. }
  168. $DSN ||= 'dbi:mysql:test';
  169. $MAX_BIN ||= 1_000_000_000; # to accomodate human-sized chromosomes
  170. if ($bWINDOWS && not $FORCE) {
  171. die "Note that Windows users must use the --create option.\n";
  172. }
  173. unless ($FORCE) {
  174. die "This will delete all existing data in database $DSN. If you want to do this, rerun with the --create option.\n"
  175. if $bWINDOWS;
  176. open (TTY,"/dev/tty") or die "/dev/tty: $!\n"; #TTY use removed for win compatability
  177. print STDERR "This operation will delete all existing data in database $DSN. Continue? ";
  178. my $f = <TTY>;
  179. die "Aborted\n" unless $f =~ /^[yY]/;
  180. close TTY;
  181. }
  182. # postgres DBD::Pg allows 'database', but also 'dbname', and 'db':
  183. # and it must be Pg (not pg)
  184. $DSN=~s/pg:database=/Pg:/i;
  185. $DSN=~s/pg:dbname=/Pg:/i;
  186. $DSN=~s/pg:db=/Pg:/i;
  187. # leave these lines for mysql
  188. $DSN=~s/database=//i;
  189. $DSN=~s/;host=/:/i; #cater for dsn in the form of "dbi:mysql:database=$dbname;host=$host"
  190. my($DBI,$DBD,$DBNAME,$HOST)=split /:/,$DSN;
  191. $DBNAME=$DSN unless $DSN=~/:/;
  192. $ADAPTOR ||= $DBD;
  193. $ADAPTOR ||= 'mysql';
  194. if ($DBD eq 'Pg') {
  195. # rebuild DSN, DBD::Pg requires full dbname=<name> format
  196. $DSN = "dbi:Pg:dbname=$DBNAME";
  197. if ($HOST) { $DSN .= ";host=$HOST"; }
  198. }
  199. my ($use_mysql,$use_mysqlcmap,$use_pg) = (0,0,0);
  200. if ( $ADAPTOR eq 'mysqlcmap' ) {
  201. $use_mysqlcmap = 1;
  202. }
  203. elsif ( $ADAPTOR =~ /^mysql/ ) {
  204. $use_mysql = 1;
  205. }
  206. elsif ( $ADAPTOR eq "Pg" ) {
  207. $use_pg = 1;
  208. }
  209. else{
  210. die "$ADAPTOR is not an acceptable database adaptor.";
  211. }
  212. my (@auth,$AUTH);
  213. if (defined $USER) {
  214. push @auth,(-user=>$USER);
  215. if ( $use_mysql or $use_mysqlcmap ) {
  216. $AUTH .= " -u$USER";
  217. }
  218. elsif ( $use_pg ) {
  219. $AUTH .= " -U $USER ";
  220. }
  221. }
  222. if (defined $PASSWORD) {
  223. push @auth,(-pass=>$PASSWORD);
  224. if ( $use_mysql or $use_mysqlcmap ) {
  225. $AUTH .= " -p$PASSWORD";
  226. }
  227. # elsif ( $use_pg ) {
  228. # $AUTH .= " -W $PASSWORD ";
  229. # }
  230. }
  231. if (defined $HOST) {
  232. $AUTH .= " -h$HOST";
  233. }
  234. if (defined $DBNAME) {
  235. if ( $use_mysql or $use_mysqlcmap ) {
  236. $AUTH .= " -D$DBNAME ";
  237. }
  238. }
  239. if (defined $LOCAL) {
  240. $LOCAL='local';
  241. $AUTH.=' --local-infile=1';
  242. }else {
  243. $LOCAL='';
  244. }
  245. my $faux_adaptor;
  246. if ( $use_mysqlcmap ) {
  247. $faux_adaptor = "fauxmysqlcmap";
  248. }
  249. elsif ( $use_mysql ) {
  250. $faux_adaptor = "fauxmysql";
  251. }
  252. elsif ( $use_pg ) {
  253. $faux_adaptor = "fauxpg";
  254. }
  255. my $db = Bio::DB::GFF->new(-adaptor=>$faux_adaptor,-dsn => $DSN,@auth)
  256. or die "Can't open database: ",Bio::DB::GFF->error,"\n";
  257. $db->gff3_name_munging(1) if $MUNGE;
  258. $MAX_BIN ? $db->initialize(-erase=>1,-MAX_BIN=>$MAX_BIN) : $db->initialize(1);
  259. $MAX_BIN ||= $db->meta('max_bin') || 100_000_000;
  260. # deal with really long lists of files
  261. if ($LONG_LIST) {
  262. -d $LONG_LIST or die "The --long_list argument must be a directory\n";
  263. opendir GFFDIR,$LONG_LIST or die "Could not open $LONG_LIST for reading: $!";
  264. @ARGV = map { "$LONG_LIST\/$_" } readdir GFFDIR;
  265. closedir GFFDIR;
  266. if (defined $FASTA && -d $FASTA) {
  267. opendir FASTA,$FASTA or die "Could not open $FASTA for reading: $!";
  268. push @ARGV, map { "$FASTA\/$_" } readdir FASTA;
  269. closedir FASTA;
  270. }
  271. elsif (defined $FASTA && -f $FASTA) {
  272. push @ARGV, $FASTA;
  273. }
  274. }
  275. foreach (@ARGV) {
  276. $_ = "gunzip -c $_ |" if /\.gz$/;
  277. $_ = "uncompress -c $_ |" if /\.Z$/;
  278. $_ = "bunzip2 -c $_ |" if /\.bz2$/;
  279. }
  280. my (@gff,@fasta);
  281. foreach (@ARGV) {
  282. if (/\.(fa|fasta|dna|seq|fast)(?:$|\.)/i) {
  283. push @fasta,$_;
  284. } else {
  285. push @gff,$_;
  286. }
  287. }
  288. @ARGV = @gff;
  289. push @fasta,$FASTA if defined $FASTA;
  290. # drop everything that was there before
  291. my %FH;
  292. my $tmpdir = File::Spec->tmpdir() || '/tmp';
  293. $tmpdir =~ s!\\!\\\\!g if $bWINDOWS; #eliminates backslash mis-interpretation
  294. -d $tmpdir or die <<END;
  295. I could not find a suitable temporary directory to write scratch files into ($tmpdir by default).
  296. Please select a directory and indicate its location by setting the TMP environment variable, or
  297. by using the --Temporary switch.
  298. END
  299. my @fasta_files_to_be_unlinked;
  300. my @files = (FDATA,FTYPE,FGROUP,FDNA,FATTRIBUTE,FATTRIBUTE_TO_FEATURE);
  301. foreach (@files) {
  302. $FH{$_} = IO::File->new(">$tmpdir/$_.$$") or die $_,": $!";
  303. $FH{$_}->autoflush;
  304. }
  305. if ( $use_pg ) {
  306. $FH{FDATA() }->print("COPY fdata (fid, fref, fstart, fstop, fbin, ftypeid, fscore, fstrand, fphase, gid, ftarget_start, ftarget_stop) FROM stdin;\n");
  307. $FH{FTYPE() }->print("COPY ftype (ftypeid, fmethod, fsource) FROM stdin;\n");
  308. $FH{FGROUP() }->print("COPY fgroup (gid, gclass, gname) FROM stdin;\n");
  309. $FH{FATTRIBUTE() }->print("COPY fattribute (fattribute_id, fattribute_name) FROM stdin;\n");
  310. $FH{FATTRIBUTE_TO_FEATURE()}->print("COPY fattribute_to_feature (fid, fattribute_id, fattribute_value) FROM stdin;\n");
  311. }
  312. my $FID = 1;
  313. my $GID = 1;
  314. my $FTYPEID = 1;
  315. my $ATTRIBUTEID = 1;
  316. my %GROUPID = ();
  317. my %FTYPEID = ();
  318. my %ATTRIBUTEID = ();
  319. my %DONE = ();
  320. my $FEATURES = 0;
  321. my %tmpfiles; # keep track of temporary fasta files
  322. my $count;
  323. my $fasta_sequence_id;
  324. my $gff3;
  325. my $current_file; #used to reset GFF3 flag in mix of GFF and GFF3 files
  326. $db->preferred_groups(split (/[,\s]+/,$GROUP_TAG)) if defined $GROUP_TAG;
  327. my $last = Time::HiRes::time() if $timer;
  328. my $start = $last;
  329. # avoid hanging on standalone --fasta load
  330. if (!@ARGV) {
  331. $FH{NULL} = IO::File->new(">$tmpdir/null");
  332. push @ARGV, "$tmpdir/null";
  333. }
  334. my ($cmap_db);
  335. if ($use_mysqlcmap){
  336. my $options = {
  337. AutoCommit => 1,
  338. FetchHashKeyName => 'NAME_lc',
  339. LongReadLen => 3000,
  340. LongTruncOk => 1,
  341. RaiseError => 1,
  342. };
  343. $cmap_db = DBI->connect( $DSN, $USER, $PASSWORD, $options );
  344. }
  345. # Only load CMap::Utils if using cmap
  346. unless (!$use_mysqlcmap or
  347. eval {
  348. require Bio::GMOD::CMap::Utils;
  349. Bio::GMOD::CMap::Utils->import('next_number');
  350. 1;
  351. }
  352. ) {
  353. print STDERR "Error loading Bio::GMOD::CMap::Utils\n";
  354. }
  355. while (<>) {
  356. $current_file ||= $ARGV;
  357. # reset GFF3 flag if new filehandle
  358. unless($current_file eq $ARGV){
  359. undef $gff3;
  360. $current_file = $ARGV;
  361. }
  362. chomp;
  363. my ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group);
  364. # close sequence filehandle if required
  365. if ( /^\#|\s+|^$|^>|\t/ && defined $FH{FASTA}) {
  366. $FH{FASTA}->close;
  367. delete $FH{FASTA};
  368. }
  369. # print to fasta file if the handle is open
  370. if ( defined $FH{FASTA} ) {
  371. $FH{FASTA}->print("$_\n");
  372. next;
  373. }
  374. elsif (/^>(\S+)/) { # uh oh, sequence coming
  375. $FH{FASTA} = IO::File->new(">$tmpdir/$1\.fa") or die "FASTA: $!\n";
  376. $FH{FASTA}->print("$_\n");
  377. print STDERR "Preparing embedded sequence $1\n";
  378. push @fasta, "$tmpdir/$1\.fa";
  379. push @fasta_files_to_be_unlinked,"$tmpdir/$1\.fa";
  380. $tmpfiles{"$tmpdir/$1\.fa"}++;
  381. next;
  382. }
  383. elsif (/^\#\#\s*gff-version\s+(\d+)/) {
  384. $gff3 = ($1 >= 3);
  385. $db->print_gff3_warning() if $gff3;
  386. next;
  387. }
  388. elsif (/^\#\#\s*group-tags\s+(.+)/) {
  389. $db->preferred_groups(split(/\s+/,$1));
  390. next;
  391. }
  392. elsif (/^\#\#\s*sequence-region\s+(\S+)\s+(\d+)\s+(\d+)/i) { # header line
  393. ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) =
  394. ($1,'reference','Component',$2,$3,'.','.','.',$gff3 ? "ID=Sequence:$1": qq(Sequence "$1"));
  395. }
  396. elsif (/^\#/) {
  397. next;
  398. }
  399. else {
  400. ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split "\t";
  401. }
  402. if ( not defined( $ref ) or length ($ref) == 0) {
  403. warn "\$ref is null. source = $source, method = $method, group = $group\n";
  404. next;
  405. }
  406. $FEATURES++;
  407. my $size = $stop-$start+1;
  408. warn "Feature $group ($size) is larger than $MAX_BIN. You will have trouble retrieving this feature.\nRerun script with --maxfeature set to a higher power of 10.\n" if $size > $MAX_BIN;
  409. $source = '\N' unless defined $source;
  410. $score = '\N' if $score eq '.';
  411. $strand = '\N' if $strand eq '.';
  412. $phase = '\N' if $phase eq '.';
  413. my ($group_class,$group_name,$target_start,$target_stop,$attributes) = $db->split_group($group,$gff3);
  414. # GFF2/3 transition
  415. $group_class = [$group_class] unless ref $group_class;
  416. $group_name = [$group_name] unless ref $group_name;
  417. for (my $i=0; $i < @$group_name; $i++) {
  418. $group_class->[$i] ||= '\N';
  419. $group_name->[$i] ||= '\N';
  420. $target_start ||= '\N';
  421. $target_stop ||= '\N';
  422. $method ||= '\N';
  423. $source ||= '\N';
  424. my $fid = $FID++;
  425. my $gid = $GROUPID{lc join('',$group_class->[$i],$group_name->[$i])} ||= $GID++;
  426. my $ftypeid = $FTYPEID{lc join('',$source,$method)} ||= $FTYPEID++;
  427. my $bin = bin($start,$stop,$db->min_bin);
  428. $FH{ FDATA() }->print( join("\t",$fid,$ref,$start,$stop,$bin,$ftypeid,$score,$strand,$phase,$gid,$target_start,$target_stop),"\n" );
  429. if ($use_mysqlcmap){
  430. my $feature_id = next_number(
  431. db => $cmap_db,
  432. table_name => 'cmap_feature',
  433. id_field => 'feature_id',
  434. )
  435. or die 'No feature id';
  436. my $direction = $strand eq '-' ? -1:1;
  437. $FH{ FGROUP() }->print(
  438. join("\t",$feature_id,$feature_id,'NULL',0, $group_name->[$i],0,0,'NULL',1,$direction, $group_class->[$i],)
  439. ,"\n"
  440. ) unless $DONE{"G$gid"}++;
  441. }
  442. else {
  443. $FH{ FGROUP() }->print( join("\t",$gid,$group_class->[$i],$group_name->[$i]),"\n") unless $DONE{"G$gid"}++;
  444. }
  445. $FH{ FTYPE() }->print( join("\t",$ftypeid,$method,$source),"\n" ) unless $DONE{"T$ftypeid"}++;
  446. foreach (@$attributes) {
  447. my ($key,$value) = @$_;
  448. my $attributeid = $ATTRIBUTEID{$key} ||= $ATTRIBUTEID++;
  449. $FH{ FATTRIBUTE() }->print( join("\t",$attributeid,$key),"\n" ) unless $DONE{"A$attributeid"}++;
  450. $FH{ FATTRIBUTE_TO_FEATURE() }->print( join("\t",$fid,$attributeid,$value),"\n");
  451. }
  452. if ( $fid % 1000 == 0) {
  453. my $now = Time::HiRes::time() if $timer;
  454. my $elapsed = $timer ? sprintf(" in %5.2fs",$now - $last) : '';
  455. $last = $now;
  456. print STDERR "$fid features parsed$elapsed...";
  457. print STDERR -t STDOUT && !$ENV{EMACS} ? "\r" : "\n";
  458. }
  459. }
  460. }
  461. $FH{FASTA}->close if exists $FH{FASTA};
  462. for my $file (@fasta) {
  463. warn "Preparing DNA file $file....\n";
  464. if ($use_pg){
  465. $FH{FDNA() }->print("COPY fdna (fref, foffset, fdna) FROM stdin;\n");
  466. }
  467. my $old = select($FH{FDNA()});
  468. $db->load_fasta($file) or warn "Couldn't load fasta file $file: $!";
  469. if ($use_pg){
  470. $FH{FDNA() }->print("\\.\n\n");
  471. }
  472. warn "done...\n";
  473. select $old;
  474. unlink $file if $tmpfiles{$file};
  475. }
  476. if ($use_pg) {
  477. $FH{FDATA() }->print("\\.\n\n");
  478. $FH{FTYPE() }->print("\\.\n\n");
  479. $FH{FGROUP() }->print("\\.\n\n");
  480. $FH{FATTRIBUTE() }->print("\\.\n\n");
  481. $FH{FATTRIBUTE_TO_FEATURE()}->print("\\.\n\n");
  482. }
  483. $_->close foreach values %FH;
  484. printf STDERR "Total parse time %5.2fs\n",(Time::HiRes::time() - $start) if $timer;
  485. warn "Loading feature data and analyzing tables. You may see RDBMS messages here...\n";
  486. if ($use_pg){
  487. warn "Loading feature data. You may see Postgres comments...\n";
  488. foreach (@files) {
  489. my $file = "$tmpdir/$_.$$";
  490. $AUTH ? system("psql $AUTH -f $file $DBNAME")
  491. : system('psql','-f', $file, $DBNAME);
  492. unlink $file;
  493. }
  494. warn "Updating sequences ...\n";
  495. $db->update_sequences();
  496. warn "Creating indexes ...\n";
  497. $db->_create_indexes_etc();
  498. warn "done...\n";
  499. }
  500. elsif( $use_mysql or $use_mysqlcmap ) {
  501. $start = time();
  502. my $success = 1;
  503. my $TERMINATEDBY = $bWINDOWS ? q( LINES TERMINATED BY '\r\n') : '';
  504. for my $f (@files) {
  505. my $table = function_to_table($f,$ADAPTOR);
  506. my $sql = join ('; ',
  507. "lock tables $table write",
  508. "delete from $table",
  509. "load data $LOCAL infile '$tmpdir/$f.$$' replace into table $table $TERMINATEDBY",
  510. "unlock tables");
  511. my $command = MYSQL . qq[$AUTH -s -e "$sql"];
  512. $command =~ s/\n/ /g;
  513. $success &&= system($command) == 0;
  514. unlink "$tmpdir/$f.$$";
  515. }
  516. printf STDERR "Total load time %5.2fs\n",(time() - $start) if $timer;
  517. print STDERR "done...\n";
  518. print STDERR "Analyzing/optimizing tables. You will see database messages...\n";
  519. $start = time();
  520. my $sql = '';
  521. for my $f (@files) {
  522. my $table = function_to_table($f,$ADAPTOR);
  523. $sql .= "analyze table $table;";
  524. }
  525. my $command = MYSQL . qq[$AUTH -N -s -e "$sql"];
  526. $success &&= system($command) == 0;
  527. printf STDERR "Optimization time time %5.2fs\n",(time() - $start);
  528. if ($success) {
  529. print "$FEATURES features successfully loaded\n";
  530. } else {
  531. print "FAILURE: Please see standard error for details\n";
  532. exit -1;
  533. }
  534. }
  535. foreach (@fasta_files_to_be_unlinked) {
  536. unlink "$tmpdir/$_.$$";
  537. }
  538. warn "Building summary statistics for coverage histograms...\n";
  539. my (@args,$AUTH);
  540. if (defined $USER) {
  541. push @args,(-user=>$USER);
  542. $AUTH .= " -u$USER";
  543. }
  544. if (defined $PASSWORD) {
  545. push @args,(-pass=>$PASSWORD);
  546. $AUTH .= " -p$PASSWORD";
  547. }
  548. push @args,(-preferred_groups=>[split(/[,\s+]+/,$GROUP_TAG)]) if defined $GROUP_TAG;
  549. my $db = Bio::DB::GFF->new(-adaptor=>"dbi::$ADAPTOR",-dsn => $DSN,@args)
  550. or die "Can't open database: ",Bio::DB::GFF->error,"\n";
  551. $db->build_summary_statistics;
  552. exit 0;
  553. sub function_to_table {
  554. my $function = shift;
  555. my $adaptor = shift;
  556. if ($function eq 'fdata'){
  557. return 'fdata';
  558. }
  559. elsif ($function eq 'ftype'){
  560. return 'ftype';
  561. }
  562. elsif ($function eq 'fgroup'){
  563. return 'cmap_feature' if ($adaptor eq 'mysqlcmap');
  564. return 'fgroup';
  565. }
  566. elsif ($function eq 'fdna'){
  567. return 'fdna';
  568. }
  569. elsif ($function eq 'fattribute'){
  570. return 'fattribute';
  571. }
  572. elsif ($function eq 'fattribute_to_feature'){
  573. return 'fattribute_to_feature';
  574. }
  575. return '';
  576. }
  577. __END__