PageRenderTime 60ms CodeModel.GetById 23ms RepoModel.GetById 0ms app.codeStats 0ms

/scripts/tr_import_members_from_fasta.pl

https://github.com/jestill/iplant-treerec
Perl | 958 lines | 743 code | 108 blank | 107 comment | 29 complexity | 223f53728488365e71ca9aa72e120a40 MD5 | raw file
  1. #!/usr/bin/perl -w
  2. #-----------------------------------------------------------+
  3. # |
  4. # tr_import_members_from_fasta.pl |
  5. # |
  6. #-----------------------------------------------------------+
  7. # |
  8. # AUTHOR: James C. Estill |
  9. # CONTACT: JamesEstill_@_gmail.com |
  10. # STARTED: 09/28/2010 |
  11. # UPDATED: 04/12/2011 |
  12. # VERSION: $Rev: 613 $ |
  13. # |
  14. # DESCRIPTION: |
  15. # Load the members table with information from a FASTA |
  16. # file. If the fasta files are tagged by the |
  17. # LocusID_SourceTaxa system, these data can be used to |
  18. # update the following fields: |
  19. # * member_id |
  20. # * stable_id -- as the internal project locus id |
  21. # * taxon_id --- as the genbank id |
  22. # When give a directory, will load all of the fasta |
  23. # files in the directory and when given a single fasta |
  24. # file will only load the fasta file. |
  25. # |
  26. # USAGE: |
  27. # tr_import_members_from_fasta.pl -i infile.fasta |
  28. # tr_import_members_from_fasta.pl -i indir/ |
  29. # |
  30. # LICENSE: |
  31. # Simplified BSD License |
  32. # http://tinyurl.com/iplant-tr-license |
  33. #-----------------------------------------------------------+
  34. # To test import:
  35. # ./tr_import_members_from_fasta.pl -i sandbox/ --driver mysql --dbname tr_test --host localhost --verbose
  36. #
  37. # For the full impoart
  38. # ./tr_import_members_from_fasta.pl ../bowers_clusters/clusters/aa_seqs/ --driver mysql --dbname tr_test --host localhost --verbose
  39. #
  40. #-----------------------------+
  41. # INCLUDES |
  42. #-----------------------------+
  43. use strict;
  44. use DBI;
  45. use Getopt::Long;
  46. use Bio::SeqIO; # Read and write seq files in different formats
  47. use Cwd; # Get the current working directory
  48. use File::Basename; # Use this to extract base name from file path
  49. # The following needed for printing help
  50. use Pod::Select; # Print subsections of POD documentation
  51. use Pod::Text; # Print POD doc as formatted text file
  52. use IO::Scalar; # For print_help subfunction
  53. use IO::Pipe; # Pipe for STDIN, STDOUT for POD docs
  54. use File::Spec; # Convert a relative path to an abosolute path
  55. #-----------------------------+
  56. # VARIABLES |
  57. #-----------------------------+
  58. my ($VERSION) = q$Rev: 613 $ =~ /(\d+)/;
  59. # Get command-line arguments, or die with a usage statement
  60. my $in_path; # Input path can be a directory or file
  61. # or STDIN
  62. my $in_format = "fasta"; # The expected format of input
  63. my $seq_count = 0;
  64. # DATABASE VARS
  65. my $db; # Database name (ie. iplant_tr)
  66. my $host; # Database host (ie. localhost)
  67. my $driver; # Database driver (ie. mysql)
  68. my $statement; # Database statement
  69. my $sth; # Database statement handle
  70. # OPTIONS SET IN USER ENVIRONMENT
  71. my $usrname = $ENV{TR_USERNAME}; # User name to connect to database
  72. my $pass = $ENV{TR_PASSWORD}; # Password to connect to database
  73. my $dsn = $ENV{TR_DSN}; # DSN for database connection
  74. # BOOLEANS
  75. my $quiet = 0;
  76. my $verbose = 0;
  77. my $show_help = 0;
  78. my $show_usage = 0;
  79. my $show_man = 0;
  80. my $show_version = 0;
  81. my $do_test = 0; # Run the program in test mode
  82. #-----------------------------+
  83. # COMMAND LINE OPTIONS |
  84. #-----------------------------+
  85. my $ok = GetOptions(# REQUIRED OPTIONS
  86. "i|infile|indir=s" => \$in_path,
  87. "f|format=s" => \$in_format,
  88. # "s|species=s" => \$species_tree_name,
  89. # "c|cluster=s" => \$cluster_set_name,
  90. # DSN
  91. "d|dsn=s" => \$dsn,
  92. # ALTERNATIVE TO --dsn
  93. "driver=s" => \$driver,
  94. "dbname=s" => \$db,
  95. "host=s" => \$host,
  96. # THE FOLLOWING CAN BE DEFINED IN ENV
  97. "u|dbuser=s" => \$usrname,
  98. "p|dbpass=s" => \$pass,
  99. "q|quiet" => \$quiet,
  100. "verbose" => \$verbose,
  101. # ADDITIONAL INFORMATION
  102. "usage" => \$show_usage,
  103. "test" => \$do_test,
  104. "version" => \$show_version,
  105. "man" => \$show_man,
  106. "h|help" => \$show_help,);
  107. #-----------------------------+
  108. # SHOW REQUESTED HELP |
  109. #-----------------------------+
  110. if ( ($show_usage) ) {
  111. # print_help ("usage", File::Spec->rel2abs($0) );
  112. print_help ("usage", $0 );
  113. }
  114. if ( ($show_help) || (!$ok) ) {
  115. # print_help ("help", File::Spec->rel2abs($0) );
  116. print_help ("help", $0 );
  117. }
  118. if ($show_man) {
  119. # User perldoc to generate the man documentation.
  120. system ("perldoc $0");
  121. exit($ok ? 0 : 2);
  122. }
  123. if ($show_version) {
  124. print "\n$0:\n".
  125. "Version: $VERSION\n\n";
  126. exit;
  127. }
  128. #-----------------------------------------------------------+
  129. # DATABASE CONNECTION |
  130. #-----------------------------------------------------------+
  131. if ( ($db) && ($host) && ($driver) ) {
  132. # Set default values if none given at command line
  133. $db = "iplant_tr" unless $db;
  134. $host = "localhost" unless $host;
  135. $driver = "mysql" unless $driver;
  136. $dsn = "DBI:$driver:database=$db;host=$host";
  137. }
  138. elsif ($dsn) {
  139. # We need to parse the database name, driver etc from the dsn string
  140. # in the form of DBI:$driver:database=$db;host=$host
  141. # Other dsn strings will not be parsed properly
  142. # Split commands are often faster then regular expressions
  143. # However, a regexp may offer a more stable parse then splits do
  144. my ($cruft, $prefix, $suffix, $predb, $prehost);
  145. ($prefix, $driver, $suffix) = split(/:/,$dsn);
  146. ($predb, $prehost) = split(/;/, $suffix);
  147. ($cruft, $db) = split(/=/,$predb);
  148. ($cruft, $host) = split(/=/,$prehost);
  149. # Print for debug
  150. print STDERR "\tPRE:\t$prefix\n" if $verbose;
  151. print STDERR "\tDRIVER:\t$driver\n" if $verbose;
  152. print STDERR "\tSUF:\t$suffix\n" if $verbose;
  153. print STDERR "\tDB:\t$db\n" if $verbose;
  154. print STDERR "\tHOST:\t$host\n" if $verbose;
  155. }
  156. else {
  157. # The variables to create a dsn have not been passed
  158. print STDERR "ERROR: A valid dsn can not be created\n";
  159. # print STDERR "No database specified" if (!$db);
  160. # print STDERR "No host specified" if (!$host);
  161. # print STDERR "No driver specified" if (!$driver);
  162. exit;
  163. }
  164. #-----------------------------+
  165. # GET DB PASSWORD |
  166. #-----------------------------+
  167. unless ($pass) {
  168. print "\nEnter password for the user $usrname\n";
  169. system('stty', '-echo') == 0 or die "can't turn off echo: $?";
  170. $pass = <STDIN>;
  171. system('stty', 'echo') == 0 or die "can't turn on echo: $?";
  172. chomp $pass;
  173. }
  174. #-----------------------------+
  175. # CONNECT TO THE DATABASE |
  176. #-----------------------------+
  177. # Commented out while I work on fetching tree structure
  178. my $dbh = &connect_to_db($dsn, $usrname, $pass);
  179. #-----------------------------------------------------------+
  180. # LOAD THE ARRAY OF FILE PATHS |
  181. #-----------------------------------------------------------+
  182. my @input_files;
  183. if ($in_path) {
  184. if (-f $in_path) {
  185. print STDERR "Input path is a file\n"
  186. if $verbose;
  187. push (@input_files, $in_path);
  188. }
  189. elsif (-d $in_path) {
  190. # NOTE: If other input formats are added, change the following to always
  191. # default to fasta format. Current here to allow for
  192. # input from other types of data.
  193. print STDERR "Input path is a directory\n"
  194. if $verbose;
  195. # GET THE DIRECTORY VAR
  196. my $in_dir = $in_path;
  197. # Add slash to indir if needed
  198. unless ($in_dir =~ /\/$/ ) {
  199. $in_dir = $in_dir."/";
  200. }
  201. # LOAD FILES IN THE INTPUT DIRECTORY
  202. # First load to tmp array so that indir can be prefixed to inpath
  203. my @tmp_file_paths;
  204. if ($in_format =~ "fasta") {
  205. opendir( DIR, $in_dir ) ||
  206. die "Can't open directory:\n$in_dir";
  207. @tmp_file_paths = grep /\.fasta$|\.fa$/, readdir DIR ;
  208. closedir( DIR );
  209. }
  210. # Append directory to path of
  211. foreach my $tmp_file_path (@tmp_file_paths ) {
  212. push (@input_files, $in_dir.$tmp_file_path);
  213. }
  214. # If no files found matching expected extensions, may want to
  215. # just push all files in the directory
  216. } else {
  217. print STDERR "Input path is not a valid directory or file:\n";
  218. die;
  219. }
  220. } else {
  221. print STDERR "\a";
  222. print STDERR "WARNING: A input directory or file has not been specified\n";
  223. }
  224. #-----------------------------------------------------------+
  225. # PROCESS EACH FASTA FILE |
  226. #-----------------------------------------------------------+
  227. # TO DO: SET FAMILY NAME WHEN IMPORTING THESE
  228. foreach my $in_file (@input_files) {
  229. print STDERR "Processing $in_file\n"
  230. if $verbose;
  231. $seq_count++;
  232. #-----------------------------+
  233. # ADD DATA TO THE FAMILY |
  234. # TABLE |
  235. #-----------------------------+
  236. #////////////////////////////////////////////
  237. # ASSUMES THAT THE FAMLIY NAME DOES NOT
  238. # ALREADY EXIST IN THE DATATABASE
  239. #////////////////////////////////////////////
  240. # Get filename from file name
  241. my @suffix_list = (".fasta",
  242. "_AA.fa",
  243. ".fna",
  244. ".faa");
  245. my $family_base_name = basename($in_file,@suffix_list);
  246. # The following var can be used to further parse the above name if needed
  247. # but the suffix list should usually do all of the work
  248. my $family_stable_id = $family_base_name;
  249. print STDERR "Family:\t".$family_base_name."\n"
  250. if $verbose;
  251. my $statement = "INSERT INTO family".
  252. " (stable_id) ".
  253. " VALUES".
  254. " ('".$family_stable_id."')";
  255. print STDERR "\tSQL: $statement\n"
  256. if $verbose;
  257. my $insert_famid_sth = &prepare_sth($dbh,$statement);
  258. &execute_sth($insert_famid_sth);
  259. my $family_id = &last_insert_id($dbh,"family", $driver);
  260. print STDERR "\tFamily ID:\t$family_id\n"
  261. if $verbose;
  262. my $inseq = Bio::SeqIO->new('-file' => "<$in_file",
  263. '-format' => $in_format);
  264. # Get some information about the sequence
  265. while (my $seqin = $inseq->next_seq) {
  266. my $species_name;
  267. my $locus_id;
  268. ($locus_id, $species_name) = split (/\_/,$seqin->primary_id());
  269. my $genbank_id = &taxon_2_gbid( $species_name );
  270. print STDERR "\t".$seqin->primary_id()."\n"
  271. if $verbose;
  272. # print STDERR "\t\tLocus: ".$locus_id."\n"
  273. # if $verbose;
  274. # print STDERR "\t\tSpecies: ".$species_name."\n"
  275. # if $verbose;
  276. # print STDERR "\t\tGB-ID: ".$genbank_id."\n"
  277. # if $verbose;
  278. # we can set the species object
  279. # print STDERR "\t".$seqin->species;
  280. # Currently will assume that the stable_id does not
  281. # already exist in the database, otherwise would
  282. #-----------------------------+
  283. # CHECK IF MEMBER IS ALREADY |
  284. # IN THE DATABASE |
  285. #-----------------------------+
  286. my $member_id = stable_id_2_member_id ($locus_id);
  287. #-----------------------------+
  288. # LOAD INFORMATION TO MEMBERS |
  289. # TABLE IF NOT ALREADY |
  290. # PRESENT |
  291. #-----------------------------+
  292. unless ($member_id) {
  293. my $statement = "INSERT INTO member".
  294. " ( stable_id,".
  295. " display_label,".
  296. " taxon_id)".
  297. " VALUES (".
  298. " '".$locus_id."',".
  299. " '".$locus_id."',".
  300. " '".$genbank_id."')";
  301. print STDERR "\t\tSQL: $statement\n"
  302. if $verbose;
  303. my $insert_member_sth = &prepare_sth($dbh,$statement);
  304. &execute_sth($insert_member_sth);
  305. # the following should work to set the member_id
  306. $member_id = stable_id_2_member_id ($locus_id);
  307. }
  308. else {
  309. print STDERR "\t\tMember id is ".$member_id."\n"
  310. if $verbose;
  311. }
  312. #-----------------------------+
  313. # ADD FAMILY MEMMERS TO THE |
  314. # FAMILY_MEMBER TABLE |
  315. #-----------------------------+
  316. #///////////////////////////////////////////////////////////
  317. # TO DO: CHECK THAT MEMBER FAMILY PAIR NOT ALREADY PRESENT
  318. #///////////////////////////////////////////////////////////
  319. my $statement = "INSERT INTO family_member ".
  320. " ( family_id, member_id )".
  321. " VALUES (".
  322. " '".$family_id."',".
  323. " '".$member_id."')";
  324. print STDERR "\t\tSQL: $statement \n"
  325. if $verbose;
  326. my $insert_fam_member_sth = &prepare_sth($dbh,$statement);
  327. &execute_sth($insert_fam_member_sth);
  328. #-----------------------------+
  329. # ADD SEQUENCE DATA TO THE |
  330. # SEQUENCE TABLE |
  331. #-----------------------------+
  332. #///////////////////////////////////////////////////////////
  333. # TO DO: First check that a sequence record does not exist
  334. # for this member_id record
  335. #///////////////////////////////////////////////////////////
  336. my $insert_seq_sql = "INSERT INTO sequence".
  337. " ( length,".
  338. " sequence )".
  339. " VALUES (".
  340. " '".$seqin->length()."',".
  341. " '".$seqin->seq()."')";
  342. print STDERR "\t\tSQL: ".$insert_seq_sql."\n"
  343. if $verbose;
  344. my $insert_seq_sth = &prepare_sth($dbh,$insert_seq_sql);
  345. &execute_sth($insert_seq_sth);
  346. #-----------------------------+
  347. # ADD THIS SEQ ID TO THE |
  348. # MEMBER TABLE |
  349. #-----------------------------+
  350. # Get the squence id
  351. # This will be used to update the squence id in the member table
  352. my $seq_db_id = &last_insert_id($dbh,"sequence", $driver);
  353. my $update_member_table_sql = "UPDATE member".
  354. " SET sequence_id = '".$seq_db_id."'".
  355. " WHERE member_id = '".$member_id."'";
  356. print STDERR "\t\tSQL: $update_member_table_sql\n"
  357. if $verbose;
  358. my $update_member_sth = &prepare_sth($dbh,$update_member_table_sql);
  359. &execute_sth( $update_member_sth );
  360. }
  361. # Get information about the sequence
  362. my $seq_id
  363. }
  364. exit;
  365. #-----------------------------------------------------------+
  366. # SUBFUNCTIONS
  367. #-----------------------------------------------------------+
  368. sub print_help {
  369. my ($help_msg, $podfile) = @_;
  370. # help_msg is the type of help msg to use (ie. help vs. usage)
  371. print "\n";
  372. #-----------------------------+
  373. # PIPE WITHIN PERL |
  374. #-----------------------------+
  375. #my $podfile = $0;
  376. my $scalar = '';
  377. tie *STDOUT, 'IO::Scalar', \$scalar;
  378. if ($help_msg =~ "usage") {
  379. podselect({-sections => ["SYNOPSIS|MORE"]}, $0);
  380. }
  381. else {
  382. podselect({-sections => ["SYNOPSIS|ARGUMENTS|OPTIONS|MORE"]}, $0);
  383. }
  384. untie *STDOUT;
  385. # now $scalar contains the pod from $podfile you can see this below
  386. #print $scalar;
  387. my $pipe = IO::Pipe->new()
  388. or die "failed to create pipe: $!";
  389. my ($pid,$fd);
  390. if ( $pid = fork() ) { #parent
  391. open(TMPSTDIN, "<&STDIN")
  392. or die "failed to dup stdin to tmp: $!";
  393. $pipe->reader();
  394. $fd = $pipe->fileno;
  395. open(STDIN, "<&=$fd")
  396. or die "failed to dup \$fd to STDIN: $!";
  397. my $pod_txt = Pod::Text->new (sentence => 0, width => 78);
  398. $pod_txt->parse_from_filehandle;
  399. # END AT WORK HERE
  400. open(STDIN, "<&TMPSTDIN")
  401. or die "failed to restore dup'ed stdin: $!";
  402. }
  403. else { #child
  404. $pipe->writer();
  405. $pipe->print($scalar);
  406. $pipe->close();
  407. exit 0;
  408. }
  409. $pipe->close();
  410. close TMPSTDIN;
  411. print "\n";
  412. exit 0;
  413. }
  414. sub taxon_2_gbid {
  415. # Convert taxa name to a valid genbank identfier integer
  416. # returns 0 when name not found
  417. my $in_taxon = shift;
  418. my %taxa =
  419. ("cucumber" => "3659",
  420. "Arabidopsis" => "3702",
  421. "grape" => "29760",
  422. "poplar" => "3694",
  423. "soybean" => "3847",
  424. "papaya" => "3649"
  425. );
  426. my $gb_id;
  427. $gb_id = $taxa{"$in_taxon"} ||
  428. "unknown";
  429. return $gb_id;
  430. }
  431. sub connect_to_db {
  432. my ($cstr) = @_;
  433. return connect_to_mysql(@_) if $cstr =~ /:mysql:/i;
  434. return connect_to_pg(@_) if $cstr =~ /:pg:/i;
  435. die "can't understand driver in connection string: $cstr\n";
  436. }
  437. sub connect_to_pg {
  438. my ($cstr, $user, $pass) = @_;
  439. my $dbh = DBI->connect($cstr, $user, $pass,
  440. {PrintError => 0,
  441. RaiseError => 1,
  442. AutoCommit => 0});
  443. $dbh || &error("DBI connect failed : ",$dbh->errstr);
  444. return($dbh);
  445. } # End of ConnectToPG subfunction
  446. sub connect_to_mysql {
  447. my ($cstr, $user, $pass) = @_;
  448. my $dbh = DBI->connect($cstr,
  449. $user,
  450. $pass,
  451. {PrintError => 0,
  452. RaiseError => 1,
  453. AutoCommit => 0});
  454. $dbh || &error("DBI connect failed : ",$dbh->errstr);
  455. return($dbh);
  456. }
  457. sub prepare_sth {
  458. my $dbh = shift;
  459. # my ($dbh) = @_;
  460. my $sth = $dbh->prepare(@_);
  461. die "failed to prepare statement '$_[0]': ".$dbh->errstr."\n" unless $sth;
  462. return $sth;
  463. }
  464. sub execute_sth {
  465. # I would like to return the statement string here to figure
  466. # out where problems are.
  467. # Takes a statement handle
  468. my $sth = shift;
  469. my $rv = $sth->execute(@_);
  470. unless ($rv) {
  471. $dbh->disconnect();
  472. die "failed to execute statement: ".$sth->errstr."\n"
  473. }
  474. return $rv;
  475. } # End of execute_sth subfunction
  476. sub last_insert_id {
  477. #my ($dbh,$table_name,$driver) = @_;
  478. # The use of last_insert_id assumes that the no one
  479. # is interleaving nodes while you are working with the db
  480. my $dbh = shift;
  481. my $table_name = shift;
  482. my $driver = shift;
  483. # The following replace by sending driver info to the sufunction
  484. #my $driver = $dbh->get_info(SQL_DBMS_NAME);
  485. if (lc($driver) eq 'mysql') {
  486. return $dbh->{'mysql_insertid'};
  487. }
  488. elsif ((lc($driver) eq 'pg') || ($driver eq 'PostgreSQL')) {
  489. my $sql = "SELECT currval('${table_name}_pk_seq')";
  490. my $stmt = $dbh->prepare_cached($sql);
  491. my $rv = $stmt->execute;
  492. die "failed to retrieve last ID generated\n" unless $rv;
  493. my $row = $stmt->fetchrow_arrayref;
  494. $stmt->finish;
  495. return $row->[0];
  496. }
  497. else {
  498. die "don't know what to do with driver $driver\n";
  499. }
  500. } # End of last_insert_id subfunction
  501. # The following pulled directly from the DBI module
  502. # this is an attempt to see if I can get the DSNs to parse
  503. # for some reason, this is returning the driver information in the
  504. # place of scheme
  505. sub parse_dsn {
  506. my ($dsn) = @_;
  507. $dsn =~ s/^(dbi):(\w*?)(?:\((.*?)\))?://i or return;
  508. my ($scheme, $driver, $attr, $attr_hash) = (lc($1), $2, $3);
  509. $driver ||= $ENV{DBI_DRIVER} || '';
  510. $attr_hash = { split /\s*=>?\s*|\s*,\s*/, $attr, -1 } if $attr;
  511. return ($scheme, $driver, $attr, $attr_hash, $dsn);
  512. }
  513. sub stable_id_2_member_id {
  514. # Get the member_id of a locus in the database given
  515. # its stable_id
  516. my $stable_id_search = shift;
  517. my $member_id_result;
  518. my $search_sql = "SELECT member_id FROM member".
  519. " WHERE stable_id = '".$stable_id_search."'";
  520. print STDERR "\t\tSQL:".$search_sql."\n"
  521. if $verbose;
  522. my $sth = $dbh->prepare($search_sql);
  523. $sth->execute();
  524. while (my $row = $sth->fetchrow_arrayref) {
  525. $member_id_result = @$row[0];
  526. }
  527. unless ($member_id_result) {
  528. $member_id_result = 0;
  529. }
  530. return $member_id_result;
  531. }
  532. __END__
  533. =head1 NAME
  534. tr_import_members_from_fasta.pl - Import members from fasta file
  535. =head1 VERSION
  536. This documentation refers to program version $Rev: 613 $
  537. =head1 SYNOPSIS
  538. =head2 Usage
  539. tr_import_members_from_fasta.pl -u UserName -p dbPass -t MyTree
  540. -d 'DBI:mysql:database=biosql;host=localhost'
  541. =head2 Required Arguments
  542. The following options may also be specified in the
  543. user environment.
  544. --dsn # The DSN string the database to connect to
  545. # Must conform to:
  546. # 'DBI:mysql:database=trdb;host=localhost'
  547. --dbuser # User name for db connection
  548. --dbpass # Password for db connection
  549. ALTERNATIVE TO --dsn:
  550. --driver # mysql
  551. --dbname # Name of database to use
  552. --host # optional: host to connect with
  553. =head1 DESCRIPTION
  554. This program populates tables in the tree reconcilation database using
  555. input from a FASTA formatted file.
  556. The following tables are populated with this script:
  557. =over
  558. =item family
  559. =item family_member
  560. =item sequence
  561. =item member
  562. =back
  563. =head1 REQUIRED ARGUMENTS
  564. =over
  565. =item -d, --dsn
  566. The DSN of the database to connect to; default is the value in the
  567. environment variable DBI_DSN. If DBI_DSN has not been defined and
  568. the string is not passed to the command line, the dsn will be
  569. constructed from --driver, --dbname, --host
  570. DSN must be in the form:
  571. DBI:mysql:database=biosql;host=localhost
  572. =item -u, --dbuser
  573. The user name to connect with; default is the value in the environment
  574. variable DBI_USER.
  575. This user must have permission to create databases.
  576. =item -p, --dbpass
  577. The password to connect with; default is the value in the environment
  578. variable DBI_PASSWORD. If this is not provided at the command line
  579. the user is prompted.
  580. =back
  581. =head2 Alternative to --dsn
  582. An alternative to passing the full dsn at the command line is to
  583. provide the components separately.
  584. =over 2
  585. =item --host
  586. The database host to connect to; default is localhost.
  587. =item --dbname
  588. The database name to connect to; default is biosql.
  589. =item --driver
  590. The database driver to connect with; default is mysql.
  591. Options other then mysql are currently not supported.
  592. =back
  593. =head2 OPTIONS
  594. =over 2
  595. =item --verbose
  596. Execute the program in verbose mode.
  597. =back
  598. =head2 Additional Information
  599. =over 2
  600. =item --version
  601. Show the program version.
  602. =item --usage
  603. Show program usage statement.
  604. =item --help
  605. Show a short help message.
  606. =item --man
  607. Show the full program manual.
  608. =back
  609. =head1 EXAMPLES
  610. Example use
  611. ./prog_name.pl
  612. =head1 DIAGNOSTICS
  613. Error messages generated by this program and possible solutions are listed
  614. below.
  615. =over
  616. =item ERROR: Error msg
  617. =back
  618. =head1 CONFIGURATION AND ENVIRONMENT
  619. Many of the options passed at the command line can be set as
  620. options in the user's environment.
  621. =over 2
  622. =item TR_USERNAME
  623. User name to connect to the database.
  624. =item TR_PASSWORD
  625. Password for the database connection
  626. =item TR_DBNAME
  627. Database name.
  628. =item TR_HOST
  629. Host for the database connection.
  630. =item TR_DSN
  631. Full database DSN for connecting to a tree reconciliatin database.
  632. =back
  633. For example in the bash shell this would be done be editing your .bashrc file
  634. to contain :
  635. export TR_USERNAME=yourname
  636. export TR_PASSWORD=yourpassword
  637. export TR_DBNAME=your_database_name
  638. export TR_DBHOST=localhost
  639. Alternatively, the database name and host can be specified in a
  640. DSN similar to the following format.
  641. export DBI_DSN='DBI:mysql:database=biosql;host-localhost'
  642. =head1 DEPENDENCIES
  643. =head2 Perl Modules
  644. The tr_index_protein_tree.pl program is dependent on the
  645. following Perl modules.
  646. =over
  647. =item DBI - L<http://dbi.perl.org>
  648. The PERL Database Interface (DBI) module allows for connections
  649. to multiple databases. This implementation of the tree reconciliation
  650. database is limited to MySQL.
  651. =item DBD:MySQL -
  652. L<http://search.cpan.org/~capttofu/DBD-mysql-4.005/lib/DBD/mysql.pm>
  653. MySQL database driver for DBI module.
  654. =item Getopt::Long - L<http://perldoc.perl.org/Getopt/Long.html>
  655. The Getopt module allows for the passing of command line options
  656. to perl scripts.
  657. =item Bio::Tree - L<http://www.bioperl.org>
  658. The Bio::Tree module is part of the bioperl package.
  659. =back
  660. =head2 Software
  661. The MySQL RDBMS is also required for the reconciliation database.
  662. =head1 BUGS AND LIMITATIONS
  663. =head2 Bugs
  664. Please report bugs to:
  665. http://pods.iplantcollaborative.org/jira
  666. =head2 Limiations
  667. The following limiations are known:
  668. =over
  669. =item *
  670. Taxon IDs are currently not resolved against the database. Currently
  671. a kludge is in place that uses the NCBI taxon IDs for a small
  672. subset of potential species.
  673. =item *
  674. This program expects specific fasta file headers.
  675. The fasta file headers must be set as:
  676. >locusName_Species
  677. where locusName is the name of the locus/gene and Species
  678. is the name of the species.
  679. Other headers will not work properly.
  680. For example, the following file is in the proper format:
  681. >POPTR-0010s18130.1_poplar
  682. MFHTKKPSTMNSHDRPMCVQGDSGLVLTTDPKPRLRWTVELHERFVDAVTQLGGPDKATP
  683. KTIMRVMGVKGLTLYHLKSHLQKFRLGKQPHKDFNDHSIKDASALDLQRSAASSSGMMSR
  684. SMNEMQMEVQRRLHEQLEVQRHLQLRTEAQGKYIQSLLEKACQTLAGDQNLASGSYKGMG
  685. NQGIPGMGAMKEFGTLNFPAFQDLNIYGGDQLDLQHNMDRPSLDGFMPNNDNICLGKKRP
  686. SPYDGSGKSPLIWPDDLRLQDLGSGPACLEPQDDPFKGDQIQMAPPSMDRGTDLDSISDM
  687. YEIKPALQGDALDEKKFEASAKLKRPSPRRSPLAAERMSPMINTGAMPQGRNSPFG
  688. >POPTR-0008s08130.1_poplar
  689. MFHTKKPSTMNSHDRPMCVQDSGLVLTTDPKPRLRWTVELHERFVDAVAQLGGPDKATPK
  690. TIMRVMGVKGLTLYHLKSHLQKFRLGKQLHKEFNDHSIKDASALDLQRSAASSSGMISRS
  691. MNDNSHMIYAIRMQMEVQRRLHEQLEVQRHLQLRTEAQGKYIQSLLEKACQTLAGDQDLA
  692. SGSYKGIGNQGVPDMGAMKDFGPLNFPPFQDLNIYGSGQLDLLHNMDRPSLDGFMSNNHD
  693. DICLGKKRTNPYAGSGKSPLIWSDDLRLQDLGSGLSCLGPQDDPLKGDQIQIAPPLMDSG
  694. TDLDSLSGLYGTKPVHQGDALDEKKLEASAKTERPSPRRAPLAADRMSPMINTGVMPQGR
  695. NSPFG
  696. =item *
  697. Currently only stable with the MySQL Database driver.
  698. =item *
  699. DSN string must currently be in the form:
  700. DBI:mysql:database=DBNAME;host=DBHOST
  701. such as:
  702. DBI:mysql:database=reconciliation_db;host=localhost
  703. =back
  704. =head1 BUGS AND LIMITATIONS
  705. =head2 Bugs
  706. Please report bugs to:
  707. http://pods.iplantcollaborative.org/jira
  708. =head2 Limitations
  709. Currently the tree_reconciliation database is limited to the MySQL RDBMS.
  710. =head1 SEE ALSO
  711. The program prog_name.pl is a component of the iPlant Tree
  712. Reconciliaton suite of utilities. Additoinal information is available
  713. at:
  714. L<https://pods.iplantcollaborative.org/wiki/display/iptol/1.0+Architecture>
  715. =head1 LICENSE
  716. Simplified BSD License
  717. http://tinyurl.com/iplant-tr-license
  718. =head1
  719. =head1 AUTHORS
  720. James C. Estill E<lt>JamesEstill at gmail.comE<gt>
  721. =head1 HISTORY
  722. Started: 09/28/2010
  723. Updated: 04/12/2011
  724. =cut