/tools/fasta_tools/fasta_to_tabular.py

https://bitbucket.org/cistrome/cistrome-harvard/ · Python · 57 lines · 41 code · 4 blank · 12 comment · 14 complexity · 1895f935c76c44d433cc9f53e142ae80 MD5 · raw file

  1. #!/usr/bin/env python
  2. # This code exists in 2 places: ~/datatypes/converters and ~/tools/fasta_tools
  3. """
  4. Input: fasta (input file), tabular (output file), int (truncation of id), int (columns from description)
  5. Output: tabular
  6. format convert: fasta to tabular
  7. """
  8. import sys, os
  9. def stop_err( msg ):
  10. sys.stderr.write( msg )
  11. sys.exit()
  12. def __main__():
  13. if len(sys.argv) != 5:
  14. stop_err("Wrong number of argument. Expect four (fasta, tabular, truncation, columns)")
  15. infile = sys.argv[1]
  16. outfile = sys.argv[2]
  17. keep_first = int( sys.argv[3] )
  18. descr_split = int( sys.argv[4] )
  19. fasta_title = fasta_seq = ''
  20. if keep_first == 0:
  21. keep_first = None
  22. elif descr_split == 1:
  23. #Added one for the ">" character
  24. #(which is removed if using descr_split > 1)
  25. keep_first += 1
  26. if descr_split < 1:
  27. stop_err("Bad description split value (should be 1 or more)")
  28. out = open( outfile, 'w' )
  29. for i, line in enumerate( open( infile ) ):
  30. line = line.rstrip( '\r\n' )
  31. if not line or line.startswith( '#' ):
  32. continue
  33. if line.startswith( '>' ):
  34. #Don't want any existing tabs to trigger extra columns:
  35. line = line.replace('\t', ' ')
  36. if i > 0:
  37. out.write('\n')
  38. if descr_split == 1:
  39. out.write(line[1:keep_first])
  40. else:
  41. words = line[1:].split(None, descr_split-1)
  42. #apply any truncation to first word (the id)
  43. words[0] = words[0][0:keep_first]
  44. #pad with empty columns if required
  45. words += [""]*(descr_split-len(words))
  46. out.write("\t".join(words))
  47. out.write('\t')
  48. else:
  49. out.write(line)
  50. if i > 0:
  51. out.write('\n')
  52. out.close()
  53. if __name__ == "__main__" : __main__()