PageRenderTime 55ms CodeModel.GetById 21ms RepoModel.GetById 1ms app.codeStats 0ms

/test/unit/bio/db/test_fastq.rb

https://github.com/nmb/bioruby
Ruby | 865 lines | 735 code | 111 blank | 19 comment | 4 complexity | b6c5c1832233804ca4e184e397fb9381 MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.1
  1. #
  2. # test/unit/bio/db/test_fastq.rb - Unit test for Bio::Fastq
  3. #
  4. # Copyright:: Copyright (C) 2009
  5. # Naohisa Goto <ng@bioruby.org>
  6. # License:: The Ruby License
  7. #
  8. # $Id:$
  9. #
  10. # loading helper routine for testing bioruby
  11. require 'pathname'
  12. load Pathname.new(File.join(File.dirname(__FILE__), ['..'] * 3,
  13. 'bioruby_test_helper.rb')).cleanpath.to_s
  14. # libraries needed for the tests
  15. require 'test/unit'
  16. require 'bio/io/flatfile'
  17. require 'bio/db/fastq'
  18. module Bio
  19. module TestFastq
  20. TestFastqDataDir = Pathname.new(File.join(BioRubyTestDataPath,
  21. 'fastq')).cleanpath.to_s
  22. # A module providing methods to compare float arrays
  23. module FloatArrayComparison
  24. private
  25. def float_array_equivalent?(expected, actual, *arg)
  26. assert_equal(expected.size, actual.size, *arg)
  27. dt = Float::EPSILON * 1024
  28. (0...(expected.size)).each do |i|
  29. e = expected[i]
  30. a = actual[i]
  31. #assert_equal(e, a)
  32. assert_in_delta(e, a, e.abs * dt)
  33. end
  34. end
  35. end #module FloatArrayComparison
  36. # Tests using 'longreads_original_sanger.fastq'
  37. class TestFastq_longreads_original_sanger < Test::Unit::TestCase
  38. include FloatArrayComparison
  39. SEQS =
  40. [
  41. 'tcagTTAAGATGGGATAATATCCTCAGATTGCGTGATGAACTTTGTTCTGGTGGAGGAGA
  42. AGGAAGTGCATTCGACGTATGCCCGTTTGTCGATATTTGtatttaaagtaatccgtcaca
  43. aatcagtgacataaatattatttagatttcgggagcaactttatttattccacaagcagg
  44. tttaaattttaaatttaaattattgcagaagactttaaattaacctcgttgtcggagtca
  45. tttgttcggttattggtcgaaagtaaccncgggaagtgccgaaaactaacaaacaaaaga
  46. agatagtgaaattttaattaaaanaaatagccaaacgtaactaactaaaacggacccgtc
  47. gaggaactgccaacggacgacacagggagtagnnn',
  48. 'tcagCCAGCAATTCCGACTTAATTGTTCTTCTTCCATCATTCATCTCGACTAACAGTTCT
  49. ACGATTAATGAGTTTGGCtttaatttgttgttcattattgtcacaattacactactgaga
  50. ctgccaaggcacncagggataggnn',
  51. 'tcagTTTTCTTAAATTACTTGAATCTGTTGAAGTGGATGTCCACTTTTGTATGCCAAATA
  52. TGCCCAGCGTATACGATCTTGGCCACATCTCCACATAATCATCAGTCGGATGCAAAAAGC
  53. GATTAAACTAAAAATGAATGCGTTTTTAGATGAGTAAATAGGTAATACTTTGTTTAAATA
  54. ATAAATGTCACAAACAGAACGCGGATTACAGTACCTGAAAATAGTTGTACTGTATCTGTG
  55. CCGGCACTTCCTCGGCCCTGAGAAGTTGTCCCGTTGTTTCCATTCGCACCATCCAATGGC
  56. CAAAGTTTGCGAAGAATCTGTTCCGTTCCATTACCAATTGTTTTTCCATGctgagactgc
  57. caaggcacacaggggataggnn',
  58. 'tcagTTTTTGGAGAATTCCGTCAGGGACGGCATGGCATATTTGTGGGTTCGGCACGGCGT
  59. CCTGGCCAAGAAGAAGAAGACGAATTAGCCCGTTAATTTAATGACACCTTCCCCAATTTT
  60. GCAGCAATGATTGGTTCATTCTTGGCGGTGCGTTTTTGTGCTTCGTCGAATTGTTGGCCA
  61. TTTTGGTCCACCGGCCATCATCTTTACGCTATCCGACTGATTGGAAATCACCGCCTAGCA
  62. TTTTGCCGAAGATTGTTGCGTTGTACGGCCATGTGCTGATTGTTTACATTGGCATTCTTG
  63. GCAATTTGTCCTTGGTCGGCTTTGACGGCAAATTTGCGGTGTTAAGTctgagactgccaa
  64. ggcacacagggggatagggnn',
  65. 'tcagTTGACCGGCGTTGTGTAACAATAATTCATTATTCTGAGACGATGCCAATGTAATCG
  66. ACGGTTTATGCCCAATTATTCCCATCTATGCTTAACTGATCAAATACTATTTGCATTACG
  67. TCACGAAATTGCGCGAACACCGCCGGCCGACAATAATTTATACCGGACATACCGGAGTTG
  68. ATGGTAATCGGTAAAGAGTTTTATTTAATTATntattatcnctattaattattgttanca
  69. acaatgtgcacgctntgccgcccgccgccgccgtgtcggtaggaccccggacggacccgg
  70. acccggttcgggtacccgttttcgggttcccggaaccgtttttcgggtacccggtttttt
  71. cggggggccccccggtaaaaaaccggggaaccccctaaaacgggtaaacgtaccgtaagg
  72. gaccccctaaacgggggccccgaaaaaccgggacccaaaccggggggaaacggttaaagg
  73. ggggggaagtaggngnnnnnnnnnnnn',
  74. 'tcagTTATTGCAGTCGTTCCGCGCCATCGCCGGTAACCGTCCGCGTGTTATTCTGTGTAT
  75. CGGCCAACCTTCGTATAACTTCGTATAATGTATGCTATACGAAGTTATTACGATCTATAC
  76. CGGCGAAACTCAGCCGAAAGGTCTCGCGGTAGAGCCTATGAGCTGCCCGACCGATGCATT
  77. TAAATTTCCGGGGATCGtcgctgatctgagactgccaaaggcacactagggggataggnn
  78. nnnnnnnnnnnnnnnnnn',
  79. 'tcagGTTTTAAATCGCTTTCCAAGGAATTTGAGTCTAAATCCGGTGGATCCCATCAGTAC
  80. AAATGCGGCGACAAGGCCGTGAAAACACTGCTTAATTCTTTGCACTTTTTGGCCACCTTT
  81. TTGGAAATGTTGTTTTGTGTTCTCAAAATTTTCCATCTCAGAACAAACATTCCATCGGGC
  82. TGATGTTGTGGCTTTTGGCGCGCGAAGTGCTGCTACTGCGCGGCAAAATCAGTCGCCAGA
  83. CCGGTTTTGTTGTGGACGACAAAGTGATCATGCCTGACTTGTACTTCTACCGCGATCCGC
  84. AAGCGCGAATTGGTCACATAGTTATAGAATTTTTGAGCCTTTTTCTTGACATAAAAAGTG
  85. TGGTTTTAAAAATTTCCTGGCAGGACCCACGCCAACGTTCAGGAATAATATCTTTTAAAA
  86. AGctgagactgccaaggcacacaggggataggn',
  87. 'tcagTTTAATTTGGTGCTTCCTTTCAATTCCTTAGTTTAAACTTGGCACTGAAGTCTCGC
  88. ATTTATAACTAGAGCCCGGATTTTAGAGGCTAAAAAGTTTTCCAGATTTCAAAATTTATT
  89. TCGAAACTATTTTTCTGATTGTGATGTGACGGATTTCTAAATTAAATCGAAATGATGTGT
  90. ATTGAACTTAACAAGTGATTTTTATCAGATTTTGTCAATGAATAAATTTTAATTTAAATC
  91. TCTTTCTAACACTTTCATGATTAAAATCTAACAAAGCGCGACCAGTATGTGAGAAGAGCA
  92. AAAACAACAAAAAGTGCTAGCACTAAAGAAGGTTCGAACCCAACACATAACGTAAGAGTT
  93. ACCGGGAAGAAAACCACTctgagactgccaaggcacacagggggataggnn',
  94. 'tcagTTTTCAAATTTTCCGAAATTTGCTGTTTGGTAGAAGGCAAATTATTTGATTGAATT
  95. TTGTATTTATTTAAAACAATTTATTTTAAAATAATAATTTTCCATTGACTTTTTACATTT
  96. AATTGATTTTATTATGCATTTTATATTTGTTTTCTAAATATTCGTTTGCAAACTCACGTT
  97. GAAATTGTATTAAACTCGAAATTAGAGTTTTTGAAATTAATTTTTATGTAGCATAATATT
  98. TTAAACATATTGGAATTTTATAAAACATTATATTTTTctgagactgccaaggcacacagg
  99. gggataggn',
  100. 'tcagTTTTGATCTTTTAATAATGAATTTTAATGTGTTAAAATGATTGCATTGATGGCATA
  101. ACCGCATTTAAATTAATTACATGAAGTGTAAGTATGAAATTTTCCTTTCCAAATTGCAAA
  102. AACTAAAATTTAAAATTTATCGTAAAAATTAACATATATTTTAAACGATTTTAAGAAACA
  103. TTTGTAAATTATATTTTTGTGAAGCGTTCAAACAAAAATAAACAATAAAATATTTTTCTA
  104. TTTAATAGCAAAACATTTGACGATGAAAAGGAAAATGCGGGTTTGAAAATGGGCTTTGCC
  105. ATGCTATTTTCATAATAACATATTTTTATTATGAATAATAAATTTACATACAATATATAC
  106. AGTCTTAAATTTATTCATAATATTTTTGAGAATctgagactgccaaggcacacaggggat
  107. aggn'
  108. ].collect { |x| x.gsub(/\s/, '').freeze }.freeze
  109. IDLINES =
  110. [
  111. 'FSRRS4401BE7HA [length=395] [gc=36.46] [flows=800] [phred_min=0] [phred_max=40] [trimmed_length=95]',
  112. 'FSRRS4401BRRTC [length=145] [gc=38.62] [flows=800] [phred_min=0] [phred_max=38] [trimmed_length=74]',
  113. 'FSRRS4401B64ST [length=382] [gc=40.58] [flows=800] [phred_min=0] [phred_max=40] [trimmed_length=346]',
  114. 'FSRRS4401EJ0YH [length=381] [gc=48.29] [flows=800] [phred_min=0] [phred_max=40] [trimmed_length=343]',
  115. 'FSRRS4401BK0IB [length=507] [gc=49.31] [flows=800] [phred_min=0] [phred_max=40] [trimmed_length=208]',
  116. 'FSRRS4401ARCCB [length=258] [gc=46.90] [flows=800] [phred_min=0] [phred_max=38] [trimmed_length=193]',
  117. 'FSRRS4401CM938 [length=453] [gc=44.15] [flows=800] [phred_min=0] [phred_max=40] [trimmed_length=418]',
  118. 'FSRRS4401EQLIK [length=411] [gc=34.31] [flows=800] [phred_min=0] [phred_max=40] [trimmed_length=374]',
  119. 'FSRRS4401AOV6A [length=309] [gc=22.98] [flows=800] [phred_min=0] [phred_max=40] [trimmed_length=273]',
  120. 'FSRRS4401EG0ZW [length=424] [gc=23.82] [flows=800] [phred_min=0] [phred_max=40] [trimmed_length=389]',
  121. ].collect { |x| x.freeze }.freeze
  122. ENTRY_IDS = [ 'FSRRS4401BE7HA',
  123. 'FSRRS4401BRRTC',
  124. 'FSRRS4401B64ST',
  125. 'FSRRS4401EJ0YH',
  126. 'FSRRS4401BK0IB',
  127. 'FSRRS4401ARCCB',
  128. 'FSRRS4401CM938',
  129. 'FSRRS4401EQLIK',
  130. 'FSRRS4401AOV6A',
  131. 'FSRRS4401EG0ZW'
  132. ].collect { |x| x.freeze }.freeze
  133. QUALITY_STRINGS =
  134. [ <<'_0_', <<'_1_', <<'_2_', <<'_3_', <<'_4_', <<'_5_', <<'_6_', <<'_7_', <<'_8_', <<'_9_' ].collect { |x| x.delete("\r\n").freeze }.freeze
  135. FFFDDDDDDDA666?688FFHGGIIIIIIIIIIIIIIIII
  136. IHHHIIIIIIIIIGHGFFFFF====DFFFFFFFFFFFFFF
  137. D???:3104/76=:5...4.3,,,366////4<ABBAAA=
  138. CCFDDDDDDDD:666CDFFFF=<ABA=;:333111<===9
  139. 9;B889FFFFFFDDBDBDDD=8844231..,,,-,,,,,,
  140. ,,1133..---17111,,,,,22555131121.--.,333
  141. 11,.,,3--,,.,,--,3511123..--!,,,,--,----
  142. 9,,,,8=,,-,,,-,,,,---26:9:5-..1,,,,11//,
  143. ,,,!,,1917--,,,,-3.,--,,17,,,,---+11113.
  144. 030000,,,044400036;96662.//;7><;!!!
  145. _0_
  146. FFFFFFFFFDDDDFFFFGFDDDDBAAAAA=<4444@@B=5
  147. 55:BBBBB@@?8:8<?<89898<84442;==3,,,514,,
  148. ,11,,,.,,21777555513,..--1115758.//34488
  149. ><<;;;;9944/!/4,,,57855!!
  150. _1_
  151. IIIICCCCI??666IIIIIIIIIIIIIIIIIIIIIIIIII
  152. IIII6666IAIIIII???IIIICCCIIIIIIIIIIIIIII
  153. IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII66333EI
  154. CE::338=/----,8=>>??:2-////7>CEEIEIHHHII
  155. IIIIIIIE;;9911199B???IBCHIIIIIIHHHIIHHHI
  156. IIIIIIIIIIIIIIIIIBBCCIIIIIIIIIIIIIIIIIII
  157. IIIIIIIIIIIIIIIGGGIIIIIIIIID?===DIIIHHHI
  158. IIIIIIIIHHHIIIIIIIIIIHHHIHHHIIIIIIIIIIII
  159. IIIIIIIIII?>;9988==5----.@@AEGIIIIIIIIIH
  160. H????EIIIFF999;EIIBB!!
  161. _2_
  162. IIII?????IIIIIIIIIIIIIIHHHIIIIIIIIIIIIIH
  163. HHIIHHHIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
  164. IIIIIIIIHHHIIIIIHHHIIIIIIIIIIIAAAAII>>>>
  165. IIIIIIIIIIIIIIIIIIIIIIIIIIEEIEE;33333D7I
  166. IIIIIIIIIIIIIIIIIIIICC@@HHIIIIIIIIIIIIII
  167. IIHHHIIIIIIIIIIIIIIIIIIIHHHIIIIIIIIIIIII
  168. BBBBIHCDCHIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
  169. IIHHHIIIHHCCDIIIIIIHHHIICCCH=CCIIIIIIIII
  170. GGGIIIIIIHHHHHHIIIIIIIIIIIIIIIHHHIIHHE??
  171. >>?EFEE?/////;:80--!!
  172. _3_
  173. FFFA@@FFFFFFFFFFHHB:::@BFFFFGGHIHIIIIIII
  174. IIIIIIIIIIIIIIIIFFFFFFFFF?=BA@11188011<<
  175. 88;?AABDDC???DDAAAADA666D?DDD=====AA>?>>
  176. <<<=<11188<<???AA?9555=ABBB@@?=>>?@@1114
  177. 2::DDA???DFFFFFFFFFFFFFBAAAA<<0000.22=//
  178. //8,--111111!23--/24!37:6666<;822/..4!46
  179. 521177553.-.23!231121112,,-,,211==5-----
  180. -,12,,,,,,-,,,-1,,,,-,,155--,,,,13111.,,
  181. ,,,,,,++111..11..1,,,,,,,,,+3,,,,,--22--
  182. ---//----55//**/--22--**,,,,**,,,,,,.1.,
  183. *,,,,***,,,,,,,,,,,,,,,,,,,,,,,),,-,,,,,
  184. ,),,,,,**//.),,,///,,,,,,,,,,,.))33---,,
  185. ,,,,,,,,(0,,,!.!!!!!!!!!!!!
  186. _4_
  187. FFF<8::@DFFFFFFFGGFDCAAAAAB@@000046<;663
  188. 22366762243348<<=??4445::>ABAAA@<<==B=:5
  189. 55:BBD??=BDDDDFFFCCCCCCCFFCDDDFFFFFDBAA=
  190. =88880004><<<99688;889<889?BBBBA=???DDBB
  191. B@@??88889---237771,,,,,,,,--1152<<00158
  192. A@><<<<<43277711,,,--37===75,----34666!!
  193. !!!!!!!!!!!!!!!!!!
  194. _5_
  195. IIIIICC>>666IIIICCCIIIIIIIIHHHIIIIIG666I
  196. IIIIIIIIIHHHIIIIIIIICCCIIIIIIIIIIIIIIIII
  197. I@@@@IIIIIIIIIIIIIHHHIIII???=;IIEEI::///
  198. //7544:?IBB72244E8EECEBC=@@@@@@@HHIIIIII
  199. IIIIBBBIIIIIIIIIHHHIIIIIIIIIIIIICCCCIIII
  200. IIIIIIIIIIIIIIIIIIIIIIII6666DEIIHEB??D@7
  201. 77772222D89EEIIIIIIIHHHIIIIIIIIHHHIIIIII
  202. IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIHHHIIIIII
  203. IIIIIIIII==?==IIIII???=;I63DDD82--,,,38=
  204. =::----,,---+++33066;@6380008/:889<:BGII
  205. IIIIIIIFE<?F5500-----5:;;;:>?@C<<7999EEE
  206. EEE@@@@EEEEE!
  207. _6_
  208. III?666??HHHIIIIIIIIIGGGIIIIIIIIIIIGGGHH
  209. HIIIIIIIIIIIIIIIIIIIIGGGIIIIIIIIIIHHHIII
  210. @@@@IIIIEIE111100----22?=8---:-------,,,
  211. ,33---5:3,----:1BBEEEHIIIIIIIIIIIB??A122
  212. 000...:?=024GIIIIIIIIIIIIIIIIIIECCHHB=//
  213. -,,21??<5-002=6FBB?:9<=11/4444//-//77??G
  214. EIEEHIACCIIIHHHIIIIIIICCCAIIIHHHHHHIIIII
  215. IIIIIIIIIIIIIIIIIEE1//--822;----.777@EII
  216. IIII???IIIIIIIIIIIHHHIIIIIIIIIIIIIIIIIII
  217. I994227775555AE;IEEEEEIIIII??9755>@==:3,
  218. ,,,,33336!!
  219. _7_
  220. IIIICCCCI;;;CCCCIII???HHHIIIIHHHIIIIIIII
  221. IIHHHIIIHHHIIIIIII@@@@IFICCCICAA;;;;ED?B
  222. @@D66445555<<<GII>>AAIIIIIIII;;;::III???
  223. CCCIII;;;;IFFIIIIICCCBIBIEEDC4444?4BBBE?
  224. EIIICHHII;;;HIIIIIIHH;;;HHIIIII;;;IIIIHH
  225. HIIIIII>>??>IEEBGG::1111/46FBFBB?=;=A?97
  226. 771119:EAAADDBD7777=/111122DA@@B68;;;I8H
  227. HIIIII;;;;?>IECCCB/////;745=!
  228. _8_
  229. IIA94445EEII===>IIIIIIIIICCCCIIHIIICC;;;
  230. ;IIIIIIIIIIIIIIIIIIIIIIIIIF;;666DDIIIIII
  231. IIIIIIIIIIIIIEE94442244@@666CC<<BDDA=---
  232. --2<,,,,659//00===8CIII;>>==HH;;IIIIIICC
  233. @@???III@@@@IC?666HIDDCI?B??CC<EE11111B4
  234. BDDCB;=@B777>////-=323?423,,,/=1,,,,-:4E
  235. ;??EIIIIICCCCI>;;;IIIIIII<<@@?=////7=A99
  236. 988<<4455IEEEIIIIIIIIIIIII<999HIIIIIIIII
  237. II?????IIIIIIIIIIICAC;55539EIIIIIIIIIIII
  238. IIIIHH999HHHIA=AEEFF@=.....AD@@@DDEEEEFI
  239. II;;;977FFCCC@24449?FDD!
  240. _9_
  241. QUALITY_SCORES = QUALITY_STRINGS.collect { |str|
  242. str.unpack('C*').collect { |i| i - 33 }.freeze
  243. }.freeze
  244. ERROR_PROBABILITIES = QUALITY_SCORES.collect { |ary|
  245. ary.collect { |q| 10 ** (- q / 10.0) }.freeze
  246. }.freeze
  247. def setup
  248. fn = File.join(TestFastqDataDir, 'longreads_original_sanger.fastq')
  249. @ff = Bio::FlatFile.open(Bio::Fastq, fn)
  250. end
  251. def test_validate_format
  252. @ff.each do |e|
  253. assert(e.validate_format)
  254. end
  255. assert(@ff.eof?)
  256. end
  257. def test_validate_format_with_array
  258. @ff.each do |e|
  259. a = []
  260. assert(e.validate_format(a))
  261. assert(a.empty?)
  262. end
  263. end
  264. def test_definition
  265. ids = IDLINES.dup
  266. @ff.each do |e|
  267. assert_equal(ids.shift, e.definition)
  268. end
  269. assert(ids.empty?)
  270. end
  271. def test_entry_id
  272. ids = ENTRY_IDS.dup
  273. @ff.each do |e|
  274. assert_equal(ids.shift, e.entry_id)
  275. end
  276. assert(ids.empty?)
  277. end
  278. def test_sequence_string
  279. seqs = SEQS.dup
  280. @ff.each do |e|
  281. s = seqs.shift
  282. assert_equal(s, e.sequence_string)
  283. end
  284. assert(seqs.empty?)
  285. end
  286. def test_seq
  287. seqs = SEQS.collect { |x| Bio::Sequence::Generic.new(x) }
  288. @ff.each do |e|
  289. s = seqs.shift
  290. assert_equal(s, e.seq)
  291. end
  292. assert(seqs.empty?)
  293. end
  294. def test_naseq
  295. seqs = SEQS.collect { |x| Bio::Sequence::NA.new(x) }
  296. @ff.each do |e|
  297. s = seqs.shift
  298. assert_equal(s, e.naseq)
  299. end
  300. assert(seqs.empty?)
  301. end
  302. def test_nalen
  303. lengths = SEQS.collect { |x| Bio::Sequence::NA.new(x).length }
  304. @ff.each do |e|
  305. i = lengths.shift
  306. assert_equal(i, e.nalen)
  307. end
  308. assert(lengths.empty?)
  309. end
  310. def test_quality_string
  311. qualities = QUALITY_STRINGS.dup
  312. @ff.each do |e|
  313. assert_equal(qualities.shift, e.quality_string)
  314. end
  315. assert(qualities.empty?)
  316. end
  317. def test_quality_scores
  318. qualities = QUALITY_SCORES.dup
  319. @ff.each do |e|
  320. assert_equal(qualities.shift, e.quality_scores)
  321. end
  322. assert(qualities.empty?)
  323. end
  324. def test_error_probabilities
  325. probs = ERROR_PROBABILITIES.dup
  326. @ff.each do |e|
  327. float_array_equivalent?(probs.shift,
  328. e.error_probabilities)
  329. end
  330. assert(probs.empty?)
  331. end
  332. def test_to_biosequence
  333. @ff.each_with_index do |e, i|
  334. s = nil
  335. assert_nothing_raised { s = e.to_biosequence }
  336. assert_equal(Bio::Sequence::Generic.new(SEQS[i]), s.seq)
  337. assert_equal(IDLINES[i], s.definition)
  338. assert_equal(ENTRY_IDS[i], s.entry_id)
  339. assert_equal(:phred, s.quality_score_type)
  340. assert_equal(QUALITY_SCORES[i], s.quality_scores)
  341. float_array_equivalent?(ERROR_PROBABILITIES[i],
  342. s.error_probabilities)
  343. end
  344. end
  345. def test_roundtrip
  346. @ff.each_with_index do |e, i|
  347. str_orig = @ff.entry_raw
  348. s = e.to_biosequence
  349. str = s.output(:fastq_sanger,
  350. { :repeat_title => true, :width => 80 })
  351. assert_equal(str_orig, str)
  352. e2 = Bio::Fastq.new(str)
  353. assert_equal(e.sequence_string, e2.sequence_string)
  354. assert_equal(e.quality_string, e2.quality_string)
  355. assert_equal(e.definition, e2.definition)
  356. assert_equal(e.quality_scores, e2.quality_scores)
  357. float_array_equivalent?(e.error_probabilities,
  358. e2.error_probabilities)
  359. end
  360. end
  361. end #class TestFastq_longreads_original_sanger
  362. # common methods to read *_full_range_as_*.fastq and test quality scores
  363. # and error probabilities
  364. module TestFastq_full_range
  365. include FloatArrayComparison
  366. private
  367. def read_file(fn, format)
  368. path = File.join(TestFastqDataDir, fn)
  369. entries = Bio::FlatFile.open(Bio::Fastq, path) { |ff| ff.to_a }
  370. entries.each { |e| e.format=format }
  371. entries
  372. end
  373. def scores_through(range)
  374. range.to_a
  375. end
  376. def scores_phred2solexa(range)
  377. min = -5
  378. max = 62
  379. sc = range.collect do |q|
  380. tmp = 10 ** (q / 10.0) - 1
  381. if tmp <= 0 then
  382. min
  383. else
  384. r = (10 * Math.log10(tmp)).round
  385. if r < min then
  386. min
  387. elsif r > max then
  388. max
  389. else
  390. r
  391. end
  392. end
  393. end
  394. sc
  395. end
  396. def scores_phred2illumina(range)
  397. min = 0
  398. max = 62
  399. sc = range.collect do |q|
  400. if q < min then
  401. min
  402. elsif q > max then
  403. max
  404. else
  405. q
  406. end
  407. end
  408. sc
  409. end
  410. def scores_phred2sanger(range)
  411. min = 0
  412. max = 93
  413. sc = range.collect do |q|
  414. if q < min then
  415. min
  416. elsif q > max then
  417. max
  418. else
  419. q
  420. end
  421. end
  422. sc
  423. end
  424. def scores_solexa2phred(range)
  425. sc = range.collect do |q|
  426. r = 10 * Math.log10(10 ** (q / 10.0) + 1)
  427. r.round
  428. end
  429. sc
  430. end
  431. def scores_solexa2sanger(range)
  432. scores_phred2sanger(scores_solexa2phred(range))
  433. end
  434. def scores_solexa2illumina(range)
  435. scores_phred2illumina(scores_solexa2phred(range))
  436. end
  437. def common_test_quality_scores(scores, filename, format)
  438. entries = read_file(filename, format)
  439. assert_equal(scores, entries[0].quality_scores)
  440. assert_equal(scores.reverse, entries[1].quality_scores)
  441. end
  442. def common_test_error_probabilities(probabilities, filename, format)
  443. entries = read_file(filename, format)
  444. float_array_equivalent?(probabilities,
  445. entries[0].error_probabilities)
  446. float_array_equivalent?(probabilities.reverse,
  447. entries[1].error_probabilities)
  448. end
  449. def common_test_validate_format(filename, format)
  450. entries = read_file(filename, format)
  451. assert(entries[0].validate_format)
  452. assert(entries[1].validate_format)
  453. end
  454. def phred_q2p(scores)
  455. scores.collect { |q| 10 ** (-q / 10.0) }
  456. end
  457. def solexa_q2p(scores)
  458. scores.collect do |q|
  459. t = 10 ** (-q / 10.0)
  460. t / (1.0 + t)
  461. end
  462. end
  463. public
  464. def test_validate_format
  465. common_test_validate_format(self.class::FILENAME_AS_SANGER,
  466. 'fastq-sanger')
  467. common_test_validate_format(self.class::FILENAME_AS_SOLEXA,
  468. 'fastq-solexa')
  469. common_test_validate_format(self.class::FILENAME_AS_ILLUMINA,
  470. 'fastq-illumina')
  471. end
  472. def test_quality_scores_as_sanger
  473. scores = scores_to_sanger(self.class::RANGE)
  474. common_test_quality_scores(scores,
  475. self.class::FILENAME_AS_SANGER,
  476. 'fastq-sanger')
  477. end
  478. def test_error_probabilities_as_sanger
  479. scores = scores_to_sanger(self.class::RANGE)
  480. probs = phred_q2p(scores)
  481. common_test_error_probabilities(probs,
  482. self.class::FILENAME_AS_SANGER,
  483. 'fastq-sanger')
  484. end
  485. def test_quality_scores_as_solexa
  486. scores = scores_to_solexa(self.class::RANGE)
  487. common_test_quality_scores(scores,
  488. self.class::FILENAME_AS_SOLEXA,
  489. 'fastq-solexa')
  490. end
  491. def test_error_probabilities_as_solexa
  492. scores = scores_to_solexa(self.class::RANGE)
  493. probs = solexa_q2p(scores)
  494. common_test_error_probabilities(probs,
  495. self.class::FILENAME_AS_SOLEXA,
  496. 'fastq-solexa')
  497. end
  498. def test_quality_scores_as_illumina
  499. scores = scores_to_illumina(self.class::RANGE)
  500. common_test_quality_scores(scores,
  501. self.class::FILENAME_AS_ILLUMINA,
  502. 'fastq-illumina')
  503. end
  504. def test_error_probabilities_as_illumina
  505. scores = scores_to_illumina(self.class::RANGE)
  506. probs = phred_q2p(scores)
  507. common_test_error_probabilities(probs,
  508. self.class::FILENAME_AS_ILLUMINA,
  509. 'fastq-illumina')
  510. end
  511. end #module TestFastq_full_range
  512. class TestFastq_sanger_full_range < Test::Unit::TestCase
  513. include TestFastq_full_range
  514. RANGE = 0..93
  515. FILENAME_AS_SANGER = 'sanger_full_range_as_sanger.fastq'
  516. FILENAME_AS_SOLEXA = 'sanger_full_range_as_solexa.fastq'
  517. FILENAME_AS_ILLUMINA = 'sanger_full_range_as_illumina.fastq'
  518. alias scores_to_sanger scores_through
  519. alias scores_to_solexa scores_phred2solexa
  520. alias scores_to_illumina scores_phred2illumina
  521. end #class TestFastq_sanger_full_range
  522. class TestFastq_solexa_full_range < Test::Unit::TestCase
  523. include TestFastq_full_range
  524. RANGE = (-5)..62
  525. FILENAME_AS_SANGER = 'solexa_full_range_as_sanger.fastq'
  526. FILENAME_AS_SOLEXA = 'solexa_full_range_as_solexa.fastq'
  527. FILENAME_AS_ILLUMINA = 'solexa_full_range_as_illumina.fastq'
  528. alias scores_to_sanger scores_solexa2sanger
  529. alias scores_to_solexa scores_through
  530. alias scores_to_illumina scores_solexa2illumina
  531. end #class TestFastq_solexa_full_range
  532. class TestFastq_illumina_full_range < Test::Unit::TestCase
  533. include TestFastq_full_range
  534. RANGE = 0..62
  535. FILENAME_AS_SANGER = 'illumina_full_range_as_sanger.fastq'
  536. FILENAME_AS_SOLEXA = 'illumina_full_range_as_solexa.fastq'
  537. FILENAME_AS_ILLUMINA = 'illumina_full_range_as_illumina.fastq'
  538. alias scores_to_sanger scores_phred2sanger
  539. alias scores_to_solexa scores_phred2solexa
  540. alias scores_to_illumina scores_through
  541. end #class TestFastq_illumina_full_range
  542. # common methods for testing error_*.fastq
  543. module TestFastq_error
  544. FILENAME = nil
  545. PRE_SKIP = 2
  546. POST_SKIP = 2
  547. ERRORS = []
  548. def do_test_validate_format(ff)
  549. e = ff.next_entry
  550. #p e
  551. a = []
  552. assert_equal(false, e.validate_format(a))
  553. assert_equal(self.class::ERRORS.size, a.size)
  554. self.class::ERRORS.each do |ex|
  555. obj = a.shift
  556. assert_kind_of(ex.class, obj)
  557. assert_equal(ex.message, obj.message)
  558. end
  559. end
  560. private :do_test_validate_format
  561. def test_validate_format
  562. path = File.join(TestFastqDataDir, self.class::FILENAME)
  563. Bio::FlatFile.open(Bio::Fastq, path) do |ff|
  564. self.class::PRE_SKIP.times { ff.next_entry }
  565. do_test_validate_format(ff)
  566. self.class::POST_SKIP.times { ff.next_entry }
  567. assert(ff.eof?)
  568. end
  569. end
  570. end #module TestFastq_error
  571. class TestFastq_error_diff_ids < Test::Unit::TestCase
  572. include TestFastq_error
  573. FILENAME = 'error_diff_ids.fastq'
  574. PRE_SKIP = 2
  575. POST_SKIP = 2
  576. ERRORS = [ Bio::Fastq::Error::Diff_ids.new ]
  577. end #class TestFastq_error_diff_ids
  578. class TestFastq_error_double_qual < Test::Unit::TestCase
  579. include TestFastq_error
  580. FILENAME = 'error_double_qual.fastq'
  581. PRE_SKIP = 2
  582. POST_SKIP = 2
  583. ERRORS = [ Bio::Fastq::Error::Long_qual.new ]
  584. end #class TestFastq_error_double_qual
  585. class TestFastq_error_double_seq < Test::Unit::TestCase
  586. include TestFastq_error
  587. FILENAME = 'error_double_seq.fastq'
  588. PRE_SKIP = 3
  589. POST_SKIP = 0
  590. ERRORS = [ Bio::Fastq::Error::Long_qual.new ]
  591. end #class TestFastq_error_double_seq
  592. class TestFastq_error_long_qual < Test::Unit::TestCase
  593. include TestFastq_error
  594. FILENAME = 'error_long_qual.fastq'
  595. PRE_SKIP = 3
  596. POST_SKIP = 1
  597. ERRORS = [ Bio::Fastq::Error::Long_qual.new ]
  598. end #class TestFastq_error_long_qual
  599. class TestFastq_error_no_qual < Test::Unit::TestCase
  600. include TestFastq_error
  601. FILENAME = 'error_no_qual.fastq'
  602. PRE_SKIP = 0
  603. POST_SKIP = 0
  604. private
  605. def do_test_validate_format(ff)
  606. 2.times do
  607. e = ff.next_entry
  608. a = []
  609. e.validate_format(a)
  610. assert_equal(1, a.size)
  611. assert_kind_of(Bio::Fastq::Error::Long_qual, a[0])
  612. end
  613. 1.times do
  614. e = ff.next_entry
  615. a = []
  616. e.validate_format(a)
  617. assert_equal(1, a.size)
  618. assert_kind_of(Bio::Fastq::Error::Short_qual, a[0])
  619. end
  620. end
  621. end #class TestFastq_error_no_qual
  622. class TestFastq_error_qual_del < Test::Unit::TestCase
  623. include TestFastq_error
  624. FILENAME = 'error_qual_del.fastq'
  625. PRE_SKIP = 3
  626. POST_SKIP = 1
  627. ERRORS = [ Bio::Fastq::Error::Qual_char.new(12) ]
  628. end #class TestFastq_error_qual_del
  629. class TestFastq_error_qual_escape < Test::Unit::TestCase
  630. include TestFastq_error
  631. FILENAME = 'error_qual_escape.fastq'
  632. PRE_SKIP = 4
  633. POST_SKIP = 0
  634. ERRORS = [ Bio::Fastq::Error::Qual_char.new(7) ]
  635. end #class TestFastq_error_qual_escape
  636. class TestFastq_error_qual_null < Test::Unit::TestCase
  637. include TestFastq_error
  638. FILENAME = 'error_qual_null.fastq'
  639. PRE_SKIP = 0
  640. POST_SKIP = 4
  641. ERRORS = [ Bio::Fastq::Error::Qual_char.new(3) ]
  642. end #class TestFastq_error_qual_null
  643. class TestFastq_error_qual_space < Test::Unit::TestCase
  644. include TestFastq_error
  645. FILENAME = 'error_qual_space.fastq'
  646. PRE_SKIP = 3
  647. POST_SKIP = 1
  648. ERRORS = [ Bio::Fastq::Error::Qual_char.new(18) ]
  649. end #class TestFastq_error_qual_space
  650. class TestFastq_error_qual_tab < Test::Unit::TestCase
  651. include TestFastq_error
  652. FILENAME = 'error_qual_tab.fastq'
  653. PRE_SKIP = 4
  654. POST_SKIP = 0
  655. ERRORS = [ Bio::Fastq::Error::Qual_char.new(10) ]
  656. end #class TestFastq_error_qual_tab
  657. class TestFastq_error_qual_unit_sep < Test::Unit::TestCase
  658. include TestFastq_error
  659. FILENAME = 'error_qual_unit_sep.fastq'
  660. PRE_SKIP = 2
  661. POST_SKIP = 2
  662. ERRORS = [ Bio::Fastq::Error::Qual_char.new(5) ]
  663. end #class TestFastq_error_qual_unit_sep
  664. class TestFastq_error_qual_vtab < Test::Unit::TestCase
  665. include TestFastq_error
  666. FILENAME = 'error_qual_vtab.fastq'
  667. PRE_SKIP = 0
  668. POST_SKIP = 4
  669. ERRORS = [ Bio::Fastq::Error::Qual_char.new(10) ]
  670. end #class TestFastq_error_qual_vtab
  671. class TestFastq_error_short_qual < Test::Unit::TestCase
  672. include TestFastq_error
  673. FILENAME = 'error_short_qual.fastq'
  674. PRE_SKIP = 2
  675. POST_SKIP = 1
  676. ERRORS = [ Bio::Fastq::Error::Long_qual.new ]
  677. end #class TestFastq_error_short_qual
  678. class TestFastq_error_spaces < Test::Unit::TestCase
  679. include TestFastq_error
  680. FILENAME = 'error_spaces.fastq'
  681. PRE_SKIP = 0
  682. POST_SKIP = 0
  683. ERRORS = [ Bio::Fastq::Error::Seq_char.new(9),
  684. Bio::Fastq::Error::Seq_char.new(20),
  685. Bio::Fastq::Error::Qual_char.new(9),
  686. Bio::Fastq::Error::Qual_char.new(20)
  687. ]
  688. private
  689. def do_test_validate_format(ff)
  690. 5.times do
  691. e = ff.next_entry
  692. a = []
  693. e.validate_format(a)
  694. assert_equal(4, a.size)
  695. self.class::ERRORS.each do |ex|
  696. obj = a.shift
  697. assert_kind_of(ex.class, obj)
  698. assert_equal(ex.message, obj.message)
  699. end
  700. end
  701. end
  702. end #class TestFastq_error_spaces
  703. class TestFastq_error_tabs < TestFastq_error_spaces
  704. FILENAME = 'error_tabs.fastq'
  705. end #class TestFastq_error_tabs
  706. class TestFastq_error_trunc_at_plus < Test::Unit::TestCase
  707. include TestFastq_error
  708. FILENAME = 'error_trunc_at_plus.fastq'
  709. PRE_SKIP = 4
  710. POST_SKIP = 0
  711. ERRORS = [ Bio::Fastq::Error::No_qual.new ]
  712. end #class TestFastq_error_trunc_at_plus
  713. class TestFastq_error_trunc_at_qual < TestFastq_error_trunc_at_plus
  714. FILENAME = 'error_trunc_at_qual.fastq'
  715. end #class TestFastq_error_trunc_at_qual
  716. class TestFastq_error_trunc_at_seq < Test::Unit::TestCase
  717. include TestFastq_error
  718. FILENAME = 'error_trunc_at_seq.fastq'
  719. PRE_SKIP = 4
  720. POST_SKIP = 0
  721. ERRORS = [ Bio::Fastq::Error::No_qual.new ]
  722. end #class TestFastq_error_trunc_at_seq
  723. # Unit tests for Bio::Fastq#mask.
  724. class TestFastq_mask < Test::Unit::TestCase
  725. def setup
  726. fn = File.join(TestFastqDataDir, 'wrapping_original_sanger.fastq')
  727. Bio::FlatFile.open(Bio::Fastq, fn) do |ff|
  728. @entry = ff.next_entry
  729. end
  730. @entry.format = :fastq_sanger
  731. end
  732. def test_mask_60
  733. expected = 'n' * 135
  734. assert_equal(expected, @entry.mask(60).seq)
  735. end
  736. def test_mask_20
  737. expected = "GAAnTTnCAGGnCCACCTTTnnnnnGATAGAATAATGGAGAAnnTTAAAnGCTGTACATATACCAATGAACAATAAnTCAATACATAAAnnnGGAGAAGTnGGAACCGAAnGGnTTnGAnTTCAAnCCnTTnCGn"
  738. assert_equal(expected, @entry.mask(20).seq)
  739. end
  740. def test_mask_20_with_x
  741. expected = "GAAxTTxCAGGxCCACCTTTxxxxxGATAGAATAATGGAGAAxxTTAAAxGCTGTACATATACCAATGAACAATAAxTCAATACATAAAxxxGGAGAAGTxGGAACCGAAxGGxTTxGAxTTCAAxCCxTTxCGx"
  742. assert_equal(expected, @entry.mask(20, 'x').seq)
  743. end
  744. def test_mask_20_with_empty_string
  745. expected = "GAATTCAGGCCACCTTTGATAGAATAATGGAGAATTAAAGCTGTACATATACCAATGAACAATAATCAATACATAAAGGAGAAGTGGAACCGAAGGTTGATTCAACCTTCG"
  746. assert_equal(expected, @entry.mask(20, '').seq)
  747. end
  748. def test_mask_20_with_longer_string
  749. expected = "GAA-*-TT-*-CAGG-*-CCACCTTT-*--*--*--*--*-GATAGAATAATGGAGAA-*--*-TTAAA-*-GCTGTACATATACCAATGAACAATAA-*-TCAATACATAAA-*--*--*-GGAGAAGT-*-GGAACCGAA-*-GG-*-TT-*-GA-*-TTCAA-*-CC-*-TT-*-CG-*-"
  750. assert_equal(expected, @entry.mask(20, '-*-').seq)
  751. end
  752. end #class TestFastq_mask
  753. end #module TestFastq
  754. end #module Bio