PageRenderTime 59ms CodeModel.GetById 31ms RepoModel.GetById 0ms app.codeStats 0ms

/nextgen/bcbio/solexa/samplesheet.py

https://github.com/kdaily/bcbb
Python | 108 lines | 81 code | 10 blank | 17 comment | 17 complexity | dc9df36e9383d9ee9a7eb36514ebcbfe MD5 | raw file
  1. """Converts Illumina SampleSheet CSV files to the run_info.yaml input file.
  2. This allows running the analysis pipeline without Galaxy, using CSV input
  3. files from Illumina SampleSheet or Genesifter.
  4. """
  5. import os
  6. import sys
  7. import csv
  8. import itertools
  9. import difflib
  10. import glob
  11. import yaml
  12. from bcbio.solexa.flowcell import (get_flowcell_info)
  13. from bcbio import utils
  14. def _organize_lanes(info_iter, barcode_ids):
  15. """Organize flat lane information into nested YAML structure.
  16. """
  17. all_lanes = []
  18. for (fcid, lane, sampleref), info in itertools.groupby(info_iter, lambda x: (x[0], x[1], x[1])):
  19. info = list(info)
  20. cur_lane = dict(flowcell_id=fcid, lane=lane, genome_build=info[0][3], analysis="Standard")
  21. if not _has_barcode(info):
  22. cur_lane["description"] = info[0][1]
  23. else: # barcoded sample
  24. cur_lane["description"] = "Barcoded lane %s" % lane
  25. multiplex = []
  26. for (_, _, sample_id, _, bc_seq) in info:
  27. bc_type, bc_id = barcode_ids[bc_seq]
  28. multiplex.append(dict(barcode_type=bc_type,
  29. barcode_id=bc_id,
  30. sequence=bc_seq,
  31. name=sample_id))
  32. cur_lane["multiplex"] = multiplex
  33. all_lanes.append(cur_lane)
  34. return all_lanes
  35. def _has_barcode(sample):
  36. if sample[0][4]:
  37. return True
  38. def _generate_barcode_ids(info_iter):
  39. """Create unique barcode IDs assigned to sequences
  40. """
  41. bc_type = "SampleSheet"
  42. barcodes = list(set([x[-1] for x in info_iter]))
  43. barcodes.sort()
  44. barcode_ids = {}
  45. for i, bc in enumerate(barcodes):
  46. barcode_ids[bc] = (bc_type, i+1)
  47. return barcode_ids
  48. def _read_input_csv(in_file):
  49. """Parse useful details from SampleSheet CSV file.
  50. """
  51. with open(in_file, "rU") as in_handle:
  52. reader = csv.reader(in_handle)
  53. reader.next() # header
  54. for line in reader:
  55. if line: # empty lines
  56. (fc_id, lane, sample_id, genome, barcode) = line[:5]
  57. yield fc_id, lane, sample_id, genome, barcode
  58. def _get_flowcell_id(in_file, require_single=True):
  59. """Retrieve the unique flowcell id represented in the SampleSheet.
  60. """
  61. fc_ids = set([x[0] for x in _read_input_csv(in_file)])
  62. if require_single and len(fc_ids) > 1:
  63. raise ValueError("There are several FCIDs in the same samplesheet file: %s" % in_file)
  64. else:
  65. return fc_ids
  66. def csv2yaml(in_file, out_file=None):
  67. """Convert a CSV SampleSheet to YAML run_info format.
  68. """
  69. if out_file is None:
  70. out_file = "%s.yaml" % os.path.splitext(in_file)[0]
  71. barcode_ids = _generate_barcode_ids(_read_input_csv(in_file))
  72. lanes = _organize_lanes(_read_input_csv(in_file), barcode_ids)
  73. with open(out_file, "w") as out_handle:
  74. out_handle.write(yaml.dump(lanes, default_flow_style=False))
  75. return out_file
  76. def run_has_samplesheet(fc_dir, config, require_single=True):
  77. """Checks if there's a suitable SampleSheet.csv present for the run
  78. """
  79. fc_name, _ = get_flowcell_info(fc_dir)
  80. sheet_dirs = config.get("samplesheet_directories", [])
  81. fcid_sheet = {}
  82. for ss_dir in (s for s in sheet_dirs if os.path.exists(s)):
  83. with utils.chdir(ss_dir):
  84. for ss in glob.glob("*.csv"):
  85. fc_ids = _get_flowcell_id(ss, require_single)
  86. for fcid in fc_ids:
  87. if fcid:
  88. fcid_sheet[fcid] = os.path.join(ss_dir, ss)
  89. # difflib handles human errors while entering data on the SampleSheet.
  90. # Only one best candidate is returned (if any). 0.85 cutoff allows for
  91. # maximum of 2 mismatches in fcid
  92. potential_fcids = difflib.get_close_matches(fc_name, fcid_sheet.keys(), 1, 0.85)
  93. if len(potential_fcids) > 0 and fcid_sheet.has_key(potential_fcids[0]):
  94. return fcid_sheet[potential_fcids[0]]
  95. else:
  96. return None