PageRenderTime 52ms CodeModel.GetById 25ms RepoModel.GetById 1ms app.codeStats 0ms

/InSilicoTyping.cs

https://bitbucket.org/peterk87/mist
C# | 336 lines | 261 code | 35 blank | 40 comment | 43 complexity | a7fd5c465c1eab4a62720ad9d03108e9 MD5 | raw file
  1. using System;
  2. using System.Collections.Generic;
  3. using System.IO;
  4. namespace MIST
  5. {
  6. [Serializable]
  7. public class InSilicoTyping
  8. {
  9. private readonly Dictionary<Marker, FileInfo> _markerTestInfoFile = new Dictionary<Marker, FileInfo>();
  10. private readonly Dictionary<Marker, string> _markerTestName = new Dictionary<Marker, string>();
  11. /// <summary>List of markers to perform in silico typing analysis with.</summary>
  12. private readonly List<Marker> _markers = new List<Marker>();
  13. private readonly Dictionary<string, List<Marker>> _testNameMarker = new Dictionary<string, List<Marker>>();
  14. private readonly Dictionary<string, FileInfo> _testNameTestInfoFile = new Dictionary<string, FileInfo>();
  15. /// <summary>List of allele multifasta file folders.</summary>
  16. private List<DirectoryInfo> _allelesFolders;
  17. /// <summary>Dictionary to link up genome multifasta file with ContigCollection object for that genome.</summary>
  18. private Dictionary<string, ContigCollection> _multifastaFileDict = new Dictionary<string, ContigCollection>();
  19. /// <summary>List of ContigCollection objects for all genomes to perform in silico typing analysis on.</summary>
  20. private List<ContigCollection> _multifastaFiles = new List<ContigCollection>();
  21. /// <summary>Temporary data directory.</summary>
  22. private DirectoryInfo _tempDir;
  23. /// <summary>List of typing test information files. Format may change in the future to JSON or XML. </summary>
  24. private List<FileInfo> _testInfoFiles;
  25. //Constructor
  26. /// <summary>Start a new in silico analysis with a list of test info files.</summary>
  27. /// <param name="testInfoFiles">Files containing in silico typing test information</param>
  28. /// <param name="allelesFolders">Directories in which to find alleles for various typing tests.</param>
  29. /// <param name="tempDir">Directory in which to save temporary data.</param>
  30. public InSilicoTyping(List<FileInfo> testInfoFiles, List<DirectoryInfo> allelesFolders, DirectoryInfo tempDir)
  31. {
  32. // cannot perform in silico typing without test information files
  33. if (testInfoFiles == null) return;
  34. _testInfoFiles = testInfoFiles;
  35. _allelesFolders = allelesFolders;
  36. _tempDir = tempDir;
  37. ReadTestInfoFiles();
  38. }
  39. public List<DirectoryInfo> AllelesFolders { get { return _allelesFolders; } set { _allelesFolders = value; } }
  40. public DirectoryInfo TempDir { get { return _tempDir; } set { _tempDir = value; } }
  41. public Dictionary<string, ContigCollection> MultifastaFileDict { get { return _multifastaFileDict; } set { _multifastaFileDict = value; } }
  42. public List<ContigCollection> MultifastaFiles { get { return _multifastaFiles; } set { _multifastaFiles = value; } }
  43. public List<FileInfo> TestInfoFiles { get { return _testInfoFiles; } set { _testInfoFiles = value; } }
  44. public List<ContigCollection> MultiFastaFiles { get { return _multifastaFiles; } }
  45. public Dictionary<string, FileInfo> TestNameTestInfoFile { get { return _testNameTestInfoFile; } }
  46. public List<Marker> Markers { get { return _markers; } }
  47. public Dictionary<Marker, string> MarkerTestName { get { return _markerTestName; } }
  48. public Dictionary<string, List<Marker>> TestNameMarker { get { return _testNameMarker; } }
  49. public Dictionary<Marker, FileInfo> MarkerTestInfoFile { get { return _markerTestInfoFile; } }
  50. public void AddGenomeFilesToAnalysis(List<string> inputGenomes)
  51. {
  52. foreach (string file in inputGenomes)
  53. {
  54. if (!_multifastaFileDict.ContainsKey(file))
  55. {
  56. _multifastaFileDict.Add(file, null);
  57. }
  58. }
  59. }
  60. /// <summary>Read a multifasta file containing the contigs for a genome.
  61. /// Add the multifasta contigcollection to the list of contigcollections.
  62. /// Get the marker matches associated with the genome and the selected tests.</summary>
  63. /// <param name="filename">Multifasta file path to read from.</param>
  64. public void ReadAndGetMarkerMatchesForGenome(string filename)
  65. {
  66. ContigCollection cc = _multifastaFileDict[filename];
  67. bool newFile = false;
  68. if (cc == null)
  69. {
  70. cc = new ContigCollection(this, filename);
  71. _multifastaFileDict[filename] = cc;
  72. newFile = true;
  73. }
  74. cc.GetMarkerMatchesSingleCore();
  75. if (newFile)
  76. _multifastaFiles.Add(cc);
  77. }
  78. /// <summary>Read typing test info from files.</summary>
  79. private void ReadTestInfoFiles()
  80. {
  81. foreach (FileInfo filename in _testInfoFiles)
  82. {
  83. ReadTestInfoFile(filename);
  84. }
  85. }
  86. /// <summary>Read typing test info from file.</summary>
  87. /// <param name="testFileInfo">Test info file.</param>
  88. private void ReadTestInfoFile(FileInfo testFileInfo)
  89. {
  90. if (testFileInfo.DirectoryName == null) return;
  91. try
  92. {
  93. using (var sr = new StreamReader(testFileInfo.FullName))
  94. {
  95. // All test types || binary/allelic|| binary || allelic only || repeat only
  96. //marker Name || Test Name || Test Type || Forward Primer || Reverse Primer || Amplicon Size || Range Factor || Allelic Database Filename || Repeat Size
  97. // 0 || 1 || 2 || 3 || 4 || 5 || 6 || 7 || 8
  98. sr.ReadLine(); //skip first line
  99. while (!sr.EndOfStream)
  100. {
  101. string line = sr.ReadLine();
  102. if (line == null) continue;
  103. string[] split = line.Split('\t');
  104. if (split.Length < 8)
  105. throw new ArgumentOutOfRangeException(
  106. string.Format(
  107. "Line '{0}' in test info file '{1}' does not contain 8 elements delimited by tabs. Contains {2} elements instead",
  108. line,
  109. testFileInfo.Name,
  110. split.Length));
  111. string alleleFilename = split[7];
  112. string alleleFullFilename = "";
  113. if (alleleFilename != "")
  114. {
  115. if (_allelesFolders.Count == 0)
  116. {
  117. DirectoryInfo dir = testFileInfo.Directory;
  118. if (dir == null)
  119. throw new DirectoryNotFoundException(string.Format("Alleles directory not found for test info file '{0}'", testFileInfo.Name));
  120. FileInfo[] searchResults = dir.GetFiles(alleleFilename, SearchOption.AllDirectories);
  121. if (searchResults.Length > 0)
  122. alleleFullFilename = searchResults[0].FullName;
  123. }
  124. else
  125. {
  126. // try to find allele multifasta file in one of the allele folders the user specified
  127. foreach (DirectoryInfo allelesFolder in _allelesFolders)
  128. {
  129. FileInfo[] searchResults = allelesFolder.GetFiles(alleleFilename, SearchOption.AllDirectories);
  130. if (searchResults.Length <= 0)
  131. continue;
  132. alleleFullFilename = searchResults[0].FullName;
  133. break;
  134. }
  135. }
  136. }
  137. if (alleleFullFilename == "" && alleleFilename != "")
  138. throw new FileNotFoundException(string.Format("Allele file, '{0}' not found for test info file '{1}'",
  139. alleleFilename,
  140. testFileInfo.Name));
  141. //variables for trying to parse numbers from strings
  142. int i, j;
  143. double d;
  144. string markerName = split[0];
  145. string testName = split[1];
  146. var testType = (TestType) (int.Parse(split[2]));
  147. string fprimer = split[3];
  148. string rprimer = split[4];
  149. int ampliconSize = int.TryParse(split[5], out i) ? i : -1;
  150. double ampliconRange = double.TryParse(split[6], out d) ? d : -1;
  151. int repeatSize = int.TryParse(split[8], out j) ? j : -1;
  152. var marker = new Marker(markerName, testName, testType, fprimer, rprimer, ampliconSize, ampliconRange, alleleFullFilename, repeatSize);
  153. _markerTestInfoFile.Add(marker, testFileInfo);
  154. _markerTestName.Add(marker, testName);
  155. if (!_testNameMarker.ContainsKey(testName))
  156. {
  157. _testNameMarker.Add(testName, new List<Marker> {marker});
  158. }
  159. else
  160. {
  161. _testNameMarker[testName].Add(marker);
  162. }
  163. if (!_testNameTestInfoFile.ContainsKey(testName))
  164. {
  165. _testNameTestInfoFile.Add(testName, testFileInfo);
  166. }
  167. else
  168. {
  169. if (_testNameTestInfoFile[testName] != testFileInfo)
  170. {
  171. throw new Exception(string.Format("Duplicate tests ({0}) found in two different test info files:\n{1}\n{2}",
  172. testName,
  173. _testNameTestInfoFile[testName].FullName,
  174. testFileInfo.FullName));
  175. }
  176. }
  177. _markers.Add(marker);
  178. }
  179. }
  180. }
  181. catch (Exception ex)
  182. {
  183. Console.Error.WriteLine(ex.Message);
  184. Console.Error.WriteLine(ex.StackTrace);
  185. }
  186. }
  187. /// <summary>Write the marker match results to a file.</summary>
  188. /// <param name="filename">Save filename.</param>
  189. /// <param name="verbose">Verbose or sparse results file.</param>
  190. /// <returns>Exception if error encountered.</returns>
  191. public void WriteResults(string filename, bool verbose)
  192. {
  193. var typingResults = new TypingResultsCollection(this, verbose);
  194. using (var sw = new StreamWriter(filename))
  195. {
  196. var headers = new List<string> {"Sample"}; //headers in the first line of the file
  197. //get all of the headers for each marker for each test including extra info each test
  198. foreach (var pair in typingResults.TestMarkerDict)
  199. {
  200. string testName = pair.Key;
  201. var markers = new List<Marker>(pair.Value);
  202. markers.Sort();
  203. foreach (Marker marker in markers)
  204. {
  205. headers.Add(marker.Name);
  206. }
  207. if (typingResults.TestMetadataFile[testName] == null)
  208. continue;
  209. List<string> extraInfoHeaders = typingResults.TestMetadataFile[testName].GetExtraInfoHeaders();
  210. foreach (string extraInfoHeader in extraInfoHeaders)
  211. {
  212. headers.Add(extraInfoHeader);
  213. }
  214. }
  215. sw.WriteLine(string.Join("\t", headers));
  216. //write each line of in silico typing data
  217. foreach (TypingResults typingResult in typingResults.Results)
  218. {
  219. ContigCollection cc = typingResult.ContigCollection;
  220. var line = new List<string> {cc.Name};
  221. foreach (var pair in typingResults.TestMarkerDict)
  222. {
  223. string testName = pair.Key;
  224. var markers = new List<Marker>(pair.Value);
  225. //sort the Marker by Name
  226. markers.Sort();
  227. var markerMatches = new List<MarkerMatch>();
  228. foreach (Marker marker in markers)
  229. {
  230. //for the current marker, get the marker match data
  231. MarkerMatch markerMatch = cc.MarkerMatchesDict[marker];
  232. markerMatches.Add(markerMatch);
  233. line.Add(GetMarkerMatchData(marker, markerMatch, verbose));
  234. }
  235. //check if there is extra info for the current test
  236. if (typingResults.TestMetadataFile[testName] == null)
  237. continue;
  238. //get the extra info
  239. List<string> list = typingResults.TestMetadataFile[testName].GetExtraInfo(markerMatches, verbose);
  240. foreach (string extraInfo in list)
  241. {
  242. //add it to the line
  243. line.Add(extraInfo);
  244. }
  245. }
  246. sw.WriteLine(string.Join("\t", line));
  247. }
  248. }
  249. }
  250. /// <summary>Get the marker match data (verbose or sparse) for a particular marker match.</summary>
  251. /// <param name="marker">marker.</param>
  252. /// <param name="mm">marker match.</param>
  253. /// <param name="verbose">Verbose or sparse data to be returned.</param>
  254. /// <returns>marker match data string.</returns>
  255. private static string GetMarkerMatchData(Marker marker, MarkerMatch mm, bool verbose)
  256. {
  257. switch (marker.TypingTest)
  258. {
  259. case TestType.AmpliconProbe:
  260. case TestType.OligoProbe:
  261. case TestType.PCR:
  262. if (mm != null)
  263. {
  264. return mm.MarkerCall == "Present" ? "1" : "0";
  265. }
  266. return ("0");
  267. case TestType.Allelic:
  268. if (mm != null)
  269. {
  270. if (verbose)
  271. return (mm.CorrectMarkerMatch
  272. ? mm.MarkerCall
  273. : mm.AlleleMatch + "; " + mm.Mismatches + " mismatches");
  274. return mm.CorrectMarkerMatch ? mm.MarkerCall : "";
  275. }
  276. return "";
  277. case TestType.Repeat:
  278. if (mm != null)
  279. {
  280. return mm.MarkerCall;
  281. }
  282. return "";
  283. case TestType.SNP:
  284. if (mm != null)
  285. {
  286. return mm.MarkerCall;
  287. }
  288. return "";
  289. }
  290. return null;
  291. }
  292. #region ExtraInfo
  293. private List<ExtraTestInfo> _extraInfo = new List<ExtraTestInfo>();
  294. public List<ExtraTestInfo> ExtraInfo { get { return _extraInfo; } set { _extraInfo = value; } }
  295. public void AddExtraInfo(ExtraTestInfo extraInfo)
  296. {
  297. _extraInfo.Add(extraInfo);
  298. }
  299. #endregion
  300. }
  301. }