/InSilicoTyping.cs
C# | 336 lines | 261 code | 35 blank | 40 comment | 43 complexity | a7fd5c465c1eab4a62720ad9d03108e9 MD5 | raw file
- using System;
- using System.Collections.Generic;
- using System.IO;
- namespace MIST
- {
- [Serializable]
- public class InSilicoTyping
- {
- private readonly Dictionary<Marker, FileInfo> _markerTestInfoFile = new Dictionary<Marker, FileInfo>();
- private readonly Dictionary<Marker, string> _markerTestName = new Dictionary<Marker, string>();
- /// <summary>List of markers to perform in silico typing analysis with.</summary>
- private readonly List<Marker> _markers = new List<Marker>();
- private readonly Dictionary<string, List<Marker>> _testNameMarker = new Dictionary<string, List<Marker>>();
- private readonly Dictionary<string, FileInfo> _testNameTestInfoFile = new Dictionary<string, FileInfo>();
- /// <summary>List of allele multifasta file folders.</summary>
- private List<DirectoryInfo> _allelesFolders;
- /// <summary>Dictionary to link up genome multifasta file with ContigCollection object for that genome.</summary>
- private Dictionary<string, ContigCollection> _multifastaFileDict = new Dictionary<string, ContigCollection>();
- /// <summary>List of ContigCollection objects for all genomes to perform in silico typing analysis on.</summary>
- private List<ContigCollection> _multifastaFiles = new List<ContigCollection>();
- /// <summary>Temporary data directory.</summary>
- private DirectoryInfo _tempDir;
- /// <summary>List of typing test information files. Format may change in the future to JSON or XML. </summary>
- private List<FileInfo> _testInfoFiles;
- //Constructor
- /// <summary>Start a new in silico analysis with a list of test info files.</summary>
- /// <param name="testInfoFiles">Files containing in silico typing test information</param>
- /// <param name="allelesFolders">Directories in which to find alleles for various typing tests.</param>
- /// <param name="tempDir">Directory in which to save temporary data.</param>
- public InSilicoTyping(List<FileInfo> testInfoFiles, List<DirectoryInfo> allelesFolders, DirectoryInfo tempDir)
- {
- // cannot perform in silico typing without test information files
- if (testInfoFiles == null) return;
- _testInfoFiles = testInfoFiles;
- _allelesFolders = allelesFolders;
- _tempDir = tempDir;
- ReadTestInfoFiles();
- }
- public List<DirectoryInfo> AllelesFolders { get { return _allelesFolders; } set { _allelesFolders = value; } }
- public DirectoryInfo TempDir { get { return _tempDir; } set { _tempDir = value; } }
- public Dictionary<string, ContigCollection> MultifastaFileDict { get { return _multifastaFileDict; } set { _multifastaFileDict = value; } }
- public List<ContigCollection> MultifastaFiles { get { return _multifastaFiles; } set { _multifastaFiles = value; } }
- public List<FileInfo> TestInfoFiles { get { return _testInfoFiles; } set { _testInfoFiles = value; } }
- public List<ContigCollection> MultiFastaFiles { get { return _multifastaFiles; } }
- public Dictionary<string, FileInfo> TestNameTestInfoFile { get { return _testNameTestInfoFile; } }
- public List<Marker> Markers { get { return _markers; } }
- public Dictionary<Marker, string> MarkerTestName { get { return _markerTestName; } }
- public Dictionary<string, List<Marker>> TestNameMarker { get { return _testNameMarker; } }
- public Dictionary<Marker, FileInfo> MarkerTestInfoFile { get { return _markerTestInfoFile; } }
- public void AddGenomeFilesToAnalysis(List<string> inputGenomes)
- {
- foreach (string file in inputGenomes)
- {
- if (!_multifastaFileDict.ContainsKey(file))
- {
- _multifastaFileDict.Add(file, null);
- }
- }
- }
- /// <summary>Read a multifasta file containing the contigs for a genome.
- /// Add the multifasta contigcollection to the list of contigcollections.
- /// Get the marker matches associated with the genome and the selected tests.</summary>
- /// <param name="filename">Multifasta file path to read from.</param>
- public void ReadAndGetMarkerMatchesForGenome(string filename)
- {
- ContigCollection cc = _multifastaFileDict[filename];
- bool newFile = false;
- if (cc == null)
- {
- cc = new ContigCollection(this, filename);
- _multifastaFileDict[filename] = cc;
- newFile = true;
- }
- cc.GetMarkerMatchesSingleCore();
- if (newFile)
- _multifastaFiles.Add(cc);
- }
- /// <summary>Read typing test info from files.</summary>
- private void ReadTestInfoFiles()
- {
- foreach (FileInfo filename in _testInfoFiles)
- {
- ReadTestInfoFile(filename);
- }
- }
- /// <summary>Read typing test info from file.</summary>
- /// <param name="testFileInfo">Test info file.</param>
- private void ReadTestInfoFile(FileInfo testFileInfo)
- {
- if (testFileInfo.DirectoryName == null) return;
- try
- {
- using (var sr = new StreamReader(testFileInfo.FullName))
- {
- // All test types || binary/allelic|| binary || allelic only || repeat only
- //marker Name || Test Name || Test Type || Forward Primer || Reverse Primer || Amplicon Size || Range Factor || Allelic Database Filename || Repeat Size
- // 0 || 1 || 2 || 3 || 4 || 5 || 6 || 7 || 8
- sr.ReadLine(); //skip first line
- while (!sr.EndOfStream)
- {
- string line = sr.ReadLine();
- if (line == null) continue;
- string[] split = line.Split('\t');
- if (split.Length < 8)
- throw new ArgumentOutOfRangeException(
- string.Format(
- "Line '{0}' in test info file '{1}' does not contain 8 elements delimited by tabs. Contains {2} elements instead",
- line,
- testFileInfo.Name,
- split.Length));
- string alleleFilename = split[7];
- string alleleFullFilename = "";
- if (alleleFilename != "")
- {
- if (_allelesFolders.Count == 0)
- {
- DirectoryInfo dir = testFileInfo.Directory;
- if (dir == null)
- throw new DirectoryNotFoundException(string.Format("Alleles directory not found for test info file '{0}'", testFileInfo.Name));
- FileInfo[] searchResults = dir.GetFiles(alleleFilename, SearchOption.AllDirectories);
- if (searchResults.Length > 0)
- alleleFullFilename = searchResults[0].FullName;
- }
- else
- {
- // try to find allele multifasta file in one of the allele folders the user specified
- foreach (DirectoryInfo allelesFolder in _allelesFolders)
- {
- FileInfo[] searchResults = allelesFolder.GetFiles(alleleFilename, SearchOption.AllDirectories);
- if (searchResults.Length <= 0)
- continue;
- alleleFullFilename = searchResults[0].FullName;
- break;
- }
- }
- }
- if (alleleFullFilename == "" && alleleFilename != "")
- throw new FileNotFoundException(string.Format("Allele file, '{0}' not found for test info file '{1}'",
- alleleFilename,
- testFileInfo.Name));
- //variables for trying to parse numbers from strings
- int i, j;
- double d;
- string markerName = split[0];
- string testName = split[1];
- var testType = (TestType) (int.Parse(split[2]));
- string fprimer = split[3];
- string rprimer = split[4];
- int ampliconSize = int.TryParse(split[5], out i) ? i : -1;
- double ampliconRange = double.TryParse(split[6], out d) ? d : -1;
- int repeatSize = int.TryParse(split[8], out j) ? j : -1;
- var marker = new Marker(markerName, testName, testType, fprimer, rprimer, ampliconSize, ampliconRange, alleleFullFilename, repeatSize);
- _markerTestInfoFile.Add(marker, testFileInfo);
- _markerTestName.Add(marker, testName);
- if (!_testNameMarker.ContainsKey(testName))
- {
- _testNameMarker.Add(testName, new List<Marker> {marker});
- }
- else
- {
- _testNameMarker[testName].Add(marker);
- }
- if (!_testNameTestInfoFile.ContainsKey(testName))
- {
- _testNameTestInfoFile.Add(testName, testFileInfo);
- }
- else
- {
- if (_testNameTestInfoFile[testName] != testFileInfo)
- {
- throw new Exception(string.Format("Duplicate tests ({0}) found in two different test info files:\n{1}\n{2}",
- testName,
- _testNameTestInfoFile[testName].FullName,
- testFileInfo.FullName));
- }
- }
- _markers.Add(marker);
- }
- }
- }
- catch (Exception ex)
- {
- Console.Error.WriteLine(ex.Message);
- Console.Error.WriteLine(ex.StackTrace);
- }
- }
- /// <summary>Write the marker match results to a file.</summary>
- /// <param name="filename">Save filename.</param>
- /// <param name="verbose">Verbose or sparse results file.</param>
- /// <returns>Exception if error encountered.</returns>
- public void WriteResults(string filename, bool verbose)
- {
- var typingResults = new TypingResultsCollection(this, verbose);
- using (var sw = new StreamWriter(filename))
- {
- var headers = new List<string> {"Sample"}; //headers in the first line of the file
- //get all of the headers for each marker for each test including extra info each test
- foreach (var pair in typingResults.TestMarkerDict)
- {
- string testName = pair.Key;
- var markers = new List<Marker>(pair.Value);
- markers.Sort();
- foreach (Marker marker in markers)
- {
- headers.Add(marker.Name);
- }
- if (typingResults.TestMetadataFile[testName] == null)
- continue;
- List<string> extraInfoHeaders = typingResults.TestMetadataFile[testName].GetExtraInfoHeaders();
- foreach (string extraInfoHeader in extraInfoHeaders)
- {
- headers.Add(extraInfoHeader);
- }
- }
- sw.WriteLine(string.Join("\t", headers));
- //write each line of in silico typing data
- foreach (TypingResults typingResult in typingResults.Results)
- {
- ContigCollection cc = typingResult.ContigCollection;
- var line = new List<string> {cc.Name};
- foreach (var pair in typingResults.TestMarkerDict)
- {
- string testName = pair.Key;
- var markers = new List<Marker>(pair.Value);
- //sort the Marker by Name
- markers.Sort();
- var markerMatches = new List<MarkerMatch>();
- foreach (Marker marker in markers)
- {
- //for the current marker, get the marker match data
- MarkerMatch markerMatch = cc.MarkerMatchesDict[marker];
- markerMatches.Add(markerMatch);
- line.Add(GetMarkerMatchData(marker, markerMatch, verbose));
- }
- //check if there is extra info for the current test
- if (typingResults.TestMetadataFile[testName] == null)
- continue;
- //get the extra info
- List<string> list = typingResults.TestMetadataFile[testName].GetExtraInfo(markerMatches, verbose);
- foreach (string extraInfo in list)
- {
- //add it to the line
- line.Add(extraInfo);
- }
- }
- sw.WriteLine(string.Join("\t", line));
- }
- }
- }
- /// <summary>Get the marker match data (verbose or sparse) for a particular marker match.</summary>
- /// <param name="marker">marker.</param>
- /// <param name="mm">marker match.</param>
- /// <param name="verbose">Verbose or sparse data to be returned.</param>
- /// <returns>marker match data string.</returns>
- private static string GetMarkerMatchData(Marker marker, MarkerMatch mm, bool verbose)
- {
- switch (marker.TypingTest)
- {
- case TestType.AmpliconProbe:
- case TestType.OligoProbe:
- case TestType.PCR:
- if (mm != null)
- {
- return mm.MarkerCall == "Present" ? "1" : "0";
- }
- return ("0");
- case TestType.Allelic:
- if (mm != null)
- {
- if (verbose)
- return (mm.CorrectMarkerMatch
- ? mm.MarkerCall
- : mm.AlleleMatch + "; " + mm.Mismatches + " mismatches");
- return mm.CorrectMarkerMatch ? mm.MarkerCall : "";
- }
- return "";
- case TestType.Repeat:
- if (mm != null)
- {
- return mm.MarkerCall;
- }
- return "";
- case TestType.SNP:
- if (mm != null)
- {
- return mm.MarkerCall;
- }
- return "";
- }
- return null;
- }
- #region ExtraInfo
- private List<ExtraTestInfo> _extraInfo = new List<ExtraTestInfo>();
- public List<ExtraTestInfo> ExtraInfo { get { return _extraInfo; } set { _extraInfo = value; } }
- public void AddExtraInfo(ExtraTestInfo extraInfo)
- {
- _extraInfo.Add(extraInfo);
- }
- #endregion
- }
- }