/BlastProcess.cs
C# | 170 lines | 129 code | 14 blank | 27 comment | 12 complexity | c2809de4c2e9570055e058bc397c35c5 MD5 | raw file
- using System;
- using System.Collections.Generic;
- using System.Diagnostics;
- using System.IO;
- using System.Text.RegularExpressions;
- namespace MIST
- {
- internal class BlastProcess
- {
- /// <summary>Regex to find an error or exception in the process error output.</summary>
- private static readonly Regex ErrorRegex = new Regex(@"[Ee]rror|[Ee]xception", RegexOptions.Compiled);
- /// <summary>Process error ouput string.</summary>
- private string _error = "";
- /// <summary>Parsed BLAST output.</summary>
- private readonly List<BlastOutput> _blastOutputs = new List<BlastOutput>();
- /// <summary>Object to prevent cross thread interference.</summary>
- private readonly object _thisLock = new object();
- /// <summary>Parsed BLAST output.</summary>
- public List<BlastOutput> BlastOutputs
- {
- get { return _blastOutputs; }
- }
- /// <summary>Run nucleotide BLAST using a query string (standard input) and subject database in a specified working directory.</summary>
- /// <param name="workingDir">Working directory for running BLAST.</param>
- /// <param name="query">Query multifasta string (standard input).</param>
- /// <param name="subject">Subject filename.</param>
- /// <param name="testType">Test type used for determining how many results BLAST should return.</param>
- /// <returns></returns>
- public BlastProcess(DirectoryInfo workingDir, string query, string subject, TestType testType)
- {
- string args = "";
- switch (testType)
- {
- case TestType.OligoProbe:
- case TestType.SNP:
- args = "-num_alignments 1 -task blastn";
- break;
- case TestType.AmpliconProbe:
- case TestType.Allelic:
- args = "";
- break;
- case TestType.PCR:
- case TestType.Repeat:
- args = string.Format("-word_size {0}", Misc.BlastWordSize);
- break;
- }
- //name the output file something random determined by the OS filesystem
- //it's not like anyone really needs to check these files, they are temporary
- var outputFilename = string.Format("{0}", Path.GetRandomFileName());
- //run blast now to using multifasta file that has been formatted into a blast db as a subject and the allele file as the query
- var startInfo = new ProcessStartInfo(@"blastn",
- String.Format(
- @" -db ""{0}"" -outfmt ""6 qseqid sseqid pident length qstart qend sstart send qlen bitscore gaps sseq qseq mismatch"" {1} -out {2}",
- subject.Replace(' ', '_'),
- args,
- outputFilename
- ))
- {
- WorkingDirectory = workingDir.FullName,
- RedirectStandardError = true,
- RedirectStandardInput = true,
- UseShellExecute = false,
- CreateNoWindow = true
- };
- var outputFilePath = Path.Combine(workingDir.FullName, outputFilename);
- bool errorCaught;
- int errorCount = 0;
- do
- {
- _error = "";
- RunProcess(startInfo, query);
- //check if an error was produced while running BLAST
- if (ErrorRegex.IsMatch(_error))
- {
- Console.Error.WriteLine(string.Format("BLAST error:{0}", _error));
- errorCaught = true;
- }
- else
- {
- errorCaught = ParseBlastOutput(outputFilePath);
- if (errorCaught)
- {
- Console.Error.WriteLine(string.Format("Error parsing BLAST output:\n File: '{0}'\n Subject Genome: '{1}'\n Query: '{2}'\n TestType: '{3}'",
- outputFilePath,
- subject,
- query,
- testType));
- }
- }
- //if an error was produced by BLAST then rerun BLAST until the error goes away
- errorCount++;
- } while (errorCaught && errorCount < Misc.BlastErrorRetryLimit);
- }
- /// <summary>Run the process with the supplied process start info and standard input string. Receive both standard error and output data.</summary>
- /// <param name="startInfo"></param>
- /// <param name="input"></param>
- private void RunProcess(ProcessStartInfo startInfo, string input)
- {
- lock (_thisLock)
- {
- // Start the new process
- var p = new Process { StartInfo = startInfo };
- bool started = p.Start();
- while (!started)
- {
- p = new Process { StartInfo = startInfo };
- started = p.Start();
- }
- //provide query input
- p.StandardInput.Write(input);
- p.StandardInput.Close();
- //wait for the process to exit
- p.WaitForExit();
- _error = p.StandardError.ReadToEnd();
- p.Close();
- }
- }
- private bool ParseBlastOutput(string outputFilename)
- {
- _blastOutputs.Clear();
- bool blastOutputAdded = false;
- using (var sr = new StreamReader(outputFilename))
- {
- string line;
- //while there are lines that can be read
- while ((line = sr.ReadLine()) != null)
- {
- //check that the line is not null or empty
- if (string.IsNullOrEmpty(line))
- continue;
- //split the line based on tabs
- string[] split = line.Split('\t');
- //there should be 14 elements in the split
- if (split.Length < 14)
- {
- Console.Error.WriteLine(string.Format( "BLAST result incomplete. Only {0} of 11 expected columns of data.",
- split.Length));
- Console.Error.WriteLine(line);
- return true;
- }
- //try to parse the line split into a BlastOutput object
- var blastOutput = new BlastOutput();
- var ex = blastOutput.ParseBlastResult(split);
- if (ex != null)
- {
- Console.Error.WriteLine(string.Format("BLAST result parsing error:\n{0}", line));
- return true;
- }
- _blastOutputs.Add(blastOutput);
- blastOutputAdded = true;
- }
- }
- return !blastOutputAdded;
- }
- }
- }