/Misc.cs
C# | 315 lines | 258 code | 26 blank | 31 comment | 15 complexity | 91c914f5ef717a806ceea7cfad3b76e9 MD5 | raw file
- using System;
- using System.Collections.Generic;
- using System.Diagnostics;
- using System.IO;
- using System.Text.RegularExpressions;
- namespace MIST
- {
- public static class Misc
- {
- /// <summary>BLAST word size (default: 11)</summary>
- public static int BlastWordSize { get; set; }
- public static string TempDir { get; set; }
- public static int Cores { get; set; }
- public static int BlastErrorRetryLimit { get; set; }
- public static double AlignmentLengthCoverage { get; set; }
- public static readonly Regex NumberRegex = new Regex(@"\d+", RegexOptions.Compiled);
-
- private static readonly object ThisLock = new object();
- public static bool IsDegenSequence(string sequence)
- {
- foreach (char c in sequence)
- {
- switch (c)
- {
- case 'A':
- case 'C':
- case 'G':
- case 'T':
- break;
- default:
- return true;
- }
- }
- return false;
- }
- /// <summary>Expand a degenerate nucleotide sequence into all of its possible non-degenerate sequences.
- /// Returns list of non-degenerate sequences.</summary>
- /// <param name="array">Mutable char array for holding nucleotide sequence for current non-degenerate sequence.</param>
- /// <param name="seq">Degenerate nucleotide sequence.</param>
- /// <param name="index">Current index in the degenerate nucleotide sequence.</param>
- /// <param name="list">List of non-degenerate sequences to be returned.</param>
- public static void ExpandDegenSequence(char[] array, string seq, int index, List<string> list)
- {
- if (index == seq.Length)
- {
- list.Add(new string(array));
- return;
- }
- char c = seq[index];
- var charList = new List<char>();
- switch (c)
- {
- case 'A':
- charList.Add('A');
- break;
- case 'T':
- charList.Add('T');
- break;
- case 'G':
- charList.Add('G');
- break;
- case 'C':
- charList.Add('C');
- break;
- case 'R':
- //A or G
- charList.Add('A');
- charList.Add('G');
- break;
- case 'Y':
- //C or T
- charList.Add('C');
- charList.Add('T');
- break;
- case 'M':
- //A or C
- charList.Add('A');
- charList.Add('C');
- break;
- case 'S':
- //G or C
- charList.Add('G');
- charList.Add('C');
- break;
- case 'W':
- //A or T
- charList.Add('A');
- charList.Add('T');
- break;
- case 'K':
- //G or T
- charList.Add('G');
- charList.Add('T');
- break;
- case 'V':
- //A, C or G; not T
- charList.Add('A');
- charList.Add('C');
- charList.Add('G');
- break;
- case 'D':
- //A, G or T; not C
- charList.Add('A');
- charList.Add('G');
- charList.Add('T');
- break;
- case 'H':
- //A, C or T; not G
- charList.Add('A');
- charList.Add('C');
- charList.Add('T');
- break;
- case 'B':
- //C, G or T; not A
- charList.Add('C');
- charList.Add('G');
- charList.Add('T');
- break;
- case 'N':
- //aNy
- charList.Add('A');
- charList.Add('C');
- charList.Add('G');
- charList.Add('T');
- break;
- }
- foreach (char c1 in charList)
- {
- array[index] = c1;
- ExpandDegenSequence(array, seq, index + 1, list);
- }
- }
- /// <summary>Get the number of different characters between two strings. Strings must be of the same length.</summary>
- /// <param name="s1">String</param>
- /// <param name="s2">String</param>
- /// <returns>Count of differences between two strings.</returns>
- public static int GetDifferences(string s1, string s2)
- {
- int count = 0;
- //only search the strings for the length of the shorter string
- int minLength = s1.Length > s2.Length ? s2.Length : s1.Length;
- for (int i = 0; i < minLength; i++)
- {
- if (s1[i] != s2[i])
- count++;
- }
- return count;
- }
- public static FileInfo WriteTempMultifastaFile(FileInfo multifasta, DirectoryInfo tmpDir, out int fastaEntries)
- {
- var contigCollection = new ContigCollection(multifasta.FullName);
- contigCollection.Read();
- fastaEntries = contigCollection.Contigs.Count;
- string path = Path.Combine(tmpDir.FullName, multifasta.Name.Replace(' ', '_'));
- using (var sw = new StreamWriter(path))
- {
- foreach (Contig contig in contigCollection.Contigs)
- {
- if (contig.Sequence.Length > 0)
- {
- sw.Write(">");
- sw.WriteLine(contig.Index);
- sw.WriteLine(contig.Sequence);
- }
- }
- }
- return new FileInfo(path);
- }
-
- public static void AdjustSubjectIndices(int length, ref int queryEndIndex, ref int queryStartIndex, bool reverseComplement, ref int subjectEndIndex, ref int subjectStartIndex)
- {
- int addToEnd = length - queryEndIndex;
- int addToStart = queryStartIndex - 1;
-
- if (reverseComplement)
- {
- subjectEndIndex--; //-1 to match array indices which start at 0
- subjectStartIndex--; //-1 to match array indices which start at 0
- subjectEndIndex += addToStart;
- subjectStartIndex -= addToEnd;
- }
- else
- {
- subjectStartIndex -= addToStart;
- subjectStartIndex--; //-1 to match array indices which start at 0
- subjectEndIndex += addToEnd;
- subjectEndIndex--; //-1 to match array indices which start at 0
- }
- }
- public static bool GetAmplicon(int startIndex, int endIndex, bool reverseComplement, Contig c, ref string amplicon)
- {
- lock (ThisLock)
- {
- try
- {
- var isContigTruncation = false;
- //get the amplicon sequence with the adjusted start and end indices
- if (startIndex < 0)
- {
- //if the amplicon extends to an end of a contig
- isContigTruncation = true;
- amplicon = c.Sequence.Length < endIndex ?
- c.Sequence :
- c.Sequence.Substring(0, endIndex + 1);
- }
- else
- {
- //if the adjusted amplicon length is longer than the actual length of the sequence
- //then go for the longest possible bit of sequence
- if ((c.Sequence.Length - startIndex) < (endIndex - startIndex + 1))
- {
- isContigTruncation = true;
- //Console.Error.WriteLine(string.Format("Amplicon contig truncation detected with contig {0} of multifasta {1}", c.Header, c.MultifastaFile.Name))
- amplicon = c.Sequence.Substring(startIndex, (c.Sequence.Length - startIndex));
- }
- else
- {
- amplicon = c.Sequence.Substring(startIndex, endIndex - startIndex + 1);
- }
- }
- amplicon = reverseComplement ? ReverseComplement(amplicon) : amplicon.ToUpper();
- if (isContigTruncation)
- {
- Console.Error.WriteLine(string.Format("Amplicon contig truncation detected with contig {0} of multifasta {1}", c.Header, c.MultifastaFile.Name));
- }
- return isContigTruncation;
- }
- catch (Exception)
- {
- return true;
- }
- }
- }
- /// <summary>Run makeblastdb on multifasta file.</summary>
- /// <param name="workingDir">Working directory; "Temp" folder within folder where the genome multifasta files were selected.</param>
- /// <param name="fiMultifasta">Genome multifasta file.</param>
- public static void MakeBlastDB(DirectoryInfo workingDir, FileInfo fiMultifasta)
- {
- var startInfo = new ProcessStartInfo(@"makeblastdb",
- String.Format(@"-in ""{0}"" -dbtype nucl",
- fiMultifasta.Name.Replace(' ', '_')))
- {
- WorkingDirectory = workingDir.FullName,
- RedirectStandardError = true,
- RedirectStandardOutput = true,
- UseShellExecute = false,
- CreateNoWindow = true
- };
- var p = new Process { StartInfo = startInfo };
- while (!p.Start())
- {
- p = new Process { StartInfo = startInfo };
- }
- p.WaitForExit();
- p.Close();
- }
- public static string ReverseComplement(string str)
- {
- string tmp = Complement(Reverse(str));
- return tmp;
- }
- private static string Reverse(string str)
- {
- char[] chrArray = str.ToCharArray();
- Array.Reverse(chrArray);
- return new string(chrArray);
- }
- private static string Complement(string str)
- {
- char[] ch = str.ToUpper().ToCharArray();
- for (int i = 0; i < ch.Length; i++)
- {
- switch (ch[i])
- {
- case 'A':
- ch[i] = 'T';
- break;
- case 'G':
- ch[i] = 'C';
- break;
- case 'C':
- ch[i] = 'G';
- break;
- case 'T':
- ch[i] = 'A';
- break;
- }
- }
- return new string(ch);
- }
- }
- }