PageRenderTime 26ms CodeModel.GetById 22ms RepoModel.GetById 1ms app.codeStats 0ms

/main.cpp

https://github.com/albertwcheng/bedSeq
C++ | 369 lines | 258 code | 79 blank | 32 comment | 62 complexity | 6eb0c94122be9d66c0d511d582aa0484 MD5 | raw file
  1. /***************************************************************************
  2. Copyright 2010 Wu Albert Cheng <albertwcheng@gmail.com>
  3. Permission is hereby granted, free of charge, to any person obtaining a copy
  4. of this software and associated documentation files (the "Software"), to deal
  5. in the Software without restriction, including without limitation the rights
  6. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. copies of the Software, and to permit persons to whom the Software is
  8. furnished to do so, subject to the following conditions:
  9. The above copyright notice and this permission notice shall be included in
  10. all copies or substantial portions of the Software.
  11. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  12. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  13. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  14. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  15. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  16. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  17. THE SOFTWARE.
  18. *******************************************************************************/
  19. #include <iostream>
  20. #include <stdio.h>
  21. #include <stdlib.h>
  22. #include <map>
  23. #include "StringUtil.h"
  24. #include "RandomAccessFile.h"
  25. #include <vector>
  26. #include "Nucleic.h"
  27. #include <deque>
  28. using namespace std;
  29. #define BEDLINE_BUFFER_SIZE 1024*1024
  30. #define BED_FORMAT 1
  31. #define EBED_FORMAT 2
  32. #define FORWARD '+'
  33. #define REVERSE '-'
  34. int printUsageAndExit(const char*programName)
  35. {
  36. cerr<<"Usage: "<<programName<<" <seqDir> <bedfilename> <bedfiletype:bed|ebed> [options] > <ofilename>"<<endl;
  37. cerr<<"Description: \tappend sequence to bed files (bed format for the feature coordinate, ebed format for feature block coordinates)"<<endl;
  38. cerr<<"\t\tseqDir contains per chromosome <chr>.seq which is one line raw sequence without header. convert fa to seq file using convertFaToPureSeq.py"<<endl;
  39. cerr<<"Options:"<<endl;
  40. cerr<<"--output-fasta\toutput fasta file instead of appending seq to bed file"<<endl;
  41. cerr<<"--use-coord-as-name\tuse coordinate as name of sequence instead of the name field of bed file. Only valid when --output-fasta option is specified"<<endl;
  42. cerr<<"--use-block-coord-as-name\tuse block coordinate as name. Only when --output-fasta is specified and format is ebed"<<endl;
  43. cerr<<"--print-OK\tBy default, only error messages are printed to stderr. specifying --print-OK ask to print OK to stderr even when succesful to allow for efficient handling of piping from a master program"<<endl;
  44. cerr<<"--seq-block-sep c, separate blocked sequences by character c"<<endl;
  45. return EXIT_FAILURE;
  46. }
  47. inline string getSeqFromRaf(RandomAccessFile* raf,int start0, int end1,char strand)
  48. {
  49. string forward=raf->get(start0,end1);
  50. if(strand==FORWARD)
  51. return forward;
  52. return reverse_complement(forward);
  53. }
  54. int main(int argc, const char**argv){
  55. char *bedLineBuffer =new char[BEDLINE_BUFFER_SIZE];
  56. bool outputFasta=false;
  57. bool useCoordAsName=false;
  58. bool useBlockCoordAsName=false;
  59. bool printOK=false;
  60. const char* programName=argv[0];
  61. string seqblockSep="";
  62. if(argc<4){
  63. return printUsageAndExit(programName);
  64. }
  65. for(int ai=4;ai<argc;ai++)
  66. {
  67. if(!strcmp(argv[ai],"--output-fasta"))
  68. outputFasta=true;
  69. else if(!strcmp(argv[ai],"--use-coord-as-name"))
  70. useCoordAsName=true;
  71. else if(!strcmp(argv[ai],"--use-block-coord-as-name"))
  72. useBlockCoordAsName=true;
  73. else if(!strcmp(argv[ai],"--print-OK"))
  74. printOK=true;
  75. else if(!strcmp(argv[ai],"--seq-block-sep"))
  76. {
  77. ai++; //consume next
  78. if(ai>=argc){
  79. cerr<<"--seq-block-sep requires one argument"<<endl;
  80. printUsageAndExit(programName);
  81. }
  82. seqblockSep=argv[ai];
  83. }
  84. else {
  85. cerr<<"unknown option "<<argv[ai]<<endl;
  86. return printUsageAndExit(programName);
  87. }
  88. }
  89. string seqDir=argv[1];
  90. string bedfilename=argv[2];
  91. string bedfiletype=argv[3];
  92. int bedformat=0;
  93. map<string,RandomAccessFile*> rafStreams;
  94. if(bedfiletype=="ebed")
  95. bedformat=EBED_FORMAT;
  96. else if (bedfiletype=="bed")
  97. bedformat=BED_FORMAT;
  98. else
  99. {
  100. cerr<<"Unknown bed format:"<<bedfiletype<<".abort"<<endl;
  101. return printUsageAndExit(programName);
  102. }
  103. if(useBlockCoordAsName && useCoordAsName ){
  104. cerr<<"both --use-block-coord-as-name and --use-coord-as-name are specified. These two options are mutually exclusive"<<endl;
  105. return printUsageAndExit(programName);
  106. }
  107. if( (useBlockCoordAsName || useCoordAsName ) && ( !outputFasta) ){
  108. cerr<<"--use-block-coord-as-name and --use-coord-as-name are valid only when --output-fasta is specified"<<endl;
  109. return printUsageAndExit(programName);
  110. }
  111. if( useBlockCoordAsName && bedformat!=EBED_FORMAT){
  112. cerr<<"--use-block-coord-as-name is only valid when format is ebed"<<endl;
  113. return printUsageAndExit(programName);
  114. }
  115. //now open and read bed file
  116. ifstream bedfil(bedfilename.c_str());
  117. strcpy(bedLineBuffer,"\0");
  118. vector<string> fields;
  119. vector<string> blockSizes;
  120. vector<string> blockStarts;
  121. int lino=0;
  122. string thisChr="";
  123. RandomAccessFile* thisRaf=NULL;
  124. while(bedfil.good())
  125. {
  126. if(!bedfil.good())
  127. break;
  128. lino++;
  129. bedfil.getline(bedLineBuffer,BEDLINE_BUFFER_SIZE);
  130. StringUtil::split(bedLineBuffer,"\t",fields);
  131. if(strlen(bedLineBuffer)==0)
  132. continue;
  133. if(fields.size()<3)
  134. {
  135. cerr<<"Ignored: invalid number of fields for line "<<lino<<" ["<<bedLineBuffer<<"]"<<endl;
  136. continue;
  137. }
  138. string chrom(fields[0]);
  139. if (chrom!=thisChr)
  140. {
  141. //switch raf!!
  142. //
  143. //
  144. //cerr<<"switch raf thisChr="<<thisChr<<" to "<<chrom<<" line="<<bedLineBuffer<<endl;
  145. map<string,RandomAccessFile*>::iterator rafI=rafStreams.find(chrom);
  146. if( rafI==rafStreams.end())
  147. {
  148. thisRaf=new RandomAccessFile(seqDir+"/"+chrom+".seq");
  149. if (!thisRaf->good() ) //invalid
  150. {
  151. delete thisRaf;
  152. thisRaf=NULL;
  153. continue;
  154. }
  155. //thisChr=chrom; //save time
  156. //register
  157. rafStreams.insert(map<string,RandomAccessFile*>::value_type(chrom,thisRaf));
  158. }
  159. else {
  160. thisRaf=rafI->second;
  161. }
  162. //bug fix 10/2/2013 move the thisChr=chrom from above to here
  163. thisChr=chrom;
  164. }
  165. if(!thisRaf)
  166. continue;
  167. int featureStart0=StringUtil::atoi(fields[1]);
  168. int featureEnd1=StringUtil::atoi(fields[2]);
  169. char strand=FORWARD;
  170. if (fields.size()>=6) //get strand info
  171. {
  172. if(fields[5]=="-")
  173. strand=REVERSE;
  174. else
  175. strand=FORWARD;
  176. }
  177. if(bedformat==BED_FORMAT)
  178. {
  179. //simple!
  180. string seq=getSeqFromRaf(thisRaf,featureStart0,featureEnd1,strand);
  181. if(outputFasta)
  182. {
  183. string name;
  184. if(useCoordAsName || fields.size()<4)
  185. { name=chrom+":"+StringUtil::str(featureStart0+1)+"-"+StringUtil::str(featureEnd1);
  186. }
  187. else {
  188. name=fields[3];
  189. }
  190. cout<<">"<<name;
  191. cout<<endl;
  192. cout<<seq;
  193. cout<<endl;
  194. if(printOK)
  195. cerr<<"OK"<<endl;
  196. }
  197. else {
  198. cout<<bedLineBuffer;
  199. cout<<"\t";
  200. cout<<seq;
  201. cout<<endl;
  202. if(printOK)
  203. cerr<<"OK"<<endl;
  204. }
  205. }
  206. else {
  207. //this is ebed format
  208. if(fields.size()<12)
  209. {
  210. cerr<<"Ignored: invalid number of fields for line "<<lino<<" "<<bedLineBuffer<<endl;
  211. continue;
  212. }
  213. int blockCount=StringUtil::atoi(fields[9]);
  214. string seq="";
  215. StringUtil::split(fields[10],",",blockSizes);
  216. StringUtil::split(fields[11],",",blockStarts);
  217. if (blockSizes.size()<blockCount || blockStarts.size()<blockCount)
  218. {
  219. cerr<<"Ignored: invalid number of blockStarts and blockEnds given the blockCount "<<lino<<" "<<bedLineBuffer<<endl;
  220. continue;
  221. }
  222. for (int blockI=0;blockI<blockCount;blockI++)
  223. {
  224. int blockRelStart=StringUtil::atoi(blockStarts[blockI]);
  225. int blockSize=StringUtil::atoi(blockSizes[blockI]);
  226. int getStart0=featureStart0+blockRelStart;
  227. int getEnd1=getStart0+blockSize;
  228. if(blockI>0){
  229. seq+=seqblockSep;
  230. }
  231. seq+=getSeqFromRaf(thisRaf,getStart0,getEnd1,FORWARD); //always get forward first then convert at end
  232. }
  233. if(outputFasta){
  234. string name;
  235. if(useCoordAsName)
  236. { name=chrom+":"+StringUtil::str(featureStart0+1)+"-"+StringUtil::str(featureEnd1);
  237. }
  238. else if(useBlockCoordAsName){
  239. deque<string> coordblocks;
  240. for (int blockI=0;blockI<blockCount;blockI++)
  241. {
  242. int blockRelStart=StringUtil::atoi(blockStarts[blockI]);
  243. int blockSize=StringUtil::atoi(blockSizes[blockI]);
  244. int getStart0=featureStart0+blockRelStart;
  245. int getEnd1=getStart0+blockSize;
  246. string coordstring=chrom+":"+StringUtil::str(getStart0+1)+"-"+StringUtil::str(getEnd1);
  247. if(strand==FORWARD)
  248. coordblocks.push_back(coordstring);
  249. else {
  250. coordblocks.push_front(coordstring);
  251. }
  252. }
  253. name=StringUtil::join< deque<string>, deque<string>::const_iterator >(coordblocks,",");
  254. }
  255. else {
  256. name=fields[3];
  257. }
  258. cout<<">"<<name;
  259. cout<<endl;
  260. cout<<seq;
  261. cout<<endl;
  262. if(printOK)
  263. cerr<<"OK"<<endl;
  264. }else{
  265. cout<<bedLineBuffer;
  266. cout<<"\t";
  267. if(strand==FORWARD)
  268. cout<<seq;
  269. else {
  270. cout<<reverse_complement(seq);
  271. }
  272. cout<<endl;
  273. if(printOK)
  274. cerr<<"OK"<<endl;
  275. }
  276. }
  277. }
  278. bedfil.close();
  279. //close all rafs
  280. for(map<string,RandomAccessFile*>::iterator i=rafStreams.begin();i!=rafStreams.end();i++)
  281. {
  282. i->second->close();
  283. delete i->second;
  284. }
  285. delete[] bedLineBuffer;
  286. return EXIT_SUCCESS;
  287. }