/updatepreOTToLnewCoL.py
Python | 434 lines | 412 code | 6 blank | 16 comment | 18 complexity | 6d739099000775544d6ba8dca60ae428 MD5 | raw file
1########################################################################################## 2# Goal: to make a file from the CoL database that includes just the taxonomy 3# reforming the taxonomy list to mirror our ideas of the tree (i.e. changing 'Animalia' 4#to 'Eukaryots, Opisthokonta, Metazoa' and renaming all the 'Chromista' and 'Protozoa' 5# taxa. 6# Second method, extract the species not already in OTToL 7# 8# 9########################################################################################## 10import datetime 11import os, re 12numlist = [] 13########################################################################################## 14 15def findlastnum(filename, dbname): 16 17 infile = open(filename,'r').readlines() 18 for line in infile: 19 db = line.split('\t')[2] 20 if re.search(dbname,db): 21 numlist.append(line.split('\t')[0]) 22 23 numlist.sort() 24 #print numlist 25 return int(numlist.pop()) + 1 26########################################################################################## 27 28def parsetextfromCoL(): 29 infile = open('col/taxa.txt','r') #taxa.txt is file downloaded from CoL with all taxa 30 outfile1 = open('acceptedNames','a') 31 outfile2 = open('provisionallyAcceptedNames','a') 32 count = 0 33 pcount = 0 34 for line in infile: 35 if not re.search('infraspecies',line): 36 if re.search('accepted',line) and re.search('species',line): 37 if not re.search('provisionally',line): 38 outfile1.write(line) 39 count = count + 1 40 else: 41 outfile2.write(line) 42 pcount = pcount + 1 43 #print 'accepted names: '+ str(count) 44 #print 'provisionally accepted names: '+ str(pcount) 45########################################################################################## 46 47def renameCoL(f): 48 outfile = open('taxa_' + f,'w') 49 errorout = open('viruses_and_other_prblems','a') 50 infile = open(f,'r').readlines() 51 newtaxlist = [] 52 for line in infile: 53 taxalist = line.split('\t\t')[2] 54 #print taxalist 55 sp = taxalist.split()[1].strip(',') 56 57 if taxalist.split('\t')[1] == 'Archaea': 58 newline = taxalist.split('\t')[1:] 59 elif taxalist.split('\t')[1] == 'Bacteria': 60 newline = taxalist.split('\t')[1:] 61 elif taxalist.split('\t')[1] == 'Animalia': 62 newline = re.sub('Animalia','Eukaryota\tOpisthokonta\tMetazoa',taxalist).split('\t')[1:] 63 elif taxalist.split('\t')[1] == 'Fungi': 64 newline = re.sub('Fungi','Eukaryota\tOpisthokonta\tFungi',taxalist).split('\t')[1:] 65 elif taxalist.split('\t')[1] == 'Plantae': 66 newline = re.sub('Plantae','Eukaryota\tPlantae',taxalist).split('\t')[1:] 67 elif taxalist.split('\t')[1] == 'Protozoa': 68 newline = renameProt(taxalist) #.split('\t')[1:] 69 elif taxalist.split('\t')[1] == 'Chromista': 70 newline = renameProt(taxalist) #.split('\t')[1:] 71 else: 72 if taxalist.split('\t')[1] == 'Viruses' or re.search('viruses',taxalist): 73 newline = 'Virus' 74 else: 75 errorout = open('viruses_and_other_prblems','a') 76 errorout.write('problem with 1 ' + line) 77 errorout.close() 78 newline = '' 79 if newline != '' or newline != 'Virus': 80 try: 81 check(newline) 82 except: 83 print line, newline 84 85 print 'problem with check(newline) ' + str(printlist) 86 errorout = open('viruses_and_other_prblems','a') 87 errorout.write('problem with check(newline) ' + str(printlist) + '\n') 88 errorout.close() 89 try: 90 printlist = newline + [sp] 91 try: 92 assert re.search('[A-Z]',printlist[-2][0]) and re.search('[a-z]',printlist[-1][0]) 93 for item in printlist: 94 if re.search(',',item): 95 print str(printlist) 96 errorout = open('viruses_and_other_prblems','a') 97 errorout.write('problem with str(printlist) ' + str(printlist) + '\n') 98 errorout.close() 99 else: 100 outfile.write(item + '\t') 101 102 outfile.write('\n') 103 except: 104 errorout = open('viruses_and_other_prblems','a') 105 errorout.write('problem with genera and species ' + str(line)) 106 errorout.close() 107 except: 108 errorout = open('viruses_and_other_prblems','a') 109 errorout.write('problem with printlist ' + str(line)) 110 errorout.close() 111 112 113########################################################################################## 114 115def renameProt(taxlist): #hash table built from our understanding of the tree 116 myhash = {} 117 taxalist = taxlist.split('\t')[1:] #get rid of preceding genus species 118 119 myhash['Protozoa','Acritarcha','Not assigned'] = ['Eukaryota','EE','Acritarcha','Not assigned'] 120 myhash['Protozoa','Apicomplexa','Conoidasida'] = ['Eukaryota','SAR','Alveolata','Apicomplexa','Conoidasida'] 121 myhash['Protozoa','Apicomplexa','Not assigned'] = ['Eukaryota','SAR','Alveolata','Apicomplexa','Not assigned'] 122 myhash['Protozoa','Cercozoa','Chlorarachniophyceae'] = ['Eukaryota','SAR','Rhizaria','Cercozoa','Chlorarachniophyceae'] 123 myhash['Protozoa','Cercozoa','Phytomyxea'] = ['Eukaryota','SAR','Rhizaria','Cercozoa','Phytomyxea'] 124 myhash['Protozoa','Choanozoa','Mesomycetozoea'] = ['Eukaryota','Opisthokonta','Choanozoa','Mesomycetozoea'] 125 myhash['Protozoa','Choanozoa','Not assigned'] = ['Eukaryota','Opisthokonta','Choanozoa','Not assigned'] 126 myhash['Protozoa','Ciliophora','Ciliatea'] = ['Eukaryota','SAR','Alveolata','Ciliophora','Ciliatea'] 127 myhash['Protozoa','Dinophyta','Dinophyceae'] = ['Eukaryota','SAR','Alveolata','Dinophyta','Dinophyceae'] 128 myhash['Protozoa','Dinophyta','Ebriophyceae'] = ['Eukaryota','SAR','Alveolata','Dinophyta','Not assigned'] 129 myhash['Protozoa','Dinophyta','Not assigned'] = ['Eukaryota','Excavata','Euglenozoa','Euglenida'] 130 myhash['Protozoa','Euglenozoa','Euglenida'] = ['Eukaryota','Excavata','Euglenozoa','Euglenida'] 131 myhash['Protozoa','Euglenozoa','Trypanosomatidae'] = ['Eukaryota','Excavata','Euglenozoa','Trypanosomatidae'] 132 myhash['Protozoa','Flagellata','Not assigned'] = ['Eukaryota','Excavata','Flagellata','Not assigned'] 133 myhash['Protozoa','Mycetozoa','Acrasiomycetes'] = ['Eukaryota','Amoebozoa','Mycetozoa','Acrasiomycetes'] 134 myhash['Protozoa','Mycetozoa','Dictyosteliomycetes'] = ['Eukaryota','Amoebozoa','Mycetozoa','Dictyosteliomycetes'] 135 myhash['Protozoa','Mycetozoa','Myxomycetes'] = ['Eukaryota','Amoebozoa','Mycetozoa','Myxomycetes'] 136 myhash['Protozoa','Mycetozoa','Protosteliomycetes'] = ['Eukaryota','Amoebozoa','Mycetozoa','Protosteliomycetes'] 137 myhash['Protozoa','Mycetozoa','Not assigned'] = ['Eukaryota','Amoebozoa','Mycetozoa'] 138 myhash['Protozoa','Not assigned','Acantharia'] = ['Eukaryota','SAR','Rhizaria','Acantharia'] 139 myhash['Protozoa','Not assigned','Filosia'] = ['Eukaryota','SAR','Rhizaria','Filosia'] 140 myhash['Protozoa','Not assigned','Granuloreticulosea'] = ['Eukaryota','SAR','Rhizaria','Granuloreticulosea'] 141 myhash['Protozoa','Not assigned','Haplosporea'] = ['Eukaryota','SAR','Rhizaria','Haplosporea'] 142 myhash['Protozoa','Not assigned','Heliozoa','Actinophryida'] = ['Eukaryota','SAR','Stramenopile','Actinophryidae'] 143 myhash['Protozoa','Not assigned','Heliozoa','Centrohelida'] = ['Eukaryota','EE','Centrohelida'] 144 myhash['Protozoa','Not assigned','Heliozoa','Desmothothoracida'] = ['Eukaryota','SAR','Rhizaria','Desmothoracida'] 145 myhash['Protozoa','Not assigned','Heliozoa','Desmothoracida'] = ['Eukaryota','SAR','Rhizaria','Desmothoracida'] 146 myhash['Protozoa','Not assigned','Lobosa'] = ['Eukaryota','Amoebozoa','Lobosa'] 147 myhash['Protozoa','Not assigned','Not assigned','Jakobaceae'] = ['Eukaryota','Excavata','Jakobid'] 148 myhash['Protozoa','Not assigned','Sporozoa'] = ['Eukaryota','SAR','Alveolata','Apicomplexa'] 149 myhash['Protozoa','Parabasalia','Not assigned'] = ['Eukaryota','Excavata','Parabasalia'] 150 myhash['Protozoa','Percolozoa','Heterolobosea'] = ['Eukaryota','Excavata','Heterolobosea'] 151 myhash['Protozoa','Sarcomastigophora','Phytomastigophora'] = ['Eukaryota','SAR','Alveolata','Dinoflagellate'] 152 myhash['Protozoa','Sarcomastigophora','Zoomastigophora','Diplomonadida'] = ['Eukaryota','Excavata','Fornicata'] 153 myhash['Protozoa','Sarcomastigophora','Zoomastigophora','Trichomonadida'] = ['Eukaryota','Excavata','Parabasalia'] 154 myhash['Protozoa','Xenophyophora','Psamminida'] = ['Eukaryota','SAR','Rhizaria','Xenophyophora','Psamminida'] 155 myhash['Protozoa','Xenophyophora','Stannomida'] = ['Eukaryota','SAR','Rhizaria','Xenophyophora','Stannomida'] 156 myhash['Protozoa','Sarcomastigophora','Polycystina'] = ['Eukaryota','SAR','Rhizaria','Radiolaria'] 157 myhash['Protozoa','Myzozoa','Perkinsea'] = ['Eukaryota','EE','Perkinsus'] 158 myhash['Chromista','Cryptophyta','Cryptophyceae'] = ['Eukaryota','EE','Cryptophyta','Cryptophyceae'] 159 myhash['Chromista','Haptophyta','Not assigned'] = ['Eukaryota','EE','Haptophyta','Not assigned'] 160 myhash['Chromista','Haptophyta','Prymnesiophyceae'] = ['Eukaryota','EE','Haptophyta','Prymnesiophyceae'] 161 myhash['Chromista','Hyphochytriomycota','Hyphochytriomycetes'] = ['Eukaryota','SAR','Stramenopile','Hyphochytriomycetes'] 162 myhash['Chromista','Labyrinthista','Labyrinthulea'] = ['Eukaryota','SAR','Stramenopile','Labyrinthulea'] 163 myhash['Protozoa','Labyrinthista','Labyrinthulea'] = ['Eukaryota','SAR','Stramenopile','Labyrinthulea'] 164 myhash['Chromista','Ochrophyta','Bodonophyceae'] = ['Eukaryota','SAR','Stramenopile','Bodonophyceae'] 165 myhash['Chromista','Ochrophyta','Chrysophyceae'] = ['Eukaryota','SAR','Stramenopile','Chrysophyceae'] 166 myhash['Chromista','Ochrophyta','Coscinodiscophyceae'] = ['Eukaryota','SAR','Stramenopile','Coscinodiscophyceae'] 167 myhash['Chromista','Ochrophyta','Craspedophyceae'] = ['Eukaryota','SAR','Stramenopile','Craspedophyceae'] 168 myhash['Chromista','Ochrophyta','Dictyochophyceae'] = ['Eukaryota','SAR','Stramenopile','Dictyochophyceae'] 169 myhash['Chromista','Ochrophyta','Eustigmatophyceae'] = ['Eukaryota','SAR','Stramenopile','Eustigmatophyceae'] 170 myhash['Chromista','Ochrophyta','Fragilariophyceae'] = ['Eukaryota','SAR','Stramenopile','Fragilariophyceae'] 171 myhash['Chromista','Ochrophyta','Hexamitophyceae'] = ['Eukaryota','SAR','Stramenopile','Hexamitophyceae'] 172 myhash['Chromista','Ochrophyta','Phaeophyceae'] = ['Eukaryota','SAR','Stramenopile','Phaeophyceae'] 173 myhash['Chromista','Ochrophyta','Raphidophyceae'] = ['Eukaryota','SAR','Stramenopile','Raphidophyceae'] 174 myhash['Chromista','Ochrophyta','Synurophyceae'] = ['Eukaryota','SAR','Stramenopile','Synurophyceae'] 175 myhash['Chromista','Ochrophyta','Xanthophyceae'] = ['Eukaryota','SAR','Stramenopile','Xanthophyceae'] 176 myhash['Chromista','Oomycota','Not assigned'] = ['Eukaryota','SAR','Stramenopile','Not assigned'] 177 myhash['Chromista','Oomycota','Oomycetes'] = ['Eukaryota','SAR','Stramenopile','Oomycetes'] 178 myhash['Chromista','Sagenista','Bicosoecophyceae'] = ['Eukaryota','SAR','Stramenopile','Bicosoecophyceae'] 179 myhash['Chromista','Ochrophyta','Pelagophyceae'] = ['Eukaryota','SAR','Stramenopile','Pelagophyceae'] 180 myhash['Chromista','Ochrophyta','Bolidophyceae'] = ['Eukaryota','SAR','Stramenopile','Bolidophyceae'] 181 myhash['Chromista','Ochrophyta','Pinguiophyceae'] = ['Eukaryota','SAR','Stramenopile','Pinguiophyceae'] 182 myhash['Chromista','Not assigned','Schizocladiophyceae'] = ['Eukaryota','SAR','Stramenopile','Schizocladiophyceae'] 183 myhash['Chromista','Ochrophyta','Pinguiophyceae'] = ['Eukaryota','SAR','Stramenopile','Pinguiophyceae'] 184 myhash['Chromista','Not assigned','Developayella'] = ['Eukaryota','SAR','Stramenopile','Developayella'] 185 186 for key in myhash.keys(): 187 newtaxlist = myhash[key] 188 keylist = key 189 if all(x in taxalist for x in keylist): 190 for y in keylist: 191 taxalist.remove(y) #remove old list 192 for item in taxalist: 193 newtaxlist.append(item) 194 #print newtaxlist 195 return newtaxlist 196 197########################################################################################## 198def mark_notinOTToL(OTToL): 199 i = 0 200 infile = open('allCoLrenamed_taxa','r').readlines() 201 infile2 = open(OTToL,'r').readlines() 202 outfile = open('new_sp_2add2OTToL','a') 203 taxonDict = {} 204 for line in infile2: 205 taxid = line.split('\t')[0] 206 parid = line.split('\t')[1] 207 taxon = line.split('\t')[3] 208 taxonDict[taxon] = line 209 210 211 212 for line in infile: 213 try: 214 assert re.search('[A-Z]',line.split()[-2][0]) and re.search('[a-z]',line.split()[-1][0]) 215 gensp = line.split()[-2] + ' ' + line.split()[-1] 216 except: 217 outfile2 = open('error_entering','a') 218 outfile2.write(line) 219 outfile2.close() 220 gensp = '' 221 try: 222 outfile2 = open('already_in','a') 223 outfile2.write(taxonDict[gensp]) 224 outfile2.close() 225 except: 226 if gensp != '': 227 outfile.write(line) 228 229 outfile.close() 230########################################################################################## 231 232def check(line): 233 234 #error = open('errorlog','a') 235 #infile = open('taxa_' + f,'r') 236 #for line in infile: 237 if line == 'Virus': 238 return 239 if line[0] not in ['Eukaryota','Bacteria','Archaea']: 240 error = open('errorlog','a') 241 error.write(line) 242 error.close() 243 if line[0] == 'Eukaryota': 244 if line[1] not in ['Opisthokonta','Plantae','SAR','Amoebozoa','Excavata','EE']: 245 error = open('errorlog','a') 246 error.write(line) 247 error.close() 248 249 if line[0] == 'Bacteria': 250 if line[1] not in ['Actinobacteria','Cyanobacteria','Acidobacteria','Aquificae','Bacteroidetes','Chlamydiae','Chlorobi','Chloroflexi','Chrysiogenetes','Deferribacteres','Deinococcus-thermus','Dictyoglomi','Fibrobacteres','Firmicutes','Fusobacteria','Gemmatimonadetes','Lentisphaerae','Nitrospira','Planctomycetes','Proteobacteria','Spirochaetes','Thermodesulfobacteria','Thermomicrobia','Thermotogae','Verrucomicrobia','Flavobacteria','Sphingobacteria','Ochrophyta','Deinococci','Bacilli','Clostridia','Mollicutes','Bacteria','Alphaproteobacteria','Betaproteobacteria','Deltaproteobacteria','Epsilonproteobacteria','Gammaproteobacteria','Verrucomicrobiae']: 251 error = open('errorlog','a') 252 error.write(line) 253 error.close() 254 255 if line[0] == 'Archaea': 256 if line[1] not in ['Crenarchaeota','Euryarchaeota']: 257 error = open('errorlog','a') 258 error.write(line) 259 error.close() 260########################################################################################## 261 262def cleanup(toKEEP): 263 for f in os.listdir(os.curdir): 264 if f not in toKEEP: 265 os.system('mv ' + f + ' col_extra_files...check_and_discard') 266 267########################################################################################## 268def mergesp(date, OTToL,file,newtxid): 269 infile1 = open(OTToL,'r').readlines() 270 infile2 = open(file,'r').readlines() 271 outfile = open('to_merge','w') 272 outfile2 = open('genera_2add2OTToL','a') 273 genusDict = {} 274 275 for line in infile1: 276 #try: 277 if line.split('\t')[-2] == 'genus' or line.split('\t')[-2] == 'gen.': 278 genus = line.split('\t')[3].split()[0] 279 txid = line.split('\t')[0] 280 genusDict[genus] = txid 281 #except: 282 # print line 283 for line in infile2: 284 if re.search('Not assigned',line): 285 unassignedout = open('Not_assigned','a') 286 unassignedout.write(line) 287 unassignedout.close() 288 else: 289 genus2add = line.split()[-2] 290 species2add = line.split()[-1] 291 292 if genus2add in genusDict.keys(): 293 newparid = genusDict[genus2add] 294 newtxid = newtxid + 1 295 today = datetime.datetime.now().strftime('%m_%d_%y') 296 297 newline = str(newtxid) + '\t' + str(newparid) + '\tCoL_' + date + '\t' + genus2add + ' ' + species2add + '\t\tspecies\t' + today + '\n' 298 299 outfile.write(newline) 300 else: 301 outfile2.write(line) 302 outfile.close() 303 outfile2.close() 304 return newtxid 305########################################################################################## 306def mergegen(date, OTToL,file,newtxid): 307 infile1 = open(OTToL,'r').readlines() 308 infile2 = open(file,'r').readlines() 309 outfile = open('to_merge','a') 310 outfile2 = open('stillNotinOTToL','a') 311 familyDict = {} 312 addedList= [] 313 for line in infile1: 314 #try: 315 if line.split('\t')[-2] != 'species' and line.split('\t')[-2] != 'subspecies' and line.split('\t')[-2] != 'genus': 316 try: 317 family = line.split('\t')[3].split()[0] 318 except: 319 family = line.split('\t')[3] 320 txid = line.split('\t')[0] 321 familyDict[family] = txid 322 #except: 323 # print line 324 for line in infile2: 325 326 genus2add = line.split()[-2] 327 fam2add = line.split()[-3] 328 329 if genus2add.strip() not in addedList: 330 addedList.append(genus2add.strip()) 331 if fam2add in familyDict.keys(): 332 newparid = familyDict[fam2add] 333 newtxid = newtxid + 1 334 today = datetime.datetime.now().strftime('%m_%d_%y') 335 336 newline = str(newtxid) + '\t' + str(newparid) + '\tCoL_' + date + '\t' + genus2add + '\t\tgenus\t' + today + '\n' 337 338 outfile.write(newline) 339 else: 340 outfile2.write(line) 341 outfile.close() 342 outfile2.close() 343 add(OTToL,'to_merge') 344 os.system('cat ' + OTToL + ' toAdd_' + OTToL + '+to_merge > ' + OTToL + 'genadded') #FIXED THIS TO LOOK FOR DUPLICATES/HOMONYMS 345 os.system('cp to_merge to_merge_1') 346 os.system('mv ' + file + ' ' + file + '_1') 347 mergesp(date, OTToL + 'genadded',file + '_1',newtxid+1 ) 348 349########################################################################################## 350def add(file,file2): 351 infile1 = open(file,'r').readlines() 352 infile2 = open(file2,'r').readlines() 353 outfile = open('toAdd_' + file + '+' + file2,'w') 354 outfile2 = open('dups_2check_before_adding2_' + file + '+' + file2,'w') 355 outfile3 = open('homonyms_2check_before_adding2_' + file + '+' + file2,'w') 356 taxlist = [] 357 homlist = [] 358 taxidlist = [] 359 taxlist2 = [] 360 for line in infile1: 361 tax = line.split('\t')[3] 362 taxid = line.split('\t')[0] 363 db = line.split('\t')[2] 364 taxlist.append(tax) 365 taxidlist.append(taxid) 366 if re.search('_hom',db): 367 homlist.append(tax) 368 #print 'taxlist made' 369 370 for line in infile2: 371 tax = line.split('\t')[3] 372 taxid = line.split('\t')[0] 373 parid = line.split('\t')[1] 374 if tax not in taxlist2: 375 taxlist2.append(tax) 376 if tax not in taxlist: 377 if checkID(taxid,parid,taxidlist) == True: 378 outfile.write(line) 379 elif checkID(taxid,parid,taxidlist) == 'parent': 380 errorlog = open('adderrorlog','a') 381 errorlog.write('No Parent: ' + line) 382 errorlog.close() 383 elif checkID(taxid,parid,taxidlist) == 'taxon': 384 errorlog = open('adderrorlog','a') 385 errorlog.write('TaxID exists: ' + line) 386 errorlog.close() 387 else: 388 if tax in homlist: 389 outfile3.write(line) 390 else: 391 outfile2.write(line) 392 else: 393 errorlog = open('adderrorlog','a') 394 errorlog.write('Trying to add twice: ' + line) 395 errorlog.close() 396 397def checkID(taxid,parid,taxidlist): 398 if taxid in taxidlist: 399 return 'taxon' 400 elif parid not in taxidlist: 401 return 'parent' 402 else: 403 return True 404########################################################################################## 405 406 407def main(): 408# OTToL = raw_input('What is the latest OTToL? ') 409# date = raw_input('What is the download date for the latest OTToL? ') 410 date = 'today' 411 OTToL = 'OTToL080912v2' 412 parsetextfromCoL() 413 for f in ['acceptedNames','provisionallyAcceptedNames']: 414 renameCoL(f) 415 check(f) 416 os.system('cat taxa_provisionallyAcceptedNames taxa_acceptedNames > allCoLrenamed_taxa') 417 mark_notinOTToL(OTToL) 418 os.system('mkdir col_extra_files...check_and_discard') 419 toKEEP = [OTToL,'new_sp_2add2OTToL','col','col_extra_files...check_and_discard','updateOTToLnewCoL.py'] 420 cleanup(toKEEP) 421 422 ############################## 423 nextnum = findlastnum(OTToL, 'CoL') 424 nextnum = mergesp(date, OTToL,'new_sp_2add2OTToL',nextnum) 425 426 mergegen(date, OTToL,'genera_2add2OTToL',nextnum) 427 428 add(OTToL + 'genadded','to_merge') 429 os.system('cat ' + OTToL + 'genadded toAdd_' + OTToL + 'genadded+to_merge > ' + OTToL + 'final') 430 toKEEP = [OTToL + 'final','col','col_extra_files...check_and_discard','updateOTToLnewCoL.py','updateOTToLnewCoL_README'] 431 cleanup(toKEEP) 432main() 433 434