PageRenderTime 56ms CodeModel.GetById 22ms app.highlight 29ms RepoModel.GetById 1ms app.codeStats 0ms

/updatepreOTToLnewCoL.py

https://bitbucket.org/mtholder/ottol
Python | 434 lines | 412 code | 6 blank | 16 comment | 18 complexity | 6d739099000775544d6ba8dca60ae428 MD5 | raw file
  1##########################################################################################
  2# Goal: to make a file from the CoL database that includes just the taxonomy
  3# reforming the taxonomy list to mirror our ideas of the tree (i.e. changing 'Animalia' 
  4#to 'Eukaryots, Opisthokonta, Metazoa' and renaming all the 'Chromista' and 'Protozoa'
  5# taxa.
  6# Second method, extract the species not already in OTToL 
  7# 
  8#
  9##########################################################################################
 10import datetime
 11import os, re
 12numlist = []
 13##########################################################################################
 14
 15def findlastnum(filename, dbname):
 16
 17	infile = open(filename,'r').readlines()
 18	for line in infile:
 19		db = line.split('\t')[2]
 20		if re.search(dbname,db):
 21			numlist.append(line.split('\t')[0])
 22		
 23	numlist.sort()
 24	#print numlist
 25	return int(numlist.pop()) + 1
 26##########################################################################################
 27	
 28def parsetextfromCoL():
 29	infile = open('col/taxa.txt','r') #taxa.txt is file downloaded from CoL with all taxa
 30	outfile1 = open('acceptedNames','a')
 31	outfile2 = open('provisionallyAcceptedNames','a')
 32	count = 0
 33	pcount = 0
 34	for line in infile:
 35		if not re.search('infraspecies',line):
 36			if re.search('accepted',line) and re.search('species',line):
 37				if not re.search('provisionally',line):
 38					outfile1.write(line)
 39					count = count + 1
 40				else:
 41					outfile2.write(line)
 42					pcount = pcount + 1
 43	#print 'accepted names: '+ str(count)
 44	#print 'provisionally accepted names: '+ str(pcount)
 45##########################################################################################
 46
 47def renameCoL(f):
 48	outfile = open('taxa_' + f,'w')
 49	errorout = open('viruses_and_other_prblems','a')
 50	infile = open(f,'r').readlines()
 51	newtaxlist = []
 52	for line in infile:
 53		taxalist = line.split('\t\t')[2]
 54		#print taxalist
 55		sp = taxalist.split()[1].strip(',') 
 56		
 57		if taxalist.split('\t')[1] == 'Archaea':
 58			newline = taxalist.split('\t')[1:]
 59		elif taxalist.split('\t')[1] == 'Bacteria':
 60			newline = taxalist.split('\t')[1:]
 61		elif taxalist.split('\t')[1] == 'Animalia':
 62			newline = re.sub('Animalia','Eukaryota\tOpisthokonta\tMetazoa',taxalist).split('\t')[1:] 
 63		elif taxalist.split('\t')[1] == 'Fungi':
 64			newline = re.sub('Fungi','Eukaryota\tOpisthokonta\tFungi',taxalist).split('\t')[1:] 
 65		elif taxalist.split('\t')[1] == 'Plantae':
 66			newline = re.sub('Plantae','Eukaryota\tPlantae',taxalist).split('\t')[1:] 
 67		elif taxalist.split('\t')[1] == 'Protozoa':
 68			newline = renameProt(taxalist) #.split('\t')[1:] 
 69		elif taxalist.split('\t')[1] == 'Chromista':
 70			newline = renameProt(taxalist) #.split('\t')[1:] 
 71		else:
 72			if taxalist.split('\t')[1] == 'Viruses' or re.search('viruses',taxalist):
 73				newline = 'Virus'
 74			else:
 75				errorout = open('viruses_and_other_prblems','a')
 76				errorout.write('problem with 1 ' + line)
 77				errorout.close()
 78				newline = ''
 79		if newline != '' or newline != 'Virus':
 80			try:
 81				check(newline)
 82			except:
 83				print line, newline
 84
 85				print 'problem with check(newline) ' + str(printlist)
 86				errorout = open('viruses_and_other_prblems','a')
 87				errorout.write('problem with check(newline) ' + str(printlist) + '\n')
 88				errorout.close()
 89			try:
 90				printlist = newline  + [sp]
 91				try:
 92					assert re.search('[A-Z]',printlist[-2][0]) and re.search('[a-z]',printlist[-1][0])
 93					for item in printlist:	
 94						if re.search(',',item):
 95							print str(printlist)
 96							errorout = open('viruses_and_other_prblems','a')
 97							errorout.write('problem with str(printlist) ' + str(printlist) + '\n')
 98							errorout.close()					
 99						else:
100							outfile.write(item + '\t')
101			
102					outfile.write('\n')
103				except:
104					errorout = open('viruses_and_other_prblems','a')
105					errorout.write('problem with genera and species ' + str(line))
106					errorout.close()					
107			except:
108				errorout = open('viruses_and_other_prblems','a')
109				errorout.write('problem with printlist ' + str(line))
110				errorout.close()
111				
112				
113##########################################################################################
114				
115def renameProt(taxlist): #hash table built from our understanding of the tree
116	myhash = {}
117	taxalist = taxlist.split('\t')[1:] #get rid of preceding genus species
118
119	myhash['Protozoa','Acritarcha','Not assigned'] = ['Eukaryota','EE','Acritarcha','Not assigned']
120	myhash['Protozoa','Apicomplexa','Conoidasida'] = ['Eukaryota','SAR','Alveolata','Apicomplexa','Conoidasida']
121	myhash['Protozoa','Apicomplexa','Not assigned'] = ['Eukaryota','SAR','Alveolata','Apicomplexa','Not assigned']
122	myhash['Protozoa','Cercozoa','Chlorarachniophyceae'] = ['Eukaryota','SAR','Rhizaria','Cercozoa','Chlorarachniophyceae']
123	myhash['Protozoa','Cercozoa','Phytomyxea'] = ['Eukaryota','SAR','Rhizaria','Cercozoa','Phytomyxea']
124	myhash['Protozoa','Choanozoa','Mesomycetozoea'] = ['Eukaryota','Opisthokonta','Choanozoa','Mesomycetozoea']
125	myhash['Protozoa','Choanozoa','Not assigned'] = ['Eukaryota','Opisthokonta','Choanozoa','Not assigned']
126	myhash['Protozoa','Ciliophora','Ciliatea'] = ['Eukaryota','SAR','Alveolata','Ciliophora','Ciliatea']
127	myhash['Protozoa','Dinophyta','Dinophyceae'] = ['Eukaryota','SAR','Alveolata','Dinophyta','Dinophyceae']
128	myhash['Protozoa','Dinophyta','Ebriophyceae'] = ['Eukaryota','SAR','Alveolata','Dinophyta','Not assigned']
129	myhash['Protozoa','Dinophyta','Not assigned'] = ['Eukaryota','Excavata','Euglenozoa','Euglenida']
130	myhash['Protozoa','Euglenozoa','Euglenida'] = ['Eukaryota','Excavata','Euglenozoa','Euglenida']
131	myhash['Protozoa','Euglenozoa','Trypanosomatidae'] = ['Eukaryota','Excavata','Euglenozoa','Trypanosomatidae']
132	myhash['Protozoa','Flagellata','Not assigned'] = ['Eukaryota','Excavata','Flagellata','Not assigned']
133	myhash['Protozoa','Mycetozoa','Acrasiomycetes'] = ['Eukaryota','Amoebozoa','Mycetozoa','Acrasiomycetes']
134	myhash['Protozoa','Mycetozoa','Dictyosteliomycetes'] = ['Eukaryota','Amoebozoa','Mycetozoa','Dictyosteliomycetes']
135	myhash['Protozoa','Mycetozoa','Myxomycetes'] = ['Eukaryota','Amoebozoa','Mycetozoa','Myxomycetes']
136	myhash['Protozoa','Mycetozoa','Protosteliomycetes'] = ['Eukaryota','Amoebozoa','Mycetozoa','Protosteliomycetes']
137	myhash['Protozoa','Mycetozoa','Not assigned'] = ['Eukaryota','Amoebozoa','Mycetozoa']
138	myhash['Protozoa','Not assigned','Acantharia'] = ['Eukaryota','SAR','Rhizaria','Acantharia']
139	myhash['Protozoa','Not assigned','Filosia'] = ['Eukaryota','SAR','Rhizaria','Filosia']
140	myhash['Protozoa','Not assigned','Granuloreticulosea'] = ['Eukaryota','SAR','Rhizaria','Granuloreticulosea']
141	myhash['Protozoa','Not assigned','Haplosporea'] = ['Eukaryota','SAR','Rhizaria','Haplosporea']
142	myhash['Protozoa','Not assigned','Heliozoa','Actinophryida'] = ['Eukaryota','SAR','Stramenopile','Actinophryidae']
143	myhash['Protozoa','Not assigned','Heliozoa','Centrohelida'] = ['Eukaryota','EE','Centrohelida']
144	myhash['Protozoa','Not assigned','Heliozoa','Desmothothoracida'] = ['Eukaryota','SAR','Rhizaria','Desmothoracida']
145	myhash['Protozoa','Not assigned','Heliozoa','Desmothoracida'] = ['Eukaryota','SAR','Rhizaria','Desmothoracida']
146	myhash['Protozoa','Not assigned','Lobosa'] = ['Eukaryota','Amoebozoa','Lobosa']
147	myhash['Protozoa','Not assigned','Not assigned','Jakobaceae'] = ['Eukaryota','Excavata','Jakobid']
148	myhash['Protozoa','Not assigned','Sporozoa'] = ['Eukaryota','SAR','Alveolata','Apicomplexa']
149	myhash['Protozoa','Parabasalia','Not assigned'] = ['Eukaryota','Excavata','Parabasalia']
150	myhash['Protozoa','Percolozoa','Heterolobosea'] = ['Eukaryota','Excavata','Heterolobosea']
151	myhash['Protozoa','Sarcomastigophora','Phytomastigophora'] = ['Eukaryota','SAR','Alveolata','Dinoflagellate']
152	myhash['Protozoa','Sarcomastigophora','Zoomastigophora','Diplomonadida'] = ['Eukaryota','Excavata','Fornicata']
153	myhash['Protozoa','Sarcomastigophora','Zoomastigophora','Trichomonadida'] = ['Eukaryota','Excavata','Parabasalia']
154	myhash['Protozoa','Xenophyophora','Psamminida'] = ['Eukaryota','SAR','Rhizaria','Xenophyophora','Psamminida']
155	myhash['Protozoa','Xenophyophora','Stannomida'] = ['Eukaryota','SAR','Rhizaria','Xenophyophora','Stannomida']
156	myhash['Protozoa','Sarcomastigophora','Polycystina'] = ['Eukaryota','SAR','Rhizaria','Radiolaria']
157	myhash['Protozoa','Myzozoa','Perkinsea'] = ['Eukaryota','EE','Perkinsus']
158	myhash['Chromista','Cryptophyta','Cryptophyceae'] = ['Eukaryota','EE','Cryptophyta','Cryptophyceae']
159	myhash['Chromista','Haptophyta','Not assigned'] = ['Eukaryota','EE','Haptophyta','Not assigned']
160	myhash['Chromista','Haptophyta','Prymnesiophyceae'] = ['Eukaryota','EE','Haptophyta','Prymnesiophyceae']
161	myhash['Chromista','Hyphochytriomycota','Hyphochytriomycetes'] = ['Eukaryota','SAR','Stramenopile','Hyphochytriomycetes']
162	myhash['Chromista','Labyrinthista','Labyrinthulea'] = ['Eukaryota','SAR','Stramenopile','Labyrinthulea']
163	myhash['Protozoa','Labyrinthista','Labyrinthulea'] = ['Eukaryota','SAR','Stramenopile','Labyrinthulea']
164	myhash['Chromista','Ochrophyta','Bodonophyceae'] = ['Eukaryota','SAR','Stramenopile','Bodonophyceae']
165	myhash['Chromista','Ochrophyta','Chrysophyceae'] = ['Eukaryota','SAR','Stramenopile','Chrysophyceae']
166	myhash['Chromista','Ochrophyta','Coscinodiscophyceae'] = ['Eukaryota','SAR','Stramenopile','Coscinodiscophyceae']
167	myhash['Chromista','Ochrophyta','Craspedophyceae'] = ['Eukaryota','SAR','Stramenopile','Craspedophyceae']
168	myhash['Chromista','Ochrophyta','Dictyochophyceae'] = ['Eukaryota','SAR','Stramenopile','Dictyochophyceae']
169	myhash['Chromista','Ochrophyta','Eustigmatophyceae'] = ['Eukaryota','SAR','Stramenopile','Eustigmatophyceae']
170	myhash['Chromista','Ochrophyta','Fragilariophyceae'] = ['Eukaryota','SAR','Stramenopile','Fragilariophyceae']
171	myhash['Chromista','Ochrophyta','Hexamitophyceae'] = ['Eukaryota','SAR','Stramenopile','Hexamitophyceae']
172	myhash['Chromista','Ochrophyta','Phaeophyceae'] = ['Eukaryota','SAR','Stramenopile','Phaeophyceae']
173	myhash['Chromista','Ochrophyta','Raphidophyceae'] = ['Eukaryota','SAR','Stramenopile','Raphidophyceae']
174	myhash['Chromista','Ochrophyta','Synurophyceae'] = ['Eukaryota','SAR','Stramenopile','Synurophyceae']
175	myhash['Chromista','Ochrophyta','Xanthophyceae'] = ['Eukaryota','SAR','Stramenopile','Xanthophyceae']
176	myhash['Chromista','Oomycota','Not assigned'] = ['Eukaryota','SAR','Stramenopile','Not assigned']
177	myhash['Chromista','Oomycota','Oomycetes'] = ['Eukaryota','SAR','Stramenopile','Oomycetes']
178	myhash['Chromista','Sagenista','Bicosoecophyceae'] = ['Eukaryota','SAR','Stramenopile','Bicosoecophyceae']
179	myhash['Chromista','Ochrophyta','Pelagophyceae'] = ['Eukaryota','SAR','Stramenopile','Pelagophyceae']
180	myhash['Chromista','Ochrophyta','Bolidophyceae'] = ['Eukaryota','SAR','Stramenopile','Bolidophyceae']
181	myhash['Chromista','Ochrophyta','Pinguiophyceae'] = ['Eukaryota','SAR','Stramenopile','Pinguiophyceae']
182	myhash['Chromista','Not assigned','Schizocladiophyceae'] = ['Eukaryota','SAR','Stramenopile','Schizocladiophyceae']
183	myhash['Chromista','Ochrophyta','Pinguiophyceae'] = ['Eukaryota','SAR','Stramenopile','Pinguiophyceae']
184	myhash['Chromista','Not assigned','Developayella'] = ['Eukaryota','SAR','Stramenopile','Developayella']
185
186	for key in myhash.keys():
187		newtaxlist = myhash[key]
188		keylist = key
189		if all(x in taxalist for x in keylist):
190			for y in keylist:
191				taxalist.remove(y) #remove old list
192			for item in taxalist:
193				newtaxlist.append(item)
194			#print newtaxlist
195			return newtaxlist
196			
197##########################################################################################
198def mark_notinOTToL(OTToL):
199	i = 0
200	infile = open('allCoLrenamed_taxa','r').readlines()
201	infile2 = open(OTToL,'r').readlines()
202	outfile = open('new_sp_2add2OTToL','a')
203	taxonDict = {}
204	for line in infile2:
205		taxid = line.split('\t')[0]
206		parid = line.split('\t')[1]
207		taxon = line.split('\t')[3]
208		taxonDict[taxon] = line 
209	
210	
211	
212	for line in infile:
213		try:
214			assert re.search('[A-Z]',line.split()[-2][0]) and re.search('[a-z]',line.split()[-1][0])
215			gensp = line.split()[-2] + ' ' + line.split()[-1]
216		except:
217			outfile2 = open('error_entering','a')
218			outfile2.write(line)
219			outfile2.close()
220			gensp = ''
221		try:
222			outfile2 = open('already_in','a')
223			outfile2.write(taxonDict[gensp])
224			outfile2.close()			
225		except:
226			if gensp != '':
227				outfile.write(line)
228	
229	outfile.close()
230##########################################################################################
231
232def check(line):
233	
234	#error = open('errorlog','a')
235	#infile = open('taxa_' + f,'r')
236	#for line in infile:
237	if line == 'Virus':
238		return
239	if line[0] not in ['Eukaryota','Bacteria','Archaea']:
240		error = open('errorlog','a')
241		error.write(line)
242		error.close()
243	if line[0] == 'Eukaryota':
244		if line[1] not in ['Opisthokonta','Plantae','SAR','Amoebozoa','Excavata','EE']:
245			error = open('errorlog','a')
246			error.write(line)
247			error.close()
248
249	if line[0] == 'Bacteria':
250		if line[1] not in ['Actinobacteria','Cyanobacteria','Acidobacteria','Aquificae','Bacteroidetes','Chlamydiae','Chlorobi','Chloroflexi','Chrysiogenetes','Deferribacteres','Deinococcus-thermus','Dictyoglomi','Fibrobacteres','Firmicutes','Fusobacteria','Gemmatimonadetes','Lentisphaerae','Nitrospira','Planctomycetes','Proteobacteria','Spirochaetes','Thermodesulfobacteria','Thermomicrobia','Thermotogae','Verrucomicrobia','Flavobacteria','Sphingobacteria','Ochrophyta','Deinococci','Bacilli','Clostridia','Mollicutes','Bacteria','Alphaproteobacteria','Betaproteobacteria','Deltaproteobacteria','Epsilonproteobacteria','Gammaproteobacteria','Verrucomicrobiae']:
251			error = open('errorlog','a')
252			error.write(line)
253			error.close()
254
255	if line[0] == 'Archaea':
256		if line[1] not in ['Crenarchaeota','Euryarchaeota']:
257			error = open('errorlog','a')
258			error.write(line)
259			error.close()
260##########################################################################################
261
262def cleanup(toKEEP):
263	for f in os.listdir(os.curdir):
264		if f not in toKEEP:
265			os.system('mv ' + f + ' col_extra_files...check_and_discard')
266			
267##########################################################################################
268def mergesp(date, OTToL,file,newtxid):
269	infile1 = open(OTToL,'r').readlines()
270	infile2 = open(file,'r').readlines()
271	outfile = open('to_merge','w')
272	outfile2 = open('genera_2add2OTToL','a')
273	genusDict = {}
274	
275	for line in infile1:
276		#try:
277		if line.split('\t')[-2] == 'genus' or line.split('\t')[-2] == 'gen.':
278			genus = line.split('\t')[3].split()[0]
279			txid = line.split('\t')[0]
280			genusDict[genus] = txid
281		#except:
282		#	print line
283	for line in infile2:
284		if re.search('Not assigned',line):
285			unassignedout = open('Not_assigned','a')
286			unassignedout.write(line)
287			unassignedout.close()
288		else:
289			genus2add = line.split()[-2]
290			species2add = line.split()[-1]
291	
292			if genus2add in genusDict.keys():
293				newparid = genusDict[genus2add]
294				newtxid = newtxid + 1
295				today = datetime.datetime.now().strftime('%m_%d_%y')
296
297				newline = str(newtxid) + '\t' + str(newparid) + '\tCoL_' + date + '\t' + genus2add + ' ' + species2add + '\t\tspecies\t' + today + '\n'
298			
299				outfile.write(newline)
300			else:
301				outfile2.write(line)	
302	outfile.close()
303	outfile2.close()
304	return newtxid
305##########################################################################################
306def mergegen(date, OTToL,file,newtxid):
307	infile1 = open(OTToL,'r').readlines()
308	infile2 = open(file,'r').readlines()
309	outfile = open('to_merge','a')
310	outfile2 = open('stillNotinOTToL','a')
311	familyDict = {}
312	addedList= []	
313	for line in infile1:
314		#try:
315		if line.split('\t')[-2] != 'species' and line.split('\t')[-2] != 'subspecies' and line.split('\t')[-2] != 'genus':
316			try:
317				family = line.split('\t')[3].split()[0]
318			except:
319				family = line.split('\t')[3]
320			txid = line.split('\t')[0]
321			familyDict[family] = txid
322		#except:
323		#	print line
324	for line in infile2:
325			
326		genus2add = line.split()[-2]
327		fam2add = line.split()[-3]
328		
329		if genus2add.strip() not in addedList:
330			addedList.append(genus2add.strip())
331			if fam2add in familyDict.keys():
332				newparid = familyDict[fam2add]
333				newtxid = newtxid + 1
334				today = datetime.datetime.now().strftime('%m_%d_%y')
335
336				newline = str(newtxid) + '\t' + str(newparid) + '\tCoL_' + date + '\t' + genus2add +  '\t\tgenus\t' + today + '\n'
337			
338				outfile.write(newline)
339			else:
340				outfile2.write(line)	
341	outfile.close()
342	outfile2.close()
343	add(OTToL,'to_merge')
344	os.system('cat ' +  OTToL + ' toAdd_' + OTToL + '+to_merge > ' + OTToL + 'genadded')  #FIXED THIS TO LOOK FOR DUPLICATES/HOMONYMS
345	os.system('cp to_merge to_merge_1')
346	os.system('mv ' + file  + ' ' + file + '_1')
347	mergesp(date, OTToL + 'genadded',file + '_1',newtxid+1 )
348
349##########################################################################################
350def add(file,file2):
351	infile1 = open(file,'r').readlines()
352	infile2 = open(file2,'r').readlines()
353	outfile = open('toAdd_' + file + '+' + file2,'w')
354	outfile2 = open('dups_2check_before_adding2_' + file + '+' + file2,'w')
355	outfile3 = open('homonyms_2check_before_adding2_' + file + '+' + file2,'w')
356	taxlist = []
357	homlist = []
358	taxidlist = []
359	taxlist2 = []
360	for line in infile1:
361		tax = line.split('\t')[3]
362		taxid = line.split('\t')[0]
363		db = line.split('\t')[2]
364		taxlist.append(tax)
365		taxidlist.append(taxid)
366		if re.search('_hom',db):
367			homlist.append(tax)
368	#print 'taxlist made'
369	
370	for line in infile2:
371		tax = line.split('\t')[3]
372		taxid = line.split('\t')[0]
373		parid = line.split('\t')[1]
374		if tax not in taxlist2:
375			taxlist2.append(tax)
376			if tax not in taxlist:
377				if checkID(taxid,parid,taxidlist) == True:
378					outfile.write(line)
379				elif checkID(taxid,parid,taxidlist) == 'parent':
380					errorlog = open('adderrorlog','a')
381					errorlog.write('No Parent: ' + line)
382					errorlog.close()
383				elif checkID(taxid,parid,taxidlist) == 'taxon':
384					errorlog = open('adderrorlog','a')
385					errorlog.write('TaxID exists: ' + line)
386					errorlog.close()
387			else:
388				if tax in homlist:
389					outfile3.write(line)
390				else:
391					outfile2.write(line)
392		else:
393			errorlog = open('adderrorlog','a')
394			errorlog.write('Trying to add twice: ' + line)
395			errorlog.close()			
396
397def checkID(taxid,parid,taxidlist):
398	if taxid in taxidlist:
399		return 'taxon'
400	elif parid not in taxidlist:
401		return 'parent'
402	else:
403		return True
404##########################################################################################
405
406
407def main():
408#	OTToL = raw_input('What is the latest OTToL? ')
409#	date = raw_input('What is the download date for the latest OTToL? ')
410	date = 'today'
411	OTToL = 'OTToL080912v2'
412	parsetextfromCoL()
413	for f in ['acceptedNames','provisionallyAcceptedNames']:
414		renameCoL(f)
415		check(f)
416	os.system('cat taxa_provisionallyAcceptedNames taxa_acceptedNames > allCoLrenamed_taxa')
417	mark_notinOTToL(OTToL)
418	os.system('mkdir col_extra_files...check_and_discard')
419	toKEEP = [OTToL,'new_sp_2add2OTToL','col','col_extra_files...check_and_discard','updateOTToLnewCoL.py']
420	cleanup(toKEEP)
421
422	##############################
423	nextnum = findlastnum(OTToL, 'CoL')
424	nextnum = mergesp(date, OTToL,'new_sp_2add2OTToL',nextnum)
425	
426	mergegen(date, OTToL,'genera_2add2OTToL',nextnum)
427	
428	add(OTToL + 'genadded','to_merge')
429	os.system('cat ' + OTToL + 'genadded toAdd_' + OTToL + 'genadded+to_merge   > ' + OTToL + 'final') 
430	toKEEP = [OTToL + 'final','col','col_extra_files...check_and_discard','updateOTToLnewCoL.py','updateOTToLnewCoL_README']
431	cleanup(toKEEP)
432main()
433
434