/bioutil/codon.go
Go | 393 lines | 368 code | 15 blank | 10 comment | 30 complexity | 60279b6189acded9f11e4f278c911e66 MD5 | raw file
- package bioutil
- // Codon tables.
- // These tables are based on parsing the NCBI files:
- // ftp://ftp.ncbi.nih.gov/entrez/misc/data/gc.prt.
- import (
- "bufio"
- "io"
- "regexp"
- "strings"
- )
- // A genetic codon table.
- type CodonTable struct {
- Id string // Table Id, defined by NCBI.
- Name string // Table name.
- AltName string // Alt-name
- Table map[string]string // A codon -- aa map
- BackTable map[string]string // A aa -- codon pattern map
- StopCodons []string // Stop codons in this table
- StartCodons []string // Start codons in this table
- }
- // ParseGC returns a list of codon tables.
- // This function is designed to parse gc.prt from the NCBI.
- func ParseGC(r io.Reader) (tables map[string]CodonTable, err error) {
- tables = make(map[string]CodonTable)
- br := bufio.NewReader(r)
- // regexp compile
- startRe, _ := regexp.Compile(`^\s{`)
- endRe, _ := regexp.Compile(`^\s}`)
- idRe, _ := regexp.Compile(`id\s(\d+)`)
- nameRe, _ := regexp.Compile(`name\s+\"(.+)\"`)
- ncbieaaRe, _ := regexp.Compile(`\sncbieaa\s+"(.+)"`)
- sncbieaaRe, _ := regexp.Compile(`\ssncbieaa\s+"(.+)"`)
- baseRe, _ := regexp.Compile(`Base\d\s+(\w+)`)
- var line string
- line, err = br.ReadString('\n')
- for err == nil {
- if startRe.MatchString(line) {
- var idStr, ncbieaa, sncbieaa string
- names := []string{}
- bases := []string{}
- line, err = br.ReadString('\n')
- for err == nil {
- if endRe.MatchString(line) {
- break
- }
- if idRe.MatchString(line) {
- idStr = idRe.FindStringSubmatch(line)[1]
- }
- if nameRe.MatchString(line) {
- names = append(names, nameRe.FindStringSubmatch(line)[1])
- }
- if ncbieaaRe.MatchString(line) {
- ncbieaa = ncbieaaRe.FindStringSubmatch(line)[1]
- }
- if sncbieaaRe.MatchString(line) {
- sncbieaa = sncbieaaRe.FindStringSubmatch(line)[1]
- }
- if baseRe.MatchString(line) {
- bases = append(bases, baseRe.FindStringSubmatch(line)[1])
- }
- line, err = br.ReadString('\n')
- }
- id := idStr
- var name, altname string
- name = names[0]
- if len(names) > 1 {
- altname = names[1]
- }
- // create a CondonTable
- ct := CodonTable{
- Id: id,
- Name: name,
- AltName: altname,
- }
- cmap := make(map[string]string)
- bmap := make(map[string][]string)
- stopCodons := []string{}
- startCodons := []string{}
- for i := 0; i < len(ncbieaa); i++ {
- codon := string([]byte{bases[0][i], bases[1][i], bases[2][i]})
- aa := string(ncbieaa[i])
- cmap[codon] = aa
- bmap[aa] = append(bmap[aa], codon)
- if ncbieaa[i] == '*' {
- stopCodons = append(stopCodons, codon)
- }
- if sncbieaa[i] == 'M' {
- startCodons = append(startCodons, codon)
- bmap[">"] = append(bmap[">"], codon)
- }
- }
- backTable := make(map[string]string)
- for aa, codons := range bmap {
- s := ""
- for i := 0; i < 3; i++ {
- m := make(map[byte]bool)
- for _, c := range codons {
- _, found := m[c[i]]
- if !found {
- m[c[i]] = true
- }
- }
- if len(m) == 4 {
- s += "."
- } else {
- keys := []string{}
- for k, _ := range m {
- keys = append(keys, string(k))
- }
- if len(keys) == 1 {
- s += keys[0]
- } else {
- s += "[" + strings.Join(keys, "|") + "]"
- }
- }
- }
- backTable[aa] = s
- }
- ct.Table = cmap
- ct.StartCodons = startCodons
- ct.StopCodons = stopCodons
- ct.BackTable = backTable
- tables[ct.Id] = ct
- }
- line, err = br.ReadString('\n')
- }
- if err == io.EOF {
- err = nil
- }
- return
- }
- // BuildinCodonTables returns 18 codon tables of version 3.9.
- // It is better to use ParseGC() to get updated codon tables.
- func BuildinCodonTables() map[string]CodonTable {
- gcContent := `--**************************************************************************
- -- This is the NCBI genetic code table
- -- Initial base data set from Andrzej Elzanowski while at PIR International
- -- Addition of Eubacterial and Alternative Yeast by J.Ostell at NCBI
- -- Base 1-3 of each codon have been added as comments to facilitate
- -- readability at the suggestion of Peter Rice, EMBL
- -- Later additions by Taxonomy Group staff at NCBI
- --
- -- Version 3.9
- -- Code 14 differs from code 9 only by translating UAA to Tyr rather than
- -- STOP. A recent study (Telford et al, 2000) has found no evidence that
- -- the codon UAA codes for Tyr in the flatworms, but other opinions exist.
- -- There are very few GenBank records that are translated with code 14,
- -- but a test translation shows that retranslating these records with code
- -- 9 can cause premature terminations. Therefore, GenBank will maintain
- -- code 14 until further information becomes available.
- --
- -- Version 3.8
- -- Added GTG start to Echinoderm mitochondrial code, code 9
- --
- -- Version 3.7
- -- Added code 23 Thraustochytrium mitochondrial code
- -- formerly OGMP code 93
- -- submitted by Gertraude Berger, Ph.D.
- --
- -- Version 3.6
- -- Added code 22 TAG-Leu, TCA-stop
- -- found in mitochondrial DNA of Scenedesmus obliquus
- -- submitted by Gertraude Berger, Ph.D.
- -- Organelle Genome Megasequencing Program, Univ Montreal
- --
- -- Version 3.5
- -- Added code 21, Trematode Mitochondrial
- -- (as deduced from: Garey & Wolstenholme,1989; Ohama et al, 1990)
- -- Added code 16, Chlorophycean Mitochondrial
- -- (TAG can translated to Leucine instaed to STOP in chlorophyceans
- -- and fungi)
- --
- -- Version 3.4
- -- Added CTG,TTG as allowed alternate start codons in Standard code.
- -- Prats et al. 1989, Hann et al. 1992
- --
- -- Version 3.3 - 10/13/95
- -- Added alternate intiation codon ATC to code 5
- -- based on complete mitochondrial genome of honeybee
- -- Crozier and Crozier (1993)
- --
- -- Version 3.2 - 6/24/95
- -- Code Comments
- -- 10 Alternative Ciliate Macronuclear renamed to Euplotid Macro...
- -- 15 Bleharisma Macro.. code added
- -- 5 Invertebrate Mito.. GTG allowed as alternate initiator
- -- 11 Eubacterial renamed to Bacterial as most alternate starts
- -- have been found in Achea
- --
- --
- -- Version 3.1 - 1995
- -- Updated as per Andrzej Elzanowski at NCBI
- -- Complete documentation in NCBI toolkit documentation
- -- Note: 2 genetic codes have been deleted
- --
- -- Old id Use id - Notes
- --
- -- id 7 id 4 - Kinetoplast code now merged in code id 4
- -- id 8 id 1 - all plant chloroplast differences due to RNA edit
- --
- --*************************************************************************
- Genetic-code-table ::= {
- {
- name "Standard" ,
- name "SGC0" ,
- id 1 ,
- ncbieaa "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
- sncbieaa "---M---------------M---------------M----------------------------"
- -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
- -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
- -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
- },
- {
- name "Vertebrate Mitochondrial" ,
- name "SGC1" ,
- id 2 ,
- ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG",
- sncbieaa "--------------------------------MMMM---------------M------------"
- -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
- -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
- -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
- },
- {
- name "Yeast Mitochondrial" ,
- name "SGC2" ,
- id 3 ,
- ncbieaa "FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
- sncbieaa "----------------------------------MM----------------------------"
- -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
- -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
- -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
- },
- {
- name "Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate
- Mitochondrial; Mycoplasma; Spiroplasma" ,
- name "SGC3" ,
- id 4 ,
- ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
- sncbieaa "--MM---------------M------------MMMM---------------M------------"
- -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
- -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
- -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
- },
- {
- name "Invertebrate Mitochondrial" ,
- name "SGC4" ,
- id 5 ,
- ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG",
- sncbieaa "---M----------------------------MMMM---------------M------------"
- -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
- -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
- -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
- },
- {
- name "Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear" ,
- name "SGC5" ,
- id 6 ,
- ncbieaa "FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
- sncbieaa "-----------------------------------M----------------------------"
- -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
- -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
- -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
- },
- {
- name "Echinoderm Mitochondrial; Flatworm Mitochondrial" ,
- name "SGC8" ,
- id 9 ,
- ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
- sncbieaa "-----------------------------------M---------------M------------"
- -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
- -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
- -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
- },
- {
- name "Euplotid Nuclear" ,
- name "SGC9" ,
- id 10 ,
- ncbieaa "FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
- sncbieaa "-----------------------------------M----------------------------"
- -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
- -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
- -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
- },
- {
- name "Bacterial, Archaeal and Plant Plastid" ,
- id 11 ,
- ncbieaa "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
- sncbieaa "---M---------------M------------MMMM---------------M------------"
- -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
- -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
- -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
- },
- {
- name "Alternative Yeast Nuclear" ,
- id 12 ,
- ncbieaa "FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
- sncbieaa "-------------------M---------------M----------------------------"
- -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
- -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
- -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
- },
- {
- name "Ascidian Mitochondrial" ,
- id 13 ,
- ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG",
- sncbieaa "---M------------------------------MM---------------M------------"
- -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
- -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
- -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
- },
- {
- name "Alternative Flatworm Mitochondrial" ,
- id 14 ,
- ncbieaa "FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
- sncbieaa "-----------------------------------M----------------------------"
- -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
- -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
- -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
- } ,
- {
- name "Blepharisma Macronuclear" ,
- id 15 ,
- ncbieaa "FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
- sncbieaa "-----------------------------------M----------------------------"
- -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
- -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
- -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
- } ,
- {
- name "Chlorophycean Mitochondrial" ,
- id 16 ,
- ncbieaa "FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
- sncbieaa "-----------------------------------M----------------------------"
- -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
- -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
- -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
- } ,
- {
- name "Trematode Mitochondrial" ,
- id 21 ,
- ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
- sncbieaa "-----------------------------------M---------------M------------"
- -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
- -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
- -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
- } ,
- {
- name "Scenedesmus obliquus Mitochondrial" ,
- id 22 ,
- ncbieaa "FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
- sncbieaa "-----------------------------------M----------------------------"
- -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
- -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
- -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
- } ,
- {
- name "Thraustochytrium Mitochondrial" ,
- id 23 ,
- ncbieaa "FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
- sncbieaa "--------------------------------M--M---------------M------------"
- -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
- -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
- -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
- } ,
- {
- name "Pterobranchia Mitochondrial" ,
- id 24 ,
- ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSSKVVVVAAAADDEEGGGG",
- sncbieaa "---M---------------M---------------M---------------M------------"
- -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
- -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
- -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
- }
- }`
- tables, _ := ParseGC(strings.NewReader(gcContent))
- return tables
- }