/data/expression/extract_gpl5175_info.py
https://gitlab.com/kmeyer/cns-count-analyses · Python · 70 lines · 55 code · 10 blank · 5 comment · 22 complexity · 904f48988732618ecdf77396dd22fa67 MD5 · raw file
- #!/usr/bin/env python3
- """Split gene information in GPL5175 NetAffx file.
- This spreads gene assignments across multiple lines.
- """
- import re
- from collections import namedtuple
- MAssignment = namedtuple("MAssignment",
- ["accid", "source", "description",
- "chr", "score", "coverage",
- "direct_probes", "possible_probes",
- "xhyb"])
- def get_gene_info(line):
- fields = line.strip().split("\t")
- if fields[-1] != "main":
- return
- if fields[6] == "---": # RANGE_START
- return
- if fields[10] == "---": # mRNA assignment
- return
- ensg_pat = re.compile(r"gene:(ENSG[R]{0,1}\d+)")
- affyid = fields[0]
- tot_probes = fields[8]
- symbols = {}
- if fields[9] != "---":
- for subfield in fields[9].split(" /// "):
- mid, symbol = subfield.split(" // ")[:2]
- symbols[mid.strip()] = symbol.strip()
- assignments = []
- for subfield in fields[10].split(" /// "):
- try:
- ma = MAssignment(*[i.strip() for i in subfield.split(" // ")])
- except TypeError:
- if "[WARNING: THIS FIELD TRUNCATED]" in subfield:
- continue
- raise
- assignments.append(ma)
- for ma in assignments:
- if ma.accid.startswith("NM_") or ma.accid.startswith("ENST"):
- symbol = symbols.get(ma.accid, "")
- ensg_match = ensg_pat.search(ma.description)
- ensg = ensg_match.group(1) if ensg_match else ""
- yield (affyid, symbol, ensg,
- ma.accid, ma.source, ma.score, ma.coverage,
- ma.possible_probes, ma.direct_probes, tot_probes)
- if __name__ == "__main__":
- info_table = "GPL5175-3188.txt"
- outfile = "gpl5175-geneinfo.csv"
- with open(info_table) as ifh:
- with open(outfile, "w") as ofh:
- headers = ["affyid", "symbol", "ensg",
- "id", "id_type", "score", "coverage",
- "pos_probes", "dir_probes", "tot_probes"]
- ofh.write(",".join(headers) + "\n")
- for line in ifh:
- if not line.startswith("#"):
- break
- for line in ifh:
- for infos in get_gene_info(line):
- ofh.write(",".join(infos) + "\n")