/data/expression/extract_gpl5175_info.py

https://gitlab.com/kmeyer/cns-count-analyses · Python · 70 lines · 55 code · 10 blank · 5 comment · 22 complexity · 904f48988732618ecdf77396dd22fa67 MD5 · raw file

  1. #!/usr/bin/env python3
  2. """Split gene information in GPL5175 NetAffx file.
  3. This spreads gene assignments across multiple lines.
  4. """
  5. import re
  6. from collections import namedtuple
  7. MAssignment = namedtuple("MAssignment",
  8. ["accid", "source", "description",
  9. "chr", "score", "coverage",
  10. "direct_probes", "possible_probes",
  11. "xhyb"])
  12. def get_gene_info(line):
  13. fields = line.strip().split("\t")
  14. if fields[-1] != "main":
  15. return
  16. if fields[6] == "---": # RANGE_START
  17. return
  18. if fields[10] == "---": # mRNA assignment
  19. return
  20. ensg_pat = re.compile(r"gene:(ENSG[R]{0,1}\d+)")
  21. affyid = fields[0]
  22. tot_probes = fields[8]
  23. symbols = {}
  24. if fields[9] != "---":
  25. for subfield in fields[9].split(" /// "):
  26. mid, symbol = subfield.split(" // ")[:2]
  27. symbols[mid.strip()] = symbol.strip()
  28. assignments = []
  29. for subfield in fields[10].split(" /// "):
  30. try:
  31. ma = MAssignment(*[i.strip() for i in subfield.split(" // ")])
  32. except TypeError:
  33. if "[WARNING: THIS FIELD TRUNCATED]" in subfield:
  34. continue
  35. raise
  36. assignments.append(ma)
  37. for ma in assignments:
  38. if ma.accid.startswith("NM_") or ma.accid.startswith("ENST"):
  39. symbol = symbols.get(ma.accid, "")
  40. ensg_match = ensg_pat.search(ma.description)
  41. ensg = ensg_match.group(1) if ensg_match else ""
  42. yield (affyid, symbol, ensg,
  43. ma.accid, ma.source, ma.score, ma.coverage,
  44. ma.possible_probes, ma.direct_probes, tot_probes)
  45. if __name__ == "__main__":
  46. info_table = "GPL5175-3188.txt"
  47. outfile = "gpl5175-geneinfo.csv"
  48. with open(info_table) as ifh:
  49. with open(outfile, "w") as ofh:
  50. headers = ["affyid", "symbol", "ensg",
  51. "id", "id_type", "score", "coverage",
  52. "pos_probes", "dir_probes", "tot_probes"]
  53. ofh.write(",".join(headers) + "\n")
  54. for line in ifh:
  55. if not line.startswith("#"):
  56. break
  57. for line in ifh:
  58. for infos in get_gene_info(line):
  59. ofh.write(",".join(infos) + "\n")