PageRenderTime 26ms CodeModel.GetById 25ms RepoModel.GetById 0ms app.codeStats 0ms

/src/biogps/apps/dataset/geo/ParseGDSFiles.py

https://bitbucket.org/Apoc/biogps_core
Python | 183 lines | 133 code | 21 blank | 29 comment | 41 complexity | 738cf05f0c7aa41255b0a064df2d6c77 MD5 | raw file
Possible License(s): LGPL-2.1, Apache-2.0
  1. #! /home/imacleod/venv/datasets/bin/python
  2. #####################################################
  3. # NCBI GEO GDS dataset parser. Creates .csv file #
  4. # in preparation for uploading dataset to service #
  5. # layer. #
  6. #####################################################
  7. from ast import literal_eval
  8. from os import path
  9. import csv
  10. import glob
  11. import psycopg2
  12. import sys
  13. import urllib2
  14. sys.path.append('/home/imacleod/datachart_sl')
  15. from service_layer.dataset.ds_loading.dataset_loader import cap_first, gen_ds_id
  16. conn = psycopg2.connect("dbname=biogps_datasets_dev user=postgres\
  17. host=localhost port=5432")
  18. cur = conn.cursor()
  19. def parse_GDS_files():
  20. soft_files = glob.glob('/storage/sulab/GEO_Data/GDS/test_copies/*.soft')
  21. out_path = '/storage/sulab/GEO_Data/GDS/test_copies/csv/'
  22. test_platforms = list(('GPL8300', 'GPL570', 'GPL96', 'GPL571', 'GPL97',
  23. 'GPL339', 'GPL340', 'GPL9523', 'GPL1261', 'GPL8321',
  24. 'GPL5766', 'GPL5759', 'GPL8492'))
  25. # Files already uploaded
  26. uploaded_files = set()
  27. csv_reader = csv.reader(open('/storage/sulab/GEO_Data/GDS/test_copies/csv/'\
  28. 'uploaded_files.csv', 'r'))
  29. for row in csv_reader:
  30. for item in row:
  31. uploaded_files.add(item)
  32. for soft_file in soft_files:
  33. filename = path.splitext(path.basename(soft_file))[0]
  34. print '\n%s' % filename
  35. if filename not in uploaded_files:
  36. with open(soft_file, 'r') as s_file:
  37. first_data_column, in_data = False, False
  38. col_mask, factors, line_errors, title_list = list(), list(), list(), list()
  39. current_line, dataset_id, valid_length = 0, 0, 0
  40. dataset_name, geo_id, platform, pubmed_id, summary = '', '', '', '', ''
  41. display_params = {'Aggregate': '', 'Color': '', 'Sort': ''}
  42. # For use with matrix
  43. rep_dict = dict()
  44. data_list = list()
  45. for line in s_file:
  46. # GDS files use space after platform, before equal sign
  47. # Example: !dataset_platform = GPL570
  48. if line.startswith('!dataset_platform '):
  49. platform = line.rstrip('\n').split('=')[1].lstrip()
  50. if platform in test_platforms:
  51. print 'Platform: %s' % platform
  52. dataset_id = gen_ds_id()
  53. print 'Dataset_id: %s' % dataset_id
  54. continue
  55. else:
  56. # Platform we're not interested in, go to next file
  57. break
  58. if line.startswith('!dataset_title'):
  59. dataset_name = line.split('=')[1].strip()
  60. if line.startswith('^DATASET'):
  61. geo_id = line.split('=')[1].strip()
  62. if line.startswith('!dataset_pubmed_id'):
  63. pubmed_id = line.split('=')[1].strip()
  64. if line.startswith('!dataset_description'):
  65. summary = line.split('=')[1].strip()
  66. if line.startswith('!dataset_table_begin'):
  67. in_data = True
  68. elif line.startswith('!dataset_table_end'):
  69. in_data = False
  70. elif in_data:
  71. # If no data go to next file
  72. if len(line) == 0:
  73. break
  74. # Column headers
  75. if line.startswith('ID_REF'):
  76. line = line.split()
  77. # Remove Identifier column
  78. line.pop(1)
  79. # Count number of elements per line
  80. valid_length = len(line)
  81. # Set first data column, titles for samples
  82. for i, j in enumerate(line[1:]):
  83. if j.startswith('GSM') and \
  84. not first_data_column:
  85. first_data_column = i
  86. title_list.append(j)
  87. else:
  88. # Data line - remove extra spaces, split on tabs
  89. row_data = list()
  90. line = line.replace(': ', ':').split('\t')
  91. # Remove Identifier column
  92. line.pop(1)
  93. if len(line) != valid_length:
  94. if len(line) < valid_length:
  95. line_errors.append([filename, 'Line ' +\
  96. str(current_line) +\
  97. ': missing values'])
  98. elif len(line) > valid_length:
  99. line_errors.append([filename, 'Line ' +\
  100. str(current_line) +\
  101. ': too many values'])
  102. for i, j in enumerate(line):
  103. if i == 0:
  104. reporter = j.strip()
  105. # Create both keys and values out of
  106. # reporters and their positions for
  107. # fast lookups, adjusting for headers
  108. rep_dict[reporter] = current_line
  109. rep_dict[current_line] = reporter
  110. else:
  111. if i >= first_data_column:
  112. # Round numeric values to three digits
  113. # Let this fail! Don't wrap in try/
  114. # except. We need to know about
  115. # missing data.
  116. row_data.append(round(float(j), 3))
  117. # Insert data
  118. #cur.execute("insert into dataset_data values (%d, '%s', '%s');" % (dataset_id, reporter, row_data))
  119. current_line += 1
  120. # Lookup GEO curation ID in order to get sample titles
  121. cur_id = 0
  122. u = urllib2.urlopen("http://insilico.ulb.ac.be/Publicutilities/getcurations?gse=%s" % geo_id)
  123. if u.getcode() == 200:
  124. cur_data = literal_eval(u.read())
  125. for i in cur_data['curations']:
  126. if i['curator'] == 'GEO':
  127. cur_id = i['curid']
  128. sample_titles = dict()
  129. if cur_id:
  130. u = urllib2.urlopen("http://insilico.ulb.ac.be/Publicutilities/getannotations?gse=%s&gpl=%s&id=%s" % (geo_id, platform, cur_id))
  131. if u.getcode() == 200:
  132. titles_data = literal_eval(u.read())
  133. for k, v in titles_data.iteritems():
  134. sample_titles[k] = v['title'].strip("'")
  135. factors_dict = dict()
  136. # Request factors metadata from InSilicoDB
  137. u = urllib2.urlopen("http://insilico.ulb.ac.be/Publicutilities/getpreferedannotation?gse=%s&gpl=%s" % (geo_id, platform))
  138. if u.getcode() == 200:
  139. factors_data = u.read()
  140. #factors_data = literal_eval(u.read())
  141. for i in title_list:
  142. factors_dict[i] = factors_data[i]
  143. if sample_titles:
  144. factors_dict[i]['title'] = sample_titles[i]
  145. [factors.append({i: factors_dict[i]}) for i in title_list]
  146. else:
  147. print '\nError retrieving json factors from InSilicoDB for file %s!' % filename
  148. metadata = {"ID": dataset_id, "Name": dataset_name, "Owner": "GEO", "GEO_ID": geo_id, "PubMed_ID": pubmed_id, "Summary": summary, "Col_Mask": col_mask, "Factors": factors, "Display_Params": display_params}
  149. # Insert dataset metadata and matrix in DB
  150. #cur.execute(query_string)
  151. try:
  152. if len(line_errors) > 0:
  153. print 'Errors:'
  154. print str(line_errors)
  155. except AttributeError:
  156. # No errors to write
  157. pass
  158. else:
  159. print 'Already uploaded!'
  160. if __name__ == "__main__":
  161. parse_GDS_files()