PageRenderTime 51ms CodeModel.GetById 24ms RepoModel.GetById 1ms app.codeStats 0ms

/scripts/l1l2_run.py

https://bitbucket.org/slipguru/l1l2signature
Python | 283 lines | 182 code | 63 blank | 38 comment | 26 complexity | cd727e042417a8a1d08fc28296e1b091 MD5 | raw file
Possible License(s): BSD-3-Clause
  1. #!/usr/bin/python -u
  2. '''
  3. #!/usr/bin/env python -u
  4. '''
  5. # -*- coding: utf-8 -*-
  6. import os
  7. import imp
  8. import shutil
  9. import cPickle as pkl
  10. import random
  11. import numpy as np
  12. import pplus
  13. import l1l2py
  14. from l1l2signature import internals as l1l2_core
  15. from l1l2signature import utils as l1l2_utils
  16. def main(config_path, custom_name = None):
  17. # Configuration File
  18. config_dir = os.path.dirname(config_path)
  19. config = imp.load_source('config', config_path)
  20. # Data paths
  21. data_path = os.path.join(config_dir, config.data_matrix)
  22. labels_path = os.path.join(config_dir, config.labels)
  23. # Result dir initialization
  24. result_path = os.path.join(config_dir, config.result_path) #result base dir
  25. # result_dir = l1l2_core.result_dir_init(result_path, config_path, data_path, labels_path)
  26. result_dir = l1l2_core.result_dir_init(result_path, config_path, data_path, labels_path, custom_name)
  27. splits_dir = os.path.join(result_dir, 'splitsIO')
  28. try:
  29. is_permutation_test = config.is_permutation_test
  30. except:
  31. is_permutation_test = False
  32. if is_permutation_test:
  33. # print("IS PERMUTATION JOB")
  34. target_job = modelselection_perm_job
  35. else:
  36. # print("IS NORMAL JOB")
  37. target_job = modelselection_job
  38. print #--------------------------------------------------------------------
  39. print 'Reading data... '
  40. br = l1l2_utils.BioDataReader(data_path, labels_path,
  41. config.sample_remover,
  42. config.variable_remover,
  43. config.delimiter,
  44. config.samples_on,
  45. config.positive_label)
  46. data = br.data
  47. labels = br.labels
  48. print(' * Data shape:', data.shape)
  49. print(' * Labels shape:', labels.shape)
  50. print #--------------------------------------------------------------------
  51. print('Generating parameters ranges...')
  52. if is_permutation_test:
  53. labels_perm = labels.copy()
  54. np.random.shuffle(labels_perm)
  55. rs = l1l2_utils.RangesScaler(data, labels_perm, config.data_normalizer,
  56. config.labels_normalizer)
  57. else:
  58. rs = l1l2_utils.RangesScaler(data, labels, config.data_normalizer,
  59. config.labels_normalizer)
  60. tau_range = rs.tau_range(config.tau_range)
  61. mu_range = rs.mu_range(config.mu_range)
  62. lambda_range = np.sort(config.lambda_range)
  63. for name, range in (('tau', tau_range), ('mu', mu_range),
  64. ('lambda', lambda_range)):
  65. print(' * %2d values of %6s from %8.3f to %8.3f' % (len(range), name,
  66. range[0],
  67. range[-1]))
  68. print #-------------------------------------------------------------------
  69. print('Setting parallel infrastructure...')
  70. try:
  71. workers_servers = config.workers_servers
  72. except:
  73. workers_servers = None
  74. pc = pplus.PPlusConnection(debug=config.debug, workers_servers = workers_servers)
  75. print(' * Experiment id: %s' % pc.id)
  76. ### Truncate emergency log file
  77. # with open('/tmp/emergency_log.txt', 'w') as lf:
  78. # lf.write("Starting experiment id: {}\n".format(pc.id))
  79. # Puts file on the disk
  80. pc.put('DATAFILE', data_path)
  81. pc.put('LABELFILE', labels_path)
  82. print #--------------------------------------------------------------------
  83. print('Distributing external splits...')
  84. ext_k = config.external_k or len(labels)
  85. int_k = config.internal_k or (ext_k - 1)
  86. print('(# jobs = %d)' % ext_k)
  87. ext_cv_sets = config.cv_splitting(labels, ext_k)
  88. sparse, regularized, return_predictions = (True, False, True)
  89. if is_permutation_test:
  90. random_seeds = list()
  91. random.seed()
  92. for i in np.arange(ext_k):
  93. random_seeds.append(random.randint(0,1234567890))
  94. for i, (train_idxs, test_idxs) in enumerate(ext_cv_sets):
  95. input_key = 'in_split%d' % i #adds split ID to filename
  96. output_key = 'out_split%d' % i
  97. input_args=(config.sample_remover,
  98. config.variable_remover,
  99. config.delimiter,
  100. config.samples_on,
  101. config.positive_label,
  102. train_idxs, test_idxs,
  103. config.cv_splitting,
  104. mu_range, tau_range, lambda_range,
  105. int_k, config.cv_error, config.error,
  106. config.data_normalizer, config.labels_normalizer,
  107. sparse, regularized, return_predictions)
  108. # Saving args
  109. input_path = os.path.join(splits_dir, input_key)
  110. with open(input_path, 'w') as inputfile:
  111. pkl.dump(input_args, inputfile, pkl.HIGHEST_PROTOCOL)
  112. pc.put(input_key, input_path)
  113. random_seed = random_seeds[i]
  114. print(' * Split {} submitted, random seed = {}!'.format(i+1, random_seed))
  115. pc.submit(target_job,
  116. args=(input_key, output_key, random_seed),
  117. depfuncs=(l1l2_utils.BioDataReader,
  118. l1l2_utils.L1L2SignatureException,
  119. l1l2_utils._check_unique_labels),
  120. modules=('numpy as np', # needed by BioReader
  121. 'l1l2py', 'cPickle as pkl'))
  122. print #--------------------------------------------------------------------
  123. print('Execution...')
  124. result_keys = pc.collect()
  125. if result_keys:
  126. print(' * Collected %d jobs on %d' % (len(result_keys), ext_k))
  127. print(' * Saving results data...')
  128. for output_key in result_keys:
  129. shutil.copy(pc.get_path(output_key), splits_dir)
  130. print('done')
  131. else:
  132. print(' * Error, no results collected!')
  133. print #END ----------------------------------------------------------------
  134. # PPlus Job function ----------------------------------------------------------
  135. def modelselection_job(pc, input_key, output_key, random_seed):
  136. # Importing inputs
  137. with open(pc.get_path(input_key)) as inputfile:
  138. tmp = pkl.load(inputfile)
  139. (sample_remover, variable_remover, delimiter, samples_on, positive_label,
  140. train_idxs, test_idxs, cv_splitting) = tmp[:8]
  141. args = tmp[8:]
  142. # Reading data
  143. br = BioDataReader(pc.get_path('DATAFILE'), pc.get_path('LABELFILE'),
  144. sample_remover, variable_remover, delimiter, samples_on,
  145. positive_label)
  146. data = br.data
  147. labels = br.labels
  148. # Calculating split submatrix
  149. #Xtr, Ytr = data[train_idxs,:], labels[train_idxs,:]
  150. #Xts, Yts = data[test_idxs, :], labels[test_idxs, :]
  151. Xtr, Ytr = data[train_idxs,:], labels[train_idxs]
  152. Xts, Yts = data[test_idxs, :], labels[test_idxs]
  153. # Parameters
  154. args = list(args) # writeble
  155. args[3] = cv_splitting(Ytr, args[3]) # args[3]=k -> splits
  156. # Execution
  157. result = l1l2py.model_selection(Xtr, Ytr, Xts, Yts, *args)
  158. # Saving outputs
  159. with pc.write_remotely(output_key) as resultfile:
  160. pkl.dump(result, resultfile, pkl.HIGHEST_PROTOCOL)
  161. return output_key
  162. # PPlus Job function ----------------------------------------------------------
  163. def modelselection_perm_job(pc, input_key, output_key, random_seed = None):
  164. # Importing inputs
  165. with open(pc.get_path(input_key)) as inputfile:
  166. tmp = pkl.load(inputfile)
  167. (sample_remover, variable_remover, delimiter, samples_on, positive_label,
  168. train_idxs, test_idxs, cv_splitting) = tmp[:8]
  169. args = tmp[8:]
  170. # Reading data
  171. br = BioDataReader(pc.get_path('DATAFILE'), pc.get_path('LABELFILE'),
  172. sample_remover, variable_remover, delimiter, samples_on,
  173. positive_label)
  174. data = br.data
  175. labels = br.labels
  176. # Calculating split submatrix
  177. #Xtr, Ytr = data[train_idxs,:], labels[train_idxs,:]
  178. #Xts, Yts = data[test_idxs, :], labels[test_idxs, :]
  179. Xtr, Ytr = data[train_idxs,:], labels[train_idxs]
  180. Xts, Yts = data[test_idxs, :], labels[test_idxs]
  181. # Parameters
  182. args = list(args) # writeble
  183. args[3] = cv_splitting(Ytr, args[3]) # args[3]=k -> splits
  184. ### control the random seed as well
  185. args.append(random_seed)
  186. args.append(input_key)
  187. # setattr(l1l2py, 'pc_id', input_key)
  188. # Execution
  189. result = l1l2py.model_selection_perm(Xtr, Ytr, Xts, Yts, *args)
  190. # Saving outputs
  191. with pc.write_remotely(output_key) as resultfile:
  192. pkl.dump(result, resultfile, pkl.HIGHEST_PROTOCOL)
  193. return output_key
  194. # Script entry ----------------------------------------------------------------
  195. if __name__ == '__main__':
  196. from optparse import OptionParser
  197. from l1l2signature import __version__
  198. usage = "usage: %prog [-c] configuration-file.py"
  199. parser = OptionParser(usage=usage, version='%prog ' + __version__)
  200. parser.add_option("-c", "--create", dest="create",
  201. action="store_true",
  202. help="create config file", default=False)
  203. parser.add_option("-n", "--custom_name", dest="custom_name", type = "string",
  204. action="store",
  205. help="custom name for experiment results folder", default=None)
  206. (options, args) = parser.parse_args()
  207. # if len(args) != 1:
  208. if len(args) < 1:
  209. parser.error('incorrect number of arguments')
  210. config_file_path = args[0]
  211. if options.create:
  212. from l1l2signature import config
  213. std_config_path = config.__file__
  214. if std_config_path.endswith('.pyc'):
  215. std_config_path = std_config_path[:-1]
  216. if os.path.exists(config_file_path):
  217. parser.error('config file already exists')
  218. shutil.copy(std_config_path, config_file_path)
  219. else:
  220. main(os.path.abspath(config_file_path), options.custom_name)