PageRenderTime 49ms CodeModel.GetById 22ms RepoModel.GetById 0ms app.codeStats 0ms

/LSA/estim_mp_sparse_mat.py

https://gitlab.com/kyrgyzov/lsa_slurm
Python | 110 lines | 77 code | 25 blank | 8 comment | 12 complexity | d95180da2a050c4fcd18ec6784386357 MD5 | raw file
  1. #!/usr/bin/env python
  2. import sys, getopt
  3. import glob, os,time
  4. import numpy as np
  5. from lsa_job import *
  6. from job_vars import JobVars
  7. import yaml
  8. from collections import defaultdict
  9. from scipy import linalg
  10. from sklearn.datasets import make_sparse_spd_matrix
  11. from sklearn.covariance import GraphLassoCV, ledoit_wolf
  12. import matplotlib.pyplot as plt
  13. import networkx as nx
  14. def mat2file(X,lNames,rNames,luC,p2save):
  15. with open(p2save, 'w') as wFile:
  16. head = luC+','
  17. head += ','.join(map(str,rNames))
  18. wFile.write(head+'\n')
  19. for i in range(len(lNames)):
  20. line = lNames[i]+','
  21. line += ','.join(map(str,X[i,:]))
  22. wFile.write(line+'\n')
  23. help_message = 'usage example: python kmer_cluster_merge.py -r 1 -p /project/home/'
  24. if __name__ == "__main__":
  25. try:
  26. opts, args = getopt.getopt(sys.argv[1:],'hp:')
  27. except:
  28. print help_message
  29. sys.exit(2)
  30. for opt, arg in opts:
  31. if opt in ('-h'):
  32. print help_message
  33. sys.exit()
  34. elif opt in ('-p'):
  35. pDir = arg
  36. pDir = '/env/cns/proj/ADAM/LLDeep/PhyloMatrices/'
  37. print pDir
  38. #rName = 'genus'
  39. #rName = 'species'
  40. #rName = 'strain'
  41. #rName = 'class'
  42. #rName = 'family'
  43. #rName = 'order'
  44. #rName = 'phylum'
  45. LS = glob.glob(pDir+'dicts/'+rName+'/*.dc')
  46. print len(LS)
  47. nbr_C = load_data(pDir+'dicts/'+rName+'/nbr_cols.txt',fl='nbr')
  48. print nbr_C
  49. S = dok_matrix((len(LS),nbr_C), dtype=float)
  50. sNames = []
  51. for i,fl in enumerate(LS):
  52. tD = load_data(fl,fl='list',dump='pk')
  53. sNames.append(fl[fl.rfind('/')+1:fl.rfind('.')])
  54. for k,v in tD.iteritems():
  55. S[i,k] = v
  56. pS0 = np.zeros(nbr_C,dtype=np.uint64)
  57. pS1 = np.zeros(len(LS),dtype=np.uint64)
  58. for ij,v in S.iteritems():
  59. pS0[ij[1]] += 1
  60. pS1[ij[0]] += 1
  61. nzi = np.nonzero(pS0)[0]
  62. X=np.zeros((len(LS),nzi.size),dtype=float)
  63. for ij,v in S.iteritems():
  64. j = np.where(nzi == ij[1])[0]
  65. X[ij[0],j] = v
  66. X=X[np.sum(X,1)!=0,:]
  67. X=X[:,np.sum(X,0)!=0]
  68. iX = np.copy(X)
  69. for i in range(len(LS)):
  70. nbrr = load_data(pDir+sNames[i]+'/merged_reads.nbr',fl='nbr')
  71. print i,sNames[i],nbrr
  72. X[i,:] *= nbrr
  73. print X
  74. X /= 200
  75. X = X.astype(int)
  76. print X
  77. print X.shape
  78. luC = 'smpNm\\taxID'
  79. rNames = nzi[:]
  80. p2save = pDir + 'dicts/' + rName + '/' + 'mat_counts.csv'
  81. print p2save
  82. mat2file(X,sNames,rNames,luC,p2save)
  83. p2save = pDir + 'dicts/' + rName + '/' + 'mat_percents.csv'
  84. print p2save
  85. mat2file(iX,sNames,rNames,luC,p2save)