/LSA/estim_mp_sparse_mat.py
Python | 110 lines | 77 code | 25 blank | 8 comment | 12 complexity | d95180da2a050c4fcd18ec6784386357 MD5 | raw file
- #!/usr/bin/env python
- import sys, getopt
- import glob, os,time
- import numpy as np
- from lsa_job import *
- from job_vars import JobVars
- import yaml
- from collections import defaultdict
- from scipy import linalg
- from sklearn.datasets import make_sparse_spd_matrix
- from sklearn.covariance import GraphLassoCV, ledoit_wolf
- import matplotlib.pyplot as plt
- import networkx as nx
- def mat2file(X,lNames,rNames,luC,p2save):
-
- with open(p2save, 'w') as wFile:
- head = luC+','
- head += ','.join(map(str,rNames))
- wFile.write(head+'\n')
- for i in range(len(lNames)):
- line = lNames[i]+','
- line += ','.join(map(str,X[i,:]))
- wFile.write(line+'\n')
- help_message = 'usage example: python kmer_cluster_merge.py -r 1 -p /project/home/'
- if __name__ == "__main__":
- try:
- opts, args = getopt.getopt(sys.argv[1:],'hp:')
- except:
- print help_message
- sys.exit(2)
- for opt, arg in opts:
- if opt in ('-h'):
- print help_message
- sys.exit()
- elif opt in ('-p'):
- pDir = arg
- pDir = '/env/cns/proj/ADAM/LLDeep/PhyloMatrices/'
- print pDir
- #rName = 'genus'
- #rName = 'species'
- #rName = 'strain'
- #rName = 'class'
- #rName = 'family'
- #rName = 'order'
- #rName = 'phylum'
- LS = glob.glob(pDir+'dicts/'+rName+'/*.dc')
- print len(LS)
- nbr_C = load_data(pDir+'dicts/'+rName+'/nbr_cols.txt',fl='nbr')
- print nbr_C
-
- S = dok_matrix((len(LS),nbr_C), dtype=float)
- sNames = []
- for i,fl in enumerate(LS):
- tD = load_data(fl,fl='list',dump='pk')
- sNames.append(fl[fl.rfind('/')+1:fl.rfind('.')])
- for k,v in tD.iteritems():
- S[i,k] = v
- pS0 = np.zeros(nbr_C,dtype=np.uint64)
- pS1 = np.zeros(len(LS),dtype=np.uint64)
- for ij,v in S.iteritems():
- pS0[ij[1]] += 1
- pS1[ij[0]] += 1
- nzi = np.nonzero(pS0)[0]
- X=np.zeros((len(LS),nzi.size),dtype=float)
- for ij,v in S.iteritems():
- j = np.where(nzi == ij[1])[0]
- X[ij[0],j] = v
- X=X[np.sum(X,1)!=0,:]
- X=X[:,np.sum(X,0)!=0]
- iX = np.copy(X)
- for i in range(len(LS)):
- nbrr = load_data(pDir+sNames[i]+'/merged_reads.nbr',fl='nbr')
- print i,sNames[i],nbrr
- X[i,:] *= nbrr
- print X
- X /= 200
- X = X.astype(int)
- print X
- print X.shape
- luC = 'smpNm\\taxID'
- rNames = nzi[:]
- p2save = pDir + 'dicts/' + rName + '/' + 'mat_counts.csv'
- print p2save
- mat2file(X,sNames,rNames,luC,p2save)
-
- p2save = pDir + 'dicts/' + rName + '/' + 'mat_percents.csv'
- print p2save
- mat2file(iX,sNames,rNames,luC,p2save)