PageRenderTime 72ms CodeModel.GetById 34ms RepoModel.GetById 1ms app.codeStats 0ms

/analyzer/featureExtractor.py

https://github.com/kaeaura/churn_prediction_proj
Python | 957 lines | 944 code | 4 blank | 9 comment | 9 complexity | e2db2c9e0e821af00ff66fbceba5af30 MD5 | raw file
  1. # Jing-Kai Lou (kaeaura@gamil.com)
  2. # Sun Feb 26 15:17:41 CST 2012
  3. import os
  4. import re
  5. import sys
  6. import time
  7. import getopt
  8. import cPickle
  9. import networkx as nx
  10. from db import LiteDB
  11. from scipy import average, median, array, log10, sqrt
  12. from scipy.optimize import leastsq
  13. from collections import Counter, defaultdict
  14. from itertools import ifilter
  15. __author__ = "Jing-Kai Lou (kaeaura@gmail.com)"
  16. powerlaw = lambda x, amp, index: amp * (x**index)
  17. def strength_centrality(G, weight, k = None, mode = 'out'):
  18. '''
  19. Return the degree centrality according to the weight on the edges.
  20. Arguments:
  21. --
  22. * weight: str. the name of the edge attribute using for the weight values
  23. * k: the node list. would be G.nodes() if k is ignored
  24. * mode: str. should be "in", "out" or "all", which stands for the in_degree, out_degree, and degree respectively
  25. '''
  26. if k == None:
  27. k = G.nodes()
  28. n_weight_dict = dict()
  29. for node in k:
  30. if mode == 'out':
  31. n_weight_dict[node] = sum(map(lambda x: x[2][weight], G.out_edges(node, data = True)))
  32. elif mode == 'in':
  33. n_weight_dict[node] = sum(map(lambda x: x[2][weight], G.in_edges(node, data = True)))
  34. else:
  35. n_weight_dict[node] = sum(map(lambda x: x[2][weight], G.out_edges(node, data = True) + G.in_edges(node, data = True)))
  36. return(n_weight_dict)
  37. def average_degree(g):
  38. if g.order():
  39. d = g.degree().values()
  40. return(float(sum(d)) / g.order())
  41. else:
  42. return(0)
  43. def average_in_degree(g):
  44. if g.order():
  45. d = g.in_degree().values()
  46. return(float(sum(d)) / g.order())
  47. else:
  48. return(0)
  49. def average_out_degree(g):
  50. if g.order():
  51. d = g.out_degree().values()
  52. return(float(sum(d)) / g.order())
  53. else:
  54. return(0)
  55. def degcor(g):
  56. """
  57. Calculating the pearson correlation between in-degree and out-degree of nodes
  58. in the given Graph g.
  59. Parameters:
  60. -----------
  61. g: NetworkX DiGraph
  62. Returns:
  63. -------
  64. degree correlation, float
  65. """
  66. assert(g.is_directed())
  67. from scipy.stats import pearsonr
  68. x, y = list(), list()
  69. for n in g.nodes_iter():
  70. x.append(g.out_degree(n))
  71. y.append(g.in_degree(n))
  72. return(pearsonr(x, y))
  73. def to_undirected(g, is_bi = False):
  74. """
  75. Remove the selfloops and make it as undirected for a given graph.
  76. Parameters:
  77. -----------
  78. g: NetworkX Graph, NetworkX DiGraph
  79. is_bi: bool, (default = False)
  80. If is_bi is true, then only bi-directional tie left.
  81. That means all single directional arc between nodes are removed
  82. Returns:
  83. -------
  84. NetworkXGraph
  85. """
  86. g.remove_edges_from(g.selfloop_edges())
  87. return(nx.Graph(g))
  88. def mean_clustering(g, normalized = False):
  89. """
  90. Calculating the clustering coefficient for a graph. If given graph is directed,
  91. the graph is converted to undirected automatically that means any arc will be an edge.
  92. Parameters:
  93. -----------
  94. g: NetworkX Graph, NetworkX DiGraph
  95. normalized: bool, optional, (default = False)
  96. Returns:
  97. -------
  98. float, (normalized) clustering coefficient
  99. """
  100. # first, remove the self edges
  101. g = to_undirected(g)
  102. c = nx.average_clustering(g)
  103. rc = float(g.size() * 2) / (g.order() * (g.order() - 1))
  104. if normalized:
  105. return(c / rc if rc > 0 else 0)
  106. else:
  107. return(c)
  108. def randomly_clustering(g, tries = 10):
  109. """
  110. Comparing the average clustering coefficient of g with other graphs h
  111. which share identical degree sequence. This function returns the comparison ratio.
  112. Parameters:
  113. -----------
  114. g: NetworkX Graph, NetworkX DiGraph
  115. tries: int, optional, (default = 10)
  116. number of tries (compared graphs)
  117. See also:
  118. ---------
  119. mean_clustering
  120. Returns:
  121. --------
  122. float, the ratio of avg clustering coefficient, avg_cc(g) / mean(avg_cc(h))
  123. """
  124. from scipy import average
  125. g = to_undirected(g)
  126. d = g.degree().values()
  127. c = mean_clustering(g, normalized = False)
  128. p = list()
  129. for t in xrange(tries):
  130. ng = nx.configuration_model(d, create_using = nx.Graph())
  131. p.append(mean_clustering(ng))
  132. del ng
  133. return(c / average(p))
  134. def reinforce(g, weight = 'weight'):
  135. w = nx.get_edge_attributes(g, weight)
  136. if len(w):
  137. return(filter(lambda x: x.__len__() > 1 if hasattr(x, '__iter__') else x > 1, w.values()).__len__() / float(w.__len__()))
  138. else:
  139. return(0)
  140. def reciprocity(g, return_cor = True):
  141. """
  142. Calculate the reciprocity for a directed graphs. If return_cor is False,
  143. then it provides a traditional way of quantifying reciprocity r as the ratio of
  144. the number of links pointing in both directions L_r to the total number of links L.
  145. Nevertheless, reciprocity r must be compared with the value r_rand expected
  146. in a random graph with exactly same size and order, or it has only a relative meaning
  147. and does not carry complete information by itself. In order to avoid the aforementioned
  148. problems, this also proposes a new definition of reciprocity rho as the correlation
  149. between the entries of the adjacency matrix of a directed graph.
  150. Parameters:
  151. -----------
  152. g: NetworkX DiGraph
  153. return_cor: bool, optional, (default = True)
  154. If true, return the return_cor reciprocity rho (correlation)
  155. Returns:
  156. -------
  157. reciprocity: float
  158. References:
  159. -----------
  160. [1] D. Garlaschelli and M. I. Loffredo,
  161. 'Patterns of link reciprocity in directed networks,'
  162. arXiv.org, vol. cond-mat.dis-nn. 22-Apr.-2004.
  163. """
  164. if g:
  165. assert(g.is_directed())
  166. # first, remove the self loops and duplicated edges
  167. # in matrix aspect, remove the diagonal elements and make the element are no greater than 1
  168. n = g.order()
  169. L = set(g.edges()).difference(set(g.selfloop_edges()))
  170. # sort the directional edges as undirectional edges
  171. L_unorder = map(lambda x: '-'.join(map(str, list(set(x)))), L)
  172. L_cnt = Counter()
  173. for l in iter(L_unorder):
  174. L_cnt[l] += 1
  175. L_r = filter(lambda x: L_cnt[x] > 1, L_cnt.iterkeys())
  176. r = 2 * float(len(L_r)) / len(L_unorder)
  177. a = float(len(L)) / (n * (n - 1))
  178. rho = ((r - a) / (1 - a))
  179. return(rho if return_cor else r)
  180. else:
  181. return(None)
  182. def powerlaw_fit(xdata, ydata, err = 0.1):
  183. yerr = err * ydata
  184. logx = log10(xdata)
  185. logy = log10(ydata)
  186. logyerr = yerr / ydata
  187. fitfunc = lambda p, x: p[0] + p[1] * x
  188. errfunc = lambda p, x, y, err: (y - fitfunc(p, x)) / err
  189. pinit = [2.0, -2.0]
  190. out = leastsq(errfunc, pinit, args = (logx, logy, logyerr), full_output = 1)
  191. pfinal = out[0]
  192. covar = out[1]
  193. index = pfinal[1]
  194. amp = 10 ** pfinal[0]
  195. indexErr = sqrt(covar[0][0])
  196. return(amp, index, indexErr)
  197. def get_resilience_fraction(g, steps = 1000, mode = 'in'):
  198. """docstring for get_resilience_fraction"""
  199. n = float(g.order())
  200. h = g.copy()
  201. step_width = max(1, n / steps)
  202. if mode == 'in':
  203. sorted_nodes = sorted(g.in_degree().items(), key = lambda x: x[1])
  204. else:
  205. sorted_nodes = sorted(g.out_degree().items(), key = lambda x: x[1])
  206. sorted_node_index = map(lambda n: n[0], sorted_nodes)
  207. fraction = [1]
  208. while len(sorted_node_index):
  209. rip = []
  210. while len(rip) < step_width:
  211. if len(sorted_node_index):
  212. rip.append(sorted_node_index.pop())
  213. else:
  214. break
  215. h.remove_nodes_from(rip)
  216. ws = nx.weakly_connected_components(h)
  217. w = ws[0].__len__() if ws else 0
  218. fraction.append(w / n)
  219. return(dict(zip(range(len(fraction)), fraction)))
  220. def get_degree_rate_distribution(g):
  221. """docstring for bidirectional_overlap"""
  222. i = g.in_degree()
  223. o = g.out_degree()
  224. r = [i[n]/float(o[n]) for n in g.nodes() if o[n]]
  225. r_size = float(len(r))
  226. from numpy import linspace, concatenate
  227. x_step = concatenate((linspace(0.001, 1, num = 200), linspace(1, 10, num = 100), linspace(10, 1000, 100)))
  228. xdata, ydata = list(), list()
  229. xdata = list(x_step)
  230. ydata = [len(filter(lambda x: x <= sp, r)) / r_size for sp in x_step]
  231. return(xdata, ydata)
  232. def get_link_weight_distribution(g, weight = 'weight'):
  233. """docstring for get_link_weight_distribution"""
  234. w = nx.get_edge_attributes(g, weight).values()
  235. from collections import Counter
  236. from numpy import array
  237. wc = Counter()
  238. for ww in iter(w):
  239. wc[ww] += 1
  240. wc = dict(wc)
  241. wSum = sum(wc.values())
  242. xdata = wc.keys()
  243. xdata.sort()
  244. xdata = array(xdata)
  245. ydata = list()
  246. for k in xdata: ydata.append(sum([wc[kk] for kk in xdata[xdata >= k]]))
  247. ydata = array(map(lambda x: float(x) / wSum, ydata))
  248. return(xdata, ydata)
  249. def get_degree_overlap_ratio(g):
  250. order = g.order()
  251. i = g.in_degree()
  252. o = g.out_degree()
  253. get_index = lambda x: x[0]
  254. sorted_i_index = map(get_index, sorted(i.items(), key = lambda x: x[1], reverse = True))
  255. sorted_o_index = map(get_index, sorted(o.items(), key = lambda x: x[1], reverse = True))
  256. from numpy import linspace
  257. top_nums = map(int, linspace(0, order, num = 101))
  258. def intersect_fraction(x, y):
  259. return(len(set(x).intersection(set(y))) / float(len(x)) if len(x) else 0)
  260. overlaps = [ intersect_fraction(sorted_i_index[:top_num], sorted_o_index[:top_num]) for top_num in top_nums ]
  261. overlaps = dict(zip(range(len(overlaps)), overlaps))
  262. return(overlaps)
  263. def get_degree_distribution(g, mode = 'both', is_CDF = True):
  264. """
  265. The discrete degree distribution. Similar to the histogram shows the possible degrees k,
  266. and ratio of nodes with degree greater than k in graph g.
  267. Parameters:
  268. -----------
  269. g: NetworkX Graph
  270. mode: str ('in', 'out', 'both'), (default = 'both')
  271. is_CDF: bool, (default = True)
  272. if True, return the ratio values as CDF, else return the ratio values as PDF
  273. Returns:
  274. --------
  275. xdata, ydata, a 2-tuple of array, (degree k, P(k))
  276. """
  277. if mode == 'both':
  278. dg = g.degree().values()
  279. elif mode == 'in':
  280. dg = g.in_degree().values()
  281. elif mode == 'out':
  282. dg = g.out_degree().values()
  283. else:
  284. return(0)
  285. c = Counter()
  286. for d in iter(dg):
  287. c[d] += 1
  288. d = dict(c)
  289. if d.__contains__(0):
  290. d.__delitem__(0)
  291. dSum = sum(d.values())
  292. dKeys = d.keys()
  293. dKeys.sort()
  294. xdata = array(dKeys)
  295. ylist = list()
  296. if is_CDF:
  297. for k in xdata: ylist.append(sum([d[kk] for kk in xdata[xdata >= k]]))
  298. else:
  299. for k in xdata: ylist.append(sum([d[kk] for kk in xdata[xdata == k]]))
  300. ydata = array(map(lambda x: float(x) / dSum, ylist))
  301. return(xdata, ydata)
  302. def get_cluster_distribution(g, method = 'average'):
  303. """
  304. The clustering coefficient distribution grouped by degree. Similar to the histogram shows the possible degree k,
  305. and average/median clustering coefficient of nodes with degree k in graph g.
  306. Parameters:
  307. -----------
  308. g: NetworkX Graph
  309. method: str, ('average', 'median'), (default = 'average')
  310. Returns:
  311. --------
  312. xdata, ydata, a 2-tuple of array, (k, avg_cc(V_k)), where V_k are the nodes with degree k
  313. """
  314. g = to_undirected(g)
  315. k = nx.clustering(g)
  316. d = g.degree()
  317. ck = defaultdict(list)
  318. for n in g.nodes_iter():
  319. ck[d[n]].append(k[n])
  320. xdata, ydata = list(), list()
  321. if method == 'average':
  322. for x, y in ifilter(lambda x: x[0] > 1 and average(x[1]) > 0, ck.iteritems()):
  323. xdata.append(x)
  324. ydata.append(average(y))
  325. elif method == 'median':
  326. for x, y in ifilter(lambda x: x[0] > 1 and median(x[1]) > 0, ck.iteritems()):
  327. xdata.append(x)
  328. ydata.append(median(y))
  329. else:
  330. raise NameError("method should be 'average' or 'mean'")
  331. xdata = array(xdata)
  332. ydata = array(ydata)
  333. return(xdata, ydata)
  334. def get_degree_correlation(g, method = 'average', mode = 'both'):
  335. """
  336. The average neighbor degree/in-degree/out-degree distribution grouped by degree. Similar to the histogram shows the possible degree k,
  337. and average/median clustering coefficient of nodes with degree k in graph g.
  338. Parameters:
  339. -----------
  340. g: NetworkX Graph
  341. mode: str, ('in', 'out', 'both'), (default = 'both')
  342. method: str, ('average', 'median'), (default = 'average')
  343. Returns:
  344. --------
  345. xdata, ydata, a 2-tuple of array, (k, <Knn>(k)), where <Knn>(k) denotes as the average/median degree
  346. """
  347. # re implement with the function = nx.average_degree_connectivity
  348. if mode == 'both':
  349. d = g.degree()
  350. k = nx.average_neighbor_degree(g)
  351. elif mode == 'in':
  352. d = g.in_degree()
  353. k = nx.average_neighbor_degree(g, source = 'in', target = 'in')
  354. elif mode == 'out':
  355. d = g.out_degree()
  356. k = nx.average_neighbor_degree(g, source = 'out', target = 'out')
  357. else:
  358. raise NameError("mode must be 'in', 'out', or 'both'")
  359. ck = defaultdict(list)
  360. #group the nodes by degree
  361. for n in g.nodes_iter():
  362. ck[d[n]].append(k[n])
  363. xdata, ydata = list(), list()
  364. if method == 'average':
  365. for x, y in ifilter(lambda x: x[0] > 0 and average(x[1]) > 0, ck.iteritems()):
  366. xdata.append(x)
  367. ydata.append(average(y))
  368. elif method == 'median':
  369. for x, y in ifilter(lambda x: x[0] > 0 and median(x[1]) > 0, ck.iteritems()):
  370. xdata.append(x)
  371. ydata.append(median(y))
  372. else:
  373. raise NameError("method must be 'average' or 'median'")
  374. xdata = array(xdata)
  375. ydata = array(ydata)
  376. return(xdata, ydata)
  377. def effect_diameter(graph):
  378. """
  379. Get the diamter of the largest (weakly) connected component
  380. Parameters:
  381. -----------
  382. graph: Networkx Graph, Networkx DiGraph,
  383. Returns:
  384. --------
  385. int, the diameter of the largested connected component
  386. """
  387. if graph.is_directed():
  388. graph = to_undirected(graph)
  389. return(nx.diameter(nx.connected_component_subgraphs(graph)[0]))
  390. def hop_counts(graph, cutoff = None, samples = 100000):
  391. """
  392. Calculating the length (in number of hops) with nodes using dijkstra algorithm.
  393. The input graph will be automaically turned into an undirected simple graph without loops.
  394. Note the function only focus on the largest connected component if graph is disconnected.
  395. To reduce the computation complexity, sampling method is used by default. Can use the argument
  396. 'samples' to control the number of samples.
  397. Parameters:
  398. -----------
  399. graph: Networkx Graph, NetworkX DiGraph,
  400. cutoff: integer or float, optional
  401. Depth to stop search. Only paths of length <= cutoff are counted
  402. samples: int
  403. The number of sampling node pairs in order
  404. If samples equals to 0, then all pairs of nodes will be included.
  405. Returns:
  406. --------
  407. a dictionary, keyed by hop count, of number of paths with specific hops
  408. """
  409. graph = to_undirected(graph)
  410. from collections import Counter
  411. cnt = Counter()
  412. if samples > 0:
  413. def gen_pairs(p, n):
  414. """
  415. generting the sampling node pairs.
  416. duplicated pairs is possible, but self loop is impossible
  417. """
  418. import random
  419. random.seed()
  420. pairs = [ random.sample(p, 2) for s in xrange(n) ]
  421. return(pairs)
  422. pairs = gen_pairs(nx.connected_components(graph)[0], samples)
  423. pair_lens = map(lambda x: nx.bidirectional_dijkstra(graph, x[0], x[1], weight = None)[0], pairs)
  424. if cutoff: pair_lens = filter(lambda x: x <= cutoff, pair_lens)
  425. for pl in iter(pair_lens):
  426. cnt[pl] += 1
  427. else:
  428. plDict = nx.all_pairs_dijkstra_path_length(graph, cutoff = cutoff)
  429. for p in plDict.itervalues():
  430. for d in p.itervalues():
  431. cnt[d] += 1
  432. return(dict(cnt))
  433. def dist_pack(graph, **kwargs):
  434. """
  435. Packing the distant measurement properties of given graph
  436. Parameters:
  437. -----------
  438. graph: Networkx Graph, NetworkX Digraph,
  439. arbitary args: **kwargs,
  440. Returns:
  441. --------
  442. a dictionary, keyed by property names, of property values.
  443. """
  444. t = dict()
  445. for k in kwargs:
  446. t.__setitem__(k, kwargs[k])
  447. t.__setitem__('diameter', effect_diameter(graph))
  448. t.__setitem__('hop_counts', hop_counts(graph, samples = 10000))
  449. return(t)
  450. def degree_seq_pack(graph, **kwargs):
  451. t = dict()
  452. for k in kwargs:
  453. t.__setitem__(k, kwargs[k])
  454. if graph.is_directed():
  455. t.__setitem__('inDegSeq', graph.in_degree().values())
  456. t.__setitem__('outDegSeq', graph.out_degree().values())
  457. t.__setitem__('degSeq', graph.degree().values())
  458. return(t)
  459. def degree_pack(graph, **kwargs):
  460. t = dict()
  461. for k in kwargs:
  462. t.__setitem__(k, kwargs[k])
  463. t.__setitem__('order', graph.order())
  464. t.__setitem__('size', graph.size())
  465. t.__setitem__('degree', average_degree(graph))
  466. if graph.is_directed():
  467. t.__setitem__('degcor', degcor(graph))
  468. xdata, ydata = get_degree_distribution(graph, mode = 'in')
  469. t.__setitem__('inDegDistr_x', xdata)
  470. t.__setitem__('inDegDistr_y', ydata)
  471. t.__setitem__('inDegDistr_fit', powerlaw_fit(xdata, ydata))
  472. xdata, ydata = get_degree_distribution(graph, mode = 'out')
  473. t.__setitem__('outDegDistr_x', xdata)
  474. t.__setitem__('outDegDistr_y', ydata)
  475. t.__setitem__('outDegDistr_fit', powerlaw_fit(xdata, ydata))
  476. xdata, ydata = get_degree_rate_distribution(graph)
  477. t.__setitem__('degRateDistr_x', xdata)
  478. t.__setitem__('degRateDistr_y', ydata)
  479. t.__setitem__('degOverlapRatio', get_degree_overlap_ratio(graph))
  480. if nx.get_edge_attributes(graph, 'weight'):
  481. xdata, ydata = get_link_weight_distribution(graph)
  482. t.__setitem__('linkWeightDistr_x', xdata)
  483. t.__setitem__('linkWeightDistr_y', ydata)
  484. xdata, ydata = get_degree_distribution(graph)
  485. t.__setitem__('degDistr_x', xdata)
  486. t.__setitem__('degDistr_y', ydata)
  487. t.__setitem__('degDistr_fit', powerlaw_fit(xdata, ydata))
  488. return(t)
  489. def knn_pack(graph, *kwargs):
  490. t = dict()
  491. for k in kwargs:
  492. t.__setitem__(k, kwargs[k])
  493. t.__setitem__('asr', nx.degree_assortativity_coefficient(graph))
  494. t.__setitem__('weighted_asr', nx.degree_assortativity_coefficient(graph, weight = 'weight'))
  495. if graph.is_directed():
  496. t.__setitem__('knn', nx.average_degree_connectivity(graph, source = 'out', target = 'in'))
  497. if len(nx.get_edge_attributes(graph, 'weight')):
  498. t.__setitem__('weighted_knn', nx.average_degree_connectivity(graph, source = 'out', target = 'in', weight = 'weight'))
  499. else:
  500. t.__setitem__('knn', nx.average_degree_connectivity(graph))
  501. if len(nx.get_edge_attributes(graph, 'weight')):
  502. t.__setitem__('weighted_knn', nx.average_degree_connectivity(graph, weight = 'weight'))
  503. return(t)
  504. def easy_pack(graph, **kwargs):
  505. """
  506. Packing the topological properties of a given graph.
  507. Parameters:
  508. -----------
  509. graph: NetworkX Graph, NetworkX DiGraph,
  510. arbitary args: **kwargs,
  511. set prop_name = prop_value
  512. Returns:
  513. --------
  514. a dictionary of topological property, keyed with property names
  515. """
  516. t = dict()
  517. for k in kwargs:
  518. t.__setitem__(k, kwargs[k])
  519. # resolve the features
  520. t.__setitem__('order', graph.order())
  521. t.__setitem__('size', graph.size())
  522. # t.__setitem__('degree', average_degree(graph))
  523. # t.__setitem__('asr', nx.degree_assortativity_coefficient(graph))
  524. t.__setitem__('recp', reciprocity(graph, return_cor = False))
  525. t.__setitem__('rho', reciprocity(graph, return_cor = True))
  526. t.__setitem__('reinf', reinforce(graph))
  527. # if graph.is_directed():
  528. # t.__setitem__('degcor', degcor(graph))
  529. # # in_degree
  530. # xdata, ydata = get_degree_distribution(graph, mode = 'in')
  531. # t.__setitem__('inDegDistr_x', xdata)
  532. # t.__setitem__('inDegDistr_y', ydata)
  533. # t.__setitem__('inDegDistr_fit', powerlaw_fit(xdata, ydata))
  534. #
  535. # # out_degree
  536. # xdata, ydata = get_degree_distribution(graph, mode = 'out')
  537. # t.__setitem__('outDegDistr_x', xdata)
  538. # t.__setitem__('outDegDistr_y', ydata)
  539. # t.__setitem__('outDegDistr_fit', powerlaw_fit(xdata, ydata))
  540. return(t)
  541. def frac_pack(graph, **kwargs):
  542. t = dict()
  543. for k in kwargs:
  544. t.__setitem__(k, kwargs[k])
  545. t.__setitem__('inFrac_dict', get_resilience_fraction(graph, mode = 'in'))
  546. t.__setitem__('outFrac_dict', get_resilience_fraction(graph, mode = 'out'))
  547. return(t)
  548. def pack(graph, **kwargs):
  549. """
  550. Packing the topological properties of given graph.
  551. Parameters:
  552. -----------
  553. graph: NetworkX Graph, NetworkX DiGraph,
  554. arbitary args: **kwargs,
  555. set prop_name = prop_value
  556. Returns:
  557. --------
  558. a dictionary of topological property values keyed with property names
  559. """
  560. t = dict()
  561. # add meta labels
  562. for k in kwargs:
  563. t.__setitem__(k, kwargs[k])
  564. # resolve the features
  565. t.__setitem__('order', graph.order())
  566. t.__setitem__('size', graph.size())
  567. t.__setitem__('degree', average_degree(graph))
  568. t.__setitem__('asr', nx.degree_assortativity_coefficient(graph))
  569. t.__setitem__('recp', reciprocity(graph, return_cor = False))
  570. t.__setitem__('rho', reciprocity(graph, return_cor = True))
  571. t.__setitem__('reinf', reinforce(graph))
  572. if graph.is_directed():
  573. t.__setitem__('degcor', degcor(graph))
  574. # in_degree
  575. xdata, ydata = get_degree_distribution(graph, mode = 'in')
  576. t.__setitem__('inDegDistr_x', xdata)
  577. t.__setitem__('inDegDistr_y', ydata)
  578. t.__setitem__('inDegDistr_fit', powerlaw_fit(xdata, ydata))
  579. # out_degree
  580. xdata, ydata = get_degree_distribution(graph, mode = 'out')
  581. t.__setitem__('outDegDistr_x', xdata)
  582. t.__setitem__('outDegDistr_y', ydata)
  583. t.__setitem__('outDegDistr_fit', powerlaw_fit(xdata, ydata))
  584. # in_knn
  585. xdata, ydata = get_degree_correlation(graph, method = 'average', mode = 'in')
  586. t.__setitem__('inKnnDistr_avg_x', xdata)
  587. t.__setitem__('inKnnDistr_avg_y', ydata)
  588. t.__setitem__('inKnnDistr_avg_fit', powerlaw_fit(xdata, ydata))
  589. xdata, ydata = get_degree_correlation(graph, method = 'median', mode = 'in')
  590. t.__setitem__('inKnnDistr_median_x', xdata)
  591. t.__setitem__('inKnnDistr_median_y', ydata)
  592. t.__setitem__('inKnnDistr_median_fit', powerlaw_fit(xdata, ydata))
  593. # out_knn
  594. xdata, ydata = get_degree_correlation(graph, method = 'average', mode = 'out')
  595. t.__setitem__('outKnnDistr_avg_x', xdata)
  596. t.__setitem__('outKnnDistr_avg_y', ydata)
  597. t.__setitem__('outKnnDistr_avg_fit', powerlaw_fit(xdata, ydata))
  598. xdata, ydata = get_degree_correlation(graph, method = 'median', mode = 'out')
  599. t.__setitem__('outKnnDistr_median_x', xdata)
  600. t.__setitem__('outKnnDistr_median_y', ydata)
  601. t.__setitem__('outKnnDistr_median_fit', powerlaw_fit(xdata, ydata))
  602. # degree
  603. xdata, ydata = get_degree_distribution(graph)
  604. t.__setitem__('degDistr_x', xdata)
  605. t.__setitem__('degDistr_y', ydata)
  606. t.__setitem__('degDistr_fit', powerlaw_fit(xdata, ydata))
  607. # clustering
  608. t.__setitem__('clustering', mean_clustering(graph, normalized = False))
  609. t.__setitem__('clustering_over_random', mean_clustering(graph, normalized = True))
  610. t.__setitem__('clustering_over_config', randomly_clustering(graph, tries = 100))
  611. xdata, ydata = get_cluster_distribution(graph, method = 'average')
  612. t.__setitem__('ccDistr_avg_x', xdata)
  613. t.__setitem__('ccDistr_avg_y', ydata)
  614. t.__setitem__('ccDistr_avg_fit', powerlaw_fit(xdata, ydata))
  615. xdata, ydata = get_cluster_distribution(graph, method = 'median')
  616. t.__setitem__('ccDistr_median_x', xdata)
  617. t.__setitem__('ccDistr_median_y', ydata)
  618. t.__setitem__('ccDistr_median_fit', powerlaw_fit(xdata, ydata))
  619. # knn
  620. xdata, ydata = get_degree_correlation(graph, method = 'average')
  621. t.__setitem__('knnDistr_avg_x', xdata)
  622. t.__setitem__('knnDistr_avg_y', ydata)
  623. t.__setitem__('knnDistr_avg_fit', powerlaw_fit(xdata, ydata))
  624. xdata, ydata = get_degree_correlation(graph, method = 'median')
  625. t.__setitem__('knnDistr_median_x', xdata)
  626. t.__setitem__('knnDistr_median_y', ydata)
  627. t.__setitem__('knnDistr_median_fit', powerlaw_fit(xdata, ydata))
  628. return(t)
  629. class DiNet(nx.DiGraph):
  630. def __init__(self, graph, name = 'envolop'):
  631. """docstring for __init__"""
  632. nx.DiGraph.__init__(self, data = graph, name = name)
  633. def _itemToTuple(self, item):
  634. """docstring for _itemToTuple"""
  635. (i, j), y = item
  636. y = len(y) if dir(y).__contains__('__len__') else y
  637. return((i, j, y))
  638. def extract_edges(self, edge_attr):
  639. """docstring for extract"""
  640. exEdges = nx.get_edge_attributes(self, edge_attr)
  641. sg = nx.DiGraph(data = exEdges.keys(), name = edge_attr)
  642. if len(exEdges):
  643. nx.set_edge_attributes(sg, edge_attr, exEdges)
  644. return(sg)
  645. def extract_multiple_edges(self, *edge_attrs):
  646. sgs = []
  647. for edge_attr in edge_attrs:
  648. sgs.append(self.extract_edges(edge_attr))
  649. return(sgs)
  650. def overlap(self, edge_attrA = 'weight', edge_attrB = 'friend'):
  651. """docstring for overlap"""
  652. setA = set(nx.get_edge_attributes(self.hete, edge_attrA).keys())
  653. setB = set(nx.get_edge_attributes(self.hete, edge_attrB).keys())
  654. return(setA.intersection(setB))
  655. def utility(self, edge_attrA = 'weight', edge_attrB = 'friend'):
  656. w = nx.get_edge_attributes(self.hete, edge_attrA)
  657. totalW = sum(map(lambda x: x.__len__(), w.itervalues()))
  658. ut = float(0)
  659. for interEdge in self.overlap():
  660. ut += w[interEdge].__len__()
  661. return(ut / totalW)
  662. class Net(nx.Graph):
  663. def __init__(self, graph, name = 'envolop'):
  664. """docstring for __init__"""
  665. nx.Graph.__init__(self, data = graph, name = name)
  666. def _itemToTuple(self, item):
  667. """docstring for _itemToTuple"""
  668. (i, j), y = item
  669. y = len(y) if dir(y).__contains__('__len__') else y
  670. return((i, j, y))
  671. def extract_edges(self, edge_attr):
  672. """docstring for extract"""
  673. exEdges = nx.get_edge_attributes(self, edge_attr)
  674. sg = nx.Graph(data = exEdges.keys(), name = edge_attr)
  675. if len(exEdges):
  676. nx.set_edge_attributes(sg, edge_attr, exEdges)
  677. return(sg)
  678. def extract_multiple_edges(self, *edge_attrs):
  679. sgs = []
  680. for edge_attr in edge_attrs:
  681. sgs.append(self.extract_edges(edge_attr))
  682. return(sgs)
  683. def overlap(self, edge_attrA = 'weight', edge_attrB = 'friend'):
  684. """docstring for overlap"""
  685. setA = set(nx.get_edge_attributes(self.hete, edge_attrA).keys())
  686. setB = set(nx.get_edge_attributes(self.hete, edge_attrB).keys())
  687. return(setA.intersection(setB))
  688. def utility(self, edge_attrA = 'weight', edge_attrB = 'friend'):
  689. w = nx.get_edge_attributes(self.hete, edge_attrA)
  690. totalW = sum(map(lambda x: x.__len__(), w.itervalues()))
  691. ut = float(0)
  692. for interEdge in self.overlap():
  693. ut += w[interEdge].__len__()
  694. return(ut / totalW)
  695. def main(argv):
  696. """docstring for main"""
  697. inputFile = list()
  698. inputDir = None
  699. inputFileType = 'gpickle'
  700. outputFile = None
  701. dataName = None
  702. enableVerbose = False
  703. heteNames = None
  704. heteNameSep = ","
  705. enablePlot = False
  706. show_fit = False
  707. metalabels = dict()
  708. forceSave = False
  709. asDirected = False
  710. enable_appendant = False
  711. enable_easyPack = False
  712. packType = 'normal'
  713. ofs = ","
  714. def usage():
  715. print ("----------------------------------------")
  716. print ("read the graphs (in gpickle, edgelist, or cpickle format) and ")
  717. print ("extract the graph topological features, and then pack features as a .db file")
  718. print
  719. print ("-h, --help: print this usage")
  720. print ("-i ...: read inputFile (as a gpickle file)")
  721. print ("-I ...: read inputDir (directory)")
  722. print ("control file type:")
  723. print ("\t(default: searching gpickle files)")
  724. print ("\t-e: set inputFile type as edgelist")
  725. print ("\t-p: set inputFile type as cpickle (for the temporal series file)")
  726. print ("-d: forced the input graph as directed (only valid while reading edgelist files)")
  727. print ("-o: outputFile (default *.db file)")
  728. print ("-r: dataName (saved as the db key)")
  729. print ("-v: enable verbose")
  730. print ("-N [typeName1,typeName2,...]: If input graph contains different edge type, split them into subgraphs")
  731. print ("-c [attribute]=[value]: dynamically add additioanl attribute to the output db file")
  732. print ("-a: if output file already existed, then append previous result. If detecting identical keys, then update the values")
  733. print (" : The update means: older attributes will be update, new attributes will be create")
  734. print ("-f: if output file already existed, then replace previous result")
  735. print ("-T: pack type [easy, noraml, dist] ")
  736. print ("----------------------------------------")
  737. try:
  738. opts, args = getopt.getopt(argv, "hi:I:epo:r:vMN:c:afT:d", ["help"])
  739. except getopt.GetoptError, err:
  740. print ("The given argv incorrect")
  741. usage()
  742. print (err)
  743. sys.exit(2)
  744. for opt, arg in opts:
  745. if opt in ("-h", "--help"):
  746. usage()
  747. sys.exit()
  748. elif opt in ("-i"):
  749. inputFile.append(arg)
  750. elif opt in ("-I"):
  751. inputDir = arg
  752. elif opt in ("-e"):
  753. inputFileType = 'edgelist'
  754. elif opt in ("-p"):
  755. inputFileType = 'cpickle'
  756. elif opt in ("-o"):
  757. outputFile = arg
  758. elif opt in ("-r"):
  759. dataName = arg
  760. elif opt in ("-v"):
  761. enableVerbose = True
  762. elif opt in ("-N"):
  763. heteNames = arg.split(heteNameSep)
  764. elif opt in ("-c"):
  765. k, v = arg.split("=")
  766. metalabels.__setitem__(k, v)
  767. elif opt in ("-a"):
  768. enable_appendant = True
  769. elif opt in ("-f"):
  770. forceSave = True
  771. elif opt in ("-T"):
  772. packType = arg
  773. elif opt in ("-d"):
  774. asDirected = True
  775. inputFileTypeDict = { "gpickle": ".*\.gpickle$", "edgelist": ".*\.txt$", "cpickle": ".*\.cpickle$" }
  776. if inputDir:
  777. assert(os.path.exists(inputDir))
  778. pattern = re.compile(inputFileTypeDict[inputFileType])
  779. filelist = filter(lambda x: pattern.match(x) is not None, os.listdir(inputDir))
  780. inputFile.extend([os.path.join(inputDir, f) for f in filelist])
  781. print inputFile
  782. if enableVerbose:
  783. print ("inputFile: %s" % inputFile)
  784. print ("outputFile: %s" % outputFile)
  785. print ("dataName: %s" % dataName)
  786. print ("enableVerbose: %s" % enableVerbose)
  787. print ("heteNames: %s" % heteNames)
  788. print ("asDirected: %s" % asDirected)
  789. if outputFile is not None:
  790. outputDir = os.path.dirname(outputFile)
  791. if outputDir and not (os.path.exists(outputDir)):
  792. os.makedirs(outputDir)
  793. else:
  794. print ("There is no output")
  795. db = LiteDB()
  796. if os.path.exists(outputFile) and enable_appendant:
  797. db.load(outputFile)
  798. for f in inputFile:
  799. print ("processing %s" % f)
  800. if not os.path.exists(f): next
  801. if inputFileType == 'gpickle': # a graph
  802. g = nx.read_gpickle(file(f, "r"))
  803. elif inputFileType == 'edgelist': # edgelist
  804. g = nx.read_edgelist(f, nodetype = int, create_using = nx.DiGraph()) if asDirected else nx.read_edgelist(f, nodetype = int, create_using = nx.Graph())
  805. elif inputFileType == 'cpickle': # a list of graphs
  806. g = cPickle.load(file(f, "r"))
  807. graphs = list()
  808. if heteNames:
  809. for gg in g:
  810. net = DiNet(gg) if g.is_directed() else Net(gg)
  811. graphs.extend(net.extract_multiple_edges(*heteNames))
  812. else:
  813. if inputFileType in ('gpickle', 'edgelist'):
  814. graphs.append(g)
  815. elif inputFileType in ('cpickle'):
  816. graphs.extend(g)
  817. else:
  818. print ('empty file')
  819. sys.exit()
  820. for graph in iter(graphs):
  821. autoName = re.sub(".%s" % inputFileType, "", os.path.basename(f))
  822. fillinName = autoName if dataName is None else dataName
  823. graph_key = "_".join([fillinName, str(graph), 'd' if graph.is_directed() else 'u'])
  824. if db.__contains__(graph_key) and not forceSave:
  825. if packType == 'easy':
  826. print ('updating in easy-pack mode')
  827. db[graph_key].update(easy_pack(graph, **metalabels))
  828. elif packType == 'degree_seq':
  829. print ('updating in degree-seq-pack mode')
  830. db[graph_key].update(degree_seq_pack(graph, **metalabels))
  831. elif packType == 'degree':
  832. print ('updating in degree-pack mode')
  833. db[graph_key].update(degree_pack(graph, **metalabels))
  834. elif packType == 'dist':
  835. print ('updating in dist-pack mode')
  836. db[graph_key].update(dist_pack(graph, **metalabels))
  837. elif packType == 'frac':
  838. print ('updating in frac-pack mode')
  839. db[graph_key].update(frac_pack(graph, **metalabels))
  840. elif packType == 'knn':
  841. print ('updating in knn-pack mode')
  842. db[graph_key].update(knn_pack(graph, **metalabels))
  843. else:
  844. print ('updating in normal-pack mode')
  845. db[graph_key].update(pack(graph, **metalabels))
  846. else:
  847. if packType == 'easy':
  848. print ("writing in easy-pack mode")
  849. db.__setitem__(graph_key, easy_pack(graph, **metalabels))
  850. elif packType == 'degree_seq':
  851. print ('writing in degree-seq-pack mode')
  852. db.__setitem__(graph_key, degree_seq_pack(graph, **metalabels))
  853. elif packType == 'degree':
  854. print ('writing in degree-pack mode')
  855. db.__setitem__(graph_key, degree_pack(graph, **metalabels))
  856. elif packType == 'dist':
  857. print ("writing in dist-pack mode")
  858. db.__setitem__(graph_key, dist_pack(graph, **metalabels))
  859. elif packType == 'frac':
  860. print ("writing in frac-pack mode")
  861. db.__setitem__(graph_key, frac_pack(graph, **metalabels))
  862. elif packType == 'knn':
  863. print ("writing in knn-pack mode")
  864. db.__setitem__(graph_key, knn_pack(graph, **metalabels))
  865. else:
  866. print ("writing in normal-pack mode")
  867. db.__setitem__(graph_key, pack(graph, **metalabels))
  868. db.save(outputFile)
  869. if __name__ == "__main__":
  870. main(sys.argv[1:])