PageRenderTime 30ms CodeModel.GetById 20ms RepoModel.GetById 0ms app.codeStats 0ms

/src/python/pants/backend/jvm/tasks/jvm_dependency_usage.py

https://gitlab.com/Ivy001/pants
Python | 435 lines | 424 code | 5 blank | 6 comment | 5 complexity | 2077e6596164f94122a1f19df96303be MD5 | raw file
  1. # coding=utf-8
  2. # Copyright 2015 Pants project contributors (see CONTRIBUTORS.md).
  3. # Licensed under the Apache License, Version 2.0 (see LICENSE).
  4. from __future__ import (absolute_import, division, generators, nested_scopes, print_function,
  5. unicode_literals, with_statement)
  6. import json
  7. import os
  8. import sys
  9. from collections import defaultdict, namedtuple
  10. from pants.backend.jvm.targets.jar_library import JarLibrary
  11. from pants.backend.jvm.tasks.jvm_dependency_analyzer import JvmDependencyAnalyzer
  12. from pants.base.build_environment import get_buildroot
  13. from pants.build_graph.aliased_target import AliasTarget
  14. from pants.build_graph.resources import Resources
  15. from pants.build_graph.target import Target
  16. from pants.build_graph.target_scopes import Scopes
  17. from pants.task.task import Task
  18. from pants.util.fileutil import create_size_estimators
  19. class JvmDependencyUsage(Task):
  20. """Determines the dependency usage ratios of targets.
  21. Analyzes the relationship between the products a target T produces vs. the products
  22. which T's dependents actually require (this is done by observing analysis files).
  23. If the ratio of required products to available products is low, then this is a sign
  24. that target T isn't factored well.
  25. A graph is formed from these results, where each node of the graph is a target, and
  26. each edge is a product usage ratio between a target and its dependency. The nodes
  27. also contain additional information to guide refactoring -- for example, the estimated
  28. job size of each target, which indicates the impact a poorly factored target has on
  29. the build times. (see DependencyUsageGraph->to_json)
  30. The graph is either summarized for local analysis or outputted as a JSON file for
  31. aggregation and analysis on a larger scale.
  32. """
  33. size_estimators = create_size_estimators()
  34. @classmethod
  35. def register_options(cls, register):
  36. super(JvmDependencyUsage, cls).register_options(register)
  37. register('--internal-only', default=False, type=bool, fingerprint=True,
  38. help='Specifies that only internal dependencies should be included in the graph '
  39. 'output (no external jars).')
  40. register('--summary', default=True, type=bool,
  41. help='When set, outputs a summary of the "worst" dependencies; otherwise, '
  42. 'outputs a JSON report.')
  43. register('--size-estimator',
  44. choices=list(cls.size_estimators.keys()), default='filesize', fingerprint=True,
  45. help='The method of target size estimation.')
  46. register('--transitive', default=True, type=bool,
  47. help='Score all targets in the build graph transitively.')
  48. register('--output-file', type=str,
  49. help='Output destination. When unset, outputs to <stdout>.')
  50. register('--use-cached', type=bool,
  51. help='Use cached dependency data to compute analysis result. '
  52. 'When set, skips `resolve` and `compile` steps. '
  53. 'Useful for computing analysis for a lot of targets, but '
  54. 'result can differ from direct execution because cached information '
  55. 'doesn\'t depend on 3rdparty libraries versions.')
  56. @classmethod
  57. def prepare(cls, options, round_manager):
  58. super(JvmDependencyUsage, cls).prepare(options, round_manager)
  59. if not options.use_cached:
  60. round_manager.require_data('classes_by_source')
  61. round_manager.require_data('runtime_classpath')
  62. round_manager.require_data('product_deps_by_src')
  63. else:
  64. # We want to have synthetic targets in build graph to deserialize nodes properly.
  65. round_manager.require_data('java')
  66. round_manager.require_data('scala')
  67. round_manager.require_data('deferred_sources')
  68. @classmethod
  69. def skip(cls, options):
  70. """This task is always explicitly requested."""
  71. return False
  72. def execute(self):
  73. graph = self.create_dep_usage_graph(self.context.targets() if self.get_options().transitive
  74. else self.context.target_roots)
  75. output_file = self.get_options().output_file
  76. if output_file:
  77. self.context.log.info('Writing dependency usage to {}'.format(output_file))
  78. with open(output_file, 'w') as fh:
  79. self._render(graph, fh)
  80. else:
  81. sys.stdout.write(b'\n')
  82. self._render(graph, sys.stdout)
  83. @classmethod
  84. def implementation_version(cls):
  85. return super(JvmDependencyUsage, cls).implementation_version() + [('JvmDependencyUsage', 7)]
  86. def _render(self, graph, fh):
  87. chunks = graph.to_summary() if self.get_options().summary else graph.to_json()
  88. for chunk in chunks:
  89. fh.write(chunk)
  90. fh.flush()
  91. def _dep_type(self, target, dep, declared_deps, eligible_unused_deps, is_used):
  92. """Returns a tuple of a 'declared'/'undeclared' boolean, and 'used'/'unused' boolean.
  93. These values are related, because some declared deps are not eligible to be considered unused.
  94. :param target: The source target.
  95. :param dep: The dependency to compute a type for.
  96. :param declared_deps: The declared dependencies of the target.
  97. :param eligible_unused_deps: The declared dependencies of the target that are eligible
  98. to be considered unused; this is generally only 'DEFAULT' scoped dependencies.
  99. :param is_used: True if the dep was actually used at compile time.
  100. """
  101. if target == dep:
  102. return True, True
  103. return (dep in declared_deps), (is_used or dep not in eligible_unused_deps)
  104. def _select(self, target):
  105. if self.get_options().internal_only and isinstance(target, JarLibrary):
  106. return False
  107. elif isinstance(target, Resources) or type(target) in (AliasTarget, Target):
  108. return False
  109. else:
  110. return True
  111. def create_dep_usage_graph(self, targets):
  112. """Creates a graph of concrete targets, with their sum of products and dependencies.
  113. Synthetic targets contribute products and dependencies to their concrete target.
  114. """
  115. with self.invalidated(targets,
  116. invalidate_dependents=True) as invalidation_check:
  117. target_to_vts = {}
  118. for vts in invalidation_check.all_vts:
  119. target_to_vts[vts.target] = vts
  120. if not self.get_options().use_cached:
  121. node_creator = self.calculating_node_creator(
  122. self.context.products.get_data('classes_by_source'),
  123. self.context.products.get_data('runtime_classpath'),
  124. self.context.products.get_data('product_deps_by_src'),
  125. target_to_vts)
  126. else:
  127. node_creator = self.cached_node_creator(target_to_vts)
  128. return DependencyUsageGraph(self.create_dep_usage_nodes(targets, node_creator),
  129. self.size_estimators[self.get_options().size_estimator])
  130. def calculating_node_creator(self, classes_by_source, runtime_classpath, product_deps_by_src,
  131. target_to_vts):
  132. """Strategy directly computes dependency graph node based on
  133. `classes_by_source`, `runtime_classpath`, `product_deps_by_src` parameters and
  134. stores the result to the build cache.
  135. """
  136. analyzer = JvmDependencyAnalyzer(get_buildroot(), runtime_classpath, product_deps_by_src)
  137. targets = self.context.targets()
  138. targets_by_file = analyzer.targets_by_file(targets)
  139. transitive_deps_by_target = analyzer.compute_transitive_deps_by_target(targets)
  140. def creator(target):
  141. transitive_deps = set(transitive_deps_by_target.get(target))
  142. node = self.create_dep_usage_node(target,
  143. analyzer,
  144. classes_by_source,
  145. targets_by_file,
  146. transitive_deps)
  147. vt = target_to_vts[target]
  148. with open(self.nodes_json(vt.results_dir), mode='w') as fp:
  149. json.dump(node.to_cacheable_dict(), fp, indent=2, sort_keys=True)
  150. vt.update()
  151. return node
  152. return creator
  153. def cached_node_creator(self, target_to_vts):
  154. """Strategy restores dependency graph node from the build cache.
  155. """
  156. def creator(target):
  157. vt = target_to_vts[target]
  158. if vt.valid and os.path.exists(self.nodes_json(vt.results_dir)):
  159. try:
  160. with open(self.nodes_json(vt.results_dir)) as fp:
  161. return Node.from_cacheable_dict(json.load(fp),
  162. lambda spec: self.context.resolve(spec).__iter__().next())
  163. except Exception:
  164. self.context.log.warn("Can't deserialize json for target {}".format(target))
  165. return Node(target.concrete_derived_from)
  166. else:
  167. self.context.log.warn("No cache entry for {}".format(target))
  168. return Node(target.concrete_derived_from)
  169. return creator
  170. def nodes_json(self, target_results_dir):
  171. return os.path.join(target_results_dir, 'node.json')
  172. def create_dep_usage_nodes(self, targets, node_creator):
  173. nodes = {}
  174. for target in targets:
  175. if not self._select(target):
  176. continue
  177. # Create or extend a Node for the concrete version of this target.
  178. concrete_target = target.concrete_derived_from
  179. node = node_creator(target)
  180. if concrete_target in nodes:
  181. nodes[concrete_target].combine(node)
  182. else:
  183. nodes[concrete_target] = node
  184. # Prune any Nodes with 0 products.
  185. for concrete_target, node in nodes.items()[:]:
  186. if node.products_total == 0:
  187. nodes.pop(concrete_target)
  188. return nodes
  189. def cache_target_dirs(self):
  190. return True
  191. def create_dep_usage_node(self, target, analyzer, classes_by_source, targets_by_file, transitive_deps):
  192. buildroot = analyzer.buildroot
  193. product_deps_by_src = analyzer.product_deps_by_src
  194. declared_deps_with_aliases = set(analyzer.resolve_aliases(target))
  195. eligible_unused_deps = set(d for d, _ in analyzer.resolve_aliases(target, scope=Scopes.DEFAULT))
  196. concrete_target = target.concrete_derived_from
  197. declared_deps = [resolved for resolved, _ in declared_deps_with_aliases]
  198. products_total = analyzer.count_products(target)
  199. node = Node(concrete_target)
  200. node.add_derivation(target, products_total)
  201. def _construct_edge(dep_tgt, products_used):
  202. is_declared, is_used = self._dep_type(target,
  203. dep_tgt,
  204. declared_deps,
  205. eligible_unused_deps,
  206. len(products_used) > 0)
  207. return Edge(is_declared=is_declared, is_used=is_used, products_used=products_used)
  208. # Record declared Edges, initially all as "unused" or "declared".
  209. for dep_tgt, aliased_from in declared_deps_with_aliases:
  210. derived_from = dep_tgt.concrete_derived_from
  211. if self._select(derived_from):
  212. node.add_edge(_construct_edge(dep_tgt, products_used=set()), derived_from, aliased_from)
  213. # Record the used products and undeclared Edges for this target. Note that some of
  214. # these may be self edges, which are considered later.
  215. target_product_deps_by_src = product_deps_by_src.get(target, {})
  216. for src in target.sources_relative_to_buildroot():
  217. for product_dep in target_product_deps_by_src.get(os.path.join(buildroot, src), []):
  218. for dep_tgt in targets_by_file.get(product_dep, []):
  219. derived_from = dep_tgt.concrete_derived_from
  220. if not self._select(derived_from):
  221. continue
  222. # Create edge only for those direct or transitive dependencies in order to
  223. # disqualify irrelevant targets that happen to share some file in sources,
  224. # not uncommon when globs especially rglobs is used.
  225. if not derived_from in transitive_deps:
  226. continue
  227. node.add_edge(_construct_edge(dep_tgt, products_used={product_dep}), derived_from)
  228. return node
  229. class Node(object):
  230. def __init__(self, concrete_target):
  231. self.concrete_target = concrete_target
  232. self.products_total = 0
  233. self.derivations = set()
  234. # Dict mapping concrete dependency targets to an Edge object.
  235. self.dep_edges = defaultdict(Edge)
  236. # Dict mapping concrete dependency targets to where they are aliased from.
  237. self.dep_aliases = defaultdict(set)
  238. def add_derivation(self, derived_target, derived_products):
  239. self.derivations.add(derived_target)
  240. self.products_total += derived_products
  241. def add_edge(self, edge, dest, dest_aliased_from=None):
  242. self.dep_edges[dest] += edge
  243. if dest_aliased_from:
  244. self.dep_aliases[dest].add(dest_aliased_from)
  245. def combine(self, other_node):
  246. assert other_node.concrete_target == self.concrete_target
  247. self.products_total += other_node.products_total
  248. self.derivations.update(other_node.derivations)
  249. self.dep_edges.update(other_node.dep_edges)
  250. self.dep_aliases.update(other_node.dep_aliases)
  251. def to_cacheable_dict(self):
  252. edges = {}
  253. for dest in self.dep_edges:
  254. edges[dest.address.spec] = {
  255. 'products_used': list(self.dep_edges[dest].products_used),
  256. 'is_declared': self.dep_edges[dest].is_declared,
  257. 'is_used': self.dep_edges[dest].is_used,
  258. }
  259. aliases = {}
  260. for dep, dep_aliases in self.dep_aliases.items():
  261. aliases[dep.address.spec] = [alias.address.spec for alias in dep_aliases]
  262. return {
  263. 'target': self.concrete_target.address.spec,
  264. 'products_total': self.products_total,
  265. 'derivations': [derivation.address.spec for derivation in self.derivations],
  266. 'dep_edges': edges,
  267. 'aliases': aliases,
  268. }
  269. @staticmethod
  270. def from_cacheable_dict(cached_dict, target_resolve_func):
  271. res = Node(target_resolve_func(cached_dict['target']))
  272. res.products_total = cached_dict['products_total']
  273. res.derivations.update([target_resolve_func(spec) for spec in cached_dict['derivations']])
  274. for edge in cached_dict['dep_edges']:
  275. res.dep_edges[target_resolve_func(edge)] = Edge(
  276. is_declared=cached_dict['dep_edges'][edge]['is_declared'],
  277. is_used=cached_dict['dep_edges'][edge]['is_used'],
  278. products_used=set(cached_dict['dep_edges'][edge]['products_used']))
  279. for dep in cached_dict['aliases']:
  280. for alias in cached_dict['aliases'][dep]:
  281. res.dep_aliases[target_resolve_func(dep)].add(target_resolve_func(alias))
  282. return res
  283. class Edge(object):
  284. """Record a set of used products, and a boolean indicating that a depedency edge was declared."""
  285. def __init__(self, is_declared=False, is_used=False, products_used=None):
  286. self.products_used = products_used or set()
  287. self.is_declared = is_declared
  288. self.is_used = is_used
  289. def __iadd__(self, that):
  290. self.products_used |= that.products_used
  291. self.is_declared |= that.is_declared
  292. self.is_used |= that.is_used
  293. return self
  294. class DependencyUsageGraph(object):
  295. def __init__(self, nodes, size_estimator):
  296. self._nodes = nodes
  297. self._size_estimator = size_estimator
  298. self._cost_cache = {}
  299. self._trans_cost_cache = {}
  300. def _cost(self, target):
  301. if target not in self._cost_cache:
  302. self._cost_cache[target] = self._size_estimator(target.sources_relative_to_buildroot())
  303. return self._cost_cache[target]
  304. def _trans_cost(self, target):
  305. if target not in self._trans_cost_cache:
  306. dep_sum = sum(self._trans_cost(dep) for dep in target.dependencies)
  307. self._trans_cost_cache[target] = self._cost(target) + dep_sum
  308. return self._trans_cost_cache[target]
  309. def _edge_type(self, target, edge, dep):
  310. if target == dep:
  311. return 'self'
  312. elif edge.is_declared and edge.is_used:
  313. return 'declared'
  314. elif edge.is_declared and not edge.is_used:
  315. return 'unused'
  316. else:
  317. return 'undeclared'
  318. def _used_ratio(self, dep_tgt, edge):
  319. if edge.products_used:
  320. # If products were recorded as used, generate a legitimate ratio.
  321. dep_tgt_products_total = self._nodes[dep_tgt].products_total if dep_tgt in self._nodes else 1
  322. return len(edge.products_used) / max(dep_tgt_products_total, 1)
  323. elif edge.is_used:
  324. # Else, the dep might not be in the default scope, and must considered to be used.
  325. return 1.0
  326. else:
  327. # Otherwise, definitely not used.
  328. return 0.0
  329. def to_summary(self):
  330. """Outputs summarized dependencies ordered by a combination of max usage and cost."""
  331. # Aggregate inbound edges by their maximum product usage ratio.
  332. max_target_usage = defaultdict(lambda: 0.0)
  333. for target, node in self._nodes.items():
  334. for dep_target, edge in node.dep_edges.items():
  335. if target == dep_target:
  336. continue
  337. used_ratio = self._used_ratio(dep_target, edge)
  338. max_target_usage[dep_target] = max(max_target_usage[dep_target], used_ratio)
  339. # Calculate a score for each.
  340. Score = namedtuple('Score', ('badness', 'max_usage', 'cost_transitive', 'target'))
  341. scores = []
  342. for target, max_usage in max_target_usage.items():
  343. cost_transitive = self._trans_cost(target)
  344. score = int(cost_transitive / (max_usage if max_usage > 0.0 else 1.0))
  345. scores.append(Score(score, max_usage, cost_transitive, target.address.spec))
  346. # Output in order by score.
  347. yield '[\n'
  348. first = True
  349. for score in sorted(scores, key=lambda s: s.badness):
  350. yield '{} {}'.format('' if first else ',\n', json.dumps(score._asdict()))
  351. first = False
  352. yield '\n]\n'
  353. def to_json(self):
  354. """Outputs the entire graph."""
  355. res_dict = {}
  356. def gen_dep_edge(node, edge, dep_tgt, aliases):
  357. return {
  358. 'target': dep_tgt.address.spec,
  359. 'dependency_type': self._edge_type(node.concrete_target, edge, dep_tgt),
  360. 'products_used': len(edge.products_used),
  361. 'products_used_ratio': self._used_ratio(dep_tgt, edge),
  362. 'aliases': [alias.address.spec for alias in aliases],
  363. }
  364. for node in self._nodes.values():
  365. res_dict[node.concrete_target.address.spec] = {
  366. 'cost': self._cost(node.concrete_target),
  367. 'cost_transitive': self._trans_cost(node.concrete_target),
  368. 'products_total': node.products_total,
  369. 'dependencies': [gen_dep_edge(node, edge, dep_tgt, node.dep_aliases.get(dep_tgt, {}))
  370. for dep_tgt, edge in node.dep_edges.items()],
  371. }
  372. yield json.dumps(res_dict, indent=2, sort_keys=True)