jvm_dependency_usage.py

/src/python/pants/backend/jvm/tasks/jvm_dependency_usage.py

https://gitlab.com/Ivy001/pants
Python | 435 lines | 424 code | 5 blank | 6 comment | 5 complexity | 2077e6596164f94122a1f19df96303be MD5 | raw file

# coding=utf-8
# Copyright 2015 Pants project contributors (see CONTRIBUTORS.md).
# Licensed under the Apache License, Version 2.0 (see LICENSE).

from __future__ import (absolute_import, division, generators, nested_scopes, print_function,
                        unicode_literals, with_statement)

import json
import os
import sys
from collections import defaultdict, namedtuple

from pants.backend.jvm.targets.jar_library import JarLibrary
from pants.backend.jvm.tasks.jvm_dependency_analyzer import JvmDependencyAnalyzer
from pants.base.build_environment import get_buildroot
from pants.build_graph.aliased_target import AliasTarget
from pants.build_graph.resources import Resources
from pants.build_graph.target import Target
from pants.build_graph.target_scopes import Scopes
from pants.task.task import Task
from pants.util.fileutil import create_size_estimators


class JvmDependencyUsage(Task):
  """Determines the dependency usage ratios of targets.

  Analyzes the relationship between the products a target T produces vs. the products
  which T's dependents actually require (this is done by observing analysis files).
  If the ratio of required products to available products is low, then this is a sign
  that target T isn't factored well.

  A graph is formed from these results, where each node of the graph is a target, and
  each edge is a product usage ratio between a target and its dependency. The nodes
  also contain additional information to guide refactoring -- for example, the estimated
  job size of each target, which indicates the impact a poorly factored target has on
  the build times. (see DependencyUsageGraph->to_json)

  The graph is either summarized for local analysis or outputted as a JSON file for
  aggregation and analysis on a larger scale.
  """

  size_estimators = create_size_estimators()

  @classmethod
  def register_options(cls, register):
    super(JvmDependencyUsage, cls).register_options(register)
    register('--internal-only', default=False, type=bool, fingerprint=True,
             help='Specifies that only internal dependencies should be included in the graph '
                  'output (no external jars).')
    register('--summary', default=True, type=bool,
             help='When set, outputs a summary of the "worst" dependencies; otherwise, '
                  'outputs a JSON report.')
    register('--size-estimator',
             choices=list(cls.size_estimators.keys()), default='filesize', fingerprint=True,
             help='The method of target size estimation.')
    register('--transitive', default=True, type=bool,
             help='Score all targets in the build graph transitively.')
    register('--output-file', type=str,
             help='Output destination. When unset, outputs to <stdout>.')
    register('--use-cached', type=bool,
             help='Use cached dependency data to compute analysis result. '
                  'When set, skips `resolve` and `compile` steps. '
                  'Useful for computing analysis for a lot of targets, but '
                  'result can differ from direct execution because cached information '
                  'doesn\'t depend on 3rdparty libraries versions.')

  @classmethod
  def prepare(cls, options, round_manager):
    super(JvmDependencyUsage, cls).prepare(options, round_manager)
    if not options.use_cached:
      round_manager.require_data('classes_by_source')
      round_manager.require_data('runtime_classpath')
      round_manager.require_data('product_deps_by_src')
    else:
      # We want to have synthetic targets in build graph to deserialize nodes properly.
      round_manager.require_data('java')
      round_manager.require_data('scala')
      round_manager.require_data('deferred_sources')

  @classmethod
  def skip(cls, options):
    """This task is always explicitly requested."""
    return False

  def execute(self):
    graph = self.create_dep_usage_graph(self.context.targets() if self.get_options().transitive
                                        else self.context.target_roots)
    output_file = self.get_options().output_file
    if output_file:
      self.context.log.info('Writing dependency usage to {}'.format(output_file))
      with open(output_file, 'w') as fh:
        self._render(graph, fh)
    else:
      sys.stdout.write(b'\n')
      self._render(graph, sys.stdout)

  @classmethod
  def implementation_version(cls):
    return super(JvmDependencyUsage, cls).implementation_version() + [('JvmDependencyUsage', 7)]

  def _render(self, graph, fh):
    chunks = graph.to_summary() if self.get_options().summary else graph.to_json()
    for chunk in chunks:
      fh.write(chunk)
    fh.flush()

  def _dep_type(self, target, dep, declared_deps, eligible_unused_deps, is_used):
    """Returns a tuple of a 'declared'/'undeclared' boolean, and 'used'/'unused' boolean.

    These values are related, because some declared deps are not eligible to be considered unused.

    :param target: The source target.
    :param dep: The dependency to compute a type for.
    :param declared_deps: The declared dependencies of the target.
    :param eligible_unused_deps: The declared dependencies of the target that are eligible
      to be considered unused; this is generally only 'DEFAULT' scoped dependencies.
    :param is_used: True if the dep was actually used at compile time.
    """
    if target == dep:
      return True, True
    return (dep in declared_deps), (is_used or dep not in eligible_unused_deps)

  def _select(self, target):
    if self.get_options().internal_only and isinstance(target, JarLibrary):
      return False
    elif isinstance(target, Resources) or type(target) in (AliasTarget, Target):
      return False
    else:
      return True

  def create_dep_usage_graph(self, targets):
    """Creates a graph of concrete targets, with their sum of products and dependencies.

    Synthetic targets contribute products and dependencies to their concrete target.
    """
    with self.invalidated(targets,
                          invalidate_dependents=True) as invalidation_check:
      target_to_vts = {}
      for vts in invalidation_check.all_vts:
        target_to_vts[vts.target] = vts

      if not self.get_options().use_cached:
        node_creator = self.calculating_node_creator(
          self.context.products.get_data('classes_by_source'),
          self.context.products.get_data('runtime_classpath'),
          self.context.products.get_data('product_deps_by_src'),
          target_to_vts)
      else:
        node_creator = self.cached_node_creator(target_to_vts)

      return DependencyUsageGraph(self.create_dep_usage_nodes(targets, node_creator),
                                  self.size_estimators[self.get_options().size_estimator])

  def calculating_node_creator(self, classes_by_source, runtime_classpath, product_deps_by_src,
                               target_to_vts):
    """Strategy directly computes dependency graph node based on
    `classes_by_source`, `runtime_classpath`, `product_deps_by_src` parameters and
    stores the result to the build cache.
    """
    analyzer = JvmDependencyAnalyzer(get_buildroot(), runtime_classpath, product_deps_by_src)
    targets = self.context.targets()
    targets_by_file = analyzer.targets_by_file(targets)
    transitive_deps_by_target = analyzer.compute_transitive_deps_by_target(targets)
    def creator(target):
      transitive_deps = set(transitive_deps_by_target.get(target))
      node = self.create_dep_usage_node(target,
                                        analyzer,
                                        classes_by_source,
                                        targets_by_file,
                                        transitive_deps)
      vt = target_to_vts[target]
      with open(self.nodes_json(vt.results_dir), mode='w') as fp:
        json.dump(node.to_cacheable_dict(), fp, indent=2, sort_keys=True)
      vt.update()
      return node

    return creator

  def cached_node_creator(self, target_to_vts):
    """Strategy restores dependency graph node from the build cache.
    """
    def creator(target):
      vt = target_to_vts[target]
      if vt.valid and os.path.exists(self.nodes_json(vt.results_dir)):
        try:
          with open(self.nodes_json(vt.results_dir)) as fp:
            return Node.from_cacheable_dict(json.load(fp),
                                            lambda spec: self.context.resolve(spec).__iter__().next())
        except Exception:
          self.context.log.warn("Can't deserialize json for target {}".format(target))
          return Node(target.concrete_derived_from)
      else:
        self.context.log.warn("No cache entry for {}".format(target))
        return Node(target.concrete_derived_from)

    return creator

  def nodes_json(self, target_results_dir):
    return os.path.join(target_results_dir, 'node.json')

  def create_dep_usage_nodes(self, targets, node_creator):
    nodes = {}
    for target in targets:
      if not self._select(target):
        continue
      # Create or extend a Node for the concrete version of this target.
      concrete_target = target.concrete_derived_from
      node = node_creator(target)
      if concrete_target in nodes:
        nodes[concrete_target].combine(node)
      else:
        nodes[concrete_target] = node

    # Prune any Nodes with 0 products.
    for concrete_target, node in nodes.items()[:]:
      if node.products_total == 0:
        nodes.pop(concrete_target)

    return nodes

  def cache_target_dirs(self):
    return True

  def create_dep_usage_node(self, target, analyzer, classes_by_source, targets_by_file, transitive_deps):
    buildroot = analyzer.buildroot
    product_deps_by_src = analyzer.product_deps_by_src
    declared_deps_with_aliases = set(analyzer.resolve_aliases(target))
    eligible_unused_deps = set(d for d, _ in analyzer.resolve_aliases(target, scope=Scopes.DEFAULT))
    concrete_target = target.concrete_derived_from
    declared_deps = [resolved for resolved, _ in declared_deps_with_aliases]
    products_total = analyzer.count_products(target)
    node = Node(concrete_target)
    node.add_derivation(target, products_total)

    def _construct_edge(dep_tgt, products_used):
      is_declared, is_used = self._dep_type(target,
                                            dep_tgt,
                                            declared_deps,
                                            eligible_unused_deps,
                                            len(products_used) > 0)
      return Edge(is_declared=is_declared, is_used=is_used, products_used=products_used)

    # Record declared Edges, initially all as "unused" or "declared".
    for dep_tgt, aliased_from in declared_deps_with_aliases:
      derived_from = dep_tgt.concrete_derived_from
      if self._select(derived_from):
        node.add_edge(_construct_edge(dep_tgt, products_used=set()), derived_from, aliased_from)

    # Record the used products and undeclared Edges for this target. Note that some of
    # these may be self edges, which are considered later.
    target_product_deps_by_src = product_deps_by_src.get(target, {})
    for src in target.sources_relative_to_buildroot():
      for product_dep in target_product_deps_by_src.get(os.path.join(buildroot, src), []):
        for dep_tgt in targets_by_file.get(product_dep, []):
          derived_from = dep_tgt.concrete_derived_from
          if not self._select(derived_from):
            continue
          # Create edge only for those direct or transitive dependencies in order to
          # disqualify irrelevant targets that happen to share some file in sources,
          # not uncommon when globs especially rglobs is used.
          if not derived_from in transitive_deps:
            continue
          node.add_edge(_construct_edge(dep_tgt, products_used={product_dep}), derived_from)

    return node


class Node(object):
  def __init__(self, concrete_target):
    self.concrete_target = concrete_target
    self.products_total = 0
    self.derivations = set()
    # Dict mapping concrete dependency targets to an Edge object.
    self.dep_edges = defaultdict(Edge)
    # Dict mapping concrete dependency targets to where they are aliased from.
    self.dep_aliases = defaultdict(set)

  def add_derivation(self, derived_target, derived_products):
    self.derivations.add(derived_target)
    self.products_total += derived_products

  def add_edge(self, edge, dest, dest_aliased_from=None):
    self.dep_edges[dest] += edge
    if dest_aliased_from:
      self.dep_aliases[dest].add(dest_aliased_from)

  def combine(self, other_node):
    assert other_node.concrete_target == self.concrete_target
    self.products_total += other_node.products_total
    self.derivations.update(other_node.derivations)
    self.dep_edges.update(other_node.dep_edges)
    self.dep_aliases.update(other_node.dep_aliases)

  def to_cacheable_dict(self):
    edges = {}
    for dest in self.dep_edges:
      edges[dest.address.spec] = {
        'products_used': list(self.dep_edges[dest].products_used),
        'is_declared': self.dep_edges[dest].is_declared,
        'is_used': self.dep_edges[dest].is_used,
      }
    aliases = {}

    for dep, dep_aliases in self.dep_aliases.items():
      aliases[dep.address.spec] = [alias.address.spec for alias in dep_aliases]

    return {
      'target': self.concrete_target.address.spec,
      'products_total': self.products_total,
      'derivations': [derivation.address.spec for derivation in self.derivations],
      'dep_edges': edges,
      'aliases': aliases,
    }

  @staticmethod
  def from_cacheable_dict(cached_dict, target_resolve_func):
    res = Node(target_resolve_func(cached_dict['target']))
    res.products_total = cached_dict['products_total']
    res.derivations.update([target_resolve_func(spec) for spec in cached_dict['derivations']])
    for edge in cached_dict['dep_edges']:
      res.dep_edges[target_resolve_func(edge)] = Edge(
        is_declared=cached_dict['dep_edges'][edge]['is_declared'],
        is_used=cached_dict['dep_edges'][edge]['is_used'],
        products_used=set(cached_dict['dep_edges'][edge]['products_used']))
    for dep in cached_dict['aliases']:
      for alias in cached_dict['aliases'][dep]:
        res.dep_aliases[target_resolve_func(dep)].add(target_resolve_func(alias))
    return res


class Edge(object):
  """Record a set of used products, and a boolean indicating that a depedency edge was declared."""

  def __init__(self, is_declared=False, is_used=False, products_used=None):
    self.products_used = products_used or set()
    self.is_declared = is_declared
    self.is_used = is_used

  def __iadd__(self, that):
    self.products_used |= that.products_used
    self.is_declared |= that.is_declared
    self.is_used |= that.is_used
    return self


class DependencyUsageGraph(object):
  def __init__(self, nodes, size_estimator):
    self._nodes = nodes
    self._size_estimator = size_estimator
    self._cost_cache = {}
    self._trans_cost_cache = {}

  def _cost(self, target):
    if target not in self._cost_cache:
      self._cost_cache[target] = self._size_estimator(target.sources_relative_to_buildroot())
    return self._cost_cache[target]

  def _trans_cost(self, target):
    if target not in self._trans_cost_cache:
      dep_sum = sum(self._trans_cost(dep) for dep in target.dependencies)
      self._trans_cost_cache[target] = self._cost(target) + dep_sum
    return self._trans_cost_cache[target]

  def _edge_type(self, target, edge, dep):
    if target == dep:
      return 'self'
    elif edge.is_declared and edge.is_used:
      return 'declared'
    elif edge.is_declared and not edge.is_used:
      return 'unused'
    else:
      return 'undeclared'

  def _used_ratio(self, dep_tgt, edge):
    if edge.products_used:
      # If products were recorded as used, generate a legitimate ratio.
      dep_tgt_products_total = self._nodes[dep_tgt].products_total if dep_tgt in self._nodes else 1
      return len(edge.products_used) / max(dep_tgt_products_total, 1)
    elif edge.is_used:
      # Else, the dep might not be in the default scope, and must considered to be used.
      return 1.0
    else:
      # Otherwise, definitely not used.
      return 0.0

  def to_summary(self):
    """Outputs summarized dependencies ordered by a combination of max usage and cost."""

    # Aggregate inbound edges by their maximum product usage ratio.
    max_target_usage = defaultdict(lambda: 0.0)
    for target, node in self._nodes.items():
      for dep_target, edge in node.dep_edges.items():
        if target == dep_target:
          continue
        used_ratio = self._used_ratio(dep_target, edge)
        max_target_usage[dep_target] = max(max_target_usage[dep_target], used_ratio)

    # Calculate a score for each.
    Score = namedtuple('Score', ('badness', 'max_usage', 'cost_transitive', 'target'))
    scores = []
    for target, max_usage in max_target_usage.items():
      cost_transitive = self._trans_cost(target)
      score = int(cost_transitive / (max_usage if max_usage > 0.0 else 1.0))
      scores.append(Score(score, max_usage, cost_transitive, target.address.spec))

    # Output in order by score.
    yield '[\n'
    first = True
    for score in sorted(scores, key=lambda s: s.badness):
      yield '{}  {}'.format('' if first else ',\n', json.dumps(score._asdict()))
      first = False
    yield '\n]\n'

  def to_json(self):
    """Outputs the entire graph."""
    res_dict = {}

    def gen_dep_edge(node, edge, dep_tgt, aliases):
      return {
        'target': dep_tgt.address.spec,
        'dependency_type': self._edge_type(node.concrete_target, edge, dep_tgt),
        'products_used': len(edge.products_used),
        'products_used_ratio': self._used_ratio(dep_tgt, edge),
        'aliases': [alias.address.spec for alias in aliases],
      }

    for node in self._nodes.values():
      res_dict[node.concrete_target.address.spec] = {
        'cost': self._cost(node.concrete_target),
        'cost_transitive': self._trans_cost(node.concrete_target),
        'products_total': node.products_total,
        'dependencies': [gen_dep_edge(node, edge, dep_tgt, node.dep_aliases.get(dep_tgt, {}))
                         for dep_tgt, edge in node.dep_edges.items()],
      }
    yield json.dumps(res_dict, indent=2, sort_keys=True)