fix_includes.py | searchcode

/fix_includes.py

Large files files are truncated, but you can click here to view the full file

#!/usr/bin/python

##===--- fix_includes.py - rewrite source files based on iwyu output ------===##
#
#                     The LLVM Compiler Infrastructure
#
# This file is distributed under the University of Illinois Open Source
# License. See LICENSE.TXT for details.
#
##===----------------------------------------------------------------------===##

"""Update files with the 'correct' #include and forward-declare lines.

Given the output of include_what_you_use on stdin -- when run at the
(default) --v=1 verbosity level or higher -- modify the files
mentioned in the output, removing their old #include lines and
replacing them with the lines given by the include_what_you_use
script.

We only edit files that are writeable (presumably open for p4 edit),
unless the user supplies a command to make files writeable via the
--checkout_command flag (eg '--checkout_command="p4 edit"').


This script runs in four stages.  In the first, it groups physical
lines together to form 'move spans'.  A 'move span' is the atomic unit
for moving or deleting code.  A move span is either a) an #include
line, along with any comment lines immediately preceding it; b) a
forward-declare line -- or more if it's a multi-line forward declare
-- along with preceding comments; c) any other single line.  Example:

   // I really am glad I'm forward-declaring this class!
   // If I didn't, I'd have to #include the entire world.
   template<typename A, typename B, typename C, typename D>
   class MyClass;

Then, it groups move spans together into 'reorder spans'.  These are
spans of code that consist entirely of #includes and forward-declares,
maybe separated by blank lines and comments.  We assume that we can
arbitrarily reorder #includes and forward-declares within a reorder
span, without affecting correctness.  Things like #ifdefs, #defines,
namespace declarations, static variable declarations, class
definitions, etc -- just about anything -- break up reorder spans.

In stage 3 it deletes all #include and forward-declare lines that iwyu
says to delete.  iwyu includes line numbers for deletion, making this
part easy.  If this step results in "empty" #ifdefs or namespaces
(#ifdefs or namespaces with no code inside them), we delete those as
well.  We recalculate the reorder spans, which may have gotten bigger
due to the deleted code.

In stage 4 it adds new iwyu-dictated #includes and forward-declares
after the last existing #includes and forward-declares.  Then it
reorders the #includes and forward-declares to match the order
specified by iwyu.  It follows iwyu's instructions as much as
possible, modulo the constraint that an #include or forward-declare
cannot leave its current reorder span.

All this moving messes up the blank lines, which we then need to fix
up.  Then we're done!
"""

__author__ = 'csilvers@google.com (Craig Silverstein)'

import difflib
import optparse
import os
import pipes  # For (undocumented) pipes.quote
import re
import sys
import subprocess

_USAGE = """\
%prog [options] [filename] ... < <output from include-what-you-use script>
    OR %prog -s [other options] <filename> ...

%prog reads the output from the include-what-you-use
script on stdin -- run with --v=1 (default) verbose or above -- and,
unless --sort_only or --dry_run is specified,
modifies the files mentioned in the output, removing their old
#include lines and replacing them with the lines given by the
include_what_you_use script.  It also sorts the #include and
forward-declare lines.

Only writable files (those opened for p4 edit) are modified (unless
--checkout_command is specified).  All files mentioned in the
include-what-you-use script are modified, unless filenames are
specified on the commandline, in which case only those files are
modified.

The exit code is the number of files that were modified (or that would
be modified if --dry_run was specified) unless that number exceeds 100,
in which case 100 is returned.
"""

_COMMENT_RE = re.compile(r'\s*//.*')

# These are the types of lines a file can have.  These are matched
# using re.match(), so don't need a leading ^.
_C_COMMENT_START_RE = re.compile(r'\s*/\*')
_C_COMMENT_END_RE = re.compile(r'.*\*/\s*(.*)$')
_COMMENT_LINE_RE = re.compile(r'\s*//')
_BLANK_LINE_RE = re.compile(r'\s*$')
_IF_RE = re.compile(r'\s*#\s*if')               # compiles #if/ifdef/ifndef
_ELSE_RE = re.compile(r'\s*#\s*(else|elif)\b')  # compiles #else/elif
_ENDIF_RE = re.compile(r'\s*#\s*endif\b')
# This is used to delete 'empty' namespaces after fwd-decls are removed.
# Some third-party libraries use macros to start/end namespaces.
_NAMESPACE_START_RE = re.compile(r'\s*(namespace\b[^{]*{\s*)+(//.*)?$|'
                                 r'\s*(U_NAMESPACE_BEGIN)|'
                                 r'\s*(HASH_NAMESPACE_DECLARATION_START)')
_NAMESPACE_END_RE = re.compile(r'\s*(})|'
                               r'\s*(U_NAMESPACE_END)|'
                               r'\s*(HASH_NAMESPACE_DECLARATION_END)')
# The group (in parens) holds the unique 'key' identifying this #include.
_INCLUDE_RE = re.compile(r'\s*#\s*include\s+([<"][^"">]+[>"])')
# We don't need this to actually match forward-declare lines (we get
# that information from the iwyu input), but we do need an RE here to
# serve as an index to _LINE_TYPES.  So we use an RE that never matches.
_FORWARD_DECLARE_RE = re.compile(r'$.FORWARD_DECLARE_RE')
# Likewise, used to mark an '#ifdef' line of a header guard, or other
# #ifdef that covers an entire file.
_HEADER_GUARD_RE = re.compile(r'$.HEADER_GUARD_RE')
# Marks the '#define' line that comes after a header guard.  Since we
# know the previous line was a header-guard line, we're not that picky
# about this one.
_HEADER_GUARD_DEFINE_RE = re.compile(r'\s*#\s*define\s+')

# We annotate every line in the source file by the re it matches, or None.
# Note that not all of the above RE's are represented here; for instance,
# we fold _C_COMMENT_START_RE and _C_COMMENT_END_RE into _COMMENT_LINE_RE.
_LINE_TYPES = [_COMMENT_LINE_RE, _BLANK_LINE_RE,
               _NAMESPACE_START_RE, _NAMESPACE_END_RE,
               _IF_RE, _ELSE_RE, _ENDIF_RE,
               _INCLUDE_RE, _FORWARD_DECLARE_RE,
               _HEADER_GUARD_RE, _HEADER_GUARD_DEFINE_RE,
              ]

# A regexp matching #include lines that should be a barrier for
# sorting -- that is, we should never reorganize the code so an
# #include that used to come before this line now comes after, or vice
# versa.  This can be used for 'fragile' #includes that require other
# #includes to happen before them to function properly.
# (Note that the barrier has no effect on where new #includes are
# added; it just affects the reordering of existing #includes.)
_BARRIER_INCLUDES = re.compile(r'^\s*#\s*include\s+(<linux/)')


def _MayBeHeaderFile(filename):
  """Tries to figure out if filename is a C++ header file.  Defaults to yes."""
  # Header files have all sorts of extensions: .h, .hpp, .hxx, or no
  # extension at all.  So we say everything is a header file unless it
  # has a known extension that's not.
  extension = os.path.splitext(filename)[1]
  return extension not in ('.c', '.cc', '.cxx', '.cpp', '.C', '.CC')


class FixIncludesError(Exception):
  pass


class IWYUOutputRecord(object):
  """Information that the iwyu output file has about one source file."""

  def __init__(self, filename):
    self.filename = filename

    # A set of integers.
    self.lines_to_delete = set()

    # A set of integer line-numbers, for each #include iwyu saw that
    # is marked with a line number.  This is usually not an exhaustive
    # list of include-lines, but that's ok because we only use this
    # data structure for sanity checking: we double-check with our own
    # analysis that these lines are all # #include lines.  If not, we
    # know the iwyu data is likely out of date, and we complain.  So
    # more data here is always welcome, but not essential.
    self.some_include_lines = set()

    # A set of integer line-number spans [start_line, end_line), for
    # each forward-declare iwyu saw.  iwyu reports line numbers for
    # every forward-declare it sees in the source code.  (It won't
    # report, though, forward-declares inside '#if 0' or similar.)
    self.seen_forward_declare_lines = set()

    # A set of each line in the iwyu 'add' section.
    self.includes_and_forward_declares_to_add = set()

    # A map from the include filename (including ""s or <>s) to the
    # full line as given by iwyu, which includes comments that iwyu
    # has put next to the #include.  This holds both 'to-add' and
    # 'to-keep' #includes.  If flags.comments is False, the comments
    # are removed before adding to this list.
    self.full_include_lines = {}

  def Merge(self, other):
    """Merges other with this one.  They must share a filename.

    This function is intended to be used when we see two iwyu records
    in the input, both for the same file.  We can merge the two together.
    We are conservative: we union the lines to add, and intersect the
    lines to delete.

    Arguments:
      other: an IWYUOutputRecord to merge into this one.
        It must have the same value for filename that self does.
    """
    assert self.filename == other.filename, "Can't merge distinct files"
    self.lines_to_delete.intersection_update(other.lines_to_delete)
    self.some_include_lines.update(other.some_include_lines)
    self.seen_forward_declare_lines.update(other.seen_forward_declare_lines)
    self.includes_and_forward_declares_to_add.update(
        other.includes_and_forward_declares_to_add)
    self.full_include_lines.update(other.full_include_lines)

  def HasContentfulChanges(self):
    """Returns true iff this record has at least one add or delete."""
    return (self.includes_and_forward_declares_to_add or
            self.lines_to_delete)

  def __str__(self):
    return ('--- iwyu record ---\n  FILENAME: %s\n  LINES TO DELETE: %s\n'
            '  (SOME) INCLUDE LINES: %s\n  (SOME) FWD-DECL LINES: %s\n'
            '  TO ADD: %s\n  ALL INCLUDES: %s\n---\n'
            % (self.filename, self.lines_to_delete,
               self.some_include_lines, self.seen_forward_declare_lines,
               self.includes_and_forward_declares_to_add,
               self.full_include_lines))


class IWYUOutputParser(object):
  """Parses the lines in iwyu output corresponding to one source file."""

  # iwyu adds this comment to some lines to map them to the source file.
  _LINE_NUMBERS_COMMENT_RE = re.compile(r'\s*// lines ([0-9]+)-([0-9]+)')

  # The output of include-what-you-use has sections that indicate what
  # #includes and forward-declares should be added to the output file,
  # what should be removed, and what the end result is.  The first line
  # of each section also has the filename.
  _ADD_SECTION_RE = re.compile(r'^(.*) should add these lines:$')
  _REMOVE_SECTION_RE = re.compile(r'^(.*) should remove these lines:$')
  _TOTAL_SECTION_RE = re.compile(r'^The full include-list for ([^:]*):$')
  _SECTION_END_RE = re.compile(r'^---$')

  # Alternately, if a file does not need any iwyu modifications (though
  # it still may need its #includes sorted), iwyu will emit this:
  _NO_EDITS_RE = re.compile(r'^\((.*) has correct #includes/fwd-decls\)$')

  _RE_TO_NAME = {_ADD_SECTION_RE: 'add',
                 _REMOVE_SECTION_RE: 'remove',
                 _TOTAL_SECTION_RE: 'total',
                 _SECTION_END_RE: 'end',
                 _NO_EDITS_RE: 'no_edits',
                }
  # A small state-transition machine.  key==None indicates the start
  # state.  value==None means that the key is an end state (that is,
  # its presence indicates the record is finished).
  _EXPECTED_NEXT_RE = {
      None:               frozenset([_ADD_SECTION_RE, _NO_EDITS_RE]),
      _ADD_SECTION_RE:    frozenset([_REMOVE_SECTION_RE]),
      _REMOVE_SECTION_RE: frozenset([_TOTAL_SECTION_RE]),
      _TOTAL_SECTION_RE:  frozenset([_SECTION_END_RE]),
      _SECTION_END_RE:    None,
      _NO_EDITS_RE:       None,
  }

  def __init__(self):
    # This is set to one of the 'section' REs above.  None is the start-state.
    self.current_section = None
    self.filename = '<unknown file>'
    self.lines_by_section = {}     # key is an RE, value is a list of lines

  def _ProcessOneLine(self, line):
    """Reads one line of input, updates self, and returns False at EORecord.

    If the line matches one of the hard-coded section names, updates
    self.filename and self.current_section.  Otherwise, the line is
    taken to be a member of the currently active section, and is added
    to self.lines_by_section.

    Arguments:
      line: one line from the iwyu input file.

    Returns:
      False if the line is the end-of-section marker, True otherwise.

    Raises:
      FixIncludesError: if there is an out-of-order section or
      mismatched filename.
    """
    line = line.rstrip()     # don't worry about line endings
    if not line:             # just ignore blank lines
      return True

    for (section_re, section_name) in self._RE_TO_NAME.iteritems():
      m = section_re.search(line)
      if m:
        # Check or set the filename (if the re has a group, it's for filename).
        if section_re.groups >= 1:
          this_filename = m.group(1)
          if (self.current_section is not None and
              this_filename != self.filename):
            raise FixIncludesError('"%s" section for %s comes after "%s" for %s'
                                   % (section_name, this_filename,
                                      self._RE_TO_NAME[self.current_section],
                                      self.filename))
          self.filename = this_filename

        # Check and set the new section we're entering.
        if section_re not in self._EXPECTED_NEXT_RE[self.current_section]:
          if self.current_section is None:
            raise FixIncludesError('%s: "%s" section unexpectedly comes first'
                                   % (self.filename, section_name))
          else:
            raise FixIncludesError('%s: "%s" section unexpectedly follows "%s"'
                                   % (self.filename, section_name,
                                      self._RE_TO_NAME[self.current_section]))
        self.current_section = section_re
        # We're done parsing this record if this section has nothing after it.
        return self._EXPECTED_NEXT_RE[self.current_section] is not None

    # We're not starting a new section, so just add to the current section.
    # We ignore lines before section-start, they're probably things like
    # compiler messages ("Compiling file foo").
    if self.current_section is not None:
      self.lines_by_section.setdefault(self.current_section, []).append(line)
    return True

  def ParseOneRecord(self, iwyu_output, flags):
    """Given a file object with output from an iwyu run, return per file info.

    For each source file that iwyu_output mentions (because iwyu was run on
    it), we return a structure holding the information in IWYUOutputRecord:
    1) What file these changes apply to
    2) What line numbers hold includes/fwd-declares to remove
    3) What includes/fwd-declares to add
    4) Ordering information for includes and fwd-declares

    Arguments:
      iwyu_output: a File object returning lines from an iwyu run
      flags: commandline flags, as parsed by optparse.  We use
         flags.comments, which controls whether we output comments
         generated by iwyu.
    Returns:
       An IWYUOutputRecord object, or None at EOF.

    Raises:
       FixIncludesError: for malformed-looking lines in the iwyu output.
    """
    for line in iwyu_output:
      if not self._ProcessOneLine(line):   # returns False at end-of-record
        break
    else:                                  # for/else
      return None                          # at EOF

    # Now set up all the fields in an IWYUOutputRecord.
    # IWYUOutputRecord.filename
    retval = IWYUOutputRecord(self.filename)

    # IWYUOutputRecord.lines_to_delete
    for line in self.lines_by_section.get(self._REMOVE_SECTION_RE, []):
      m = self._LINE_NUMBERS_COMMENT_RE.search(line)
      if not m:
        raise FixIncludesError('line "%s" (for %s) has no line number'
                               % (line, self.filename))
      # The RE is of the form [start_line, end_line], inclusive.
      for line_number in xrange(int(m.group(1)), int(m.group(2)) + 1):
        retval.lines_to_delete.add(line_number)

    # IWYUOutputRecord.some_include_lines
    for line in (self.lines_by_section.get(self._REMOVE_SECTION_RE, []) +
                 self.lines_by_section.get(self._TOTAL_SECTION_RE, [])):
      if not _INCLUDE_RE.match(line):
        continue
      m = self._LINE_NUMBERS_COMMENT_RE.search(line)
      if not m:
        continue   # not all #include lines have line numbers, but some do
      for line_number in xrange(int(m.group(1)), int(m.group(2)) + 1):
        retval.some_include_lines.add(line_number)

    # IWYUOutputRecord.seen_forward_declare_lines
    for line in (self.lines_by_section.get(self._REMOVE_SECTION_RE, []) +
                 self.lines_by_section.get(self._TOTAL_SECTION_RE, [])):
      # Everything that's not an #include is a forward-declare.
      if line.startswith('- '):    # the 'remove' lines all start with '- '.
        line = line[len('- '):]
      if _INCLUDE_RE.match(line):
        continue
      m = self._LINE_NUMBERS_COMMENT_RE.search(line)
      if m:
        retval.seen_forward_declare_lines.add((int(m.group(1)),
                                               int(m.group(2))+1))

    # IWYUOutputRecord.includes_and_forward_declares_to_add
    for line in self.lines_by_section.get(self._ADD_SECTION_RE, []):
      line = _COMMENT_RE.sub('', line)
      retval.includes_and_forward_declares_to_add.add(line)

    # IWYUOutputRecord.full_include_lines
    for line in self.lines_by_section.get(self._TOTAL_SECTION_RE, []):
      m = _INCLUDE_RE.match(line)
      if m:
        if not flags.comments:
          line = _COMMENT_RE.sub('', line)  # pretend there were no comments
        else:
          # Just remove '// line XX': that's iwyu metadata, not a real comment
          line = self._LINE_NUMBERS_COMMENT_RE.sub('', line)
        retval.full_include_lines[m.group(1)] = line

    return retval


class LineInfo(object):
  """Information about a single line of a source file."""

  def __init__(self, line):
    """Initializes the content of the line, but no ancillary fields."""
    # The content of the line in the input file
    self.line = line

    # The 'type' of the line.  The 'type' is one of the regular
    # expression objects in _LINE_TYPES, or None for any line that
    # does not match any regular expression in _LINE_TYPES.
    self.type = None

    # True if no lines processed before this one have the same type
    # as this line.
    self.is_first_line_of_this_type = False

    # Set to true if we want to delete/ignore this line in the output
    # (for instance, because iwyu says to delete this line).  At the
    # start, the only line to delete is the 'dummy' line 0.
    self.deleted = self.line is None

    # If this line is an #include or a forward-declare, gives a
    # [begin,end) pair saying the 'span' this line is part of.  We do
    # this for two types of span: the move span (an #include or
    # forward declare, along with any preceding comments) and the
    # reorder span (a continguous block of move-spans, connected only
    # by blank lines and comments).  For lines that are not an
    # #include or forward-declare, these may have an arbitrary value.
    self.move_span = None
    self.reorder_span = None

    # If this line is an #include or a forward-declare, gives the
    # 'key' of the line.  For #includes it is the filename included,
    # including the ""s or <>s.  For a forward-declare it's the name
    # of the class/struct.  For other types of lines, this is None.
    self.key = None

  def __str__(self):
    if self.deleted:
      line = 'XX-%s-XX' % self.line
    else:
      line = '>>>%s<<<' % self.line
    if self.type is None:
      type_id = None
    else:
      type_id = _LINE_TYPES.index(self.type)
    return ('%s\n  -- type: %s (key: %s).  move_span: %s.  reorder_span: %s'
            % (line, type_id, self.key, self.move_span, self.reorder_span))


def _ReadFile(filename):
  """Read from filename and return a list of file lines."""
  try:
    return open(filename).read().splitlines()
  except (IOError, OSError), why:
    print "Skipping '%s': %s" % (filename, why)
  return None


def _ReadWriteableFile(filename, ignore_writeable):
  """Read from filename and return a list of file lines.

  Given a filename, if the file is found and is writable, read
  the file contents and return it as a list of lines (newlines
  removed).  If the file is not found or is not writable, or if
  there is another IO error, return None.

  Arguments:
    filename: the name of the file to read.
    ignore_writeable: if True, don't check whether the file is writeable;
       return the contents anyway.

  Returns:
    A list of lines (without trailing newline) from filename, or None
    if the file is not writable, or cannot be read.
  """
  if os.access(filename, os.W_OK) or ignore_writeable:
    return _ReadFile(filename)
  return None


def _WriteFileContentsToFileObject(f, file_lines):
  """Write the given file-lines to the file."""
  f.write('\n'.join(file_lines))
  f.write('\n')


def _WriteFileContents(filename, file_lines):
  """Write the given file-lines to the file."""
  try:
    f = open(filename, 'w')
    try:
      _WriteFileContentsToFileObject(f, file_lines)
    finally:
      f.close()
  except (IOError, OSError), why:
    print "Error writing '%s': %s" % (filename, why)


def _CreateCommandLine(command, args):
  """Join the command with the args in a shell-quoted way."""
  ret = '%s %s' % (command, ' '.join(map(pipes.quote, args)))
  print 'Running:', ret
  return ret


def _GetCommandOutputLines(command, args):
  """Return an iterable over the output lines of the given shell command."""
  full_command = _CreateCommandLine(command, args)
  proc = subprocess.Popen(full_command, shell=True, stdout=subprocess.PIPE)
  return proc.stdout


def _RunCommand(command, args):
  """Run the given shell command."""
  for line in _GetCommandOutputLines(command, args):
    print line,


def _GetCommandOutputWithInput(command, stdin_text):
  """Return the output of the given command fed the stdin_text."""
  print command
  proc = subprocess.Popen(command,
                          stdin=subprocess.PIPE,
                          stdout=subprocess.PIPE,
                          shell=True)
  return proc.communicate(input=stdin_text)[0]


def PrintFileDiff(old_file_contents, new_file_contents):
  """Print a unified diff between files, specified as lists of lines."""
  diff = difflib.unified_diff(old_file_contents, new_file_contents)
  # skip the '--- <filename>/+++ <filename>' lines at the start
  try:
    diff.next()
    diff.next()
    print '\n'.join(diff)
  except StopIteration:
    pass


def _MarkHeaderGuardIfPresent(file_lines):
  """If any line in file_lines is a header-guard, mark it in file_lines.

  We define a header-guard as follows: an #ifdef where there is
  nothing contentful before or after the #ifdef.  Also, the #ifdef
  should have no #elif in it (though we don't currently test that).
  This catches the common case of an 'ifdef guard' in .h file, such
  as '#ifndef FOO_H\n#define FOO_H\n...contents...\n#endif', but it
  can also catch other whole-program #ifdefs, such as
  '#ifdef __linux\n...\n#endif'.  The issue here is that if an #ifdef
  encloses the entire file, then we are willing to put new
  #includes/fwd-declares inside the #ifdef (which normally we
  wouldn't do).  So we want to mark such #ifdefs with a special label.

  If we find such an #ifdef line -- and a single file can have at most
  one -- we change its type to a special type for header guards.

  Arguments:
    file_lines: an array of LineInfo objects with .type filled in.
  """
  # Pass over blank lines or comments at the top of the file.
  i = 0
  for i in xrange(len(file_lines)):
    if (not file_lines[i].deleted and
        file_lines[i].type not in [_COMMENT_LINE_RE, _BLANK_LINE_RE]):
      break
  else:     # for/else: got to EOF without finding any non-blank/comment lines
    return

  # This next line is the candidate header guard-line.
  ifdef_start = i
  if file_lines[ifdef_start].type != _IF_RE:
    # Not a header guard, just return without doing anything.
    return

  # Find the end of this ifdef, to see if it's really a header guard..
  ifdef_depth = 0
  for ifdef_end in xrange(ifdef_start, len(file_lines)):
    if file_lines[ifdef_end].deleted:
      continue
    if file_lines[ifdef_end].type == _IF_RE:
      ifdef_depth += 1
    elif file_lines[ifdef_end].type == _ENDIF_RE:
      ifdef_depth -= 1
      if ifdef_depth == 0:   # The end of our #ifdef!
        break
  else:                      # for/else
    return False             # Weird: never found a close to this #ifdef

  # Finally, all the lines after the end of the ifdef must be blank or comments.
  for i in xrange(ifdef_end + 1, len(file_lines)):
    if (not file_lines[i].deleted and
        file_lines[i].type not in [_COMMENT_LINE_RE, _BLANK_LINE_RE]):
      return

  # We passed the gauntlet!
  file_lines[ifdef_start].type = _HEADER_GUARD_RE

  # And the line after the header guard #ifdef is the '#define' (usually).
  if _HEADER_GUARD_DEFINE_RE.match(file_lines[ifdef_start + 1].line):
    file_lines[ifdef_start+1].type = _HEADER_GUARD_DEFINE_RE


def _CalculateLineTypesAndKeys(file_lines, iwyu_record):
  """Fills file_line's type and key fields, where the 'type' is a regexp object.

  We match each line (line_info.line) against every regexp in
  _LINE_TYPES, and assign the first that matches, or None if none
  does.  We also use iwyu_record's some_include_lines and
  seen_forward_declare_lines to identify those lines.  In fact,
  that's the only data source we use for forward-declare lines.

  Sets file_line.type and file_line.is_first_line_of_this_type for
  each file_line in file_lines.

  Arguments:
    file_lines: an array of LineInfo objects with .line fields filled in.
    iwyu_record: the IWYUOutputRecord struct for this source file.

  Raises:
    FixIncludesError: if iwyu_record's line-number information is
      is inconsistent with what we see in the file.  (For instance,
      it says line 12 is an #include, but we say it's a blank line,
      or the file only has 11 lines.)
  """
  seen_types = set()
  in_c_style_comment = False
  for line_info in file_lines:
    if line_info.line is None:
      line_info.type = None
    elif _C_COMMENT_START_RE.match(line_info.line):
      # Note: _C_COMMENT_START_RE only matches a comment at the start
      # of a line.  Comments in the middle of a line are ignored.
      # This can cause problems with multi-line comments that start
      # in the middle of the line, but that's hopefully quite rare.
      # TODO(csilvers): check for that case.
      m = _C_COMMENT_END_RE.match(line_info.line)
      if not m:             # comment continues onto future lines
        line_info.type = _COMMENT_LINE_RE
        in_c_style_comment = True
      elif not m.group(1):  # comment extends across entire line (only)
        line_info.type = _COMMENT_LINE_RE
      else:                 # comment takes only part of line, treat as content
        # TODO(csilvers): this mis-diagnoses lines like '/*comment*/class Foo;'
        line_info.type = None
    elif in_c_style_comment and _C_COMMENT_END_RE.match(line_info.line):
      line_info.type = _COMMENT_LINE_RE
      in_c_style_comment = False
    elif in_c_style_comment:
      line_info.type = _COMMENT_LINE_RE
    else:
      for type_re in _LINE_TYPES:
        # header-guard-define-re has a two-part decision criterion: it
        # matches the RE, *and* it comes after a header guard line.
        # That's too complex to figure out now, so we skip over it now
        # and fix it up later in _MarkHeaderGuardIfPresent().
        if type_re in (_HEADER_GUARD_DEFINE_RE,):
          continue
        m = type_re.match(line_info.line)
        if m:
          line_info.type = type_re
          if type_re == _INCLUDE_RE:
            line_info.key = m.group(1)   # get the 'key' for the #include.
          break
      else:    # for/else
        line_info.type = None   # means we didn't match any re

    line_info.is_first_line_of_this_type = (line_info.type not in seen_types)
    seen_types.add(line_info.type)

  # Now double-check against iwyu that we got all the #include lines right.
  for line_number in iwyu_record.some_include_lines:
    if file_lines[line_number].type != _INCLUDE_RE:
      raise FixIncludesError('iwyu line number %s:%d (%s) is not an #include'
                             % (iwyu_record.filename, line_number,
                                file_lines[line_number].line))

  # We depend entirely on the iwyu_record for the forward-declare lines.
  for (start_line, end_line) in iwyu_record.seen_forward_declare_lines:
    for line_number in xrange(start_line, end_line):
      if line_number >= len(file_lines):
        raise FixIncludesError('iwyu line number %s:%d is past file-end'
                               % (iwyu_record.filename, line_number))
      file_lines[line_number].type = _FORWARD_DECLARE_RE

  # While we're at it, let's do a bit more sanity checking on iwyu_record.
  for line_number in iwyu_record.lines_to_delete:
    if line_number >= len(file_lines):
      raise FixIncludesError('iwyu line number %s:%d is past file-end'
                             % (iwyu_record.filename, line_number))
    elif file_lines[line_number].type not in (_INCLUDE_RE,
                                              _FORWARD_DECLARE_RE):
      raise FixIncludesError('iwyu line number %s:%d (%s) is not'
                             ' an #include or forward declare'
                             % (iwyu_record.filename, line_number,
                                file_lines[line_number].line))

  # Check if this file has a header guard, which for our purposes is
  # an #ifdef (or #if) that covers an entire source file.  Usually
  # this will be a standard .h header-guard, but it could be something
  # like '#if __linux/#endif'.  The point here is that if an #ifdef
  # encloses the entire file, then we are willing to put new
  # #includes/fwd-declares inside the #ifdef (which normally we
  # wouldn't do).  So we mark such #ifdefs with a special label.
  _MarkHeaderGuardIfPresent(file_lines)


def _PreviousNondeletedLine(file_lines, line_number):
  """Returns the line number of the previous not-deleted line, or None."""
  for line_number in xrange(line_number - 1, -1, -1):
    if not file_lines[line_number].deleted:
      return line_number
  return None


def _NextNondeletedLine(file_lines, line_number):
  """Returns the line number of the next not-deleted line, or None."""
  for line_number in xrange(line_number + 1, len(file_lines)):
    if not file_lines[line_number].deleted:
      return line_number
  return None


def _LineNumberStartingPrecedingComments(file_lines, line_number):
  """Returns the line-number for the comment-lines preceding the given linenum.

  Looking at file_lines, look at the lines immediately preceding the
  given line-number.  If they're comment lines, return the first line
  of the comment lines preceding the given line.  Otherwise, return
  the given line number.

  As a special case, if the comments go all the way up to the first
  line of the file (line 1), we assume they're comment lines, which
  are special -- they're not associated with any source code line --
  and we return line_number in that case.

  Arguments:
    file_lines: an array of LineInfo objects, with .type fields filled in.
    line_number: an index into file_lines.

  Returns:
    The first line number of the preceding comments, or line_number
      if there are no preceding comments or they appear to be a
      top-of-file copyright notice.
  """
  retval = line_number
  while retval > 0 and file_lines[retval - 1].type == _COMMENT_LINE_RE:
    retval -= 1
  if retval <= 1:          # top-of-line comments
    retval = line_number   # so ignore all the comment lines
  return retval


def _CalculateMoveSpans(file_lines, forward_declare_spans):
  """Fills each input_line's move_span field.

  A 'move span' is a range of lines (from file_lines) that includes
  an #include or forward-declare, and all the comments preceding it.
  It is the unit we would move if we decided to move (or delete) this
  #include or forward-declare.

  For lines of type _INCLUDE_RE or _FORWARD_DECLARE_RE, the move span
  is set to the tuple [start_of_span, end_of_span).  All other lines
  have the move span kept at None.

  Arguments:
    file_lines: an array of LineInfo objects, with .type fields filled in.
    forward_declare_spans: a set of line-number pairs
       [start_line, end_line), each representing a single namespace.
       In practice this comes from iwyu_record.seen_forward_declare_lines.
  """
  # First let's do #includes.
  for line_number in xrange(len(file_lines)):
    if file_lines[line_number].type == _INCLUDE_RE:
      span_begin = _LineNumberStartingPrecedingComments(file_lines, line_number)
      for i in xrange(span_begin, line_number + 1):
        file_lines[i].move_span = (span_begin, line_number + 1)

  # Now forward-declares.  These spans come as input to this function.
  for (span_begin, span_end) in forward_declare_spans:
    span_begin = _LineNumberStartingPrecedingComments(file_lines, span_begin)
    for i in xrange(span_begin, span_end):
      file_lines[i].move_span = (span_begin, span_end)


def _ContainsBarrierInclude(file_lines, line_range):
  """Returns true iff some line in [line_range[0], line_range[1]) is BARRIER."""
  for line_number in apply(xrange, line_range):
    if (not file_lines[line_number].deleted and
        _BARRIER_INCLUDES.search(file_lines[line_number].line)):
      return True
  return False


def _LinesAreAllBlank(file_lines, start_line, end_line):
  """Returns true iff all lines in [start_line, end_line) are blank/deleted."""
  for line_number in xrange(start_line, end_line):
    if (not file_lines[line_number].deleted and
        file_lines[line_number].type != _BLANK_LINE_RE):
      return False
  return True


def _CalculateReorderSpans(file_lines):
  """Fills each input_line's reorder_span field.

  A 'reorder span' is a range of lines (from file_lines) that only has
  #includes and forward-declares in it (and maybe blank lines, and
  comments associated with #includes or forward-declares).  In
  particular, it does not include any "real code" besides #includes
  and forward-declares: no functions, no static variable assignment,
  no macro #defines, no nothing.  We are willing to reorder #includes
  and namespaces freely inside a reorder span.

  Calculating reorder_span is easy: they're just the union of
  contiguous move-spans (with perhaps blank lines and comments
  thrown in), because move-spans share the 'no actual code'
  requirement.

  There's one exception: if any move-span matches the
  _BARRIER_INCLUDES regexp, it means that we should consider that
  move-span to be a 'barrier': nothing should get reordered from one
  side of that move-span to the other.  (This is used for #includes
  that depend on other #includes being before them to function
  properly.)  We do that by putting them into their own reorder span.

  For lines of type _INCLUDE_RE or _FORWARD_DECLARE_RE, the reorder
  span is set to the tuple [start_of_span, end_of_span).  All other
  lines have an arbitrary value for the reorder span.

  Arguments:
    file_lines: an array of LineInfo objects with .type and .move_span
       fields filled in.
  """
  # Happily, move_spans are disjoint. Just make sure they're sorted and unique.
  move_spans = [s.move_span for s in file_lines if s.move_span is not None]
  sorted_move_spans = sorted(set(move_spans))

  i = 0
  while i < len(sorted_move_spans):
    reorder_span_start = sorted_move_spans[i][0]

    # If we're a 'nosort' include, we're always in a reorder span of
    # our own.  Otherwise, add in the next move span if we're
    # connected to it only by blank lines.
    if not _ContainsBarrierInclude(file_lines, sorted_move_spans[i]):
      while i < len(sorted_move_spans) - 1:
        move_span_end = sorted_move_spans[i][1]
        next_move_span_start = sorted_move_spans[i+1][0]
        if (_LinesAreAllBlank(file_lines, move_span_end, next_move_span_start)
            and not _ContainsBarrierInclude(file_lines, sorted_move_spans[i+1])):
          i += 1
        else:
          break
    reorder_span_end = sorted_move_spans[i][1]
    # We'll map every line in the span to the span-extent.
    for line_number in xrange(reorder_span_start, reorder_span_end):
      file_lines[line_number].reorder_span = (reorder_span_start,
                                              reorder_span_end)
    i += 1


def ParseOneFile(f, iwyu_record):
  """Given a file object, read and classify the lines of the file.

  For each file that iwyu_output mentions, we return a list of LineInfo
  objects, which is a parsed version of each line, including not only
  its content but its 'type', its 'key', etc.

  Arguments:
    f: an iterable object returning lines from a file.
    iwyu_record: the IWYUOutputRecord struct for this source file.

  Returns:
     An array of LineInfo objects.  The first element is always a dummy
     element, so the first line of the file is at retval[1], matching
     the way iwyu counts line numbers.
  """
  file_lines = [LineInfo(None)]
  for line in f:
    file_lines.append(LineInfo(line))
  _CalculateLineTypesAndKeys(file_lines, iwyu_record)
  _CalculateMoveSpans(file_lines, iwyu_record.seen_forward_declare_lines)
  _CalculateReorderSpans(file_lines)
  return file_lines


def _DeleteEmptyNamespaces(file_lines):
  """Delete namespaces with nothing in them.

  Empty namespaces could be caused by transformations that removed
  forward-declarations:
        namespace foo {
        class Myclass;
        }
     ->
        namespace foo {
        }
  We want to get rid of the 'empty' namespace in this case.

  This routine 'deletes' lines by setting their 'deleted' field to True.

  Arguments:
    file_lines: an array of LineInfo objects with .type fields filled in.

  Returns:
    The number of namespaces deleted.
  """
  num_namespaces_deleted = 0
  start_line = 0
  while start_line < len(file_lines):
    line_info = file_lines[start_line]
    if line_info.deleted or line_info.type != _NAMESPACE_START_RE:
      start_line += 1
      continue
    # Because multiple namespaces can be on one line
    # ("namespace foo { namespace bar { ..."), we need to count.
    # We use the max because line may have 0 '{'s if it's a macro.
    # TODO(csilvers): ignore { in comments.
    namespace_depth = max(line_info.line.count('{'), 1)
    end_line = start_line + 1
    while end_line < len(file_lines):
      line_info = file_lines[end_line]
      if line_info.deleted:
        end_line += 1
      elif line_info.type in (_COMMENT_LINE_RE, _BLANK_LINE_RE):
        end_line += 1                # ignore blank lines
      elif line_info.type == _NAMESPACE_START_RE:     # nested namespace
        namespace_depth += max(line_info.line.count('{'), 1)
        end_line += 1
      elif line_info.type == _NAMESPACE_END_RE:
        namespace_depth -= max(line_info.line.count('}'), 1)
        end_line += 1
        if namespace_depth <= 0:
          # Delete any comments preceding this namespace as well.
          start_line = _LineNumberStartingPrecedingComments(file_lines,
                                                            start_line)
          # And also blank lines.
          while (start_line > 0 and
                 file_lines[start_line-1].type == _BLANK_LINE_RE):
            start_line -= 1
          for line_number in xrange(start_line, end_line):
            file_lines[line_number].deleted = True
          num_namespaces_deleted += 1
          break
      else:   # bail: we're at a line indicating this isn't an empty namespace
        end_line = start_line + 1  # rewind to try again with nested namespaces
        break
    start_line = end_line

  return num_namespaces_deleted


def _DeleteEmptyIfdefs(file_lines):
  """Deletes ifdefs with nothing in them.

  This could be caused by transformations that removed #includes:
        #ifdef OS_WINDOWS
        # include <windows.h>
        #endif
     ->
        #ifdef OS_WINDOWS
        #endif
  We want to get rid of the 'empty' #ifdef in this case.
  We also handle 'empty' #ifdefs with #else, if both sides of
  the #else are empty.  We also handle #ifndef and #if.

  This routine 'deletes' lines by replacing their content with None.

  Arguments:
    file_lines: an array of LineInfo objects with .type fields filled in.

  Returns:
    The number of ifdefs deleted.
  """
  num_ifdefs_deleted = 0
  start_line = 0
  while start_line < len(file_lines):
    if file_lines[start_line].type not in (_IF_RE, _HEADER_GUARD_RE):
      start_line += 1
      continue
    end_line = start_line + 1
    while end_line < len(file_lines):
      line_info = file_lines[end_line]
      if line_info.deleted:
        end_line += 1
      elif line_info.type in (_ELSE_RE, _COMMENT_LINE_RE, _BLANK_LINE_RE):
        end_line += 1                # ignore blank lines
      elif line_info.type == _ENDIF_RE:
        end_line += 1
        # Delete any comments preceding this #ifdef as well.
        start_line = _LineNumberStartingPrecedingComments(file_lines,
                                                          start_line)
        # And also blank lines.
        while (start_line > 0 and
               file_lines[start_line-1].type == _BLANK_LINE_RE):
          start_line -= 1
        for line_number in xrange(start_line, end_line):
          file_lines[line_number].deleted = True
        num_ifdefs_deleted += 1
        break
      else:   # bail: we're at a line indicating this isn't an empty ifdef
        end_line = start_line + 1  # rewind to try again with nested #ifdefs
        break
    start_line = end_line

  return num_ifdefs_deleted


def _DeleteDuplicateLines(file_lines, line_ranges):
  """Goes through all lines in line_ranges, and if any are dups, deletes them.

  For all lines in line_ranges, if any is the same as a previously
  seen line, set its deleted bit to True.  The purpose of line_ranges
  is to avoid lines in #ifdefs and namespaces, that may be identical
  syntactically but have different semantics.  Ideally, line_ranges
  should include only 'top-level' lines.

  We ignore lines that consist only of comments (or are blank).  We
  ignore end-of-line comments when comparing lines for equality.
  NOTE: Because our comment-finding RE is primitive, it's best if
  line_ranges covers only #include and forward-declare lines.  In
  particular, it should not cover lines that may have C literal
  strings in them.

  Arguments:
    file_lines: an array of LineInfo objects.
    line_ranges: a list of [start_line, end_line) pairs.
  """
  seen_lines = set()
  for line_range in line_ranges:
    for line_number in apply(xrange, line_range):
      if file_lines[line_number].type in (_BLANK_LINE_RE, _COMMENT_LINE_RE):
        continue
      uncommented_line = _COMMENT_RE.sub('', file_lines[line_number].line)
      if uncommented_line in seen_lines:
        file_lines[line_number].deleted = True
      elif not file_lines[line_number].deleted:
        seen_lines.add(uncommented_line)


def _DeleteExtraneousBlankLines(file_lines, line_range):
  """Deletes extraneous blank lines caused by line deletion.

  Here's a example file:
     class Foo { ... };

     class Bar;

     class Baz { ... }

  If we delete the "class Bar;" line, we also want to delete one of
  the blank lines around it, otherwise we leave two blank lines
  between Foo and Baz which looks bad.  The idea is that if we have
  whitespace on both sides of a deleted span of code, the whitespace
  on one of the sides is 'extraneous'.  In this case, we should delete
  not only 'class Bar;' but also the whitespace line below it.  That
  leaves one blank line between Foo and Bar, like people would expect.

  We're careful to only delete the minimum of the number of blank
  lines that show up on either side.  If 'class Bar' had one blank
  line before it, and one hundred after it, we'd only delete one blank
  line when we delete 'class Bar'.  This matches user's expecatations.

  The situation can get tricky when two deleted spans touch (we might
  think it's safe to delete the whitespace between them when it's
  not).  To be safe, we only do this check when an entire reorder-span
  has been deleted.  So we check the given line_range, and only do
  blank-line deletion if every line in the range is deleted.

  Arguments:
    file_lines: an array of LineInfo objects, with .type filled in.
    line_range: a range [start_line, end_line).  It should correspond
       to a reorder-span.
  """
  # First make sure the entire span is deleted.
  for line_number in apply(xrange, line_range):
    if not file_lines[line_number].deleted:
      return

  before_line = _PreviousNondeletedLine(file_lines, line_range[0])
  after_line = _NextNondeletedLine(file_lines, line_range[1] - 1)
  while (before_line and file_lines[before_line].type == _BLANK_LINE_RE and
         after_line and file_lines[after_line].type == _BLANK_LINE_RE):
    # OK, we've got whitespace on both sides of a deleted span.  We
    # only want to keep whitespace on one side, so delete on the other.
    file_lines[after_line].deleted = True
    before_line = _PreviousNondeletedLine(file_lines, before_line)
    after_line = _NextNondeletedLine(file_lines, after_line)


def _ShouldInsertBlankLine(decorated_move_span, next_decorated_move_span,
                           file_lines, flags):
  """Returns true iff we should insert a blank line between the two spans.

  Given two decorated move-spans, of the form
     (reorder_range, kind, noncomment_lines, all_lines)
  returns true if we should insert a blank line between them.  We
  always put a blank line when transitioning from an #include to a
  forward-declare and back.  When the appropriate commandline flag is
  set, we also put a blank line between the 'main' includes (foo.h)
  and the C/C++ system includes, and another between the system
  includes and the rest of the Google includes.

  If the two move spans are in different reorder_ranges, that means
  the first move_span is at the end of a reorder range.  In that case,
  a different rule for blank lines applies: if the next line is
  contentful (eg 'static int x = 5;'), or a namespace start, we want
  to insert a blank line to separate the move-span from the next
  block.  When figuring out if the next line is contentful, we skip
  over comments.

  Arguments:
    decorated_move_span: a decorated_move_span we may want to put a blank
       line after.
    next_decorated_move_span: the next decorated_move_span, which may
       be a sentinel decorated_move_span at end-of-file.
    file_lines: an array of LineInfo objects with .deleted filled in.
    flags: commandline flags, as parsed by optparse.  We use
       flags.blank_lines, which controls whether we put blank
       lines between different 'kinds' of #includes.

  Returns:
     true if we should insert a blank line after decorated_move_span.
  """
  # First handle the 'at the end of a reorder range' case.
  if decorated_move_span[0] != next_decorated_move_span[0]:
    next_line = _NextNondeletedLine(file_lines, decorated_move_span[0][1] - 1)
    # Skip over comments to figure out if the next line is contentful.
    while (next_line and next_line < len(file_lines) and
           file_lines[next_line].type == _COMMENT_LINE_RE):
      next_line += 1
    return (next_line and next_line < len(file_lines) and
            file_lines[next_line].type in (_NAMESPACE_START_RE, None))

  # We never insert a blank line between two spans of the same kind.
  # Nor do we ever insert a blank line at EOF.
  (this_kind, next_kind) = (decorated_move_span[1], next_decorated_move_span[1])
  if this_kind == next_kind or next_kind == _EOF_KIND:
    return False

  # We also never insert a blank line between C and C++-style #includes,
  # no matter what the flag value.
  if (this_kind in [_C_SYSTEM_INCLUDE_KIND, _CXX_SYSTEM_INCLUDE_KIND] and
      next_kind in [_C_SYSTEM_INCLUDE_KIND, _CXX_SYSTEM_INCLUDE_KIND]):
    return False

  # Handle the case we're going from an include to fwd declare or
  # back.  If we get here, we can't both be fwd-declares, so it
  # suffices to check if either of us is.
  if this_kind == _FORWARD_DECLARE_KIND or next_kind == _FORWARD_DECLARE_KIND:
    return True

  # Now, depending on the flag, we insert a blank line whenever the
  # kind changes (we handled the one case where a changing kind
  # doesn't introduce a blank line, above).
  if flags.blank_lines:
    return this_kind != next_kind

  return False


def _GetToplevelReorderSpans(file_lines):
  """Returns a sorted list of all reorder_spans not inside an #ifdef/namespace.

  This routine looks at all the reorder_spans in file_lines, ignores
  reorder spans inside #ifdefs and namespaces -- except for the 'header
  guard' ifdef that encapsulates an entire .h file -- and returns the
  rest in sorted order.

  Arguments:
    file_lines: an array of LineInfo objects with .type and
       .reorder_span filled in.

  Returns:
    A list of [start_line, end_line) reorder_spans.
  """
  in_ifdef = [False] * len(file_lines)   # lines insid…
Large files files are truncated, but you can click here to view the full file