/util/find_copyrights.py
Python | 273 lines | 253 code | 19 blank | 1 comment | 48 complexity | 002a84b5022f94579098e69b5b62f16b MD5 | raw file
Possible License(s): BSD-3-Clause, LGPL-2.1
- #!/usr/bin/env python
- import os
- import re
- import sys
- from file_types import lang_type, find_files
- mode_line = re.compile('(-\*- *mode:.* *-\*-)')
- shell_comment = re.compile(r'^\s*#')
- lisp_comment = re.compile(r';')
- cpp_comment = re.compile(r'//')
- c_comment_start = re.compile(r'/\*')
- c_comment_end = re.compile(r'\*/')
- def find_copyright_block(lines, lang_type):
- start = None
- if lang_type in ('python', 'make', 'shell', 'perl', 'scons'):
- for i,line in enumerate(lines):
- if i == 0 and (line.startswith('#!') or mode_line.search(line)):
- continue
- if shell_comment.search(line):
- if start is None:
- start = i
- elif start is None:
- if line.strip():
- return
- else:
- yield start, i-1
- start = None
- elif lang_type in ('lisp', ):
- for i,line in enumerate(lines):
- if i == 0 and mode_line.search(line):
- continue
- if lisp_comment.search(line):
- if start is None:
- start = i
- elif start is None:
- if line.strip():
- return
- else:
- yield start, i-1
- start = None
- elif lang_type in ('C', 'C++', 'swig', 'isa', 'asm', 'slicc',
- 'lex', 'yacc'):
- mode = None
- for i,line in enumerate(lines):
- if i == 0 and mode_line.search(line):
- continue
- if mode == 'C':
- assert start is not None, 'on line %d' % (i + 1)
- match = c_comment_end.search(line)
- if match:
- yield start, i
- mode = None
- continue
- cpp_match = cpp_comment.search(line)
- c_match = c_comment_start.search(line)
- if cpp_match:
- assert not c_match, 'on line %d' % (i + 1)
- if line[:cpp_match.start()].strip():
- return
- if mode is None:
- mode = 'CPP'
- start = i
- else:
- text = line[cpp_match.end():].lstrip()
- if text.startswith("Copyright") > 0:
- yield start, i-1
- start = i
- continue
- elif mode == 'CPP':
- assert start is not None, 'on line %d' % (i + 1)
- if not line.strip():
- continue
- yield start, i-1
- mode = None
- if not c_match:
- return
- if c_match:
- assert mode is None, 'on line %d' % (i + 1)
- mode = 'C'
- start = i
- if mode is None and line.strip():
- return
- else:
- raise AttributeError, "Could not handle language %s" % lang_type
- date_range_re = re.compile(r'([0-9]{4})\s*-\s*([0-9]{4})')
- def process_dates(dates):
- dates = [ d.strip() for d in dates.split(',') ]
- output = set()
- for date in dates:
- match = date_range_re.match(date)
- if match:
- f,l = [ int(d) for d in match.groups() ]
- for i in xrange(f, l+1):
- output.add(i)
- else:
- try:
- date = int(date)
- output.add(date)
- except ValueError:
- pass
- return output
- copyright_re = \
- re.compile(r'Copyright (\([cC]\)) ([-, 0-9]+)[\s*#/]*([A-z-,. ]+)',
- re.DOTALL)
- authors_re = re.compile(r'^[\s*#/]*Authors:\s*([A-z .]+)\s*$')
- more_authors_re = re.compile(r'^[\s*#/]*([A-z .]+)\s*$')
- all_owners = set()
- def get_data(lang_type, lines):
- data = []
- last = None
- for start,end in find_copyright_block(lines, lang_type):
- joined = ''.join(lines[start:end+1])
- match = copyright_re.search(joined)
- if not match:
- continue
- c,dates,owner = match.groups()
- dates = dates.strip()
- owner = owner.strip()
- all_owners.add(owner)
- try:
- dates = process_dates(dates)
- except Exception:
- print dates
- print owner
- raise
- authors = []
- for i in xrange(start,end+1):
- line = lines[i]
- if not authors:
- match = authors_re.search(line)
- if match:
- authors.append(match.group(1).strip())
- else:
- match = more_authors_re.search(line)
- if not match:
- for j in xrange(i, end+1):
- line = lines[j].strip()
- if not line:
- end = j
- break
- if line.startswith('//'):
- line = line[2:].lstrip()
- if line:
- end = j - 1
- break
- break
- authors.append(match.group(1).strip())
- info = (owner, dates, authors, start, end)
- data.append(info)
- return data
- def datestr(dates):
- dates = list(dates)
- dates.sort()
- output = []
- def add_output(first, second):
- if first == second:
- output.append('%d' % (first))
- else:
- output.append('%d-%d' % (first, second))
- first = dates.pop(0)
- second = first
- while dates:
- next = dates.pop(0)
- if next == second + 1:
- second = next
- else:
- add_output(first, second)
- first = next
- second = next
- add_output(first, second)
- return ','.join(output)
- usage_str = """usage:
- %s [-v] <directory>"""
- def usage(exitcode):
- print usage_str % sys.argv[0]
- if exitcode is not None:
- sys.exit(exitcode)
- if __name__ == '__main__':
- import getopt
- show_counts = False
- ignore = set()
- verbose = False
- try:
- opts, args = getopt.getopt(sys.argv[1:], "ci:v")
- except getopt.GetoptError:
- usage(1)
- for o,a in opts:
- if o == '-c':
- show_counts = True
- if o == '-i':
- ignore.add(a)
- if o == '-v':
- verbose = True
- files = []
- for base in args:
- if os.path.isfile(base):
- files += [ (base, lang_type(base)) ]
- elif os.path.isdir(base):
- files += find_files(base)
- else:
- raise AttributeError, "can't access '%s'" % base
- copyrights = {}
- counts = {}
- for filename, lang in files:
- f = file(filename, 'r')
- lines = f.readlines()
- if not lines:
- continue
- lines = [ line.rstrip('\r\n') for line in lines ]
- lt = lang_type(filename, lines[0])
- try:
- data = get_data(lt, lines)
- except Exception, e:
- if verbose:
- if len(e.args) == 1:
- e.args = ('%s (%s))' % (e, filename), )
- print "could not parse %s: %s" % (filename, e)
- continue
- for owner, dates, authors, start, end in data:
- if owner not in copyrights:
- copyrights[owner] = set()
- if owner not in counts:
- counts[owner] = 0
- copyrights[owner] |= dates
- counts[owner] += 1
- info = [ (counts[o], d, o) for o,d in copyrights.items() ]
- for count,dates,owner in sorted(info, reverse=True):
- if show_counts:
- owner = '%s (%s files)' % (owner, count)
- print 'Copyright (c) %s %s' % (datestr(dates), owner)