jitfuzz.py - This Python script is a fuzz tester for the by…

/Misc/jitfuzz.py

http://unladen-swallow.googlecode.com/ · Python · 198 lines · 155 code · 6 blank · 37 comment · 5 complexity · a0ecae3371d166c46d81e05432ac281f MD5 · raw file


#!/usr/bin/env python

"""Fuzz tester for the bytecode -> LLVM IR compiler.

The fuzzer implements two strategies for attacking the compiler:
  - Generate a random code object.
  - Take a known-good code object and change one byte to a random replacement.

Neither of these is a terribly sophisticated, but combined were sufficient to
expose multiple fatal errors in the compiler. Strategies that were tried, but
failed to find further problems:
  - Very large code objects with thousands of opcodes.
  - Take a known-good code object and shuffle the opcodes without fixing jump
    targets.
  - Take a known-good code object and shuffle the opcodes, fixing jump targets.
  - Take a known-good code object and replace opcodes with other valid opcodes
    (jump opcodes replace other jump opcodes, nullary opcodes replace other
    nullary opcodes, etc).

The code objects produced by these strategies would either be caught by the
JIT's bytecode validator or would be compiled successfully. The experience was
that the compiler has no trouble with syntactically-correct bytecode, even if
the semantics are invalid.

The fuzzer has yet to generate bytecode that causes problems for LLVM; all
errors so far have been in the bytecode -> LLVM IR frontend.

Example:
  /unladen/swallow/python jitfuzz.py --random_seed=12345678
"""

# Python imports
import opcode
import optparse
import random
import sys
import traceback
import types


def find_code_objects(*modules):
    """Find most code objects in the given modules."""
    for module in modules:
        for val in module.__dict__.itervalues():
            if isinstance(val, types.FunctionType):
                yield val.__code__
            if isinstance(val, type):
                for x in val.__dict__.values():
                    if isinstance(x, types.MethodType):
                        yield x.__code__


# These are known-good code objects for us to screw with.
CODE_OBJS = list(find_code_objects(traceback, optparse, random))

# The order of this list must match the order of parameters to types.CodeType().
CODE_ATTRS = ["argcount", "nlocals", "stacksize", "flags", "code",
              "consts", "names", "varnames", "filename", "name",
              "firstlineno", "lnotab", "freevars", "cellvars"]


def stderr(message, *args):
    print >>sys.stderr, message % args


def init_random_seed(random_seed):
    if random_seed == -1:
        random_seed = int(random.random() * 1e9)
    random.seed(random_seed)
    return random_seed


def clone_code_object(code_obj, **changes):
    """Copy a given code object, possibly changing some attributes.

    Example:
        clone_code_object(code, code=new_bytecode, flags=new_flags)

    Args:
        code_obj: baseline code object to clone.
        **changes: keys should be names in CODE_ATTRS, values should be the
          new value for that attribute name.

    Returns:
        A new code object.
    """
    members = []
    for attr in CODE_ATTRS:
        if attr in changes:
            members.append(changes[attr])
        else:
            full_attr = "co_" + attr
            members.append(getattr(code_obj, full_attr))
    return types.CodeType(*members)


def random_int(lower=0, upper=10):
    return random.randint(lower, upper)


def random_char(lower=1, upper=255):
    return chr(random.randint(lower, upper))


def random_string(length=None):
    if length is None:
        length = random_int(upper=5000)
    # Not random, but nothing looks at the contents of the strings.
    return "a" * length


def random_list(func, length=None):
    if length is None:
        length = random_int(upper=500)
    return [func() for _ in xrange(length)]


def random_object():
    return random.choice([None, True, 3e8, random_list,
                          "foo", u"bar", (9,), []])


def random_code_object():
    correct = (random.random() < 0.5)

    argcount = random_int()
    nlocals = random_int(upper=100)
    stacksize = random_int(upper=10000)
    flags = random_int(upper=1024)
    codestring = random_string()
    constants = tuple(random_list(random_object))
    names = tuple(random_list(random_string))
    filename = "attack-jit.py"
    name = random_string()
    firstlineno = random_int(lower=-1000, upper=1000)
    lnotab = ""
    freevars = tuple(random_list(random_string))
    cellvars = tuple(random_list(random_string))
    if correct:
        varnames = tuple(random_list(random_string, nlocals))
    else:
        varnames = tuple(random_list(random_string))

    code = types.CodeType(argcount, nlocals, stacksize, flags, codestring,
                          constants, names, varnames, filename, name,
                          firstlineno, lnotab, freevars, cellvars)
    return code


def permute_code_object(baseline):
    """Take a code object and change one byte of the bytecode."""
    bytecode = list(baseline.co_code)
    bytecode[random.randint(0, len(bytecode) - 1)] = random_char()
    return clone_code_object(baseline, code="".join(bytecode))


def generate_code():
    """Yield new code objects forever."""
    while True:
        if random.random() < 0.5:
            yield random_code_object()
        else:
            yield permute_code_object(random.choice(CODE_OBJS))


def attack_jit():
    # Track how many code objects are approved by the validator. If too many
    # are being rejected by the validator, we're not stressing LLVM enough.
    valid = 0
    rejected = 0
    for i, code in enumerate(generate_code()):
        code.co_use_jit = True
        try:
            code.co_optimization = 2
            valid += 1
        except:
            traceback.print_exc()
            rejected += 1
        if i % 100 == 0:
            print
            print "### %d attacks successfully repulsed" % i
            print "### Validated: %d; rejected: %d" % (valid, rejected)
            print


def main(argv):
    parser = optparse.OptionParser()
    parser.add_option("-r", "--random_seed",
        help="Random seed", type="int", default=-1)
    options, _ = parser.parse_args(argv)

    rand_seed = init_random_seed(options.random_seed)
    stderr("Using random seed: %s", rand_seed)
    attack_jit()


if __name__ == "__main__":
    main(sys.argv)

Summary ✨

This Python script is a fuzz tester for the bytecode -> LLVM IR compiler, designed to expose fatal errors in the compiler by generating and testing random code objects. It uses two strategies: generating new code objects randomly or modifying existing ones, and tracks the number of valid and rejected code objects.

Tech Fingerprint

Alerts (16)

'isinstance(' Overuse may indicate design issues; consider polymorphism
45 47 49
'list(' Avoid unnecessary list conversions; use generators where possible
54 152
'def' Ensure functions have docstrings for documentation
62 66 97 101 105 112 118 123 166 186
'except:' Avoid catching all exceptions; specify exception types to catch only expected errors
176