/pypy/jit/backend/x86/assembler.py
Python | 2580 lines | 1964 code | 229 blank | 387 comment | 411 complexity | a0ae27c3539b9fe6f39c3dbbf3975e23 MD5 | raw file
Large files files are truncated, but you can click here to view the full file
- import sys, os
- from pypy.jit.backend.llsupport import symbolic
- from pypy.jit.backend.llsupport.asmmemmgr import MachineDataBlockWrapper
- from pypy.jit.metainterp.history import Const, Box, BoxInt, ConstInt
- from pypy.jit.metainterp.history import AbstractFailDescr, INT, REF, FLOAT
- from pypy.jit.metainterp.history import JitCellToken
- from pypy.rpython.lltypesystem import lltype, rffi, rstr, llmemory
- from pypy.rpython.lltypesystem.lloperation import llop
- from pypy.rpython.annlowlevel import llhelper
- from pypy.rlib.jit import AsmInfo
- from pypy.jit.backend.model import CompiledLoopToken
- from pypy.jit.backend.x86.regalloc import (RegAlloc, get_ebp_ofs, _get_scale,
- gpr_reg_mgr_cls, _valid_addressing_size)
- from pypy.jit.backend.x86.arch import (FRAME_FIXED_SIZE, FORCE_INDEX_OFS, WORD,
- IS_X86_32, IS_X86_64)
- from pypy.jit.backend.x86.regloc import (eax, ecx, edx, ebx,
- esp, ebp, esi, edi,
- xmm0, xmm1, xmm2, xmm3,
- xmm4, xmm5, xmm6, xmm7,
- r8, r9, r10, r11,
- r12, r13, r14, r15,
- X86_64_SCRATCH_REG,
- X86_64_XMM_SCRATCH_REG,
- RegLoc, StackLoc, ConstFloatLoc,
- ImmedLoc, AddressLoc, imm,
- imm0, imm1, FloatImmedLoc)
- from pypy.rlib.objectmodel import we_are_translated, specialize
- from pypy.jit.backend.x86 import rx86, regloc, codebuf
- from pypy.jit.metainterp.resoperation import rop, ResOperation
- from pypy.jit.backend.x86.support import values_array
- from pypy.jit.backend.x86 import support
- from pypy.rlib.debug import (debug_print, debug_start, debug_stop,
- have_debug_prints)
- from pypy.rlib import rgc
- from pypy.rlib.clibffi import FFI_DEFAULT_ABI
- from pypy.jit.backend.x86.jump import remap_frame_layout
- from pypy.jit.codewriter.effectinfo import EffectInfo
- from pypy.jit.codewriter import longlong
- from pypy.rlib.rarithmetic import intmask
- from pypy.rlib.objectmodel import compute_unique_id
- # darwin requires the stack to be 16 bytes aligned on calls. Same for gcc 4.5.0,
- # better safe than sorry
- CALL_ALIGN = 16 // WORD
- def align_stack_words(words):
- return (words + CALL_ALIGN - 1) & ~(CALL_ALIGN-1)
- class GuardToken(object):
- def __init__(self, faildescr, failargs, fail_locs, exc,
- is_guard_not_invalidated):
- self.faildescr = faildescr
- self.failargs = failargs
- self.fail_locs = fail_locs
- self.exc = exc
- self.is_guard_not_invalidated = is_guard_not_invalidated
- DEBUG_COUNTER = lltype.Struct('DEBUG_COUNTER', ('i', lltype.Signed),
- ('type', lltype.Char), # 'b'ridge, 'l'abel or
- # 'e'ntry point
- ('number', lltype.Signed))
- class Assembler386(object):
- _regalloc = None
- _output_loop_log = None
- def __init__(self, cpu, translate_support_code=False,
- failargs_limit=1000):
- self.cpu = cpu
- self.verbose = False
- self.rtyper = cpu.rtyper
- self.fail_boxes_int = values_array(lltype.Signed, failargs_limit)
- self.fail_boxes_ptr = values_array(llmemory.GCREF, failargs_limit)
- self.fail_boxes_float = values_array(longlong.FLOATSTORAGE,
- failargs_limit)
- self.fail_ebp = 0
- self.loop_run_counters = []
- self.float_const_neg_addr = 0
- self.float_const_abs_addr = 0
- self.malloc_slowpath1 = 0
- self.malloc_slowpath2 = 0
- self.memcpy_addr = 0
- self.setup_failure_recovery()
- self._debug = False
- self.debug_counter_descr = cpu.fielddescrof(DEBUG_COUNTER, 'i')
- self.fail_boxes_count = 0
- self.datablockwrapper = None
- self.stack_check_slowpath = 0
- self.propagate_exception_path = 0
- self.gcrootmap_retaddr_forced = 0
- self.teardown()
- def leave_jitted_hook(self):
- ptrs = self.fail_boxes_ptr.ar
- llop.gc_assume_young_pointers(lltype.Void,
- llmemory.cast_ptr_to_adr(ptrs))
- def set_debug(self, v):
- self._debug = v
- def setup_once(self):
- # the address of the function called by 'new'
- gc_ll_descr = self.cpu.gc_ll_descr
- gc_ll_descr.initialize()
- self.memcpy_addr = self.cpu.cast_ptr_to_int(support.memcpy_fn)
- self._build_failure_recovery(False)
- self._build_failure_recovery(True)
- if self.cpu.supports_floats:
- self._build_failure_recovery(False, withfloats=True)
- self._build_failure_recovery(True, withfloats=True)
- support.ensure_sse2_floats()
- self._build_float_constants()
- self._build_propagate_exception_path()
- if gc_ll_descr.get_malloc_slowpath_addr is not None:
- self._build_malloc_slowpath()
- self._build_stack_check_slowpath()
- if gc_ll_descr.gcrootmap:
- self._build_release_gil(gc_ll_descr.gcrootmap)
- debug_start('jit-backend-counts')
- self.set_debug(have_debug_prints())
- debug_stop('jit-backend-counts')
- def setup(self, looptoken):
- assert self.memcpy_addr != 0, "setup_once() not called?"
- self.current_clt = looptoken.compiled_loop_token
- self.pending_guard_tokens = []
- if WORD == 8:
- self.pending_memoryerror_trampoline_from = []
- self.error_trampoline_64 = 0
- self.mc = codebuf.MachineCodeBlockWrapper()
- #assert self.datablockwrapper is None --- but obscure case
- # possible, e.g. getting MemoryError and continuing
- allblocks = self.get_asmmemmgr_blocks(looptoken)
- self.datablockwrapper = MachineDataBlockWrapper(self.cpu.asmmemmgr,
- allblocks)
- self.target_tokens_currently_compiling = {}
- def teardown(self):
- self.pending_guard_tokens = None
- if WORD == 8:
- self.pending_memoryerror_trampoline_from = None
- self.mc = None
- self.current_clt = None
- def finish_once(self):
- if self._debug:
- debug_start('jit-backend-counts')
- for i in range(len(self.loop_run_counters)):
- struct = self.loop_run_counters[i]
- if struct.type == 'l':
- prefix = 'TargetToken(%d)' % struct.number
- elif struct.type == 'b':
- prefix = 'bridge ' + str(struct.number)
- else:
- prefix = 'entry ' + str(struct.number)
- debug_print(prefix + ':' + str(struct.i))
- debug_stop('jit-backend-counts')
- def _build_float_constants(self):
- datablockwrapper = MachineDataBlockWrapper(self.cpu.asmmemmgr, [])
- float_constants = datablockwrapper.malloc_aligned(32, alignment=16)
- datablockwrapper.done()
- addr = rffi.cast(rffi.CArrayPtr(lltype.Char), float_constants)
- qword_padding = '\x00\x00\x00\x00\x00\x00\x00\x00'
- # 0x8000000000000000
- neg_const = '\x00\x00\x00\x00\x00\x00\x00\x80'
- # 0x7FFFFFFFFFFFFFFF
- abs_const = '\xFF\xFF\xFF\xFF\xFF\xFF\xFF\x7F'
- data = neg_const + qword_padding + abs_const + qword_padding
- for i in range(len(data)):
- addr[i] = data[i]
- self.float_const_neg_addr = float_constants
- self.float_const_abs_addr = float_constants + 16
- def _build_malloc_slowpath(self):
- # With asmgcc, we need two helpers, so that we can write two CALL
- # instructions in assembler, with a mark_gc_roots in between.
- # With shadowstack, this is not needed, so we produce a single helper.
- gcrootmap = self.cpu.gc_ll_descr.gcrootmap
- shadow_stack = (gcrootmap is not None and gcrootmap.is_shadow_stack)
- #
- # ---------- first helper for the slow path of malloc ----------
- mc = codebuf.MachineCodeBlockWrapper()
- if self.cpu.supports_floats: # save the XMM registers in
- for i in range(self.cpu.NUM_REGS):# the *caller* frame, from esp+8
- mc.MOVSD_sx((WORD*2)+8*i, i)
- mc.SUB_rr(edx.value, eax.value) # compute the size we want
- addr = self.cpu.gc_ll_descr.get_malloc_slowpath_addr()
- #
- # The registers to save in the copy area: with shadowstack, most
- # registers need to be saved. With asmgcc, the callee-saved registers
- # don't need to.
- save_in_copy_area = gpr_reg_mgr_cls.REGLOC_TO_COPY_AREA_OFS.items()
- if not shadow_stack:
- save_in_copy_area = [(reg, ofs) for (reg, ofs) in save_in_copy_area
- if reg not in gpr_reg_mgr_cls.REGLOC_TO_GCROOTMAP_REG_INDEX]
- #
- for reg, ofs in save_in_copy_area:
- mc.MOV_br(ofs, reg.value)
- #
- if shadow_stack:
- # ---- shadowstack ----
- mc.SUB_ri(esp.value, 16 - WORD) # stack alignment of 16 bytes
- if IS_X86_32:
- mc.MOV_sr(0, edx.value) # push argument
- elif IS_X86_64:
- mc.MOV_rr(edi.value, edx.value)
- mc.CALL(imm(addr))
- mc.ADD_ri(esp.value, 16 - WORD)
- else:
- # ---- asmgcc ----
- if IS_X86_32:
- mc.MOV_sr(WORD, edx.value) # save it as the new argument
- elif IS_X86_64:
- # rdi can be clobbered: its content was saved in the
- # copy area of the stack
- mc.MOV_rr(edi.value, edx.value)
- mc.JMP(imm(addr)) # tail call to the real malloc
- rawstart = mc.materialize(self.cpu.asmmemmgr, [])
- self.malloc_slowpath1 = rawstart
- # ---------- second helper for the slow path of malloc ----------
- mc = codebuf.MachineCodeBlockWrapper()
- #
- for reg, ofs in save_in_copy_area:
- mc.MOV_rb(reg.value, ofs)
- assert reg is not eax and reg is not edx
- #
- if self.cpu.supports_floats: # restore the XMM registers
- for i in range(self.cpu.NUM_REGS):# from where they were saved
- mc.MOVSD_xs(i, (WORD*2)+8*i)
- #
- # Note: we check this after the code above, just because the code
- # above is more than 127 bytes on 64-bits...
- mc.TEST_rr(eax.value, eax.value)
- mc.J_il8(rx86.Conditions['Z'], 0) # patched later
- jz_location = mc.get_relative_pos()
- #
- nursery_free_adr = self.cpu.gc_ll_descr.get_nursery_free_addr()
- mc.MOV(edx, heap(nursery_free_adr)) # load this in EDX
- mc.RET()
- #
- # If the slowpath malloc failed, we raise a MemoryError that
- # always interrupts the current loop, as a "good enough"
- # approximation. Also note that we didn't RET from this helper;
- # but the code we jump to will actually restore the stack
- # position based on EBP, which will get us out of here for free.
- offset = mc.get_relative_pos() - jz_location
- assert 0 < offset <= 127
- mc.overwrite(jz_location-1, chr(offset))
- mc.JMP(imm(self.propagate_exception_path))
- #
- rawstart = mc.materialize(self.cpu.asmmemmgr, [])
- self.malloc_slowpath2 = rawstart
- def _build_propagate_exception_path(self):
- if self.cpu.propagate_exception_v < 0:
- return # not supported (for tests, or non-translated)
- #
- self.mc = codebuf.MachineCodeBlockWrapper()
- # call on_leave_jitted_save_exc()
- addr = self.cpu.get_on_leave_jitted_int(save_exception=True,
- default_to_memoryerror=True)
- self.mc.CALL(imm(addr))
- self.mc.MOV_ri(eax.value, self.cpu.propagate_exception_v)
- self._call_footer()
- rawstart = self.mc.materialize(self.cpu.asmmemmgr, [])
- self.propagate_exception_path = rawstart
- self.mc = None
- def _build_stack_check_slowpath(self):
- _, _, slowpathaddr = self.cpu.insert_stack_check()
- if slowpathaddr == 0 or self.cpu.propagate_exception_v < 0:
- return # no stack check (for tests, or non-translated)
- #
- # make a "function" that is called immediately at the start of
- # an assembler function. In particular, the stack looks like:
- #
- # | ... | <-- aligned to a multiple of 16
- # | retaddr of caller |
- # | my own retaddr | <-- esp
- # +---------------------+
- #
- mc = codebuf.MachineCodeBlockWrapper()
- #
- stack_size = WORD
- if IS_X86_64:
- # on the x86_64, we have to save all the registers that may
- # have been used to pass arguments
- stack_size += 6*WORD + 8*8
- for reg in [edi, esi, edx, ecx, r8, r9]:
- mc.PUSH_r(reg.value)
- mc.SUB_ri(esp.value, 8*8)
- for i in range(8):
- mc.MOVSD_sx(8*i, i) # xmm0 to xmm7
- #
- if IS_X86_32:
- stack_size += 2*WORD
- mc.PUSH_r(eax.value) # alignment
- mc.PUSH_r(esp.value)
- elif IS_X86_64:
- mc.MOV_rr(edi.value, esp.value)
- #
- # esp is now aligned to a multiple of 16 again
- mc.CALL(imm(slowpathaddr))
- #
- mc.MOV(eax, heap(self.cpu.pos_exception()))
- mc.TEST_rr(eax.value, eax.value)
- mc.J_il8(rx86.Conditions['NZ'], 0)
- jnz_location = mc.get_relative_pos()
- #
- if IS_X86_32:
- mc.ADD_ri(esp.value, 2*WORD) # cancel the two PUSHes above
- elif IS_X86_64:
- # restore the registers
- for i in range(7, -1, -1):
- mc.MOVSD_xs(i, 8*i)
- mc.ADD_ri(esp.value, 8*8)
- for reg in [r9, r8, ecx, edx, esi, edi]:
- mc.POP_r(reg.value)
- #
- mc.RET()
- #
- # patch the JNZ above
- offset = mc.get_relative_pos() - jnz_location
- assert 0 < offset <= 127
- mc.overwrite(jnz_location-1, chr(offset))
- # call on_leave_jitted_save_exc()
- addr = self.cpu.get_on_leave_jitted_int(save_exception=True)
- mc.CALL(imm(addr))
- #
- mc.MOV_ri(eax.value, self.cpu.propagate_exception_v)
- #
- # footer -- note the ADD, which skips the return address of this
- # function, and will instead return to the caller's caller. Note
- # also that we completely ignore the saved arguments, because we
- # are interrupting the function.
- mc.ADD_ri(esp.value, stack_size)
- mc.RET()
- #
- rawstart = mc.materialize(self.cpu.asmmemmgr, [])
- self.stack_check_slowpath = rawstart
- @staticmethod
- @rgc.no_collect
- def _release_gil_asmgcc(css):
- # similar to trackgcroot.py:pypy_asm_stackwalk, first part
- from pypy.rpython.memory.gctransform import asmgcroot
- new = rffi.cast(asmgcroot.ASM_FRAMEDATA_HEAD_PTR, css)
- next = asmgcroot.gcrootanchor.next
- new.next = next
- new.prev = asmgcroot.gcrootanchor
- asmgcroot.gcrootanchor.next = new
- next.prev = new
- # and now release the GIL
- before = rffi.aroundstate.before
- if before:
- before()
- @staticmethod
- @rgc.no_collect
- def _reacquire_gil_asmgcc(css):
- # first reacquire the GIL
- after = rffi.aroundstate.after
- if after:
- after()
- # similar to trackgcroot.py:pypy_asm_stackwalk, second part
- from pypy.rpython.memory.gctransform import asmgcroot
- old = rffi.cast(asmgcroot.ASM_FRAMEDATA_HEAD_PTR, css)
- prev = old.prev
- next = old.next
- prev.next = next
- next.prev = prev
- @staticmethod
- @rgc.no_collect
- def _release_gil_shadowstack():
- before = rffi.aroundstate.before
- if before:
- before()
- @staticmethod
- @rgc.no_collect
- def _reacquire_gil_shadowstack():
- after = rffi.aroundstate.after
- if after:
- after()
- _NOARG_FUNC = lltype.Ptr(lltype.FuncType([], lltype.Void))
- _CLOSESTACK_FUNC = lltype.Ptr(lltype.FuncType([rffi.LONGP],
- lltype.Void))
- def _build_release_gil(self, gcrootmap):
- if gcrootmap.is_shadow_stack:
- releasegil_func = llhelper(self._NOARG_FUNC,
- self._release_gil_shadowstack)
- reacqgil_func = llhelper(self._NOARG_FUNC,
- self._reacquire_gil_shadowstack)
- else:
- releasegil_func = llhelper(self._CLOSESTACK_FUNC,
- self._release_gil_asmgcc)
- reacqgil_func = llhelper(self._CLOSESTACK_FUNC,
- self._reacquire_gil_asmgcc)
- self.releasegil_addr = self.cpu.cast_ptr_to_int(releasegil_func)
- self.reacqgil_addr = self.cpu.cast_ptr_to_int(reacqgil_func)
- def assemble_loop(self, loopname, inputargs, operations, looptoken, log):
- '''adds the following attributes to looptoken:
- _x86_function_addr (address of the generated func, as an int)
- _x86_loop_code (debug: addr of the start of the ResOps)
- _x86_fullsize (debug: full size including failure)
- _x86_debug_checksum
- '''
- # XXX this function is too longish and contains some code
- # duplication with assemble_bridge(). Also, we should think
- # about not storing on 'self' attributes that will live only
- # for the duration of compiling one loop or a one bridge.
- clt = CompiledLoopToken(self.cpu, looptoken.number)
- clt.allgcrefs = []
- looptoken.compiled_loop_token = clt
- if not we_are_translated():
- # Arguments should be unique
- assert len(set(inputargs)) == len(inputargs)
- self.setup(looptoken)
- if log:
- operations = self._inject_debugging_code(looptoken, operations,
- 'e', looptoken.number)
- regalloc = RegAlloc(self, self.cpu.translate_support_code)
- #
- self._call_header_with_stack_check()
- stackadjustpos = self._patchable_stackadjust()
- clt._debug_nbargs = len(inputargs)
- operations = regalloc.prepare_loop(inputargs, operations,
- looptoken, clt.allgcrefs)
- looppos = self.mc.get_relative_pos()
- looptoken._x86_loop_code = looppos
- clt.frame_depth = -1 # temporarily
- frame_depth = self._assemble(regalloc, operations)
- clt.frame_depth = frame_depth
- #
- size_excluding_failure_stuff = self.mc.get_relative_pos()
- self.write_pending_failure_recoveries()
- full_size = self.mc.get_relative_pos()
- #
- rawstart = self.materialize_loop(looptoken)
- debug_start("jit-backend-addr")
- debug_print("Loop %d (%s) has address %x to %x (bootstrap %x)" % (
- looptoken.number, loopname,
- rawstart + looppos,
- rawstart + size_excluding_failure_stuff,
- rawstart))
- debug_stop("jit-backend-addr")
- self._patch_stackadjust(rawstart + stackadjustpos, frame_depth)
- self.patch_pending_failure_recoveries(rawstart)
- #
- ops_offset = self.mc.ops_offset
- if not we_are_translated():
- # used only by looptoken.dump() -- useful in tests
- looptoken._x86_rawstart = rawstart
- looptoken._x86_fullsize = full_size
- looptoken._x86_ops_offset = ops_offset
- looptoken._x86_function_addr = rawstart
- self.fixup_target_tokens(rawstart)
- self.teardown()
- # oprofile support
- if self.cpu.profile_agent is not None:
- name = "Loop # %s: %s" % (looptoken.number, loopname)
- self.cpu.profile_agent.native_code_written(name,
- rawstart, full_size)
- return AsmInfo(ops_offset, rawstart + looppos,
- size_excluding_failure_stuff - looppos)
- def assemble_bridge(self, faildescr, inputargs, operations,
- original_loop_token, log):
- if not we_are_translated():
- # Arguments should be unique
- assert len(set(inputargs)) == len(inputargs)
- descr_number = self.cpu.get_fail_descr_number(faildescr)
- failure_recovery = self._find_failure_recovery_bytecode(faildescr)
- self.setup(original_loop_token)
- if log:
- operations = self._inject_debugging_code(faildescr, operations,
- 'b', descr_number)
- arglocs = self.rebuild_faillocs_from_descr(failure_recovery)
- if not we_are_translated():
- assert ([loc.assembler() for loc in arglocs] ==
- [loc.assembler() for loc in faildescr._x86_debug_faillocs])
- regalloc = RegAlloc(self, self.cpu.translate_support_code)
- startpos = self.mc.get_relative_pos()
- operations = regalloc.prepare_bridge(inputargs, arglocs,
- operations,
- self.current_clt.allgcrefs)
- stackadjustpos = self._patchable_stackadjust()
- frame_depth = self._assemble(regalloc, operations)
- codeendpos = self.mc.get_relative_pos()
- self.write_pending_failure_recoveries()
- fullsize = self.mc.get_relative_pos()
- #
- rawstart = self.materialize_loop(original_loop_token)
- debug_start("jit-backend-addr")
- debug_print("bridge out of Guard %d has address %x to %x" %
- (descr_number, rawstart, rawstart + codeendpos))
- debug_stop("jit-backend-addr")
- self._patch_stackadjust(rawstart + stackadjustpos, frame_depth)
- self.patch_pending_failure_recoveries(rawstart)
- if not we_are_translated():
- # for the benefit of tests
- faildescr._x86_bridge_frame_depth = frame_depth
- # patch the jump from original guard
- self.patch_jump_for_descr(faildescr, rawstart)
- ops_offset = self.mc.ops_offset
- self.fixup_target_tokens(rawstart)
- self.current_clt.frame_depth = max(self.current_clt.frame_depth, frame_depth)
- self.teardown()
- # oprofile support
- if self.cpu.profile_agent is not None:
- name = "Bridge # %s" % (descr_number,)
- self.cpu.profile_agent.native_code_written(name,
- rawstart, fullsize)
- return AsmInfo(ops_offset, startpos + rawstart, codeendpos - startpos)
- def write_pending_failure_recoveries(self):
- # for each pending guard, generate the code of the recovery stub
- # at the end of self.mc.
- for tok in self.pending_guard_tokens:
- tok.pos_recovery_stub = self.generate_quick_failure(tok)
- if WORD == 8 and len(self.pending_memoryerror_trampoline_from) > 0:
- self.error_trampoline_64 = self.generate_propagate_error_64()
- def patch_pending_failure_recoveries(self, rawstart):
- # after we wrote the assembler to raw memory, set up
- # tok.faildescr._x86_adr_jump_offset to contain the raw address of
- # the 4-byte target field in the JMP/Jcond instruction, and patch
- # the field in question to point (initially) to the recovery stub
- clt = self.current_clt
- for tok in self.pending_guard_tokens:
- addr = rawstart + tok.pos_jump_offset
- tok.faildescr._x86_adr_jump_offset = addr
- relative_target = tok.pos_recovery_stub - (tok.pos_jump_offset + 4)
- assert rx86.fits_in_32bits(relative_target)
- #
- if not tok.is_guard_not_invalidated:
- mc = codebuf.MachineCodeBlockWrapper()
- mc.writeimm32(relative_target)
- mc.copy_to_raw_memory(addr)
- else:
- # GUARD_NOT_INVALIDATED, record an entry in
- # clt.invalidate_positions of the form:
- # (addr-in-the-code-of-the-not-yet-written-jump-target,
- # relative-target-to-use)
- relpos = tok.pos_jump_offset
- clt.invalidate_positions.append((rawstart + relpos,
- relative_target))
- # General idea: Although no code was generated by this
- # guard, the code might be patched with a "JMP rel32" to
- # the guard recovery code. This recovery code is
- # already generated, and looks like the recovery code
- # for any guard, even if at first it has no jump to it.
- # So we may later write 5 bytes overriding the existing
- # instructions; this works because a CALL instruction
- # would also take at least 5 bytes. If it could take
- # less, we would run into the issue that overwriting the
- # 5 bytes here might get a few nonsense bytes at the
- # return address of the following CALL.
- if WORD == 8:
- for pos_after_jz in self.pending_memoryerror_trampoline_from:
- assert self.error_trampoline_64 != 0 # only if non-empty
- mc = codebuf.MachineCodeBlockWrapper()
- mc.writeimm32(self.error_trampoline_64 - pos_after_jz)
- mc.copy_to_raw_memory(rawstart + pos_after_jz - 4)
- def get_asmmemmgr_blocks(self, looptoken):
- clt = looptoken.compiled_loop_token
- if clt.asmmemmgr_blocks is None:
- clt.asmmemmgr_blocks = []
- return clt.asmmemmgr_blocks
- def materialize_loop(self, looptoken):
- self.datablockwrapper.done() # finish using cpu.asmmemmgr
- self.datablockwrapper = None
- allblocks = self.get_asmmemmgr_blocks(looptoken)
- return self.mc.materialize(self.cpu.asmmemmgr, allblocks,
- self.cpu.gc_ll_descr.gcrootmap)
- def _register_counter(self, tp, number, token):
- # YYY very minor leak -- we need the counters to stay alive
- # forever, just because we want to report them at the end
- # of the process
- struct = lltype.malloc(DEBUG_COUNTER, flavor='raw',
- track_allocation=False)
- struct.i = 0
- struct.type = tp
- if tp == 'b' or tp == 'e':
- struct.number = number
- else:
- assert token
- struct.number = compute_unique_id(token)
- self.loop_run_counters.append(struct)
- return struct
- def _find_failure_recovery_bytecode(self, faildescr):
- adr_jump_offset = faildescr._x86_adr_jump_offset
- if adr_jump_offset == 0:
- # This case should be prevented by the logic in compile.py:
- # look for CNT_BUSY_FLAG, which disables tracing from a guard
- # when another tracing from the same guard is already in progress.
- raise BridgeAlreadyCompiled
- # follow the JMP/Jcond
- p = rffi.cast(rffi.INTP, adr_jump_offset)
- adr_target = adr_jump_offset + 4 + rffi.cast(lltype.Signed, p[0])
- # skip the CALL
- if WORD == 4:
- adr_target += 5 # CALL imm
- else:
- adr_target += 13 # MOV r11, imm-as-8-bytes; CALL *r11 xxxxxxxxxx
- return adr_target
- def patch_jump_for_descr(self, faildescr, adr_new_target):
- adr_jump_offset = faildescr._x86_adr_jump_offset
- assert adr_jump_offset != 0
- offset = adr_new_target - (adr_jump_offset + 4)
- # If the new target fits within a rel32 of the jump, just patch
- # that. Otherwise, leave the original rel32 to the recovery stub in
- # place, but clobber the recovery stub with a jump to the real
- # target.
- mc = codebuf.MachineCodeBlockWrapper()
- if rx86.fits_in_32bits(offset):
- mc.writeimm32(offset)
- mc.copy_to_raw_memory(adr_jump_offset)
- else:
- # "mov r11, addr; jmp r11" is up to 13 bytes, which fits in there
- # because we always write "mov r11, imm-as-8-bytes; call *r11" in
- # the first place.
- mc.MOV_ri(X86_64_SCRATCH_REG.value, adr_new_target)
- mc.JMP_r(X86_64_SCRATCH_REG.value)
- p = rffi.cast(rffi.INTP, adr_jump_offset)
- adr_target = adr_jump_offset + 4 + rffi.cast(lltype.Signed, p[0])
- mc.copy_to_raw_memory(adr_target)
- faildescr._x86_adr_jump_offset = 0 # means "patched"
- def fixup_target_tokens(self, rawstart):
- for targettoken in self.target_tokens_currently_compiling:
- targettoken._x86_loop_code += rawstart
- self.target_tokens_currently_compiling = None
- def _append_debugging_code(self, operations, tp, number, token):
- counter = self._register_counter(tp, number, token)
- c_adr = ConstInt(rffi.cast(lltype.Signed, counter))
- box = BoxInt()
- box2 = BoxInt()
- ops = [ResOperation(rop.GETFIELD_RAW, [c_adr],
- box, descr=self.debug_counter_descr),
- ResOperation(rop.INT_ADD, [box, ConstInt(1)], box2),
- ResOperation(rop.SETFIELD_RAW, [c_adr, box2],
- None, descr=self.debug_counter_descr)]
- operations.extend(ops)
- @specialize.argtype(1)
- def _inject_debugging_code(self, looptoken, operations, tp, number):
- if self._debug:
- # before doing anything, let's increase a counter
- s = 0
- for op in operations:
- s += op.getopnum()
- looptoken._x86_debug_checksum = s
- newoperations = []
- self._append_debugging_code(newoperations, tp, number,
- None)
- for op in operations:
- newoperations.append(op)
- if op.getopnum() == rop.LABEL:
- self._append_debugging_code(newoperations, 'l', number,
- op.getdescr())
- operations = newoperations
- return operations
- def _assemble(self, regalloc, operations):
- self._regalloc = regalloc
- regalloc.compute_hint_frame_locations(operations)
- regalloc.walk_operations(operations)
- if we_are_translated() or self.cpu.dont_keepalive_stuff:
- self._regalloc = None # else keep it around for debugging
- frame_depth = regalloc.get_final_frame_depth()
- jump_target_descr = regalloc.jump_target_descr
- if jump_target_descr is not None:
- target_frame_depth = jump_target_descr._x86_clt.frame_depth
- frame_depth = max(frame_depth, target_frame_depth)
- return frame_depth
- def _patchable_stackadjust(self):
- # stack adjustment LEA
- self.mc.LEA32_rb(esp.value, 0)
- return self.mc.get_relative_pos() - 4
- def _patch_stackadjust(self, adr_lea, allocated_depth):
- # patch stack adjustment LEA
- mc = codebuf.MachineCodeBlockWrapper()
- # Compute the correct offset for the instruction LEA ESP, [EBP-4*words]
- mc.writeimm32(self._get_offset_of_ebp_from_esp(allocated_depth))
- mc.copy_to_raw_memory(adr_lea)
- def _get_offset_of_ebp_from_esp(self, allocated_depth):
- # Given that [EBP] is where we saved EBP, i.e. in the last word
- # of our fixed frame, then the 'words' value is:
- words = (FRAME_FIXED_SIZE - 1) + allocated_depth
- # align, e.g. for Mac OS X
- aligned_words = align_stack_words(words+2)-2 # 2 = EIP+EBP
- return -WORD * aligned_words
- def _call_header(self):
- # NB. the shape of the frame is hard-coded in get_basic_shape() too.
- # Also, make sure this is consistent with FRAME_FIXED_SIZE.
- self.mc.PUSH_r(ebp.value)
- self.mc.MOV_rr(ebp.value, esp.value)
- for loc in self.cpu.CALLEE_SAVE_REGISTERS:
- self.mc.PUSH_r(loc.value)
- gcrootmap = self.cpu.gc_ll_descr.gcrootmap
- if gcrootmap and gcrootmap.is_shadow_stack:
- self._call_header_shadowstack(gcrootmap)
- def _call_header_with_stack_check(self):
- if self.stack_check_slowpath == 0:
- pass # no stack check (e.g. not translated)
- else:
- endaddr, lengthaddr, _ = self.cpu.insert_stack_check()
- self.mc.MOV(eax, heap(endaddr)) # MOV eax, [start]
- self.mc.SUB(eax, esp) # SUB eax, current
- self.mc.CMP(eax, heap(lengthaddr)) # CMP eax, [length]
- self.mc.J_il8(rx86.Conditions['BE'], 0) # JBE .skip
- jb_location = self.mc.get_relative_pos()
- self.mc.CALL(imm(self.stack_check_slowpath))# CALL slowpath
- # patch the JB above # .skip:
- offset = self.mc.get_relative_pos() - jb_location
- assert 0 < offset <= 127
- self.mc.overwrite(jb_location-1, chr(offset))
- #
- self._call_header()
- def _call_footer(self):
- self.mc.LEA_rb(esp.value, -len(self.cpu.CALLEE_SAVE_REGISTERS) * WORD)
- gcrootmap = self.cpu.gc_ll_descr.gcrootmap
- if gcrootmap and gcrootmap.is_shadow_stack:
- self._call_footer_shadowstack(gcrootmap)
- for i in range(len(self.cpu.CALLEE_SAVE_REGISTERS)-1, -1, -1):
- self.mc.POP_r(self.cpu.CALLEE_SAVE_REGISTERS[i].value)
- self.mc.POP_r(ebp.value)
- self.mc.RET()
- def _call_header_shadowstack(self, gcrootmap):
- # we need to put two words into the shadowstack: the MARKER_FRAME
- # and the address of the frame (ebp, actually)
- rst = gcrootmap.get_root_stack_top_addr()
- if rx86.fits_in_32bits(rst):
- self.mc.MOV_rj(eax.value, rst) # MOV eax, [rootstacktop]
- else:
- self.mc.MOV_ri(r13.value, rst) # MOV r13, rootstacktop
- self.mc.MOV_rm(eax.value, (r13.value, 0)) # MOV eax, [r13]
- #
- MARKER = gcrootmap.MARKER_FRAME
- self.mc.LEA_rm(ebx.value, (eax.value, 2*WORD)) # LEA ebx, [eax+2*WORD]
- self.mc.MOV_mi((eax.value, WORD), MARKER) # MOV [eax+WORD], MARKER
- self.mc.MOV_mr((eax.value, 0), ebp.value) # MOV [eax], ebp
- #
- if rx86.fits_in_32bits(rst):
- self.mc.MOV_jr(rst, ebx.value) # MOV [rootstacktop], ebx
- else:
- self.mc.MOV_mr((r13.value, 0), ebx.value) # MOV [r13], ebx
- def _call_footer_shadowstack(self, gcrootmap):
- rst = gcrootmap.get_root_stack_top_addr()
- if rx86.fits_in_32bits(rst):
- self.mc.SUB_ji8(rst, 2*WORD) # SUB [rootstacktop], 2*WORD
- else:
- self.mc.MOV_ri(ebx.value, rst) # MOV ebx, rootstacktop
- self.mc.SUB_mi8((ebx.value, 0), 2*WORD) # SUB [ebx], 2*WORD
- def redirect_call_assembler(self, oldlooptoken, newlooptoken):
- # some minimal sanity checking
- old_nbargs = oldlooptoken.compiled_loop_token._debug_nbargs
- new_nbargs = newlooptoken.compiled_loop_token._debug_nbargs
- assert old_nbargs == new_nbargs
- # we overwrite the instructions at the old _x86_direct_bootstrap_code
- # to start with a JMP to the new _x86_direct_bootstrap_code.
- # Ideally we should rather patch all existing CALLs, but well.
- oldadr = oldlooptoken._x86_function_addr
- target = newlooptoken._x86_function_addr
- mc = codebuf.MachineCodeBlockWrapper()
- mc.JMP(imm(target))
- if WORD == 4: # keep in sync with prepare_loop()
- assert mc.get_relative_pos() == 5
- else:
- assert mc.get_relative_pos() <= 13
- mc.copy_to_raw_memory(oldadr)
- def dump(self, text):
- if not self.verbose:
- return
- _prev = Box._extended_display
- try:
- Box._extended_display = False
- pos = self.mc.get_relative_pos()
- print >> sys.stderr, ' 0x%x %s' % (pos, text)
- finally:
- Box._extended_display = _prev
- # ------------------------------------------------------------
- def mov(self, from_loc, to_loc):
- if (isinstance(from_loc, RegLoc) and from_loc.is_xmm) or (isinstance(to_loc, RegLoc) and to_loc.is_xmm):
- self.mc.MOVSD(to_loc, from_loc)
- else:
- assert to_loc is not ebp
- self.mc.MOV(to_loc, from_loc)
- regalloc_mov = mov # legacy interface
- def regalloc_push(self, loc):
- if isinstance(loc, RegLoc) and loc.is_xmm:
- self.mc.SUB_ri(esp.value, 8) # = size of doubles
- self.mc.MOVSD_sx(0, loc.value)
- elif WORD == 4 and isinstance(loc, StackLoc) and loc.get_width() == 8:
- # XXX evil trick
- self.mc.PUSH_b(loc.value + 4)
- self.mc.PUSH_b(loc.value)
- else:
- self.mc.PUSH(loc)
- def regalloc_pop(self, loc):
- if isinstance(loc, RegLoc) and loc.is_xmm:
- self.mc.MOVSD_xs(loc.value, 0)
- self.mc.ADD_ri(esp.value, 8) # = size of doubles
- elif WORD == 4 and isinstance(loc, StackLoc) and loc.get_width() == 8:
- # XXX evil trick
- self.mc.POP_b(loc.value)
- self.mc.POP_b(loc.value + 4)
- else:
- self.mc.POP(loc)
- def regalloc_immedmem2mem(self, from_loc, to_loc):
- # move a ConstFloatLoc directly to a StackLoc, as two MOVs
- # (even on x86-64, because the immediates are encoded as 32 bits)
- assert isinstance(from_loc, ConstFloatLoc)
- assert isinstance(to_loc, StackLoc)
- low_part = rffi.cast(rffi.CArrayPtr(rffi.INT), from_loc.value)[0]
- high_part = rffi.cast(rffi.CArrayPtr(rffi.INT), from_loc.value)[1]
- low_part = intmask(low_part)
- high_part = intmask(high_part)
- self.mc.MOV32_bi(to_loc.value, low_part)
- self.mc.MOV32_bi(to_loc.value + 4, high_part)
- def regalloc_perform(self, op, arglocs, resloc):
- genop_list[op.getopnum()](self, op, arglocs, resloc)
- def regalloc_perform_discard(self, op, arglocs):
- genop_discard_list[op.getopnum()](self, op, arglocs)
- def regalloc_perform_llong(self, op, arglocs, resloc):
- effectinfo = op.getdescr().get_extra_info()
- oopspecindex = effectinfo.oopspecindex
- genop_llong_list[oopspecindex](self, op, arglocs, resloc)
- def regalloc_perform_math(self, op, arglocs, resloc):
- effectinfo = op.getdescr().get_extra_info()
- oopspecindex = effectinfo.oopspecindex
- genop_math_list[oopspecindex](self, op, arglocs, resloc)
- def regalloc_perform_with_guard(self, op, guard_op, faillocs,
- arglocs, resloc):
- faildescr = guard_op.getdescr()
- assert isinstance(faildescr, AbstractFailDescr)
- failargs = guard_op.getfailargs()
- guard_opnum = guard_op.getopnum()
- guard_token = self.implement_guard_recovery(guard_opnum,
- faildescr, failargs,
- faillocs)
- if op is None:
- dispatch_opnum = guard_opnum
- else:
- dispatch_opnum = op.getopnum()
- genop_guard_list[dispatch_opnum](self, op, guard_op, guard_token,
- arglocs, resloc)
- if not we_are_translated():
- # must be added by the genop_guard_list[]()
- assert guard_token is self.pending_guard_tokens[-1]
- def regalloc_perform_guard(self, guard_op, faillocs, arglocs, resloc):
- self.regalloc_perform_with_guard(None, guard_op, faillocs, arglocs,
- resloc)
- def load_effective_addr(self, sizereg, baseofs, scale, result, frm=imm0):
- self.mc.LEA(result, addr_add(frm, sizereg, baseofs, scale))
- def _unaryop(asmop):
- def genop_unary(self, op, arglocs, resloc):
- getattr(self.mc, asmop)(arglocs[0])
- return genop_unary
- def _binaryop(asmop, can_swap=False):
- def genop_binary(self, op, arglocs, result_loc):
- getattr(self.mc, asmop)(arglocs[0], arglocs[1])
- return genop_binary
- def _cmpop(cond, rev_cond):
- def genop_cmp(self, op, arglocs, result_loc):
- rl = result_loc.lowest8bits()
- if isinstance(op.getarg(0), Const):
- self.mc.CMP(arglocs[1], arglocs[0])
- self.mc.SET_ir(rx86.Conditions[rev_cond], rl.value)
- else:
- self.mc.CMP(arglocs[0], arglocs[1])
- self.mc.SET_ir(rx86.Conditions[cond], rl.value)
- self.mc.MOVZX8_rr(result_loc.value, rl.value)
- return genop_cmp
- def _cmpop_float(cond, rev_cond, is_ne=False):
- def genop_cmp(self, op, arglocs, result_loc):
- if isinstance(arglocs[0], RegLoc):
- self.mc.UCOMISD(arglocs[0], arglocs[1])
- checkcond = cond
- else:
- self.mc.UCOMISD(arglocs[1], arglocs[0])
- checkcond = rev_cond
- tmp1 = result_loc.lowest8bits()
- if IS_X86_32:
- tmp2 = result_loc.higher8bits()
- elif IS_X86_64:
- tmp2 = X86_64_SCRATCH_REG.lowest8bits()
- self.mc.SET_ir(rx86.Conditions[checkcond], tmp1.value)
- if is_ne:
- self.mc.SET_ir(rx86.Conditions['P'], tmp2.value)
- self.mc.OR8_rr(tmp1.value, tmp2.value)
- else:
- self.mc.SET_ir(rx86.Conditions['NP'], tmp2.value)
- self.mc.AND8_rr(tmp1.value, tmp2.value)
- self.mc.MOVZX8_rr(result_loc.value, tmp1.value)
- return genop_cmp
- def _cmpop_guard(cond, rev_cond, false_cond, false_rev_cond):
- def genop_cmp_guard(self, op, guard_op, guard_token, arglocs, result_loc):
- guard_opnum = guard_op.getopnum()
- if isinstance(op.getarg(0), Const):
- self.mc.CMP(arglocs[1], arglocs[0])
- if guard_opnum == rop.GUARD_FALSE:
- self.implement_guard(guard_token, rev_cond)
- else:
- self.implement_guard(guard_token, false_rev_cond)
- else:
- self.mc.CMP(arglocs[0], arglocs[1])
- if guard_opnum == rop.GUARD_FALSE:
- self.implement_guard(guard_token, cond)
- else:
- self.implement_guard(guard_token, false_cond)
- return genop_cmp_guard
- def _cmpop_guard_float(cond, rev_cond, false_cond, false_rev_cond):
- need_direct_jp = 'A' not in cond
- need_rev_jp = 'A' not in rev_cond
- def genop_cmp_guard_float(self, op, guard_op, guard_token, arglocs,
- result_loc):
- guard_opnum = guard_op.getopnum()
- if isinstance(arglocs[0], RegLoc):
- self.mc.UCOMISD(arglocs[0], arglocs[1])
- checkcond = cond
- checkfalsecond = false_cond
- need_jp = need_direct_jp
- else:
- self.mc.UCOMISD(arglocs[1], arglocs[0])
- checkcond = rev_cond
- checkfalsecond = false_rev_cond
- need_jp = need_rev_jp
- if guard_opnum == rop.GUARD_FALSE:
- if need_jp:
- self.mc.J_il8(rx86.Conditions['P'], 6)
- self.implement_guard(guard_token, checkcond)
- else:
- if need_jp:
- self.mc.J_il8(rx86.Conditions['P'], 2)
- self.mc.J_il8(rx86.Conditions[checkcond], 5)
- self.implement_guard(guard_token)
- else:
- self.implement_guard(guard_token, checkfalsecond)
- return genop_cmp_guard_float
- def _emit_call(self, force_index, x, arglocs, start=0, tmp=eax,
- argtypes=None, callconv=FFI_DEFAULT_ABI):
- if IS_X86_64:
- return self._emit_call_64(force_index, x, arglocs, start, argtypes)
- p = 0
- n = len(arglocs)
- for i in range(start, n):
- loc = arglocs[i]
- if isinstance(loc, RegLoc):
- if loc.is_xmm:
- self.mc.MOVSD_sx(p, loc.value)
- else:
- self.mc.MOV_sr(p, loc.value)
- p += loc.get_width()
- p = 0
- for i in range(start, n):
- loc = arglocs[i]
- if not isinstance(loc, RegLoc):
- if loc.get_width() == 8:
- self.mc.MOVSD(xmm0, loc)
- self.mc.MOVSD_sx(p, xmm0.value)
- else:
- self.mc.MOV(tmp, loc)
- self.mc.MOV_sr(p, tmp.value)
- p += loc.get_width()
- # x is a location
- self.mc.CALL(x)
- self.mark_gc_roots(force_index)
- #
- if callconv != FFI_DEFAULT_ABI:
- self._fix_stdcall(callconv, p)
- #
- self._regalloc.needed_extra_stack_locations(p//WORD)
- def _fix_stdcall(self, callconv, p):
- from pypy.rlib.clibffi import FFI_STDCALL
- assert callconv == FFI_STDCALL
- # it's a bit stupid, but we're just going to cancel the fact that
- # the called function just added 'p' to ESP, by subtracting it again.
- self.mc.SUB_ri(esp.value, p)
- def _emit_call_64(self, force_index, x, arglocs, start, argtypes):
- src_locs = []
- dst_locs = []
- xmm_src_locs = []
- xmm_dst_locs = []
- pass_on_stack = []
- singlefloats = None
- # In reverse order for use with pop()
- unused_gpr = [r9, r8, ecx, edx, esi, edi]
- unused_xmm = [xmm7, xmm6, xmm5, xmm4, xmm3, xmm2, xmm1, xmm0]
- for i in range(start, len(arglocs)):
- loc = arglocs[i]
- # XXX: Should be much simplier to tell whether a location is a
- # float! It's so ugly because we have to "guard" the access to
- # .type with isinstance, since not all AssemblerLocation classes
- # are "typed"
- if ((isinstance(loc, RegLoc) and loc.is_xmm) or
- (isinstance(loc, StackLoc) and loc.type == FLOAT) or
- (isinstance(loc, ConstFloatLoc))):
- if len(unused_xmm) > 0:
- xmm_src_locs.append(loc)
- xmm_dst_locs.append(unused_xmm.pop())
- else:
- pass_on_stack.append(loc)
- elif (argtypes is not None and argtypes[i-start] == 'S' and
- len(unused_xmm) > 0):
- # Singlefloat argument
- if singlefloats is None: singlefloats = []
- singlefloats.append((loc, unused_xmm.pop()))
- else:
- if len(unused_gpr) > 0:
- src_locs.append(loc)
- dst_locs.append(unused_gpr.pop())
- else:
- pass_on_stack.append(loc)
- # Emit instructions to pass the stack arguments
- # XXX: Would be nice to let remap_frame_layout take care of this, but
- # we'd need to create something like StackLoc, but relative to esp,
- # and I don't know if it's worth it.
- for i in range(len(pass_on_stack)):
- loc = pass_on_stack[i]
- if not isinstance(loc, RegLoc):
- if isinstance(loc, StackLoc) and loc.type == FLOAT:
- self.mc.MOVSD(X86_64_XMM_SCRATCH_REG, loc)
- self.mc.MOVSD_sx(i*WORD, X86_64_XMM_SCRATCH_REG.value)
- else:
- self.mc.MOV(X86_64_SCRATCH_REG, loc)
- self.mc.MOV_sr(i*WORD, X86_64_SCRATCH_REG.value)
- else:
- # It's a register
- if loc.is_xmm:
- self.mc.MOVSD_sx(i*WORD, loc.value)
- else:
- self.mc.MOV_sr(i*WORD, loc.value)
- # Handle register arguments: first remap the xmm arguments
- remap_frame_layout(self, xmm_src_locs, xmm_dst_locs,
- X86_64_XMM_SCRATCH_REG)
- # Load the singlefloat arguments from main regs or stack to xmm regs
- if singlefloats is not None:
- for src, dst in singlefloats:
- self.mc.MOVD(dst, src)
- # Finally remap the arguments in the main regs
- # If x is a register and is in dst_locs, then oups, it needs to
- # be moved away:
- if x in dst_locs:
- src_locs.append(x)
- dst_locs.append(r10)
- x = r10
- remap_frame_layout(self, src_locs, dst_locs, X86_64_SCRATCH_REG)
- self.mc.CALL(x)
- self.mark_gc_roots(force_index)
- self._regalloc.needed_extra_stack_locations(len(pass_on_stack))
- def call(self, addr, args, res):
- force_index = self.write_new_force_index()
- self._emit_call(force_index, imm(addr), args)
- assert res is eax
- def write_new_force_index(self):
- # for shadowstack only: get a new, unused force_index number and
- # write it to FORCE_INDEX_OFS. Used to record the call shape
- # (i.e. where the GC pointers are in the stack) around a CALL
- # instruction that doesn't already have a force_index.
- gcrootmap = self.cpu.gc_ll_descr.gcrootmap
- if gcrootmap and gcroot…
Large files files are truncated, but you can click here to view the full file