/rpython/jit/backend/x86/assembler.py
Python | 2710 lines | 2063 code | 231 blank | 416 comment | 433 complexity | d917c7c1cc9ac1c5ddc2d035f28cb416 MD5 | raw file
Large files files are truncated, but you can click here to view the full file
- import sys, os
- from rpython.jit.backend.llsupport import symbolic, jitframe
- from rpython.jit.backend.llsupport.asmmemmgr import MachineDataBlockWrapper
- from rpython.jit.metainterp.history import Const, Box, BoxInt, ConstInt
- from rpython.jit.metainterp.history import AbstractFailDescr, INT, REF, FLOAT
- from rpython.jit.metainterp.history import JitCellToken
- from rpython.rtyper.lltypesystem import lltype, rffi, rstr, llmemory
- from rpython.rtyper.lltypesystem.lloperation import llop
- from rpython.rtyper.annlowlevel import llhelper
- from rpython.rlib.jit import AsmInfo
- from rpython.rlib import longlong2float
- from rpython.jit.backend.model import CompiledLoopToken
- from rpython.jit.backend.x86.regalloc import (RegAlloc, get_ebp_ofs, _get_scale,
- gpr_reg_mgr_cls, xmm_reg_mgr_cls, _valid_addressing_size)
- from rpython.jit.backend.x86.arch import (FRAME_FIXED_SIZE, FORCE_INDEX_OFS, WORD,
- IS_X86_32, IS_X86_64)
- from rpython.jit.backend.x86.regloc import (eax, ecx, edx, ebx,
- esp, ebp, esi, edi,
- xmm0, xmm1, xmm2, xmm3,
- xmm4, xmm5, xmm6, xmm7,
- r8, r9, r10, r11,
- r12, r13, r14, r15,
- X86_64_SCRATCH_REG,
- X86_64_XMM_SCRATCH_REG,
- RegLoc, StackLoc, ConstFloatLoc,
- ImmedLoc, AddressLoc, imm,
- imm0, imm1, FloatImmedLoc)
- from rpython.rlib.objectmodel import we_are_translated, specialize
- from rpython.jit.backend.x86 import rx86, regloc, codebuf
- from rpython.jit.metainterp.resoperation import rop, ResOperation
- from rpython.jit.backend.x86 import support
- from rpython.rlib.debug import (debug_print, debug_start, debug_stop,
- have_debug_prints, fatalerror)
- from rpython.rlib import rgc
- from rpython.rlib.clibffi import FFI_DEFAULT_ABI
- from rpython.jit.backend.x86.jump import remap_frame_layout
- from rpython.jit.codewriter.effectinfo import EffectInfo
- from rpython.jit.codewriter import longlong
- from rpython.rlib.rarithmetic import intmask
- from rpython.rlib.objectmodel import compute_unique_id
- # darwin requires the stack to be 16 bytes aligned on calls. Same for gcc 4.5.0,
- # better safe than sorry
- CALL_ALIGN = 16 // WORD
- def align_stack_words(words):
- return (words + CALL_ALIGN - 1) & ~(CALL_ALIGN-1)
- class GuardToken(object):
- def __init__(self, faildescr, failargs, fail_locs, exc,
- is_guard_not_invalidated, is_guard_not_forced):
- self.faildescr = faildescr
- self.failargs = failargs
- self.fail_locs = fail_locs
- self.exc = exc
- self.is_guard_not_invalidated = is_guard_not_invalidated
- self.is_guard_not_forced = is_guard_not_forced
- DEBUG_COUNTER = lltype.Struct('DEBUG_COUNTER', ('i', lltype.Signed),
- ('type', lltype.Char), # 'b'ridge, 'l'abel or
- # 'e'ntry point
- ('number', lltype.Signed))
- class Assembler386(object):
- _regalloc = None
- _output_loop_log = None
- def __init__(self, cpu, translate_support_code=False):
- self.cpu = cpu
- self.verbose = False
- self.rtyper = cpu.rtyper
- self.loop_run_counters = []
- self.float_const_neg_addr = 0
- self.float_const_abs_addr = 0
- self.malloc_slowpath1 = 0
- self.malloc_slowpath2 = 0
- self.wb_slowpath = [0, 0, 0, 0]
- self.memcpy_addr = 0
- self.setup_failure_recovery()
- self._debug = False
- self.debug_counter_descr = cpu.fielddescrof(DEBUG_COUNTER, 'i')
- self.datablockwrapper = None
- self.stack_check_slowpath = 0
- self.propagate_exception_path = 0
- self.gcrootmap_retaddr_forced = 0
- self.teardown()
- self.force_token_to_dead_frame = {} # XXX temporary hack
- def set_debug(self, v):
- r = self._debug
- self._debug = v
- return r
- def setup_once(self):
- # the address of the function called by 'new'
- gc_ll_descr = self.cpu.gc_ll_descr
- gc_ll_descr.initialize()
- self.memcpy_addr = self.cpu.cast_ptr_to_int(support.memcpy_fn)
- self._build_failure_recovery(False)
- self._build_failure_recovery(True)
- self._build_wb_slowpath(False)
- self._build_wb_slowpath(True)
- if self.cpu.supports_floats:
- self._build_failure_recovery(False, withfloats=True)
- self._build_failure_recovery(True, withfloats=True)
- self._build_wb_slowpath(False, withfloats=True)
- self._build_wb_slowpath(True, withfloats=True)
- support.ensure_sse2_floats()
- self._build_float_constants()
- self._build_propagate_exception_path()
- if gc_ll_descr.get_malloc_slowpath_addr is not None:
- self._build_malloc_slowpath()
- self._build_stack_check_slowpath()
- if gc_ll_descr.gcrootmap:
- self._build_release_gil(gc_ll_descr.gcrootmap)
- if not self._debug:
- # if self._debug is already set it means that someone called
- # set_debug by hand before initializing the assembler. Leave it
- # as it is
- debug_start('jit-backend-counts')
- self.set_debug(have_debug_prints())
- debug_stop('jit-backend-counts')
- def setup(self, looptoken):
- assert self.memcpy_addr != 0, "setup_once() not called?"
- self.current_clt = looptoken.compiled_loop_token
- self.pending_guard_tokens = []
- if WORD == 8:
- self.pending_memoryerror_trampoline_from = []
- self.error_trampoline_64 = 0
- self.mc = codebuf.MachineCodeBlockWrapper()
- #assert self.datablockwrapper is None --- but obscure case
- # possible, e.g. getting MemoryError and continuing
- allblocks = self.get_asmmemmgr_blocks(looptoken)
- self.datablockwrapper = MachineDataBlockWrapper(self.cpu.asmmemmgr,
- allblocks)
- self.target_tokens_currently_compiling = {}
- def teardown(self):
- self.pending_guard_tokens = None
- if WORD == 8:
- self.pending_memoryerror_trampoline_from = None
- self.mc = None
- self.current_clt = None
- def finish_once(self):
- if self._debug:
- debug_start('jit-backend-counts')
- for i in range(len(self.loop_run_counters)):
- struct = self.loop_run_counters[i]
- if struct.type == 'l':
- prefix = 'TargetToken(%d)' % struct.number
- elif struct.type == 'b':
- prefix = 'bridge ' + str(struct.number)
- else:
- prefix = 'entry ' + str(struct.number)
- debug_print(prefix + ':' + str(struct.i))
- debug_stop('jit-backend-counts')
- def _build_float_constants(self):
- datablockwrapper = MachineDataBlockWrapper(self.cpu.asmmemmgr, [])
- float_constants = datablockwrapper.malloc_aligned(32, alignment=16)
- datablockwrapper.done()
- addr = rffi.cast(rffi.CArrayPtr(lltype.Char), float_constants)
- qword_padding = '\x00\x00\x00\x00\x00\x00\x00\x00'
- # 0x8000000000000000
- neg_const = '\x00\x00\x00\x00\x00\x00\x00\x80'
- # 0x7FFFFFFFFFFFFFFF
- abs_const = '\xFF\xFF\xFF\xFF\xFF\xFF\xFF\x7F'
- data = neg_const + qword_padding + abs_const + qword_padding
- for i in range(len(data)):
- addr[i] = data[i]
- self.float_const_neg_addr = float_constants
- self.float_const_abs_addr = float_constants + 16
- def _build_malloc_slowpath(self):
- # With asmgcc, we need two helpers, so that we can write two CALL
- # instructions in assembler, with a mark_gc_roots in between.
- # With shadowstack, this is not needed, so we produce a single helper.
- gcrootmap = self.cpu.gc_ll_descr.gcrootmap
- shadow_stack = (gcrootmap is not None and gcrootmap.is_shadow_stack)
- #
- # ---------- first helper for the slow path of malloc ----------
- mc = codebuf.MachineCodeBlockWrapper()
- if self.cpu.supports_floats: # save the XMM registers in
- for i in range(self.cpu.NUM_REGS):# the *caller* frame, from esp+8
- mc.MOVSD_sx((WORD*2)+8*i, i)
- mc.SUB_rr(edx.value, eax.value) # compute the size we want
- addr = self.cpu.gc_ll_descr.get_malloc_slowpath_addr()
- #
- # The registers to save in the copy area: with shadowstack, most
- # registers need to be saved. With asmgcc, the callee-saved registers
- # don't need to.
- save_in_copy_area = gpr_reg_mgr_cls.REGLOC_TO_COPY_AREA_OFS.items()
- if not shadow_stack:
- save_in_copy_area = [(reg, ofs) for (reg, ofs) in save_in_copy_area
- if reg not in gpr_reg_mgr_cls.REGLOC_TO_GCROOTMAP_REG_INDEX]
- #
- for reg, ofs in save_in_copy_area:
- mc.MOV_br(ofs, reg.value)
- #
- if shadow_stack:
- # ---- shadowstack ----
- mc.SUB_ri(esp.value, 16 - WORD) # stack alignment of 16 bytes
- if IS_X86_32:
- mc.MOV_sr(0, edx.value) # push argument
- elif IS_X86_64:
- mc.MOV_rr(edi.value, edx.value)
- mc.CALL(imm(addr))
- mc.ADD_ri(esp.value, 16 - WORD)
- else:
- # ---- asmgcc ----
- if IS_X86_32:
- mc.MOV_sr(WORD, edx.value) # save it as the new argument
- elif IS_X86_64:
- # rdi can be clobbered: its content was saved in the
- # copy area of the stack
- mc.MOV_rr(edi.value, edx.value)
- mc.JMP(imm(addr)) # tail call to the real malloc
- rawstart = mc.materialize(self.cpu.asmmemmgr, [])
- self.malloc_slowpath1 = rawstart
- # ---------- second helper for the slow path of malloc ----------
- mc = codebuf.MachineCodeBlockWrapper()
- #
- for reg, ofs in save_in_copy_area:
- mc.MOV_rb(reg.value, ofs)
- assert reg is not eax and reg is not edx
- #
- if self.cpu.supports_floats: # restore the XMM registers
- for i in range(self.cpu.NUM_REGS):# from where they were saved
- mc.MOVSD_xs(i, (WORD*2)+8*i)
- #
- # Note: we check this after the code above, just because the code
- # above is more than 127 bytes on 64-bits...
- mc.TEST_rr(eax.value, eax.value)
- mc.J_il8(rx86.Conditions['Z'], 0) # patched later
- jz_location = mc.get_relative_pos()
- #
- nursery_free_adr = self.cpu.gc_ll_descr.get_nursery_free_addr()
- mc.MOV(edx, heap(nursery_free_adr)) # load this in EDX
- mc.RET()
- #
- # If the slowpath malloc failed, we raise a MemoryError that
- # always interrupts the current loop, as a "good enough"
- # approximation. Also note that we didn't RET from this helper;
- # but the code we jump to will actually restore the stack
- # position based on EBP, which will get us out of here for free.
- offset = mc.get_relative_pos() - jz_location
- assert 0 < offset <= 127
- mc.overwrite(jz_location-1, chr(offset))
- mc.JMP(imm(self.propagate_exception_path))
- #
- rawstart = mc.materialize(self.cpu.asmmemmgr, [])
- self.malloc_slowpath2 = rawstart
- def _build_propagate_exception_path(self):
- if self.cpu.propagate_exception_v < 0:
- return # not supported (for tests, or non-translated)
- #
- self.mc = codebuf.MachineCodeBlockWrapper()
- #
- # Call the helper, which will return a dead frame object with
- # the correct exception set, or MemoryError by default
- addr = rffi.cast(lltype.Signed, self.cpu.get_propagate_exception())
- self.mc.CALL(imm(addr))
- #
- self._call_footer()
- rawstart = self.mc.materialize(self.cpu.asmmemmgr, [])
- self.propagate_exception_path = rawstart
- self.mc = None
- def _build_stack_check_slowpath(self):
- _, _, slowpathaddr = self.cpu.insert_stack_check()
- if slowpathaddr == 0 or self.cpu.propagate_exception_v < 0:
- return # no stack check (for tests, or non-translated)
- #
- # make a "function" that is called immediately at the start of
- # an assembler function. In particular, the stack looks like:
- #
- # | ... | <-- aligned to a multiple of 16
- # | retaddr of caller |
- # | my own retaddr | <-- esp
- # +---------------------+
- #
- mc = codebuf.MachineCodeBlockWrapper()
- #
- stack_size = WORD
- if IS_X86_64:
- # on the x86_64, we have to save all the registers that may
- # have been used to pass arguments
- stack_size += 6*WORD + 8*8
- for reg in [edi, esi, edx, ecx, r8, r9]:
- mc.PUSH_r(reg.value)
- mc.SUB_ri(esp.value, 8*8)
- for i in range(8):
- mc.MOVSD_sx(8*i, i) # xmm0 to xmm7
- #
- if IS_X86_32:
- stack_size += 2*WORD
- mc.PUSH_r(eax.value) # alignment
- mc.PUSH_r(esp.value)
- elif IS_X86_64:
- mc.MOV_rr(edi.value, esp.value)
- #
- # esp is now aligned to a multiple of 16 again
- mc.CALL(imm(slowpathaddr))
- #
- mc.MOV(eax, heap(self.cpu.pos_exception()))
- mc.TEST_rr(eax.value, eax.value)
- mc.J_il8(rx86.Conditions['NZ'], 0)
- jnz_location = mc.get_relative_pos()
- #
- if IS_X86_32:
- mc.ADD_ri(esp.value, 2*WORD) # cancel the two PUSHes above
- elif IS_X86_64:
- # restore the registers
- for i in range(7, -1, -1):
- mc.MOVSD_xs(i, 8*i)
- mc.ADD_ri(esp.value, 8*8)
- for reg in [r9, r8, ecx, edx, esi, edi]:
- mc.POP_r(reg.value)
- #
- mc.RET()
- #
- # patch the JNZ above
- offset = mc.get_relative_pos() - jnz_location
- assert 0 < offset <= 127
- mc.overwrite(jnz_location-1, chr(offset))
- #
- # Call the helper, which will return a dead frame object with
- # the correct exception set, or MemoryError by default
- addr = rffi.cast(lltype.Signed, self.cpu.get_propagate_exception())
- mc.CALL(imm(addr))
- #
- # footer -- note the ADD, which skips the return address of this
- # function, and will instead return to the caller's caller. Note
- # also that we completely ignore the saved arguments, because we
- # are interrupting the function.
- mc.ADD_ri(esp.value, stack_size)
- mc.RET()
- #
- rawstart = mc.materialize(self.cpu.asmmemmgr, [])
- self.stack_check_slowpath = rawstart
- def _build_wb_slowpath(self, withcards, withfloats=False):
- descr = self.cpu.gc_ll_descr.write_barrier_descr
- if descr is None:
- return
- if not withcards:
- func = descr.get_write_barrier_fn(self.cpu)
- else:
- if descr.jit_wb_cards_set == 0:
- return
- func = descr.get_write_barrier_from_array_fn(self.cpu)
- if func == 0:
- return
- #
- # This builds a helper function called from the slow path of
- # write barriers. It must save all registers, and optionally
- # all XMM registers. It takes a single argument just pushed
- # on the stack even on X86_64. It must restore stack alignment
- # accordingly.
- mc = codebuf.MachineCodeBlockWrapper()
- #
- frame_size = (1 + # my argument, considered part of my frame
- 1 + # my return address
- len(gpr_reg_mgr_cls.save_around_call_regs))
- if withfloats:
- frame_size += 16 # X86_32: 16 words for 8 registers;
- # X86_64: just 16 registers
- if IS_X86_32:
- frame_size += 1 # argument to pass to the call
- #
- # align to a multiple of 16 bytes
- frame_size = (frame_size + (CALL_ALIGN-1)) & ~(CALL_ALIGN-1)
- #
- correct_esp_by = (frame_size - 2) * WORD
- mc.SUB_ri(esp.value, correct_esp_by)
- #
- ofs = correct_esp_by
- if withfloats:
- for reg in xmm_reg_mgr_cls.save_around_call_regs:
- ofs -= 8
- mc.MOVSD_sx(ofs, reg.value)
- for reg in gpr_reg_mgr_cls.save_around_call_regs:
- ofs -= WORD
- mc.MOV_sr(ofs, reg.value)
- #
- if IS_X86_32:
- mc.MOV_rs(eax.value, (frame_size - 1) * WORD)
- mc.MOV_sr(0, eax.value)
- elif IS_X86_64:
- mc.MOV_rs(edi.value, (frame_size - 1) * WORD)
- mc.CALL(imm(func))
- #
- if withcards:
- # A final TEST8 before the RET, for the caller. Careful to
- # not follow this instruction with another one that changes
- # the status of the CPU flags!
- mc.MOV_rs(eax.value, (frame_size - 1) * WORD)
- mc.TEST8(addr_add_const(eax, descr.jit_wb_if_flag_byteofs),
- imm(-0x80))
- #
- ofs = correct_esp_by
- if withfloats:
- for reg in xmm_reg_mgr_cls.save_around_call_regs:
- ofs -= 8
- mc.MOVSD_xs(reg.value, ofs)
- for reg in gpr_reg_mgr_cls.save_around_call_regs:
- ofs -= WORD
- mc.MOV_rs(reg.value, ofs)
- #
- # ADD esp, correct_esp_by --- but cannot use ADD, because
- # of its effects on the CPU flags
- mc.LEA_rs(esp.value, correct_esp_by)
- mc.RET16_i(WORD)
- #
- rawstart = mc.materialize(self.cpu.asmmemmgr, [])
- self.wb_slowpath[withcards + 2 * withfloats] = rawstart
- @staticmethod
- @rgc.no_collect
- def _release_gil_asmgcc(css):
- # similar to trackgcroot.py:pypy_asm_stackwalk, first part
- from rpython.rtyper.memory.gctransform import asmgcroot
- new = rffi.cast(asmgcroot.ASM_FRAMEDATA_HEAD_PTR, css)
- next = asmgcroot.gcrootanchor.next
- new.next = next
- new.prev = asmgcroot.gcrootanchor
- asmgcroot.gcrootanchor.next = new
- next.prev = new
- # and now release the GIL
- before = rffi.aroundstate.before
- if before:
- before()
- @staticmethod
- @rgc.no_collect
- def _reacquire_gil_asmgcc(css):
- # first reacquire the GIL
- after = rffi.aroundstate.after
- if after:
- after()
- # similar to trackgcroot.py:pypy_asm_stackwalk, second part
- from rpython.rtyper.memory.gctransform import asmgcroot
- old = rffi.cast(asmgcroot.ASM_FRAMEDATA_HEAD_PTR, css)
- prev = old.prev
- next = old.next
- prev.next = next
- next.prev = prev
- @staticmethod
- @rgc.no_collect
- def _release_gil_shadowstack():
- before = rffi.aroundstate.before
- if before:
- before()
- @staticmethod
- @rgc.no_collect
- def _reacquire_gil_shadowstack():
- after = rffi.aroundstate.after
- if after:
- after()
- _NOARG_FUNC = lltype.Ptr(lltype.FuncType([], lltype.Void))
- _CLOSESTACK_FUNC = lltype.Ptr(lltype.FuncType([rffi.LONGP],
- lltype.Void))
- def _build_release_gil(self, gcrootmap):
- if gcrootmap.is_shadow_stack:
- releasegil_func = llhelper(self._NOARG_FUNC,
- self._release_gil_shadowstack)
- reacqgil_func = llhelper(self._NOARG_FUNC,
- self._reacquire_gil_shadowstack)
- else:
- releasegil_func = llhelper(self._CLOSESTACK_FUNC,
- self._release_gil_asmgcc)
- reacqgil_func = llhelper(self._CLOSESTACK_FUNC,
- self._reacquire_gil_asmgcc)
- self.releasegil_addr = self.cpu.cast_ptr_to_int(releasegil_func)
- self.reacqgil_addr = self.cpu.cast_ptr_to_int(reacqgil_func)
- def assemble_loop(self, loopname, inputargs, operations, looptoken, log):
- '''adds the following attributes to looptoken:
- _x86_function_addr (address of the generated func, as an int)
- _x86_loop_code (debug: addr of the start of the ResOps)
- _x86_fullsize (debug: full size including failure)
- _x86_debug_checksum
- '''
- # XXX this function is too longish and contains some code
- # duplication with assemble_bridge(). Also, we should think
- # about not storing on 'self' attributes that will live only
- # for the duration of compiling one loop or a one bridge.
- clt = CompiledLoopToken(self.cpu, looptoken.number)
- clt.allgcrefs = []
- looptoken.compiled_loop_token = clt
- if not we_are_translated():
- # Arguments should be unique
- assert len(set(inputargs)) == len(inputargs)
- self.setup(looptoken)
- if log:
- operations = self._inject_debugging_code(looptoken, operations,
- 'e', looptoken.number)
- regalloc = RegAlloc(self, self.cpu.translate_support_code)
- #
- self._call_header_with_stack_check()
- stackadjustpos = self._patchable_stackadjust()
- clt._debug_nbargs = len(inputargs)
- operations = regalloc.prepare_loop(inputargs, operations,
- looptoken, clt.allgcrefs)
- looppos = self.mc.get_relative_pos()
- looptoken._x86_loop_code = looppos
- clt.frame_depth = -1 # temporarily
- frame_depth = self._assemble(regalloc, operations)
- clt.frame_depth = frame_depth
- #
- size_excluding_failure_stuff = self.mc.get_relative_pos()
- self.write_pending_failure_recoveries()
- full_size = self.mc.get_relative_pos()
- #
- rawstart = self.materialize_loop(looptoken)
- debug_start("jit-backend-addr")
- debug_print("Loop %d (%s) has address %x to %x (bootstrap %x)" % (
- looptoken.number, loopname,
- rawstart + looppos,
- rawstart + size_excluding_failure_stuff,
- rawstart))
- debug_stop("jit-backend-addr")
- self._patch_stackadjust(rawstart + stackadjustpos, frame_depth)
- self.patch_pending_failure_recoveries(rawstart)
- #
- ops_offset = self.mc.ops_offset
- if not we_are_translated():
- # used only by looptoken.dump() -- useful in tests
- looptoken._x86_rawstart = rawstart
- looptoken._x86_fullsize = full_size
- looptoken._x86_ops_offset = ops_offset
- looptoken._x86_function_addr = rawstart
- self.fixup_target_tokens(rawstart)
- self.teardown()
- # oprofile support
- if self.cpu.profile_agent is not None:
- name = "Loop # %s: %s" % (looptoken.number, loopname)
- self.cpu.profile_agent.native_code_written(name,
- rawstart, full_size)
- return AsmInfo(ops_offset, rawstart + looppos,
- size_excluding_failure_stuff - looppos)
- def assemble_bridge(self, faildescr, inputargs, operations,
- original_loop_token, log):
- if not we_are_translated():
- # Arguments should be unique
- assert len(set(inputargs)) == len(inputargs)
- descr_number = self.cpu.get_fail_descr_number(faildescr)
- failure_recovery = self._find_failure_recovery_bytecode(faildescr)
- self.setup(original_loop_token)
- if log:
- operations = self._inject_debugging_code(faildescr, operations,
- 'b', descr_number)
- arglocs = self.rebuild_faillocs_from_descr(failure_recovery)
- if not we_are_translated():
- assert ([loc.assembler() for loc in arglocs] ==
- [loc.assembler() for loc in faildescr._x86_debug_faillocs])
- regalloc = RegAlloc(self, self.cpu.translate_support_code)
- startpos = self.mc.get_relative_pos()
- operations = regalloc.prepare_bridge(inputargs, arglocs,
- operations,
- self.current_clt.allgcrefs)
- stackadjustpos = self._patchable_stackadjust()
- frame_depth = self._assemble(regalloc, operations)
- codeendpos = self.mc.get_relative_pos()
- self.write_pending_failure_recoveries()
- fullsize = self.mc.get_relative_pos()
- #
- rawstart = self.materialize_loop(original_loop_token)
- debug_start("jit-backend-addr")
- debug_print("bridge out of Guard %d has address %x to %x" %
- (descr_number, rawstart, rawstart + codeendpos))
- debug_stop("jit-backend-addr")
- self._patch_stackadjust(rawstart + stackadjustpos, frame_depth)
- self.patch_pending_failure_recoveries(rawstart)
- if not we_are_translated():
- # for the benefit of tests
- faildescr._x86_bridge_frame_depth = frame_depth
- # patch the jump from original guard
- self.patch_jump_for_descr(faildescr, rawstart)
- ops_offset = self.mc.ops_offset
- self.fixup_target_tokens(rawstart)
- self.current_clt.frame_depth = max(self.current_clt.frame_depth, frame_depth)
- self.teardown()
- # oprofile support
- if self.cpu.profile_agent is not None:
- name = "Bridge # %s" % (descr_number,)
- self.cpu.profile_agent.native_code_written(name,
- rawstart, fullsize)
- return AsmInfo(ops_offset, startpos + rawstart, codeendpos - startpos)
- def write_pending_failure_recoveries(self):
- # for each pending guard, generate the code of the recovery stub
- # at the end of self.mc.
- for tok in self.pending_guard_tokens:
- tok.pos_recovery_stub = self.generate_quick_failure(tok)
- if WORD == 8 and len(self.pending_memoryerror_trampoline_from) > 0:
- self.error_trampoline_64 = self.generate_propagate_error_64()
- def patch_pending_failure_recoveries(self, rawstart):
- # after we wrote the assembler to raw memory, set up
- # tok.faildescr._x86_adr_jump_offset to contain the raw address of
- # the 4-byte target field in the JMP/Jcond instruction, and patch
- # the field in question to point (initially) to the recovery stub
- clt = self.current_clt
- for tok in self.pending_guard_tokens:
- addr = rawstart + tok.pos_jump_offset
- tok.faildescr._x86_adr_jump_offset = addr
- relative_target = tok.pos_recovery_stub - (tok.pos_jump_offset + 4)
- assert rx86.fits_in_32bits(relative_target)
- #
- if not tok.is_guard_not_invalidated:
- mc = codebuf.MachineCodeBlockWrapper()
- mc.writeimm32(relative_target)
- mc.copy_to_raw_memory(addr)
- else:
- # GUARD_NOT_INVALIDATED, record an entry in
- # clt.invalidate_positions of the form:
- # (addr-in-the-code-of-the-not-yet-written-jump-target,
- # relative-target-to-use)
- relpos = tok.pos_jump_offset
- clt.invalidate_positions.append((rawstart + relpos,
- relative_target))
- # General idea: Although no code was generated by this
- # guard, the code might be patched with a "JMP rel32" to
- # the guard recovery code. This recovery code is
- # already generated, and looks like the recovery code
- # for any guard, even if at first it has no jump to it.
- # So we may later write 5 bytes overriding the existing
- # instructions; this works because a CALL instruction
- # would also take at least 5 bytes. If it could take
- # less, we would run into the issue that overwriting the
- # 5 bytes here might get a few nonsense bytes at the
- # return address of the following CALL.
- if WORD == 8:
- for pos_after_jz in self.pending_memoryerror_trampoline_from:
- assert self.error_trampoline_64 != 0 # only if non-empty
- mc = codebuf.MachineCodeBlockWrapper()
- mc.writeimm32(self.error_trampoline_64 - pos_after_jz)
- mc.copy_to_raw_memory(rawstart + pos_after_jz - 4)
- def get_asmmemmgr_blocks(self, looptoken):
- clt = looptoken.compiled_loop_token
- if clt.asmmemmgr_blocks is None:
- clt.asmmemmgr_blocks = []
- return clt.asmmemmgr_blocks
- def materialize_loop(self, looptoken):
- self.datablockwrapper.done() # finish using cpu.asmmemmgr
- self.datablockwrapper = None
- allblocks = self.get_asmmemmgr_blocks(looptoken)
- return self.mc.materialize(self.cpu.asmmemmgr, allblocks,
- self.cpu.gc_ll_descr.gcrootmap)
- def _register_counter(self, tp, number, token):
- # YYY very minor leak -- we need the counters to stay alive
- # forever, just because we want to report them at the end
- # of the process
- struct = lltype.malloc(DEBUG_COUNTER, flavor='raw',
- track_allocation=False)
- struct.i = 0
- struct.type = tp
- if tp == 'b' or tp == 'e':
- struct.number = number
- else:
- assert token
- struct.number = compute_unique_id(token)
- self.loop_run_counters.append(struct)
- return struct
- def _find_failure_recovery_bytecode(self, faildescr):
- adr_jump_offset = faildescr._x86_adr_jump_offset
- if adr_jump_offset == 0:
- # This case should be prevented by the logic in compile.py:
- # look for CNT_BUSY_FLAG, which disables tracing from a guard
- # when another tracing from the same guard is already in progress.
- raise BridgeAlreadyCompiled
- # follow the JMP/Jcond
- p = rffi.cast(rffi.INTP, adr_jump_offset)
- adr_target = adr_jump_offset + 4 + rffi.cast(lltype.Signed, p[0])
- # skip the CALL
- if WORD == 4:
- adr_target += 5 # CALL imm
- else:
- adr_target += 13 # MOV r11, imm-as-8-bytes; CALL *r11 xxxxxxxxxx
- return adr_target
- def patch_jump_for_descr(self, faildescr, adr_new_target):
- adr_jump_offset = faildescr._x86_adr_jump_offset
- assert adr_jump_offset != 0
- offset = adr_new_target - (adr_jump_offset + 4)
- # If the new target fits within a rel32 of the jump, just patch
- # that. Otherwise, leave the original rel32 to the recovery stub in
- # place, but clobber the recovery stub with a jump to the real
- # target.
- mc = codebuf.MachineCodeBlockWrapper()
- if rx86.fits_in_32bits(offset):
- mc.writeimm32(offset)
- mc.copy_to_raw_memory(adr_jump_offset)
- else:
- # "mov r11, addr; jmp r11" is up to 13 bytes, which fits in there
- # because we always write "mov r11, imm-as-8-bytes; call *r11" in
- # the first place.
- mc.MOV_ri(X86_64_SCRATCH_REG.value, adr_new_target)
- mc.JMP_r(X86_64_SCRATCH_REG.value)
- p = rffi.cast(rffi.INTP, adr_jump_offset)
- adr_target = adr_jump_offset + 4 + rffi.cast(lltype.Signed, p[0])
- mc.copy_to_raw_memory(adr_target)
- faildescr._x86_adr_jump_offset = 0 # means "patched"
- def fixup_target_tokens(self, rawstart):
- for targettoken in self.target_tokens_currently_compiling:
- targettoken._x86_loop_code += rawstart
- self.target_tokens_currently_compiling = None
- def _append_debugging_code(self, operations, tp, number, token):
- counter = self._register_counter(tp, number, token)
- c_adr = ConstInt(rffi.cast(lltype.Signed, counter))
- box = BoxInt()
- box2 = BoxInt()
- ops = [ResOperation(rop.GETFIELD_RAW, [c_adr],
- box, descr=self.debug_counter_descr),
- ResOperation(rop.INT_ADD, [box, ConstInt(1)], box2),
- ResOperation(rop.SETFIELD_RAW, [c_adr, box2],
- None, descr=self.debug_counter_descr)]
- operations.extend(ops)
- @specialize.argtype(1)
- def _inject_debugging_code(self, looptoken, operations, tp, number):
- if self._debug:
- s = 0
- for op in operations:
- s += op.getopnum()
- looptoken._x86_debug_checksum = s
- newoperations = []
- self._append_debugging_code(newoperations, tp, number,
- None)
- for op in operations:
- newoperations.append(op)
- if op.getopnum() == rop.LABEL:
- self._append_debugging_code(newoperations, 'l', number,
- op.getdescr())
- operations = newoperations
- return operations
- def _assemble(self, regalloc, operations):
- self._regalloc = regalloc
- regalloc.compute_hint_frame_locations(operations)
- regalloc.walk_operations(operations)
- if we_are_translated() or self.cpu.dont_keepalive_stuff:
- self._regalloc = None # else keep it around for debugging
- frame_depth = regalloc.get_final_frame_depth()
- jump_target_descr = regalloc.jump_target_descr
- if jump_target_descr is not None:
- target_frame_depth = jump_target_descr._x86_clt.frame_depth
- frame_depth = max(frame_depth, target_frame_depth)
- return frame_depth
- def _patchable_stackadjust(self):
- # stack adjustment LEA
- self.mc.LEA32_rb(esp.value, 0)
- return self.mc.get_relative_pos() - 4
- def _patch_stackadjust(self, adr_lea, allocated_depth):
- # patch stack adjustment LEA
- mc = codebuf.MachineCodeBlockWrapper()
- # Compute the correct offset for the instruction LEA ESP, [EBP-4*words]
- mc.writeimm32(self._get_offset_of_ebp_from_esp(allocated_depth))
- mc.copy_to_raw_memory(adr_lea)
- def _get_offset_of_ebp_from_esp(self, allocated_depth):
- # Given that [EBP] is where we saved EBP, i.e. in the last word
- # of our fixed frame, then the 'words' value is:
- words = (FRAME_FIXED_SIZE - 1) + allocated_depth
- # align, e.g. for Mac OS X
- aligned_words = align_stack_words(words+2)-2 # 2 = EIP+EBP
- return -WORD * aligned_words
- def _call_header(self):
- # NB. the shape of the frame is hard-coded in get_basic_shape() too.
- # Also, make sure this is consistent with FRAME_FIXED_SIZE.
- self.mc.PUSH_r(ebp.value)
- self.mc.MOV_rr(ebp.value, esp.value)
- for loc in self.cpu.CALLEE_SAVE_REGISTERS:
- self.mc.PUSH_r(loc.value)
- gcrootmap = self.cpu.gc_ll_descr.gcrootmap
- if gcrootmap and gcrootmap.is_shadow_stack:
- self._call_header_shadowstack(gcrootmap)
- def _call_header_with_stack_check(self):
- if self.stack_check_slowpath == 0:
- pass # no stack check (e.g. not translated)
- else:
- endaddr, lengthaddr, _ = self.cpu.insert_stack_check()
- self.mc.MOV(eax, heap(endaddr)) # MOV eax, [start]
- self.mc.SUB(eax, esp) # SUB eax, current
- self.mc.CMP(eax, heap(lengthaddr)) # CMP eax, [length]
- self.mc.J_il8(rx86.Conditions['BE'], 0) # JBE .skip
- jb_location = self.mc.get_relative_pos()
- self.mc.CALL(imm(self.stack_check_slowpath))# CALL slowpath
- # patch the JB above # .skip:
- offset = self.mc.get_relative_pos() - jb_location
- assert 0 < offset <= 127
- self.mc.overwrite(jb_location-1, chr(offset))
- #
- self._call_header()
- def _call_footer(self):
- self.mc.LEA_rb(esp.value, -len(self.cpu.CALLEE_SAVE_REGISTERS) * WORD)
- gcrootmap = self.cpu.gc_ll_descr.gcrootmap
- if gcrootmap and gcrootmap.is_shadow_stack:
- self._call_footer_shadowstack(gcrootmap)
- for i in range(len(self.cpu.CALLEE_SAVE_REGISTERS)-1, -1, -1):
- self.mc.POP_r(self.cpu.CALLEE_SAVE_REGISTERS[i].value)
- self.mc.POP_r(ebp.value)
- self.mc.RET()
- def _call_header_shadowstack(self, gcrootmap):
- # we need to put two words into the shadowstack: the MARKER_FRAME
- # and the address of the frame (ebp, actually)
- rst = gcrootmap.get_root_stack_top_addr()
- if rx86.fits_in_32bits(rst):
- self.mc.MOV_rj(eax.value, rst) # MOV eax, [rootstacktop]
- else:
- self.mc.MOV_ri(r13.value, rst) # MOV r13, rootstacktop
- self.mc.MOV_rm(eax.value, (r13.value, 0)) # MOV eax, [r13]
- #
- MARKER = gcrootmap.MARKER_FRAME
- self.mc.LEA_rm(ebx.value, (eax.value, 2*WORD)) # LEA ebx, [eax+2*WORD]
- self.mc.MOV_mi((eax.value, WORD), MARKER) # MOV [eax+WORD], MARKER
- self.mc.MOV_mr((eax.value, 0), ebp.value) # MOV [eax], ebp
- #
- if rx86.fits_in_32bits(rst):
- self.mc.MOV_jr(rst, ebx.value) # MOV [rootstacktop], ebx
- else:
- self.mc.MOV_mr((r13.value, 0), ebx.value) # MOV [r13], ebx
- def _call_footer_shadowstack(self, gcrootmap):
- rst = gcrootmap.get_root_stack_top_addr()
- if rx86.fits_in_32bits(rst):
- self.mc.SUB_ji8(rst, 2*WORD) # SUB [rootstacktop], 2*WORD
- else:
- self.mc.MOV_ri(ebx.value, rst) # MOV ebx, rootstacktop
- self.mc.SUB_mi8((ebx.value, 0), 2*WORD) # SUB [ebx], 2*WORD
- def redirect_call_assembler(self, oldlooptoken, newlooptoken):
- # some minimal sanity checking
- old_nbargs = oldlooptoken.compiled_loop_token._debug_nbargs
- new_nbargs = newlooptoken.compiled_loop_token._debug_nbargs
- assert old_nbargs == new_nbargs
- # we overwrite the instructions at the old _x86_direct_bootstrap_code
- # to start with a JMP to the new _x86_direct_bootstrap_code.
- # Ideally we should rather patch all existing CALLs, but well.
- oldadr = oldlooptoken._x86_function_addr
- target = newlooptoken._x86_function_addr
- mc = codebuf.MachineCodeBlockWrapper()
- mc.JMP(imm(target))
- if WORD == 4: # keep in sync with prepare_loop()
- assert mc.get_relative_pos() == 5
- else:
- assert mc.get_relative_pos() <= 13
- mc.copy_to_raw_memory(oldadr)
- def dump(self, text):
- if not self.verbose:
- return
- _prev = Box._extended_display
- try:
- Box._extended_display = False
- pos = self.mc.get_relative_pos()
- print >> sys.stderr, ' 0x%x %s' % (pos, text)
- finally:
- Box._extended_display = _prev
- # ------------------------------------------------------------
- def mov(self, from_loc, to_loc):
- if (isinstance(from_loc, RegLoc) and from_loc.is_xmm) or (isinstance(to_loc, RegLoc) and to_loc.is_xmm):
- self.mc.MOVSD(to_loc, from_loc)
- else:
- assert to_loc is not ebp
- self.mc.MOV(to_loc, from_loc)
- regalloc_mov = mov # legacy interface
- def regalloc_push(self, loc):
- if isinstance(loc, RegLoc) and loc.is_xmm:
- self.mc.SUB_ri(esp.value, 8) # = size of doubles
- self.mc.MOVSD_sx(0, loc.value)
- elif WORD == 4 and isinstance(loc, StackLoc) and loc.get_width() == 8:
- # XXX evil trick
- self.mc.PUSH_b(loc.value + 4)
- self.mc.PUSH_b(loc.value)
- else:
- self.mc.PUSH(loc)
- def regalloc_pop(self, loc):
- if isinstance(loc, RegLoc) and loc.is_xmm:
- self.mc.MOVSD_xs(loc.value, 0)
- self.mc.ADD_ri(esp.value, 8) # = size of doubles
- elif WORD == 4 and isinstance(loc, StackLoc) and loc.get_width() == 8:
- # XXX evil trick
- self.mc.POP_b(loc.value)
- self.mc.POP_b(loc.value + 4)
- else:
- self.mc.POP(loc)
- def regalloc_immedmem2mem(self, from_loc, to_loc):
- # move a ConstFloatLoc directly to a StackLoc, as two MOVs
- # (even on x86-64, because the immediates are encoded as 32 bits)
- assert isinstance(from_loc, ConstFloatLoc)
- assert isinstance(to_loc, StackLoc)
- low_part = rffi.cast(rffi.CArrayPtr(rffi.INT), from_loc.value)[0]
- high_part = rffi.cast(rffi.CArrayPtr(rffi.INT), from_loc.value)[1]
- low_part = intmask(low_part)
- high_part = intmask(high_part)
- self.mc.MOV32_bi(to_loc.value, low_part)
- self.mc.MOV32_bi(to_loc.value + 4, high_part)
- def regalloc_perform(self, op, arglocs, resloc):
- genop_list[op.getopnum()](self, op, arglocs, resloc)
- def regalloc_perform_discard(self, op, arglocs):
- genop_discard_list[op.getopnum()](self, op, arglocs)
- def regalloc_perform_llong(self, op, arglocs, resloc):
- effectinfo = op.getdescr().get_extra_info()
- oopspecindex = effectinfo.oopspecindex
- genop_llong_list[oopspecindex](self, op, arglocs, resloc)
- def regalloc_perform_math(self, op, arglocs, resloc):
- effectinfo = op.getdescr().get_extra_info()
- oopspecindex = effectinfo.oopspecindex
- genop_math_list[oopspecindex](self, op, arglocs, resloc)
- def regalloc_perform_with_guard(self, op, guard_op, faillocs,
- arglocs, resloc):
- faildescr = guard_op.getdescr()
- assert isinstance(faildescr, AbstractFailDescr)
- failargs = guard_op.getfailargs()
- guard_opnum = guard_op.getopnum()
- guard_token = self.implement_guard_recovery(guard_opnum,
- faildescr, failargs,
- faillocs)
- if op is None:
- dispatch_opnum = guard_opnum
- else:
- dispatch_opnum = op.getopnum()
- genop_guard_list[dispatch_opnum](self, op, guard_op, guard_token,
- arglocs, resloc)
- if not we_are_translated():
- # must be added by the genop_guard_list[]()
- assert guard_token is self.pending_guard_tokens[-1]
- def regalloc_perform_guard(self, guard_op, faillocs, arglocs, resloc):
- self.regalloc_perform_with_guard(None, guard_op, faillocs, arglocs,
- resloc)
- def load_effective_addr(self, sizereg, baseofs, scale, result, frm=imm0):
- self.mc.LEA(result, addr_add(frm, sizereg, baseofs, scale))
- def _unaryop(asmop):
- def genop_unary(self, op, arglocs, resloc):
- getattr(self.mc, asmop)(arglocs[0])
- return genop_unary
- def _binaryop(asmop, can_swap=False):
- def genop_binary(self, op, arglocs, result_loc):
- getattr(self.mc, asmop)(arglocs[0], arglocs[1])
- return genop_binary
- def _binaryop_or_lea(asmop, is_add):
- def genop_binary_or_lea(self, op, arglocs, result_loc):
- # use a regular ADD or SUB if result_loc is arglocs[0],
- # and a LEA only if different.
- if result_loc is arglocs[0]:
- getattr(self.mc, asmop)(arglocs[0], arglocs[1])
- else:
- loc = arglocs[0]
- argloc = arglocs[1]
- assert isinstance(loc, RegLoc)
- assert isinstance(argloc, ImmedLoc)
- assert isinstance(result_loc, RegLoc)
- delta = argloc.value
- if not is_add: # subtraction
- delta = -delta
- self.mc.LEA_rm(result_loc.value, (loc.value, delta))
- return genop_binary_or_lea
- def _cmpop(cond, rev_cond):
- def genop_cmp(self, op, arglocs, result_loc):
- rl = result_loc.lowest8bits()
- if isinstance(op.getarg(0), Const):
- self.mc.CMP(arglocs[1], arglocs[0])
- self.mc.SET_ir(rx86.Conditions[rev_cond], rl.value)
- else:
- self.mc.CMP(arglocs[0], arglocs[1])
- self.mc.SET_ir(rx86.Conditions[cond], rl.value)
- self.mc.MOVZX8_rr(result_loc.value, rl.value)
- return genop_cmp
- def _cmpop_float(cond, rev_cond, is_ne=False):
- def genop_cmp(self, op, arglocs, result_loc):
- if isinstance(arglocs[0], RegLoc):
- self.mc.UCOMISD(arglocs[0], arglocs[1])
- checkcond = cond
- else:
- self.mc.UCOMISD(arglocs[1], arglocs[0])
- checkcond = rev_cond
- tmp1 = result_loc.lowest8bits()
- if IS_X86_32:
- tmp2 = result_loc.higher8bits()
- elif IS_X86_64:
- tmp2 = X86_64_SCRATCH_REG.lowest8bits()
- self.mc.SET_ir(rx86.Conditions[checkcond], tmp1.value)
- if is_ne:
- self.mc.SET_ir(rx86.Conditions['P'], tmp2.value)
- self.mc.OR8_rr(tmp1.value, tmp2.value)
- else:
- self.mc.SET_ir(rx86.Conditions['NP'], tmp2.value)
- self.mc.AND8_rr(tmp1.value, tmp2.value)
- self.mc.MOVZX8_rr(result_loc.value, tmp1.value)
- return genop_cmp
- def _cmpop_guard(cond, rev_cond, false_cond, false_rev_cond):
- def genop_cmp_guard(self, op, guard_op, guard_token, arglocs, result_loc):
- guard_opnum = guard_op.getopnum()
- if isinstance(op.getarg(0), Const):
- self.mc.CMP(arglocs[1], arglocs[0])
- if guard_opnum == rop.GUARD_FALSE:
- self.implement_guard(guard_token, rev_cond)
- else:
- self.implement_guard(guard_token, false_rev_cond)
- else:
- self.mc.CMP(arglocs[0], arglocs[1])
- if guard_opnum == rop.GUARD_FALSE:
- self.implement_guard(guard_token, cond)
- else:
- self.implement_guard(guard_token, false_cond)
- return genop_cmp_guard
- def _cmpop_guard_float(cond, rev_cond, false_cond, false_rev_cond):
- need_direct_jp = 'A' not in cond
- need_rev_jp = 'A' not in rev_cond
- def genop_cmp_guard_float(self, op, guard_op, guard_token, arglocs,
- result_loc):
- guard_opnum = guard_op.getopnum()
- if isinstance(arglocs[0], RegLoc):
- self.mc.UCOMISD(arglocs[0], arglocs[1])
- checkcond = cond
- checkfalsecond = false_cond
- need_jp = need_direct_jp
- else:
- self.mc.UCOMISD(arglocs[1], arglocs[0])
- checkcond = rev_cond
- checkfalsecond = false_rev_cond
- need_jp = need_rev_jp
- if guard_opnum == rop.GUARD_FALSE:
- if need_jp:
- self.mc.J_il8(rx86.Conditions['P'], 6)
- self.implement_guard(guard_token, checkcond)
- else:
- if need_jp:
- self.mc.J_il8(rx86.Conditions['P'], 2)
- self.mc.J_il8(rx86.Conditions[checkcond], 5)
- self.implement_guard(guard_token)
- else:
- self.implement_guard(guard_token, checkfalsecond)
- return genop_cmp_guard_float
- def _emit_call(self, force_index, x, arglocs, start=0, tmp=eax,
- argtypes=None, callconv=FFI_DEFAULT_ABI):
- if IS_X86_64:
- return self._emit_call_64(force_index, x, arglocs, start, argtypes)
- p = 0
- n = len(arglocs)
- for i in range(start, n):
- loc = arglocs[i]
- if isinstance(loc, RegLoc):
- if loc.is_xmm:
- self.mc.MOVSD_sx(p, loc.value)
- else:
- self.mc.MOV_sr(p, loc.value)
- p += loc.get_width()
- p = 0
- for i in range(start, n):
- loc = arglocs[i]
- if not isinstance(loc, RegLoc):
- if loc.get_width() == 8:
- self.mc.MOVSD(xmm0, loc)
- self.mc.MOVSD_sx(p, xmm0.value)
- else:
- self.mc.MOV(tmp, loc)
- self.mc.MOV_sr(p, tmp.value)
- p += loc.get_width()
- # x is a location
- self.mc.CALL(x)
- self.mark_gc_roots(force_index)
- #
- if callconv != FFI_DEFAULT_ABI:
- self._fix_stdcall(callconv, p)
- #
- self._regalloc.needed_extra_stack_locations(p//WORD)
- def _fix_stdcall(self, callconv, p):
- from rpython.rlib.clibffi import FFI_STDCALL
- assert callconv == FFI_STDCALL
- # it's a bit stupid, but we're just going to cancel the fact that
- # the called function just added 'p' to ESP, by subtracting it again.
- self.mc.SUB_ri(esp.value, p)
- def _emit_call_64(self, force_in…
Large files files are truncated, but you can click here to view the full file