PageRenderTime 57ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 1ms

/pypy/jit/backend/x86/assembler.py

http://github.com/pypy/pypy
Python | 2580 lines | 1964 code | 229 blank | 387 comment | 411 complexity | a0ae27c3539b9fe6f39c3dbbf3975e23 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

  1. import sys, os
  2. from pypy.jit.backend.llsupport import symbolic
  3. from pypy.jit.backend.llsupport.asmmemmgr import MachineDataBlockWrapper
  4. from pypy.jit.metainterp.history import Const, Box, BoxInt, ConstInt
  5. from pypy.jit.metainterp.history import AbstractFailDescr, INT, REF, FLOAT
  6. from pypy.jit.metainterp.history import JitCellToken
  7. from pypy.rpython.lltypesystem import lltype, rffi, rstr, llmemory
  8. from pypy.rpython.lltypesystem.lloperation import llop
  9. from pypy.rpython.annlowlevel import llhelper
  10. from pypy.rlib.jit import AsmInfo
  11. from pypy.jit.backend.model import CompiledLoopToken
  12. from pypy.jit.backend.x86.regalloc import (RegAlloc, get_ebp_ofs, _get_scale,
  13. gpr_reg_mgr_cls, _valid_addressing_size)
  14. from pypy.jit.backend.x86.arch import (FRAME_FIXED_SIZE, FORCE_INDEX_OFS, WORD,
  15. IS_X86_32, IS_X86_64)
  16. from pypy.jit.backend.x86.regloc import (eax, ecx, edx, ebx,
  17. esp, ebp, esi, edi,
  18. xmm0, xmm1, xmm2, xmm3,
  19. xmm4, xmm5, xmm6, xmm7,
  20. r8, r9, r10, r11,
  21. r12, r13, r14, r15,
  22. X86_64_SCRATCH_REG,
  23. X86_64_XMM_SCRATCH_REG,
  24. RegLoc, StackLoc, ConstFloatLoc,
  25. ImmedLoc, AddressLoc, imm,
  26. imm0, imm1, FloatImmedLoc)
  27. from pypy.rlib.objectmodel import we_are_translated, specialize
  28. from pypy.jit.backend.x86 import rx86, regloc, codebuf
  29. from pypy.jit.metainterp.resoperation import rop, ResOperation
  30. from pypy.jit.backend.x86.support import values_array
  31. from pypy.jit.backend.x86 import support
  32. from pypy.rlib.debug import (debug_print, debug_start, debug_stop,
  33. have_debug_prints)
  34. from pypy.rlib import rgc
  35. from pypy.rlib.clibffi import FFI_DEFAULT_ABI
  36. from pypy.jit.backend.x86.jump import remap_frame_layout
  37. from pypy.jit.codewriter.effectinfo import EffectInfo
  38. from pypy.jit.codewriter import longlong
  39. from pypy.rlib.rarithmetic import intmask
  40. from pypy.rlib.objectmodel import compute_unique_id
  41. # darwin requires the stack to be 16 bytes aligned on calls. Same for gcc 4.5.0,
  42. # better safe than sorry
  43. CALL_ALIGN = 16 // WORD
  44. def align_stack_words(words):
  45. return (words + CALL_ALIGN - 1) & ~(CALL_ALIGN-1)
  46. class GuardToken(object):
  47. def __init__(self, faildescr, failargs, fail_locs, exc,
  48. is_guard_not_invalidated):
  49. self.faildescr = faildescr
  50. self.failargs = failargs
  51. self.fail_locs = fail_locs
  52. self.exc = exc
  53. self.is_guard_not_invalidated = is_guard_not_invalidated
  54. DEBUG_COUNTER = lltype.Struct('DEBUG_COUNTER', ('i', lltype.Signed),
  55. ('type', lltype.Char), # 'b'ridge, 'l'abel or
  56. # 'e'ntry point
  57. ('number', lltype.Signed))
  58. class Assembler386(object):
  59. _regalloc = None
  60. _output_loop_log = None
  61. def __init__(self, cpu, translate_support_code=False,
  62. failargs_limit=1000):
  63. self.cpu = cpu
  64. self.verbose = False
  65. self.rtyper = cpu.rtyper
  66. self.fail_boxes_int = values_array(lltype.Signed, failargs_limit)
  67. self.fail_boxes_ptr = values_array(llmemory.GCREF, failargs_limit)
  68. self.fail_boxes_float = values_array(longlong.FLOATSTORAGE,
  69. failargs_limit)
  70. self.fail_ebp = 0
  71. self.loop_run_counters = []
  72. self.float_const_neg_addr = 0
  73. self.float_const_abs_addr = 0
  74. self.malloc_slowpath1 = 0
  75. self.malloc_slowpath2 = 0
  76. self.memcpy_addr = 0
  77. self.setup_failure_recovery()
  78. self._debug = False
  79. self.debug_counter_descr = cpu.fielddescrof(DEBUG_COUNTER, 'i')
  80. self.fail_boxes_count = 0
  81. self.datablockwrapper = None
  82. self.stack_check_slowpath = 0
  83. self.propagate_exception_path = 0
  84. self.gcrootmap_retaddr_forced = 0
  85. self.teardown()
  86. def leave_jitted_hook(self):
  87. ptrs = self.fail_boxes_ptr.ar
  88. llop.gc_assume_young_pointers(lltype.Void,
  89. llmemory.cast_ptr_to_adr(ptrs))
  90. def set_debug(self, v):
  91. self._debug = v
  92. def setup_once(self):
  93. # the address of the function called by 'new'
  94. gc_ll_descr = self.cpu.gc_ll_descr
  95. gc_ll_descr.initialize()
  96. self.memcpy_addr = self.cpu.cast_ptr_to_int(support.memcpy_fn)
  97. self._build_failure_recovery(False)
  98. self._build_failure_recovery(True)
  99. if self.cpu.supports_floats:
  100. self._build_failure_recovery(False, withfloats=True)
  101. self._build_failure_recovery(True, withfloats=True)
  102. support.ensure_sse2_floats()
  103. self._build_float_constants()
  104. self._build_propagate_exception_path()
  105. if gc_ll_descr.get_malloc_slowpath_addr is not None:
  106. self._build_malloc_slowpath()
  107. self._build_stack_check_slowpath()
  108. if gc_ll_descr.gcrootmap:
  109. self._build_release_gil(gc_ll_descr.gcrootmap)
  110. debug_start('jit-backend-counts')
  111. self.set_debug(have_debug_prints())
  112. debug_stop('jit-backend-counts')
  113. def setup(self, looptoken):
  114. assert self.memcpy_addr != 0, "setup_once() not called?"
  115. self.current_clt = looptoken.compiled_loop_token
  116. self.pending_guard_tokens = []
  117. if WORD == 8:
  118. self.pending_memoryerror_trampoline_from = []
  119. self.error_trampoline_64 = 0
  120. self.mc = codebuf.MachineCodeBlockWrapper()
  121. #assert self.datablockwrapper is None --- but obscure case
  122. # possible, e.g. getting MemoryError and continuing
  123. allblocks = self.get_asmmemmgr_blocks(looptoken)
  124. self.datablockwrapper = MachineDataBlockWrapper(self.cpu.asmmemmgr,
  125. allblocks)
  126. self.target_tokens_currently_compiling = {}
  127. def teardown(self):
  128. self.pending_guard_tokens = None
  129. if WORD == 8:
  130. self.pending_memoryerror_trampoline_from = None
  131. self.mc = None
  132. self.current_clt = None
  133. def finish_once(self):
  134. if self._debug:
  135. debug_start('jit-backend-counts')
  136. for i in range(len(self.loop_run_counters)):
  137. struct = self.loop_run_counters[i]
  138. if struct.type == 'l':
  139. prefix = 'TargetToken(%d)' % struct.number
  140. elif struct.type == 'b':
  141. prefix = 'bridge ' + str(struct.number)
  142. else:
  143. prefix = 'entry ' + str(struct.number)
  144. debug_print(prefix + ':' + str(struct.i))
  145. debug_stop('jit-backend-counts')
  146. def _build_float_constants(self):
  147. datablockwrapper = MachineDataBlockWrapper(self.cpu.asmmemmgr, [])
  148. float_constants = datablockwrapper.malloc_aligned(32, alignment=16)
  149. datablockwrapper.done()
  150. addr = rffi.cast(rffi.CArrayPtr(lltype.Char), float_constants)
  151. qword_padding = '\x00\x00\x00\x00\x00\x00\x00\x00'
  152. # 0x8000000000000000
  153. neg_const = '\x00\x00\x00\x00\x00\x00\x00\x80'
  154. # 0x7FFFFFFFFFFFFFFF
  155. abs_const = '\xFF\xFF\xFF\xFF\xFF\xFF\xFF\x7F'
  156. data = neg_const + qword_padding + abs_const + qword_padding
  157. for i in range(len(data)):
  158. addr[i] = data[i]
  159. self.float_const_neg_addr = float_constants
  160. self.float_const_abs_addr = float_constants + 16
  161. def _build_malloc_slowpath(self):
  162. # With asmgcc, we need two helpers, so that we can write two CALL
  163. # instructions in assembler, with a mark_gc_roots in between.
  164. # With shadowstack, this is not needed, so we produce a single helper.
  165. gcrootmap = self.cpu.gc_ll_descr.gcrootmap
  166. shadow_stack = (gcrootmap is not None and gcrootmap.is_shadow_stack)
  167. #
  168. # ---------- first helper for the slow path of malloc ----------
  169. mc = codebuf.MachineCodeBlockWrapper()
  170. if self.cpu.supports_floats: # save the XMM registers in
  171. for i in range(self.cpu.NUM_REGS):# the *caller* frame, from esp+8
  172. mc.MOVSD_sx((WORD*2)+8*i, i)
  173. mc.SUB_rr(edx.value, eax.value) # compute the size we want
  174. addr = self.cpu.gc_ll_descr.get_malloc_slowpath_addr()
  175. #
  176. # The registers to save in the copy area: with shadowstack, most
  177. # registers need to be saved. With asmgcc, the callee-saved registers
  178. # don't need to.
  179. save_in_copy_area = gpr_reg_mgr_cls.REGLOC_TO_COPY_AREA_OFS.items()
  180. if not shadow_stack:
  181. save_in_copy_area = [(reg, ofs) for (reg, ofs) in save_in_copy_area
  182. if reg not in gpr_reg_mgr_cls.REGLOC_TO_GCROOTMAP_REG_INDEX]
  183. #
  184. for reg, ofs in save_in_copy_area:
  185. mc.MOV_br(ofs, reg.value)
  186. #
  187. if shadow_stack:
  188. # ---- shadowstack ----
  189. mc.SUB_ri(esp.value, 16 - WORD) # stack alignment of 16 bytes
  190. if IS_X86_32:
  191. mc.MOV_sr(0, edx.value) # push argument
  192. elif IS_X86_64:
  193. mc.MOV_rr(edi.value, edx.value)
  194. mc.CALL(imm(addr))
  195. mc.ADD_ri(esp.value, 16 - WORD)
  196. else:
  197. # ---- asmgcc ----
  198. if IS_X86_32:
  199. mc.MOV_sr(WORD, edx.value) # save it as the new argument
  200. elif IS_X86_64:
  201. # rdi can be clobbered: its content was saved in the
  202. # copy area of the stack
  203. mc.MOV_rr(edi.value, edx.value)
  204. mc.JMP(imm(addr)) # tail call to the real malloc
  205. rawstart = mc.materialize(self.cpu.asmmemmgr, [])
  206. self.malloc_slowpath1 = rawstart
  207. # ---------- second helper for the slow path of malloc ----------
  208. mc = codebuf.MachineCodeBlockWrapper()
  209. #
  210. for reg, ofs in save_in_copy_area:
  211. mc.MOV_rb(reg.value, ofs)
  212. assert reg is not eax and reg is not edx
  213. #
  214. if self.cpu.supports_floats: # restore the XMM registers
  215. for i in range(self.cpu.NUM_REGS):# from where they were saved
  216. mc.MOVSD_xs(i, (WORD*2)+8*i)
  217. #
  218. # Note: we check this after the code above, just because the code
  219. # above is more than 127 bytes on 64-bits...
  220. mc.TEST_rr(eax.value, eax.value)
  221. mc.J_il8(rx86.Conditions['Z'], 0) # patched later
  222. jz_location = mc.get_relative_pos()
  223. #
  224. nursery_free_adr = self.cpu.gc_ll_descr.get_nursery_free_addr()
  225. mc.MOV(edx, heap(nursery_free_adr)) # load this in EDX
  226. mc.RET()
  227. #
  228. # If the slowpath malloc failed, we raise a MemoryError that
  229. # always interrupts the current loop, as a "good enough"
  230. # approximation. Also note that we didn't RET from this helper;
  231. # but the code we jump to will actually restore the stack
  232. # position based on EBP, which will get us out of here for free.
  233. offset = mc.get_relative_pos() - jz_location
  234. assert 0 < offset <= 127
  235. mc.overwrite(jz_location-1, chr(offset))
  236. mc.JMP(imm(self.propagate_exception_path))
  237. #
  238. rawstart = mc.materialize(self.cpu.asmmemmgr, [])
  239. self.malloc_slowpath2 = rawstart
  240. def _build_propagate_exception_path(self):
  241. if self.cpu.propagate_exception_v < 0:
  242. return # not supported (for tests, or non-translated)
  243. #
  244. self.mc = codebuf.MachineCodeBlockWrapper()
  245. # call on_leave_jitted_save_exc()
  246. addr = self.cpu.get_on_leave_jitted_int(save_exception=True,
  247. default_to_memoryerror=True)
  248. self.mc.CALL(imm(addr))
  249. self.mc.MOV_ri(eax.value, self.cpu.propagate_exception_v)
  250. self._call_footer()
  251. rawstart = self.mc.materialize(self.cpu.asmmemmgr, [])
  252. self.propagate_exception_path = rawstart
  253. self.mc = None
  254. def _build_stack_check_slowpath(self):
  255. _, _, slowpathaddr = self.cpu.insert_stack_check()
  256. if slowpathaddr == 0 or self.cpu.propagate_exception_v < 0:
  257. return # no stack check (for tests, or non-translated)
  258. #
  259. # make a "function" that is called immediately at the start of
  260. # an assembler function. In particular, the stack looks like:
  261. #
  262. # | ... | <-- aligned to a multiple of 16
  263. # | retaddr of caller |
  264. # | my own retaddr | <-- esp
  265. # +---------------------+
  266. #
  267. mc = codebuf.MachineCodeBlockWrapper()
  268. #
  269. stack_size = WORD
  270. if IS_X86_64:
  271. # on the x86_64, we have to save all the registers that may
  272. # have been used to pass arguments
  273. stack_size += 6*WORD + 8*8
  274. for reg in [edi, esi, edx, ecx, r8, r9]:
  275. mc.PUSH_r(reg.value)
  276. mc.SUB_ri(esp.value, 8*8)
  277. for i in range(8):
  278. mc.MOVSD_sx(8*i, i) # xmm0 to xmm7
  279. #
  280. if IS_X86_32:
  281. stack_size += 2*WORD
  282. mc.PUSH_r(eax.value) # alignment
  283. mc.PUSH_r(esp.value)
  284. elif IS_X86_64:
  285. mc.MOV_rr(edi.value, esp.value)
  286. #
  287. # esp is now aligned to a multiple of 16 again
  288. mc.CALL(imm(slowpathaddr))
  289. #
  290. mc.MOV(eax, heap(self.cpu.pos_exception()))
  291. mc.TEST_rr(eax.value, eax.value)
  292. mc.J_il8(rx86.Conditions['NZ'], 0)
  293. jnz_location = mc.get_relative_pos()
  294. #
  295. if IS_X86_32:
  296. mc.ADD_ri(esp.value, 2*WORD) # cancel the two PUSHes above
  297. elif IS_X86_64:
  298. # restore the registers
  299. for i in range(7, -1, -1):
  300. mc.MOVSD_xs(i, 8*i)
  301. mc.ADD_ri(esp.value, 8*8)
  302. for reg in [r9, r8, ecx, edx, esi, edi]:
  303. mc.POP_r(reg.value)
  304. #
  305. mc.RET()
  306. #
  307. # patch the JNZ above
  308. offset = mc.get_relative_pos() - jnz_location
  309. assert 0 < offset <= 127
  310. mc.overwrite(jnz_location-1, chr(offset))
  311. # call on_leave_jitted_save_exc()
  312. addr = self.cpu.get_on_leave_jitted_int(save_exception=True)
  313. mc.CALL(imm(addr))
  314. #
  315. mc.MOV_ri(eax.value, self.cpu.propagate_exception_v)
  316. #
  317. # footer -- note the ADD, which skips the return address of this
  318. # function, and will instead return to the caller's caller. Note
  319. # also that we completely ignore the saved arguments, because we
  320. # are interrupting the function.
  321. mc.ADD_ri(esp.value, stack_size)
  322. mc.RET()
  323. #
  324. rawstart = mc.materialize(self.cpu.asmmemmgr, [])
  325. self.stack_check_slowpath = rawstart
  326. @staticmethod
  327. @rgc.no_collect
  328. def _release_gil_asmgcc(css):
  329. # similar to trackgcroot.py:pypy_asm_stackwalk, first part
  330. from pypy.rpython.memory.gctransform import asmgcroot
  331. new = rffi.cast(asmgcroot.ASM_FRAMEDATA_HEAD_PTR, css)
  332. next = asmgcroot.gcrootanchor.next
  333. new.next = next
  334. new.prev = asmgcroot.gcrootanchor
  335. asmgcroot.gcrootanchor.next = new
  336. next.prev = new
  337. # and now release the GIL
  338. before = rffi.aroundstate.before
  339. if before:
  340. before()
  341. @staticmethod
  342. @rgc.no_collect
  343. def _reacquire_gil_asmgcc(css):
  344. # first reacquire the GIL
  345. after = rffi.aroundstate.after
  346. if after:
  347. after()
  348. # similar to trackgcroot.py:pypy_asm_stackwalk, second part
  349. from pypy.rpython.memory.gctransform import asmgcroot
  350. old = rffi.cast(asmgcroot.ASM_FRAMEDATA_HEAD_PTR, css)
  351. prev = old.prev
  352. next = old.next
  353. prev.next = next
  354. next.prev = prev
  355. @staticmethod
  356. @rgc.no_collect
  357. def _release_gil_shadowstack():
  358. before = rffi.aroundstate.before
  359. if before:
  360. before()
  361. @staticmethod
  362. @rgc.no_collect
  363. def _reacquire_gil_shadowstack():
  364. after = rffi.aroundstate.after
  365. if after:
  366. after()
  367. _NOARG_FUNC = lltype.Ptr(lltype.FuncType([], lltype.Void))
  368. _CLOSESTACK_FUNC = lltype.Ptr(lltype.FuncType([rffi.LONGP],
  369. lltype.Void))
  370. def _build_release_gil(self, gcrootmap):
  371. if gcrootmap.is_shadow_stack:
  372. releasegil_func = llhelper(self._NOARG_FUNC,
  373. self._release_gil_shadowstack)
  374. reacqgil_func = llhelper(self._NOARG_FUNC,
  375. self._reacquire_gil_shadowstack)
  376. else:
  377. releasegil_func = llhelper(self._CLOSESTACK_FUNC,
  378. self._release_gil_asmgcc)
  379. reacqgil_func = llhelper(self._CLOSESTACK_FUNC,
  380. self._reacquire_gil_asmgcc)
  381. self.releasegil_addr = self.cpu.cast_ptr_to_int(releasegil_func)
  382. self.reacqgil_addr = self.cpu.cast_ptr_to_int(reacqgil_func)
  383. def assemble_loop(self, loopname, inputargs, operations, looptoken, log):
  384. '''adds the following attributes to looptoken:
  385. _x86_function_addr (address of the generated func, as an int)
  386. _x86_loop_code (debug: addr of the start of the ResOps)
  387. _x86_fullsize (debug: full size including failure)
  388. _x86_debug_checksum
  389. '''
  390. # XXX this function is too longish and contains some code
  391. # duplication with assemble_bridge(). Also, we should think
  392. # about not storing on 'self' attributes that will live only
  393. # for the duration of compiling one loop or a one bridge.
  394. clt = CompiledLoopToken(self.cpu, looptoken.number)
  395. clt.allgcrefs = []
  396. looptoken.compiled_loop_token = clt
  397. if not we_are_translated():
  398. # Arguments should be unique
  399. assert len(set(inputargs)) == len(inputargs)
  400. self.setup(looptoken)
  401. if log:
  402. operations = self._inject_debugging_code(looptoken, operations,
  403. 'e', looptoken.number)
  404. regalloc = RegAlloc(self, self.cpu.translate_support_code)
  405. #
  406. self._call_header_with_stack_check()
  407. stackadjustpos = self._patchable_stackadjust()
  408. clt._debug_nbargs = len(inputargs)
  409. operations = regalloc.prepare_loop(inputargs, operations,
  410. looptoken, clt.allgcrefs)
  411. looppos = self.mc.get_relative_pos()
  412. looptoken._x86_loop_code = looppos
  413. clt.frame_depth = -1 # temporarily
  414. frame_depth = self._assemble(regalloc, operations)
  415. clt.frame_depth = frame_depth
  416. #
  417. size_excluding_failure_stuff = self.mc.get_relative_pos()
  418. self.write_pending_failure_recoveries()
  419. full_size = self.mc.get_relative_pos()
  420. #
  421. rawstart = self.materialize_loop(looptoken)
  422. debug_start("jit-backend-addr")
  423. debug_print("Loop %d (%s) has address %x to %x (bootstrap %x)" % (
  424. looptoken.number, loopname,
  425. rawstart + looppos,
  426. rawstart + size_excluding_failure_stuff,
  427. rawstart))
  428. debug_stop("jit-backend-addr")
  429. self._patch_stackadjust(rawstart + stackadjustpos, frame_depth)
  430. self.patch_pending_failure_recoveries(rawstart)
  431. #
  432. ops_offset = self.mc.ops_offset
  433. if not we_are_translated():
  434. # used only by looptoken.dump() -- useful in tests
  435. looptoken._x86_rawstart = rawstart
  436. looptoken._x86_fullsize = full_size
  437. looptoken._x86_ops_offset = ops_offset
  438. looptoken._x86_function_addr = rawstart
  439. self.fixup_target_tokens(rawstart)
  440. self.teardown()
  441. # oprofile support
  442. if self.cpu.profile_agent is not None:
  443. name = "Loop # %s: %s" % (looptoken.number, loopname)
  444. self.cpu.profile_agent.native_code_written(name,
  445. rawstart, full_size)
  446. return AsmInfo(ops_offset, rawstart + looppos,
  447. size_excluding_failure_stuff - looppos)
  448. def assemble_bridge(self, faildescr, inputargs, operations,
  449. original_loop_token, log):
  450. if not we_are_translated():
  451. # Arguments should be unique
  452. assert len(set(inputargs)) == len(inputargs)
  453. descr_number = self.cpu.get_fail_descr_number(faildescr)
  454. failure_recovery = self._find_failure_recovery_bytecode(faildescr)
  455. self.setup(original_loop_token)
  456. if log:
  457. operations = self._inject_debugging_code(faildescr, operations,
  458. 'b', descr_number)
  459. arglocs = self.rebuild_faillocs_from_descr(failure_recovery)
  460. if not we_are_translated():
  461. assert ([loc.assembler() for loc in arglocs] ==
  462. [loc.assembler() for loc in faildescr._x86_debug_faillocs])
  463. regalloc = RegAlloc(self, self.cpu.translate_support_code)
  464. startpos = self.mc.get_relative_pos()
  465. operations = regalloc.prepare_bridge(inputargs, arglocs,
  466. operations,
  467. self.current_clt.allgcrefs)
  468. stackadjustpos = self._patchable_stackadjust()
  469. frame_depth = self._assemble(regalloc, operations)
  470. codeendpos = self.mc.get_relative_pos()
  471. self.write_pending_failure_recoveries()
  472. fullsize = self.mc.get_relative_pos()
  473. #
  474. rawstart = self.materialize_loop(original_loop_token)
  475. debug_start("jit-backend-addr")
  476. debug_print("bridge out of Guard %d has address %x to %x" %
  477. (descr_number, rawstart, rawstart + codeendpos))
  478. debug_stop("jit-backend-addr")
  479. self._patch_stackadjust(rawstart + stackadjustpos, frame_depth)
  480. self.patch_pending_failure_recoveries(rawstart)
  481. if not we_are_translated():
  482. # for the benefit of tests
  483. faildescr._x86_bridge_frame_depth = frame_depth
  484. # patch the jump from original guard
  485. self.patch_jump_for_descr(faildescr, rawstart)
  486. ops_offset = self.mc.ops_offset
  487. self.fixup_target_tokens(rawstart)
  488. self.current_clt.frame_depth = max(self.current_clt.frame_depth, frame_depth)
  489. self.teardown()
  490. # oprofile support
  491. if self.cpu.profile_agent is not None:
  492. name = "Bridge # %s" % (descr_number,)
  493. self.cpu.profile_agent.native_code_written(name,
  494. rawstart, fullsize)
  495. return AsmInfo(ops_offset, startpos + rawstart, codeendpos - startpos)
  496. def write_pending_failure_recoveries(self):
  497. # for each pending guard, generate the code of the recovery stub
  498. # at the end of self.mc.
  499. for tok in self.pending_guard_tokens:
  500. tok.pos_recovery_stub = self.generate_quick_failure(tok)
  501. if WORD == 8 and len(self.pending_memoryerror_trampoline_from) > 0:
  502. self.error_trampoline_64 = self.generate_propagate_error_64()
  503. def patch_pending_failure_recoveries(self, rawstart):
  504. # after we wrote the assembler to raw memory, set up
  505. # tok.faildescr._x86_adr_jump_offset to contain the raw address of
  506. # the 4-byte target field in the JMP/Jcond instruction, and patch
  507. # the field in question to point (initially) to the recovery stub
  508. clt = self.current_clt
  509. for tok in self.pending_guard_tokens:
  510. addr = rawstart + tok.pos_jump_offset
  511. tok.faildescr._x86_adr_jump_offset = addr
  512. relative_target = tok.pos_recovery_stub - (tok.pos_jump_offset + 4)
  513. assert rx86.fits_in_32bits(relative_target)
  514. #
  515. if not tok.is_guard_not_invalidated:
  516. mc = codebuf.MachineCodeBlockWrapper()
  517. mc.writeimm32(relative_target)
  518. mc.copy_to_raw_memory(addr)
  519. else:
  520. # GUARD_NOT_INVALIDATED, record an entry in
  521. # clt.invalidate_positions of the form:
  522. # (addr-in-the-code-of-the-not-yet-written-jump-target,
  523. # relative-target-to-use)
  524. relpos = tok.pos_jump_offset
  525. clt.invalidate_positions.append((rawstart + relpos,
  526. relative_target))
  527. # General idea: Although no code was generated by this
  528. # guard, the code might be patched with a "JMP rel32" to
  529. # the guard recovery code. This recovery code is
  530. # already generated, and looks like the recovery code
  531. # for any guard, even if at first it has no jump to it.
  532. # So we may later write 5 bytes overriding the existing
  533. # instructions; this works because a CALL instruction
  534. # would also take at least 5 bytes. If it could take
  535. # less, we would run into the issue that overwriting the
  536. # 5 bytes here might get a few nonsense bytes at the
  537. # return address of the following CALL.
  538. if WORD == 8:
  539. for pos_after_jz in self.pending_memoryerror_trampoline_from:
  540. assert self.error_trampoline_64 != 0 # only if non-empty
  541. mc = codebuf.MachineCodeBlockWrapper()
  542. mc.writeimm32(self.error_trampoline_64 - pos_after_jz)
  543. mc.copy_to_raw_memory(rawstart + pos_after_jz - 4)
  544. def get_asmmemmgr_blocks(self, looptoken):
  545. clt = looptoken.compiled_loop_token
  546. if clt.asmmemmgr_blocks is None:
  547. clt.asmmemmgr_blocks = []
  548. return clt.asmmemmgr_blocks
  549. def materialize_loop(self, looptoken):
  550. self.datablockwrapper.done() # finish using cpu.asmmemmgr
  551. self.datablockwrapper = None
  552. allblocks = self.get_asmmemmgr_blocks(looptoken)
  553. return self.mc.materialize(self.cpu.asmmemmgr, allblocks,
  554. self.cpu.gc_ll_descr.gcrootmap)
  555. def _register_counter(self, tp, number, token):
  556. # YYY very minor leak -- we need the counters to stay alive
  557. # forever, just because we want to report them at the end
  558. # of the process
  559. struct = lltype.malloc(DEBUG_COUNTER, flavor='raw',
  560. track_allocation=False)
  561. struct.i = 0
  562. struct.type = tp
  563. if tp == 'b' or tp == 'e':
  564. struct.number = number
  565. else:
  566. assert token
  567. struct.number = compute_unique_id(token)
  568. self.loop_run_counters.append(struct)
  569. return struct
  570. def _find_failure_recovery_bytecode(self, faildescr):
  571. adr_jump_offset = faildescr._x86_adr_jump_offset
  572. if adr_jump_offset == 0:
  573. # This case should be prevented by the logic in compile.py:
  574. # look for CNT_BUSY_FLAG, which disables tracing from a guard
  575. # when another tracing from the same guard is already in progress.
  576. raise BridgeAlreadyCompiled
  577. # follow the JMP/Jcond
  578. p = rffi.cast(rffi.INTP, adr_jump_offset)
  579. adr_target = adr_jump_offset + 4 + rffi.cast(lltype.Signed, p[0])
  580. # skip the CALL
  581. if WORD == 4:
  582. adr_target += 5 # CALL imm
  583. else:
  584. adr_target += 13 # MOV r11, imm-as-8-bytes; CALL *r11 xxxxxxxxxx
  585. return adr_target
  586. def patch_jump_for_descr(self, faildescr, adr_new_target):
  587. adr_jump_offset = faildescr._x86_adr_jump_offset
  588. assert adr_jump_offset != 0
  589. offset = adr_new_target - (adr_jump_offset + 4)
  590. # If the new target fits within a rel32 of the jump, just patch
  591. # that. Otherwise, leave the original rel32 to the recovery stub in
  592. # place, but clobber the recovery stub with a jump to the real
  593. # target.
  594. mc = codebuf.MachineCodeBlockWrapper()
  595. if rx86.fits_in_32bits(offset):
  596. mc.writeimm32(offset)
  597. mc.copy_to_raw_memory(adr_jump_offset)
  598. else:
  599. # "mov r11, addr; jmp r11" is up to 13 bytes, which fits in there
  600. # because we always write "mov r11, imm-as-8-bytes; call *r11" in
  601. # the first place.
  602. mc.MOV_ri(X86_64_SCRATCH_REG.value, adr_new_target)
  603. mc.JMP_r(X86_64_SCRATCH_REG.value)
  604. p = rffi.cast(rffi.INTP, adr_jump_offset)
  605. adr_target = adr_jump_offset + 4 + rffi.cast(lltype.Signed, p[0])
  606. mc.copy_to_raw_memory(adr_target)
  607. faildescr._x86_adr_jump_offset = 0 # means "patched"
  608. def fixup_target_tokens(self, rawstart):
  609. for targettoken in self.target_tokens_currently_compiling:
  610. targettoken._x86_loop_code += rawstart
  611. self.target_tokens_currently_compiling = None
  612. def _append_debugging_code(self, operations, tp, number, token):
  613. counter = self._register_counter(tp, number, token)
  614. c_adr = ConstInt(rffi.cast(lltype.Signed, counter))
  615. box = BoxInt()
  616. box2 = BoxInt()
  617. ops = [ResOperation(rop.GETFIELD_RAW, [c_adr],
  618. box, descr=self.debug_counter_descr),
  619. ResOperation(rop.INT_ADD, [box, ConstInt(1)], box2),
  620. ResOperation(rop.SETFIELD_RAW, [c_adr, box2],
  621. None, descr=self.debug_counter_descr)]
  622. operations.extend(ops)
  623. @specialize.argtype(1)
  624. def _inject_debugging_code(self, looptoken, operations, tp, number):
  625. if self._debug:
  626. # before doing anything, let's increase a counter
  627. s = 0
  628. for op in operations:
  629. s += op.getopnum()
  630. looptoken._x86_debug_checksum = s
  631. newoperations = []
  632. self._append_debugging_code(newoperations, tp, number,
  633. None)
  634. for op in operations:
  635. newoperations.append(op)
  636. if op.getopnum() == rop.LABEL:
  637. self._append_debugging_code(newoperations, 'l', number,
  638. op.getdescr())
  639. operations = newoperations
  640. return operations
  641. def _assemble(self, regalloc, operations):
  642. self._regalloc = regalloc
  643. regalloc.compute_hint_frame_locations(operations)
  644. regalloc.walk_operations(operations)
  645. if we_are_translated() or self.cpu.dont_keepalive_stuff:
  646. self._regalloc = None # else keep it around for debugging
  647. frame_depth = regalloc.get_final_frame_depth()
  648. jump_target_descr = regalloc.jump_target_descr
  649. if jump_target_descr is not None:
  650. target_frame_depth = jump_target_descr._x86_clt.frame_depth
  651. frame_depth = max(frame_depth, target_frame_depth)
  652. return frame_depth
  653. def _patchable_stackadjust(self):
  654. # stack adjustment LEA
  655. self.mc.LEA32_rb(esp.value, 0)
  656. return self.mc.get_relative_pos() - 4
  657. def _patch_stackadjust(self, adr_lea, allocated_depth):
  658. # patch stack adjustment LEA
  659. mc = codebuf.MachineCodeBlockWrapper()
  660. # Compute the correct offset for the instruction LEA ESP, [EBP-4*words]
  661. mc.writeimm32(self._get_offset_of_ebp_from_esp(allocated_depth))
  662. mc.copy_to_raw_memory(adr_lea)
  663. def _get_offset_of_ebp_from_esp(self, allocated_depth):
  664. # Given that [EBP] is where we saved EBP, i.e. in the last word
  665. # of our fixed frame, then the 'words' value is:
  666. words = (FRAME_FIXED_SIZE - 1) + allocated_depth
  667. # align, e.g. for Mac OS X
  668. aligned_words = align_stack_words(words+2)-2 # 2 = EIP+EBP
  669. return -WORD * aligned_words
  670. def _call_header(self):
  671. # NB. the shape of the frame is hard-coded in get_basic_shape() too.
  672. # Also, make sure this is consistent with FRAME_FIXED_SIZE.
  673. self.mc.PUSH_r(ebp.value)
  674. self.mc.MOV_rr(ebp.value, esp.value)
  675. for loc in self.cpu.CALLEE_SAVE_REGISTERS:
  676. self.mc.PUSH_r(loc.value)
  677. gcrootmap = self.cpu.gc_ll_descr.gcrootmap
  678. if gcrootmap and gcrootmap.is_shadow_stack:
  679. self._call_header_shadowstack(gcrootmap)
  680. def _call_header_with_stack_check(self):
  681. if self.stack_check_slowpath == 0:
  682. pass # no stack check (e.g. not translated)
  683. else:
  684. endaddr, lengthaddr, _ = self.cpu.insert_stack_check()
  685. self.mc.MOV(eax, heap(endaddr)) # MOV eax, [start]
  686. self.mc.SUB(eax, esp) # SUB eax, current
  687. self.mc.CMP(eax, heap(lengthaddr)) # CMP eax, [length]
  688. self.mc.J_il8(rx86.Conditions['BE'], 0) # JBE .skip
  689. jb_location = self.mc.get_relative_pos()
  690. self.mc.CALL(imm(self.stack_check_slowpath))# CALL slowpath
  691. # patch the JB above # .skip:
  692. offset = self.mc.get_relative_pos() - jb_location
  693. assert 0 < offset <= 127
  694. self.mc.overwrite(jb_location-1, chr(offset))
  695. #
  696. self._call_header()
  697. def _call_footer(self):
  698. self.mc.LEA_rb(esp.value, -len(self.cpu.CALLEE_SAVE_REGISTERS) * WORD)
  699. gcrootmap = self.cpu.gc_ll_descr.gcrootmap
  700. if gcrootmap and gcrootmap.is_shadow_stack:
  701. self._call_footer_shadowstack(gcrootmap)
  702. for i in range(len(self.cpu.CALLEE_SAVE_REGISTERS)-1, -1, -1):
  703. self.mc.POP_r(self.cpu.CALLEE_SAVE_REGISTERS[i].value)
  704. self.mc.POP_r(ebp.value)
  705. self.mc.RET()
  706. def _call_header_shadowstack(self, gcrootmap):
  707. # we need to put two words into the shadowstack: the MARKER_FRAME
  708. # and the address of the frame (ebp, actually)
  709. rst = gcrootmap.get_root_stack_top_addr()
  710. if rx86.fits_in_32bits(rst):
  711. self.mc.MOV_rj(eax.value, rst) # MOV eax, [rootstacktop]
  712. else:
  713. self.mc.MOV_ri(r13.value, rst) # MOV r13, rootstacktop
  714. self.mc.MOV_rm(eax.value, (r13.value, 0)) # MOV eax, [r13]
  715. #
  716. MARKER = gcrootmap.MARKER_FRAME
  717. self.mc.LEA_rm(ebx.value, (eax.value, 2*WORD)) # LEA ebx, [eax+2*WORD]
  718. self.mc.MOV_mi((eax.value, WORD), MARKER) # MOV [eax+WORD], MARKER
  719. self.mc.MOV_mr((eax.value, 0), ebp.value) # MOV [eax], ebp
  720. #
  721. if rx86.fits_in_32bits(rst):
  722. self.mc.MOV_jr(rst, ebx.value) # MOV [rootstacktop], ebx
  723. else:
  724. self.mc.MOV_mr((r13.value, 0), ebx.value) # MOV [r13], ebx
  725. def _call_footer_shadowstack(self, gcrootmap):
  726. rst = gcrootmap.get_root_stack_top_addr()
  727. if rx86.fits_in_32bits(rst):
  728. self.mc.SUB_ji8(rst, 2*WORD) # SUB [rootstacktop], 2*WORD
  729. else:
  730. self.mc.MOV_ri(ebx.value, rst) # MOV ebx, rootstacktop
  731. self.mc.SUB_mi8((ebx.value, 0), 2*WORD) # SUB [ebx], 2*WORD
  732. def redirect_call_assembler(self, oldlooptoken, newlooptoken):
  733. # some minimal sanity checking
  734. old_nbargs = oldlooptoken.compiled_loop_token._debug_nbargs
  735. new_nbargs = newlooptoken.compiled_loop_token._debug_nbargs
  736. assert old_nbargs == new_nbargs
  737. # we overwrite the instructions at the old _x86_direct_bootstrap_code
  738. # to start with a JMP to the new _x86_direct_bootstrap_code.
  739. # Ideally we should rather patch all existing CALLs, but well.
  740. oldadr = oldlooptoken._x86_function_addr
  741. target = newlooptoken._x86_function_addr
  742. mc = codebuf.MachineCodeBlockWrapper()
  743. mc.JMP(imm(target))
  744. if WORD == 4: # keep in sync with prepare_loop()
  745. assert mc.get_relative_pos() == 5
  746. else:
  747. assert mc.get_relative_pos() <= 13
  748. mc.copy_to_raw_memory(oldadr)
  749. def dump(self, text):
  750. if not self.verbose:
  751. return
  752. _prev = Box._extended_display
  753. try:
  754. Box._extended_display = False
  755. pos = self.mc.get_relative_pos()
  756. print >> sys.stderr, ' 0x%x %s' % (pos, text)
  757. finally:
  758. Box._extended_display = _prev
  759. # ------------------------------------------------------------
  760. def mov(self, from_loc, to_loc):
  761. if (isinstance(from_loc, RegLoc) and from_loc.is_xmm) or (isinstance(to_loc, RegLoc) and to_loc.is_xmm):
  762. self.mc.MOVSD(to_loc, from_loc)
  763. else:
  764. assert to_loc is not ebp
  765. self.mc.MOV(to_loc, from_loc)
  766. regalloc_mov = mov # legacy interface
  767. def regalloc_push(self, loc):
  768. if isinstance(loc, RegLoc) and loc.is_xmm:
  769. self.mc.SUB_ri(esp.value, 8) # = size of doubles
  770. self.mc.MOVSD_sx(0, loc.value)
  771. elif WORD == 4 and isinstance(loc, StackLoc) and loc.get_width() == 8:
  772. # XXX evil trick
  773. self.mc.PUSH_b(loc.value + 4)
  774. self.mc.PUSH_b(loc.value)
  775. else:
  776. self.mc.PUSH(loc)
  777. def regalloc_pop(self, loc):
  778. if isinstance(loc, RegLoc) and loc.is_xmm:
  779. self.mc.MOVSD_xs(loc.value, 0)
  780. self.mc.ADD_ri(esp.value, 8) # = size of doubles
  781. elif WORD == 4 and isinstance(loc, StackLoc) and loc.get_width() == 8:
  782. # XXX evil trick
  783. self.mc.POP_b(loc.value)
  784. self.mc.POP_b(loc.value + 4)
  785. else:
  786. self.mc.POP(loc)
  787. def regalloc_immedmem2mem(self, from_loc, to_loc):
  788. # move a ConstFloatLoc directly to a StackLoc, as two MOVs
  789. # (even on x86-64, because the immediates are encoded as 32 bits)
  790. assert isinstance(from_loc, ConstFloatLoc)
  791. assert isinstance(to_loc, StackLoc)
  792. low_part = rffi.cast(rffi.CArrayPtr(rffi.INT), from_loc.value)[0]
  793. high_part = rffi.cast(rffi.CArrayPtr(rffi.INT), from_loc.value)[1]
  794. low_part = intmask(low_part)
  795. high_part = intmask(high_part)
  796. self.mc.MOV32_bi(to_loc.value, low_part)
  797. self.mc.MOV32_bi(to_loc.value + 4, high_part)
  798. def regalloc_perform(self, op, arglocs, resloc):
  799. genop_list[op.getopnum()](self, op, arglocs, resloc)
  800. def regalloc_perform_discard(self, op, arglocs):
  801. genop_discard_list[op.getopnum()](self, op, arglocs)
  802. def regalloc_perform_llong(self, op, arglocs, resloc):
  803. effectinfo = op.getdescr().get_extra_info()
  804. oopspecindex = effectinfo.oopspecindex
  805. genop_llong_list[oopspecindex](self, op, arglocs, resloc)
  806. def regalloc_perform_math(self, op, arglocs, resloc):
  807. effectinfo = op.getdescr().get_extra_info()
  808. oopspecindex = effectinfo.oopspecindex
  809. genop_math_list[oopspecindex](self, op, arglocs, resloc)
  810. def regalloc_perform_with_guard(self, op, guard_op, faillocs,
  811. arglocs, resloc):
  812. faildescr = guard_op.getdescr()
  813. assert isinstance(faildescr, AbstractFailDescr)
  814. failargs = guard_op.getfailargs()
  815. guard_opnum = guard_op.getopnum()
  816. guard_token = self.implement_guard_recovery(guard_opnum,
  817. faildescr, failargs,
  818. faillocs)
  819. if op is None:
  820. dispatch_opnum = guard_opnum
  821. else:
  822. dispatch_opnum = op.getopnum()
  823. genop_guard_list[dispatch_opnum](self, op, guard_op, guard_token,
  824. arglocs, resloc)
  825. if not we_are_translated():
  826. # must be added by the genop_guard_list[]()
  827. assert guard_token is self.pending_guard_tokens[-1]
  828. def regalloc_perform_guard(self, guard_op, faillocs, arglocs, resloc):
  829. self.regalloc_perform_with_guard(None, guard_op, faillocs, arglocs,
  830. resloc)
  831. def load_effective_addr(self, sizereg, baseofs, scale, result, frm=imm0):
  832. self.mc.LEA(result, addr_add(frm, sizereg, baseofs, scale))
  833. def _unaryop(asmop):
  834. def genop_unary(self, op, arglocs, resloc):
  835. getattr(self.mc, asmop)(arglocs[0])
  836. return genop_unary
  837. def _binaryop(asmop, can_swap=False):
  838. def genop_binary(self, op, arglocs, result_loc):
  839. getattr(self.mc, asmop)(arglocs[0], arglocs[1])
  840. return genop_binary
  841. def _cmpop(cond, rev_cond):
  842. def genop_cmp(self, op, arglocs, result_loc):
  843. rl = result_loc.lowest8bits()
  844. if isinstance(op.getarg(0), Const):
  845. self.mc.CMP(arglocs[1], arglocs[0])
  846. self.mc.SET_ir(rx86.Conditions[rev_cond], rl.value)
  847. else:
  848. self.mc.CMP(arglocs[0], arglocs[1])
  849. self.mc.SET_ir(rx86.Conditions[cond], rl.value)
  850. self.mc.MOVZX8_rr(result_loc.value, rl.value)
  851. return genop_cmp
  852. def _cmpop_float(cond, rev_cond, is_ne=False):
  853. def genop_cmp(self, op, arglocs, result_loc):
  854. if isinstance(arglocs[0], RegLoc):
  855. self.mc.UCOMISD(arglocs[0], arglocs[1])
  856. checkcond = cond
  857. else:
  858. self.mc.UCOMISD(arglocs[1], arglocs[0])
  859. checkcond = rev_cond
  860. tmp1 = result_loc.lowest8bits()
  861. if IS_X86_32:
  862. tmp2 = result_loc.higher8bits()
  863. elif IS_X86_64:
  864. tmp2 = X86_64_SCRATCH_REG.lowest8bits()
  865. self.mc.SET_ir(rx86.Conditions[checkcond], tmp1.value)
  866. if is_ne:
  867. self.mc.SET_ir(rx86.Conditions['P'], tmp2.value)
  868. self.mc.OR8_rr(tmp1.value, tmp2.value)
  869. else:
  870. self.mc.SET_ir(rx86.Conditions['NP'], tmp2.value)
  871. self.mc.AND8_rr(tmp1.value, tmp2.value)
  872. self.mc.MOVZX8_rr(result_loc.value, tmp1.value)
  873. return genop_cmp
  874. def _cmpop_guard(cond, rev_cond, false_cond, false_rev_cond):
  875. def genop_cmp_guard(self, op, guard_op, guard_token, arglocs, result_loc):
  876. guard_opnum = guard_op.getopnum()
  877. if isinstance(op.getarg(0), Const):
  878. self.mc.CMP(arglocs[1], arglocs[0])
  879. if guard_opnum == rop.GUARD_FALSE:
  880. self.implement_guard(guard_token, rev_cond)
  881. else:
  882. self.implement_guard(guard_token, false_rev_cond)
  883. else:
  884. self.mc.CMP(arglocs[0], arglocs[1])
  885. if guard_opnum == rop.GUARD_FALSE:
  886. self.implement_guard(guard_token, cond)
  887. else:
  888. self.implement_guard(guard_token, false_cond)
  889. return genop_cmp_guard
  890. def _cmpop_guard_float(cond, rev_cond, false_cond, false_rev_cond):
  891. need_direct_jp = 'A' not in cond
  892. need_rev_jp = 'A' not in rev_cond
  893. def genop_cmp_guard_float(self, op, guard_op, guard_token, arglocs,
  894. result_loc):
  895. guard_opnum = guard_op.getopnum()
  896. if isinstance(arglocs[0], RegLoc):
  897. self.mc.UCOMISD(arglocs[0], arglocs[1])
  898. checkcond = cond
  899. checkfalsecond = false_cond
  900. need_jp = need_direct_jp
  901. else:
  902. self.mc.UCOMISD(arglocs[1], arglocs[0])
  903. checkcond = rev_cond
  904. checkfalsecond = false_rev_cond
  905. need_jp = need_rev_jp
  906. if guard_opnum == rop.GUARD_FALSE:
  907. if need_jp:
  908. self.mc.J_il8(rx86.Conditions['P'], 6)
  909. self.implement_guard(guard_token, checkcond)
  910. else:
  911. if need_jp:
  912. self.mc.J_il8(rx86.Conditions['P'], 2)
  913. self.mc.J_il8(rx86.Conditions[checkcond], 5)
  914. self.implement_guard(guard_token)
  915. else:
  916. self.implement_guard(guard_token, checkfalsecond)
  917. return genop_cmp_guard_float
  918. def _emit_call(self, force_index, x, arglocs, start=0, tmp=eax,
  919. argtypes=None, callconv=FFI_DEFAULT_ABI):
  920. if IS_X86_64:
  921. return self._emit_call_64(force_index, x, arglocs, start, argtypes)
  922. p = 0
  923. n = len(arglocs)
  924. for i in range(start, n):
  925. loc = arglocs[i]
  926. if isinstance(loc, RegLoc):
  927. if loc.is_xmm:
  928. self.mc.MOVSD_sx(p, loc.value)
  929. else:
  930. self.mc.MOV_sr(p, loc.value)
  931. p += loc.get_width()
  932. p = 0
  933. for i in range(start, n):
  934. loc = arglocs[i]
  935. if not isinstance(loc, RegLoc):
  936. if loc.get_width() == 8:
  937. self.mc.MOVSD(xmm0, loc)
  938. self.mc.MOVSD_sx(p, xmm0.value)
  939. else:
  940. self.mc.MOV(tmp, loc)
  941. self.mc.MOV_sr(p, tmp.value)
  942. p += loc.get_width()
  943. # x is a location
  944. self.mc.CALL(x)
  945. self.mark_gc_roots(force_index)
  946. #
  947. if callconv != FFI_DEFAULT_ABI:
  948. self._fix_stdcall(callconv, p)
  949. #
  950. self._regalloc.needed_extra_stack_locations(p//WORD)
  951. def _fix_stdcall(self, callconv, p):
  952. from pypy.rlib.clibffi import FFI_STDCALL
  953. assert callconv == FFI_STDCALL
  954. # it's a bit stupid, but we're just going to cancel the fact that
  955. # the called function just added 'p' to ESP, by subtracting it again.
  956. self.mc.SUB_ri(esp.value, p)
  957. def _emit_call_64(self, force_index, x, arglocs, start, argtypes):
  958. src_locs = []
  959. dst_locs = []
  960. xmm_src_locs = []
  961. xmm_dst_locs = []
  962. pass_on_stack = []
  963. singlefloats = None
  964. # In reverse order for use with pop()
  965. unused_gpr = [r9, r8, ecx, edx, esi, edi]
  966. unused_xmm = [xmm7, xmm6, xmm5, xmm4, xmm3, xmm2, xmm1, xmm0]
  967. for i in range(start, len(arglocs)):
  968. loc = arglocs[i]
  969. # XXX: Should be much simplier to tell whether a location is a
  970. # float! It's so ugly because we have to "guard" the access to
  971. # .type with isinstance, since not all AssemblerLocation classes
  972. # are "typed"
  973. if ((isinstance(loc, RegLoc) and loc.is_xmm) or
  974. (isinstance(loc, StackLoc) and loc.type == FLOAT) or
  975. (isinstance(loc, ConstFloatLoc))):
  976. if len(unused_xmm) > 0:
  977. xmm_src_locs.append(loc)
  978. xmm_dst_locs.append(unused_xmm.pop())
  979. else:
  980. pass_on_stack.append(loc)
  981. elif (argtypes is not None and argtypes[i-start] == 'S' and
  982. len(unused_xmm) > 0):
  983. # Singlefloat argument
  984. if singlefloats is None: singlefloats = []
  985. singlefloats.append((loc, unused_xmm.pop()))
  986. else:
  987. if len(unused_gpr) > 0:
  988. src_locs.append(loc)
  989. dst_locs.append(unused_gpr.pop())
  990. else:
  991. pass_on_stack.append(loc)
  992. # Emit instructions to pass the stack arguments
  993. # XXX: Would be nice to let remap_frame_layout take care of this, but
  994. # we'd need to create something like StackLoc, but relative to esp,
  995. # and I don't know if it's worth it.
  996. for i in range(len(pass_on_stack)):
  997. loc = pass_on_stack[i]
  998. if not isinstance(loc, RegLoc):
  999. if isinstance(loc, StackLoc) and loc.type == FLOAT:
  1000. self.mc.MOVSD(X86_64_XMM_SCRATCH_REG, loc)
  1001. self.mc.MOVSD_sx(i*WORD, X86_64_XMM_SCRATCH_REG.value)
  1002. else:
  1003. self.mc.MOV(X86_64_SCRATCH_REG, loc)
  1004. self.mc.MOV_sr(i*WORD, X86_64_SCRATCH_REG.value)
  1005. else:
  1006. # It's a register
  1007. if loc.is_xmm:
  1008. self.mc.MOVSD_sx(i*WORD, loc.value)
  1009. else:
  1010. self.mc.MOV_sr(i*WORD, loc.value)
  1011. # Handle register arguments: first remap the xmm arguments
  1012. remap_frame_layout(self, xmm_src_locs, xmm_dst_locs,
  1013. X86_64_XMM_SCRATCH_REG)
  1014. # Load the singlefloat arguments from main regs or stack to xmm regs
  1015. if singlefloats is not None:
  1016. for src, dst in singlefloats:
  1017. self.mc.MOVD(dst, src)
  1018. # Finally remap the arguments in the main regs
  1019. # If x is a register and is in dst_locs, then oups, it needs to
  1020. # be moved away:
  1021. if x in dst_locs:
  1022. src_locs.append(x)
  1023. dst_locs.append(r10)
  1024. x = r10
  1025. remap_frame_layout(self, src_locs, dst_locs, X86_64_SCRATCH_REG)
  1026. self.mc.CALL(x)
  1027. self.mark_gc_roots(force_index)
  1028. self._regalloc.needed_extra_stack_locations(len(pass_on_stack))
  1029. def call(self, addr, args, res):
  1030. force_index = self.write_new_force_index()
  1031. self._emit_call(force_index, imm(addr), args)
  1032. assert res is eax
  1033. def write_new_force_index(self):
  1034. # for shadowstack only: get a new, unused force_index number and
  1035. # write it to FORCE_INDEX_OFS. Used to record the call shape
  1036. # (i.e. where the GC pointers are in the stack) around a CALL
  1037. # instruction that doesn't already have a force_index.
  1038. gcrootmap = self.cpu.gc_ll_descr.gcrootmap
  1039. if gcrootmap and gcroot

Large files files are truncated, but you can click here to view the full file