PageRenderTime 59ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 1ms

/pypy/jit/backend/x86/assembler.py

https://bitbucket.org/pypy/pypy/
Python | 1499 lines | 1158 code | 142 blank | 199 comment | 220 complexity | eec55b0a0cc0a1a9af373b7cfa665558 MD5 | raw file
Possible License(s): AGPL-3.0, BSD-3-Clause, Apache-2.0

Large files files are truncated, but you can click here to view the full file

  1. import sys, os
  2. from pypy.jit.backend.llsupport import symbolic
  3. from pypy.jit.backend.llsupport.asmmemmgr import MachineDataBlockWrapper
  4. from pypy.jit.metainterp.history import Const, Box, BoxInt, ConstInt
  5. from pypy.jit.metainterp.history import AbstractFailDescr, INT, REF, FLOAT
  6. from pypy.jit.metainterp.history import JitCellToken
  7. from pypy.rpython.lltypesystem import lltype, rffi, rstr, llmemory
  8. from pypy.rpython.lltypesystem.lloperation import llop
  9. from pypy.rpython.annlowlevel import llhelper
  10. from pypy.rlib.jit import AsmInfo
  11. from pypy.jit.backend.model import CompiledLoopToken
  12. from pypy.jit.backend.x86.regalloc import (RegAlloc, get_ebp_ofs, _get_scale,
  13. gpr_reg_mgr_cls, _valid_addressing_size)
  14. from pypy.jit.backend.x86.arch import (FRAME_FIXED_SIZE, FORCE_INDEX_OFS, WORD,
  15. IS_X86_32, IS_X86_64)
  16. from pypy.jit.backend.x86.regloc import (eax, ecx, edx, ebx,
  17. esp, ebp, esi, edi,
  18. xmm0, xmm1, xmm2, xmm3,
  19. xmm4, xmm5, xmm6, xmm7,
  20. r8, r9, r10, r11,
  21. r12, r13, r14, r15,
  22. X86_64_SCRATCH_REG,
  23. X86_64_XMM_SCRATCH_REG,
  24. RegLoc, StackLoc, ConstFloatLoc,
  25. ImmedLoc, AddressLoc, imm,
  26. imm0, imm1, FloatImmedLoc)
  27. from pypy.rlib.objectmodel import we_are_translated, specialize
  28. from pypy.jit.backend.x86 import rx86, regloc, codebuf
  29. from pypy.jit.metainterp.resoperation import rop, ResOperation
  30. from pypy.jit.backend.x86.support import values_array
  31. from pypy.jit.backend.x86 import support
  32. from pypy.rlib.debug import (debug_print, debug_start, debug_stop,
  33. have_debug_prints)
  34. from pypy.rlib import rgc
  35. from pypy.rlib.clibffi import FFI_DEFAULT_ABI
  36. from pypy.jit.backend.x86.jump import remap_frame_layout
  37. from pypy.jit.codewriter.effectinfo import EffectInfo
  38. from pypy.jit.codewriter import longlong
  39. from pypy.rlib.rarithmetic import intmask
  40. from pypy.rlib.objectmodel import compute_unique_id
  41. # darwin requires the stack to be 16 bytes aligned on calls. Same for gcc 4.5.0,
  42. # better safe than sorry
  43. CALL_ALIGN = 16 // WORD
  44. def align_stack_words(words):
  45. return (words + CALL_ALIGN - 1) & ~(CALL_ALIGN-1)
  46. class GuardToken(object):
  47. def __init__(self, faildescr, failargs, fail_locs, exc,
  48. is_guard_not_invalidated):
  49. self.faildescr = faildescr
  50. self.failargs = failargs
  51. self.fail_locs = fail_locs
  52. self.exc = exc
  53. self.is_guard_not_invalidated = is_guard_not_invalidated
  54. DEBUG_COUNTER = lltype.Struct('DEBUG_COUNTER', ('i', lltype.Signed),
  55. ('type', lltype.Char), # 'b'ridge, 'l'abel or
  56. # 'e'ntry point
  57. ('number', lltype.Signed))
  58. class Assembler386(object):
  59. _regalloc = None
  60. _output_loop_log = None
  61. def __init__(self, cpu, translate_support_code=False,
  62. failargs_limit=1000):
  63. self.cpu = cpu
  64. self.verbose = False
  65. self.rtyper = cpu.rtyper
  66. self.fail_boxes_int = values_array(lltype.Signed, failargs_limit)
  67. self.fail_boxes_ptr = values_array(llmemory.GCREF, failargs_limit)
  68. self.fail_boxes_float = values_array(longlong.FLOATSTORAGE,
  69. failargs_limit)
  70. self.fail_ebp = 0
  71. self.loop_run_counters = []
  72. self.float_const_neg_addr = 0
  73. self.float_const_abs_addr = 0
  74. self.malloc_slowpath1 = 0
  75. self.malloc_slowpath2 = 0
  76. self.memcpy_addr = 0
  77. self.setup_failure_recovery()
  78. self._debug = False
  79. self.debug_counter_descr = cpu.fielddescrof(DEBUG_COUNTER, 'i')
  80. self.fail_boxes_count = 0
  81. self._current_depths_cache = (0, 0)
  82. self.datablockwrapper = None
  83. self.stack_check_slowpath = 0
  84. self.propagate_exception_path = 0
  85. self.gcrootmap_retaddr_forced = 0
  86. self.teardown()
  87. def leave_jitted_hook(self):
  88. ptrs = self.fail_boxes_ptr.ar
  89. llop.gc_assume_young_pointers(lltype.Void,
  90. llmemory.cast_ptr_to_adr(ptrs))
  91. def set_debug(self, v):
  92. self._debug = v
  93. def setup_once(self):
  94. # the address of the function called by 'new'
  95. gc_ll_descr = self.cpu.gc_ll_descr
  96. gc_ll_descr.initialize()
  97. self.memcpy_addr = self.cpu.cast_ptr_to_int(support.memcpy_fn)
  98. self._build_failure_recovery(False)
  99. self._build_failure_recovery(True)
  100. if self.cpu.supports_floats:
  101. self._build_failure_recovery(False, withfloats=True)
  102. self._build_failure_recovery(True, withfloats=True)
  103. support.ensure_sse2_floats()
  104. self._build_float_constants()
  105. self._build_propagate_exception_path()
  106. if gc_ll_descr.get_malloc_slowpath_addr is not None:
  107. self._build_malloc_slowpath()
  108. self._build_stack_check_slowpath()
  109. if gc_ll_descr.gcrootmap:
  110. self._build_release_gil(gc_ll_descr.gcrootmap)
  111. debug_start('jit-backend-counts')
  112. self.set_debug(have_debug_prints())
  113. debug_stop('jit-backend-counts')
  114. def setup(self, looptoken):
  115. assert self.memcpy_addr != 0, "setup_once() not called?"
  116. self.current_clt = looptoken.compiled_loop_token
  117. self.pending_guard_tokens = []
  118. if WORD == 8:
  119. self.pending_memoryerror_trampoline_from = []
  120. self.error_trampoline_64 = 0
  121. self.mc = codebuf.MachineCodeBlockWrapper()
  122. #assert self.datablockwrapper is None --- but obscure case
  123. # possible, e.g. getting MemoryError and continuing
  124. allblocks = self.get_asmmemmgr_blocks(looptoken)
  125. self.datablockwrapper = MachineDataBlockWrapper(self.cpu.asmmemmgr,
  126. allblocks)
  127. self.target_tokens_currently_compiling = {}
  128. def teardown(self):
  129. self.pending_guard_tokens = None
  130. if WORD == 8:
  131. self.pending_memoryerror_trampoline_from = None
  132. self.mc = None
  133. self.current_clt = None
  134. def finish_once(self):
  135. if self._debug:
  136. debug_start('jit-backend-counts')
  137. for i in range(len(self.loop_run_counters)):
  138. struct = self.loop_run_counters[i]
  139. if struct.type == 'l':
  140. prefix = 'TargetToken(%d)' % struct.number
  141. elif struct.type == 'b':
  142. prefix = 'bridge ' + str(struct.number)
  143. else:
  144. prefix = 'entry ' + str(struct.number)
  145. debug_print(prefix + ':' + str(struct.i))
  146. debug_stop('jit-backend-counts')
  147. def _build_float_constants(self):
  148. datablockwrapper = MachineDataBlockWrapper(self.cpu.asmmemmgr, [])
  149. float_constants = datablockwrapper.malloc_aligned(32, alignment=16)
  150. datablockwrapper.done()
  151. addr = rffi.cast(rffi.CArrayPtr(lltype.Char), float_constants)
  152. qword_padding = '\x00\x00\x00\x00\x00\x00\x00\x00'
  153. # 0x8000000000000000
  154. neg_const = '\x00\x00\x00\x00\x00\x00\x00\x80'
  155. # 0x7FFFFFFFFFFFFFFF
  156. abs_const = '\xFF\xFF\xFF\xFF\xFF\xFF\xFF\x7F'
  157. data = neg_const + qword_padding + abs_const + qword_padding
  158. for i in range(len(data)):
  159. addr[i] = data[i]
  160. self.float_const_neg_addr = float_constants
  161. self.float_const_abs_addr = float_constants + 16
  162. def _build_malloc_slowpath(self):
  163. # With asmgcc, we need two helpers, so that we can write two CALL
  164. # instructions in assembler, with a mark_gc_roots in between.
  165. # With shadowstack, this is not needed, so we produce a single helper.
  166. gcrootmap = self.cpu.gc_ll_descr.gcrootmap
  167. shadow_stack = (gcrootmap is not None and gcrootmap.is_shadow_stack)
  168. #
  169. # ---------- first helper for the slow path of malloc ----------
  170. mc = codebuf.MachineCodeBlockWrapper()
  171. if self.cpu.supports_floats: # save the XMM registers in
  172. for i in range(self.cpu.NUM_REGS):# the *caller* frame, from esp+8
  173. mc.MOVSD_sx((WORD*2)+8*i, i)
  174. mc.SUB_rr(edx.value, eax.value) # compute the size we want
  175. addr = self.cpu.gc_ll_descr.get_malloc_slowpath_addr()
  176. #
  177. # The registers to save in the copy area: with shadowstack, most
  178. # registers need to be saved. With asmgcc, the callee-saved registers
  179. # don't need to.
  180. save_in_copy_area = gpr_reg_mgr_cls.REGLOC_TO_COPY_AREA_OFS.items()
  181. if not shadow_stack:
  182. save_in_copy_area = [(reg, ofs) for (reg, ofs) in save_in_copy_area
  183. if reg not in gpr_reg_mgr_cls.REGLOC_TO_GCROOTMAP_REG_INDEX]
  184. #
  185. for reg, ofs in save_in_copy_area:
  186. mc.MOV_br(ofs, reg.value)
  187. #
  188. if shadow_stack:
  189. # ---- shadowstack ----
  190. mc.SUB_ri(esp.value, 16 - WORD) # stack alignment of 16 bytes
  191. if IS_X86_32:
  192. mc.MOV_sr(0, edx.value) # push argument
  193. elif IS_X86_64:
  194. mc.MOV_rr(edi.value, edx.value)
  195. mc.CALL(imm(addr))
  196. mc.ADD_ri(esp.value, 16 - WORD)
  197. else:
  198. # ---- asmgcc ----
  199. if IS_X86_32:
  200. mc.MOV_sr(WORD, edx.value) # save it as the new argument
  201. elif IS_X86_64:
  202. # rdi can be clobbered: its content was saved in the
  203. # copy area of the stack
  204. mc.MOV_rr(edi.value, edx.value)
  205. mc.JMP(imm(addr)) # tail call to the real malloc
  206. rawstart = mc.materialize(self.cpu.asmmemmgr, [])
  207. self.malloc_slowpath1 = rawstart
  208. # ---------- second helper for the slow path of malloc ----------
  209. mc = codebuf.MachineCodeBlockWrapper()
  210. #
  211. for reg, ofs in save_in_copy_area:
  212. mc.MOV_rb(reg.value, ofs)
  213. assert reg is not eax and reg is not edx
  214. #
  215. if self.cpu.supports_floats: # restore the XMM registers
  216. for i in range(self.cpu.NUM_REGS):# from where they were saved
  217. mc.MOVSD_xs(i, (WORD*2)+8*i)
  218. #
  219. # Note: we check this after the code above, just because the code
  220. # above is more than 127 bytes on 64-bits...
  221. mc.TEST_rr(eax.value, eax.value)
  222. mc.J_il8(rx86.Conditions['Z'], 0) # patched later
  223. jz_location = mc.get_relative_pos()
  224. #
  225. nursery_free_adr = self.cpu.gc_ll_descr.get_nursery_free_addr()
  226. mc.MOV(edx, heap(nursery_free_adr)) # load this in EDX
  227. mc.RET()
  228. #
  229. # If the slowpath malloc failed, we raise a MemoryError that
  230. # always interrupts the current loop, as a "good enough"
  231. # approximation. Also note that we didn't RET from this helper;
  232. # but the code we jump to will actually restore the stack
  233. # position based on EBP, which will get us out of here for free.
  234. offset = mc.get_relative_pos() - jz_location
  235. assert 0 < offset <= 127
  236. mc.overwrite(jz_location-1, chr(offset))
  237. mc.JMP(imm(self.propagate_exception_path))
  238. #
  239. rawstart = mc.materialize(self.cpu.asmmemmgr, [])
  240. self.malloc_slowpath2 = rawstart
  241. def _build_propagate_exception_path(self):
  242. if self.cpu.propagate_exception_v < 0:
  243. return # not supported (for tests, or non-translated)
  244. #
  245. self.mc = codebuf.MachineCodeBlockWrapper()
  246. # call on_leave_jitted_save_exc()
  247. addr = self.cpu.get_on_leave_jitted_int(save_exception=True,
  248. default_to_memoryerror=True)
  249. self.mc.CALL(imm(addr))
  250. self.mc.MOV_ri(eax.value, self.cpu.propagate_exception_v)
  251. self._call_footer()
  252. rawstart = self.mc.materialize(self.cpu.asmmemmgr, [])
  253. self.propagate_exception_path = rawstart
  254. self.mc = None
  255. def _build_stack_check_slowpath(self):
  256. _, _, slowpathaddr = self.cpu.insert_stack_check()
  257. if slowpathaddr == 0 or self.cpu.propagate_exception_v < 0:
  258. return # no stack check (for tests, or non-translated)
  259. #
  260. # make a "function" that is called immediately at the start of
  261. # an assembler function. In particular, the stack looks like:
  262. #
  263. # | ... | <-- aligned to a multiple of 16
  264. # | retaddr of caller |
  265. # | my own retaddr | <-- esp
  266. # +---------------------+
  267. #
  268. mc = codebuf.MachineCodeBlockWrapper()
  269. #
  270. stack_size = WORD
  271. if IS_X86_64:
  272. # on the x86_64, we have to save all the registers that may
  273. # have been used to pass arguments
  274. stack_size += 6*WORD + 8*8
  275. for reg in [edi, esi, edx, ecx, r8, r9]:
  276. mc.PUSH_r(reg.value)
  277. mc.SUB_ri(esp.value, 8*8)
  278. for i in range(8):
  279. mc.MOVSD_sx(8*i, i) # xmm0 to xmm7
  280. #
  281. if IS_X86_32:
  282. stack_size += 2*WORD
  283. mc.PUSH_r(eax.value) # alignment
  284. mc.PUSH_r(esp.value)
  285. elif IS_X86_64:
  286. mc.MOV_rr(edi.value, esp.value)
  287. #
  288. # esp is now aligned to a multiple of 16 again
  289. mc.CALL(imm(slowpathaddr))
  290. #
  291. mc.MOV(eax, heap(self.cpu.pos_exception()))
  292. mc.TEST_rr(eax.value, eax.value)
  293. mc.J_il8(rx86.Conditions['NZ'], 0)
  294. jnz_location = mc.get_relative_pos()
  295. #
  296. if IS_X86_32:
  297. mc.ADD_ri(esp.value, 2*WORD) # cancel the two PUSHes above
  298. elif IS_X86_64:
  299. # restore the registers
  300. for i in range(7, -1, -1):
  301. mc.MOVSD_xs(i, 8*i)
  302. mc.ADD_ri(esp.value, 8*8)
  303. for reg in [r9, r8, ecx, edx, esi, edi]:
  304. mc.POP_r(reg.value)
  305. #
  306. mc.RET()
  307. #
  308. # patch the JNZ above
  309. offset = mc.get_relative_pos() - jnz_location
  310. assert 0 < offset <= 127
  311. mc.overwrite(jnz_location-1, chr(offset))
  312. # call on_leave_jitted_save_exc()
  313. addr = self.cpu.get_on_leave_jitted_int(save_exception=True)
  314. mc.CALL(imm(addr))
  315. #
  316. mc.MOV_ri(eax.value, self.cpu.propagate_exception_v)
  317. #
  318. # footer -- note the ADD, which skips the return address of this
  319. # function, and will instead return to the caller's caller. Note
  320. # also that we completely ignore the saved arguments, because we
  321. # are interrupting the function.
  322. mc.ADD_ri(esp.value, stack_size)
  323. mc.RET()
  324. #
  325. rawstart = mc.materialize(self.cpu.asmmemmgr, [])
  326. self.stack_check_slowpath = rawstart
  327. @staticmethod
  328. @rgc.no_collect
  329. def _release_gil_asmgcc(css):
  330. # similar to trackgcroot.py:pypy_asm_stackwalk, first part
  331. from pypy.rpython.memory.gctransform import asmgcroot
  332. new = rffi.cast(asmgcroot.ASM_FRAMEDATA_HEAD_PTR, css)
  333. next = asmgcroot.gcrootanchor.next
  334. new.next = next
  335. new.prev = asmgcroot.gcrootanchor
  336. asmgcroot.gcrootanchor.next = new
  337. next.prev = new
  338. # and now release the GIL
  339. before = rffi.aroundstate.before
  340. if before:
  341. before()
  342. @staticmethod
  343. @rgc.no_collect
  344. def _reacquire_gil_asmgcc(css):
  345. # first reacquire the GIL
  346. after = rffi.aroundstate.after
  347. if after:
  348. after()
  349. # similar to trackgcroot.py:pypy_asm_stackwalk, second part
  350. from pypy.rpython.memory.gctransform import asmgcroot
  351. old = rffi.cast(asmgcroot.ASM_FRAMEDATA_HEAD_PTR, css)
  352. prev = old.prev
  353. next = old.next
  354. prev.next = next
  355. next.prev = prev
  356. @staticmethod
  357. @rgc.no_collect
  358. def _release_gil_shadowstack():
  359. before = rffi.aroundstate.before
  360. if before:
  361. before()
  362. @staticmethod
  363. @rgc.no_collect
  364. def _reacquire_gil_shadowstack():
  365. after = rffi.aroundstate.after
  366. if after:
  367. after()
  368. _NOARG_FUNC = lltype.Ptr(lltype.FuncType([], lltype.Void))
  369. _CLOSESTACK_FUNC = lltype.Ptr(lltype.FuncType([rffi.LONGP],
  370. lltype.Void))
  371. def _build_release_gil(self, gcrootmap):
  372. if gcrootmap.is_shadow_stack:
  373. releasegil_func = llhelper(self._NOARG_FUNC,
  374. self._release_gil_shadowstack)
  375. reacqgil_func = llhelper(self._NOARG_FUNC,
  376. self._reacquire_gil_shadowstack)
  377. else:
  378. releasegil_func = llhelper(self._CLOSESTACK_FUNC,
  379. self._release_gil_asmgcc)
  380. reacqgil_func = llhelper(self._CLOSESTACK_FUNC,
  381. self._reacquire_gil_asmgcc)
  382. self.releasegil_addr = self.cpu.cast_ptr_to_int(releasegil_func)
  383. self.reacqgil_addr = self.cpu.cast_ptr_to_int(reacqgil_func)
  384. def assemble_loop(self, loopname, inputargs, operations, looptoken, log):
  385. '''adds the following attributes to looptoken:
  386. _x86_function_addr (address of the generated func, as an int)
  387. _x86_loop_code (debug: addr of the start of the ResOps)
  388. _x86_fullsize (debug: full size including failure)
  389. _x86_debug_checksum
  390. '''
  391. # XXX this function is too longish and contains some code
  392. # duplication with assemble_bridge(). Also, we should think
  393. # about not storing on 'self' attributes that will live only
  394. # for the duration of compiling one loop or a one bridge.
  395. clt = CompiledLoopToken(self.cpu, looptoken.number)
  396. clt.allgcrefs = []
  397. looptoken.compiled_loop_token = clt
  398. if not we_are_translated():
  399. # Arguments should be unique
  400. assert len(set(inputargs)) == len(inputargs)
  401. self.setup(looptoken)
  402. if log:
  403. operations = self._inject_debugging_code(looptoken, operations,
  404. 'e', looptoken.number)
  405. regalloc = RegAlloc(self, self.cpu.translate_support_code)
  406. #
  407. self._call_header_with_stack_check()
  408. stackadjustpos = self._patchable_stackadjust()
  409. clt._debug_nbargs = len(inputargs)
  410. operations = regalloc.prepare_loop(inputargs, operations,
  411. looptoken, clt.allgcrefs)
  412. looppos = self.mc.get_relative_pos()
  413. looptoken._x86_loop_code = looppos
  414. clt.frame_depth = -1 # temporarily
  415. clt.param_depth = -1 # temporarily
  416. frame_depth, param_depth = self._assemble(regalloc, operations)
  417. clt.frame_depth = frame_depth
  418. clt.param_depth = param_depth
  419. #
  420. size_excluding_failure_stuff = self.mc.get_relative_pos()
  421. self.write_pending_failure_recoveries()
  422. full_size = self.mc.get_relative_pos()
  423. #
  424. rawstart = self.materialize_loop(looptoken)
  425. debug_start("jit-backend-addr")
  426. debug_print("Loop %d (%s) has address %x to %x (bootstrap %x)" % (
  427. looptoken.number, loopname,
  428. rawstart + looppos,
  429. rawstart + size_excluding_failure_stuff,
  430. rawstart))
  431. debug_stop("jit-backend-addr")
  432. self._patch_stackadjust(rawstart + stackadjustpos,
  433. frame_depth + param_depth)
  434. self.patch_pending_failure_recoveries(rawstart)
  435. #
  436. ops_offset = self.mc.ops_offset
  437. if not we_are_translated():
  438. # used only by looptoken.dump() -- useful in tests
  439. looptoken._x86_rawstart = rawstart
  440. looptoken._x86_fullsize = full_size
  441. looptoken._x86_ops_offset = ops_offset
  442. looptoken._x86_function_addr = rawstart
  443. self.fixup_target_tokens(rawstart)
  444. self.teardown()
  445. # oprofile support
  446. if self.cpu.profile_agent is not None:
  447. name = "Loop # %s: %s" % (looptoken.number, loopname)
  448. self.cpu.profile_agent.native_code_written(name,
  449. rawstart, full_size)
  450. return AsmInfo(ops_offset, rawstart + looppos,
  451. size_excluding_failure_stuff - looppos)
  452. def assemble_bridge(self, faildescr, inputargs, operations,
  453. original_loop_token, log):
  454. if not we_are_translated():
  455. # Arguments should be unique
  456. assert len(set(inputargs)) == len(inputargs)
  457. descr_number = self.cpu.get_fail_descr_number(faildescr)
  458. failure_recovery = self._find_failure_recovery_bytecode(faildescr)
  459. self.setup(original_loop_token)
  460. if log:
  461. operations = self._inject_debugging_code(faildescr, operations,
  462. 'b', descr_number)
  463. arglocs = self.rebuild_faillocs_from_descr(failure_recovery)
  464. if not we_are_translated():
  465. assert ([loc.assembler() for loc in arglocs] ==
  466. [loc.assembler() for loc in faildescr._x86_debug_faillocs])
  467. regalloc = RegAlloc(self, self.cpu.translate_support_code)
  468. fail_depths = faildescr._x86_current_depths
  469. startpos = self.mc.get_relative_pos()
  470. operations = regalloc.prepare_bridge(fail_depths, inputargs, arglocs,
  471. operations,
  472. self.current_clt.allgcrefs)
  473. stackadjustpos = self._patchable_stackadjust()
  474. frame_depth, param_depth = self._assemble(regalloc, operations)
  475. codeendpos = self.mc.get_relative_pos()
  476. self.write_pending_failure_recoveries()
  477. fullsize = self.mc.get_relative_pos()
  478. #
  479. rawstart = self.materialize_loop(original_loop_token)
  480. debug_start("jit-backend-addr")
  481. debug_print("bridge out of Guard %d has address %x to %x" %
  482. (descr_number, rawstart, rawstart + codeendpos))
  483. debug_stop("jit-backend-addr")
  484. self._patch_stackadjust(rawstart + stackadjustpos,
  485. frame_depth + param_depth)
  486. self.patch_pending_failure_recoveries(rawstart)
  487. if not we_are_translated():
  488. # for the benefit of tests
  489. faildescr._x86_bridge_frame_depth = frame_depth
  490. faildescr._x86_bridge_param_depth = param_depth
  491. # patch the jump from original guard
  492. self.patch_jump_for_descr(faildescr, rawstart)
  493. ops_offset = self.mc.ops_offset
  494. self.fixup_target_tokens(rawstart)
  495. self.current_clt.frame_depth = max(self.current_clt.frame_depth, frame_depth)
  496. self.current_clt.param_depth = max(self.current_clt.param_depth, param_depth)
  497. self.teardown()
  498. # oprofile support
  499. if self.cpu.profile_agent is not None:
  500. name = "Bridge # %s" % (descr_number,)
  501. self.cpu.profile_agent.native_code_written(name,
  502. rawstart, fullsize)
  503. return AsmInfo(ops_offset, startpos + rawstart, codeendpos - startpos)
  504. def write_pending_failure_recoveries(self):
  505. # for each pending guard, generate the code of the recovery stub
  506. # at the end of self.mc.
  507. for tok in self.pending_guard_tokens:
  508. tok.pos_recovery_stub = self.generate_quick_failure(tok)
  509. if WORD == 8 and len(self.pending_memoryerror_trampoline_from) > 0:
  510. self.error_trampoline_64 = self.generate_propagate_error_64()
  511. def patch_pending_failure_recoveries(self, rawstart):
  512. # after we wrote the assembler to raw memory, set up
  513. # tok.faildescr._x86_adr_jump_offset to contain the raw address of
  514. # the 4-byte target field in the JMP/Jcond instruction, and patch
  515. # the field in question to point (initially) to the recovery stub
  516. clt = self.current_clt
  517. for tok in self.pending_guard_tokens:
  518. addr = rawstart + tok.pos_jump_offset
  519. tok.faildescr._x86_adr_jump_offset = addr
  520. relative_target = tok.pos_recovery_stub - (tok.pos_jump_offset + 4)
  521. assert rx86.fits_in_32bits(relative_target)
  522. #
  523. if not tok.is_guard_not_invalidated:
  524. mc = codebuf.MachineCodeBlockWrapper()
  525. mc.writeimm32(relative_target)
  526. mc.copy_to_raw_memory(addr)
  527. else:
  528. # GUARD_NOT_INVALIDATED, record an entry in
  529. # clt.invalidate_positions of the form:
  530. # (addr-in-the-code-of-the-not-yet-written-jump-target,
  531. # relative-target-to-use)
  532. relpos = tok.pos_jump_offset
  533. clt.invalidate_positions.append((rawstart + relpos,
  534. relative_target))
  535. # General idea: Although no code was generated by this
  536. # guard, the code might be patched with a "JMP rel32" to
  537. # the guard recovery code. This recovery code is
  538. # already generated, and looks like the recovery code
  539. # for any guard, even if at first it has no jump to it.
  540. # So we may later write 5 bytes overriding the existing
  541. # instructions; this works because a CALL instruction
  542. # would also take at least 5 bytes. If it could take
  543. # less, we would run into the issue that overwriting the
  544. # 5 bytes here might get a few nonsense bytes at the
  545. # return address of the following CALL.
  546. if WORD == 8:
  547. for pos_after_jz in self.pending_memoryerror_trampoline_from:
  548. assert self.error_trampoline_64 != 0 # only if non-empty
  549. mc = codebuf.MachineCodeBlockWrapper()
  550. mc.writeimm32(self.error_trampoline_64 - pos_after_jz)
  551. mc.copy_to_raw_memory(rawstart + pos_after_jz - 4)
  552. def get_asmmemmgr_blocks(self, looptoken):
  553. clt = looptoken.compiled_loop_token
  554. if clt.asmmemmgr_blocks is None:
  555. clt.asmmemmgr_blocks = []
  556. return clt.asmmemmgr_blocks
  557. def materialize_loop(self, looptoken):
  558. self.datablockwrapper.done() # finish using cpu.asmmemmgr
  559. self.datablockwrapper = None
  560. allblocks = self.get_asmmemmgr_blocks(looptoken)
  561. return self.mc.materialize(self.cpu.asmmemmgr, allblocks,
  562. self.cpu.gc_ll_descr.gcrootmap)
  563. def _register_counter(self, tp, number, token):
  564. # YYY very minor leak -- we need the counters to stay alive
  565. # forever, just because we want to report them at the end
  566. # of the process
  567. struct = lltype.malloc(DEBUG_COUNTER, flavor='raw',
  568. track_allocation=False)
  569. struct.i = 0
  570. struct.type = tp
  571. if tp == 'b' or tp == 'e':
  572. struct.number = number
  573. else:
  574. assert token
  575. struct.number = compute_unique_id(token)
  576. self.loop_run_counters.append(struct)
  577. return struct
  578. def _find_failure_recovery_bytecode(self, faildescr):
  579. adr_jump_offset = faildescr._x86_adr_jump_offset
  580. if adr_jump_offset == 0:
  581. # This case should be prevented by the logic in compile.py:
  582. # look for CNT_BUSY_FLAG, which disables tracing from a guard
  583. # when another tracing from the same guard is already in progress.
  584. raise BridgeAlreadyCompiled
  585. # follow the JMP/Jcond
  586. p = rffi.cast(rffi.INTP, adr_jump_offset)
  587. adr_target = adr_jump_offset + 4 + rffi.cast(lltype.Signed, p[0])
  588. # skip the CALL
  589. if WORD == 4:
  590. adr_target += 5 # CALL imm
  591. else:
  592. adr_target += 13 # MOV r11, imm-as-8-bytes; CALL *r11 xxxxxxxxxx
  593. return adr_target
  594. def patch_jump_for_descr(self, faildescr, adr_new_target):
  595. adr_jump_offset = faildescr._x86_adr_jump_offset
  596. assert adr_jump_offset != 0
  597. offset = adr_new_target - (adr_jump_offset + 4)
  598. # If the new target fits within a rel32 of the jump, just patch
  599. # that. Otherwise, leave the original rel32 to the recovery stub in
  600. # place, but clobber the recovery stub with a jump to the real
  601. # target.
  602. mc = codebuf.MachineCodeBlockWrapper()
  603. if rx86.fits_in_32bits(offset):
  604. mc.writeimm32(offset)
  605. mc.copy_to_raw_memory(adr_jump_offset)
  606. else:
  607. # "mov r11, addr; jmp r11" is up to 13 bytes, which fits in there
  608. # because we always write "mov r11, imm-as-8-bytes; call *r11" in
  609. # the first place.
  610. mc.MOV_ri(X86_64_SCRATCH_REG.value, adr_new_target)
  611. mc.JMP_r(X86_64_SCRATCH_REG.value)
  612. p = rffi.cast(rffi.INTP, adr_jump_offset)
  613. adr_target = adr_jump_offset + 4 + rffi.cast(lltype.Signed, p[0])
  614. mc.copy_to_raw_memory(adr_target)
  615. faildescr._x86_adr_jump_offset = 0 # means "patched"
  616. def fixup_target_tokens(self, rawstart):
  617. for targettoken in self.target_tokens_currently_compiling:
  618. targettoken._x86_loop_code += rawstart
  619. self.target_tokens_currently_compiling = None
  620. def _append_debugging_code(self, operations, tp, number, token):
  621. counter = self._register_counter(tp, number, token)
  622. c_adr = ConstInt(rffi.cast(lltype.Signed, counter))
  623. box = BoxInt()
  624. box2 = BoxInt()
  625. ops = [ResOperation(rop.GETFIELD_RAW, [c_adr],
  626. box, descr=self.debug_counter_descr),
  627. ResOperation(rop.INT_ADD, [box, ConstInt(1)], box2),
  628. ResOperation(rop.SETFIELD_RAW, [c_adr, box2],
  629. None, descr=self.debug_counter_descr)]
  630. operations.extend(ops)
  631. @specialize.argtype(1)
  632. def _inject_debugging_code(self, looptoken, operations, tp, number):
  633. if self._debug:
  634. # before doing anything, let's increase a counter
  635. s = 0
  636. for op in operations:
  637. s += op.getopnum()
  638. looptoken._x86_debug_checksum = s
  639. newoperations = []
  640. self._append_debugging_code(newoperations, tp, number,
  641. None)
  642. for op in operations:
  643. newoperations.append(op)
  644. if op.getopnum() == rop.LABEL:
  645. self._append_debugging_code(newoperations, 'l', number,
  646. op.getdescr())
  647. operations = newoperations
  648. return operations
  649. def _assemble(self, regalloc, operations):
  650. self._regalloc = regalloc
  651. regalloc.compute_hint_frame_locations(operations)
  652. regalloc.walk_operations(operations)
  653. if we_are_translated() or self.cpu.dont_keepalive_stuff:
  654. self._regalloc = None # else keep it around for debugging
  655. frame_depth = regalloc.fm.get_frame_depth()
  656. param_depth = regalloc.param_depth
  657. jump_target_descr = regalloc.jump_target_descr
  658. if jump_target_descr is not None:
  659. target_frame_depth = jump_target_descr._x86_clt.frame_depth
  660. target_param_depth = jump_target_descr._x86_clt.param_depth
  661. frame_depth = max(frame_depth, target_frame_depth)
  662. param_depth = max(param_depth, target_param_depth)
  663. return frame_depth, param_depth
  664. def _patchable_stackadjust(self):
  665. # stack adjustment LEA
  666. self.mc.LEA32_rb(esp.value, 0)
  667. return self.mc.get_relative_pos() - 4
  668. def _patch_stackadjust(self, adr_lea, allocated_depth):
  669. # patch stack adjustment LEA
  670. mc = codebuf.MachineCodeBlockWrapper()
  671. # Compute the correct offset for the instruction LEA ESP, [EBP-4*words]
  672. mc.writeimm32(self._get_offset_of_ebp_from_esp(allocated_depth))
  673. mc.copy_to_raw_memory(adr_lea)
  674. def _get_offset_of_ebp_from_esp(self, allocated_depth):
  675. # Given that [EBP] is where we saved EBP, i.e. in the last word
  676. # of our fixed frame, then the 'words' value is:
  677. words = (FRAME_FIXED_SIZE - 1) + allocated_depth
  678. # align, e.g. for Mac OS X
  679. aligned_words = align_stack_words(words+2)-2 # 2 = EIP+EBP
  680. return -WORD * aligned_words
  681. def _call_header(self):
  682. # NB. the shape of the frame is hard-coded in get_basic_shape() too.
  683. # Also, make sure this is consistent with FRAME_FIXED_SIZE.
  684. self.mc.PUSH_r(ebp.value)
  685. self.mc.MOV_rr(ebp.value, esp.value)
  686. for loc in self.cpu.CALLEE_SAVE_REGISTERS:
  687. self.mc.PUSH_r(loc.value)
  688. gcrootmap = self.cpu.gc_ll_descr.gcrootmap
  689. if gcrootmap and gcrootmap.is_shadow_stack:
  690. self._call_header_shadowstack(gcrootmap)
  691. def _call_header_with_stack_check(self):
  692. if self.stack_check_slowpath == 0:
  693. pass # no stack check (e.g. not translated)
  694. else:
  695. endaddr, lengthaddr, _ = self.cpu.insert_stack_check()
  696. self.mc.MOV(eax, heap(endaddr)) # MOV eax, [start]
  697. self.mc.SUB(eax, esp) # SUB eax, current
  698. self.mc.CMP(eax, heap(lengthaddr)) # CMP eax, [length]
  699. self.mc.J_il8(rx86.Conditions['BE'], 0) # JBE .skip
  700. jb_location = self.mc.get_relative_pos()
  701. self.mc.CALL(imm(self.stack_check_slowpath))# CALL slowpath
  702. # patch the JB above # .skip:
  703. offset = self.mc.get_relative_pos() - jb_location
  704. assert 0 < offset <= 127
  705. self.mc.overwrite(jb_location-1, chr(offset))
  706. #
  707. self._call_header()
  708. def _call_footer(self):
  709. self.mc.LEA_rb(esp.value, -len(self.cpu.CALLEE_SAVE_REGISTERS) * WORD)
  710. gcrootmap = self.cpu.gc_ll_descr.gcrootmap
  711. if gcrootmap and gcrootmap.is_shadow_stack:
  712. self._call_footer_shadowstack(gcrootmap)
  713. for i in range(len(self.cpu.CALLEE_SAVE_REGISTERS)-1, -1, -1):
  714. self.mc.POP_r(self.cpu.CALLEE_SAVE_REGISTERS[i].value)
  715. self.mc.POP_r(ebp.value)
  716. self.mc.RET()
  717. def _call_header_shadowstack(self, gcrootmap):
  718. # we need to put two words into the shadowstack: the MARKER_FRAME
  719. # and the address of the frame (ebp, actually)
  720. rst = gcrootmap.get_root_stack_top_addr()
  721. if rx86.fits_in_32bits(rst):
  722. self.mc.MOV_rj(eax.value, rst) # MOV eax, [rootstacktop]
  723. else:
  724. self.mc.MOV_ri(r13.value, rst) # MOV r13, rootstacktop
  725. self.mc.MOV_rm(eax.value, (r13.value, 0)) # MOV eax, [r13]
  726. #
  727. MARKER = gcrootmap.MARKER_FRAME
  728. self.mc.LEA_rm(ebx.value, (eax.value, 2*WORD)) # LEA ebx, [eax+2*WORD]
  729. self.mc.MOV_mi((eax.value, WORD), MARKER) # MOV [eax+WORD], MARKER
  730. self.mc.MOV_mr((eax.value, 0), ebp.value) # MOV [eax], ebp
  731. #
  732. if rx86.fits_in_32bits(rst):
  733. self.mc.MOV_jr(rst, ebx.value) # MOV [rootstacktop], ebx
  734. else:
  735. self.mc.MOV_mr((r13.value, 0), ebx.value) # MOV [r13], ebx
  736. def _call_footer_shadowstack(self, gcrootmap):
  737. rst = gcrootmap.get_root_stack_top_addr()
  738. if rx86.fits_in_32bits(rst):
  739. self.mc.SUB_ji8(rst, 2*WORD) # SUB [rootstacktop], 2*WORD
  740. else:
  741. self.mc.MOV_ri(ebx.value, rst) # MOV ebx, rootstacktop
  742. self.mc.SUB_mi8((ebx.value, 0), 2*WORD) # SUB [ebx], 2*WORD
  743. def redirect_call_assembler(self, oldlooptoken, newlooptoken):
  744. # some minimal sanity checking
  745. old_nbargs = oldlooptoken.compiled_loop_token._debug_nbargs
  746. new_nbargs = newlooptoken.compiled_loop_token._debug_nbargs
  747. assert old_nbargs == new_nbargs
  748. # we overwrite the instructions at the old _x86_direct_bootstrap_code
  749. # to start with a JMP to the new _x86_direct_bootstrap_code.
  750. # Ideally we should rather patch all existing CALLs, but well.
  751. oldadr = oldlooptoken._x86_function_addr
  752. target = newlooptoken._x86_function_addr
  753. mc = codebuf.MachineCodeBlockWrapper()
  754. mc.JMP(imm(target))
  755. if WORD == 4: # keep in sync with prepare_loop()
  756. assert mc.get_relative_pos() == 5
  757. else:
  758. assert mc.get_relative_pos() <= 13
  759. mc.copy_to_raw_memory(oldadr)
  760. def dump(self, text):
  761. if not self.verbose:
  762. return
  763. _prev = Box._extended_display
  764. try:
  765. Box._extended_display = False
  766. pos = self.mc.get_relative_pos()
  767. print >> sys.stderr, ' 0x%x %s' % (pos, text)
  768. finally:
  769. Box._extended_display = _prev
  770. # ------------------------------------------------------------
  771. def mov(self, from_loc, to_loc):
  772. if (isinstance(from_loc, RegLoc) and from_loc.is_xmm) or (isinstance(to_loc, RegLoc) and to_loc.is_xmm):
  773. self.mc.MOVSD(to_loc, from_loc)
  774. else:
  775. assert to_loc is not ebp
  776. self.mc.MOV(to_loc, from_loc)
  777. regalloc_mov = mov # legacy interface
  778. def regalloc_push(self, loc):
  779. if isinstance(loc, RegLoc) and loc.is_xmm:
  780. self.mc.SUB_ri(esp.value, 8) # = size of doubles
  781. self.mc.MOVSD_sx(0, loc.value)
  782. elif WORD == 4 and isinstance(loc, StackLoc) and loc.get_width() == 8:
  783. # XXX evil trick
  784. self.mc.PUSH_b(get_ebp_ofs(loc.position))
  785. self.mc.PUSH_b(get_ebp_ofs(loc.position + 1))
  786. else:
  787. self.mc.PUSH(loc)
  788. def regalloc_pop(self, loc):
  789. if isinstance(loc, RegLoc) and loc.is_xmm:
  790. self.mc.MOVSD_xs(loc.value, 0)
  791. self.mc.ADD_ri(esp.value, 8) # = size of doubles
  792. elif WORD == 4 and isinstance(loc, StackLoc) and loc.get_width() == 8:
  793. # XXX evil trick
  794. self.mc.POP_b(get_ebp_ofs(loc.position + 1))
  795. self.mc.POP_b(get_ebp_ofs(loc.position))
  796. else:
  797. self.mc.POP(loc)
  798. def regalloc_immedmem2mem(self, from_loc, to_loc):
  799. # move a ConstFloatLoc directly to a StackLoc, as two MOVs
  800. # (even on x86-64, because the immediates are encoded as 32 bits)
  801. assert isinstance(from_loc, ConstFloatLoc)
  802. assert isinstance(to_loc, StackLoc)
  803. low_part = rffi.cast(rffi.CArrayPtr(rffi.INT), from_loc.value)[0]
  804. high_part = rffi.cast(rffi.CArrayPtr(rffi.INT), from_loc.value)[1]
  805. low_part = intmask(low_part)
  806. high_part = intmask(high_part)
  807. self.mc.MOV32_bi(to_loc.value, low_part)
  808. self.mc.MOV32_bi(to_loc.value + 4, high_part)
  809. def regalloc_perform(self, op, arglocs, resloc):
  810. genop_list[op.getopnum()](self, op, arglocs, resloc)
  811. def regalloc_perform_discard(self, op, arglocs):
  812. genop_discard_list[op.getopnum()](self, op, arglocs)
  813. def regalloc_perform_llong(self, op, arglocs, resloc):
  814. effectinfo = op.getdescr().get_extra_info()
  815. oopspecindex = effectinfo.oopspecindex
  816. genop_llong_list[oopspecindex](self, op, arglocs, resloc)
  817. def regalloc_perform_math(self, op, arglocs, resloc):
  818. effectinfo = op.getdescr().get_extra_info()
  819. oopspecindex = effectinfo.oopspecindex
  820. genop_math_list[oopspecindex](self, op, arglocs, resloc)
  821. def regalloc_perform_with_guard(self, op, guard_op, faillocs,
  822. arglocs, resloc, current_depths):
  823. faildescr = guard_op.getdescr()
  824. assert isinstance(faildescr, AbstractFailDescr)
  825. faildescr._x86_current_depths = current_depths
  826. failargs = guard_op.getfailargs()
  827. guard_opnum = guard_op.getopnum()
  828. guard_token = self.implement_guard_recovery(guard_opnum,
  829. faildescr, failargs,
  830. faillocs)
  831. if op is None:
  832. dispatch_opnum = guard_opnum
  833. else:
  834. dispatch_opnum = op.getopnum()
  835. genop_guard_list[dispatch_opnum](self, op, guard_op, guard_token,
  836. arglocs, resloc)
  837. if not we_are_translated():
  838. # must be added by the genop_guard_list[]()
  839. assert guard_token is self.pending_guard_tokens[-1]
  840. def regalloc_perform_guard(self, guard_op, faillocs, arglocs, resloc,
  841. current_depths):
  842. self.regalloc_perform_with_guard(None, guard_op, faillocs, arglocs,
  843. resloc, current_depths)
  844. def load_effective_addr(self, sizereg, baseofs, scale, result, frm=imm0):
  845. self.mc.LEA(result, addr_add(frm, sizereg, baseofs, scale))
  846. def _unaryop(asmop):
  847. def genop_unary(self, op, arglocs, resloc):
  848. getattr(self.mc, asmop)(arglocs[0])
  849. return genop_unary
  850. def _binaryop(asmop, can_swap=False):
  851. def genop_binary(self, op, arglocs, result_loc):
  852. getattr(self.mc, asmop)(arglocs[0], arglocs[1])
  853. return genop_binary
  854. def _cmpop(cond, rev_cond):
  855. def genop_cmp(self, op, arglocs, result_loc):
  856. rl = result_loc.lowest8bits()
  857. if isinstance(op.getarg(0), Const):
  858. self.mc.CMP(arglocs[1], arglocs[0])
  859. self.mc.SET_ir(rx86.Conditions[rev_cond], rl.value)
  860. else:
  861. self.mc.CMP(arglocs[0], arglocs[1])
  862. self.mc.SET_ir(rx86.Conditions[cond], rl.value)
  863. self.mc.MOVZX8_rr(result_loc.value, rl.value)
  864. return genop_cmp
  865. def _cmpop_float(cond, rev_cond, is_ne=False):
  866. def genop_cmp(self, op, arglocs, result_loc):
  867. if isinstance(arglocs[0], RegLoc):
  868. self.mc.UCOMISD(arglocs[0], arglocs[1])
  869. checkcond = cond
  870. else:
  871. self.mc.UCOMISD(arglocs[1], arglocs[0])
  872. checkcond = rev_cond
  873. tmp1 = result_loc.lowest8bits()
  874. if IS_X86_32:
  875. tmp2 = result_loc.higher8bits()
  876. elif IS_X86_64:
  877. tmp2 = X86_64_SCRATCH_REG.lowest8bits()
  878. self.mc.SET_ir(rx86.Conditions[checkcond], tmp1.value)
  879. if is_ne:
  880. self.mc.SET_ir(rx86.Conditions['P'], tmp2.value)
  881. self.mc.OR8_rr(tmp1.value, tmp2.value)
  882. else:
  883. self.mc.SET_ir(rx86.Conditions['NP'], tmp2.value)
  884. self.mc.AND8_rr(tmp1.value, tmp2.value)
  885. self.mc.MOVZX8_rr(result_loc.value, tmp1.value)
  886. return genop_cmp
  887. def _cmpop_guard(cond, rev_cond, false_cond, false_rev_cond):
  888. def genop_cmp_guard(self, op, guard_op, guard_token, arglocs, result_loc):
  889. guard_opnum = guard_op.getopnum()
  890. if isinstance(op.getarg(0), Const):
  891. self.mc.CMP(arglocs[1], arglocs[0])
  892. if guard_opnum == rop.GUARD_FALSE:
  893. self.implement_guard(guard_token, rev_cond)
  894. else:
  895. self.implement_guard(guard_token, false_rev_cond)
  896. else:
  897. self.mc.CMP(arglocs[0], arglocs[1])
  898. if guard_opnum == rop.GUARD_FALSE:
  899. self.implement_guard(guard_token, cond)
  900. else:
  901. self.implement_guard(guard_token, false_cond)
  902. return genop_cmp_guard
  903. def _cmpop_guard_float(cond, rev_cond, false_cond, false_rev_cond):
  904. need_direct_jp = 'A' not in cond
  905. need_rev_jp = 'A' not in rev_cond
  906. def genop_cmp_guard_float(self, op, guard_op, guard_token, arglocs,
  907. result_loc):
  908. guard_opnum = guard_op.getopnum()
  909. if isinstance(arglocs[0], RegLoc):
  910. self.mc.UCOMISD(arglocs[0], arglocs[1])
  911. checkcond = cond
  912. checkfalsecond = false_cond
  913. need_jp = need_direct_jp
  914. else:
  915. self.mc.UCOMISD(arglocs[1], arglocs[0])
  916. checkcond = rev_cond
  917. checkfalsecond = false_rev_cond
  918. need_jp = need_rev_jp
  919. if guard_opnum == rop.GUARD_FALSE:
  920. if need_jp:
  921. self.mc.J_il8(rx86.Conditions['P'], 6)
  922. self.implement_guard(guard_token, checkcond)
  923. else:
  924. if need_jp:
  925. self.mc.J_il8(rx86.Conditions['P'], 2)
  926. self.mc.J_il8(rx86.Conditions[checkcond], 5)
  927. self.implement_guard(guard_token)
  928. else:
  929. self.implement_guard(guard_token, checkfalsecond)
  930. return genop_cmp_guard_float
  931. def _emit_call(self, force_index, x, arglocs, start=0, tmp=eax,
  932. argtypes=None, callconv=FFI_DEFAULT_ABI):
  933. if IS_X86_64:
  934. return self._emit_call_64(force_index, x, arglocs, start, argtypes)
  935. p = 0
  936. n = len(arglocs)
  937. for i in range(start, n):
  938. loc = arglocs[i]
  939. if isinstance(loc, RegLoc):
  940. if loc.is_xmm:
  941. self.mc.MOVSD_sx(p, loc.value)
  942. else:
  943. self.mc.MOV_sr(p, loc.value)
  944. p += loc.get_width()
  945. p = 0
  946. for i in range(start, n):
  947. loc = arglocs[i]
  948. if not isinstance(loc, RegLoc):
  949. if loc.get_width() == 8:
  950. self.mc.MOVSD(xmm0, loc)
  951. self.mc.MOVSD_sx(p, xmm0.value)
  952. else:
  953. self.mc.MOV(tmp, loc)
  954. self.mc.MOV_sr(p, tmp.value)
  955. p += loc.get_width()
  956. self._regalloc.reserve_param(p//WORD)
  957. # x is a location
  958. self.mc.CALL(x)
  959. self.mark_gc_roots(force_index)
  960. #
  961. if callconv != FFI_DEFAULT_ABI:
  962. self._fix_stdcall(callconv, p)
  963. def _fix_stdcall(self, callconv, p):
  964. from pypy.rlib.clibffi import FFI_STDCALL
  965. assert callconv == FFI_STDCALL
  966. # it's a bit stupid, but we're just going to cancel the fact that
  967. # the called function just added 'p' to ESP, by subtracting it again.
  968. self.mc.SUB_ri(esp.value, p)
  969. def _emit_call_64(self, force_index, x, arglocs, start, argtypes):
  970. src_locs = []
  971. dst_locs = []
  972. xmm_src_locs = []
  973. xmm_dst_locs = []
  974. pass_on_stack = []
  975. singlefloats = None
  976. # In reverse order for use with pop()
  977. unused_gpr = [r9, r8, ecx, edx, esi, edi]
  978. unused_xmm = [xmm7, xmm6, xmm5, xmm4, xmm3, xmm2, xmm1, xmm0]
  979. for i in range(start, len(arglocs)):
  980. loc = arglocs[i]
  981. # XXX: Should be much simplier to tell whether a location is a
  982. # float! It's so ugly because we have to "guard" the access to
  983. # .type with isinstance, since not all AssemblerLocation classes
  984. # are "typed"
  985. if ((isinstance(loc, RegLoc) and loc.is_xmm) or
  986. (isinstance(loc, StackLoc) and loc.type == FLOAT) or
  987. (isinstance(loc, ConstFloatLoc))):
  988. if len(unused_xmm) > 0:
  989. xmm_src_locs.append(loc)
  990. xmm_dst_locs.append(unused_xmm.pop())
  991. else:
  992. pass_on_stack.append(loc)
  993. elif (argtypes is not None and argtypes[i-start] == 'S' and
  994. len(unused_xmm) > 0):
  995. # Singlefloat argument
  996. if singlefloats is None: singlefloats = []
  997. singlefloats.append((loc, unused_xmm.pop()))
  998. else:
  999. if len(unused_gpr) > 0:
  1000. src_locs.append(loc)
  1001. dst_locs.append(unused_gpr.pop())
  1002. else:
  1003. pass_on_stack.append(loc)
  1004. # Emit instructions to pass the stack arguments
  1005. # XXX: Would be nice to let remap_frame_layout take care of this, but
  1006. # we'd need to create something like StackLoc, but relative to esp,
  1007. # and I don't know if it's worth it.
  1008. for i in range(len(pass_on_stack)):
  1009. loc = pass_on_stack[i]
  1010. if not isinstance(loc, RegLoc):
  1011. if isinstance(loc, StackLoc) and loc.type == FLOAT:
  1012. self.mc.MOVSD(X86_64_XMM_SCRATCH_REG, loc)
  1013. self.mc.MOVSD_sx(i*WORD, X86_64_XMM_SCRATCH_REG.value)
  1014. else:
  1015. self.mc.MOV(X86_64_SCRATCH_REG, loc)
  1016. self.mc.MOV_sr(i*WORD, X86_64_SCRATCH_REG.value)
  1017. else:
  1018. # It's a register
  1019. if loc.is_xmm:
  1020. self.mc.MOVSD_sx(i*WORD, loc.value)
  1021. else:
  1022. self.mc.MOV_sr(i*WORD, loc.value)
  1023. # Handle register arguments: first remap the xmm arguments
  1024. remap_frame_layout(self, xmm_src_locs, xmm_dst_locs,
  1025. X86_64_XMM_SCRATCH_REG)
  1026. # Load the singlefloat arguments from main regs or stack to xmm regs
  1027. if singlefloats is not None:
  1028. for src, dst in singlefloats:
  1029. self.mc.MOVD(dst, src)
  1030. # Finally remap the arguments in the main regs
  1031. # If x is a register and is in dst_locs, then oups, it needs to
  1032. # be moved away:
  1033. if x in dst_locs:
  1034. src_locs

Large files files are truncated, but you can click here to view the full file