PageRenderTime 54ms CodeModel.GetById 18ms RepoModel.GetById 0ms app.codeStats 1ms

/rpython/jit/backend/x86/assembler.py

https://bitbucket.org/pjenvey/pypy-mq
Python | 2678 lines | 2047 code | 236 blank | 395 comment | 353 complexity | c47643153212b61224566887c76ba55e MD5 | raw file
Possible License(s): Apache-2.0, AGPL-3.0, BSD-3-Clause

Large files files are truncated, but you can click here to view the full file

  1. import sys
  2. import os
  3. import py
  4. from rpython.jit.backend.llsupport import symbolic, jitframe, rewrite
  5. from rpython.jit.backend.llsupport.assembler import (GuardToken, BaseAssembler, debug_bridge)
  6. from rpython.jit.backend.llsupport.asmmemmgr import MachineDataBlockWrapper
  7. from rpython.jit.backend.llsupport.gcmap import allocate_gcmap
  8. from rpython.jit.metainterp.history import (Const, VOID, ConstInt)
  9. from rpython.jit.metainterp.history import AbstractFailDescr, INT, REF, FLOAT
  10. from rpython.jit.metainterp.compile import ResumeGuardDescr
  11. from rpython.rlib.rjitlog import rjitlog as jl
  12. from rpython.rtyper.lltypesystem import lltype, rffi, rstr, llmemory
  13. from rpython.rtyper.lltypesystem.lloperation import llop
  14. from rpython.rtyper.annlowlevel import cast_instance_to_gcref
  15. from rpython.rtyper import rclass
  16. from rpython.rlib.jit import AsmInfo
  17. from rpython.jit.backend.model import CompiledLoopToken
  18. from rpython.jit.backend.x86.regalloc import (RegAlloc, get_ebp_ofs,
  19. gpr_reg_mgr_cls, xmm_reg_mgr_cls)
  20. from rpython.jit.backend.llsupport.regalloc import (get_scale, valid_addressing_size)
  21. from rpython.jit.backend.x86.arch import (FRAME_FIXED_SIZE, WORD, IS_X86_64,
  22. JITFRAME_FIXED_SIZE, IS_X86_32,
  23. PASS_ON_MY_FRAME, THREADLOCAL_OFS,
  24. DEFAULT_FRAME_BYTES)
  25. from rpython.jit.backend.x86.regloc import (eax, ecx, edx, ebx, esp, ebp, esi,
  26. xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, r8, r9, r10, r11, edi,
  27. r12, r13, r14, r15, X86_64_SCRATCH_REG, X86_64_XMM_SCRATCH_REG,
  28. RegLoc, FrameLoc, ConstFloatLoc, ImmedLoc, AddressLoc, imm,
  29. imm0, imm1, FloatImmedLoc, RawEbpLoc, RawEspLoc)
  30. from rpython.rlib.objectmodel import we_are_translated
  31. from rpython.jit.backend.x86 import rx86, codebuf, callbuilder
  32. from rpython.jit.backend.x86.vector_ext import VectorAssemblerMixin
  33. from rpython.jit.backend.x86.callbuilder import follow_jump
  34. from rpython.jit.metainterp.resoperation import rop
  35. from rpython.jit.backend.x86 import support
  36. from rpython.rlib.debug import debug_print, debug_start, debug_stop
  37. from rpython.rlib import rgc
  38. from rpython.jit.codewriter.effectinfo import EffectInfo
  39. from rpython.jit.codewriter import longlong
  40. from rpython.rlib.rarithmetic import intmask, r_uint
  41. from rpython.rlib.objectmodel import compute_unique_id
  42. class Assembler386(BaseAssembler, VectorAssemblerMixin):
  43. _regalloc = None
  44. _output_loop_log = None
  45. _second_tmp_reg = ecx
  46. DEBUG_FRAME_DEPTH = False
  47. def __init__(self, cpu, translate_support_code=False):
  48. BaseAssembler.__init__(self, cpu, translate_support_code)
  49. self.verbose = False
  50. self.loop_run_counters = []
  51. self.float_const_neg_addr = 0
  52. self.float_const_abs_addr = 0
  53. self.single_float_const_neg_addr = 0
  54. self.single_float_const_abs_addr = 0
  55. self.expand_byte_mask_addr = 0
  56. self.malloc_slowpath = 0
  57. self.malloc_slowpath_varsize = 0
  58. self.wb_slowpath = [0, 0, 0, 0, 0]
  59. self.setup_failure_recovery()
  60. self.datablockwrapper = None
  61. self.stack_check_slowpath = 0
  62. self.propagate_exception_path = 0
  63. self.teardown()
  64. def setup_once(self):
  65. BaseAssembler.setup_once(self)
  66. if self.cpu.supports_floats:
  67. support.ensure_sse2_floats()
  68. self._build_float_constants()
  69. def setup(self, looptoken):
  70. BaseAssembler.setup(self, looptoken)
  71. assert self.memcpy_addr != 0, "setup_once() not called?"
  72. self.current_clt = looptoken.compiled_loop_token
  73. self.pending_guard_tokens = []
  74. if WORD == 8:
  75. self.pending_memoryerror_trampoline_from = []
  76. self.error_trampoline_64 = 0
  77. self.mc = codebuf.MachineCodeBlockWrapper()
  78. #assert self.datablockwrapper is None --- but obscure case
  79. # possible, e.g. getting MemoryError and continuing
  80. allblocks = self.get_asmmemmgr_blocks(looptoken)
  81. self.datablockwrapper = MachineDataBlockWrapper(self.cpu.asmmemmgr,
  82. allblocks)
  83. self.target_tokens_currently_compiling = {}
  84. self.frame_depth_to_patch = []
  85. def teardown(self):
  86. self.pending_guard_tokens = None
  87. if WORD == 8:
  88. self.pending_memoryerror_trampoline_from = None
  89. self.mc = None
  90. self.current_clt = None
  91. def _build_float_constants(self):
  92. # 0x80000000000000008000000000000000
  93. neg_const = '\x00\x00\x00\x00\x00\x00\x00\x80\x00\x00\x00\x00\x00\x00\x00\x80'
  94. # 0x7FFFFFFFFFFFFFFF7FFFFFFFFFFFFFFF
  95. abs_const = '\xFF\xFF\xFF\xFF\xFF\xFF\xFF\x7F\xFF\xFF\xFF\xFF\xFF\xFF\xFF\x7F'
  96. # 0x7FFFFFFF7FFFFFFF7FFFFFFF7FFFFFFF
  97. single_abs_const = '\xFF\xFF\xFF\x7F\xFF\xFF\xFF\x7F\xFF\xFF\xFF\x7F\xFF\xFF\xFF\x7F'
  98. # 0x80000000800000008000000080000000
  99. single_neg_const = '\x00\x00\x00\x80\x00\x00\x00\x80\x00\x00\x00\x80\x00\x00\x00\x80'
  100. zero_const = '\x00' * 16
  101. #
  102. data = neg_const + abs_const + \
  103. single_neg_const + single_abs_const + \
  104. zero_const
  105. datablockwrapper = MachineDataBlockWrapper(self.cpu.asmmemmgr, [])
  106. float_constants = datablockwrapper.malloc_aligned(len(data), alignment=16)
  107. datablockwrapper.done()
  108. addr = rffi.cast(rffi.CArrayPtr(lltype.Char), float_constants)
  109. for i in range(len(data)):
  110. addr[i] = data[i]
  111. self.float_const_neg_addr = float_constants
  112. self.float_const_abs_addr = float_constants + 16
  113. self.single_float_const_neg_addr = float_constants + 32
  114. self.single_float_const_abs_addr = float_constants + 48
  115. self.expand_byte_mask_addr = float_constants + 64
  116. def set_extra_stack_depth(self, mc, value):
  117. if self._is_asmgcc():
  118. extra_ofs = self.cpu.get_ofs_of_frame_field('jf_extra_stack_depth')
  119. mc.MOV_bi(extra_ofs, value)
  120. def build_frame_realloc_slowpath(self):
  121. mc = codebuf.MachineCodeBlockWrapper()
  122. self._push_all_regs_to_frame(mc, [], self.cpu.supports_floats)
  123. # the caller already did push_gcmap(store=True)
  124. if IS_X86_64:
  125. mc.MOV_rs(esi.value, WORD*2)
  126. # push first arg
  127. mc.MOV_rr(edi.value, ebp.value)
  128. align = callbuilder.align_stack_words(1)
  129. mc.SUB_ri(esp.value, (align - 1) * WORD)
  130. else:
  131. align = callbuilder.align_stack_words(3)
  132. mc.MOV_rs(eax.value, WORD * 2)
  133. mc.SUB_ri(esp.value, (align - 1) * WORD)
  134. mc.MOV_sr(WORD, eax.value)
  135. mc.MOV_sr(0, ebp.value)
  136. # align
  137. self.set_extra_stack_depth(mc, align * WORD)
  138. self._store_and_reset_exception(mc, None, ebx, ecx)
  139. mc.CALL(imm(self.cpu.realloc_frame))
  140. mc.MOV_rr(ebp.value, eax.value)
  141. self._restore_exception(mc, None, ebx, ecx)
  142. mc.ADD_ri(esp.value, (align - 1) * WORD)
  143. self.set_extra_stack_depth(mc, 0)
  144. gcrootmap = self.cpu.gc_ll_descr.gcrootmap
  145. if gcrootmap and gcrootmap.is_shadow_stack:
  146. self._load_shadowstack_top_in_ebx(mc, gcrootmap)
  147. mc.MOV_mr((ebx.value, -WORD), eax.value)
  148. self.pop_gcmap(mc) # cancel the push_gcmap(store=True) in the caller
  149. self._pop_all_regs_from_frame(mc, [], self.cpu.supports_floats)
  150. mc.RET()
  151. self._frame_realloc_slowpath = mc.materialize(self.cpu, [])
  152. def _build_cond_call_slowpath(self, supports_floats, callee_only):
  153. """ This builds a general call slowpath, for whatever call happens to
  154. come.
  155. """
  156. mc = codebuf.MachineCodeBlockWrapper()
  157. # copy registers to the frame, with the exception of the
  158. # 'cond_call_register_arguments' and eax, because these have already
  159. # been saved by the caller. Note that this is not symmetrical:
  160. # these 5 registers are saved by the caller but 4 of them are
  161. # restored here at the end of this function.
  162. self._push_all_regs_to_frame(mc, cond_call_register_arguments + [eax],
  163. supports_floats, callee_only)
  164. # the caller already did push_gcmap(store=True)
  165. if IS_X86_64:
  166. mc.SUB(esp, imm(WORD)) # alignment
  167. self.set_extra_stack_depth(mc, 2 * WORD)
  168. # the arguments are already in the correct registers
  169. else:
  170. # we want space for 4 arguments + call + alignment
  171. mc.SUB(esp, imm(WORD * 7))
  172. self.set_extra_stack_depth(mc, 8 * WORD)
  173. # store the arguments at the correct place in the stack
  174. for i in range(4):
  175. mc.MOV_sr(i * WORD, cond_call_register_arguments[i].value)
  176. mc.CALL(eax)
  177. self._reload_frame_if_necessary(mc)
  178. if IS_X86_64:
  179. mc.ADD(esp, imm(WORD))
  180. else:
  181. mc.ADD(esp, imm(WORD * 7))
  182. self.set_extra_stack_depth(mc, 0)
  183. self.pop_gcmap(mc) # cancel the push_gcmap(store=True) in the caller
  184. self._pop_all_regs_from_frame(mc, [eax], supports_floats, callee_only)
  185. mc.RET()
  186. return mc.materialize(self.cpu, [])
  187. def _build_malloc_slowpath(self, kind):
  188. """ While arriving on slowpath, we have a gcpattern on stack 0.
  189. The arguments are passed in ecx and edx, as follows:
  190. kind == 'fixed': nursery_head in ecx and the size in (edx - ecx).
  191. kind == 'str/unicode': length of the string to allocate in edx.
  192. kind == 'var': length to allocate in edx, tid in ecx,
  193. and itemsize in the stack 1 (position esp+WORD).
  194. This function must preserve all registers apart from ecx and edx.
  195. """
  196. assert kind in ['fixed', 'str', 'unicode', 'var']
  197. mc = codebuf.MachineCodeBlockWrapper()
  198. self._push_all_regs_to_frame(mc, [ecx, edx], self.cpu.supports_floats)
  199. # the caller already did push_gcmap(store=True)
  200. #
  201. if kind == 'fixed':
  202. addr = self.cpu.gc_ll_descr.get_malloc_slowpath_addr()
  203. elif kind == 'str':
  204. addr = self.cpu.gc_ll_descr.get_malloc_fn_addr('malloc_str')
  205. elif kind == 'unicode':
  206. addr = self.cpu.gc_ll_descr.get_malloc_fn_addr('malloc_unicode')
  207. else:
  208. addr = self.cpu.gc_ll_descr.get_malloc_slowpath_array_addr()
  209. mc.SUB_ri(esp.value, 16 - WORD) # restore 16-byte alignment
  210. # magically, the above is enough on X86_32 to reserve 3 stack places
  211. if kind == 'fixed':
  212. mc.SUB_rr(edx.value, ecx.value) # compute the size we want
  213. if IS_X86_32:
  214. mc.MOV_sr(0, edx.value) # store the length
  215. if hasattr(self.cpu.gc_ll_descr, 'passes_frame'):
  216. mc.MOV_sr(WORD, ebp.value) # for tests only
  217. else:
  218. mc.MOV_rr(edi.value, edx.value) # length argument
  219. if hasattr(self.cpu.gc_ll_descr, 'passes_frame'):
  220. mc.MOV_rr(esi.value, ebp.value) # for tests only
  221. elif kind == 'str' or kind == 'unicode':
  222. if IS_X86_32:
  223. # stack layout: [---][---][---][ret].. with 3 free stack places
  224. mc.MOV_sr(0, edx.value) # store the length
  225. elif IS_X86_64:
  226. mc.MOV_rr(edi.value, edx.value) # length argument
  227. else:
  228. if IS_X86_32:
  229. # stack layout: [---][---][---][ret][gcmap][itemsize]...
  230. mc.MOV_sr(WORD * 2, edx.value) # store the length
  231. mc.MOV_sr(WORD * 1, ecx.value) # store the tid
  232. mc.MOV_rs(edx.value, WORD * 5) # load the itemsize
  233. mc.MOV_sr(WORD * 0, edx.value) # store the itemsize
  234. else:
  235. # stack layout: [---][ret][gcmap][itemsize]...
  236. # (already in edx) # length
  237. mc.MOV_rr(esi.value, ecx.value) # tid
  238. mc.MOV_rs(edi.value, WORD * 3) # load the itemsize
  239. self.set_extra_stack_depth(mc, 16)
  240. mc.CALL(imm(follow_jump(addr)))
  241. self._reload_frame_if_necessary(mc)
  242. mc.ADD_ri(esp.value, 16 - WORD)
  243. self.set_extra_stack_depth(mc, 0)
  244. #
  245. mc.TEST_rr(eax.value, eax.value)
  246. mc.J_il(rx86.Conditions['Z'], 0xfffff) # patched later
  247. jz_location = mc.get_relative_pos()
  248. mc.MOV_rr(ecx.value, eax.value)
  249. #
  250. nursery_free_adr = self.cpu.gc_ll_descr.get_nursery_free_addr()
  251. self._pop_all_regs_from_frame(mc, [ecx, edx], self.cpu.supports_floats)
  252. mc.MOV(edx, heap(nursery_free_adr)) # load this in EDX
  253. self.pop_gcmap(mc) # push_gcmap(store=True) done by the caller
  254. mc.RET()
  255. #
  256. # If the slowpath malloc failed, we raise a MemoryError that
  257. # always interrupts the current loop, as a "good enough"
  258. # approximation. We have to adjust the esp a little, to point to
  259. # the correct "ret" arg
  260. offset = mc.get_relative_pos() - jz_location
  261. mc.overwrite32(jz_location-4, offset)
  262. # From now on this function is basically "merged" with
  263. # its caller and so contains DEFAULT_FRAME_BYTES bytes
  264. # plus my own return address, which we'll ignore next
  265. mc.force_frame_size(DEFAULT_FRAME_BYTES + WORD)
  266. mc.ADD_ri(esp.value, WORD)
  267. mc.JMP(imm(self.propagate_exception_path))
  268. #
  269. rawstart = mc.materialize(self.cpu, [])
  270. return rawstart
  271. def _build_propagate_exception_path(self):
  272. self.mc = codebuf.MachineCodeBlockWrapper()
  273. self.mc.force_frame_size(DEFAULT_FRAME_BYTES)
  274. #
  275. # read and reset the current exception
  276. self._store_and_reset_exception(self.mc, eax)
  277. ofs = self.cpu.get_ofs_of_frame_field('jf_guard_exc')
  278. self.mc.MOV_br(ofs, eax.value)
  279. propagate_exception_descr = rffi.cast(lltype.Signed,
  280. cast_instance_to_gcref(self.cpu.propagate_exception_descr))
  281. ofs = self.cpu.get_ofs_of_frame_field('jf_descr')
  282. self.mc.MOV(RawEbpLoc(ofs), imm(propagate_exception_descr))
  283. #
  284. self._call_footer()
  285. rawstart = self.mc.materialize(self.cpu, [])
  286. self.propagate_exception_path = rawstart
  287. self.mc = None
  288. def _build_stack_check_slowpath(self):
  289. _, _, slowpathaddr = self.cpu.insert_stack_check()
  290. if slowpathaddr == 0 or not self.cpu.propagate_exception_descr:
  291. return # no stack check (for tests, or non-translated)
  292. #
  293. # make a regular function that is called from a point near the start
  294. # of an assembler function (after it adjusts the stack and saves
  295. # registers).
  296. mc = codebuf.MachineCodeBlockWrapper()
  297. #
  298. if IS_X86_64:
  299. mc.MOV_rr(edi.value, esp.value)
  300. mc.SUB_ri(esp.value, WORD) # alignment
  301. #
  302. if IS_X86_32:
  303. mc.SUB_ri(esp.value, 2*WORD) # alignment
  304. mc.PUSH_r(esp.value)
  305. #
  306. # esp is now aligned to a multiple of 16 again
  307. mc.CALL(imm(follow_jump(slowpathaddr)))
  308. #
  309. if IS_X86_32:
  310. mc.ADD_ri(esp.value, 3*WORD) # alignment
  311. else:
  312. mc.ADD_ri(esp.value, WORD)
  313. #
  314. mc.MOV(eax, heap(self.cpu.pos_exception()))
  315. mc.TEST_rr(eax.value, eax.value)
  316. mc.J_il8(rx86.Conditions['NZ'], 0)
  317. jnz_location = mc.get_relative_pos()
  318. #
  319. mc.RET()
  320. #
  321. # patch the JNZ above
  322. offset = mc.get_relative_pos() - jnz_location
  323. assert 0 < offset <= 127
  324. mc.overwrite(jnz_location-1, chr(offset))
  325. # From now on this function is basically "merged" with
  326. # its caller and so contains DEFAULT_FRAME_BYTES bytes
  327. # plus my own return address, which we'll ignore next
  328. mc.force_frame_size(DEFAULT_FRAME_BYTES + WORD)
  329. mc.ADD_ri(esp.value, WORD)
  330. mc.JMP(imm(self.propagate_exception_path))
  331. #
  332. rawstart = mc.materialize(self.cpu, [])
  333. self.stack_check_slowpath = rawstart
  334. def _build_wb_slowpath(self, withcards, withfloats=False, for_frame=False):
  335. descr = self.cpu.gc_ll_descr.write_barrier_descr
  336. exc0, exc1 = None, None
  337. if descr is None:
  338. return
  339. if not withcards:
  340. func = descr.get_write_barrier_fn(self.cpu)
  341. else:
  342. if descr.jit_wb_cards_set == 0:
  343. return
  344. func = descr.get_write_barrier_from_array_fn(self.cpu)
  345. if func == 0:
  346. return
  347. #
  348. # This builds a helper function called from the slow path of
  349. # write barriers. It must save all registers, and optionally
  350. # all XMM registers. It takes a single argument just pushed
  351. # on the stack even on X86_64. It must restore stack alignment
  352. # accordingly.
  353. mc = codebuf.MachineCodeBlockWrapper()
  354. #
  355. if not for_frame:
  356. self._push_all_regs_to_frame(mc, [], withfloats, callee_only=True)
  357. if IS_X86_32:
  358. # we have 2 extra words on stack for retval and we pass 1 extra
  359. # arg, so we need to substract 2 words
  360. mc.SUB_ri(esp.value, 2 * WORD)
  361. mc.MOV_rs(eax.value, 3 * WORD) # 2 + 1
  362. mc.MOV_sr(0, eax.value)
  363. else:
  364. mc.MOV_rs(edi.value, WORD)
  365. else:
  366. # NOTE: don't save registers on the jitframe here!
  367. # It might override already-saved values that will be
  368. # restored later...
  369. #
  370. # This 'for_frame' version is called after a CALL. It does not
  371. # need to save many registers: the registers that are anyway
  372. # destroyed by the call can be ignored (volatiles), and the
  373. # non-volatile registers won't be changed here. It only needs
  374. # to save eax, maybe edx, and xmm0 (possible results of the call)
  375. # and two more non-volatile registers (used to store the RPython
  376. # exception that occurred in the CALL, if any).
  377. assert not withcards
  378. # we have one word to align
  379. mc.SUB_ri(esp.value, 7 * WORD) # align and reserve some space
  380. mc.MOV_sr(WORD, eax.value) # save for later
  381. if self.cpu.supports_floats:
  382. mc.MOVSD_sx(2 * WORD, xmm0.value) # 32-bit: also 3 * WORD
  383. if IS_X86_32:
  384. mc.MOV_sr(4 * WORD, edx.value)
  385. mc.MOV_sr(0, ebp.value)
  386. exc0, exc1 = esi, edi
  387. else:
  388. mc.MOV_rr(edi.value, ebp.value)
  389. exc0, exc1 = ebx, r12
  390. mc.MOV(RawEspLoc(WORD * 5, REF), exc0)
  391. mc.MOV(RawEspLoc(WORD * 6, INT), exc1)
  392. # note that it's safe to store the exception in register,
  393. # since the call to write barrier can't collect
  394. # (and this is assumed a bit left and right here, like lack
  395. # of _reload_frame_if_necessary)
  396. self._store_and_reset_exception(mc, exc0, exc1)
  397. mc.CALL(imm(func))
  398. #
  399. if withcards:
  400. # A final TEST8 before the RET, for the caller. Careful to
  401. # not follow this instruction with another one that changes
  402. # the status of the CPU flags!
  403. if IS_X86_32:
  404. mc.MOV_rs(eax.value, 3*WORD)
  405. else:
  406. mc.MOV_rs(eax.value, WORD)
  407. mc.TEST8(addr_add_const(eax, descr.jit_wb_if_flag_byteofs),
  408. imm(-0x80))
  409. #
  410. if not for_frame:
  411. if IS_X86_32:
  412. # ADD touches CPU flags
  413. mc.LEA_rs(esp.value, 2 * WORD)
  414. self._pop_all_regs_from_frame(mc, [], withfloats, callee_only=True)
  415. mc.RET16_i(WORD)
  416. # Note that wb_slowpath[0..3] end with a RET16_i, which must be
  417. # taken care of in the caller by stack_frame_size_delta(-WORD)
  418. else:
  419. if IS_X86_32:
  420. mc.MOV_rs(edx.value, 4 * WORD)
  421. if self.cpu.supports_floats:
  422. mc.MOVSD_xs(xmm0.value, 2 * WORD)
  423. mc.MOV_rs(eax.value, WORD) # restore
  424. self._restore_exception(mc, exc0, exc1)
  425. mc.MOV(exc0, RawEspLoc(WORD * 5, REF))
  426. mc.MOV(exc1, RawEspLoc(WORD * 6, INT))
  427. mc.LEA_rs(esp.value, 7 * WORD)
  428. mc.RET()
  429. rawstart = mc.materialize(self.cpu, [])
  430. if for_frame:
  431. self.wb_slowpath[4] = rawstart
  432. else:
  433. self.wb_slowpath[withcards + 2 * withfloats] = rawstart
  434. @rgc.no_release_gil
  435. def assemble_loop(self, jd_id, unique_id, logger, loopname, inputargs,
  436. operations, looptoken, log):
  437. '''adds the following attributes to looptoken:
  438. _ll_function_addr (address of the generated func, as an int)
  439. _ll_loop_code (debug: addr of the start of the ResOps)
  440. _x86_fullsize (debug: full size including failure)
  441. '''
  442. # XXX this function is too longish and contains some code
  443. # duplication with assemble_bridge(). Also, we should think
  444. # about not storing on 'self' attributes that will live only
  445. # for the duration of compiling one loop or a one bridge.
  446. clt = CompiledLoopToken(self.cpu, looptoken.number)
  447. looptoken.compiled_loop_token = clt
  448. clt._debug_nbargs = len(inputargs)
  449. if not we_are_translated():
  450. # Arguments should be unique
  451. assert len(set(inputargs)) == len(inputargs)
  452. self.setup(looptoken)
  453. if self.cpu.HAS_CODEMAP:
  454. self.codemap_builder.enter_portal_frame(jd_id, unique_id,
  455. self.mc.get_relative_pos())
  456. frame_info = self.datablockwrapper.malloc_aligned(
  457. jitframe.JITFRAMEINFO_SIZE, alignment=WORD)
  458. clt.frame_info = rffi.cast(jitframe.JITFRAMEINFOPTR, frame_info)
  459. clt.frame_info.clear() # for now
  460. if log:
  461. number = looptoken.number
  462. operations = self._inject_debugging_code(looptoken, operations,
  463. 'e', number)
  464. regalloc = RegAlloc(self, self.cpu.translate_support_code)
  465. #
  466. allgcrefs = []
  467. operations = regalloc.prepare_loop(inputargs, operations,
  468. looptoken, allgcrefs)
  469. self.reserve_gcref_table(allgcrefs)
  470. functionpos = self.mc.get_relative_pos()
  471. self._call_header_with_stack_check()
  472. self._check_frame_depth_debug(self.mc)
  473. looppos = self.mc.get_relative_pos()
  474. frame_depth_no_fixed_size = self._assemble(regalloc, inputargs,
  475. operations)
  476. self.update_frame_depth(frame_depth_no_fixed_size + JITFRAME_FIXED_SIZE)
  477. #
  478. size_excluding_failure_stuff = self.mc.get_relative_pos()
  479. self.write_pending_failure_recoveries(regalloc)
  480. full_size = self.mc.get_relative_pos()
  481. #
  482. rawstart = self.materialize_loop(looptoken)
  483. self.patch_gcref_table(looptoken, rawstart)
  484. self.patch_stack_checks(frame_depth_no_fixed_size + JITFRAME_FIXED_SIZE,
  485. rawstart)
  486. looptoken._ll_loop_code = looppos + rawstart
  487. debug_start("jit-backend-addr")
  488. debug_print("Loop %d (%s) has address 0x%x to 0x%x (bootstrap 0x%x)" % (
  489. looptoken.number, loopname,
  490. r_uint(rawstart + looppos),
  491. r_uint(rawstart + size_excluding_failure_stuff),
  492. r_uint(rawstart + functionpos)))
  493. debug_print(" gc table: 0x%x" % r_uint(self.gc_table_addr))
  494. debug_print(" function: 0x%x" % r_uint(rawstart + functionpos))
  495. debug_print(" resops: 0x%x" % r_uint(rawstart + looppos))
  496. debug_print(" failures: 0x%x" % r_uint(rawstart +
  497. size_excluding_failure_stuff))
  498. debug_print(" end: 0x%x" % r_uint(rawstart + full_size))
  499. debug_stop("jit-backend-addr")
  500. self.patch_pending_failure_recoveries(rawstart)
  501. #
  502. ops_offset = self.mc.ops_offset
  503. if not we_are_translated():
  504. # used only by looptoken.dump() -- useful in tests
  505. looptoken._x86_rawstart = rawstart
  506. looptoken._x86_fullsize = full_size
  507. looptoken._x86_ops_offset = ops_offset
  508. looptoken._ll_function_addr = rawstart + functionpos
  509. if logger:
  510. log = logger.log_trace(jl.MARK_TRACE_ASM, None, self.mc)
  511. log.write(inputargs, operations, ops_offset=ops_offset)
  512. # legacy
  513. if logger.logger_ops:
  514. logger.logger_ops.log_loop(inputargs, operations, 0,
  515. "rewritten", name=loopname,
  516. ops_offset=ops_offset)
  517. self.fixup_target_tokens(rawstart)
  518. self.teardown()
  519. # oprofile support
  520. if self.cpu.profile_agent is not None:
  521. name = "Loop # %s: %s" % (looptoken.number, loopname)
  522. self.cpu.profile_agent.native_code_written(name,
  523. rawstart, full_size)
  524. return AsmInfo(ops_offset, rawstart + looppos,
  525. size_excluding_failure_stuff - looppos, rawstart)
  526. @rgc.no_release_gil
  527. def assemble_bridge(self, faildescr, inputargs, operations,
  528. original_loop_token, log, logger):
  529. if not we_are_translated():
  530. # Arguments should be unique
  531. assert len(set(inputargs)) == len(inputargs)
  532. self.setup(original_loop_token)
  533. if self.cpu.HAS_CODEMAP:
  534. self.codemap_builder.inherit_code_from_position(
  535. faildescr.adr_jump_offset)
  536. self.mc.force_frame_size(DEFAULT_FRAME_BYTES)
  537. descr_number = compute_unique_id(faildescr)
  538. if log:
  539. operations = self._inject_debugging_code(faildescr, operations,
  540. 'b', descr_number)
  541. arglocs = self.rebuild_faillocs_from_descr(faildescr, inputargs)
  542. regalloc = RegAlloc(self, self.cpu.translate_support_code)
  543. allgcrefs = []
  544. operations = regalloc.prepare_bridge(inputargs, arglocs,
  545. operations,
  546. allgcrefs,
  547. self.current_clt.frame_info)
  548. self.reserve_gcref_table(allgcrefs)
  549. startpos = self.mc.get_relative_pos()
  550. self._check_frame_depth(self.mc, regalloc.get_gcmap())
  551. bridgestartpos = self.mc.get_relative_pos()
  552. self._update_at_exit(arglocs, inputargs, faildescr, regalloc)
  553. frame_depth_no_fixed_size = self._assemble(regalloc, inputargs, operations)
  554. codeendpos = self.mc.get_relative_pos()
  555. self.write_pending_failure_recoveries(regalloc)
  556. fullsize = self.mc.get_relative_pos()
  557. #
  558. rawstart = self.materialize_loop(original_loop_token)
  559. self.patch_gcref_table(original_loop_token, rawstart)
  560. self.patch_stack_checks(frame_depth_no_fixed_size + JITFRAME_FIXED_SIZE,
  561. rawstart)
  562. debug_start("jit-backend-addr")
  563. debug_print("bridge out of Guard 0x%x has address 0x%x to 0x%x" %
  564. (r_uint(descr_number), r_uint(rawstart + startpos),
  565. r_uint(rawstart + codeendpos)))
  566. debug_print(" gc table: 0x%x" % r_uint(self.gc_table_addr))
  567. debug_print(" jump target: 0x%x" % r_uint(rawstart + startpos))
  568. debug_print(" resops: 0x%x" % r_uint(rawstart + bridgestartpos))
  569. debug_print(" failures: 0x%x" % r_uint(rawstart + codeendpos))
  570. debug_print(" end: 0x%x" % r_uint(rawstart + fullsize))
  571. debug_stop("jit-backend-addr")
  572. self.patch_pending_failure_recoveries(rawstart)
  573. # patch the jump from original guard
  574. self.patch_jump_for_descr(faildescr, rawstart + startpos)
  575. ops_offset = self.mc.ops_offset
  576. frame_depth = max(self.current_clt.frame_info.jfi_frame_depth,
  577. frame_depth_no_fixed_size + JITFRAME_FIXED_SIZE)
  578. if logger:
  579. log = logger.log_trace(jl.MARK_TRACE_ASM, None, self.mc)
  580. log.write(inputargs, operations, ops_offset)
  581. # log that the already written bridge is stitched to a descr!
  582. logger.log_patch_guard(descr_number, rawstart)
  583. # legacy
  584. if logger.logger_ops:
  585. logger.logger_ops.log_bridge(inputargs, operations, "rewritten",
  586. faildescr, ops_offset=ops_offset)
  587. self.fixup_target_tokens(rawstart)
  588. self.update_frame_depth(frame_depth)
  589. self.teardown()
  590. # oprofile support
  591. if self.cpu.profile_agent is not None:
  592. name = "Bridge # %s" % (descr_number,)
  593. self.cpu.profile_agent.native_code_written(name,
  594. rawstart, fullsize)
  595. return AsmInfo(ops_offset, startpos + rawstart, codeendpos - startpos, rawstart+bridgestartpos)
  596. def stitch_bridge(self, faildescr, target):
  597. """ Stitching means that one can enter a bridge with a complete different register
  598. allocation. This needs remapping which is done here for both normal registers
  599. and accumulation registers.
  600. Why? Because this only generates a very small junk of memory, instead of
  601. duplicating the loop assembler for each faildescr!
  602. """
  603. asminfo, bridge_faildescr, version, looptoken = target
  604. assert isinstance(bridge_faildescr, ResumeGuardDescr)
  605. assert isinstance(faildescr, ResumeGuardDescr)
  606. assert asminfo.rawstart != 0
  607. self.mc = codebuf.MachineCodeBlockWrapper()
  608. allblocks = self.get_asmmemmgr_blocks(looptoken)
  609. self.datablockwrapper = MachineDataBlockWrapper(self.cpu.asmmemmgr,
  610. allblocks)
  611. frame_info = self.datablockwrapper.malloc_aligned(
  612. jitframe.JITFRAMEINFO_SIZE, alignment=WORD)
  613. self.mc.force_frame_size(DEFAULT_FRAME_BYTES)
  614. # if accumulation is saved at the guard, we need to update it here!
  615. guard_locs = self.rebuild_faillocs_from_descr(faildescr, version.inputargs)
  616. bridge_locs = self.rebuild_faillocs_from_descr(bridge_faildescr, version.inputargs)
  617. #import pdb; pdb.set_trace()
  618. guard_accum_info = faildescr.rd_vector_info
  619. # O(n^2), but usually you only have at most 1 fail argument
  620. while guard_accum_info:
  621. bridge_accum_info = bridge_faildescr.rd_vector_info
  622. while bridge_accum_info:
  623. if bridge_accum_info.failargs_pos == guard_accum_info.failargs_pos:
  624. # the mapping might be wrong!
  625. if bridge_accum_info.location is not guard_accum_info.location:
  626. self.mov(guard_accum_info.location, bridge_accum_info.location)
  627. bridge_accum_info = bridge_accum_info.next()
  628. guard_accum_info = guard_accum_info.next()
  629. # register mapping is most likely NOT valid, thus remap it in this
  630. # short piece of assembler
  631. assert len(guard_locs) == len(bridge_locs)
  632. for i,gloc in enumerate(guard_locs):
  633. bloc = bridge_locs[i]
  634. bstack = bloc.location_code() == 'b'
  635. gstack = gloc.location_code() == 'b'
  636. if bstack and gstack:
  637. pass
  638. elif gloc is not bloc:
  639. self.mov(gloc, bloc)
  640. offset = self.mc.get_relative_pos()
  641. self.mc.JMP_l(0)
  642. self.mc.writeimm32(0)
  643. self.mc.force_frame_size(DEFAULT_FRAME_BYTES)
  644. rawstart = self.materialize_loop(looptoken)
  645. # update the jump (above) to the real trace
  646. self._patch_jump_to(rawstart + offset, asminfo.rawstart)
  647. # update the guard to jump right to this custom piece of assembler
  648. self.patch_jump_for_descr(faildescr, rawstart)
  649. def _patch_jump_to(self, adr_jump_offset, adr_new_target):
  650. assert adr_jump_offset != 0
  651. offset = adr_new_target - (adr_jump_offset + 5)
  652. mc = codebuf.MachineCodeBlockWrapper()
  653. mc.force_frame_size(DEFAULT_FRAME_BYTES)
  654. if rx86.fits_in_32bits(offset):
  655. mc.JMP_l(offset)
  656. else:
  657. mc.MOV_ri(X86_64_SCRATCH_REG.value, adr_new_target)
  658. mc.JMP_r(X86_64_SCRATCH_REG.value)
  659. mc.copy_to_raw_memory(adr_jump_offset)
  660. def reserve_gcref_table(self, allgcrefs):
  661. gcref_table_size = len(allgcrefs) * WORD
  662. if IS_X86_64:
  663. # align to a multiple of 16 and reserve space at the beginning
  664. # of the machine code for the gc table. This lets us write
  665. # machine code with relative addressing (%rip - constant).
  666. gcref_table_size = (gcref_table_size + 15) & ~15
  667. mc = self.mc
  668. assert mc.get_relative_pos() == 0
  669. for i in range(gcref_table_size):
  670. mc.writechar('\x00')
  671. elif IS_X86_32:
  672. # allocate the gc table right now. This lets us write
  673. # machine code with absolute 32-bit addressing.
  674. self.gc_table_addr = self.datablockwrapper.malloc_aligned(
  675. gcref_table_size, alignment=WORD)
  676. #
  677. self.setup_gcrefs_list(allgcrefs)
  678. def patch_gcref_table(self, looptoken, rawstart):
  679. if IS_X86_64:
  680. # the gc table is at the start of the machine code
  681. self.gc_table_addr = rawstart
  682. elif IS_X86_32:
  683. # the gc table was already allocated by reserve_gcref_table()
  684. rawstart = self.gc_table_addr
  685. #
  686. tracer = self.cpu.gc_ll_descr.make_gcref_tracer(rawstart,
  687. self._allgcrefs)
  688. gcreftracers = self.get_asmmemmgr_gcreftracers(looptoken)
  689. gcreftracers.append(tracer) # keepalive
  690. self.teardown_gcrefs_list()
  691. def write_pending_failure_recoveries(self, regalloc):
  692. # for each pending guard, generate the code of the recovery stub
  693. # at the end of self.mc.
  694. for tok in self.pending_guard_tokens:
  695. descr = tok.faildescr
  696. if descr.loop_version():
  697. startpos = self.mc.get_relative_pos()
  698. self.store_info_on_descr(startpos, tok)
  699. else:
  700. tok.pos_recovery_stub = self.generate_quick_failure(tok, regalloc)
  701. if WORD == 8 and len(self.pending_memoryerror_trampoline_from) > 0:
  702. self.error_trampoline_64 = self.generate_propagate_error_64()
  703. def patch_pending_failure_recoveries(self, rawstart):
  704. # after we wrote the assembler to raw memory, set up
  705. # tok.faildescr.adr_jump_offset to contain the raw address of
  706. # the 4-byte target field in the JMP/Jcond instruction, and patch
  707. # the field in question to point (initially) to the recovery stub
  708. clt = self.current_clt
  709. for tok in self.pending_guard_tokens:
  710. addr = rawstart + tok.pos_jump_offset
  711. tok.faildescr.adr_jump_offset = addr
  712. descr = tok.faildescr
  713. if descr.loop_version():
  714. continue # patch them later
  715. relative_target = tok.pos_recovery_stub - (tok.pos_jump_offset + 4)
  716. assert rx86.fits_in_32bits(relative_target)
  717. #
  718. if not tok.guard_not_invalidated():
  719. mc = codebuf.MachineCodeBlockWrapper()
  720. mc.writeimm32(relative_target)
  721. mc.copy_to_raw_memory(addr)
  722. else:
  723. # GUARD_NOT_INVALIDATED, record an entry in
  724. # clt.invalidate_positions of the form:
  725. # (addr-in-the-code-of-the-not-yet-written-jump-target,
  726. # relative-target-to-use)
  727. relpos = tok.pos_jump_offset
  728. clt.invalidate_positions.append((rawstart + relpos,
  729. relative_target))
  730. # General idea: Although no code was generated by this
  731. # guard, the code might be patched with a "JMP rel32" to
  732. # the guard recovery code. This recovery code is
  733. # already generated, and looks like the recovery code
  734. # for any guard, even if at first it has no jump to it.
  735. # So we may later write 5 bytes overriding the existing
  736. # instructions; this works because a CALL instruction
  737. # would also take at least 5 bytes. If it could take
  738. # less, we would run into the issue that overwriting the
  739. # 5 bytes here might get a few nonsense bytes at the
  740. # return address of the following CALL.
  741. if WORD == 8:
  742. for pos_after_jz in self.pending_memoryerror_trampoline_from:
  743. assert self.error_trampoline_64 != 0 # only if non-empty
  744. mc = codebuf.MachineCodeBlockWrapper()
  745. mc.writeimm32(self.error_trampoline_64 - pos_after_jz)
  746. mc.copy_to_raw_memory(rawstart + pos_after_jz - 4)
  747. def update_frame_depth(self, frame_depth):
  748. baseofs = self.cpu.get_baseofs_of_frame_field()
  749. self.current_clt.frame_info.update_frame_depth(baseofs, frame_depth)
  750. def patch_stack_checks(self, framedepth, rawstart):
  751. for ofs in self.frame_depth_to_patch:
  752. self._patch_frame_depth(ofs + rawstart, framedepth)
  753. def _check_frame_depth(self, mc, gcmap):
  754. """ check if the frame is of enough depth to follow this bridge.
  755. Otherwise reallocate the frame in a helper.
  756. There are other potential solutions
  757. to that, but this one does not sound too bad.
  758. """
  759. descrs = self.cpu.gc_ll_descr.getframedescrs(self.cpu)
  760. ofs = self.cpu.unpack_fielddescr(descrs.arraydescr.lendescr)
  761. mc.CMP_bi(ofs, 0xffffff) # force writing 32 bit
  762. stack_check_cmp_ofs = mc.get_relative_pos() - 4
  763. mc.J_il8(rx86.Conditions['GE'], 0)
  764. jg_location = mc.get_relative_pos()
  765. mc.MOV_si(WORD, 0xffffff) # force writing 32 bit
  766. ofs2 = mc.get_relative_pos() - 4
  767. self.push_gcmap(mc, gcmap, store=True)
  768. mc.CALL(imm(self._frame_realloc_slowpath))
  769. # patch the JG above
  770. offset = mc.get_relative_pos() - jg_location
  771. assert 0 < offset <= 127
  772. mc.overwrite(jg_location-1, chr(offset))
  773. self.frame_depth_to_patch.append(stack_check_cmp_ofs)
  774. self.frame_depth_to_patch.append(ofs2)
  775. def _check_frame_depth_debug(self, mc):
  776. """ double check the depth size. It prints the error (and potentially
  777. segfaults later)
  778. """
  779. if not self.DEBUG_FRAME_DEPTH:
  780. return
  781. descrs = self.cpu.gc_ll_descr.getframedescrs(self.cpu)
  782. ofs = self.cpu.unpack_fielddescr(descrs.arraydescr.lendescr)
  783. mc.CMP_bi(ofs, 0xffffff)
  784. stack_check_cmp_ofs = mc.get_relative_pos() - 4
  785. mc.J_il8(rx86.Conditions['GE'], 0)
  786. jg_location = mc.get_relative_pos()
  787. mc.MOV_rr(edi.value, ebp.value)
  788. mc.MOV_ri(esi.value, 0xffffff)
  789. ofs2 = mc.get_relative_pos() - 4
  790. mc.CALL(imm(self.cpu.realloc_frame_crash))
  791. # patch the JG above
  792. offset = mc.get_relative_pos() - jg_location
  793. assert 0 < offset <= 127
  794. mc.overwrite(jg_location-1, chr(offset))
  795. self.frame_depth_to_patch.append(stack_check_cmp_ofs)
  796. self.frame_depth_to_patch.append(ofs2)
  797. def _patch_frame_depth(self, adr, allocated_depth):
  798. mc = codebuf.MachineCodeBlockWrapper()
  799. mc.writeimm32(allocated_depth)
  800. mc.copy_to_raw_memory(adr)
  801. def materialize_loop(self, looptoken):
  802. self.datablockwrapper.done() # finish using cpu.asmmemmgr
  803. self.datablockwrapper = None
  804. allblocks = self.get_asmmemmgr_blocks(looptoken)
  805. size = self.mc.get_relative_pos()
  806. res = self.mc.materialize(self.cpu, allblocks,
  807. self.cpu.gc_ll_descr.gcrootmap)
  808. if self.cpu.HAS_CODEMAP:
  809. self.cpu.codemap.register_codemap(
  810. self.codemap_builder.get_final_bytecode(res, size))
  811. return res
  812. def patch_jump_for_descr(self, faildescr, adr_new_target):
  813. adr_jump_offset = faildescr.adr_jump_offset
  814. assert adr_jump_offset != 0
  815. offset = adr_new_target - (adr_jump_offset + 4)
  816. # If the new target fits within a rel32 of the jump, just patch
  817. # that. Otherwise, leave the original rel32 to the recovery stub in
  818. # place, but clobber the recovery stub with a jump to the real
  819. # target.
  820. mc = codebuf.MachineCodeBlockWrapper()
  821. mc.force_frame_size(DEFAULT_FRAME_BYTES)
  822. if rx86.fits_in_32bits(offset):
  823. mc.writeimm32(offset)
  824. mc.copy_to_raw_memory(adr_jump_offset)
  825. else:
  826. # "mov r11, addr; jmp r11" is up to 13 bytes, which fits in there
  827. # because we always write "mov r11, imm-as-8-bytes; call *r11" in
  828. # the first place.
  829. mc.MOV_ri(X86_64_SCRATCH_REG.value, adr_new_target)
  830. mc.JMP_r(X86_64_SCRATCH_REG.value)
  831. p = rffi.cast(rffi.INTP, adr_jump_offset)
  832. adr_target = adr_jump_offset + 4 + rffi.cast(lltype.Signed, p[0])
  833. mc.copy_to_raw_memory(adr_target)
  834. faildescr.adr_jump_offset = 0 # means "patched"
  835. def fixup_target_tokens(self, rawstart):
  836. for targettoken in self.target_tokens_currently_compiling:
  837. targettoken._ll_loop_code += rawstart
  838. self.target_tokens_currently_compiling = None
  839. def _assemble(self, regalloc, inputargs, operations):
  840. self._regalloc = regalloc
  841. self.guard_success_cc = rx86.cond_none
  842. regalloc.compute_hint_frame_locations(operations)
  843. regalloc.walk_operations(inputargs, operations)
  844. assert self.guard_success_cc == rx86.cond_none
  845. if we_are_translated() or self.cpu.dont_keepalive_stuff:
  846. self._regalloc = None # else keep it around for debugging
  847. frame_depth = regalloc.get_final_frame_depth()
  848. jump_target_descr = regalloc.jump_target_descr
  849. if jump_target_descr is not None:
  850. tgt_depth = jump_target_descr._x86_clt.frame_info.jfi_frame_depth
  851. target_frame_depth = tgt_depth - JITFRAME_FIXED_SIZE
  852. frame_depth = max(frame_depth, target_frame_depth)
  853. return frame_depth
  854. def _call_header_vmprof(self):
  855. from rpython.rlib.rvmprof.rvmprof import cintf, VMPROF_JITTED_TAG
  856. # tloc = address of pypy_threadlocal_s
  857. if IS_X86_32:
  858. # Can't use esi here, its old value is not saved yet.
  859. # But we can use eax and ecx.
  860. self.mc.MOV_rs(edx.value, THREADLOCAL_OFS)
  861. tloc = edx
  862. old = ecx
  863. else:
  864. # The thread-local value is already in esi.
  865. # We should avoid if possible to use ecx or edx because they
  866. # would be used to pass arguments #3 and #4 (even though, so
  867. # far, the assembler only receives two arguments).
  868. tloc = esi
  869. old = r11
  870. # eax = address in the stack of a 3-words struct vmprof_stack_s
  871. self.mc.LEA_rs(eax.value, (FRAME_FIXED_SIZE - 4) * WORD)
  872. # old = current value of vmprof_tl_stack
  873. offset = cintf.vmprof_tl_stack.getoffset()
  874. self.mc.MOV_rm(old.value, (tloc.value, offset))
  875. # eax->next = old
  876. self.mc.MOV_mr((eax.value, 0), old.value)
  877. # eax->value = my esp
  878. self.mc.MOV_mr((eax.value, WORD), esp.value)
  879. # eax->kind = VMPROF_JITTED_TAG
  880. self.mc.MOV_mi((eax.value, WORD * 2), VMPROF_JITTED_TAG)
  881. # save in vmprof_tl_stack the new eax
  882. self.mc.MOV_mr((tloc.value, offset), eax.value)
  883. def _call_footer_vmprof(self):
  884. from rpython.rlib.rvmprof.rvmprof import cintf
  885. # edx = address of pypy_threadlocal_s
  886. self.mc.MOV_rs(edx.value, THREADLOCAL_OFS)
  887. self.mc.AND_ri(edx.value, ~1)
  888. # eax = (our local vmprof_tl_stack).next
  889. self.mc.MOV_rs(eax.value, (FRAME_FIXED_SIZE - 4 + 0) * WORD)
  890. # save in vmprof_tl_stack the value eax
  891. offset = cintf.vmprof_tl_stack.getoffset()
  892. self.mc.MOV_mr((edx.value, offset), eax.value)
  893. def _call_header(self):
  894. self.mc.SUB_ri(esp.value, FRAME_FIXED_SIZE * WORD)
  895. self.mc.MOV_sr(PASS_ON_MY_FRAME * WORD, ebp.value)
  896. if IS_X86_64:
  897. self.mc.MOV_sr(THREADLOCAL_OFS, esi.value)
  898. if self.cpu.translate_support_code:
  899. self._call_header_vmprof() # on X86_64, this uses esi
  900. if IS_X86_64:
  901. self.mc.MOV_rr(ebp.value, edi.value)
  902. else:
  903. self.mc.MOV_rs(ebp.value, (FRAME_FIXED_SIZE + 1) * WORD)
  904. for i, loc in enumerate(self.cpu.CALLEE_SAVE_REGISTERS):
  905. self.mc.MOV_sr((PASS_ON_MY_FRAME + i + 1) * WORD, loc.value)
  906. gcrootmap = self.cpu.gc_ll_descr.gcrootmap
  907. if gcrootmap and gcrootmap.is_shadow_stack:
  908. self._call_header_shadowstack(gcrootmap)
  909. def _call_header_with_stack_check(self):
  910. self._call_header()
  911. if self.stack_check_slowpath == 0:
  912. pass # no stack check (e.g. not translated)
  913. else:
  914. endaddr, lengthaddr, _ = self.cpu.insert_stack_check()
  915. self.mc.MOV(eax, heap(endaddr)) # MOV eax, [start]
  916. self.mc.SUB(eax, esp) # SUB eax, current
  917. self.mc.CMP(eax, heap(lengthaddr)) # CMP eax, [length]
  918. self.mc.J_il8(rx86.Conditions['BE'], 0) # JBE .skip
  919. jb_location = self.mc.get_relative_pos()
  920. self.mc.CALL(imm(self.stack_check_slowpath))# CALL slowpath
  921. # patch the JB above # .skip:
  922. offset = self.mc.get_relative_pos() - jb_location
  923. assert 0 < offset <= 127
  924. self.mc.overwrite(jb_location-1, chr(offset))
  925. #
  926. def _call_footer(self):
  927. # the return value is the jitframe
  928. if self.cpu.translate_support_code:
  929. self._call_footer_vmprof()
  930. self.mc.MOV_rr(eax.value, ebp.value)
  931. gcrootmap = self.cpu.gc_ll_descr.gcrootmap
  932. if gcrootmap and gcrootmap.is_shadow_stack:
  933. self._call_footer_shadowstack(gcrootmap)
  934. for i in range(len(self.cpu.CALLEE_SAVE_REGISTERS)-1, -1, -1):
  935. self.mc.MOV_rs(self.cpu.CALLEE_SAVE_REGISTERS[i].value,
  936. (i + 1 + PASS_ON_MY_FRAME) * WORD)
  937. self.mc.MOV_rs(ebp.value, PASS_ON_MY_FRAME * WORD)
  938. self.mc.ADD_ri(esp.value, FRAME_FIXED_SIZE * WORD)
  939. self.mc.RET()
  940. def _load_shadowstack_top_in_ebx(self, mc, gcrootmap):
  941. """Loads the shadowstack top in ebx, and returns an integer
  942. that gives the address of the stack top. If this integer doesn't
  943. fit in 32 bits, it will be loaded in r11.
  944. """
  945. rst = gcrootmap.get_root_stack_top_addr()
  946. if rx86.fits_in_32bits(rst):
  947. mc.MOV_rj(ebx.value, rst) # MOV ebx, [rootstacktop]
  948. else:
  949. mc.MOV_ri(X86_64_SCRATCH_REG.value, rst) # MOV r11, rootstacktop
  950. mc.MOV_rm(ebx.value, (X86_64_SCRATCH_REG.value, 0))
  951. # MOV ebx, [r11]
  952. #
  953. return rst
  954. def _call_header_shadowstack(self, gcrootmap):
  955. rst = self._load_shadowstack_top_in_ebx(self.mc, gcrootmap)
  956. self.mc.MOV_mr((ebx.value, 0), ebp.value) # MOV [ebx], ebp
  957. self.mc.ADD_ri(ebx.value, WORD)
  958. if rx86.fits_in_32bits(rst):
  959. self.mc.MOV_jr(rst, ebx.value) # MOV [rootstacktop], ebx
  960. else:
  961. # The integer 'rst' doesn't fit in 32 bits, so we know that
  962. # _load_shadowstack_top_in_ebx() above loaded it in r11.
  963. # Reuse it. Be careful not to overwrite r11 in the middle!
  964. self.mc.MOV_mr((X86_64_SCRATCH_REG.value, 0),
  965. ebx.value) # MOV [r11], ebx
  966. def _call_footer_shadowstack(self, gcrootmap):
  967. rst = gcrootmap.get_root_stack_top_addr()
  968. if rx86.fits_in_32bits(rst):
  969. self.mc.SUB_ji8(rst, WORD) # SUB [rootstacktop], WORD
  970. else:
  971. self.mc.MOV_ri(ebx.value, rst) # MOV ebx, rootstacktop
  972. self.mc.SUB_mi8((ebx.value, 0), WORD) # SUB [ebx], WORD
  973. def redirect_call_assembler(self, oldlooptoken, newlooptoken):
  974. # some minimal sanity checking
  975. old_nbargs = oldlooptoken.compiled_loop_token._debug_nbargs
  976. new_nbargs = newlooptoken.compiled_loop_token._debug_nbargs
  977. assert old_nbargs == new_nbargs
  978. # we overwrite the instructions at the old _ll_function_addr
  979. # to start with a JMP to the new _ll_function_addr.
  980. # Ideally we should rather patch all existing CALLs, but well.
  981. oldadr = oldlooptoken._ll_function_addr
  982. target = newlooptoken._ll_function_addr
  983. # copy frame-info data
  984. baseofs = self.cpu.get_baseofs_of_frame_field()
  985. newlooptoken.compiled_loop_token.update_frame_info(
  986. oldlooptoken.compiled_loop_token, baseofs)
  987. mc = codebuf.MachineCodeBlockWrapper()
  988. mc.JMP(imm(follow_jump(target)))
  989. if WORD == 4: # keep in sync with prepare_loop()
  990. assert mc.get_relative_pos() == 5
  991. else:
  992. ass

Large files files are truncated, but you can click here to view the full file