PageRenderTime 56ms CodeModel.GetById 16ms RepoModel.GetById 0ms app.codeStats 1ms

/rpython/jit/backend/x86/assembler.py

https://bitbucket.org/bwesterb/pypy
Python | 2710 lines | 2063 code | 231 blank | 416 comment | 433 complexity | d917c7c1cc9ac1c5ddc2d035f28cb416 MD5 | raw file

Large files files are truncated, but you can click here to view the full file

  1. import sys, os
  2. from rpython.jit.backend.llsupport import symbolic, jitframe
  3. from rpython.jit.backend.llsupport.asmmemmgr import MachineDataBlockWrapper
  4. from rpython.jit.metainterp.history import Const, Box, BoxInt, ConstInt
  5. from rpython.jit.metainterp.history import AbstractFailDescr, INT, REF, FLOAT
  6. from rpython.jit.metainterp.history import JitCellToken
  7. from rpython.rtyper.lltypesystem import lltype, rffi, rstr, llmemory
  8. from rpython.rtyper.lltypesystem.lloperation import llop
  9. from rpython.rtyper.annlowlevel import llhelper
  10. from rpython.rlib.jit import AsmInfo
  11. from rpython.rlib import longlong2float
  12. from rpython.jit.backend.model import CompiledLoopToken
  13. from rpython.jit.backend.x86.regalloc import (RegAlloc, get_ebp_ofs, _get_scale,
  14. gpr_reg_mgr_cls, xmm_reg_mgr_cls, _valid_addressing_size)
  15. from rpython.jit.backend.x86.arch import (FRAME_FIXED_SIZE, FORCE_INDEX_OFS, WORD,
  16. IS_X86_32, IS_X86_64)
  17. from rpython.jit.backend.x86.regloc import (eax, ecx, edx, ebx,
  18. esp, ebp, esi, edi,
  19. xmm0, xmm1, xmm2, xmm3,
  20. xmm4, xmm5, xmm6, xmm7,
  21. r8, r9, r10, r11,
  22. r12, r13, r14, r15,
  23. X86_64_SCRATCH_REG,
  24. X86_64_XMM_SCRATCH_REG,
  25. RegLoc, StackLoc, ConstFloatLoc,
  26. ImmedLoc, AddressLoc, imm,
  27. imm0, imm1, FloatImmedLoc)
  28. from rpython.rlib.objectmodel import we_are_translated, specialize
  29. from rpython.jit.backend.x86 import rx86, regloc, codebuf
  30. from rpython.jit.metainterp.resoperation import rop, ResOperation
  31. from rpython.jit.backend.x86 import support
  32. from rpython.rlib.debug import (debug_print, debug_start, debug_stop,
  33. have_debug_prints, fatalerror)
  34. from rpython.rlib import rgc
  35. from rpython.rlib.clibffi import FFI_DEFAULT_ABI
  36. from rpython.jit.backend.x86.jump import remap_frame_layout
  37. from rpython.jit.codewriter.effectinfo import EffectInfo
  38. from rpython.jit.codewriter import longlong
  39. from rpython.rlib.rarithmetic import intmask
  40. from rpython.rlib.objectmodel import compute_unique_id
  41. # darwin requires the stack to be 16 bytes aligned on calls. Same for gcc 4.5.0,
  42. # better safe than sorry
  43. CALL_ALIGN = 16 // WORD
  44. def align_stack_words(words):
  45. return (words + CALL_ALIGN - 1) & ~(CALL_ALIGN-1)
  46. class GuardToken(object):
  47. def __init__(self, faildescr, failargs, fail_locs, exc,
  48. is_guard_not_invalidated, is_guard_not_forced):
  49. self.faildescr = faildescr
  50. self.failargs = failargs
  51. self.fail_locs = fail_locs
  52. self.exc = exc
  53. self.is_guard_not_invalidated = is_guard_not_invalidated
  54. self.is_guard_not_forced = is_guard_not_forced
  55. DEBUG_COUNTER = lltype.Struct('DEBUG_COUNTER', ('i', lltype.Signed),
  56. ('type', lltype.Char), # 'b'ridge, 'l'abel or
  57. # 'e'ntry point
  58. ('number', lltype.Signed))
  59. class Assembler386(object):
  60. _regalloc = None
  61. _output_loop_log = None
  62. def __init__(self, cpu, translate_support_code=False):
  63. self.cpu = cpu
  64. self.verbose = False
  65. self.rtyper = cpu.rtyper
  66. self.loop_run_counters = []
  67. self.float_const_neg_addr = 0
  68. self.float_const_abs_addr = 0
  69. self.malloc_slowpath1 = 0
  70. self.malloc_slowpath2 = 0
  71. self.wb_slowpath = [0, 0, 0, 0]
  72. self.memcpy_addr = 0
  73. self.setup_failure_recovery()
  74. self._debug = False
  75. self.debug_counter_descr = cpu.fielddescrof(DEBUG_COUNTER, 'i')
  76. self.datablockwrapper = None
  77. self.stack_check_slowpath = 0
  78. self.propagate_exception_path = 0
  79. self.gcrootmap_retaddr_forced = 0
  80. self.teardown()
  81. self.force_token_to_dead_frame = {} # XXX temporary hack
  82. def set_debug(self, v):
  83. r = self._debug
  84. self._debug = v
  85. return r
  86. def setup_once(self):
  87. # the address of the function called by 'new'
  88. gc_ll_descr = self.cpu.gc_ll_descr
  89. gc_ll_descr.initialize()
  90. self.memcpy_addr = self.cpu.cast_ptr_to_int(support.memcpy_fn)
  91. self._build_failure_recovery(False)
  92. self._build_failure_recovery(True)
  93. self._build_wb_slowpath(False)
  94. self._build_wb_slowpath(True)
  95. if self.cpu.supports_floats:
  96. self._build_failure_recovery(False, withfloats=True)
  97. self._build_failure_recovery(True, withfloats=True)
  98. self._build_wb_slowpath(False, withfloats=True)
  99. self._build_wb_slowpath(True, withfloats=True)
  100. support.ensure_sse2_floats()
  101. self._build_float_constants()
  102. self._build_propagate_exception_path()
  103. if gc_ll_descr.get_malloc_slowpath_addr is not None:
  104. self._build_malloc_slowpath()
  105. self._build_stack_check_slowpath()
  106. if gc_ll_descr.gcrootmap:
  107. self._build_release_gil(gc_ll_descr.gcrootmap)
  108. if not self._debug:
  109. # if self._debug is already set it means that someone called
  110. # set_debug by hand before initializing the assembler. Leave it
  111. # as it is
  112. debug_start('jit-backend-counts')
  113. self.set_debug(have_debug_prints())
  114. debug_stop('jit-backend-counts')
  115. def setup(self, looptoken):
  116. assert self.memcpy_addr != 0, "setup_once() not called?"
  117. self.current_clt = looptoken.compiled_loop_token
  118. self.pending_guard_tokens = []
  119. if WORD == 8:
  120. self.pending_memoryerror_trampoline_from = []
  121. self.error_trampoline_64 = 0
  122. self.mc = codebuf.MachineCodeBlockWrapper()
  123. #assert self.datablockwrapper is None --- but obscure case
  124. # possible, e.g. getting MemoryError and continuing
  125. allblocks = self.get_asmmemmgr_blocks(looptoken)
  126. self.datablockwrapper = MachineDataBlockWrapper(self.cpu.asmmemmgr,
  127. allblocks)
  128. self.target_tokens_currently_compiling = {}
  129. def teardown(self):
  130. self.pending_guard_tokens = None
  131. if WORD == 8:
  132. self.pending_memoryerror_trampoline_from = None
  133. self.mc = None
  134. self.current_clt = None
  135. def finish_once(self):
  136. if self._debug:
  137. debug_start('jit-backend-counts')
  138. for i in range(len(self.loop_run_counters)):
  139. struct = self.loop_run_counters[i]
  140. if struct.type == 'l':
  141. prefix = 'TargetToken(%d)' % struct.number
  142. elif struct.type == 'b':
  143. prefix = 'bridge ' + str(struct.number)
  144. else:
  145. prefix = 'entry ' + str(struct.number)
  146. debug_print(prefix + ':' + str(struct.i))
  147. debug_stop('jit-backend-counts')
  148. def _build_float_constants(self):
  149. datablockwrapper = MachineDataBlockWrapper(self.cpu.asmmemmgr, [])
  150. float_constants = datablockwrapper.malloc_aligned(32, alignment=16)
  151. datablockwrapper.done()
  152. addr = rffi.cast(rffi.CArrayPtr(lltype.Char), float_constants)
  153. qword_padding = '\x00\x00\x00\x00\x00\x00\x00\x00'
  154. # 0x8000000000000000
  155. neg_const = '\x00\x00\x00\x00\x00\x00\x00\x80'
  156. # 0x7FFFFFFFFFFFFFFF
  157. abs_const = '\xFF\xFF\xFF\xFF\xFF\xFF\xFF\x7F'
  158. data = neg_const + qword_padding + abs_const + qword_padding
  159. for i in range(len(data)):
  160. addr[i] = data[i]
  161. self.float_const_neg_addr = float_constants
  162. self.float_const_abs_addr = float_constants + 16
  163. def _build_malloc_slowpath(self):
  164. # With asmgcc, we need two helpers, so that we can write two CALL
  165. # instructions in assembler, with a mark_gc_roots in between.
  166. # With shadowstack, this is not needed, so we produce a single helper.
  167. gcrootmap = self.cpu.gc_ll_descr.gcrootmap
  168. shadow_stack = (gcrootmap is not None and gcrootmap.is_shadow_stack)
  169. #
  170. # ---------- first helper for the slow path of malloc ----------
  171. mc = codebuf.MachineCodeBlockWrapper()
  172. if self.cpu.supports_floats: # save the XMM registers in
  173. for i in range(self.cpu.NUM_REGS):# the *caller* frame, from esp+8
  174. mc.MOVSD_sx((WORD*2)+8*i, i)
  175. mc.SUB_rr(edx.value, eax.value) # compute the size we want
  176. addr = self.cpu.gc_ll_descr.get_malloc_slowpath_addr()
  177. #
  178. # The registers to save in the copy area: with shadowstack, most
  179. # registers need to be saved. With asmgcc, the callee-saved registers
  180. # don't need to.
  181. save_in_copy_area = gpr_reg_mgr_cls.REGLOC_TO_COPY_AREA_OFS.items()
  182. if not shadow_stack:
  183. save_in_copy_area = [(reg, ofs) for (reg, ofs) in save_in_copy_area
  184. if reg not in gpr_reg_mgr_cls.REGLOC_TO_GCROOTMAP_REG_INDEX]
  185. #
  186. for reg, ofs in save_in_copy_area:
  187. mc.MOV_br(ofs, reg.value)
  188. #
  189. if shadow_stack:
  190. # ---- shadowstack ----
  191. mc.SUB_ri(esp.value, 16 - WORD) # stack alignment of 16 bytes
  192. if IS_X86_32:
  193. mc.MOV_sr(0, edx.value) # push argument
  194. elif IS_X86_64:
  195. mc.MOV_rr(edi.value, edx.value)
  196. mc.CALL(imm(addr))
  197. mc.ADD_ri(esp.value, 16 - WORD)
  198. else:
  199. # ---- asmgcc ----
  200. if IS_X86_32:
  201. mc.MOV_sr(WORD, edx.value) # save it as the new argument
  202. elif IS_X86_64:
  203. # rdi can be clobbered: its content was saved in the
  204. # copy area of the stack
  205. mc.MOV_rr(edi.value, edx.value)
  206. mc.JMP(imm(addr)) # tail call to the real malloc
  207. rawstart = mc.materialize(self.cpu.asmmemmgr, [])
  208. self.malloc_slowpath1 = rawstart
  209. # ---------- second helper for the slow path of malloc ----------
  210. mc = codebuf.MachineCodeBlockWrapper()
  211. #
  212. for reg, ofs in save_in_copy_area:
  213. mc.MOV_rb(reg.value, ofs)
  214. assert reg is not eax and reg is not edx
  215. #
  216. if self.cpu.supports_floats: # restore the XMM registers
  217. for i in range(self.cpu.NUM_REGS):# from where they were saved
  218. mc.MOVSD_xs(i, (WORD*2)+8*i)
  219. #
  220. # Note: we check this after the code above, just because the code
  221. # above is more than 127 bytes on 64-bits...
  222. mc.TEST_rr(eax.value, eax.value)
  223. mc.J_il8(rx86.Conditions['Z'], 0) # patched later
  224. jz_location = mc.get_relative_pos()
  225. #
  226. nursery_free_adr = self.cpu.gc_ll_descr.get_nursery_free_addr()
  227. mc.MOV(edx, heap(nursery_free_adr)) # load this in EDX
  228. mc.RET()
  229. #
  230. # If the slowpath malloc failed, we raise a MemoryError that
  231. # always interrupts the current loop, as a "good enough"
  232. # approximation. Also note that we didn't RET from this helper;
  233. # but the code we jump to will actually restore the stack
  234. # position based on EBP, which will get us out of here for free.
  235. offset = mc.get_relative_pos() - jz_location
  236. assert 0 < offset <= 127
  237. mc.overwrite(jz_location-1, chr(offset))
  238. mc.JMP(imm(self.propagate_exception_path))
  239. #
  240. rawstart = mc.materialize(self.cpu.asmmemmgr, [])
  241. self.malloc_slowpath2 = rawstart
  242. def _build_propagate_exception_path(self):
  243. if self.cpu.propagate_exception_v < 0:
  244. return # not supported (for tests, or non-translated)
  245. #
  246. self.mc = codebuf.MachineCodeBlockWrapper()
  247. #
  248. # Call the helper, which will return a dead frame object with
  249. # the correct exception set, or MemoryError by default
  250. addr = rffi.cast(lltype.Signed, self.cpu.get_propagate_exception())
  251. self.mc.CALL(imm(addr))
  252. #
  253. self._call_footer()
  254. rawstart = self.mc.materialize(self.cpu.asmmemmgr, [])
  255. self.propagate_exception_path = rawstart
  256. self.mc = None
  257. def _build_stack_check_slowpath(self):
  258. _, _, slowpathaddr = self.cpu.insert_stack_check()
  259. if slowpathaddr == 0 or self.cpu.propagate_exception_v < 0:
  260. return # no stack check (for tests, or non-translated)
  261. #
  262. # make a "function" that is called immediately at the start of
  263. # an assembler function. In particular, the stack looks like:
  264. #
  265. # | ... | <-- aligned to a multiple of 16
  266. # | retaddr of caller |
  267. # | my own retaddr | <-- esp
  268. # +---------------------+
  269. #
  270. mc = codebuf.MachineCodeBlockWrapper()
  271. #
  272. stack_size = WORD
  273. if IS_X86_64:
  274. # on the x86_64, we have to save all the registers that may
  275. # have been used to pass arguments
  276. stack_size += 6*WORD + 8*8
  277. for reg in [edi, esi, edx, ecx, r8, r9]:
  278. mc.PUSH_r(reg.value)
  279. mc.SUB_ri(esp.value, 8*8)
  280. for i in range(8):
  281. mc.MOVSD_sx(8*i, i) # xmm0 to xmm7
  282. #
  283. if IS_X86_32:
  284. stack_size += 2*WORD
  285. mc.PUSH_r(eax.value) # alignment
  286. mc.PUSH_r(esp.value)
  287. elif IS_X86_64:
  288. mc.MOV_rr(edi.value, esp.value)
  289. #
  290. # esp is now aligned to a multiple of 16 again
  291. mc.CALL(imm(slowpathaddr))
  292. #
  293. mc.MOV(eax, heap(self.cpu.pos_exception()))
  294. mc.TEST_rr(eax.value, eax.value)
  295. mc.J_il8(rx86.Conditions['NZ'], 0)
  296. jnz_location = mc.get_relative_pos()
  297. #
  298. if IS_X86_32:
  299. mc.ADD_ri(esp.value, 2*WORD) # cancel the two PUSHes above
  300. elif IS_X86_64:
  301. # restore the registers
  302. for i in range(7, -1, -1):
  303. mc.MOVSD_xs(i, 8*i)
  304. mc.ADD_ri(esp.value, 8*8)
  305. for reg in [r9, r8, ecx, edx, esi, edi]:
  306. mc.POP_r(reg.value)
  307. #
  308. mc.RET()
  309. #
  310. # patch the JNZ above
  311. offset = mc.get_relative_pos() - jnz_location
  312. assert 0 < offset <= 127
  313. mc.overwrite(jnz_location-1, chr(offset))
  314. #
  315. # Call the helper, which will return a dead frame object with
  316. # the correct exception set, or MemoryError by default
  317. addr = rffi.cast(lltype.Signed, self.cpu.get_propagate_exception())
  318. mc.CALL(imm(addr))
  319. #
  320. # footer -- note the ADD, which skips the return address of this
  321. # function, and will instead return to the caller's caller. Note
  322. # also that we completely ignore the saved arguments, because we
  323. # are interrupting the function.
  324. mc.ADD_ri(esp.value, stack_size)
  325. mc.RET()
  326. #
  327. rawstart = mc.materialize(self.cpu.asmmemmgr, [])
  328. self.stack_check_slowpath = rawstart
  329. def _build_wb_slowpath(self, withcards, withfloats=False):
  330. descr = self.cpu.gc_ll_descr.write_barrier_descr
  331. if descr is None:
  332. return
  333. if not withcards:
  334. func = descr.get_write_barrier_fn(self.cpu)
  335. else:
  336. if descr.jit_wb_cards_set == 0:
  337. return
  338. func = descr.get_write_barrier_from_array_fn(self.cpu)
  339. if func == 0:
  340. return
  341. #
  342. # This builds a helper function called from the slow path of
  343. # write barriers. It must save all registers, and optionally
  344. # all XMM registers. It takes a single argument just pushed
  345. # on the stack even on X86_64. It must restore stack alignment
  346. # accordingly.
  347. mc = codebuf.MachineCodeBlockWrapper()
  348. #
  349. frame_size = (1 + # my argument, considered part of my frame
  350. 1 + # my return address
  351. len(gpr_reg_mgr_cls.save_around_call_regs))
  352. if withfloats:
  353. frame_size += 16 # X86_32: 16 words for 8 registers;
  354. # X86_64: just 16 registers
  355. if IS_X86_32:
  356. frame_size += 1 # argument to pass to the call
  357. #
  358. # align to a multiple of 16 bytes
  359. frame_size = (frame_size + (CALL_ALIGN-1)) & ~(CALL_ALIGN-1)
  360. #
  361. correct_esp_by = (frame_size - 2) * WORD
  362. mc.SUB_ri(esp.value, correct_esp_by)
  363. #
  364. ofs = correct_esp_by
  365. if withfloats:
  366. for reg in xmm_reg_mgr_cls.save_around_call_regs:
  367. ofs -= 8
  368. mc.MOVSD_sx(ofs, reg.value)
  369. for reg in gpr_reg_mgr_cls.save_around_call_regs:
  370. ofs -= WORD
  371. mc.MOV_sr(ofs, reg.value)
  372. #
  373. if IS_X86_32:
  374. mc.MOV_rs(eax.value, (frame_size - 1) * WORD)
  375. mc.MOV_sr(0, eax.value)
  376. elif IS_X86_64:
  377. mc.MOV_rs(edi.value, (frame_size - 1) * WORD)
  378. mc.CALL(imm(func))
  379. #
  380. if withcards:
  381. # A final TEST8 before the RET, for the caller. Careful to
  382. # not follow this instruction with another one that changes
  383. # the status of the CPU flags!
  384. mc.MOV_rs(eax.value, (frame_size - 1) * WORD)
  385. mc.TEST8(addr_add_const(eax, descr.jit_wb_if_flag_byteofs),
  386. imm(-0x80))
  387. #
  388. ofs = correct_esp_by
  389. if withfloats:
  390. for reg in xmm_reg_mgr_cls.save_around_call_regs:
  391. ofs -= 8
  392. mc.MOVSD_xs(reg.value, ofs)
  393. for reg in gpr_reg_mgr_cls.save_around_call_regs:
  394. ofs -= WORD
  395. mc.MOV_rs(reg.value, ofs)
  396. #
  397. # ADD esp, correct_esp_by --- but cannot use ADD, because
  398. # of its effects on the CPU flags
  399. mc.LEA_rs(esp.value, correct_esp_by)
  400. mc.RET16_i(WORD)
  401. #
  402. rawstart = mc.materialize(self.cpu.asmmemmgr, [])
  403. self.wb_slowpath[withcards + 2 * withfloats] = rawstart
  404. @staticmethod
  405. @rgc.no_collect
  406. def _release_gil_asmgcc(css):
  407. # similar to trackgcroot.py:pypy_asm_stackwalk, first part
  408. from rpython.rtyper.memory.gctransform import asmgcroot
  409. new = rffi.cast(asmgcroot.ASM_FRAMEDATA_HEAD_PTR, css)
  410. next = asmgcroot.gcrootanchor.next
  411. new.next = next
  412. new.prev = asmgcroot.gcrootanchor
  413. asmgcroot.gcrootanchor.next = new
  414. next.prev = new
  415. # and now release the GIL
  416. before = rffi.aroundstate.before
  417. if before:
  418. before()
  419. @staticmethod
  420. @rgc.no_collect
  421. def _reacquire_gil_asmgcc(css):
  422. # first reacquire the GIL
  423. after = rffi.aroundstate.after
  424. if after:
  425. after()
  426. # similar to trackgcroot.py:pypy_asm_stackwalk, second part
  427. from rpython.rtyper.memory.gctransform import asmgcroot
  428. old = rffi.cast(asmgcroot.ASM_FRAMEDATA_HEAD_PTR, css)
  429. prev = old.prev
  430. next = old.next
  431. prev.next = next
  432. next.prev = prev
  433. @staticmethod
  434. @rgc.no_collect
  435. def _release_gil_shadowstack():
  436. before = rffi.aroundstate.before
  437. if before:
  438. before()
  439. @staticmethod
  440. @rgc.no_collect
  441. def _reacquire_gil_shadowstack():
  442. after = rffi.aroundstate.after
  443. if after:
  444. after()
  445. _NOARG_FUNC = lltype.Ptr(lltype.FuncType([], lltype.Void))
  446. _CLOSESTACK_FUNC = lltype.Ptr(lltype.FuncType([rffi.LONGP],
  447. lltype.Void))
  448. def _build_release_gil(self, gcrootmap):
  449. if gcrootmap.is_shadow_stack:
  450. releasegil_func = llhelper(self._NOARG_FUNC,
  451. self._release_gil_shadowstack)
  452. reacqgil_func = llhelper(self._NOARG_FUNC,
  453. self._reacquire_gil_shadowstack)
  454. else:
  455. releasegil_func = llhelper(self._CLOSESTACK_FUNC,
  456. self._release_gil_asmgcc)
  457. reacqgil_func = llhelper(self._CLOSESTACK_FUNC,
  458. self._reacquire_gil_asmgcc)
  459. self.releasegil_addr = self.cpu.cast_ptr_to_int(releasegil_func)
  460. self.reacqgil_addr = self.cpu.cast_ptr_to_int(reacqgil_func)
  461. def assemble_loop(self, loopname, inputargs, operations, looptoken, log):
  462. '''adds the following attributes to looptoken:
  463. _x86_function_addr (address of the generated func, as an int)
  464. _x86_loop_code (debug: addr of the start of the ResOps)
  465. _x86_fullsize (debug: full size including failure)
  466. _x86_debug_checksum
  467. '''
  468. # XXX this function is too longish and contains some code
  469. # duplication with assemble_bridge(). Also, we should think
  470. # about not storing on 'self' attributes that will live only
  471. # for the duration of compiling one loop or a one bridge.
  472. clt = CompiledLoopToken(self.cpu, looptoken.number)
  473. clt.allgcrefs = []
  474. looptoken.compiled_loop_token = clt
  475. if not we_are_translated():
  476. # Arguments should be unique
  477. assert len(set(inputargs)) == len(inputargs)
  478. self.setup(looptoken)
  479. if log:
  480. operations = self._inject_debugging_code(looptoken, operations,
  481. 'e', looptoken.number)
  482. regalloc = RegAlloc(self, self.cpu.translate_support_code)
  483. #
  484. self._call_header_with_stack_check()
  485. stackadjustpos = self._patchable_stackadjust()
  486. clt._debug_nbargs = len(inputargs)
  487. operations = regalloc.prepare_loop(inputargs, operations,
  488. looptoken, clt.allgcrefs)
  489. looppos = self.mc.get_relative_pos()
  490. looptoken._x86_loop_code = looppos
  491. clt.frame_depth = -1 # temporarily
  492. frame_depth = self._assemble(regalloc, operations)
  493. clt.frame_depth = frame_depth
  494. #
  495. size_excluding_failure_stuff = self.mc.get_relative_pos()
  496. self.write_pending_failure_recoveries()
  497. full_size = self.mc.get_relative_pos()
  498. #
  499. rawstart = self.materialize_loop(looptoken)
  500. debug_start("jit-backend-addr")
  501. debug_print("Loop %d (%s) has address %x to %x (bootstrap %x)" % (
  502. looptoken.number, loopname,
  503. rawstart + looppos,
  504. rawstart + size_excluding_failure_stuff,
  505. rawstart))
  506. debug_stop("jit-backend-addr")
  507. self._patch_stackadjust(rawstart + stackadjustpos, frame_depth)
  508. self.patch_pending_failure_recoveries(rawstart)
  509. #
  510. ops_offset = self.mc.ops_offset
  511. if not we_are_translated():
  512. # used only by looptoken.dump() -- useful in tests
  513. looptoken._x86_rawstart = rawstart
  514. looptoken._x86_fullsize = full_size
  515. looptoken._x86_ops_offset = ops_offset
  516. looptoken._x86_function_addr = rawstart
  517. self.fixup_target_tokens(rawstart)
  518. self.teardown()
  519. # oprofile support
  520. if self.cpu.profile_agent is not None:
  521. name = "Loop # %s: %s" % (looptoken.number, loopname)
  522. self.cpu.profile_agent.native_code_written(name,
  523. rawstart, full_size)
  524. return AsmInfo(ops_offset, rawstart + looppos,
  525. size_excluding_failure_stuff - looppos)
  526. def assemble_bridge(self, faildescr, inputargs, operations,
  527. original_loop_token, log):
  528. if not we_are_translated():
  529. # Arguments should be unique
  530. assert len(set(inputargs)) == len(inputargs)
  531. descr_number = self.cpu.get_fail_descr_number(faildescr)
  532. failure_recovery = self._find_failure_recovery_bytecode(faildescr)
  533. self.setup(original_loop_token)
  534. if log:
  535. operations = self._inject_debugging_code(faildescr, operations,
  536. 'b', descr_number)
  537. arglocs = self.rebuild_faillocs_from_descr(failure_recovery)
  538. if not we_are_translated():
  539. assert ([loc.assembler() for loc in arglocs] ==
  540. [loc.assembler() for loc in faildescr._x86_debug_faillocs])
  541. regalloc = RegAlloc(self, self.cpu.translate_support_code)
  542. startpos = self.mc.get_relative_pos()
  543. operations = regalloc.prepare_bridge(inputargs, arglocs,
  544. operations,
  545. self.current_clt.allgcrefs)
  546. stackadjustpos = self._patchable_stackadjust()
  547. frame_depth = self._assemble(regalloc, operations)
  548. codeendpos = self.mc.get_relative_pos()
  549. self.write_pending_failure_recoveries()
  550. fullsize = self.mc.get_relative_pos()
  551. #
  552. rawstart = self.materialize_loop(original_loop_token)
  553. debug_start("jit-backend-addr")
  554. debug_print("bridge out of Guard %d has address %x to %x" %
  555. (descr_number, rawstart, rawstart + codeendpos))
  556. debug_stop("jit-backend-addr")
  557. self._patch_stackadjust(rawstart + stackadjustpos, frame_depth)
  558. self.patch_pending_failure_recoveries(rawstart)
  559. if not we_are_translated():
  560. # for the benefit of tests
  561. faildescr._x86_bridge_frame_depth = frame_depth
  562. # patch the jump from original guard
  563. self.patch_jump_for_descr(faildescr, rawstart)
  564. ops_offset = self.mc.ops_offset
  565. self.fixup_target_tokens(rawstart)
  566. self.current_clt.frame_depth = max(self.current_clt.frame_depth, frame_depth)
  567. self.teardown()
  568. # oprofile support
  569. if self.cpu.profile_agent is not None:
  570. name = "Bridge # %s" % (descr_number,)
  571. self.cpu.profile_agent.native_code_written(name,
  572. rawstart, fullsize)
  573. return AsmInfo(ops_offset, startpos + rawstart, codeendpos - startpos)
  574. def write_pending_failure_recoveries(self):
  575. # for each pending guard, generate the code of the recovery stub
  576. # at the end of self.mc.
  577. for tok in self.pending_guard_tokens:
  578. tok.pos_recovery_stub = self.generate_quick_failure(tok)
  579. if WORD == 8 and len(self.pending_memoryerror_trampoline_from) > 0:
  580. self.error_trampoline_64 = self.generate_propagate_error_64()
  581. def patch_pending_failure_recoveries(self, rawstart):
  582. # after we wrote the assembler to raw memory, set up
  583. # tok.faildescr._x86_adr_jump_offset to contain the raw address of
  584. # the 4-byte target field in the JMP/Jcond instruction, and patch
  585. # the field in question to point (initially) to the recovery stub
  586. clt = self.current_clt
  587. for tok in self.pending_guard_tokens:
  588. addr = rawstart + tok.pos_jump_offset
  589. tok.faildescr._x86_adr_jump_offset = addr
  590. relative_target = tok.pos_recovery_stub - (tok.pos_jump_offset + 4)
  591. assert rx86.fits_in_32bits(relative_target)
  592. #
  593. if not tok.is_guard_not_invalidated:
  594. mc = codebuf.MachineCodeBlockWrapper()
  595. mc.writeimm32(relative_target)
  596. mc.copy_to_raw_memory(addr)
  597. else:
  598. # GUARD_NOT_INVALIDATED, record an entry in
  599. # clt.invalidate_positions of the form:
  600. # (addr-in-the-code-of-the-not-yet-written-jump-target,
  601. # relative-target-to-use)
  602. relpos = tok.pos_jump_offset
  603. clt.invalidate_positions.append((rawstart + relpos,
  604. relative_target))
  605. # General idea: Although no code was generated by this
  606. # guard, the code might be patched with a "JMP rel32" to
  607. # the guard recovery code. This recovery code is
  608. # already generated, and looks like the recovery code
  609. # for any guard, even if at first it has no jump to it.
  610. # So we may later write 5 bytes overriding the existing
  611. # instructions; this works because a CALL instruction
  612. # would also take at least 5 bytes. If it could take
  613. # less, we would run into the issue that overwriting the
  614. # 5 bytes here might get a few nonsense bytes at the
  615. # return address of the following CALL.
  616. if WORD == 8:
  617. for pos_after_jz in self.pending_memoryerror_trampoline_from:
  618. assert self.error_trampoline_64 != 0 # only if non-empty
  619. mc = codebuf.MachineCodeBlockWrapper()
  620. mc.writeimm32(self.error_trampoline_64 - pos_after_jz)
  621. mc.copy_to_raw_memory(rawstart + pos_after_jz - 4)
  622. def get_asmmemmgr_blocks(self, looptoken):
  623. clt = looptoken.compiled_loop_token
  624. if clt.asmmemmgr_blocks is None:
  625. clt.asmmemmgr_blocks = []
  626. return clt.asmmemmgr_blocks
  627. def materialize_loop(self, looptoken):
  628. self.datablockwrapper.done() # finish using cpu.asmmemmgr
  629. self.datablockwrapper = None
  630. allblocks = self.get_asmmemmgr_blocks(looptoken)
  631. return self.mc.materialize(self.cpu.asmmemmgr, allblocks,
  632. self.cpu.gc_ll_descr.gcrootmap)
  633. def _register_counter(self, tp, number, token):
  634. # YYY very minor leak -- we need the counters to stay alive
  635. # forever, just because we want to report them at the end
  636. # of the process
  637. struct = lltype.malloc(DEBUG_COUNTER, flavor='raw',
  638. track_allocation=False)
  639. struct.i = 0
  640. struct.type = tp
  641. if tp == 'b' or tp == 'e':
  642. struct.number = number
  643. else:
  644. assert token
  645. struct.number = compute_unique_id(token)
  646. self.loop_run_counters.append(struct)
  647. return struct
  648. def _find_failure_recovery_bytecode(self, faildescr):
  649. adr_jump_offset = faildescr._x86_adr_jump_offset
  650. if adr_jump_offset == 0:
  651. # This case should be prevented by the logic in compile.py:
  652. # look for CNT_BUSY_FLAG, which disables tracing from a guard
  653. # when another tracing from the same guard is already in progress.
  654. raise BridgeAlreadyCompiled
  655. # follow the JMP/Jcond
  656. p = rffi.cast(rffi.INTP, adr_jump_offset)
  657. adr_target = adr_jump_offset + 4 + rffi.cast(lltype.Signed, p[0])
  658. # skip the CALL
  659. if WORD == 4:
  660. adr_target += 5 # CALL imm
  661. else:
  662. adr_target += 13 # MOV r11, imm-as-8-bytes; CALL *r11 xxxxxxxxxx
  663. return adr_target
  664. def patch_jump_for_descr(self, faildescr, adr_new_target):
  665. adr_jump_offset = faildescr._x86_adr_jump_offset
  666. assert adr_jump_offset != 0
  667. offset = adr_new_target - (adr_jump_offset + 4)
  668. # If the new target fits within a rel32 of the jump, just patch
  669. # that. Otherwise, leave the original rel32 to the recovery stub in
  670. # place, but clobber the recovery stub with a jump to the real
  671. # target.
  672. mc = codebuf.MachineCodeBlockWrapper()
  673. if rx86.fits_in_32bits(offset):
  674. mc.writeimm32(offset)
  675. mc.copy_to_raw_memory(adr_jump_offset)
  676. else:
  677. # "mov r11, addr; jmp r11" is up to 13 bytes, which fits in there
  678. # because we always write "mov r11, imm-as-8-bytes; call *r11" in
  679. # the first place.
  680. mc.MOV_ri(X86_64_SCRATCH_REG.value, adr_new_target)
  681. mc.JMP_r(X86_64_SCRATCH_REG.value)
  682. p = rffi.cast(rffi.INTP, adr_jump_offset)
  683. adr_target = adr_jump_offset + 4 + rffi.cast(lltype.Signed, p[0])
  684. mc.copy_to_raw_memory(adr_target)
  685. faildescr._x86_adr_jump_offset = 0 # means "patched"
  686. def fixup_target_tokens(self, rawstart):
  687. for targettoken in self.target_tokens_currently_compiling:
  688. targettoken._x86_loop_code += rawstart
  689. self.target_tokens_currently_compiling = None
  690. def _append_debugging_code(self, operations, tp, number, token):
  691. counter = self._register_counter(tp, number, token)
  692. c_adr = ConstInt(rffi.cast(lltype.Signed, counter))
  693. box = BoxInt()
  694. box2 = BoxInt()
  695. ops = [ResOperation(rop.GETFIELD_RAW, [c_adr],
  696. box, descr=self.debug_counter_descr),
  697. ResOperation(rop.INT_ADD, [box, ConstInt(1)], box2),
  698. ResOperation(rop.SETFIELD_RAW, [c_adr, box2],
  699. None, descr=self.debug_counter_descr)]
  700. operations.extend(ops)
  701. @specialize.argtype(1)
  702. def _inject_debugging_code(self, looptoken, operations, tp, number):
  703. if self._debug:
  704. s = 0
  705. for op in operations:
  706. s += op.getopnum()
  707. looptoken._x86_debug_checksum = s
  708. newoperations = []
  709. self._append_debugging_code(newoperations, tp, number,
  710. None)
  711. for op in operations:
  712. newoperations.append(op)
  713. if op.getopnum() == rop.LABEL:
  714. self._append_debugging_code(newoperations, 'l', number,
  715. op.getdescr())
  716. operations = newoperations
  717. return operations
  718. def _assemble(self, regalloc, operations):
  719. self._regalloc = regalloc
  720. regalloc.compute_hint_frame_locations(operations)
  721. regalloc.walk_operations(operations)
  722. if we_are_translated() or self.cpu.dont_keepalive_stuff:
  723. self._regalloc = None # else keep it around for debugging
  724. frame_depth = regalloc.get_final_frame_depth()
  725. jump_target_descr = regalloc.jump_target_descr
  726. if jump_target_descr is not None:
  727. target_frame_depth = jump_target_descr._x86_clt.frame_depth
  728. frame_depth = max(frame_depth, target_frame_depth)
  729. return frame_depth
  730. def _patchable_stackadjust(self):
  731. # stack adjustment LEA
  732. self.mc.LEA32_rb(esp.value, 0)
  733. return self.mc.get_relative_pos() - 4
  734. def _patch_stackadjust(self, adr_lea, allocated_depth):
  735. # patch stack adjustment LEA
  736. mc = codebuf.MachineCodeBlockWrapper()
  737. # Compute the correct offset for the instruction LEA ESP, [EBP-4*words]
  738. mc.writeimm32(self._get_offset_of_ebp_from_esp(allocated_depth))
  739. mc.copy_to_raw_memory(adr_lea)
  740. def _get_offset_of_ebp_from_esp(self, allocated_depth):
  741. # Given that [EBP] is where we saved EBP, i.e. in the last word
  742. # of our fixed frame, then the 'words' value is:
  743. words = (FRAME_FIXED_SIZE - 1) + allocated_depth
  744. # align, e.g. for Mac OS X
  745. aligned_words = align_stack_words(words+2)-2 # 2 = EIP+EBP
  746. return -WORD * aligned_words
  747. def _call_header(self):
  748. # NB. the shape of the frame is hard-coded in get_basic_shape() too.
  749. # Also, make sure this is consistent with FRAME_FIXED_SIZE.
  750. self.mc.PUSH_r(ebp.value)
  751. self.mc.MOV_rr(ebp.value, esp.value)
  752. for loc in self.cpu.CALLEE_SAVE_REGISTERS:
  753. self.mc.PUSH_r(loc.value)
  754. gcrootmap = self.cpu.gc_ll_descr.gcrootmap
  755. if gcrootmap and gcrootmap.is_shadow_stack:
  756. self._call_header_shadowstack(gcrootmap)
  757. def _call_header_with_stack_check(self):
  758. if self.stack_check_slowpath == 0:
  759. pass # no stack check (e.g. not translated)
  760. else:
  761. endaddr, lengthaddr, _ = self.cpu.insert_stack_check()
  762. self.mc.MOV(eax, heap(endaddr)) # MOV eax, [start]
  763. self.mc.SUB(eax, esp) # SUB eax, current
  764. self.mc.CMP(eax, heap(lengthaddr)) # CMP eax, [length]
  765. self.mc.J_il8(rx86.Conditions['BE'], 0) # JBE .skip
  766. jb_location = self.mc.get_relative_pos()
  767. self.mc.CALL(imm(self.stack_check_slowpath))# CALL slowpath
  768. # patch the JB above # .skip:
  769. offset = self.mc.get_relative_pos() - jb_location
  770. assert 0 < offset <= 127
  771. self.mc.overwrite(jb_location-1, chr(offset))
  772. #
  773. self._call_header()
  774. def _call_footer(self):
  775. self.mc.LEA_rb(esp.value, -len(self.cpu.CALLEE_SAVE_REGISTERS) * WORD)
  776. gcrootmap = self.cpu.gc_ll_descr.gcrootmap
  777. if gcrootmap and gcrootmap.is_shadow_stack:
  778. self._call_footer_shadowstack(gcrootmap)
  779. for i in range(len(self.cpu.CALLEE_SAVE_REGISTERS)-1, -1, -1):
  780. self.mc.POP_r(self.cpu.CALLEE_SAVE_REGISTERS[i].value)
  781. self.mc.POP_r(ebp.value)
  782. self.mc.RET()
  783. def _call_header_shadowstack(self, gcrootmap):
  784. # we need to put two words into the shadowstack: the MARKER_FRAME
  785. # and the address of the frame (ebp, actually)
  786. rst = gcrootmap.get_root_stack_top_addr()
  787. if rx86.fits_in_32bits(rst):
  788. self.mc.MOV_rj(eax.value, rst) # MOV eax, [rootstacktop]
  789. else:
  790. self.mc.MOV_ri(r13.value, rst) # MOV r13, rootstacktop
  791. self.mc.MOV_rm(eax.value, (r13.value, 0)) # MOV eax, [r13]
  792. #
  793. MARKER = gcrootmap.MARKER_FRAME
  794. self.mc.LEA_rm(ebx.value, (eax.value, 2*WORD)) # LEA ebx, [eax+2*WORD]
  795. self.mc.MOV_mi((eax.value, WORD), MARKER) # MOV [eax+WORD], MARKER
  796. self.mc.MOV_mr((eax.value, 0), ebp.value) # MOV [eax], ebp
  797. #
  798. if rx86.fits_in_32bits(rst):
  799. self.mc.MOV_jr(rst, ebx.value) # MOV [rootstacktop], ebx
  800. else:
  801. self.mc.MOV_mr((r13.value, 0), ebx.value) # MOV [r13], ebx
  802. def _call_footer_shadowstack(self, gcrootmap):
  803. rst = gcrootmap.get_root_stack_top_addr()
  804. if rx86.fits_in_32bits(rst):
  805. self.mc.SUB_ji8(rst, 2*WORD) # SUB [rootstacktop], 2*WORD
  806. else:
  807. self.mc.MOV_ri(ebx.value, rst) # MOV ebx, rootstacktop
  808. self.mc.SUB_mi8((ebx.value, 0), 2*WORD) # SUB [ebx], 2*WORD
  809. def redirect_call_assembler(self, oldlooptoken, newlooptoken):
  810. # some minimal sanity checking
  811. old_nbargs = oldlooptoken.compiled_loop_token._debug_nbargs
  812. new_nbargs = newlooptoken.compiled_loop_token._debug_nbargs
  813. assert old_nbargs == new_nbargs
  814. # we overwrite the instructions at the old _x86_direct_bootstrap_code
  815. # to start with a JMP to the new _x86_direct_bootstrap_code.
  816. # Ideally we should rather patch all existing CALLs, but well.
  817. oldadr = oldlooptoken._x86_function_addr
  818. target = newlooptoken._x86_function_addr
  819. mc = codebuf.MachineCodeBlockWrapper()
  820. mc.JMP(imm(target))
  821. if WORD == 4: # keep in sync with prepare_loop()
  822. assert mc.get_relative_pos() == 5
  823. else:
  824. assert mc.get_relative_pos() <= 13
  825. mc.copy_to_raw_memory(oldadr)
  826. def dump(self, text):
  827. if not self.verbose:
  828. return
  829. _prev = Box._extended_display
  830. try:
  831. Box._extended_display = False
  832. pos = self.mc.get_relative_pos()
  833. print >> sys.stderr, ' 0x%x %s' % (pos, text)
  834. finally:
  835. Box._extended_display = _prev
  836. # ------------------------------------------------------------
  837. def mov(self, from_loc, to_loc):
  838. if (isinstance(from_loc, RegLoc) and from_loc.is_xmm) or (isinstance(to_loc, RegLoc) and to_loc.is_xmm):
  839. self.mc.MOVSD(to_loc, from_loc)
  840. else:
  841. assert to_loc is not ebp
  842. self.mc.MOV(to_loc, from_loc)
  843. regalloc_mov = mov # legacy interface
  844. def regalloc_push(self, loc):
  845. if isinstance(loc, RegLoc) and loc.is_xmm:
  846. self.mc.SUB_ri(esp.value, 8) # = size of doubles
  847. self.mc.MOVSD_sx(0, loc.value)
  848. elif WORD == 4 and isinstance(loc, StackLoc) and loc.get_width() == 8:
  849. # XXX evil trick
  850. self.mc.PUSH_b(loc.value + 4)
  851. self.mc.PUSH_b(loc.value)
  852. else:
  853. self.mc.PUSH(loc)
  854. def regalloc_pop(self, loc):
  855. if isinstance(loc, RegLoc) and loc.is_xmm:
  856. self.mc.MOVSD_xs(loc.value, 0)
  857. self.mc.ADD_ri(esp.value, 8) # = size of doubles
  858. elif WORD == 4 and isinstance(loc, StackLoc) and loc.get_width() == 8:
  859. # XXX evil trick
  860. self.mc.POP_b(loc.value)
  861. self.mc.POP_b(loc.value + 4)
  862. else:
  863. self.mc.POP(loc)
  864. def regalloc_immedmem2mem(self, from_loc, to_loc):
  865. # move a ConstFloatLoc directly to a StackLoc, as two MOVs
  866. # (even on x86-64, because the immediates are encoded as 32 bits)
  867. assert isinstance(from_loc, ConstFloatLoc)
  868. assert isinstance(to_loc, StackLoc)
  869. low_part = rffi.cast(rffi.CArrayPtr(rffi.INT), from_loc.value)[0]
  870. high_part = rffi.cast(rffi.CArrayPtr(rffi.INT), from_loc.value)[1]
  871. low_part = intmask(low_part)
  872. high_part = intmask(high_part)
  873. self.mc.MOV32_bi(to_loc.value, low_part)
  874. self.mc.MOV32_bi(to_loc.value + 4, high_part)
  875. def regalloc_perform(self, op, arglocs, resloc):
  876. genop_list[op.getopnum()](self, op, arglocs, resloc)
  877. def regalloc_perform_discard(self, op, arglocs):
  878. genop_discard_list[op.getopnum()](self, op, arglocs)
  879. def regalloc_perform_llong(self, op, arglocs, resloc):
  880. effectinfo = op.getdescr().get_extra_info()
  881. oopspecindex = effectinfo.oopspecindex
  882. genop_llong_list[oopspecindex](self, op, arglocs, resloc)
  883. def regalloc_perform_math(self, op, arglocs, resloc):
  884. effectinfo = op.getdescr().get_extra_info()
  885. oopspecindex = effectinfo.oopspecindex
  886. genop_math_list[oopspecindex](self, op, arglocs, resloc)
  887. def regalloc_perform_with_guard(self, op, guard_op, faillocs,
  888. arglocs, resloc):
  889. faildescr = guard_op.getdescr()
  890. assert isinstance(faildescr, AbstractFailDescr)
  891. failargs = guard_op.getfailargs()
  892. guard_opnum = guard_op.getopnum()
  893. guard_token = self.implement_guard_recovery(guard_opnum,
  894. faildescr, failargs,
  895. faillocs)
  896. if op is None:
  897. dispatch_opnum = guard_opnum
  898. else:
  899. dispatch_opnum = op.getopnum()
  900. genop_guard_list[dispatch_opnum](self, op, guard_op, guard_token,
  901. arglocs, resloc)
  902. if not we_are_translated():
  903. # must be added by the genop_guard_list[]()
  904. assert guard_token is self.pending_guard_tokens[-1]
  905. def regalloc_perform_guard(self, guard_op, faillocs, arglocs, resloc):
  906. self.regalloc_perform_with_guard(None, guard_op, faillocs, arglocs,
  907. resloc)
  908. def load_effective_addr(self, sizereg, baseofs, scale, result, frm=imm0):
  909. self.mc.LEA(result, addr_add(frm, sizereg, baseofs, scale))
  910. def _unaryop(asmop):
  911. def genop_unary(self, op, arglocs, resloc):
  912. getattr(self.mc, asmop)(arglocs[0])
  913. return genop_unary
  914. def _binaryop(asmop, can_swap=False):
  915. def genop_binary(self, op, arglocs, result_loc):
  916. getattr(self.mc, asmop)(arglocs[0], arglocs[1])
  917. return genop_binary
  918. def _binaryop_or_lea(asmop, is_add):
  919. def genop_binary_or_lea(self, op, arglocs, result_loc):
  920. # use a regular ADD or SUB if result_loc is arglocs[0],
  921. # and a LEA only if different.
  922. if result_loc is arglocs[0]:
  923. getattr(self.mc, asmop)(arglocs[0], arglocs[1])
  924. else:
  925. loc = arglocs[0]
  926. argloc = arglocs[1]
  927. assert isinstance(loc, RegLoc)
  928. assert isinstance(argloc, ImmedLoc)
  929. assert isinstance(result_loc, RegLoc)
  930. delta = argloc.value
  931. if not is_add: # subtraction
  932. delta = -delta
  933. self.mc.LEA_rm(result_loc.value, (loc.value, delta))
  934. return genop_binary_or_lea
  935. def _cmpop(cond, rev_cond):
  936. def genop_cmp(self, op, arglocs, result_loc):
  937. rl = result_loc.lowest8bits()
  938. if isinstance(op.getarg(0), Const):
  939. self.mc.CMP(arglocs[1], arglocs[0])
  940. self.mc.SET_ir(rx86.Conditions[rev_cond], rl.value)
  941. else:
  942. self.mc.CMP(arglocs[0], arglocs[1])
  943. self.mc.SET_ir(rx86.Conditions[cond], rl.value)
  944. self.mc.MOVZX8_rr(result_loc.value, rl.value)
  945. return genop_cmp
  946. def _cmpop_float(cond, rev_cond, is_ne=False):
  947. def genop_cmp(self, op, arglocs, result_loc):
  948. if isinstance(arglocs[0], RegLoc):
  949. self.mc.UCOMISD(arglocs[0], arglocs[1])
  950. checkcond = cond
  951. else:
  952. self.mc.UCOMISD(arglocs[1], arglocs[0])
  953. checkcond = rev_cond
  954. tmp1 = result_loc.lowest8bits()
  955. if IS_X86_32:
  956. tmp2 = result_loc.higher8bits()
  957. elif IS_X86_64:
  958. tmp2 = X86_64_SCRATCH_REG.lowest8bits()
  959. self.mc.SET_ir(rx86.Conditions[checkcond], tmp1.value)
  960. if is_ne:
  961. self.mc.SET_ir(rx86.Conditions['P'], tmp2.value)
  962. self.mc.OR8_rr(tmp1.value, tmp2.value)
  963. else:
  964. self.mc.SET_ir(rx86.Conditions['NP'], tmp2.value)
  965. self.mc.AND8_rr(tmp1.value, tmp2.value)
  966. self.mc.MOVZX8_rr(result_loc.value, tmp1.value)
  967. return genop_cmp
  968. def _cmpop_guard(cond, rev_cond, false_cond, false_rev_cond):
  969. def genop_cmp_guard(self, op, guard_op, guard_token, arglocs, result_loc):
  970. guard_opnum = guard_op.getopnum()
  971. if isinstance(op.getarg(0), Const):
  972. self.mc.CMP(arglocs[1], arglocs[0])
  973. if guard_opnum == rop.GUARD_FALSE:
  974. self.implement_guard(guard_token, rev_cond)
  975. else:
  976. self.implement_guard(guard_token, false_rev_cond)
  977. else:
  978. self.mc.CMP(arglocs[0], arglocs[1])
  979. if guard_opnum == rop.GUARD_FALSE:
  980. self.implement_guard(guard_token, cond)
  981. else:
  982. self.implement_guard(guard_token, false_cond)
  983. return genop_cmp_guard
  984. def _cmpop_guard_float(cond, rev_cond, false_cond, false_rev_cond):
  985. need_direct_jp = 'A' not in cond
  986. need_rev_jp = 'A' not in rev_cond
  987. def genop_cmp_guard_float(self, op, guard_op, guard_token, arglocs,
  988. result_loc):
  989. guard_opnum = guard_op.getopnum()
  990. if isinstance(arglocs[0], RegLoc):
  991. self.mc.UCOMISD(arglocs[0], arglocs[1])
  992. checkcond = cond
  993. checkfalsecond = false_cond
  994. need_jp = need_direct_jp
  995. else:
  996. self.mc.UCOMISD(arglocs[1], arglocs[0])
  997. checkcond = rev_cond
  998. checkfalsecond = false_rev_cond
  999. need_jp = need_rev_jp
  1000. if guard_opnum == rop.GUARD_FALSE:
  1001. if need_jp:
  1002. self.mc.J_il8(rx86.Conditions['P'], 6)
  1003. self.implement_guard(guard_token, checkcond)
  1004. else:
  1005. if need_jp:
  1006. self.mc.J_il8(rx86.Conditions['P'], 2)
  1007. self.mc.J_il8(rx86.Conditions[checkcond], 5)
  1008. self.implement_guard(guard_token)
  1009. else:
  1010. self.implement_guard(guard_token, checkfalsecond)
  1011. return genop_cmp_guard_float
  1012. def _emit_call(self, force_index, x, arglocs, start=0, tmp=eax,
  1013. argtypes=None, callconv=FFI_DEFAULT_ABI):
  1014. if IS_X86_64:
  1015. return self._emit_call_64(force_index, x, arglocs, start, argtypes)
  1016. p = 0
  1017. n = len(arglocs)
  1018. for i in range(start, n):
  1019. loc = arglocs[i]
  1020. if isinstance(loc, RegLoc):
  1021. if loc.is_xmm:
  1022. self.mc.MOVSD_sx(p, loc.value)
  1023. else:
  1024. self.mc.MOV_sr(p, loc.value)
  1025. p += loc.get_width()
  1026. p = 0
  1027. for i in range(start, n):
  1028. loc = arglocs[i]
  1029. if not isinstance(loc, RegLoc):
  1030. if loc.get_width() == 8:
  1031. self.mc.MOVSD(xmm0, loc)
  1032. self.mc.MOVSD_sx(p, xmm0.value)
  1033. else:
  1034. self.mc.MOV(tmp, loc)
  1035. self.mc.MOV_sr(p, tmp.value)
  1036. p += loc.get_width()
  1037. # x is a location
  1038. self.mc.CALL(x)
  1039. self.mark_gc_roots(force_index)
  1040. #
  1041. if callconv != FFI_DEFAULT_ABI:
  1042. self._fix_stdcall(callconv, p)
  1043. #
  1044. self._regalloc.needed_extra_stack_locations(p//WORD)
  1045. def _fix_stdcall(self, callconv, p):
  1046. from rpython.rlib.clibffi import FFI_STDCALL
  1047. assert callconv == FFI_STDCALL
  1048. # it's a bit stupid, but we're just going to cancel the fact that
  1049. # the called function just added 'p' to ESP, by subtracting it again.
  1050. self.mc.SUB_ri(esp.value, p)
  1051. def _emit_call_64(self, force_in

Large files files are truncated, but you can click here to view the full file