PageRenderTime 57ms CodeModel.GetById 19ms RepoModel.GetById 0ms app.codeStats 1ms

/pypy/jit/backend/x86/assembler.py

https://bitbucket.org/quangquach/pypy
Python | 2688 lines | 2046 code | 233 blank | 409 comment | 429 complexity | 8bcc51c3877913614e950524926e696e MD5 | raw file

Large files files are truncated, but you can click here to view the full file

  1. import sys, os
  2. from pypy.jit.backend.llsupport import symbolic
  3. from pypy.jit.backend.llsupport.asmmemmgr import MachineDataBlockWrapper
  4. from pypy.jit.metainterp.history import Const, Box, BoxInt, ConstInt
  5. from pypy.jit.metainterp.history import AbstractFailDescr, INT, REF, FLOAT
  6. from pypy.jit.metainterp.history import JitCellToken
  7. from pypy.rpython.lltypesystem import lltype, rffi, rstr, llmemory
  8. from pypy.rpython.lltypesystem.lloperation import llop
  9. from pypy.rpython.annlowlevel import llhelper
  10. from pypy.rlib.jit import AsmInfo
  11. from pypy.jit.backend.model import CompiledLoopToken
  12. from pypy.jit.backend.x86.regalloc import (RegAlloc, get_ebp_ofs, _get_scale,
  13. gpr_reg_mgr_cls, xmm_reg_mgr_cls, _valid_addressing_size)
  14. from pypy.jit.backend.x86.arch import (FRAME_FIXED_SIZE, FORCE_INDEX_OFS, WORD,
  15. IS_X86_32, IS_X86_64)
  16. from pypy.jit.backend.x86.regloc import (eax, ecx, edx, ebx,
  17. esp, ebp, esi, edi,
  18. xmm0, xmm1, xmm2, xmm3,
  19. xmm4, xmm5, xmm6, xmm7,
  20. r8, r9, r10, r11,
  21. r12, r13, r14, r15,
  22. X86_64_SCRATCH_REG,
  23. X86_64_XMM_SCRATCH_REG,
  24. RegLoc, StackLoc, ConstFloatLoc,
  25. ImmedLoc, AddressLoc, imm,
  26. imm0, imm1, FloatImmedLoc)
  27. from pypy.rlib.objectmodel import we_are_translated, specialize
  28. from pypy.jit.backend.x86 import rx86, regloc, codebuf
  29. from pypy.jit.metainterp.resoperation import rop, ResOperation
  30. from pypy.jit.backend.x86.support import values_array
  31. from pypy.jit.backend.x86 import support
  32. from pypy.rlib.debug import (debug_print, debug_start, debug_stop,
  33. have_debug_prints)
  34. from pypy.rlib import rgc
  35. from pypy.rlib.clibffi import FFI_DEFAULT_ABI
  36. from pypy.jit.backend.x86.jump import remap_frame_layout
  37. from pypy.jit.codewriter.effectinfo import EffectInfo
  38. from pypy.jit.codewriter import longlong
  39. from pypy.rlib.rarithmetic import intmask
  40. from pypy.rlib.objectmodel import compute_unique_id
  41. # darwin requires the stack to be 16 bytes aligned on calls. Same for gcc 4.5.0,
  42. # better safe than sorry
  43. CALL_ALIGN = 16 // WORD
  44. def align_stack_words(words):
  45. return (words + CALL_ALIGN - 1) & ~(CALL_ALIGN-1)
  46. class GuardToken(object):
  47. def __init__(self, faildescr, failargs, fail_locs, exc,
  48. is_guard_not_invalidated):
  49. self.faildescr = faildescr
  50. self.failargs = failargs
  51. self.fail_locs = fail_locs
  52. self.exc = exc
  53. self.is_guard_not_invalidated = is_guard_not_invalidated
  54. DEBUG_COUNTER = lltype.Struct('DEBUG_COUNTER', ('i', lltype.Signed),
  55. ('type', lltype.Char), # 'b'ridge, 'l'abel or
  56. # 'e'ntry point
  57. ('number', lltype.Signed))
  58. class Assembler386(object):
  59. _regalloc = None
  60. _output_loop_log = None
  61. def __init__(self, cpu, translate_support_code=False,
  62. failargs_limit=1000):
  63. self.cpu = cpu
  64. self.verbose = False
  65. self.rtyper = cpu.rtyper
  66. self.fail_boxes_int = values_array(lltype.Signed, failargs_limit)
  67. self.fail_boxes_ptr = values_array(llmemory.GCREF, failargs_limit)
  68. self.fail_boxes_float = values_array(longlong.FLOATSTORAGE,
  69. failargs_limit)
  70. self.fail_ebp = 0
  71. self.loop_run_counters = []
  72. self.float_const_neg_addr = 0
  73. self.float_const_abs_addr = 0
  74. self.malloc_slowpath1 = 0
  75. self.malloc_slowpath2 = 0
  76. self.wb_slowpath = [0, 0, 0, 0]
  77. self.memcpy_addr = 0
  78. self.setup_failure_recovery()
  79. self._debug = False
  80. self.debug_counter_descr = cpu.fielddescrof(DEBUG_COUNTER, 'i')
  81. self.fail_boxes_count = 0
  82. self.datablockwrapper = None
  83. self.stack_check_slowpath = 0
  84. self.propagate_exception_path = 0
  85. self.gcrootmap_retaddr_forced = 0
  86. self.teardown()
  87. def leave_jitted_hook(self):
  88. ptrs = self.fail_boxes_ptr.ar
  89. llop.gc_assume_young_pointers(lltype.Void,
  90. llmemory.cast_ptr_to_adr(ptrs))
  91. def set_debug(self, v):
  92. r = self._debug
  93. self._debug = v
  94. return r
  95. def setup_once(self):
  96. # the address of the function called by 'new'
  97. gc_ll_descr = self.cpu.gc_ll_descr
  98. gc_ll_descr.initialize()
  99. self.memcpy_addr = self.cpu.cast_ptr_to_int(support.memcpy_fn)
  100. self._build_failure_recovery(False)
  101. self._build_failure_recovery(True)
  102. self._build_wb_slowpath(False)
  103. self._build_wb_slowpath(True)
  104. if self.cpu.supports_floats:
  105. self._build_failure_recovery(False, withfloats=True)
  106. self._build_failure_recovery(True, withfloats=True)
  107. self._build_wb_slowpath(False, withfloats=True)
  108. self._build_wb_slowpath(True, withfloats=True)
  109. support.ensure_sse2_floats()
  110. self._build_float_constants()
  111. self._build_propagate_exception_path()
  112. if gc_ll_descr.get_malloc_slowpath_addr is not None:
  113. self._build_malloc_slowpath()
  114. self._build_stack_check_slowpath()
  115. if gc_ll_descr.gcrootmap:
  116. self._build_release_gil(gc_ll_descr.gcrootmap)
  117. if not self._debug:
  118. # if self._debug is already set it means that someone called
  119. # set_debug by hand before initializing the assembler. Leave it
  120. # as it is
  121. debug_start('jit-backend-counts')
  122. self.set_debug(have_debug_prints())
  123. debug_stop('jit-backend-counts')
  124. def setup(self, looptoken):
  125. assert self.memcpy_addr != 0, "setup_once() not called?"
  126. self.current_clt = looptoken.compiled_loop_token
  127. self.pending_guard_tokens = []
  128. if WORD == 8:
  129. self.pending_memoryerror_trampoline_from = []
  130. self.error_trampoline_64 = 0
  131. self.mc = codebuf.MachineCodeBlockWrapper()
  132. #assert self.datablockwrapper is None --- but obscure case
  133. # possible, e.g. getting MemoryError and continuing
  134. allblocks = self.get_asmmemmgr_blocks(looptoken)
  135. self.datablockwrapper = MachineDataBlockWrapper(self.cpu.asmmemmgr,
  136. allblocks)
  137. self.target_tokens_currently_compiling = {}
  138. def teardown(self):
  139. self.pending_guard_tokens = None
  140. if WORD == 8:
  141. self.pending_memoryerror_trampoline_from = None
  142. self.mc = None
  143. self.current_clt = None
  144. def finish_once(self):
  145. if self._debug:
  146. debug_start('jit-backend-counts')
  147. for i in range(len(self.loop_run_counters)):
  148. struct = self.loop_run_counters[i]
  149. if struct.type == 'l':
  150. prefix = 'TargetToken(%d)' % struct.number
  151. elif struct.type == 'b':
  152. prefix = 'bridge ' + str(struct.number)
  153. else:
  154. prefix = 'entry ' + str(struct.number)
  155. debug_print(prefix + ':' + str(struct.i))
  156. debug_stop('jit-backend-counts')
  157. def _build_float_constants(self):
  158. datablockwrapper = MachineDataBlockWrapper(self.cpu.asmmemmgr, [])
  159. float_constants = datablockwrapper.malloc_aligned(32, alignment=16)
  160. datablockwrapper.done()
  161. addr = rffi.cast(rffi.CArrayPtr(lltype.Char), float_constants)
  162. qword_padding = '\x00\x00\x00\x00\x00\x00\x00\x00'
  163. # 0x8000000000000000
  164. neg_const = '\x00\x00\x00\x00\x00\x00\x00\x80'
  165. # 0x7FFFFFFFFFFFFFFF
  166. abs_const = '\xFF\xFF\xFF\xFF\xFF\xFF\xFF\x7F'
  167. data = neg_const + qword_padding + abs_const + qword_padding
  168. for i in range(len(data)):
  169. addr[i] = data[i]
  170. self.float_const_neg_addr = float_constants
  171. self.float_const_abs_addr = float_constants + 16
  172. def _build_malloc_slowpath(self):
  173. # With asmgcc, we need two helpers, so that we can write two CALL
  174. # instructions in assembler, with a mark_gc_roots in between.
  175. # With shadowstack, this is not needed, so we produce a single helper.
  176. gcrootmap = self.cpu.gc_ll_descr.gcrootmap
  177. shadow_stack = (gcrootmap is not None and gcrootmap.is_shadow_stack)
  178. #
  179. # ---------- first helper for the slow path of malloc ----------
  180. mc = codebuf.MachineCodeBlockWrapper()
  181. if self.cpu.supports_floats: # save the XMM registers in
  182. for i in range(self.cpu.NUM_REGS):# the *caller* frame, from esp+8
  183. mc.MOVSD_sx((WORD*2)+8*i, i)
  184. mc.SUB_rr(edx.value, eax.value) # compute the size we want
  185. addr = self.cpu.gc_ll_descr.get_malloc_slowpath_addr()
  186. #
  187. # The registers to save in the copy area: with shadowstack, most
  188. # registers need to be saved. With asmgcc, the callee-saved registers
  189. # don't need to.
  190. save_in_copy_area = gpr_reg_mgr_cls.REGLOC_TO_COPY_AREA_OFS.items()
  191. if not shadow_stack:
  192. save_in_copy_area = [(reg, ofs) for (reg, ofs) in save_in_copy_area
  193. if reg not in gpr_reg_mgr_cls.REGLOC_TO_GCROOTMAP_REG_INDEX]
  194. #
  195. for reg, ofs in save_in_copy_area:
  196. mc.MOV_br(ofs, reg.value)
  197. #
  198. if shadow_stack:
  199. # ---- shadowstack ----
  200. mc.SUB_ri(esp.value, 16 - WORD) # stack alignment of 16 bytes
  201. if IS_X86_32:
  202. mc.MOV_sr(0, edx.value) # push argument
  203. elif IS_X86_64:
  204. mc.MOV_rr(edi.value, edx.value)
  205. mc.CALL(imm(addr))
  206. mc.ADD_ri(esp.value, 16 - WORD)
  207. else:
  208. # ---- asmgcc ----
  209. if IS_X86_32:
  210. mc.MOV_sr(WORD, edx.value) # save it as the new argument
  211. elif IS_X86_64:
  212. # rdi can be clobbered: its content was saved in the
  213. # copy area of the stack
  214. mc.MOV_rr(edi.value, edx.value)
  215. mc.JMP(imm(addr)) # tail call to the real malloc
  216. rawstart = mc.materialize(self.cpu.asmmemmgr, [])
  217. self.malloc_slowpath1 = rawstart
  218. # ---------- second helper for the slow path of malloc ----------
  219. mc = codebuf.MachineCodeBlockWrapper()
  220. #
  221. for reg, ofs in save_in_copy_area:
  222. mc.MOV_rb(reg.value, ofs)
  223. assert reg is not eax and reg is not edx
  224. #
  225. if self.cpu.supports_floats: # restore the XMM registers
  226. for i in range(self.cpu.NUM_REGS):# from where they were saved
  227. mc.MOVSD_xs(i, (WORD*2)+8*i)
  228. #
  229. # Note: we check this after the code above, just because the code
  230. # above is more than 127 bytes on 64-bits...
  231. mc.TEST_rr(eax.value, eax.value)
  232. mc.J_il8(rx86.Conditions['Z'], 0) # patched later
  233. jz_location = mc.get_relative_pos()
  234. #
  235. nursery_free_adr = self.cpu.gc_ll_descr.get_nursery_free_addr()
  236. mc.MOV(edx, heap(nursery_free_adr)) # load this in EDX
  237. mc.RET()
  238. #
  239. # If the slowpath malloc failed, we raise a MemoryError that
  240. # always interrupts the current loop, as a "good enough"
  241. # approximation. Also note that we didn't RET from this helper;
  242. # but the code we jump to will actually restore the stack
  243. # position based on EBP, which will get us out of here for free.
  244. offset = mc.get_relative_pos() - jz_location
  245. assert 0 < offset <= 127
  246. mc.overwrite(jz_location-1, chr(offset))
  247. mc.JMP(imm(self.propagate_exception_path))
  248. #
  249. rawstart = mc.materialize(self.cpu.asmmemmgr, [])
  250. self.malloc_slowpath2 = rawstart
  251. def _build_propagate_exception_path(self):
  252. if self.cpu.propagate_exception_v < 0:
  253. return # not supported (for tests, or non-translated)
  254. #
  255. self.mc = codebuf.MachineCodeBlockWrapper()
  256. # call on_leave_jitted_save_exc()
  257. addr = self.cpu.get_on_leave_jitted_int(save_exception=True,
  258. default_to_memoryerror=True)
  259. self.mc.CALL(imm(addr))
  260. self.mc.MOV_ri(eax.value, self.cpu.propagate_exception_v)
  261. self._call_footer()
  262. rawstart = self.mc.materialize(self.cpu.asmmemmgr, [])
  263. self.propagate_exception_path = rawstart
  264. self.mc = None
  265. def _build_stack_check_slowpath(self):
  266. _, _, slowpathaddr = self.cpu.insert_stack_check()
  267. if slowpathaddr == 0 or self.cpu.propagate_exception_v < 0:
  268. return # no stack check (for tests, or non-translated)
  269. #
  270. # make a "function" that is called immediately at the start of
  271. # an assembler function. In particular, the stack looks like:
  272. #
  273. # | ... | <-- aligned to a multiple of 16
  274. # | retaddr of caller |
  275. # | my own retaddr | <-- esp
  276. # +---------------------+
  277. #
  278. mc = codebuf.MachineCodeBlockWrapper()
  279. #
  280. stack_size = WORD
  281. if IS_X86_64:
  282. # on the x86_64, we have to save all the registers that may
  283. # have been used to pass arguments
  284. stack_size += 6*WORD + 8*8
  285. for reg in [edi, esi, edx, ecx, r8, r9]:
  286. mc.PUSH_r(reg.value)
  287. mc.SUB_ri(esp.value, 8*8)
  288. for i in range(8):
  289. mc.MOVSD_sx(8*i, i) # xmm0 to xmm7
  290. #
  291. if IS_X86_32:
  292. stack_size += 2*WORD
  293. mc.PUSH_r(eax.value) # alignment
  294. mc.PUSH_r(esp.value)
  295. elif IS_X86_64:
  296. mc.MOV_rr(edi.value, esp.value)
  297. #
  298. # esp is now aligned to a multiple of 16 again
  299. mc.CALL(imm(slowpathaddr))
  300. #
  301. mc.MOV(eax, heap(self.cpu.pos_exception()))
  302. mc.TEST_rr(eax.value, eax.value)
  303. mc.J_il8(rx86.Conditions['NZ'], 0)
  304. jnz_location = mc.get_relative_pos()
  305. #
  306. if IS_X86_32:
  307. mc.ADD_ri(esp.value, 2*WORD) # cancel the two PUSHes above
  308. elif IS_X86_64:
  309. # restore the registers
  310. for i in range(7, -1, -1):
  311. mc.MOVSD_xs(i, 8*i)
  312. mc.ADD_ri(esp.value, 8*8)
  313. for reg in [r9, r8, ecx, edx, esi, edi]:
  314. mc.POP_r(reg.value)
  315. #
  316. mc.RET()
  317. #
  318. # patch the JNZ above
  319. offset = mc.get_relative_pos() - jnz_location
  320. assert 0 < offset <= 127
  321. mc.overwrite(jnz_location-1, chr(offset))
  322. # call on_leave_jitted_save_exc()
  323. addr = self.cpu.get_on_leave_jitted_int(save_exception=True)
  324. mc.CALL(imm(addr))
  325. #
  326. mc.MOV_ri(eax.value, self.cpu.propagate_exception_v)
  327. #
  328. # footer -- note the ADD, which skips the return address of this
  329. # function, and will instead return to the caller's caller. Note
  330. # also that we completely ignore the saved arguments, because we
  331. # are interrupting the function.
  332. mc.ADD_ri(esp.value, stack_size)
  333. mc.RET()
  334. #
  335. rawstart = mc.materialize(self.cpu.asmmemmgr, [])
  336. self.stack_check_slowpath = rawstart
  337. def _build_wb_slowpath(self, withcards, withfloats=False):
  338. descr = self.cpu.gc_ll_descr.write_barrier_descr
  339. if descr is None:
  340. return
  341. if not withcards:
  342. func = descr.get_write_barrier_fn(self.cpu)
  343. else:
  344. if descr.jit_wb_cards_set == 0:
  345. return
  346. func = descr.get_write_barrier_from_array_fn(self.cpu)
  347. if func == 0:
  348. return
  349. #
  350. # This builds a helper function called from the slow path of
  351. # write barriers. It must save all registers, and optionally
  352. # all XMM registers. It takes a single argument just pushed
  353. # on the stack even on X86_64. It must restore stack alignment
  354. # accordingly.
  355. mc = codebuf.MachineCodeBlockWrapper()
  356. #
  357. frame_size = (1 + # my argument, considered part of my frame
  358. 1 + # my return address
  359. len(gpr_reg_mgr_cls.save_around_call_regs))
  360. if withfloats:
  361. frame_size += 16 # X86_32: 16 words for 8 registers;
  362. # X86_64: just 16 registers
  363. if IS_X86_32:
  364. frame_size += 1 # argument to pass to the call
  365. #
  366. # align to a multiple of 16 bytes
  367. frame_size = (frame_size + (CALL_ALIGN-1)) & ~(CALL_ALIGN-1)
  368. #
  369. correct_esp_by = (frame_size - 2) * WORD
  370. mc.SUB_ri(esp.value, correct_esp_by)
  371. #
  372. ofs = correct_esp_by
  373. if withfloats:
  374. for reg in xmm_reg_mgr_cls.save_around_call_regs:
  375. ofs -= 8
  376. mc.MOVSD_sx(ofs, reg.value)
  377. for reg in gpr_reg_mgr_cls.save_around_call_regs:
  378. ofs -= WORD
  379. mc.MOV_sr(ofs, reg.value)
  380. #
  381. if IS_X86_32:
  382. mc.MOV_rs(eax.value, (frame_size - 1) * WORD)
  383. mc.MOV_sr(0, eax.value)
  384. elif IS_X86_64:
  385. mc.MOV_rs(edi.value, (frame_size - 1) * WORD)
  386. mc.CALL(imm(func))
  387. #
  388. if withcards:
  389. # A final TEST8 before the RET, for the caller. Careful to
  390. # not follow this instruction with another one that changes
  391. # the status of the CPU flags!
  392. mc.MOV_rs(eax.value, (frame_size - 1) * WORD)
  393. mc.TEST8(addr_add_const(eax, descr.jit_wb_if_flag_byteofs),
  394. imm(-0x80))
  395. #
  396. ofs = correct_esp_by
  397. if withfloats:
  398. for reg in xmm_reg_mgr_cls.save_around_call_regs:
  399. ofs -= 8
  400. mc.MOVSD_xs(reg.value, ofs)
  401. for reg in gpr_reg_mgr_cls.save_around_call_regs:
  402. ofs -= WORD
  403. mc.MOV_rs(reg.value, ofs)
  404. #
  405. # ADD esp, correct_esp_by --- but cannot use ADD, because
  406. # of its effects on the CPU flags
  407. mc.LEA_rs(esp.value, correct_esp_by)
  408. mc.RET16_i(WORD)
  409. #
  410. rawstart = mc.materialize(self.cpu.asmmemmgr, [])
  411. self.wb_slowpath[withcards + 2 * withfloats] = rawstart
  412. @staticmethod
  413. @rgc.no_collect
  414. def _release_gil_asmgcc(css):
  415. # similar to trackgcroot.py:pypy_asm_stackwalk, first part
  416. from pypy.rpython.memory.gctransform import asmgcroot
  417. new = rffi.cast(asmgcroot.ASM_FRAMEDATA_HEAD_PTR, css)
  418. next = asmgcroot.gcrootanchor.next
  419. new.next = next
  420. new.prev = asmgcroot.gcrootanchor
  421. asmgcroot.gcrootanchor.next = new
  422. next.prev = new
  423. # and now release the GIL
  424. before = rffi.aroundstate.before
  425. if before:
  426. before()
  427. @staticmethod
  428. @rgc.no_collect
  429. def _reacquire_gil_asmgcc(css):
  430. # first reacquire the GIL
  431. after = rffi.aroundstate.after
  432. if after:
  433. after()
  434. # similar to trackgcroot.py:pypy_asm_stackwalk, second part
  435. from pypy.rpython.memory.gctransform import asmgcroot
  436. old = rffi.cast(asmgcroot.ASM_FRAMEDATA_HEAD_PTR, css)
  437. prev = old.prev
  438. next = old.next
  439. prev.next = next
  440. next.prev = prev
  441. @staticmethod
  442. @rgc.no_collect
  443. def _release_gil_shadowstack():
  444. before = rffi.aroundstate.before
  445. if before:
  446. before()
  447. @staticmethod
  448. @rgc.no_collect
  449. def _reacquire_gil_shadowstack():
  450. after = rffi.aroundstate.after
  451. if after:
  452. after()
  453. _NOARG_FUNC = lltype.Ptr(lltype.FuncType([], lltype.Void))
  454. _CLOSESTACK_FUNC = lltype.Ptr(lltype.FuncType([rffi.LONGP],
  455. lltype.Void))
  456. def _build_release_gil(self, gcrootmap):
  457. if gcrootmap.is_shadow_stack:
  458. releasegil_func = llhelper(self._NOARG_FUNC,
  459. self._release_gil_shadowstack)
  460. reacqgil_func = llhelper(self._NOARG_FUNC,
  461. self._reacquire_gil_shadowstack)
  462. else:
  463. releasegil_func = llhelper(self._CLOSESTACK_FUNC,
  464. self._release_gil_asmgcc)
  465. reacqgil_func = llhelper(self._CLOSESTACK_FUNC,
  466. self._reacquire_gil_asmgcc)
  467. self.releasegil_addr = self.cpu.cast_ptr_to_int(releasegil_func)
  468. self.reacqgil_addr = self.cpu.cast_ptr_to_int(reacqgil_func)
  469. def assemble_loop(self, loopname, inputargs, operations, looptoken, log):
  470. '''adds the following attributes to looptoken:
  471. _x86_function_addr (address of the generated func, as an int)
  472. _x86_loop_code (debug: addr of the start of the ResOps)
  473. _x86_fullsize (debug: full size including failure)
  474. _x86_debug_checksum
  475. '''
  476. # XXX this function is too longish and contains some code
  477. # duplication with assemble_bridge(). Also, we should think
  478. # about not storing on 'self' attributes that will live only
  479. # for the duration of compiling one loop or a one bridge.
  480. clt = CompiledLoopToken(self.cpu, looptoken.number)
  481. clt.allgcrefs = []
  482. looptoken.compiled_loop_token = clt
  483. if not we_are_translated():
  484. # Arguments should be unique
  485. assert len(set(inputargs)) == len(inputargs)
  486. self.setup(looptoken)
  487. if log:
  488. operations = self._inject_debugging_code(looptoken, operations,
  489. 'e', looptoken.number)
  490. regalloc = RegAlloc(self, self.cpu.translate_support_code)
  491. #
  492. self._call_header_with_stack_check()
  493. stackadjustpos = self._patchable_stackadjust()
  494. clt._debug_nbargs = len(inputargs)
  495. operations = regalloc.prepare_loop(inputargs, operations,
  496. looptoken, clt.allgcrefs)
  497. looppos = self.mc.get_relative_pos()
  498. looptoken._x86_loop_code = looppos
  499. clt.frame_depth = -1 # temporarily
  500. frame_depth = self._assemble(regalloc, operations)
  501. clt.frame_depth = frame_depth
  502. #
  503. size_excluding_failure_stuff = self.mc.get_relative_pos()
  504. self.write_pending_failure_recoveries()
  505. full_size = self.mc.get_relative_pos()
  506. #
  507. rawstart = self.materialize_loop(looptoken)
  508. debug_start("jit-backend-addr")
  509. debug_print("Loop %d (%s) has address %x to %x (bootstrap %x)" % (
  510. looptoken.number, loopname,
  511. rawstart + looppos,
  512. rawstart + size_excluding_failure_stuff,
  513. rawstart))
  514. debug_stop("jit-backend-addr")
  515. self._patch_stackadjust(rawstart + stackadjustpos, frame_depth)
  516. self.patch_pending_failure_recoveries(rawstart)
  517. #
  518. ops_offset = self.mc.ops_offset
  519. if not we_are_translated():
  520. # used only by looptoken.dump() -- useful in tests
  521. looptoken._x86_rawstart = rawstart
  522. looptoken._x86_fullsize = full_size
  523. looptoken._x86_ops_offset = ops_offset
  524. looptoken._x86_function_addr = rawstart
  525. self.fixup_target_tokens(rawstart)
  526. self.teardown()
  527. # oprofile support
  528. if self.cpu.profile_agent is not None:
  529. name = "Loop # %s: %s" % (looptoken.number, loopname)
  530. self.cpu.profile_agent.native_code_written(name,
  531. rawstart, full_size)
  532. return AsmInfo(ops_offset, rawstart + looppos,
  533. size_excluding_failure_stuff - looppos)
  534. def assemble_bridge(self, faildescr, inputargs, operations,
  535. original_loop_token, log):
  536. if not we_are_translated():
  537. # Arguments should be unique
  538. assert len(set(inputargs)) == len(inputargs)
  539. descr_number = self.cpu.get_fail_descr_number(faildescr)
  540. failure_recovery = self._find_failure_recovery_bytecode(faildescr)
  541. self.setup(original_loop_token)
  542. if log:
  543. operations = self._inject_debugging_code(faildescr, operations,
  544. 'b', descr_number)
  545. arglocs = self.rebuild_faillocs_from_descr(failure_recovery)
  546. if not we_are_translated():
  547. assert ([loc.assembler() for loc in arglocs] ==
  548. [loc.assembler() for loc in faildescr._x86_debug_faillocs])
  549. regalloc = RegAlloc(self, self.cpu.translate_support_code)
  550. startpos = self.mc.get_relative_pos()
  551. operations = regalloc.prepare_bridge(inputargs, arglocs,
  552. operations,
  553. self.current_clt.allgcrefs)
  554. stackadjustpos = self._patchable_stackadjust()
  555. frame_depth = self._assemble(regalloc, operations)
  556. codeendpos = self.mc.get_relative_pos()
  557. self.write_pending_failure_recoveries()
  558. fullsize = self.mc.get_relative_pos()
  559. #
  560. rawstart = self.materialize_loop(original_loop_token)
  561. debug_start("jit-backend-addr")
  562. debug_print("bridge out of Guard %d has address %x to %x" %
  563. (descr_number, rawstart, rawstart + codeendpos))
  564. debug_stop("jit-backend-addr")
  565. self._patch_stackadjust(rawstart + stackadjustpos, frame_depth)
  566. self.patch_pending_failure_recoveries(rawstart)
  567. if not we_are_translated():
  568. # for the benefit of tests
  569. faildescr._x86_bridge_frame_depth = frame_depth
  570. # patch the jump from original guard
  571. self.patch_jump_for_descr(faildescr, rawstart)
  572. ops_offset = self.mc.ops_offset
  573. self.fixup_target_tokens(rawstart)
  574. self.current_clt.frame_depth = max(self.current_clt.frame_depth, frame_depth)
  575. self.teardown()
  576. # oprofile support
  577. if self.cpu.profile_agent is not None:
  578. name = "Bridge # %s" % (descr_number,)
  579. self.cpu.profile_agent.native_code_written(name,
  580. rawstart, fullsize)
  581. return AsmInfo(ops_offset, startpos + rawstart, codeendpos - startpos)
  582. def write_pending_failure_recoveries(self):
  583. # for each pending guard, generate the code of the recovery stub
  584. # at the end of self.mc.
  585. for tok in self.pending_guard_tokens:
  586. tok.pos_recovery_stub = self.generate_quick_failure(tok)
  587. if WORD == 8 and len(self.pending_memoryerror_trampoline_from) > 0:
  588. self.error_trampoline_64 = self.generate_propagate_error_64()
  589. def patch_pending_failure_recoveries(self, rawstart):
  590. # after we wrote the assembler to raw memory, set up
  591. # tok.faildescr._x86_adr_jump_offset to contain the raw address of
  592. # the 4-byte target field in the JMP/Jcond instruction, and patch
  593. # the field in question to point (initially) to the recovery stub
  594. clt = self.current_clt
  595. for tok in self.pending_guard_tokens:
  596. addr = rawstart + tok.pos_jump_offset
  597. tok.faildescr._x86_adr_jump_offset = addr
  598. relative_target = tok.pos_recovery_stub - (tok.pos_jump_offset + 4)
  599. assert rx86.fits_in_32bits(relative_target)
  600. #
  601. if not tok.is_guard_not_invalidated:
  602. mc = codebuf.MachineCodeBlockWrapper()
  603. mc.writeimm32(relative_target)
  604. mc.copy_to_raw_memory(addr)
  605. else:
  606. # GUARD_NOT_INVALIDATED, record an entry in
  607. # clt.invalidate_positions of the form:
  608. # (addr-in-the-code-of-the-not-yet-written-jump-target,
  609. # relative-target-to-use)
  610. relpos = tok.pos_jump_offset
  611. clt.invalidate_positions.append((rawstart + relpos,
  612. relative_target))
  613. # General idea: Although no code was generated by this
  614. # guard, the code might be patched with a "JMP rel32" to
  615. # the guard recovery code. This recovery code is
  616. # already generated, and looks like the recovery code
  617. # for any guard, even if at first it has no jump to it.
  618. # So we may later write 5 bytes overriding the existing
  619. # instructions; this works because a CALL instruction
  620. # would also take at least 5 bytes. If it could take
  621. # less, we would run into the issue that overwriting the
  622. # 5 bytes here might get a few nonsense bytes at the
  623. # return address of the following CALL.
  624. if WORD == 8:
  625. for pos_after_jz in self.pending_memoryerror_trampoline_from:
  626. assert self.error_trampoline_64 != 0 # only if non-empty
  627. mc = codebuf.MachineCodeBlockWrapper()
  628. mc.writeimm32(self.error_trampoline_64 - pos_after_jz)
  629. mc.copy_to_raw_memory(rawstart + pos_after_jz - 4)
  630. def get_asmmemmgr_blocks(self, looptoken):
  631. clt = looptoken.compiled_loop_token
  632. if clt.asmmemmgr_blocks is None:
  633. clt.asmmemmgr_blocks = []
  634. return clt.asmmemmgr_blocks
  635. def materialize_loop(self, looptoken):
  636. self.datablockwrapper.done() # finish using cpu.asmmemmgr
  637. self.datablockwrapper = None
  638. allblocks = self.get_asmmemmgr_blocks(looptoken)
  639. return self.mc.materialize(self.cpu.asmmemmgr, allblocks,
  640. self.cpu.gc_ll_descr.gcrootmap)
  641. def _register_counter(self, tp, number, token):
  642. # YYY very minor leak -- we need the counters to stay alive
  643. # forever, just because we want to report them at the end
  644. # of the process
  645. struct = lltype.malloc(DEBUG_COUNTER, flavor='raw',
  646. track_allocation=False)
  647. struct.i = 0
  648. struct.type = tp
  649. if tp == 'b' or tp == 'e':
  650. struct.number = number
  651. else:
  652. assert token
  653. struct.number = compute_unique_id(token)
  654. self.loop_run_counters.append(struct)
  655. return struct
  656. def _find_failure_recovery_bytecode(self, faildescr):
  657. adr_jump_offset = faildescr._x86_adr_jump_offset
  658. if adr_jump_offset == 0:
  659. # This case should be prevented by the logic in compile.py:
  660. # look for CNT_BUSY_FLAG, which disables tracing from a guard
  661. # when another tracing from the same guard is already in progress.
  662. raise BridgeAlreadyCompiled
  663. # follow the JMP/Jcond
  664. p = rffi.cast(rffi.INTP, adr_jump_offset)
  665. adr_target = adr_jump_offset + 4 + rffi.cast(lltype.Signed, p[0])
  666. # skip the CALL
  667. if WORD == 4:
  668. adr_target += 5 # CALL imm
  669. else:
  670. adr_target += 13 # MOV r11, imm-as-8-bytes; CALL *r11 xxxxxxxxxx
  671. return adr_target
  672. def patch_jump_for_descr(self, faildescr, adr_new_target):
  673. adr_jump_offset = faildescr._x86_adr_jump_offset
  674. assert adr_jump_offset != 0
  675. offset = adr_new_target - (adr_jump_offset + 4)
  676. # If the new target fits within a rel32 of the jump, just patch
  677. # that. Otherwise, leave the original rel32 to the recovery stub in
  678. # place, but clobber the recovery stub with a jump to the real
  679. # target.
  680. mc = codebuf.MachineCodeBlockWrapper()
  681. if rx86.fits_in_32bits(offset):
  682. mc.writeimm32(offset)
  683. mc.copy_to_raw_memory(adr_jump_offset)
  684. else:
  685. # "mov r11, addr; jmp r11" is up to 13 bytes, which fits in there
  686. # because we always write "mov r11, imm-as-8-bytes; call *r11" in
  687. # the first place.
  688. mc.MOV_ri(X86_64_SCRATCH_REG.value, adr_new_target)
  689. mc.JMP_r(X86_64_SCRATCH_REG.value)
  690. p = rffi.cast(rffi.INTP, adr_jump_offset)
  691. adr_target = adr_jump_offset + 4 + rffi.cast(lltype.Signed, p[0])
  692. mc.copy_to_raw_memory(adr_target)
  693. faildescr._x86_adr_jump_offset = 0 # means "patched"
  694. def fixup_target_tokens(self, rawstart):
  695. for targettoken in self.target_tokens_currently_compiling:
  696. targettoken._x86_loop_code += rawstart
  697. self.target_tokens_currently_compiling = None
  698. def _append_debugging_code(self, operations, tp, number, token):
  699. counter = self._register_counter(tp, number, token)
  700. c_adr = ConstInt(rffi.cast(lltype.Signed, counter))
  701. box = BoxInt()
  702. box2 = BoxInt()
  703. ops = [ResOperation(rop.GETFIELD_RAW, [c_adr],
  704. box, descr=self.debug_counter_descr),
  705. ResOperation(rop.INT_ADD, [box, ConstInt(1)], box2),
  706. ResOperation(rop.SETFIELD_RAW, [c_adr, box2],
  707. None, descr=self.debug_counter_descr)]
  708. operations.extend(ops)
  709. @specialize.argtype(1)
  710. def _inject_debugging_code(self, looptoken, operations, tp, number):
  711. if self._debug:
  712. s = 0
  713. for op in operations:
  714. s += op.getopnum()
  715. looptoken._x86_debug_checksum = s
  716. newoperations = []
  717. self._append_debugging_code(newoperations, tp, number,
  718. None)
  719. for op in operations:
  720. newoperations.append(op)
  721. if op.getopnum() == rop.LABEL:
  722. self._append_debugging_code(newoperations, 'l', number,
  723. op.getdescr())
  724. operations = newoperations
  725. return operations
  726. def _assemble(self, regalloc, operations):
  727. self._regalloc = regalloc
  728. regalloc.compute_hint_frame_locations(operations)
  729. regalloc.walk_operations(operations)
  730. if we_are_translated() or self.cpu.dont_keepalive_stuff:
  731. self._regalloc = None # else keep it around for debugging
  732. frame_depth = regalloc.get_final_frame_depth()
  733. jump_target_descr = regalloc.jump_target_descr
  734. if jump_target_descr is not None:
  735. target_frame_depth = jump_target_descr._x86_clt.frame_depth
  736. frame_depth = max(frame_depth, target_frame_depth)
  737. return frame_depth
  738. def _patchable_stackadjust(self):
  739. # stack adjustment LEA
  740. self.mc.LEA32_rb(esp.value, 0)
  741. return self.mc.get_relative_pos() - 4
  742. def _patch_stackadjust(self, adr_lea, allocated_depth):
  743. # patch stack adjustment LEA
  744. mc = codebuf.MachineCodeBlockWrapper()
  745. # Compute the correct offset for the instruction LEA ESP, [EBP-4*words]
  746. mc.writeimm32(self._get_offset_of_ebp_from_esp(allocated_depth))
  747. mc.copy_to_raw_memory(adr_lea)
  748. def _get_offset_of_ebp_from_esp(self, allocated_depth):
  749. # Given that [EBP] is where we saved EBP, i.e. in the last word
  750. # of our fixed frame, then the 'words' value is:
  751. words = (FRAME_FIXED_SIZE - 1) + allocated_depth
  752. # align, e.g. for Mac OS X
  753. aligned_words = align_stack_words(words+2)-2 # 2 = EIP+EBP
  754. return -WORD * aligned_words
  755. def _call_header(self):
  756. # NB. the shape of the frame is hard-coded in get_basic_shape() too.
  757. # Also, make sure this is consistent with FRAME_FIXED_SIZE.
  758. self.mc.PUSH_r(ebp.value)
  759. self.mc.MOV_rr(ebp.value, esp.value)
  760. for loc in self.cpu.CALLEE_SAVE_REGISTERS:
  761. self.mc.PUSH_r(loc.value)
  762. gcrootmap = self.cpu.gc_ll_descr.gcrootmap
  763. if gcrootmap and gcrootmap.is_shadow_stack:
  764. self._call_header_shadowstack(gcrootmap)
  765. def _call_header_with_stack_check(self):
  766. if self.stack_check_slowpath == 0:
  767. pass # no stack check (e.g. not translated)
  768. else:
  769. endaddr, lengthaddr, _ = self.cpu.insert_stack_check()
  770. self.mc.MOV(eax, heap(endaddr)) # MOV eax, [start]
  771. self.mc.SUB(eax, esp) # SUB eax, current
  772. self.mc.CMP(eax, heap(lengthaddr)) # CMP eax, [length]
  773. self.mc.J_il8(rx86.Conditions['BE'], 0) # JBE .skip
  774. jb_location = self.mc.get_relative_pos()
  775. self.mc.CALL(imm(self.stack_check_slowpath))# CALL slowpath
  776. # patch the JB above # .skip:
  777. offset = self.mc.get_relative_pos() - jb_location
  778. assert 0 < offset <= 127
  779. self.mc.overwrite(jb_location-1, chr(offset))
  780. #
  781. self._call_header()
  782. def _call_footer(self):
  783. self.mc.LEA_rb(esp.value, -len(self.cpu.CALLEE_SAVE_REGISTERS) * WORD)
  784. gcrootmap = self.cpu.gc_ll_descr.gcrootmap
  785. if gcrootmap and gcrootmap.is_shadow_stack:
  786. self._call_footer_shadowstack(gcrootmap)
  787. for i in range(len(self.cpu.CALLEE_SAVE_REGISTERS)-1, -1, -1):
  788. self.mc.POP_r(self.cpu.CALLEE_SAVE_REGISTERS[i].value)
  789. self.mc.POP_r(ebp.value)
  790. self.mc.RET()
  791. def _call_header_shadowstack(self, gcrootmap):
  792. # we need to put two words into the shadowstack: the MARKER_FRAME
  793. # and the address of the frame (ebp, actually)
  794. rst = gcrootmap.get_root_stack_top_addr()
  795. if rx86.fits_in_32bits(rst):
  796. self.mc.MOV_rj(eax.value, rst) # MOV eax, [rootstacktop]
  797. else:
  798. self.mc.MOV_ri(r13.value, rst) # MOV r13, rootstacktop
  799. self.mc.MOV_rm(eax.value, (r13.value, 0)) # MOV eax, [r13]
  800. #
  801. MARKER = gcrootmap.MARKER_FRAME
  802. self.mc.LEA_rm(ebx.value, (eax.value, 2*WORD)) # LEA ebx, [eax+2*WORD]
  803. self.mc.MOV_mi((eax.value, WORD), MARKER) # MOV [eax+WORD], MARKER
  804. self.mc.MOV_mr((eax.value, 0), ebp.value) # MOV [eax], ebp
  805. #
  806. if rx86.fits_in_32bits(rst):
  807. self.mc.MOV_jr(rst, ebx.value) # MOV [rootstacktop], ebx
  808. else:
  809. self.mc.MOV_mr((r13.value, 0), ebx.value) # MOV [r13], ebx
  810. def _call_footer_shadowstack(self, gcrootmap):
  811. rst = gcrootmap.get_root_stack_top_addr()
  812. if rx86.fits_in_32bits(rst):
  813. self.mc.SUB_ji8(rst, 2*WORD) # SUB [rootstacktop], 2*WORD
  814. else:
  815. self.mc.MOV_ri(ebx.value, rst) # MOV ebx, rootstacktop
  816. self.mc.SUB_mi8((ebx.value, 0), 2*WORD) # SUB [ebx], 2*WORD
  817. def redirect_call_assembler(self, oldlooptoken, newlooptoken):
  818. # some minimal sanity checking
  819. old_nbargs = oldlooptoken.compiled_loop_token._debug_nbargs
  820. new_nbargs = newlooptoken.compiled_loop_token._debug_nbargs
  821. assert old_nbargs == new_nbargs
  822. # we overwrite the instructions at the old _x86_direct_bootstrap_code
  823. # to start with a JMP to the new _x86_direct_bootstrap_code.
  824. # Ideally we should rather patch all existing CALLs, but well.
  825. oldadr = oldlooptoken._x86_function_addr
  826. target = newlooptoken._x86_function_addr
  827. mc = codebuf.MachineCodeBlockWrapper()
  828. mc.JMP(imm(target))
  829. if WORD == 4: # keep in sync with prepare_loop()
  830. assert mc.get_relative_pos() == 5
  831. else:
  832. assert mc.get_relative_pos() <= 13
  833. mc.copy_to_raw_memory(oldadr)
  834. def dump(self, text):
  835. if not self.verbose:
  836. return
  837. _prev = Box._extended_display
  838. try:
  839. Box._extended_display = False
  840. pos = self.mc.get_relative_pos()
  841. print >> sys.stderr, ' 0x%x %s' % (pos, text)
  842. finally:
  843. Box._extended_display = _prev
  844. # ------------------------------------------------------------
  845. def mov(self, from_loc, to_loc):
  846. if (isinstance(from_loc, RegLoc) and from_loc.is_xmm) or (isinstance(to_loc, RegLoc) and to_loc.is_xmm):
  847. self.mc.MOVSD(to_loc, from_loc)
  848. else:
  849. assert to_loc is not ebp
  850. self.mc.MOV(to_loc, from_loc)
  851. regalloc_mov = mov # legacy interface
  852. def regalloc_push(self, loc):
  853. if isinstance(loc, RegLoc) and loc.is_xmm:
  854. self.mc.SUB_ri(esp.value, 8) # = size of doubles
  855. self.mc.MOVSD_sx(0, loc.value)
  856. elif WORD == 4 and isinstance(loc, StackLoc) and loc.get_width() == 8:
  857. # XXX evil trick
  858. self.mc.PUSH_b(loc.value + 4)
  859. self.mc.PUSH_b(loc.value)
  860. else:
  861. self.mc.PUSH(loc)
  862. def regalloc_pop(self, loc):
  863. if isinstance(loc, RegLoc) and loc.is_xmm:
  864. self.mc.MOVSD_xs(loc.value, 0)
  865. self.mc.ADD_ri(esp.value, 8) # = size of doubles
  866. elif WORD == 4 and isinstance(loc, StackLoc) and loc.get_width() == 8:
  867. # XXX evil trick
  868. self.mc.POP_b(loc.value)
  869. self.mc.POP_b(loc.value + 4)
  870. else:
  871. self.mc.POP(loc)
  872. def regalloc_immedmem2mem(self, from_loc, to_loc):
  873. # move a ConstFloatLoc directly to a StackLoc, as two MOVs
  874. # (even on x86-64, because the immediates are encoded as 32 bits)
  875. assert isinstance(from_loc, ConstFloatLoc)
  876. assert isinstance(to_loc, StackLoc)
  877. low_part = rffi.cast(rffi.CArrayPtr(rffi.INT), from_loc.value)[0]
  878. high_part = rffi.cast(rffi.CArrayPtr(rffi.INT), from_loc.value)[1]
  879. low_part = intmask(low_part)
  880. high_part = intmask(high_part)
  881. self.mc.MOV32_bi(to_loc.value, low_part)
  882. self.mc.MOV32_bi(to_loc.value + 4, high_part)
  883. def regalloc_perform(self, op, arglocs, resloc):
  884. genop_list[op.getopnum()](self, op, arglocs, resloc)
  885. def regalloc_perform_discard(self, op, arglocs):
  886. genop_discard_list[op.getopnum()](self, op, arglocs)
  887. def regalloc_perform_llong(self, op, arglocs, resloc):
  888. effectinfo = op.getdescr().get_extra_info()
  889. oopspecindex = effectinfo.oopspecindex
  890. genop_llong_list[oopspecindex](self, op, arglocs, resloc)
  891. def regalloc_perform_math(self, op, arglocs, resloc):
  892. effectinfo = op.getdescr().get_extra_info()
  893. oopspecindex = effectinfo.oopspecindex
  894. genop_math_list[oopspecindex](self, op, arglocs, resloc)
  895. def regalloc_perform_with_guard(self, op, guard_op, faillocs,
  896. arglocs, resloc):
  897. faildescr = guard_op.getdescr()
  898. assert isinstance(faildescr, AbstractFailDescr)
  899. failargs = guard_op.getfailargs()
  900. guard_opnum = guard_op.getopnum()
  901. guard_token = self.implement_guard_recovery(guard_opnum,
  902. faildescr, failargs,
  903. faillocs)
  904. if op is None:
  905. dispatch_opnum = guard_opnum
  906. else:
  907. dispatch_opnum = op.getopnum()
  908. genop_guard_list[dispatch_opnum](self, op, guard_op, guard_token,
  909. arglocs, resloc)
  910. if not we_are_translated():
  911. # must be added by the genop_guard_list[]()
  912. assert guard_token is self.pending_guard_tokens[-1]
  913. def regalloc_perform_guard(self, guard_op, faillocs, arglocs, resloc):
  914. self.regalloc_perform_with_guard(None, guard_op, faillocs, arglocs,
  915. resloc)
  916. def load_effective_addr(self, sizereg, baseofs, scale, result, frm=imm0):
  917. self.mc.LEA(result, addr_add(frm, sizereg, baseofs, scale))
  918. def _unaryop(asmop):
  919. def genop_unary(self, op, arglocs, resloc):
  920. getattr(self.mc, asmop)(arglocs[0])
  921. return genop_unary
  922. def _binaryop(asmop, can_swap=False):
  923. def genop_binary(self, op, arglocs, result_loc):
  924. getattr(self.mc, asmop)(arglocs[0], arglocs[1])
  925. return genop_binary
  926. def _binaryop_or_lea(asmop, is_add):
  927. def genop_binary_or_lea(self, op, arglocs, result_loc):
  928. # use a regular ADD or SUB if result_loc is arglocs[0],
  929. # and a LEA only if different.
  930. if result_loc is arglocs[0]:
  931. getattr(self.mc, asmop)(arglocs[0], arglocs[1])
  932. else:
  933. loc = arglocs[0]
  934. argloc = arglocs[1]
  935. assert isinstance(loc, RegLoc)
  936. assert isinstance(argloc, ImmedLoc)
  937. assert isinstance(result_loc, RegLoc)
  938. delta = argloc.value
  939. if not is_add: # subtraction
  940. delta = -delta
  941. self.mc.LEA_rm(result_loc.value, (loc.value, delta))
  942. return genop_binary_or_lea
  943. def _cmpop(cond, rev_cond):
  944. def genop_cmp(self, op, arglocs, result_loc):
  945. rl = result_loc.lowest8bits()
  946. if isinstance(op.getarg(0), Const):
  947. self.mc.CMP(arglocs[1], arglocs[0])
  948. self.mc.SET_ir(rx86.Conditions[rev_cond], rl.value)
  949. else:
  950. self.mc.CMP(arglocs[0], arglocs[1])
  951. self.mc.SET_ir(rx86.Conditions[cond], rl.value)
  952. self.mc.MOVZX8_rr(result_loc.value, rl.value)
  953. return genop_cmp
  954. def _cmpop_float(cond, rev_cond, is_ne=False):
  955. def genop_cmp(self, op, arglocs, result_loc):
  956. if isinstance(arglocs[0], RegLoc):
  957. self.mc.UCOMISD(arglocs[0], arglocs[1])
  958. checkcond = cond
  959. else:
  960. self.mc.UCOMISD(arglocs[1], arglocs[0])
  961. checkcond = rev_cond
  962. tmp1 = result_loc.lowest8bits()
  963. if IS_X86_32:
  964. tmp2 = result_loc.higher8bits()
  965. elif IS_X86_64:
  966. tmp2 = X86_64_SCRATCH_REG.lowest8bits()
  967. self.mc.SET_ir(rx86.Conditions[checkcond], tmp1.value)
  968. if is_ne:
  969. self.mc.SET_ir(rx86.Conditions['P'], tmp2.value)
  970. self.mc.OR8_rr(tmp1.value, tmp2.value)
  971. else:
  972. self.mc.SET_ir(rx86.Conditions['NP'], tmp2.value)
  973. self.mc.AND8_rr(tmp1.value, tmp2.value)
  974. self.mc.MOVZX8_rr(result_loc.value, tmp1.value)
  975. return genop_cmp
  976. def _cmpop_guard(cond, rev_cond, false_cond, false_rev_cond):
  977. def genop_cmp_guard(self, op, guard_op, guard_token, arglocs, result_loc):
  978. guard_opnum = guard_op.getopnum()
  979. if isinstance(op.getarg(0), Const):
  980. self.mc.CMP(arglocs[1], arglocs[0])
  981. if guard_opnum == rop.GUARD_FALSE:
  982. self.implement_guard(guard_token, rev_cond)
  983. else:
  984. self.implement_guard(guard_token, false_rev_cond)
  985. else:
  986. self.mc.CMP(arglocs[0], arglocs[1])
  987. if guard_opnum == rop.GUARD_FALSE:
  988. self.implement_guard(guard_token, cond)
  989. else:
  990. self.implement_guard(guard_token, false_cond)
  991. return genop_cmp_guard
  992. def _cmpop_guard_float(cond, rev_cond, false_cond, false_rev_cond):
  993. need_direct_jp = 'A' not in cond
  994. need_rev_jp = 'A' not in rev_cond
  995. def genop_cmp_guard_float(self, op, guard_op, guard_token, arglocs,
  996. result_loc):
  997. guard_opnum = guard_op.getopnum()
  998. if isinstance(arglocs[0], RegLoc):
  999. self.mc.UCOMISD(arglocs[0], arglocs[1])
  1000. checkcond = cond
  1001. checkfalsecond = false_cond
  1002. need_jp = need_direct_jp
  1003. else:
  1004. self.mc.UCOMISD(arglocs[1], arglocs[0])
  1005. checkcond = rev_cond
  1006. checkfalsecond = false_rev_cond
  1007. need_jp = need_rev_jp
  1008. if guard_opnum == rop.GUARD_FALSE:
  1009. if need_jp:
  1010. self.mc.J_il8(rx86.Conditions['P'], 6)
  1011. self.implement_guard(guard_token, checkcond)
  1012. else:
  1013. if need_jp:
  1014. self.mc.J_il8(rx86.Conditions['P'], 2)
  1015. self.mc.J_il8(rx86.Conditions[checkcond], 5)
  1016. self.implement_guard(guard_token)
  1017. else:
  1018. self.implement_guard(guard_token, checkfalsecond)
  1019. return genop_cmp_guard_float
  1020. def _emit_call(self, force_index, x, arglocs, start=0, tmp=eax,
  1021. argtypes=None, callconv=FFI_DEFAULT_ABI):
  1022. if IS_X86_64:
  1023. return self._emit_call_64(force_index, x, arglocs, start, argtypes)
  1024. p = 0
  1025. n = len(arglocs)
  1026. for i in range(start, n):
  1027. loc = arglocs[i]
  1028. if isinstance(loc, RegLoc):
  1029. if loc.is_xmm:
  1030. self.mc.MOVSD_sx(p, loc.value)
  1031. else:
  1032. self.mc.MOV_sr(p, loc.value)
  1033. p += loc.get_width()
  1034. p = 0
  1035. for i in range(start, n):
  1036. loc = arglocs[i]
  1037. if not isinstance(loc, RegLoc):
  1038. if loc.get_width() == 8:
  1039. self.mc.MOVSD(xmm0, loc)
  1040. self.mc.MOVSD_sx(p, xmm0.value)
  1041. else:
  1042. self.mc.MOV(tmp, loc)
  1043. self.mc.MOV_sr(p, tmp.value)
  1044. p += loc.get_width()
  1045. # x is a location
  1046. self.mc.CALL(x)
  1047. self.mark_gc_roots(force_index)
  1048. #
  1049. if callconv != FFI_DEFAULT_ABI:
  1050. self._fix_stdcall(callconv, p)
  1051. #
  1052. self._regalloc.needed_extra_stack_locations(p//WORD)
  1053. def _fix_stdcal

Large files files are truncated, but you can click here to view the full file