/Modules/_ctypes/libffi/src/x86/unix64.S

http://unladen-swallow.googlecode.com/ · Assembly · 418 lines · 310 code · 44 blank · 64 comment · 1 complexity · 97a9c5d610219bd27237e6767cd23c05 MD5 · raw file

  1. /* -----------------------------------------------------------------------
  2. unix64.S - Copyright (c) 2002 Bo Thorsen <bo@suse.de>
  3. Copyright (c) 2008 Red Hat, Inc
  4. x86-64 Foreign Function Interface
  5. Permission is hereby granted, free of charge, to any person obtaining
  6. a copy of this software and associated documentation files (the
  7. ``Software''), to deal in the Software without restriction, including
  8. without limitation the rights to use, copy, modify, merge, publish,
  9. distribute, sublicense, and/or sell copies of the Software, and to
  10. permit persons to whom the Software is furnished to do so, subject to
  11. the following conditions:
  12. The above copyright notice and this permission notice shall be included
  13. in all copies or substantial portions of the Software.
  14. THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
  15. EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  16. MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  17. NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
  18. HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
  19. WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  21. DEALINGS IN THE SOFTWARE.
  22. ----------------------------------------------------------------------- */
  23. #ifdef __x86_64__
  24. #define LIBFFI_ASM
  25. #include <fficonfig.h>
  26. #include <ffi.h>
  27. .text
  28. /* ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
  29. void *raddr, void (*fnaddr)(void));
  30. Bit o trickiness here -- ARGS+BYTES is the base of the stack frame
  31. for this function. This has been allocated by ffi_call. We also
  32. deallocate some of the stack that has been alloca'd. */
  33. .align 2
  34. .globl ffi_call_unix64
  35. .type ffi_call_unix64,@function
  36. ffi_call_unix64:
  37. .LUW0:
  38. movq (%rsp), %r10 /* Load return address. */
  39. leaq (%rdi, %rsi), %rax /* Find local stack base. */
  40. movq %rdx, (%rax) /* Save flags. */
  41. movq %rcx, 8(%rax) /* Save raddr. */
  42. movq %rbp, 16(%rax) /* Save old frame pointer. */
  43. movq %r10, 24(%rax) /* Relocate return address. */
  44. movq %rax, %rbp /* Finalize local stack frame. */
  45. .LUW1:
  46. movq %rdi, %r10 /* Save a copy of the register area. */
  47. movq %r8, %r11 /* Save a copy of the target fn. */
  48. movl %r9d, %eax /* Set number of SSE registers. */
  49. /* Load up all argument registers. */
  50. movq (%r10), %rdi
  51. movq 8(%r10), %rsi
  52. movq 16(%r10), %rdx
  53. movq 24(%r10), %rcx
  54. movq 32(%r10), %r8
  55. movq 40(%r10), %r9
  56. testl %eax, %eax
  57. jnz .Lload_sse
  58. .Lret_from_load_sse:
  59. /* Deallocate the reg arg area. */
  60. leaq 176(%r10), %rsp
  61. /* Call the user function. */
  62. call *%r11
  63. /* Deallocate stack arg area; local stack frame in redzone. */
  64. leaq 24(%rbp), %rsp
  65. movq 0(%rbp), %rcx /* Reload flags. */
  66. movq 8(%rbp), %rdi /* Reload raddr. */
  67. movq 16(%rbp), %rbp /* Reload old frame pointer. */
  68. .LUW2:
  69. /* The first byte of the flags contains the FFI_TYPE. */
  70. movzbl %cl, %r10d
  71. leaq .Lstore_table(%rip), %r11
  72. movslq (%r11, %r10, 4), %r10
  73. addq %r11, %r10
  74. jmp *%r10
  75. .section .rodata
  76. .Lstore_table:
  77. .long .Lst_void-.Lstore_table /* FFI_TYPE_VOID */
  78. .long .Lst_sint32-.Lstore_table /* FFI_TYPE_INT */
  79. .long .Lst_float-.Lstore_table /* FFI_TYPE_FLOAT */
  80. .long .Lst_double-.Lstore_table /* FFI_TYPE_DOUBLE */
  81. .long .Lst_ldouble-.Lstore_table /* FFI_TYPE_LONGDOUBLE */
  82. .long .Lst_uint8-.Lstore_table /* FFI_TYPE_UINT8 */
  83. .long .Lst_sint8-.Lstore_table /* FFI_TYPE_SINT8 */
  84. .long .Lst_uint16-.Lstore_table /* FFI_TYPE_UINT16 */
  85. .long .Lst_sint16-.Lstore_table /* FFI_TYPE_SINT16 */
  86. .long .Lst_uint32-.Lstore_table /* FFI_TYPE_UINT32 */
  87. .long .Lst_sint32-.Lstore_table /* FFI_TYPE_SINT32 */
  88. .long .Lst_int64-.Lstore_table /* FFI_TYPE_UINT64 */
  89. .long .Lst_int64-.Lstore_table /* FFI_TYPE_SINT64 */
  90. .long .Lst_struct-.Lstore_table /* FFI_TYPE_STRUCT */
  91. .long .Lst_int64-.Lstore_table /* FFI_TYPE_POINTER */
  92. .text
  93. .align 2
  94. .Lst_void:
  95. ret
  96. .align 2
  97. .Lst_uint8:
  98. movzbq %al, %rax
  99. movq %rax, (%rdi)
  100. ret
  101. .align 2
  102. .Lst_sint8:
  103. movsbq %al, %rax
  104. movq %rax, (%rdi)
  105. ret
  106. .align 2
  107. .Lst_uint16:
  108. movzwq %ax, %rax
  109. movq %rax, (%rdi)
  110. .align 2
  111. .Lst_sint16:
  112. movswq %ax, %rax
  113. movq %rax, (%rdi)
  114. ret
  115. .align 2
  116. .Lst_uint32:
  117. movl %eax, %eax
  118. movq %rax, (%rdi)
  119. .align 2
  120. .Lst_sint32:
  121. cltq
  122. movq %rax, (%rdi)
  123. ret
  124. .align 2
  125. .Lst_int64:
  126. movq %rax, (%rdi)
  127. ret
  128. .align 2
  129. .Lst_float:
  130. movss %xmm0, (%rdi)
  131. ret
  132. .align 2
  133. .Lst_double:
  134. movsd %xmm0, (%rdi)
  135. ret
  136. .Lst_ldouble:
  137. fstpt (%rdi)
  138. ret
  139. .align 2
  140. .Lst_struct:
  141. leaq -20(%rsp), %rsi /* Scratch area in redzone. */
  142. /* We have to locate the values now, and since we don't want to
  143. write too much data into the user's return value, we spill the
  144. value to a 16 byte scratch area first. Bits 8, 9, and 10
  145. control where the values are located. Only one of the three
  146. bits will be set; see ffi_prep_cif_machdep for the pattern. */
  147. movd %xmm0, %r10
  148. movd %xmm1, %r11
  149. testl $0x100, %ecx
  150. cmovnz %rax, %rdx
  151. cmovnz %r10, %rax
  152. testl $0x200, %ecx
  153. cmovnz %r10, %rdx
  154. testl $0x400, %ecx
  155. cmovnz %r10, %rax
  156. cmovnz %r11, %rdx
  157. movq %rax, (%rsi)
  158. movq %rdx, 8(%rsi)
  159. /* Bits 12-31 contain the true size of the structure. Copy from
  160. the scratch area to the true destination. */
  161. shrl $12, %ecx
  162. rep movsb
  163. ret
  164. /* Many times we can avoid loading any SSE registers at all.
  165. It's not worth an indirect jump to load the exact set of
  166. SSE registers needed; zero or all is a good compromise. */
  167. .align 2
  168. .LUW3:
  169. .Lload_sse:
  170. movdqa 48(%r10), %xmm0
  171. movdqa 64(%r10), %xmm1
  172. movdqa 80(%r10), %xmm2
  173. movdqa 96(%r10), %xmm3
  174. movdqa 112(%r10), %xmm4
  175. movdqa 128(%r10), %xmm5
  176. movdqa 144(%r10), %xmm6
  177. movdqa 160(%r10), %xmm7
  178. jmp .Lret_from_load_sse
  179. .LUW4:
  180. .size ffi_call_unix64,.-ffi_call_unix64
  181. .align 2
  182. .globl ffi_closure_unix64
  183. .type ffi_closure_unix64,@function
  184. ffi_closure_unix64:
  185. .LUW5:
  186. /* The carry flag is set by the trampoline iff SSE registers
  187. are used. Don't clobber it before the branch instruction. */
  188. leaq -200(%rsp), %rsp
  189. .LUW6:
  190. movq %rdi, (%rsp)
  191. movq %rsi, 8(%rsp)
  192. movq %rdx, 16(%rsp)
  193. movq %rcx, 24(%rsp)
  194. movq %r8, 32(%rsp)
  195. movq %r9, 40(%rsp)
  196. jc .Lsave_sse
  197. .Lret_from_save_sse:
  198. movq %r10, %rdi
  199. leaq 176(%rsp), %rsi
  200. movq %rsp, %rdx
  201. leaq 208(%rsp), %rcx
  202. call ffi_closure_unix64_inner@PLT
  203. /* Deallocate stack frame early; return value is now in redzone. */
  204. addq $200, %rsp
  205. .LUW7:
  206. /* The first byte of the return value contains the FFI_TYPE. */
  207. movzbl %al, %r10d
  208. leaq .Lload_table(%rip), %r11
  209. movslq (%r11, %r10, 4), %r10
  210. addq %r11, %r10
  211. jmp *%r10
  212. .section .rodata
  213. .Lload_table:
  214. .long .Lld_void-.Lload_table /* FFI_TYPE_VOID */
  215. .long .Lld_int32-.Lload_table /* FFI_TYPE_INT */
  216. .long .Lld_float-.Lload_table /* FFI_TYPE_FLOAT */
  217. .long .Lld_double-.Lload_table /* FFI_TYPE_DOUBLE */
  218. .long .Lld_ldouble-.Lload_table /* FFI_TYPE_LONGDOUBLE */
  219. .long .Lld_int8-.Lload_table /* FFI_TYPE_UINT8 */
  220. .long .Lld_int8-.Lload_table /* FFI_TYPE_SINT8 */
  221. .long .Lld_int16-.Lload_table /* FFI_TYPE_UINT16 */
  222. .long .Lld_int16-.Lload_table /* FFI_TYPE_SINT16 */
  223. .long .Lld_int32-.Lload_table /* FFI_TYPE_UINT32 */
  224. .long .Lld_int32-.Lload_table /* FFI_TYPE_SINT32 */
  225. .long .Lld_int64-.Lload_table /* FFI_TYPE_UINT64 */
  226. .long .Lld_int64-.Lload_table /* FFI_TYPE_SINT64 */
  227. .long .Lld_struct-.Lload_table /* FFI_TYPE_STRUCT */
  228. .long .Lld_int64-.Lload_table /* FFI_TYPE_POINTER */
  229. .text
  230. .align 2
  231. .Lld_void:
  232. ret
  233. .align 2
  234. .Lld_int8:
  235. movzbl -24(%rsp), %eax
  236. ret
  237. .align 2
  238. .Lld_int16:
  239. movzwl -24(%rsp), %eax
  240. ret
  241. .align 2
  242. .Lld_int32:
  243. movl -24(%rsp), %eax
  244. ret
  245. .align 2
  246. .Lld_int64:
  247. movq -24(%rsp), %rax
  248. ret
  249. .align 2
  250. .Lld_float:
  251. movss -24(%rsp), %xmm0
  252. ret
  253. .align 2
  254. .Lld_double:
  255. movsd -24(%rsp), %xmm0
  256. ret
  257. .align 2
  258. .Lld_ldouble:
  259. fldt -24(%rsp)
  260. ret
  261. .align 2
  262. .Lld_struct:
  263. /* There are four possibilities here, %rax/%rdx, %xmm0/%rax,
  264. %rax/%xmm0, %xmm0/%xmm1. We collapse two by always loading
  265. both rdx and xmm1 with the second word. For the remaining,
  266. bit 8 set means xmm0 gets the second word, and bit 9 means
  267. that rax gets the second word. */
  268. movq -24(%rsp), %rcx
  269. movq -16(%rsp), %rdx
  270. movq -16(%rsp), %xmm1
  271. testl $0x100, %eax
  272. cmovnz %rdx, %rcx
  273. movd %rcx, %xmm0
  274. testl $0x200, %eax
  275. movq -24(%rsp), %rax
  276. cmovnz %rdx, %rax
  277. ret
  278. /* See the comment above .Lload_sse; the same logic applies here. */
  279. .align 2
  280. .LUW8:
  281. .Lsave_sse:
  282. movdqa %xmm0, 48(%rsp)
  283. movdqa %xmm1, 64(%rsp)
  284. movdqa %xmm2, 80(%rsp)
  285. movdqa %xmm3, 96(%rsp)
  286. movdqa %xmm4, 112(%rsp)
  287. movdqa %xmm5, 128(%rsp)
  288. movdqa %xmm6, 144(%rsp)
  289. movdqa %xmm7, 160(%rsp)
  290. jmp .Lret_from_save_sse
  291. .LUW9:
  292. .size ffi_closure_unix64,.-ffi_closure_unix64
  293. .section .eh_frame,"a",@progbits
  294. .Lframe1:
  295. .long .LECIE1-.LSCIE1 /* CIE Length */
  296. .LSCIE1:
  297. .long 0 /* CIE Identifier Tag */
  298. .byte 1 /* CIE Version */
  299. .ascii "zR\0" /* CIE Augmentation */
  300. .uleb128 1 /* CIE Code Alignment Factor */
  301. .sleb128 -8 /* CIE Data Alignment Factor */
  302. .byte 0x10 /* CIE RA Column */
  303. .uleb128 1 /* Augmentation size */
  304. .byte 0x1b /* FDE Encoding (pcrel sdata4) */
  305. .byte 0xc /* DW_CFA_def_cfa, %rsp offset 8 */
  306. .uleb128 7
  307. .uleb128 8
  308. .byte 0x80+16 /* DW_CFA_offset, %rip offset 1*-8 */
  309. .uleb128 1
  310. .align 8
  311. .LECIE1:
  312. .LSFDE1:
  313. .long .LEFDE1-.LASFDE1 /* FDE Length */
  314. .LASFDE1:
  315. .long .LASFDE1-.Lframe1 /* FDE CIE offset */
  316. .long .LUW0-. /* FDE initial location */
  317. .long .LUW4-.LUW0 /* FDE address range */
  318. .uleb128 0x0 /* Augmentation size */
  319. .byte 0x4 /* DW_CFA_advance_loc4 */
  320. .long .LUW1-.LUW0
  321. /* New stack frame based off rbp. This is a itty bit of unwind
  322. trickery in that the CFA *has* changed. There is no easy way
  323. to describe it correctly on entry to the function. Fortunately,
  324. it doesn't matter too much since at all points we can correctly
  325. unwind back to ffi_call. Note that the location to which we
  326. moved the return address is (the new) CFA-8, so from the
  327. perspective of the unwind info, it hasn't moved. */
  328. .byte 0xc /* DW_CFA_def_cfa, %rbp offset 32 */
  329. .uleb128 6
  330. .uleb128 32
  331. .byte 0x80+6 /* DW_CFA_offset, %rbp offset 2*-8 */
  332. .uleb128 2
  333. .byte 0xa /* DW_CFA_remember_state */
  334. .byte 0x4 /* DW_CFA_advance_loc4 */
  335. .long .LUW2-.LUW1
  336. .byte 0xc /* DW_CFA_def_cfa, %rsp offset 8 */
  337. .uleb128 7
  338. .uleb128 8
  339. .byte 0xc0+6 /* DW_CFA_restore, %rbp */
  340. .byte 0x4 /* DW_CFA_advance_loc4 */
  341. .long .LUW3-.LUW2
  342. .byte 0xb /* DW_CFA_restore_state */
  343. .align 8
  344. .LEFDE1:
  345. .LSFDE3:
  346. .long .LEFDE3-.LASFDE3 /* FDE Length */
  347. .LASFDE3:
  348. .long .LASFDE3-.Lframe1 /* FDE CIE offset */
  349. .long .LUW5-. /* FDE initial location */
  350. .long .LUW9-.LUW5 /* FDE address range */
  351. .uleb128 0x0 /* Augmentation size */
  352. .byte 0x4 /* DW_CFA_advance_loc4 */
  353. .long .LUW6-.LUW5
  354. .byte 0xe /* DW_CFA_def_cfa_offset */
  355. .uleb128 208
  356. .byte 0xa /* DW_CFA_remember_state */
  357. .byte 0x4 /* DW_CFA_advance_loc4 */
  358. .long .LUW7-.LUW6
  359. .byte 0xe /* DW_CFA_def_cfa_offset */
  360. .uleb128 8
  361. .byte 0x4 /* DW_CFA_advance_loc4 */
  362. .long .LUW8-.LUW7
  363. .byte 0xb /* DW_CFA_restore_state */
  364. .align 8
  365. .LEFDE3:
  366. #endif /* __x86_64__ */
  367. #if defined __ELF__ && defined __linux__
  368. .section .note.GNU-stack,"",@progbits
  369. #endif