PageRenderTime 29ms CodeModel.GetById 10ms RepoModel.GetById 1ms app.codeStats 0ms

/gmp/mpn/sparc32/v9/mul_1.asm

https://bitbucket.org/pizzafactory/blackfin-toolchain
Assembly | 276 lines | 250 code | 26 blank | 0 comment | 2 complexity | 9dca0f99ded7818f32aee1aae538516c MD5 | raw file
Possible License(s): GPL-2.0, LGPL-2.0, GPL-3.0, AGPL-1.0, LGPL-2.1
  1. dnl SPARC v9 32-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
  2. dnl the result in a second limb vector.
  3. dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
  4. dnl This file is part of the GNU MP Library.
  5. dnl The GNU MP Library is free software; you can redistribute it and/or modify
  6. dnl it under the terms of the GNU Lesser General Public License as published
  7. dnl by the Free Software Foundation; either version 3 of the License, or (at
  8. dnl your option) any later version.
  9. dnl The GNU MP Library is distributed in the hope that it will be useful, but
  10. dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  11. dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
  12. dnl License for more details.
  13. dnl You should have received a copy of the GNU Lesser General Public License
  14. dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
  15. include(`../config.m4')
  16. C Algorithm: We use two floating-point multiplies per limb product, with the
  17. C invariant v operand split into two 16-bit pieces, and the u operand split
  18. C into 32-bit pieces. We convert the two 48-bit products and transfer them to
  19. C the integer unit.
  20. C cycles/limb
  21. C UltraSPARC 1&2: 6.5
  22. C UltraSPARC 3: ?
  23. C Possible optimizations:
  24. C 1. Combine 32-bit memory operations into 64-bit operations. Since we're
  25. C memory bandwidth limited, this could save 1.5 cycles/limb.
  26. C 2. Unroll the inner loop. Since we already use alternate temporary areas,
  27. C it is very straightforward to unroll, using an exit branch midways.
  28. C Unrolling would allow deeper scheduling which could improve speed for L2
  29. C cache case.
  30. C 3. For mpn_mul_1: Use more alternating temp areas. The std'es and ldx'es
  31. C aren't sufficiently apart-scheduled with just two temp areas.
  32. C 4. Specialize for particular v values. If its upper 16 bits are zero, we
  33. C could save many operations.
  34. C INPUT PARAMETERS
  35. C rp i0
  36. C up i1
  37. C n i2
  38. C v i3
  39. define(`FSIZE',224)
  40. ASM_START()
  41. PROLOGUE(mpn_mul_1)
  42. add %sp, -FSIZE, %sp
  43. sethi %hi(0xffff), %g1
  44. srl %o3, 16, %g2
  45. or %g1, %lo(0xffff), %g1
  46. and %o3, %g1, %g1
  47. stx %g1, [%sp+104]
  48. stx %g2, [%sp+112]
  49. ldd [%sp+104], %f6
  50. ldd [%sp+112], %f8
  51. fxtod %f6, %f6
  52. fxtod %f8, %f8
  53. ld [%sp+104], %f10 C zero f10
  54. mov 0, %g3 C cy = 0
  55. define(`fanop', `fitod %f18, %f0') C A quasi nop running in the FA pipe
  56. add %sp, 160, %o5 C point in scratch area
  57. and %o5, -32, %o5 C align at 0 (mod 32) in scratch area
  58. subcc %o2, 1, %o2
  59. ld [%o1], %f11 C read up[i]
  60. add %o1, 4, %o1 C up++
  61. bne,pt %icc, .L_two_or_more
  62. fxtod %f10, %f2
  63. fmuld %f2, %f8, %f16
  64. fmuld %f2, %f6, %f4
  65. fdtox %f16, %f14
  66. fdtox %f4, %f12
  67. std %f14, [%o5+16]
  68. std %f12, [%o5+24]
  69. ldx [%o5+16], %g2 C p16
  70. ldx [%o5+24], %g1 C p0
  71. b .L1
  72. add %o0, -16, %o0
  73. .align 16
  74. .L_two_or_more:
  75. subcc %o2, 1, %o2
  76. ld [%o1], %f11 C read up[i]
  77. fmuld %f2, %f8, %f16
  78. fmuld %f2, %f6, %f4
  79. add %o1, 4, %o1 C up++
  80. bne,pt %icc, .L_three_or_more
  81. fxtod %f10, %f2
  82. fdtox %f16, %f14
  83. fdtox %f4, %f12
  84. std %f14, [%o5+16]
  85. fmuld %f2, %f8, %f16
  86. std %f12, [%o5+24]
  87. fmuld %f2, %f6, %f4
  88. fdtox %f16, %f14
  89. fdtox %f4, %f12
  90. std %f14, [%o5+0]
  91. std %f12, [%o5+8]
  92. ldx [%o5+16], %g2 C p16
  93. ldx [%o5+24], %g1 C p0
  94. b .L2
  95. add %o0, -12, %o0
  96. .align 16
  97. .L_three_or_more:
  98. subcc %o2, 1, %o2
  99. ld [%o1], %f11 C read up[i]
  100. fdtox %f16, %f14
  101. fdtox %f4, %f12
  102. std %f14, [%o5+16]
  103. fmuld %f2, %f8, %f16
  104. std %f12, [%o5+24]
  105. fmuld %f2, %f6, %f4
  106. add %o1, 4, %o1 C up++
  107. bne,pt %icc, .L_four_or_more
  108. fxtod %f10, %f2
  109. fdtox %f16, %f14
  110. fdtox %f4, %f12
  111. std %f14, [%o5+0]
  112. fmuld %f2, %f8, %f16
  113. std %f12, [%o5+8]
  114. fmuld %f2, %f6, %f4
  115. fdtox %f16, %f14
  116. ldx [%o5+16], %g2 C p16
  117. fdtox %f4, %f12
  118. ldx [%o5+24], %g1 C p0
  119. std %f14, [%o5+16]
  120. std %f12, [%o5+24]
  121. b .L3
  122. add %o0, -8, %o0
  123. .align 16
  124. .L_four_or_more:
  125. subcc %o2, 1, %o2
  126. ld [%o1], %f11 C read up[i]
  127. fdtox %f16, %f14
  128. fdtox %f4, %f12
  129. std %f14, [%o5+0]
  130. fmuld %f2, %f8, %f16
  131. std %f12, [%o5+8]
  132. fmuld %f2, %f6, %f4
  133. add %o1, 4, %o1 C up++
  134. bne,pt %icc, .L_five_or_more
  135. fxtod %f10, %f2
  136. fdtox %f16, %f14
  137. ldx [%o5+16], %g2 C p16
  138. fdtox %f4, %f12
  139. ldx [%o5+24], %g1 C p0
  140. std %f14, [%o5+16]
  141. fmuld %f2, %f8, %f16
  142. std %f12, [%o5+24]
  143. fmuld %f2, %f6, %f4
  144. add %o1, 4, %o1 C up++
  145. b .L4
  146. add %o0, -4, %o0
  147. .align 16
  148. .L_five_or_more:
  149. subcc %o2, 1, %o2
  150. ld [%o1], %f11 C read up[i]
  151. fdtox %f16, %f14
  152. ldx [%o5+16], %g2 C p16
  153. fdtox %f4, %f12
  154. ldx [%o5+24], %g1 C p0
  155. std %f14, [%o5+16]
  156. fmuld %f2, %f8, %f16
  157. std %f12, [%o5+24]
  158. fmuld %f2, %f6, %f4
  159. add %o1, 4, %o1 C up++
  160. bne,pt %icc, .Loop
  161. fxtod %f10, %f2
  162. b,a .L5
  163. C BEGIN MAIN LOOP
  164. .align 16
  165. C -- 0
  166. .Loop: nop
  167. subcc %o2, 1, %o2
  168. ld [%o1], %f11 C read up[i]
  169. fdtox %f16, %f14
  170. C -- 1
  171. sllx %g2, 16, %g4 C (p16 << 16)
  172. add %o0, 4, %o0 C rp++
  173. ldx [%o5+0], %g2 C p16
  174. fdtox %f4, %f12
  175. C -- 2
  176. nop
  177. add %g1, %g4, %g4 C p = p0 + (p16 << 16)
  178. ldx [%o5+8], %g1 C p0
  179. fanop
  180. C -- 3
  181. nop
  182. add %g3, %g4, %g4 C p += cy
  183. std %f14, [%o5+0]
  184. fmuld %f2, %f8, %f16
  185. C -- 4
  186. srlx %g4, 32, %g3 C new cy
  187. add %o1, 4, %o1 C up++
  188. std %f12, [%o5+8]
  189. fmuld %f2, %f6, %f4
  190. C -- 5
  191. xor %o5, 16, %o5 C alternate scratch variables
  192. stw %g4, [%o0-4]
  193. bne,pt %icc, .Loop
  194. fxtod %f10, %f2
  195. C END MAIN LOOP
  196. .L5: fdtox %f16, %f14
  197. sllx %g2, 16, %g4 C (p16 << 16)
  198. ldx [%o5+0], %g2 C p16
  199. fdtox %f4, %f12
  200. add %g1, %g4, %g4 C p = p0 + (p16 << 16)
  201. ldx [%o5+8], %g1 C p0
  202. add %g4, %g3, %g4 C p += cy
  203. std %f14, [%o5+0]
  204. fmuld %f2, %f8, %f16
  205. std %f12, [%o5+8]
  206. fmuld %f2, %f6, %f4
  207. xor %o5, 16, %o5
  208. stw %g4, [%o0+0]
  209. srlx %g4, 32, %g3 C new cy
  210. .L4: fdtox %f16, %f14
  211. sllx %g2, 16, %g4 C (p16 << 16)
  212. ldx [%o5+0], %g2 C p16
  213. fdtox %f4, %f12
  214. add %g1, %g4, %g4 C p = p0 + (p16 << 16)
  215. ldx [%o5+8], %g1 C p0
  216. add %g3, %g4, %g4 C p += cy
  217. std %f14, [%o5+0]
  218. std %f12, [%o5+8]
  219. xor %o5, 16, %o5
  220. stw %g4, [%o0+4]
  221. srlx %g4, 32, %g3 C new cy
  222. .L3: sllx %g2, 16, %g4 C (p16 << 16)
  223. ldx [%o5+0], %g2 C p16
  224. add %g1, %g4, %g4 C p = p0 + (p16 << 16)
  225. ldx [%o5+8], %g1 C p0
  226. add %g3, %g4, %g4 C p += cy
  227. xor %o5, 16, %o5
  228. stw %g4, [%o0+8]
  229. srlx %g4, 32, %g3 C new cy
  230. .L2: sllx %g2, 16, %g4 C (p16 << 16)
  231. ldx [%o5+0], %g2 C p16
  232. add %g1, %g4, %g4 C p = p0 + (p16 << 16)
  233. ldx [%o5+8], %g1 C p0
  234. add %g3, %g4, %g4 C p += cy
  235. stw %g4, [%o0+12]
  236. srlx %g4, 32, %g3 C new cy
  237. .L1: sllx %g2, 16, %g4 C (p16 << 16)
  238. add %g1, %g4, %g4 C p = p0 + (p16 << 16)
  239. add %g3, %g4, %g4 C p += cy
  240. stw %g4, [%o0+16]
  241. srlx %g4, 32, %g3 C new cy
  242. mov %g3, %o0
  243. retl
  244. sub %sp, -FSIZE, %sp
  245. EPILOGUE(mpn_mul_1)