/media/libvpx/vp8/common/arm/armv6/idct_v6.asm

http://github.com/zpao/v8monkey · Assembly · 345 lines · 243 code · 16 blank · 86 comment · 0 complexity · 6ed074f17982af8c0586a26c9ba09910 MD5 · raw file

  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. ; r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r14
  11. EXPORT |vp8_short_idct4x4llm_1_v6|
  12. EXPORT |vp8_short_idct4x4llm_v6|
  13. EXPORT |vp8_short_idct4x4llm_v6_scott|
  14. EXPORT |vp8_short_idct4x4llm_v6_dual|
  15. AREA |.text|, CODE, READONLY
  16. ;********************************************************************************
  17. ;* void short_idct4x4llm_1_v6(INT16 * input, INT16 * output, INT32 pitch)
  18. ;* r0 INT16 * input
  19. ;* r1 INT16 * output
  20. ;* r2 INT32 pitch
  21. ;* bench: 3/5
  22. ;********************************************************************************
  23. |vp8_short_idct4x4llm_1_v6| PROC ; cycles in out pit
  24. ;
  25. ldrsh r0, [r0] ; load input[0] 1, r0 un 2
  26. add r0, r0, #4 ; 1 +4
  27. stmdb sp!, {r4, r5, lr} ; make room for wide writes 1 backup
  28. mov r0, r0, asr #3 ; (input[0] + 4) >> 3 1, r0 req`d ^1 >> 3
  29. pkhbt r4, r0, r0, lsl #16 ; pack r0 into r4 1, r0 req`d ^1 pack
  30. mov r5, r4 ; expand expand
  31. strd r4, [r1], r2 ; *output = r0, post inc 1
  32. strd r4, [r1], r2 ; 1
  33. strd r4, [r1], r2 ; 1
  34. strd r4, [r1] ; 1
  35. ;
  36. ldmia sp!, {r4, r5, pc} ; replace vars, return restore
  37. ENDP ; |vp8_short_idct4x4llm_1_v6|
  38. ;********************************************************************************
  39. ;********************************************************************************
  40. ;********************************************************************************
  41. ;********************************************************************************
  42. ;* void short_idct4x4llm_v6(INT16 * input, INT16 * output, INT32 pitch)
  43. ;* r0 INT16 * input
  44. ;* r1 INT16 * output
  45. ;* r2 INT32 pitch
  46. ;* bench:
  47. ;********************************************************************************
  48. |vp8_short_idct4x4llm_v6| PROC ; cycles in out pit
  49. ;
  50. stmdb sp!, {r4-r11, lr} ; backup registers 1 backup
  51. ;
  52. mov r4, #0x00004E00 ; 1 cst
  53. orr r4, r4, #0x0000007B ; cospi8sqrt2minus1
  54. mov r5, #0x00008A00 ; 1 cst
  55. orr r5, r5, #0x0000008C ; sinpi8sqrt2
  56. ;
  57. mov r6, #4 ; i=4 1 i
  58. loop1 ;
  59. ldrsh r12, [r0, #8] ; input[4] 1, r12 unavail 2 [4]
  60. ldrsh r3, [r0, #24] ; input[12] 1, r3 unavail 2 [12]
  61. ldrsh r8, [r0, #16] ; input[8] 1, r8 unavail 2 [8]
  62. ldrsh r7, [r0], #0x2 ; input[0] 1, r7 unavail 2 ++ [0]
  63. smulwb r10, r5, r12 ; ([4] * sinpi8sqrt2) >> 16 1, r10 un 2, r12/r5 ^1 t1
  64. smulwb r11, r4, r3 ; ([12] * cospi8sqrt2minus1) >> 16 1, r11 un 2, r3/r4 ^1 t2
  65. add r9, r7, r8 ; a1 = [0] + [8] 1 a1
  66. sub r7, r7, r8 ; b1 = [0] - [8] 1 b1
  67. add r11, r3, r11 ; temp2 1
  68. rsb r11, r11, r10 ; c1 = temp1 - temp2 1 c1
  69. smulwb r3, r5, r3 ; ([12] * sinpi8sqrt2) >> 16 1, r3 un 2, r3/r5 ^ 1 t2
  70. smulwb r10, r4, r12 ; ([4] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r12/r4 ^1 t1
  71. add r8, r7, r11 ; b1 + c1 1 b+c
  72. strh r8, [r1, r2] ; out[pitch] = b1+c1 1
  73. sub r7, r7, r11 ; b1 - c1 1 b-c
  74. add r10, r12, r10 ; temp1 1
  75. add r3, r10, r3 ; d1 = temp1 + temp2 1 d1
  76. add r10, r9, r3 ; a1 + d1 1 a+d
  77. sub r3, r9, r3 ; a1 - d1 1 a-d
  78. add r8, r2, r2 ; pitch * 2 1 p*2
  79. strh r7, [r1, r8] ; out[pitch*2] = b1-c1 1
  80. add r7, r2, r2, lsl #1 ; pitch * 3 1 p*3
  81. strh r3, [r1, r7] ; out[pitch*3] = a1-d1 1
  82. subs r6, r6, #1 ; i-- 1 --
  83. strh r10, [r1], #0x2 ; out[0] = a1+d1 1 ++
  84. bne loop1 ; if i>0, continue
  85. ;
  86. sub r1, r1, #8 ; set up out for next loop 1 -4
  87. ; for this iteration, input=prev output
  88. mov r6, #4 ; i=4 1 i
  89. ; b returnfull
  90. loop2 ;
  91. ldrsh r11, [r1, #2] ; input[1] 1, r11 un 2 [1]
  92. ldrsh r8, [r1, #6] ; input[3] 1, r8 un 2 [3]
  93. ldrsh r3, [r1, #4] ; input[2] 1, r3 un 2 [2]
  94. ldrsh r0, [r1] ; input[0] 1, r0 un 2 [0]
  95. smulwb r9, r5, r11 ; ([1] * sinpi8sqrt2) >> 16 1, r9 un 2, r5/r11 ^1 t1
  96. smulwb r10, r4, r8 ; ([3] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r8 ^1 t2
  97. add r7, r0, r3 ; a1 = [0] + [2] 1 a1
  98. sub r0, r0, r3 ; b1 = [0] - [2] 1 b1
  99. add r10, r8, r10 ; temp2 1
  100. rsb r9, r10, r9 ; c1 = temp1 - temp2 1 c1
  101. smulwb r8, r5, r8 ; ([3] * sinpi8sqrt2) >> 16 1, r8 un 2, r5/r8 ^1 t2
  102. smulwb r10, r4, r11 ; ([1] * cospi8sqrt2minus1) >> 16 1, r10 un 2, r4/r11 ^1 t1
  103. add r3, r0, r9 ; b1+c1 1 b+c
  104. add r3, r3, #4 ; b1+c1+4 1 +4
  105. add r10, r11, r10 ; temp1 1
  106. mov r3, r3, asr #3 ; b1+c1+4 >> 3 1, r3 ^1 >>3
  107. strh r3, [r1, #2] ; out[1] = b1+c1 1
  108. add r10, r10, r8 ; d1 = temp1 + temp2 1 d1
  109. add r3, r7, r10 ; a1+d1 1 a+d
  110. add r3, r3, #4 ; a1+d1+4 1 +4
  111. sub r7, r7, r10 ; a1-d1 1 a-d
  112. add r7, r7, #4 ; a1-d1+4 1 +4
  113. mov r3, r3, asr #3 ; a1+d1+4 >> 3 1, r3 ^1 >>3
  114. mov r7, r7, asr #3 ; a1-d1+4 >> 3 1, r7 ^1 >>3
  115. strh r7, [r1, #6] ; out[3] = a1-d1 1
  116. sub r0, r0, r9 ; b1-c1 1 b-c
  117. add r0, r0, #4 ; b1-c1+4 1 +4
  118. subs r6, r6, #1 ; i-- 1 --
  119. mov r0, r0, asr #3 ; b1-c1+4 >> 3 1, r0 ^1 >>3
  120. strh r0, [r1, #4] ; out[2] = b1-c1 1
  121. strh r3, [r1], r2 ; out[0] = a1+d1 1
  122. ; add r1, r1, r2 ; out += pitch 1 ++
  123. bne loop2 ; if i>0, continue
  124. returnfull ;
  125. ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
  126. ENDP
  127. ;********************************************************************************
  128. ;********************************************************************************
  129. ;********************************************************************************
  130. ;********************************************************************************
  131. ;* void short_idct4x4llm_v6_scott(INT16 * input, INT16 * output, INT32 pitch)
  132. ;* r0 INT16 * input
  133. ;* r1 INT16 * output
  134. ;* r2 INT32 pitch
  135. ;* bench:
  136. ;********************************************************************************
  137. |vp8_short_idct4x4llm_v6_scott| PROC ; cycles in out pit
  138. ; mov r0, #0 ;
  139. ; ldr r0, [r0] ;
  140. stmdb sp!, {r4 - r11, lr} ; backup registers 1 backup
  141. ;
  142. mov r3, #0x00004E00 ; cos
  143. orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
  144. mov r4, #0x00008A00 ; sin
  145. orr r4, r4, #0x0000008C ; sinpi8sqrt2
  146. ;
  147. mov r5, #0x2 ; i i
  148. ;
  149. short_idct4x4llm_v6_scott_loop1 ;
  150. ldr r10, [r0, #(4*2)] ; i5 | i4 5,4
  151. ldr r11, [r0, #(12*2)] ; i13 | i12 13,12
  152. ;
  153. smulwb r6, r4, r10 ; ((ip[4] * sinpi8sqrt2) >> 16) lt1
  154. smulwb r7, r3, r11 ; ((ip[12] * cospi8sqrt2minus1) >> 16) lt2
  155. ;
  156. smulwb r12, r3, r10 ; ((ip[4] * cospi8sqrt2misu1) >> 16) l2t2
  157. smulwb r14, r4, r11 ; ((ip[12] * sinpi8sqrt2) >> 16) l2t1
  158. ;
  159. add r6, r6, r7 ; partial c1 lt1-lt2
  160. add r12, r12, r14 ; partial d1 l2t2+l2t1
  161. ;
  162. smulwt r14, r4, r10 ; ((ip[5] * sinpi8sqrt2) >> 16) ht1
  163. smulwt r7, r3, r11 ; ((ip[13] * cospi8sqrt2minus1) >> 16) ht2
  164. ;
  165. smulwt r8, r3, r10 ; ((ip[5] * cospi8sqrt2minus1) >> 16) h2t1
  166. smulwt r9, r4, r11 ; ((ip[13] * sinpi8sqrt2) >> 16) h2t2
  167. ;
  168. add r7, r14, r7 ; partial c1_2 ht1+ht2
  169. sub r8, r8, r9 ; partial d1_2 h2t1-h2t2
  170. ;
  171. pkhbt r6, r6, r7, lsl #16 ; partial c1_2 | partial c1_1 pack
  172. pkhbt r12, r12, r8, lsl #16 ; partial d1_2 | partial d1_1 pack
  173. ;
  174. usub16 r6, r6, r10 ; c1_2 | c1_1 c
  175. uadd16 r12, r12, r11 ; d1_2 | d1_1 d
  176. ;
  177. ldr r10, [r0, #0] ; i1 | i0 1,0
  178. ldr r11, [r0, #(8*2)] ; i9 | i10 9,10
  179. ;
  180. ;;;;;; add r0, r0, #0x4 ; +4
  181. ;;;;;; add r1, r1, #0x4 ; +4
  182. ;
  183. uadd16 r8, r10, r11 ; i1 + i9 | i0 + i8 aka a1 a
  184. usub16 r9, r10, r11 ; i1 - i9 | i0 - i8 aka b1 b
  185. ;
  186. uadd16 r7, r8, r12 ; a1 + d1 pair a+d
  187. usub16 r14, r8, r12 ; a1 - d1 pair a-d
  188. ;
  189. str r7, [r1] ; op[0] = a1 + d1
  190. str r14, [r1, r2] ; op[pitch*3] = a1 - d1
  191. ;
  192. add r0, r0, #0x4 ; op[pitch] = b1 + c1 ++
  193. add r1, r1, #0x4 ; op[pitch*2] = b1 - c1 ++
  194. ;
  195. subs r5, r5, #0x1 ; --
  196. bne short_idct4x4llm_v6_scott_loop1 ;
  197. ;
  198. sub r1, r1, #16 ; reset output ptr
  199. mov r5, #0x4 ;
  200. mov r0, r1 ; input = output
  201. ;
  202. short_idct4x4llm_v6_scott_loop2 ;
  203. ;
  204. subs r5, r5, #0x1 ;
  205. bne short_idct4x4llm_v6_scott_loop2 ;
  206. ;
  207. ldmia sp!, {r4 - r11, pc} ;
  208. ENDP ;
  209. ;
  210. ;********************************************************************************
  211. ;********************************************************************************
  212. ;********************************************************************************
  213. ;********************************************************************************
  214. ;* void short_idct4x4llm_v6_dual(INT16 * input, INT16 * output, INT32 pitch)
  215. ;* r0 INT16 * input
  216. ;* r1 INT16 * output
  217. ;* r2 INT32 pitch
  218. ;* bench:
  219. ;********************************************************************************
  220. |vp8_short_idct4x4llm_v6_dual| PROC ; cycles in out pit
  221. ;
  222. stmdb sp!, {r4-r11, lr} ; backup registers 1 backup
  223. mov r3, #0x00004E00 ; cos
  224. orr r3, r3, #0x0000007B ; cospi8sqrt2minus1
  225. mov r4, #0x00008A00 ; sin
  226. orr r4, r4, #0x0000008C ; sinpi8sqrt2
  227. mov r5, #0x2 ; i=2 i
  228. loop1_dual
  229. ldr r6, [r0, #(4*2)] ; i5 | i4 5|4
  230. ldr r12, [r0, #(12*2)] ; i13 | i12 13|12
  231. ldr r14, [r0, #(8*2)] ; i9 | i8 9|8
  232. smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
  233. smulwb r7, r3, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 4c
  234. smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
  235. smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 4s
  236. pkhbt r7, r7, r9, lsl #16 ; 5c | 4c
  237. smulwt r11, r3, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 13c
  238. pkhbt r8, r8, r10, lsl #16 ; 5s | 4s
  239. uadd16 r6, r6, r7 ; 5c+5 | 4c+4
  240. smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 13s
  241. smulwb r9, r3, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 12c
  242. smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 12s
  243. subs r5, r5, #0x1 ; i-- --
  244. pkhbt r9, r9, r11, lsl #16 ; 13c | 12c
  245. ldr r11, [r0], #0x4 ; i1 | i0 ++ 1|0
  246. pkhbt r10, r10, r7, lsl #16 ; 13s | 12s
  247. uadd16 r7, r12, r9 ; 13c+13 | 12c+12
  248. usub16 r7, r8, r7 ; c c
  249. uadd16 r6, r6, r10 ; d d
  250. uadd16 r10, r11, r14 ; a a
  251. usub16 r8, r11, r14 ; b b
  252. uadd16 r9, r10, r6 ; a+d a+d
  253. usub16 r10, r10, r6 ; a-d a-d
  254. uadd16 r6, r8, r7 ; b+c b+c
  255. usub16 r7, r8, r7 ; b-c b-c
  256. str r6, [r1, r2] ; o5 | o4
  257. add r6, r2, r2 ; pitch * 2 p2
  258. str r7, [r1, r6] ; o9 | o8
  259. add r6, r6, r2 ; pitch * 3 p3
  260. str r10, [r1, r6] ; o13 | o12
  261. str r9, [r1], #0x4 ; o1 | o0 ++
  262. bne loop1_dual ;
  263. mov r5, #0x2 ; i=2 i
  264. sub r0, r1, #8 ; reset input/output i/o
  265. loop2_dual
  266. ldr r6, [r0, r2] ; i5 | i4 5|4
  267. ldr r1, [r0] ; i1 | i0 1|0
  268. ldr r12, [r0, #0x4] ; i3 | i2 3|2
  269. add r14, r2, #0x4 ; pitch + 2 p+2
  270. ldr r14, [r0, r14] ; i7 | i6 7|6
  271. smulwt r9, r3, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 5c
  272. smulwt r7, r3, r1 ; (ip[1] * cospi8sqrt2minus1) >> 16 1c
  273. smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 5s
  274. smulwt r8, r4, r1 ; (ip[1] * sinpi8sqrt2) >> 16 1s
  275. pkhbt r11, r6, r1, lsl #16 ; i0 | i4 0|4
  276. pkhbt r7, r9, r7, lsl #16 ; 1c | 5c
  277. pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 © tc1
  278. pkhtb r1, r1, r6, asr #16 ; i1 | i5 1|5
  279. uadd16 r1, r7, r1 ; 1c+1 | 5c+5 = temp2 (d) td2
  280. pkhbt r9, r14, r12, lsl #16 ; i2 | i6 2|6
  281. uadd16 r10, r11, r9 ; a a
  282. usub16 r9, r11, r9 ; b b
  283. pkhtb r6, r12, r14, asr #16 ; i3 | i7 3|7
  284. subs r5, r5, #0x1 ; i-- --
  285. smulwt r7, r3, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 3c
  286. smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 3s
  287. smulwb r12, r3, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 7c
  288. smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 7s
  289. pkhbt r7, r12, r7, lsl #16 ; 3c | 7c
  290. pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 (d) td1
  291. uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 (c) tc2
  292. usub16 r12, r8, r6 ; c (o1 | o5) c
  293. uadd16 r6, r11, r1 ; d (o3 | o7) d
  294. uadd16 r7, r10, r6 ; a+d a+d
  295. mov r8, #0x4 ; set up 4's 4
  296. orr r8, r8, #0x40000 ; 4|4
  297. usub16 r6, r10, r6 ; a-d a-d
  298. uadd16 r6, r6, r8 ; a-d+4 3|7
  299. uadd16 r7, r7, r8 ; a+d+4 0|4
  300. uadd16 r10, r9, r12 ; b+c b+c
  301. usub16 r1, r9, r12 ; b-c b-c
  302. uadd16 r10, r10, r8 ; b+c+4 1|5
  303. uadd16 r1, r1, r8 ; b-c+4 2|6
  304. mov r8, r10, asr #19 ; o1 >> 3
  305. strh r8, [r0, #2] ; o1
  306. mov r8, r1, asr #19 ; o2 >> 3
  307. strh r8, [r0, #4] ; o2
  308. mov r8, r6, asr #19 ; o3 >> 3
  309. strh r8, [r0, #6] ; o3
  310. mov r8, r7, asr #19 ; o0 >> 3
  311. strh r8, [r0], r2 ; o0 +p
  312. sxth r10, r10 ;
  313. mov r8, r10, asr #3 ; o5 >> 3
  314. strh r8, [r0, #2] ; o5
  315. sxth r1, r1 ;
  316. mov r8, r1, asr #3 ; o6 >> 3
  317. strh r8, [r0, #4] ; o6
  318. sxth r6, r6 ;
  319. mov r8, r6, asr #3 ; o7 >> 3
  320. strh r8, [r0, #6] ; o7
  321. sxth r7, r7 ;
  322. mov r8, r7, asr #3 ; o4 >> 3
  323. strh r8, [r0], r2 ; o4 +p
  324. ;;;;; subs r5, r5, #0x1 ; i-- --
  325. bne loop2_dual ;
  326. ;
  327. ldmia sp!, {r4 - r11, pc} ; replace vars, return restore
  328. ENDP
  329. END