/media/libvpx/vp8/encoder/arm/neon/fastfdct8x4_neon.asm

http://github.com/zpao/v8monkey · Assembly · 177 lines · 121 code · 27 blank · 29 comment · 0 complexity · ca6498f6323cdf89b24511ee86f01cdd MD5 · raw file

  1. ;
  2. ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
  3. ;
  4. ; Use of this source code is governed by a BSD-style license
  5. ; that can be found in the LICENSE file in the root of the source
  6. ; tree. An additional intellectual property rights grant can be found
  7. ; in the file PATENTS. All contributing project authors may
  8. ; be found in the AUTHORS file in the root of the source tree.
  9. ;
  10. EXPORT |vp8_fast_fdct8x4_neon|
  11. ARM
  12. REQUIRE8
  13. PRESERVE8
  14. AREA ||.text||, CODE, READONLY, ALIGN=2
  15. ;void vp8_fast_fdct4x4_c(short *input, short *output, int pitch);
  16. ;NOTE:
  17. ;The input *src_diff. src_diff is calculated as:
  18. ;diff_ptr[c] = src_ptr[c] - pred_ptr[c]; (in Subtract* function)
  19. ;In which *src_ptr and *pred_ptr both are unsigned char.
  20. ;Therefore, *src_diff should be in the range of [-255, 255].
  21. ;CAUTION:
  22. ;The input values of 25th block are set in vp8_build_dcblock function, which are out of [-255, 255].
  23. ;But, VP8 encoder only uses vp8_short_fdct4x4_c for 25th block, not vp8_fast_fdct4x4_c. That makes
  24. ;it ok for assuming *input in [-255, 255] in vp8_fast_fdct4x4_c, but not ok in vp8_short_fdct4x4_c.
  25. |vp8_fast_fdct8x4_neon| PROC
  26. vld1.16 {q1}, [r0], r2 ;load input
  27. ldr r12, _ffdct8_coeff_
  28. vld1.16 {q2}, [r0], r2
  29. vld1.16 {q3}, [r0], r2
  30. vld1.16 {d0}, [r12]
  31. vld1.16 {q4}, [r0], r2
  32. ;First for-loop
  33. ;transpose d2, d4, d6, d8. Then, d2=ip[0], d4=ip[1], d6=ip[2], d8=ip[3]
  34. ;transpose d3, d5, d7, d9. Then, d3=ip[0], d5=ip[1], d7=ip[2], d9=ip[3]
  35. vtrn.32 d2, d6
  36. vtrn.32 d3, d7
  37. vtrn.32 d4, d8
  38. vtrn.32 d5, d9
  39. vtrn.16 d2, d4
  40. vtrn.16 d3, d5
  41. vtrn.16 d6, d8
  42. vtrn.16 d7, d9
  43. vadd.s16 d10, d2, d8 ;ip[0]+ip[3]
  44. vadd.s16 d11, d4, d6 ;ip[1]+ip[2]
  45. vsub.s16 d12, d4, d6 ;ip[1]-ip[2]
  46. vsub.s16 d13, d2, d8 ;ip[0]-ip[3]
  47. vadd.s16 d22, d3, d9
  48. vadd.s16 d23, d5, d7
  49. vsub.s16 d24, d5, d7
  50. vsub.s16 d25, d3, d9
  51. vshl.i16 q5, q5, #1 ; a1, b1
  52. vshl.i16 q6, q6, #1 ; c1, d1
  53. vshl.i16 q1, q11, #1
  54. vshl.i16 q2, q12, #1
  55. vadd.s16 d14, d10, d11 ;temp1 = a1 + b1
  56. vsub.s16 d15, d10, d11 ;temp2 = a1 - b1
  57. vadd.s16 d24, d2, d3
  58. vsub.s16 d25, d2, d3
  59. vqdmulh.s16 q8, q7, d0[1]
  60. vqdmulh.s16 q13, q12, d0[1]
  61. vqdmulh.s16 q10, q6, d0[0]
  62. vqdmulh.s16 q15, q2, d0[0]
  63. vqdmulh.s16 q9, q6, d0[2]
  64. vqdmulh.s16 q14, q2, d0[2]
  65. vshr.s16 q8, q8, #1
  66. vshr.s16 q13, q13, #1
  67. vshr.s16 q10, q10, #1
  68. vshr.s16 q15, q15, #1
  69. vshr.s16 q9, q9, #1 ;d18:temp1 = ( c1 * x_c3)>>16; d19:temp1 = (d1 * x_c3)>>16
  70. vshr.s16 q14, q14, #1 ;d28:temp1 = ( c1 * x_c3)>>16; d29:temp1 = (d1 * x_c3)>>16
  71. vadd.s16 q10, q6, q10 ;d20:temp2 = ((c1 * x_c1)>>16) + c1; d21:temp2 = ((d1 * x_c1)>>16) + d1
  72. vadd.s16 q15, q2, q15 ;d30:temp2 = ((c1 * x_c1)>>16) + c1; d31:temp2 = ((d1 * x_c1)>>16) + d1
  73. vadd.s16 d2, d14, d16 ;op[0] = ((temp1 * x_c2 )>>16) + temp1
  74. vadd.s16 d3, d24, d26 ;op[0] = ((temp1 * x_c2 )>>16) + temp1
  75. vadd.s16 d6, d15, d17 ;op[2] = ((temp2 * x_c2 )>>16) + temp2
  76. vadd.s16 d7, d25, d27 ;op[2] = ((temp2 * x_c2 )>>16) + temp2
  77. vadd.s16 d4, d18, d21 ;op[1] = temp1 + temp2 -- q is not necessary, just for protection
  78. vadd.s16 d5, d28, d31 ;op[1] = temp1 + temp2 -- q is not necessary, just for protection
  79. vsub.s16 d8, d19, d20 ;op[3] = temp1 - temp2
  80. vsub.s16 d9, d29, d30 ;op[3] = temp1 - temp2
  81. ;Second for-loop
  82. ;transpose d2, d4, d6, d8. Then, d2=ip[0], d4=ip[4], d6=ip[8], d8=ip[12]
  83. ;transpose d3, d5, d7, d9. Then, d3=ip[0], d5=ip[4], d7=ip[8], d9=ip[12]
  84. vtrn.32 d2, d6
  85. vtrn.32 d3, d7
  86. vtrn.32 d4, d8
  87. vtrn.32 d5, d9
  88. vtrn.16 d2, d4
  89. vtrn.16 d3, d5
  90. vtrn.16 d6, d8
  91. vtrn.16 d7, d9
  92. vadd.s16 d10, d2, d8 ;a1 = ip[0]+ip[12]
  93. vadd.s16 d11, d4, d6 ;b1 = ip[4]+ip[8]
  94. vsub.s16 d12, d4, d6 ;c1 = ip[4]-ip[8]
  95. vsub.s16 d13, d2, d8 ;d1 = ip[0]-ip[12]
  96. vadd.s16 d2, d3, d9
  97. vadd.s16 d4, d5, d7
  98. vsub.s16 d24, d5, d7
  99. vsub.s16 d25, d3, d9
  100. vadd.s16 d14, d10, d11 ;temp1 = a1 + b1
  101. vsub.s16 d15, d10, d11 ;temp2 = a1 - b1
  102. vadd.s16 d22, d2, d4
  103. vsub.s16 d23, d2, d4
  104. vqdmulh.s16 q8, q7, d0[1]
  105. vqdmulh.s16 q13, q11, d0[1]
  106. vqdmulh.s16 q10, q6, d0[0]
  107. vqdmulh.s16 q15, q12, d0[0]
  108. vqdmulh.s16 q9, q6, d0[2]
  109. vqdmulh.s16 q14, q12, d0[2]
  110. vshr.s16 q8, q8, #1
  111. vshr.s16 q13, q13, #1
  112. vshr.s16 q10, q10, #1
  113. vshr.s16 q15, q15, #1
  114. vshr.s16 q9, q9, #1 ;d18:temp1 = ( c1 * x_c3)>>16; d19:temp1 = (d1 * x_c3)>>16
  115. vshr.s16 q14, q14, #1 ;d28:temp1 = ( c1 * x_c3)>>16; d29:temp1 = (d1 * x_c3)>>16
  116. vadd.s16 q10, q6, q10 ;d20:temp2 = ((c1 * x_c1)>>16) + c1; d21:temp2 = ((d1 * x_c1)>>16) + d1
  117. vadd.s16 q15, q12, q15 ;d30:temp2 = ((c1 * x_c1)>>16) + c1; d31:temp2 = ((d1 * x_c1)>>16) + d1
  118. vadd.s16 d2, d14, d16 ;a2 = ((temp1 * x_c2 )>>16) + temp1
  119. vadd.s16 d6, d22, d26 ;a2 = ((temp1 * x_c2 )>>16) + temp1
  120. vadd.s16 d4, d15, d17 ;c2 = ((temp2 * x_c2 )>>16) + temp2
  121. vadd.s16 d8, d23, d27 ;c2 = ((temp2 * x_c2 )>>16) + temp2
  122. vadd.s16 d3, d18, d21 ;b2 = temp1 + temp2 -- q is not necessary, just for protection
  123. vadd.s16 d7, d28, d31 ;b2 = temp1 + temp2 -- q is not necessary, just for protection
  124. vsub.s16 d5, d19, d20 ;d2 = temp1 - temp2
  125. vsub.s16 d9, d29, d30 ;d2 = temp1 - temp2
  126. vclt.s16 q5, q1, #0
  127. vclt.s16 q6, q2, #0
  128. vclt.s16 q7, q3, #0
  129. vclt.s16 q8, q4, #0
  130. vsub.s16 q1, q1, q5
  131. vsub.s16 q2, q2, q6
  132. vsub.s16 q3, q3, q7
  133. vsub.s16 q4, q4, q8
  134. vshr.s16 q1, q1, #1
  135. vshr.s16 q2, q2, #1
  136. vshr.s16 q3, q3, #1
  137. vshr.s16 q4, q4, #1
  138. vst1.16 {q1, q2}, [r1]!
  139. vst1.16 {q3, q4}, [r1]
  140. bx lr
  141. ENDP
  142. ;-----------------
  143. _ffdct8_coeff_
  144. DCD ffdct8_coeff
  145. ffdct8_coeff
  146. ; 60547 = 0xEC83
  147. ; 46341 = 0xB505
  148. ; 25080 = 0x61F8
  149. DCD 0xB505EC83, 0x000061F8
  150. END