/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s

https://bitbucket.org/aways/android_frameworks_av · Assembly · 480 lines · 219 code · 169 blank · 92 comment · 0 complexity · a895c6da856e2c9b3cffe107e3db27b8 MD5 · raw file

  1. ;//
  2. ;// (c) Copyright 2007 ARM Limited. All Rights Reserved.
  3. ;//
  4. ;// Description:
  5. ;// H.264 inverse quantize and transform module
  6. ;//
  7. ;//
  8. ;// Include standard headers
  9. INCLUDE omxtypes_s.h
  10. INCLUDE armCOMM_s.h
  11. ;// Import symbols required from other files
  12. ;// (For example tables)
  13. IMPORT armVCM4P10_UnpackBlock4x4
  14. IMPORT armVCM4P10_TransformResidual4x4
  15. IMPORT armVCM4P10_QPDivTable
  16. IMPORT armVCM4P10_VMatrixU16
  17. IMPORT armVCM4P10_QPModuloTable
  18. M_VARIANTS ARM1136JS, ARM1136JS_U
  19. ;// Set debugging level
  20. ;//DEBUG_ON SETL {TRUE}
  21. ;// Static Function: armVCM4P10_DequantLumaAC4x4
  22. ;// Guarding implementation by the processor name
  23. IF ARM1136JS
  24. ;//Input Registers
  25. pSrcDst RN 0
  26. QP RN 1
  27. ;//Output Registers
  28. ;//Local Scratch Registers
  29. pQPdiv RN 4
  30. pQPmod RN 5
  31. pVRow RN 2
  32. QPmod RN 6
  33. shift RN 3
  34. rowLuma01 RN 1
  35. rowLuma23 RN 4
  36. SrcDst00 RN 5
  37. SrcDst02 RN 6
  38. SrcDst10 RN 7
  39. SrcDst12 RN 8
  40. SrcDst20 RN 9
  41. SrcDst22 RN 10
  42. SrcDst30 RN 11
  43. SrcDst32 RN 12
  44. temp1 RN 2
  45. temp2 RN 3
  46. temp3 RN 14
  47. ;// Allocate stack memory required by the function
  48. ;// Write function header
  49. M_START armVCM4P10_DequantLumaAC4x4,r11
  50. LDR pQPmod,=armVCM4P10_QPModuloTable
  51. LDR pQPdiv,=armVCM4P10_QPDivTable
  52. LDR pVRow,=armVCM4P10_VMatrixU16
  53. LDRSB QPmod,[pQPmod,QP] ;// (QP%6) * 6
  54. LDRSB shift,[pQPdiv,QP] ;// Shift = QP / 6
  55. LDRH rowLuma01,[pVRow,QPmod]! ;// rowLuma01 = [00|0a]
  56. LDRH temp3,[pVRow,#2] ;// temp3 = [00|0b]
  57. LDRH rowLuma23,[pVRow,#4] ;// rowLuma23 = [00|0c]
  58. ORR rowLuma01,rowLuma01,temp3,LSL #16 ;// rowLuma01 = [0b|0a]
  59. ;// Load all the 16 'src' values
  60. LDMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
  61. ;//*********************************************************************************************
  62. ;//
  63. ;// 'Shift' ranges between [0,8]
  64. ;// So we can shift the packed rowLuma values [0b|0a] with a single LSL operation
  65. ;//
  66. ;//*********************************************************************************************
  67. LSL rowLuma01,rowLuma01,shift
  68. LSL rowLuma23,rowLuma23,shift
  69. ;//**********************************************************************************************
  70. ;//
  71. ;// The idea is to unroll the Loop completely
  72. ;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above)
  73. ;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16'
  74. ;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2
  75. ;// These 3 values are loaded into rowLuma01 and rowLuma23 (above)
  76. ;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above)
  77. ;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated
  78. ;// Here we interleave the PKHBT operations for various rows to avoide pipeline stalls
  79. ;//
  80. ;// We then pack the two 16 bit multiplication result into a word and store at one go
  81. ;//
  82. ;//**********************************************************************************************
  83. ;// Row 1
  84. SMULTB temp1,SrcDst00,rowLuma23 ;// pSrcDst[1] * (pVRow[2]<<Shift)
  85. SMULBB SrcDst00,SrcDst00,rowLuma01 ;// pSrcDst[0] * (pVRow[0]<<Shift)
  86. SMULTB temp2,SrcDst02,rowLuma23 ;// pSrcDst[3] * (pVRow[2]<<Shift)
  87. SMULBB SrcDst02,SrcDst02,rowLuma01 ;// pSrcDst[2] * (pVRow[0]<<Shift)
  88. PKHBT SrcDst00,SrcDst00,temp1,LSL #16 ;// Pack the first two product values
  89. ;// Row 2
  90. SMULTT temp1,SrcDst10,rowLuma01 ;// pSrcDst[5] * (pVRow[1]<<Shift)
  91. SMULBB SrcDst10,SrcDst10,rowLuma23 ;// pSrcDst[4] * (pVRow[2]<<Shift)
  92. PKHBT SrcDst02,SrcDst02,temp2,LSL #16 ;// Pack the next two product values
  93. SMULTT temp2,SrcDst12,rowLuma01 ;// pSrcDst[7] * (pVRow[1]<<Shift)
  94. SMULBB SrcDst12,SrcDst12,rowLuma23 ;// pSrcDst[6] * (pVRow[2]<<Shift)
  95. PKHBT SrcDst10,SrcDst10,temp1,LSL #16 ;// Pack the next two product values
  96. ;// Row 3
  97. SMULTB temp1,SrcDst20,rowLuma23 ;// pSrcDst[9] * (pVRow[2]<<Shift)
  98. SMULBB SrcDst20,SrcDst20,rowLuma01 ;// pSrcDst[8] * (pVRow[0]<<Shift)
  99. PKHBT SrcDst12,SrcDst12,temp2,LSL #16 ;// Pack the next two product values
  100. SMULTB temp2,SrcDst22,rowLuma23 ;// pSrcDst[11] * (pVRow[2]<<Shift)
  101. SMULBB SrcDst22,SrcDst22,rowLuma01 ;// pSrcDst[10] * (pVRow[0]<<Shift)
  102. PKHBT SrcDst20,SrcDst20,temp1,LSL #16 ;// Pack the next two product values
  103. ;// Row 4
  104. SMULTT temp1,SrcDst30,rowLuma01 ;// pSrcDst[13] * (pVRow[1]<<Shift)
  105. SMULBB SrcDst30,SrcDst30,rowLuma23 ;// pSrcDst[12] * (pVRow[2]<<Shift)
  106. SMULTT temp3,SrcDst32,rowLuma01 ;// pSrcDst[15] * (pVRow[1]<<Shift)
  107. SMULBB SrcDst32,SrcDst32,rowLuma23 ;// pSrcDst[14] * (pVRow[2]<<Shift)
  108. PKHBT SrcDst22,SrcDst22,temp2,LSL #16 ;// Pack the remaining product values
  109. PKHBT SrcDst30,SrcDst30,temp1,LSL #16
  110. PKHBT SrcDst32,SrcDst32,temp3,LSL #16
  111. STMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
  112. ;// Set return value
  113. ;// Write function tail
  114. M_END
  115. ENDIF ;//ARM1136JS
  116. ;// Guarding implementation by the processor name
  117. IF ARM1136JS_U
  118. ;//Input Registers
  119. pSrcDst RN 0
  120. QP RN 1
  121. ;//Output Registers
  122. ;//Local Scratch Registers
  123. pQPdiv RN 4
  124. pQPmod RN 5
  125. pVRow RN 2
  126. QPmod RN 6
  127. shift RN 3
  128. rowLuma01 RN 1
  129. rowLuma23 RN 4
  130. SrcDst00 RN 5
  131. SrcDst02 RN 6
  132. SrcDst10 RN 7
  133. SrcDst12 RN 8
  134. SrcDst20 RN 9
  135. SrcDst22 RN 10
  136. SrcDst30 RN 11
  137. SrcDst32 RN 12
  138. temp1 RN 2
  139. temp2 RN 3
  140. temp3 RN 14
  141. ;// Allocate stack memory required by the function
  142. ;// Write function header
  143. M_START armVCM4P10_DequantLumaAC4x4,r11
  144. LDR pQPmod,=armVCM4P10_QPModuloTable
  145. LDR pQPdiv,=armVCM4P10_QPDivTable
  146. LDR pVRow,=armVCM4P10_VMatrixU16
  147. LDRSB QPmod,[pQPmod,QP] ;// (QP%6) * 6
  148. LDRSB shift,[pQPdiv,QP] ;// Shift = QP / 6
  149. LDR rowLuma01,[pVRow,QPmod]! ;// rowLuma01 = [0b|0a]
  150. LDR rowLuma23,[pVRow,#4] ;// rowLuma23 = [0d|0c]
  151. ;// Load all the 16 'src' values
  152. LDMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
  153. ;//*********************************************************************************************
  154. ;//
  155. ;// 'Shift' ranges between [0,8]
  156. ;// So we can shift the packed rowLuma values [0b|0a] with a single LSL operation
  157. ;//
  158. ;//*********************************************************************************************
  159. LSL rowLuma01,rowLuma01,shift
  160. LSL rowLuma23,rowLuma23,shift
  161. ;//**********************************************************************************************
  162. ;//
  163. ;// The idea is to unroll the Loop completely
  164. ;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above)
  165. ;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16'
  166. ;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2
  167. ;// These 3 values are loaded into rowLuma01 and rowLuma23 (above)
  168. ;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above)
  169. ;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated
  170. ;// Here we interleave the PKHBT operations for various rows to avoide pipeline stalls
  171. ;//
  172. ;// We then pack the two 16 bit multiplication result into a word and store at one go
  173. ;//
  174. ;//**********************************************************************************************
  175. ;// Row 1
  176. SMULTB temp1,SrcDst00,rowLuma23 ;// pSrcDst[1] * (pVRow[2]<<Shift)
  177. SMULBB SrcDst00,SrcDst00,rowLuma01 ;// pSrcDst[0] * (pVRow[0]<<Shift)
  178. SMULTB temp2,SrcDst02,rowLuma23 ;// pSrcDst[3] * (pVRow[2]<<Shift)
  179. SMULBB SrcDst02,SrcDst02,rowLuma01 ;// pSrcDst[2] * (pVRow[0]<<Shift)
  180. PKHBT SrcDst00,SrcDst00,temp1,LSL #16 ;// Pack the first two product values
  181. ;// Row 2
  182. SMULTT temp1,SrcDst10,rowLuma01 ;// pSrcDst[5] * (pVRow[1]<<Shift)
  183. SMULBB SrcDst10,SrcDst10,rowLuma23 ;// pSrcDst[4] * (pVRow[2]<<Shift)
  184. PKHBT SrcDst02,SrcDst02,temp2,LSL #16 ;// Pack the next two product values
  185. SMULTT temp2,SrcDst12,rowLuma01 ;// pSrcDst[7] * (pVRow[1]<<Shift)
  186. SMULBB SrcDst12,SrcDst12,rowLuma23 ;// pSrcDst[6] * (pVRow[2]<<Shift)
  187. PKHBT SrcDst10,SrcDst10,temp1,LSL #16 ;// Pack the next two product values
  188. ;// Row 3
  189. SMULTB temp1,SrcDst20,rowLuma23 ;// pSrcDst[9] * (pVRow[2]<<Shift)
  190. SMULBB SrcDst20,SrcDst20,rowLuma01 ;// pSrcDst[8] * (pVRow[0]<<Shift)
  191. PKHBT SrcDst12,SrcDst12,temp2,LSL #16 ;// Pack the next two product values
  192. SMULTB temp2,SrcDst22,rowLuma23 ;// pSrcDst[11] * (pVRow[2]<<Shift)
  193. SMULBB SrcDst22,SrcDst22,rowLuma01 ;// pSrcDst[10] * (pVRow[0]<<Shift)
  194. PKHBT SrcDst20,SrcDst20,temp1,LSL #16 ;// Pack the next two product values
  195. ;// Row 4
  196. SMULTT temp1,SrcDst30,rowLuma01 ;// pSrcDst[13] * (pVRow[1]<<Shift)
  197. SMULBB SrcDst30,SrcDst30,rowLuma23 ;// pSrcDst[12] * (pVRow[2]<<Shift)
  198. SMULTT temp3,SrcDst32,rowLuma01 ;// pSrcDst[15] * (pVRow[1]<<Shift)
  199. SMULBB SrcDst32,SrcDst32,rowLuma23 ;// pSrcDst[14] * (pVRow[2]<<Shift)
  200. PKHBT SrcDst22,SrcDst22,temp2,LSL #16 ;// Pack the remaining product values
  201. PKHBT SrcDst30,SrcDst30,temp1,LSL #16
  202. PKHBT SrcDst32,SrcDst32,temp3,LSL #16
  203. STMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
  204. ;// Set return value
  205. ;// Write function tail
  206. M_END
  207. ENDIF ;//ARM1136JS_U
  208. ;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd
  209. ;// Guarding implementation by the processor name
  210. IF ARM1136JS
  211. ;//Input Registers
  212. ppSrc RN 0
  213. pPred RN 1
  214. pDC RN 2
  215. pDst RN 3
  216. ;//Output Registers
  217. result RN 0
  218. ;//Local Scratch Registers
  219. pDelta RN 4
  220. pDeltaTmp RN 6
  221. AC RN 5 ;//Load from stack
  222. pPredTemp RN 7
  223. pDCTemp RN 8
  224. pDstTemp RN 9
  225. pDeltaArg1 RN 1
  226. pDeltaArg0 RN 0
  227. QP RN 1 ;//Load from stack
  228. DCval RN 10
  229. DCvalCopy RN 11
  230. predstep RN 1
  231. dstStep RN 10
  232. ycounter RN 0
  233. PredVal1 RN 3
  234. PredVal2 RN 5
  235. DeltaVal1 RN 2
  236. DeltaVal2 RN 11
  237. PredVal RN 8
  238. tmpDeltaVal RN 6
  239. sum1 RN 12
  240. sum2 RN 14
  241. ;// Allocate stack memory required by the function
  242. M_ALLOC8 pBuffer, 32
  243. ;// Write function header
  244. M_START omxVCM4P10_DequantTransformResidualFromPairAndAdd,r11
  245. ;// Define stack arguments
  246. M_ARG predStepOnStack, 4
  247. M_ARG dstStepOnStack,4
  248. M_ARG QPOnStack, 4
  249. M_ARG ACOnStack,4
  250. M_ADR pDelta,pBuffer
  251. M_LDR AC,ACOnStack
  252. ;// Save registers r1,r2,r3 before function call
  253. MOV pPredTemp,pPred
  254. MOV pDCTemp,pDC
  255. MOV pDstTemp,pDst
  256. CMP AC,#0
  257. BEQ DCcase
  258. MOV pDeltaArg1,pDelta ;// Set up r1 for armVCM4P10_UnpackBlock4x4
  259. BL armVCM4P10_UnpackBlock4x4
  260. M_LDR QP,QPOnStack ;// Set up r1 for DequantLumaAC4x4
  261. MOV pDeltaArg0,pDelta ;// Set up r0 for DequantLumaAC4x4
  262. BL armVCM4P10_DequantLumaAC4x4
  263. CMP pDCTemp,#0
  264. LDRSHNE DCval,[pDCTemp]
  265. MOV pDeltaArg0,pDelta ;// Set up r0 for armVCM4P10_TransformResidual4x4
  266. MOV pDeltaArg1,pDelta ;// Set up r1 for armVCM4P10_TransformResidual4x4
  267. STRHNE DCval,[pDelta]
  268. BL armVCM4P10_TransformResidual4x4
  269. B OutDCcase
  270. DCcase
  271. LDRSH DCval,[pDCTemp]
  272. ADD DCval,DCval,#32
  273. ASR DCval,DCval,#6
  274. PKHBT DCval,DCval,DCval,LSL #16 ;// Duplicating the Lower halfword
  275. MOV DCvalCopy, DCval ;// Needed for STRD
  276. STRD DCval, [pDelta, #0] ;// pDelta[0] = pDelta[1] = pDelta[2] = pDelta[3] = DCval
  277. STRD DCval, [pDelta, #8] ;// pDelta[4] = pDelta[5] = pDelta[6] = pDelta[7] = DCval
  278. STRD DCval, [pDelta, #16] ;// pDelta[8] = pDelta[9] = pDelta[10] = pDelta[11] = DCval
  279. STRD DCval, [pDelta, #24]
  280. OutDCcase
  281. M_LDR predstep,predStepOnStack
  282. M_LDR dstStep,dstStepOnStack
  283. LDMIA pDelta!,{tmpDeltaVal,DeltaVal2} ;// Pre load
  284. MOV ycounter,#4 ;// Counter for the PredPlusDeltaLoop
  285. LDR PredVal,[pPredTemp] ;// Pre load
  286. PredPlusDeltaLoop
  287. SUBS ycounter,ycounter,#1
  288. ADD pPredTemp,pPredTemp,predstep ;// Increment pPred ptr
  289. PKHBT DeltaVal1,tmpDeltaVal,DeltaVal2,LSL #16 ;// Deltaval1 = [C A]
  290. PKHTB DeltaVal2,DeltaVal2,tmpDeltaVal,ASR #16 ;// DeltaVal2 = [D B]
  291. UXTB16 PredVal1,PredVal ;// PredVal1 = [0c0a]
  292. UXTB16 PredVal2,PredVal,ROR #8 ;// PredVal2 = [0d0b]
  293. LDRGT PredVal,[pPredTemp] ;// Pre load
  294. QADD16 sum2,DeltaVal2,PredVal2 ;// Add and saturate to 16 bits
  295. QADD16 sum1,DeltaVal1,PredVal1
  296. USAT16 sum2,#8,sum2 ;// armClip(0,255,sum2)
  297. USAT16 sum1,#8,sum1
  298. LDMGTIA pDelta!,{tmpDeltaVal,DeltaVal2} ;// Pre load
  299. ORR sum1,sum1,sum2,LSL #8 ;// sum1 = [dcba]
  300. STR sum1,[pDstTemp]
  301. ADD pDstTemp,pDstTemp,dstStep ;// Increment pDst ptr
  302. BGT PredPlusDeltaLoop
  303. ;// Set return value
  304. MOV result,#OMX_Sts_NoErr
  305. End
  306. ;// Write function tail
  307. M_END
  308. ENDIF ;//ARM1136JS
  309. ;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd
  310. ;// Guarding implementation by the processor name
  311. END