/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s
https://bitbucket.org/aways/android_frameworks_av · Assembly · 480 lines · 219 code · 169 blank · 92 comment · 0 complexity · a895c6da856e2c9b3cffe107e3db27b8 MD5 · raw file
- ;//
- ;// (c) Copyright 2007 ARM Limited. All Rights Reserved.
- ;//
- ;// Description:
- ;// H.264 inverse quantize and transform module
- ;//
- ;//
-
- ;// Include standard headers
- INCLUDE omxtypes_s.h
- INCLUDE armCOMM_s.h
-
- ;// Import symbols required from other files
- ;// (For example tables)
-
- IMPORT armVCM4P10_UnpackBlock4x4
- IMPORT armVCM4P10_TransformResidual4x4
- IMPORT armVCM4P10_QPDivTable
- IMPORT armVCM4P10_VMatrixU16
- IMPORT armVCM4P10_QPModuloTable
-
- M_VARIANTS ARM1136JS, ARM1136JS_U
-
- ;// Set debugging level
- ;//DEBUG_ON SETL {TRUE}
- ;// Static Function: armVCM4P10_DequantLumaAC4x4
- ;// Guarding implementation by the processor name
-
- IF ARM1136JS
-
- ;//Input Registers
- pSrcDst RN 0
- QP RN 1
- ;//Output Registers
- ;//Local Scratch Registers
- pQPdiv RN 4
- pQPmod RN 5
- pVRow RN 2
- QPmod RN 6
- shift RN 3
- rowLuma01 RN 1
- rowLuma23 RN 4
- SrcDst00 RN 5
- SrcDst02 RN 6
- SrcDst10 RN 7
- SrcDst12 RN 8
- SrcDst20 RN 9
- SrcDst22 RN 10
- SrcDst30 RN 11
- SrcDst32 RN 12
- temp1 RN 2
- temp2 RN 3
- temp3 RN 14
-
-
- ;// Allocate stack memory required by the function
-
- ;// Write function header
- M_START armVCM4P10_DequantLumaAC4x4,r11
-
- LDR pQPmod,=armVCM4P10_QPModuloTable
- LDR pQPdiv,=armVCM4P10_QPDivTable
- LDR pVRow,=armVCM4P10_VMatrixU16
-
- LDRSB QPmod,[pQPmod,QP] ;// (QP%6) * 6
- LDRSB shift,[pQPdiv,QP] ;// Shift = QP / 6
-
- LDRH rowLuma01,[pVRow,QPmod]! ;// rowLuma01 = [00|0a]
- LDRH temp3,[pVRow,#2] ;// temp3 = [00|0b]
- LDRH rowLuma23,[pVRow,#4] ;// rowLuma23 = [00|0c]
- ORR rowLuma01,rowLuma01,temp3,LSL #16 ;// rowLuma01 = [0b|0a]
-
- ;// Load all the 16 'src' values
- LDMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
-
-
- ;//*********************************************************************************************
- ;//
- ;// 'Shift' ranges between [0,8]
- ;// So we can shift the packed rowLuma values [0b|0a] with a single LSL operation
- ;//
- ;//*********************************************************************************************
-
- LSL rowLuma01,rowLuma01,shift
- LSL rowLuma23,rowLuma23,shift
-
-
- ;//**********************************************************************************************
- ;//
- ;// The idea is to unroll the Loop completely
- ;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above)
- ;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16'
- ;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2
- ;// These 3 values are loaded into rowLuma01 and rowLuma23 (above)
- ;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above)
- ;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated
- ;// Here we interleave the PKHBT operations for various rows to avoide pipeline stalls
- ;//
- ;// We then pack the two 16 bit multiplication result into a word and store at one go
- ;//
- ;//**********************************************************************************************
-
-
- ;// Row 1
-
-
- SMULTB temp1,SrcDst00,rowLuma23 ;// pSrcDst[1] * (pVRow[2]<<Shift)
- SMULBB SrcDst00,SrcDst00,rowLuma01 ;// pSrcDst[0] * (pVRow[0]<<Shift)
-
- SMULTB temp2,SrcDst02,rowLuma23 ;// pSrcDst[3] * (pVRow[2]<<Shift)
- SMULBB SrcDst02,SrcDst02,rowLuma01 ;// pSrcDst[2] * (pVRow[0]<<Shift)
-
- PKHBT SrcDst00,SrcDst00,temp1,LSL #16 ;// Pack the first two product values
-
-
- ;// Row 2
- SMULTT temp1,SrcDst10,rowLuma01 ;// pSrcDst[5] * (pVRow[1]<<Shift)
- SMULBB SrcDst10,SrcDst10,rowLuma23 ;// pSrcDst[4] * (pVRow[2]<<Shift)
-
- PKHBT SrcDst02,SrcDst02,temp2,LSL #16 ;// Pack the next two product values
- SMULTT temp2,SrcDst12,rowLuma01 ;// pSrcDst[7] * (pVRow[1]<<Shift)
- SMULBB SrcDst12,SrcDst12,rowLuma23 ;// pSrcDst[6] * (pVRow[2]<<Shift)
-
- PKHBT SrcDst10,SrcDst10,temp1,LSL #16 ;// Pack the next two product values
-
-
- ;// Row 3
-
- SMULTB temp1,SrcDst20,rowLuma23 ;// pSrcDst[9] * (pVRow[2]<<Shift)
- SMULBB SrcDst20,SrcDst20,rowLuma01 ;// pSrcDst[8] * (pVRow[0]<<Shift)
-
- PKHBT SrcDst12,SrcDst12,temp2,LSL #16 ;// Pack the next two product values
- SMULTB temp2,SrcDst22,rowLuma23 ;// pSrcDst[11] * (pVRow[2]<<Shift)
- SMULBB SrcDst22,SrcDst22,rowLuma01 ;// pSrcDst[10] * (pVRow[0]<<Shift)
-
- PKHBT SrcDst20,SrcDst20,temp1,LSL #16 ;// Pack the next two product values
-
-
-
- ;// Row 4
-
- SMULTT temp1,SrcDst30,rowLuma01 ;// pSrcDst[13] * (pVRow[1]<<Shift)
- SMULBB SrcDst30,SrcDst30,rowLuma23 ;// pSrcDst[12] * (pVRow[2]<<Shift)
-
- SMULTT temp3,SrcDst32,rowLuma01 ;// pSrcDst[15] * (pVRow[1]<<Shift)
- SMULBB SrcDst32,SrcDst32,rowLuma23 ;// pSrcDst[14] * (pVRow[2]<<Shift)
-
- PKHBT SrcDst22,SrcDst22,temp2,LSL #16 ;// Pack the remaining product values
- PKHBT SrcDst30,SrcDst30,temp1,LSL #16
- PKHBT SrcDst32,SrcDst32,temp3,LSL #16
-
-
- STMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
-
-
- ;// Set return value
-
-
-
- ;// Write function tail
- M_END
-
- ENDIF ;//ARM1136JS
-
- ;// Guarding implementation by the processor name
-
- IF ARM1136JS_U
-
- ;//Input Registers
- pSrcDst RN 0
- QP RN 1
- ;//Output Registers
- ;//Local Scratch Registers
- pQPdiv RN 4
- pQPmod RN 5
- pVRow RN 2
- QPmod RN 6
- shift RN 3
- rowLuma01 RN 1
- rowLuma23 RN 4
- SrcDst00 RN 5
- SrcDst02 RN 6
- SrcDst10 RN 7
- SrcDst12 RN 8
- SrcDst20 RN 9
- SrcDst22 RN 10
- SrcDst30 RN 11
- SrcDst32 RN 12
- temp1 RN 2
- temp2 RN 3
- temp3 RN 14
-
-
- ;// Allocate stack memory required by the function
-
- ;// Write function header
- M_START armVCM4P10_DequantLumaAC4x4,r11
-
- LDR pQPmod,=armVCM4P10_QPModuloTable
- LDR pQPdiv,=armVCM4P10_QPDivTable
- LDR pVRow,=armVCM4P10_VMatrixU16
-
- LDRSB QPmod,[pQPmod,QP] ;// (QP%6) * 6
- LDRSB shift,[pQPdiv,QP] ;// Shift = QP / 6
-
- LDR rowLuma01,[pVRow,QPmod]! ;// rowLuma01 = [0b|0a]
- LDR rowLuma23,[pVRow,#4] ;// rowLuma23 = [0d|0c]
- ;// Load all the 16 'src' values
- LDMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
-
-
- ;//*********************************************************************************************
- ;//
- ;// 'Shift' ranges between [0,8]
- ;// So we can shift the packed rowLuma values [0b|0a] with a single LSL operation
- ;//
- ;//*********************************************************************************************
-
- LSL rowLuma01,rowLuma01,shift
- LSL rowLuma23,rowLuma23,shift
-
-
- ;//**********************************************************************************************
- ;//
- ;// The idea is to unroll the Loop completely
- ;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above)
- ;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16'
- ;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2
- ;// These 3 values are loaded into rowLuma01 and rowLuma23 (above)
- ;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above)
- ;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated
- ;// Here we interleave the PKHBT operations for various rows to avoide pipeline stalls
- ;//
- ;// We then pack the two 16 bit multiplication result into a word and store at one go
- ;//
- ;//**********************************************************************************************
-
-
- ;// Row 1
-
-
- SMULTB temp1,SrcDst00,rowLuma23 ;// pSrcDst[1] * (pVRow[2]<<Shift)
- SMULBB SrcDst00,SrcDst00,rowLuma01 ;// pSrcDst[0] * (pVRow[0]<<Shift)
-
- SMULTB temp2,SrcDst02,rowLuma23 ;// pSrcDst[3] * (pVRow[2]<<Shift)
- SMULBB SrcDst02,SrcDst02,rowLuma01 ;// pSrcDst[2] * (pVRow[0]<<Shift)
-
- PKHBT SrcDst00,SrcDst00,temp1,LSL #16 ;// Pack the first two product values
-
-
- ;// Row 2
- SMULTT temp1,SrcDst10,rowLuma01 ;// pSrcDst[5] * (pVRow[1]<<Shift)
- SMULBB SrcDst10,SrcDst10,rowLuma23 ;// pSrcDst[4] * (pVRow[2]<<Shift)
-
- PKHBT SrcDst02,SrcDst02,temp2,LSL #16 ;// Pack the next two product values
- SMULTT temp2,SrcDst12,rowLuma01 ;// pSrcDst[7] * (pVRow[1]<<Shift)
- SMULBB SrcDst12,SrcDst12,rowLuma23 ;// pSrcDst[6] * (pVRow[2]<<Shift)
-
- PKHBT SrcDst10,SrcDst10,temp1,LSL #16 ;// Pack the next two product values
-
-
- ;// Row 3
-
- SMULTB temp1,SrcDst20,rowLuma23 ;// pSrcDst[9] * (pVRow[2]<<Shift)
- SMULBB SrcDst20,SrcDst20,rowLuma01 ;// pSrcDst[8] * (pVRow[0]<<Shift)
-
- PKHBT SrcDst12,SrcDst12,temp2,LSL #16 ;// Pack the next two product values
- SMULTB temp2,SrcDst22,rowLuma23 ;// pSrcDst[11] * (pVRow[2]<<Shift)
- SMULBB SrcDst22,SrcDst22,rowLuma01 ;// pSrcDst[10] * (pVRow[0]<<Shift)
-
- PKHBT SrcDst20,SrcDst20,temp1,LSL #16 ;// Pack the next two product values
-
-
-
- ;// Row 4
-
- SMULTT temp1,SrcDst30,rowLuma01 ;// pSrcDst[13] * (pVRow[1]<<Shift)
- SMULBB SrcDst30,SrcDst30,rowLuma23 ;// pSrcDst[12] * (pVRow[2]<<Shift)
-
- SMULTT temp3,SrcDst32,rowLuma01 ;// pSrcDst[15] * (pVRow[1]<<Shift)
- SMULBB SrcDst32,SrcDst32,rowLuma23 ;// pSrcDst[14] * (pVRow[2]<<Shift)
-
- PKHBT SrcDst22,SrcDst22,temp2,LSL #16 ;// Pack the remaining product values
- PKHBT SrcDst30,SrcDst30,temp1,LSL #16
- PKHBT SrcDst32,SrcDst32,temp3,LSL #16
-
-
- STMIA pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
-
-
- ;// Set return value
-
-
-
- ;// Write function tail
- M_END
-
- ENDIF ;//ARM1136JS_U
- ;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd
-
- ;// Guarding implementation by the processor name
-
- IF ARM1136JS
-
- ;//Input Registers
- ppSrc RN 0
- pPred RN 1
- pDC RN 2
- pDst RN 3
-
- ;//Output Registers
- result RN 0
- ;//Local Scratch Registers
- pDelta RN 4
- pDeltaTmp RN 6
- AC RN 5 ;//Load from stack
- pPredTemp RN 7
- pDCTemp RN 8
- pDstTemp RN 9
- pDeltaArg1 RN 1
- pDeltaArg0 RN 0
- QP RN 1 ;//Load from stack
- DCval RN 10
- DCvalCopy RN 11
- predstep RN 1
- dstStep RN 10
- ycounter RN 0
- PredVal1 RN 3
- PredVal2 RN 5
- DeltaVal1 RN 2
- DeltaVal2 RN 11
- PredVal RN 8
- tmpDeltaVal RN 6
- sum1 RN 12
- sum2 RN 14
-
-
-
- ;// Allocate stack memory required by the function
- M_ALLOC8 pBuffer, 32
-
- ;// Write function header
- M_START omxVCM4P10_DequantTransformResidualFromPairAndAdd,r11
-
- ;// Define stack arguments
- M_ARG predStepOnStack, 4
- M_ARG dstStepOnStack,4
- M_ARG QPOnStack, 4
- M_ARG ACOnStack,4
-
-
- M_ADR pDelta,pBuffer
- M_LDR AC,ACOnStack
-
-
- ;// Save registers r1,r2,r3 before function call
- MOV pPredTemp,pPred
- MOV pDCTemp,pDC
- MOV pDstTemp,pDst
-
- CMP AC,#0
- BEQ DCcase
- MOV pDeltaArg1,pDelta ;// Set up r1 for armVCM4P10_UnpackBlock4x4
-
- BL armVCM4P10_UnpackBlock4x4
-
- M_LDR QP,QPOnStack ;// Set up r1 for DequantLumaAC4x4
- MOV pDeltaArg0,pDelta ;// Set up r0 for DequantLumaAC4x4
- BL armVCM4P10_DequantLumaAC4x4
-
-
- CMP pDCTemp,#0
- LDRSHNE DCval,[pDCTemp]
- MOV pDeltaArg0,pDelta ;// Set up r0 for armVCM4P10_TransformResidual4x4
- MOV pDeltaArg1,pDelta ;// Set up r1 for armVCM4P10_TransformResidual4x4
- STRHNE DCval,[pDelta]
-
- BL armVCM4P10_TransformResidual4x4
- B OutDCcase
-
- DCcase
- LDRSH DCval,[pDCTemp]
- ADD DCval,DCval,#32
- ASR DCval,DCval,#6
- PKHBT DCval,DCval,DCval,LSL #16 ;// Duplicating the Lower halfword
- MOV DCvalCopy, DCval ;// Needed for STRD
- STRD DCval, [pDelta, #0] ;// pDelta[0] = pDelta[1] = pDelta[2] = pDelta[3] = DCval
- STRD DCval, [pDelta, #8] ;// pDelta[4] = pDelta[5] = pDelta[6] = pDelta[7] = DCval
- STRD DCval, [pDelta, #16] ;// pDelta[8] = pDelta[9] = pDelta[10] = pDelta[11] = DCval
- STRD DCval, [pDelta, #24]
-
-
- OutDCcase
- M_LDR predstep,predStepOnStack
- M_LDR dstStep,dstStepOnStack
-
- LDMIA pDelta!,{tmpDeltaVal,DeltaVal2} ;// Pre load
- MOV ycounter,#4 ;// Counter for the PredPlusDeltaLoop
- LDR PredVal,[pPredTemp] ;// Pre load
- PredPlusDeltaLoop
-
-
- SUBS ycounter,ycounter,#1
- ADD pPredTemp,pPredTemp,predstep ;// Increment pPred ptr
-
- PKHBT DeltaVal1,tmpDeltaVal,DeltaVal2,LSL #16 ;// Deltaval1 = [C A]
- PKHTB DeltaVal2,DeltaVal2,tmpDeltaVal,ASR #16 ;// DeltaVal2 = [D B]
-
- UXTB16 PredVal1,PredVal ;// PredVal1 = [0c0a]
- UXTB16 PredVal2,PredVal,ROR #8 ;// PredVal2 = [0d0b]
-
- LDRGT PredVal,[pPredTemp] ;// Pre load
-
- QADD16 sum2,DeltaVal2,PredVal2 ;// Add and saturate to 16 bits
- QADD16 sum1,DeltaVal1,PredVal1
-
- USAT16 sum2,#8,sum2 ;// armClip(0,255,sum2)
- USAT16 sum1,#8,sum1
-
- LDMGTIA pDelta!,{tmpDeltaVal,DeltaVal2} ;// Pre load
-
- ORR sum1,sum1,sum2,LSL #8 ;// sum1 = [dcba]
- STR sum1,[pDstTemp]
-
- ADD pDstTemp,pDstTemp,dstStep ;// Increment pDst ptr
- BGT PredPlusDeltaLoop
-
-
- ;// Set return value
- MOV result,#OMX_Sts_NoErr
-
- End
-
- ;// Write function tail
-
- M_END
-
- ENDIF ;//ARM1136JS
-
-
- ;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd
-
- ;// Guarding implementation by the processor name
-
-
-
-
- END