omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s

/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s

https://bitbucket.org/aways/android_frameworks_av · Assembly · 480 lines · 219 code · 169 blank · 92 comment · 0 complexity · a895c6da856e2c9b3cffe107e3db27b8 MD5 · raw file

;//
;// (c) Copyright 2007 ARM Limited. All Rights Reserved.
;//
;// Description:
;// H.264 inverse quantize and transform module
;// 
;// 

        

;// Include standard headers

        INCLUDE omxtypes_s.h
        INCLUDE armCOMM_s.h
        
;// Import symbols required from other files
;// (For example tables)
    
        IMPORT armVCM4P10_UnpackBlock4x4
        IMPORT armVCM4P10_TransformResidual4x4
        IMPORT armVCM4P10_QPDivTable
        IMPORT armVCM4P10_VMatrixU16
        IMPORT armVCM4P10_QPModuloTable 
        
    M_VARIANTS ARM1136JS, ARM1136JS_U
        
;// Set debugging level        
;//DEBUG_ON    SETL {TRUE}


;// Static Function: armVCM4P10_DequantLumaAC4x4

;// Guarding implementation by the processor name
    
    IF  ARM1136JS 
    
;//Input Registers
pSrcDst       RN  0
QP            RN  1


;//Output Registers


;//Local Scratch Registers
pQPdiv          RN  4
pQPmod          RN  5
pVRow           RN  2
QPmod           RN  6
shift           RN  3
rowLuma01       RN  1
rowLuma23       RN  4

SrcDst00        RN  5
SrcDst02        RN  6
SrcDst10        RN  7
SrcDst12        RN  8
SrcDst20        RN  9
SrcDst22        RN  10
SrcDst30        RN  11
SrcDst32        RN  12

temp1           RN  2
temp2           RN  3
temp3           RN  14
    
    
        ;// Allocate stack memory required by the function
        
        ;// Write function header
        M_START armVCM4P10_DequantLumaAC4x4,r11
         
        LDR    pQPmod,=armVCM4P10_QPModuloTable
        LDR    pQPdiv,=armVCM4P10_QPDivTable        
        LDR    pVRow,=armVCM4P10_VMatrixU16
         
        LDRSB  QPmod,[pQPmod,QP]                    ;// (QP%6) * 6
        LDRSB  shift,[pQPdiv,QP]                    ;// Shift = QP / 6
                
        LDRH    rowLuma01,[pVRow,QPmod]!             ;// rowLuma01 = [00|0a]
        LDRH    temp3,[pVRow,#2]                     ;// temp3     = [00|0b]   
        LDRH    rowLuma23,[pVRow,#4]                 ;// rowLuma23 = [00|0c] 
        ORR     rowLuma01,rowLuma01,temp3,LSL #16    ;// rowLuma01 = [0b|0a]   
        
        ;// Load all the 16 'src' values
        LDMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
        
        
        ;//*********************************************************************************************
        ;//
        ;// 'Shift' ranges between [0,8] 
        ;// So we can shift the packed rowLuma values [0b|0a] with a single LSL operation
        ;//
        ;//*********************************************************************************************
        
        LSL    rowLuma01,rowLuma01,shift
        LSL    rowLuma23,rowLuma23,shift
        
        
        ;//**********************************************************************************************
        ;//
        ;// The idea is to unroll the Loop completely
        ;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above)
        ;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16' 
        ;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2
        ;// These 3 values are loaded into rowLuma01 and rowLuma23 (above)
        ;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above)
        ;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated
        ;// Here we interleave the PKHBT operations for various rows  to avoide pipeline stalls
        ;// 
        ;// We then pack the two 16 bit multiplication result into a word and store at one go
        ;//
        ;//**********************************************************************************************
        
        
        ;// Row 1
        
        
        SMULTB  temp1,SrcDst00,rowLuma23                    ;// pSrcDst[1] * (pVRow[2]<<Shift) 
        SMULBB  SrcDst00,SrcDst00,rowLuma01                 ;// pSrcDst[0] * (pVRow[0]<<Shift)  
        
        SMULTB  temp2,SrcDst02,rowLuma23                    ;// pSrcDst[3] * (pVRow[2]<<Shift) 
        SMULBB  SrcDst02,SrcDst02,rowLuma01                 ;// pSrcDst[2] * (pVRow[0]<<Shift)
        
        PKHBT   SrcDst00,SrcDst00,temp1,LSL #16             ;// Pack the first two product values
        
                
        ;// Row 2
        SMULTT  temp1,SrcDst10,rowLuma01                    ;// pSrcDst[5] * (pVRow[1]<<Shift)
        SMULBB  SrcDst10,SrcDst10,rowLuma23                 ;// pSrcDst[4] * (pVRow[2]<<Shift)
        
        PKHBT   SrcDst02,SrcDst02,temp2,LSL #16             ;// Pack the next two product values
        SMULTT  temp2,SrcDst12,rowLuma01                    ;// pSrcDst[7] * (pVRow[1]<<Shift)
        SMULBB  SrcDst12,SrcDst12,rowLuma23                    ;// pSrcDst[6] * (pVRow[2]<<Shift)
        
        PKHBT   SrcDst10,SrcDst10,temp1,LSL #16             ;// Pack the next two product values
        
               
        ;// Row 3    
        
        SMULTB  temp1,SrcDst20,rowLuma23                    ;// pSrcDst[9] * (pVRow[2]<<Shift)         
        SMULBB  SrcDst20,SrcDst20,rowLuma01                    ;// pSrcDst[8] * (pVRow[0]<<Shift)  
       
        PKHBT   SrcDst12,SrcDst12,temp2,LSL #16               ;// Pack the next two product values
        SMULTB  temp2,SrcDst22,rowLuma23                    ;// pSrcDst[11] * (pVRow[2]<<Shift) 
        SMULBB  SrcDst22,SrcDst22,rowLuma01                    ;// pSrcDst[10] * (pVRow[0]<<Shift)
                                                            
        PKHBT   SrcDst20,SrcDst20,temp1,LSL #16             ;// Pack the next two product values
        
        
                        
        ;// Row 4   
        
        SMULTT  temp1,SrcDst30,rowLuma01                    ;// pSrcDst[13] * (pVRow[1]<<Shift)
        SMULBB  SrcDst30,SrcDst30,rowLuma23                    ;// pSrcDst[12] * (pVRow[2]<<Shift)
        
        SMULTT  temp3,SrcDst32,rowLuma01                    ;// pSrcDst[15] * (pVRow[1]<<Shift)
        SMULBB  SrcDst32,SrcDst32,rowLuma23                    ;// pSrcDst[14] * (pVRow[2]<<Shift)
       
        PKHBT   SrcDst22,SrcDst22,temp2,LSL #16             ;// Pack the remaining product values
        PKHBT   SrcDst30,SrcDst30,temp1,LSL #16
        PKHBT   SrcDst32,SrcDst32,temp3,LSL #16
        
        
        STMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
        
        
        ;// Set return value
          
           
      
        ;// Write function tail
        M_END
        
    ENDIF                                                    ;//ARM1136JS        
 

;// Guarding implementation by the processor name
    
    IF  ARM1136JS_U
    
;//Input Registers
pSrcDst       RN  0
QP            RN  1


;//Output Registers


;//Local Scratch Registers
pQPdiv          RN  4
pQPmod          RN  5
pVRow           RN  2
QPmod           RN  6
shift           RN  3
rowLuma01       RN  1
rowLuma23       RN  4

SrcDst00        RN  5
SrcDst02        RN  6
SrcDst10        RN  7
SrcDst12        RN  8
SrcDst20        RN  9
SrcDst22        RN  10
SrcDst30        RN  11
SrcDst32        RN  12

temp1           RN  2
temp2           RN  3
temp3           RN  14
    
    
        ;// Allocate stack memory required by the function
        
        ;// Write function header
        M_START armVCM4P10_DequantLumaAC4x4,r11
         
        LDR    pQPmod,=armVCM4P10_QPModuloTable
        LDR    pQPdiv,=armVCM4P10_QPDivTable        
        LDR    pVRow,=armVCM4P10_VMatrixU16
         
        LDRSB  QPmod,[pQPmod,QP]                    ;// (QP%6) * 6
        LDRSB  shift,[pQPdiv,QP]                    ;// Shift = QP / 6
                
        LDR    rowLuma01,[pVRow,QPmod]!             ;// rowLuma01 = [0b|0a]
        LDR    rowLuma23,[pVRow,#4]                 ;// rowLuma23 = [0d|0c]    

        ;// Load all the 16 'src' values
        LDMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
        
        
        ;//*********************************************************************************************
        ;//
        ;// 'Shift' ranges between [0,8] 
        ;// So we can shift the packed rowLuma values [0b|0a] with a single LSL operation
        ;//
        ;//*********************************************************************************************
        
        LSL    rowLuma01,rowLuma01,shift
        LSL    rowLuma23,rowLuma23,shift
        
        
        ;//**********************************************************************************************
        ;//
        ;// The idea is to unroll the Loop completely
        ;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above)
        ;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16' 
        ;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2
        ;// These 3 values are loaded into rowLuma01 and rowLuma23 (above)
        ;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above)
        ;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated
        ;// Here we interleave the PKHBT operations for various rows  to avoide pipeline stalls
        ;// 
        ;// We then pack the two 16 bit multiplication result into a word and store at one go
        ;//
        ;//**********************************************************************************************
        
        
        ;// Row 1
        
        
        SMULTB  temp1,SrcDst00,rowLuma23                    ;// pSrcDst[1] * (pVRow[2]<<Shift) 
        SMULBB  SrcDst00,SrcDst00,rowLuma01                 ;// pSrcDst[0] * (pVRow[0]<<Shift)  
        
        SMULTB  temp2,SrcDst02,rowLuma23                    ;// pSrcDst[3] * (pVRow[2]<<Shift) 
        SMULBB  SrcDst02,SrcDst02,rowLuma01                 ;// pSrcDst[2] * (pVRow[0]<<Shift)
        
        PKHBT   SrcDst00,SrcDst00,temp1,LSL #16             ;// Pack the first two product values
        
                
        ;// Row 2
        SMULTT  temp1,SrcDst10,rowLuma01                    ;// pSrcDst[5] * (pVRow[1]<<Shift)
        SMULBB  SrcDst10,SrcDst10,rowLuma23                 ;// pSrcDst[4] * (pVRow[2]<<Shift)
        
        PKHBT   SrcDst02,SrcDst02,temp2,LSL #16             ;// Pack the next two product values
        SMULTT  temp2,SrcDst12,rowLuma01                    ;// pSrcDst[7] * (pVRow[1]<<Shift)
        SMULBB  SrcDst12,SrcDst12,rowLuma23                    ;// pSrcDst[6] * (pVRow[2]<<Shift)
        
        PKHBT   SrcDst10,SrcDst10,temp1,LSL #16             ;// Pack the next two product values
        
               
        ;// Row 3    
        
        SMULTB  temp1,SrcDst20,rowLuma23                    ;// pSrcDst[9] * (pVRow[2]<<Shift)         
        SMULBB  SrcDst20,SrcDst20,rowLuma01                    ;// pSrcDst[8] * (pVRow[0]<<Shift)  
       
        PKHBT   SrcDst12,SrcDst12,temp2,LSL #16               ;// Pack the next two product values
        SMULTB  temp2,SrcDst22,rowLuma23                    ;// pSrcDst[11] * (pVRow[2]<<Shift) 
        SMULBB  SrcDst22,SrcDst22,rowLuma01                    ;// pSrcDst[10] * (pVRow[0]<<Shift)
                                                            
        PKHBT   SrcDst20,SrcDst20,temp1,LSL #16             ;// Pack the next two product values
        
        
                        
        ;// Row 4   
        
        SMULTT  temp1,SrcDst30,rowLuma01                    ;// pSrcDst[13] * (pVRow[1]<<Shift)
        SMULBB  SrcDst30,SrcDst30,rowLuma23                    ;// pSrcDst[12] * (pVRow[2]<<Shift)
        
        SMULTT  temp3,SrcDst32,rowLuma01                    ;// pSrcDst[15] * (pVRow[1]<<Shift)
        SMULBB  SrcDst32,SrcDst32,rowLuma23                    ;// pSrcDst[14] * (pVRow[2]<<Shift)
       
        PKHBT   SrcDst22,SrcDst22,temp2,LSL #16             ;// Pack the remaining product values
        PKHBT   SrcDst30,SrcDst30,temp1,LSL #16
        PKHBT   SrcDst32,SrcDst32,temp3,LSL #16
        
        
        STMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
        
        
        ;// Set return value
          
           
      
        ;// Write function tail
        M_END
        
    ENDIF                                                    ;//ARM1136JS_U        





;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd            
    
;// Guarding implementation by the processor name
    
    IF  ARM1136JS
    
;//Input Registers
ppSrc       RN  0
pPred       RN  1
pDC         RN  2
pDst        RN  3
   

;//Output Registers
result      RN  0

;//Local Scratch Registers
pDelta      RN  4
pDeltaTmp   RN  6
AC          RN  5                   ;//Load from stack
pPredTemp   RN  7
pDCTemp     RN  8
pDstTemp    RN  9
pDeltaArg1  RN  1
pDeltaArg0  RN  0
QP          RN  1                   ;//Load from stack
DCval       RN  10  
DCvalCopy   RN  11
predstep    RN  1
dstStep     RN  10
ycounter    RN  0
PredVal1    RN  3
PredVal2    RN  5
DeltaVal1   RN  2
DeltaVal2   RN  11
PredVal     RN  8
tmpDeltaVal RN  6
sum1        RN  12
sum2        RN  14
    
    
           
    ;// Allocate stack memory required by the function
        M_ALLOC8 pBuffer, 32
               

    ;// Write function header
        M_START omxVCM4P10_DequantTransformResidualFromPairAndAdd,r11
        
        ;// Define stack arguments
        M_ARG   predStepOnStack, 4
        M_ARG   dstStepOnStack,4
        M_ARG   QPOnStack, 4
        M_ARG   ACOnStack,4
  
        
        M_ADR   pDelta,pBuffer 
        M_LDR   AC,ACOnStack 
        
         
        ;// Save registers r1,r2,r3 before function call    
        MOV     pPredTemp,pPred
        MOV     pDCTemp,pDC
        MOV     pDstTemp,pDst
        
        CMP     AC,#0
        BEQ     DCcase
        MOV     pDeltaArg1,pDelta                           ;// Set up r1 for armVCM4P10_UnpackBlock4x4
    
        BL      armVCM4P10_UnpackBlock4x4
    
        M_LDR   QP,QPOnStack                                ;// Set up r1 for DequantLumaAC4x4
        MOV     pDeltaArg0,pDelta                           ;// Set up r0 for DequantLumaAC4x4

        BL      armVCM4P10_DequantLumaAC4x4
        
        
        CMP     pDCTemp,#0
        LDRSHNE DCval,[pDCTemp]
        MOV     pDeltaArg0,pDelta                           ;// Set up r0 for armVCM4P10_TransformResidual4x4
        MOV     pDeltaArg1,pDelta                           ;// Set up r1 for armVCM4P10_TransformResidual4x4
        STRHNE  DCval,[pDelta]
        
        BL      armVCM4P10_TransformResidual4x4
        B       OutDCcase 
        

DCcase
        LDRSH   DCval,[pDCTemp] 
        ADD     DCval,DCval,#32 
        ASR     DCval,DCval,#6
        PKHBT   DCval,DCval,DCval,LSL #16                  ;// Duplicating the Lower halfword
        MOV     DCvalCopy, DCval                           ;// Needed for STRD
        STRD    DCval, [pDelta, #0]                        ;// pDelta[0]  = pDelta[1]  = pDelta[2]  = pDelta[3] = DCval
        STRD    DCval, [pDelta, #8]                        ;// pDelta[4]  = pDelta[5]  = pDelta[6]  = pDelta[7] = DCval
        STRD    DCval, [pDelta, #16]                       ;// pDelta[8]  = pDelta[9]  = pDelta[10] = pDelta[11] = DCval
        STRD    DCval, [pDelta, #24]   
        
               
OutDCcase      
        M_LDR   predstep,predStepOnStack
        M_LDR   dstStep,dstStepOnStack
        
        LDMIA   pDelta!,{tmpDeltaVal,DeltaVal2}             ;// Pre load
        MOV     ycounter,#4                                 ;// Counter for the PredPlusDeltaLoop
        LDR     PredVal,[pPredTemp]                         ;// Pre load

PredPlusDeltaLoop
        
       
        SUBS    ycounter,ycounter,#1
        ADD     pPredTemp,pPredTemp,predstep                ;// Increment pPred ptr
        
        PKHBT   DeltaVal1,tmpDeltaVal,DeltaVal2,LSL #16     ;// Deltaval1 = [C A]   
        PKHTB   DeltaVal2,DeltaVal2,tmpDeltaVal,ASR #16     ;// DeltaVal2 = [D B]
        
        UXTB16  PredVal1,PredVal                            ;// PredVal1 = [0c0a]
        UXTB16  PredVal2,PredVal,ROR #8                     ;// PredVal2 = [0d0b]
        
        LDRGT   PredVal,[pPredTemp]                         ;// Pre load
        
        QADD16  sum2,DeltaVal2,PredVal2                     ;// Add and saturate to 16 bits
        QADD16  sum1,DeltaVal1,PredVal1
        
        USAT16  sum2,#8,sum2                                ;// armClip(0,255,sum2)
        USAT16  sum1,#8,sum1
        
        LDMGTIA   pDelta!,{tmpDeltaVal,DeltaVal2}           ;// Pre load
          
        ORR     sum1,sum1,sum2,LSL #8                       ;// sum1 = [dcba]
        STR     sum1,[pDstTemp]
        
        ADD     pDstTemp,pDstTemp,dstStep                   ;// Increment pDst ptr
        BGT     PredPlusDeltaLoop  
        
        
        ;// Set return value
        MOV     result,#OMX_Sts_NoErr
        
End                

        
        ;// Write function tail
        
        M_END
        
    ENDIF                                                    ;//ARM1136JS   
    
    
;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd            
    
;// Guarding implementation by the processor name
    
    
         
            
    END