PageRenderTime 21ms CodeModel.GetById 15ms app.highlight 4ms RepoModel.GetById 1ms app.codeStats 0ms

/media/libvpx/vp8/encoder/arm/armv6/vp8_variance8x8_armv6.asm

http://github.com/zpao/v8monkey
Assembly | 101 lines | 55 code | 23 blank | 23 comment | 0 complexity | b4d46b33e1a06a552ff29963590ead42 MD5 | raw file
  1;
  2;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
  3;
  4;  Use of this source code is governed by a BSD-style license
  5;  that can be found in the LICENSE file in the root of the source
  6;  tree. An additional intellectual property rights grant can be found
  7;  in the file PATENTS.  All contributing project authors may
  8;  be found in the AUTHORS file in the root of the source tree.
  9;
 10
 11
 12    EXPORT  |vp8_variance8x8_armv6|
 13
 14    ARM
 15
 16    AREA ||.text||, CODE, READONLY, ALIGN=2
 17
 18; r0    unsigned char *src_ptr
 19; r1    int source_stride
 20; r2    unsigned char *ref_ptr
 21; r3    int  recon_stride
 22; stack unsigned int *sse
 23|vp8_variance8x8_armv6| PROC
 24
 25    push    {r4-r10, lr}
 26
 27    pld     [r0, r1, lsl #0]
 28    pld     [r2, r3, lsl #0]
 29
 30    mov     r12, #8             ; set loop counter to 8 (=block height)
 31    mov     r4, #0              ; initialize sum = 0
 32    mov     r5, #0              ; initialize sse = 0
 33
 34loop
 35    ; 1st 4 pixels
 36    ldr     r6, [r0, #0x0]      ; load 4 src pixels
 37    ldr     r7, [r2, #0x0]      ; load 4 ref pixels
 38
 39    mov     lr, #0              ; constant zero
 40
 41    usub8   r8, r6, r7          ; calculate difference
 42    pld     [r0, r1, lsl #1]
 43    sel     r10, r8, lr         ; select bytes with positive difference
 44    usub8   r9, r7, r6          ; calculate difference with reversed operands
 45    pld     [r2, r3, lsl #1]
 46    sel     r8, r9, lr          ; select bytes with negative difference
 47
 48    ; calculate partial sums
 49    usad8   r6, r10, lr         ; calculate sum of positive differences
 50    usad8   r7, r8, lr          ; calculate sum of negative differences
 51    orr     r8, r8, r10         ; differences of all 4 pixels
 52    ; calculate total sum
 53    add    r4, r4, r6           ; add positive differences to sum
 54    sub    r4, r4, r7           ; substract negative differences from sum
 55
 56    ; calculate sse
 57    uxtb16  r7, r8              ; byte (two pixels) to halfwords
 58    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
 59    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
 60
 61    ; 2nd 4 pixels
 62    ldr     r6, [r0, #0x4]      ; load 4 src pixels
 63    ldr     r7, [r2, #0x4]      ; load 4 ref pixels
 64    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
 65
 66    usub8   r8, r6, r7          ; calculate difference
 67    add     r0, r0, r1          ; set src_ptr to next row
 68    sel     r10, r8, lr         ; select bytes with positive difference
 69    usub8   r9, r7, r6          ; calculate difference with reversed operands
 70    add     r2, r2, r3          ; set dst_ptr to next row
 71    sel     r8, r9, lr          ; select bytes with negative difference
 72
 73    ; calculate partial sums
 74    usad8   r6, r10, lr         ; calculate sum of positive differences
 75    usad8   r7, r8, lr          ; calculate sum of negative differences
 76    orr     r8, r8, r10         ; differences of all 4 pixels
 77
 78    ; calculate total sum
 79    add     r4, r4, r6          ; add positive differences to sum
 80    sub     r4, r4, r7          ; substract negative differences from sum
 81
 82    ; calculate sse
 83    uxtb16  r7, r8              ; byte (two pixels) to halfwords
 84    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
 85    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
 86    subs    r12, r12, #1        ; next row
 87    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
 88
 89    bne     loop
 90
 91    ; return stuff
 92    ldr     r8, [sp, #32]       ; get address of sse
 93    mul     r1, r4, r4          ; sum * sum
 94    str     r5, [r8]            ; store sse
 95    sub     r0, r5, r1, ASR #6  ; return (sse - ((sum * sum) >> 6))
 96
 97    pop     {r4-r10, pc}
 98
 99    ENDP
100
101    END