media/libvpx/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,138 @@
     1.4 +;
     1.5 +;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
     1.6 +;
     1.7 +;  Use of this source code is governed by a BSD-style license
     1.8 +;  that can be found in the LICENSE file in the root of the source
     1.9 +;  tree. An additional intellectual property rights grant can be found
    1.10 +;  in the file PATENTS.  All contributing project authors may
    1.11 +;  be found in the AUTHORS file in the root of the source tree.
    1.12 +;
    1.13 +
    1.14 +
    1.15 +    EXPORT  |vp8_mse16x16_armv6|
    1.16 +
    1.17 +    ARM
    1.18 +
    1.19 +    AREA ||.text||, CODE, READONLY, ALIGN=2
    1.20 +
    1.21 +; r0    unsigned char *src_ptr
    1.22 +; r1    int source_stride
    1.23 +; r2    unsigned char *ref_ptr
    1.24 +; r3    int  recon_stride
    1.25 +; stack unsigned int *sse
    1.26 +;
    1.27 +;note: Based on vp8_variance16x16_armv6. In this function, sum is never used.
    1.28 +;      So, we can remove this part of calculation.
    1.29 +
    1.30 +|vp8_mse16x16_armv6| PROC
    1.31 +
    1.32 +    push    {r4-r9, lr}
    1.33 +
    1.34 +    pld     [r0, r1, lsl #0]
    1.35 +    pld     [r2, r3, lsl #0]
    1.36 +
    1.37 +    mov     r12, #16            ; set loop counter to 16 (=block height)
    1.38 +    mov     r4, #0              ; initialize sse = 0
    1.39 +
    1.40 +loop
    1.41 +    ; 1st 4 pixels
    1.42 +    ldr     r5, [r0, #0x0]      ; load 4 src pixels
    1.43 +    ldr     r6, [r2, #0x0]      ; load 4 ref pixels
    1.44 +
    1.45 +    mov     lr, #0              ; constant zero
    1.46 +
    1.47 +    usub8   r8, r5, r6          ; calculate difference
    1.48 +    pld     [r0, r1, lsl #1]
    1.49 +    sel     r7, r8, lr          ; select bytes with positive difference
    1.50 +    usub8   r9, r6, r5          ; calculate difference with reversed operands
    1.51 +    pld     [r2, r3, lsl #1]
    1.52 +    sel     r8, r9, lr          ; select bytes with negative difference
    1.53 +
    1.54 +    ; calculate partial sums
    1.55 +    usad8   r5, r7, lr          ; calculate sum of positive differences
    1.56 +    usad8   r6, r8, lr          ; calculate sum of negative differences
    1.57 +    orr     r8, r8, r7          ; differences of all 4 pixels
    1.58 +
    1.59 +    ldr     r5, [r0, #0x4]      ; load 4 src pixels
    1.60 +
    1.61 +    ; calculate sse
    1.62 +    uxtb16  r6, r8              ; byte (two pixels) to halfwords
    1.63 +    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
    1.64 +    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
    1.65 +
    1.66 +    ; 2nd 4 pixels
    1.67 +    ldr     r6, [r2, #0x4]      ; load 4 ref pixels
    1.68 +    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
    1.69 +
    1.70 +    usub8   r8, r5, r6          ; calculate difference
    1.71 +    sel     r7, r8, lr          ; select bytes with positive difference
    1.72 +    usub8   r9, r6, r5          ; calculate difference with reversed operands
    1.73 +    sel     r8, r9, lr          ; select bytes with negative difference
    1.74 +
    1.75 +    ; calculate partial sums
    1.76 +    usad8   r5, r7, lr          ; calculate sum of positive differences
    1.77 +    usad8   r6, r8, lr          ; calculate sum of negative differences
    1.78 +    orr     r8, r8, r7          ; differences of all 4 pixels
    1.79 +    ldr     r5, [r0, #0x8]      ; load 4 src pixels
    1.80 +    ; calculate sse
    1.81 +    uxtb16  r6, r8              ; byte (two pixels) to halfwords
    1.82 +    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
    1.83 +    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
    1.84 +
    1.85 +    ; 3rd 4 pixels
    1.86 +    ldr     r6, [r2, #0x8]      ; load 4 ref pixels
    1.87 +    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
    1.88 +
    1.89 +    usub8   r8, r5, r6          ; calculate difference
    1.90 +    sel     r7, r8, lr          ; select bytes with positive difference
    1.91 +    usub8   r9, r6, r5          ; calculate difference with reversed operands
    1.92 +    sel     r8, r9, lr          ; select bytes with negative difference
    1.93 +
    1.94 +    ; calculate partial sums
    1.95 +    usad8   r5, r7, lr          ; calculate sum of positive differences
    1.96 +    usad8   r6, r8, lr          ; calculate sum of negative differences
    1.97 +    orr     r8, r8, r7          ; differences of all 4 pixels
    1.98 +
    1.99 +    ldr     r5, [r0, #0xc]      ; load 4 src pixels
   1.100 +
   1.101 +    ; calculate sse
   1.102 +    uxtb16  r6, r8              ; byte (two pixels) to halfwords
   1.103 +    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
   1.104 +    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
   1.105 +
   1.106 +    ; 4th 4 pixels
   1.107 +    ldr     r6, [r2, #0xc]      ; load 4 ref pixels
   1.108 +    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
   1.109 +
   1.110 +    usub8   r8, r5, r6          ; calculate difference
   1.111 +    add     r0, r0, r1          ; set src_ptr to next row
   1.112 +    sel     r7, r8, lr          ; select bytes with positive difference
   1.113 +    usub8   r9, r6, r5          ; calculate difference with reversed operands
   1.114 +    add     r2, r2, r3          ; set dst_ptr to next row
   1.115 +    sel     r8, r9, lr          ; select bytes with negative difference
   1.116 +
   1.117 +    ; calculate partial sums
   1.118 +    usad8   r5, r7, lr          ; calculate sum of positive differences
   1.119 +    usad8   r6, r8, lr          ; calculate sum of negative differences
   1.120 +    orr     r8, r8, r7          ; differences of all 4 pixels
   1.121 +
   1.122 +    subs    r12, r12, #1        ; next row
   1.123 +
   1.124 +    ; calculate sse
   1.125 +    uxtb16  r6, r8              ; byte (two pixels) to halfwords
   1.126 +    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
   1.127 +    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
   1.128 +    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
   1.129 +
   1.130 +    bne     loop
   1.131 +
   1.132 +    ; return stuff
   1.133 +    ldr     r1, [sp, #28]       ; get address of sse
   1.134 +    mov     r0, r4              ; return sse
   1.135 +    str     r4, [r1]            ; store sse
   1.136 +
   1.137 +    pop     {r4-r9, pc}
   1.138 +
   1.139 +    ENDP
   1.140 +
   1.141 +    END

mercurial