media/libvpx/vp8/encoder/arm/armv6/walsh_v6.asm

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp8/encoder/arm/armv6/walsh_v6.asm	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,212 @@
     1.4 +;
     1.5 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 +;
     1.7 +;  Use of this source code is governed by a BSD-style license
     1.8 +;  that can be found in the LICENSE file in the root of the source
     1.9 +;  tree. An additional intellectual property rights grant can be found
    1.10 +;  in the file PATENTS.  All contributing project authors may
    1.11 +;  be found in the AUTHORS file in the root of the source tree.
    1.12 +;
    1.13 +
    1.14 +    EXPORT |vp8_short_walsh4x4_armv6|
    1.15 +
    1.16 +    ARM
    1.17 +    REQUIRE8
    1.18 +    PRESERVE8
    1.19 +
    1.20 +    AREA    |.text|, CODE, READONLY  ; name this block of code
    1.21 +
    1.22 +;short vp8_short_walsh4x4_armv6(short *input, short *output, int pitch)
    1.23 +; r0    short *input,
    1.24 +; r1    short *output,
    1.25 +; r2    int pitch
    1.26 +|vp8_short_walsh4x4_armv6| PROC
    1.27 +
    1.28 +    stmdb       sp!, {r4 - r11, lr}
    1.29 +
    1.30 +    ldrd        r4, r5, [r0], r2
    1.31 +    ldr         lr, c00040004
    1.32 +    ldrd        r6, r7, [r0], r2
    1.33 +
    1.34 +    ; 0-3
    1.35 +    qadd16      r3, r4, r5          ; [d1|a1] [1+3   |   0+2]
    1.36 +    qsub16      r4, r4, r5          ; [c1|b1] [1-3   |   0-2]
    1.37 +
    1.38 +    ldrd        r8, r9, [r0], r2
    1.39 +    ; 4-7
    1.40 +    qadd16      r5, r6, r7          ; [d1|a1] [5+7   |   4+6]
    1.41 +    qsub16      r6, r6, r7          ; [c1|b1] [5-7   |   4-6]
    1.42 +
    1.43 +    ldrd        r10, r11, [r0]
    1.44 +    ; 8-11
    1.45 +    qadd16      r7, r8, r9          ; [d1|a1] [9+11  |  8+10]
    1.46 +    qsub16      r8, r8, r9          ; [c1|b1] [9-11  |  8-10]
    1.47 +
    1.48 +    ; 12-15
    1.49 +    qadd16      r9, r10, r11        ; [d1|a1] [13+15 | 12+14]
    1.50 +    qsub16      r10, r10, r11       ; [c1|b1] [13-15 | 12-14]
    1.51 +
    1.52 +
    1.53 +    lsls        r2, r3, #16
    1.54 +    smuad       r11, r3, lr         ; A0 = a1<<2 + d1<<2
    1.55 +    addne       r11, r11, #1        ; A0 += (a1!=0)
    1.56 +
    1.57 +    lsls        r2, r7, #16
    1.58 +    smuad       r12, r7, lr         ; C0 = a1<<2 + d1<<2
    1.59 +    addne       r12, r12, #1        ; C0 += (a1!=0)
    1.60 +
    1.61 +    add         r0, r11, r12        ; a1_0 = A0 + C0
    1.62 +    sub         r11, r11, r12       ; b1_0 = A0 - C0
    1.63 +
    1.64 +    lsls        r2, r5, #16
    1.65 +    smuad       r12, r5, lr         ; B0 = a1<<2 + d1<<2
    1.66 +    addne       r12, r12, #1        ; B0 += (a1!=0)
    1.67 +
    1.68 +    lsls        r2, r9, #16
    1.69 +    smuad       r2, r9, lr          ; D0 = a1<<2 + d1<<2
    1.70 +    addne       r2, r2, #1          ; D0 += (a1!=0)
    1.71 +
    1.72 +    add         lr, r12, r2         ; d1_0 = B0 + D0
    1.73 +    sub         r12, r12, r2        ; c1_0 = B0 - D0
    1.74 +
    1.75 +    ; op[0,4,8,12]
    1.76 +    adds        r2, r0, lr          ; a2 = a1_0 + d1_0
    1.77 +    addmi       r2, r2, #1          ; += a2 < 0
    1.78 +    add         r2, r2, #3          ; += 3
    1.79 +    subs        r0, r0, lr          ; d2 = a1_0 - d1_0
    1.80 +    mov         r2, r2, asr #3      ; >> 3
    1.81 +    strh        r2, [r1]            ; op[0]
    1.82 +
    1.83 +    addmi       r0, r0, #1          ; += a2 < 0
    1.84 +    add         r0, r0, #3          ; += 3
    1.85 +    ldr         lr, c00040004
    1.86 +    mov         r0, r0, asr #3      ; >> 3
    1.87 +    strh        r0, [r1, #24]       ; op[12]
    1.88 +
    1.89 +    adds        r2, r11, r12        ; b2 = b1_0 + c1_0
    1.90 +    addmi       r2, r2, #1          ; += a2 < 0
    1.91 +    add         r2, r2, #3          ; += 3
    1.92 +    subs        r0, r11, r12        ; c2 = b1_0 - c1_0
    1.93 +    mov         r2, r2, asr #3      ; >> 3
    1.94 +    strh        r2, [r1, #8]        ; op[4]
    1.95 +
    1.96 +    addmi       r0, r0, #1          ; += a2 < 0
    1.97 +    add         r0, r0, #3          ; += 3
    1.98 +    smusd       r3, r3, lr          ; A3 = a1<<2 - d1<<2
    1.99 +    smusd       r7, r7, lr          ; C3 = a1<<2 - d1<<2
   1.100 +    mov         r0, r0, asr #3      ; >> 3
   1.101 +    strh        r0, [r1, #16]       ; op[8]
   1.102 +
   1.103 +
   1.104 +    ; op[3,7,11,15]
   1.105 +    add         r0, r3, r7          ; a1_3 = A3 + C3
   1.106 +    sub         r3, r3, r7          ; b1_3 = A3 - C3
   1.107 +
   1.108 +    smusd       r5, r5, lr          ; B3 = a1<<2 - d1<<2
   1.109 +    smusd       r9, r9, lr          ; D3 = a1<<2 - d1<<2
   1.110 +    add         r7, r5, r9          ; d1_3 = B3 + D3
   1.111 +    sub         r5, r5, r9          ; c1_3 = B3 - D3
   1.112 +
   1.113 +    adds        r2, r0, r7          ; a2 = a1_3 + d1_3
   1.114 +    addmi       r2, r2, #1          ; += a2 < 0
   1.115 +    add         r2, r2, #3          ; += 3
   1.116 +    adds        r9, r3, r5          ; b2 = b1_3 + c1_3
   1.117 +    mov         r2, r2, asr #3      ; >> 3
   1.118 +    strh        r2, [r1, #6]        ; op[3]
   1.119 +
   1.120 +    addmi       r9, r9, #1          ; += a2 < 0
   1.121 +    add         r9, r9, #3          ; += 3
   1.122 +    subs        r2, r3, r5          ; c2 = b1_3 - c1_3
   1.123 +    mov         r9, r9, asr #3      ; >> 3
   1.124 +    strh        r9, [r1, #14]       ; op[7]
   1.125 +
   1.126 +    addmi       r2, r2, #1          ; += a2 < 0
   1.127 +    add         r2, r2, #3          ; += 3
   1.128 +    subs        r9, r0, r7          ; d2 = a1_3 - d1_3
   1.129 +    mov         r2, r2, asr #3      ; >> 3
   1.130 +    strh        r2, [r1, #22]       ; op[11]
   1.131 +
   1.132 +    addmi       r9, r9, #1          ; += a2 < 0
   1.133 +    add         r9, r9, #3          ; += 3
   1.134 +    smuad       r3, r4, lr          ; A1 = b1<<2 + c1<<2
   1.135 +    smuad       r5, r8, lr          ; C1 = b1<<2 + c1<<2
   1.136 +    mov         r9, r9, asr #3      ; >> 3
   1.137 +    strh        r9, [r1, #30]       ; op[15]
   1.138 +
   1.139 +    ; op[1,5,9,13]
   1.140 +    add         r0, r3, r5          ; a1_1 = A1 + C1
   1.141 +    sub         r3, r3, r5          ; b1_1 = A1 - C1
   1.142 +
   1.143 +    smuad       r7, r6, lr          ; B1 = b1<<2 + c1<<2
   1.144 +    smuad       r9, r10, lr         ; D1 = b1<<2 + c1<<2
   1.145 +    add         r5, r7, r9          ; d1_1 = B1 + D1
   1.146 +    sub         r7, r7, r9          ; c1_1 = B1 - D1
   1.147 +
   1.148 +    adds        r2, r0, r5          ; a2 = a1_1 + d1_1
   1.149 +    addmi       r2, r2, #1          ; += a2 < 0
   1.150 +    add         r2, r2, #3          ; += 3
   1.151 +    adds        r9, r3, r7          ; b2 = b1_1 + c1_1
   1.152 +    mov         r2, r2, asr #3      ; >> 3
   1.153 +    strh        r2, [r1, #2]        ; op[1]
   1.154 +
   1.155 +    addmi       r9, r9, #1          ; += a2 < 0
   1.156 +    add         r9, r9, #3          ; += 3
   1.157 +    subs        r2, r3, r7          ; c2 = b1_1 - c1_1
   1.158 +    mov         r9, r9, asr #3      ; >> 3
   1.159 +    strh        r9, [r1, #10]       ; op[5]
   1.160 +
   1.161 +    addmi       r2, r2, #1          ; += a2 < 0
   1.162 +    add         r2, r2, #3          ; += 3
   1.163 +    subs        r9, r0, r5          ; d2 = a1_1 - d1_1
   1.164 +    mov         r2, r2, asr #3      ; >> 3
   1.165 +    strh        r2, [r1, #18]       ; op[9]
   1.166 +
   1.167 +    addmi       r9, r9, #1          ; += a2 < 0
   1.168 +    add         r9, r9, #3          ; += 3
   1.169 +    smusd       r4, r4, lr          ; A2 = b1<<2 - c1<<2
   1.170 +    smusd       r8, r8, lr          ; C2 = b1<<2 - c1<<2
   1.171 +    mov         r9, r9, asr #3      ; >> 3
   1.172 +    strh        r9, [r1, #26]       ; op[13]
   1.173 +
   1.174 +
   1.175 +    ; op[2,6,10,14]
   1.176 +    add         r11, r4, r8         ; a1_2 = A2 + C2
   1.177 +    sub         r12, r4, r8         ; b1_2 = A2 - C2
   1.178 +
   1.179 +    smusd       r6, r6, lr          ; B2 = b1<<2 - c1<<2
   1.180 +    smusd       r10, r10, lr        ; D2 = b1<<2 - c1<<2
   1.181 +    add         r4, r6, r10         ; d1_2 = B2 + D2
   1.182 +    sub         r8, r6, r10         ; c1_2 = B2 - D2
   1.183 +
   1.184 +    adds        r2, r11, r4         ; a2 = a1_2 + d1_2
   1.185 +    addmi       r2, r2, #1          ; += a2 < 0
   1.186 +    add         r2, r2, #3          ; += 3
   1.187 +    adds        r9, r12, r8         ; b2 = b1_2 + c1_2
   1.188 +    mov         r2, r2, asr #3      ; >> 3
   1.189 +    strh        r2, [r1, #4]        ; op[2]
   1.190 +
   1.191 +    addmi       r9, r9, #1          ; += a2 < 0
   1.192 +    add         r9, r9, #3          ; += 3
   1.193 +    subs        r2, r12, r8         ; c2 = b1_2 - c1_2
   1.194 +    mov         r9, r9, asr #3      ; >> 3
   1.195 +    strh        r9, [r1, #12]       ; op[6]
   1.196 +
   1.197 +    addmi       r2, r2, #1          ; += a2 < 0
   1.198 +    add         r2, r2, #3          ; += 3
   1.199 +    subs        r9, r11, r4         ; d2 = a1_2 - d1_2
   1.200 +    mov         r2, r2, asr #3      ; >> 3
   1.201 +    strh        r2, [r1, #20]       ; op[10]
   1.202 +
   1.203 +    addmi       r9, r9, #1          ; += a2 < 0
   1.204 +    add         r9, r9, #3          ; += 3
   1.205 +    mov         r9, r9, asr #3      ; >> 3
   1.206 +    strh        r9, [r1, #28]       ; op[14]
   1.207 +
   1.208 +
   1.209 +    ldmia       sp!, {r4 - r11, pc}
   1.210 +    ENDP        ; |vp8_short_walsh4x4_armv6|
   1.211 +
   1.212 +c00040004
   1.213 +    DCD         0x00040004
   1.214 +
   1.215 +    END

mercurial