1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/encoder/arm/armv6/walsh_v6.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,212 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + EXPORT |vp8_short_walsh4x4_armv6| 1.15 + 1.16 + ARM 1.17 + REQUIRE8 1.18 + PRESERVE8 1.19 + 1.20 + AREA |.text|, CODE, READONLY ; name this block of code 1.21 + 1.22 +;short vp8_short_walsh4x4_armv6(short *input, short *output, int pitch) 1.23 +; r0 short *input, 1.24 +; r1 short *output, 1.25 +; r2 int pitch 1.26 +|vp8_short_walsh4x4_armv6| PROC 1.27 + 1.28 + stmdb sp!, {r4 - r11, lr} 1.29 + 1.30 + ldrd r4, r5, [r0], r2 1.31 + ldr lr, c00040004 1.32 + ldrd r6, r7, [r0], r2 1.33 + 1.34 + ; 0-3 1.35 + qadd16 r3, r4, r5 ; [d1|a1] [1+3 | 0+2] 1.36 + qsub16 r4, r4, r5 ; [c1|b1] [1-3 | 0-2] 1.37 + 1.38 + ldrd r8, r9, [r0], r2 1.39 + ; 4-7 1.40 + qadd16 r5, r6, r7 ; [d1|a1] [5+7 | 4+6] 1.41 + qsub16 r6, r6, r7 ; [c1|b1] [5-7 | 4-6] 1.42 + 1.43 + ldrd r10, r11, [r0] 1.44 + ; 8-11 1.45 + qadd16 r7, r8, r9 ; [d1|a1] [9+11 | 8+10] 1.46 + qsub16 r8, r8, r9 ; [c1|b1] [9-11 | 8-10] 1.47 + 1.48 + ; 12-15 1.49 + qadd16 r9, r10, r11 ; [d1|a1] [13+15 | 12+14] 1.50 + qsub16 r10, r10, r11 ; [c1|b1] [13-15 | 12-14] 1.51 + 1.52 + 1.53 + lsls r2, r3, #16 1.54 + smuad r11, r3, lr ; A0 = a1<<2 + d1<<2 1.55 + addne r11, r11, #1 ; A0 += (a1!=0) 1.56 + 1.57 + lsls r2, r7, #16 1.58 + smuad r12, r7, lr ; C0 = a1<<2 + d1<<2 1.59 + addne r12, r12, #1 ; C0 += (a1!=0) 1.60 + 1.61 + add r0, r11, r12 ; a1_0 = A0 + C0 1.62 + sub r11, r11, r12 ; b1_0 = A0 - C0 1.63 + 1.64 + lsls r2, r5, #16 1.65 + smuad r12, r5, lr ; B0 = a1<<2 + d1<<2 1.66 + addne r12, r12, #1 ; B0 += (a1!=0) 1.67 + 1.68 + lsls r2, r9, #16 1.69 + smuad r2, r9, lr ; D0 = a1<<2 + d1<<2 1.70 + addne r2, r2, #1 ; D0 += (a1!=0) 1.71 + 1.72 + add lr, r12, r2 ; d1_0 = B0 + D0 1.73 + sub r12, r12, r2 ; c1_0 = B0 - D0 1.74 + 1.75 + ; op[0,4,8,12] 1.76 + adds r2, r0, lr ; a2 = a1_0 + d1_0 1.77 + addmi r2, r2, #1 ; += a2 < 0 1.78 + add r2, r2, #3 ; += 3 1.79 + subs r0, r0, lr ; d2 = a1_0 - d1_0 1.80 + mov r2, r2, asr #3 ; >> 3 1.81 + strh r2, [r1] ; op[0] 1.82 + 1.83 + addmi r0, r0, #1 ; += a2 < 0 1.84 + add r0, r0, #3 ; += 3 1.85 + ldr lr, c00040004 1.86 + mov r0, r0, asr #3 ; >> 3 1.87 + strh r0, [r1, #24] ; op[12] 1.88 + 1.89 + adds r2, r11, r12 ; b2 = b1_0 + c1_0 1.90 + addmi r2, r2, #1 ; += a2 < 0 1.91 + add r2, r2, #3 ; += 3 1.92 + subs r0, r11, r12 ; c2 = b1_0 - c1_0 1.93 + mov r2, r2, asr #3 ; >> 3 1.94 + strh r2, [r1, #8] ; op[4] 1.95 + 1.96 + addmi r0, r0, #1 ; += a2 < 0 1.97 + add r0, r0, #3 ; += 3 1.98 + smusd r3, r3, lr ; A3 = a1<<2 - d1<<2 1.99 + smusd r7, r7, lr ; C3 = a1<<2 - d1<<2 1.100 + mov r0, r0, asr #3 ; >> 3 1.101 + strh r0, [r1, #16] ; op[8] 1.102 + 1.103 + 1.104 + ; op[3,7,11,15] 1.105 + add r0, r3, r7 ; a1_3 = A3 + C3 1.106 + sub r3, r3, r7 ; b1_3 = A3 - C3 1.107 + 1.108 + smusd r5, r5, lr ; B3 = a1<<2 - d1<<2 1.109 + smusd r9, r9, lr ; D3 = a1<<2 - d1<<2 1.110 + add r7, r5, r9 ; d1_3 = B3 + D3 1.111 + sub r5, r5, r9 ; c1_3 = B3 - D3 1.112 + 1.113 + adds r2, r0, r7 ; a2 = a1_3 + d1_3 1.114 + addmi r2, r2, #1 ; += a2 < 0 1.115 + add r2, r2, #3 ; += 3 1.116 + adds r9, r3, r5 ; b2 = b1_3 + c1_3 1.117 + mov r2, r2, asr #3 ; >> 3 1.118 + strh r2, [r1, #6] ; op[3] 1.119 + 1.120 + addmi r9, r9, #1 ; += a2 < 0 1.121 + add r9, r9, #3 ; += 3 1.122 + subs r2, r3, r5 ; c2 = b1_3 - c1_3 1.123 + mov r9, r9, asr #3 ; >> 3 1.124 + strh r9, [r1, #14] ; op[7] 1.125 + 1.126 + addmi r2, r2, #1 ; += a2 < 0 1.127 + add r2, r2, #3 ; += 3 1.128 + subs r9, r0, r7 ; d2 = a1_3 - d1_3 1.129 + mov r2, r2, asr #3 ; >> 3 1.130 + strh r2, [r1, #22] ; op[11] 1.131 + 1.132 + addmi r9, r9, #1 ; += a2 < 0 1.133 + add r9, r9, #3 ; += 3 1.134 + smuad r3, r4, lr ; A1 = b1<<2 + c1<<2 1.135 + smuad r5, r8, lr ; C1 = b1<<2 + c1<<2 1.136 + mov r9, r9, asr #3 ; >> 3 1.137 + strh r9, [r1, #30] ; op[15] 1.138 + 1.139 + ; op[1,5,9,13] 1.140 + add r0, r3, r5 ; a1_1 = A1 + C1 1.141 + sub r3, r3, r5 ; b1_1 = A1 - C1 1.142 + 1.143 + smuad r7, r6, lr ; B1 = b1<<2 + c1<<2 1.144 + smuad r9, r10, lr ; D1 = b1<<2 + c1<<2 1.145 + add r5, r7, r9 ; d1_1 = B1 + D1 1.146 + sub r7, r7, r9 ; c1_1 = B1 - D1 1.147 + 1.148 + adds r2, r0, r5 ; a2 = a1_1 + d1_1 1.149 + addmi r2, r2, #1 ; += a2 < 0 1.150 + add r2, r2, #3 ; += 3 1.151 + adds r9, r3, r7 ; b2 = b1_1 + c1_1 1.152 + mov r2, r2, asr #3 ; >> 3 1.153 + strh r2, [r1, #2] ; op[1] 1.154 + 1.155 + addmi r9, r9, #1 ; += a2 < 0 1.156 + add r9, r9, #3 ; += 3 1.157 + subs r2, r3, r7 ; c2 = b1_1 - c1_1 1.158 + mov r9, r9, asr #3 ; >> 3 1.159 + strh r9, [r1, #10] ; op[5] 1.160 + 1.161 + addmi r2, r2, #1 ; += a2 < 0 1.162 + add r2, r2, #3 ; += 3 1.163 + subs r9, r0, r5 ; d2 = a1_1 - d1_1 1.164 + mov r2, r2, asr #3 ; >> 3 1.165 + strh r2, [r1, #18] ; op[9] 1.166 + 1.167 + addmi r9, r9, #1 ; += a2 < 0 1.168 + add r9, r9, #3 ; += 3 1.169 + smusd r4, r4, lr ; A2 = b1<<2 - c1<<2 1.170 + smusd r8, r8, lr ; C2 = b1<<2 - c1<<2 1.171 + mov r9, r9, asr #3 ; >> 3 1.172 + strh r9, [r1, #26] ; op[13] 1.173 + 1.174 + 1.175 + ; op[2,6,10,14] 1.176 + add r11, r4, r8 ; a1_2 = A2 + C2 1.177 + sub r12, r4, r8 ; b1_2 = A2 - C2 1.178 + 1.179 + smusd r6, r6, lr ; B2 = b1<<2 - c1<<2 1.180 + smusd r10, r10, lr ; D2 = b1<<2 - c1<<2 1.181 + add r4, r6, r10 ; d1_2 = B2 + D2 1.182 + sub r8, r6, r10 ; c1_2 = B2 - D2 1.183 + 1.184 + adds r2, r11, r4 ; a2 = a1_2 + d1_2 1.185 + addmi r2, r2, #1 ; += a2 < 0 1.186 + add r2, r2, #3 ; += 3 1.187 + adds r9, r12, r8 ; b2 = b1_2 + c1_2 1.188 + mov r2, r2, asr #3 ; >> 3 1.189 + strh r2, [r1, #4] ; op[2] 1.190 + 1.191 + addmi r9, r9, #1 ; += a2 < 0 1.192 + add r9, r9, #3 ; += 3 1.193 + subs r2, r12, r8 ; c2 = b1_2 - c1_2 1.194 + mov r9, r9, asr #3 ; >> 3 1.195 + strh r9, [r1, #12] ; op[6] 1.196 + 1.197 + addmi r2, r2, #1 ; += a2 < 0 1.198 + add r2, r2, #3 ; += 3 1.199 + subs r9, r11, r4 ; d2 = a1_2 - d1_2 1.200 + mov r2, r2, asr #3 ; >> 3 1.201 + strh r2, [r1, #20] ; op[10] 1.202 + 1.203 + addmi r9, r9, #1 ; += a2 < 0 1.204 + add r9, r9, #3 ; += 3 1.205 + mov r9, r9, asr #3 ; >> 3 1.206 + strh r9, [r1, #28] ; op[14] 1.207 + 1.208 + 1.209 + ldmia sp!, {r4 - r11, pc} 1.210 + ENDP ; |vp8_short_walsh4x4_armv6| 1.211 + 1.212 +c00040004 1.213 + DCD 0x00040004 1.214 + 1.215 + END