1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp9/common/arm/neon/vp9_convolve8_neon.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,280 @@ 1.4 +; 1.5 +; Copyright (c) 2013 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 + ; These functions are only valid when: 1.16 + ; x_step_q4 == 16 1.17 + ; w%4 == 0 1.18 + ; h%4 == 0 1.19 + ; taps == 8 1.20 + ; VP9_FILTER_WEIGHT == 128 1.21 + ; VP9_FILTER_SHIFT == 7 1.22 + 1.23 + EXPORT |vp9_convolve8_horiz_neon| 1.24 + EXPORT |vp9_convolve8_vert_neon| 1.25 + IMPORT |vp9_convolve8_horiz_c| 1.26 + IMPORT |vp9_convolve8_vert_c| 1.27 + ARM 1.28 + REQUIRE8 1.29 + PRESERVE8 1.30 + 1.31 + AREA ||.text||, CODE, READONLY, ALIGN=2 1.32 + 1.33 + ; Multiply and accumulate by q0 1.34 + MACRO 1.35 + MULTIPLY_BY_Q0 $dst, $src0, $src1, $src2, $src3, $src4, $src5, $src6, $src7 1.36 + vmull.s16 $dst, $src0, d0[0] 1.37 + vmlal.s16 $dst, $src1, d0[1] 1.38 + vmlal.s16 $dst, $src2, d0[2] 1.39 + vmlal.s16 $dst, $src3, d0[3] 1.40 + vmlal.s16 $dst, $src4, d1[0] 1.41 + vmlal.s16 $dst, $src5, d1[1] 1.42 + vmlal.s16 $dst, $src6, d1[2] 1.43 + vmlal.s16 $dst, $src7, d1[3] 1.44 + MEND 1.45 + 1.46 +; r0 const uint8_t *src 1.47 +; r1 int src_stride 1.48 +; r2 uint8_t *dst 1.49 +; r3 int dst_stride 1.50 +; sp[]const int16_t *filter_x 1.51 +; sp[]int x_step_q4 1.52 +; sp[]const int16_t *filter_y ; unused 1.53 +; sp[]int y_step_q4 ; unused 1.54 +; sp[]int w 1.55 +; sp[]int h 1.56 + 1.57 +|vp9_convolve8_horiz_neon| PROC 1.58 + ldr r12, [sp, #4] ; x_step_q4 1.59 + cmp r12, #16 1.60 + bne vp9_convolve8_horiz_c 1.61 + 1.62 + push {r4-r10, lr} 1.63 + 1.64 + sub r0, r0, #3 ; adjust for taps 1.65 + 1.66 + ldr r5, [sp, #32] ; filter_x 1.67 + ldr r6, [sp, #48] ; w 1.68 + ldr r7, [sp, #52] ; h 1.69 + 1.70 + vld1.s16 {q0}, [r5] ; filter_x 1.71 + 1.72 + sub r8, r1, r1, lsl #2 ; -src_stride * 3 1.73 + add r8, r8, #4 ; -src_stride * 3 + 4 1.74 + 1.75 + sub r4, r3, r3, lsl #2 ; -dst_stride * 3 1.76 + add r4, r4, #4 ; -dst_stride * 3 + 4 1.77 + 1.78 + rsb r9, r6, r1, lsl #2 ; reset src for outer loop 1.79 + sub r9, r9, #7 1.80 + rsb r12, r6, r3, lsl #2 ; reset dst for outer loop 1.81 + 1.82 + mov r10, r6 ; w loop counter 1.83 + 1.84 +loop_horiz_v 1.85 + vld1.8 {d24}, [r0], r1 1.86 + vld1.8 {d25}, [r0], r1 1.87 + vld1.8 {d26}, [r0], r1 1.88 + vld1.8 {d27}, [r0], r8 1.89 + 1.90 + vtrn.16 q12, q13 1.91 + vtrn.8 d24, d25 1.92 + vtrn.8 d26, d27 1.93 + 1.94 + pld [r0, r1, lsl #2] 1.95 + 1.96 + vmovl.u8 q8, d24 1.97 + vmovl.u8 q9, d25 1.98 + vmovl.u8 q10, d26 1.99 + vmovl.u8 q11, d27 1.100 + 1.101 + ; save a few instructions in the inner loop 1.102 + vswp d17, d18 1.103 + vmov d23, d21 1.104 + 1.105 + add r0, r0, #3 1.106 + 1.107 +loop_horiz 1.108 + add r5, r0, #64 1.109 + 1.110 + vld1.32 {d28[]}, [r0], r1 1.111 + vld1.32 {d29[]}, [r0], r1 1.112 + vld1.32 {d31[]}, [r0], r1 1.113 + vld1.32 {d30[]}, [r0], r8 1.114 + 1.115 + pld [r5] 1.116 + 1.117 + vtrn.16 d28, d31 1.118 + vtrn.16 d29, d30 1.119 + vtrn.8 d28, d29 1.120 + vtrn.8 d31, d30 1.121 + 1.122 + pld [r5, r1] 1.123 + 1.124 + ; extract to s16 1.125 + vtrn.32 q14, q15 1.126 + vmovl.u8 q12, d28 1.127 + vmovl.u8 q13, d29 1.128 + 1.129 + pld [r5, r1, lsl #1] 1.130 + 1.131 + ; src[] * filter_x 1.132 + MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24 1.133 + MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26 1.134 + MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27 1.135 + MULTIPLY_BY_Q0 q15, d22, d18, d19, d23, d24, d26, d27, d25 1.136 + 1.137 + pld [r5, -r8] 1.138 + 1.139 + ; += 64 >> 7 1.140 + vqrshrun.s32 d2, q1, #7 1.141 + vqrshrun.s32 d3, q2, #7 1.142 + vqrshrun.s32 d4, q14, #7 1.143 + vqrshrun.s32 d5, q15, #7 1.144 + 1.145 + ; saturate 1.146 + vqmovn.u16 d2, q1 1.147 + vqmovn.u16 d3, q2 1.148 + 1.149 + ; transpose 1.150 + vtrn.16 d2, d3 1.151 + vtrn.32 d2, d3 1.152 + vtrn.8 d2, d3 1.153 + 1.154 + vst1.u32 {d2[0]}, [r2@32], r3 1.155 + vst1.u32 {d3[0]}, [r2@32], r3 1.156 + vst1.u32 {d2[1]}, [r2@32], r3 1.157 + vst1.u32 {d3[1]}, [r2@32], r4 1.158 + 1.159 + vmov q8, q9 1.160 + vmov d20, d23 1.161 + vmov q11, q12 1.162 + vmov q9, q13 1.163 + 1.164 + subs r6, r6, #4 ; w -= 4 1.165 + bgt loop_horiz 1.166 + 1.167 + ; outer loop 1.168 + mov r6, r10 ; restore w counter 1.169 + add r0, r0, r9 ; src += src_stride * 4 - w 1.170 + add r2, r2, r12 ; dst += dst_stride * 4 - w 1.171 + subs r7, r7, #4 ; h -= 4 1.172 + bgt loop_horiz_v 1.173 + 1.174 + pop {r4-r10, pc} 1.175 + 1.176 + ENDP 1.177 + 1.178 +|vp9_convolve8_vert_neon| PROC 1.179 + ldr r12, [sp, #12] 1.180 + cmp r12, #16 1.181 + bne vp9_convolve8_vert_c 1.182 + 1.183 + push {r4-r8, lr} 1.184 + 1.185 + ; adjust for taps 1.186 + sub r0, r0, r1 1.187 + sub r0, r0, r1, lsl #1 1.188 + 1.189 + ldr r4, [sp, #32] ; filter_y 1.190 + ldr r6, [sp, #40] ; w 1.191 + ldr lr, [sp, #44] ; h 1.192 + 1.193 + vld1.s16 {q0}, [r4] ; filter_y 1.194 + 1.195 + lsl r1, r1, #1 1.196 + lsl r3, r3, #1 1.197 + 1.198 +loop_vert_h 1.199 + mov r4, r0 1.200 + add r7, r0, r1, asr #1 1.201 + mov r5, r2 1.202 + add r8, r2, r3, asr #1 1.203 + mov r12, lr ; h loop counter 1.204 + 1.205 + vld1.u32 {d16[0]}, [r4], r1 1.206 + vld1.u32 {d16[1]}, [r7], r1 1.207 + vld1.u32 {d18[0]}, [r4], r1 1.208 + vld1.u32 {d18[1]}, [r7], r1 1.209 + vld1.u32 {d20[0]}, [r4], r1 1.210 + vld1.u32 {d20[1]}, [r7], r1 1.211 + vld1.u32 {d22[0]}, [r4], r1 1.212 + 1.213 + vmovl.u8 q8, d16 1.214 + vmovl.u8 q9, d18 1.215 + vmovl.u8 q10, d20 1.216 + vmovl.u8 q11, d22 1.217 + 1.218 +loop_vert 1.219 + ; always process a 4x4 block at a time 1.220 + vld1.u32 {d24[0]}, [r7], r1 1.221 + vld1.u32 {d26[0]}, [r4], r1 1.222 + vld1.u32 {d26[1]}, [r7], r1 1.223 + vld1.u32 {d24[1]}, [r4], r1 1.224 + 1.225 + ; extract to s16 1.226 + vmovl.u8 q12, d24 1.227 + vmovl.u8 q13, d26 1.228 + 1.229 + pld [r5] 1.230 + pld [r8] 1.231 + 1.232 + ; src[] * filter_y 1.233 + MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24 1.234 + 1.235 + pld [r5, r3] 1.236 + pld [r8, r3] 1.237 + 1.238 + MULTIPLY_BY_Q0 q2, d17, d18, d19, d20, d21, d22, d24, d26 1.239 + 1.240 + pld [r7] 1.241 + pld [r4] 1.242 + 1.243 + MULTIPLY_BY_Q0 q14, d18, d19, d20, d21, d22, d24, d26, d27 1.244 + 1.245 + pld [r7, r1] 1.246 + pld [r4, r1] 1.247 + 1.248 + MULTIPLY_BY_Q0 q15, d19, d20, d21, d22, d24, d26, d27, d25 1.249 + 1.250 + ; += 64 >> 7 1.251 + vqrshrun.s32 d2, q1, #7 1.252 + vqrshrun.s32 d3, q2, #7 1.253 + vqrshrun.s32 d4, q14, #7 1.254 + vqrshrun.s32 d5, q15, #7 1.255 + 1.256 + ; saturate 1.257 + vqmovn.u16 d2, q1 1.258 + vqmovn.u16 d3, q2 1.259 + 1.260 + vst1.u32 {d2[0]}, [r5@32], r3 1.261 + vst1.u32 {d2[1]}, [r8@32], r3 1.262 + vst1.u32 {d3[0]}, [r5@32], r3 1.263 + vst1.u32 {d3[1]}, [r8@32], r3 1.264 + 1.265 + vmov q8, q10 1.266 + vmov d18, d22 1.267 + vmov d19, d24 1.268 + vmov q10, q13 1.269 + vmov d22, d25 1.270 + 1.271 + subs r12, r12, #4 ; h -= 4 1.272 + bgt loop_vert 1.273 + 1.274 + ; outer loop 1.275 + add r0, r0, #4 1.276 + add r2, r2, #4 1.277 + subs r6, r6, #4 ; w -= 4 1.278 + bgt loop_vert_h 1.279 + 1.280 + pop {r4-r8, pc} 1.281 + 1.282 + ENDP 1.283 + END