1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libyuv/source/scale_neon.cc Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,684 @@ 1.4 +/* 1.5 + * Copyright 2011 The LibYuv Project Authors. All rights reserved. 1.6 + * 1.7 + * Use of this source code is governed by a BSD-style license 1.8 + * that can be found in the LICENSE file in the root of the source 1.9 + * tree. An additional intellectual property rights grant can be found 1.10 + * in the file PATENTS. All contributing project authors may 1.11 + * be found in the AUTHORS file in the root of the source tree. 1.12 + */ 1.13 + 1.14 +#include "libyuv/row.h" 1.15 + 1.16 +#ifdef __cplusplus 1.17 +namespace libyuv { 1.18 +extern "C" { 1.19 +#endif 1.20 + 1.21 +// This module is for GCC Neon. 1.22 +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) 1.23 + 1.24 +// NEON downscalers with interpolation. 1.25 +// Provided by Fritz Koenig 1.26 + 1.27 +// Read 32x1 throw away even pixels, and write 16x1. 1.28 +void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 1.29 + uint8* dst, int dst_width) { 1.30 + asm volatile ( 1.31 + ".p2align 2 \n" 1.32 + "1: \n" 1.33 + // load even pixels into q0, odd into q1 1.34 + "vld2.8 {q0, q1}, [%0]! \n" 1.35 + "subs %2, %2, #16 \n" // 16 processed per loop 1.36 + "vst1.8 {q1}, [%1]! \n" // store odd pixels 1.37 + "bgt 1b \n" 1.38 + : "+r"(src_ptr), // %0 1.39 + "+r"(dst), // %1 1.40 + "+r"(dst_width) // %2 1.41 + : 1.42 + : "q0", "q1" // Clobber List 1.43 + ); 1.44 +} 1.45 + 1.46 +// Read 32x2 average down and write 16x1. 1.47 +void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 1.48 + uint8* dst, int dst_width) { 1.49 + asm volatile ( 1.50 + // change the stride to row 2 pointer 1.51 + "add %1, %0 \n" 1.52 + ".p2align 2 \n" 1.53 + "1: \n" 1.54 + "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc 1.55 + "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc 1.56 + "subs %3, %3, #16 \n" // 16 processed per loop 1.57 + "vpaddl.u8 q0, q0 \n" // row 1 add adjacent 1.58 + "vpaddl.u8 q1, q1 \n" 1.59 + "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1 1.60 + "vpadal.u8 q1, q3 \n" 1.61 + "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack 1.62 + "vrshrn.u16 d1, q1, #2 \n" 1.63 + "vst1.8 {q0}, [%2]! \n" 1.64 + "bgt 1b \n" 1.65 + : "+r"(src_ptr), // %0 1.66 + "+r"(src_stride), // %1 1.67 + "+r"(dst), // %2 1.68 + "+r"(dst_width) // %3 1.69 + : 1.70 + : "q0", "q1", "q2", "q3" // Clobber List 1.71 + ); 1.72 +} 1.73 + 1.74 +void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 1.75 + uint8* dst_ptr, int dst_width) { 1.76 + asm volatile ( 1.77 + ".p2align 2 \n" 1.78 + "1: \n" 1.79 + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 1.80 + "subs %2, %2, #8 \n" // 8 processed per loop 1.81 + "vst1.8 {d2}, [%1]! \n" 1.82 + "bgt 1b \n" 1.83 + : "+r"(src_ptr), // %0 1.84 + "+r"(dst_ptr), // %1 1.85 + "+r"(dst_width) // %2 1.86 + : 1.87 + : "q0", "q1", "memory", "cc" 1.88 + ); 1.89 +} 1.90 + 1.91 +void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 1.92 + uint8* dst_ptr, int dst_width) { 1.93 + asm volatile ( 1.94 + "add r4, %0, %3 \n" 1.95 + "add r5, r4, %3 \n" 1.96 + "add %3, r5, %3 \n" 1.97 + ".p2align 2 \n" 1.98 + "1: \n" 1.99 + "vld1.8 {q0}, [%0]! \n" // load up 16x4 1.100 + "vld1.8 {q1}, [r4]! \n" 1.101 + "vld1.8 {q2}, [r5]! \n" 1.102 + "vld1.8 {q3}, [%3]! \n" 1.103 + "subs %2, %2, #4 \n" 1.104 + "vpaddl.u8 q0, q0 \n" 1.105 + "vpadal.u8 q0, q1 \n" 1.106 + "vpadal.u8 q0, q2 \n" 1.107 + "vpadal.u8 q0, q3 \n" 1.108 + "vpaddl.u16 q0, q0 \n" 1.109 + "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding 1.110 + "vmovn.u16 d0, q0 \n" 1.111 + "vst1.32 {d0[0]}, [%1]! \n" 1.112 + "bgt 1b \n" 1.113 + : "+r"(src_ptr), // %0 1.114 + "+r"(dst_ptr), // %1 1.115 + "+r"(dst_width) // %2 1.116 + : "r"(src_stride) // %3 1.117 + : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc" 1.118 + ); 1.119 +} 1.120 + 1.121 +// Down scale from 4 to 3 pixels. Use the neon multilane read/write 1.122 +// to load up the every 4th pixel into a 4 different registers. 1.123 +// Point samples 32 pixels to 24 pixels. 1.124 +void ScaleRowDown34_NEON(const uint8* src_ptr, 1.125 + ptrdiff_t src_stride, 1.126 + uint8* dst_ptr, int dst_width) { 1.127 + asm volatile ( 1.128 + ".p2align 2 \n" 1.129 + "1: \n" 1.130 + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 1.131 + "subs %2, %2, #24 \n" 1.132 + "vmov d2, d3 \n" // order d0, d1, d2 1.133 + "vst3.8 {d0, d1, d2}, [%1]! \n" 1.134 + "bgt 1b \n" 1.135 + : "+r"(src_ptr), // %0 1.136 + "+r"(dst_ptr), // %1 1.137 + "+r"(dst_width) // %2 1.138 + : 1.139 + : "d0", "d1", "d2", "d3", "memory", "cc" 1.140 + ); 1.141 +} 1.142 + 1.143 +void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, 1.144 + ptrdiff_t src_stride, 1.145 + uint8* dst_ptr, int dst_width) { 1.146 + asm volatile ( 1.147 + "vmov.u8 d24, #3 \n" 1.148 + "add %3, %0 \n" 1.149 + ".p2align 2 \n" 1.150 + "1: \n" 1.151 + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 1.152 + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 1.153 + "subs %2, %2, #24 \n" 1.154 + 1.155 + // filter src line 0 with src line 1 1.156 + // expand chars to shorts to allow for room 1.157 + // when adding lines together 1.158 + "vmovl.u8 q8, d4 \n" 1.159 + "vmovl.u8 q9, d5 \n" 1.160 + "vmovl.u8 q10, d6 \n" 1.161 + "vmovl.u8 q11, d7 \n" 1.162 + 1.163 + // 3 * line_0 + line_1 1.164 + "vmlal.u8 q8, d0, d24 \n" 1.165 + "vmlal.u8 q9, d1, d24 \n" 1.166 + "vmlal.u8 q10, d2, d24 \n" 1.167 + "vmlal.u8 q11, d3, d24 \n" 1.168 + 1.169 + // (3 * line_0 + line_1) >> 2 1.170 + "vqrshrn.u16 d0, q8, #2 \n" 1.171 + "vqrshrn.u16 d1, q9, #2 \n" 1.172 + "vqrshrn.u16 d2, q10, #2 \n" 1.173 + "vqrshrn.u16 d3, q11, #2 \n" 1.174 + 1.175 + // a0 = (src[0] * 3 + s[1] * 1) >> 2 1.176 + "vmovl.u8 q8, d1 \n" 1.177 + "vmlal.u8 q8, d0, d24 \n" 1.178 + "vqrshrn.u16 d0, q8, #2 \n" 1.179 + 1.180 + // a1 = (src[1] * 1 + s[2] * 1) >> 1 1.181 + "vrhadd.u8 d1, d1, d2 \n" 1.182 + 1.183 + // a2 = (src[2] * 1 + s[3] * 3) >> 2 1.184 + "vmovl.u8 q8, d2 \n" 1.185 + "vmlal.u8 q8, d3, d24 \n" 1.186 + "vqrshrn.u16 d2, q8, #2 \n" 1.187 + 1.188 + "vst3.8 {d0, d1, d2}, [%1]! \n" 1.189 + 1.190 + "bgt 1b \n" 1.191 + : "+r"(src_ptr), // %0 1.192 + "+r"(dst_ptr), // %1 1.193 + "+r"(dst_width), // %2 1.194 + "+r"(src_stride) // %3 1.195 + : 1.196 + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc" 1.197 + ); 1.198 +} 1.199 + 1.200 +void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, 1.201 + ptrdiff_t src_stride, 1.202 + uint8* dst_ptr, int dst_width) { 1.203 + asm volatile ( 1.204 + "vmov.u8 d24, #3 \n" 1.205 + "add %3, %0 \n" 1.206 + ".p2align 2 \n" 1.207 + "1: \n" 1.208 + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 1.209 + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 1.210 + "subs %2, %2, #24 \n" 1.211 + // average src line 0 with src line 1 1.212 + "vrhadd.u8 q0, q0, q2 \n" 1.213 + "vrhadd.u8 q1, q1, q3 \n" 1.214 + 1.215 + // a0 = (src[0] * 3 + s[1] * 1) >> 2 1.216 + "vmovl.u8 q3, d1 \n" 1.217 + "vmlal.u8 q3, d0, d24 \n" 1.218 + "vqrshrn.u16 d0, q3, #2 \n" 1.219 + 1.220 + // a1 = (src[1] * 1 + s[2] * 1) >> 1 1.221 + "vrhadd.u8 d1, d1, d2 \n" 1.222 + 1.223 + // a2 = (src[2] * 1 + s[3] * 3) >> 2 1.224 + "vmovl.u8 q3, d2 \n" 1.225 + "vmlal.u8 q3, d3, d24 \n" 1.226 + "vqrshrn.u16 d2, q3, #2 \n" 1.227 + 1.228 + "vst3.8 {d0, d1, d2}, [%1]! \n" 1.229 + "bgt 1b \n" 1.230 + : "+r"(src_ptr), // %0 1.231 + "+r"(dst_ptr), // %1 1.232 + "+r"(dst_width), // %2 1.233 + "+r"(src_stride) // %3 1.234 + : 1.235 + : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc" 1.236 + ); 1.237 +} 1.238 + 1.239 +#define HAS_SCALEROWDOWN38_NEON 1.240 +static uvec8 kShuf38 = 1.241 + { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; 1.242 +static uvec8 kShuf38_2 = 1.243 + { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 }; 1.244 +static vec16 kMult38_Div6 = 1.245 + { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, 1.246 + 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; 1.247 +static vec16 kMult38_Div9 = 1.248 + { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, 1.249 + 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; 1.250 + 1.251 +// 32 -> 12 1.252 +void ScaleRowDown38_NEON(const uint8* src_ptr, 1.253 + ptrdiff_t src_stride, 1.254 + uint8* dst_ptr, int dst_width) { 1.255 + asm volatile ( 1.256 + "vld1.8 {q3}, [%3] \n" 1.257 + ".p2align 2 \n" 1.258 + "1: \n" 1.259 + "vld1.8 {d0, d1, d2, d3}, [%0]! \n" 1.260 + "subs %2, %2, #12 \n" 1.261 + "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" 1.262 + "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" 1.263 + "vst1.8 {d4}, [%1]! \n" 1.264 + "vst1.32 {d5[0]}, [%1]! \n" 1.265 + "bgt 1b \n" 1.266 + : "+r"(src_ptr), // %0 1.267 + "+r"(dst_ptr), // %1 1.268 + "+r"(dst_width) // %2 1.269 + : "r"(&kShuf38) // %3 1.270 + : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc" 1.271 + ); 1.272 +} 1.273 + 1.274 +// 32x3 -> 12x1 1.275 +void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, 1.276 + ptrdiff_t src_stride, 1.277 + uint8* dst_ptr, int dst_width) { 1.278 + asm volatile ( 1.279 + "vld1.16 {q13}, [%4] \n" 1.280 + "vld1.8 {q14}, [%5] \n" 1.281 + "vld1.8 {q15}, [%6] \n" 1.282 + "add r4, %0, %3, lsl #1 \n" 1.283 + "add %3, %0 \n" 1.284 + ".p2align 2 \n" 1.285 + "1: \n" 1.286 + 1.287 + // d0 = 00 40 01 41 02 42 03 43 1.288 + // d1 = 10 50 11 51 12 52 13 53 1.289 + // d2 = 20 60 21 61 22 62 23 63 1.290 + // d3 = 30 70 31 71 32 72 33 73 1.291 + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" 1.292 + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" 1.293 + "vld4.8 {d16, d17, d18, d19}, [r4]! \n" 1.294 + "subs %2, %2, #12 \n" 1.295 + 1.296 + // Shuffle the input data around to get align the data 1.297 + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 1.298 + // d0 = 00 10 01 11 02 12 03 13 1.299 + // d1 = 40 50 41 51 42 52 43 53 1.300 + "vtrn.u8 d0, d1 \n" 1.301 + "vtrn.u8 d4, d5 \n" 1.302 + "vtrn.u8 d16, d17 \n" 1.303 + 1.304 + // d2 = 20 30 21 31 22 32 23 33 1.305 + // d3 = 60 70 61 71 62 72 63 73 1.306 + "vtrn.u8 d2, d3 \n" 1.307 + "vtrn.u8 d6, d7 \n" 1.308 + "vtrn.u8 d18, d19 \n" 1.309 + 1.310 + // d0 = 00+10 01+11 02+12 03+13 1.311 + // d2 = 40+50 41+51 42+52 43+53 1.312 + "vpaddl.u8 q0, q0 \n" 1.313 + "vpaddl.u8 q2, q2 \n" 1.314 + "vpaddl.u8 q8, q8 \n" 1.315 + 1.316 + // d3 = 60+70 61+71 62+72 63+73 1.317 + "vpaddl.u8 d3, d3 \n" 1.318 + "vpaddl.u8 d7, d7 \n" 1.319 + "vpaddl.u8 d19, d19 \n" 1.320 + 1.321 + // combine source lines 1.322 + "vadd.u16 q0, q2 \n" 1.323 + "vadd.u16 q0, q8 \n" 1.324 + "vadd.u16 d4, d3, d7 \n" 1.325 + "vadd.u16 d4, d19 \n" 1.326 + 1.327 + // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] 1.328 + // + s[6 + st * 1] + s[7 + st * 1] 1.329 + // + s[6 + st * 2] + s[7 + st * 2]) / 6 1.330 + "vqrdmulh.s16 q2, q2, q13 \n" 1.331 + "vmovn.u16 d4, q2 \n" 1.332 + 1.333 + // Shuffle 2,3 reg around so that 2 can be added to the 1.334 + // 0,1 reg and 3 can be added to the 4,5 reg. This 1.335 + // requires expanding from u8 to u16 as the 0,1 and 4,5 1.336 + // registers are already expanded. Then do transposes 1.337 + // to get aligned. 1.338 + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 1.339 + "vmovl.u8 q1, d2 \n" 1.340 + "vmovl.u8 q3, d6 \n" 1.341 + "vmovl.u8 q9, d18 \n" 1.342 + 1.343 + // combine source lines 1.344 + "vadd.u16 q1, q3 \n" 1.345 + "vadd.u16 q1, q9 \n" 1.346 + 1.347 + // d4 = xx 20 xx 30 xx 22 xx 32 1.348 + // d5 = xx 21 xx 31 xx 23 xx 33 1.349 + "vtrn.u32 d2, d3 \n" 1.350 + 1.351 + // d4 = xx 20 xx 21 xx 22 xx 23 1.352 + // d5 = xx 30 xx 31 xx 32 xx 33 1.353 + "vtrn.u16 d2, d3 \n" 1.354 + 1.355 + // 0+1+2, 3+4+5 1.356 + "vadd.u16 q0, q1 \n" 1.357 + 1.358 + // Need to divide, but can't downshift as the the value 1.359 + // isn't a power of 2. So multiply by 65536 / n 1.360 + // and take the upper 16 bits. 1.361 + "vqrdmulh.s16 q0, q0, q15 \n" 1.362 + 1.363 + // Align for table lookup, vtbl requires registers to 1.364 + // be adjacent 1.365 + "vmov.u8 d2, d4 \n" 1.366 + 1.367 + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" 1.368 + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" 1.369 + 1.370 + "vst1.8 {d3}, [%1]! \n" 1.371 + "vst1.32 {d4[0]}, [%1]! \n" 1.372 + "bgt 1b \n" 1.373 + : "+r"(src_ptr), // %0 1.374 + "+r"(dst_ptr), // %1 1.375 + "+r"(dst_width), // %2 1.376 + "+r"(src_stride) // %3 1.377 + : "r"(&kMult38_Div6), // %4 1.378 + "r"(&kShuf38_2), // %5 1.379 + "r"(&kMult38_Div9) // %6 1.380 + : "r4", "q0", "q1", "q2", "q3", "q8", "q9", 1.381 + "q13", "q14", "q15", "memory", "cc" 1.382 + ); 1.383 +} 1.384 + 1.385 +// 32x2 -> 12x1 1.386 +void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, 1.387 + ptrdiff_t src_stride, 1.388 + uint8* dst_ptr, int dst_width) { 1.389 + asm volatile ( 1.390 + "vld1.16 {q13}, [%4] \n" 1.391 + "vld1.8 {q14}, [%5] \n" 1.392 + "add %3, %0 \n" 1.393 + ".p2align 2 \n" 1.394 + "1: \n" 1.395 + 1.396 + // d0 = 00 40 01 41 02 42 03 43 1.397 + // d1 = 10 50 11 51 12 52 13 53 1.398 + // d2 = 20 60 21 61 22 62 23 63 1.399 + // d3 = 30 70 31 71 32 72 33 73 1.400 + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" 1.401 + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" 1.402 + "subs %2, %2, #12 \n" 1.403 + 1.404 + // Shuffle the input data around to get align the data 1.405 + // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 1.406 + // d0 = 00 10 01 11 02 12 03 13 1.407 + // d1 = 40 50 41 51 42 52 43 53 1.408 + "vtrn.u8 d0, d1 \n" 1.409 + "vtrn.u8 d4, d5 \n" 1.410 + 1.411 + // d2 = 20 30 21 31 22 32 23 33 1.412 + // d3 = 60 70 61 71 62 72 63 73 1.413 + "vtrn.u8 d2, d3 \n" 1.414 + "vtrn.u8 d6, d7 \n" 1.415 + 1.416 + // d0 = 00+10 01+11 02+12 03+13 1.417 + // d2 = 40+50 41+51 42+52 43+53 1.418 + "vpaddl.u8 q0, q0 \n" 1.419 + "vpaddl.u8 q2, q2 \n" 1.420 + 1.421 + // d3 = 60+70 61+71 62+72 63+73 1.422 + "vpaddl.u8 d3, d3 \n" 1.423 + "vpaddl.u8 d7, d7 \n" 1.424 + 1.425 + // combine source lines 1.426 + "vadd.u16 q0, q2 \n" 1.427 + "vadd.u16 d4, d3, d7 \n" 1.428 + 1.429 + // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 1.430 + "vqrshrn.u16 d4, q2, #2 \n" 1.431 + 1.432 + // Shuffle 2,3 reg around so that 2 can be added to the 1.433 + // 0,1 reg and 3 can be added to the 4,5 reg. This 1.434 + // requires expanding from u8 to u16 as the 0,1 and 4,5 1.435 + // registers are already expanded. Then do transposes 1.436 + // to get aligned. 1.437 + // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 1.438 + "vmovl.u8 q1, d2 \n" 1.439 + "vmovl.u8 q3, d6 \n" 1.440 + 1.441 + // combine source lines 1.442 + "vadd.u16 q1, q3 \n" 1.443 + 1.444 + // d4 = xx 20 xx 30 xx 22 xx 32 1.445 + // d5 = xx 21 xx 31 xx 23 xx 33 1.446 + "vtrn.u32 d2, d3 \n" 1.447 + 1.448 + // d4 = xx 20 xx 21 xx 22 xx 23 1.449 + // d5 = xx 30 xx 31 xx 32 xx 33 1.450 + "vtrn.u16 d2, d3 \n" 1.451 + 1.452 + // 0+1+2, 3+4+5 1.453 + "vadd.u16 q0, q1 \n" 1.454 + 1.455 + // Need to divide, but can't downshift as the the value 1.456 + // isn't a power of 2. So multiply by 65536 / n 1.457 + // and take the upper 16 bits. 1.458 + "vqrdmulh.s16 q0, q0, q13 \n" 1.459 + 1.460 + // Align for table lookup, vtbl requires registers to 1.461 + // be adjacent 1.462 + "vmov.u8 d2, d4 \n" 1.463 + 1.464 + "vtbl.u8 d3, {d0, d1, d2}, d28 \n" 1.465 + "vtbl.u8 d4, {d0, d1, d2}, d29 \n" 1.466 + 1.467 + "vst1.8 {d3}, [%1]! \n" 1.468 + "vst1.32 {d4[0]}, [%1]! \n" 1.469 + "bgt 1b \n" 1.470 + : "+r"(src_ptr), // %0 1.471 + "+r"(dst_ptr), // %1 1.472 + "+r"(dst_width), // %2 1.473 + "+r"(src_stride) // %3 1.474 + : "r"(&kMult38_Div6), // %4 1.475 + "r"(&kShuf38_2) // %5 1.476 + : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" 1.477 + ); 1.478 +} 1.479 + 1.480 +// 16x2 -> 16x1 1.481 +void ScaleFilterRows_NEON(uint8* dst_ptr, 1.482 + const uint8* src_ptr, ptrdiff_t src_stride, 1.483 + int dst_width, int source_y_fraction) { 1.484 + asm volatile ( 1.485 + "cmp %4, #0 \n" 1.486 + "beq 100f \n" 1.487 + "add %2, %1 \n" 1.488 + "cmp %4, #64 \n" 1.489 + "beq 75f \n" 1.490 + "cmp %4, #128 \n" 1.491 + "beq 50f \n" 1.492 + "cmp %4, #192 \n" 1.493 + "beq 25f \n" 1.494 + 1.495 + "vdup.8 d5, %4 \n" 1.496 + "rsb %4, #256 \n" 1.497 + "vdup.8 d4, %4 \n" 1.498 + // General purpose row blend. 1.499 + "1: \n" 1.500 + "vld1.8 {q0}, [%1]! \n" 1.501 + "vld1.8 {q1}, [%2]! \n" 1.502 + "subs %3, %3, #16 \n" 1.503 + "vmull.u8 q13, d0, d4 \n" 1.504 + "vmull.u8 q14, d1, d4 \n" 1.505 + "vmlal.u8 q13, d2, d5 \n" 1.506 + "vmlal.u8 q14, d3, d5 \n" 1.507 + "vrshrn.u16 d0, q13, #8 \n" 1.508 + "vrshrn.u16 d1, q14, #8 \n" 1.509 + "vst1.8 {q0}, [%0]! \n" 1.510 + "bgt 1b \n" 1.511 + "b 99f \n" 1.512 + 1.513 + // Blend 25 / 75. 1.514 + "25: \n" 1.515 + "vld1.8 {q0}, [%1]! \n" 1.516 + "vld1.8 {q1}, [%2]! \n" 1.517 + "subs %3, %3, #16 \n" 1.518 + "vrhadd.u8 q0, q1 \n" 1.519 + "vrhadd.u8 q0, q1 \n" 1.520 + "vst1.8 {q0}, [%0]! \n" 1.521 + "bgt 25b \n" 1.522 + "b 99f \n" 1.523 + 1.524 + // Blend 50 / 50. 1.525 + "50: \n" 1.526 + "vld1.8 {q0}, [%1]! \n" 1.527 + "vld1.8 {q1}, [%2]! \n" 1.528 + "subs %3, %3, #16 \n" 1.529 + "vrhadd.u8 q0, q1 \n" 1.530 + "vst1.8 {q0}, [%0]! \n" 1.531 + "bgt 50b \n" 1.532 + "b 99f \n" 1.533 + 1.534 + // Blend 75 / 25. 1.535 + "75: \n" 1.536 + "vld1.8 {q1}, [%1]! \n" 1.537 + "vld1.8 {q0}, [%2]! \n" 1.538 + "subs %3, %3, #16 \n" 1.539 + "vrhadd.u8 q0, q1 \n" 1.540 + "vrhadd.u8 q0, q1 \n" 1.541 + "vst1.8 {q0}, [%0]! \n" 1.542 + "bgt 75b \n" 1.543 + "b 99f \n" 1.544 + 1.545 + // Blend 100 / 0 - Copy row unchanged. 1.546 + "100: \n" 1.547 + "vld1.8 {q0}, [%1]! \n" 1.548 + "subs %3, %3, #16 \n" 1.549 + "vst1.8 {q0}, [%0]! \n" 1.550 + "bgt 100b \n" 1.551 + 1.552 + "99: \n" 1.553 + "vst1.8 {d1[7]}, [%0] \n" 1.554 + : "+r"(dst_ptr), // %0 1.555 + "+r"(src_ptr), // %1 1.556 + "+r"(src_stride), // %2 1.557 + "+r"(dst_width), // %3 1.558 + "+r"(source_y_fraction) // %4 1.559 + : 1.560 + : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc" 1.561 + ); 1.562 +} 1.563 + 1.564 +void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 1.565 + uint8* dst, int dst_width) { 1.566 + asm volatile ( 1.567 + ".p2align 2 \n" 1.568 + "1: \n" 1.569 + // load even pixels into q0, odd into q1 1.570 + "vld2.32 {q0, q1}, [%0]! \n" 1.571 + "vld2.32 {q2, q3}, [%0]! \n" 1.572 + "subs %2, %2, #8 \n" // 8 processed per loop 1.573 + "vst1.8 {q1}, [%1]! \n" // store odd pixels 1.574 + "vst1.8 {q3}, [%1]! \n" 1.575 + "bgt 1b \n" 1.576 + : "+r"(src_ptr), // %0 1.577 + "+r"(dst), // %1 1.578 + "+r"(dst_width) // %2 1.579 + : 1.580 + : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List 1.581 + ); 1.582 +} 1.583 + 1.584 +void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, 1.585 + uint8* dst, int dst_width) { 1.586 + asm volatile ( 1.587 + // change the stride to row 2 pointer 1.588 + "add %1, %1, %0 \n" 1.589 + ".p2align 2 \n" 1.590 + "1: \n" 1.591 + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. 1.592 + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. 1.593 + "subs %3, %3, #8 \n" // 8 processed per loop. 1.594 + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. 1.595 + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. 1.596 + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. 1.597 + "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. 1.598 + "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels. 1.599 + "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels. 1.600 + "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. 1.601 + "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. 1.602 + "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. 1.603 + "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. 1.604 + "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack 1.605 + "vrshrn.u16 d1, q1, #2 \n" 1.606 + "vrshrn.u16 d2, q2, #2 \n" 1.607 + "vrshrn.u16 d3, q3, #2 \n" 1.608 + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" 1.609 + "bgt 1b \n" 1.610 + : "+r"(src_ptr), // %0 1.611 + "+r"(src_stride), // %1 1.612 + "+r"(dst), // %2 1.613 + "+r"(dst_width) // %3 1.614 + : 1.615 + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" 1.616 + ); 1.617 +} 1.618 + 1.619 +// Reads 4 pixels at a time. 1.620 +// Alignment requirement: src_argb 4 byte aligned. 1.621 +void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, 1.622 + int src_stepx, uint8* dst_argb, int dst_width) { 1.623 + asm volatile ( 1.624 + "mov r12, %3, lsl #2 \n" 1.625 + ".p2align 2 \n" 1.626 + "1: \n" 1.627 + "vld1.32 {d0[0]}, [%0], r12 \n" 1.628 + "vld1.32 {d0[1]}, [%0], r12 \n" 1.629 + "vld1.32 {d1[0]}, [%0], r12 \n" 1.630 + "vld1.32 {d1[1]}, [%0], r12 \n" 1.631 + "subs %2, %2, #4 \n" // 4 pixels per loop. 1.632 + "vst1.8 {q0}, [%1]! \n" 1.633 + "bgt 1b \n" 1.634 + : "+r"(src_argb), // %0 1.635 + "+r"(dst_argb), // %1 1.636 + "+r"(dst_width) // %2 1.637 + : "r"(src_stepx) // %3 1.638 + : "memory", "cc", "r12", "q0" 1.639 + ); 1.640 +} 1.641 + 1.642 +// Reads 4 pixels at a time. 1.643 +// Alignment requirement: src_argb 4 byte aligned. 1.644 +void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, 1.645 + int src_stepx, 1.646 + uint8* dst_argb, int dst_width) { 1.647 + asm volatile ( 1.648 + "mov r12, %4, lsl #2 \n" 1.649 + "add %1, %1, %0 \n" 1.650 + ".p2align 2 \n" 1.651 + "1: \n" 1.652 + "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1 1.653 + "vld1.8 {d1}, [%1], r12 \n" 1.654 + "vld1.8 {d2}, [%0], r12 \n" 1.655 + "vld1.8 {d3}, [%1], r12 \n" 1.656 + "vld1.8 {d4}, [%0], r12 \n" 1.657 + "vld1.8 {d5}, [%1], r12 \n" 1.658 + "vld1.8 {d6}, [%0], r12 \n" 1.659 + "vld1.8 {d7}, [%1], r12 \n" 1.660 + "vaddl.u8 q0, d0, d1 \n" 1.661 + "vaddl.u8 q1, d2, d3 \n" 1.662 + "vaddl.u8 q2, d4, d5 \n" 1.663 + "vaddl.u8 q3, d6, d7 \n" 1.664 + "vswp.8 d1, d2 \n" // ab_cd -> ac_bd 1.665 + "vswp.8 d5, d6 \n" // ef_gh -> eg_fh 1.666 + "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) 1.667 + "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) 1.668 + "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. 1.669 + "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. 1.670 + "subs %3, %3, #4 \n" // 4 pixels per loop. 1.671 + "vst1.8 {q0}, [%2]! \n" 1.672 + "bgt 1b \n" 1.673 + : "+r"(src_argb), // %0 1.674 + "+r"(src_stride), // %1 1.675 + "+r"(dst_argb), // %2 1.676 + "+r"(dst_width) // %3 1.677 + : "r"(src_stepx) // %4 1.678 + : "memory", "cc", "r12", "q0", "q1", "q2", "q3" 1.679 + ); 1.680 +} 1.681 + 1.682 +#endif // __ARM_NEON__ 1.683 + 1.684 +#ifdef __cplusplus 1.685 +} // extern "C" 1.686 +} // namespace libyuv 1.687 +#endif