1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libyuv/source/rotate_neon.cc Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,405 @@ 1.4 +/* 1.5 + * Copyright 2011 The LibYuv Project Authors. All rights reserved. 1.6 + * 1.7 + * Use of this source code is governed by a BSD-style license 1.8 + * that can be found in the LICENSE file in the root of the source 1.9 + * tree. An additional intellectual property rights grant can be found 1.10 + * in the file PATENTS. All contributing project authors may 1.11 + * be found in the AUTHORS file in the root of the source tree. 1.12 + */ 1.13 + 1.14 +#include "libyuv/row.h" 1.15 + 1.16 +#include "libyuv/basic_types.h" 1.17 + 1.18 +#ifdef __cplusplus 1.19 +namespace libyuv { 1.20 +extern "C" { 1.21 +#endif 1.22 + 1.23 +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) 1.24 +static uvec8 kVTbl4x4Transpose = 1.25 + { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; 1.26 + 1.27 +void TransposeWx8_NEON(const uint8* src, int src_stride, 1.28 + uint8* dst, int dst_stride, 1.29 + int width) { 1.30 + asm volatile ( 1.31 + // loops are on blocks of 8. loop will stop when 1.32 + // counter gets to or below 0. starting the counter 1.33 + // at w-8 allow for this 1.34 + "sub %4, #8 \n" 1.35 + 1.36 + // handle 8x8 blocks. this should be the majority of the plane 1.37 + ".p2align 2 \n" 1.38 + "1: \n" 1.39 + "mov r9, %0 \n" 1.40 + 1.41 + "vld1.8 {d0}, [r9], %1 \n" 1.42 + "vld1.8 {d1}, [r9], %1 \n" 1.43 + "vld1.8 {d2}, [r9], %1 \n" 1.44 + "vld1.8 {d3}, [r9], %1 \n" 1.45 + "vld1.8 {d4}, [r9], %1 \n" 1.46 + "vld1.8 {d5}, [r9], %1 \n" 1.47 + "vld1.8 {d6}, [r9], %1 \n" 1.48 + "vld1.8 {d7}, [r9] \n" 1.49 + 1.50 + "vtrn.8 d1, d0 \n" 1.51 + "vtrn.8 d3, d2 \n" 1.52 + "vtrn.8 d5, d4 \n" 1.53 + "vtrn.8 d7, d6 \n" 1.54 + 1.55 + "vtrn.16 d1, d3 \n" 1.56 + "vtrn.16 d0, d2 \n" 1.57 + "vtrn.16 d5, d7 \n" 1.58 + "vtrn.16 d4, d6 \n" 1.59 + 1.60 + "vtrn.32 d1, d5 \n" 1.61 + "vtrn.32 d0, d4 \n" 1.62 + "vtrn.32 d3, d7 \n" 1.63 + "vtrn.32 d2, d6 \n" 1.64 + 1.65 + "vrev16.8 q0, q0 \n" 1.66 + "vrev16.8 q1, q1 \n" 1.67 + "vrev16.8 q2, q2 \n" 1.68 + "vrev16.8 q3, q3 \n" 1.69 + 1.70 + "mov r9, %2 \n" 1.71 + 1.72 + "vst1.8 {d1}, [r9], %3 \n" 1.73 + "vst1.8 {d0}, [r9], %3 \n" 1.74 + "vst1.8 {d3}, [r9], %3 \n" 1.75 + "vst1.8 {d2}, [r9], %3 \n" 1.76 + "vst1.8 {d5}, [r9], %3 \n" 1.77 + "vst1.8 {d4}, [r9], %3 \n" 1.78 + "vst1.8 {d7}, [r9], %3 \n" 1.79 + "vst1.8 {d6}, [r9] \n" 1.80 + 1.81 + "add %0, #8 \n" // src += 8 1.82 + "add %2, %2, %3, lsl #3 \n" // dst += 8 * dst_stride 1.83 + "subs %4, #8 \n" // w -= 8 1.84 + "bge 1b \n" 1.85 + 1.86 + // add 8 back to counter. if the result is 0 there are 1.87 + // no residuals. 1.88 + "adds %4, #8 \n" 1.89 + "beq 4f \n" 1.90 + 1.91 + // some residual, so between 1 and 7 lines left to transpose 1.92 + "cmp %4, #2 \n" 1.93 + "blt 3f \n" 1.94 + 1.95 + "cmp %4, #4 \n" 1.96 + "blt 2f \n" 1.97 + 1.98 + // 4x8 block 1.99 + "mov r9, %0 \n" 1.100 + "vld1.32 {d0[0]}, [r9], %1 \n" 1.101 + "vld1.32 {d0[1]}, [r9], %1 \n" 1.102 + "vld1.32 {d1[0]}, [r9], %1 \n" 1.103 + "vld1.32 {d1[1]}, [r9], %1 \n" 1.104 + "vld1.32 {d2[0]}, [r9], %1 \n" 1.105 + "vld1.32 {d2[1]}, [r9], %1 \n" 1.106 + "vld1.32 {d3[0]}, [r9], %1 \n" 1.107 + "vld1.32 {d3[1]}, [r9] \n" 1.108 + 1.109 + "mov r9, %2 \n" 1.110 + 1.111 + "vld1.8 {q3}, [%5] \n" 1.112 + 1.113 + "vtbl.8 d4, {d0, d1}, d6 \n" 1.114 + "vtbl.8 d5, {d0, d1}, d7 \n" 1.115 + "vtbl.8 d0, {d2, d3}, d6 \n" 1.116 + "vtbl.8 d1, {d2, d3}, d7 \n" 1.117 + 1.118 + // TODO(frkoenig): Rework shuffle above to 1.119 + // write out with 4 instead of 8 writes. 1.120 + "vst1.32 {d4[0]}, [r9], %3 \n" 1.121 + "vst1.32 {d4[1]}, [r9], %3 \n" 1.122 + "vst1.32 {d5[0]}, [r9], %3 \n" 1.123 + "vst1.32 {d5[1]}, [r9] \n" 1.124 + 1.125 + "add r9, %2, #4 \n" 1.126 + "vst1.32 {d0[0]}, [r9], %3 \n" 1.127 + "vst1.32 {d0[1]}, [r9], %3 \n" 1.128 + "vst1.32 {d1[0]}, [r9], %3 \n" 1.129 + "vst1.32 {d1[1]}, [r9] \n" 1.130 + 1.131 + "add %0, #4 \n" // src += 4 1.132 + "add %2, %2, %3, lsl #2 \n" // dst += 4 * dst_stride 1.133 + "subs %4, #4 \n" // w -= 4 1.134 + "beq 4f \n" 1.135 + 1.136 + // some residual, check to see if it includes a 2x8 block, 1.137 + // or less 1.138 + "cmp %4, #2 \n" 1.139 + "blt 3f \n" 1.140 + 1.141 + // 2x8 block 1.142 + "2: \n" 1.143 + "mov r9, %0 \n" 1.144 + "vld1.16 {d0[0]}, [r9], %1 \n" 1.145 + "vld1.16 {d1[0]}, [r9], %1 \n" 1.146 + "vld1.16 {d0[1]}, [r9], %1 \n" 1.147 + "vld1.16 {d1[1]}, [r9], %1 \n" 1.148 + "vld1.16 {d0[2]}, [r9], %1 \n" 1.149 + "vld1.16 {d1[2]}, [r9], %1 \n" 1.150 + "vld1.16 {d0[3]}, [r9], %1 \n" 1.151 + "vld1.16 {d1[3]}, [r9] \n" 1.152 + 1.153 + "vtrn.8 d0, d1 \n" 1.154 + 1.155 + "mov r9, %2 \n" 1.156 + 1.157 + "vst1.64 {d0}, [r9], %3 \n" 1.158 + "vst1.64 {d1}, [r9] \n" 1.159 + 1.160 + "add %0, #2 \n" // src += 2 1.161 + "add %2, %2, %3, lsl #1 \n" // dst += 2 * dst_stride 1.162 + "subs %4, #2 \n" // w -= 2 1.163 + "beq 4f \n" 1.164 + 1.165 + // 1x8 block 1.166 + "3: \n" 1.167 + "vld1.8 {d0[0]}, [%0], %1 \n" 1.168 + "vld1.8 {d0[1]}, [%0], %1 \n" 1.169 + "vld1.8 {d0[2]}, [%0], %1 \n" 1.170 + "vld1.8 {d0[3]}, [%0], %1 \n" 1.171 + "vld1.8 {d0[4]}, [%0], %1 \n" 1.172 + "vld1.8 {d0[5]}, [%0], %1 \n" 1.173 + "vld1.8 {d0[6]}, [%0], %1 \n" 1.174 + "vld1.8 {d0[7]}, [%0] \n" 1.175 + 1.176 + "vst1.64 {d0}, [%2] \n" 1.177 + 1.178 + "4: \n" 1.179 + 1.180 + : "+r"(src), // %0 1.181 + "+r"(src_stride), // %1 1.182 + "+r"(dst), // %2 1.183 + "+r"(dst_stride), // %3 1.184 + "+r"(width) // %4 1.185 + : "r"(&kVTbl4x4Transpose) // %5 1.186 + : "memory", "cc", "r9", "q0", "q1", "q2", "q3" 1.187 + ); 1.188 +} 1.189 + 1.190 +static uvec8 kVTbl4x4TransposeDi = 1.191 + { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 }; 1.192 + 1.193 +void TransposeUVWx8_NEON(const uint8* src, int src_stride, 1.194 + uint8* dst_a, int dst_stride_a, 1.195 + uint8* dst_b, int dst_stride_b, 1.196 + int width) { 1.197 + asm volatile ( 1.198 + // loops are on blocks of 8. loop will stop when 1.199 + // counter gets to or below 0. starting the counter 1.200 + // at w-8 allow for this 1.201 + "sub %6, #8 \n" 1.202 + 1.203 + // handle 8x8 blocks. this should be the majority of the plane 1.204 + ".p2align 2 \n" 1.205 + "1: \n" 1.206 + "mov r9, %0 \n" 1.207 + 1.208 + "vld2.8 {d0, d1}, [r9], %1 \n" 1.209 + "vld2.8 {d2, d3}, [r9], %1 \n" 1.210 + "vld2.8 {d4, d5}, [r9], %1 \n" 1.211 + "vld2.8 {d6, d7}, [r9], %1 \n" 1.212 + "vld2.8 {d16, d17}, [r9], %1 \n" 1.213 + "vld2.8 {d18, d19}, [r9], %1 \n" 1.214 + "vld2.8 {d20, d21}, [r9], %1 \n" 1.215 + "vld2.8 {d22, d23}, [r9] \n" 1.216 + 1.217 + "vtrn.8 q1, q0 \n" 1.218 + "vtrn.8 q3, q2 \n" 1.219 + "vtrn.8 q9, q8 \n" 1.220 + "vtrn.8 q11, q10 \n" 1.221 + 1.222 + "vtrn.16 q1, q3 \n" 1.223 + "vtrn.16 q0, q2 \n" 1.224 + "vtrn.16 q9, q11 \n" 1.225 + "vtrn.16 q8, q10 \n" 1.226 + 1.227 + "vtrn.32 q1, q9 \n" 1.228 + "vtrn.32 q0, q8 \n" 1.229 + "vtrn.32 q3, q11 \n" 1.230 + "vtrn.32 q2, q10 \n" 1.231 + 1.232 + "vrev16.8 q0, q0 \n" 1.233 + "vrev16.8 q1, q1 \n" 1.234 + "vrev16.8 q2, q2 \n" 1.235 + "vrev16.8 q3, q3 \n" 1.236 + "vrev16.8 q8, q8 \n" 1.237 + "vrev16.8 q9, q9 \n" 1.238 + "vrev16.8 q10, q10 \n" 1.239 + "vrev16.8 q11, q11 \n" 1.240 + 1.241 + "mov r9, %2 \n" 1.242 + 1.243 + "vst1.8 {d2}, [r9], %3 \n" 1.244 + "vst1.8 {d0}, [r9], %3 \n" 1.245 + "vst1.8 {d6}, [r9], %3 \n" 1.246 + "vst1.8 {d4}, [r9], %3 \n" 1.247 + "vst1.8 {d18}, [r9], %3 \n" 1.248 + "vst1.8 {d16}, [r9], %3 \n" 1.249 + "vst1.8 {d22}, [r9], %3 \n" 1.250 + "vst1.8 {d20}, [r9] \n" 1.251 + 1.252 + "mov r9, %4 \n" 1.253 + 1.254 + "vst1.8 {d3}, [r9], %5 \n" 1.255 + "vst1.8 {d1}, [r9], %5 \n" 1.256 + "vst1.8 {d7}, [r9], %5 \n" 1.257 + "vst1.8 {d5}, [r9], %5 \n" 1.258 + "vst1.8 {d19}, [r9], %5 \n" 1.259 + "vst1.8 {d17}, [r9], %5 \n" 1.260 + "vst1.8 {d23}, [r9], %5 \n" 1.261 + "vst1.8 {d21}, [r9] \n" 1.262 + 1.263 + "add %0, #8*2 \n" // src += 8*2 1.264 + "add %2, %2, %3, lsl #3 \n" // dst_a += 8 * dst_stride_a 1.265 + "add %4, %4, %5, lsl #3 \n" // dst_b += 8 * dst_stride_b 1.266 + "subs %6, #8 \n" // w -= 8 1.267 + "bge 1b \n" 1.268 + 1.269 + // add 8 back to counter. if the result is 0 there are 1.270 + // no residuals. 1.271 + "adds %6, #8 \n" 1.272 + "beq 4f \n" 1.273 + 1.274 + // some residual, so between 1 and 7 lines left to transpose 1.275 + "cmp %6, #2 \n" 1.276 + "blt 3f \n" 1.277 + 1.278 + "cmp %6, #4 \n" 1.279 + "blt 2f \n" 1.280 + 1.281 + //TODO(frkoenig): Clean this up 1.282 + // 4x8 block 1.283 + "mov r9, %0 \n" 1.284 + "vld1.64 {d0}, [r9], %1 \n" 1.285 + "vld1.64 {d1}, [r9], %1 \n" 1.286 + "vld1.64 {d2}, [r9], %1 \n" 1.287 + "vld1.64 {d3}, [r9], %1 \n" 1.288 + "vld1.64 {d4}, [r9], %1 \n" 1.289 + "vld1.64 {d5}, [r9], %1 \n" 1.290 + "vld1.64 {d6}, [r9], %1 \n" 1.291 + "vld1.64 {d7}, [r9] \n" 1.292 + 1.293 + "vld1.8 {q15}, [%7] \n" 1.294 + 1.295 + "vtrn.8 q0, q1 \n" 1.296 + "vtrn.8 q2, q3 \n" 1.297 + 1.298 + "vtbl.8 d16, {d0, d1}, d30 \n" 1.299 + "vtbl.8 d17, {d0, d1}, d31 \n" 1.300 + "vtbl.8 d18, {d2, d3}, d30 \n" 1.301 + "vtbl.8 d19, {d2, d3}, d31 \n" 1.302 + "vtbl.8 d20, {d4, d5}, d30 \n" 1.303 + "vtbl.8 d21, {d4, d5}, d31 \n" 1.304 + "vtbl.8 d22, {d6, d7}, d30 \n" 1.305 + "vtbl.8 d23, {d6, d7}, d31 \n" 1.306 + 1.307 + "mov r9, %2 \n" 1.308 + 1.309 + "vst1.32 {d16[0]}, [r9], %3 \n" 1.310 + "vst1.32 {d16[1]}, [r9], %3 \n" 1.311 + "vst1.32 {d17[0]}, [r9], %3 \n" 1.312 + "vst1.32 {d17[1]}, [r9], %3 \n" 1.313 + 1.314 + "add r9, %2, #4 \n" 1.315 + "vst1.32 {d20[0]}, [r9], %3 \n" 1.316 + "vst1.32 {d20[1]}, [r9], %3 \n" 1.317 + "vst1.32 {d21[0]}, [r9], %3 \n" 1.318 + "vst1.32 {d21[1]}, [r9] \n" 1.319 + 1.320 + "mov r9, %4 \n" 1.321 + 1.322 + "vst1.32 {d18[0]}, [r9], %5 \n" 1.323 + "vst1.32 {d18[1]}, [r9], %5 \n" 1.324 + "vst1.32 {d19[0]}, [r9], %5 \n" 1.325 + "vst1.32 {d19[1]}, [r9], %5 \n" 1.326 + 1.327 + "add r9, %4, #4 \n" 1.328 + "vst1.32 {d22[0]}, [r9], %5 \n" 1.329 + "vst1.32 {d22[1]}, [r9], %5 \n" 1.330 + "vst1.32 {d23[0]}, [r9], %5 \n" 1.331 + "vst1.32 {d23[1]}, [r9] \n" 1.332 + 1.333 + "add %0, #4*2 \n" // src += 4 * 2 1.334 + "add %2, %2, %3, lsl #2 \n" // dst_a += 4 * dst_stride_a 1.335 + "add %4, %4, %5, lsl #2 \n" // dst_b += 4 * dst_stride_b 1.336 + "subs %6, #4 \n" // w -= 4 1.337 + "beq 4f \n" 1.338 + 1.339 + // some residual, check to see if it includes a 2x8 block, 1.340 + // or less 1.341 + "cmp %6, #2 \n" 1.342 + "blt 3f \n" 1.343 + 1.344 + // 2x8 block 1.345 + "2: \n" 1.346 + "mov r9, %0 \n" 1.347 + "vld2.16 {d0[0], d2[0]}, [r9], %1 \n" 1.348 + "vld2.16 {d1[0], d3[0]}, [r9], %1 \n" 1.349 + "vld2.16 {d0[1], d2[1]}, [r9], %1 \n" 1.350 + "vld2.16 {d1[1], d3[1]}, [r9], %1 \n" 1.351 + "vld2.16 {d0[2], d2[2]}, [r9], %1 \n" 1.352 + "vld2.16 {d1[2], d3[2]}, [r9], %1 \n" 1.353 + "vld2.16 {d0[3], d2[3]}, [r9], %1 \n" 1.354 + "vld2.16 {d1[3], d3[3]}, [r9] \n" 1.355 + 1.356 + "vtrn.8 d0, d1 \n" 1.357 + "vtrn.8 d2, d3 \n" 1.358 + 1.359 + "mov r9, %2 \n" 1.360 + 1.361 + "vst1.64 {d0}, [r9], %3 \n" 1.362 + "vst1.64 {d2}, [r9] \n" 1.363 + 1.364 + "mov r9, %4 \n" 1.365 + 1.366 + "vst1.64 {d1}, [r9], %5 \n" 1.367 + "vst1.64 {d3}, [r9] \n" 1.368 + 1.369 + "add %0, #2*2 \n" // src += 2 * 2 1.370 + "add %2, %2, %3, lsl #1 \n" // dst_a += 2 * dst_stride_a 1.371 + "add %4, %4, %5, lsl #1 \n" // dst_b += 2 * dst_stride_b 1.372 + "subs %6, #2 \n" // w -= 2 1.373 + "beq 4f \n" 1.374 + 1.375 + // 1x8 block 1.376 + "3: \n" 1.377 + "vld2.8 {d0[0], d1[0]}, [%0], %1 \n" 1.378 + "vld2.8 {d0[1], d1[1]}, [%0], %1 \n" 1.379 + "vld2.8 {d0[2], d1[2]}, [%0], %1 \n" 1.380 + "vld2.8 {d0[3], d1[3]}, [%0], %1 \n" 1.381 + "vld2.8 {d0[4], d1[4]}, [%0], %1 \n" 1.382 + "vld2.8 {d0[5], d1[5]}, [%0], %1 \n" 1.383 + "vld2.8 {d0[6], d1[6]}, [%0], %1 \n" 1.384 + "vld2.8 {d0[7], d1[7]}, [%0] \n" 1.385 + 1.386 + "vst1.64 {d0}, [%2] \n" 1.387 + "vst1.64 {d1}, [%4] \n" 1.388 + 1.389 + "4: \n" 1.390 + 1.391 + : "+r"(src), // %0 1.392 + "+r"(src_stride), // %1 1.393 + "+r"(dst_a), // %2 1.394 + "+r"(dst_stride_a), // %3 1.395 + "+r"(dst_b), // %4 1.396 + "+r"(dst_stride_b), // %5 1.397 + "+r"(width) // %6 1.398 + : "r"(&kVTbl4x4TransposeDi) // %7 1.399 + : "memory", "cc", "r9", 1.400 + "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" 1.401 + ); 1.402 +} 1.403 +#endif 1.404 + 1.405 +#ifdef __cplusplus 1.406 +} // extern "C" 1.407 +} // namespace libyuv 1.408 +#endif