media/libyuv/source/rotate_neon.cc

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /*
michael@0 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
michael@0 3 *
michael@0 4 * Use of this source code is governed by a BSD-style license
michael@0 5 * that can be found in the LICENSE file in the root of the source
michael@0 6 * tree. An additional intellectual property rights grant can be found
michael@0 7 * in the file PATENTS. All contributing project authors may
michael@0 8 * be found in the AUTHORS file in the root of the source tree.
michael@0 9 */
michael@0 10
michael@0 11 #include "libyuv/row.h"
michael@0 12
michael@0 13 #include "libyuv/basic_types.h"
michael@0 14
michael@0 15 #ifdef __cplusplus
michael@0 16 namespace libyuv {
michael@0 17 extern "C" {
michael@0 18 #endif
michael@0 19
michael@0 20 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
michael@0 21 static uvec8 kVTbl4x4Transpose =
michael@0 22 { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
michael@0 23
michael@0 24 void TransposeWx8_NEON(const uint8* src, int src_stride,
michael@0 25 uint8* dst, int dst_stride,
michael@0 26 int width) {
michael@0 27 asm volatile (
michael@0 28 // loops are on blocks of 8. loop will stop when
michael@0 29 // counter gets to or below 0. starting the counter
michael@0 30 // at w-8 allow for this
michael@0 31 "sub %4, #8 \n"
michael@0 32
michael@0 33 // handle 8x8 blocks. this should be the majority of the plane
michael@0 34 ".p2align 2 \n"
michael@0 35 "1: \n"
michael@0 36 "mov r9, %0 \n"
michael@0 37
michael@0 38 "vld1.8 {d0}, [r9], %1 \n"
michael@0 39 "vld1.8 {d1}, [r9], %1 \n"
michael@0 40 "vld1.8 {d2}, [r9], %1 \n"
michael@0 41 "vld1.8 {d3}, [r9], %1 \n"
michael@0 42 "vld1.8 {d4}, [r9], %1 \n"
michael@0 43 "vld1.8 {d5}, [r9], %1 \n"
michael@0 44 "vld1.8 {d6}, [r9], %1 \n"
michael@0 45 "vld1.8 {d7}, [r9] \n"
michael@0 46
michael@0 47 "vtrn.8 d1, d0 \n"
michael@0 48 "vtrn.8 d3, d2 \n"
michael@0 49 "vtrn.8 d5, d4 \n"
michael@0 50 "vtrn.8 d7, d6 \n"
michael@0 51
michael@0 52 "vtrn.16 d1, d3 \n"
michael@0 53 "vtrn.16 d0, d2 \n"
michael@0 54 "vtrn.16 d5, d7 \n"
michael@0 55 "vtrn.16 d4, d6 \n"
michael@0 56
michael@0 57 "vtrn.32 d1, d5 \n"
michael@0 58 "vtrn.32 d0, d4 \n"
michael@0 59 "vtrn.32 d3, d7 \n"
michael@0 60 "vtrn.32 d2, d6 \n"
michael@0 61
michael@0 62 "vrev16.8 q0, q0 \n"
michael@0 63 "vrev16.8 q1, q1 \n"
michael@0 64 "vrev16.8 q2, q2 \n"
michael@0 65 "vrev16.8 q3, q3 \n"
michael@0 66
michael@0 67 "mov r9, %2 \n"
michael@0 68
michael@0 69 "vst1.8 {d1}, [r9], %3 \n"
michael@0 70 "vst1.8 {d0}, [r9], %3 \n"
michael@0 71 "vst1.8 {d3}, [r9], %3 \n"
michael@0 72 "vst1.8 {d2}, [r9], %3 \n"
michael@0 73 "vst1.8 {d5}, [r9], %3 \n"
michael@0 74 "vst1.8 {d4}, [r9], %3 \n"
michael@0 75 "vst1.8 {d7}, [r9], %3 \n"
michael@0 76 "vst1.8 {d6}, [r9] \n"
michael@0 77
michael@0 78 "add %0, #8 \n" // src += 8
michael@0 79 "add %2, %2, %3, lsl #3 \n" // dst += 8 * dst_stride
michael@0 80 "subs %4, #8 \n" // w -= 8
michael@0 81 "bge 1b \n"
michael@0 82
michael@0 83 // add 8 back to counter. if the result is 0 there are
michael@0 84 // no residuals.
michael@0 85 "adds %4, #8 \n"
michael@0 86 "beq 4f \n"
michael@0 87
michael@0 88 // some residual, so between 1 and 7 lines left to transpose
michael@0 89 "cmp %4, #2 \n"
michael@0 90 "blt 3f \n"
michael@0 91
michael@0 92 "cmp %4, #4 \n"
michael@0 93 "blt 2f \n"
michael@0 94
michael@0 95 // 4x8 block
michael@0 96 "mov r9, %0 \n"
michael@0 97 "vld1.32 {d0[0]}, [r9], %1 \n"
michael@0 98 "vld1.32 {d0[1]}, [r9], %1 \n"
michael@0 99 "vld1.32 {d1[0]}, [r9], %1 \n"
michael@0 100 "vld1.32 {d1[1]}, [r9], %1 \n"
michael@0 101 "vld1.32 {d2[0]}, [r9], %1 \n"
michael@0 102 "vld1.32 {d2[1]}, [r9], %1 \n"
michael@0 103 "vld1.32 {d3[0]}, [r9], %1 \n"
michael@0 104 "vld1.32 {d3[1]}, [r9] \n"
michael@0 105
michael@0 106 "mov r9, %2 \n"
michael@0 107
michael@0 108 "vld1.8 {q3}, [%5] \n"
michael@0 109
michael@0 110 "vtbl.8 d4, {d0, d1}, d6 \n"
michael@0 111 "vtbl.8 d5, {d0, d1}, d7 \n"
michael@0 112 "vtbl.8 d0, {d2, d3}, d6 \n"
michael@0 113 "vtbl.8 d1, {d2, d3}, d7 \n"
michael@0 114
michael@0 115 // TODO(frkoenig): Rework shuffle above to
michael@0 116 // write out with 4 instead of 8 writes.
michael@0 117 "vst1.32 {d4[0]}, [r9], %3 \n"
michael@0 118 "vst1.32 {d4[1]}, [r9], %3 \n"
michael@0 119 "vst1.32 {d5[0]}, [r9], %3 \n"
michael@0 120 "vst1.32 {d5[1]}, [r9] \n"
michael@0 121
michael@0 122 "add r9, %2, #4 \n"
michael@0 123 "vst1.32 {d0[0]}, [r9], %3 \n"
michael@0 124 "vst1.32 {d0[1]}, [r9], %3 \n"
michael@0 125 "vst1.32 {d1[0]}, [r9], %3 \n"
michael@0 126 "vst1.32 {d1[1]}, [r9] \n"
michael@0 127
michael@0 128 "add %0, #4 \n" // src += 4
michael@0 129 "add %2, %2, %3, lsl #2 \n" // dst += 4 * dst_stride
michael@0 130 "subs %4, #4 \n" // w -= 4
michael@0 131 "beq 4f \n"
michael@0 132
michael@0 133 // some residual, check to see if it includes a 2x8 block,
michael@0 134 // or less
michael@0 135 "cmp %4, #2 \n"
michael@0 136 "blt 3f \n"
michael@0 137
michael@0 138 // 2x8 block
michael@0 139 "2: \n"
michael@0 140 "mov r9, %0 \n"
michael@0 141 "vld1.16 {d0[0]}, [r9], %1 \n"
michael@0 142 "vld1.16 {d1[0]}, [r9], %1 \n"
michael@0 143 "vld1.16 {d0[1]}, [r9], %1 \n"
michael@0 144 "vld1.16 {d1[1]}, [r9], %1 \n"
michael@0 145 "vld1.16 {d0[2]}, [r9], %1 \n"
michael@0 146 "vld1.16 {d1[2]}, [r9], %1 \n"
michael@0 147 "vld1.16 {d0[3]}, [r9], %1 \n"
michael@0 148 "vld1.16 {d1[3]}, [r9] \n"
michael@0 149
michael@0 150 "vtrn.8 d0, d1 \n"
michael@0 151
michael@0 152 "mov r9, %2 \n"
michael@0 153
michael@0 154 "vst1.64 {d0}, [r9], %3 \n"
michael@0 155 "vst1.64 {d1}, [r9] \n"
michael@0 156
michael@0 157 "add %0, #2 \n" // src += 2
michael@0 158 "add %2, %2, %3, lsl #1 \n" // dst += 2 * dst_stride
michael@0 159 "subs %4, #2 \n" // w -= 2
michael@0 160 "beq 4f \n"
michael@0 161
michael@0 162 // 1x8 block
michael@0 163 "3: \n"
michael@0 164 "vld1.8 {d0[0]}, [%0], %1 \n"
michael@0 165 "vld1.8 {d0[1]}, [%0], %1 \n"
michael@0 166 "vld1.8 {d0[2]}, [%0], %1 \n"
michael@0 167 "vld1.8 {d0[3]}, [%0], %1 \n"
michael@0 168 "vld1.8 {d0[4]}, [%0], %1 \n"
michael@0 169 "vld1.8 {d0[5]}, [%0], %1 \n"
michael@0 170 "vld1.8 {d0[6]}, [%0], %1 \n"
michael@0 171 "vld1.8 {d0[7]}, [%0] \n"
michael@0 172
michael@0 173 "vst1.64 {d0}, [%2] \n"
michael@0 174
michael@0 175 "4: \n"
michael@0 176
michael@0 177 : "+r"(src), // %0
michael@0 178 "+r"(src_stride), // %1
michael@0 179 "+r"(dst), // %2
michael@0 180 "+r"(dst_stride), // %3
michael@0 181 "+r"(width) // %4
michael@0 182 : "r"(&kVTbl4x4Transpose) // %5
michael@0 183 : "memory", "cc", "r9", "q0", "q1", "q2", "q3"
michael@0 184 );
michael@0 185 }
michael@0 186
michael@0 187 static uvec8 kVTbl4x4TransposeDi =
michael@0 188 { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 };
michael@0 189
michael@0 190 void TransposeUVWx8_NEON(const uint8* src, int src_stride,
michael@0 191 uint8* dst_a, int dst_stride_a,
michael@0 192 uint8* dst_b, int dst_stride_b,
michael@0 193 int width) {
michael@0 194 asm volatile (
michael@0 195 // loops are on blocks of 8. loop will stop when
michael@0 196 // counter gets to or below 0. starting the counter
michael@0 197 // at w-8 allow for this
michael@0 198 "sub %6, #8 \n"
michael@0 199
michael@0 200 // handle 8x8 blocks. this should be the majority of the plane
michael@0 201 ".p2align 2 \n"
michael@0 202 "1: \n"
michael@0 203 "mov r9, %0 \n"
michael@0 204
michael@0 205 "vld2.8 {d0, d1}, [r9], %1 \n"
michael@0 206 "vld2.8 {d2, d3}, [r9], %1 \n"
michael@0 207 "vld2.8 {d4, d5}, [r9], %1 \n"
michael@0 208 "vld2.8 {d6, d7}, [r9], %1 \n"
michael@0 209 "vld2.8 {d16, d17}, [r9], %1 \n"
michael@0 210 "vld2.8 {d18, d19}, [r9], %1 \n"
michael@0 211 "vld2.8 {d20, d21}, [r9], %1 \n"
michael@0 212 "vld2.8 {d22, d23}, [r9] \n"
michael@0 213
michael@0 214 "vtrn.8 q1, q0 \n"
michael@0 215 "vtrn.8 q3, q2 \n"
michael@0 216 "vtrn.8 q9, q8 \n"
michael@0 217 "vtrn.8 q11, q10 \n"
michael@0 218
michael@0 219 "vtrn.16 q1, q3 \n"
michael@0 220 "vtrn.16 q0, q2 \n"
michael@0 221 "vtrn.16 q9, q11 \n"
michael@0 222 "vtrn.16 q8, q10 \n"
michael@0 223
michael@0 224 "vtrn.32 q1, q9 \n"
michael@0 225 "vtrn.32 q0, q8 \n"
michael@0 226 "vtrn.32 q3, q11 \n"
michael@0 227 "vtrn.32 q2, q10 \n"
michael@0 228
michael@0 229 "vrev16.8 q0, q0 \n"
michael@0 230 "vrev16.8 q1, q1 \n"
michael@0 231 "vrev16.8 q2, q2 \n"
michael@0 232 "vrev16.8 q3, q3 \n"
michael@0 233 "vrev16.8 q8, q8 \n"
michael@0 234 "vrev16.8 q9, q9 \n"
michael@0 235 "vrev16.8 q10, q10 \n"
michael@0 236 "vrev16.8 q11, q11 \n"
michael@0 237
michael@0 238 "mov r9, %2 \n"
michael@0 239
michael@0 240 "vst1.8 {d2}, [r9], %3 \n"
michael@0 241 "vst1.8 {d0}, [r9], %3 \n"
michael@0 242 "vst1.8 {d6}, [r9], %3 \n"
michael@0 243 "vst1.8 {d4}, [r9], %3 \n"
michael@0 244 "vst1.8 {d18}, [r9], %3 \n"
michael@0 245 "vst1.8 {d16}, [r9], %3 \n"
michael@0 246 "vst1.8 {d22}, [r9], %3 \n"
michael@0 247 "vst1.8 {d20}, [r9] \n"
michael@0 248
michael@0 249 "mov r9, %4 \n"
michael@0 250
michael@0 251 "vst1.8 {d3}, [r9], %5 \n"
michael@0 252 "vst1.8 {d1}, [r9], %5 \n"
michael@0 253 "vst1.8 {d7}, [r9], %5 \n"
michael@0 254 "vst1.8 {d5}, [r9], %5 \n"
michael@0 255 "vst1.8 {d19}, [r9], %5 \n"
michael@0 256 "vst1.8 {d17}, [r9], %5 \n"
michael@0 257 "vst1.8 {d23}, [r9], %5 \n"
michael@0 258 "vst1.8 {d21}, [r9] \n"
michael@0 259
michael@0 260 "add %0, #8*2 \n" // src += 8*2
michael@0 261 "add %2, %2, %3, lsl #3 \n" // dst_a += 8 * dst_stride_a
michael@0 262 "add %4, %4, %5, lsl #3 \n" // dst_b += 8 * dst_stride_b
michael@0 263 "subs %6, #8 \n" // w -= 8
michael@0 264 "bge 1b \n"
michael@0 265
michael@0 266 // add 8 back to counter. if the result is 0 there are
michael@0 267 // no residuals.
michael@0 268 "adds %6, #8 \n"
michael@0 269 "beq 4f \n"
michael@0 270
michael@0 271 // some residual, so between 1 and 7 lines left to transpose
michael@0 272 "cmp %6, #2 \n"
michael@0 273 "blt 3f \n"
michael@0 274
michael@0 275 "cmp %6, #4 \n"
michael@0 276 "blt 2f \n"
michael@0 277
michael@0 278 //TODO(frkoenig): Clean this up
michael@0 279 // 4x8 block
michael@0 280 "mov r9, %0 \n"
michael@0 281 "vld1.64 {d0}, [r9], %1 \n"
michael@0 282 "vld1.64 {d1}, [r9], %1 \n"
michael@0 283 "vld1.64 {d2}, [r9], %1 \n"
michael@0 284 "vld1.64 {d3}, [r9], %1 \n"
michael@0 285 "vld1.64 {d4}, [r9], %1 \n"
michael@0 286 "vld1.64 {d5}, [r9], %1 \n"
michael@0 287 "vld1.64 {d6}, [r9], %1 \n"
michael@0 288 "vld1.64 {d7}, [r9] \n"
michael@0 289
michael@0 290 "vld1.8 {q15}, [%7] \n"
michael@0 291
michael@0 292 "vtrn.8 q0, q1 \n"
michael@0 293 "vtrn.8 q2, q3 \n"
michael@0 294
michael@0 295 "vtbl.8 d16, {d0, d1}, d30 \n"
michael@0 296 "vtbl.8 d17, {d0, d1}, d31 \n"
michael@0 297 "vtbl.8 d18, {d2, d3}, d30 \n"
michael@0 298 "vtbl.8 d19, {d2, d3}, d31 \n"
michael@0 299 "vtbl.8 d20, {d4, d5}, d30 \n"
michael@0 300 "vtbl.8 d21, {d4, d5}, d31 \n"
michael@0 301 "vtbl.8 d22, {d6, d7}, d30 \n"
michael@0 302 "vtbl.8 d23, {d6, d7}, d31 \n"
michael@0 303
michael@0 304 "mov r9, %2 \n"
michael@0 305
michael@0 306 "vst1.32 {d16[0]}, [r9], %3 \n"
michael@0 307 "vst1.32 {d16[1]}, [r9], %3 \n"
michael@0 308 "vst1.32 {d17[0]}, [r9], %3 \n"
michael@0 309 "vst1.32 {d17[1]}, [r9], %3 \n"
michael@0 310
michael@0 311 "add r9, %2, #4 \n"
michael@0 312 "vst1.32 {d20[0]}, [r9], %3 \n"
michael@0 313 "vst1.32 {d20[1]}, [r9], %3 \n"
michael@0 314 "vst1.32 {d21[0]}, [r9], %3 \n"
michael@0 315 "vst1.32 {d21[1]}, [r9] \n"
michael@0 316
michael@0 317 "mov r9, %4 \n"
michael@0 318
michael@0 319 "vst1.32 {d18[0]}, [r9], %5 \n"
michael@0 320 "vst1.32 {d18[1]}, [r9], %5 \n"
michael@0 321 "vst1.32 {d19[0]}, [r9], %5 \n"
michael@0 322 "vst1.32 {d19[1]}, [r9], %5 \n"
michael@0 323
michael@0 324 "add r9, %4, #4 \n"
michael@0 325 "vst1.32 {d22[0]}, [r9], %5 \n"
michael@0 326 "vst1.32 {d22[1]}, [r9], %5 \n"
michael@0 327 "vst1.32 {d23[0]}, [r9], %5 \n"
michael@0 328 "vst1.32 {d23[1]}, [r9] \n"
michael@0 329
michael@0 330 "add %0, #4*2 \n" // src += 4 * 2
michael@0 331 "add %2, %2, %3, lsl #2 \n" // dst_a += 4 * dst_stride_a
michael@0 332 "add %4, %4, %5, lsl #2 \n" // dst_b += 4 * dst_stride_b
michael@0 333 "subs %6, #4 \n" // w -= 4
michael@0 334 "beq 4f \n"
michael@0 335
michael@0 336 // some residual, check to see if it includes a 2x8 block,
michael@0 337 // or less
michael@0 338 "cmp %6, #2 \n"
michael@0 339 "blt 3f \n"
michael@0 340
michael@0 341 // 2x8 block
michael@0 342 "2: \n"
michael@0 343 "mov r9, %0 \n"
michael@0 344 "vld2.16 {d0[0], d2[0]}, [r9], %1 \n"
michael@0 345 "vld2.16 {d1[0], d3[0]}, [r9], %1 \n"
michael@0 346 "vld2.16 {d0[1], d2[1]}, [r9], %1 \n"
michael@0 347 "vld2.16 {d1[1], d3[1]}, [r9], %1 \n"
michael@0 348 "vld2.16 {d0[2], d2[2]}, [r9], %1 \n"
michael@0 349 "vld2.16 {d1[2], d3[2]}, [r9], %1 \n"
michael@0 350 "vld2.16 {d0[3], d2[3]}, [r9], %1 \n"
michael@0 351 "vld2.16 {d1[3], d3[3]}, [r9] \n"
michael@0 352
michael@0 353 "vtrn.8 d0, d1 \n"
michael@0 354 "vtrn.8 d2, d3 \n"
michael@0 355
michael@0 356 "mov r9, %2 \n"
michael@0 357
michael@0 358 "vst1.64 {d0}, [r9], %3 \n"
michael@0 359 "vst1.64 {d2}, [r9] \n"
michael@0 360
michael@0 361 "mov r9, %4 \n"
michael@0 362
michael@0 363 "vst1.64 {d1}, [r9], %5 \n"
michael@0 364 "vst1.64 {d3}, [r9] \n"
michael@0 365
michael@0 366 "add %0, #2*2 \n" // src += 2 * 2
michael@0 367 "add %2, %2, %3, lsl #1 \n" // dst_a += 2 * dst_stride_a
michael@0 368 "add %4, %4, %5, lsl #1 \n" // dst_b += 2 * dst_stride_b
michael@0 369 "subs %6, #2 \n" // w -= 2
michael@0 370 "beq 4f \n"
michael@0 371
michael@0 372 // 1x8 block
michael@0 373 "3: \n"
michael@0 374 "vld2.8 {d0[0], d1[0]}, [%0], %1 \n"
michael@0 375 "vld2.8 {d0[1], d1[1]}, [%0], %1 \n"
michael@0 376 "vld2.8 {d0[2], d1[2]}, [%0], %1 \n"
michael@0 377 "vld2.8 {d0[3], d1[3]}, [%0], %1 \n"
michael@0 378 "vld2.8 {d0[4], d1[4]}, [%0], %1 \n"
michael@0 379 "vld2.8 {d0[5], d1[5]}, [%0], %1 \n"
michael@0 380 "vld2.8 {d0[6], d1[6]}, [%0], %1 \n"
michael@0 381 "vld2.8 {d0[7], d1[7]}, [%0] \n"
michael@0 382
michael@0 383 "vst1.64 {d0}, [%2] \n"
michael@0 384 "vst1.64 {d1}, [%4] \n"
michael@0 385
michael@0 386 "4: \n"
michael@0 387
michael@0 388 : "+r"(src), // %0
michael@0 389 "+r"(src_stride), // %1
michael@0 390 "+r"(dst_a), // %2
michael@0 391 "+r"(dst_stride_a), // %3
michael@0 392 "+r"(dst_b), // %4
michael@0 393 "+r"(dst_stride_b), // %5
michael@0 394 "+r"(width) // %6
michael@0 395 : "r"(&kVTbl4x4TransposeDi) // %7
michael@0 396 : "memory", "cc", "r9",
michael@0 397 "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
michael@0 398 );
michael@0 399 }
michael@0 400 #endif
michael@0 401
michael@0 402 #ifdef __cplusplus
michael@0 403 } // extern "C"
michael@0 404 } // namespace libyuv
michael@0 405 #endif

mercurial