media/libyuv/source/row_neon.cc

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /*
michael@0 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
michael@0 3 *
michael@0 4 * Use of this source code is governed by a BSD-style license
michael@0 5 * that can be found in the LICENSE file in the root of the source
michael@0 6 * tree. An additional intellectual property rights grant can be found
michael@0 7 * in the file PATENTS. All contributing project authors may
michael@0 8 * be found in the AUTHORS file in the root of the source tree.
michael@0 9 */
michael@0 10
michael@0 11 #include "libyuv/row.h"
michael@0 12
michael@0 13 #ifdef __cplusplus
michael@0 14 namespace libyuv {
michael@0 15 extern "C" {
michael@0 16 #endif
michael@0 17
michael@0 18 // This module is for GCC Neon
michael@0 19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
michael@0 20
michael@0 21 // Read 8 Y, 4 U and 4 V from 422
michael@0 22 #define READYUV422 \
michael@0 23 "vld1.8 {d0}, [%0]! \n" \
michael@0 24 "vld1.32 {d2[0]}, [%1]! \n" \
michael@0 25 "vld1.32 {d2[1]}, [%2]! \n"
michael@0 26
michael@0 27 // Read 8 Y, 2 U and 2 V from 422
michael@0 28 #define READYUV411 \
michael@0 29 "vld1.8 {d0}, [%0]! \n" \
michael@0 30 "vld1.16 {d2[0]}, [%1]! \n" \
michael@0 31 "vld1.16 {d2[1]}, [%2]! \n" \
michael@0 32 "vmov.u8 d3, d2 \n" \
michael@0 33 "vzip.u8 d2, d3 \n"
michael@0 34
michael@0 35 // Read 8 Y, 8 U and 8 V from 444
michael@0 36 #define READYUV444 \
michael@0 37 "vld1.8 {d0}, [%0]! \n" \
michael@0 38 "vld1.8 {d2}, [%1]! \n" \
michael@0 39 "vld1.8 {d3}, [%2]! \n" \
michael@0 40 "vpaddl.u8 q1, q1 \n" \
michael@0 41 "vrshrn.u16 d2, q1, #1 \n"
michael@0 42
michael@0 43 // Read 8 Y, and set 4 U and 4 V to 128
michael@0 44 #define READYUV400 \
michael@0 45 "vld1.8 {d0}, [%0]! \n" \
michael@0 46 "vmov.u8 d2, #128 \n"
michael@0 47
michael@0 48 // Read 8 Y and 4 UV from NV12
michael@0 49 #define READNV12 \
michael@0 50 "vld1.8 {d0}, [%0]! \n" \
michael@0 51 "vld1.8 {d2}, [%1]! \n" \
michael@0 52 "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
michael@0 53 "vuzp.u8 d2, d3 \n" \
michael@0 54 "vtrn.u32 d2, d3 \n"
michael@0 55
michael@0 56 // Read 8 Y and 4 VU from NV21
michael@0 57 #define READNV21 \
michael@0 58 "vld1.8 {d0}, [%0]! \n" \
michael@0 59 "vld1.8 {d2}, [%1]! \n" \
michael@0 60 "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
michael@0 61 "vuzp.u8 d3, d2 \n" \
michael@0 62 "vtrn.u32 d2, d3 \n"
michael@0 63
michael@0 64 // Read 8 YUY2
michael@0 65 #define READYUY2 \
michael@0 66 "vld2.8 {d0, d2}, [%0]! \n" \
michael@0 67 "vmov.u8 d3, d2 \n" \
michael@0 68 "vuzp.u8 d2, d3 \n" \
michael@0 69 "vtrn.u32 d2, d3 \n"
michael@0 70
michael@0 71 // Read 8 UYVY
michael@0 72 #define READUYVY \
michael@0 73 "vld2.8 {d2, d3}, [%0]! \n" \
michael@0 74 "vmov.u8 d0, d3 \n" \
michael@0 75 "vmov.u8 d3, d2 \n" \
michael@0 76 "vuzp.u8 d2, d3 \n" \
michael@0 77 "vtrn.u32 d2, d3 \n"
michael@0 78
michael@0 79 #define YUV422TORGB \
michael@0 80 "veor.u8 d2, d26 \n"/*subtract 128 from u and v*/\
michael@0 81 "vmull.s8 q8, d2, d24 \n"/* u/v B/R component */\
michael@0 82 "vmull.s8 q9, d2, d25 \n"/* u/v G component */\
michael@0 83 "vmov.u8 d1, #0 \n"/* split odd/even y apart */\
michael@0 84 "vtrn.u8 d0, d1 \n" \
michael@0 85 "vsub.s16 q0, q0, q15 \n"/* offset y */\
michael@0 86 "vmul.s16 q0, q0, q14 \n" \
michael@0 87 "vadd.s16 d18, d19 \n" \
michael@0 88 "vqadd.s16 d20, d0, d16 \n" /* B */ \
michael@0 89 "vqadd.s16 d21, d1, d16 \n" \
michael@0 90 "vqadd.s16 d22, d0, d17 \n" /* R */ \
michael@0 91 "vqadd.s16 d23, d1, d17 \n" \
michael@0 92 "vqadd.s16 d16, d0, d18 \n" /* G */ \
michael@0 93 "vqadd.s16 d17, d1, d18 \n" \
michael@0 94 "vqshrun.s16 d0, q10, #6 \n" /* B */ \
michael@0 95 "vqshrun.s16 d1, q11, #6 \n" /* G */ \
michael@0 96 "vqshrun.s16 d2, q8, #6 \n" /* R */ \
michael@0 97 "vmovl.u8 q10, d0 \n"/* set up for reinterleave*/\
michael@0 98 "vmovl.u8 q11, d1 \n" \
michael@0 99 "vmovl.u8 q8, d2 \n" \
michael@0 100 "vtrn.u8 d20, d21 \n" \
michael@0 101 "vtrn.u8 d22, d23 \n" \
michael@0 102 "vtrn.u8 d16, d17 \n" \
michael@0 103 "vmov.u8 d21, d16 \n"
michael@0 104
michael@0 105 static vec8 kUVToRB = { 127, 127, 127, 127, 102, 102, 102, 102,
michael@0 106 0, 0, 0, 0, 0, 0, 0, 0 };
michael@0 107 static vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
michael@0 108 0, 0, 0, 0, 0, 0, 0, 0 };
michael@0 109
michael@0 110 void I444ToARGBRow_NEON(const uint8* src_y,
michael@0 111 const uint8* src_u,
michael@0 112 const uint8* src_v,
michael@0 113 uint8* dst_argb,
michael@0 114 int width) {
michael@0 115 asm volatile (
michael@0 116 "vld1.8 {d24}, [%5] \n"
michael@0 117 "vld1.8 {d25}, [%6] \n"
michael@0 118 "vmov.u8 d26, #128 \n"
michael@0 119 "vmov.u16 q14, #74 \n"
michael@0 120 "vmov.u16 q15, #16 \n"
michael@0 121 ".p2align 2 \n"
michael@0 122 "1: \n"
michael@0 123 READYUV444
michael@0 124 YUV422TORGB
michael@0 125 "subs %4, %4, #8 \n"
michael@0 126 "vmov.u8 d23, #255 \n"
michael@0 127 "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
michael@0 128 "bgt 1b \n"
michael@0 129 : "+r"(src_y), // %0
michael@0 130 "+r"(src_u), // %1
michael@0 131 "+r"(src_v), // %2
michael@0 132 "+r"(dst_argb), // %3
michael@0 133 "+r"(width) // %4
michael@0 134 : "r"(&kUVToRB), // %5
michael@0 135 "r"(&kUVToG) // %6
michael@0 136 : "cc", "memory", "q0", "q1", "q2", "q3",
michael@0 137 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 138 );
michael@0 139 }
michael@0 140
michael@0 141 void I422ToARGBRow_NEON(const uint8* src_y,
michael@0 142 const uint8* src_u,
michael@0 143 const uint8* src_v,
michael@0 144 uint8* dst_argb,
michael@0 145 int width) {
michael@0 146 asm volatile (
michael@0 147 "vld1.8 {d24}, [%5] \n"
michael@0 148 "vld1.8 {d25}, [%6] \n"
michael@0 149 "vmov.u8 d26, #128 \n"
michael@0 150 "vmov.u16 q14, #74 \n"
michael@0 151 "vmov.u16 q15, #16 \n"
michael@0 152 ".p2align 2 \n"
michael@0 153 "1: \n"
michael@0 154 READYUV422
michael@0 155 YUV422TORGB
michael@0 156 "subs %4, %4, #8 \n"
michael@0 157 "vmov.u8 d23, #255 \n"
michael@0 158 "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
michael@0 159 "bgt 1b \n"
michael@0 160 : "+r"(src_y), // %0
michael@0 161 "+r"(src_u), // %1
michael@0 162 "+r"(src_v), // %2
michael@0 163 "+r"(dst_argb), // %3
michael@0 164 "+r"(width) // %4
michael@0 165 : "r"(&kUVToRB), // %5
michael@0 166 "r"(&kUVToG) // %6
michael@0 167 : "cc", "memory", "q0", "q1", "q2", "q3",
michael@0 168 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 169 );
michael@0 170 }
michael@0 171
michael@0 172 void I411ToARGBRow_NEON(const uint8* src_y,
michael@0 173 const uint8* src_u,
michael@0 174 const uint8* src_v,
michael@0 175 uint8* dst_argb,
michael@0 176 int width) {
michael@0 177 asm volatile (
michael@0 178 "vld1.8 {d24}, [%5] \n"
michael@0 179 "vld1.8 {d25}, [%6] \n"
michael@0 180 "vmov.u8 d26, #128 \n"
michael@0 181 "vmov.u16 q14, #74 \n"
michael@0 182 "vmov.u16 q15, #16 \n"
michael@0 183 ".p2align 2 \n"
michael@0 184 "1: \n"
michael@0 185 READYUV411
michael@0 186 YUV422TORGB
michael@0 187 "subs %4, %4, #8 \n"
michael@0 188 "vmov.u8 d23, #255 \n"
michael@0 189 "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
michael@0 190 "bgt 1b \n"
michael@0 191 : "+r"(src_y), // %0
michael@0 192 "+r"(src_u), // %1
michael@0 193 "+r"(src_v), // %2
michael@0 194 "+r"(dst_argb), // %3
michael@0 195 "+r"(width) // %4
michael@0 196 : "r"(&kUVToRB), // %5
michael@0 197 "r"(&kUVToG) // %6
michael@0 198 : "cc", "memory", "q0", "q1", "q2", "q3",
michael@0 199 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 200 );
michael@0 201 }
michael@0 202
michael@0 203 void I422ToBGRARow_NEON(const uint8* src_y,
michael@0 204 const uint8* src_u,
michael@0 205 const uint8* src_v,
michael@0 206 uint8* dst_bgra,
michael@0 207 int width) {
michael@0 208 asm volatile (
michael@0 209 "vld1.8 {d24}, [%5] \n"
michael@0 210 "vld1.8 {d25}, [%6] \n"
michael@0 211 "vmov.u8 d26, #128 \n"
michael@0 212 "vmov.u16 q14, #74 \n"
michael@0 213 "vmov.u16 q15, #16 \n"
michael@0 214 ".p2align 2 \n"
michael@0 215 "1: \n"
michael@0 216 READYUV422
michael@0 217 YUV422TORGB
michael@0 218 "subs %4, %4, #8 \n"
michael@0 219 "vswp.u8 d20, d22 \n"
michael@0 220 "vmov.u8 d19, #255 \n"
michael@0 221 "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
michael@0 222 "bgt 1b \n"
michael@0 223 : "+r"(src_y), // %0
michael@0 224 "+r"(src_u), // %1
michael@0 225 "+r"(src_v), // %2
michael@0 226 "+r"(dst_bgra), // %3
michael@0 227 "+r"(width) // %4
michael@0 228 : "r"(&kUVToRB), // %5
michael@0 229 "r"(&kUVToG) // %6
michael@0 230 : "cc", "memory", "q0", "q1", "q2", "q3",
michael@0 231 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 232 );
michael@0 233 }
michael@0 234
michael@0 235 void I422ToABGRRow_NEON(const uint8* src_y,
michael@0 236 const uint8* src_u,
michael@0 237 const uint8* src_v,
michael@0 238 uint8* dst_abgr,
michael@0 239 int width) {
michael@0 240 asm volatile (
michael@0 241 "vld1.8 {d24}, [%5] \n"
michael@0 242 "vld1.8 {d25}, [%6] \n"
michael@0 243 "vmov.u8 d26, #128 \n"
michael@0 244 "vmov.u16 q14, #74 \n"
michael@0 245 "vmov.u16 q15, #16 \n"
michael@0 246 ".p2align 2 \n"
michael@0 247 "1: \n"
michael@0 248 READYUV422
michael@0 249 YUV422TORGB
michael@0 250 "subs %4, %4, #8 \n"
michael@0 251 "vswp.u8 d20, d22 \n"
michael@0 252 "vmov.u8 d23, #255 \n"
michael@0 253 "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
michael@0 254 "bgt 1b \n"
michael@0 255 : "+r"(src_y), // %0
michael@0 256 "+r"(src_u), // %1
michael@0 257 "+r"(src_v), // %2
michael@0 258 "+r"(dst_abgr), // %3
michael@0 259 "+r"(width) // %4
michael@0 260 : "r"(&kUVToRB), // %5
michael@0 261 "r"(&kUVToG) // %6
michael@0 262 : "cc", "memory", "q0", "q1", "q2", "q3",
michael@0 263 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 264 );
michael@0 265 }
michael@0 266
michael@0 267 void I422ToRGBARow_NEON(const uint8* src_y,
michael@0 268 const uint8* src_u,
michael@0 269 const uint8* src_v,
michael@0 270 uint8* dst_rgba,
michael@0 271 int width) {
michael@0 272 asm volatile (
michael@0 273 "vld1.8 {d24}, [%5] \n"
michael@0 274 "vld1.8 {d25}, [%6] \n"
michael@0 275 "vmov.u8 d26, #128 \n"
michael@0 276 "vmov.u16 q14, #74 \n"
michael@0 277 "vmov.u16 q15, #16 \n"
michael@0 278 ".p2align 2 \n"
michael@0 279 "1: \n"
michael@0 280 READYUV422
michael@0 281 YUV422TORGB
michael@0 282 "subs %4, %4, #8 \n"
michael@0 283 "vmov.u8 d19, #255 \n"
michael@0 284 "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
michael@0 285 "bgt 1b \n"
michael@0 286 : "+r"(src_y), // %0
michael@0 287 "+r"(src_u), // %1
michael@0 288 "+r"(src_v), // %2
michael@0 289 "+r"(dst_rgba), // %3
michael@0 290 "+r"(width) // %4
michael@0 291 : "r"(&kUVToRB), // %5
michael@0 292 "r"(&kUVToG) // %6
michael@0 293 : "cc", "memory", "q0", "q1", "q2", "q3",
michael@0 294 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 295 );
michael@0 296 }
michael@0 297
michael@0 298 void I422ToRGB24Row_NEON(const uint8* src_y,
michael@0 299 const uint8* src_u,
michael@0 300 const uint8* src_v,
michael@0 301 uint8* dst_rgb24,
michael@0 302 int width) {
michael@0 303 asm volatile (
michael@0 304 "vld1.8 {d24}, [%5] \n"
michael@0 305 "vld1.8 {d25}, [%6] \n"
michael@0 306 "vmov.u8 d26, #128 \n"
michael@0 307 "vmov.u16 q14, #74 \n"
michael@0 308 "vmov.u16 q15, #16 \n"
michael@0 309 ".p2align 2 \n"
michael@0 310 "1: \n"
michael@0 311 READYUV422
michael@0 312 YUV422TORGB
michael@0 313 "subs %4, %4, #8 \n"
michael@0 314 "vst3.8 {d20, d21, d22}, [%3]! \n"
michael@0 315 "bgt 1b \n"
michael@0 316 : "+r"(src_y), // %0
michael@0 317 "+r"(src_u), // %1
michael@0 318 "+r"(src_v), // %2
michael@0 319 "+r"(dst_rgb24), // %3
michael@0 320 "+r"(width) // %4
michael@0 321 : "r"(&kUVToRB), // %5
michael@0 322 "r"(&kUVToG) // %6
michael@0 323 : "cc", "memory", "q0", "q1", "q2", "q3",
michael@0 324 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 325 );
michael@0 326 }
michael@0 327
michael@0 328 void I422ToRAWRow_NEON(const uint8* src_y,
michael@0 329 const uint8* src_u,
michael@0 330 const uint8* src_v,
michael@0 331 uint8* dst_raw,
michael@0 332 int width) {
michael@0 333 asm volatile (
michael@0 334 "vld1.8 {d24}, [%5] \n"
michael@0 335 "vld1.8 {d25}, [%6] \n"
michael@0 336 "vmov.u8 d26, #128 \n"
michael@0 337 "vmov.u16 q14, #74 \n"
michael@0 338 "vmov.u16 q15, #16 \n"
michael@0 339 ".p2align 2 \n"
michael@0 340 "1: \n"
michael@0 341 READYUV422
michael@0 342 YUV422TORGB
michael@0 343 "subs %4, %4, #8 \n"
michael@0 344 "vswp.u8 d20, d22 \n"
michael@0 345 "vst3.8 {d20, d21, d22}, [%3]! \n"
michael@0 346 "bgt 1b \n"
michael@0 347 : "+r"(src_y), // %0
michael@0 348 "+r"(src_u), // %1
michael@0 349 "+r"(src_v), // %2
michael@0 350 "+r"(dst_raw), // %3
michael@0 351 "+r"(width) // %4
michael@0 352 : "r"(&kUVToRB), // %5
michael@0 353 "r"(&kUVToG) // %6
michael@0 354 : "cc", "memory", "q0", "q1", "q2", "q3",
michael@0 355 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 356 );
michael@0 357 }
michael@0 358
michael@0 359 #define ARGBTORGB565 \
michael@0 360 "vshr.u8 d20, d20, #3 \n" /* B */ \
michael@0 361 "vshr.u8 d21, d21, #2 \n" /* G */ \
michael@0 362 "vshr.u8 d22, d22, #3 \n" /* R */ \
michael@0 363 "vmovl.u8 q8, d20 \n" /* B */ \
michael@0 364 "vmovl.u8 q9, d21 \n" /* G */ \
michael@0 365 "vmovl.u8 q10, d22 \n" /* R */ \
michael@0 366 "vshl.u16 q9, q9, #5 \n" /* G */ \
michael@0 367 "vshl.u16 q10, q10, #11 \n" /* R */ \
michael@0 368 "vorr q0, q8, q9 \n" /* BG */ \
michael@0 369 "vorr q0, q0, q10 \n" /* BGR */
michael@0 370
michael@0 371 void I422ToRGB565Row_NEON(const uint8* src_y,
michael@0 372 const uint8* src_u,
michael@0 373 const uint8* src_v,
michael@0 374 uint8* dst_rgb565,
michael@0 375 int width) {
michael@0 376 asm volatile (
michael@0 377 "vld1.8 {d24}, [%5] \n"
michael@0 378 "vld1.8 {d25}, [%6] \n"
michael@0 379 "vmov.u8 d26, #128 \n"
michael@0 380 "vmov.u16 q14, #74 \n"
michael@0 381 "vmov.u16 q15, #16 \n"
michael@0 382 ".p2align 2 \n"
michael@0 383 "1: \n"
michael@0 384 READYUV422
michael@0 385 YUV422TORGB
michael@0 386 "subs %4, %4, #8 \n"
michael@0 387 ARGBTORGB565
michael@0 388 "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
michael@0 389 "bgt 1b \n"
michael@0 390 : "+r"(src_y), // %0
michael@0 391 "+r"(src_u), // %1
michael@0 392 "+r"(src_v), // %2
michael@0 393 "+r"(dst_rgb565), // %3
michael@0 394 "+r"(width) // %4
michael@0 395 : "r"(&kUVToRB), // %5
michael@0 396 "r"(&kUVToG) // %6
michael@0 397 : "cc", "memory", "q0", "q1", "q2", "q3",
michael@0 398 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 399 );
michael@0 400 }
michael@0 401
michael@0 402 #define ARGBTOARGB1555 \
michael@0 403 "vshr.u8 q10, q10, #3 \n" /* B */ \
michael@0 404 "vshr.u8 d22, d22, #3 \n" /* R */ \
michael@0 405 "vshr.u8 d23, d23, #7 \n" /* A */ \
michael@0 406 "vmovl.u8 q8, d20 \n" /* B */ \
michael@0 407 "vmovl.u8 q9, d21 \n" /* G */ \
michael@0 408 "vmovl.u8 q10, d22 \n" /* R */ \
michael@0 409 "vmovl.u8 q11, d23 \n" /* A */ \
michael@0 410 "vshl.u16 q9, q9, #5 \n" /* G */ \
michael@0 411 "vshl.u16 q10, q10, #10 \n" /* R */ \
michael@0 412 "vshl.u16 q11, q11, #15 \n" /* A */ \
michael@0 413 "vorr q0, q8, q9 \n" /* BG */ \
michael@0 414 "vorr q1, q10, q11 \n" /* RA */ \
michael@0 415 "vorr q0, q0, q1 \n" /* BGRA */
michael@0 416
michael@0 417 void I422ToARGB1555Row_NEON(const uint8* src_y,
michael@0 418 const uint8* src_u,
michael@0 419 const uint8* src_v,
michael@0 420 uint8* dst_argb1555,
michael@0 421 int width) {
michael@0 422 asm volatile (
michael@0 423 "vld1.8 {d24}, [%5] \n"
michael@0 424 "vld1.8 {d25}, [%6] \n"
michael@0 425 "vmov.u8 d26, #128 \n"
michael@0 426 "vmov.u16 q14, #74 \n"
michael@0 427 "vmov.u16 q15, #16 \n"
michael@0 428 ".p2align 2 \n"
michael@0 429 "1: \n"
michael@0 430 READYUV422
michael@0 431 YUV422TORGB
michael@0 432 "subs %4, %4, #8 \n"
michael@0 433 "vmov.u8 d23, #255 \n"
michael@0 434 ARGBTOARGB1555
michael@0 435 "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555.
michael@0 436 "bgt 1b \n"
michael@0 437 : "+r"(src_y), // %0
michael@0 438 "+r"(src_u), // %1
michael@0 439 "+r"(src_v), // %2
michael@0 440 "+r"(dst_argb1555), // %3
michael@0 441 "+r"(width) // %4
michael@0 442 : "r"(&kUVToRB), // %5
michael@0 443 "r"(&kUVToG) // %6
michael@0 444 : "cc", "memory", "q0", "q1", "q2", "q3",
michael@0 445 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 446 );
michael@0 447 }
michael@0 448
michael@0 449 #define ARGBTOARGB4444 \
michael@0 450 "vshr.u8 d20, d20, #4 \n" /* B */ \
michael@0 451 "vbic.32 d21, d21, d4 \n" /* G */ \
michael@0 452 "vshr.u8 d22, d22, #4 \n" /* R */ \
michael@0 453 "vbic.32 d23, d23, d4 \n" /* A */ \
michael@0 454 "vorr d0, d20, d21 \n" /* BG */ \
michael@0 455 "vorr d1, d22, d23 \n" /* RA */ \
michael@0 456 "vzip.u8 d0, d1 \n" /* BGRA */
michael@0 457
michael@0 458 void I422ToARGB4444Row_NEON(const uint8* src_y,
michael@0 459 const uint8* src_u,
michael@0 460 const uint8* src_v,
michael@0 461 uint8* dst_argb4444,
michael@0 462 int width) {
michael@0 463 asm volatile (
michael@0 464 "vld1.8 {d24}, [%5] \n"
michael@0 465 "vld1.8 {d25}, [%6] \n"
michael@0 466 "vmov.u8 d26, #128 \n"
michael@0 467 "vmov.u16 q14, #74 \n"
michael@0 468 "vmov.u16 q15, #16 \n"
michael@0 469 "vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
michael@0 470 ".p2align 2 \n"
michael@0 471 "1: \n"
michael@0 472 READYUV422
michael@0 473 YUV422TORGB
michael@0 474 "subs %4, %4, #8 \n"
michael@0 475 "vmov.u8 d23, #255 \n"
michael@0 476 ARGBTOARGB4444
michael@0 477 "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444.
michael@0 478 "bgt 1b \n"
michael@0 479 : "+r"(src_y), // %0
michael@0 480 "+r"(src_u), // %1
michael@0 481 "+r"(src_v), // %2
michael@0 482 "+r"(dst_argb4444), // %3
michael@0 483 "+r"(width) // %4
michael@0 484 : "r"(&kUVToRB), // %5
michael@0 485 "r"(&kUVToG) // %6
michael@0 486 : "cc", "memory", "q0", "q1", "q2", "q3",
michael@0 487 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 488 );
michael@0 489 }
michael@0 490
michael@0 491 void YToARGBRow_NEON(const uint8* src_y,
michael@0 492 uint8* dst_argb,
michael@0 493 int width) {
michael@0 494 asm volatile (
michael@0 495 "vld1.8 {d24}, [%3] \n"
michael@0 496 "vld1.8 {d25}, [%4] \n"
michael@0 497 "vmov.u8 d26, #128 \n"
michael@0 498 "vmov.u16 q14, #74 \n"
michael@0 499 "vmov.u16 q15, #16 \n"
michael@0 500 ".p2align 2 \n"
michael@0 501 "1: \n"
michael@0 502 READYUV400
michael@0 503 YUV422TORGB
michael@0 504 "subs %2, %2, #8 \n"
michael@0 505 "vmov.u8 d23, #255 \n"
michael@0 506 "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
michael@0 507 "bgt 1b \n"
michael@0 508 : "+r"(src_y), // %0
michael@0 509 "+r"(dst_argb), // %1
michael@0 510 "+r"(width) // %2
michael@0 511 : "r"(&kUVToRB), // %3
michael@0 512 "r"(&kUVToG) // %4
michael@0 513 : "cc", "memory", "q0", "q1", "q2", "q3",
michael@0 514 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 515 );
michael@0 516 }
michael@0 517
michael@0 518 void I400ToARGBRow_NEON(const uint8* src_y,
michael@0 519 uint8* dst_argb,
michael@0 520 int width) {
michael@0 521 asm volatile (
michael@0 522 ".p2align 2 \n"
michael@0 523 "vmov.u8 d23, #255 \n"
michael@0 524 "1: \n"
michael@0 525 "vld1.8 {d20}, [%0]! \n"
michael@0 526 "vmov d21, d20 \n"
michael@0 527 "vmov d22, d20 \n"
michael@0 528 "subs %2, %2, #8 \n"
michael@0 529 "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
michael@0 530 "bgt 1b \n"
michael@0 531 : "+r"(src_y), // %0
michael@0 532 "+r"(dst_argb), // %1
michael@0 533 "+r"(width) // %2
michael@0 534 :
michael@0 535 : "cc", "memory", "d20", "d21", "d22", "d23"
michael@0 536 );
michael@0 537 }
michael@0 538
michael@0 539 void NV12ToARGBRow_NEON(const uint8* src_y,
michael@0 540 const uint8* src_uv,
michael@0 541 uint8* dst_argb,
michael@0 542 int width) {
michael@0 543 asm volatile (
michael@0 544 "vld1.8 {d24}, [%4] \n"
michael@0 545 "vld1.8 {d25}, [%5] \n"
michael@0 546 "vmov.u8 d26, #128 \n"
michael@0 547 "vmov.u16 q14, #74 \n"
michael@0 548 "vmov.u16 q15, #16 \n"
michael@0 549 ".p2align 2 \n"
michael@0 550 "1: \n"
michael@0 551 READNV12
michael@0 552 YUV422TORGB
michael@0 553 "subs %3, %3, #8 \n"
michael@0 554 "vmov.u8 d23, #255 \n"
michael@0 555 "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
michael@0 556 "bgt 1b \n"
michael@0 557 : "+r"(src_y), // %0
michael@0 558 "+r"(src_uv), // %1
michael@0 559 "+r"(dst_argb), // %2
michael@0 560 "+r"(width) // %3
michael@0 561 : "r"(&kUVToRB), // %4
michael@0 562 "r"(&kUVToG) // %5
michael@0 563 : "cc", "memory", "q0", "q1", "q2", "q3",
michael@0 564 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 565 );
michael@0 566 }
michael@0 567
michael@0 568 void NV21ToARGBRow_NEON(const uint8* src_y,
michael@0 569 const uint8* src_uv,
michael@0 570 uint8* dst_argb,
michael@0 571 int width) {
michael@0 572 asm volatile (
michael@0 573 "vld1.8 {d24}, [%4] \n"
michael@0 574 "vld1.8 {d25}, [%5] \n"
michael@0 575 "vmov.u8 d26, #128 \n"
michael@0 576 "vmov.u16 q14, #74 \n"
michael@0 577 "vmov.u16 q15, #16 \n"
michael@0 578 ".p2align 2 \n"
michael@0 579 "1: \n"
michael@0 580 READNV21
michael@0 581 YUV422TORGB
michael@0 582 "subs %3, %3, #8 \n"
michael@0 583 "vmov.u8 d23, #255 \n"
michael@0 584 "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
michael@0 585 "bgt 1b \n"
michael@0 586 : "+r"(src_y), // %0
michael@0 587 "+r"(src_uv), // %1
michael@0 588 "+r"(dst_argb), // %2
michael@0 589 "+r"(width) // %3
michael@0 590 : "r"(&kUVToRB), // %4
michael@0 591 "r"(&kUVToG) // %5
michael@0 592 : "cc", "memory", "q0", "q1", "q2", "q3",
michael@0 593 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 594 );
michael@0 595 }
michael@0 596
michael@0 597 void NV12ToRGB565Row_NEON(const uint8* src_y,
michael@0 598 const uint8* src_uv,
michael@0 599 uint8* dst_rgb565,
michael@0 600 int width) {
michael@0 601 asm volatile (
michael@0 602 "vld1.8 {d24}, [%4] \n"
michael@0 603 "vld1.8 {d25}, [%5] \n"
michael@0 604 "vmov.u8 d26, #128 \n"
michael@0 605 "vmov.u16 q14, #74 \n"
michael@0 606 "vmov.u16 q15, #16 \n"
michael@0 607 ".p2align 2 \n"
michael@0 608 "1: \n"
michael@0 609 READNV12
michael@0 610 YUV422TORGB
michael@0 611 "subs %3, %3, #8 \n"
michael@0 612 ARGBTORGB565
michael@0 613 "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
michael@0 614 "bgt 1b \n"
michael@0 615 : "+r"(src_y), // %0
michael@0 616 "+r"(src_uv), // %1
michael@0 617 "+r"(dst_rgb565), // %2
michael@0 618 "+r"(width) // %3
michael@0 619 : "r"(&kUVToRB), // %4
michael@0 620 "r"(&kUVToG) // %5
michael@0 621 : "cc", "memory", "q0", "q1", "q2", "q3",
michael@0 622 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 623 );
michael@0 624 }
michael@0 625
michael@0 626 void NV21ToRGB565Row_NEON(const uint8* src_y,
michael@0 627 const uint8* src_uv,
michael@0 628 uint8* dst_rgb565,
michael@0 629 int width) {
michael@0 630 asm volatile (
michael@0 631 "vld1.8 {d24}, [%4] \n"
michael@0 632 "vld1.8 {d25}, [%5] \n"
michael@0 633 "vmov.u8 d26, #128 \n"
michael@0 634 "vmov.u16 q14, #74 \n"
michael@0 635 "vmov.u16 q15, #16 \n"
michael@0 636 ".p2align 2 \n"
michael@0 637 "1: \n"
michael@0 638 READNV21
michael@0 639 YUV422TORGB
michael@0 640 "subs %3, %3, #8 \n"
michael@0 641 ARGBTORGB565
michael@0 642 "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
michael@0 643 "bgt 1b \n"
michael@0 644 : "+r"(src_y), // %0
michael@0 645 "+r"(src_uv), // %1
michael@0 646 "+r"(dst_rgb565), // %2
michael@0 647 "+r"(width) // %3
michael@0 648 : "r"(&kUVToRB), // %4
michael@0 649 "r"(&kUVToG) // %5
michael@0 650 : "cc", "memory", "q0", "q1", "q2", "q3",
michael@0 651 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 652 );
michael@0 653 }
michael@0 654
michael@0 655 void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
michael@0 656 uint8* dst_argb,
michael@0 657 int width) {
michael@0 658 asm volatile (
michael@0 659 "vld1.8 {d24}, [%3] \n"
michael@0 660 "vld1.8 {d25}, [%4] \n"
michael@0 661 "vmov.u8 d26, #128 \n"
michael@0 662 "vmov.u16 q14, #74 \n"
michael@0 663 "vmov.u16 q15, #16 \n"
michael@0 664 ".p2align 2 \n"
michael@0 665 "1: \n"
michael@0 666 READYUY2
michael@0 667 YUV422TORGB
michael@0 668 "subs %2, %2, #8 \n"
michael@0 669 "vmov.u8 d23, #255 \n"
michael@0 670 "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
michael@0 671 "bgt 1b \n"
michael@0 672 : "+r"(src_yuy2), // %0
michael@0 673 "+r"(dst_argb), // %1
michael@0 674 "+r"(width) // %2
michael@0 675 : "r"(&kUVToRB), // %3
michael@0 676 "r"(&kUVToG) // %4
michael@0 677 : "cc", "memory", "q0", "q1", "q2", "q3",
michael@0 678 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 679 );
michael@0 680 }
michael@0 681
michael@0 682 void UYVYToARGBRow_NEON(const uint8* src_uyvy,
michael@0 683 uint8* dst_argb,
michael@0 684 int width) {
michael@0 685 asm volatile (
michael@0 686 "vld1.8 {d24}, [%3] \n"
michael@0 687 "vld1.8 {d25}, [%4] \n"
michael@0 688 "vmov.u8 d26, #128 \n"
michael@0 689 "vmov.u16 q14, #74 \n"
michael@0 690 "vmov.u16 q15, #16 \n"
michael@0 691 ".p2align 2 \n"
michael@0 692 "1: \n"
michael@0 693 READUYVY
michael@0 694 YUV422TORGB
michael@0 695 "subs %2, %2, #8 \n"
michael@0 696 "vmov.u8 d23, #255 \n"
michael@0 697 "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
michael@0 698 "bgt 1b \n"
michael@0 699 : "+r"(src_uyvy), // %0
michael@0 700 "+r"(dst_argb), // %1
michael@0 701 "+r"(width) // %2
michael@0 702 : "r"(&kUVToRB), // %3
michael@0 703 "r"(&kUVToG) // %4
michael@0 704 : "cc", "memory", "q0", "q1", "q2", "q3",
michael@0 705 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 706 );
michael@0 707 }
michael@0 708
michael@0 709 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
michael@0 710 void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
michael@0 711 int width) {
michael@0 712 asm volatile (
michael@0 713 ".p2align 2 \n"
michael@0 714 "1: \n"
michael@0 715 "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
michael@0 716 "subs %3, %3, #16 \n" // 16 processed per loop
michael@0 717 "vst1.8 {q0}, [%1]! \n" // store U
michael@0 718 "vst1.8 {q1}, [%2]! \n" // store V
michael@0 719 "bgt 1b \n"
michael@0 720 : "+r"(src_uv), // %0
michael@0 721 "+r"(dst_u), // %1
michael@0 722 "+r"(dst_v), // %2
michael@0 723 "+r"(width) // %3 // Output registers
michael@0 724 : // Input registers
michael@0 725 : "cc", "memory", "q0", "q1" // Clobber List
michael@0 726 );
michael@0 727 }
michael@0 728
michael@0 729 // Reads 16 U's and V's and writes out 16 pairs of UV.
michael@0 730 void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
michael@0 731 int width) {
michael@0 732 asm volatile (
michael@0 733 ".p2align 2 \n"
michael@0 734 "1: \n"
michael@0 735 "vld1.8 {q0}, [%0]! \n" // load U
michael@0 736 "vld1.8 {q1}, [%1]! \n" // load V
michael@0 737 "subs %3, %3, #16 \n" // 16 processed per loop
michael@0 738 "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
michael@0 739 "bgt 1b \n"
michael@0 740 :
michael@0 741 "+r"(src_u), // %0
michael@0 742 "+r"(src_v), // %1
michael@0 743 "+r"(dst_uv), // %2
michael@0 744 "+r"(width) // %3 // Output registers
michael@0 745 : // Input registers
michael@0 746 : "cc", "memory", "q0", "q1" // Clobber List
michael@0 747 );
michael@0 748 }
michael@0 749
michael@0 750 // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
michael@0 751 void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
michael@0 752 asm volatile (
michael@0 753 ".p2align 2 \n"
michael@0 754 "1: \n"
michael@0 755 "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
michael@0 756 "subs %2, %2, #32 \n" // 32 processed per loop
michael@0 757 "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
michael@0 758 "bgt 1b \n"
michael@0 759 : "+r"(src), // %0
michael@0 760 "+r"(dst), // %1
michael@0 761 "+r"(count) // %2 // Output registers
michael@0 762 : // Input registers
michael@0 763 : "cc", "memory", "q0", "q1" // Clobber List
michael@0 764 );
michael@0 765 }
michael@0 766
michael@0 767 // SetRow8 writes 'count' bytes using a 32 bit value repeated.
michael@0 768 void SetRow_NEON(uint8* dst, uint32 v32, int count) {
michael@0 769 asm volatile (
michael@0 770 "vdup.u32 q0, %2 \n" // duplicate 4 ints
michael@0 771 "1: \n"
michael@0 772 "subs %1, %1, #16 \n" // 16 bytes per loop
michael@0 773 "vst1.8 {q0}, [%0]! \n" // store
michael@0 774 "bgt 1b \n"
michael@0 775 : "+r"(dst), // %0
michael@0 776 "+r"(count) // %1
michael@0 777 : "r"(v32) // %2
michael@0 778 : "cc", "memory", "q0"
michael@0 779 );
michael@0 780 }
michael@0 781
michael@0 782 // TODO(fbarchard): Make fully assembler
michael@0 783 // SetRow32 writes 'count' words using a 32 bit value repeated.
michael@0 784 void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,
michael@0 785 int dst_stride, int height) {
michael@0 786 for (int y = 0; y < height; ++y) {
michael@0 787 SetRow_NEON(dst, v32, width << 2);
michael@0 788 dst += dst_stride;
michael@0 789 }
michael@0 790 }
michael@0 791
michael@0 792 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
michael@0 793 asm volatile (
michael@0 794 // Start at end of source row.
michael@0 795 "mov r3, #-16 \n"
michael@0 796 "add %0, %0, %2 \n"
michael@0 797 "sub %0, #16 \n"
michael@0 798
michael@0 799 ".p2align 2 \n"
michael@0 800 "1: \n"
michael@0 801 "vld1.8 {q0}, [%0], r3 \n" // src -= 16
michael@0 802 "subs %2, #16 \n" // 16 pixels per loop.
michael@0 803 "vrev64.8 q0, q0 \n"
michael@0 804 "vst1.8 {d1}, [%1]! \n" // dst += 16
michael@0 805 "vst1.8 {d0}, [%1]! \n"
michael@0 806 "bgt 1b \n"
michael@0 807 : "+r"(src), // %0
michael@0 808 "+r"(dst), // %1
michael@0 809 "+r"(width) // %2
michael@0 810 :
michael@0 811 : "cc", "memory", "r3", "q0"
michael@0 812 );
michael@0 813 }
michael@0 814
michael@0 815 void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
michael@0 816 int width) {
michael@0 817 asm volatile (
michael@0 818 // Start at end of source row.
michael@0 819 "mov r12, #-16 \n"
michael@0 820 "add %0, %0, %3, lsl #1 \n"
michael@0 821 "sub %0, #16 \n"
michael@0 822
michael@0 823 ".p2align 2 \n"
michael@0 824 "1: \n"
michael@0 825 "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
michael@0 826 "subs %3, #8 \n" // 8 pixels per loop.
michael@0 827 "vrev64.8 q0, q0 \n"
michael@0 828 "vst1.8 {d0}, [%1]! \n" // dst += 8
michael@0 829 "vst1.8 {d1}, [%2]! \n"
michael@0 830 "bgt 1b \n"
michael@0 831 : "+r"(src_uv), // %0
michael@0 832 "+r"(dst_u), // %1
michael@0 833 "+r"(dst_v), // %2
michael@0 834 "+r"(width) // %3
michael@0 835 :
michael@0 836 : "cc", "memory", "r12", "q0"
michael@0 837 );
michael@0 838 }
michael@0 839
michael@0 840 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
michael@0 841 asm volatile (
michael@0 842 // Start at end of source row.
michael@0 843 "mov r3, #-16 \n"
michael@0 844 "add %0, %0, %2, lsl #2 \n"
michael@0 845 "sub %0, #16 \n"
michael@0 846
michael@0 847 ".p2align 2 \n"
michael@0 848 "1: \n"
michael@0 849 "vld1.8 {q0}, [%0], r3 \n" // src -= 16
michael@0 850 "subs %2, #4 \n" // 4 pixels per loop.
michael@0 851 "vrev64.32 q0, q0 \n"
michael@0 852 "vst1.8 {d1}, [%1]! \n" // dst += 16
michael@0 853 "vst1.8 {d0}, [%1]! \n"
michael@0 854 "bgt 1b \n"
michael@0 855 : "+r"(src), // %0
michael@0 856 "+r"(dst), // %1
michael@0 857 "+r"(width) // %2
michael@0 858 :
michael@0 859 : "cc", "memory", "r3", "q0"
michael@0 860 );
michael@0 861 }
michael@0 862
michael@0 863 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
michael@0 864 asm volatile (
michael@0 865 "vmov.u8 d4, #255 \n" // Alpha
michael@0 866 ".p2align 2 \n"
michael@0 867 "1: \n"
michael@0 868 "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
michael@0 869 "subs %2, %2, #8 \n" // 8 processed per loop.
michael@0 870 "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
michael@0 871 "bgt 1b \n"
michael@0 872 : "+r"(src_rgb24), // %0
michael@0 873 "+r"(dst_argb), // %1
michael@0 874 "+r"(pix) // %2
michael@0 875 :
michael@0 876 : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
michael@0 877 );
michael@0 878 }
michael@0 879
michael@0 880 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
michael@0 881 asm volatile (
michael@0 882 "vmov.u8 d4, #255 \n" // Alpha
michael@0 883 ".p2align 2 \n"
michael@0 884 "1: \n"
michael@0 885 "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
michael@0 886 "subs %2, %2, #8 \n" // 8 processed per loop.
michael@0 887 "vswp.u8 d1, d3 \n" // swap R, B
michael@0 888 "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
michael@0 889 "bgt 1b \n"
michael@0 890 : "+r"(src_raw), // %0
michael@0 891 "+r"(dst_argb), // %1
michael@0 892 "+r"(pix) // %2
michael@0 893 :
michael@0 894 : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
michael@0 895 );
michael@0 896 }
michael@0 897
michael@0 898 #define RGB565TOARGB \
michael@0 899 "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \
michael@0 900 "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \
michael@0 901 "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \
michael@0 902 "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \
michael@0 903 "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
michael@0 904 "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
michael@0 905 "vorr.u8 d0, d0, d4 \n" /* B */ \
michael@0 906 "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \
michael@0 907 "vorr.u8 d2, d1, d5 \n" /* R */ \
michael@0 908 "vorr.u8 d1, d4, d6 \n" /* G */
michael@0 909
michael@0 910 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
michael@0 911 asm volatile (
michael@0 912 "vmov.u8 d3, #255 \n" // Alpha
michael@0 913 ".p2align 2 \n"
michael@0 914 "1: \n"
michael@0 915 "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
michael@0 916 "subs %2, %2, #8 \n" // 8 processed per loop.
michael@0 917 RGB565TOARGB
michael@0 918 "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
michael@0 919 "bgt 1b \n"
michael@0 920 : "+r"(src_rgb565), // %0
michael@0 921 "+r"(dst_argb), // %1
michael@0 922 "+r"(pix) // %2
michael@0 923 :
michael@0 924 : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
michael@0 925 );
michael@0 926 }
michael@0 927
michael@0 928 #define ARGB1555TOARGB \
michael@0 929 "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \
michael@0 930 "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \
michael@0 931 "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \
michael@0 932 "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \
michael@0 933 "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \
michael@0 934 "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \
michael@0 935 "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \
michael@0 936 "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \
michael@0 937 "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \
michael@0 938 "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \
michael@0 939 "vorr.u8 q1, q1, q3 \n" /* R,A */ \
michael@0 940 "vorr.u8 q0, q0, q2 \n" /* B,G */ \
michael@0 941
michael@0 942 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
michael@0 943 #define RGB555TOARGB \
michael@0 944 "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \
michael@0 945 "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \
michael@0 946 "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \
michael@0 947 "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \
michael@0 948 "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
michael@0 949 "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
michael@0 950 "vorr.u8 d0, d0, d4 \n" /* B */ \
michael@0 951 "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \
michael@0 952 "vorr.u8 d2, d1, d5 \n" /* R */ \
michael@0 953 "vorr.u8 d1, d4, d6 \n" /* G */
michael@0 954
michael@0 955 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
michael@0 956 int pix) {
michael@0 957 asm volatile (
michael@0 958 "vmov.u8 d3, #255 \n" // Alpha
michael@0 959 ".p2align 2 \n"
michael@0 960 "1: \n"
michael@0 961 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
michael@0 962 "subs %2, %2, #8 \n" // 8 processed per loop.
michael@0 963 ARGB1555TOARGB
michael@0 964 "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
michael@0 965 "bgt 1b \n"
michael@0 966 : "+r"(src_argb1555), // %0
michael@0 967 "+r"(dst_argb), // %1
michael@0 968 "+r"(pix) // %2
michael@0 969 :
michael@0 970 : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
michael@0 971 );
michael@0 972 }
michael@0 973
michael@0 974 #define ARGB4444TOARGB \
michael@0 975 "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \
michael@0 976 "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \
michael@0 977 "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \
michael@0 978 "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \
michael@0 979 "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \
michael@0 980 "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \
michael@0 981 "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \
michael@0 982 "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */
michael@0 983
michael@0 984 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
michael@0 985 int pix) {
michael@0 986 asm volatile (
michael@0 987 "vmov.u8 d3, #255 \n" // Alpha
michael@0 988 ".p2align 2 \n"
michael@0 989 "1: \n"
michael@0 990 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
michael@0 991 "subs %2, %2, #8 \n" // 8 processed per loop.
michael@0 992 ARGB4444TOARGB
michael@0 993 "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
michael@0 994 "bgt 1b \n"
michael@0 995 : "+r"(src_argb4444), // %0
michael@0 996 "+r"(dst_argb), // %1
michael@0 997 "+r"(pix) // %2
michael@0 998 :
michael@0 999 : "cc", "memory", "q0", "q1", "q2" // Clobber List
michael@0 1000 );
michael@0 1001 }
michael@0 1002
michael@0 1003 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
michael@0 1004 asm volatile (
michael@0 1005 ".p2align 2 \n"
michael@0 1006 "1: \n"
michael@0 1007 "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
michael@0 1008 "subs %2, %2, #8 \n" // 8 processed per loop.
michael@0 1009 "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24.
michael@0 1010 "bgt 1b \n"
michael@0 1011 : "+r"(src_argb), // %0
michael@0 1012 "+r"(dst_rgb24), // %1
michael@0 1013 "+r"(pix) // %2
michael@0 1014 :
michael@0 1015 : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
michael@0 1016 );
michael@0 1017 }
michael@0 1018
michael@0 1019 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
michael@0 1020 asm volatile (
michael@0 1021 ".p2align 2 \n"
michael@0 1022 "1: \n"
michael@0 1023 "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
michael@0 1024 "subs %2, %2, #8 \n" // 8 processed per loop.
michael@0 1025 "vswp.u8 d1, d3 \n" // swap R, B
michael@0 1026 "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
michael@0 1027 "bgt 1b \n"
michael@0 1028 : "+r"(src_argb), // %0
michael@0 1029 "+r"(dst_raw), // %1
michael@0 1030 "+r"(pix) // %2
michael@0 1031 :
michael@0 1032 : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
michael@0 1033 );
michael@0 1034 }
michael@0 1035
michael@0 1036 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
michael@0 1037 asm volatile (
michael@0 1038 ".p2align 2 \n"
michael@0 1039 "1: \n"
michael@0 1040 "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
michael@0 1041 "subs %2, %2, #16 \n" // 16 processed per loop.
michael@0 1042 "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
michael@0 1043 "bgt 1b \n"
michael@0 1044 : "+r"(src_yuy2), // %0
michael@0 1045 "+r"(dst_y), // %1
michael@0 1046 "+r"(pix) // %2
michael@0 1047 :
michael@0 1048 : "cc", "memory", "q0", "q1" // Clobber List
michael@0 1049 );
michael@0 1050 }
michael@0 1051
michael@0 1052 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
michael@0 1053 asm volatile (
michael@0 1054 ".p2align 2 \n"
michael@0 1055 "1: \n"
michael@0 1056 "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
michael@0 1057 "subs %2, %2, #16 \n" // 16 processed per loop.
michael@0 1058 "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
michael@0 1059 "bgt 1b \n"
michael@0 1060 : "+r"(src_uyvy), // %0
michael@0 1061 "+r"(dst_y), // %1
michael@0 1062 "+r"(pix) // %2
michael@0 1063 :
michael@0 1064 : "cc", "memory", "q0", "q1" // Clobber List
michael@0 1065 );
michael@0 1066 }
michael@0 1067
michael@0 1068 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
michael@0 1069 int pix) {
michael@0 1070 asm volatile (
michael@0 1071 ".p2align 2 \n"
michael@0 1072 "1: \n"
michael@0 1073 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
michael@0 1074 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
michael@0 1075 "vst1.8 {d1}, [%1]! \n" // store 8 U.
michael@0 1076 "vst1.8 {d3}, [%2]! \n" // store 8 V.
michael@0 1077 "bgt 1b \n"
michael@0 1078 : "+r"(src_yuy2), // %0
michael@0 1079 "+r"(dst_u), // %1
michael@0 1080 "+r"(dst_v), // %2
michael@0 1081 "+r"(pix) // %3
michael@0 1082 :
michael@0 1083 : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
michael@0 1084 );
michael@0 1085 }
michael@0 1086
michael@0 1087 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
michael@0 1088 int pix) {
michael@0 1089 asm volatile (
michael@0 1090 ".p2align 2 \n"
michael@0 1091 "1: \n"
michael@0 1092 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
michael@0 1093 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
michael@0 1094 "vst1.8 {d0}, [%1]! \n" // store 8 U.
michael@0 1095 "vst1.8 {d2}, [%2]! \n" // store 8 V.
michael@0 1096 "bgt 1b \n"
michael@0 1097 : "+r"(src_uyvy), // %0
michael@0 1098 "+r"(dst_u), // %1
michael@0 1099 "+r"(dst_v), // %2
michael@0 1100 "+r"(pix) // %3
michael@0 1101 :
michael@0 1102 : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
michael@0 1103 );
michael@0 1104 }
michael@0 1105
michael@0 1106 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
michael@0 1107 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 1108 asm volatile (
michael@0 1109 "add %1, %0, %1 \n" // stride + src_yuy2
michael@0 1110 ".p2align 2 \n"
michael@0 1111 "1: \n"
michael@0 1112 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
michael@0 1113 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
michael@0 1114 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
michael@0 1115 "vrhadd.u8 d1, d1, d5 \n" // average rows of U
michael@0 1116 "vrhadd.u8 d3, d3, d7 \n" // average rows of V
michael@0 1117 "vst1.8 {d1}, [%2]! \n" // store 8 U.
michael@0 1118 "vst1.8 {d3}, [%3]! \n" // store 8 V.
michael@0 1119 "bgt 1b \n"
michael@0 1120 : "+r"(src_yuy2), // %0
michael@0 1121 "+r"(stride_yuy2), // %1
michael@0 1122 "+r"(dst_u), // %2
michael@0 1123 "+r"(dst_v), // %3
michael@0 1124 "+r"(pix) // %4
michael@0 1125 :
michael@0 1126 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
michael@0 1127 );
michael@0 1128 }
michael@0 1129
michael@0 1130 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
michael@0 1131 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 1132 asm volatile (
michael@0 1133 "add %1, %0, %1 \n" // stride + src_uyvy
michael@0 1134 ".p2align 2 \n"
michael@0 1135 "1: \n"
michael@0 1136 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
michael@0 1137 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
michael@0 1138 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
michael@0 1139 "vrhadd.u8 d0, d0, d4 \n" // average rows of U
michael@0 1140 "vrhadd.u8 d2, d2, d6 \n" // average rows of V
michael@0 1141 "vst1.8 {d0}, [%2]! \n" // store 8 U.
michael@0 1142 "vst1.8 {d2}, [%3]! \n" // store 8 V.
michael@0 1143 "bgt 1b \n"
michael@0 1144 : "+r"(src_uyvy), // %0
michael@0 1145 "+r"(stride_uyvy), // %1
michael@0 1146 "+r"(dst_u), // %2
michael@0 1147 "+r"(dst_v), // %3
michael@0 1148 "+r"(pix) // %4
michael@0 1149 :
michael@0 1150 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
michael@0 1151 );
michael@0 1152 }
michael@0 1153
michael@0 1154 void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
michael@0 1155 uint8* dst_uv, int pix) {
michael@0 1156 asm volatile (
michael@0 1157 // change the stride to row 2 pointer
michael@0 1158 "add %1, %0 \n"
michael@0 1159 "1: \n"
michael@0 1160 "vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels.
michael@0 1161 "subs %3, %3, #16 \n" // 16 processed per loop
michael@0 1162 "vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels.
michael@0 1163 "vrhadd.u8 q0, q1 \n" // average row 1 and 2
michael@0 1164 "vst1.8 {q0}, [%2]! \n"
michael@0 1165 "bgt 1b \n"
michael@0 1166 : "+r"(src_uv), // %0
michael@0 1167 "+r"(src_uv_stride), // %1
michael@0 1168 "+r"(dst_uv), // %2
michael@0 1169 "+r"(pix) // %3
michael@0 1170 :
michael@0 1171 : "cc", "memory", "q0", "q1" // Clobber List
michael@0 1172 );
michael@0 1173 }
michael@0 1174
michael@0 1175 // Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG
michael@0 1176 void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
michael@0 1177 uint32 selector, int pix) {
michael@0 1178 asm volatile (
michael@0 1179 "vmov.u32 d6[0], %3 \n" // selector
michael@0 1180 "1: \n"
michael@0 1181 "vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels.
michael@0 1182 "subs %2, %2, #8 \n" // 8 processed per loop
michael@0 1183 "vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels
michael@0 1184 "vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels
michael@0 1185 "vtrn.u32 d4, d5 \n" // combine 8 pixels
michael@0 1186 "vst1.8 {d4}, [%1]! \n" // store 8.
michael@0 1187 "bgt 1b \n"
michael@0 1188 : "+r"(src_argb), // %0
michael@0 1189 "+r"(dst_bayer), // %1
michael@0 1190 "+r"(pix) // %2
michael@0 1191 : "r"(selector) // %3
michael@0 1192 : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
michael@0 1193 );
michael@0 1194 }
michael@0 1195
michael@0 1196 // Select G channels from ARGB. e.g. GGGGGGGG
michael@0 1197 void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
michael@0 1198 uint32 /*selector*/, int pix) {
michael@0 1199 asm volatile (
michael@0 1200 "1: \n"
michael@0 1201 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels.
michael@0 1202 "subs %2, %2, #8 \n" // 8 processed per loop
michael@0 1203 "vst1.8 {d1}, [%1]! \n" // store 8 G's.
michael@0 1204 "bgt 1b \n"
michael@0 1205 : "+r"(src_argb), // %0
michael@0 1206 "+r"(dst_bayer), // %1
michael@0 1207 "+r"(pix) // %2
michael@0 1208 :
michael@0 1209 : "cc", "memory", "q0", "q1" // Clobber List
michael@0 1210 );
michael@0 1211 }
michael@0 1212
michael@0 1213 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
michael@0 1214 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
michael@0 1215 const uint8* shuffler, int pix) {
michael@0 1216 asm volatile (
michael@0 1217 "vld1.8 {q2}, [%3] \n" // shuffler
michael@0 1218 "1: \n"
michael@0 1219 "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
michael@0 1220 "subs %2, %2, #4 \n" // 4 processed per loop
michael@0 1221 "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
michael@0 1222 "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
michael@0 1223 "vst1.8 {q1}, [%1]! \n" // store 4.
michael@0 1224 "bgt 1b \n"
michael@0 1225 : "+r"(src_argb), // %0
michael@0 1226 "+r"(dst_argb), // %1
michael@0 1227 "+r"(pix) // %2
michael@0 1228 : "r"(shuffler) // %3
michael@0 1229 : "cc", "memory", "q0", "q1", "q2" // Clobber List
michael@0 1230 );
michael@0 1231 }
michael@0 1232
michael@0 1233 void I422ToYUY2Row_NEON(const uint8* src_y,
michael@0 1234 const uint8* src_u,
michael@0 1235 const uint8* src_v,
michael@0 1236 uint8* dst_yuy2, int width) {
michael@0 1237 asm volatile (
michael@0 1238 ".p2align 2 \n"
michael@0 1239 "1: \n"
michael@0 1240 "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
michael@0 1241 "vld1.8 {d1}, [%1]! \n" // load 8 Us
michael@0 1242 "vld1.8 {d3}, [%2]! \n" // load 8 Vs
michael@0 1243 "subs %4, %4, #16 \n" // 16 pixels
michael@0 1244 "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
michael@0 1245 "bgt 1b \n"
michael@0 1246 : "+r"(src_y), // %0
michael@0 1247 "+r"(src_u), // %1
michael@0 1248 "+r"(src_v), // %2
michael@0 1249 "+r"(dst_yuy2), // %3
michael@0 1250 "+r"(width) // %4
michael@0 1251 :
michael@0 1252 : "cc", "memory", "d0", "d1", "d2", "d3"
michael@0 1253 );
michael@0 1254 }
michael@0 1255
michael@0 1256 void I422ToUYVYRow_NEON(const uint8* src_y,
michael@0 1257 const uint8* src_u,
michael@0 1258 const uint8* src_v,
michael@0 1259 uint8* dst_uyvy, int width) {
michael@0 1260 asm volatile (
michael@0 1261 ".p2align 2 \n"
michael@0 1262 "1: \n"
michael@0 1263 "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
michael@0 1264 "vld1.8 {d0}, [%1]! \n" // load 8 Us
michael@0 1265 "vld1.8 {d2}, [%2]! \n" // load 8 Vs
michael@0 1266 "subs %4, %4, #16 \n" // 16 pixels
michael@0 1267 "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
michael@0 1268 "bgt 1b \n"
michael@0 1269 : "+r"(src_y), // %0
michael@0 1270 "+r"(src_u), // %1
michael@0 1271 "+r"(src_v), // %2
michael@0 1272 "+r"(dst_uyvy), // %3
michael@0 1273 "+r"(width) // %4
michael@0 1274 :
michael@0 1275 : "cc", "memory", "d0", "d1", "d2", "d3"
michael@0 1276 );
michael@0 1277 }
michael@0 1278
michael@0 1279 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
michael@0 1280 asm volatile (
michael@0 1281 ".p2align 2 \n"
michael@0 1282 "1: \n"
michael@0 1283 "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
michael@0 1284 "subs %2, %2, #8 \n" // 8 processed per loop.
michael@0 1285 ARGBTORGB565
michael@0 1286 "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565.
michael@0 1287 "bgt 1b \n"
michael@0 1288 : "+r"(src_argb), // %0
michael@0 1289 "+r"(dst_rgb565), // %1
michael@0 1290 "+r"(pix) // %2
michael@0 1291 :
michael@0 1292 : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
michael@0 1293 );
michael@0 1294 }
michael@0 1295
michael@0 1296 void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
michael@0 1297 int pix) {
michael@0 1298 asm volatile (
michael@0 1299 ".p2align 2 \n"
michael@0 1300 "1: \n"
michael@0 1301 "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
michael@0 1302 "subs %2, %2, #8 \n" // 8 processed per loop.
michael@0 1303 ARGBTOARGB1555
michael@0 1304 "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555.
michael@0 1305 "bgt 1b \n"
michael@0 1306 : "+r"(src_argb), // %0
michael@0 1307 "+r"(dst_argb1555), // %1
michael@0 1308 "+r"(pix) // %2
michael@0 1309 :
michael@0 1310 : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
michael@0 1311 );
michael@0 1312 }
michael@0 1313
michael@0 1314 void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
michael@0 1315 int pix) {
michael@0 1316 asm volatile (
michael@0 1317 "vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
michael@0 1318 ".p2align 2 \n"
michael@0 1319 "1: \n"
michael@0 1320 "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
michael@0 1321 "subs %2, %2, #8 \n" // 8 processed per loop.
michael@0 1322 ARGBTOARGB4444
michael@0 1323 "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444.
michael@0 1324 "bgt 1b \n"
michael@0 1325 : "+r"(src_argb), // %0
michael@0 1326 "+r"(dst_argb4444), // %1
michael@0 1327 "+r"(pix) // %2
michael@0 1328 :
michael@0 1329 : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
michael@0 1330 );
michael@0 1331 }
michael@0 1332
michael@0 1333 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
michael@0 1334 asm volatile (
michael@0 1335 "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
michael@0 1336 "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
michael@0 1337 "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
michael@0 1338 "vmov.u8 d27, #16 \n" // Add 16 constant
michael@0 1339 ".p2align 2 \n"
michael@0 1340 "1: \n"
michael@0 1341 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
michael@0 1342 "subs %2, %2, #8 \n" // 8 processed per loop.
michael@0 1343 "vmull.u8 q2, d0, d24 \n" // B
michael@0 1344 "vmlal.u8 q2, d1, d25 \n" // G
michael@0 1345 "vmlal.u8 q2, d2, d26 \n" // R
michael@0 1346 "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
michael@0 1347 "vqadd.u8 d0, d27 \n"
michael@0 1348 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
michael@0 1349 "bgt 1b \n"
michael@0 1350 : "+r"(src_argb), // %0
michael@0 1351 "+r"(dst_y), // %1
michael@0 1352 "+r"(pix) // %2
michael@0 1353 :
michael@0 1354 : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
michael@0 1355 );
michael@0 1356 }
michael@0 1357
michael@0 1358 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
michael@0 1359 asm volatile (
michael@0 1360 "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
michael@0 1361 "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
michael@0 1362 "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
michael@0 1363 ".p2align 2 \n"
michael@0 1364 "1: \n"
michael@0 1365 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
michael@0 1366 "subs %2, %2, #8 \n" // 8 processed per loop.
michael@0 1367 "vmull.u8 q2, d0, d24 \n" // B
michael@0 1368 "vmlal.u8 q2, d1, d25 \n" // G
michael@0 1369 "vmlal.u8 q2, d2, d26 \n" // R
michael@0 1370 "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y
michael@0 1371 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
michael@0 1372 "bgt 1b \n"
michael@0 1373 : "+r"(src_argb), // %0
michael@0 1374 "+r"(dst_y), // %1
michael@0 1375 "+r"(pix) // %2
michael@0 1376 :
michael@0 1377 : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
michael@0 1378 );
michael@0 1379 }
michael@0 1380
michael@0 1381 // 8x1 pixels.
michael@0 1382 void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
michael@0 1383 int pix) {
michael@0 1384 asm volatile (
michael@0 1385 "vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient
michael@0 1386 "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient
michael@0 1387 "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient
michael@0 1388 "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
michael@0 1389 "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
michael@0 1390 "vmov.u16 q15, #0x8080 \n" // 128.5
michael@0 1391 ".p2align 2 \n"
michael@0 1392 "1: \n"
michael@0 1393 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
michael@0 1394 "subs %3, %3, #8 \n" // 8 processed per loop.
michael@0 1395 "vmull.u8 q2, d0, d24 \n" // B
michael@0 1396 "vmlsl.u8 q2, d1, d25 \n" // G
michael@0 1397 "vmlsl.u8 q2, d2, d26 \n" // R
michael@0 1398 "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned
michael@0 1399
michael@0 1400 "vmull.u8 q3, d2, d24 \n" // R
michael@0 1401 "vmlsl.u8 q3, d1, d28 \n" // G
michael@0 1402 "vmlsl.u8 q3, d0, d27 \n" // B
michael@0 1403 "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned
michael@0 1404
michael@0 1405 "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U
michael@0 1406 "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V
michael@0 1407
michael@0 1408 "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
michael@0 1409 "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
michael@0 1410 "bgt 1b \n"
michael@0 1411 : "+r"(src_argb), // %0
michael@0 1412 "+r"(dst_u), // %1
michael@0 1413 "+r"(dst_v), // %2
michael@0 1414 "+r"(pix) // %3
michael@0 1415 :
michael@0 1416 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"
michael@0 1417 );
michael@0 1418 }
michael@0 1419
michael@0 1420 // 16x1 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
michael@0 1421 void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
michael@0 1422 int pix) {
michael@0 1423 asm volatile (
michael@0 1424 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
michael@0 1425 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
michael@0 1426 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
michael@0 1427 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
michael@0 1428 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
michael@0 1429 "vmov.u16 q15, #0x8080 \n" // 128.5
michael@0 1430 ".p2align 2 \n"
michael@0 1431 "1: \n"
michael@0 1432 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
michael@0 1433 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
michael@0 1434
michael@0 1435 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
michael@0 1436 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
michael@0 1437 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
michael@0 1438
michael@0 1439 "subs %3, %3, #16 \n" // 16 processed per loop.
michael@0 1440 "vmul.s16 q8, q0, q10 \n" // B
michael@0 1441 "vmls.s16 q8, q1, q11 \n" // G
michael@0 1442 "vmls.s16 q8, q2, q12 \n" // R
michael@0 1443 "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
michael@0 1444
michael@0 1445 "vmul.s16 q9, q2, q10 \n" // R
michael@0 1446 "vmls.s16 q9, q1, q14 \n" // G
michael@0 1447 "vmls.s16 q9, q0, q13 \n" // B
michael@0 1448 "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
michael@0 1449
michael@0 1450 "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
michael@0 1451 "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
michael@0 1452
michael@0 1453 "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
michael@0 1454 "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
michael@0 1455 "bgt 1b \n"
michael@0 1456 : "+r"(src_argb), // %0
michael@0 1457 "+r"(dst_u), // %1
michael@0 1458 "+r"(dst_v), // %2
michael@0 1459 "+r"(pix) // %3
michael@0 1460 :
michael@0 1461 : "cc", "memory", "q0", "q1", "q2", "q3",
michael@0 1462 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 1463 );
michael@0 1464 }
michael@0 1465
michael@0 1466 // 32x1 pixels -> 8x1. pix is number of argb pixels. e.g. 32.
michael@0 1467 void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
michael@0 1468 int pix) {
michael@0 1469 asm volatile (
michael@0 1470 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
michael@0 1471 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
michael@0 1472 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
michael@0 1473 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
michael@0 1474 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
michael@0 1475 "vmov.u16 q15, #0x8080 \n" // 128.5
michael@0 1476 ".p2align 2 \n"
michael@0 1477 "1: \n"
michael@0 1478 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
michael@0 1479 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
michael@0 1480 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
michael@0 1481 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
michael@0 1482 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
michael@0 1483 "vld4.8 {d8, d10, d12, d14}, [%0]! \n" // load 8 more ARGB pixels.
michael@0 1484 "vld4.8 {d9, d11, d13, d15}, [%0]! \n" // load last 8 ARGB pixels.
michael@0 1485 "vpaddl.u8 q4, q4 \n" // B 16 bytes -> 8 shorts.
michael@0 1486 "vpaddl.u8 q5, q5 \n" // G 16 bytes -> 8 shorts.
michael@0 1487 "vpaddl.u8 q6, q6 \n" // R 16 bytes -> 8 shorts.
michael@0 1488
michael@0 1489 "vpadd.u16 d0, d0, d1 \n" // B 16 shorts -> 8 shorts.
michael@0 1490 "vpadd.u16 d1, d8, d9 \n" // B
michael@0 1491 "vpadd.u16 d2, d2, d3 \n" // G 16 shorts -> 8 shorts.
michael@0 1492 "vpadd.u16 d3, d10, d11 \n" // G
michael@0 1493 "vpadd.u16 d4, d4, d5 \n" // R 16 shorts -> 8 shorts.
michael@0 1494 "vpadd.u16 d5, d12, d13 \n" // R
michael@0 1495
michael@0 1496 "vrshr.u16 q0, q0, #1 \n" // 2x average
michael@0 1497 "vrshr.u16 q1, q1, #1 \n"
michael@0 1498 "vrshr.u16 q2, q2, #1 \n"
michael@0 1499
michael@0 1500 "subs %3, %3, #32 \n" // 32 processed per loop.
michael@0 1501 "vmul.s16 q8, q0, q10 \n" // B
michael@0 1502 "vmls.s16 q8, q1, q11 \n" // G
michael@0 1503 "vmls.s16 q8, q2, q12 \n" // R
michael@0 1504 "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
michael@0 1505 "vmul.s16 q9, q2, q10 \n" // R
michael@0 1506 "vmls.s16 q9, q1, q14 \n" // G
michael@0 1507 "vmls.s16 q9, q0, q13 \n" // B
michael@0 1508 "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
michael@0 1509 "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
michael@0 1510 "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
michael@0 1511 "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
michael@0 1512 "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
michael@0 1513 "bgt 1b \n"
michael@0 1514 : "+r"(src_argb), // %0
michael@0 1515 "+r"(dst_u), // %1
michael@0 1516 "+r"(dst_v), // %2
michael@0 1517 "+r"(pix) // %3
michael@0 1518 :
michael@0 1519 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
michael@0 1520 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 1521 );
michael@0 1522 }
michael@0 1523
michael@0 1524 // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
michael@0 1525 #define RGBTOUV(QB, QG, QR) \
michael@0 1526 "vmul.s16 q8, " #QB ", q10 \n" /* B */ \
michael@0 1527 "vmls.s16 q8, " #QG ", q11 \n" /* G */ \
michael@0 1528 "vmls.s16 q8, " #QR ", q12 \n" /* R */ \
michael@0 1529 "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \
michael@0 1530 "vmul.s16 q9, " #QR ", q10 \n" /* R */ \
michael@0 1531 "vmls.s16 q9, " #QG ", q14 \n" /* G */ \
michael@0 1532 "vmls.s16 q9, " #QB ", q13 \n" /* B */ \
michael@0 1533 "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \
michael@0 1534 "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \
michael@0 1535 "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */
michael@0 1536
michael@0 1537 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
michael@0 1538 void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
michael@0 1539 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 1540 asm volatile (
michael@0 1541 "add %1, %0, %1 \n" // src_stride + src_argb
michael@0 1542 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
michael@0 1543 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
michael@0 1544 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
michael@0 1545 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
michael@0 1546 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
michael@0 1547 "vmov.u16 q15, #0x8080 \n" // 128.5
michael@0 1548 ".p2align 2 \n"
michael@0 1549 "1: \n"
michael@0 1550 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
michael@0 1551 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
michael@0 1552 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
michael@0 1553 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
michael@0 1554 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
michael@0 1555 "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
michael@0 1556 "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
michael@0 1557 "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
michael@0 1558 "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
michael@0 1559 "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
michael@0 1560
michael@0 1561 "vrshr.u16 q0, q0, #1 \n" // 2x average
michael@0 1562 "vrshr.u16 q1, q1, #1 \n"
michael@0 1563 "vrshr.u16 q2, q2, #1 \n"
michael@0 1564
michael@0 1565 "subs %4, %4, #16 \n" // 32 processed per loop.
michael@0 1566 RGBTOUV(q0, q1, q2)
michael@0 1567 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
michael@0 1568 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
michael@0 1569 "bgt 1b \n"
michael@0 1570 : "+r"(src_argb), // %0
michael@0 1571 "+r"(src_stride_argb), // %1
michael@0 1572 "+r"(dst_u), // %2
michael@0 1573 "+r"(dst_v), // %3
michael@0 1574 "+r"(pix) // %4
michael@0 1575 :
michael@0 1576 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
michael@0 1577 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 1578 );
michael@0 1579 }
michael@0 1580
michael@0 1581 // TODO(fbarchard): Subsample match C code.
michael@0 1582 void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
michael@0 1583 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 1584 asm volatile (
michael@0 1585 "add %1, %0, %1 \n" // src_stride + src_argb
michael@0 1586 "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
michael@0 1587 "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
michael@0 1588 "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
michael@0 1589 "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
michael@0 1590 "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
michael@0 1591 "vmov.u16 q15, #0x8080 \n" // 128.5
michael@0 1592 ".p2align 2 \n"
michael@0 1593 "1: \n"
michael@0 1594 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
michael@0 1595 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
michael@0 1596 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
michael@0 1597 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
michael@0 1598 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
michael@0 1599 "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
michael@0 1600 "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
michael@0 1601 "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
michael@0 1602 "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
michael@0 1603 "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
michael@0 1604
michael@0 1605 "vrshr.u16 q0, q0, #1 \n" // 2x average
michael@0 1606 "vrshr.u16 q1, q1, #1 \n"
michael@0 1607 "vrshr.u16 q2, q2, #1 \n"
michael@0 1608
michael@0 1609 "subs %4, %4, #16 \n" // 32 processed per loop.
michael@0 1610 RGBTOUV(q0, q1, q2)
michael@0 1611 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
michael@0 1612 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
michael@0 1613 "bgt 1b \n"
michael@0 1614 : "+r"(src_argb), // %0
michael@0 1615 "+r"(src_stride_argb), // %1
michael@0 1616 "+r"(dst_u), // %2
michael@0 1617 "+r"(dst_v), // %3
michael@0 1618 "+r"(pix) // %4
michael@0 1619 :
michael@0 1620 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
michael@0 1621 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 1622 );
michael@0 1623 }
michael@0 1624
michael@0 1625 void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
michael@0 1626 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 1627 asm volatile (
michael@0 1628 "add %1, %0, %1 \n" // src_stride + src_bgra
michael@0 1629 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
michael@0 1630 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
michael@0 1631 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
michael@0 1632 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
michael@0 1633 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
michael@0 1634 "vmov.u16 q15, #0x8080 \n" // 128.5
michael@0 1635 ".p2align 2 \n"
michael@0 1636 "1: \n"
michael@0 1637 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
michael@0 1638 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
michael@0 1639 "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
michael@0 1640 "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
michael@0 1641 "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
michael@0 1642 "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
michael@0 1643 "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
michael@0 1644 "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
michael@0 1645 "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
michael@0 1646 "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts.
michael@0 1647
michael@0 1648 "vrshr.u16 q1, q1, #1 \n" // 2x average
michael@0 1649 "vrshr.u16 q2, q2, #1 \n"
michael@0 1650 "vrshr.u16 q3, q3, #1 \n"
michael@0 1651
michael@0 1652 "subs %4, %4, #16 \n" // 32 processed per loop.
michael@0 1653 RGBTOUV(q3, q2, q1)
michael@0 1654 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
michael@0 1655 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
michael@0 1656 "bgt 1b \n"
michael@0 1657 : "+r"(src_bgra), // %0
michael@0 1658 "+r"(src_stride_bgra), // %1
michael@0 1659 "+r"(dst_u), // %2
michael@0 1660 "+r"(dst_v), // %3
michael@0 1661 "+r"(pix) // %4
michael@0 1662 :
michael@0 1663 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
michael@0 1664 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 1665 );
michael@0 1666 }
michael@0 1667
michael@0 1668 void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
michael@0 1669 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 1670 asm volatile (
michael@0 1671 "add %1, %0, %1 \n" // src_stride + src_abgr
michael@0 1672 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
michael@0 1673 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
michael@0 1674 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
michael@0 1675 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
michael@0 1676 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
michael@0 1677 "vmov.u16 q15, #0x8080 \n" // 128.5
michael@0 1678 ".p2align 2 \n"
michael@0 1679 "1: \n"
michael@0 1680 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
michael@0 1681 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
michael@0 1682 "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
michael@0 1683 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
michael@0 1684 "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
michael@0 1685 "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
michael@0 1686 "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
michael@0 1687 "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
michael@0 1688 "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
michael@0 1689 "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
michael@0 1690
michael@0 1691 "vrshr.u16 q0, q0, #1 \n" // 2x average
michael@0 1692 "vrshr.u16 q1, q1, #1 \n"
michael@0 1693 "vrshr.u16 q2, q2, #1 \n"
michael@0 1694
michael@0 1695 "subs %4, %4, #16 \n" // 32 processed per loop.
michael@0 1696 RGBTOUV(q2, q1, q0)
michael@0 1697 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
michael@0 1698 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
michael@0 1699 "bgt 1b \n"
michael@0 1700 : "+r"(src_abgr), // %0
michael@0 1701 "+r"(src_stride_abgr), // %1
michael@0 1702 "+r"(dst_u), // %2
michael@0 1703 "+r"(dst_v), // %3
michael@0 1704 "+r"(pix) // %4
michael@0 1705 :
michael@0 1706 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
michael@0 1707 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 1708 );
michael@0 1709 }
michael@0 1710
michael@0 1711 void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
michael@0 1712 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 1713 asm volatile (
michael@0 1714 "add %1, %0, %1 \n" // src_stride + src_rgba
michael@0 1715 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
michael@0 1716 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
michael@0 1717 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
michael@0 1718 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
michael@0 1719 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
michael@0 1720 "vmov.u16 q15, #0x8080 \n" // 128.5
michael@0 1721 ".p2align 2 \n"
michael@0 1722 "1: \n"
michael@0 1723 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
michael@0 1724 "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
michael@0 1725 "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
michael@0 1726 "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
michael@0 1727 "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
michael@0 1728 "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
michael@0 1729 "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
michael@0 1730 "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
michael@0 1731 "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
michael@0 1732 "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts.
michael@0 1733
michael@0 1734 "vrshr.u16 q0, q0, #1 \n" // 2x average
michael@0 1735 "vrshr.u16 q1, q1, #1 \n"
michael@0 1736 "vrshr.u16 q2, q2, #1 \n"
michael@0 1737
michael@0 1738 "subs %4, %4, #16 \n" // 32 processed per loop.
michael@0 1739 RGBTOUV(q0, q1, q2)
michael@0 1740 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
michael@0 1741 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
michael@0 1742 "bgt 1b \n"
michael@0 1743 : "+r"(src_rgba), // %0
michael@0 1744 "+r"(src_stride_rgba), // %1
michael@0 1745 "+r"(dst_u), // %2
michael@0 1746 "+r"(dst_v), // %3
michael@0 1747 "+r"(pix) // %4
michael@0 1748 :
michael@0 1749 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
michael@0 1750 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 1751 );
michael@0 1752 }
michael@0 1753
michael@0 1754 void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
michael@0 1755 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 1756 asm volatile (
michael@0 1757 "add %1, %0, %1 \n" // src_stride + src_rgb24
michael@0 1758 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
michael@0 1759 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
michael@0 1760 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
michael@0 1761 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
michael@0 1762 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
michael@0 1763 "vmov.u16 q15, #0x8080 \n" // 128.5
michael@0 1764 ".p2align 2 \n"
michael@0 1765 "1: \n"
michael@0 1766 "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
michael@0 1767 "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
michael@0 1768 "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
michael@0 1769 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
michael@0 1770 "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
michael@0 1771 "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
michael@0 1772 "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
michael@0 1773 "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
michael@0 1774 "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
michael@0 1775 "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
michael@0 1776
michael@0 1777 "vrshr.u16 q0, q0, #1 \n" // 2x average
michael@0 1778 "vrshr.u16 q1, q1, #1 \n"
michael@0 1779 "vrshr.u16 q2, q2, #1 \n"
michael@0 1780
michael@0 1781 "subs %4, %4, #16 \n" // 32 processed per loop.
michael@0 1782 RGBTOUV(q0, q1, q2)
michael@0 1783 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
michael@0 1784 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
michael@0 1785 "bgt 1b \n"
michael@0 1786 : "+r"(src_rgb24), // %0
michael@0 1787 "+r"(src_stride_rgb24), // %1
michael@0 1788 "+r"(dst_u), // %2
michael@0 1789 "+r"(dst_v), // %3
michael@0 1790 "+r"(pix) // %4
michael@0 1791 :
michael@0 1792 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
michael@0 1793 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 1794 );
michael@0 1795 }
michael@0 1796
michael@0 1797 void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
michael@0 1798 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 1799 asm volatile (
michael@0 1800 "add %1, %0, %1 \n" // src_stride + src_raw
michael@0 1801 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
michael@0 1802 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
michael@0 1803 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
michael@0 1804 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
michael@0 1805 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
michael@0 1806 "vmov.u16 q15, #0x8080 \n" // 128.5
michael@0 1807 ".p2align 2 \n"
michael@0 1808 "1: \n"
michael@0 1809 "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
michael@0 1810 "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
michael@0 1811 "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
michael@0 1812 "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
michael@0 1813 "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
michael@0 1814 "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
michael@0 1815 "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
michael@0 1816 "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
michael@0 1817 "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
michael@0 1818 "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
michael@0 1819
michael@0 1820 "vrshr.u16 q0, q0, #1 \n" // 2x average
michael@0 1821 "vrshr.u16 q1, q1, #1 \n"
michael@0 1822 "vrshr.u16 q2, q2, #1 \n"
michael@0 1823
michael@0 1824 "subs %4, %4, #16 \n" // 32 processed per loop.
michael@0 1825 RGBTOUV(q2, q1, q0)
michael@0 1826 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
michael@0 1827 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
michael@0 1828 "bgt 1b \n"
michael@0 1829 : "+r"(src_raw), // %0
michael@0 1830 "+r"(src_stride_raw), // %1
michael@0 1831 "+r"(dst_u), // %2
michael@0 1832 "+r"(dst_v), // %3
michael@0 1833 "+r"(pix) // %4
michael@0 1834 :
michael@0 1835 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
michael@0 1836 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 1837 );
michael@0 1838 }
michael@0 1839
michael@0 1840 // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
michael@0 1841 void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
michael@0 1842 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 1843 asm volatile (
michael@0 1844 "add %1, %0, %1 \n" // src_stride + src_argb
michael@0 1845 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
michael@0 1846 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
michael@0 1847 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
michael@0 1848 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
michael@0 1849 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
michael@0 1850 "vmov.u16 q15, #0x8080 \n" // 128.5
michael@0 1851 ".p2align 2 \n"
michael@0 1852 "1: \n"
michael@0 1853 "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
michael@0 1854 RGB565TOARGB
michael@0 1855 "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
michael@0 1856 "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
michael@0 1857 "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
michael@0 1858 "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels.
michael@0 1859 RGB565TOARGB
michael@0 1860 "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
michael@0 1861 "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
michael@0 1862 "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
michael@0 1863
michael@0 1864 "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels.
michael@0 1865 RGB565TOARGB
michael@0 1866 "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
michael@0 1867 "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
michael@0 1868 "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
michael@0 1869 "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels.
michael@0 1870 RGB565TOARGB
michael@0 1871 "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
michael@0 1872 "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
michael@0 1873 "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
michael@0 1874
michael@0 1875 "vrshr.u16 q4, q4, #1 \n" // 2x average
michael@0 1876 "vrshr.u16 q5, q5, #1 \n"
michael@0 1877 "vrshr.u16 q6, q6, #1 \n"
michael@0 1878
michael@0 1879 "subs %4, %4, #16 \n" // 16 processed per loop.
michael@0 1880 "vmul.s16 q8, q4, q10 \n" // B
michael@0 1881 "vmls.s16 q8, q5, q11 \n" // G
michael@0 1882 "vmls.s16 q8, q6, q12 \n" // R
michael@0 1883 "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
michael@0 1884 "vmul.s16 q9, q6, q10 \n" // R
michael@0 1885 "vmls.s16 q9, q5, q14 \n" // G
michael@0 1886 "vmls.s16 q9, q4, q13 \n" // B
michael@0 1887 "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
michael@0 1888 "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
michael@0 1889 "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
michael@0 1890 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
michael@0 1891 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
michael@0 1892 "bgt 1b \n"
michael@0 1893 : "+r"(src_rgb565), // %0
michael@0 1894 "+r"(src_stride_rgb565), // %1
michael@0 1895 "+r"(dst_u), // %2
michael@0 1896 "+r"(dst_v), // %3
michael@0 1897 "+r"(pix) // %4
michael@0 1898 :
michael@0 1899 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
michael@0 1900 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 1901 );
michael@0 1902 }
michael@0 1903
michael@0 1904 // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
michael@0 1905 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
michael@0 1906 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 1907 asm volatile (
michael@0 1908 "add %1, %0, %1 \n" // src_stride + src_argb
michael@0 1909 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
michael@0 1910 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
michael@0 1911 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
michael@0 1912 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
michael@0 1913 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
michael@0 1914 "vmov.u16 q15, #0x8080 \n" // 128.5
michael@0 1915 ".p2align 2 \n"
michael@0 1916 "1: \n"
michael@0 1917 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
michael@0 1918 RGB555TOARGB
michael@0 1919 "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
michael@0 1920 "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
michael@0 1921 "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
michael@0 1922 "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels.
michael@0 1923 RGB555TOARGB
michael@0 1924 "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
michael@0 1925 "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
michael@0 1926 "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
michael@0 1927
michael@0 1928 "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels.
michael@0 1929 RGB555TOARGB
michael@0 1930 "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
michael@0 1931 "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
michael@0 1932 "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
michael@0 1933 "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels.
michael@0 1934 RGB555TOARGB
michael@0 1935 "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
michael@0 1936 "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
michael@0 1937 "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
michael@0 1938
michael@0 1939 "vrshr.u16 q4, q4, #1 \n" // 2x average
michael@0 1940 "vrshr.u16 q5, q5, #1 \n"
michael@0 1941 "vrshr.u16 q6, q6, #1 \n"
michael@0 1942
michael@0 1943 "subs %4, %4, #16 \n" // 16 processed per loop.
michael@0 1944 "vmul.s16 q8, q4, q10 \n" // B
michael@0 1945 "vmls.s16 q8, q5, q11 \n" // G
michael@0 1946 "vmls.s16 q8, q6, q12 \n" // R
michael@0 1947 "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
michael@0 1948 "vmul.s16 q9, q6, q10 \n" // R
michael@0 1949 "vmls.s16 q9, q5, q14 \n" // G
michael@0 1950 "vmls.s16 q9, q4, q13 \n" // B
michael@0 1951 "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
michael@0 1952 "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
michael@0 1953 "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
michael@0 1954 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
michael@0 1955 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
michael@0 1956 "bgt 1b \n"
michael@0 1957 : "+r"(src_argb1555), // %0
michael@0 1958 "+r"(src_stride_argb1555), // %1
michael@0 1959 "+r"(dst_u), // %2
michael@0 1960 "+r"(dst_v), // %3
michael@0 1961 "+r"(pix) // %4
michael@0 1962 :
michael@0 1963 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
michael@0 1964 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 1965 );
michael@0 1966 }
michael@0 1967
michael@0 1968 // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
michael@0 1969 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
michael@0 1970 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 1971 asm volatile (
michael@0 1972 "add %1, %0, %1 \n" // src_stride + src_argb
michael@0 1973 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
michael@0 1974 "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
michael@0 1975 "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
michael@0 1976 "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
michael@0 1977 "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
michael@0 1978 "vmov.u16 q15, #0x8080 \n" // 128.5
michael@0 1979 ".p2align 2 \n"
michael@0 1980 "1: \n"
michael@0 1981 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
michael@0 1982 ARGB4444TOARGB
michael@0 1983 "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
michael@0 1984 "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
michael@0 1985 "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
michael@0 1986 "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels.
michael@0 1987 ARGB4444TOARGB
michael@0 1988 "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
michael@0 1989 "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
michael@0 1990 "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
michael@0 1991
michael@0 1992 "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels.
michael@0 1993 ARGB4444TOARGB
michael@0 1994 "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
michael@0 1995 "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
michael@0 1996 "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
michael@0 1997 "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels.
michael@0 1998 ARGB4444TOARGB
michael@0 1999 "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
michael@0 2000 "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
michael@0 2001 "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
michael@0 2002
michael@0 2003 "vrshr.u16 q4, q4, #1 \n" // 2x average
michael@0 2004 "vrshr.u16 q5, q5, #1 \n"
michael@0 2005 "vrshr.u16 q6, q6, #1 \n"
michael@0 2006
michael@0 2007 "subs %4, %4, #16 \n" // 16 processed per loop.
michael@0 2008 "vmul.s16 q8, q4, q10 \n" // B
michael@0 2009 "vmls.s16 q8, q5, q11 \n" // G
michael@0 2010 "vmls.s16 q8, q6, q12 \n" // R
michael@0 2011 "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
michael@0 2012 "vmul.s16 q9, q6, q10 \n" // R
michael@0 2013 "vmls.s16 q9, q5, q14 \n" // G
michael@0 2014 "vmls.s16 q9, q4, q13 \n" // B
michael@0 2015 "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
michael@0 2016 "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
michael@0 2017 "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
michael@0 2018 "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
michael@0 2019 "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
michael@0 2020 "bgt 1b \n"
michael@0 2021 : "+r"(src_argb4444), // %0
michael@0 2022 "+r"(src_stride_argb4444), // %1
michael@0 2023 "+r"(dst_u), // %2
michael@0 2024 "+r"(dst_v), // %3
michael@0 2025 "+r"(pix) // %4
michael@0 2026 :
michael@0 2027 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
michael@0 2028 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 2029 );
michael@0 2030 }
michael@0 2031
michael@0 2032 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
michael@0 2033 asm volatile (
michael@0 2034 "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
michael@0 2035 "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
michael@0 2036 "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
michael@0 2037 "vmov.u8 d27, #16 \n" // Add 16 constant
michael@0 2038 ".p2align 2 \n"
michael@0 2039 "1: \n"
michael@0 2040 "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
michael@0 2041 "subs %2, %2, #8 \n" // 8 processed per loop.
michael@0 2042 RGB565TOARGB
michael@0 2043 "vmull.u8 q2, d0, d24 \n" // B
michael@0 2044 "vmlal.u8 q2, d1, d25 \n" // G
michael@0 2045 "vmlal.u8 q2, d2, d26 \n" // R
michael@0 2046 "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
michael@0 2047 "vqadd.u8 d0, d27 \n"
michael@0 2048 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
michael@0 2049 "bgt 1b \n"
michael@0 2050 : "+r"(src_rgb565), // %0
michael@0 2051 "+r"(dst_y), // %1
michael@0 2052 "+r"(pix) // %2
michael@0 2053 :
michael@0 2054 : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
michael@0 2055 );
michael@0 2056 }
michael@0 2057
michael@0 2058 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
michael@0 2059 asm volatile (
michael@0 2060 "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
michael@0 2061 "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
michael@0 2062 "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
michael@0 2063 "vmov.u8 d27, #16 \n" // Add 16 constant
michael@0 2064 ".p2align 2 \n"
michael@0 2065 "1: \n"
michael@0 2066 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
michael@0 2067 "subs %2, %2, #8 \n" // 8 processed per loop.
michael@0 2068 ARGB1555TOARGB
michael@0 2069 "vmull.u8 q2, d0, d24 \n" // B
michael@0 2070 "vmlal.u8 q2, d1, d25 \n" // G
michael@0 2071 "vmlal.u8 q2, d2, d26 \n" // R
michael@0 2072 "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
michael@0 2073 "vqadd.u8 d0, d27 \n"
michael@0 2074 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
michael@0 2075 "bgt 1b \n"
michael@0 2076 : "+r"(src_argb1555), // %0
michael@0 2077 "+r"(dst_y), // %1
michael@0 2078 "+r"(pix) // %2
michael@0 2079 :
michael@0 2080 : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
michael@0 2081 );
michael@0 2082 }
michael@0 2083
michael@0 2084 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
michael@0 2085 asm volatile (
michael@0 2086 "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
michael@0 2087 "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
michael@0 2088 "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
michael@0 2089 "vmov.u8 d27, #16 \n" // Add 16 constant
michael@0 2090 ".p2align 2 \n"
michael@0 2091 "1: \n"
michael@0 2092 "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
michael@0 2093 "subs %2, %2, #8 \n" // 8 processed per loop.
michael@0 2094 ARGB4444TOARGB
michael@0 2095 "vmull.u8 q2, d0, d24 \n" // B
michael@0 2096 "vmlal.u8 q2, d1, d25 \n" // G
michael@0 2097 "vmlal.u8 q2, d2, d26 \n" // R
michael@0 2098 "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
michael@0 2099 "vqadd.u8 d0, d27 \n"
michael@0 2100 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
michael@0 2101 "bgt 1b \n"
michael@0 2102 : "+r"(src_argb4444), // %0
michael@0 2103 "+r"(dst_y), // %1
michael@0 2104 "+r"(pix) // %2
michael@0 2105 :
michael@0 2106 : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
michael@0 2107 );
michael@0 2108 }
michael@0 2109
michael@0 2110 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
michael@0 2111 asm volatile (
michael@0 2112 "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
michael@0 2113 "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
michael@0 2114 "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
michael@0 2115 "vmov.u8 d7, #16 \n" // Add 16 constant
michael@0 2116 ".p2align 2 \n"
michael@0 2117 "1: \n"
michael@0 2118 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
michael@0 2119 "subs %2, %2, #8 \n" // 8 processed per loop.
michael@0 2120 "vmull.u8 q8, d1, d4 \n" // R
michael@0 2121 "vmlal.u8 q8, d2, d5 \n" // G
michael@0 2122 "vmlal.u8 q8, d3, d6 \n" // B
michael@0 2123 "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
michael@0 2124 "vqadd.u8 d0, d7 \n"
michael@0 2125 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
michael@0 2126 "bgt 1b \n"
michael@0 2127 : "+r"(src_bgra), // %0
michael@0 2128 "+r"(dst_y), // %1
michael@0 2129 "+r"(pix) // %2
michael@0 2130 :
michael@0 2131 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
michael@0 2132 );
michael@0 2133 }
michael@0 2134
michael@0 2135 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
michael@0 2136 asm volatile (
michael@0 2137 "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
michael@0 2138 "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
michael@0 2139 "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
michael@0 2140 "vmov.u8 d7, #16 \n" // Add 16 constant
michael@0 2141 ".p2align 2 \n"
michael@0 2142 "1: \n"
michael@0 2143 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
michael@0 2144 "subs %2, %2, #8 \n" // 8 processed per loop.
michael@0 2145 "vmull.u8 q8, d0, d4 \n" // R
michael@0 2146 "vmlal.u8 q8, d1, d5 \n" // G
michael@0 2147 "vmlal.u8 q8, d2, d6 \n" // B
michael@0 2148 "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
michael@0 2149 "vqadd.u8 d0, d7 \n"
michael@0 2150 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
michael@0 2151 "bgt 1b \n"
michael@0 2152 : "+r"(src_abgr), // %0
michael@0 2153 "+r"(dst_y), // %1
michael@0 2154 "+r"(pix) // %2
michael@0 2155 :
michael@0 2156 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
michael@0 2157 );
michael@0 2158 }
michael@0 2159
michael@0 2160 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
michael@0 2161 asm volatile (
michael@0 2162 "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
michael@0 2163 "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
michael@0 2164 "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
michael@0 2165 "vmov.u8 d7, #16 \n" // Add 16 constant
michael@0 2166 ".p2align 2 \n"
michael@0 2167 "1: \n"
michael@0 2168 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
michael@0 2169 "subs %2, %2, #8 \n" // 8 processed per loop.
michael@0 2170 "vmull.u8 q8, d1, d4 \n" // B
michael@0 2171 "vmlal.u8 q8, d2, d5 \n" // G
michael@0 2172 "vmlal.u8 q8, d3, d6 \n" // R
michael@0 2173 "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
michael@0 2174 "vqadd.u8 d0, d7 \n"
michael@0 2175 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
michael@0 2176 "bgt 1b \n"
michael@0 2177 : "+r"(src_rgba), // %0
michael@0 2178 "+r"(dst_y), // %1
michael@0 2179 "+r"(pix) // %2
michael@0 2180 :
michael@0 2181 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
michael@0 2182 );
michael@0 2183 }
michael@0 2184
michael@0 2185 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
michael@0 2186 asm volatile (
michael@0 2187 "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
michael@0 2188 "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
michael@0 2189 "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
michael@0 2190 "vmov.u8 d7, #16 \n" // Add 16 constant
michael@0 2191 ".p2align 2 \n"
michael@0 2192 "1: \n"
michael@0 2193 "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
michael@0 2194 "subs %2, %2, #8 \n" // 8 processed per loop.
michael@0 2195 "vmull.u8 q8, d0, d4 \n" // B
michael@0 2196 "vmlal.u8 q8, d1, d5 \n" // G
michael@0 2197 "vmlal.u8 q8, d2, d6 \n" // R
michael@0 2198 "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
michael@0 2199 "vqadd.u8 d0, d7 \n"
michael@0 2200 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
michael@0 2201 "bgt 1b \n"
michael@0 2202 : "+r"(src_rgb24), // %0
michael@0 2203 "+r"(dst_y), // %1
michael@0 2204 "+r"(pix) // %2
michael@0 2205 :
michael@0 2206 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
michael@0 2207 );
michael@0 2208 }
michael@0 2209
michael@0 2210 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
michael@0 2211 asm volatile (
michael@0 2212 "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
michael@0 2213 "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
michael@0 2214 "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
michael@0 2215 "vmov.u8 d7, #16 \n" // Add 16 constant
michael@0 2216 ".p2align 2 \n"
michael@0 2217 "1: \n"
michael@0 2218 "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
michael@0 2219 "subs %2, %2, #8 \n" // 8 processed per loop.
michael@0 2220 "vmull.u8 q8, d0, d4 \n" // B
michael@0 2221 "vmlal.u8 q8, d1, d5 \n" // G
michael@0 2222 "vmlal.u8 q8, d2, d6 \n" // R
michael@0 2223 "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
michael@0 2224 "vqadd.u8 d0, d7 \n"
michael@0 2225 "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
michael@0 2226 "bgt 1b \n"
michael@0 2227 : "+r"(src_raw), // %0
michael@0 2228 "+r"(dst_y), // %1
michael@0 2229 "+r"(pix) // %2
michael@0 2230 :
michael@0 2231 : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
michael@0 2232 );
michael@0 2233 }
michael@0 2234
michael@0 2235 // Bilinear filter 16x2 -> 16x1
michael@0 2236 void InterpolateRow_NEON(uint8* dst_ptr,
michael@0 2237 const uint8* src_ptr, ptrdiff_t src_stride,
michael@0 2238 int dst_width, int source_y_fraction) {
michael@0 2239 asm volatile (
michael@0 2240 "cmp %4, #0 \n"
michael@0 2241 "beq 100f \n"
michael@0 2242 "add %2, %1 \n"
michael@0 2243 "cmp %4, #64 \n"
michael@0 2244 "beq 75f \n"
michael@0 2245 "cmp %4, #128 \n"
michael@0 2246 "beq 50f \n"
michael@0 2247 "cmp %4, #192 \n"
michael@0 2248 "beq 25f \n"
michael@0 2249
michael@0 2250 "vdup.8 d5, %4 \n"
michael@0 2251 "rsb %4, #256 \n"
michael@0 2252 "vdup.8 d4, %4 \n"
michael@0 2253 // General purpose row blend.
michael@0 2254 "1: \n"
michael@0 2255 "vld1.8 {q0}, [%1]! \n"
michael@0 2256 "vld1.8 {q1}, [%2]! \n"
michael@0 2257 "subs %3, %3, #16 \n"
michael@0 2258 "vmull.u8 q13, d0, d4 \n"
michael@0 2259 "vmull.u8 q14, d1, d4 \n"
michael@0 2260 "vmlal.u8 q13, d2, d5 \n"
michael@0 2261 "vmlal.u8 q14, d3, d5 \n"
michael@0 2262 "vrshrn.u16 d0, q13, #8 \n"
michael@0 2263 "vrshrn.u16 d1, q14, #8 \n"
michael@0 2264 "vst1.8 {q0}, [%0]! \n"
michael@0 2265 "bgt 1b \n"
michael@0 2266 "b 99f \n"
michael@0 2267
michael@0 2268 // Blend 25 / 75.
michael@0 2269 "25: \n"
michael@0 2270 "vld1.8 {q0}, [%1]! \n"
michael@0 2271 "vld1.8 {q1}, [%2]! \n"
michael@0 2272 "subs %3, %3, #16 \n"
michael@0 2273 "vrhadd.u8 q0, q1 \n"
michael@0 2274 "vrhadd.u8 q0, q1 \n"
michael@0 2275 "vst1.8 {q0}, [%0]! \n"
michael@0 2276 "bgt 25b \n"
michael@0 2277 "b 99f \n"
michael@0 2278
michael@0 2279 // Blend 50 / 50.
michael@0 2280 "50: \n"
michael@0 2281 "vld1.8 {q0}, [%1]! \n"
michael@0 2282 "vld1.8 {q1}, [%2]! \n"
michael@0 2283 "subs %3, %3, #16 \n"
michael@0 2284 "vrhadd.u8 q0, q1 \n"
michael@0 2285 "vst1.8 {q0}, [%0]! \n"
michael@0 2286 "bgt 50b \n"
michael@0 2287 "b 99f \n"
michael@0 2288
michael@0 2289 // Blend 75 / 25.
michael@0 2290 "75: \n"
michael@0 2291 "vld1.8 {q1}, [%1]! \n"
michael@0 2292 "vld1.8 {q0}, [%2]! \n"
michael@0 2293 "subs %3, %3, #16 \n"
michael@0 2294 "vrhadd.u8 q0, q1 \n"
michael@0 2295 "vrhadd.u8 q0, q1 \n"
michael@0 2296 "vst1.8 {q0}, [%0]! \n"
michael@0 2297 "bgt 75b \n"
michael@0 2298 "b 99f \n"
michael@0 2299
michael@0 2300 // Blend 100 / 0 - Copy row unchanged.
michael@0 2301 "100: \n"
michael@0 2302 "vld1.8 {q0}, [%1]! \n"
michael@0 2303 "subs %3, %3, #16 \n"
michael@0 2304 "vst1.8 {q0}, [%0]! \n"
michael@0 2305 "bgt 100b \n"
michael@0 2306
michael@0 2307 "99: \n"
michael@0 2308 : "+r"(dst_ptr), // %0
michael@0 2309 "+r"(src_ptr), // %1
michael@0 2310 "+r"(src_stride), // %2
michael@0 2311 "+r"(dst_width), // %3
michael@0 2312 "+r"(source_y_fraction) // %4
michael@0 2313 :
michael@0 2314 : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
michael@0 2315 );
michael@0 2316 }
michael@0 2317
michael@0 2318 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
michael@0 2319 void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
michael@0 2320 uint8* dst_argb, int width) {
michael@0 2321 asm volatile (
michael@0 2322 "subs %3, #8 \n"
michael@0 2323 "blt 89f \n"
michael@0 2324 // Blend 8 pixels.
michael@0 2325 "8: \n"
michael@0 2326 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
michael@0 2327 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
michael@0 2328 "subs %3, %3, #8 \n" // 8 processed per loop.
michael@0 2329 "vmull.u8 q10, d4, d3 \n" // db * a
michael@0 2330 "vmull.u8 q11, d5, d3 \n" // dg * a
michael@0 2331 "vmull.u8 q12, d6, d3 \n" // dr * a
michael@0 2332 "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
michael@0 2333 "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
michael@0 2334 "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
michael@0 2335 "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
michael@0 2336 "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
michael@0 2337 "vqadd.u8 q0, q0, q2 \n" // + sbg
michael@0 2338 "vqadd.u8 d2, d2, d6 \n" // + sr
michael@0 2339 "vmov.u8 d3, #255 \n" // a = 255
michael@0 2340 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
michael@0 2341 "bge 8b \n"
michael@0 2342
michael@0 2343 "89: \n"
michael@0 2344 "adds %3, #8-1 \n"
michael@0 2345 "blt 99f \n"
michael@0 2346
michael@0 2347 // Blend 1 pixels.
michael@0 2348 "1: \n"
michael@0 2349 "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
michael@0 2350 "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
michael@0 2351 "subs %3, %3, #1 \n" // 1 processed per loop.
michael@0 2352 "vmull.u8 q10, d4, d3 \n" // db * a
michael@0 2353 "vmull.u8 q11, d5, d3 \n" // dg * a
michael@0 2354 "vmull.u8 q12, d6, d3 \n" // dr * a
michael@0 2355 "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
michael@0 2356 "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
michael@0 2357 "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
michael@0 2358 "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
michael@0 2359 "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
michael@0 2360 "vqadd.u8 q0, q0, q2 \n" // + sbg
michael@0 2361 "vqadd.u8 d2, d2, d6 \n" // + sr
michael@0 2362 "vmov.u8 d3, #255 \n" // a = 255
michael@0 2363 "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
michael@0 2364 "bge 1b \n"
michael@0 2365
michael@0 2366 "99: \n"
michael@0 2367
michael@0 2368 : "+r"(src_argb0), // %0
michael@0 2369 "+r"(src_argb1), // %1
michael@0 2370 "+r"(dst_argb), // %2
michael@0 2371 "+r"(width) // %3
michael@0 2372 :
michael@0 2373 : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"
michael@0 2374 );
michael@0 2375 }
michael@0 2376
michael@0 2377 // Attenuate 8 pixels at a time.
michael@0 2378 void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
michael@0 2379 asm volatile (
michael@0 2380 // Attenuate 8 pixels.
michael@0 2381 "1: \n"
michael@0 2382 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
michael@0 2383 "subs %2, %2, #8 \n" // 8 processed per loop.
michael@0 2384 "vmull.u8 q10, d0, d3 \n" // b * a
michael@0 2385 "vmull.u8 q11, d1, d3 \n" // g * a
michael@0 2386 "vmull.u8 q12, d2, d3 \n" // r * a
michael@0 2387 "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8
michael@0 2388 "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8
michael@0 2389 "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8
michael@0 2390 "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
michael@0 2391 "bgt 1b \n"
michael@0 2392 : "+r"(src_argb), // %0
michael@0 2393 "+r"(dst_argb), // %1
michael@0 2394 "+r"(width) // %2
michael@0 2395 :
michael@0 2396 : "cc", "memory", "q0", "q1", "q10", "q11", "q12"
michael@0 2397 );
michael@0 2398 }
michael@0 2399
michael@0 2400 // Quantize 8 ARGB pixels (32 bytes).
michael@0 2401 // dst = (dst * scale >> 16) * interval_size + interval_offset;
michael@0 2402 void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
michael@0 2403 int interval_offset, int width) {
michael@0 2404 asm volatile (
michael@0 2405 "vdup.u16 q8, %2 \n"
michael@0 2406 "vshr.u16 q8, q8, #1 \n" // scale >>= 1
michael@0 2407 "vdup.u16 q9, %3 \n" // interval multiply.
michael@0 2408 "vdup.u16 q10, %4 \n" // interval add
michael@0 2409
michael@0 2410 // 8 pixel loop.
michael@0 2411 ".p2align 2 \n"
michael@0 2412 "1: \n"
michael@0 2413 "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
michael@0 2414 "subs %1, %1, #8 \n" // 8 processed per loop.
michael@0 2415 "vmovl.u8 q0, d0 \n" // b (0 .. 255)
michael@0 2416 "vmovl.u8 q1, d2 \n"
michael@0 2417 "vmovl.u8 q2, d4 \n"
michael@0 2418 "vqdmulh.s16 q0, q0, q8 \n" // b * scale
michael@0 2419 "vqdmulh.s16 q1, q1, q8 \n" // g
michael@0 2420 "vqdmulh.s16 q2, q2, q8 \n" // r
michael@0 2421 "vmul.u16 q0, q0, q9 \n" // b * interval_size
michael@0 2422 "vmul.u16 q1, q1, q9 \n" // g
michael@0 2423 "vmul.u16 q2, q2, q9 \n" // r
michael@0 2424 "vadd.u16 q0, q0, q10 \n" // b + interval_offset
michael@0 2425 "vadd.u16 q1, q1, q10 \n" // g
michael@0 2426 "vadd.u16 q2, q2, q10 \n" // r
michael@0 2427 "vqmovn.u16 d0, q0 \n"
michael@0 2428 "vqmovn.u16 d2, q1 \n"
michael@0 2429 "vqmovn.u16 d4, q2 \n"
michael@0 2430 "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
michael@0 2431 "bgt 1b \n"
michael@0 2432 : "+r"(dst_argb), // %0
michael@0 2433 "+r"(width) // %1
michael@0 2434 : "r"(scale), // %2
michael@0 2435 "r"(interval_size), // %3
michael@0 2436 "r"(interval_offset) // %4
michael@0 2437 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
michael@0 2438 );
michael@0 2439 }
michael@0 2440
michael@0 2441 // Shade 8 pixels at a time by specified value.
michael@0 2442 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
michael@0 2443 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
michael@0 2444 void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
michael@0 2445 uint32 value) {
michael@0 2446 asm volatile (
michael@0 2447 "vdup.u32 q0, %3 \n" // duplicate scale value.
michael@0 2448 "vzip.u8 d0, d1 \n" // d0 aarrggbb.
michael@0 2449 "vshr.u16 q0, q0, #1 \n" // scale / 2.
michael@0 2450
michael@0 2451 // 8 pixel loop.
michael@0 2452 ".p2align 2 \n"
michael@0 2453 "1: \n"
michael@0 2454 "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
michael@0 2455 "subs %2, %2, #8 \n" // 8 processed per loop.
michael@0 2456 "vmovl.u8 q10, d20 \n" // b (0 .. 255)
michael@0 2457 "vmovl.u8 q11, d22 \n"
michael@0 2458 "vmovl.u8 q12, d24 \n"
michael@0 2459 "vmovl.u8 q13, d26 \n"
michael@0 2460 "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2
michael@0 2461 "vqrdmulh.s16 q11, q11, d0[1] \n" // g
michael@0 2462 "vqrdmulh.s16 q12, q12, d0[2] \n" // r
michael@0 2463 "vqrdmulh.s16 q13, q13, d0[3] \n" // a
michael@0 2464 "vqmovn.u16 d20, q10 \n"
michael@0 2465 "vqmovn.u16 d22, q11 \n"
michael@0 2466 "vqmovn.u16 d24, q12 \n"
michael@0 2467 "vqmovn.u16 d26, q13 \n"
michael@0 2468 "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
michael@0 2469 "bgt 1b \n"
michael@0 2470 : "+r"(src_argb), // %0
michael@0 2471 "+r"(dst_argb), // %1
michael@0 2472 "+r"(width) // %2
michael@0 2473 : "r"(value) // %3
michael@0 2474 : "cc", "memory", "q0", "q10", "q11", "q12", "q13"
michael@0 2475 );
michael@0 2476 }
michael@0 2477
michael@0 2478 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
michael@0 2479 // Similar to ARGBToYJ but stores ARGB.
michael@0 2480 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
michael@0 2481 void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
michael@0 2482 asm volatile (
michael@0 2483 "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
michael@0 2484 "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
michael@0 2485 "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
michael@0 2486 ".p2align 2 \n"
michael@0 2487 "1: \n"
michael@0 2488 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
michael@0 2489 "subs %2, %2, #8 \n" // 8 processed per loop.
michael@0 2490 "vmull.u8 q2, d0, d24 \n" // B
michael@0 2491 "vmlal.u8 q2, d1, d25 \n" // G
michael@0 2492 "vmlal.u8 q2, d2, d26 \n" // R
michael@0 2493 "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B
michael@0 2494 "vmov d1, d0 \n" // G
michael@0 2495 "vmov d2, d0 \n" // R
michael@0 2496 "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
michael@0 2497 "bgt 1b \n"
michael@0 2498 : "+r"(src_argb), // %0
michael@0 2499 "+r"(dst_argb), // %1
michael@0 2500 "+r"(width) // %2
michael@0 2501 :
michael@0 2502 : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
michael@0 2503 );
michael@0 2504 }
michael@0 2505
michael@0 2506 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
michael@0 2507 // b = (r * 35 + g * 68 + b * 17) >> 7
michael@0 2508 // g = (r * 45 + g * 88 + b * 22) >> 7
michael@0 2509 // r = (r * 50 + g * 98 + b * 24) >> 7
michael@0 2510 void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
michael@0 2511 asm volatile (
michael@0 2512 "vmov.u8 d20, #17 \n" // BB coefficient
michael@0 2513 "vmov.u8 d21, #68 \n" // BG coefficient
michael@0 2514 "vmov.u8 d22, #35 \n" // BR coefficient
michael@0 2515 "vmov.u8 d24, #22 \n" // GB coefficient
michael@0 2516 "vmov.u8 d25, #88 \n" // GG coefficient
michael@0 2517 "vmov.u8 d26, #45 \n" // GR coefficient
michael@0 2518 "vmov.u8 d28, #24 \n" // BB coefficient
michael@0 2519 "vmov.u8 d29, #98 \n" // BG coefficient
michael@0 2520 "vmov.u8 d30, #50 \n" // BR coefficient
michael@0 2521 ".p2align 2 \n"
michael@0 2522 "1: \n"
michael@0 2523 "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
michael@0 2524 "subs %1, %1, #8 \n" // 8 processed per loop.
michael@0 2525 "vmull.u8 q2, d0, d20 \n" // B to Sepia B
michael@0 2526 "vmlal.u8 q2, d1, d21 \n" // G
michael@0 2527 "vmlal.u8 q2, d2, d22 \n" // R
michael@0 2528 "vmull.u8 q3, d0, d24 \n" // B to Sepia G
michael@0 2529 "vmlal.u8 q3, d1, d25 \n" // G
michael@0 2530 "vmlal.u8 q3, d2, d26 \n" // R
michael@0 2531 "vmull.u8 q8, d0, d28 \n" // B to Sepia R
michael@0 2532 "vmlal.u8 q8, d1, d29 \n" // G
michael@0 2533 "vmlal.u8 q8, d2, d30 \n" // R
michael@0 2534 "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
michael@0 2535 "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
michael@0 2536 "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
michael@0 2537 "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
michael@0 2538 "bgt 1b \n"
michael@0 2539 : "+r"(dst_argb), // %0
michael@0 2540 "+r"(width) // %1
michael@0 2541 :
michael@0 2542 : "cc", "memory", "q0", "q1", "q2", "q3",
michael@0 2543 "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 2544 );
michael@0 2545 }
michael@0 2546
michael@0 2547 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
michael@0 2548 // TODO(fbarchard): Was same as Sepia except matrix is provided. This function
michael@0 2549 // needs to saturate. Consider doing a non-saturating version.
michael@0 2550 void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
michael@0 2551 const int8* matrix_argb, int width) {
michael@0 2552 asm volatile (
michael@0 2553 "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
michael@0 2554 "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
michael@0 2555 "vmovl.s8 q1, d5 \n" // R,A coefficients s16.
michael@0 2556
michael@0 2557 ".p2align 2 \n"
michael@0 2558 "1: \n"
michael@0 2559 "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
michael@0 2560 "subs %2, %2, #8 \n" // 8 processed per loop.
michael@0 2561 "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
michael@0 2562 "vmovl.u8 q9, d18 \n" // g
michael@0 2563 "vmovl.u8 q10, d20 \n" // r
michael@0 2564 "vmovl.u8 q15, d22 \n" // a
michael@0 2565 "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B
michael@0 2566 "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G
michael@0 2567 "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R
michael@0 2568 "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A
michael@0 2569 "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B
michael@0 2570 "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G
michael@0 2571 "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R
michael@0 2572 "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A
michael@0 2573 "vqadd.s16 q12, q12, q4 \n" // Accumulate B
michael@0 2574 "vqadd.s16 q13, q13, q5 \n" // Accumulate G
michael@0 2575 "vqadd.s16 q14, q14, q6 \n" // Accumulate R
michael@0 2576 "vqadd.s16 q15, q15, q7 \n" // Accumulate A
michael@0 2577 "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B
michael@0 2578 "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G
michael@0 2579 "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R
michael@0 2580 "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A
michael@0 2581 "vqadd.s16 q12, q12, q4 \n" // Accumulate B
michael@0 2582 "vqadd.s16 q13, q13, q5 \n" // Accumulate G
michael@0 2583 "vqadd.s16 q14, q14, q6 \n" // Accumulate R
michael@0 2584 "vqadd.s16 q15, q15, q7 \n" // Accumulate A
michael@0 2585 "vmul.s16 q4, q15, d0[3] \n" // B += A * Matrix B
michael@0 2586 "vmul.s16 q5, q15, d1[3] \n" // G += A * Matrix G
michael@0 2587 "vmul.s16 q6, q15, d2[3] \n" // R += A * Matrix R
michael@0 2588 "vmul.s16 q7, q15, d3[3] \n" // A += A * Matrix A
michael@0 2589 "vqadd.s16 q12, q12, q4 \n" // Accumulate B
michael@0 2590 "vqadd.s16 q13, q13, q5 \n" // Accumulate G
michael@0 2591 "vqadd.s16 q14, q14, q6 \n" // Accumulate R
michael@0 2592 "vqadd.s16 q15, q15, q7 \n" // Accumulate A
michael@0 2593 "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B
michael@0 2594 "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G
michael@0 2595 "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R
michael@0 2596 "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A
michael@0 2597 "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
michael@0 2598 "bgt 1b \n"
michael@0 2599 : "+r"(src_argb), // %0
michael@0 2600 "+r"(dst_argb), // %1
michael@0 2601 "+r"(width) // %2
michael@0 2602 : "r"(matrix_argb) // %3
michael@0 2603 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
michael@0 2604 "q10", "q11", "q12", "q13", "q14", "q15"
michael@0 2605 );
michael@0 2606 }
michael@0 2607
michael@0 2608 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
michael@0 2609 #ifdef HAS_ARGBMULTIPLYROW_NEON
michael@0 2610 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
michael@0 2611 void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
michael@0 2612 uint8* dst_argb, int width) {
michael@0 2613 asm volatile (
michael@0 2614 // 8 pixel loop.
michael@0 2615 ".p2align 2 \n"
michael@0 2616 "1: \n"
michael@0 2617 "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
michael@0 2618 "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels.
michael@0 2619 "subs %3, %3, #8 \n" // 8 processed per loop.
michael@0 2620 "vmull.u8 q0, d0, d1 \n" // multiply B
michael@0 2621 "vmull.u8 q1, d2, d3 \n" // multiply G
michael@0 2622 "vmull.u8 q2, d4, d5 \n" // multiply R
michael@0 2623 "vmull.u8 q3, d6, d7 \n" // multiply A
michael@0 2624 "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B
michael@0 2625 "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
michael@0 2626 "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
michael@0 2627 "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
michael@0 2628 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
michael@0 2629 "bgt 1b \n"
michael@0 2630
michael@0 2631 : "+r"(src_argb0), // %0
michael@0 2632 "+r"(src_argb1), // %1
michael@0 2633 "+r"(dst_argb), // %2
michael@0 2634 "+r"(width) // %3
michael@0 2635 :
michael@0 2636 : "cc", "memory", "q0", "q1", "q2", "q3"
michael@0 2637 );
michael@0 2638 }
michael@0 2639 #endif // HAS_ARGBMULTIPLYROW_NEON
michael@0 2640
michael@0 2641 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
michael@0 2642 void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
michael@0 2643 uint8* dst_argb, int width) {
michael@0 2644 asm volatile (
michael@0 2645 // 8 pixel loop.
michael@0 2646 ".p2align 2 \n"
michael@0 2647 "1: \n"
michael@0 2648 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
michael@0 2649 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
michael@0 2650 "subs %3, %3, #8 \n" // 8 processed per loop.
michael@0 2651 "vqadd.u8 q0, q0, q2 \n" // add B, G
michael@0 2652 "vqadd.u8 q1, q1, q3 \n" // add R, A
michael@0 2653 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
michael@0 2654 "bgt 1b \n"
michael@0 2655
michael@0 2656 : "+r"(src_argb0), // %0
michael@0 2657 "+r"(src_argb1), // %1
michael@0 2658 "+r"(dst_argb), // %2
michael@0 2659 "+r"(width) // %3
michael@0 2660 :
michael@0 2661 : "cc", "memory", "q0", "q1", "q2", "q3"
michael@0 2662 );
michael@0 2663 }
michael@0 2664
michael@0 2665 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
michael@0 2666 void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
michael@0 2667 uint8* dst_argb, int width) {
michael@0 2668 asm volatile (
michael@0 2669 // 8 pixel loop.
michael@0 2670 ".p2align 2 \n"
michael@0 2671 "1: \n"
michael@0 2672 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
michael@0 2673 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
michael@0 2674 "subs %3, %3, #8 \n" // 8 processed per loop.
michael@0 2675 "vqsub.u8 q0, q0, q2 \n" // subtract B, G
michael@0 2676 "vqsub.u8 q1, q1, q3 \n" // subtract R, A
michael@0 2677 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
michael@0 2678 "bgt 1b \n"
michael@0 2679
michael@0 2680 : "+r"(src_argb0), // %0
michael@0 2681 "+r"(src_argb1), // %1
michael@0 2682 "+r"(dst_argb), // %2
michael@0 2683 "+r"(width) // %3
michael@0 2684 :
michael@0 2685 : "cc", "memory", "q0", "q1", "q2", "q3"
michael@0 2686 );
michael@0 2687 }
michael@0 2688
michael@0 2689 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
michael@0 2690 // A = 255
michael@0 2691 // R = Sobel
michael@0 2692 // G = Sobel
michael@0 2693 // B = Sobel
michael@0 2694 void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
michael@0 2695 uint8* dst_argb, int width) {
michael@0 2696 asm volatile (
michael@0 2697 "vmov.u8 d3, #255 \n" // alpha
michael@0 2698 // 8 pixel loop.
michael@0 2699 ".p2align 2 \n"
michael@0 2700 "1: \n"
michael@0 2701 "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
michael@0 2702 "vld1.8 {d1}, [%1]! \n" // load 8 sobely.
michael@0 2703 "subs %3, %3, #8 \n" // 8 processed per loop.
michael@0 2704 "vqadd.u8 d0, d0, d1 \n" // add
michael@0 2705 "vmov.u8 d1, d0 \n"
michael@0 2706 "vmov.u8 d2, d0 \n"
michael@0 2707 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
michael@0 2708 "bgt 1b \n"
michael@0 2709 : "+r"(src_sobelx), // %0
michael@0 2710 "+r"(src_sobely), // %1
michael@0 2711 "+r"(dst_argb), // %2
michael@0 2712 "+r"(width) // %3
michael@0 2713 :
michael@0 2714 : "cc", "memory", "q0", "q1"
michael@0 2715 );
michael@0 2716 }
michael@0 2717
michael@0 2718 // Adds Sobel X and Sobel Y and stores Sobel into plane.
michael@0 2719 void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
michael@0 2720 uint8* dst_y, int width) {
michael@0 2721 asm volatile (
michael@0 2722 // 16 pixel loop.
michael@0 2723 ".p2align 2 \n"
michael@0 2724 "1: \n"
michael@0 2725 "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
michael@0 2726 "vld1.8 {q1}, [%1]! \n" // load 16 sobely.
michael@0 2727 "subs %3, %3, #16 \n" // 16 processed per loop.
michael@0 2728 "vqadd.u8 q0, q0, q1 \n" // add
michael@0 2729 "vst1.8 {q0}, [%2]! \n" // store 16 pixels.
michael@0 2730 "bgt 1b \n"
michael@0 2731 : "+r"(src_sobelx), // %0
michael@0 2732 "+r"(src_sobely), // %1
michael@0 2733 "+r"(dst_y), // %2
michael@0 2734 "+r"(width) // %3
michael@0 2735 :
michael@0 2736 : "cc", "memory", "q0", "q1"
michael@0 2737 );
michael@0 2738 }
michael@0 2739
michael@0 2740 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
michael@0 2741 // A = 255
michael@0 2742 // R = Sobel X
michael@0 2743 // G = Sobel
michael@0 2744 // B = Sobel Y
michael@0 2745 void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
michael@0 2746 uint8* dst_argb, int width) {
michael@0 2747 asm volatile (
michael@0 2748 "vmov.u8 d3, #255 \n" // alpha
michael@0 2749 // 8 pixel loop.
michael@0 2750 ".p2align 2 \n"
michael@0 2751 "1: \n"
michael@0 2752 "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
michael@0 2753 "vld1.8 {d0}, [%1]! \n" // load 8 sobely.
michael@0 2754 "subs %3, %3, #8 \n" // 8 processed per loop.
michael@0 2755 "vqadd.u8 d1, d0, d2 \n" // add
michael@0 2756 "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
michael@0 2757 "bgt 1b \n"
michael@0 2758 : "+r"(src_sobelx), // %0
michael@0 2759 "+r"(src_sobely), // %1
michael@0 2760 "+r"(dst_argb), // %2
michael@0 2761 "+r"(width) // %3
michael@0 2762 :
michael@0 2763 : "cc", "memory", "q0", "q1"
michael@0 2764 );
michael@0 2765 }
michael@0 2766
michael@0 2767 // SobelX as a matrix is
michael@0 2768 // -1 0 1
michael@0 2769 // -2 0 2
michael@0 2770 // -1 0 1
michael@0 2771 void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
michael@0 2772 const uint8* src_y2, uint8* dst_sobelx, int width) {
michael@0 2773 asm volatile (
michael@0 2774 ".p2align 2 \n"
michael@0 2775 "1: \n"
michael@0 2776 "vld1.8 {d0}, [%0],%5 \n" // top
michael@0 2777 "vld1.8 {d1}, [%0],%6 \n"
michael@0 2778 "vsubl.u8 q0, d0, d1 \n"
michael@0 2779 "vld1.8 {d2}, [%1],%5 \n" // center * 2
michael@0 2780 "vld1.8 {d3}, [%1],%6 \n"
michael@0 2781 "vsubl.u8 q1, d2, d3 \n"
michael@0 2782 "vadd.s16 q0, q0, q1 \n"
michael@0 2783 "vadd.s16 q0, q0, q1 \n"
michael@0 2784 "vld1.8 {d2}, [%2],%5 \n" // bottom
michael@0 2785 "vld1.8 {d3}, [%2],%6 \n"
michael@0 2786 "subs %4, %4, #8 \n" // 8 pixels
michael@0 2787 "vsubl.u8 q1, d2, d3 \n"
michael@0 2788 "vadd.s16 q0, q0, q1 \n"
michael@0 2789 "vabs.s16 q0, q0 \n"
michael@0 2790 "vqmovn.u16 d0, q0 \n"
michael@0 2791 "vst1.8 {d0}, [%3]! \n" // store 8 sobelx
michael@0 2792 "bgt 1b \n"
michael@0 2793 : "+r"(src_y0), // %0
michael@0 2794 "+r"(src_y1), // %1
michael@0 2795 "+r"(src_y2), // %2
michael@0 2796 "+r"(dst_sobelx), // %3
michael@0 2797 "+r"(width) // %4
michael@0 2798 : "r"(2), // %5
michael@0 2799 "r"(6) // %6
michael@0 2800 : "cc", "memory", "q0", "q1" // Clobber List
michael@0 2801 );
michael@0 2802 }
michael@0 2803
michael@0 2804 // SobelY as a matrix is
michael@0 2805 // -1 -2 -1
michael@0 2806 // 0 0 0
michael@0 2807 // 1 2 1
michael@0 2808 void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
michael@0 2809 uint8* dst_sobely, int width) {
michael@0 2810 asm volatile (
michael@0 2811 ".p2align 2 \n"
michael@0 2812 "1: \n"
michael@0 2813 "vld1.8 {d0}, [%0],%4 \n" // left
michael@0 2814 "vld1.8 {d1}, [%1],%4 \n"
michael@0 2815 "vsubl.u8 q0, d0, d1 \n"
michael@0 2816 "vld1.8 {d2}, [%0],%4 \n" // center * 2
michael@0 2817 "vld1.8 {d3}, [%1],%4 \n"
michael@0 2818 "vsubl.u8 q1, d2, d3 \n"
michael@0 2819 "vadd.s16 q0, q0, q1 \n"
michael@0 2820 "vadd.s16 q0, q0, q1 \n"
michael@0 2821 "vld1.8 {d2}, [%0],%5 \n" // right
michael@0 2822 "vld1.8 {d3}, [%1],%5 \n"
michael@0 2823 "subs %3, %3, #8 \n" // 8 pixels
michael@0 2824 "vsubl.u8 q1, d2, d3 \n"
michael@0 2825 "vadd.s16 q0, q0, q1 \n"
michael@0 2826 "vabs.s16 q0, q0 \n"
michael@0 2827 "vqmovn.u16 d0, q0 \n"
michael@0 2828 "vst1.8 {d0}, [%2]! \n" // store 8 sobely
michael@0 2829 "bgt 1b \n"
michael@0 2830 : "+r"(src_y0), // %0
michael@0 2831 "+r"(src_y1), // %1
michael@0 2832 "+r"(dst_sobely), // %2
michael@0 2833 "+r"(width) // %3
michael@0 2834 : "r"(1), // %4
michael@0 2835 "r"(6) // %5
michael@0 2836 : "cc", "memory", "q0", "q1" // Clobber List
michael@0 2837 );
michael@0 2838 }
michael@0 2839 #endif // __ARM_NEON__
michael@0 2840
michael@0 2841 #ifdef __cplusplus
michael@0 2842 } // extern "C"
michael@0 2843 } // namespace libyuv
michael@0 2844 #endif

mercurial