Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* |
michael@0 | 2 | * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
michael@0 | 3 | * |
michael@0 | 4 | * Use of this source code is governed by a BSD-style license |
michael@0 | 5 | * that can be found in the LICENSE file in the root of the source |
michael@0 | 6 | * tree. An additional intellectual property rights grant can be found |
michael@0 | 7 | * in the file PATENTS. All contributing project authors may |
michael@0 | 8 | * be found in the AUTHORS file in the root of the source tree. |
michael@0 | 9 | */ |
michael@0 | 10 | |
michael@0 | 11 | #include "libyuv/row.h" |
michael@0 | 12 | |
michael@0 | 13 | #ifdef __cplusplus |
michael@0 | 14 | namespace libyuv { |
michael@0 | 15 | extern "C" { |
michael@0 | 16 | #endif |
michael@0 | 17 | |
michael@0 | 18 | // This module is for GCC Neon |
michael@0 | 19 | #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) |
michael@0 | 20 | |
michael@0 | 21 | // Read 8 Y, 4 U and 4 V from 422 |
michael@0 | 22 | #define READYUV422 \ |
michael@0 | 23 | "vld1.8 {d0}, [%0]! \n" \ |
michael@0 | 24 | "vld1.32 {d2[0]}, [%1]! \n" \ |
michael@0 | 25 | "vld1.32 {d2[1]}, [%2]! \n" |
michael@0 | 26 | |
michael@0 | 27 | // Read 8 Y, 2 U and 2 V from 422 |
michael@0 | 28 | #define READYUV411 \ |
michael@0 | 29 | "vld1.8 {d0}, [%0]! \n" \ |
michael@0 | 30 | "vld1.16 {d2[0]}, [%1]! \n" \ |
michael@0 | 31 | "vld1.16 {d2[1]}, [%2]! \n" \ |
michael@0 | 32 | "vmov.u8 d3, d2 \n" \ |
michael@0 | 33 | "vzip.u8 d2, d3 \n" |
michael@0 | 34 | |
michael@0 | 35 | // Read 8 Y, 8 U and 8 V from 444 |
michael@0 | 36 | #define READYUV444 \ |
michael@0 | 37 | "vld1.8 {d0}, [%0]! \n" \ |
michael@0 | 38 | "vld1.8 {d2}, [%1]! \n" \ |
michael@0 | 39 | "vld1.8 {d3}, [%2]! \n" \ |
michael@0 | 40 | "vpaddl.u8 q1, q1 \n" \ |
michael@0 | 41 | "vrshrn.u16 d2, q1, #1 \n" |
michael@0 | 42 | |
michael@0 | 43 | // Read 8 Y, and set 4 U and 4 V to 128 |
michael@0 | 44 | #define READYUV400 \ |
michael@0 | 45 | "vld1.8 {d0}, [%0]! \n" \ |
michael@0 | 46 | "vmov.u8 d2, #128 \n" |
michael@0 | 47 | |
michael@0 | 48 | // Read 8 Y and 4 UV from NV12 |
michael@0 | 49 | #define READNV12 \ |
michael@0 | 50 | "vld1.8 {d0}, [%0]! \n" \ |
michael@0 | 51 | "vld1.8 {d2}, [%1]! \n" \ |
michael@0 | 52 | "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ |
michael@0 | 53 | "vuzp.u8 d2, d3 \n" \ |
michael@0 | 54 | "vtrn.u32 d2, d3 \n" |
michael@0 | 55 | |
michael@0 | 56 | // Read 8 Y and 4 VU from NV21 |
michael@0 | 57 | #define READNV21 \ |
michael@0 | 58 | "vld1.8 {d0}, [%0]! \n" \ |
michael@0 | 59 | "vld1.8 {d2}, [%1]! \n" \ |
michael@0 | 60 | "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ |
michael@0 | 61 | "vuzp.u8 d3, d2 \n" \ |
michael@0 | 62 | "vtrn.u32 d2, d3 \n" |
michael@0 | 63 | |
michael@0 | 64 | // Read 8 YUY2 |
michael@0 | 65 | #define READYUY2 \ |
michael@0 | 66 | "vld2.8 {d0, d2}, [%0]! \n" \ |
michael@0 | 67 | "vmov.u8 d3, d2 \n" \ |
michael@0 | 68 | "vuzp.u8 d2, d3 \n" \ |
michael@0 | 69 | "vtrn.u32 d2, d3 \n" |
michael@0 | 70 | |
michael@0 | 71 | // Read 8 UYVY |
michael@0 | 72 | #define READUYVY \ |
michael@0 | 73 | "vld2.8 {d2, d3}, [%0]! \n" \ |
michael@0 | 74 | "vmov.u8 d0, d3 \n" \ |
michael@0 | 75 | "vmov.u8 d3, d2 \n" \ |
michael@0 | 76 | "vuzp.u8 d2, d3 \n" \ |
michael@0 | 77 | "vtrn.u32 d2, d3 \n" |
michael@0 | 78 | |
michael@0 | 79 | #define YUV422TORGB \ |
michael@0 | 80 | "veor.u8 d2, d26 \n"/*subtract 128 from u and v*/\ |
michael@0 | 81 | "vmull.s8 q8, d2, d24 \n"/* u/v B/R component */\ |
michael@0 | 82 | "vmull.s8 q9, d2, d25 \n"/* u/v G component */\ |
michael@0 | 83 | "vmov.u8 d1, #0 \n"/* split odd/even y apart */\ |
michael@0 | 84 | "vtrn.u8 d0, d1 \n" \ |
michael@0 | 85 | "vsub.s16 q0, q0, q15 \n"/* offset y */\ |
michael@0 | 86 | "vmul.s16 q0, q0, q14 \n" \ |
michael@0 | 87 | "vadd.s16 d18, d19 \n" \ |
michael@0 | 88 | "vqadd.s16 d20, d0, d16 \n" /* B */ \ |
michael@0 | 89 | "vqadd.s16 d21, d1, d16 \n" \ |
michael@0 | 90 | "vqadd.s16 d22, d0, d17 \n" /* R */ \ |
michael@0 | 91 | "vqadd.s16 d23, d1, d17 \n" \ |
michael@0 | 92 | "vqadd.s16 d16, d0, d18 \n" /* G */ \ |
michael@0 | 93 | "vqadd.s16 d17, d1, d18 \n" \ |
michael@0 | 94 | "vqshrun.s16 d0, q10, #6 \n" /* B */ \ |
michael@0 | 95 | "vqshrun.s16 d1, q11, #6 \n" /* G */ \ |
michael@0 | 96 | "vqshrun.s16 d2, q8, #6 \n" /* R */ \ |
michael@0 | 97 | "vmovl.u8 q10, d0 \n"/* set up for reinterleave*/\ |
michael@0 | 98 | "vmovl.u8 q11, d1 \n" \ |
michael@0 | 99 | "vmovl.u8 q8, d2 \n" \ |
michael@0 | 100 | "vtrn.u8 d20, d21 \n" \ |
michael@0 | 101 | "vtrn.u8 d22, d23 \n" \ |
michael@0 | 102 | "vtrn.u8 d16, d17 \n" \ |
michael@0 | 103 | "vmov.u8 d21, d16 \n" |
michael@0 | 104 | |
michael@0 | 105 | static vec8 kUVToRB = { 127, 127, 127, 127, 102, 102, 102, 102, |
michael@0 | 106 | 0, 0, 0, 0, 0, 0, 0, 0 }; |
michael@0 | 107 | static vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52, |
michael@0 | 108 | 0, 0, 0, 0, 0, 0, 0, 0 }; |
michael@0 | 109 | |
michael@0 | 110 | void I444ToARGBRow_NEON(const uint8* src_y, |
michael@0 | 111 | const uint8* src_u, |
michael@0 | 112 | const uint8* src_v, |
michael@0 | 113 | uint8* dst_argb, |
michael@0 | 114 | int width) { |
michael@0 | 115 | asm volatile ( |
michael@0 | 116 | "vld1.8 {d24}, [%5] \n" |
michael@0 | 117 | "vld1.8 {d25}, [%6] \n" |
michael@0 | 118 | "vmov.u8 d26, #128 \n" |
michael@0 | 119 | "vmov.u16 q14, #74 \n" |
michael@0 | 120 | "vmov.u16 q15, #16 \n" |
michael@0 | 121 | ".p2align 2 \n" |
michael@0 | 122 | "1: \n" |
michael@0 | 123 | READYUV444 |
michael@0 | 124 | YUV422TORGB |
michael@0 | 125 | "subs %4, %4, #8 \n" |
michael@0 | 126 | "vmov.u8 d23, #255 \n" |
michael@0 | 127 | "vst4.8 {d20, d21, d22, d23}, [%3]! \n" |
michael@0 | 128 | "bgt 1b \n" |
michael@0 | 129 | : "+r"(src_y), // %0 |
michael@0 | 130 | "+r"(src_u), // %1 |
michael@0 | 131 | "+r"(src_v), // %2 |
michael@0 | 132 | "+r"(dst_argb), // %3 |
michael@0 | 133 | "+r"(width) // %4 |
michael@0 | 134 | : "r"(&kUVToRB), // %5 |
michael@0 | 135 | "r"(&kUVToG) // %6 |
michael@0 | 136 | : "cc", "memory", "q0", "q1", "q2", "q3", |
michael@0 | 137 | "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 138 | ); |
michael@0 | 139 | } |
michael@0 | 140 | |
michael@0 | 141 | void I422ToARGBRow_NEON(const uint8* src_y, |
michael@0 | 142 | const uint8* src_u, |
michael@0 | 143 | const uint8* src_v, |
michael@0 | 144 | uint8* dst_argb, |
michael@0 | 145 | int width) { |
michael@0 | 146 | asm volatile ( |
michael@0 | 147 | "vld1.8 {d24}, [%5] \n" |
michael@0 | 148 | "vld1.8 {d25}, [%6] \n" |
michael@0 | 149 | "vmov.u8 d26, #128 \n" |
michael@0 | 150 | "vmov.u16 q14, #74 \n" |
michael@0 | 151 | "vmov.u16 q15, #16 \n" |
michael@0 | 152 | ".p2align 2 \n" |
michael@0 | 153 | "1: \n" |
michael@0 | 154 | READYUV422 |
michael@0 | 155 | YUV422TORGB |
michael@0 | 156 | "subs %4, %4, #8 \n" |
michael@0 | 157 | "vmov.u8 d23, #255 \n" |
michael@0 | 158 | "vst4.8 {d20, d21, d22, d23}, [%3]! \n" |
michael@0 | 159 | "bgt 1b \n" |
michael@0 | 160 | : "+r"(src_y), // %0 |
michael@0 | 161 | "+r"(src_u), // %1 |
michael@0 | 162 | "+r"(src_v), // %2 |
michael@0 | 163 | "+r"(dst_argb), // %3 |
michael@0 | 164 | "+r"(width) // %4 |
michael@0 | 165 | : "r"(&kUVToRB), // %5 |
michael@0 | 166 | "r"(&kUVToG) // %6 |
michael@0 | 167 | : "cc", "memory", "q0", "q1", "q2", "q3", |
michael@0 | 168 | "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 169 | ); |
michael@0 | 170 | } |
michael@0 | 171 | |
michael@0 | 172 | void I411ToARGBRow_NEON(const uint8* src_y, |
michael@0 | 173 | const uint8* src_u, |
michael@0 | 174 | const uint8* src_v, |
michael@0 | 175 | uint8* dst_argb, |
michael@0 | 176 | int width) { |
michael@0 | 177 | asm volatile ( |
michael@0 | 178 | "vld1.8 {d24}, [%5] \n" |
michael@0 | 179 | "vld1.8 {d25}, [%6] \n" |
michael@0 | 180 | "vmov.u8 d26, #128 \n" |
michael@0 | 181 | "vmov.u16 q14, #74 \n" |
michael@0 | 182 | "vmov.u16 q15, #16 \n" |
michael@0 | 183 | ".p2align 2 \n" |
michael@0 | 184 | "1: \n" |
michael@0 | 185 | READYUV411 |
michael@0 | 186 | YUV422TORGB |
michael@0 | 187 | "subs %4, %4, #8 \n" |
michael@0 | 188 | "vmov.u8 d23, #255 \n" |
michael@0 | 189 | "vst4.8 {d20, d21, d22, d23}, [%3]! \n" |
michael@0 | 190 | "bgt 1b \n" |
michael@0 | 191 | : "+r"(src_y), // %0 |
michael@0 | 192 | "+r"(src_u), // %1 |
michael@0 | 193 | "+r"(src_v), // %2 |
michael@0 | 194 | "+r"(dst_argb), // %3 |
michael@0 | 195 | "+r"(width) // %4 |
michael@0 | 196 | : "r"(&kUVToRB), // %5 |
michael@0 | 197 | "r"(&kUVToG) // %6 |
michael@0 | 198 | : "cc", "memory", "q0", "q1", "q2", "q3", |
michael@0 | 199 | "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 200 | ); |
michael@0 | 201 | } |
michael@0 | 202 | |
michael@0 | 203 | void I422ToBGRARow_NEON(const uint8* src_y, |
michael@0 | 204 | const uint8* src_u, |
michael@0 | 205 | const uint8* src_v, |
michael@0 | 206 | uint8* dst_bgra, |
michael@0 | 207 | int width) { |
michael@0 | 208 | asm volatile ( |
michael@0 | 209 | "vld1.8 {d24}, [%5] \n" |
michael@0 | 210 | "vld1.8 {d25}, [%6] \n" |
michael@0 | 211 | "vmov.u8 d26, #128 \n" |
michael@0 | 212 | "vmov.u16 q14, #74 \n" |
michael@0 | 213 | "vmov.u16 q15, #16 \n" |
michael@0 | 214 | ".p2align 2 \n" |
michael@0 | 215 | "1: \n" |
michael@0 | 216 | READYUV422 |
michael@0 | 217 | YUV422TORGB |
michael@0 | 218 | "subs %4, %4, #8 \n" |
michael@0 | 219 | "vswp.u8 d20, d22 \n" |
michael@0 | 220 | "vmov.u8 d19, #255 \n" |
michael@0 | 221 | "vst4.8 {d19, d20, d21, d22}, [%3]! \n" |
michael@0 | 222 | "bgt 1b \n" |
michael@0 | 223 | : "+r"(src_y), // %0 |
michael@0 | 224 | "+r"(src_u), // %1 |
michael@0 | 225 | "+r"(src_v), // %2 |
michael@0 | 226 | "+r"(dst_bgra), // %3 |
michael@0 | 227 | "+r"(width) // %4 |
michael@0 | 228 | : "r"(&kUVToRB), // %5 |
michael@0 | 229 | "r"(&kUVToG) // %6 |
michael@0 | 230 | : "cc", "memory", "q0", "q1", "q2", "q3", |
michael@0 | 231 | "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 232 | ); |
michael@0 | 233 | } |
michael@0 | 234 | |
michael@0 | 235 | void I422ToABGRRow_NEON(const uint8* src_y, |
michael@0 | 236 | const uint8* src_u, |
michael@0 | 237 | const uint8* src_v, |
michael@0 | 238 | uint8* dst_abgr, |
michael@0 | 239 | int width) { |
michael@0 | 240 | asm volatile ( |
michael@0 | 241 | "vld1.8 {d24}, [%5] \n" |
michael@0 | 242 | "vld1.8 {d25}, [%6] \n" |
michael@0 | 243 | "vmov.u8 d26, #128 \n" |
michael@0 | 244 | "vmov.u16 q14, #74 \n" |
michael@0 | 245 | "vmov.u16 q15, #16 \n" |
michael@0 | 246 | ".p2align 2 \n" |
michael@0 | 247 | "1: \n" |
michael@0 | 248 | READYUV422 |
michael@0 | 249 | YUV422TORGB |
michael@0 | 250 | "subs %4, %4, #8 \n" |
michael@0 | 251 | "vswp.u8 d20, d22 \n" |
michael@0 | 252 | "vmov.u8 d23, #255 \n" |
michael@0 | 253 | "vst4.8 {d20, d21, d22, d23}, [%3]! \n" |
michael@0 | 254 | "bgt 1b \n" |
michael@0 | 255 | : "+r"(src_y), // %0 |
michael@0 | 256 | "+r"(src_u), // %1 |
michael@0 | 257 | "+r"(src_v), // %2 |
michael@0 | 258 | "+r"(dst_abgr), // %3 |
michael@0 | 259 | "+r"(width) // %4 |
michael@0 | 260 | : "r"(&kUVToRB), // %5 |
michael@0 | 261 | "r"(&kUVToG) // %6 |
michael@0 | 262 | : "cc", "memory", "q0", "q1", "q2", "q3", |
michael@0 | 263 | "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 264 | ); |
michael@0 | 265 | } |
michael@0 | 266 | |
michael@0 | 267 | void I422ToRGBARow_NEON(const uint8* src_y, |
michael@0 | 268 | const uint8* src_u, |
michael@0 | 269 | const uint8* src_v, |
michael@0 | 270 | uint8* dst_rgba, |
michael@0 | 271 | int width) { |
michael@0 | 272 | asm volatile ( |
michael@0 | 273 | "vld1.8 {d24}, [%5] \n" |
michael@0 | 274 | "vld1.8 {d25}, [%6] \n" |
michael@0 | 275 | "vmov.u8 d26, #128 \n" |
michael@0 | 276 | "vmov.u16 q14, #74 \n" |
michael@0 | 277 | "vmov.u16 q15, #16 \n" |
michael@0 | 278 | ".p2align 2 \n" |
michael@0 | 279 | "1: \n" |
michael@0 | 280 | READYUV422 |
michael@0 | 281 | YUV422TORGB |
michael@0 | 282 | "subs %4, %4, #8 \n" |
michael@0 | 283 | "vmov.u8 d19, #255 \n" |
michael@0 | 284 | "vst4.8 {d19, d20, d21, d22}, [%3]! \n" |
michael@0 | 285 | "bgt 1b \n" |
michael@0 | 286 | : "+r"(src_y), // %0 |
michael@0 | 287 | "+r"(src_u), // %1 |
michael@0 | 288 | "+r"(src_v), // %2 |
michael@0 | 289 | "+r"(dst_rgba), // %3 |
michael@0 | 290 | "+r"(width) // %4 |
michael@0 | 291 | : "r"(&kUVToRB), // %5 |
michael@0 | 292 | "r"(&kUVToG) // %6 |
michael@0 | 293 | : "cc", "memory", "q0", "q1", "q2", "q3", |
michael@0 | 294 | "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 295 | ); |
michael@0 | 296 | } |
michael@0 | 297 | |
michael@0 | 298 | void I422ToRGB24Row_NEON(const uint8* src_y, |
michael@0 | 299 | const uint8* src_u, |
michael@0 | 300 | const uint8* src_v, |
michael@0 | 301 | uint8* dst_rgb24, |
michael@0 | 302 | int width) { |
michael@0 | 303 | asm volatile ( |
michael@0 | 304 | "vld1.8 {d24}, [%5] \n" |
michael@0 | 305 | "vld1.8 {d25}, [%6] \n" |
michael@0 | 306 | "vmov.u8 d26, #128 \n" |
michael@0 | 307 | "vmov.u16 q14, #74 \n" |
michael@0 | 308 | "vmov.u16 q15, #16 \n" |
michael@0 | 309 | ".p2align 2 \n" |
michael@0 | 310 | "1: \n" |
michael@0 | 311 | READYUV422 |
michael@0 | 312 | YUV422TORGB |
michael@0 | 313 | "subs %4, %4, #8 \n" |
michael@0 | 314 | "vst3.8 {d20, d21, d22}, [%3]! \n" |
michael@0 | 315 | "bgt 1b \n" |
michael@0 | 316 | : "+r"(src_y), // %0 |
michael@0 | 317 | "+r"(src_u), // %1 |
michael@0 | 318 | "+r"(src_v), // %2 |
michael@0 | 319 | "+r"(dst_rgb24), // %3 |
michael@0 | 320 | "+r"(width) // %4 |
michael@0 | 321 | : "r"(&kUVToRB), // %5 |
michael@0 | 322 | "r"(&kUVToG) // %6 |
michael@0 | 323 | : "cc", "memory", "q0", "q1", "q2", "q3", |
michael@0 | 324 | "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 325 | ); |
michael@0 | 326 | } |
michael@0 | 327 | |
michael@0 | 328 | void I422ToRAWRow_NEON(const uint8* src_y, |
michael@0 | 329 | const uint8* src_u, |
michael@0 | 330 | const uint8* src_v, |
michael@0 | 331 | uint8* dst_raw, |
michael@0 | 332 | int width) { |
michael@0 | 333 | asm volatile ( |
michael@0 | 334 | "vld1.8 {d24}, [%5] \n" |
michael@0 | 335 | "vld1.8 {d25}, [%6] \n" |
michael@0 | 336 | "vmov.u8 d26, #128 \n" |
michael@0 | 337 | "vmov.u16 q14, #74 \n" |
michael@0 | 338 | "vmov.u16 q15, #16 \n" |
michael@0 | 339 | ".p2align 2 \n" |
michael@0 | 340 | "1: \n" |
michael@0 | 341 | READYUV422 |
michael@0 | 342 | YUV422TORGB |
michael@0 | 343 | "subs %4, %4, #8 \n" |
michael@0 | 344 | "vswp.u8 d20, d22 \n" |
michael@0 | 345 | "vst3.8 {d20, d21, d22}, [%3]! \n" |
michael@0 | 346 | "bgt 1b \n" |
michael@0 | 347 | : "+r"(src_y), // %0 |
michael@0 | 348 | "+r"(src_u), // %1 |
michael@0 | 349 | "+r"(src_v), // %2 |
michael@0 | 350 | "+r"(dst_raw), // %3 |
michael@0 | 351 | "+r"(width) // %4 |
michael@0 | 352 | : "r"(&kUVToRB), // %5 |
michael@0 | 353 | "r"(&kUVToG) // %6 |
michael@0 | 354 | : "cc", "memory", "q0", "q1", "q2", "q3", |
michael@0 | 355 | "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 356 | ); |
michael@0 | 357 | } |
michael@0 | 358 | |
michael@0 | 359 | #define ARGBTORGB565 \ |
michael@0 | 360 | "vshr.u8 d20, d20, #3 \n" /* B */ \ |
michael@0 | 361 | "vshr.u8 d21, d21, #2 \n" /* G */ \ |
michael@0 | 362 | "vshr.u8 d22, d22, #3 \n" /* R */ \ |
michael@0 | 363 | "vmovl.u8 q8, d20 \n" /* B */ \ |
michael@0 | 364 | "vmovl.u8 q9, d21 \n" /* G */ \ |
michael@0 | 365 | "vmovl.u8 q10, d22 \n" /* R */ \ |
michael@0 | 366 | "vshl.u16 q9, q9, #5 \n" /* G */ \ |
michael@0 | 367 | "vshl.u16 q10, q10, #11 \n" /* R */ \ |
michael@0 | 368 | "vorr q0, q8, q9 \n" /* BG */ \ |
michael@0 | 369 | "vorr q0, q0, q10 \n" /* BGR */ |
michael@0 | 370 | |
michael@0 | 371 | void I422ToRGB565Row_NEON(const uint8* src_y, |
michael@0 | 372 | const uint8* src_u, |
michael@0 | 373 | const uint8* src_v, |
michael@0 | 374 | uint8* dst_rgb565, |
michael@0 | 375 | int width) { |
michael@0 | 376 | asm volatile ( |
michael@0 | 377 | "vld1.8 {d24}, [%5] \n" |
michael@0 | 378 | "vld1.8 {d25}, [%6] \n" |
michael@0 | 379 | "vmov.u8 d26, #128 \n" |
michael@0 | 380 | "vmov.u16 q14, #74 \n" |
michael@0 | 381 | "vmov.u16 q15, #16 \n" |
michael@0 | 382 | ".p2align 2 \n" |
michael@0 | 383 | "1: \n" |
michael@0 | 384 | READYUV422 |
michael@0 | 385 | YUV422TORGB |
michael@0 | 386 | "subs %4, %4, #8 \n" |
michael@0 | 387 | ARGBTORGB565 |
michael@0 | 388 | "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. |
michael@0 | 389 | "bgt 1b \n" |
michael@0 | 390 | : "+r"(src_y), // %0 |
michael@0 | 391 | "+r"(src_u), // %1 |
michael@0 | 392 | "+r"(src_v), // %2 |
michael@0 | 393 | "+r"(dst_rgb565), // %3 |
michael@0 | 394 | "+r"(width) // %4 |
michael@0 | 395 | : "r"(&kUVToRB), // %5 |
michael@0 | 396 | "r"(&kUVToG) // %6 |
michael@0 | 397 | : "cc", "memory", "q0", "q1", "q2", "q3", |
michael@0 | 398 | "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 399 | ); |
michael@0 | 400 | } |
michael@0 | 401 | |
michael@0 | 402 | #define ARGBTOARGB1555 \ |
michael@0 | 403 | "vshr.u8 q10, q10, #3 \n" /* B */ \ |
michael@0 | 404 | "vshr.u8 d22, d22, #3 \n" /* R */ \ |
michael@0 | 405 | "vshr.u8 d23, d23, #7 \n" /* A */ \ |
michael@0 | 406 | "vmovl.u8 q8, d20 \n" /* B */ \ |
michael@0 | 407 | "vmovl.u8 q9, d21 \n" /* G */ \ |
michael@0 | 408 | "vmovl.u8 q10, d22 \n" /* R */ \ |
michael@0 | 409 | "vmovl.u8 q11, d23 \n" /* A */ \ |
michael@0 | 410 | "vshl.u16 q9, q9, #5 \n" /* G */ \ |
michael@0 | 411 | "vshl.u16 q10, q10, #10 \n" /* R */ \ |
michael@0 | 412 | "vshl.u16 q11, q11, #15 \n" /* A */ \ |
michael@0 | 413 | "vorr q0, q8, q9 \n" /* BG */ \ |
michael@0 | 414 | "vorr q1, q10, q11 \n" /* RA */ \ |
michael@0 | 415 | "vorr q0, q0, q1 \n" /* BGRA */ |
michael@0 | 416 | |
michael@0 | 417 | void I422ToARGB1555Row_NEON(const uint8* src_y, |
michael@0 | 418 | const uint8* src_u, |
michael@0 | 419 | const uint8* src_v, |
michael@0 | 420 | uint8* dst_argb1555, |
michael@0 | 421 | int width) { |
michael@0 | 422 | asm volatile ( |
michael@0 | 423 | "vld1.8 {d24}, [%5] \n" |
michael@0 | 424 | "vld1.8 {d25}, [%6] \n" |
michael@0 | 425 | "vmov.u8 d26, #128 \n" |
michael@0 | 426 | "vmov.u16 q14, #74 \n" |
michael@0 | 427 | "vmov.u16 q15, #16 \n" |
michael@0 | 428 | ".p2align 2 \n" |
michael@0 | 429 | "1: \n" |
michael@0 | 430 | READYUV422 |
michael@0 | 431 | YUV422TORGB |
michael@0 | 432 | "subs %4, %4, #8 \n" |
michael@0 | 433 | "vmov.u8 d23, #255 \n" |
michael@0 | 434 | ARGBTOARGB1555 |
michael@0 | 435 | "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555. |
michael@0 | 436 | "bgt 1b \n" |
michael@0 | 437 | : "+r"(src_y), // %0 |
michael@0 | 438 | "+r"(src_u), // %1 |
michael@0 | 439 | "+r"(src_v), // %2 |
michael@0 | 440 | "+r"(dst_argb1555), // %3 |
michael@0 | 441 | "+r"(width) // %4 |
michael@0 | 442 | : "r"(&kUVToRB), // %5 |
michael@0 | 443 | "r"(&kUVToG) // %6 |
michael@0 | 444 | : "cc", "memory", "q0", "q1", "q2", "q3", |
michael@0 | 445 | "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 446 | ); |
michael@0 | 447 | } |
michael@0 | 448 | |
michael@0 | 449 | #define ARGBTOARGB4444 \ |
michael@0 | 450 | "vshr.u8 d20, d20, #4 \n" /* B */ \ |
michael@0 | 451 | "vbic.32 d21, d21, d4 \n" /* G */ \ |
michael@0 | 452 | "vshr.u8 d22, d22, #4 \n" /* R */ \ |
michael@0 | 453 | "vbic.32 d23, d23, d4 \n" /* A */ \ |
michael@0 | 454 | "vorr d0, d20, d21 \n" /* BG */ \ |
michael@0 | 455 | "vorr d1, d22, d23 \n" /* RA */ \ |
michael@0 | 456 | "vzip.u8 d0, d1 \n" /* BGRA */ |
michael@0 | 457 | |
michael@0 | 458 | void I422ToARGB4444Row_NEON(const uint8* src_y, |
michael@0 | 459 | const uint8* src_u, |
michael@0 | 460 | const uint8* src_v, |
michael@0 | 461 | uint8* dst_argb4444, |
michael@0 | 462 | int width) { |
michael@0 | 463 | asm volatile ( |
michael@0 | 464 | "vld1.8 {d24}, [%5] \n" |
michael@0 | 465 | "vld1.8 {d25}, [%6] \n" |
michael@0 | 466 | "vmov.u8 d26, #128 \n" |
michael@0 | 467 | "vmov.u16 q14, #74 \n" |
michael@0 | 468 | "vmov.u16 q15, #16 \n" |
michael@0 | 469 | "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. |
michael@0 | 470 | ".p2align 2 \n" |
michael@0 | 471 | "1: \n" |
michael@0 | 472 | READYUV422 |
michael@0 | 473 | YUV422TORGB |
michael@0 | 474 | "subs %4, %4, #8 \n" |
michael@0 | 475 | "vmov.u8 d23, #255 \n" |
michael@0 | 476 | ARGBTOARGB4444 |
michael@0 | 477 | "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444. |
michael@0 | 478 | "bgt 1b \n" |
michael@0 | 479 | : "+r"(src_y), // %0 |
michael@0 | 480 | "+r"(src_u), // %1 |
michael@0 | 481 | "+r"(src_v), // %2 |
michael@0 | 482 | "+r"(dst_argb4444), // %3 |
michael@0 | 483 | "+r"(width) // %4 |
michael@0 | 484 | : "r"(&kUVToRB), // %5 |
michael@0 | 485 | "r"(&kUVToG) // %6 |
michael@0 | 486 | : "cc", "memory", "q0", "q1", "q2", "q3", |
michael@0 | 487 | "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 488 | ); |
michael@0 | 489 | } |
michael@0 | 490 | |
michael@0 | 491 | void YToARGBRow_NEON(const uint8* src_y, |
michael@0 | 492 | uint8* dst_argb, |
michael@0 | 493 | int width) { |
michael@0 | 494 | asm volatile ( |
michael@0 | 495 | "vld1.8 {d24}, [%3] \n" |
michael@0 | 496 | "vld1.8 {d25}, [%4] \n" |
michael@0 | 497 | "vmov.u8 d26, #128 \n" |
michael@0 | 498 | "vmov.u16 q14, #74 \n" |
michael@0 | 499 | "vmov.u16 q15, #16 \n" |
michael@0 | 500 | ".p2align 2 \n" |
michael@0 | 501 | "1: \n" |
michael@0 | 502 | READYUV400 |
michael@0 | 503 | YUV422TORGB |
michael@0 | 504 | "subs %2, %2, #8 \n" |
michael@0 | 505 | "vmov.u8 d23, #255 \n" |
michael@0 | 506 | "vst4.8 {d20, d21, d22, d23}, [%1]! \n" |
michael@0 | 507 | "bgt 1b \n" |
michael@0 | 508 | : "+r"(src_y), // %0 |
michael@0 | 509 | "+r"(dst_argb), // %1 |
michael@0 | 510 | "+r"(width) // %2 |
michael@0 | 511 | : "r"(&kUVToRB), // %3 |
michael@0 | 512 | "r"(&kUVToG) // %4 |
michael@0 | 513 | : "cc", "memory", "q0", "q1", "q2", "q3", |
michael@0 | 514 | "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 515 | ); |
michael@0 | 516 | } |
michael@0 | 517 | |
michael@0 | 518 | void I400ToARGBRow_NEON(const uint8* src_y, |
michael@0 | 519 | uint8* dst_argb, |
michael@0 | 520 | int width) { |
michael@0 | 521 | asm volatile ( |
michael@0 | 522 | ".p2align 2 \n" |
michael@0 | 523 | "vmov.u8 d23, #255 \n" |
michael@0 | 524 | "1: \n" |
michael@0 | 525 | "vld1.8 {d20}, [%0]! \n" |
michael@0 | 526 | "vmov d21, d20 \n" |
michael@0 | 527 | "vmov d22, d20 \n" |
michael@0 | 528 | "subs %2, %2, #8 \n" |
michael@0 | 529 | "vst4.8 {d20, d21, d22, d23}, [%1]! \n" |
michael@0 | 530 | "bgt 1b \n" |
michael@0 | 531 | : "+r"(src_y), // %0 |
michael@0 | 532 | "+r"(dst_argb), // %1 |
michael@0 | 533 | "+r"(width) // %2 |
michael@0 | 534 | : |
michael@0 | 535 | : "cc", "memory", "d20", "d21", "d22", "d23" |
michael@0 | 536 | ); |
michael@0 | 537 | } |
michael@0 | 538 | |
michael@0 | 539 | void NV12ToARGBRow_NEON(const uint8* src_y, |
michael@0 | 540 | const uint8* src_uv, |
michael@0 | 541 | uint8* dst_argb, |
michael@0 | 542 | int width) { |
michael@0 | 543 | asm volatile ( |
michael@0 | 544 | "vld1.8 {d24}, [%4] \n" |
michael@0 | 545 | "vld1.8 {d25}, [%5] \n" |
michael@0 | 546 | "vmov.u8 d26, #128 \n" |
michael@0 | 547 | "vmov.u16 q14, #74 \n" |
michael@0 | 548 | "vmov.u16 q15, #16 \n" |
michael@0 | 549 | ".p2align 2 \n" |
michael@0 | 550 | "1: \n" |
michael@0 | 551 | READNV12 |
michael@0 | 552 | YUV422TORGB |
michael@0 | 553 | "subs %3, %3, #8 \n" |
michael@0 | 554 | "vmov.u8 d23, #255 \n" |
michael@0 | 555 | "vst4.8 {d20, d21, d22, d23}, [%2]! \n" |
michael@0 | 556 | "bgt 1b \n" |
michael@0 | 557 | : "+r"(src_y), // %0 |
michael@0 | 558 | "+r"(src_uv), // %1 |
michael@0 | 559 | "+r"(dst_argb), // %2 |
michael@0 | 560 | "+r"(width) // %3 |
michael@0 | 561 | : "r"(&kUVToRB), // %4 |
michael@0 | 562 | "r"(&kUVToG) // %5 |
michael@0 | 563 | : "cc", "memory", "q0", "q1", "q2", "q3", |
michael@0 | 564 | "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 565 | ); |
michael@0 | 566 | } |
michael@0 | 567 | |
michael@0 | 568 | void NV21ToARGBRow_NEON(const uint8* src_y, |
michael@0 | 569 | const uint8* src_uv, |
michael@0 | 570 | uint8* dst_argb, |
michael@0 | 571 | int width) { |
michael@0 | 572 | asm volatile ( |
michael@0 | 573 | "vld1.8 {d24}, [%4] \n" |
michael@0 | 574 | "vld1.8 {d25}, [%5] \n" |
michael@0 | 575 | "vmov.u8 d26, #128 \n" |
michael@0 | 576 | "vmov.u16 q14, #74 \n" |
michael@0 | 577 | "vmov.u16 q15, #16 \n" |
michael@0 | 578 | ".p2align 2 \n" |
michael@0 | 579 | "1: \n" |
michael@0 | 580 | READNV21 |
michael@0 | 581 | YUV422TORGB |
michael@0 | 582 | "subs %3, %3, #8 \n" |
michael@0 | 583 | "vmov.u8 d23, #255 \n" |
michael@0 | 584 | "vst4.8 {d20, d21, d22, d23}, [%2]! \n" |
michael@0 | 585 | "bgt 1b \n" |
michael@0 | 586 | : "+r"(src_y), // %0 |
michael@0 | 587 | "+r"(src_uv), // %1 |
michael@0 | 588 | "+r"(dst_argb), // %2 |
michael@0 | 589 | "+r"(width) // %3 |
michael@0 | 590 | : "r"(&kUVToRB), // %4 |
michael@0 | 591 | "r"(&kUVToG) // %5 |
michael@0 | 592 | : "cc", "memory", "q0", "q1", "q2", "q3", |
michael@0 | 593 | "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 594 | ); |
michael@0 | 595 | } |
michael@0 | 596 | |
michael@0 | 597 | void NV12ToRGB565Row_NEON(const uint8* src_y, |
michael@0 | 598 | const uint8* src_uv, |
michael@0 | 599 | uint8* dst_rgb565, |
michael@0 | 600 | int width) { |
michael@0 | 601 | asm volatile ( |
michael@0 | 602 | "vld1.8 {d24}, [%4] \n" |
michael@0 | 603 | "vld1.8 {d25}, [%5] \n" |
michael@0 | 604 | "vmov.u8 d26, #128 \n" |
michael@0 | 605 | "vmov.u16 q14, #74 \n" |
michael@0 | 606 | "vmov.u16 q15, #16 \n" |
michael@0 | 607 | ".p2align 2 \n" |
michael@0 | 608 | "1: \n" |
michael@0 | 609 | READNV12 |
michael@0 | 610 | YUV422TORGB |
michael@0 | 611 | "subs %3, %3, #8 \n" |
michael@0 | 612 | ARGBTORGB565 |
michael@0 | 613 | "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. |
michael@0 | 614 | "bgt 1b \n" |
michael@0 | 615 | : "+r"(src_y), // %0 |
michael@0 | 616 | "+r"(src_uv), // %1 |
michael@0 | 617 | "+r"(dst_rgb565), // %2 |
michael@0 | 618 | "+r"(width) // %3 |
michael@0 | 619 | : "r"(&kUVToRB), // %4 |
michael@0 | 620 | "r"(&kUVToG) // %5 |
michael@0 | 621 | : "cc", "memory", "q0", "q1", "q2", "q3", |
michael@0 | 622 | "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 623 | ); |
michael@0 | 624 | } |
michael@0 | 625 | |
michael@0 | 626 | void NV21ToRGB565Row_NEON(const uint8* src_y, |
michael@0 | 627 | const uint8* src_uv, |
michael@0 | 628 | uint8* dst_rgb565, |
michael@0 | 629 | int width) { |
michael@0 | 630 | asm volatile ( |
michael@0 | 631 | "vld1.8 {d24}, [%4] \n" |
michael@0 | 632 | "vld1.8 {d25}, [%5] \n" |
michael@0 | 633 | "vmov.u8 d26, #128 \n" |
michael@0 | 634 | "vmov.u16 q14, #74 \n" |
michael@0 | 635 | "vmov.u16 q15, #16 \n" |
michael@0 | 636 | ".p2align 2 \n" |
michael@0 | 637 | "1: \n" |
michael@0 | 638 | READNV21 |
michael@0 | 639 | YUV422TORGB |
michael@0 | 640 | "subs %3, %3, #8 \n" |
michael@0 | 641 | ARGBTORGB565 |
michael@0 | 642 | "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. |
michael@0 | 643 | "bgt 1b \n" |
michael@0 | 644 | : "+r"(src_y), // %0 |
michael@0 | 645 | "+r"(src_uv), // %1 |
michael@0 | 646 | "+r"(dst_rgb565), // %2 |
michael@0 | 647 | "+r"(width) // %3 |
michael@0 | 648 | : "r"(&kUVToRB), // %4 |
michael@0 | 649 | "r"(&kUVToG) // %5 |
michael@0 | 650 | : "cc", "memory", "q0", "q1", "q2", "q3", |
michael@0 | 651 | "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 652 | ); |
michael@0 | 653 | } |
michael@0 | 654 | |
michael@0 | 655 | void YUY2ToARGBRow_NEON(const uint8* src_yuy2, |
michael@0 | 656 | uint8* dst_argb, |
michael@0 | 657 | int width) { |
michael@0 | 658 | asm volatile ( |
michael@0 | 659 | "vld1.8 {d24}, [%3] \n" |
michael@0 | 660 | "vld1.8 {d25}, [%4] \n" |
michael@0 | 661 | "vmov.u8 d26, #128 \n" |
michael@0 | 662 | "vmov.u16 q14, #74 \n" |
michael@0 | 663 | "vmov.u16 q15, #16 \n" |
michael@0 | 664 | ".p2align 2 \n" |
michael@0 | 665 | "1: \n" |
michael@0 | 666 | READYUY2 |
michael@0 | 667 | YUV422TORGB |
michael@0 | 668 | "subs %2, %2, #8 \n" |
michael@0 | 669 | "vmov.u8 d23, #255 \n" |
michael@0 | 670 | "vst4.8 {d20, d21, d22, d23}, [%1]! \n" |
michael@0 | 671 | "bgt 1b \n" |
michael@0 | 672 | : "+r"(src_yuy2), // %0 |
michael@0 | 673 | "+r"(dst_argb), // %1 |
michael@0 | 674 | "+r"(width) // %2 |
michael@0 | 675 | : "r"(&kUVToRB), // %3 |
michael@0 | 676 | "r"(&kUVToG) // %4 |
michael@0 | 677 | : "cc", "memory", "q0", "q1", "q2", "q3", |
michael@0 | 678 | "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 679 | ); |
michael@0 | 680 | } |
michael@0 | 681 | |
michael@0 | 682 | void UYVYToARGBRow_NEON(const uint8* src_uyvy, |
michael@0 | 683 | uint8* dst_argb, |
michael@0 | 684 | int width) { |
michael@0 | 685 | asm volatile ( |
michael@0 | 686 | "vld1.8 {d24}, [%3] \n" |
michael@0 | 687 | "vld1.8 {d25}, [%4] \n" |
michael@0 | 688 | "vmov.u8 d26, #128 \n" |
michael@0 | 689 | "vmov.u16 q14, #74 \n" |
michael@0 | 690 | "vmov.u16 q15, #16 \n" |
michael@0 | 691 | ".p2align 2 \n" |
michael@0 | 692 | "1: \n" |
michael@0 | 693 | READUYVY |
michael@0 | 694 | YUV422TORGB |
michael@0 | 695 | "subs %2, %2, #8 \n" |
michael@0 | 696 | "vmov.u8 d23, #255 \n" |
michael@0 | 697 | "vst4.8 {d20, d21, d22, d23}, [%1]! \n" |
michael@0 | 698 | "bgt 1b \n" |
michael@0 | 699 | : "+r"(src_uyvy), // %0 |
michael@0 | 700 | "+r"(dst_argb), // %1 |
michael@0 | 701 | "+r"(width) // %2 |
michael@0 | 702 | : "r"(&kUVToRB), // %3 |
michael@0 | 703 | "r"(&kUVToG) // %4 |
michael@0 | 704 | : "cc", "memory", "q0", "q1", "q2", "q3", |
michael@0 | 705 | "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 706 | ); |
michael@0 | 707 | } |
michael@0 | 708 | |
michael@0 | 709 | // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. |
michael@0 | 710 | void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, |
michael@0 | 711 | int width) { |
michael@0 | 712 | asm volatile ( |
michael@0 | 713 | ".p2align 2 \n" |
michael@0 | 714 | "1: \n" |
michael@0 | 715 | "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV |
michael@0 | 716 | "subs %3, %3, #16 \n" // 16 processed per loop |
michael@0 | 717 | "vst1.8 {q0}, [%1]! \n" // store U |
michael@0 | 718 | "vst1.8 {q1}, [%2]! \n" // store V |
michael@0 | 719 | "bgt 1b \n" |
michael@0 | 720 | : "+r"(src_uv), // %0 |
michael@0 | 721 | "+r"(dst_u), // %1 |
michael@0 | 722 | "+r"(dst_v), // %2 |
michael@0 | 723 | "+r"(width) // %3 // Output registers |
michael@0 | 724 | : // Input registers |
michael@0 | 725 | : "cc", "memory", "q0", "q1" // Clobber List |
michael@0 | 726 | ); |
michael@0 | 727 | } |
michael@0 | 728 | |
michael@0 | 729 | // Reads 16 U's and V's and writes out 16 pairs of UV. |
michael@0 | 730 | void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, |
michael@0 | 731 | int width) { |
michael@0 | 732 | asm volatile ( |
michael@0 | 733 | ".p2align 2 \n" |
michael@0 | 734 | "1: \n" |
michael@0 | 735 | "vld1.8 {q0}, [%0]! \n" // load U |
michael@0 | 736 | "vld1.8 {q1}, [%1]! \n" // load V |
michael@0 | 737 | "subs %3, %3, #16 \n" // 16 processed per loop |
michael@0 | 738 | "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV |
michael@0 | 739 | "bgt 1b \n" |
michael@0 | 740 | : |
michael@0 | 741 | "+r"(src_u), // %0 |
michael@0 | 742 | "+r"(src_v), // %1 |
michael@0 | 743 | "+r"(dst_uv), // %2 |
michael@0 | 744 | "+r"(width) // %3 // Output registers |
michael@0 | 745 | : // Input registers |
michael@0 | 746 | : "cc", "memory", "q0", "q1" // Clobber List |
michael@0 | 747 | ); |
michael@0 | 748 | } |
michael@0 | 749 | |
michael@0 | 750 | // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. |
michael@0 | 751 | void CopyRow_NEON(const uint8* src, uint8* dst, int count) { |
michael@0 | 752 | asm volatile ( |
michael@0 | 753 | ".p2align 2 \n" |
michael@0 | 754 | "1: \n" |
michael@0 | 755 | "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 |
michael@0 | 756 | "subs %2, %2, #32 \n" // 32 processed per loop |
michael@0 | 757 | "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 |
michael@0 | 758 | "bgt 1b \n" |
michael@0 | 759 | : "+r"(src), // %0 |
michael@0 | 760 | "+r"(dst), // %1 |
michael@0 | 761 | "+r"(count) // %2 // Output registers |
michael@0 | 762 | : // Input registers |
michael@0 | 763 | : "cc", "memory", "q0", "q1" // Clobber List |
michael@0 | 764 | ); |
michael@0 | 765 | } |
michael@0 | 766 | |
michael@0 | 767 | // SetRow8 writes 'count' bytes using a 32 bit value repeated. |
michael@0 | 768 | void SetRow_NEON(uint8* dst, uint32 v32, int count) { |
michael@0 | 769 | asm volatile ( |
michael@0 | 770 | "vdup.u32 q0, %2 \n" // duplicate 4 ints |
michael@0 | 771 | "1: \n" |
michael@0 | 772 | "subs %1, %1, #16 \n" // 16 bytes per loop |
michael@0 | 773 | "vst1.8 {q0}, [%0]! \n" // store |
michael@0 | 774 | "bgt 1b \n" |
michael@0 | 775 | : "+r"(dst), // %0 |
michael@0 | 776 | "+r"(count) // %1 |
michael@0 | 777 | : "r"(v32) // %2 |
michael@0 | 778 | : "cc", "memory", "q0" |
michael@0 | 779 | ); |
michael@0 | 780 | } |
michael@0 | 781 | |
michael@0 | 782 | // TODO(fbarchard): Make fully assembler |
michael@0 | 783 | // SetRow32 writes 'count' words using a 32 bit value repeated. |
michael@0 | 784 | void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width, |
michael@0 | 785 | int dst_stride, int height) { |
michael@0 | 786 | for (int y = 0; y < height; ++y) { |
michael@0 | 787 | SetRow_NEON(dst, v32, width << 2); |
michael@0 | 788 | dst += dst_stride; |
michael@0 | 789 | } |
michael@0 | 790 | } |
michael@0 | 791 | |
michael@0 | 792 | void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { |
michael@0 | 793 | asm volatile ( |
michael@0 | 794 | // Start at end of source row. |
michael@0 | 795 | "mov r3, #-16 \n" |
michael@0 | 796 | "add %0, %0, %2 \n" |
michael@0 | 797 | "sub %0, #16 \n" |
michael@0 | 798 | |
michael@0 | 799 | ".p2align 2 \n" |
michael@0 | 800 | "1: \n" |
michael@0 | 801 | "vld1.8 {q0}, [%0], r3 \n" // src -= 16 |
michael@0 | 802 | "subs %2, #16 \n" // 16 pixels per loop. |
michael@0 | 803 | "vrev64.8 q0, q0 \n" |
michael@0 | 804 | "vst1.8 {d1}, [%1]! \n" // dst += 16 |
michael@0 | 805 | "vst1.8 {d0}, [%1]! \n" |
michael@0 | 806 | "bgt 1b \n" |
michael@0 | 807 | : "+r"(src), // %0 |
michael@0 | 808 | "+r"(dst), // %1 |
michael@0 | 809 | "+r"(width) // %2 |
michael@0 | 810 | : |
michael@0 | 811 | : "cc", "memory", "r3", "q0" |
michael@0 | 812 | ); |
michael@0 | 813 | } |
michael@0 | 814 | |
michael@0 | 815 | void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, |
michael@0 | 816 | int width) { |
michael@0 | 817 | asm volatile ( |
michael@0 | 818 | // Start at end of source row. |
michael@0 | 819 | "mov r12, #-16 \n" |
michael@0 | 820 | "add %0, %0, %3, lsl #1 \n" |
michael@0 | 821 | "sub %0, #16 \n" |
michael@0 | 822 | |
michael@0 | 823 | ".p2align 2 \n" |
michael@0 | 824 | "1: \n" |
michael@0 | 825 | "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 |
michael@0 | 826 | "subs %3, #8 \n" // 8 pixels per loop. |
michael@0 | 827 | "vrev64.8 q0, q0 \n" |
michael@0 | 828 | "vst1.8 {d0}, [%1]! \n" // dst += 8 |
michael@0 | 829 | "vst1.8 {d1}, [%2]! \n" |
michael@0 | 830 | "bgt 1b \n" |
michael@0 | 831 | : "+r"(src_uv), // %0 |
michael@0 | 832 | "+r"(dst_u), // %1 |
michael@0 | 833 | "+r"(dst_v), // %2 |
michael@0 | 834 | "+r"(width) // %3 |
michael@0 | 835 | : |
michael@0 | 836 | : "cc", "memory", "r12", "q0" |
michael@0 | 837 | ); |
michael@0 | 838 | } |
michael@0 | 839 | |
michael@0 | 840 | void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { |
michael@0 | 841 | asm volatile ( |
michael@0 | 842 | // Start at end of source row. |
michael@0 | 843 | "mov r3, #-16 \n" |
michael@0 | 844 | "add %0, %0, %2, lsl #2 \n" |
michael@0 | 845 | "sub %0, #16 \n" |
michael@0 | 846 | |
michael@0 | 847 | ".p2align 2 \n" |
michael@0 | 848 | "1: \n" |
michael@0 | 849 | "vld1.8 {q0}, [%0], r3 \n" // src -= 16 |
michael@0 | 850 | "subs %2, #4 \n" // 4 pixels per loop. |
michael@0 | 851 | "vrev64.32 q0, q0 \n" |
michael@0 | 852 | "vst1.8 {d1}, [%1]! \n" // dst += 16 |
michael@0 | 853 | "vst1.8 {d0}, [%1]! \n" |
michael@0 | 854 | "bgt 1b \n" |
michael@0 | 855 | : "+r"(src), // %0 |
michael@0 | 856 | "+r"(dst), // %1 |
michael@0 | 857 | "+r"(width) // %2 |
michael@0 | 858 | : |
michael@0 | 859 | : "cc", "memory", "r3", "q0" |
michael@0 | 860 | ); |
michael@0 | 861 | } |
michael@0 | 862 | |
michael@0 | 863 | void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { |
michael@0 | 864 | asm volatile ( |
michael@0 | 865 | "vmov.u8 d4, #255 \n" // Alpha |
michael@0 | 866 | ".p2align 2 \n" |
michael@0 | 867 | "1: \n" |
michael@0 | 868 | "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. |
michael@0 | 869 | "subs %2, %2, #8 \n" // 8 processed per loop. |
michael@0 | 870 | "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. |
michael@0 | 871 | "bgt 1b \n" |
michael@0 | 872 | : "+r"(src_rgb24), // %0 |
michael@0 | 873 | "+r"(dst_argb), // %1 |
michael@0 | 874 | "+r"(pix) // %2 |
michael@0 | 875 | : |
michael@0 | 876 | : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List |
michael@0 | 877 | ); |
michael@0 | 878 | } |
michael@0 | 879 | |
michael@0 | 880 | void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { |
michael@0 | 881 | asm volatile ( |
michael@0 | 882 | "vmov.u8 d4, #255 \n" // Alpha |
michael@0 | 883 | ".p2align 2 \n" |
michael@0 | 884 | "1: \n" |
michael@0 | 885 | "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. |
michael@0 | 886 | "subs %2, %2, #8 \n" // 8 processed per loop. |
michael@0 | 887 | "vswp.u8 d1, d3 \n" // swap R, B |
michael@0 | 888 | "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. |
michael@0 | 889 | "bgt 1b \n" |
michael@0 | 890 | : "+r"(src_raw), // %0 |
michael@0 | 891 | "+r"(dst_argb), // %1 |
michael@0 | 892 | "+r"(pix) // %2 |
michael@0 | 893 | : |
michael@0 | 894 | : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List |
michael@0 | 895 | ); |
michael@0 | 896 | } |
michael@0 | 897 | |
michael@0 | 898 | #define RGB565TOARGB \ |
michael@0 | 899 | "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \ |
michael@0 | 900 | "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \ |
michael@0 | 901 | "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \ |
michael@0 | 902 | "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \ |
michael@0 | 903 | "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ |
michael@0 | 904 | "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ |
michael@0 | 905 | "vorr.u8 d0, d0, d4 \n" /* B */ \ |
michael@0 | 906 | "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \ |
michael@0 | 907 | "vorr.u8 d2, d1, d5 \n" /* R */ \ |
michael@0 | 908 | "vorr.u8 d1, d4, d6 \n" /* G */ |
michael@0 | 909 | |
michael@0 | 910 | void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { |
michael@0 | 911 | asm volatile ( |
michael@0 | 912 | "vmov.u8 d3, #255 \n" // Alpha |
michael@0 | 913 | ".p2align 2 \n" |
michael@0 | 914 | "1: \n" |
michael@0 | 915 | "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. |
michael@0 | 916 | "subs %2, %2, #8 \n" // 8 processed per loop. |
michael@0 | 917 | RGB565TOARGB |
michael@0 | 918 | "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. |
michael@0 | 919 | "bgt 1b \n" |
michael@0 | 920 | : "+r"(src_rgb565), // %0 |
michael@0 | 921 | "+r"(dst_argb), // %1 |
michael@0 | 922 | "+r"(pix) // %2 |
michael@0 | 923 | : |
michael@0 | 924 | : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List |
michael@0 | 925 | ); |
michael@0 | 926 | } |
michael@0 | 927 | |
michael@0 | 928 | #define ARGB1555TOARGB \ |
michael@0 | 929 | "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \ |
michael@0 | 930 | "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \ |
michael@0 | 931 | "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \ |
michael@0 | 932 | "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \ |
michael@0 | 933 | "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \ |
michael@0 | 934 | "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \ |
michael@0 | 935 | "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \ |
michael@0 | 936 | "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \ |
michael@0 | 937 | "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \ |
michael@0 | 938 | "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \ |
michael@0 | 939 | "vorr.u8 q1, q1, q3 \n" /* R,A */ \ |
michael@0 | 940 | "vorr.u8 q0, q0, q2 \n" /* B,G */ \ |
michael@0 | 941 | |
michael@0 | 942 | // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. |
michael@0 | 943 | #define RGB555TOARGB \ |
michael@0 | 944 | "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \ |
michael@0 | 945 | "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \ |
michael@0 | 946 | "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \ |
michael@0 | 947 | "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \ |
michael@0 | 948 | "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ |
michael@0 | 949 | "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ |
michael@0 | 950 | "vorr.u8 d0, d0, d4 \n" /* B */ \ |
michael@0 | 951 | "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \ |
michael@0 | 952 | "vorr.u8 d2, d1, d5 \n" /* R */ \ |
michael@0 | 953 | "vorr.u8 d1, d4, d6 \n" /* G */ |
michael@0 | 954 | |
michael@0 | 955 | void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, |
michael@0 | 956 | int pix) { |
michael@0 | 957 | asm volatile ( |
michael@0 | 958 | "vmov.u8 d3, #255 \n" // Alpha |
michael@0 | 959 | ".p2align 2 \n" |
michael@0 | 960 | "1: \n" |
michael@0 | 961 | "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. |
michael@0 | 962 | "subs %2, %2, #8 \n" // 8 processed per loop. |
michael@0 | 963 | ARGB1555TOARGB |
michael@0 | 964 | "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. |
michael@0 | 965 | "bgt 1b \n" |
michael@0 | 966 | : "+r"(src_argb1555), // %0 |
michael@0 | 967 | "+r"(dst_argb), // %1 |
michael@0 | 968 | "+r"(pix) // %2 |
michael@0 | 969 | : |
michael@0 | 970 | : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List |
michael@0 | 971 | ); |
michael@0 | 972 | } |
michael@0 | 973 | |
michael@0 | 974 | #define ARGB4444TOARGB \ |
michael@0 | 975 | "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \ |
michael@0 | 976 | "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \ |
michael@0 | 977 | "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \ |
michael@0 | 978 | "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \ |
michael@0 | 979 | "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \ |
michael@0 | 980 | "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \ |
michael@0 | 981 | "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \ |
michael@0 | 982 | "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */ |
michael@0 | 983 | |
michael@0 | 984 | void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, |
michael@0 | 985 | int pix) { |
michael@0 | 986 | asm volatile ( |
michael@0 | 987 | "vmov.u8 d3, #255 \n" // Alpha |
michael@0 | 988 | ".p2align 2 \n" |
michael@0 | 989 | "1: \n" |
michael@0 | 990 | "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. |
michael@0 | 991 | "subs %2, %2, #8 \n" // 8 processed per loop. |
michael@0 | 992 | ARGB4444TOARGB |
michael@0 | 993 | "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. |
michael@0 | 994 | "bgt 1b \n" |
michael@0 | 995 | : "+r"(src_argb4444), // %0 |
michael@0 | 996 | "+r"(dst_argb), // %1 |
michael@0 | 997 | "+r"(pix) // %2 |
michael@0 | 998 | : |
michael@0 | 999 | : "cc", "memory", "q0", "q1", "q2" // Clobber List |
michael@0 | 1000 | ); |
michael@0 | 1001 | } |
michael@0 | 1002 | |
michael@0 | 1003 | void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { |
michael@0 | 1004 | asm volatile ( |
michael@0 | 1005 | ".p2align 2 \n" |
michael@0 | 1006 | "1: \n" |
michael@0 | 1007 | "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. |
michael@0 | 1008 | "subs %2, %2, #8 \n" // 8 processed per loop. |
michael@0 | 1009 | "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24. |
michael@0 | 1010 | "bgt 1b \n" |
michael@0 | 1011 | : "+r"(src_argb), // %0 |
michael@0 | 1012 | "+r"(dst_rgb24), // %1 |
michael@0 | 1013 | "+r"(pix) // %2 |
michael@0 | 1014 | : |
michael@0 | 1015 | : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List |
michael@0 | 1016 | ); |
michael@0 | 1017 | } |
michael@0 | 1018 | |
michael@0 | 1019 | void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { |
michael@0 | 1020 | asm volatile ( |
michael@0 | 1021 | ".p2align 2 \n" |
michael@0 | 1022 | "1: \n" |
michael@0 | 1023 | "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. |
michael@0 | 1024 | "subs %2, %2, #8 \n" // 8 processed per loop. |
michael@0 | 1025 | "vswp.u8 d1, d3 \n" // swap R, B |
michael@0 | 1026 | "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. |
michael@0 | 1027 | "bgt 1b \n" |
michael@0 | 1028 | : "+r"(src_argb), // %0 |
michael@0 | 1029 | "+r"(dst_raw), // %1 |
michael@0 | 1030 | "+r"(pix) // %2 |
michael@0 | 1031 | : |
michael@0 | 1032 | : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List |
michael@0 | 1033 | ); |
michael@0 | 1034 | } |
michael@0 | 1035 | |
michael@0 | 1036 | void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { |
michael@0 | 1037 | asm volatile ( |
michael@0 | 1038 | ".p2align 2 \n" |
michael@0 | 1039 | "1: \n" |
michael@0 | 1040 | "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. |
michael@0 | 1041 | "subs %2, %2, #16 \n" // 16 processed per loop. |
michael@0 | 1042 | "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. |
michael@0 | 1043 | "bgt 1b \n" |
michael@0 | 1044 | : "+r"(src_yuy2), // %0 |
michael@0 | 1045 | "+r"(dst_y), // %1 |
michael@0 | 1046 | "+r"(pix) // %2 |
michael@0 | 1047 | : |
michael@0 | 1048 | : "cc", "memory", "q0", "q1" // Clobber List |
michael@0 | 1049 | ); |
michael@0 | 1050 | } |
michael@0 | 1051 | |
michael@0 | 1052 | void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { |
michael@0 | 1053 | asm volatile ( |
michael@0 | 1054 | ".p2align 2 \n" |
michael@0 | 1055 | "1: \n" |
michael@0 | 1056 | "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. |
michael@0 | 1057 | "subs %2, %2, #16 \n" // 16 processed per loop. |
michael@0 | 1058 | "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. |
michael@0 | 1059 | "bgt 1b \n" |
michael@0 | 1060 | : "+r"(src_uyvy), // %0 |
michael@0 | 1061 | "+r"(dst_y), // %1 |
michael@0 | 1062 | "+r"(pix) // %2 |
michael@0 | 1063 | : |
michael@0 | 1064 | : "cc", "memory", "q0", "q1" // Clobber List |
michael@0 | 1065 | ); |
michael@0 | 1066 | } |
michael@0 | 1067 | |
michael@0 | 1068 | void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, |
michael@0 | 1069 | int pix) { |
michael@0 | 1070 | asm volatile ( |
michael@0 | 1071 | ".p2align 2 \n" |
michael@0 | 1072 | "1: \n" |
michael@0 | 1073 | "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. |
michael@0 | 1074 | "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. |
michael@0 | 1075 | "vst1.8 {d1}, [%1]! \n" // store 8 U. |
michael@0 | 1076 | "vst1.8 {d3}, [%2]! \n" // store 8 V. |
michael@0 | 1077 | "bgt 1b \n" |
michael@0 | 1078 | : "+r"(src_yuy2), // %0 |
michael@0 | 1079 | "+r"(dst_u), // %1 |
michael@0 | 1080 | "+r"(dst_v), // %2 |
michael@0 | 1081 | "+r"(pix) // %3 |
michael@0 | 1082 | : |
michael@0 | 1083 | : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List |
michael@0 | 1084 | ); |
michael@0 | 1085 | } |
michael@0 | 1086 | |
michael@0 | 1087 | void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, |
michael@0 | 1088 | int pix) { |
michael@0 | 1089 | asm volatile ( |
michael@0 | 1090 | ".p2align 2 \n" |
michael@0 | 1091 | "1: \n" |
michael@0 | 1092 | "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. |
michael@0 | 1093 | "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. |
michael@0 | 1094 | "vst1.8 {d0}, [%1]! \n" // store 8 U. |
michael@0 | 1095 | "vst1.8 {d2}, [%2]! \n" // store 8 V. |
michael@0 | 1096 | "bgt 1b \n" |
michael@0 | 1097 | : "+r"(src_uyvy), // %0 |
michael@0 | 1098 | "+r"(dst_u), // %1 |
michael@0 | 1099 | "+r"(dst_v), // %2 |
michael@0 | 1100 | "+r"(pix) // %3 |
michael@0 | 1101 | : |
michael@0 | 1102 | : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List |
michael@0 | 1103 | ); |
michael@0 | 1104 | } |
michael@0 | 1105 | |
michael@0 | 1106 | void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, |
michael@0 | 1107 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 1108 | asm volatile ( |
michael@0 | 1109 | "add %1, %0, %1 \n" // stride + src_yuy2 |
michael@0 | 1110 | ".p2align 2 \n" |
michael@0 | 1111 | "1: \n" |
michael@0 | 1112 | "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. |
michael@0 | 1113 | "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. |
michael@0 | 1114 | "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. |
michael@0 | 1115 | "vrhadd.u8 d1, d1, d5 \n" // average rows of U |
michael@0 | 1116 | "vrhadd.u8 d3, d3, d7 \n" // average rows of V |
michael@0 | 1117 | "vst1.8 {d1}, [%2]! \n" // store 8 U. |
michael@0 | 1118 | "vst1.8 {d3}, [%3]! \n" // store 8 V. |
michael@0 | 1119 | "bgt 1b \n" |
michael@0 | 1120 | : "+r"(src_yuy2), // %0 |
michael@0 | 1121 | "+r"(stride_yuy2), // %1 |
michael@0 | 1122 | "+r"(dst_u), // %2 |
michael@0 | 1123 | "+r"(dst_v), // %3 |
michael@0 | 1124 | "+r"(pix) // %4 |
michael@0 | 1125 | : |
michael@0 | 1126 | : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List |
michael@0 | 1127 | ); |
michael@0 | 1128 | } |
michael@0 | 1129 | |
michael@0 | 1130 | void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, |
michael@0 | 1131 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 1132 | asm volatile ( |
michael@0 | 1133 | "add %1, %0, %1 \n" // stride + src_uyvy |
michael@0 | 1134 | ".p2align 2 \n" |
michael@0 | 1135 | "1: \n" |
michael@0 | 1136 | "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. |
michael@0 | 1137 | "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. |
michael@0 | 1138 | "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. |
michael@0 | 1139 | "vrhadd.u8 d0, d0, d4 \n" // average rows of U |
michael@0 | 1140 | "vrhadd.u8 d2, d2, d6 \n" // average rows of V |
michael@0 | 1141 | "vst1.8 {d0}, [%2]! \n" // store 8 U. |
michael@0 | 1142 | "vst1.8 {d2}, [%3]! \n" // store 8 V. |
michael@0 | 1143 | "bgt 1b \n" |
michael@0 | 1144 | : "+r"(src_uyvy), // %0 |
michael@0 | 1145 | "+r"(stride_uyvy), // %1 |
michael@0 | 1146 | "+r"(dst_u), // %2 |
michael@0 | 1147 | "+r"(dst_v), // %3 |
michael@0 | 1148 | "+r"(pix) // %4 |
michael@0 | 1149 | : |
michael@0 | 1150 | : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List |
michael@0 | 1151 | ); |
michael@0 | 1152 | } |
michael@0 | 1153 | |
michael@0 | 1154 | void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, |
michael@0 | 1155 | uint8* dst_uv, int pix) { |
michael@0 | 1156 | asm volatile ( |
michael@0 | 1157 | // change the stride to row 2 pointer |
michael@0 | 1158 | "add %1, %0 \n" |
michael@0 | 1159 | "1: \n" |
michael@0 | 1160 | "vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels. |
michael@0 | 1161 | "subs %3, %3, #16 \n" // 16 processed per loop |
michael@0 | 1162 | "vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels. |
michael@0 | 1163 | "vrhadd.u8 q0, q1 \n" // average row 1 and 2 |
michael@0 | 1164 | "vst1.8 {q0}, [%2]! \n" |
michael@0 | 1165 | "bgt 1b \n" |
michael@0 | 1166 | : "+r"(src_uv), // %0 |
michael@0 | 1167 | "+r"(src_uv_stride), // %1 |
michael@0 | 1168 | "+r"(dst_uv), // %2 |
michael@0 | 1169 | "+r"(pix) // %3 |
michael@0 | 1170 | : |
michael@0 | 1171 | : "cc", "memory", "q0", "q1" // Clobber List |
michael@0 | 1172 | ); |
michael@0 | 1173 | } |
michael@0 | 1174 | |
michael@0 | 1175 | // Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG |
michael@0 | 1176 | void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, |
michael@0 | 1177 | uint32 selector, int pix) { |
michael@0 | 1178 | asm volatile ( |
michael@0 | 1179 | "vmov.u32 d6[0], %3 \n" // selector |
michael@0 | 1180 | "1: \n" |
michael@0 | 1181 | "vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels. |
michael@0 | 1182 | "subs %2, %2, #8 \n" // 8 processed per loop |
michael@0 | 1183 | "vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels |
michael@0 | 1184 | "vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels |
michael@0 | 1185 | "vtrn.u32 d4, d5 \n" // combine 8 pixels |
michael@0 | 1186 | "vst1.8 {d4}, [%1]! \n" // store 8. |
michael@0 | 1187 | "bgt 1b \n" |
michael@0 | 1188 | : "+r"(src_argb), // %0 |
michael@0 | 1189 | "+r"(dst_bayer), // %1 |
michael@0 | 1190 | "+r"(pix) // %2 |
michael@0 | 1191 | : "r"(selector) // %3 |
michael@0 | 1192 | : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List |
michael@0 | 1193 | ); |
michael@0 | 1194 | } |
michael@0 | 1195 | |
michael@0 | 1196 | // Select G channels from ARGB. e.g. GGGGGGGG |
michael@0 | 1197 | void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, |
michael@0 | 1198 | uint32 /*selector*/, int pix) { |
michael@0 | 1199 | asm volatile ( |
michael@0 | 1200 | "1: \n" |
michael@0 | 1201 | "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels. |
michael@0 | 1202 | "subs %2, %2, #8 \n" // 8 processed per loop |
michael@0 | 1203 | "vst1.8 {d1}, [%1]! \n" // store 8 G's. |
michael@0 | 1204 | "bgt 1b \n" |
michael@0 | 1205 | : "+r"(src_argb), // %0 |
michael@0 | 1206 | "+r"(dst_bayer), // %1 |
michael@0 | 1207 | "+r"(pix) // %2 |
michael@0 | 1208 | : |
michael@0 | 1209 | : "cc", "memory", "q0", "q1" // Clobber List |
michael@0 | 1210 | ); |
michael@0 | 1211 | } |
michael@0 | 1212 | |
michael@0 | 1213 | // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
michael@0 | 1214 | void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, |
michael@0 | 1215 | const uint8* shuffler, int pix) { |
michael@0 | 1216 | asm volatile ( |
michael@0 | 1217 | "vld1.8 {q2}, [%3] \n" // shuffler |
michael@0 | 1218 | "1: \n" |
michael@0 | 1219 | "vld1.8 {q0}, [%0]! \n" // load 4 pixels. |
michael@0 | 1220 | "subs %2, %2, #4 \n" // 4 processed per loop |
michael@0 | 1221 | "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels |
michael@0 | 1222 | "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels |
michael@0 | 1223 | "vst1.8 {q1}, [%1]! \n" // store 4. |
michael@0 | 1224 | "bgt 1b \n" |
michael@0 | 1225 | : "+r"(src_argb), // %0 |
michael@0 | 1226 | "+r"(dst_argb), // %1 |
michael@0 | 1227 | "+r"(pix) // %2 |
michael@0 | 1228 | : "r"(shuffler) // %3 |
michael@0 | 1229 | : "cc", "memory", "q0", "q1", "q2" // Clobber List |
michael@0 | 1230 | ); |
michael@0 | 1231 | } |
michael@0 | 1232 | |
michael@0 | 1233 | void I422ToYUY2Row_NEON(const uint8* src_y, |
michael@0 | 1234 | const uint8* src_u, |
michael@0 | 1235 | const uint8* src_v, |
michael@0 | 1236 | uint8* dst_yuy2, int width) { |
michael@0 | 1237 | asm volatile ( |
michael@0 | 1238 | ".p2align 2 \n" |
michael@0 | 1239 | "1: \n" |
michael@0 | 1240 | "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys |
michael@0 | 1241 | "vld1.8 {d1}, [%1]! \n" // load 8 Us |
michael@0 | 1242 | "vld1.8 {d3}, [%2]! \n" // load 8 Vs |
michael@0 | 1243 | "subs %4, %4, #16 \n" // 16 pixels |
michael@0 | 1244 | "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. |
michael@0 | 1245 | "bgt 1b \n" |
michael@0 | 1246 | : "+r"(src_y), // %0 |
michael@0 | 1247 | "+r"(src_u), // %1 |
michael@0 | 1248 | "+r"(src_v), // %2 |
michael@0 | 1249 | "+r"(dst_yuy2), // %3 |
michael@0 | 1250 | "+r"(width) // %4 |
michael@0 | 1251 | : |
michael@0 | 1252 | : "cc", "memory", "d0", "d1", "d2", "d3" |
michael@0 | 1253 | ); |
michael@0 | 1254 | } |
michael@0 | 1255 | |
michael@0 | 1256 | void I422ToUYVYRow_NEON(const uint8* src_y, |
michael@0 | 1257 | const uint8* src_u, |
michael@0 | 1258 | const uint8* src_v, |
michael@0 | 1259 | uint8* dst_uyvy, int width) { |
michael@0 | 1260 | asm volatile ( |
michael@0 | 1261 | ".p2align 2 \n" |
michael@0 | 1262 | "1: \n" |
michael@0 | 1263 | "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys |
michael@0 | 1264 | "vld1.8 {d0}, [%1]! \n" // load 8 Us |
michael@0 | 1265 | "vld1.8 {d2}, [%2]! \n" // load 8 Vs |
michael@0 | 1266 | "subs %4, %4, #16 \n" // 16 pixels |
michael@0 | 1267 | "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. |
michael@0 | 1268 | "bgt 1b \n" |
michael@0 | 1269 | : "+r"(src_y), // %0 |
michael@0 | 1270 | "+r"(src_u), // %1 |
michael@0 | 1271 | "+r"(src_v), // %2 |
michael@0 | 1272 | "+r"(dst_uyvy), // %3 |
michael@0 | 1273 | "+r"(width) // %4 |
michael@0 | 1274 | : |
michael@0 | 1275 | : "cc", "memory", "d0", "d1", "d2", "d3" |
michael@0 | 1276 | ); |
michael@0 | 1277 | } |
michael@0 | 1278 | |
michael@0 | 1279 | void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { |
michael@0 | 1280 | asm volatile ( |
michael@0 | 1281 | ".p2align 2 \n" |
michael@0 | 1282 | "1: \n" |
michael@0 | 1283 | "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. |
michael@0 | 1284 | "subs %2, %2, #8 \n" // 8 processed per loop. |
michael@0 | 1285 | ARGBTORGB565 |
michael@0 | 1286 | "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565. |
michael@0 | 1287 | "bgt 1b \n" |
michael@0 | 1288 | : "+r"(src_argb), // %0 |
michael@0 | 1289 | "+r"(dst_rgb565), // %1 |
michael@0 | 1290 | "+r"(pix) // %2 |
michael@0 | 1291 | : |
michael@0 | 1292 | : "cc", "memory", "q0", "q8", "q9", "q10", "q11" |
michael@0 | 1293 | ); |
michael@0 | 1294 | } |
michael@0 | 1295 | |
michael@0 | 1296 | void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, |
michael@0 | 1297 | int pix) { |
michael@0 | 1298 | asm volatile ( |
michael@0 | 1299 | ".p2align 2 \n" |
michael@0 | 1300 | "1: \n" |
michael@0 | 1301 | "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. |
michael@0 | 1302 | "subs %2, %2, #8 \n" // 8 processed per loop. |
michael@0 | 1303 | ARGBTOARGB1555 |
michael@0 | 1304 | "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555. |
michael@0 | 1305 | "bgt 1b \n" |
michael@0 | 1306 | : "+r"(src_argb), // %0 |
michael@0 | 1307 | "+r"(dst_argb1555), // %1 |
michael@0 | 1308 | "+r"(pix) // %2 |
michael@0 | 1309 | : |
michael@0 | 1310 | : "cc", "memory", "q0", "q8", "q9", "q10", "q11" |
michael@0 | 1311 | ); |
michael@0 | 1312 | } |
michael@0 | 1313 | |
michael@0 | 1314 | void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, |
michael@0 | 1315 | int pix) { |
michael@0 | 1316 | asm volatile ( |
michael@0 | 1317 | "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. |
michael@0 | 1318 | ".p2align 2 \n" |
michael@0 | 1319 | "1: \n" |
michael@0 | 1320 | "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. |
michael@0 | 1321 | "subs %2, %2, #8 \n" // 8 processed per loop. |
michael@0 | 1322 | ARGBTOARGB4444 |
michael@0 | 1323 | "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444. |
michael@0 | 1324 | "bgt 1b \n" |
michael@0 | 1325 | : "+r"(src_argb), // %0 |
michael@0 | 1326 | "+r"(dst_argb4444), // %1 |
michael@0 | 1327 | "+r"(pix) // %2 |
michael@0 | 1328 | : |
michael@0 | 1329 | : "cc", "memory", "q0", "q8", "q9", "q10", "q11" |
michael@0 | 1330 | ); |
michael@0 | 1331 | } |
michael@0 | 1332 | |
michael@0 | 1333 | void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { |
michael@0 | 1334 | asm volatile ( |
michael@0 | 1335 | "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient |
michael@0 | 1336 | "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient |
michael@0 | 1337 | "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient |
michael@0 | 1338 | "vmov.u8 d27, #16 \n" // Add 16 constant |
michael@0 | 1339 | ".p2align 2 \n" |
michael@0 | 1340 | "1: \n" |
michael@0 | 1341 | "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. |
michael@0 | 1342 | "subs %2, %2, #8 \n" // 8 processed per loop. |
michael@0 | 1343 | "vmull.u8 q2, d0, d24 \n" // B |
michael@0 | 1344 | "vmlal.u8 q2, d1, d25 \n" // G |
michael@0 | 1345 | "vmlal.u8 q2, d2, d26 \n" // R |
michael@0 | 1346 | "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y |
michael@0 | 1347 | "vqadd.u8 d0, d27 \n" |
michael@0 | 1348 | "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. |
michael@0 | 1349 | "bgt 1b \n" |
michael@0 | 1350 | : "+r"(src_argb), // %0 |
michael@0 | 1351 | "+r"(dst_y), // %1 |
michael@0 | 1352 | "+r"(pix) // %2 |
michael@0 | 1353 | : |
michael@0 | 1354 | : "cc", "memory", "q0", "q1", "q2", "q12", "q13" |
michael@0 | 1355 | ); |
michael@0 | 1356 | } |
michael@0 | 1357 | |
michael@0 | 1358 | void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { |
michael@0 | 1359 | asm volatile ( |
michael@0 | 1360 | "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient |
michael@0 | 1361 | "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient |
michael@0 | 1362 | "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient |
michael@0 | 1363 | ".p2align 2 \n" |
michael@0 | 1364 | "1: \n" |
michael@0 | 1365 | "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. |
michael@0 | 1366 | "subs %2, %2, #8 \n" // 8 processed per loop. |
michael@0 | 1367 | "vmull.u8 q2, d0, d24 \n" // B |
michael@0 | 1368 | "vmlal.u8 q2, d1, d25 \n" // G |
michael@0 | 1369 | "vmlal.u8 q2, d2, d26 \n" // R |
michael@0 | 1370 | "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y |
michael@0 | 1371 | "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. |
michael@0 | 1372 | "bgt 1b \n" |
michael@0 | 1373 | : "+r"(src_argb), // %0 |
michael@0 | 1374 | "+r"(dst_y), // %1 |
michael@0 | 1375 | "+r"(pix) // %2 |
michael@0 | 1376 | : |
michael@0 | 1377 | : "cc", "memory", "q0", "q1", "q2", "q12", "q13" |
michael@0 | 1378 | ); |
michael@0 | 1379 | } |
michael@0 | 1380 | |
michael@0 | 1381 | // 8x1 pixels. |
michael@0 | 1382 | void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, |
michael@0 | 1383 | int pix) { |
michael@0 | 1384 | asm volatile ( |
michael@0 | 1385 | "vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient |
michael@0 | 1386 | "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient |
michael@0 | 1387 | "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient |
michael@0 | 1388 | "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient |
michael@0 | 1389 | "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient |
michael@0 | 1390 | "vmov.u16 q15, #0x8080 \n" // 128.5 |
michael@0 | 1391 | ".p2align 2 \n" |
michael@0 | 1392 | "1: \n" |
michael@0 | 1393 | "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. |
michael@0 | 1394 | "subs %3, %3, #8 \n" // 8 processed per loop. |
michael@0 | 1395 | "vmull.u8 q2, d0, d24 \n" // B |
michael@0 | 1396 | "vmlsl.u8 q2, d1, d25 \n" // G |
michael@0 | 1397 | "vmlsl.u8 q2, d2, d26 \n" // R |
michael@0 | 1398 | "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned |
michael@0 | 1399 | |
michael@0 | 1400 | "vmull.u8 q3, d2, d24 \n" // R |
michael@0 | 1401 | "vmlsl.u8 q3, d1, d28 \n" // G |
michael@0 | 1402 | "vmlsl.u8 q3, d0, d27 \n" // B |
michael@0 | 1403 | "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned |
michael@0 | 1404 | |
michael@0 | 1405 | "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U |
michael@0 | 1406 | "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V |
michael@0 | 1407 | |
michael@0 | 1408 | "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. |
michael@0 | 1409 | "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. |
michael@0 | 1410 | "bgt 1b \n" |
michael@0 | 1411 | : "+r"(src_argb), // %0 |
michael@0 | 1412 | "+r"(dst_u), // %1 |
michael@0 | 1413 | "+r"(dst_v), // %2 |
michael@0 | 1414 | "+r"(pix) // %3 |
michael@0 | 1415 | : |
michael@0 | 1416 | : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15" |
michael@0 | 1417 | ); |
michael@0 | 1418 | } |
michael@0 | 1419 | |
michael@0 | 1420 | // 16x1 pixels -> 8x1. pix is number of argb pixels. e.g. 16. |
michael@0 | 1421 | void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, |
michael@0 | 1422 | int pix) { |
michael@0 | 1423 | asm volatile ( |
michael@0 | 1424 | "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient |
michael@0 | 1425 | "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient |
michael@0 | 1426 | "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient |
michael@0 | 1427 | "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient |
michael@0 | 1428 | "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient |
michael@0 | 1429 | "vmov.u16 q15, #0x8080 \n" // 128.5 |
michael@0 | 1430 | ".p2align 2 \n" |
michael@0 | 1431 | "1: \n" |
michael@0 | 1432 | "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. |
michael@0 | 1433 | "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. |
michael@0 | 1434 | |
michael@0 | 1435 | "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. |
michael@0 | 1436 | "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. |
michael@0 | 1437 | "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. |
michael@0 | 1438 | |
michael@0 | 1439 | "subs %3, %3, #16 \n" // 16 processed per loop. |
michael@0 | 1440 | "vmul.s16 q8, q0, q10 \n" // B |
michael@0 | 1441 | "vmls.s16 q8, q1, q11 \n" // G |
michael@0 | 1442 | "vmls.s16 q8, q2, q12 \n" // R |
michael@0 | 1443 | "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned |
michael@0 | 1444 | |
michael@0 | 1445 | "vmul.s16 q9, q2, q10 \n" // R |
michael@0 | 1446 | "vmls.s16 q9, q1, q14 \n" // G |
michael@0 | 1447 | "vmls.s16 q9, q0, q13 \n" // B |
michael@0 | 1448 | "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned |
michael@0 | 1449 | |
michael@0 | 1450 | "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U |
michael@0 | 1451 | "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V |
michael@0 | 1452 | |
michael@0 | 1453 | "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. |
michael@0 | 1454 | "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. |
michael@0 | 1455 | "bgt 1b \n" |
michael@0 | 1456 | : "+r"(src_argb), // %0 |
michael@0 | 1457 | "+r"(dst_u), // %1 |
michael@0 | 1458 | "+r"(dst_v), // %2 |
michael@0 | 1459 | "+r"(pix) // %3 |
michael@0 | 1460 | : |
michael@0 | 1461 | : "cc", "memory", "q0", "q1", "q2", "q3", |
michael@0 | 1462 | "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 1463 | ); |
michael@0 | 1464 | } |
michael@0 | 1465 | |
michael@0 | 1466 | // 32x1 pixels -> 8x1. pix is number of argb pixels. e.g. 32. |
michael@0 | 1467 | void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, |
michael@0 | 1468 | int pix) { |
michael@0 | 1469 | asm volatile ( |
michael@0 | 1470 | "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient |
michael@0 | 1471 | "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient |
michael@0 | 1472 | "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient |
michael@0 | 1473 | "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient |
michael@0 | 1474 | "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient |
michael@0 | 1475 | "vmov.u16 q15, #0x8080 \n" // 128.5 |
michael@0 | 1476 | ".p2align 2 \n" |
michael@0 | 1477 | "1: \n" |
michael@0 | 1478 | "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. |
michael@0 | 1479 | "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. |
michael@0 | 1480 | "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. |
michael@0 | 1481 | "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. |
michael@0 | 1482 | "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. |
michael@0 | 1483 | "vld4.8 {d8, d10, d12, d14}, [%0]! \n" // load 8 more ARGB pixels. |
michael@0 | 1484 | "vld4.8 {d9, d11, d13, d15}, [%0]! \n" // load last 8 ARGB pixels. |
michael@0 | 1485 | "vpaddl.u8 q4, q4 \n" // B 16 bytes -> 8 shorts. |
michael@0 | 1486 | "vpaddl.u8 q5, q5 \n" // G 16 bytes -> 8 shorts. |
michael@0 | 1487 | "vpaddl.u8 q6, q6 \n" // R 16 bytes -> 8 shorts. |
michael@0 | 1488 | |
michael@0 | 1489 | "vpadd.u16 d0, d0, d1 \n" // B 16 shorts -> 8 shorts. |
michael@0 | 1490 | "vpadd.u16 d1, d8, d9 \n" // B |
michael@0 | 1491 | "vpadd.u16 d2, d2, d3 \n" // G 16 shorts -> 8 shorts. |
michael@0 | 1492 | "vpadd.u16 d3, d10, d11 \n" // G |
michael@0 | 1493 | "vpadd.u16 d4, d4, d5 \n" // R 16 shorts -> 8 shorts. |
michael@0 | 1494 | "vpadd.u16 d5, d12, d13 \n" // R |
michael@0 | 1495 | |
michael@0 | 1496 | "vrshr.u16 q0, q0, #1 \n" // 2x average |
michael@0 | 1497 | "vrshr.u16 q1, q1, #1 \n" |
michael@0 | 1498 | "vrshr.u16 q2, q2, #1 \n" |
michael@0 | 1499 | |
michael@0 | 1500 | "subs %3, %3, #32 \n" // 32 processed per loop. |
michael@0 | 1501 | "vmul.s16 q8, q0, q10 \n" // B |
michael@0 | 1502 | "vmls.s16 q8, q1, q11 \n" // G |
michael@0 | 1503 | "vmls.s16 q8, q2, q12 \n" // R |
michael@0 | 1504 | "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned |
michael@0 | 1505 | "vmul.s16 q9, q2, q10 \n" // R |
michael@0 | 1506 | "vmls.s16 q9, q1, q14 \n" // G |
michael@0 | 1507 | "vmls.s16 q9, q0, q13 \n" // B |
michael@0 | 1508 | "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned |
michael@0 | 1509 | "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U |
michael@0 | 1510 | "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V |
michael@0 | 1511 | "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. |
michael@0 | 1512 | "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. |
michael@0 | 1513 | "bgt 1b \n" |
michael@0 | 1514 | : "+r"(src_argb), // %0 |
michael@0 | 1515 | "+r"(dst_u), // %1 |
michael@0 | 1516 | "+r"(dst_v), // %2 |
michael@0 | 1517 | "+r"(pix) // %3 |
michael@0 | 1518 | : |
michael@0 | 1519 | : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", |
michael@0 | 1520 | "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 1521 | ); |
michael@0 | 1522 | } |
michael@0 | 1523 | |
michael@0 | 1524 | // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. |
michael@0 | 1525 | #define RGBTOUV(QB, QG, QR) \ |
michael@0 | 1526 | "vmul.s16 q8, " #QB ", q10 \n" /* B */ \ |
michael@0 | 1527 | "vmls.s16 q8, " #QG ", q11 \n" /* G */ \ |
michael@0 | 1528 | "vmls.s16 q8, " #QR ", q12 \n" /* R */ \ |
michael@0 | 1529 | "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \ |
michael@0 | 1530 | "vmul.s16 q9, " #QR ", q10 \n" /* R */ \ |
michael@0 | 1531 | "vmls.s16 q9, " #QG ", q14 \n" /* G */ \ |
michael@0 | 1532 | "vmls.s16 q9, " #QB ", q13 \n" /* B */ \ |
michael@0 | 1533 | "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \ |
michael@0 | 1534 | "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \ |
michael@0 | 1535 | "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */ |
michael@0 | 1536 | |
michael@0 | 1537 | // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. |
michael@0 | 1538 | void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, |
michael@0 | 1539 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 1540 | asm volatile ( |
michael@0 | 1541 | "add %1, %0, %1 \n" // src_stride + src_argb |
michael@0 | 1542 | "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient |
michael@0 | 1543 | "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient |
michael@0 | 1544 | "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient |
michael@0 | 1545 | "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient |
michael@0 | 1546 | "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient |
michael@0 | 1547 | "vmov.u16 q15, #0x8080 \n" // 128.5 |
michael@0 | 1548 | ".p2align 2 \n" |
michael@0 | 1549 | "1: \n" |
michael@0 | 1550 | "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. |
michael@0 | 1551 | "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. |
michael@0 | 1552 | "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. |
michael@0 | 1553 | "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. |
michael@0 | 1554 | "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. |
michael@0 | 1555 | "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. |
michael@0 | 1556 | "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. |
michael@0 | 1557 | "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. |
michael@0 | 1558 | "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. |
michael@0 | 1559 | "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. |
michael@0 | 1560 | |
michael@0 | 1561 | "vrshr.u16 q0, q0, #1 \n" // 2x average |
michael@0 | 1562 | "vrshr.u16 q1, q1, #1 \n" |
michael@0 | 1563 | "vrshr.u16 q2, q2, #1 \n" |
michael@0 | 1564 | |
michael@0 | 1565 | "subs %4, %4, #16 \n" // 32 processed per loop. |
michael@0 | 1566 | RGBTOUV(q0, q1, q2) |
michael@0 | 1567 | "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. |
michael@0 | 1568 | "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. |
michael@0 | 1569 | "bgt 1b \n" |
michael@0 | 1570 | : "+r"(src_argb), // %0 |
michael@0 | 1571 | "+r"(src_stride_argb), // %1 |
michael@0 | 1572 | "+r"(dst_u), // %2 |
michael@0 | 1573 | "+r"(dst_v), // %3 |
michael@0 | 1574 | "+r"(pix) // %4 |
michael@0 | 1575 | : |
michael@0 | 1576 | : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", |
michael@0 | 1577 | "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 1578 | ); |
michael@0 | 1579 | } |
michael@0 | 1580 | |
michael@0 | 1581 | // TODO(fbarchard): Subsample match C code. |
michael@0 | 1582 | void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, |
michael@0 | 1583 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 1584 | asm volatile ( |
michael@0 | 1585 | "add %1, %0, %1 \n" // src_stride + src_argb |
michael@0 | 1586 | "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient |
michael@0 | 1587 | "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient |
michael@0 | 1588 | "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient |
michael@0 | 1589 | "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient |
michael@0 | 1590 | "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient |
michael@0 | 1591 | "vmov.u16 q15, #0x8080 \n" // 128.5 |
michael@0 | 1592 | ".p2align 2 \n" |
michael@0 | 1593 | "1: \n" |
michael@0 | 1594 | "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. |
michael@0 | 1595 | "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. |
michael@0 | 1596 | "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. |
michael@0 | 1597 | "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. |
michael@0 | 1598 | "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. |
michael@0 | 1599 | "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. |
michael@0 | 1600 | "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. |
michael@0 | 1601 | "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. |
michael@0 | 1602 | "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. |
michael@0 | 1603 | "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. |
michael@0 | 1604 | |
michael@0 | 1605 | "vrshr.u16 q0, q0, #1 \n" // 2x average |
michael@0 | 1606 | "vrshr.u16 q1, q1, #1 \n" |
michael@0 | 1607 | "vrshr.u16 q2, q2, #1 \n" |
michael@0 | 1608 | |
michael@0 | 1609 | "subs %4, %4, #16 \n" // 32 processed per loop. |
michael@0 | 1610 | RGBTOUV(q0, q1, q2) |
michael@0 | 1611 | "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. |
michael@0 | 1612 | "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. |
michael@0 | 1613 | "bgt 1b \n" |
michael@0 | 1614 | : "+r"(src_argb), // %0 |
michael@0 | 1615 | "+r"(src_stride_argb), // %1 |
michael@0 | 1616 | "+r"(dst_u), // %2 |
michael@0 | 1617 | "+r"(dst_v), // %3 |
michael@0 | 1618 | "+r"(pix) // %4 |
michael@0 | 1619 | : |
michael@0 | 1620 | : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", |
michael@0 | 1621 | "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 1622 | ); |
michael@0 | 1623 | } |
michael@0 | 1624 | |
michael@0 | 1625 | void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, |
michael@0 | 1626 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 1627 | asm volatile ( |
michael@0 | 1628 | "add %1, %0, %1 \n" // src_stride + src_bgra |
michael@0 | 1629 | "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient |
michael@0 | 1630 | "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient |
michael@0 | 1631 | "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient |
michael@0 | 1632 | "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient |
michael@0 | 1633 | "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient |
michael@0 | 1634 | "vmov.u16 q15, #0x8080 \n" // 128.5 |
michael@0 | 1635 | ".p2align 2 \n" |
michael@0 | 1636 | "1: \n" |
michael@0 | 1637 | "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. |
michael@0 | 1638 | "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. |
michael@0 | 1639 | "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. |
michael@0 | 1640 | "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts. |
michael@0 | 1641 | "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts. |
michael@0 | 1642 | "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels. |
michael@0 | 1643 | "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels. |
michael@0 | 1644 | "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts. |
michael@0 | 1645 | "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts. |
michael@0 | 1646 | "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts. |
michael@0 | 1647 | |
michael@0 | 1648 | "vrshr.u16 q1, q1, #1 \n" // 2x average |
michael@0 | 1649 | "vrshr.u16 q2, q2, #1 \n" |
michael@0 | 1650 | "vrshr.u16 q3, q3, #1 \n" |
michael@0 | 1651 | |
michael@0 | 1652 | "subs %4, %4, #16 \n" // 32 processed per loop. |
michael@0 | 1653 | RGBTOUV(q3, q2, q1) |
michael@0 | 1654 | "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. |
michael@0 | 1655 | "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. |
michael@0 | 1656 | "bgt 1b \n" |
michael@0 | 1657 | : "+r"(src_bgra), // %0 |
michael@0 | 1658 | "+r"(src_stride_bgra), // %1 |
michael@0 | 1659 | "+r"(dst_u), // %2 |
michael@0 | 1660 | "+r"(dst_v), // %3 |
michael@0 | 1661 | "+r"(pix) // %4 |
michael@0 | 1662 | : |
michael@0 | 1663 | : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", |
michael@0 | 1664 | "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 1665 | ); |
michael@0 | 1666 | } |
michael@0 | 1667 | |
michael@0 | 1668 | void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, |
michael@0 | 1669 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 1670 | asm volatile ( |
michael@0 | 1671 | "add %1, %0, %1 \n" // src_stride + src_abgr |
michael@0 | 1672 | "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient |
michael@0 | 1673 | "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient |
michael@0 | 1674 | "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient |
michael@0 | 1675 | "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient |
michael@0 | 1676 | "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient |
michael@0 | 1677 | "vmov.u16 q15, #0x8080 \n" // 128.5 |
michael@0 | 1678 | ".p2align 2 \n" |
michael@0 | 1679 | "1: \n" |
michael@0 | 1680 | "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. |
michael@0 | 1681 | "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. |
michael@0 | 1682 | "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. |
michael@0 | 1683 | "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. |
michael@0 | 1684 | "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. |
michael@0 | 1685 | "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. |
michael@0 | 1686 | "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. |
michael@0 | 1687 | "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. |
michael@0 | 1688 | "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. |
michael@0 | 1689 | "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. |
michael@0 | 1690 | |
michael@0 | 1691 | "vrshr.u16 q0, q0, #1 \n" // 2x average |
michael@0 | 1692 | "vrshr.u16 q1, q1, #1 \n" |
michael@0 | 1693 | "vrshr.u16 q2, q2, #1 \n" |
michael@0 | 1694 | |
michael@0 | 1695 | "subs %4, %4, #16 \n" // 32 processed per loop. |
michael@0 | 1696 | RGBTOUV(q2, q1, q0) |
michael@0 | 1697 | "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. |
michael@0 | 1698 | "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. |
michael@0 | 1699 | "bgt 1b \n" |
michael@0 | 1700 | : "+r"(src_abgr), // %0 |
michael@0 | 1701 | "+r"(src_stride_abgr), // %1 |
michael@0 | 1702 | "+r"(dst_u), // %2 |
michael@0 | 1703 | "+r"(dst_v), // %3 |
michael@0 | 1704 | "+r"(pix) // %4 |
michael@0 | 1705 | : |
michael@0 | 1706 | : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", |
michael@0 | 1707 | "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 1708 | ); |
michael@0 | 1709 | } |
michael@0 | 1710 | |
michael@0 | 1711 | void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, |
michael@0 | 1712 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 1713 | asm volatile ( |
michael@0 | 1714 | "add %1, %0, %1 \n" // src_stride + src_rgba |
michael@0 | 1715 | "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient |
michael@0 | 1716 | "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient |
michael@0 | 1717 | "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient |
michael@0 | 1718 | "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient |
michael@0 | 1719 | "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient |
michael@0 | 1720 | "vmov.u16 q15, #0x8080 \n" // 128.5 |
michael@0 | 1721 | ".p2align 2 \n" |
michael@0 | 1722 | "1: \n" |
michael@0 | 1723 | "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. |
michael@0 | 1724 | "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. |
michael@0 | 1725 | "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. |
michael@0 | 1726 | "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts. |
michael@0 | 1727 | "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts. |
michael@0 | 1728 | "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels. |
michael@0 | 1729 | "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels. |
michael@0 | 1730 | "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts. |
michael@0 | 1731 | "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts. |
michael@0 | 1732 | "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts. |
michael@0 | 1733 | |
michael@0 | 1734 | "vrshr.u16 q0, q0, #1 \n" // 2x average |
michael@0 | 1735 | "vrshr.u16 q1, q1, #1 \n" |
michael@0 | 1736 | "vrshr.u16 q2, q2, #1 \n" |
michael@0 | 1737 | |
michael@0 | 1738 | "subs %4, %4, #16 \n" // 32 processed per loop. |
michael@0 | 1739 | RGBTOUV(q0, q1, q2) |
michael@0 | 1740 | "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. |
michael@0 | 1741 | "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. |
michael@0 | 1742 | "bgt 1b \n" |
michael@0 | 1743 | : "+r"(src_rgba), // %0 |
michael@0 | 1744 | "+r"(src_stride_rgba), // %1 |
michael@0 | 1745 | "+r"(dst_u), // %2 |
michael@0 | 1746 | "+r"(dst_v), // %3 |
michael@0 | 1747 | "+r"(pix) // %4 |
michael@0 | 1748 | : |
michael@0 | 1749 | : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", |
michael@0 | 1750 | "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 1751 | ); |
michael@0 | 1752 | } |
michael@0 | 1753 | |
michael@0 | 1754 | void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, |
michael@0 | 1755 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 1756 | asm volatile ( |
michael@0 | 1757 | "add %1, %0, %1 \n" // src_stride + src_rgb24 |
michael@0 | 1758 | "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient |
michael@0 | 1759 | "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient |
michael@0 | 1760 | "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient |
michael@0 | 1761 | "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient |
michael@0 | 1762 | "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient |
michael@0 | 1763 | "vmov.u16 q15, #0x8080 \n" // 128.5 |
michael@0 | 1764 | ".p2align 2 \n" |
michael@0 | 1765 | "1: \n" |
michael@0 | 1766 | "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. |
michael@0 | 1767 | "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. |
michael@0 | 1768 | "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. |
michael@0 | 1769 | "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. |
michael@0 | 1770 | "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. |
michael@0 | 1771 | "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. |
michael@0 | 1772 | "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. |
michael@0 | 1773 | "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. |
michael@0 | 1774 | "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. |
michael@0 | 1775 | "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. |
michael@0 | 1776 | |
michael@0 | 1777 | "vrshr.u16 q0, q0, #1 \n" // 2x average |
michael@0 | 1778 | "vrshr.u16 q1, q1, #1 \n" |
michael@0 | 1779 | "vrshr.u16 q2, q2, #1 \n" |
michael@0 | 1780 | |
michael@0 | 1781 | "subs %4, %4, #16 \n" // 32 processed per loop. |
michael@0 | 1782 | RGBTOUV(q0, q1, q2) |
michael@0 | 1783 | "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. |
michael@0 | 1784 | "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. |
michael@0 | 1785 | "bgt 1b \n" |
michael@0 | 1786 | : "+r"(src_rgb24), // %0 |
michael@0 | 1787 | "+r"(src_stride_rgb24), // %1 |
michael@0 | 1788 | "+r"(dst_u), // %2 |
michael@0 | 1789 | "+r"(dst_v), // %3 |
michael@0 | 1790 | "+r"(pix) // %4 |
michael@0 | 1791 | : |
michael@0 | 1792 | : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", |
michael@0 | 1793 | "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 1794 | ); |
michael@0 | 1795 | } |
michael@0 | 1796 | |
michael@0 | 1797 | void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, |
michael@0 | 1798 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 1799 | asm volatile ( |
michael@0 | 1800 | "add %1, %0, %1 \n" // src_stride + src_raw |
michael@0 | 1801 | "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient |
michael@0 | 1802 | "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient |
michael@0 | 1803 | "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient |
michael@0 | 1804 | "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient |
michael@0 | 1805 | "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient |
michael@0 | 1806 | "vmov.u16 q15, #0x8080 \n" // 128.5 |
michael@0 | 1807 | ".p2align 2 \n" |
michael@0 | 1808 | "1: \n" |
michael@0 | 1809 | "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. |
michael@0 | 1810 | "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. |
michael@0 | 1811 | "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. |
michael@0 | 1812 | "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. |
michael@0 | 1813 | "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. |
michael@0 | 1814 | "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. |
michael@0 | 1815 | "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. |
michael@0 | 1816 | "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. |
michael@0 | 1817 | "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. |
michael@0 | 1818 | "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. |
michael@0 | 1819 | |
michael@0 | 1820 | "vrshr.u16 q0, q0, #1 \n" // 2x average |
michael@0 | 1821 | "vrshr.u16 q1, q1, #1 \n" |
michael@0 | 1822 | "vrshr.u16 q2, q2, #1 \n" |
michael@0 | 1823 | |
michael@0 | 1824 | "subs %4, %4, #16 \n" // 32 processed per loop. |
michael@0 | 1825 | RGBTOUV(q2, q1, q0) |
michael@0 | 1826 | "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. |
michael@0 | 1827 | "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. |
michael@0 | 1828 | "bgt 1b \n" |
michael@0 | 1829 | : "+r"(src_raw), // %0 |
michael@0 | 1830 | "+r"(src_stride_raw), // %1 |
michael@0 | 1831 | "+r"(dst_u), // %2 |
michael@0 | 1832 | "+r"(dst_v), // %3 |
michael@0 | 1833 | "+r"(pix) // %4 |
michael@0 | 1834 | : |
michael@0 | 1835 | : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", |
michael@0 | 1836 | "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 1837 | ); |
michael@0 | 1838 | } |
michael@0 | 1839 | |
michael@0 | 1840 | // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. |
michael@0 | 1841 | void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, |
michael@0 | 1842 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 1843 | asm volatile ( |
michael@0 | 1844 | "add %1, %0, %1 \n" // src_stride + src_argb |
michael@0 | 1845 | "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient |
michael@0 | 1846 | "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient |
michael@0 | 1847 | "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient |
michael@0 | 1848 | "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient |
michael@0 | 1849 | "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient |
michael@0 | 1850 | "vmov.u16 q15, #0x8080 \n" // 128.5 |
michael@0 | 1851 | ".p2align 2 \n" |
michael@0 | 1852 | "1: \n" |
michael@0 | 1853 | "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. |
michael@0 | 1854 | RGB565TOARGB |
michael@0 | 1855 | "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. |
michael@0 | 1856 | "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. |
michael@0 | 1857 | "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. |
michael@0 | 1858 | "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels. |
michael@0 | 1859 | RGB565TOARGB |
michael@0 | 1860 | "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. |
michael@0 | 1861 | "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. |
michael@0 | 1862 | "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. |
michael@0 | 1863 | |
michael@0 | 1864 | "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels. |
michael@0 | 1865 | RGB565TOARGB |
michael@0 | 1866 | "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. |
michael@0 | 1867 | "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. |
michael@0 | 1868 | "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. |
michael@0 | 1869 | "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels. |
michael@0 | 1870 | RGB565TOARGB |
michael@0 | 1871 | "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. |
michael@0 | 1872 | "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. |
michael@0 | 1873 | "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. |
michael@0 | 1874 | |
michael@0 | 1875 | "vrshr.u16 q4, q4, #1 \n" // 2x average |
michael@0 | 1876 | "vrshr.u16 q5, q5, #1 \n" |
michael@0 | 1877 | "vrshr.u16 q6, q6, #1 \n" |
michael@0 | 1878 | |
michael@0 | 1879 | "subs %4, %4, #16 \n" // 16 processed per loop. |
michael@0 | 1880 | "vmul.s16 q8, q4, q10 \n" // B |
michael@0 | 1881 | "vmls.s16 q8, q5, q11 \n" // G |
michael@0 | 1882 | "vmls.s16 q8, q6, q12 \n" // R |
michael@0 | 1883 | "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned |
michael@0 | 1884 | "vmul.s16 q9, q6, q10 \n" // R |
michael@0 | 1885 | "vmls.s16 q9, q5, q14 \n" // G |
michael@0 | 1886 | "vmls.s16 q9, q4, q13 \n" // B |
michael@0 | 1887 | "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned |
michael@0 | 1888 | "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U |
michael@0 | 1889 | "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V |
michael@0 | 1890 | "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. |
michael@0 | 1891 | "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. |
michael@0 | 1892 | "bgt 1b \n" |
michael@0 | 1893 | : "+r"(src_rgb565), // %0 |
michael@0 | 1894 | "+r"(src_stride_rgb565), // %1 |
michael@0 | 1895 | "+r"(dst_u), // %2 |
michael@0 | 1896 | "+r"(dst_v), // %3 |
michael@0 | 1897 | "+r"(pix) // %4 |
michael@0 | 1898 | : |
michael@0 | 1899 | : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", |
michael@0 | 1900 | "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 1901 | ); |
michael@0 | 1902 | } |
michael@0 | 1903 | |
michael@0 | 1904 | // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. |
michael@0 | 1905 | void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, |
michael@0 | 1906 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 1907 | asm volatile ( |
michael@0 | 1908 | "add %1, %0, %1 \n" // src_stride + src_argb |
michael@0 | 1909 | "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient |
michael@0 | 1910 | "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient |
michael@0 | 1911 | "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient |
michael@0 | 1912 | "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient |
michael@0 | 1913 | "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient |
michael@0 | 1914 | "vmov.u16 q15, #0x8080 \n" // 128.5 |
michael@0 | 1915 | ".p2align 2 \n" |
michael@0 | 1916 | "1: \n" |
michael@0 | 1917 | "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. |
michael@0 | 1918 | RGB555TOARGB |
michael@0 | 1919 | "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. |
michael@0 | 1920 | "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. |
michael@0 | 1921 | "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. |
michael@0 | 1922 | "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels. |
michael@0 | 1923 | RGB555TOARGB |
michael@0 | 1924 | "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. |
michael@0 | 1925 | "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. |
michael@0 | 1926 | "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. |
michael@0 | 1927 | |
michael@0 | 1928 | "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels. |
michael@0 | 1929 | RGB555TOARGB |
michael@0 | 1930 | "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. |
michael@0 | 1931 | "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. |
michael@0 | 1932 | "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. |
michael@0 | 1933 | "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels. |
michael@0 | 1934 | RGB555TOARGB |
michael@0 | 1935 | "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. |
michael@0 | 1936 | "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. |
michael@0 | 1937 | "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. |
michael@0 | 1938 | |
michael@0 | 1939 | "vrshr.u16 q4, q4, #1 \n" // 2x average |
michael@0 | 1940 | "vrshr.u16 q5, q5, #1 \n" |
michael@0 | 1941 | "vrshr.u16 q6, q6, #1 \n" |
michael@0 | 1942 | |
michael@0 | 1943 | "subs %4, %4, #16 \n" // 16 processed per loop. |
michael@0 | 1944 | "vmul.s16 q8, q4, q10 \n" // B |
michael@0 | 1945 | "vmls.s16 q8, q5, q11 \n" // G |
michael@0 | 1946 | "vmls.s16 q8, q6, q12 \n" // R |
michael@0 | 1947 | "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned |
michael@0 | 1948 | "vmul.s16 q9, q6, q10 \n" // R |
michael@0 | 1949 | "vmls.s16 q9, q5, q14 \n" // G |
michael@0 | 1950 | "vmls.s16 q9, q4, q13 \n" // B |
michael@0 | 1951 | "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned |
michael@0 | 1952 | "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U |
michael@0 | 1953 | "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V |
michael@0 | 1954 | "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. |
michael@0 | 1955 | "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. |
michael@0 | 1956 | "bgt 1b \n" |
michael@0 | 1957 | : "+r"(src_argb1555), // %0 |
michael@0 | 1958 | "+r"(src_stride_argb1555), // %1 |
michael@0 | 1959 | "+r"(dst_u), // %2 |
michael@0 | 1960 | "+r"(dst_v), // %3 |
michael@0 | 1961 | "+r"(pix) // %4 |
michael@0 | 1962 | : |
michael@0 | 1963 | : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", |
michael@0 | 1964 | "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 1965 | ); |
michael@0 | 1966 | } |
michael@0 | 1967 | |
michael@0 | 1968 | // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. |
michael@0 | 1969 | void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, |
michael@0 | 1970 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 1971 | asm volatile ( |
michael@0 | 1972 | "add %1, %0, %1 \n" // src_stride + src_argb |
michael@0 | 1973 | "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient |
michael@0 | 1974 | "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient |
michael@0 | 1975 | "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient |
michael@0 | 1976 | "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient |
michael@0 | 1977 | "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient |
michael@0 | 1978 | "vmov.u16 q15, #0x8080 \n" // 128.5 |
michael@0 | 1979 | ".p2align 2 \n" |
michael@0 | 1980 | "1: \n" |
michael@0 | 1981 | "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. |
michael@0 | 1982 | ARGB4444TOARGB |
michael@0 | 1983 | "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. |
michael@0 | 1984 | "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. |
michael@0 | 1985 | "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. |
michael@0 | 1986 | "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels. |
michael@0 | 1987 | ARGB4444TOARGB |
michael@0 | 1988 | "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. |
michael@0 | 1989 | "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. |
michael@0 | 1990 | "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. |
michael@0 | 1991 | |
michael@0 | 1992 | "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels. |
michael@0 | 1993 | ARGB4444TOARGB |
michael@0 | 1994 | "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. |
michael@0 | 1995 | "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. |
michael@0 | 1996 | "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. |
michael@0 | 1997 | "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels. |
michael@0 | 1998 | ARGB4444TOARGB |
michael@0 | 1999 | "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. |
michael@0 | 2000 | "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. |
michael@0 | 2001 | "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. |
michael@0 | 2002 | |
michael@0 | 2003 | "vrshr.u16 q4, q4, #1 \n" // 2x average |
michael@0 | 2004 | "vrshr.u16 q5, q5, #1 \n" |
michael@0 | 2005 | "vrshr.u16 q6, q6, #1 \n" |
michael@0 | 2006 | |
michael@0 | 2007 | "subs %4, %4, #16 \n" // 16 processed per loop. |
michael@0 | 2008 | "vmul.s16 q8, q4, q10 \n" // B |
michael@0 | 2009 | "vmls.s16 q8, q5, q11 \n" // G |
michael@0 | 2010 | "vmls.s16 q8, q6, q12 \n" // R |
michael@0 | 2011 | "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned |
michael@0 | 2012 | "vmul.s16 q9, q6, q10 \n" // R |
michael@0 | 2013 | "vmls.s16 q9, q5, q14 \n" // G |
michael@0 | 2014 | "vmls.s16 q9, q4, q13 \n" // B |
michael@0 | 2015 | "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned |
michael@0 | 2016 | "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U |
michael@0 | 2017 | "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V |
michael@0 | 2018 | "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. |
michael@0 | 2019 | "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. |
michael@0 | 2020 | "bgt 1b \n" |
michael@0 | 2021 | : "+r"(src_argb4444), // %0 |
michael@0 | 2022 | "+r"(src_stride_argb4444), // %1 |
michael@0 | 2023 | "+r"(dst_u), // %2 |
michael@0 | 2024 | "+r"(dst_v), // %3 |
michael@0 | 2025 | "+r"(pix) // %4 |
michael@0 | 2026 | : |
michael@0 | 2027 | : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", |
michael@0 | 2028 | "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 2029 | ); |
michael@0 | 2030 | } |
michael@0 | 2031 | |
michael@0 | 2032 | void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { |
michael@0 | 2033 | asm volatile ( |
michael@0 | 2034 | "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient |
michael@0 | 2035 | "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient |
michael@0 | 2036 | "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient |
michael@0 | 2037 | "vmov.u8 d27, #16 \n" // Add 16 constant |
michael@0 | 2038 | ".p2align 2 \n" |
michael@0 | 2039 | "1: \n" |
michael@0 | 2040 | "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. |
michael@0 | 2041 | "subs %2, %2, #8 \n" // 8 processed per loop. |
michael@0 | 2042 | RGB565TOARGB |
michael@0 | 2043 | "vmull.u8 q2, d0, d24 \n" // B |
michael@0 | 2044 | "vmlal.u8 q2, d1, d25 \n" // G |
michael@0 | 2045 | "vmlal.u8 q2, d2, d26 \n" // R |
michael@0 | 2046 | "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y |
michael@0 | 2047 | "vqadd.u8 d0, d27 \n" |
michael@0 | 2048 | "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. |
michael@0 | 2049 | "bgt 1b \n" |
michael@0 | 2050 | : "+r"(src_rgb565), // %0 |
michael@0 | 2051 | "+r"(dst_y), // %1 |
michael@0 | 2052 | "+r"(pix) // %2 |
michael@0 | 2053 | : |
michael@0 | 2054 | : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" |
michael@0 | 2055 | ); |
michael@0 | 2056 | } |
michael@0 | 2057 | |
michael@0 | 2058 | void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { |
michael@0 | 2059 | asm volatile ( |
michael@0 | 2060 | "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient |
michael@0 | 2061 | "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient |
michael@0 | 2062 | "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient |
michael@0 | 2063 | "vmov.u8 d27, #16 \n" // Add 16 constant |
michael@0 | 2064 | ".p2align 2 \n" |
michael@0 | 2065 | "1: \n" |
michael@0 | 2066 | "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. |
michael@0 | 2067 | "subs %2, %2, #8 \n" // 8 processed per loop. |
michael@0 | 2068 | ARGB1555TOARGB |
michael@0 | 2069 | "vmull.u8 q2, d0, d24 \n" // B |
michael@0 | 2070 | "vmlal.u8 q2, d1, d25 \n" // G |
michael@0 | 2071 | "vmlal.u8 q2, d2, d26 \n" // R |
michael@0 | 2072 | "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y |
michael@0 | 2073 | "vqadd.u8 d0, d27 \n" |
michael@0 | 2074 | "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. |
michael@0 | 2075 | "bgt 1b \n" |
michael@0 | 2076 | : "+r"(src_argb1555), // %0 |
michael@0 | 2077 | "+r"(dst_y), // %1 |
michael@0 | 2078 | "+r"(pix) // %2 |
michael@0 | 2079 | : |
michael@0 | 2080 | : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" |
michael@0 | 2081 | ); |
michael@0 | 2082 | } |
michael@0 | 2083 | |
michael@0 | 2084 | void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { |
michael@0 | 2085 | asm volatile ( |
michael@0 | 2086 | "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient |
michael@0 | 2087 | "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient |
michael@0 | 2088 | "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient |
michael@0 | 2089 | "vmov.u8 d27, #16 \n" // Add 16 constant |
michael@0 | 2090 | ".p2align 2 \n" |
michael@0 | 2091 | "1: \n" |
michael@0 | 2092 | "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. |
michael@0 | 2093 | "subs %2, %2, #8 \n" // 8 processed per loop. |
michael@0 | 2094 | ARGB4444TOARGB |
michael@0 | 2095 | "vmull.u8 q2, d0, d24 \n" // B |
michael@0 | 2096 | "vmlal.u8 q2, d1, d25 \n" // G |
michael@0 | 2097 | "vmlal.u8 q2, d2, d26 \n" // R |
michael@0 | 2098 | "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y |
michael@0 | 2099 | "vqadd.u8 d0, d27 \n" |
michael@0 | 2100 | "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. |
michael@0 | 2101 | "bgt 1b \n" |
michael@0 | 2102 | : "+r"(src_argb4444), // %0 |
michael@0 | 2103 | "+r"(dst_y), // %1 |
michael@0 | 2104 | "+r"(pix) // %2 |
michael@0 | 2105 | : |
michael@0 | 2106 | : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13" |
michael@0 | 2107 | ); |
michael@0 | 2108 | } |
michael@0 | 2109 | |
michael@0 | 2110 | void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { |
michael@0 | 2111 | asm volatile ( |
michael@0 | 2112 | "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient |
michael@0 | 2113 | "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient |
michael@0 | 2114 | "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient |
michael@0 | 2115 | "vmov.u8 d7, #16 \n" // Add 16 constant |
michael@0 | 2116 | ".p2align 2 \n" |
michael@0 | 2117 | "1: \n" |
michael@0 | 2118 | "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. |
michael@0 | 2119 | "subs %2, %2, #8 \n" // 8 processed per loop. |
michael@0 | 2120 | "vmull.u8 q8, d1, d4 \n" // R |
michael@0 | 2121 | "vmlal.u8 q8, d2, d5 \n" // G |
michael@0 | 2122 | "vmlal.u8 q8, d3, d6 \n" // B |
michael@0 | 2123 | "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y |
michael@0 | 2124 | "vqadd.u8 d0, d7 \n" |
michael@0 | 2125 | "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. |
michael@0 | 2126 | "bgt 1b \n" |
michael@0 | 2127 | : "+r"(src_bgra), // %0 |
michael@0 | 2128 | "+r"(dst_y), // %1 |
michael@0 | 2129 | "+r"(pix) // %2 |
michael@0 | 2130 | : |
michael@0 | 2131 | : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" |
michael@0 | 2132 | ); |
michael@0 | 2133 | } |
michael@0 | 2134 | |
michael@0 | 2135 | void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { |
michael@0 | 2136 | asm volatile ( |
michael@0 | 2137 | "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient |
michael@0 | 2138 | "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient |
michael@0 | 2139 | "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient |
michael@0 | 2140 | "vmov.u8 d7, #16 \n" // Add 16 constant |
michael@0 | 2141 | ".p2align 2 \n" |
michael@0 | 2142 | "1: \n" |
michael@0 | 2143 | "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. |
michael@0 | 2144 | "subs %2, %2, #8 \n" // 8 processed per loop. |
michael@0 | 2145 | "vmull.u8 q8, d0, d4 \n" // R |
michael@0 | 2146 | "vmlal.u8 q8, d1, d5 \n" // G |
michael@0 | 2147 | "vmlal.u8 q8, d2, d6 \n" // B |
michael@0 | 2148 | "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y |
michael@0 | 2149 | "vqadd.u8 d0, d7 \n" |
michael@0 | 2150 | "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. |
michael@0 | 2151 | "bgt 1b \n" |
michael@0 | 2152 | : "+r"(src_abgr), // %0 |
michael@0 | 2153 | "+r"(dst_y), // %1 |
michael@0 | 2154 | "+r"(pix) // %2 |
michael@0 | 2155 | : |
michael@0 | 2156 | : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" |
michael@0 | 2157 | ); |
michael@0 | 2158 | } |
michael@0 | 2159 | |
michael@0 | 2160 | void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { |
michael@0 | 2161 | asm volatile ( |
michael@0 | 2162 | "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient |
michael@0 | 2163 | "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient |
michael@0 | 2164 | "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient |
michael@0 | 2165 | "vmov.u8 d7, #16 \n" // Add 16 constant |
michael@0 | 2166 | ".p2align 2 \n" |
michael@0 | 2167 | "1: \n" |
michael@0 | 2168 | "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. |
michael@0 | 2169 | "subs %2, %2, #8 \n" // 8 processed per loop. |
michael@0 | 2170 | "vmull.u8 q8, d1, d4 \n" // B |
michael@0 | 2171 | "vmlal.u8 q8, d2, d5 \n" // G |
michael@0 | 2172 | "vmlal.u8 q8, d3, d6 \n" // R |
michael@0 | 2173 | "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y |
michael@0 | 2174 | "vqadd.u8 d0, d7 \n" |
michael@0 | 2175 | "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. |
michael@0 | 2176 | "bgt 1b \n" |
michael@0 | 2177 | : "+r"(src_rgba), // %0 |
michael@0 | 2178 | "+r"(dst_y), // %1 |
michael@0 | 2179 | "+r"(pix) // %2 |
michael@0 | 2180 | : |
michael@0 | 2181 | : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" |
michael@0 | 2182 | ); |
michael@0 | 2183 | } |
michael@0 | 2184 | |
michael@0 | 2185 | void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { |
michael@0 | 2186 | asm volatile ( |
michael@0 | 2187 | "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient |
michael@0 | 2188 | "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient |
michael@0 | 2189 | "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient |
michael@0 | 2190 | "vmov.u8 d7, #16 \n" // Add 16 constant |
michael@0 | 2191 | ".p2align 2 \n" |
michael@0 | 2192 | "1: \n" |
michael@0 | 2193 | "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. |
michael@0 | 2194 | "subs %2, %2, #8 \n" // 8 processed per loop. |
michael@0 | 2195 | "vmull.u8 q8, d0, d4 \n" // B |
michael@0 | 2196 | "vmlal.u8 q8, d1, d5 \n" // G |
michael@0 | 2197 | "vmlal.u8 q8, d2, d6 \n" // R |
michael@0 | 2198 | "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y |
michael@0 | 2199 | "vqadd.u8 d0, d7 \n" |
michael@0 | 2200 | "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. |
michael@0 | 2201 | "bgt 1b \n" |
michael@0 | 2202 | : "+r"(src_rgb24), // %0 |
michael@0 | 2203 | "+r"(dst_y), // %1 |
michael@0 | 2204 | "+r"(pix) // %2 |
michael@0 | 2205 | : |
michael@0 | 2206 | : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" |
michael@0 | 2207 | ); |
michael@0 | 2208 | } |
michael@0 | 2209 | |
michael@0 | 2210 | void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { |
michael@0 | 2211 | asm volatile ( |
michael@0 | 2212 | "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient |
michael@0 | 2213 | "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient |
michael@0 | 2214 | "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient |
michael@0 | 2215 | "vmov.u8 d7, #16 \n" // Add 16 constant |
michael@0 | 2216 | ".p2align 2 \n" |
michael@0 | 2217 | "1: \n" |
michael@0 | 2218 | "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. |
michael@0 | 2219 | "subs %2, %2, #8 \n" // 8 processed per loop. |
michael@0 | 2220 | "vmull.u8 q8, d0, d4 \n" // B |
michael@0 | 2221 | "vmlal.u8 q8, d1, d5 \n" // G |
michael@0 | 2222 | "vmlal.u8 q8, d2, d6 \n" // R |
michael@0 | 2223 | "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y |
michael@0 | 2224 | "vqadd.u8 d0, d7 \n" |
michael@0 | 2225 | "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. |
michael@0 | 2226 | "bgt 1b \n" |
michael@0 | 2227 | : "+r"(src_raw), // %0 |
michael@0 | 2228 | "+r"(dst_y), // %1 |
michael@0 | 2229 | "+r"(pix) // %2 |
michael@0 | 2230 | : |
michael@0 | 2231 | : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" |
michael@0 | 2232 | ); |
michael@0 | 2233 | } |
michael@0 | 2234 | |
michael@0 | 2235 | // Bilinear filter 16x2 -> 16x1 |
michael@0 | 2236 | void InterpolateRow_NEON(uint8* dst_ptr, |
michael@0 | 2237 | const uint8* src_ptr, ptrdiff_t src_stride, |
michael@0 | 2238 | int dst_width, int source_y_fraction) { |
michael@0 | 2239 | asm volatile ( |
michael@0 | 2240 | "cmp %4, #0 \n" |
michael@0 | 2241 | "beq 100f \n" |
michael@0 | 2242 | "add %2, %1 \n" |
michael@0 | 2243 | "cmp %4, #64 \n" |
michael@0 | 2244 | "beq 75f \n" |
michael@0 | 2245 | "cmp %4, #128 \n" |
michael@0 | 2246 | "beq 50f \n" |
michael@0 | 2247 | "cmp %4, #192 \n" |
michael@0 | 2248 | "beq 25f \n" |
michael@0 | 2249 | |
michael@0 | 2250 | "vdup.8 d5, %4 \n" |
michael@0 | 2251 | "rsb %4, #256 \n" |
michael@0 | 2252 | "vdup.8 d4, %4 \n" |
michael@0 | 2253 | // General purpose row blend. |
michael@0 | 2254 | "1: \n" |
michael@0 | 2255 | "vld1.8 {q0}, [%1]! \n" |
michael@0 | 2256 | "vld1.8 {q1}, [%2]! \n" |
michael@0 | 2257 | "subs %3, %3, #16 \n" |
michael@0 | 2258 | "vmull.u8 q13, d0, d4 \n" |
michael@0 | 2259 | "vmull.u8 q14, d1, d4 \n" |
michael@0 | 2260 | "vmlal.u8 q13, d2, d5 \n" |
michael@0 | 2261 | "vmlal.u8 q14, d3, d5 \n" |
michael@0 | 2262 | "vrshrn.u16 d0, q13, #8 \n" |
michael@0 | 2263 | "vrshrn.u16 d1, q14, #8 \n" |
michael@0 | 2264 | "vst1.8 {q0}, [%0]! \n" |
michael@0 | 2265 | "bgt 1b \n" |
michael@0 | 2266 | "b 99f \n" |
michael@0 | 2267 | |
michael@0 | 2268 | // Blend 25 / 75. |
michael@0 | 2269 | "25: \n" |
michael@0 | 2270 | "vld1.8 {q0}, [%1]! \n" |
michael@0 | 2271 | "vld1.8 {q1}, [%2]! \n" |
michael@0 | 2272 | "subs %3, %3, #16 \n" |
michael@0 | 2273 | "vrhadd.u8 q0, q1 \n" |
michael@0 | 2274 | "vrhadd.u8 q0, q1 \n" |
michael@0 | 2275 | "vst1.8 {q0}, [%0]! \n" |
michael@0 | 2276 | "bgt 25b \n" |
michael@0 | 2277 | "b 99f \n" |
michael@0 | 2278 | |
michael@0 | 2279 | // Blend 50 / 50. |
michael@0 | 2280 | "50: \n" |
michael@0 | 2281 | "vld1.8 {q0}, [%1]! \n" |
michael@0 | 2282 | "vld1.8 {q1}, [%2]! \n" |
michael@0 | 2283 | "subs %3, %3, #16 \n" |
michael@0 | 2284 | "vrhadd.u8 q0, q1 \n" |
michael@0 | 2285 | "vst1.8 {q0}, [%0]! \n" |
michael@0 | 2286 | "bgt 50b \n" |
michael@0 | 2287 | "b 99f \n" |
michael@0 | 2288 | |
michael@0 | 2289 | // Blend 75 / 25. |
michael@0 | 2290 | "75: \n" |
michael@0 | 2291 | "vld1.8 {q1}, [%1]! \n" |
michael@0 | 2292 | "vld1.8 {q0}, [%2]! \n" |
michael@0 | 2293 | "subs %3, %3, #16 \n" |
michael@0 | 2294 | "vrhadd.u8 q0, q1 \n" |
michael@0 | 2295 | "vrhadd.u8 q0, q1 \n" |
michael@0 | 2296 | "vst1.8 {q0}, [%0]! \n" |
michael@0 | 2297 | "bgt 75b \n" |
michael@0 | 2298 | "b 99f \n" |
michael@0 | 2299 | |
michael@0 | 2300 | // Blend 100 / 0 - Copy row unchanged. |
michael@0 | 2301 | "100: \n" |
michael@0 | 2302 | "vld1.8 {q0}, [%1]! \n" |
michael@0 | 2303 | "subs %3, %3, #16 \n" |
michael@0 | 2304 | "vst1.8 {q0}, [%0]! \n" |
michael@0 | 2305 | "bgt 100b \n" |
michael@0 | 2306 | |
michael@0 | 2307 | "99: \n" |
michael@0 | 2308 | : "+r"(dst_ptr), // %0 |
michael@0 | 2309 | "+r"(src_ptr), // %1 |
michael@0 | 2310 | "+r"(src_stride), // %2 |
michael@0 | 2311 | "+r"(dst_width), // %3 |
michael@0 | 2312 | "+r"(source_y_fraction) // %4 |
michael@0 | 2313 | : |
michael@0 | 2314 | : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14" |
michael@0 | 2315 | ); |
michael@0 | 2316 | } |
michael@0 | 2317 | |
michael@0 | 2318 | // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr |
michael@0 | 2319 | void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, |
michael@0 | 2320 | uint8* dst_argb, int width) { |
michael@0 | 2321 | asm volatile ( |
michael@0 | 2322 | "subs %3, #8 \n" |
michael@0 | 2323 | "blt 89f \n" |
michael@0 | 2324 | // Blend 8 pixels. |
michael@0 | 2325 | "8: \n" |
michael@0 | 2326 | "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. |
michael@0 | 2327 | "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. |
michael@0 | 2328 | "subs %3, %3, #8 \n" // 8 processed per loop. |
michael@0 | 2329 | "vmull.u8 q10, d4, d3 \n" // db * a |
michael@0 | 2330 | "vmull.u8 q11, d5, d3 \n" // dg * a |
michael@0 | 2331 | "vmull.u8 q12, d6, d3 \n" // dr * a |
michael@0 | 2332 | "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 |
michael@0 | 2333 | "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 |
michael@0 | 2334 | "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 |
michael@0 | 2335 | "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 |
michael@0 | 2336 | "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 |
michael@0 | 2337 | "vqadd.u8 q0, q0, q2 \n" // + sbg |
michael@0 | 2338 | "vqadd.u8 d2, d2, d6 \n" // + sr |
michael@0 | 2339 | "vmov.u8 d3, #255 \n" // a = 255 |
michael@0 | 2340 | "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. |
michael@0 | 2341 | "bge 8b \n" |
michael@0 | 2342 | |
michael@0 | 2343 | "89: \n" |
michael@0 | 2344 | "adds %3, #8-1 \n" |
michael@0 | 2345 | "blt 99f \n" |
michael@0 | 2346 | |
michael@0 | 2347 | // Blend 1 pixels. |
michael@0 | 2348 | "1: \n" |
michael@0 | 2349 | "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. |
michael@0 | 2350 | "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. |
michael@0 | 2351 | "subs %3, %3, #1 \n" // 1 processed per loop. |
michael@0 | 2352 | "vmull.u8 q10, d4, d3 \n" // db * a |
michael@0 | 2353 | "vmull.u8 q11, d5, d3 \n" // dg * a |
michael@0 | 2354 | "vmull.u8 q12, d6, d3 \n" // dr * a |
michael@0 | 2355 | "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8 |
michael@0 | 2356 | "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8 |
michael@0 | 2357 | "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8 |
michael@0 | 2358 | "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256 |
michael@0 | 2359 | "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256 |
michael@0 | 2360 | "vqadd.u8 q0, q0, q2 \n" // + sbg |
michael@0 | 2361 | "vqadd.u8 d2, d2, d6 \n" // + sr |
michael@0 | 2362 | "vmov.u8 d3, #255 \n" // a = 255 |
michael@0 | 2363 | "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. |
michael@0 | 2364 | "bge 1b \n" |
michael@0 | 2365 | |
michael@0 | 2366 | "99: \n" |
michael@0 | 2367 | |
michael@0 | 2368 | : "+r"(src_argb0), // %0 |
michael@0 | 2369 | "+r"(src_argb1), // %1 |
michael@0 | 2370 | "+r"(dst_argb), // %2 |
michael@0 | 2371 | "+r"(width) // %3 |
michael@0 | 2372 | : |
michael@0 | 2373 | : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12" |
michael@0 | 2374 | ); |
michael@0 | 2375 | } |
michael@0 | 2376 | |
michael@0 | 2377 | // Attenuate 8 pixels at a time. |
michael@0 | 2378 | void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { |
michael@0 | 2379 | asm volatile ( |
michael@0 | 2380 | // Attenuate 8 pixels. |
michael@0 | 2381 | "1: \n" |
michael@0 | 2382 | "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. |
michael@0 | 2383 | "subs %2, %2, #8 \n" // 8 processed per loop. |
michael@0 | 2384 | "vmull.u8 q10, d0, d3 \n" // b * a |
michael@0 | 2385 | "vmull.u8 q11, d1, d3 \n" // g * a |
michael@0 | 2386 | "vmull.u8 q12, d2, d3 \n" // r * a |
michael@0 | 2387 | "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8 |
michael@0 | 2388 | "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8 |
michael@0 | 2389 | "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8 |
michael@0 | 2390 | "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. |
michael@0 | 2391 | "bgt 1b \n" |
michael@0 | 2392 | : "+r"(src_argb), // %0 |
michael@0 | 2393 | "+r"(dst_argb), // %1 |
michael@0 | 2394 | "+r"(width) // %2 |
michael@0 | 2395 | : |
michael@0 | 2396 | : "cc", "memory", "q0", "q1", "q10", "q11", "q12" |
michael@0 | 2397 | ); |
michael@0 | 2398 | } |
michael@0 | 2399 | |
michael@0 | 2400 | // Quantize 8 ARGB pixels (32 bytes). |
michael@0 | 2401 | // dst = (dst * scale >> 16) * interval_size + interval_offset; |
michael@0 | 2402 | void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, |
michael@0 | 2403 | int interval_offset, int width) { |
michael@0 | 2404 | asm volatile ( |
michael@0 | 2405 | "vdup.u16 q8, %2 \n" |
michael@0 | 2406 | "vshr.u16 q8, q8, #1 \n" // scale >>= 1 |
michael@0 | 2407 | "vdup.u16 q9, %3 \n" // interval multiply. |
michael@0 | 2408 | "vdup.u16 q10, %4 \n" // interval add |
michael@0 | 2409 | |
michael@0 | 2410 | // 8 pixel loop. |
michael@0 | 2411 | ".p2align 2 \n" |
michael@0 | 2412 | "1: \n" |
michael@0 | 2413 | "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. |
michael@0 | 2414 | "subs %1, %1, #8 \n" // 8 processed per loop. |
michael@0 | 2415 | "vmovl.u8 q0, d0 \n" // b (0 .. 255) |
michael@0 | 2416 | "vmovl.u8 q1, d2 \n" |
michael@0 | 2417 | "vmovl.u8 q2, d4 \n" |
michael@0 | 2418 | "vqdmulh.s16 q0, q0, q8 \n" // b * scale |
michael@0 | 2419 | "vqdmulh.s16 q1, q1, q8 \n" // g |
michael@0 | 2420 | "vqdmulh.s16 q2, q2, q8 \n" // r |
michael@0 | 2421 | "vmul.u16 q0, q0, q9 \n" // b * interval_size |
michael@0 | 2422 | "vmul.u16 q1, q1, q9 \n" // g |
michael@0 | 2423 | "vmul.u16 q2, q2, q9 \n" // r |
michael@0 | 2424 | "vadd.u16 q0, q0, q10 \n" // b + interval_offset |
michael@0 | 2425 | "vadd.u16 q1, q1, q10 \n" // g |
michael@0 | 2426 | "vadd.u16 q2, q2, q10 \n" // r |
michael@0 | 2427 | "vqmovn.u16 d0, q0 \n" |
michael@0 | 2428 | "vqmovn.u16 d2, q1 \n" |
michael@0 | 2429 | "vqmovn.u16 d4, q2 \n" |
michael@0 | 2430 | "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB. |
michael@0 | 2431 | "bgt 1b \n" |
michael@0 | 2432 | : "+r"(dst_argb), // %0 |
michael@0 | 2433 | "+r"(width) // %1 |
michael@0 | 2434 | : "r"(scale), // %2 |
michael@0 | 2435 | "r"(interval_size), // %3 |
michael@0 | 2436 | "r"(interval_offset) // %4 |
michael@0 | 2437 | : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10" |
michael@0 | 2438 | ); |
michael@0 | 2439 | } |
michael@0 | 2440 | |
michael@0 | 2441 | // Shade 8 pixels at a time by specified value. |
michael@0 | 2442 | // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. |
michael@0 | 2443 | // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. |
michael@0 | 2444 | void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, |
michael@0 | 2445 | uint32 value) { |
michael@0 | 2446 | asm volatile ( |
michael@0 | 2447 | "vdup.u32 q0, %3 \n" // duplicate scale value. |
michael@0 | 2448 | "vzip.u8 d0, d1 \n" // d0 aarrggbb. |
michael@0 | 2449 | "vshr.u16 q0, q0, #1 \n" // scale / 2. |
michael@0 | 2450 | |
michael@0 | 2451 | // 8 pixel loop. |
michael@0 | 2452 | ".p2align 2 \n" |
michael@0 | 2453 | "1: \n" |
michael@0 | 2454 | "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. |
michael@0 | 2455 | "subs %2, %2, #8 \n" // 8 processed per loop. |
michael@0 | 2456 | "vmovl.u8 q10, d20 \n" // b (0 .. 255) |
michael@0 | 2457 | "vmovl.u8 q11, d22 \n" |
michael@0 | 2458 | "vmovl.u8 q12, d24 \n" |
michael@0 | 2459 | "vmovl.u8 q13, d26 \n" |
michael@0 | 2460 | "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2 |
michael@0 | 2461 | "vqrdmulh.s16 q11, q11, d0[1] \n" // g |
michael@0 | 2462 | "vqrdmulh.s16 q12, q12, d0[2] \n" // r |
michael@0 | 2463 | "vqrdmulh.s16 q13, q13, d0[3] \n" // a |
michael@0 | 2464 | "vqmovn.u16 d20, q10 \n" |
michael@0 | 2465 | "vqmovn.u16 d22, q11 \n" |
michael@0 | 2466 | "vqmovn.u16 d24, q12 \n" |
michael@0 | 2467 | "vqmovn.u16 d26, q13 \n" |
michael@0 | 2468 | "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. |
michael@0 | 2469 | "bgt 1b \n" |
michael@0 | 2470 | : "+r"(src_argb), // %0 |
michael@0 | 2471 | "+r"(dst_argb), // %1 |
michael@0 | 2472 | "+r"(width) // %2 |
michael@0 | 2473 | : "r"(value) // %3 |
michael@0 | 2474 | : "cc", "memory", "q0", "q10", "q11", "q12", "q13" |
michael@0 | 2475 | ); |
michael@0 | 2476 | } |
michael@0 | 2477 | |
michael@0 | 2478 | // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels |
michael@0 | 2479 | // Similar to ARGBToYJ but stores ARGB. |
michael@0 | 2480 | // C code is (15 * b + 75 * g + 38 * r + 64) >> 7; |
michael@0 | 2481 | void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { |
michael@0 | 2482 | asm volatile ( |
michael@0 | 2483 | "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient |
michael@0 | 2484 | "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient |
michael@0 | 2485 | "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient |
michael@0 | 2486 | ".p2align 2 \n" |
michael@0 | 2487 | "1: \n" |
michael@0 | 2488 | "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. |
michael@0 | 2489 | "subs %2, %2, #8 \n" // 8 processed per loop. |
michael@0 | 2490 | "vmull.u8 q2, d0, d24 \n" // B |
michael@0 | 2491 | "vmlal.u8 q2, d1, d25 \n" // G |
michael@0 | 2492 | "vmlal.u8 q2, d2, d26 \n" // R |
michael@0 | 2493 | "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B |
michael@0 | 2494 | "vmov d1, d0 \n" // G |
michael@0 | 2495 | "vmov d2, d0 \n" // R |
michael@0 | 2496 | "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. |
michael@0 | 2497 | "bgt 1b \n" |
michael@0 | 2498 | : "+r"(src_argb), // %0 |
michael@0 | 2499 | "+r"(dst_argb), // %1 |
michael@0 | 2500 | "+r"(width) // %2 |
michael@0 | 2501 | : |
michael@0 | 2502 | : "cc", "memory", "q0", "q1", "q2", "q12", "q13" |
michael@0 | 2503 | ); |
michael@0 | 2504 | } |
michael@0 | 2505 | |
michael@0 | 2506 | // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. |
michael@0 | 2507 | // b = (r * 35 + g * 68 + b * 17) >> 7 |
michael@0 | 2508 | // g = (r * 45 + g * 88 + b * 22) >> 7 |
michael@0 | 2509 | // r = (r * 50 + g * 98 + b * 24) >> 7 |
michael@0 | 2510 | void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { |
michael@0 | 2511 | asm volatile ( |
michael@0 | 2512 | "vmov.u8 d20, #17 \n" // BB coefficient |
michael@0 | 2513 | "vmov.u8 d21, #68 \n" // BG coefficient |
michael@0 | 2514 | "vmov.u8 d22, #35 \n" // BR coefficient |
michael@0 | 2515 | "vmov.u8 d24, #22 \n" // GB coefficient |
michael@0 | 2516 | "vmov.u8 d25, #88 \n" // GG coefficient |
michael@0 | 2517 | "vmov.u8 d26, #45 \n" // GR coefficient |
michael@0 | 2518 | "vmov.u8 d28, #24 \n" // BB coefficient |
michael@0 | 2519 | "vmov.u8 d29, #98 \n" // BG coefficient |
michael@0 | 2520 | "vmov.u8 d30, #50 \n" // BR coefficient |
michael@0 | 2521 | ".p2align 2 \n" |
michael@0 | 2522 | "1: \n" |
michael@0 | 2523 | "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. |
michael@0 | 2524 | "subs %1, %1, #8 \n" // 8 processed per loop. |
michael@0 | 2525 | "vmull.u8 q2, d0, d20 \n" // B to Sepia B |
michael@0 | 2526 | "vmlal.u8 q2, d1, d21 \n" // G |
michael@0 | 2527 | "vmlal.u8 q2, d2, d22 \n" // R |
michael@0 | 2528 | "vmull.u8 q3, d0, d24 \n" // B to Sepia G |
michael@0 | 2529 | "vmlal.u8 q3, d1, d25 \n" // G |
michael@0 | 2530 | "vmlal.u8 q3, d2, d26 \n" // R |
michael@0 | 2531 | "vmull.u8 q8, d0, d28 \n" // B to Sepia R |
michael@0 | 2532 | "vmlal.u8 q8, d1, d29 \n" // G |
michael@0 | 2533 | "vmlal.u8 q8, d2, d30 \n" // R |
michael@0 | 2534 | "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B |
michael@0 | 2535 | "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G |
michael@0 | 2536 | "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R |
michael@0 | 2537 | "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. |
michael@0 | 2538 | "bgt 1b \n" |
michael@0 | 2539 | : "+r"(dst_argb), // %0 |
michael@0 | 2540 | "+r"(width) // %1 |
michael@0 | 2541 | : |
michael@0 | 2542 | : "cc", "memory", "q0", "q1", "q2", "q3", |
michael@0 | 2543 | "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 2544 | ); |
michael@0 | 2545 | } |
michael@0 | 2546 | |
michael@0 | 2547 | // Tranform 8 ARGB pixels (32 bytes) with color matrix. |
michael@0 | 2548 | // TODO(fbarchard): Was same as Sepia except matrix is provided. This function |
michael@0 | 2549 | // needs to saturate. Consider doing a non-saturating version. |
michael@0 | 2550 | void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, |
michael@0 | 2551 | const int8* matrix_argb, int width) { |
michael@0 | 2552 | asm volatile ( |
michael@0 | 2553 | "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. |
michael@0 | 2554 | "vmovl.s8 q0, d4 \n" // B,G coefficients s16. |
michael@0 | 2555 | "vmovl.s8 q1, d5 \n" // R,A coefficients s16. |
michael@0 | 2556 | |
michael@0 | 2557 | ".p2align 2 \n" |
michael@0 | 2558 | "1: \n" |
michael@0 | 2559 | "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. |
michael@0 | 2560 | "subs %2, %2, #8 \n" // 8 processed per loop. |
michael@0 | 2561 | "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit |
michael@0 | 2562 | "vmovl.u8 q9, d18 \n" // g |
michael@0 | 2563 | "vmovl.u8 q10, d20 \n" // r |
michael@0 | 2564 | "vmovl.u8 q15, d22 \n" // a |
michael@0 | 2565 | "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B |
michael@0 | 2566 | "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G |
michael@0 | 2567 | "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R |
michael@0 | 2568 | "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A |
michael@0 | 2569 | "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B |
michael@0 | 2570 | "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G |
michael@0 | 2571 | "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R |
michael@0 | 2572 | "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A |
michael@0 | 2573 | "vqadd.s16 q12, q12, q4 \n" // Accumulate B |
michael@0 | 2574 | "vqadd.s16 q13, q13, q5 \n" // Accumulate G |
michael@0 | 2575 | "vqadd.s16 q14, q14, q6 \n" // Accumulate R |
michael@0 | 2576 | "vqadd.s16 q15, q15, q7 \n" // Accumulate A |
michael@0 | 2577 | "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B |
michael@0 | 2578 | "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G |
michael@0 | 2579 | "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R |
michael@0 | 2580 | "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A |
michael@0 | 2581 | "vqadd.s16 q12, q12, q4 \n" // Accumulate B |
michael@0 | 2582 | "vqadd.s16 q13, q13, q5 \n" // Accumulate G |
michael@0 | 2583 | "vqadd.s16 q14, q14, q6 \n" // Accumulate R |
michael@0 | 2584 | "vqadd.s16 q15, q15, q7 \n" // Accumulate A |
michael@0 | 2585 | "vmul.s16 q4, q15, d0[3] \n" // B += A * Matrix B |
michael@0 | 2586 | "vmul.s16 q5, q15, d1[3] \n" // G += A * Matrix G |
michael@0 | 2587 | "vmul.s16 q6, q15, d2[3] \n" // R += A * Matrix R |
michael@0 | 2588 | "vmul.s16 q7, q15, d3[3] \n" // A += A * Matrix A |
michael@0 | 2589 | "vqadd.s16 q12, q12, q4 \n" // Accumulate B |
michael@0 | 2590 | "vqadd.s16 q13, q13, q5 \n" // Accumulate G |
michael@0 | 2591 | "vqadd.s16 q14, q14, q6 \n" // Accumulate R |
michael@0 | 2592 | "vqadd.s16 q15, q15, q7 \n" // Accumulate A |
michael@0 | 2593 | "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B |
michael@0 | 2594 | "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G |
michael@0 | 2595 | "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R |
michael@0 | 2596 | "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A |
michael@0 | 2597 | "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. |
michael@0 | 2598 | "bgt 1b \n" |
michael@0 | 2599 | : "+r"(src_argb), // %0 |
michael@0 | 2600 | "+r"(dst_argb), // %1 |
michael@0 | 2601 | "+r"(width) // %2 |
michael@0 | 2602 | : "r"(matrix_argb) // %3 |
michael@0 | 2603 | : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", |
michael@0 | 2604 | "q10", "q11", "q12", "q13", "q14", "q15" |
michael@0 | 2605 | ); |
michael@0 | 2606 | } |
michael@0 | 2607 | |
michael@0 | 2608 | // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. |
michael@0 | 2609 | #ifdef HAS_ARGBMULTIPLYROW_NEON |
michael@0 | 2610 | // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. |
michael@0 | 2611 | void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, |
michael@0 | 2612 | uint8* dst_argb, int width) { |
michael@0 | 2613 | asm volatile ( |
michael@0 | 2614 | // 8 pixel loop. |
michael@0 | 2615 | ".p2align 2 \n" |
michael@0 | 2616 | "1: \n" |
michael@0 | 2617 | "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. |
michael@0 | 2618 | "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels. |
michael@0 | 2619 | "subs %3, %3, #8 \n" // 8 processed per loop. |
michael@0 | 2620 | "vmull.u8 q0, d0, d1 \n" // multiply B |
michael@0 | 2621 | "vmull.u8 q1, d2, d3 \n" // multiply G |
michael@0 | 2622 | "vmull.u8 q2, d4, d5 \n" // multiply R |
michael@0 | 2623 | "vmull.u8 q3, d6, d7 \n" // multiply A |
michael@0 | 2624 | "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B |
michael@0 | 2625 | "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G |
michael@0 | 2626 | "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R |
michael@0 | 2627 | "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A |
michael@0 | 2628 | "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. |
michael@0 | 2629 | "bgt 1b \n" |
michael@0 | 2630 | |
michael@0 | 2631 | : "+r"(src_argb0), // %0 |
michael@0 | 2632 | "+r"(src_argb1), // %1 |
michael@0 | 2633 | "+r"(dst_argb), // %2 |
michael@0 | 2634 | "+r"(width) // %3 |
michael@0 | 2635 | : |
michael@0 | 2636 | : "cc", "memory", "q0", "q1", "q2", "q3" |
michael@0 | 2637 | ); |
michael@0 | 2638 | } |
michael@0 | 2639 | #endif // HAS_ARGBMULTIPLYROW_NEON |
michael@0 | 2640 | |
michael@0 | 2641 | // Add 2 rows of ARGB pixels together, 8 pixels at a time. |
michael@0 | 2642 | void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, |
michael@0 | 2643 | uint8* dst_argb, int width) { |
michael@0 | 2644 | asm volatile ( |
michael@0 | 2645 | // 8 pixel loop. |
michael@0 | 2646 | ".p2align 2 \n" |
michael@0 | 2647 | "1: \n" |
michael@0 | 2648 | "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. |
michael@0 | 2649 | "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. |
michael@0 | 2650 | "subs %3, %3, #8 \n" // 8 processed per loop. |
michael@0 | 2651 | "vqadd.u8 q0, q0, q2 \n" // add B, G |
michael@0 | 2652 | "vqadd.u8 q1, q1, q3 \n" // add R, A |
michael@0 | 2653 | "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. |
michael@0 | 2654 | "bgt 1b \n" |
michael@0 | 2655 | |
michael@0 | 2656 | : "+r"(src_argb0), // %0 |
michael@0 | 2657 | "+r"(src_argb1), // %1 |
michael@0 | 2658 | "+r"(dst_argb), // %2 |
michael@0 | 2659 | "+r"(width) // %3 |
michael@0 | 2660 | : |
michael@0 | 2661 | : "cc", "memory", "q0", "q1", "q2", "q3" |
michael@0 | 2662 | ); |
michael@0 | 2663 | } |
michael@0 | 2664 | |
michael@0 | 2665 | // Subtract 2 rows of ARGB pixels, 8 pixels at a time. |
michael@0 | 2666 | void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, |
michael@0 | 2667 | uint8* dst_argb, int width) { |
michael@0 | 2668 | asm volatile ( |
michael@0 | 2669 | // 8 pixel loop. |
michael@0 | 2670 | ".p2align 2 \n" |
michael@0 | 2671 | "1: \n" |
michael@0 | 2672 | "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. |
michael@0 | 2673 | "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. |
michael@0 | 2674 | "subs %3, %3, #8 \n" // 8 processed per loop. |
michael@0 | 2675 | "vqsub.u8 q0, q0, q2 \n" // subtract B, G |
michael@0 | 2676 | "vqsub.u8 q1, q1, q3 \n" // subtract R, A |
michael@0 | 2677 | "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. |
michael@0 | 2678 | "bgt 1b \n" |
michael@0 | 2679 | |
michael@0 | 2680 | : "+r"(src_argb0), // %0 |
michael@0 | 2681 | "+r"(src_argb1), // %1 |
michael@0 | 2682 | "+r"(dst_argb), // %2 |
michael@0 | 2683 | "+r"(width) // %3 |
michael@0 | 2684 | : |
michael@0 | 2685 | : "cc", "memory", "q0", "q1", "q2", "q3" |
michael@0 | 2686 | ); |
michael@0 | 2687 | } |
michael@0 | 2688 | |
michael@0 | 2689 | // Adds Sobel X and Sobel Y and stores Sobel into ARGB. |
michael@0 | 2690 | // A = 255 |
michael@0 | 2691 | // R = Sobel |
michael@0 | 2692 | // G = Sobel |
michael@0 | 2693 | // B = Sobel |
michael@0 | 2694 | void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, |
michael@0 | 2695 | uint8* dst_argb, int width) { |
michael@0 | 2696 | asm volatile ( |
michael@0 | 2697 | "vmov.u8 d3, #255 \n" // alpha |
michael@0 | 2698 | // 8 pixel loop. |
michael@0 | 2699 | ".p2align 2 \n" |
michael@0 | 2700 | "1: \n" |
michael@0 | 2701 | "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. |
michael@0 | 2702 | "vld1.8 {d1}, [%1]! \n" // load 8 sobely. |
michael@0 | 2703 | "subs %3, %3, #8 \n" // 8 processed per loop. |
michael@0 | 2704 | "vqadd.u8 d0, d0, d1 \n" // add |
michael@0 | 2705 | "vmov.u8 d1, d0 \n" |
michael@0 | 2706 | "vmov.u8 d2, d0 \n" |
michael@0 | 2707 | "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. |
michael@0 | 2708 | "bgt 1b \n" |
michael@0 | 2709 | : "+r"(src_sobelx), // %0 |
michael@0 | 2710 | "+r"(src_sobely), // %1 |
michael@0 | 2711 | "+r"(dst_argb), // %2 |
michael@0 | 2712 | "+r"(width) // %3 |
michael@0 | 2713 | : |
michael@0 | 2714 | : "cc", "memory", "q0", "q1" |
michael@0 | 2715 | ); |
michael@0 | 2716 | } |
michael@0 | 2717 | |
michael@0 | 2718 | // Adds Sobel X and Sobel Y and stores Sobel into plane. |
michael@0 | 2719 | void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, |
michael@0 | 2720 | uint8* dst_y, int width) { |
michael@0 | 2721 | asm volatile ( |
michael@0 | 2722 | // 16 pixel loop. |
michael@0 | 2723 | ".p2align 2 \n" |
michael@0 | 2724 | "1: \n" |
michael@0 | 2725 | "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. |
michael@0 | 2726 | "vld1.8 {q1}, [%1]! \n" // load 16 sobely. |
michael@0 | 2727 | "subs %3, %3, #16 \n" // 16 processed per loop. |
michael@0 | 2728 | "vqadd.u8 q0, q0, q1 \n" // add |
michael@0 | 2729 | "vst1.8 {q0}, [%2]! \n" // store 16 pixels. |
michael@0 | 2730 | "bgt 1b \n" |
michael@0 | 2731 | : "+r"(src_sobelx), // %0 |
michael@0 | 2732 | "+r"(src_sobely), // %1 |
michael@0 | 2733 | "+r"(dst_y), // %2 |
michael@0 | 2734 | "+r"(width) // %3 |
michael@0 | 2735 | : |
michael@0 | 2736 | : "cc", "memory", "q0", "q1" |
michael@0 | 2737 | ); |
michael@0 | 2738 | } |
michael@0 | 2739 | |
michael@0 | 2740 | // Mixes Sobel X, Sobel Y and Sobel into ARGB. |
michael@0 | 2741 | // A = 255 |
michael@0 | 2742 | // R = Sobel X |
michael@0 | 2743 | // G = Sobel |
michael@0 | 2744 | // B = Sobel Y |
michael@0 | 2745 | void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, |
michael@0 | 2746 | uint8* dst_argb, int width) { |
michael@0 | 2747 | asm volatile ( |
michael@0 | 2748 | "vmov.u8 d3, #255 \n" // alpha |
michael@0 | 2749 | // 8 pixel loop. |
michael@0 | 2750 | ".p2align 2 \n" |
michael@0 | 2751 | "1: \n" |
michael@0 | 2752 | "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. |
michael@0 | 2753 | "vld1.8 {d0}, [%1]! \n" // load 8 sobely. |
michael@0 | 2754 | "subs %3, %3, #8 \n" // 8 processed per loop. |
michael@0 | 2755 | "vqadd.u8 d1, d0, d2 \n" // add |
michael@0 | 2756 | "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. |
michael@0 | 2757 | "bgt 1b \n" |
michael@0 | 2758 | : "+r"(src_sobelx), // %0 |
michael@0 | 2759 | "+r"(src_sobely), // %1 |
michael@0 | 2760 | "+r"(dst_argb), // %2 |
michael@0 | 2761 | "+r"(width) // %3 |
michael@0 | 2762 | : |
michael@0 | 2763 | : "cc", "memory", "q0", "q1" |
michael@0 | 2764 | ); |
michael@0 | 2765 | } |
michael@0 | 2766 | |
michael@0 | 2767 | // SobelX as a matrix is |
michael@0 | 2768 | // -1 0 1 |
michael@0 | 2769 | // -2 0 2 |
michael@0 | 2770 | // -1 0 1 |
michael@0 | 2771 | void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, |
michael@0 | 2772 | const uint8* src_y2, uint8* dst_sobelx, int width) { |
michael@0 | 2773 | asm volatile ( |
michael@0 | 2774 | ".p2align 2 \n" |
michael@0 | 2775 | "1: \n" |
michael@0 | 2776 | "vld1.8 {d0}, [%0],%5 \n" // top |
michael@0 | 2777 | "vld1.8 {d1}, [%0],%6 \n" |
michael@0 | 2778 | "vsubl.u8 q0, d0, d1 \n" |
michael@0 | 2779 | "vld1.8 {d2}, [%1],%5 \n" // center * 2 |
michael@0 | 2780 | "vld1.8 {d3}, [%1],%6 \n" |
michael@0 | 2781 | "vsubl.u8 q1, d2, d3 \n" |
michael@0 | 2782 | "vadd.s16 q0, q0, q1 \n" |
michael@0 | 2783 | "vadd.s16 q0, q0, q1 \n" |
michael@0 | 2784 | "vld1.8 {d2}, [%2],%5 \n" // bottom |
michael@0 | 2785 | "vld1.8 {d3}, [%2],%6 \n" |
michael@0 | 2786 | "subs %4, %4, #8 \n" // 8 pixels |
michael@0 | 2787 | "vsubl.u8 q1, d2, d3 \n" |
michael@0 | 2788 | "vadd.s16 q0, q0, q1 \n" |
michael@0 | 2789 | "vabs.s16 q0, q0 \n" |
michael@0 | 2790 | "vqmovn.u16 d0, q0 \n" |
michael@0 | 2791 | "vst1.8 {d0}, [%3]! \n" // store 8 sobelx |
michael@0 | 2792 | "bgt 1b \n" |
michael@0 | 2793 | : "+r"(src_y0), // %0 |
michael@0 | 2794 | "+r"(src_y1), // %1 |
michael@0 | 2795 | "+r"(src_y2), // %2 |
michael@0 | 2796 | "+r"(dst_sobelx), // %3 |
michael@0 | 2797 | "+r"(width) // %4 |
michael@0 | 2798 | : "r"(2), // %5 |
michael@0 | 2799 | "r"(6) // %6 |
michael@0 | 2800 | : "cc", "memory", "q0", "q1" // Clobber List |
michael@0 | 2801 | ); |
michael@0 | 2802 | } |
michael@0 | 2803 | |
michael@0 | 2804 | // SobelY as a matrix is |
michael@0 | 2805 | // -1 -2 -1 |
michael@0 | 2806 | // 0 0 0 |
michael@0 | 2807 | // 1 2 1 |
michael@0 | 2808 | void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, |
michael@0 | 2809 | uint8* dst_sobely, int width) { |
michael@0 | 2810 | asm volatile ( |
michael@0 | 2811 | ".p2align 2 \n" |
michael@0 | 2812 | "1: \n" |
michael@0 | 2813 | "vld1.8 {d0}, [%0],%4 \n" // left |
michael@0 | 2814 | "vld1.8 {d1}, [%1],%4 \n" |
michael@0 | 2815 | "vsubl.u8 q0, d0, d1 \n" |
michael@0 | 2816 | "vld1.8 {d2}, [%0],%4 \n" // center * 2 |
michael@0 | 2817 | "vld1.8 {d3}, [%1],%4 \n" |
michael@0 | 2818 | "vsubl.u8 q1, d2, d3 \n" |
michael@0 | 2819 | "vadd.s16 q0, q0, q1 \n" |
michael@0 | 2820 | "vadd.s16 q0, q0, q1 \n" |
michael@0 | 2821 | "vld1.8 {d2}, [%0],%5 \n" // right |
michael@0 | 2822 | "vld1.8 {d3}, [%1],%5 \n" |
michael@0 | 2823 | "subs %3, %3, #8 \n" // 8 pixels |
michael@0 | 2824 | "vsubl.u8 q1, d2, d3 \n" |
michael@0 | 2825 | "vadd.s16 q0, q0, q1 \n" |
michael@0 | 2826 | "vabs.s16 q0, q0 \n" |
michael@0 | 2827 | "vqmovn.u16 d0, q0 \n" |
michael@0 | 2828 | "vst1.8 {d0}, [%2]! \n" // store 8 sobely |
michael@0 | 2829 | "bgt 1b \n" |
michael@0 | 2830 | : "+r"(src_y0), // %0 |
michael@0 | 2831 | "+r"(src_y1), // %1 |
michael@0 | 2832 | "+r"(dst_sobely), // %2 |
michael@0 | 2833 | "+r"(width) // %3 |
michael@0 | 2834 | : "r"(1), // %4 |
michael@0 | 2835 | "r"(6) // %5 |
michael@0 | 2836 | : "cc", "memory", "q0", "q1" // Clobber List |
michael@0 | 2837 | ); |
michael@0 | 2838 | } |
michael@0 | 2839 | #endif // __ARM_NEON__ |
michael@0 | 2840 | |
michael@0 | 2841 | #ifdef __cplusplus |
michael@0 | 2842 | } // extern "C" |
michael@0 | 2843 | } // namespace libyuv |
michael@0 | 2844 | #endif |