Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* |
michael@0 | 2 | * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
michael@0 | 3 | * |
michael@0 | 4 | * Use of this source code is governed by a BSD-style license |
michael@0 | 5 | * that can be found in the LICENSE file in the root of the source |
michael@0 | 6 | * tree. An additional intellectual property rights grant can be found |
michael@0 | 7 | * in the file PATENTS. All contributing project authors may |
michael@0 | 8 | * be found in the AUTHORS file in the root of the source tree. |
michael@0 | 9 | */ |
michael@0 | 10 | |
michael@0 | 11 | #include "libyuv/row.h" |
michael@0 | 12 | |
michael@0 | 13 | #ifdef __cplusplus |
michael@0 | 14 | namespace libyuv { |
michael@0 | 15 | extern "C" { |
michael@0 | 16 | #endif |
michael@0 | 17 | |
michael@0 | 18 | // This module is for GCC Neon. |
michael@0 | 19 | #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) |
michael@0 | 20 | |
michael@0 | 21 | // NEON downscalers with interpolation. |
michael@0 | 22 | // Provided by Fritz Koenig |
michael@0 | 23 | |
michael@0 | 24 | // Read 32x1 throw away even pixels, and write 16x1. |
michael@0 | 25 | void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, |
michael@0 | 26 | uint8* dst, int dst_width) { |
michael@0 | 27 | asm volatile ( |
michael@0 | 28 | ".p2align 2 \n" |
michael@0 | 29 | "1: \n" |
michael@0 | 30 | // load even pixels into q0, odd into q1 |
michael@0 | 31 | "vld2.8 {q0, q1}, [%0]! \n" |
michael@0 | 32 | "subs %2, %2, #16 \n" // 16 processed per loop |
michael@0 | 33 | "vst1.8 {q1}, [%1]! \n" // store odd pixels |
michael@0 | 34 | "bgt 1b \n" |
michael@0 | 35 | : "+r"(src_ptr), // %0 |
michael@0 | 36 | "+r"(dst), // %1 |
michael@0 | 37 | "+r"(dst_width) // %2 |
michael@0 | 38 | : |
michael@0 | 39 | : "q0", "q1" // Clobber List |
michael@0 | 40 | ); |
michael@0 | 41 | } |
michael@0 | 42 | |
michael@0 | 43 | // Read 32x2 average down and write 16x1. |
michael@0 | 44 | void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, |
michael@0 | 45 | uint8* dst, int dst_width) { |
michael@0 | 46 | asm volatile ( |
michael@0 | 47 | // change the stride to row 2 pointer |
michael@0 | 48 | "add %1, %0 \n" |
michael@0 | 49 | ".p2align 2 \n" |
michael@0 | 50 | "1: \n" |
michael@0 | 51 | "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc |
michael@0 | 52 | "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc |
michael@0 | 53 | "subs %3, %3, #16 \n" // 16 processed per loop |
michael@0 | 54 | "vpaddl.u8 q0, q0 \n" // row 1 add adjacent |
michael@0 | 55 | "vpaddl.u8 q1, q1 \n" |
michael@0 | 56 | "vpadal.u8 q0, q2 \n" // row 2 add adjacent + row1 |
michael@0 | 57 | "vpadal.u8 q1, q3 \n" |
michael@0 | 58 | "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack |
michael@0 | 59 | "vrshrn.u16 d1, q1, #2 \n" |
michael@0 | 60 | "vst1.8 {q0}, [%2]! \n" |
michael@0 | 61 | "bgt 1b \n" |
michael@0 | 62 | : "+r"(src_ptr), // %0 |
michael@0 | 63 | "+r"(src_stride), // %1 |
michael@0 | 64 | "+r"(dst), // %2 |
michael@0 | 65 | "+r"(dst_width) // %3 |
michael@0 | 66 | : |
michael@0 | 67 | : "q0", "q1", "q2", "q3" // Clobber List |
michael@0 | 68 | ); |
michael@0 | 69 | } |
michael@0 | 70 | |
michael@0 | 71 | void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, |
michael@0 | 72 | uint8* dst_ptr, int dst_width) { |
michael@0 | 73 | asm volatile ( |
michael@0 | 74 | ".p2align 2 \n" |
michael@0 | 75 | "1: \n" |
michael@0 | 76 | "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 |
michael@0 | 77 | "subs %2, %2, #8 \n" // 8 processed per loop |
michael@0 | 78 | "vst1.8 {d2}, [%1]! \n" |
michael@0 | 79 | "bgt 1b \n" |
michael@0 | 80 | : "+r"(src_ptr), // %0 |
michael@0 | 81 | "+r"(dst_ptr), // %1 |
michael@0 | 82 | "+r"(dst_width) // %2 |
michael@0 | 83 | : |
michael@0 | 84 | : "q0", "q1", "memory", "cc" |
michael@0 | 85 | ); |
michael@0 | 86 | } |
michael@0 | 87 | |
michael@0 | 88 | void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, |
michael@0 | 89 | uint8* dst_ptr, int dst_width) { |
michael@0 | 90 | asm volatile ( |
michael@0 | 91 | "add r4, %0, %3 \n" |
michael@0 | 92 | "add r5, r4, %3 \n" |
michael@0 | 93 | "add %3, r5, %3 \n" |
michael@0 | 94 | ".p2align 2 \n" |
michael@0 | 95 | "1: \n" |
michael@0 | 96 | "vld1.8 {q0}, [%0]! \n" // load up 16x4 |
michael@0 | 97 | "vld1.8 {q1}, [r4]! \n" |
michael@0 | 98 | "vld1.8 {q2}, [r5]! \n" |
michael@0 | 99 | "vld1.8 {q3}, [%3]! \n" |
michael@0 | 100 | "subs %2, %2, #4 \n" |
michael@0 | 101 | "vpaddl.u8 q0, q0 \n" |
michael@0 | 102 | "vpadal.u8 q0, q1 \n" |
michael@0 | 103 | "vpadal.u8 q0, q2 \n" |
michael@0 | 104 | "vpadal.u8 q0, q3 \n" |
michael@0 | 105 | "vpaddl.u16 q0, q0 \n" |
michael@0 | 106 | "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding |
michael@0 | 107 | "vmovn.u16 d0, q0 \n" |
michael@0 | 108 | "vst1.32 {d0[0]}, [%1]! \n" |
michael@0 | 109 | "bgt 1b \n" |
michael@0 | 110 | : "+r"(src_ptr), // %0 |
michael@0 | 111 | "+r"(dst_ptr), // %1 |
michael@0 | 112 | "+r"(dst_width) // %2 |
michael@0 | 113 | : "r"(src_stride) // %3 |
michael@0 | 114 | : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc" |
michael@0 | 115 | ); |
michael@0 | 116 | } |
michael@0 | 117 | |
michael@0 | 118 | // Down scale from 4 to 3 pixels. Use the neon multilane read/write |
michael@0 | 119 | // to load up the every 4th pixel into a 4 different registers. |
michael@0 | 120 | // Point samples 32 pixels to 24 pixels. |
michael@0 | 121 | void ScaleRowDown34_NEON(const uint8* src_ptr, |
michael@0 | 122 | ptrdiff_t src_stride, |
michael@0 | 123 | uint8* dst_ptr, int dst_width) { |
michael@0 | 124 | asm volatile ( |
michael@0 | 125 | ".p2align 2 \n" |
michael@0 | 126 | "1: \n" |
michael@0 | 127 | "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 |
michael@0 | 128 | "subs %2, %2, #24 \n" |
michael@0 | 129 | "vmov d2, d3 \n" // order d0, d1, d2 |
michael@0 | 130 | "vst3.8 {d0, d1, d2}, [%1]! \n" |
michael@0 | 131 | "bgt 1b \n" |
michael@0 | 132 | : "+r"(src_ptr), // %0 |
michael@0 | 133 | "+r"(dst_ptr), // %1 |
michael@0 | 134 | "+r"(dst_width) // %2 |
michael@0 | 135 | : |
michael@0 | 136 | : "d0", "d1", "d2", "d3", "memory", "cc" |
michael@0 | 137 | ); |
michael@0 | 138 | } |
michael@0 | 139 | |
michael@0 | 140 | void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, |
michael@0 | 141 | ptrdiff_t src_stride, |
michael@0 | 142 | uint8* dst_ptr, int dst_width) { |
michael@0 | 143 | asm volatile ( |
michael@0 | 144 | "vmov.u8 d24, #3 \n" |
michael@0 | 145 | "add %3, %0 \n" |
michael@0 | 146 | ".p2align 2 \n" |
michael@0 | 147 | "1: \n" |
michael@0 | 148 | "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 |
michael@0 | 149 | "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 |
michael@0 | 150 | "subs %2, %2, #24 \n" |
michael@0 | 151 | |
michael@0 | 152 | // filter src line 0 with src line 1 |
michael@0 | 153 | // expand chars to shorts to allow for room |
michael@0 | 154 | // when adding lines together |
michael@0 | 155 | "vmovl.u8 q8, d4 \n" |
michael@0 | 156 | "vmovl.u8 q9, d5 \n" |
michael@0 | 157 | "vmovl.u8 q10, d6 \n" |
michael@0 | 158 | "vmovl.u8 q11, d7 \n" |
michael@0 | 159 | |
michael@0 | 160 | // 3 * line_0 + line_1 |
michael@0 | 161 | "vmlal.u8 q8, d0, d24 \n" |
michael@0 | 162 | "vmlal.u8 q9, d1, d24 \n" |
michael@0 | 163 | "vmlal.u8 q10, d2, d24 \n" |
michael@0 | 164 | "vmlal.u8 q11, d3, d24 \n" |
michael@0 | 165 | |
michael@0 | 166 | // (3 * line_0 + line_1) >> 2 |
michael@0 | 167 | "vqrshrn.u16 d0, q8, #2 \n" |
michael@0 | 168 | "vqrshrn.u16 d1, q9, #2 \n" |
michael@0 | 169 | "vqrshrn.u16 d2, q10, #2 \n" |
michael@0 | 170 | "vqrshrn.u16 d3, q11, #2 \n" |
michael@0 | 171 | |
michael@0 | 172 | // a0 = (src[0] * 3 + s[1] * 1) >> 2 |
michael@0 | 173 | "vmovl.u8 q8, d1 \n" |
michael@0 | 174 | "vmlal.u8 q8, d0, d24 \n" |
michael@0 | 175 | "vqrshrn.u16 d0, q8, #2 \n" |
michael@0 | 176 | |
michael@0 | 177 | // a1 = (src[1] * 1 + s[2] * 1) >> 1 |
michael@0 | 178 | "vrhadd.u8 d1, d1, d2 \n" |
michael@0 | 179 | |
michael@0 | 180 | // a2 = (src[2] * 1 + s[3] * 3) >> 2 |
michael@0 | 181 | "vmovl.u8 q8, d2 \n" |
michael@0 | 182 | "vmlal.u8 q8, d3, d24 \n" |
michael@0 | 183 | "vqrshrn.u16 d2, q8, #2 \n" |
michael@0 | 184 | |
michael@0 | 185 | "vst3.8 {d0, d1, d2}, [%1]! \n" |
michael@0 | 186 | |
michael@0 | 187 | "bgt 1b \n" |
michael@0 | 188 | : "+r"(src_ptr), // %0 |
michael@0 | 189 | "+r"(dst_ptr), // %1 |
michael@0 | 190 | "+r"(dst_width), // %2 |
michael@0 | 191 | "+r"(src_stride) // %3 |
michael@0 | 192 | : |
michael@0 | 193 | : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "d24", "memory", "cc" |
michael@0 | 194 | ); |
michael@0 | 195 | } |
michael@0 | 196 | |
michael@0 | 197 | void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, |
michael@0 | 198 | ptrdiff_t src_stride, |
michael@0 | 199 | uint8* dst_ptr, int dst_width) { |
michael@0 | 200 | asm volatile ( |
michael@0 | 201 | "vmov.u8 d24, #3 \n" |
michael@0 | 202 | "add %3, %0 \n" |
michael@0 | 203 | ".p2align 2 \n" |
michael@0 | 204 | "1: \n" |
michael@0 | 205 | "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 |
michael@0 | 206 | "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 |
michael@0 | 207 | "subs %2, %2, #24 \n" |
michael@0 | 208 | // average src line 0 with src line 1 |
michael@0 | 209 | "vrhadd.u8 q0, q0, q2 \n" |
michael@0 | 210 | "vrhadd.u8 q1, q1, q3 \n" |
michael@0 | 211 | |
michael@0 | 212 | // a0 = (src[0] * 3 + s[1] * 1) >> 2 |
michael@0 | 213 | "vmovl.u8 q3, d1 \n" |
michael@0 | 214 | "vmlal.u8 q3, d0, d24 \n" |
michael@0 | 215 | "vqrshrn.u16 d0, q3, #2 \n" |
michael@0 | 216 | |
michael@0 | 217 | // a1 = (src[1] * 1 + s[2] * 1) >> 1 |
michael@0 | 218 | "vrhadd.u8 d1, d1, d2 \n" |
michael@0 | 219 | |
michael@0 | 220 | // a2 = (src[2] * 1 + s[3] * 3) >> 2 |
michael@0 | 221 | "vmovl.u8 q3, d2 \n" |
michael@0 | 222 | "vmlal.u8 q3, d3, d24 \n" |
michael@0 | 223 | "vqrshrn.u16 d2, q3, #2 \n" |
michael@0 | 224 | |
michael@0 | 225 | "vst3.8 {d0, d1, d2}, [%1]! \n" |
michael@0 | 226 | "bgt 1b \n" |
michael@0 | 227 | : "+r"(src_ptr), // %0 |
michael@0 | 228 | "+r"(dst_ptr), // %1 |
michael@0 | 229 | "+r"(dst_width), // %2 |
michael@0 | 230 | "+r"(src_stride) // %3 |
michael@0 | 231 | : |
michael@0 | 232 | : "r4", "q0", "q1", "q2", "q3", "d24", "memory", "cc" |
michael@0 | 233 | ); |
michael@0 | 234 | } |
michael@0 | 235 | |
michael@0 | 236 | #define HAS_SCALEROWDOWN38_NEON |
michael@0 | 237 | static uvec8 kShuf38 = |
michael@0 | 238 | { 0, 3, 6, 8, 11, 14, 16, 19, 22, 24, 27, 30, 0, 0, 0, 0 }; |
michael@0 | 239 | static uvec8 kShuf38_2 = |
michael@0 | 240 | { 0, 8, 16, 2, 10, 17, 4, 12, 18, 6, 14, 19, 0, 0, 0, 0 }; |
michael@0 | 241 | static vec16 kMult38_Div6 = |
michael@0 | 242 | { 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12, |
michael@0 | 243 | 65536 / 12, 65536 / 12, 65536 / 12, 65536 / 12 }; |
michael@0 | 244 | static vec16 kMult38_Div9 = |
michael@0 | 245 | { 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18, |
michael@0 | 246 | 65536 / 18, 65536 / 18, 65536 / 18, 65536 / 18 }; |
michael@0 | 247 | |
michael@0 | 248 | // 32 -> 12 |
michael@0 | 249 | void ScaleRowDown38_NEON(const uint8* src_ptr, |
michael@0 | 250 | ptrdiff_t src_stride, |
michael@0 | 251 | uint8* dst_ptr, int dst_width) { |
michael@0 | 252 | asm volatile ( |
michael@0 | 253 | "vld1.8 {q3}, [%3] \n" |
michael@0 | 254 | ".p2align 2 \n" |
michael@0 | 255 | "1: \n" |
michael@0 | 256 | "vld1.8 {d0, d1, d2, d3}, [%0]! \n" |
michael@0 | 257 | "subs %2, %2, #12 \n" |
michael@0 | 258 | "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" |
michael@0 | 259 | "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" |
michael@0 | 260 | "vst1.8 {d4}, [%1]! \n" |
michael@0 | 261 | "vst1.32 {d5[0]}, [%1]! \n" |
michael@0 | 262 | "bgt 1b \n" |
michael@0 | 263 | : "+r"(src_ptr), // %0 |
michael@0 | 264 | "+r"(dst_ptr), // %1 |
michael@0 | 265 | "+r"(dst_width) // %2 |
michael@0 | 266 | : "r"(&kShuf38) // %3 |
michael@0 | 267 | : "d0", "d1", "d2", "d3", "d4", "d5", "memory", "cc" |
michael@0 | 268 | ); |
michael@0 | 269 | } |
michael@0 | 270 | |
michael@0 | 271 | // 32x3 -> 12x1 |
michael@0 | 272 | void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, |
michael@0 | 273 | ptrdiff_t src_stride, |
michael@0 | 274 | uint8* dst_ptr, int dst_width) { |
michael@0 | 275 | asm volatile ( |
michael@0 | 276 | "vld1.16 {q13}, [%4] \n" |
michael@0 | 277 | "vld1.8 {q14}, [%5] \n" |
michael@0 | 278 | "vld1.8 {q15}, [%6] \n" |
michael@0 | 279 | "add r4, %0, %3, lsl #1 \n" |
michael@0 | 280 | "add %3, %0 \n" |
michael@0 | 281 | ".p2align 2 \n" |
michael@0 | 282 | "1: \n" |
michael@0 | 283 | |
michael@0 | 284 | // d0 = 00 40 01 41 02 42 03 43 |
michael@0 | 285 | // d1 = 10 50 11 51 12 52 13 53 |
michael@0 | 286 | // d2 = 20 60 21 61 22 62 23 63 |
michael@0 | 287 | // d3 = 30 70 31 71 32 72 33 73 |
michael@0 | 288 | "vld4.8 {d0, d1, d2, d3}, [%0]! \n" |
michael@0 | 289 | "vld4.8 {d4, d5, d6, d7}, [%3]! \n" |
michael@0 | 290 | "vld4.8 {d16, d17, d18, d19}, [r4]! \n" |
michael@0 | 291 | "subs %2, %2, #12 \n" |
michael@0 | 292 | |
michael@0 | 293 | // Shuffle the input data around to get align the data |
michael@0 | 294 | // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 |
michael@0 | 295 | // d0 = 00 10 01 11 02 12 03 13 |
michael@0 | 296 | // d1 = 40 50 41 51 42 52 43 53 |
michael@0 | 297 | "vtrn.u8 d0, d1 \n" |
michael@0 | 298 | "vtrn.u8 d4, d5 \n" |
michael@0 | 299 | "vtrn.u8 d16, d17 \n" |
michael@0 | 300 | |
michael@0 | 301 | // d2 = 20 30 21 31 22 32 23 33 |
michael@0 | 302 | // d3 = 60 70 61 71 62 72 63 73 |
michael@0 | 303 | "vtrn.u8 d2, d3 \n" |
michael@0 | 304 | "vtrn.u8 d6, d7 \n" |
michael@0 | 305 | "vtrn.u8 d18, d19 \n" |
michael@0 | 306 | |
michael@0 | 307 | // d0 = 00+10 01+11 02+12 03+13 |
michael@0 | 308 | // d2 = 40+50 41+51 42+52 43+53 |
michael@0 | 309 | "vpaddl.u8 q0, q0 \n" |
michael@0 | 310 | "vpaddl.u8 q2, q2 \n" |
michael@0 | 311 | "vpaddl.u8 q8, q8 \n" |
michael@0 | 312 | |
michael@0 | 313 | // d3 = 60+70 61+71 62+72 63+73 |
michael@0 | 314 | "vpaddl.u8 d3, d3 \n" |
michael@0 | 315 | "vpaddl.u8 d7, d7 \n" |
michael@0 | 316 | "vpaddl.u8 d19, d19 \n" |
michael@0 | 317 | |
michael@0 | 318 | // combine source lines |
michael@0 | 319 | "vadd.u16 q0, q2 \n" |
michael@0 | 320 | "vadd.u16 q0, q8 \n" |
michael@0 | 321 | "vadd.u16 d4, d3, d7 \n" |
michael@0 | 322 | "vadd.u16 d4, d19 \n" |
michael@0 | 323 | |
michael@0 | 324 | // dst_ptr[3] = (s[6 + st * 0] + s[7 + st * 0] |
michael@0 | 325 | // + s[6 + st * 1] + s[7 + st * 1] |
michael@0 | 326 | // + s[6 + st * 2] + s[7 + st * 2]) / 6 |
michael@0 | 327 | "vqrdmulh.s16 q2, q2, q13 \n" |
michael@0 | 328 | "vmovn.u16 d4, q2 \n" |
michael@0 | 329 | |
michael@0 | 330 | // Shuffle 2,3 reg around so that 2 can be added to the |
michael@0 | 331 | // 0,1 reg and 3 can be added to the 4,5 reg. This |
michael@0 | 332 | // requires expanding from u8 to u16 as the 0,1 and 4,5 |
michael@0 | 333 | // registers are already expanded. Then do transposes |
michael@0 | 334 | // to get aligned. |
michael@0 | 335 | // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 |
michael@0 | 336 | "vmovl.u8 q1, d2 \n" |
michael@0 | 337 | "vmovl.u8 q3, d6 \n" |
michael@0 | 338 | "vmovl.u8 q9, d18 \n" |
michael@0 | 339 | |
michael@0 | 340 | // combine source lines |
michael@0 | 341 | "vadd.u16 q1, q3 \n" |
michael@0 | 342 | "vadd.u16 q1, q9 \n" |
michael@0 | 343 | |
michael@0 | 344 | // d4 = xx 20 xx 30 xx 22 xx 32 |
michael@0 | 345 | // d5 = xx 21 xx 31 xx 23 xx 33 |
michael@0 | 346 | "vtrn.u32 d2, d3 \n" |
michael@0 | 347 | |
michael@0 | 348 | // d4 = xx 20 xx 21 xx 22 xx 23 |
michael@0 | 349 | // d5 = xx 30 xx 31 xx 32 xx 33 |
michael@0 | 350 | "vtrn.u16 d2, d3 \n" |
michael@0 | 351 | |
michael@0 | 352 | // 0+1+2, 3+4+5 |
michael@0 | 353 | "vadd.u16 q0, q1 \n" |
michael@0 | 354 | |
michael@0 | 355 | // Need to divide, but can't downshift as the the value |
michael@0 | 356 | // isn't a power of 2. So multiply by 65536 / n |
michael@0 | 357 | // and take the upper 16 bits. |
michael@0 | 358 | "vqrdmulh.s16 q0, q0, q15 \n" |
michael@0 | 359 | |
michael@0 | 360 | // Align for table lookup, vtbl requires registers to |
michael@0 | 361 | // be adjacent |
michael@0 | 362 | "vmov.u8 d2, d4 \n" |
michael@0 | 363 | |
michael@0 | 364 | "vtbl.u8 d3, {d0, d1, d2}, d28 \n" |
michael@0 | 365 | "vtbl.u8 d4, {d0, d1, d2}, d29 \n" |
michael@0 | 366 | |
michael@0 | 367 | "vst1.8 {d3}, [%1]! \n" |
michael@0 | 368 | "vst1.32 {d4[0]}, [%1]! \n" |
michael@0 | 369 | "bgt 1b \n" |
michael@0 | 370 | : "+r"(src_ptr), // %0 |
michael@0 | 371 | "+r"(dst_ptr), // %1 |
michael@0 | 372 | "+r"(dst_width), // %2 |
michael@0 | 373 | "+r"(src_stride) // %3 |
michael@0 | 374 | : "r"(&kMult38_Div6), // %4 |
michael@0 | 375 | "r"(&kShuf38_2), // %5 |
michael@0 | 376 | "r"(&kMult38_Div9) // %6 |
michael@0 | 377 | : "r4", "q0", "q1", "q2", "q3", "q8", "q9", |
michael@0 | 378 | "q13", "q14", "q15", "memory", "cc" |
michael@0 | 379 | ); |
michael@0 | 380 | } |
michael@0 | 381 | |
michael@0 | 382 | // 32x2 -> 12x1 |
michael@0 | 383 | void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, |
michael@0 | 384 | ptrdiff_t src_stride, |
michael@0 | 385 | uint8* dst_ptr, int dst_width) { |
michael@0 | 386 | asm volatile ( |
michael@0 | 387 | "vld1.16 {q13}, [%4] \n" |
michael@0 | 388 | "vld1.8 {q14}, [%5] \n" |
michael@0 | 389 | "add %3, %0 \n" |
michael@0 | 390 | ".p2align 2 \n" |
michael@0 | 391 | "1: \n" |
michael@0 | 392 | |
michael@0 | 393 | // d0 = 00 40 01 41 02 42 03 43 |
michael@0 | 394 | // d1 = 10 50 11 51 12 52 13 53 |
michael@0 | 395 | // d2 = 20 60 21 61 22 62 23 63 |
michael@0 | 396 | // d3 = 30 70 31 71 32 72 33 73 |
michael@0 | 397 | "vld4.8 {d0, d1, d2, d3}, [%0]! \n" |
michael@0 | 398 | "vld4.8 {d4, d5, d6, d7}, [%3]! \n" |
michael@0 | 399 | "subs %2, %2, #12 \n" |
michael@0 | 400 | |
michael@0 | 401 | // Shuffle the input data around to get align the data |
michael@0 | 402 | // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 |
michael@0 | 403 | // d0 = 00 10 01 11 02 12 03 13 |
michael@0 | 404 | // d1 = 40 50 41 51 42 52 43 53 |
michael@0 | 405 | "vtrn.u8 d0, d1 \n" |
michael@0 | 406 | "vtrn.u8 d4, d5 \n" |
michael@0 | 407 | |
michael@0 | 408 | // d2 = 20 30 21 31 22 32 23 33 |
michael@0 | 409 | // d3 = 60 70 61 71 62 72 63 73 |
michael@0 | 410 | "vtrn.u8 d2, d3 \n" |
michael@0 | 411 | "vtrn.u8 d6, d7 \n" |
michael@0 | 412 | |
michael@0 | 413 | // d0 = 00+10 01+11 02+12 03+13 |
michael@0 | 414 | // d2 = 40+50 41+51 42+52 43+53 |
michael@0 | 415 | "vpaddl.u8 q0, q0 \n" |
michael@0 | 416 | "vpaddl.u8 q2, q2 \n" |
michael@0 | 417 | |
michael@0 | 418 | // d3 = 60+70 61+71 62+72 63+73 |
michael@0 | 419 | "vpaddl.u8 d3, d3 \n" |
michael@0 | 420 | "vpaddl.u8 d7, d7 \n" |
michael@0 | 421 | |
michael@0 | 422 | // combine source lines |
michael@0 | 423 | "vadd.u16 q0, q2 \n" |
michael@0 | 424 | "vadd.u16 d4, d3, d7 \n" |
michael@0 | 425 | |
michael@0 | 426 | // dst_ptr[3] = (s[6] + s[7] + s[6+st] + s[7+st]) / 4 |
michael@0 | 427 | "vqrshrn.u16 d4, q2, #2 \n" |
michael@0 | 428 | |
michael@0 | 429 | // Shuffle 2,3 reg around so that 2 can be added to the |
michael@0 | 430 | // 0,1 reg and 3 can be added to the 4,5 reg. This |
michael@0 | 431 | // requires expanding from u8 to u16 as the 0,1 and 4,5 |
michael@0 | 432 | // registers are already expanded. Then do transposes |
michael@0 | 433 | // to get aligned. |
michael@0 | 434 | // q2 = xx 20 xx 30 xx 21 xx 31 xx 22 xx 32 xx 23 xx 33 |
michael@0 | 435 | "vmovl.u8 q1, d2 \n" |
michael@0 | 436 | "vmovl.u8 q3, d6 \n" |
michael@0 | 437 | |
michael@0 | 438 | // combine source lines |
michael@0 | 439 | "vadd.u16 q1, q3 \n" |
michael@0 | 440 | |
michael@0 | 441 | // d4 = xx 20 xx 30 xx 22 xx 32 |
michael@0 | 442 | // d5 = xx 21 xx 31 xx 23 xx 33 |
michael@0 | 443 | "vtrn.u32 d2, d3 \n" |
michael@0 | 444 | |
michael@0 | 445 | // d4 = xx 20 xx 21 xx 22 xx 23 |
michael@0 | 446 | // d5 = xx 30 xx 31 xx 32 xx 33 |
michael@0 | 447 | "vtrn.u16 d2, d3 \n" |
michael@0 | 448 | |
michael@0 | 449 | // 0+1+2, 3+4+5 |
michael@0 | 450 | "vadd.u16 q0, q1 \n" |
michael@0 | 451 | |
michael@0 | 452 | // Need to divide, but can't downshift as the the value |
michael@0 | 453 | // isn't a power of 2. So multiply by 65536 / n |
michael@0 | 454 | // and take the upper 16 bits. |
michael@0 | 455 | "vqrdmulh.s16 q0, q0, q13 \n" |
michael@0 | 456 | |
michael@0 | 457 | // Align for table lookup, vtbl requires registers to |
michael@0 | 458 | // be adjacent |
michael@0 | 459 | "vmov.u8 d2, d4 \n" |
michael@0 | 460 | |
michael@0 | 461 | "vtbl.u8 d3, {d0, d1, d2}, d28 \n" |
michael@0 | 462 | "vtbl.u8 d4, {d0, d1, d2}, d29 \n" |
michael@0 | 463 | |
michael@0 | 464 | "vst1.8 {d3}, [%1]! \n" |
michael@0 | 465 | "vst1.32 {d4[0]}, [%1]! \n" |
michael@0 | 466 | "bgt 1b \n" |
michael@0 | 467 | : "+r"(src_ptr), // %0 |
michael@0 | 468 | "+r"(dst_ptr), // %1 |
michael@0 | 469 | "+r"(dst_width), // %2 |
michael@0 | 470 | "+r"(src_stride) // %3 |
michael@0 | 471 | : "r"(&kMult38_Div6), // %4 |
michael@0 | 472 | "r"(&kShuf38_2) // %5 |
michael@0 | 473 | : "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc" |
michael@0 | 474 | ); |
michael@0 | 475 | } |
michael@0 | 476 | |
michael@0 | 477 | // 16x2 -> 16x1 |
michael@0 | 478 | void ScaleFilterRows_NEON(uint8* dst_ptr, |
michael@0 | 479 | const uint8* src_ptr, ptrdiff_t src_stride, |
michael@0 | 480 | int dst_width, int source_y_fraction) { |
michael@0 | 481 | asm volatile ( |
michael@0 | 482 | "cmp %4, #0 \n" |
michael@0 | 483 | "beq 100f \n" |
michael@0 | 484 | "add %2, %1 \n" |
michael@0 | 485 | "cmp %4, #64 \n" |
michael@0 | 486 | "beq 75f \n" |
michael@0 | 487 | "cmp %4, #128 \n" |
michael@0 | 488 | "beq 50f \n" |
michael@0 | 489 | "cmp %4, #192 \n" |
michael@0 | 490 | "beq 25f \n" |
michael@0 | 491 | |
michael@0 | 492 | "vdup.8 d5, %4 \n" |
michael@0 | 493 | "rsb %4, #256 \n" |
michael@0 | 494 | "vdup.8 d4, %4 \n" |
michael@0 | 495 | // General purpose row blend. |
michael@0 | 496 | "1: \n" |
michael@0 | 497 | "vld1.8 {q0}, [%1]! \n" |
michael@0 | 498 | "vld1.8 {q1}, [%2]! \n" |
michael@0 | 499 | "subs %3, %3, #16 \n" |
michael@0 | 500 | "vmull.u8 q13, d0, d4 \n" |
michael@0 | 501 | "vmull.u8 q14, d1, d4 \n" |
michael@0 | 502 | "vmlal.u8 q13, d2, d5 \n" |
michael@0 | 503 | "vmlal.u8 q14, d3, d5 \n" |
michael@0 | 504 | "vrshrn.u16 d0, q13, #8 \n" |
michael@0 | 505 | "vrshrn.u16 d1, q14, #8 \n" |
michael@0 | 506 | "vst1.8 {q0}, [%0]! \n" |
michael@0 | 507 | "bgt 1b \n" |
michael@0 | 508 | "b 99f \n" |
michael@0 | 509 | |
michael@0 | 510 | // Blend 25 / 75. |
michael@0 | 511 | "25: \n" |
michael@0 | 512 | "vld1.8 {q0}, [%1]! \n" |
michael@0 | 513 | "vld1.8 {q1}, [%2]! \n" |
michael@0 | 514 | "subs %3, %3, #16 \n" |
michael@0 | 515 | "vrhadd.u8 q0, q1 \n" |
michael@0 | 516 | "vrhadd.u8 q0, q1 \n" |
michael@0 | 517 | "vst1.8 {q0}, [%0]! \n" |
michael@0 | 518 | "bgt 25b \n" |
michael@0 | 519 | "b 99f \n" |
michael@0 | 520 | |
michael@0 | 521 | // Blend 50 / 50. |
michael@0 | 522 | "50: \n" |
michael@0 | 523 | "vld1.8 {q0}, [%1]! \n" |
michael@0 | 524 | "vld1.8 {q1}, [%2]! \n" |
michael@0 | 525 | "subs %3, %3, #16 \n" |
michael@0 | 526 | "vrhadd.u8 q0, q1 \n" |
michael@0 | 527 | "vst1.8 {q0}, [%0]! \n" |
michael@0 | 528 | "bgt 50b \n" |
michael@0 | 529 | "b 99f \n" |
michael@0 | 530 | |
michael@0 | 531 | // Blend 75 / 25. |
michael@0 | 532 | "75: \n" |
michael@0 | 533 | "vld1.8 {q1}, [%1]! \n" |
michael@0 | 534 | "vld1.8 {q0}, [%2]! \n" |
michael@0 | 535 | "subs %3, %3, #16 \n" |
michael@0 | 536 | "vrhadd.u8 q0, q1 \n" |
michael@0 | 537 | "vrhadd.u8 q0, q1 \n" |
michael@0 | 538 | "vst1.8 {q0}, [%0]! \n" |
michael@0 | 539 | "bgt 75b \n" |
michael@0 | 540 | "b 99f \n" |
michael@0 | 541 | |
michael@0 | 542 | // Blend 100 / 0 - Copy row unchanged. |
michael@0 | 543 | "100: \n" |
michael@0 | 544 | "vld1.8 {q0}, [%1]! \n" |
michael@0 | 545 | "subs %3, %3, #16 \n" |
michael@0 | 546 | "vst1.8 {q0}, [%0]! \n" |
michael@0 | 547 | "bgt 100b \n" |
michael@0 | 548 | |
michael@0 | 549 | "99: \n" |
michael@0 | 550 | "vst1.8 {d1[7]}, [%0] \n" |
michael@0 | 551 | : "+r"(dst_ptr), // %0 |
michael@0 | 552 | "+r"(src_ptr), // %1 |
michael@0 | 553 | "+r"(src_stride), // %2 |
michael@0 | 554 | "+r"(dst_width), // %3 |
michael@0 | 555 | "+r"(source_y_fraction) // %4 |
michael@0 | 556 | : |
michael@0 | 557 | : "q0", "q1", "d4", "d5", "q13", "q14", "memory", "cc" |
michael@0 | 558 | ); |
michael@0 | 559 | } |
michael@0 | 560 | |
michael@0 | 561 | void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, |
michael@0 | 562 | uint8* dst, int dst_width) { |
michael@0 | 563 | asm volatile ( |
michael@0 | 564 | ".p2align 2 \n" |
michael@0 | 565 | "1: \n" |
michael@0 | 566 | // load even pixels into q0, odd into q1 |
michael@0 | 567 | "vld2.32 {q0, q1}, [%0]! \n" |
michael@0 | 568 | "vld2.32 {q2, q3}, [%0]! \n" |
michael@0 | 569 | "subs %2, %2, #8 \n" // 8 processed per loop |
michael@0 | 570 | "vst1.8 {q1}, [%1]! \n" // store odd pixels |
michael@0 | 571 | "vst1.8 {q3}, [%1]! \n" |
michael@0 | 572 | "bgt 1b \n" |
michael@0 | 573 | : "+r"(src_ptr), // %0 |
michael@0 | 574 | "+r"(dst), // %1 |
michael@0 | 575 | "+r"(dst_width) // %2 |
michael@0 | 576 | : |
michael@0 | 577 | : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List |
michael@0 | 578 | ); |
michael@0 | 579 | } |
michael@0 | 580 | |
michael@0 | 581 | void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, |
michael@0 | 582 | uint8* dst, int dst_width) { |
michael@0 | 583 | asm volatile ( |
michael@0 | 584 | // change the stride to row 2 pointer |
michael@0 | 585 | "add %1, %1, %0 \n" |
michael@0 | 586 | ".p2align 2 \n" |
michael@0 | 587 | "1: \n" |
michael@0 | 588 | "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. |
michael@0 | 589 | "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. |
michael@0 | 590 | "subs %3, %3, #8 \n" // 8 processed per loop. |
michael@0 | 591 | "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. |
michael@0 | 592 | "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. |
michael@0 | 593 | "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. |
michael@0 | 594 | "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. |
michael@0 | 595 | "vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels. |
michael@0 | 596 | "vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels. |
michael@0 | 597 | "vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts. |
michael@0 | 598 | "vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts. |
michael@0 | 599 | "vpadal.u8 q2, q10 \n" // R 16 bytes -> 8 shorts. |
michael@0 | 600 | "vpadal.u8 q3, q11 \n" // A 16 bytes -> 8 shorts. |
michael@0 | 601 | "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack |
michael@0 | 602 | "vrshrn.u16 d1, q1, #2 \n" |
michael@0 | 603 | "vrshrn.u16 d2, q2, #2 \n" |
michael@0 | 604 | "vrshrn.u16 d3, q3, #2 \n" |
michael@0 | 605 | "vst4.8 {d0, d1, d2, d3}, [%2]! \n" |
michael@0 | 606 | "bgt 1b \n" |
michael@0 | 607 | : "+r"(src_ptr), // %0 |
michael@0 | 608 | "+r"(src_stride), // %1 |
michael@0 | 609 | "+r"(dst), // %2 |
michael@0 | 610 | "+r"(dst_width) // %3 |
michael@0 | 611 | : |
michael@0 | 612 | : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" |
michael@0 | 613 | ); |
michael@0 | 614 | } |
michael@0 | 615 | |
michael@0 | 616 | // Reads 4 pixels at a time. |
michael@0 | 617 | // Alignment requirement: src_argb 4 byte aligned. |
michael@0 | 618 | void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, |
michael@0 | 619 | int src_stepx, uint8* dst_argb, int dst_width) { |
michael@0 | 620 | asm volatile ( |
michael@0 | 621 | "mov r12, %3, lsl #2 \n" |
michael@0 | 622 | ".p2align 2 \n" |
michael@0 | 623 | "1: \n" |
michael@0 | 624 | "vld1.32 {d0[0]}, [%0], r12 \n" |
michael@0 | 625 | "vld1.32 {d0[1]}, [%0], r12 \n" |
michael@0 | 626 | "vld1.32 {d1[0]}, [%0], r12 \n" |
michael@0 | 627 | "vld1.32 {d1[1]}, [%0], r12 \n" |
michael@0 | 628 | "subs %2, %2, #4 \n" // 4 pixels per loop. |
michael@0 | 629 | "vst1.8 {q0}, [%1]! \n" |
michael@0 | 630 | "bgt 1b \n" |
michael@0 | 631 | : "+r"(src_argb), // %0 |
michael@0 | 632 | "+r"(dst_argb), // %1 |
michael@0 | 633 | "+r"(dst_width) // %2 |
michael@0 | 634 | : "r"(src_stepx) // %3 |
michael@0 | 635 | : "memory", "cc", "r12", "q0" |
michael@0 | 636 | ); |
michael@0 | 637 | } |
michael@0 | 638 | |
michael@0 | 639 | // Reads 4 pixels at a time. |
michael@0 | 640 | // Alignment requirement: src_argb 4 byte aligned. |
michael@0 | 641 | void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, |
michael@0 | 642 | int src_stepx, |
michael@0 | 643 | uint8* dst_argb, int dst_width) { |
michael@0 | 644 | asm volatile ( |
michael@0 | 645 | "mov r12, %4, lsl #2 \n" |
michael@0 | 646 | "add %1, %1, %0 \n" |
michael@0 | 647 | ".p2align 2 \n" |
michael@0 | 648 | "1: \n" |
michael@0 | 649 | "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1 |
michael@0 | 650 | "vld1.8 {d1}, [%1], r12 \n" |
michael@0 | 651 | "vld1.8 {d2}, [%0], r12 \n" |
michael@0 | 652 | "vld1.8 {d3}, [%1], r12 \n" |
michael@0 | 653 | "vld1.8 {d4}, [%0], r12 \n" |
michael@0 | 654 | "vld1.8 {d5}, [%1], r12 \n" |
michael@0 | 655 | "vld1.8 {d6}, [%0], r12 \n" |
michael@0 | 656 | "vld1.8 {d7}, [%1], r12 \n" |
michael@0 | 657 | "vaddl.u8 q0, d0, d1 \n" |
michael@0 | 658 | "vaddl.u8 q1, d2, d3 \n" |
michael@0 | 659 | "vaddl.u8 q2, d4, d5 \n" |
michael@0 | 660 | "vaddl.u8 q3, d6, d7 \n" |
michael@0 | 661 | "vswp.8 d1, d2 \n" // ab_cd -> ac_bd |
michael@0 | 662 | "vswp.8 d5, d6 \n" // ef_gh -> eg_fh |
michael@0 | 663 | "vadd.u16 q0, q0, q1 \n" // (a+b)_(c+d) |
michael@0 | 664 | "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) |
michael@0 | 665 | "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. |
michael@0 | 666 | "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. |
michael@0 | 667 | "subs %3, %3, #4 \n" // 4 pixels per loop. |
michael@0 | 668 | "vst1.8 {q0}, [%2]! \n" |
michael@0 | 669 | "bgt 1b \n" |
michael@0 | 670 | : "+r"(src_argb), // %0 |
michael@0 | 671 | "+r"(src_stride), // %1 |
michael@0 | 672 | "+r"(dst_argb), // %2 |
michael@0 | 673 | "+r"(dst_width) // %3 |
michael@0 | 674 | : "r"(src_stepx) // %4 |
michael@0 | 675 | : "memory", "cc", "r12", "q0", "q1", "q2", "q3" |
michael@0 | 676 | ); |
michael@0 | 677 | } |
michael@0 | 678 | |
michael@0 | 679 | #endif // __ARM_NEON__ |
michael@0 | 680 | |
michael@0 | 681 | #ifdef __cplusplus |
michael@0 | 682 | } // extern "C" |
michael@0 | 683 | } // namespace libyuv |
michael@0 | 684 | #endif |