Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* |
michael@0 | 2 | * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
michael@0 | 3 | * |
michael@0 | 4 | * Use of this source code is governed by a BSD-style license |
michael@0 | 5 | * that can be found in the LICENSE file in the root of the source |
michael@0 | 6 | * tree. An additional intellectual property rights grant can be found |
michael@0 | 7 | * in the file PATENTS. All contributing project authors may |
michael@0 | 8 | * be found in the AUTHORS file in the root of the source tree. |
michael@0 | 9 | */ |
michael@0 | 10 | |
michael@0 | 11 | #include "libyuv/row.h" |
michael@0 | 12 | |
michael@0 | 13 | #ifdef __cplusplus |
michael@0 | 14 | namespace libyuv { |
michael@0 | 15 | extern "C" { |
michael@0 | 16 | #endif |
michael@0 | 17 | |
michael@0 | 18 | // This module is for Visual C x86. |
michael@0 | 19 | #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) |
michael@0 | 20 | |
michael@0 | 21 | #ifdef HAS_ARGBTOYROW_SSSE3 |
michael@0 | 22 | |
michael@0 | 23 | // Constants for ARGB. |
michael@0 | 24 | static const vec8 kARGBToY = { |
michael@0 | 25 | 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 |
michael@0 | 26 | }; |
michael@0 | 27 | |
michael@0 | 28 | // JPeg full range. |
michael@0 | 29 | static const vec8 kARGBToYJ = { |
michael@0 | 30 | 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 |
michael@0 | 31 | }; |
michael@0 | 32 | |
michael@0 | 33 | static const vec8 kARGBToU = { |
michael@0 | 34 | 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 |
michael@0 | 35 | }; |
michael@0 | 36 | |
michael@0 | 37 | static const vec8 kARGBToUJ = { |
michael@0 | 38 | 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 |
michael@0 | 39 | }; |
michael@0 | 40 | |
michael@0 | 41 | static const vec8 kARGBToV = { |
michael@0 | 42 | -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, |
michael@0 | 43 | }; |
michael@0 | 44 | |
michael@0 | 45 | static const vec8 kARGBToVJ = { |
michael@0 | 46 | -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 |
michael@0 | 47 | }; |
michael@0 | 48 | |
michael@0 | 49 | // vpermd for vphaddw + vpackuswb vpermd. |
michael@0 | 50 | static const lvec32 kPermdARGBToY_AVX = { |
michael@0 | 51 | 0, 4, 1, 5, 2, 6, 3, 7 |
michael@0 | 52 | }; |
michael@0 | 53 | |
michael@0 | 54 | // vpshufb for vphaddw + vpackuswb packed to shorts. |
michael@0 | 55 | static const lvec8 kShufARGBToUV_AVX = { |
michael@0 | 56 | 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, |
michael@0 | 57 | 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15, |
michael@0 | 58 | }; |
michael@0 | 59 | |
michael@0 | 60 | // Constants for BGRA. |
michael@0 | 61 | static const vec8 kBGRAToY = { |
michael@0 | 62 | 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 |
michael@0 | 63 | }; |
michael@0 | 64 | |
michael@0 | 65 | static const vec8 kBGRAToU = { |
michael@0 | 66 | 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 |
michael@0 | 67 | }; |
michael@0 | 68 | |
michael@0 | 69 | static const vec8 kBGRAToV = { |
michael@0 | 70 | 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 |
michael@0 | 71 | }; |
michael@0 | 72 | |
michael@0 | 73 | // Constants for ABGR. |
michael@0 | 74 | static const vec8 kABGRToY = { |
michael@0 | 75 | 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 |
michael@0 | 76 | }; |
michael@0 | 77 | |
michael@0 | 78 | static const vec8 kABGRToU = { |
michael@0 | 79 | -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 |
michael@0 | 80 | }; |
michael@0 | 81 | |
michael@0 | 82 | static const vec8 kABGRToV = { |
michael@0 | 83 | 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 |
michael@0 | 84 | }; |
michael@0 | 85 | |
michael@0 | 86 | // Constants for RGBA. |
michael@0 | 87 | static const vec8 kRGBAToY = { |
michael@0 | 88 | 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 |
michael@0 | 89 | }; |
michael@0 | 90 | |
michael@0 | 91 | static const vec8 kRGBAToU = { |
michael@0 | 92 | 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 |
michael@0 | 93 | }; |
michael@0 | 94 | |
michael@0 | 95 | static const vec8 kRGBAToV = { |
michael@0 | 96 | 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 |
michael@0 | 97 | }; |
michael@0 | 98 | |
michael@0 | 99 | static const uvec8 kAddY16 = { |
michael@0 | 100 | 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u |
michael@0 | 101 | }; |
michael@0 | 102 | |
michael@0 | 103 | static const vec16 kAddYJ64 = { |
michael@0 | 104 | 64, 64, 64, 64, 64, 64, 64, 64 |
michael@0 | 105 | }; |
michael@0 | 106 | |
michael@0 | 107 | static const uvec8 kAddUV128 = { |
michael@0 | 108 | 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, |
michael@0 | 109 | 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u |
michael@0 | 110 | }; |
michael@0 | 111 | |
michael@0 | 112 | static const uvec16 kAddUVJ128 = { |
michael@0 | 113 | 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u |
michael@0 | 114 | }; |
michael@0 | 115 | |
michael@0 | 116 | // Shuffle table for converting RGB24 to ARGB. |
michael@0 | 117 | static const uvec8 kShuffleMaskRGB24ToARGB = { |
michael@0 | 118 | 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u |
michael@0 | 119 | }; |
michael@0 | 120 | |
michael@0 | 121 | // Shuffle table for converting RAW to ARGB. |
michael@0 | 122 | static const uvec8 kShuffleMaskRAWToARGB = { |
michael@0 | 123 | 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u |
michael@0 | 124 | }; |
michael@0 | 125 | |
michael@0 | 126 | // Shuffle table for converting ARGB to RGB24. |
michael@0 | 127 | static const uvec8 kShuffleMaskARGBToRGB24 = { |
michael@0 | 128 | 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u |
michael@0 | 129 | }; |
michael@0 | 130 | |
michael@0 | 131 | // Shuffle table for converting ARGB to RAW. |
michael@0 | 132 | static const uvec8 kShuffleMaskARGBToRAW = { |
michael@0 | 133 | 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u |
michael@0 | 134 | }; |
michael@0 | 135 | |
michael@0 | 136 | // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 |
michael@0 | 137 | static const uvec8 kShuffleMaskARGBToRGB24_0 = { |
michael@0 | 138 | 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u |
michael@0 | 139 | }; |
michael@0 | 140 | |
michael@0 | 141 | // Shuffle table for converting ARGB to RAW. |
michael@0 | 142 | static const uvec8 kShuffleMaskARGBToRAW_0 = { |
michael@0 | 143 | 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u |
michael@0 | 144 | }; |
michael@0 | 145 | |
michael@0 | 146 | // Duplicates gray value 3 times and fills in alpha opaque. |
michael@0 | 147 | __declspec(naked) __declspec(align(16)) |
michael@0 | 148 | void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { |
michael@0 | 149 | __asm { |
michael@0 | 150 | mov eax, [esp + 4] // src_y |
michael@0 | 151 | mov edx, [esp + 8] // dst_argb |
michael@0 | 152 | mov ecx, [esp + 12] // pix |
michael@0 | 153 | pcmpeqb xmm5, xmm5 // generate mask 0xff000000 |
michael@0 | 154 | pslld xmm5, 24 |
michael@0 | 155 | |
michael@0 | 156 | align 4 |
michael@0 | 157 | convertloop: |
michael@0 | 158 | movq xmm0, qword ptr [eax] |
michael@0 | 159 | lea eax, [eax + 8] |
michael@0 | 160 | punpcklbw xmm0, xmm0 |
michael@0 | 161 | movdqa xmm1, xmm0 |
michael@0 | 162 | punpcklwd xmm0, xmm0 |
michael@0 | 163 | punpckhwd xmm1, xmm1 |
michael@0 | 164 | por xmm0, xmm5 |
michael@0 | 165 | por xmm1, xmm5 |
michael@0 | 166 | movdqa [edx], xmm0 |
michael@0 | 167 | movdqa [edx + 16], xmm1 |
michael@0 | 168 | lea edx, [edx + 32] |
michael@0 | 169 | sub ecx, 8 |
michael@0 | 170 | jg convertloop |
michael@0 | 171 | ret |
michael@0 | 172 | } |
michael@0 | 173 | } |
michael@0 | 174 | |
michael@0 | 175 | __declspec(naked) __declspec(align(16)) |
michael@0 | 176 | void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, |
michael@0 | 177 | int pix) { |
michael@0 | 178 | __asm { |
michael@0 | 179 | mov eax, [esp + 4] // src_y |
michael@0 | 180 | mov edx, [esp + 8] // dst_argb |
michael@0 | 181 | mov ecx, [esp + 12] // pix |
michael@0 | 182 | pcmpeqb xmm5, xmm5 // generate mask 0xff000000 |
michael@0 | 183 | pslld xmm5, 24 |
michael@0 | 184 | |
michael@0 | 185 | align 4 |
michael@0 | 186 | convertloop: |
michael@0 | 187 | movq xmm0, qword ptr [eax] |
michael@0 | 188 | lea eax, [eax + 8] |
michael@0 | 189 | punpcklbw xmm0, xmm0 |
michael@0 | 190 | movdqa xmm1, xmm0 |
michael@0 | 191 | punpcklwd xmm0, xmm0 |
michael@0 | 192 | punpckhwd xmm1, xmm1 |
michael@0 | 193 | por xmm0, xmm5 |
michael@0 | 194 | por xmm1, xmm5 |
michael@0 | 195 | movdqu [edx], xmm0 |
michael@0 | 196 | movdqu [edx + 16], xmm1 |
michael@0 | 197 | lea edx, [edx + 32] |
michael@0 | 198 | sub ecx, 8 |
michael@0 | 199 | jg convertloop |
michael@0 | 200 | ret |
michael@0 | 201 | } |
michael@0 | 202 | } |
michael@0 | 203 | |
michael@0 | 204 | __declspec(naked) __declspec(align(16)) |
michael@0 | 205 | void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { |
michael@0 | 206 | __asm { |
michael@0 | 207 | mov eax, [esp + 4] // src_rgb24 |
michael@0 | 208 | mov edx, [esp + 8] // dst_argb |
michael@0 | 209 | mov ecx, [esp + 12] // pix |
michael@0 | 210 | pcmpeqb xmm5, xmm5 // generate mask 0xff000000 |
michael@0 | 211 | pslld xmm5, 24 |
michael@0 | 212 | movdqa xmm4, kShuffleMaskRGB24ToARGB |
michael@0 | 213 | |
michael@0 | 214 | align 4 |
michael@0 | 215 | convertloop: |
michael@0 | 216 | movdqu xmm0, [eax] |
michael@0 | 217 | movdqu xmm1, [eax + 16] |
michael@0 | 218 | movdqu xmm3, [eax + 32] |
michael@0 | 219 | lea eax, [eax + 48] |
michael@0 | 220 | movdqa xmm2, xmm3 |
michael@0 | 221 | palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} |
michael@0 | 222 | pshufb xmm2, xmm4 |
michael@0 | 223 | por xmm2, xmm5 |
michael@0 | 224 | palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} |
michael@0 | 225 | pshufb xmm0, xmm4 |
michael@0 | 226 | movdqa [edx + 32], xmm2 |
michael@0 | 227 | por xmm0, xmm5 |
michael@0 | 228 | pshufb xmm1, xmm4 |
michael@0 | 229 | movdqa [edx], xmm0 |
michael@0 | 230 | por xmm1, xmm5 |
michael@0 | 231 | palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} |
michael@0 | 232 | pshufb xmm3, xmm4 |
michael@0 | 233 | movdqa [edx + 16], xmm1 |
michael@0 | 234 | por xmm3, xmm5 |
michael@0 | 235 | sub ecx, 16 |
michael@0 | 236 | movdqa [edx + 48], xmm3 |
michael@0 | 237 | lea edx, [edx + 64] |
michael@0 | 238 | jg convertloop |
michael@0 | 239 | ret |
michael@0 | 240 | } |
michael@0 | 241 | } |
michael@0 | 242 | |
michael@0 | 243 | __declspec(naked) __declspec(align(16)) |
michael@0 | 244 | void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, |
michael@0 | 245 | int pix) { |
michael@0 | 246 | __asm { |
michael@0 | 247 | mov eax, [esp + 4] // src_raw |
michael@0 | 248 | mov edx, [esp + 8] // dst_argb |
michael@0 | 249 | mov ecx, [esp + 12] // pix |
michael@0 | 250 | pcmpeqb xmm5, xmm5 // generate mask 0xff000000 |
michael@0 | 251 | pslld xmm5, 24 |
michael@0 | 252 | movdqa xmm4, kShuffleMaskRAWToARGB |
michael@0 | 253 | |
michael@0 | 254 | align 4 |
michael@0 | 255 | convertloop: |
michael@0 | 256 | movdqu xmm0, [eax] |
michael@0 | 257 | movdqu xmm1, [eax + 16] |
michael@0 | 258 | movdqu xmm3, [eax + 32] |
michael@0 | 259 | lea eax, [eax + 48] |
michael@0 | 260 | movdqa xmm2, xmm3 |
michael@0 | 261 | palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} |
michael@0 | 262 | pshufb xmm2, xmm4 |
michael@0 | 263 | por xmm2, xmm5 |
michael@0 | 264 | palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} |
michael@0 | 265 | pshufb xmm0, xmm4 |
michael@0 | 266 | movdqa [edx + 32], xmm2 |
michael@0 | 267 | por xmm0, xmm5 |
michael@0 | 268 | pshufb xmm1, xmm4 |
michael@0 | 269 | movdqa [edx], xmm0 |
michael@0 | 270 | por xmm1, xmm5 |
michael@0 | 271 | palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} |
michael@0 | 272 | pshufb xmm3, xmm4 |
michael@0 | 273 | movdqa [edx + 16], xmm1 |
michael@0 | 274 | por xmm3, xmm5 |
michael@0 | 275 | sub ecx, 16 |
michael@0 | 276 | movdqa [edx + 48], xmm3 |
michael@0 | 277 | lea edx, [edx + 64] |
michael@0 | 278 | jg convertloop |
michael@0 | 279 | ret |
michael@0 | 280 | } |
michael@0 | 281 | } |
michael@0 | 282 | |
michael@0 | 283 | // pmul method to replicate bits. |
michael@0 | 284 | // Math to replicate bits: |
michael@0 | 285 | // (v << 8) | (v << 3) |
michael@0 | 286 | // v * 256 + v * 8 |
michael@0 | 287 | // v * (256 + 8) |
michael@0 | 288 | // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 |
michael@0 | 289 | // 20 instructions. |
michael@0 | 290 | __declspec(naked) __declspec(align(16)) |
michael@0 | 291 | void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, |
michael@0 | 292 | int pix) { |
michael@0 | 293 | __asm { |
michael@0 | 294 | mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
michael@0 | 295 | movd xmm5, eax |
michael@0 | 296 | pshufd xmm5, xmm5, 0 |
michael@0 | 297 | mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits |
michael@0 | 298 | movd xmm6, eax |
michael@0 | 299 | pshufd xmm6, xmm6, 0 |
michael@0 | 300 | pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red |
michael@0 | 301 | psllw xmm3, 11 |
michael@0 | 302 | pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green |
michael@0 | 303 | psllw xmm4, 10 |
michael@0 | 304 | psrlw xmm4, 5 |
michael@0 | 305 | pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha |
michael@0 | 306 | psllw xmm7, 8 |
michael@0 | 307 | |
michael@0 | 308 | mov eax, [esp + 4] // src_rgb565 |
michael@0 | 309 | mov edx, [esp + 8] // dst_argb |
michael@0 | 310 | mov ecx, [esp + 12] // pix |
michael@0 | 311 | sub edx, eax |
michael@0 | 312 | sub edx, eax |
michael@0 | 313 | |
michael@0 | 314 | align 4 |
michael@0 | 315 | convertloop: |
michael@0 | 316 | movdqu xmm0, [eax] // fetch 8 pixels of bgr565 |
michael@0 | 317 | movdqa xmm1, xmm0 |
michael@0 | 318 | movdqa xmm2, xmm0 |
michael@0 | 319 | pand xmm1, xmm3 // R in upper 5 bits |
michael@0 | 320 | psllw xmm2, 11 // B in upper 5 bits |
michael@0 | 321 | pmulhuw xmm1, xmm5 // * (256 + 8) |
michael@0 | 322 | pmulhuw xmm2, xmm5 // * (256 + 8) |
michael@0 | 323 | psllw xmm1, 8 |
michael@0 | 324 | por xmm1, xmm2 // RB |
michael@0 | 325 | pand xmm0, xmm4 // G in middle 6 bits |
michael@0 | 326 | pmulhuw xmm0, xmm6 // << 5 * (256 + 4) |
michael@0 | 327 | por xmm0, xmm7 // AG |
michael@0 | 328 | movdqa xmm2, xmm1 |
michael@0 | 329 | punpcklbw xmm1, xmm0 |
michael@0 | 330 | punpckhbw xmm2, xmm0 |
michael@0 | 331 | movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB |
michael@0 | 332 | movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB |
michael@0 | 333 | lea eax, [eax + 16] |
michael@0 | 334 | sub ecx, 8 |
michael@0 | 335 | jg convertloop |
michael@0 | 336 | ret |
michael@0 | 337 | } |
michael@0 | 338 | } |
michael@0 | 339 | |
michael@0 | 340 | // 24 instructions |
michael@0 | 341 | __declspec(naked) __declspec(align(16)) |
michael@0 | 342 | void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, |
michael@0 | 343 | int pix) { |
michael@0 | 344 | __asm { |
michael@0 | 345 | mov eax, 0x01080108 // generate multiplier to repeat 5 bits |
michael@0 | 346 | movd xmm5, eax |
michael@0 | 347 | pshufd xmm5, xmm5, 0 |
michael@0 | 348 | mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits |
michael@0 | 349 | movd xmm6, eax |
michael@0 | 350 | pshufd xmm6, xmm6, 0 |
michael@0 | 351 | pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red |
michael@0 | 352 | psllw xmm3, 11 |
michael@0 | 353 | movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green |
michael@0 | 354 | psrlw xmm4, 6 |
michael@0 | 355 | pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha |
michael@0 | 356 | psllw xmm7, 8 |
michael@0 | 357 | |
michael@0 | 358 | mov eax, [esp + 4] // src_argb1555 |
michael@0 | 359 | mov edx, [esp + 8] // dst_argb |
michael@0 | 360 | mov ecx, [esp + 12] // pix |
michael@0 | 361 | sub edx, eax |
michael@0 | 362 | sub edx, eax |
michael@0 | 363 | |
michael@0 | 364 | align 4 |
michael@0 | 365 | convertloop: |
michael@0 | 366 | movdqu xmm0, [eax] // fetch 8 pixels of 1555 |
michael@0 | 367 | movdqa xmm1, xmm0 |
michael@0 | 368 | movdqa xmm2, xmm0 |
michael@0 | 369 | psllw xmm1, 1 // R in upper 5 bits |
michael@0 | 370 | psllw xmm2, 11 // B in upper 5 bits |
michael@0 | 371 | pand xmm1, xmm3 |
michael@0 | 372 | pmulhuw xmm2, xmm5 // * (256 + 8) |
michael@0 | 373 | pmulhuw xmm1, xmm5 // * (256 + 8) |
michael@0 | 374 | psllw xmm1, 8 |
michael@0 | 375 | por xmm1, xmm2 // RB |
michael@0 | 376 | movdqa xmm2, xmm0 |
michael@0 | 377 | pand xmm0, xmm4 // G in middle 5 bits |
michael@0 | 378 | psraw xmm2, 8 // A |
michael@0 | 379 | pmulhuw xmm0, xmm6 // << 6 * (256 + 8) |
michael@0 | 380 | pand xmm2, xmm7 |
michael@0 | 381 | por xmm0, xmm2 // AG |
michael@0 | 382 | movdqa xmm2, xmm1 |
michael@0 | 383 | punpcklbw xmm1, xmm0 |
michael@0 | 384 | punpckhbw xmm2, xmm0 |
michael@0 | 385 | movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB |
michael@0 | 386 | movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB |
michael@0 | 387 | lea eax, [eax + 16] |
michael@0 | 388 | sub ecx, 8 |
michael@0 | 389 | jg convertloop |
michael@0 | 390 | ret |
michael@0 | 391 | } |
michael@0 | 392 | } |
michael@0 | 393 | |
michael@0 | 394 | // 18 instructions. |
michael@0 | 395 | __declspec(naked) __declspec(align(16)) |
michael@0 | 396 | void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, |
michael@0 | 397 | int pix) { |
michael@0 | 398 | __asm { |
michael@0 | 399 | mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f |
michael@0 | 400 | movd xmm4, eax |
michael@0 | 401 | pshufd xmm4, xmm4, 0 |
michael@0 | 402 | movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles |
michael@0 | 403 | pslld xmm5, 4 |
michael@0 | 404 | mov eax, [esp + 4] // src_argb4444 |
michael@0 | 405 | mov edx, [esp + 8] // dst_argb |
michael@0 | 406 | mov ecx, [esp + 12] // pix |
michael@0 | 407 | sub edx, eax |
michael@0 | 408 | sub edx, eax |
michael@0 | 409 | |
michael@0 | 410 | align 4 |
michael@0 | 411 | convertloop: |
michael@0 | 412 | movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 |
michael@0 | 413 | movdqa xmm2, xmm0 |
michael@0 | 414 | pand xmm0, xmm4 // mask low nibbles |
michael@0 | 415 | pand xmm2, xmm5 // mask high nibbles |
michael@0 | 416 | movdqa xmm1, xmm0 |
michael@0 | 417 | movdqa xmm3, xmm2 |
michael@0 | 418 | psllw xmm1, 4 |
michael@0 | 419 | psrlw xmm3, 4 |
michael@0 | 420 | por xmm0, xmm1 |
michael@0 | 421 | por xmm2, xmm3 |
michael@0 | 422 | movdqa xmm1, xmm0 |
michael@0 | 423 | punpcklbw xmm0, xmm2 |
michael@0 | 424 | punpckhbw xmm1, xmm2 |
michael@0 | 425 | movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB |
michael@0 | 426 | movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB |
michael@0 | 427 | lea eax, [eax + 16] |
michael@0 | 428 | sub ecx, 8 |
michael@0 | 429 | jg convertloop |
michael@0 | 430 | ret |
michael@0 | 431 | } |
michael@0 | 432 | } |
michael@0 | 433 | |
michael@0 | 434 | __declspec(naked) __declspec(align(16)) |
michael@0 | 435 | void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { |
michael@0 | 436 | __asm { |
michael@0 | 437 | mov eax, [esp + 4] // src_argb |
michael@0 | 438 | mov edx, [esp + 8] // dst_rgb |
michael@0 | 439 | mov ecx, [esp + 12] // pix |
michael@0 | 440 | movdqa xmm6, kShuffleMaskARGBToRGB24 |
michael@0 | 441 | |
michael@0 | 442 | align 4 |
michael@0 | 443 | convertloop: |
michael@0 | 444 | movdqu xmm0, [eax] // fetch 16 pixels of argb |
michael@0 | 445 | movdqu xmm1, [eax + 16] |
michael@0 | 446 | movdqu xmm2, [eax + 32] |
michael@0 | 447 | movdqu xmm3, [eax + 48] |
michael@0 | 448 | lea eax, [eax + 64] |
michael@0 | 449 | pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB |
michael@0 | 450 | pshufb xmm1, xmm6 |
michael@0 | 451 | pshufb xmm2, xmm6 |
michael@0 | 452 | pshufb xmm3, xmm6 |
michael@0 | 453 | movdqa xmm4, xmm1 // 4 bytes from 1 for 0 |
michael@0 | 454 | psrldq xmm1, 4 // 8 bytes from 1 |
michael@0 | 455 | pslldq xmm4, 12 // 4 bytes from 1 for 0 |
michael@0 | 456 | movdqa xmm5, xmm2 // 8 bytes from 2 for 1 |
michael@0 | 457 | por xmm0, xmm4 // 4 bytes from 1 for 0 |
michael@0 | 458 | pslldq xmm5, 8 // 8 bytes from 2 for 1 |
michael@0 | 459 | movdqu [edx], xmm0 // store 0 |
michael@0 | 460 | por xmm1, xmm5 // 8 bytes from 2 for 1 |
michael@0 | 461 | psrldq xmm2, 8 // 4 bytes from 2 |
michael@0 | 462 | pslldq xmm3, 4 // 12 bytes from 3 for 2 |
michael@0 | 463 | por xmm2, xmm3 // 12 bytes from 3 for 2 |
michael@0 | 464 | movdqu [edx + 16], xmm1 // store 1 |
michael@0 | 465 | movdqu [edx + 32], xmm2 // store 2 |
michael@0 | 466 | lea edx, [edx + 48] |
michael@0 | 467 | sub ecx, 16 |
michael@0 | 468 | jg convertloop |
michael@0 | 469 | ret |
michael@0 | 470 | } |
michael@0 | 471 | } |
michael@0 | 472 | |
michael@0 | 473 | __declspec(naked) __declspec(align(16)) |
michael@0 | 474 | void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { |
michael@0 | 475 | __asm { |
michael@0 | 476 | mov eax, [esp + 4] // src_argb |
michael@0 | 477 | mov edx, [esp + 8] // dst_rgb |
michael@0 | 478 | mov ecx, [esp + 12] // pix |
michael@0 | 479 | movdqa xmm6, kShuffleMaskARGBToRAW |
michael@0 | 480 | |
michael@0 | 481 | align 4 |
michael@0 | 482 | convertloop: |
michael@0 | 483 | movdqu xmm0, [eax] // fetch 16 pixels of argb |
michael@0 | 484 | movdqu xmm1, [eax + 16] |
michael@0 | 485 | movdqu xmm2, [eax + 32] |
michael@0 | 486 | movdqu xmm3, [eax + 48] |
michael@0 | 487 | lea eax, [eax + 64] |
michael@0 | 488 | pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB |
michael@0 | 489 | pshufb xmm1, xmm6 |
michael@0 | 490 | pshufb xmm2, xmm6 |
michael@0 | 491 | pshufb xmm3, xmm6 |
michael@0 | 492 | movdqa xmm4, xmm1 // 4 bytes from 1 for 0 |
michael@0 | 493 | psrldq xmm1, 4 // 8 bytes from 1 |
michael@0 | 494 | pslldq xmm4, 12 // 4 bytes from 1 for 0 |
michael@0 | 495 | movdqa xmm5, xmm2 // 8 bytes from 2 for 1 |
michael@0 | 496 | por xmm0, xmm4 // 4 bytes from 1 for 0 |
michael@0 | 497 | pslldq xmm5, 8 // 8 bytes from 2 for 1 |
michael@0 | 498 | movdqu [edx], xmm0 // store 0 |
michael@0 | 499 | por xmm1, xmm5 // 8 bytes from 2 for 1 |
michael@0 | 500 | psrldq xmm2, 8 // 4 bytes from 2 |
michael@0 | 501 | pslldq xmm3, 4 // 12 bytes from 3 for 2 |
michael@0 | 502 | por xmm2, xmm3 // 12 bytes from 3 for 2 |
michael@0 | 503 | movdqu [edx + 16], xmm1 // store 1 |
michael@0 | 504 | movdqu [edx + 32], xmm2 // store 2 |
michael@0 | 505 | lea edx, [edx + 48] |
michael@0 | 506 | sub ecx, 16 |
michael@0 | 507 | jg convertloop |
michael@0 | 508 | ret |
michael@0 | 509 | } |
michael@0 | 510 | } |
michael@0 | 511 | |
michael@0 | 512 | __declspec(naked) __declspec(align(16)) |
michael@0 | 513 | void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { |
michael@0 | 514 | __asm { |
michael@0 | 515 | mov eax, [esp + 4] // src_argb |
michael@0 | 516 | mov edx, [esp + 8] // dst_rgb |
michael@0 | 517 | mov ecx, [esp + 12] // pix |
michael@0 | 518 | pcmpeqb xmm3, xmm3 // generate mask 0x0000001f |
michael@0 | 519 | psrld xmm3, 27 |
michael@0 | 520 | pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 |
michael@0 | 521 | psrld xmm4, 26 |
michael@0 | 522 | pslld xmm4, 5 |
michael@0 | 523 | pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 |
michael@0 | 524 | pslld xmm5, 11 |
michael@0 | 525 | |
michael@0 | 526 | align 4 |
michael@0 | 527 | convertloop: |
michael@0 | 528 | movdqa xmm0, [eax] // fetch 4 pixels of argb |
michael@0 | 529 | movdqa xmm1, xmm0 // B |
michael@0 | 530 | movdqa xmm2, xmm0 // G |
michael@0 | 531 | pslld xmm0, 8 // R |
michael@0 | 532 | psrld xmm1, 3 // B |
michael@0 | 533 | psrld xmm2, 5 // G |
michael@0 | 534 | psrad xmm0, 16 // R |
michael@0 | 535 | pand xmm1, xmm3 // B |
michael@0 | 536 | pand xmm2, xmm4 // G |
michael@0 | 537 | pand xmm0, xmm5 // R |
michael@0 | 538 | por xmm1, xmm2 // BG |
michael@0 | 539 | por xmm0, xmm1 // BGR |
michael@0 | 540 | packssdw xmm0, xmm0 |
michael@0 | 541 | lea eax, [eax + 16] |
michael@0 | 542 | movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 |
michael@0 | 543 | lea edx, [edx + 8] |
michael@0 | 544 | sub ecx, 4 |
michael@0 | 545 | jg convertloop |
michael@0 | 546 | ret |
michael@0 | 547 | } |
michael@0 | 548 | } |
michael@0 | 549 | |
michael@0 | 550 | // TODO(fbarchard): Improve sign extension/packing. |
michael@0 | 551 | __declspec(naked) __declspec(align(16)) |
michael@0 | 552 | void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { |
michael@0 | 553 | __asm { |
michael@0 | 554 | mov eax, [esp + 4] // src_argb |
michael@0 | 555 | mov edx, [esp + 8] // dst_rgb |
michael@0 | 556 | mov ecx, [esp + 12] // pix |
michael@0 | 557 | pcmpeqb xmm4, xmm4 // generate mask 0x0000001f |
michael@0 | 558 | psrld xmm4, 27 |
michael@0 | 559 | movdqa xmm5, xmm4 // generate mask 0x000003e0 |
michael@0 | 560 | pslld xmm5, 5 |
michael@0 | 561 | movdqa xmm6, xmm4 // generate mask 0x00007c00 |
michael@0 | 562 | pslld xmm6, 10 |
michael@0 | 563 | pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 |
michael@0 | 564 | pslld xmm7, 15 |
michael@0 | 565 | |
michael@0 | 566 | align 4 |
michael@0 | 567 | convertloop: |
michael@0 | 568 | movdqa xmm0, [eax] // fetch 4 pixels of argb |
michael@0 | 569 | movdqa xmm1, xmm0 // B |
michael@0 | 570 | movdqa xmm2, xmm0 // G |
michael@0 | 571 | movdqa xmm3, xmm0 // R |
michael@0 | 572 | psrad xmm0, 16 // A |
michael@0 | 573 | psrld xmm1, 3 // B |
michael@0 | 574 | psrld xmm2, 6 // G |
michael@0 | 575 | psrld xmm3, 9 // R |
michael@0 | 576 | pand xmm0, xmm7 // A |
michael@0 | 577 | pand xmm1, xmm4 // B |
michael@0 | 578 | pand xmm2, xmm5 // G |
michael@0 | 579 | pand xmm3, xmm6 // R |
michael@0 | 580 | por xmm0, xmm1 // BA |
michael@0 | 581 | por xmm2, xmm3 // GR |
michael@0 | 582 | por xmm0, xmm2 // BGRA |
michael@0 | 583 | packssdw xmm0, xmm0 |
michael@0 | 584 | lea eax, [eax + 16] |
michael@0 | 585 | movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 |
michael@0 | 586 | lea edx, [edx + 8] |
michael@0 | 587 | sub ecx, 4 |
michael@0 | 588 | jg convertloop |
michael@0 | 589 | ret |
michael@0 | 590 | } |
michael@0 | 591 | } |
michael@0 | 592 | |
michael@0 | 593 | __declspec(naked) __declspec(align(16)) |
michael@0 | 594 | void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { |
michael@0 | 595 | __asm { |
michael@0 | 596 | mov eax, [esp + 4] // src_argb |
michael@0 | 597 | mov edx, [esp + 8] // dst_rgb |
michael@0 | 598 | mov ecx, [esp + 12] // pix |
michael@0 | 599 | pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 |
michael@0 | 600 | psllw xmm4, 12 |
michael@0 | 601 | movdqa xmm3, xmm4 // generate mask 0x00f000f0 |
michael@0 | 602 | psrlw xmm3, 8 |
michael@0 | 603 | |
michael@0 | 604 | align 4 |
michael@0 | 605 | convertloop: |
michael@0 | 606 | movdqa xmm0, [eax] // fetch 4 pixels of argb |
michael@0 | 607 | movdqa xmm1, xmm0 |
michael@0 | 608 | pand xmm0, xmm3 // low nibble |
michael@0 | 609 | pand xmm1, xmm4 // high nibble |
michael@0 | 610 | psrl xmm0, 4 |
michael@0 | 611 | psrl xmm1, 8 |
michael@0 | 612 | por xmm0, xmm1 |
michael@0 | 613 | packuswb xmm0, xmm0 |
michael@0 | 614 | lea eax, [eax + 16] |
michael@0 | 615 | movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 |
michael@0 | 616 | lea edx, [edx + 8] |
michael@0 | 617 | sub ecx, 4 |
michael@0 | 618 | jg convertloop |
michael@0 | 619 | ret |
michael@0 | 620 | } |
michael@0 | 621 | } |
michael@0 | 622 | |
michael@0 | 623 | // Convert 16 ARGB pixels (64 bytes) to 16 Y values. |
michael@0 | 624 | __declspec(naked) __declspec(align(16)) |
michael@0 | 625 | void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
michael@0 | 626 | __asm { |
michael@0 | 627 | mov eax, [esp + 4] /* src_argb */ |
michael@0 | 628 | mov edx, [esp + 8] /* dst_y */ |
michael@0 | 629 | mov ecx, [esp + 12] /* pix */ |
michael@0 | 630 | movdqa xmm5, kAddY16 |
michael@0 | 631 | movdqa xmm4, kARGBToY |
michael@0 | 632 | |
michael@0 | 633 | align 4 |
michael@0 | 634 | convertloop: |
michael@0 | 635 | movdqa xmm0, [eax] |
michael@0 | 636 | movdqa xmm1, [eax + 16] |
michael@0 | 637 | movdqa xmm2, [eax + 32] |
michael@0 | 638 | movdqa xmm3, [eax + 48] |
michael@0 | 639 | pmaddubsw xmm0, xmm4 |
michael@0 | 640 | pmaddubsw xmm1, xmm4 |
michael@0 | 641 | pmaddubsw xmm2, xmm4 |
michael@0 | 642 | pmaddubsw xmm3, xmm4 |
michael@0 | 643 | lea eax, [eax + 64] |
michael@0 | 644 | phaddw xmm0, xmm1 |
michael@0 | 645 | phaddw xmm2, xmm3 |
michael@0 | 646 | psrlw xmm0, 7 |
michael@0 | 647 | psrlw xmm2, 7 |
michael@0 | 648 | packuswb xmm0, xmm2 |
michael@0 | 649 | paddb xmm0, xmm5 |
michael@0 | 650 | sub ecx, 16 |
michael@0 | 651 | movdqa [edx], xmm0 |
michael@0 | 652 | lea edx, [edx + 16] |
michael@0 | 653 | jg convertloop |
michael@0 | 654 | ret |
michael@0 | 655 | } |
michael@0 | 656 | } |
michael@0 | 657 | |
michael@0 | 658 | // Convert 16 ARGB pixels (64 bytes) to 16 Y values. |
michael@0 | 659 | __declspec(naked) __declspec(align(16)) |
michael@0 | 660 | void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
michael@0 | 661 | __asm { |
michael@0 | 662 | mov eax, [esp + 4] /* src_argb */ |
michael@0 | 663 | mov edx, [esp + 8] /* dst_y */ |
michael@0 | 664 | mov ecx, [esp + 12] /* pix */ |
michael@0 | 665 | movdqa xmm4, kARGBToYJ |
michael@0 | 666 | movdqa xmm5, kAddYJ64 |
michael@0 | 667 | |
michael@0 | 668 | align 4 |
michael@0 | 669 | convertloop: |
michael@0 | 670 | movdqa xmm0, [eax] |
michael@0 | 671 | movdqa xmm1, [eax + 16] |
michael@0 | 672 | movdqa xmm2, [eax + 32] |
michael@0 | 673 | movdqa xmm3, [eax + 48] |
michael@0 | 674 | pmaddubsw xmm0, xmm4 |
michael@0 | 675 | pmaddubsw xmm1, xmm4 |
michael@0 | 676 | pmaddubsw xmm2, xmm4 |
michael@0 | 677 | pmaddubsw xmm3, xmm4 |
michael@0 | 678 | lea eax, [eax + 64] |
michael@0 | 679 | phaddw xmm0, xmm1 |
michael@0 | 680 | phaddw xmm2, xmm3 |
michael@0 | 681 | paddw xmm0, xmm5 // Add .5 for rounding. |
michael@0 | 682 | paddw xmm2, xmm5 |
michael@0 | 683 | psrlw xmm0, 7 |
michael@0 | 684 | psrlw xmm2, 7 |
michael@0 | 685 | packuswb xmm0, xmm2 |
michael@0 | 686 | sub ecx, 16 |
michael@0 | 687 | movdqa [edx], xmm0 |
michael@0 | 688 | lea edx, [edx + 16] |
michael@0 | 689 | jg convertloop |
michael@0 | 690 | ret |
michael@0 | 691 | } |
michael@0 | 692 | } |
michael@0 | 693 | |
michael@0 | 694 | #ifdef HAS_ARGBTOYROW_AVX2 |
michael@0 | 695 | // Convert 32 ARGB pixels (128 bytes) to 32 Y values. |
michael@0 | 696 | __declspec(naked) __declspec(align(32)) |
michael@0 | 697 | void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { |
michael@0 | 698 | __asm { |
michael@0 | 699 | mov eax, [esp + 4] /* src_argb */ |
michael@0 | 700 | mov edx, [esp + 8] /* dst_y */ |
michael@0 | 701 | mov ecx, [esp + 12] /* pix */ |
michael@0 | 702 | vbroadcastf128 ymm4, kARGBToY |
michael@0 | 703 | vbroadcastf128 ymm5, kAddY16 |
michael@0 | 704 | vmovdqa ymm6, kPermdARGBToY_AVX |
michael@0 | 705 | |
michael@0 | 706 | align 4 |
michael@0 | 707 | convertloop: |
michael@0 | 708 | vmovdqu ymm0, [eax] |
michael@0 | 709 | vmovdqu ymm1, [eax + 32] |
michael@0 | 710 | vmovdqu ymm2, [eax + 64] |
michael@0 | 711 | vmovdqu ymm3, [eax + 96] |
michael@0 | 712 | vpmaddubsw ymm0, ymm0, ymm4 |
michael@0 | 713 | vpmaddubsw ymm1, ymm1, ymm4 |
michael@0 | 714 | vpmaddubsw ymm2, ymm2, ymm4 |
michael@0 | 715 | vpmaddubsw ymm3, ymm3, ymm4 |
michael@0 | 716 | lea eax, [eax + 128] |
michael@0 | 717 | vphaddw ymm0, ymm0, ymm1 // mutates. |
michael@0 | 718 | vphaddw ymm2, ymm2, ymm3 |
michael@0 | 719 | vpsrlw ymm0, ymm0, 7 |
michael@0 | 720 | vpsrlw ymm2, ymm2, 7 |
michael@0 | 721 | vpackuswb ymm0, ymm0, ymm2 // mutates. |
michael@0 | 722 | vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. |
michael@0 | 723 | vpaddb ymm0, ymm0, ymm5 |
michael@0 | 724 | sub ecx, 32 |
michael@0 | 725 | vmovdqu [edx], ymm0 |
michael@0 | 726 | lea edx, [edx + 32] |
michael@0 | 727 | jg convertloop |
michael@0 | 728 | vzeroupper |
michael@0 | 729 | ret |
michael@0 | 730 | } |
michael@0 | 731 | } |
michael@0 | 732 | #endif // HAS_ARGBTOYROW_AVX2 |
michael@0 | 733 | |
michael@0 | 734 | #ifdef HAS_ARGBTOYROW_AVX2 |
michael@0 | 735 | // Convert 32 ARGB pixels (128 bytes) to 32 Y values. |
michael@0 | 736 | __declspec(naked) __declspec(align(32)) |
michael@0 | 737 | void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { |
michael@0 | 738 | __asm { |
michael@0 | 739 | mov eax, [esp + 4] /* src_argb */ |
michael@0 | 740 | mov edx, [esp + 8] /* dst_y */ |
michael@0 | 741 | mov ecx, [esp + 12] /* pix */ |
michael@0 | 742 | vbroadcastf128 ymm4, kARGBToYJ |
michael@0 | 743 | vbroadcastf128 ymm5, kAddYJ64 |
michael@0 | 744 | vmovdqa ymm6, kPermdARGBToY_AVX |
michael@0 | 745 | |
michael@0 | 746 | align 4 |
michael@0 | 747 | convertloop: |
michael@0 | 748 | vmovdqu ymm0, [eax] |
michael@0 | 749 | vmovdqu ymm1, [eax + 32] |
michael@0 | 750 | vmovdqu ymm2, [eax + 64] |
michael@0 | 751 | vmovdqu ymm3, [eax + 96] |
michael@0 | 752 | vpmaddubsw ymm0, ymm0, ymm4 |
michael@0 | 753 | vpmaddubsw ymm1, ymm1, ymm4 |
michael@0 | 754 | vpmaddubsw ymm2, ymm2, ymm4 |
michael@0 | 755 | vpmaddubsw ymm3, ymm3, ymm4 |
michael@0 | 756 | lea eax, [eax + 128] |
michael@0 | 757 | vphaddw ymm0, ymm0, ymm1 // mutates. |
michael@0 | 758 | vphaddw ymm2, ymm2, ymm3 |
michael@0 | 759 | vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding. |
michael@0 | 760 | vpaddw ymm2, ymm2, ymm5 |
michael@0 | 761 | vpsrlw ymm0, ymm0, 7 |
michael@0 | 762 | vpsrlw ymm2, ymm2, 7 |
michael@0 | 763 | vpackuswb ymm0, ymm0, ymm2 // mutates. |
michael@0 | 764 | vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. |
michael@0 | 765 | sub ecx, 32 |
michael@0 | 766 | vmovdqu [edx], ymm0 |
michael@0 | 767 | lea edx, [edx + 32] |
michael@0 | 768 | jg convertloop |
michael@0 | 769 | |
michael@0 | 770 | vzeroupper |
michael@0 | 771 | ret |
michael@0 | 772 | } |
michael@0 | 773 | } |
michael@0 | 774 | #endif // HAS_ARGBTOYJROW_AVX2 |
michael@0 | 775 | |
michael@0 | 776 | __declspec(naked) __declspec(align(16)) |
michael@0 | 777 | void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
michael@0 | 778 | __asm { |
michael@0 | 779 | mov eax, [esp + 4] /* src_argb */ |
michael@0 | 780 | mov edx, [esp + 8] /* dst_y */ |
michael@0 | 781 | mov ecx, [esp + 12] /* pix */ |
michael@0 | 782 | movdqa xmm5, kAddY16 |
michael@0 | 783 | movdqa xmm4, kARGBToY |
michael@0 | 784 | |
michael@0 | 785 | align 4 |
michael@0 | 786 | convertloop: |
michael@0 | 787 | movdqu xmm0, [eax] |
michael@0 | 788 | movdqu xmm1, [eax + 16] |
michael@0 | 789 | movdqu xmm2, [eax + 32] |
michael@0 | 790 | movdqu xmm3, [eax + 48] |
michael@0 | 791 | pmaddubsw xmm0, xmm4 |
michael@0 | 792 | pmaddubsw xmm1, xmm4 |
michael@0 | 793 | pmaddubsw xmm2, xmm4 |
michael@0 | 794 | pmaddubsw xmm3, xmm4 |
michael@0 | 795 | lea eax, [eax + 64] |
michael@0 | 796 | phaddw xmm0, xmm1 |
michael@0 | 797 | phaddw xmm2, xmm3 |
michael@0 | 798 | psrlw xmm0, 7 |
michael@0 | 799 | psrlw xmm2, 7 |
michael@0 | 800 | packuswb xmm0, xmm2 |
michael@0 | 801 | paddb xmm0, xmm5 |
michael@0 | 802 | sub ecx, 16 |
michael@0 | 803 | movdqu [edx], xmm0 |
michael@0 | 804 | lea edx, [edx + 16] |
michael@0 | 805 | jg convertloop |
michael@0 | 806 | ret |
michael@0 | 807 | } |
michael@0 | 808 | } |
michael@0 | 809 | |
michael@0 | 810 | __declspec(naked) __declspec(align(16)) |
michael@0 | 811 | void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
michael@0 | 812 | __asm { |
michael@0 | 813 | mov eax, [esp + 4] /* src_argb */ |
michael@0 | 814 | mov edx, [esp + 8] /* dst_y */ |
michael@0 | 815 | mov ecx, [esp + 12] /* pix */ |
michael@0 | 816 | movdqa xmm4, kARGBToYJ |
michael@0 | 817 | movdqa xmm5, kAddYJ64 |
michael@0 | 818 | |
michael@0 | 819 | align 4 |
michael@0 | 820 | convertloop: |
michael@0 | 821 | movdqu xmm0, [eax] |
michael@0 | 822 | movdqu xmm1, [eax + 16] |
michael@0 | 823 | movdqu xmm2, [eax + 32] |
michael@0 | 824 | movdqu xmm3, [eax + 48] |
michael@0 | 825 | pmaddubsw xmm0, xmm4 |
michael@0 | 826 | pmaddubsw xmm1, xmm4 |
michael@0 | 827 | pmaddubsw xmm2, xmm4 |
michael@0 | 828 | pmaddubsw xmm3, xmm4 |
michael@0 | 829 | lea eax, [eax + 64] |
michael@0 | 830 | phaddw xmm0, xmm1 |
michael@0 | 831 | phaddw xmm2, xmm3 |
michael@0 | 832 | paddw xmm0, xmm5 |
michael@0 | 833 | paddw xmm2, xmm5 |
michael@0 | 834 | psrlw xmm0, 7 |
michael@0 | 835 | psrlw xmm2, 7 |
michael@0 | 836 | packuswb xmm0, xmm2 |
michael@0 | 837 | sub ecx, 16 |
michael@0 | 838 | movdqu [edx], xmm0 |
michael@0 | 839 | lea edx, [edx + 16] |
michael@0 | 840 | jg convertloop |
michael@0 | 841 | ret |
michael@0 | 842 | } |
michael@0 | 843 | } |
michael@0 | 844 | |
michael@0 | 845 | __declspec(naked) __declspec(align(16)) |
michael@0 | 846 | void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
michael@0 | 847 | __asm { |
michael@0 | 848 | mov eax, [esp + 4] /* src_argb */ |
michael@0 | 849 | mov edx, [esp + 8] /* dst_y */ |
michael@0 | 850 | mov ecx, [esp + 12] /* pix */ |
michael@0 | 851 | movdqa xmm5, kAddY16 |
michael@0 | 852 | movdqa xmm4, kBGRAToY |
michael@0 | 853 | |
michael@0 | 854 | align 4 |
michael@0 | 855 | convertloop: |
michael@0 | 856 | movdqa xmm0, [eax] |
michael@0 | 857 | movdqa xmm1, [eax + 16] |
michael@0 | 858 | movdqa xmm2, [eax + 32] |
michael@0 | 859 | movdqa xmm3, [eax + 48] |
michael@0 | 860 | pmaddubsw xmm0, xmm4 |
michael@0 | 861 | pmaddubsw xmm1, xmm4 |
michael@0 | 862 | pmaddubsw xmm2, xmm4 |
michael@0 | 863 | pmaddubsw xmm3, xmm4 |
michael@0 | 864 | lea eax, [eax + 64] |
michael@0 | 865 | phaddw xmm0, xmm1 |
michael@0 | 866 | phaddw xmm2, xmm3 |
michael@0 | 867 | psrlw xmm0, 7 |
michael@0 | 868 | psrlw xmm2, 7 |
michael@0 | 869 | packuswb xmm0, xmm2 |
michael@0 | 870 | paddb xmm0, xmm5 |
michael@0 | 871 | sub ecx, 16 |
michael@0 | 872 | movdqa [edx], xmm0 |
michael@0 | 873 | lea edx, [edx + 16] |
michael@0 | 874 | jg convertloop |
michael@0 | 875 | ret |
michael@0 | 876 | } |
michael@0 | 877 | } |
michael@0 | 878 | |
michael@0 | 879 | __declspec(naked) __declspec(align(16)) |
michael@0 | 880 | void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
michael@0 | 881 | __asm { |
michael@0 | 882 | mov eax, [esp + 4] /* src_argb */ |
michael@0 | 883 | mov edx, [esp + 8] /* dst_y */ |
michael@0 | 884 | mov ecx, [esp + 12] /* pix */ |
michael@0 | 885 | movdqa xmm5, kAddY16 |
michael@0 | 886 | movdqa xmm4, kBGRAToY |
michael@0 | 887 | |
michael@0 | 888 | align 4 |
michael@0 | 889 | convertloop: |
michael@0 | 890 | movdqu xmm0, [eax] |
michael@0 | 891 | movdqu xmm1, [eax + 16] |
michael@0 | 892 | movdqu xmm2, [eax + 32] |
michael@0 | 893 | movdqu xmm3, [eax + 48] |
michael@0 | 894 | pmaddubsw xmm0, xmm4 |
michael@0 | 895 | pmaddubsw xmm1, xmm4 |
michael@0 | 896 | pmaddubsw xmm2, xmm4 |
michael@0 | 897 | pmaddubsw xmm3, xmm4 |
michael@0 | 898 | lea eax, [eax + 64] |
michael@0 | 899 | phaddw xmm0, xmm1 |
michael@0 | 900 | phaddw xmm2, xmm3 |
michael@0 | 901 | psrlw xmm0, 7 |
michael@0 | 902 | psrlw xmm2, 7 |
michael@0 | 903 | packuswb xmm0, xmm2 |
michael@0 | 904 | paddb xmm0, xmm5 |
michael@0 | 905 | sub ecx, 16 |
michael@0 | 906 | movdqu [edx], xmm0 |
michael@0 | 907 | lea edx, [edx + 16] |
michael@0 | 908 | jg convertloop |
michael@0 | 909 | ret |
michael@0 | 910 | } |
michael@0 | 911 | } |
michael@0 | 912 | |
michael@0 | 913 | __declspec(naked) __declspec(align(16)) |
michael@0 | 914 | void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
michael@0 | 915 | __asm { |
michael@0 | 916 | mov eax, [esp + 4] /* src_argb */ |
michael@0 | 917 | mov edx, [esp + 8] /* dst_y */ |
michael@0 | 918 | mov ecx, [esp + 12] /* pix */ |
michael@0 | 919 | movdqa xmm5, kAddY16 |
michael@0 | 920 | movdqa xmm4, kABGRToY |
michael@0 | 921 | |
michael@0 | 922 | align 4 |
michael@0 | 923 | convertloop: |
michael@0 | 924 | movdqa xmm0, [eax] |
michael@0 | 925 | movdqa xmm1, [eax + 16] |
michael@0 | 926 | movdqa xmm2, [eax + 32] |
michael@0 | 927 | movdqa xmm3, [eax + 48] |
michael@0 | 928 | pmaddubsw xmm0, xmm4 |
michael@0 | 929 | pmaddubsw xmm1, xmm4 |
michael@0 | 930 | pmaddubsw xmm2, xmm4 |
michael@0 | 931 | pmaddubsw xmm3, xmm4 |
michael@0 | 932 | lea eax, [eax + 64] |
michael@0 | 933 | phaddw xmm0, xmm1 |
michael@0 | 934 | phaddw xmm2, xmm3 |
michael@0 | 935 | psrlw xmm0, 7 |
michael@0 | 936 | psrlw xmm2, 7 |
michael@0 | 937 | packuswb xmm0, xmm2 |
michael@0 | 938 | paddb xmm0, xmm5 |
michael@0 | 939 | sub ecx, 16 |
michael@0 | 940 | movdqa [edx], xmm0 |
michael@0 | 941 | lea edx, [edx + 16] |
michael@0 | 942 | jg convertloop |
michael@0 | 943 | ret |
michael@0 | 944 | } |
michael@0 | 945 | } |
michael@0 | 946 | |
michael@0 | 947 | __declspec(naked) __declspec(align(16)) |
michael@0 | 948 | void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
michael@0 | 949 | __asm { |
michael@0 | 950 | mov eax, [esp + 4] /* src_argb */ |
michael@0 | 951 | mov edx, [esp + 8] /* dst_y */ |
michael@0 | 952 | mov ecx, [esp + 12] /* pix */ |
michael@0 | 953 | movdqa xmm5, kAddY16 |
michael@0 | 954 | movdqa xmm4, kABGRToY |
michael@0 | 955 | |
michael@0 | 956 | align 4 |
michael@0 | 957 | convertloop: |
michael@0 | 958 | movdqu xmm0, [eax] |
michael@0 | 959 | movdqu xmm1, [eax + 16] |
michael@0 | 960 | movdqu xmm2, [eax + 32] |
michael@0 | 961 | movdqu xmm3, [eax + 48] |
michael@0 | 962 | pmaddubsw xmm0, xmm4 |
michael@0 | 963 | pmaddubsw xmm1, xmm4 |
michael@0 | 964 | pmaddubsw xmm2, xmm4 |
michael@0 | 965 | pmaddubsw xmm3, xmm4 |
michael@0 | 966 | lea eax, [eax + 64] |
michael@0 | 967 | phaddw xmm0, xmm1 |
michael@0 | 968 | phaddw xmm2, xmm3 |
michael@0 | 969 | psrlw xmm0, 7 |
michael@0 | 970 | psrlw xmm2, 7 |
michael@0 | 971 | packuswb xmm0, xmm2 |
michael@0 | 972 | paddb xmm0, xmm5 |
michael@0 | 973 | sub ecx, 16 |
michael@0 | 974 | movdqu [edx], xmm0 |
michael@0 | 975 | lea edx, [edx + 16] |
michael@0 | 976 | jg convertloop |
michael@0 | 977 | ret |
michael@0 | 978 | } |
michael@0 | 979 | } |
michael@0 | 980 | |
michael@0 | 981 | __declspec(naked) __declspec(align(16)) |
michael@0 | 982 | void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
michael@0 | 983 | __asm { |
michael@0 | 984 | mov eax, [esp + 4] /* src_argb */ |
michael@0 | 985 | mov edx, [esp + 8] /* dst_y */ |
michael@0 | 986 | mov ecx, [esp + 12] /* pix */ |
michael@0 | 987 | movdqa xmm5, kAddY16 |
michael@0 | 988 | movdqa xmm4, kRGBAToY |
michael@0 | 989 | |
michael@0 | 990 | align 4 |
michael@0 | 991 | convertloop: |
michael@0 | 992 | movdqa xmm0, [eax] |
michael@0 | 993 | movdqa xmm1, [eax + 16] |
michael@0 | 994 | movdqa xmm2, [eax + 32] |
michael@0 | 995 | movdqa xmm3, [eax + 48] |
michael@0 | 996 | pmaddubsw xmm0, xmm4 |
michael@0 | 997 | pmaddubsw xmm1, xmm4 |
michael@0 | 998 | pmaddubsw xmm2, xmm4 |
michael@0 | 999 | pmaddubsw xmm3, xmm4 |
michael@0 | 1000 | lea eax, [eax + 64] |
michael@0 | 1001 | phaddw xmm0, xmm1 |
michael@0 | 1002 | phaddw xmm2, xmm3 |
michael@0 | 1003 | psrlw xmm0, 7 |
michael@0 | 1004 | psrlw xmm2, 7 |
michael@0 | 1005 | packuswb xmm0, xmm2 |
michael@0 | 1006 | paddb xmm0, xmm5 |
michael@0 | 1007 | sub ecx, 16 |
michael@0 | 1008 | movdqa [edx], xmm0 |
michael@0 | 1009 | lea edx, [edx + 16] |
michael@0 | 1010 | jg convertloop |
michael@0 | 1011 | ret |
michael@0 | 1012 | } |
michael@0 | 1013 | } |
michael@0 | 1014 | |
michael@0 | 1015 | __declspec(naked) __declspec(align(16)) |
michael@0 | 1016 | void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
michael@0 | 1017 | __asm { |
michael@0 | 1018 | mov eax, [esp + 4] /* src_argb */ |
michael@0 | 1019 | mov edx, [esp + 8] /* dst_y */ |
michael@0 | 1020 | mov ecx, [esp + 12] /* pix */ |
michael@0 | 1021 | movdqa xmm5, kAddY16 |
michael@0 | 1022 | movdqa xmm4, kRGBAToY |
michael@0 | 1023 | |
michael@0 | 1024 | align 4 |
michael@0 | 1025 | convertloop: |
michael@0 | 1026 | movdqu xmm0, [eax] |
michael@0 | 1027 | movdqu xmm1, [eax + 16] |
michael@0 | 1028 | movdqu xmm2, [eax + 32] |
michael@0 | 1029 | movdqu xmm3, [eax + 48] |
michael@0 | 1030 | pmaddubsw xmm0, xmm4 |
michael@0 | 1031 | pmaddubsw xmm1, xmm4 |
michael@0 | 1032 | pmaddubsw xmm2, xmm4 |
michael@0 | 1033 | pmaddubsw xmm3, xmm4 |
michael@0 | 1034 | lea eax, [eax + 64] |
michael@0 | 1035 | phaddw xmm0, xmm1 |
michael@0 | 1036 | phaddw xmm2, xmm3 |
michael@0 | 1037 | psrlw xmm0, 7 |
michael@0 | 1038 | psrlw xmm2, 7 |
michael@0 | 1039 | packuswb xmm0, xmm2 |
michael@0 | 1040 | paddb xmm0, xmm5 |
michael@0 | 1041 | sub ecx, 16 |
michael@0 | 1042 | movdqu [edx], xmm0 |
michael@0 | 1043 | lea edx, [edx + 16] |
michael@0 | 1044 | jg convertloop |
michael@0 | 1045 | ret |
michael@0 | 1046 | } |
michael@0 | 1047 | } |
michael@0 | 1048 | |
michael@0 | 1049 | __declspec(naked) __declspec(align(16)) |
michael@0 | 1050 | void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
michael@0 | 1051 | uint8* dst_u, uint8* dst_v, int width) { |
michael@0 | 1052 | __asm { |
michael@0 | 1053 | push esi |
michael@0 | 1054 | push edi |
michael@0 | 1055 | mov eax, [esp + 8 + 4] // src_argb |
michael@0 | 1056 | mov esi, [esp + 8 + 8] // src_stride_argb |
michael@0 | 1057 | mov edx, [esp + 8 + 12] // dst_u |
michael@0 | 1058 | mov edi, [esp + 8 + 16] // dst_v |
michael@0 | 1059 | mov ecx, [esp + 8 + 20] // pix |
michael@0 | 1060 | movdqa xmm7, kARGBToU |
michael@0 | 1061 | movdqa xmm6, kARGBToV |
michael@0 | 1062 | movdqa xmm5, kAddUV128 |
michael@0 | 1063 | sub edi, edx // stride from u to v |
michael@0 | 1064 | |
michael@0 | 1065 | align 4 |
michael@0 | 1066 | convertloop: |
michael@0 | 1067 | /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
michael@0 | 1068 | movdqa xmm0, [eax] |
michael@0 | 1069 | movdqa xmm1, [eax + 16] |
michael@0 | 1070 | movdqa xmm2, [eax + 32] |
michael@0 | 1071 | movdqa xmm3, [eax + 48] |
michael@0 | 1072 | pavgb xmm0, [eax + esi] |
michael@0 | 1073 | pavgb xmm1, [eax + esi + 16] |
michael@0 | 1074 | pavgb xmm2, [eax + esi + 32] |
michael@0 | 1075 | pavgb xmm3, [eax + esi + 48] |
michael@0 | 1076 | lea eax, [eax + 64] |
michael@0 | 1077 | movdqa xmm4, xmm0 |
michael@0 | 1078 | shufps xmm0, xmm1, 0x88 |
michael@0 | 1079 | shufps xmm4, xmm1, 0xdd |
michael@0 | 1080 | pavgb xmm0, xmm4 |
michael@0 | 1081 | movdqa xmm4, xmm2 |
michael@0 | 1082 | shufps xmm2, xmm3, 0x88 |
michael@0 | 1083 | shufps xmm4, xmm3, 0xdd |
michael@0 | 1084 | pavgb xmm2, xmm4 |
michael@0 | 1085 | |
michael@0 | 1086 | // step 2 - convert to U and V |
michael@0 | 1087 | // from here down is very similar to Y code except |
michael@0 | 1088 | // instead of 16 different pixels, its 8 pixels of U and 8 of V |
michael@0 | 1089 | movdqa xmm1, xmm0 |
michael@0 | 1090 | movdqa xmm3, xmm2 |
michael@0 | 1091 | pmaddubsw xmm0, xmm7 // U |
michael@0 | 1092 | pmaddubsw xmm2, xmm7 |
michael@0 | 1093 | pmaddubsw xmm1, xmm6 // V |
michael@0 | 1094 | pmaddubsw xmm3, xmm6 |
michael@0 | 1095 | phaddw xmm0, xmm2 |
michael@0 | 1096 | phaddw xmm1, xmm3 |
michael@0 | 1097 | psraw xmm0, 8 |
michael@0 | 1098 | psraw xmm1, 8 |
michael@0 | 1099 | packsswb xmm0, xmm1 |
michael@0 | 1100 | paddb xmm0, xmm5 // -> unsigned |
michael@0 | 1101 | |
michael@0 | 1102 | // step 3 - store 8 U and 8 V values |
michael@0 | 1103 | sub ecx, 16 |
michael@0 | 1104 | movlps qword ptr [edx], xmm0 // U |
michael@0 | 1105 | movhps qword ptr [edx + edi], xmm0 // V |
michael@0 | 1106 | lea edx, [edx + 8] |
michael@0 | 1107 | jg convertloop |
michael@0 | 1108 | |
michael@0 | 1109 | pop edi |
michael@0 | 1110 | pop esi |
michael@0 | 1111 | ret |
michael@0 | 1112 | } |
michael@0 | 1113 | } |
michael@0 | 1114 | |
michael@0 | 1115 | __declspec(naked) __declspec(align(16)) |
michael@0 | 1116 | void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
michael@0 | 1117 | uint8* dst_u, uint8* dst_v, int width) { |
michael@0 | 1118 | __asm { |
michael@0 | 1119 | push esi |
michael@0 | 1120 | push edi |
michael@0 | 1121 | mov eax, [esp + 8 + 4] // src_argb |
michael@0 | 1122 | mov esi, [esp + 8 + 8] // src_stride_argb |
michael@0 | 1123 | mov edx, [esp + 8 + 12] // dst_u |
michael@0 | 1124 | mov edi, [esp + 8 + 16] // dst_v |
michael@0 | 1125 | mov ecx, [esp + 8 + 20] // pix |
michael@0 | 1126 | movdqa xmm7, kARGBToUJ |
michael@0 | 1127 | movdqa xmm6, kARGBToVJ |
michael@0 | 1128 | movdqa xmm5, kAddUVJ128 |
michael@0 | 1129 | sub edi, edx // stride from u to v |
michael@0 | 1130 | |
michael@0 | 1131 | align 4 |
michael@0 | 1132 | convertloop: |
michael@0 | 1133 | /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
michael@0 | 1134 | movdqa xmm0, [eax] |
michael@0 | 1135 | movdqa xmm1, [eax + 16] |
michael@0 | 1136 | movdqa xmm2, [eax + 32] |
michael@0 | 1137 | movdqa xmm3, [eax + 48] |
michael@0 | 1138 | pavgb xmm0, [eax + esi] |
michael@0 | 1139 | pavgb xmm1, [eax + esi + 16] |
michael@0 | 1140 | pavgb xmm2, [eax + esi + 32] |
michael@0 | 1141 | pavgb xmm3, [eax + esi + 48] |
michael@0 | 1142 | lea eax, [eax + 64] |
michael@0 | 1143 | movdqa xmm4, xmm0 |
michael@0 | 1144 | shufps xmm0, xmm1, 0x88 |
michael@0 | 1145 | shufps xmm4, xmm1, 0xdd |
michael@0 | 1146 | pavgb xmm0, xmm4 |
michael@0 | 1147 | movdqa xmm4, xmm2 |
michael@0 | 1148 | shufps xmm2, xmm3, 0x88 |
michael@0 | 1149 | shufps xmm4, xmm3, 0xdd |
michael@0 | 1150 | pavgb xmm2, xmm4 |
michael@0 | 1151 | |
michael@0 | 1152 | // step 2 - convert to U and V |
michael@0 | 1153 | // from here down is very similar to Y code except |
michael@0 | 1154 | // instead of 16 different pixels, its 8 pixels of U and 8 of V |
michael@0 | 1155 | movdqa xmm1, xmm0 |
michael@0 | 1156 | movdqa xmm3, xmm2 |
michael@0 | 1157 | pmaddubsw xmm0, xmm7 // U |
michael@0 | 1158 | pmaddubsw xmm2, xmm7 |
michael@0 | 1159 | pmaddubsw xmm1, xmm6 // V |
michael@0 | 1160 | pmaddubsw xmm3, xmm6 |
michael@0 | 1161 | phaddw xmm0, xmm2 |
michael@0 | 1162 | phaddw xmm1, xmm3 |
michael@0 | 1163 | paddw xmm0, xmm5 // +.5 rounding -> unsigned |
michael@0 | 1164 | paddw xmm1, xmm5 |
michael@0 | 1165 | psraw xmm0, 8 |
michael@0 | 1166 | psraw xmm1, 8 |
michael@0 | 1167 | packsswb xmm0, xmm1 |
michael@0 | 1168 | |
michael@0 | 1169 | // step 3 - store 8 U and 8 V values |
michael@0 | 1170 | sub ecx, 16 |
michael@0 | 1171 | movlps qword ptr [edx], xmm0 // U |
michael@0 | 1172 | movhps qword ptr [edx + edi], xmm0 // V |
michael@0 | 1173 | lea edx, [edx + 8] |
michael@0 | 1174 | jg convertloop |
michael@0 | 1175 | |
michael@0 | 1176 | pop edi |
michael@0 | 1177 | pop esi |
michael@0 | 1178 | ret |
michael@0 | 1179 | } |
michael@0 | 1180 | } |
michael@0 | 1181 | |
michael@0 | 1182 | #ifdef HAS_ARGBTOUVROW_AVX2 |
michael@0 | 1183 | __declspec(naked) __declspec(align(32)) |
michael@0 | 1184 | void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, |
michael@0 | 1185 | uint8* dst_u, uint8* dst_v, int width) { |
michael@0 | 1186 | __asm { |
michael@0 | 1187 | push esi |
michael@0 | 1188 | push edi |
michael@0 | 1189 | mov eax, [esp + 8 + 4] // src_argb |
michael@0 | 1190 | mov esi, [esp + 8 + 8] // src_stride_argb |
michael@0 | 1191 | mov edx, [esp + 8 + 12] // dst_u |
michael@0 | 1192 | mov edi, [esp + 8 + 16] // dst_v |
michael@0 | 1193 | mov ecx, [esp + 8 + 20] // pix |
michael@0 | 1194 | vbroadcastf128 ymm5, kAddUV128 |
michael@0 | 1195 | vbroadcastf128 ymm6, kARGBToV |
michael@0 | 1196 | vbroadcastf128 ymm7, kARGBToU |
michael@0 | 1197 | sub edi, edx // stride from u to v |
michael@0 | 1198 | |
michael@0 | 1199 | align 4 |
michael@0 | 1200 | convertloop: |
michael@0 | 1201 | /* step 1 - subsample 32x2 argb pixels to 16x1 */ |
michael@0 | 1202 | vmovdqu ymm0, [eax] |
michael@0 | 1203 | vmovdqu ymm1, [eax + 32] |
michael@0 | 1204 | vmovdqu ymm2, [eax + 64] |
michael@0 | 1205 | vmovdqu ymm3, [eax + 96] |
michael@0 | 1206 | vpavgb ymm0, ymm0, [eax + esi] |
michael@0 | 1207 | vpavgb ymm1, ymm1, [eax + esi + 32] |
michael@0 | 1208 | vpavgb ymm2, ymm2, [eax + esi + 64] |
michael@0 | 1209 | vpavgb ymm3, ymm3, [eax + esi + 96] |
michael@0 | 1210 | lea eax, [eax + 128] |
michael@0 | 1211 | vshufps ymm4, ymm0, ymm1, 0x88 |
michael@0 | 1212 | vshufps ymm0, ymm0, ymm1, 0xdd |
michael@0 | 1213 | vpavgb ymm0, ymm0, ymm4 // mutated by vshufps |
michael@0 | 1214 | vshufps ymm4, ymm2, ymm3, 0x88 |
michael@0 | 1215 | vshufps ymm2, ymm2, ymm3, 0xdd |
michael@0 | 1216 | vpavgb ymm2, ymm2, ymm4 // mutated by vshufps |
michael@0 | 1217 | |
michael@0 | 1218 | // step 2 - convert to U and V |
michael@0 | 1219 | // from here down is very similar to Y code except |
michael@0 | 1220 | // instead of 32 different pixels, its 16 pixels of U and 16 of V |
michael@0 | 1221 | vpmaddubsw ymm1, ymm0, ymm7 // U |
michael@0 | 1222 | vpmaddubsw ymm3, ymm2, ymm7 |
michael@0 | 1223 | vpmaddubsw ymm0, ymm0, ymm6 // V |
michael@0 | 1224 | vpmaddubsw ymm2, ymm2, ymm6 |
michael@0 | 1225 | vphaddw ymm1, ymm1, ymm3 // mutates |
michael@0 | 1226 | vphaddw ymm0, ymm0, ymm2 |
michael@0 | 1227 | vpsraw ymm1, ymm1, 8 |
michael@0 | 1228 | vpsraw ymm0, ymm0, 8 |
michael@0 | 1229 | vpacksswb ymm0, ymm1, ymm0 // mutates |
michael@0 | 1230 | vpermq ymm0, ymm0, 0xd8 // For vpacksswb |
michael@0 | 1231 | vpshufb ymm0, ymm0, kShufARGBToUV_AVX // For vshufps + vphaddw |
michael@0 | 1232 | vpaddb ymm0, ymm0, ymm5 // -> unsigned |
michael@0 | 1233 | |
michael@0 | 1234 | // step 3 - store 16 U and 16 V values |
michael@0 | 1235 | sub ecx, 32 |
michael@0 | 1236 | vextractf128 [edx], ymm0, 0 // U |
michael@0 | 1237 | vextractf128 [edx + edi], ymm0, 1 // V |
michael@0 | 1238 | lea edx, [edx + 16] |
michael@0 | 1239 | jg convertloop |
michael@0 | 1240 | |
michael@0 | 1241 | pop edi |
michael@0 | 1242 | pop esi |
michael@0 | 1243 | vzeroupper |
michael@0 | 1244 | ret |
michael@0 | 1245 | } |
michael@0 | 1246 | } |
michael@0 | 1247 | #endif // HAS_ARGBTOUVROW_AVX2 |
michael@0 | 1248 | |
michael@0 | 1249 | __declspec(naked) __declspec(align(16)) |
michael@0 | 1250 | void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, |
michael@0 | 1251 | uint8* dst_u, uint8* dst_v, int width) { |
michael@0 | 1252 | __asm { |
michael@0 | 1253 | push esi |
michael@0 | 1254 | push edi |
michael@0 | 1255 | mov eax, [esp + 8 + 4] // src_argb |
michael@0 | 1256 | mov esi, [esp + 8 + 8] // src_stride_argb |
michael@0 | 1257 | mov edx, [esp + 8 + 12] // dst_u |
michael@0 | 1258 | mov edi, [esp + 8 + 16] // dst_v |
michael@0 | 1259 | mov ecx, [esp + 8 + 20] // pix |
michael@0 | 1260 | movdqa xmm7, kARGBToU |
michael@0 | 1261 | movdqa xmm6, kARGBToV |
michael@0 | 1262 | movdqa xmm5, kAddUV128 |
michael@0 | 1263 | sub edi, edx // stride from u to v |
michael@0 | 1264 | |
michael@0 | 1265 | align 4 |
michael@0 | 1266 | convertloop: |
michael@0 | 1267 | /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
michael@0 | 1268 | movdqu xmm0, [eax] |
michael@0 | 1269 | movdqu xmm1, [eax + 16] |
michael@0 | 1270 | movdqu xmm2, [eax + 32] |
michael@0 | 1271 | movdqu xmm3, [eax + 48] |
michael@0 | 1272 | movdqu xmm4, [eax + esi] |
michael@0 | 1273 | pavgb xmm0, xmm4 |
michael@0 | 1274 | movdqu xmm4, [eax + esi + 16] |
michael@0 | 1275 | pavgb xmm1, xmm4 |
michael@0 | 1276 | movdqu xmm4, [eax + esi + 32] |
michael@0 | 1277 | pavgb xmm2, xmm4 |
michael@0 | 1278 | movdqu xmm4, [eax + esi + 48] |
michael@0 | 1279 | pavgb xmm3, xmm4 |
michael@0 | 1280 | lea eax, [eax + 64] |
michael@0 | 1281 | movdqa xmm4, xmm0 |
michael@0 | 1282 | shufps xmm0, xmm1, 0x88 |
michael@0 | 1283 | shufps xmm4, xmm1, 0xdd |
michael@0 | 1284 | pavgb xmm0, xmm4 |
michael@0 | 1285 | movdqa xmm4, xmm2 |
michael@0 | 1286 | shufps xmm2, xmm3, 0x88 |
michael@0 | 1287 | shufps xmm4, xmm3, 0xdd |
michael@0 | 1288 | pavgb xmm2, xmm4 |
michael@0 | 1289 | |
michael@0 | 1290 | // step 2 - convert to U and V |
michael@0 | 1291 | // from here down is very similar to Y code except |
michael@0 | 1292 | // instead of 16 different pixels, its 8 pixels of U and 8 of V |
michael@0 | 1293 | movdqa xmm1, xmm0 |
michael@0 | 1294 | movdqa xmm3, xmm2 |
michael@0 | 1295 | pmaddubsw xmm0, xmm7 // U |
michael@0 | 1296 | pmaddubsw xmm2, xmm7 |
michael@0 | 1297 | pmaddubsw xmm1, xmm6 // V |
michael@0 | 1298 | pmaddubsw xmm3, xmm6 |
michael@0 | 1299 | phaddw xmm0, xmm2 |
michael@0 | 1300 | phaddw xmm1, xmm3 |
michael@0 | 1301 | psraw xmm0, 8 |
michael@0 | 1302 | psraw xmm1, 8 |
michael@0 | 1303 | packsswb xmm0, xmm1 |
michael@0 | 1304 | paddb xmm0, xmm5 // -> unsigned |
michael@0 | 1305 | |
michael@0 | 1306 | // step 3 - store 8 U and 8 V values |
michael@0 | 1307 | sub ecx, 16 |
michael@0 | 1308 | movlps qword ptr [edx], xmm0 // U |
michael@0 | 1309 | movhps qword ptr [edx + edi], xmm0 // V |
michael@0 | 1310 | lea edx, [edx + 8] |
michael@0 | 1311 | jg convertloop |
michael@0 | 1312 | |
michael@0 | 1313 | pop edi |
michael@0 | 1314 | pop esi |
michael@0 | 1315 | ret |
michael@0 | 1316 | } |
michael@0 | 1317 | } |
michael@0 | 1318 | |
michael@0 | 1319 | __declspec(naked) __declspec(align(16)) |
michael@0 | 1320 | void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, |
michael@0 | 1321 | uint8* dst_u, uint8* dst_v, int width) { |
michael@0 | 1322 | __asm { |
michael@0 | 1323 | push esi |
michael@0 | 1324 | push edi |
michael@0 | 1325 | mov eax, [esp + 8 + 4] // src_argb |
michael@0 | 1326 | mov esi, [esp + 8 + 8] // src_stride_argb |
michael@0 | 1327 | mov edx, [esp + 8 + 12] // dst_u |
michael@0 | 1328 | mov edi, [esp + 8 + 16] // dst_v |
michael@0 | 1329 | mov ecx, [esp + 8 + 20] // pix |
michael@0 | 1330 | movdqa xmm7, kARGBToUJ |
michael@0 | 1331 | movdqa xmm6, kARGBToVJ |
michael@0 | 1332 | movdqa xmm5, kAddUVJ128 |
michael@0 | 1333 | sub edi, edx // stride from u to v |
michael@0 | 1334 | |
michael@0 | 1335 | align 4 |
michael@0 | 1336 | convertloop: |
michael@0 | 1337 | /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
michael@0 | 1338 | movdqu xmm0, [eax] |
michael@0 | 1339 | movdqu xmm1, [eax + 16] |
michael@0 | 1340 | movdqu xmm2, [eax + 32] |
michael@0 | 1341 | movdqu xmm3, [eax + 48] |
michael@0 | 1342 | movdqu xmm4, [eax + esi] |
michael@0 | 1343 | pavgb xmm0, xmm4 |
michael@0 | 1344 | movdqu xmm4, [eax + esi + 16] |
michael@0 | 1345 | pavgb xmm1, xmm4 |
michael@0 | 1346 | movdqu xmm4, [eax + esi + 32] |
michael@0 | 1347 | pavgb xmm2, xmm4 |
michael@0 | 1348 | movdqu xmm4, [eax + esi + 48] |
michael@0 | 1349 | pavgb xmm3, xmm4 |
michael@0 | 1350 | lea eax, [eax + 64] |
michael@0 | 1351 | movdqa xmm4, xmm0 |
michael@0 | 1352 | shufps xmm0, xmm1, 0x88 |
michael@0 | 1353 | shufps xmm4, xmm1, 0xdd |
michael@0 | 1354 | pavgb xmm0, xmm4 |
michael@0 | 1355 | movdqa xmm4, xmm2 |
michael@0 | 1356 | shufps xmm2, xmm3, 0x88 |
michael@0 | 1357 | shufps xmm4, xmm3, 0xdd |
michael@0 | 1358 | pavgb xmm2, xmm4 |
michael@0 | 1359 | |
michael@0 | 1360 | // step 2 - convert to U and V |
michael@0 | 1361 | // from here down is very similar to Y code except |
michael@0 | 1362 | // instead of 16 different pixels, its 8 pixels of U and 8 of V |
michael@0 | 1363 | movdqa xmm1, xmm0 |
michael@0 | 1364 | movdqa xmm3, xmm2 |
michael@0 | 1365 | pmaddubsw xmm0, xmm7 // U |
michael@0 | 1366 | pmaddubsw xmm2, xmm7 |
michael@0 | 1367 | pmaddubsw xmm1, xmm6 // V |
michael@0 | 1368 | pmaddubsw xmm3, xmm6 |
michael@0 | 1369 | phaddw xmm0, xmm2 |
michael@0 | 1370 | phaddw xmm1, xmm3 |
michael@0 | 1371 | paddw xmm0, xmm5 // +.5 rounding -> unsigned |
michael@0 | 1372 | paddw xmm1, xmm5 |
michael@0 | 1373 | psraw xmm0, 8 |
michael@0 | 1374 | psraw xmm1, 8 |
michael@0 | 1375 | packsswb xmm0, xmm1 |
michael@0 | 1376 | |
michael@0 | 1377 | // step 3 - store 8 U and 8 V values |
michael@0 | 1378 | sub ecx, 16 |
michael@0 | 1379 | movlps qword ptr [edx], xmm0 // U |
michael@0 | 1380 | movhps qword ptr [edx + edi], xmm0 // V |
michael@0 | 1381 | lea edx, [edx + 8] |
michael@0 | 1382 | jg convertloop |
michael@0 | 1383 | |
michael@0 | 1384 | pop edi |
michael@0 | 1385 | pop esi |
michael@0 | 1386 | ret |
michael@0 | 1387 | } |
michael@0 | 1388 | } |
michael@0 | 1389 | |
michael@0 | 1390 | __declspec(naked) __declspec(align(16)) |
michael@0 | 1391 | void ARGBToUV444Row_SSSE3(const uint8* src_argb0, |
michael@0 | 1392 | uint8* dst_u, uint8* dst_v, int width) { |
michael@0 | 1393 | __asm { |
michael@0 | 1394 | push edi |
michael@0 | 1395 | mov eax, [esp + 4 + 4] // src_argb |
michael@0 | 1396 | mov edx, [esp + 4 + 8] // dst_u |
michael@0 | 1397 | mov edi, [esp + 4 + 12] // dst_v |
michael@0 | 1398 | mov ecx, [esp + 4 + 16] // pix |
michael@0 | 1399 | movdqa xmm7, kARGBToU |
michael@0 | 1400 | movdqa xmm6, kARGBToV |
michael@0 | 1401 | movdqa xmm5, kAddUV128 |
michael@0 | 1402 | sub edi, edx // stride from u to v |
michael@0 | 1403 | |
michael@0 | 1404 | align 4 |
michael@0 | 1405 | convertloop: |
michael@0 | 1406 | /* convert to U and V */ |
michael@0 | 1407 | movdqa xmm0, [eax] // U |
michael@0 | 1408 | movdqa xmm1, [eax + 16] |
michael@0 | 1409 | movdqa xmm2, [eax + 32] |
michael@0 | 1410 | movdqa xmm3, [eax + 48] |
michael@0 | 1411 | pmaddubsw xmm0, xmm7 |
michael@0 | 1412 | pmaddubsw xmm1, xmm7 |
michael@0 | 1413 | pmaddubsw xmm2, xmm7 |
michael@0 | 1414 | pmaddubsw xmm3, xmm7 |
michael@0 | 1415 | phaddw xmm0, xmm1 |
michael@0 | 1416 | phaddw xmm2, xmm3 |
michael@0 | 1417 | psraw xmm0, 8 |
michael@0 | 1418 | psraw xmm2, 8 |
michael@0 | 1419 | packsswb xmm0, xmm2 |
michael@0 | 1420 | paddb xmm0, xmm5 |
michael@0 | 1421 | sub ecx, 16 |
michael@0 | 1422 | movdqa [edx], xmm0 |
michael@0 | 1423 | |
michael@0 | 1424 | movdqa xmm0, [eax] // V |
michael@0 | 1425 | movdqa xmm1, [eax + 16] |
michael@0 | 1426 | movdqa xmm2, [eax + 32] |
michael@0 | 1427 | movdqa xmm3, [eax + 48] |
michael@0 | 1428 | pmaddubsw xmm0, xmm6 |
michael@0 | 1429 | pmaddubsw xmm1, xmm6 |
michael@0 | 1430 | pmaddubsw xmm2, xmm6 |
michael@0 | 1431 | pmaddubsw xmm3, xmm6 |
michael@0 | 1432 | phaddw xmm0, xmm1 |
michael@0 | 1433 | phaddw xmm2, xmm3 |
michael@0 | 1434 | psraw xmm0, 8 |
michael@0 | 1435 | psraw xmm2, 8 |
michael@0 | 1436 | packsswb xmm0, xmm2 |
michael@0 | 1437 | paddb xmm0, xmm5 |
michael@0 | 1438 | lea eax, [eax + 64] |
michael@0 | 1439 | movdqa [edx + edi], xmm0 |
michael@0 | 1440 | lea edx, [edx + 16] |
michael@0 | 1441 | jg convertloop |
michael@0 | 1442 | |
michael@0 | 1443 | pop edi |
michael@0 | 1444 | ret |
michael@0 | 1445 | } |
michael@0 | 1446 | } |
michael@0 | 1447 | |
michael@0 | 1448 | __declspec(naked) __declspec(align(16)) |
michael@0 | 1449 | void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0, |
michael@0 | 1450 | uint8* dst_u, uint8* dst_v, int width) { |
michael@0 | 1451 | __asm { |
michael@0 | 1452 | push edi |
michael@0 | 1453 | mov eax, [esp + 4 + 4] // src_argb |
michael@0 | 1454 | mov edx, [esp + 4 + 8] // dst_u |
michael@0 | 1455 | mov edi, [esp + 4 + 12] // dst_v |
michael@0 | 1456 | mov ecx, [esp + 4 + 16] // pix |
michael@0 | 1457 | movdqa xmm7, kARGBToU |
michael@0 | 1458 | movdqa xmm6, kARGBToV |
michael@0 | 1459 | movdqa xmm5, kAddUV128 |
michael@0 | 1460 | sub edi, edx // stride from u to v |
michael@0 | 1461 | |
michael@0 | 1462 | align 4 |
michael@0 | 1463 | convertloop: |
michael@0 | 1464 | /* convert to U and V */ |
michael@0 | 1465 | movdqu xmm0, [eax] // U |
michael@0 | 1466 | movdqu xmm1, [eax + 16] |
michael@0 | 1467 | movdqu xmm2, [eax + 32] |
michael@0 | 1468 | movdqu xmm3, [eax + 48] |
michael@0 | 1469 | pmaddubsw xmm0, xmm7 |
michael@0 | 1470 | pmaddubsw xmm1, xmm7 |
michael@0 | 1471 | pmaddubsw xmm2, xmm7 |
michael@0 | 1472 | pmaddubsw xmm3, xmm7 |
michael@0 | 1473 | phaddw xmm0, xmm1 |
michael@0 | 1474 | phaddw xmm2, xmm3 |
michael@0 | 1475 | psraw xmm0, 8 |
michael@0 | 1476 | psraw xmm2, 8 |
michael@0 | 1477 | packsswb xmm0, xmm2 |
michael@0 | 1478 | paddb xmm0, xmm5 |
michael@0 | 1479 | sub ecx, 16 |
michael@0 | 1480 | movdqu [edx], xmm0 |
michael@0 | 1481 | |
michael@0 | 1482 | movdqu xmm0, [eax] // V |
michael@0 | 1483 | movdqu xmm1, [eax + 16] |
michael@0 | 1484 | movdqu xmm2, [eax + 32] |
michael@0 | 1485 | movdqu xmm3, [eax + 48] |
michael@0 | 1486 | pmaddubsw xmm0, xmm6 |
michael@0 | 1487 | pmaddubsw xmm1, xmm6 |
michael@0 | 1488 | pmaddubsw xmm2, xmm6 |
michael@0 | 1489 | pmaddubsw xmm3, xmm6 |
michael@0 | 1490 | phaddw xmm0, xmm1 |
michael@0 | 1491 | phaddw xmm2, xmm3 |
michael@0 | 1492 | psraw xmm0, 8 |
michael@0 | 1493 | psraw xmm2, 8 |
michael@0 | 1494 | packsswb xmm0, xmm2 |
michael@0 | 1495 | paddb xmm0, xmm5 |
michael@0 | 1496 | lea eax, [eax + 64] |
michael@0 | 1497 | movdqu [edx + edi], xmm0 |
michael@0 | 1498 | lea edx, [edx + 16] |
michael@0 | 1499 | jg convertloop |
michael@0 | 1500 | |
michael@0 | 1501 | pop edi |
michael@0 | 1502 | ret |
michael@0 | 1503 | } |
michael@0 | 1504 | } |
michael@0 | 1505 | |
michael@0 | 1506 | __declspec(naked) __declspec(align(16)) |
michael@0 | 1507 | void ARGBToUV422Row_SSSE3(const uint8* src_argb0, |
michael@0 | 1508 | uint8* dst_u, uint8* dst_v, int width) { |
michael@0 | 1509 | __asm { |
michael@0 | 1510 | push edi |
michael@0 | 1511 | mov eax, [esp + 4 + 4] // src_argb |
michael@0 | 1512 | mov edx, [esp + 4 + 8] // dst_u |
michael@0 | 1513 | mov edi, [esp + 4 + 12] // dst_v |
michael@0 | 1514 | mov ecx, [esp + 4 + 16] // pix |
michael@0 | 1515 | movdqa xmm7, kARGBToU |
michael@0 | 1516 | movdqa xmm6, kARGBToV |
michael@0 | 1517 | movdqa xmm5, kAddUV128 |
michael@0 | 1518 | sub edi, edx // stride from u to v |
michael@0 | 1519 | |
michael@0 | 1520 | align 4 |
michael@0 | 1521 | convertloop: |
michael@0 | 1522 | /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
michael@0 | 1523 | movdqa xmm0, [eax] |
michael@0 | 1524 | movdqa xmm1, [eax + 16] |
michael@0 | 1525 | movdqa xmm2, [eax + 32] |
michael@0 | 1526 | movdqa xmm3, [eax + 48] |
michael@0 | 1527 | lea eax, [eax + 64] |
michael@0 | 1528 | movdqa xmm4, xmm0 |
michael@0 | 1529 | shufps xmm0, xmm1, 0x88 |
michael@0 | 1530 | shufps xmm4, xmm1, 0xdd |
michael@0 | 1531 | pavgb xmm0, xmm4 |
michael@0 | 1532 | movdqa xmm4, xmm2 |
michael@0 | 1533 | shufps xmm2, xmm3, 0x88 |
michael@0 | 1534 | shufps xmm4, xmm3, 0xdd |
michael@0 | 1535 | pavgb xmm2, xmm4 |
michael@0 | 1536 | |
michael@0 | 1537 | // step 2 - convert to U and V |
michael@0 | 1538 | // from here down is very similar to Y code except |
michael@0 | 1539 | // instead of 16 different pixels, its 8 pixels of U and 8 of V |
michael@0 | 1540 | movdqa xmm1, xmm0 |
michael@0 | 1541 | movdqa xmm3, xmm2 |
michael@0 | 1542 | pmaddubsw xmm0, xmm7 // U |
michael@0 | 1543 | pmaddubsw xmm2, xmm7 |
michael@0 | 1544 | pmaddubsw xmm1, xmm6 // V |
michael@0 | 1545 | pmaddubsw xmm3, xmm6 |
michael@0 | 1546 | phaddw xmm0, xmm2 |
michael@0 | 1547 | phaddw xmm1, xmm3 |
michael@0 | 1548 | psraw xmm0, 8 |
michael@0 | 1549 | psraw xmm1, 8 |
michael@0 | 1550 | packsswb xmm0, xmm1 |
michael@0 | 1551 | paddb xmm0, xmm5 // -> unsigned |
michael@0 | 1552 | |
michael@0 | 1553 | // step 3 - store 8 U and 8 V values |
michael@0 | 1554 | sub ecx, 16 |
michael@0 | 1555 | movlps qword ptr [edx], xmm0 // U |
michael@0 | 1556 | movhps qword ptr [edx + edi], xmm0 // V |
michael@0 | 1557 | lea edx, [edx + 8] |
michael@0 | 1558 | jg convertloop |
michael@0 | 1559 | |
michael@0 | 1560 | pop edi |
michael@0 | 1561 | ret |
michael@0 | 1562 | } |
michael@0 | 1563 | } |
michael@0 | 1564 | |
michael@0 | 1565 | __declspec(naked) __declspec(align(16)) |
michael@0 | 1566 | void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0, |
michael@0 | 1567 | uint8* dst_u, uint8* dst_v, int width) { |
michael@0 | 1568 | __asm { |
michael@0 | 1569 | push edi |
michael@0 | 1570 | mov eax, [esp + 4 + 4] // src_argb |
michael@0 | 1571 | mov edx, [esp + 4 + 8] // dst_u |
michael@0 | 1572 | mov edi, [esp + 4 + 12] // dst_v |
michael@0 | 1573 | mov ecx, [esp + 4 + 16] // pix |
michael@0 | 1574 | movdqa xmm7, kARGBToU |
michael@0 | 1575 | movdqa xmm6, kARGBToV |
michael@0 | 1576 | movdqa xmm5, kAddUV128 |
michael@0 | 1577 | sub edi, edx // stride from u to v |
michael@0 | 1578 | |
michael@0 | 1579 | align 4 |
michael@0 | 1580 | convertloop: |
michael@0 | 1581 | /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
michael@0 | 1582 | movdqu xmm0, [eax] |
michael@0 | 1583 | movdqu xmm1, [eax + 16] |
michael@0 | 1584 | movdqu xmm2, [eax + 32] |
michael@0 | 1585 | movdqu xmm3, [eax + 48] |
michael@0 | 1586 | lea eax, [eax + 64] |
michael@0 | 1587 | movdqa xmm4, xmm0 |
michael@0 | 1588 | shufps xmm0, xmm1, 0x88 |
michael@0 | 1589 | shufps xmm4, xmm1, 0xdd |
michael@0 | 1590 | pavgb xmm0, xmm4 |
michael@0 | 1591 | movdqa xmm4, xmm2 |
michael@0 | 1592 | shufps xmm2, xmm3, 0x88 |
michael@0 | 1593 | shufps xmm4, xmm3, 0xdd |
michael@0 | 1594 | pavgb xmm2, xmm4 |
michael@0 | 1595 | |
michael@0 | 1596 | // step 2 - convert to U and V |
michael@0 | 1597 | // from here down is very similar to Y code except |
michael@0 | 1598 | // instead of 16 different pixels, its 8 pixels of U and 8 of V |
michael@0 | 1599 | movdqa xmm1, xmm0 |
michael@0 | 1600 | movdqa xmm3, xmm2 |
michael@0 | 1601 | pmaddubsw xmm0, xmm7 // U |
michael@0 | 1602 | pmaddubsw xmm2, xmm7 |
michael@0 | 1603 | pmaddubsw xmm1, xmm6 // V |
michael@0 | 1604 | pmaddubsw xmm3, xmm6 |
michael@0 | 1605 | phaddw xmm0, xmm2 |
michael@0 | 1606 | phaddw xmm1, xmm3 |
michael@0 | 1607 | psraw xmm0, 8 |
michael@0 | 1608 | psraw xmm1, 8 |
michael@0 | 1609 | packsswb xmm0, xmm1 |
michael@0 | 1610 | paddb xmm0, xmm5 // -> unsigned |
michael@0 | 1611 | |
michael@0 | 1612 | // step 3 - store 8 U and 8 V values |
michael@0 | 1613 | sub ecx, 16 |
michael@0 | 1614 | movlps qword ptr [edx], xmm0 // U |
michael@0 | 1615 | movhps qword ptr [edx + edi], xmm0 // V |
michael@0 | 1616 | lea edx, [edx + 8] |
michael@0 | 1617 | jg convertloop |
michael@0 | 1618 | |
michael@0 | 1619 | pop edi |
michael@0 | 1620 | ret |
michael@0 | 1621 | } |
michael@0 | 1622 | } |
michael@0 | 1623 | |
michael@0 | 1624 | __declspec(naked) __declspec(align(16)) |
michael@0 | 1625 | void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
michael@0 | 1626 | uint8* dst_u, uint8* dst_v, int width) { |
michael@0 | 1627 | __asm { |
michael@0 | 1628 | push esi |
michael@0 | 1629 | push edi |
michael@0 | 1630 | mov eax, [esp + 8 + 4] // src_argb |
michael@0 | 1631 | mov esi, [esp + 8 + 8] // src_stride_argb |
michael@0 | 1632 | mov edx, [esp + 8 + 12] // dst_u |
michael@0 | 1633 | mov edi, [esp + 8 + 16] // dst_v |
michael@0 | 1634 | mov ecx, [esp + 8 + 20] // pix |
michael@0 | 1635 | movdqa xmm7, kBGRAToU |
michael@0 | 1636 | movdqa xmm6, kBGRAToV |
michael@0 | 1637 | movdqa xmm5, kAddUV128 |
michael@0 | 1638 | sub edi, edx // stride from u to v |
michael@0 | 1639 | |
michael@0 | 1640 | align 4 |
michael@0 | 1641 | convertloop: |
michael@0 | 1642 | /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
michael@0 | 1643 | movdqa xmm0, [eax] |
michael@0 | 1644 | movdqa xmm1, [eax + 16] |
michael@0 | 1645 | movdqa xmm2, [eax + 32] |
michael@0 | 1646 | movdqa xmm3, [eax + 48] |
michael@0 | 1647 | pavgb xmm0, [eax + esi] |
michael@0 | 1648 | pavgb xmm1, [eax + esi + 16] |
michael@0 | 1649 | pavgb xmm2, [eax + esi + 32] |
michael@0 | 1650 | pavgb xmm3, [eax + esi + 48] |
michael@0 | 1651 | lea eax, [eax + 64] |
michael@0 | 1652 | movdqa xmm4, xmm0 |
michael@0 | 1653 | shufps xmm0, xmm1, 0x88 |
michael@0 | 1654 | shufps xmm4, xmm1, 0xdd |
michael@0 | 1655 | pavgb xmm0, xmm4 |
michael@0 | 1656 | movdqa xmm4, xmm2 |
michael@0 | 1657 | shufps xmm2, xmm3, 0x88 |
michael@0 | 1658 | shufps xmm4, xmm3, 0xdd |
michael@0 | 1659 | pavgb xmm2, xmm4 |
michael@0 | 1660 | |
michael@0 | 1661 | // step 2 - convert to U and V |
michael@0 | 1662 | // from here down is very similar to Y code except |
michael@0 | 1663 | // instead of 16 different pixels, its 8 pixels of U and 8 of V |
michael@0 | 1664 | movdqa xmm1, xmm0 |
michael@0 | 1665 | movdqa xmm3, xmm2 |
michael@0 | 1666 | pmaddubsw xmm0, xmm7 // U |
michael@0 | 1667 | pmaddubsw xmm2, xmm7 |
michael@0 | 1668 | pmaddubsw xmm1, xmm6 // V |
michael@0 | 1669 | pmaddubsw xmm3, xmm6 |
michael@0 | 1670 | phaddw xmm0, xmm2 |
michael@0 | 1671 | phaddw xmm1, xmm3 |
michael@0 | 1672 | psraw xmm0, 8 |
michael@0 | 1673 | psraw xmm1, 8 |
michael@0 | 1674 | packsswb xmm0, xmm1 |
michael@0 | 1675 | paddb xmm0, xmm5 // -> unsigned |
michael@0 | 1676 | |
michael@0 | 1677 | // step 3 - store 8 U and 8 V values |
michael@0 | 1678 | sub ecx, 16 |
michael@0 | 1679 | movlps qword ptr [edx], xmm0 // U |
michael@0 | 1680 | movhps qword ptr [edx + edi], xmm0 // V |
michael@0 | 1681 | lea edx, [edx + 8] |
michael@0 | 1682 | jg convertloop |
michael@0 | 1683 | |
michael@0 | 1684 | pop edi |
michael@0 | 1685 | pop esi |
michael@0 | 1686 | ret |
michael@0 | 1687 | } |
michael@0 | 1688 | } |
michael@0 | 1689 | |
michael@0 | 1690 | __declspec(naked) __declspec(align(16)) |
michael@0 | 1691 | void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, |
michael@0 | 1692 | uint8* dst_u, uint8* dst_v, int width) { |
michael@0 | 1693 | __asm { |
michael@0 | 1694 | push esi |
michael@0 | 1695 | push edi |
michael@0 | 1696 | mov eax, [esp + 8 + 4] // src_argb |
michael@0 | 1697 | mov esi, [esp + 8 + 8] // src_stride_argb |
michael@0 | 1698 | mov edx, [esp + 8 + 12] // dst_u |
michael@0 | 1699 | mov edi, [esp + 8 + 16] // dst_v |
michael@0 | 1700 | mov ecx, [esp + 8 + 20] // pix |
michael@0 | 1701 | movdqa xmm7, kBGRAToU |
michael@0 | 1702 | movdqa xmm6, kBGRAToV |
michael@0 | 1703 | movdqa xmm5, kAddUV128 |
michael@0 | 1704 | sub edi, edx // stride from u to v |
michael@0 | 1705 | |
michael@0 | 1706 | align 4 |
michael@0 | 1707 | convertloop: |
michael@0 | 1708 | /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
michael@0 | 1709 | movdqu xmm0, [eax] |
michael@0 | 1710 | movdqu xmm1, [eax + 16] |
michael@0 | 1711 | movdqu xmm2, [eax + 32] |
michael@0 | 1712 | movdqu xmm3, [eax + 48] |
michael@0 | 1713 | movdqu xmm4, [eax + esi] |
michael@0 | 1714 | pavgb xmm0, xmm4 |
michael@0 | 1715 | movdqu xmm4, [eax + esi + 16] |
michael@0 | 1716 | pavgb xmm1, xmm4 |
michael@0 | 1717 | movdqu xmm4, [eax + esi + 32] |
michael@0 | 1718 | pavgb xmm2, xmm4 |
michael@0 | 1719 | movdqu xmm4, [eax + esi + 48] |
michael@0 | 1720 | pavgb xmm3, xmm4 |
michael@0 | 1721 | lea eax, [eax + 64] |
michael@0 | 1722 | movdqa xmm4, xmm0 |
michael@0 | 1723 | shufps xmm0, xmm1, 0x88 |
michael@0 | 1724 | shufps xmm4, xmm1, 0xdd |
michael@0 | 1725 | pavgb xmm0, xmm4 |
michael@0 | 1726 | movdqa xmm4, xmm2 |
michael@0 | 1727 | shufps xmm2, xmm3, 0x88 |
michael@0 | 1728 | shufps xmm4, xmm3, 0xdd |
michael@0 | 1729 | pavgb xmm2, xmm4 |
michael@0 | 1730 | |
michael@0 | 1731 | // step 2 - convert to U and V |
michael@0 | 1732 | // from here down is very similar to Y code except |
michael@0 | 1733 | // instead of 16 different pixels, its 8 pixels of U and 8 of V |
michael@0 | 1734 | movdqa xmm1, xmm0 |
michael@0 | 1735 | movdqa xmm3, xmm2 |
michael@0 | 1736 | pmaddubsw xmm0, xmm7 // U |
michael@0 | 1737 | pmaddubsw xmm2, xmm7 |
michael@0 | 1738 | pmaddubsw xmm1, xmm6 // V |
michael@0 | 1739 | pmaddubsw xmm3, xmm6 |
michael@0 | 1740 | phaddw xmm0, xmm2 |
michael@0 | 1741 | phaddw xmm1, xmm3 |
michael@0 | 1742 | psraw xmm0, 8 |
michael@0 | 1743 | psraw xmm1, 8 |
michael@0 | 1744 | packsswb xmm0, xmm1 |
michael@0 | 1745 | paddb xmm0, xmm5 // -> unsigned |
michael@0 | 1746 | |
michael@0 | 1747 | // step 3 - store 8 U and 8 V values |
michael@0 | 1748 | sub ecx, 16 |
michael@0 | 1749 | movlps qword ptr [edx], xmm0 // U |
michael@0 | 1750 | movhps qword ptr [edx + edi], xmm0 // V |
michael@0 | 1751 | lea edx, [edx + 8] |
michael@0 | 1752 | jg convertloop |
michael@0 | 1753 | |
michael@0 | 1754 | pop edi |
michael@0 | 1755 | pop esi |
michael@0 | 1756 | ret |
michael@0 | 1757 | } |
michael@0 | 1758 | } |
michael@0 | 1759 | |
michael@0 | 1760 | __declspec(naked) __declspec(align(16)) |
michael@0 | 1761 | void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
michael@0 | 1762 | uint8* dst_u, uint8* dst_v, int width) { |
michael@0 | 1763 | __asm { |
michael@0 | 1764 | push esi |
michael@0 | 1765 | push edi |
michael@0 | 1766 | mov eax, [esp + 8 + 4] // src_argb |
michael@0 | 1767 | mov esi, [esp + 8 + 8] // src_stride_argb |
michael@0 | 1768 | mov edx, [esp + 8 + 12] // dst_u |
michael@0 | 1769 | mov edi, [esp + 8 + 16] // dst_v |
michael@0 | 1770 | mov ecx, [esp + 8 + 20] // pix |
michael@0 | 1771 | movdqa xmm7, kABGRToU |
michael@0 | 1772 | movdqa xmm6, kABGRToV |
michael@0 | 1773 | movdqa xmm5, kAddUV128 |
michael@0 | 1774 | sub edi, edx // stride from u to v |
michael@0 | 1775 | |
michael@0 | 1776 | align 4 |
michael@0 | 1777 | convertloop: |
michael@0 | 1778 | /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
michael@0 | 1779 | movdqa xmm0, [eax] |
michael@0 | 1780 | movdqa xmm1, [eax + 16] |
michael@0 | 1781 | movdqa xmm2, [eax + 32] |
michael@0 | 1782 | movdqa xmm3, [eax + 48] |
michael@0 | 1783 | pavgb xmm0, [eax + esi] |
michael@0 | 1784 | pavgb xmm1, [eax + esi + 16] |
michael@0 | 1785 | pavgb xmm2, [eax + esi + 32] |
michael@0 | 1786 | pavgb xmm3, [eax + esi + 48] |
michael@0 | 1787 | lea eax, [eax + 64] |
michael@0 | 1788 | movdqa xmm4, xmm0 |
michael@0 | 1789 | shufps xmm0, xmm1, 0x88 |
michael@0 | 1790 | shufps xmm4, xmm1, 0xdd |
michael@0 | 1791 | pavgb xmm0, xmm4 |
michael@0 | 1792 | movdqa xmm4, xmm2 |
michael@0 | 1793 | shufps xmm2, xmm3, 0x88 |
michael@0 | 1794 | shufps xmm4, xmm3, 0xdd |
michael@0 | 1795 | pavgb xmm2, xmm4 |
michael@0 | 1796 | |
michael@0 | 1797 | // step 2 - convert to U and V |
michael@0 | 1798 | // from here down is very similar to Y code except |
michael@0 | 1799 | // instead of 16 different pixels, its 8 pixels of U and 8 of V |
michael@0 | 1800 | movdqa xmm1, xmm0 |
michael@0 | 1801 | movdqa xmm3, xmm2 |
michael@0 | 1802 | pmaddubsw xmm0, xmm7 // U |
michael@0 | 1803 | pmaddubsw xmm2, xmm7 |
michael@0 | 1804 | pmaddubsw xmm1, xmm6 // V |
michael@0 | 1805 | pmaddubsw xmm3, xmm6 |
michael@0 | 1806 | phaddw xmm0, xmm2 |
michael@0 | 1807 | phaddw xmm1, xmm3 |
michael@0 | 1808 | psraw xmm0, 8 |
michael@0 | 1809 | psraw xmm1, 8 |
michael@0 | 1810 | packsswb xmm0, xmm1 |
michael@0 | 1811 | paddb xmm0, xmm5 // -> unsigned |
michael@0 | 1812 | |
michael@0 | 1813 | // step 3 - store 8 U and 8 V values |
michael@0 | 1814 | sub ecx, 16 |
michael@0 | 1815 | movlps qword ptr [edx], xmm0 // U |
michael@0 | 1816 | movhps qword ptr [edx + edi], xmm0 // V |
michael@0 | 1817 | lea edx, [edx + 8] |
michael@0 | 1818 | jg convertloop |
michael@0 | 1819 | |
michael@0 | 1820 | pop edi |
michael@0 | 1821 | pop esi |
michael@0 | 1822 | ret |
michael@0 | 1823 | } |
michael@0 | 1824 | } |
michael@0 | 1825 | |
michael@0 | 1826 | __declspec(naked) __declspec(align(16)) |
michael@0 | 1827 | void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, |
michael@0 | 1828 | uint8* dst_u, uint8* dst_v, int width) { |
michael@0 | 1829 | __asm { |
michael@0 | 1830 | push esi |
michael@0 | 1831 | push edi |
michael@0 | 1832 | mov eax, [esp + 8 + 4] // src_argb |
michael@0 | 1833 | mov esi, [esp + 8 + 8] // src_stride_argb |
michael@0 | 1834 | mov edx, [esp + 8 + 12] // dst_u |
michael@0 | 1835 | mov edi, [esp + 8 + 16] // dst_v |
michael@0 | 1836 | mov ecx, [esp + 8 + 20] // pix |
michael@0 | 1837 | movdqa xmm7, kABGRToU |
michael@0 | 1838 | movdqa xmm6, kABGRToV |
michael@0 | 1839 | movdqa xmm5, kAddUV128 |
michael@0 | 1840 | sub edi, edx // stride from u to v |
michael@0 | 1841 | |
michael@0 | 1842 | align 4 |
michael@0 | 1843 | convertloop: |
michael@0 | 1844 | /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
michael@0 | 1845 | movdqu xmm0, [eax] |
michael@0 | 1846 | movdqu xmm1, [eax + 16] |
michael@0 | 1847 | movdqu xmm2, [eax + 32] |
michael@0 | 1848 | movdqu xmm3, [eax + 48] |
michael@0 | 1849 | movdqu xmm4, [eax + esi] |
michael@0 | 1850 | pavgb xmm0, xmm4 |
michael@0 | 1851 | movdqu xmm4, [eax + esi + 16] |
michael@0 | 1852 | pavgb xmm1, xmm4 |
michael@0 | 1853 | movdqu xmm4, [eax + esi + 32] |
michael@0 | 1854 | pavgb xmm2, xmm4 |
michael@0 | 1855 | movdqu xmm4, [eax + esi + 48] |
michael@0 | 1856 | pavgb xmm3, xmm4 |
michael@0 | 1857 | lea eax, [eax + 64] |
michael@0 | 1858 | movdqa xmm4, xmm0 |
michael@0 | 1859 | shufps xmm0, xmm1, 0x88 |
michael@0 | 1860 | shufps xmm4, xmm1, 0xdd |
michael@0 | 1861 | pavgb xmm0, xmm4 |
michael@0 | 1862 | movdqa xmm4, xmm2 |
michael@0 | 1863 | shufps xmm2, xmm3, 0x88 |
michael@0 | 1864 | shufps xmm4, xmm3, 0xdd |
michael@0 | 1865 | pavgb xmm2, xmm4 |
michael@0 | 1866 | |
michael@0 | 1867 | // step 2 - convert to U and V |
michael@0 | 1868 | // from here down is very similar to Y code except |
michael@0 | 1869 | // instead of 16 different pixels, its 8 pixels of U and 8 of V |
michael@0 | 1870 | movdqa xmm1, xmm0 |
michael@0 | 1871 | movdqa xmm3, xmm2 |
michael@0 | 1872 | pmaddubsw xmm0, xmm7 // U |
michael@0 | 1873 | pmaddubsw xmm2, xmm7 |
michael@0 | 1874 | pmaddubsw xmm1, xmm6 // V |
michael@0 | 1875 | pmaddubsw xmm3, xmm6 |
michael@0 | 1876 | phaddw xmm0, xmm2 |
michael@0 | 1877 | phaddw xmm1, xmm3 |
michael@0 | 1878 | psraw xmm0, 8 |
michael@0 | 1879 | psraw xmm1, 8 |
michael@0 | 1880 | packsswb xmm0, xmm1 |
michael@0 | 1881 | paddb xmm0, xmm5 // -> unsigned |
michael@0 | 1882 | |
michael@0 | 1883 | // step 3 - store 8 U and 8 V values |
michael@0 | 1884 | sub ecx, 16 |
michael@0 | 1885 | movlps qword ptr [edx], xmm0 // U |
michael@0 | 1886 | movhps qword ptr [edx + edi], xmm0 // V |
michael@0 | 1887 | lea edx, [edx + 8] |
michael@0 | 1888 | jg convertloop |
michael@0 | 1889 | |
michael@0 | 1890 | pop edi |
michael@0 | 1891 | pop esi |
michael@0 | 1892 | ret |
michael@0 | 1893 | } |
michael@0 | 1894 | } |
michael@0 | 1895 | |
michael@0 | 1896 | __declspec(naked) __declspec(align(16)) |
michael@0 | 1897 | void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
michael@0 | 1898 | uint8* dst_u, uint8* dst_v, int width) { |
michael@0 | 1899 | __asm { |
michael@0 | 1900 | push esi |
michael@0 | 1901 | push edi |
michael@0 | 1902 | mov eax, [esp + 8 + 4] // src_argb |
michael@0 | 1903 | mov esi, [esp + 8 + 8] // src_stride_argb |
michael@0 | 1904 | mov edx, [esp + 8 + 12] // dst_u |
michael@0 | 1905 | mov edi, [esp + 8 + 16] // dst_v |
michael@0 | 1906 | mov ecx, [esp + 8 + 20] // pix |
michael@0 | 1907 | movdqa xmm7, kRGBAToU |
michael@0 | 1908 | movdqa xmm6, kRGBAToV |
michael@0 | 1909 | movdqa xmm5, kAddUV128 |
michael@0 | 1910 | sub edi, edx // stride from u to v |
michael@0 | 1911 | |
michael@0 | 1912 | align 4 |
michael@0 | 1913 | convertloop: |
michael@0 | 1914 | /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
michael@0 | 1915 | movdqa xmm0, [eax] |
michael@0 | 1916 | movdqa xmm1, [eax + 16] |
michael@0 | 1917 | movdqa xmm2, [eax + 32] |
michael@0 | 1918 | movdqa xmm3, [eax + 48] |
michael@0 | 1919 | pavgb xmm0, [eax + esi] |
michael@0 | 1920 | pavgb xmm1, [eax + esi + 16] |
michael@0 | 1921 | pavgb xmm2, [eax + esi + 32] |
michael@0 | 1922 | pavgb xmm3, [eax + esi + 48] |
michael@0 | 1923 | lea eax, [eax + 64] |
michael@0 | 1924 | movdqa xmm4, xmm0 |
michael@0 | 1925 | shufps xmm0, xmm1, 0x88 |
michael@0 | 1926 | shufps xmm4, xmm1, 0xdd |
michael@0 | 1927 | pavgb xmm0, xmm4 |
michael@0 | 1928 | movdqa xmm4, xmm2 |
michael@0 | 1929 | shufps xmm2, xmm3, 0x88 |
michael@0 | 1930 | shufps xmm4, xmm3, 0xdd |
michael@0 | 1931 | pavgb xmm2, xmm4 |
michael@0 | 1932 | |
michael@0 | 1933 | // step 2 - convert to U and V |
michael@0 | 1934 | // from here down is very similar to Y code except |
michael@0 | 1935 | // instead of 16 different pixels, its 8 pixels of U and 8 of V |
michael@0 | 1936 | movdqa xmm1, xmm0 |
michael@0 | 1937 | movdqa xmm3, xmm2 |
michael@0 | 1938 | pmaddubsw xmm0, xmm7 // U |
michael@0 | 1939 | pmaddubsw xmm2, xmm7 |
michael@0 | 1940 | pmaddubsw xmm1, xmm6 // V |
michael@0 | 1941 | pmaddubsw xmm3, xmm6 |
michael@0 | 1942 | phaddw xmm0, xmm2 |
michael@0 | 1943 | phaddw xmm1, xmm3 |
michael@0 | 1944 | psraw xmm0, 8 |
michael@0 | 1945 | psraw xmm1, 8 |
michael@0 | 1946 | packsswb xmm0, xmm1 |
michael@0 | 1947 | paddb xmm0, xmm5 // -> unsigned |
michael@0 | 1948 | |
michael@0 | 1949 | // step 3 - store 8 U and 8 V values |
michael@0 | 1950 | sub ecx, 16 |
michael@0 | 1951 | movlps qword ptr [edx], xmm0 // U |
michael@0 | 1952 | movhps qword ptr [edx + edi], xmm0 // V |
michael@0 | 1953 | lea edx, [edx + 8] |
michael@0 | 1954 | jg convertloop |
michael@0 | 1955 | |
michael@0 | 1956 | pop edi |
michael@0 | 1957 | pop esi |
michael@0 | 1958 | ret |
michael@0 | 1959 | } |
michael@0 | 1960 | } |
michael@0 | 1961 | |
michael@0 | 1962 | __declspec(naked) __declspec(align(16)) |
michael@0 | 1963 | void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, |
michael@0 | 1964 | uint8* dst_u, uint8* dst_v, int width) { |
michael@0 | 1965 | __asm { |
michael@0 | 1966 | push esi |
michael@0 | 1967 | push edi |
michael@0 | 1968 | mov eax, [esp + 8 + 4] // src_argb |
michael@0 | 1969 | mov esi, [esp + 8 + 8] // src_stride_argb |
michael@0 | 1970 | mov edx, [esp + 8 + 12] // dst_u |
michael@0 | 1971 | mov edi, [esp + 8 + 16] // dst_v |
michael@0 | 1972 | mov ecx, [esp + 8 + 20] // pix |
michael@0 | 1973 | movdqa xmm7, kRGBAToU |
michael@0 | 1974 | movdqa xmm6, kRGBAToV |
michael@0 | 1975 | movdqa xmm5, kAddUV128 |
michael@0 | 1976 | sub edi, edx // stride from u to v |
michael@0 | 1977 | |
michael@0 | 1978 | align 4 |
michael@0 | 1979 | convertloop: |
michael@0 | 1980 | /* step 1 - subsample 16x2 argb pixels to 8x1 */ |
michael@0 | 1981 | movdqu xmm0, [eax] |
michael@0 | 1982 | movdqu xmm1, [eax + 16] |
michael@0 | 1983 | movdqu xmm2, [eax + 32] |
michael@0 | 1984 | movdqu xmm3, [eax + 48] |
michael@0 | 1985 | movdqu xmm4, [eax + esi] |
michael@0 | 1986 | pavgb xmm0, xmm4 |
michael@0 | 1987 | movdqu xmm4, [eax + esi + 16] |
michael@0 | 1988 | pavgb xmm1, xmm4 |
michael@0 | 1989 | movdqu xmm4, [eax + esi + 32] |
michael@0 | 1990 | pavgb xmm2, xmm4 |
michael@0 | 1991 | movdqu xmm4, [eax + esi + 48] |
michael@0 | 1992 | pavgb xmm3, xmm4 |
michael@0 | 1993 | lea eax, [eax + 64] |
michael@0 | 1994 | movdqa xmm4, xmm0 |
michael@0 | 1995 | shufps xmm0, xmm1, 0x88 |
michael@0 | 1996 | shufps xmm4, xmm1, 0xdd |
michael@0 | 1997 | pavgb xmm0, xmm4 |
michael@0 | 1998 | movdqa xmm4, xmm2 |
michael@0 | 1999 | shufps xmm2, xmm3, 0x88 |
michael@0 | 2000 | shufps xmm4, xmm3, 0xdd |
michael@0 | 2001 | pavgb xmm2, xmm4 |
michael@0 | 2002 | |
michael@0 | 2003 | // step 2 - convert to U and V |
michael@0 | 2004 | // from here down is very similar to Y code except |
michael@0 | 2005 | // instead of 16 different pixels, its 8 pixels of U and 8 of V |
michael@0 | 2006 | movdqa xmm1, xmm0 |
michael@0 | 2007 | movdqa xmm3, xmm2 |
michael@0 | 2008 | pmaddubsw xmm0, xmm7 // U |
michael@0 | 2009 | pmaddubsw xmm2, xmm7 |
michael@0 | 2010 | pmaddubsw xmm1, xmm6 // V |
michael@0 | 2011 | pmaddubsw xmm3, xmm6 |
michael@0 | 2012 | phaddw xmm0, xmm2 |
michael@0 | 2013 | phaddw xmm1, xmm3 |
michael@0 | 2014 | psraw xmm0, 8 |
michael@0 | 2015 | psraw xmm1, 8 |
michael@0 | 2016 | packsswb xmm0, xmm1 |
michael@0 | 2017 | paddb xmm0, xmm5 // -> unsigned |
michael@0 | 2018 | |
michael@0 | 2019 | // step 3 - store 8 U and 8 V values |
michael@0 | 2020 | sub ecx, 16 |
michael@0 | 2021 | movlps qword ptr [edx], xmm0 // U |
michael@0 | 2022 | movhps qword ptr [edx + edi], xmm0 // V |
michael@0 | 2023 | lea edx, [edx + 8] |
michael@0 | 2024 | jg convertloop |
michael@0 | 2025 | |
michael@0 | 2026 | pop edi |
michael@0 | 2027 | pop esi |
michael@0 | 2028 | ret |
michael@0 | 2029 | } |
michael@0 | 2030 | } |
michael@0 | 2031 | #endif // HAS_ARGBTOYROW_SSSE3 |
michael@0 | 2032 | |
michael@0 | 2033 | #define YG 74 /* (int8)(1.164 * 64 + 0.5) */ |
michael@0 | 2034 | |
michael@0 | 2035 | #define UB 127 /* min(63,(int8)(2.018 * 64)) */ |
michael@0 | 2036 | #define UG -25 /* (int8)(-0.391 * 64 - 0.5) */ |
michael@0 | 2037 | #define UR 0 |
michael@0 | 2038 | |
michael@0 | 2039 | #define VB 0 |
michael@0 | 2040 | #define VG -52 /* (int8)(-0.813 * 64 - 0.5) */ |
michael@0 | 2041 | #define VR 102 /* (int8)(1.596 * 64 + 0.5) */ |
michael@0 | 2042 | |
michael@0 | 2043 | // Bias |
michael@0 | 2044 | #define BB UB * 128 + VB * 128 |
michael@0 | 2045 | #define BG UG * 128 + VG * 128 |
michael@0 | 2046 | #define BR UR * 128 + VR * 128 |
michael@0 | 2047 | |
michael@0 | 2048 | #ifdef HAS_I422TOARGBROW_AVX2 |
michael@0 | 2049 | |
michael@0 | 2050 | static const lvec8 kUVToB_AVX = { |
michael@0 | 2051 | UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, |
michael@0 | 2052 | UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB |
michael@0 | 2053 | }; |
michael@0 | 2054 | static const lvec8 kUVToR_AVX = { |
michael@0 | 2055 | UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, |
michael@0 | 2056 | UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR |
michael@0 | 2057 | }; |
michael@0 | 2058 | static const lvec8 kUVToG_AVX = { |
michael@0 | 2059 | UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, |
michael@0 | 2060 | UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG |
michael@0 | 2061 | }; |
michael@0 | 2062 | static const lvec16 kYToRgb_AVX = { |
michael@0 | 2063 | YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG |
michael@0 | 2064 | }; |
michael@0 | 2065 | static const lvec16 kYSub16_AVX = { |
michael@0 | 2066 | 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 |
michael@0 | 2067 | }; |
michael@0 | 2068 | static const lvec16 kUVBiasB_AVX = { |
michael@0 | 2069 | BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB |
michael@0 | 2070 | }; |
michael@0 | 2071 | static const lvec16 kUVBiasG_AVX = { |
michael@0 | 2072 | BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG |
michael@0 | 2073 | }; |
michael@0 | 2074 | static const lvec16 kUVBiasR_AVX = { |
michael@0 | 2075 | BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR |
michael@0 | 2076 | }; |
michael@0 | 2077 | |
michael@0 | 2078 | // 16 pixels |
michael@0 | 2079 | // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
michael@0 | 2080 | __declspec(naked) __declspec(align(16)) |
michael@0 | 2081 | void I422ToARGBRow_AVX2(const uint8* y_buf, |
michael@0 | 2082 | const uint8* u_buf, |
michael@0 | 2083 | const uint8* v_buf, |
michael@0 | 2084 | uint8* dst_argb, |
michael@0 | 2085 | int width) { |
michael@0 | 2086 | __asm { |
michael@0 | 2087 | push esi |
michael@0 | 2088 | push edi |
michael@0 | 2089 | mov eax, [esp + 8 + 4] // Y |
michael@0 | 2090 | mov esi, [esp + 8 + 8] // U |
michael@0 | 2091 | mov edi, [esp + 8 + 12] // V |
michael@0 | 2092 | mov edx, [esp + 8 + 16] // argb |
michael@0 | 2093 | mov ecx, [esp + 8 + 20] // width |
michael@0 | 2094 | sub edi, esi |
michael@0 | 2095 | vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
michael@0 | 2096 | vpxor ymm4, ymm4, ymm4 |
michael@0 | 2097 | |
michael@0 | 2098 | align 4 |
michael@0 | 2099 | convertloop: |
michael@0 | 2100 | vmovq xmm0, qword ptr [esi] // U |
michael@0 | 2101 | vmovq xmm1, qword ptr [esi + edi] // V |
michael@0 | 2102 | lea esi, [esi + 8] |
michael@0 | 2103 | vpunpcklbw ymm0, ymm0, ymm1 // UV |
michael@0 | 2104 | vpermq ymm0, ymm0, 0xd8 |
michael@0 | 2105 | vpunpcklwd ymm0, ymm0, ymm0 // UVUV |
michael@0 | 2106 | vpmaddubsw ymm2, ymm0, kUVToB_AVX // scale B UV |
michael@0 | 2107 | vpmaddubsw ymm1, ymm0, kUVToG_AVX // scale G UV |
michael@0 | 2108 | vpmaddubsw ymm0, ymm0, kUVToR_AVX // scale R UV |
michael@0 | 2109 | vpsubw ymm2, ymm2, kUVBiasB_AVX // unbias back to signed |
michael@0 | 2110 | vpsubw ymm1, ymm1, kUVBiasG_AVX |
michael@0 | 2111 | vpsubw ymm0, ymm0, kUVBiasR_AVX |
michael@0 | 2112 | |
michael@0 | 2113 | // Step 2: Find Y contribution to 16 R,G,B values |
michael@0 | 2114 | vmovdqu xmm3, [eax] // NOLINT |
michael@0 | 2115 | lea eax, [eax + 16] |
michael@0 | 2116 | vpermq ymm3, ymm3, 0xd8 |
michael@0 | 2117 | vpunpcklbw ymm3, ymm3, ymm4 |
michael@0 | 2118 | vpsubsw ymm3, ymm3, kYSub16_AVX |
michael@0 | 2119 | vpmullw ymm3, ymm3, kYToRgb_AVX |
michael@0 | 2120 | vpaddsw ymm2, ymm2, ymm3 // B += Y |
michael@0 | 2121 | vpaddsw ymm1, ymm1, ymm3 // G += Y |
michael@0 | 2122 | vpaddsw ymm0, ymm0, ymm3 // R += Y |
michael@0 | 2123 | vpsraw ymm2, ymm2, 6 |
michael@0 | 2124 | vpsraw ymm1, ymm1, 6 |
michael@0 | 2125 | vpsraw ymm0, ymm0, 6 |
michael@0 | 2126 | vpackuswb ymm2, ymm2, ymm2 // B |
michael@0 | 2127 | vpackuswb ymm1, ymm1, ymm1 // G |
michael@0 | 2128 | vpackuswb ymm0, ymm0, ymm0 // R |
michael@0 | 2129 | |
michael@0 | 2130 | // Step 3: Weave into ARGB |
michael@0 | 2131 | vpunpcklbw ymm2, ymm2, ymm1 // BG |
michael@0 | 2132 | vpermq ymm2, ymm2, 0xd8 |
michael@0 | 2133 | vpunpcklbw ymm0, ymm0, ymm5 // RA |
michael@0 | 2134 | vpermq ymm0, ymm0, 0xd8 |
michael@0 | 2135 | vpunpcklwd ymm1, ymm2, ymm0 // BGRA first 8 pixels |
michael@0 | 2136 | vpunpckhwd ymm2, ymm2, ymm0 // BGRA next 8 pixels |
michael@0 | 2137 | vmovdqu [edx], ymm1 |
michael@0 | 2138 | vmovdqu [edx + 32], ymm2 |
michael@0 | 2139 | lea edx, [edx + 64] |
michael@0 | 2140 | sub ecx, 16 |
michael@0 | 2141 | jg convertloop |
michael@0 | 2142 | vzeroupper |
michael@0 | 2143 | |
michael@0 | 2144 | pop edi |
michael@0 | 2145 | pop esi |
michael@0 | 2146 | ret |
michael@0 | 2147 | } |
michael@0 | 2148 | } |
michael@0 | 2149 | #endif // HAS_I422TOARGBROW_AVX2 |
michael@0 | 2150 | |
michael@0 | 2151 | #ifdef HAS_I422TOARGBROW_SSSE3 |
michael@0 | 2152 | |
michael@0 | 2153 | static const vec8 kUVToB = { |
michael@0 | 2154 | UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB |
michael@0 | 2155 | }; |
michael@0 | 2156 | |
michael@0 | 2157 | static const vec8 kUVToR = { |
michael@0 | 2158 | UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR |
michael@0 | 2159 | }; |
michael@0 | 2160 | |
michael@0 | 2161 | static const vec8 kUVToG = { |
michael@0 | 2162 | UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG |
michael@0 | 2163 | }; |
michael@0 | 2164 | |
michael@0 | 2165 | static const vec8 kVUToB = { |
michael@0 | 2166 | VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, |
michael@0 | 2167 | }; |
michael@0 | 2168 | |
michael@0 | 2169 | static const vec8 kVUToR = { |
michael@0 | 2170 | VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, |
michael@0 | 2171 | }; |
michael@0 | 2172 | |
michael@0 | 2173 | static const vec8 kVUToG = { |
michael@0 | 2174 | VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, |
michael@0 | 2175 | }; |
michael@0 | 2176 | |
michael@0 | 2177 | static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG }; |
michael@0 | 2178 | static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 }; |
michael@0 | 2179 | static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB }; |
michael@0 | 2180 | static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG }; |
michael@0 | 2181 | static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; |
michael@0 | 2182 | |
michael@0 | 2183 | // TODO(fbarchard): Read that does half size on Y and treats 420 as 444. |
michael@0 | 2184 | |
michael@0 | 2185 | // Read 8 UV from 444. |
michael@0 | 2186 | #define READYUV444 __asm { \ |
michael@0 | 2187 | __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \ |
michael@0 | 2188 | __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \ |
michael@0 | 2189 | __asm lea esi, [esi + 8] \ |
michael@0 | 2190 | __asm punpcklbw xmm0, xmm1 /* UV */ \ |
michael@0 | 2191 | } |
michael@0 | 2192 | |
michael@0 | 2193 | // Read 4 UV from 422, upsample to 8 UV. |
michael@0 | 2194 | #define READYUV422 __asm { \ |
michael@0 | 2195 | __asm movd xmm0, [esi] /* U */ \ |
michael@0 | 2196 | __asm movd xmm1, [esi + edi] /* V */ \ |
michael@0 | 2197 | __asm lea esi, [esi + 4] \ |
michael@0 | 2198 | __asm punpcklbw xmm0, xmm1 /* UV */ \ |
michael@0 | 2199 | __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ |
michael@0 | 2200 | } |
michael@0 | 2201 | |
michael@0 | 2202 | // Read 2 UV from 411, upsample to 8 UV. |
michael@0 | 2203 | #define READYUV411 __asm { \ |
michael@0 | 2204 | __asm movzx ebx, word ptr [esi] /* U */ /* NOLINT */ \ |
michael@0 | 2205 | __asm movd xmm0, ebx \ |
michael@0 | 2206 | __asm movzx ebx, word ptr [esi + edi] /* V */ /* NOLINT */ \ |
michael@0 | 2207 | __asm movd xmm1, ebx \ |
michael@0 | 2208 | __asm lea esi, [esi + 2] \ |
michael@0 | 2209 | __asm punpcklbw xmm0, xmm1 /* UV */ \ |
michael@0 | 2210 | __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ |
michael@0 | 2211 | __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \ |
michael@0 | 2212 | } |
michael@0 | 2213 | |
michael@0 | 2214 | // Read 4 UV from NV12, upsample to 8 UV. |
michael@0 | 2215 | #define READNV12 __asm { \ |
michael@0 | 2216 | __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \ |
michael@0 | 2217 | __asm lea esi, [esi + 8] \ |
michael@0 | 2218 | __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ |
michael@0 | 2219 | } |
michael@0 | 2220 | |
michael@0 | 2221 | // Convert 8 pixels: 8 UV and 8 Y. |
michael@0 | 2222 | #define YUVTORGB __asm { \ |
michael@0 | 2223 | /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ |
michael@0 | 2224 | __asm movdqa xmm1, xmm0 \ |
michael@0 | 2225 | __asm movdqa xmm2, xmm0 \ |
michael@0 | 2226 | __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \ |
michael@0 | 2227 | __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \ |
michael@0 | 2228 | __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \ |
michael@0 | 2229 | __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ |
michael@0 | 2230 | __asm psubw xmm1, kUVBiasG \ |
michael@0 | 2231 | __asm psubw xmm2, kUVBiasR \ |
michael@0 | 2232 | /* Step 2: Find Y contribution to 8 R,G,B values */ \ |
michael@0 | 2233 | __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ |
michael@0 | 2234 | __asm lea eax, [eax + 8] \ |
michael@0 | 2235 | __asm punpcklbw xmm3, xmm4 \ |
michael@0 | 2236 | __asm psubsw xmm3, kYSub16 \ |
michael@0 | 2237 | __asm pmullw xmm3, kYToRgb \ |
michael@0 | 2238 | __asm paddsw xmm0, xmm3 /* B += Y */ \ |
michael@0 | 2239 | __asm paddsw xmm1, xmm3 /* G += Y */ \ |
michael@0 | 2240 | __asm paddsw xmm2, xmm3 /* R += Y */ \ |
michael@0 | 2241 | __asm psraw xmm0, 6 \ |
michael@0 | 2242 | __asm psraw xmm1, 6 \ |
michael@0 | 2243 | __asm psraw xmm2, 6 \ |
michael@0 | 2244 | __asm packuswb xmm0, xmm0 /* B */ \ |
michael@0 | 2245 | __asm packuswb xmm1, xmm1 /* G */ \ |
michael@0 | 2246 | __asm packuswb xmm2, xmm2 /* R */ \ |
michael@0 | 2247 | } |
michael@0 | 2248 | |
michael@0 | 2249 | // Convert 8 pixels: 8 VU and 8 Y. |
michael@0 | 2250 | #define YVUTORGB __asm { \ |
michael@0 | 2251 | /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ |
michael@0 | 2252 | __asm movdqa xmm1, xmm0 \ |
michael@0 | 2253 | __asm movdqa xmm2, xmm0 \ |
michael@0 | 2254 | __asm pmaddubsw xmm0, kVUToB /* scale B UV */ \ |
michael@0 | 2255 | __asm pmaddubsw xmm1, kVUToG /* scale G UV */ \ |
michael@0 | 2256 | __asm pmaddubsw xmm2, kVUToR /* scale R UV */ \ |
michael@0 | 2257 | __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ |
michael@0 | 2258 | __asm psubw xmm1, kUVBiasG \ |
michael@0 | 2259 | __asm psubw xmm2, kUVBiasR \ |
michael@0 | 2260 | /* Step 2: Find Y contribution to 8 R,G,B values */ \ |
michael@0 | 2261 | __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ |
michael@0 | 2262 | __asm lea eax, [eax + 8] \ |
michael@0 | 2263 | __asm punpcklbw xmm3, xmm4 \ |
michael@0 | 2264 | __asm psubsw xmm3, kYSub16 \ |
michael@0 | 2265 | __asm pmullw xmm3, kYToRgb \ |
michael@0 | 2266 | __asm paddsw xmm0, xmm3 /* B += Y */ \ |
michael@0 | 2267 | __asm paddsw xmm1, xmm3 /* G += Y */ \ |
michael@0 | 2268 | __asm paddsw xmm2, xmm3 /* R += Y */ \ |
michael@0 | 2269 | __asm psraw xmm0, 6 \ |
michael@0 | 2270 | __asm psraw xmm1, 6 \ |
michael@0 | 2271 | __asm psraw xmm2, 6 \ |
michael@0 | 2272 | __asm packuswb xmm0, xmm0 /* B */ \ |
michael@0 | 2273 | __asm packuswb xmm1, xmm1 /* G */ \ |
michael@0 | 2274 | __asm packuswb xmm2, xmm2 /* R */ \ |
michael@0 | 2275 | } |
michael@0 | 2276 | |
michael@0 | 2277 | // 8 pixels, dest aligned 16. |
michael@0 | 2278 | // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). |
michael@0 | 2279 | __declspec(naked) __declspec(align(16)) |
michael@0 | 2280 | void I444ToARGBRow_SSSE3(const uint8* y_buf, |
michael@0 | 2281 | const uint8* u_buf, |
michael@0 | 2282 | const uint8* v_buf, |
michael@0 | 2283 | uint8* dst_argb, |
michael@0 | 2284 | int width) { |
michael@0 | 2285 | __asm { |
michael@0 | 2286 | push esi |
michael@0 | 2287 | push edi |
michael@0 | 2288 | mov eax, [esp + 8 + 4] // Y |
michael@0 | 2289 | mov esi, [esp + 8 + 8] // U |
michael@0 | 2290 | mov edi, [esp + 8 + 12] // V |
michael@0 | 2291 | mov edx, [esp + 8 + 16] // argb |
michael@0 | 2292 | mov ecx, [esp + 8 + 20] // width |
michael@0 | 2293 | sub edi, esi |
michael@0 | 2294 | pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
michael@0 | 2295 | pxor xmm4, xmm4 |
michael@0 | 2296 | |
michael@0 | 2297 | align 4 |
michael@0 | 2298 | convertloop: |
michael@0 | 2299 | READYUV444 |
michael@0 | 2300 | YUVTORGB |
michael@0 | 2301 | |
michael@0 | 2302 | // Step 3: Weave into ARGB |
michael@0 | 2303 | punpcklbw xmm0, xmm1 // BG |
michael@0 | 2304 | punpcklbw xmm2, xmm5 // RA |
michael@0 | 2305 | movdqa xmm1, xmm0 |
michael@0 | 2306 | punpcklwd xmm0, xmm2 // BGRA first 4 pixels |
michael@0 | 2307 | punpckhwd xmm1, xmm2 // BGRA next 4 pixels |
michael@0 | 2308 | movdqa [edx], xmm0 |
michael@0 | 2309 | movdqa [edx + 16], xmm1 |
michael@0 | 2310 | lea edx, [edx + 32] |
michael@0 | 2311 | sub ecx, 8 |
michael@0 | 2312 | jg convertloop |
michael@0 | 2313 | |
michael@0 | 2314 | pop edi |
michael@0 | 2315 | pop esi |
michael@0 | 2316 | ret |
michael@0 | 2317 | } |
michael@0 | 2318 | } |
michael@0 | 2319 | |
michael@0 | 2320 | // 8 pixels, dest aligned 16. |
michael@0 | 2321 | // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
michael@0 | 2322 | __declspec(naked) __declspec(align(16)) |
michael@0 | 2323 | void I422ToRGB24Row_SSSE3(const uint8* y_buf, |
michael@0 | 2324 | const uint8* u_buf, |
michael@0 | 2325 | const uint8* v_buf, |
michael@0 | 2326 | uint8* dst_rgb24, |
michael@0 | 2327 | int width) { |
michael@0 | 2328 | __asm { |
michael@0 | 2329 | push esi |
michael@0 | 2330 | push edi |
michael@0 | 2331 | mov eax, [esp + 8 + 4] // Y |
michael@0 | 2332 | mov esi, [esp + 8 + 8] // U |
michael@0 | 2333 | mov edi, [esp + 8 + 12] // V |
michael@0 | 2334 | mov edx, [esp + 8 + 16] // rgb24 |
michael@0 | 2335 | mov ecx, [esp + 8 + 20] // width |
michael@0 | 2336 | sub edi, esi |
michael@0 | 2337 | pxor xmm4, xmm4 |
michael@0 | 2338 | movdqa xmm5, kShuffleMaskARGBToRGB24_0 |
michael@0 | 2339 | movdqa xmm6, kShuffleMaskARGBToRGB24 |
michael@0 | 2340 | |
michael@0 | 2341 | align 4 |
michael@0 | 2342 | convertloop: |
michael@0 | 2343 | READYUV422 |
michael@0 | 2344 | YUVTORGB |
michael@0 | 2345 | |
michael@0 | 2346 | // Step 3: Weave into RRGB |
michael@0 | 2347 | punpcklbw xmm0, xmm1 // BG |
michael@0 | 2348 | punpcklbw xmm2, xmm2 // RR |
michael@0 | 2349 | movdqa xmm1, xmm0 |
michael@0 | 2350 | punpcklwd xmm0, xmm2 // BGRR first 4 pixels |
michael@0 | 2351 | punpckhwd xmm1, xmm2 // BGRR next 4 pixels |
michael@0 | 2352 | pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes. |
michael@0 | 2353 | pshufb xmm1, xmm6 // Pack into first 12 bytes. |
michael@0 | 2354 | palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1 |
michael@0 | 2355 | movq qword ptr [edx], xmm0 // First 8 bytes |
michael@0 | 2356 | movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels. |
michael@0 | 2357 | lea edx, [edx + 24] |
michael@0 | 2358 | sub ecx, 8 |
michael@0 | 2359 | jg convertloop |
michael@0 | 2360 | |
michael@0 | 2361 | pop edi |
michael@0 | 2362 | pop esi |
michael@0 | 2363 | ret |
michael@0 | 2364 | } |
michael@0 | 2365 | } |
michael@0 | 2366 | |
michael@0 | 2367 | // 8 pixels, dest aligned 16. |
michael@0 | 2368 | // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
michael@0 | 2369 | __declspec(naked) __declspec(align(16)) |
michael@0 | 2370 | void I422ToRAWRow_SSSE3(const uint8* y_buf, |
michael@0 | 2371 | const uint8* u_buf, |
michael@0 | 2372 | const uint8* v_buf, |
michael@0 | 2373 | uint8* dst_raw, |
michael@0 | 2374 | int width) { |
michael@0 | 2375 | __asm { |
michael@0 | 2376 | push esi |
michael@0 | 2377 | push edi |
michael@0 | 2378 | mov eax, [esp + 8 + 4] // Y |
michael@0 | 2379 | mov esi, [esp + 8 + 8] // U |
michael@0 | 2380 | mov edi, [esp + 8 + 12] // V |
michael@0 | 2381 | mov edx, [esp + 8 + 16] // raw |
michael@0 | 2382 | mov ecx, [esp + 8 + 20] // width |
michael@0 | 2383 | sub edi, esi |
michael@0 | 2384 | pxor xmm4, xmm4 |
michael@0 | 2385 | movdqa xmm5, kShuffleMaskARGBToRAW_0 |
michael@0 | 2386 | movdqa xmm6, kShuffleMaskARGBToRAW |
michael@0 | 2387 | |
michael@0 | 2388 | align 4 |
michael@0 | 2389 | convertloop: |
michael@0 | 2390 | READYUV422 |
michael@0 | 2391 | YUVTORGB |
michael@0 | 2392 | |
michael@0 | 2393 | // Step 3: Weave into RRGB |
michael@0 | 2394 | punpcklbw xmm0, xmm1 // BG |
michael@0 | 2395 | punpcklbw xmm2, xmm2 // RR |
michael@0 | 2396 | movdqa xmm1, xmm0 |
michael@0 | 2397 | punpcklwd xmm0, xmm2 // BGRR first 4 pixels |
michael@0 | 2398 | punpckhwd xmm1, xmm2 // BGRR next 4 pixels |
michael@0 | 2399 | pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes. |
michael@0 | 2400 | pshufb xmm1, xmm6 // Pack into first 12 bytes. |
michael@0 | 2401 | palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1 |
michael@0 | 2402 | movq qword ptr [edx], xmm0 // First 8 bytes |
michael@0 | 2403 | movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels. |
michael@0 | 2404 | lea edx, [edx + 24] |
michael@0 | 2405 | sub ecx, 8 |
michael@0 | 2406 | jg convertloop |
michael@0 | 2407 | |
michael@0 | 2408 | pop edi |
michael@0 | 2409 | pop esi |
michael@0 | 2410 | ret |
michael@0 | 2411 | } |
michael@0 | 2412 | } |
michael@0 | 2413 | |
michael@0 | 2414 | // 8 pixels, dest unaligned. |
michael@0 | 2415 | // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
michael@0 | 2416 | __declspec(naked) __declspec(align(16)) |
michael@0 | 2417 | void I422ToRGB565Row_SSSE3(const uint8* y_buf, |
michael@0 | 2418 | const uint8* u_buf, |
michael@0 | 2419 | const uint8* v_buf, |
michael@0 | 2420 | uint8* rgb565_buf, |
michael@0 | 2421 | int width) { |
michael@0 | 2422 | __asm { |
michael@0 | 2423 | push esi |
michael@0 | 2424 | push edi |
michael@0 | 2425 | mov eax, [esp + 8 + 4] // Y |
michael@0 | 2426 | mov esi, [esp + 8 + 8] // U |
michael@0 | 2427 | mov edi, [esp + 8 + 12] // V |
michael@0 | 2428 | mov edx, [esp + 8 + 16] // rgb565 |
michael@0 | 2429 | mov ecx, [esp + 8 + 20] // width |
michael@0 | 2430 | sub edi, esi |
michael@0 | 2431 | pxor xmm4, xmm4 |
michael@0 | 2432 | pcmpeqb xmm5, xmm5 // generate mask 0x0000001f |
michael@0 | 2433 | psrld xmm5, 27 |
michael@0 | 2434 | pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 |
michael@0 | 2435 | psrld xmm6, 26 |
michael@0 | 2436 | pslld xmm6, 5 |
michael@0 | 2437 | pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 |
michael@0 | 2438 | pslld xmm7, 11 |
michael@0 | 2439 | |
michael@0 | 2440 | align 4 |
michael@0 | 2441 | convertloop: |
michael@0 | 2442 | READYUV422 |
michael@0 | 2443 | YUVTORGB |
michael@0 | 2444 | |
michael@0 | 2445 | // Step 3: Weave into RRGB |
michael@0 | 2446 | punpcklbw xmm0, xmm1 // BG |
michael@0 | 2447 | punpcklbw xmm2, xmm2 // RR |
michael@0 | 2448 | movdqa xmm1, xmm0 |
michael@0 | 2449 | punpcklwd xmm0, xmm2 // BGRR first 4 pixels |
michael@0 | 2450 | punpckhwd xmm1, xmm2 // BGRR next 4 pixels |
michael@0 | 2451 | |
michael@0 | 2452 | // Step 3b: RRGB -> RGB565 |
michael@0 | 2453 | movdqa xmm3, xmm0 // B first 4 pixels of argb |
michael@0 | 2454 | movdqa xmm2, xmm0 // G |
michael@0 | 2455 | pslld xmm0, 8 // R |
michael@0 | 2456 | psrld xmm3, 3 // B |
michael@0 | 2457 | psrld xmm2, 5 // G |
michael@0 | 2458 | psrad xmm0, 16 // R |
michael@0 | 2459 | pand xmm3, xmm5 // B |
michael@0 | 2460 | pand xmm2, xmm6 // G |
michael@0 | 2461 | pand xmm0, xmm7 // R |
michael@0 | 2462 | por xmm3, xmm2 // BG |
michael@0 | 2463 | por xmm0, xmm3 // BGR |
michael@0 | 2464 | movdqa xmm3, xmm1 // B next 4 pixels of argb |
michael@0 | 2465 | movdqa xmm2, xmm1 // G |
michael@0 | 2466 | pslld xmm1, 8 // R |
michael@0 | 2467 | psrld xmm3, 3 // B |
michael@0 | 2468 | psrld xmm2, 5 // G |
michael@0 | 2469 | psrad xmm1, 16 // R |
michael@0 | 2470 | pand xmm3, xmm5 // B |
michael@0 | 2471 | pand xmm2, xmm6 // G |
michael@0 | 2472 | pand xmm1, xmm7 // R |
michael@0 | 2473 | por xmm3, xmm2 // BG |
michael@0 | 2474 | por xmm1, xmm3 // BGR |
michael@0 | 2475 | packssdw xmm0, xmm1 |
michael@0 | 2476 | sub ecx, 8 |
michael@0 | 2477 | movdqu [edx], xmm0 // store 8 pixels of RGB565 |
michael@0 | 2478 | lea edx, [edx + 16] |
michael@0 | 2479 | jg convertloop |
michael@0 | 2480 | |
michael@0 | 2481 | pop edi |
michael@0 | 2482 | pop esi |
michael@0 | 2483 | ret |
michael@0 | 2484 | } |
michael@0 | 2485 | } |
michael@0 | 2486 | |
michael@0 | 2487 | // 8 pixels, dest aligned 16. |
michael@0 | 2488 | // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
michael@0 | 2489 | __declspec(naked) __declspec(align(16)) |
michael@0 | 2490 | void I422ToARGBRow_SSSE3(const uint8* y_buf, |
michael@0 | 2491 | const uint8* u_buf, |
michael@0 | 2492 | const uint8* v_buf, |
michael@0 | 2493 | uint8* dst_argb, |
michael@0 | 2494 | int width) { |
michael@0 | 2495 | __asm { |
michael@0 | 2496 | push esi |
michael@0 | 2497 | push edi |
michael@0 | 2498 | mov eax, [esp + 8 + 4] // Y |
michael@0 | 2499 | mov esi, [esp + 8 + 8] // U |
michael@0 | 2500 | mov edi, [esp + 8 + 12] // V |
michael@0 | 2501 | mov edx, [esp + 8 + 16] // argb |
michael@0 | 2502 | mov ecx, [esp + 8 + 20] // width |
michael@0 | 2503 | sub edi, esi |
michael@0 | 2504 | pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
michael@0 | 2505 | pxor xmm4, xmm4 |
michael@0 | 2506 | |
michael@0 | 2507 | align 4 |
michael@0 | 2508 | convertloop: |
michael@0 | 2509 | READYUV422 |
michael@0 | 2510 | YUVTORGB |
michael@0 | 2511 | |
michael@0 | 2512 | // Step 3: Weave into ARGB |
michael@0 | 2513 | punpcklbw xmm0, xmm1 // BG |
michael@0 | 2514 | punpcklbw xmm2, xmm5 // RA |
michael@0 | 2515 | movdqa xmm1, xmm0 |
michael@0 | 2516 | punpcklwd xmm0, xmm2 // BGRA first 4 pixels |
michael@0 | 2517 | punpckhwd xmm1, xmm2 // BGRA next 4 pixels |
michael@0 | 2518 | movdqa [edx], xmm0 |
michael@0 | 2519 | movdqa [edx + 16], xmm1 |
michael@0 | 2520 | lea edx, [edx + 32] |
michael@0 | 2521 | sub ecx, 8 |
michael@0 | 2522 | jg convertloop |
michael@0 | 2523 | |
michael@0 | 2524 | pop edi |
michael@0 | 2525 | pop esi |
michael@0 | 2526 | ret |
michael@0 | 2527 | } |
michael@0 | 2528 | } |
michael@0 | 2529 | |
michael@0 | 2530 | // 8 pixels, dest aligned 16. |
michael@0 | 2531 | // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
michael@0 | 2532 | // Similar to I420 but duplicate UV once more. |
michael@0 | 2533 | __declspec(naked) __declspec(align(16)) |
michael@0 | 2534 | void I411ToARGBRow_SSSE3(const uint8* y_buf, |
michael@0 | 2535 | const uint8* u_buf, |
michael@0 | 2536 | const uint8* v_buf, |
michael@0 | 2537 | uint8* dst_argb, |
michael@0 | 2538 | int width) { |
michael@0 | 2539 | __asm { |
michael@0 | 2540 | push ebx |
michael@0 | 2541 | push esi |
michael@0 | 2542 | push edi |
michael@0 | 2543 | mov eax, [esp + 12 + 4] // Y |
michael@0 | 2544 | mov esi, [esp + 12 + 8] // U |
michael@0 | 2545 | mov edi, [esp + 12 + 12] // V |
michael@0 | 2546 | mov edx, [esp + 12 + 16] // argb |
michael@0 | 2547 | mov ecx, [esp + 12 + 20] // width |
michael@0 | 2548 | sub edi, esi |
michael@0 | 2549 | pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
michael@0 | 2550 | pxor xmm4, xmm4 |
michael@0 | 2551 | |
michael@0 | 2552 | align 4 |
michael@0 | 2553 | convertloop: |
michael@0 | 2554 | READYUV411 // modifies EBX |
michael@0 | 2555 | YUVTORGB |
michael@0 | 2556 | |
michael@0 | 2557 | // Step 3: Weave into ARGB |
michael@0 | 2558 | punpcklbw xmm0, xmm1 // BG |
michael@0 | 2559 | punpcklbw xmm2, xmm5 // RA |
michael@0 | 2560 | movdqa xmm1, xmm0 |
michael@0 | 2561 | punpcklwd xmm0, xmm2 // BGRA first 4 pixels |
michael@0 | 2562 | punpckhwd xmm1, xmm2 // BGRA next 4 pixels |
michael@0 | 2563 | movdqa [edx], xmm0 |
michael@0 | 2564 | movdqa [edx + 16], xmm1 |
michael@0 | 2565 | lea edx, [edx + 32] |
michael@0 | 2566 | sub ecx, 8 |
michael@0 | 2567 | jg convertloop |
michael@0 | 2568 | |
michael@0 | 2569 | pop edi |
michael@0 | 2570 | pop esi |
michael@0 | 2571 | pop ebx |
michael@0 | 2572 | ret |
michael@0 | 2573 | } |
michael@0 | 2574 | } |
michael@0 | 2575 | |
michael@0 | 2576 | // 8 pixels, dest aligned 16. |
michael@0 | 2577 | // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
michael@0 | 2578 | __declspec(naked) __declspec(align(16)) |
michael@0 | 2579 | void NV12ToARGBRow_SSSE3(const uint8* y_buf, |
michael@0 | 2580 | const uint8* uv_buf, |
michael@0 | 2581 | uint8* dst_argb, |
michael@0 | 2582 | int width) { |
michael@0 | 2583 | __asm { |
michael@0 | 2584 | push esi |
michael@0 | 2585 | mov eax, [esp + 4 + 4] // Y |
michael@0 | 2586 | mov esi, [esp + 4 + 8] // UV |
michael@0 | 2587 | mov edx, [esp + 4 + 12] // argb |
michael@0 | 2588 | mov ecx, [esp + 4 + 16] // width |
michael@0 | 2589 | pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
michael@0 | 2590 | pxor xmm4, xmm4 |
michael@0 | 2591 | |
michael@0 | 2592 | align 4 |
michael@0 | 2593 | convertloop: |
michael@0 | 2594 | READNV12 |
michael@0 | 2595 | YUVTORGB |
michael@0 | 2596 | |
michael@0 | 2597 | // Step 3: Weave into ARGB |
michael@0 | 2598 | punpcklbw xmm0, xmm1 // BG |
michael@0 | 2599 | punpcklbw xmm2, xmm5 // RA |
michael@0 | 2600 | movdqa xmm1, xmm0 |
michael@0 | 2601 | punpcklwd xmm0, xmm2 // BGRA first 4 pixels |
michael@0 | 2602 | punpckhwd xmm1, xmm2 // BGRA next 4 pixels |
michael@0 | 2603 | movdqa [edx], xmm0 |
michael@0 | 2604 | movdqa [edx + 16], xmm1 |
michael@0 | 2605 | lea edx, [edx + 32] |
michael@0 | 2606 | sub ecx, 8 |
michael@0 | 2607 | jg convertloop |
michael@0 | 2608 | |
michael@0 | 2609 | pop esi |
michael@0 | 2610 | ret |
michael@0 | 2611 | } |
michael@0 | 2612 | } |
michael@0 | 2613 | |
michael@0 | 2614 | // 8 pixels, dest aligned 16. |
michael@0 | 2615 | // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
michael@0 | 2616 | __declspec(naked) __declspec(align(16)) |
michael@0 | 2617 | void NV21ToARGBRow_SSSE3(const uint8* y_buf, |
michael@0 | 2618 | const uint8* uv_buf, |
michael@0 | 2619 | uint8* dst_argb, |
michael@0 | 2620 | int width) { |
michael@0 | 2621 | __asm { |
michael@0 | 2622 | push esi |
michael@0 | 2623 | mov eax, [esp + 4 + 4] // Y |
michael@0 | 2624 | mov esi, [esp + 4 + 8] // VU |
michael@0 | 2625 | mov edx, [esp + 4 + 12] // argb |
michael@0 | 2626 | mov ecx, [esp + 4 + 16] // width |
michael@0 | 2627 | pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
michael@0 | 2628 | pxor xmm4, xmm4 |
michael@0 | 2629 | |
michael@0 | 2630 | align 4 |
michael@0 | 2631 | convertloop: |
michael@0 | 2632 | READNV12 |
michael@0 | 2633 | YVUTORGB |
michael@0 | 2634 | |
michael@0 | 2635 | // Step 3: Weave into ARGB |
michael@0 | 2636 | punpcklbw xmm0, xmm1 // BG |
michael@0 | 2637 | punpcklbw xmm2, xmm5 // RA |
michael@0 | 2638 | movdqa xmm1, xmm0 |
michael@0 | 2639 | punpcklwd xmm0, xmm2 // BGRA first 4 pixels |
michael@0 | 2640 | punpckhwd xmm1, xmm2 // BGRA next 4 pixels |
michael@0 | 2641 | movdqa [edx], xmm0 |
michael@0 | 2642 | movdqa [edx + 16], xmm1 |
michael@0 | 2643 | lea edx, [edx + 32] |
michael@0 | 2644 | sub ecx, 8 |
michael@0 | 2645 | jg convertloop |
michael@0 | 2646 | |
michael@0 | 2647 | pop esi |
michael@0 | 2648 | ret |
michael@0 | 2649 | } |
michael@0 | 2650 | } |
michael@0 | 2651 | |
michael@0 | 2652 | // 8 pixels, unaligned. |
michael@0 | 2653 | // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). |
michael@0 | 2654 | __declspec(naked) __declspec(align(16)) |
michael@0 | 2655 | void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, |
michael@0 | 2656 | const uint8* u_buf, |
michael@0 | 2657 | const uint8* v_buf, |
michael@0 | 2658 | uint8* dst_argb, |
michael@0 | 2659 | int width) { |
michael@0 | 2660 | __asm { |
michael@0 | 2661 | push esi |
michael@0 | 2662 | push edi |
michael@0 | 2663 | mov eax, [esp + 8 + 4] // Y |
michael@0 | 2664 | mov esi, [esp + 8 + 8] // U |
michael@0 | 2665 | mov edi, [esp + 8 + 12] // V |
michael@0 | 2666 | mov edx, [esp + 8 + 16] // argb |
michael@0 | 2667 | mov ecx, [esp + 8 + 20] // width |
michael@0 | 2668 | sub edi, esi |
michael@0 | 2669 | pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
michael@0 | 2670 | pxor xmm4, xmm4 |
michael@0 | 2671 | |
michael@0 | 2672 | align 4 |
michael@0 | 2673 | convertloop: |
michael@0 | 2674 | READYUV444 |
michael@0 | 2675 | YUVTORGB |
michael@0 | 2676 | |
michael@0 | 2677 | // Step 3: Weave into ARGB |
michael@0 | 2678 | punpcklbw xmm0, xmm1 // BG |
michael@0 | 2679 | punpcklbw xmm2, xmm5 // RA |
michael@0 | 2680 | movdqa xmm1, xmm0 |
michael@0 | 2681 | punpcklwd xmm0, xmm2 // BGRA first 4 pixels |
michael@0 | 2682 | punpckhwd xmm1, xmm2 // BGRA next 4 pixels |
michael@0 | 2683 | movdqu [edx], xmm0 |
michael@0 | 2684 | movdqu [edx + 16], xmm1 |
michael@0 | 2685 | lea edx, [edx + 32] |
michael@0 | 2686 | sub ecx, 8 |
michael@0 | 2687 | jg convertloop |
michael@0 | 2688 | |
michael@0 | 2689 | pop edi |
michael@0 | 2690 | pop esi |
michael@0 | 2691 | ret |
michael@0 | 2692 | } |
michael@0 | 2693 | } |
michael@0 | 2694 | |
michael@0 | 2695 | // 8 pixels, unaligned. |
michael@0 | 2696 | // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
michael@0 | 2697 | __declspec(naked) __declspec(align(16)) |
michael@0 | 2698 | void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, |
michael@0 | 2699 | const uint8* u_buf, |
michael@0 | 2700 | const uint8* v_buf, |
michael@0 | 2701 | uint8* dst_argb, |
michael@0 | 2702 | int width) { |
michael@0 | 2703 | __asm { |
michael@0 | 2704 | push esi |
michael@0 | 2705 | push edi |
michael@0 | 2706 | mov eax, [esp + 8 + 4] // Y |
michael@0 | 2707 | mov esi, [esp + 8 + 8] // U |
michael@0 | 2708 | mov edi, [esp + 8 + 12] // V |
michael@0 | 2709 | mov edx, [esp + 8 + 16] // argb |
michael@0 | 2710 | mov ecx, [esp + 8 + 20] // width |
michael@0 | 2711 | sub edi, esi |
michael@0 | 2712 | pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
michael@0 | 2713 | pxor xmm4, xmm4 |
michael@0 | 2714 | |
michael@0 | 2715 | align 4 |
michael@0 | 2716 | convertloop: |
michael@0 | 2717 | READYUV422 |
michael@0 | 2718 | YUVTORGB |
michael@0 | 2719 | |
michael@0 | 2720 | // Step 3: Weave into ARGB |
michael@0 | 2721 | punpcklbw xmm0, xmm1 // BG |
michael@0 | 2722 | punpcklbw xmm2, xmm5 // RA |
michael@0 | 2723 | movdqa xmm1, xmm0 |
michael@0 | 2724 | punpcklwd xmm0, xmm2 // BGRA first 4 pixels |
michael@0 | 2725 | punpckhwd xmm1, xmm2 // BGRA next 4 pixels |
michael@0 | 2726 | movdqu [edx], xmm0 |
michael@0 | 2727 | movdqu [edx + 16], xmm1 |
michael@0 | 2728 | lea edx, [edx + 32] |
michael@0 | 2729 | sub ecx, 8 |
michael@0 | 2730 | jg convertloop |
michael@0 | 2731 | |
michael@0 | 2732 | pop edi |
michael@0 | 2733 | pop esi |
michael@0 | 2734 | ret |
michael@0 | 2735 | } |
michael@0 | 2736 | } |
michael@0 | 2737 | |
michael@0 | 2738 | // 8 pixels, unaligned. |
michael@0 | 2739 | // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
michael@0 | 2740 | // Similar to I420 but duplicate UV once more. |
michael@0 | 2741 | __declspec(naked) __declspec(align(16)) |
michael@0 | 2742 | void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, |
michael@0 | 2743 | const uint8* u_buf, |
michael@0 | 2744 | const uint8* v_buf, |
michael@0 | 2745 | uint8* dst_argb, |
michael@0 | 2746 | int width) { |
michael@0 | 2747 | __asm { |
michael@0 | 2748 | push ebx |
michael@0 | 2749 | push esi |
michael@0 | 2750 | push edi |
michael@0 | 2751 | mov eax, [esp + 12 + 4] // Y |
michael@0 | 2752 | mov esi, [esp + 12 + 8] // U |
michael@0 | 2753 | mov edi, [esp + 12 + 12] // V |
michael@0 | 2754 | mov edx, [esp + 12 + 16] // argb |
michael@0 | 2755 | mov ecx, [esp + 12 + 20] // width |
michael@0 | 2756 | sub edi, esi |
michael@0 | 2757 | pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
michael@0 | 2758 | pxor xmm4, xmm4 |
michael@0 | 2759 | |
michael@0 | 2760 | align 4 |
michael@0 | 2761 | convertloop: |
michael@0 | 2762 | READYUV411 // modifies EBX |
michael@0 | 2763 | YUVTORGB |
michael@0 | 2764 | |
michael@0 | 2765 | // Step 3: Weave into ARGB |
michael@0 | 2766 | punpcklbw xmm0, xmm1 // BG |
michael@0 | 2767 | punpcklbw xmm2, xmm5 // RA |
michael@0 | 2768 | movdqa xmm1, xmm0 |
michael@0 | 2769 | punpcklwd xmm0, xmm2 // BGRA first 4 pixels |
michael@0 | 2770 | punpckhwd xmm1, xmm2 // BGRA next 4 pixels |
michael@0 | 2771 | movdqu [edx], xmm0 |
michael@0 | 2772 | movdqu [edx + 16], xmm1 |
michael@0 | 2773 | lea edx, [edx + 32] |
michael@0 | 2774 | sub ecx, 8 |
michael@0 | 2775 | jg convertloop |
michael@0 | 2776 | |
michael@0 | 2777 | pop edi |
michael@0 | 2778 | pop esi |
michael@0 | 2779 | pop ebx |
michael@0 | 2780 | ret |
michael@0 | 2781 | } |
michael@0 | 2782 | } |
michael@0 | 2783 | |
michael@0 | 2784 | // 8 pixels, dest aligned 16. |
michael@0 | 2785 | // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
michael@0 | 2786 | __declspec(naked) __declspec(align(16)) |
michael@0 | 2787 | void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, |
michael@0 | 2788 | const uint8* uv_buf, |
michael@0 | 2789 | uint8* dst_argb, |
michael@0 | 2790 | int width) { |
michael@0 | 2791 | __asm { |
michael@0 | 2792 | push esi |
michael@0 | 2793 | mov eax, [esp + 4 + 4] // Y |
michael@0 | 2794 | mov esi, [esp + 4 + 8] // UV |
michael@0 | 2795 | mov edx, [esp + 4 + 12] // argb |
michael@0 | 2796 | mov ecx, [esp + 4 + 16] // width |
michael@0 | 2797 | pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
michael@0 | 2798 | pxor xmm4, xmm4 |
michael@0 | 2799 | |
michael@0 | 2800 | align 4 |
michael@0 | 2801 | convertloop: |
michael@0 | 2802 | READNV12 |
michael@0 | 2803 | YUVTORGB |
michael@0 | 2804 | |
michael@0 | 2805 | // Step 3: Weave into ARGB |
michael@0 | 2806 | punpcklbw xmm0, xmm1 // BG |
michael@0 | 2807 | punpcklbw xmm2, xmm5 // RA |
michael@0 | 2808 | movdqa xmm1, xmm0 |
michael@0 | 2809 | punpcklwd xmm0, xmm2 // BGRA first 4 pixels |
michael@0 | 2810 | punpckhwd xmm1, xmm2 // BGRA next 4 pixels |
michael@0 | 2811 | movdqu [edx], xmm0 |
michael@0 | 2812 | movdqu [edx + 16], xmm1 |
michael@0 | 2813 | lea edx, [edx + 32] |
michael@0 | 2814 | sub ecx, 8 |
michael@0 | 2815 | jg convertloop |
michael@0 | 2816 | |
michael@0 | 2817 | pop esi |
michael@0 | 2818 | ret |
michael@0 | 2819 | } |
michael@0 | 2820 | } |
michael@0 | 2821 | |
michael@0 | 2822 | // 8 pixels, dest aligned 16. |
michael@0 | 2823 | // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
michael@0 | 2824 | __declspec(naked) __declspec(align(16)) |
michael@0 | 2825 | void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, |
michael@0 | 2826 | const uint8* uv_buf, |
michael@0 | 2827 | uint8* dst_argb, |
michael@0 | 2828 | int width) { |
michael@0 | 2829 | __asm { |
michael@0 | 2830 | push esi |
michael@0 | 2831 | mov eax, [esp + 4 + 4] // Y |
michael@0 | 2832 | mov esi, [esp + 4 + 8] // VU |
michael@0 | 2833 | mov edx, [esp + 4 + 12] // argb |
michael@0 | 2834 | mov ecx, [esp + 4 + 16] // width |
michael@0 | 2835 | pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
michael@0 | 2836 | pxor xmm4, xmm4 |
michael@0 | 2837 | |
michael@0 | 2838 | align 4 |
michael@0 | 2839 | convertloop: |
michael@0 | 2840 | READNV12 |
michael@0 | 2841 | YVUTORGB |
michael@0 | 2842 | |
michael@0 | 2843 | // Step 3: Weave into ARGB |
michael@0 | 2844 | punpcklbw xmm0, xmm1 // BG |
michael@0 | 2845 | punpcklbw xmm2, xmm5 // RA |
michael@0 | 2846 | movdqa xmm1, xmm0 |
michael@0 | 2847 | punpcklwd xmm0, xmm2 // BGRA first 4 pixels |
michael@0 | 2848 | punpckhwd xmm1, xmm2 // BGRA next 4 pixels |
michael@0 | 2849 | movdqu [edx], xmm0 |
michael@0 | 2850 | movdqu [edx + 16], xmm1 |
michael@0 | 2851 | lea edx, [edx + 32] |
michael@0 | 2852 | sub ecx, 8 |
michael@0 | 2853 | jg convertloop |
michael@0 | 2854 | |
michael@0 | 2855 | pop esi |
michael@0 | 2856 | ret |
michael@0 | 2857 | } |
michael@0 | 2858 | } |
michael@0 | 2859 | |
michael@0 | 2860 | __declspec(naked) __declspec(align(16)) |
michael@0 | 2861 | void I422ToBGRARow_SSSE3(const uint8* y_buf, |
michael@0 | 2862 | const uint8* u_buf, |
michael@0 | 2863 | const uint8* v_buf, |
michael@0 | 2864 | uint8* dst_bgra, |
michael@0 | 2865 | int width) { |
michael@0 | 2866 | __asm { |
michael@0 | 2867 | push esi |
michael@0 | 2868 | push edi |
michael@0 | 2869 | mov eax, [esp + 8 + 4] // Y |
michael@0 | 2870 | mov esi, [esp + 8 + 8] // U |
michael@0 | 2871 | mov edi, [esp + 8 + 12] // V |
michael@0 | 2872 | mov edx, [esp + 8 + 16] // bgra |
michael@0 | 2873 | mov ecx, [esp + 8 + 20] // width |
michael@0 | 2874 | sub edi, esi |
michael@0 | 2875 | pxor xmm4, xmm4 |
michael@0 | 2876 | |
michael@0 | 2877 | align 4 |
michael@0 | 2878 | convertloop: |
michael@0 | 2879 | READYUV422 |
michael@0 | 2880 | YUVTORGB |
michael@0 | 2881 | |
michael@0 | 2882 | // Step 3: Weave into BGRA |
michael@0 | 2883 | pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
michael@0 | 2884 | punpcklbw xmm1, xmm0 // GB |
michael@0 | 2885 | punpcklbw xmm5, xmm2 // AR |
michael@0 | 2886 | movdqa xmm0, xmm5 |
michael@0 | 2887 | punpcklwd xmm5, xmm1 // BGRA first 4 pixels |
michael@0 | 2888 | punpckhwd xmm0, xmm1 // BGRA next 4 pixels |
michael@0 | 2889 | movdqa [edx], xmm5 |
michael@0 | 2890 | movdqa [edx + 16], xmm0 |
michael@0 | 2891 | lea edx, [edx + 32] |
michael@0 | 2892 | sub ecx, 8 |
michael@0 | 2893 | jg convertloop |
michael@0 | 2894 | |
michael@0 | 2895 | pop edi |
michael@0 | 2896 | pop esi |
michael@0 | 2897 | ret |
michael@0 | 2898 | } |
michael@0 | 2899 | } |
michael@0 | 2900 | |
michael@0 | 2901 | __declspec(naked) __declspec(align(16)) |
michael@0 | 2902 | void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, |
michael@0 | 2903 | const uint8* u_buf, |
michael@0 | 2904 | const uint8* v_buf, |
michael@0 | 2905 | uint8* dst_bgra, |
michael@0 | 2906 | int width) { |
michael@0 | 2907 | __asm { |
michael@0 | 2908 | push esi |
michael@0 | 2909 | push edi |
michael@0 | 2910 | mov eax, [esp + 8 + 4] // Y |
michael@0 | 2911 | mov esi, [esp + 8 + 8] // U |
michael@0 | 2912 | mov edi, [esp + 8 + 12] // V |
michael@0 | 2913 | mov edx, [esp + 8 + 16] // bgra |
michael@0 | 2914 | mov ecx, [esp + 8 + 20] // width |
michael@0 | 2915 | sub edi, esi |
michael@0 | 2916 | pxor xmm4, xmm4 |
michael@0 | 2917 | |
michael@0 | 2918 | align 4 |
michael@0 | 2919 | convertloop: |
michael@0 | 2920 | READYUV422 |
michael@0 | 2921 | YUVTORGB |
michael@0 | 2922 | |
michael@0 | 2923 | // Step 3: Weave into BGRA |
michael@0 | 2924 | pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
michael@0 | 2925 | punpcklbw xmm1, xmm0 // GB |
michael@0 | 2926 | punpcklbw xmm5, xmm2 // AR |
michael@0 | 2927 | movdqa xmm0, xmm5 |
michael@0 | 2928 | punpcklwd xmm5, xmm1 // BGRA first 4 pixels |
michael@0 | 2929 | punpckhwd xmm0, xmm1 // BGRA next 4 pixels |
michael@0 | 2930 | movdqu [edx], xmm5 |
michael@0 | 2931 | movdqu [edx + 16], xmm0 |
michael@0 | 2932 | lea edx, [edx + 32] |
michael@0 | 2933 | sub ecx, 8 |
michael@0 | 2934 | jg convertloop |
michael@0 | 2935 | |
michael@0 | 2936 | pop edi |
michael@0 | 2937 | pop esi |
michael@0 | 2938 | ret |
michael@0 | 2939 | } |
michael@0 | 2940 | } |
michael@0 | 2941 | |
michael@0 | 2942 | __declspec(naked) __declspec(align(16)) |
michael@0 | 2943 | void I422ToABGRRow_SSSE3(const uint8* y_buf, |
michael@0 | 2944 | const uint8* u_buf, |
michael@0 | 2945 | const uint8* v_buf, |
michael@0 | 2946 | uint8* dst_abgr, |
michael@0 | 2947 | int width) { |
michael@0 | 2948 | __asm { |
michael@0 | 2949 | push esi |
michael@0 | 2950 | push edi |
michael@0 | 2951 | mov eax, [esp + 8 + 4] // Y |
michael@0 | 2952 | mov esi, [esp + 8 + 8] // U |
michael@0 | 2953 | mov edi, [esp + 8 + 12] // V |
michael@0 | 2954 | mov edx, [esp + 8 + 16] // abgr |
michael@0 | 2955 | mov ecx, [esp + 8 + 20] // width |
michael@0 | 2956 | sub edi, esi |
michael@0 | 2957 | pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
michael@0 | 2958 | pxor xmm4, xmm4 |
michael@0 | 2959 | |
michael@0 | 2960 | align 4 |
michael@0 | 2961 | convertloop: |
michael@0 | 2962 | READYUV422 |
michael@0 | 2963 | YUVTORGB |
michael@0 | 2964 | |
michael@0 | 2965 | // Step 3: Weave into ARGB |
michael@0 | 2966 | punpcklbw xmm2, xmm1 // RG |
michael@0 | 2967 | punpcklbw xmm0, xmm5 // BA |
michael@0 | 2968 | movdqa xmm1, xmm2 |
michael@0 | 2969 | punpcklwd xmm2, xmm0 // RGBA first 4 pixels |
michael@0 | 2970 | punpckhwd xmm1, xmm0 // RGBA next 4 pixels |
michael@0 | 2971 | movdqa [edx], xmm2 |
michael@0 | 2972 | movdqa [edx + 16], xmm1 |
michael@0 | 2973 | lea edx, [edx + 32] |
michael@0 | 2974 | sub ecx, 8 |
michael@0 | 2975 | jg convertloop |
michael@0 | 2976 | |
michael@0 | 2977 | pop edi |
michael@0 | 2978 | pop esi |
michael@0 | 2979 | ret |
michael@0 | 2980 | } |
michael@0 | 2981 | } |
michael@0 | 2982 | |
michael@0 | 2983 | __declspec(naked) __declspec(align(16)) |
michael@0 | 2984 | void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, |
michael@0 | 2985 | const uint8* u_buf, |
michael@0 | 2986 | const uint8* v_buf, |
michael@0 | 2987 | uint8* dst_abgr, |
michael@0 | 2988 | int width) { |
michael@0 | 2989 | __asm { |
michael@0 | 2990 | push esi |
michael@0 | 2991 | push edi |
michael@0 | 2992 | mov eax, [esp + 8 + 4] // Y |
michael@0 | 2993 | mov esi, [esp + 8 + 8] // U |
michael@0 | 2994 | mov edi, [esp + 8 + 12] // V |
michael@0 | 2995 | mov edx, [esp + 8 + 16] // abgr |
michael@0 | 2996 | mov ecx, [esp + 8 + 20] // width |
michael@0 | 2997 | sub edi, esi |
michael@0 | 2998 | pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
michael@0 | 2999 | pxor xmm4, xmm4 |
michael@0 | 3000 | |
michael@0 | 3001 | align 4 |
michael@0 | 3002 | convertloop: |
michael@0 | 3003 | READYUV422 |
michael@0 | 3004 | YUVTORGB |
michael@0 | 3005 | |
michael@0 | 3006 | // Step 3: Weave into ARGB |
michael@0 | 3007 | punpcklbw xmm2, xmm1 // RG |
michael@0 | 3008 | punpcklbw xmm0, xmm5 // BA |
michael@0 | 3009 | movdqa xmm1, xmm2 |
michael@0 | 3010 | punpcklwd xmm2, xmm0 // RGBA first 4 pixels |
michael@0 | 3011 | punpckhwd xmm1, xmm0 // RGBA next 4 pixels |
michael@0 | 3012 | movdqu [edx], xmm2 |
michael@0 | 3013 | movdqu [edx + 16], xmm1 |
michael@0 | 3014 | lea edx, [edx + 32] |
michael@0 | 3015 | sub ecx, 8 |
michael@0 | 3016 | jg convertloop |
michael@0 | 3017 | |
michael@0 | 3018 | pop edi |
michael@0 | 3019 | pop esi |
michael@0 | 3020 | ret |
michael@0 | 3021 | } |
michael@0 | 3022 | } |
michael@0 | 3023 | |
michael@0 | 3024 | __declspec(naked) __declspec(align(16)) |
michael@0 | 3025 | void I422ToRGBARow_SSSE3(const uint8* y_buf, |
michael@0 | 3026 | const uint8* u_buf, |
michael@0 | 3027 | const uint8* v_buf, |
michael@0 | 3028 | uint8* dst_rgba, |
michael@0 | 3029 | int width) { |
michael@0 | 3030 | __asm { |
michael@0 | 3031 | push esi |
michael@0 | 3032 | push edi |
michael@0 | 3033 | mov eax, [esp + 8 + 4] // Y |
michael@0 | 3034 | mov esi, [esp + 8 + 8] // U |
michael@0 | 3035 | mov edi, [esp + 8 + 12] // V |
michael@0 | 3036 | mov edx, [esp + 8 + 16] // rgba |
michael@0 | 3037 | mov ecx, [esp + 8 + 20] // width |
michael@0 | 3038 | sub edi, esi |
michael@0 | 3039 | pxor xmm4, xmm4 |
michael@0 | 3040 | |
michael@0 | 3041 | align 4 |
michael@0 | 3042 | convertloop: |
michael@0 | 3043 | READYUV422 |
michael@0 | 3044 | YUVTORGB |
michael@0 | 3045 | |
michael@0 | 3046 | // Step 3: Weave into RGBA |
michael@0 | 3047 | pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
michael@0 | 3048 | punpcklbw xmm1, xmm2 // GR |
michael@0 | 3049 | punpcklbw xmm5, xmm0 // AB |
michael@0 | 3050 | movdqa xmm0, xmm5 |
michael@0 | 3051 | punpcklwd xmm5, xmm1 // RGBA first 4 pixels |
michael@0 | 3052 | punpckhwd xmm0, xmm1 // RGBA next 4 pixels |
michael@0 | 3053 | movdqa [edx], xmm5 |
michael@0 | 3054 | movdqa [edx + 16], xmm0 |
michael@0 | 3055 | lea edx, [edx + 32] |
michael@0 | 3056 | sub ecx, 8 |
michael@0 | 3057 | jg convertloop |
michael@0 | 3058 | |
michael@0 | 3059 | pop edi |
michael@0 | 3060 | pop esi |
michael@0 | 3061 | ret |
michael@0 | 3062 | } |
michael@0 | 3063 | } |
michael@0 | 3064 | |
michael@0 | 3065 | __declspec(naked) __declspec(align(16)) |
michael@0 | 3066 | void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, |
michael@0 | 3067 | const uint8* u_buf, |
michael@0 | 3068 | const uint8* v_buf, |
michael@0 | 3069 | uint8* dst_rgba, |
michael@0 | 3070 | int width) { |
michael@0 | 3071 | __asm { |
michael@0 | 3072 | push esi |
michael@0 | 3073 | push edi |
michael@0 | 3074 | mov eax, [esp + 8 + 4] // Y |
michael@0 | 3075 | mov esi, [esp + 8 + 8] // U |
michael@0 | 3076 | mov edi, [esp + 8 + 12] // V |
michael@0 | 3077 | mov edx, [esp + 8 + 16] // rgba |
michael@0 | 3078 | mov ecx, [esp + 8 + 20] // width |
michael@0 | 3079 | sub edi, esi |
michael@0 | 3080 | pxor xmm4, xmm4 |
michael@0 | 3081 | |
michael@0 | 3082 | align 4 |
michael@0 | 3083 | convertloop: |
michael@0 | 3084 | READYUV422 |
michael@0 | 3085 | YUVTORGB |
michael@0 | 3086 | |
michael@0 | 3087 | // Step 3: Weave into RGBA |
michael@0 | 3088 | pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
michael@0 | 3089 | punpcklbw xmm1, xmm2 // GR |
michael@0 | 3090 | punpcklbw xmm5, xmm0 // AB |
michael@0 | 3091 | movdqa xmm0, xmm5 |
michael@0 | 3092 | punpcklwd xmm5, xmm1 // RGBA first 4 pixels |
michael@0 | 3093 | punpckhwd xmm0, xmm1 // RGBA next 4 pixels |
michael@0 | 3094 | movdqu [edx], xmm5 |
michael@0 | 3095 | movdqu [edx + 16], xmm0 |
michael@0 | 3096 | lea edx, [edx + 32] |
michael@0 | 3097 | sub ecx, 8 |
michael@0 | 3098 | jg convertloop |
michael@0 | 3099 | |
michael@0 | 3100 | pop edi |
michael@0 | 3101 | pop esi |
michael@0 | 3102 | ret |
michael@0 | 3103 | } |
michael@0 | 3104 | } |
michael@0 | 3105 | |
michael@0 | 3106 | #endif // HAS_I422TOARGBROW_SSSE3 |
michael@0 | 3107 | |
michael@0 | 3108 | #ifdef HAS_YTOARGBROW_SSE2 |
michael@0 | 3109 | __declspec(naked) __declspec(align(16)) |
michael@0 | 3110 | void YToARGBRow_SSE2(const uint8* y_buf, |
michael@0 | 3111 | uint8* rgb_buf, |
michael@0 | 3112 | int width) { |
michael@0 | 3113 | __asm { |
michael@0 | 3114 | pxor xmm5, xmm5 |
michael@0 | 3115 | pcmpeqb xmm4, xmm4 // generate mask 0xff000000 |
michael@0 | 3116 | pslld xmm4, 24 |
michael@0 | 3117 | mov eax, 0x00100010 |
michael@0 | 3118 | movd xmm3, eax |
michael@0 | 3119 | pshufd xmm3, xmm3, 0 |
michael@0 | 3120 | mov eax, 0x004a004a // 74 |
michael@0 | 3121 | movd xmm2, eax |
michael@0 | 3122 | pshufd xmm2, xmm2,0 |
michael@0 | 3123 | mov eax, [esp + 4] // Y |
michael@0 | 3124 | mov edx, [esp + 8] // rgb |
michael@0 | 3125 | mov ecx, [esp + 12] // width |
michael@0 | 3126 | |
michael@0 | 3127 | align 4 |
michael@0 | 3128 | convertloop: |
michael@0 | 3129 | // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 |
michael@0 | 3130 | movq xmm0, qword ptr [eax] |
michael@0 | 3131 | lea eax, [eax + 8] |
michael@0 | 3132 | punpcklbw xmm0, xmm5 // 0.Y |
michael@0 | 3133 | psubusw xmm0, xmm3 |
michael@0 | 3134 | pmullw xmm0, xmm2 |
michael@0 | 3135 | psrlw xmm0, 6 |
michael@0 | 3136 | packuswb xmm0, xmm0 // G |
michael@0 | 3137 | |
michael@0 | 3138 | // Step 2: Weave into ARGB |
michael@0 | 3139 | punpcklbw xmm0, xmm0 // GG |
michael@0 | 3140 | movdqa xmm1, xmm0 |
michael@0 | 3141 | punpcklwd xmm0, xmm0 // BGRA first 4 pixels |
michael@0 | 3142 | punpckhwd xmm1, xmm1 // BGRA next 4 pixels |
michael@0 | 3143 | por xmm0, xmm4 |
michael@0 | 3144 | por xmm1, xmm4 |
michael@0 | 3145 | movdqa [edx], xmm0 |
michael@0 | 3146 | movdqa [edx + 16], xmm1 |
michael@0 | 3147 | lea edx, [edx + 32] |
michael@0 | 3148 | sub ecx, 8 |
michael@0 | 3149 | jg convertloop |
michael@0 | 3150 | |
michael@0 | 3151 | ret |
michael@0 | 3152 | } |
michael@0 | 3153 | } |
michael@0 | 3154 | #endif // HAS_YTOARGBROW_SSE2 |
michael@0 | 3155 | |
michael@0 | 3156 | #ifdef HAS_MIRRORROW_SSSE3 |
michael@0 | 3157 | // Shuffle table for reversing the bytes. |
michael@0 | 3158 | static const uvec8 kShuffleMirror = { |
michael@0 | 3159 | 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u |
michael@0 | 3160 | }; |
michael@0 | 3161 | |
michael@0 | 3162 | __declspec(naked) __declspec(align(16)) |
michael@0 | 3163 | void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { |
michael@0 | 3164 | __asm { |
michael@0 | 3165 | mov eax, [esp + 4] // src |
michael@0 | 3166 | mov edx, [esp + 8] // dst |
michael@0 | 3167 | mov ecx, [esp + 12] // width |
michael@0 | 3168 | movdqa xmm5, kShuffleMirror |
michael@0 | 3169 | lea eax, [eax - 16] |
michael@0 | 3170 | |
michael@0 | 3171 | align 4 |
michael@0 | 3172 | convertloop: |
michael@0 | 3173 | movdqa xmm0, [eax + ecx] |
michael@0 | 3174 | pshufb xmm0, xmm5 |
michael@0 | 3175 | sub ecx, 16 |
michael@0 | 3176 | movdqa [edx], xmm0 |
michael@0 | 3177 | lea edx, [edx + 16] |
michael@0 | 3178 | jg convertloop |
michael@0 | 3179 | ret |
michael@0 | 3180 | } |
michael@0 | 3181 | } |
michael@0 | 3182 | #endif // HAS_MIRRORROW_SSSE3 |
michael@0 | 3183 | |
michael@0 | 3184 | #ifdef HAS_MIRRORROW_AVX2 |
michael@0 | 3185 | // Shuffle table for reversing the bytes. |
michael@0 | 3186 | static const ulvec8 kShuffleMirror_AVX2 = { |
michael@0 | 3187 | 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u, |
michael@0 | 3188 | 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u |
michael@0 | 3189 | }; |
michael@0 | 3190 | |
michael@0 | 3191 | __declspec(naked) __declspec(align(16)) |
michael@0 | 3192 | void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { |
michael@0 | 3193 | __asm { |
michael@0 | 3194 | mov eax, [esp + 4] // src |
michael@0 | 3195 | mov edx, [esp + 8] // dst |
michael@0 | 3196 | mov ecx, [esp + 12] // width |
michael@0 | 3197 | vmovdqa ymm5, kShuffleMirror_AVX2 |
michael@0 | 3198 | lea eax, [eax - 32] |
michael@0 | 3199 | |
michael@0 | 3200 | align 4 |
michael@0 | 3201 | convertloop: |
michael@0 | 3202 | vmovdqu ymm0, [eax + ecx] |
michael@0 | 3203 | vpshufb ymm0, ymm0, ymm5 |
michael@0 | 3204 | vpermq ymm0, ymm0, 0x4e // swap high and low halfs |
michael@0 | 3205 | sub ecx, 32 |
michael@0 | 3206 | vmovdqu [edx], ymm0 |
michael@0 | 3207 | lea edx, [edx + 32] |
michael@0 | 3208 | jg convertloop |
michael@0 | 3209 | vzeroupper |
michael@0 | 3210 | ret |
michael@0 | 3211 | } |
michael@0 | 3212 | } |
michael@0 | 3213 | #endif // HAS_MIRRORROW_AVX2 |
michael@0 | 3214 | |
michael@0 | 3215 | #ifdef HAS_MIRRORROW_SSE2 |
michael@0 | 3216 | // SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3 |
michael@0 | 3217 | // version can not. |
michael@0 | 3218 | __declspec(naked) __declspec(align(16)) |
michael@0 | 3219 | void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { |
michael@0 | 3220 | __asm { |
michael@0 | 3221 | mov eax, [esp + 4] // src |
michael@0 | 3222 | mov edx, [esp + 8] // dst |
michael@0 | 3223 | mov ecx, [esp + 12] // width |
michael@0 | 3224 | lea eax, [eax - 16] |
michael@0 | 3225 | |
michael@0 | 3226 | align 4 |
michael@0 | 3227 | convertloop: |
michael@0 | 3228 | movdqu xmm0, [eax + ecx] |
michael@0 | 3229 | movdqa xmm1, xmm0 // swap bytes |
michael@0 | 3230 | psllw xmm0, 8 |
michael@0 | 3231 | psrlw xmm1, 8 |
michael@0 | 3232 | por xmm0, xmm1 |
michael@0 | 3233 | pshuflw xmm0, xmm0, 0x1b // swap words |
michael@0 | 3234 | pshufhw xmm0, xmm0, 0x1b |
michael@0 | 3235 | pshufd xmm0, xmm0, 0x4e // swap qwords |
michael@0 | 3236 | sub ecx, 16 |
michael@0 | 3237 | movdqu [edx], xmm0 |
michael@0 | 3238 | lea edx, [edx + 16] |
michael@0 | 3239 | jg convertloop |
michael@0 | 3240 | ret |
michael@0 | 3241 | } |
michael@0 | 3242 | } |
michael@0 | 3243 | #endif // HAS_MIRRORROW_SSE2 |
michael@0 | 3244 | |
michael@0 | 3245 | #ifdef HAS_MIRRORROW_UV_SSSE3 |
michael@0 | 3246 | // Shuffle table for reversing the bytes of UV channels. |
michael@0 | 3247 | static const uvec8 kShuffleMirrorUV = { |
michael@0 | 3248 | 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u |
michael@0 | 3249 | }; |
michael@0 | 3250 | |
michael@0 | 3251 | __declspec(naked) __declspec(align(16)) |
michael@0 | 3252 | void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, |
michael@0 | 3253 | int width) { |
michael@0 | 3254 | __asm { |
michael@0 | 3255 | push edi |
michael@0 | 3256 | mov eax, [esp + 4 + 4] // src |
michael@0 | 3257 | mov edx, [esp + 4 + 8] // dst_u |
michael@0 | 3258 | mov edi, [esp + 4 + 12] // dst_v |
michael@0 | 3259 | mov ecx, [esp + 4 + 16] // width |
michael@0 | 3260 | movdqa xmm1, kShuffleMirrorUV |
michael@0 | 3261 | lea eax, [eax + ecx * 2 - 16] |
michael@0 | 3262 | sub edi, edx |
michael@0 | 3263 | |
michael@0 | 3264 | align 4 |
michael@0 | 3265 | convertloop: |
michael@0 | 3266 | movdqa xmm0, [eax] |
michael@0 | 3267 | lea eax, [eax - 16] |
michael@0 | 3268 | pshufb xmm0, xmm1 |
michael@0 | 3269 | sub ecx, 8 |
michael@0 | 3270 | movlpd qword ptr [edx], xmm0 |
michael@0 | 3271 | movhpd qword ptr [edx + edi], xmm0 |
michael@0 | 3272 | lea edx, [edx + 8] |
michael@0 | 3273 | jg convertloop |
michael@0 | 3274 | |
michael@0 | 3275 | pop edi |
michael@0 | 3276 | ret |
michael@0 | 3277 | } |
michael@0 | 3278 | } |
michael@0 | 3279 | #endif // HAS_MIRRORROW_UV_SSSE3 |
michael@0 | 3280 | |
michael@0 | 3281 | #ifdef HAS_ARGBMIRRORROW_SSSE3 |
michael@0 | 3282 | // Shuffle table for reversing the bytes. |
michael@0 | 3283 | static const uvec8 kARGBShuffleMirror = { |
michael@0 | 3284 | 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u |
michael@0 | 3285 | }; |
michael@0 | 3286 | |
michael@0 | 3287 | __declspec(naked) __declspec(align(16)) |
michael@0 | 3288 | void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { |
michael@0 | 3289 | __asm { |
michael@0 | 3290 | mov eax, [esp + 4] // src |
michael@0 | 3291 | mov edx, [esp + 8] // dst |
michael@0 | 3292 | mov ecx, [esp + 12] // width |
michael@0 | 3293 | lea eax, [eax - 16 + ecx * 4] // last 4 pixels. |
michael@0 | 3294 | movdqa xmm5, kARGBShuffleMirror |
michael@0 | 3295 | |
michael@0 | 3296 | align 4 |
michael@0 | 3297 | convertloop: |
michael@0 | 3298 | movdqa xmm0, [eax] |
michael@0 | 3299 | lea eax, [eax - 16] |
michael@0 | 3300 | pshufb xmm0, xmm5 |
michael@0 | 3301 | sub ecx, 4 |
michael@0 | 3302 | movdqa [edx], xmm0 |
michael@0 | 3303 | lea edx, [edx + 16] |
michael@0 | 3304 | jg convertloop |
michael@0 | 3305 | ret |
michael@0 | 3306 | } |
michael@0 | 3307 | } |
michael@0 | 3308 | #endif // HAS_ARGBMIRRORROW_SSSE3 |
michael@0 | 3309 | |
michael@0 | 3310 | #ifdef HAS_ARGBMIRRORROW_AVX2 |
michael@0 | 3311 | // Shuffle table for reversing the bytes. |
michael@0 | 3312 | static const ulvec32 kARGBShuffleMirror_AVX2 = { |
michael@0 | 3313 | 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u |
michael@0 | 3314 | }; |
michael@0 | 3315 | |
michael@0 | 3316 | __declspec(naked) __declspec(align(16)) |
michael@0 | 3317 | void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { |
michael@0 | 3318 | __asm { |
michael@0 | 3319 | mov eax, [esp + 4] // src |
michael@0 | 3320 | mov edx, [esp + 8] // dst |
michael@0 | 3321 | mov ecx, [esp + 12] // width |
michael@0 | 3322 | lea eax, [eax - 32] |
michael@0 | 3323 | vmovdqa ymm5, kARGBShuffleMirror_AVX2 |
michael@0 | 3324 | |
michael@0 | 3325 | align 4 |
michael@0 | 3326 | convertloop: |
michael@0 | 3327 | vpermd ymm0, ymm5, [eax + ecx * 4] // permute dword order |
michael@0 | 3328 | sub ecx, 8 |
michael@0 | 3329 | vmovdqu [edx], ymm0 |
michael@0 | 3330 | lea edx, [edx + 32] |
michael@0 | 3331 | jg convertloop |
michael@0 | 3332 | vzeroupper |
michael@0 | 3333 | ret |
michael@0 | 3334 | } |
michael@0 | 3335 | } |
michael@0 | 3336 | #endif // HAS_ARGBMIRRORROW_AVX2 |
michael@0 | 3337 | |
michael@0 | 3338 | #ifdef HAS_SPLITUVROW_SSE2 |
michael@0 | 3339 | __declspec(naked) __declspec(align(16)) |
michael@0 | 3340 | void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 3341 | __asm { |
michael@0 | 3342 | push edi |
michael@0 | 3343 | mov eax, [esp + 4 + 4] // src_uv |
michael@0 | 3344 | mov edx, [esp + 4 + 8] // dst_u |
michael@0 | 3345 | mov edi, [esp + 4 + 12] // dst_v |
michael@0 | 3346 | mov ecx, [esp + 4 + 16] // pix |
michael@0 | 3347 | pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
michael@0 | 3348 | psrlw xmm5, 8 |
michael@0 | 3349 | sub edi, edx |
michael@0 | 3350 | |
michael@0 | 3351 | align 4 |
michael@0 | 3352 | convertloop: |
michael@0 | 3353 | movdqa xmm0, [eax] |
michael@0 | 3354 | movdqa xmm1, [eax + 16] |
michael@0 | 3355 | lea eax, [eax + 32] |
michael@0 | 3356 | movdqa xmm2, xmm0 |
michael@0 | 3357 | movdqa xmm3, xmm1 |
michael@0 | 3358 | pand xmm0, xmm5 // even bytes |
michael@0 | 3359 | pand xmm1, xmm5 |
michael@0 | 3360 | packuswb xmm0, xmm1 |
michael@0 | 3361 | psrlw xmm2, 8 // odd bytes |
michael@0 | 3362 | psrlw xmm3, 8 |
michael@0 | 3363 | packuswb xmm2, xmm3 |
michael@0 | 3364 | movdqa [edx], xmm0 |
michael@0 | 3365 | movdqa [edx + edi], xmm2 |
michael@0 | 3366 | lea edx, [edx + 16] |
michael@0 | 3367 | sub ecx, 16 |
michael@0 | 3368 | jg convertloop |
michael@0 | 3369 | |
michael@0 | 3370 | pop edi |
michael@0 | 3371 | ret |
michael@0 | 3372 | } |
michael@0 | 3373 | } |
michael@0 | 3374 | |
michael@0 | 3375 | __declspec(naked) __declspec(align(16)) |
michael@0 | 3376 | void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, |
michael@0 | 3377 | int pix) { |
michael@0 | 3378 | __asm { |
michael@0 | 3379 | push edi |
michael@0 | 3380 | mov eax, [esp + 4 + 4] // src_uv |
michael@0 | 3381 | mov edx, [esp + 4 + 8] // dst_u |
michael@0 | 3382 | mov edi, [esp + 4 + 12] // dst_v |
michael@0 | 3383 | mov ecx, [esp + 4 + 16] // pix |
michael@0 | 3384 | pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
michael@0 | 3385 | psrlw xmm5, 8 |
michael@0 | 3386 | sub edi, edx |
michael@0 | 3387 | |
michael@0 | 3388 | align 4 |
michael@0 | 3389 | convertloop: |
michael@0 | 3390 | movdqu xmm0, [eax] |
michael@0 | 3391 | movdqu xmm1, [eax + 16] |
michael@0 | 3392 | lea eax, [eax + 32] |
michael@0 | 3393 | movdqa xmm2, xmm0 |
michael@0 | 3394 | movdqa xmm3, xmm1 |
michael@0 | 3395 | pand xmm0, xmm5 // even bytes |
michael@0 | 3396 | pand xmm1, xmm5 |
michael@0 | 3397 | packuswb xmm0, xmm1 |
michael@0 | 3398 | psrlw xmm2, 8 // odd bytes |
michael@0 | 3399 | psrlw xmm3, 8 |
michael@0 | 3400 | packuswb xmm2, xmm3 |
michael@0 | 3401 | movdqu [edx], xmm0 |
michael@0 | 3402 | movdqu [edx + edi], xmm2 |
michael@0 | 3403 | lea edx, [edx + 16] |
michael@0 | 3404 | sub ecx, 16 |
michael@0 | 3405 | jg convertloop |
michael@0 | 3406 | |
michael@0 | 3407 | pop edi |
michael@0 | 3408 | ret |
michael@0 | 3409 | } |
michael@0 | 3410 | } |
michael@0 | 3411 | #endif // HAS_SPLITUVROW_SSE2 |
michael@0 | 3412 | |
michael@0 | 3413 | #ifdef HAS_SPLITUVROW_AVX2 |
michael@0 | 3414 | __declspec(naked) __declspec(align(16)) |
michael@0 | 3415 | void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 3416 | __asm { |
michael@0 | 3417 | push edi |
michael@0 | 3418 | mov eax, [esp + 4 + 4] // src_uv |
michael@0 | 3419 | mov edx, [esp + 4 + 8] // dst_u |
michael@0 | 3420 | mov edi, [esp + 4 + 12] // dst_v |
michael@0 | 3421 | mov ecx, [esp + 4 + 16] // pix |
michael@0 | 3422 | vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
michael@0 | 3423 | vpsrlw ymm5, ymm5, 8 |
michael@0 | 3424 | sub edi, edx |
michael@0 | 3425 | |
michael@0 | 3426 | align 4 |
michael@0 | 3427 | convertloop: |
michael@0 | 3428 | vmovdqu ymm0, [eax] |
michael@0 | 3429 | vmovdqu ymm1, [eax + 32] |
michael@0 | 3430 | lea eax, [eax + 64] |
michael@0 | 3431 | vpsrlw ymm2, ymm0, 8 // odd bytes |
michael@0 | 3432 | vpsrlw ymm3, ymm1, 8 |
michael@0 | 3433 | vpand ymm0, ymm0, ymm5 // even bytes |
michael@0 | 3434 | vpand ymm1, ymm1, ymm5 |
michael@0 | 3435 | vpackuswb ymm0, ymm0, ymm1 |
michael@0 | 3436 | vpackuswb ymm2, ymm2, ymm3 |
michael@0 | 3437 | vpermq ymm0, ymm0, 0xd8 |
michael@0 | 3438 | vpermq ymm2, ymm2, 0xd8 |
michael@0 | 3439 | vmovdqu [edx], ymm0 |
michael@0 | 3440 | vmovdqu [edx + edi], ymm2 |
michael@0 | 3441 | lea edx, [edx + 32] |
michael@0 | 3442 | sub ecx, 32 |
michael@0 | 3443 | jg convertloop |
michael@0 | 3444 | |
michael@0 | 3445 | pop edi |
michael@0 | 3446 | vzeroupper |
michael@0 | 3447 | ret |
michael@0 | 3448 | } |
michael@0 | 3449 | } |
michael@0 | 3450 | #endif // HAS_SPLITUVROW_AVX2 |
michael@0 | 3451 | |
michael@0 | 3452 | #ifdef HAS_MERGEUVROW_SSE2 |
michael@0 | 3453 | __declspec(naked) __declspec(align(16)) |
michael@0 | 3454 | void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, |
michael@0 | 3455 | int width) { |
michael@0 | 3456 | __asm { |
michael@0 | 3457 | push edi |
michael@0 | 3458 | mov eax, [esp + 4 + 4] // src_u |
michael@0 | 3459 | mov edx, [esp + 4 + 8] // src_v |
michael@0 | 3460 | mov edi, [esp + 4 + 12] // dst_uv |
michael@0 | 3461 | mov ecx, [esp + 4 + 16] // width |
michael@0 | 3462 | sub edx, eax |
michael@0 | 3463 | |
michael@0 | 3464 | align 4 |
michael@0 | 3465 | convertloop: |
michael@0 | 3466 | movdqa xmm0, [eax] // read 16 U's |
michael@0 | 3467 | movdqa xmm1, [eax + edx] // and 16 V's |
michael@0 | 3468 | lea eax, [eax + 16] |
michael@0 | 3469 | movdqa xmm2, xmm0 |
michael@0 | 3470 | punpcklbw xmm0, xmm1 // first 8 UV pairs |
michael@0 | 3471 | punpckhbw xmm2, xmm1 // next 8 UV pairs |
michael@0 | 3472 | movdqa [edi], xmm0 |
michael@0 | 3473 | movdqa [edi + 16], xmm2 |
michael@0 | 3474 | lea edi, [edi + 32] |
michael@0 | 3475 | sub ecx, 16 |
michael@0 | 3476 | jg convertloop |
michael@0 | 3477 | |
michael@0 | 3478 | pop edi |
michael@0 | 3479 | ret |
michael@0 | 3480 | } |
michael@0 | 3481 | } |
michael@0 | 3482 | |
michael@0 | 3483 | __declspec(naked) __declspec(align(16)) |
michael@0 | 3484 | void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, |
michael@0 | 3485 | uint8* dst_uv, int width) { |
michael@0 | 3486 | __asm { |
michael@0 | 3487 | push edi |
michael@0 | 3488 | mov eax, [esp + 4 + 4] // src_u |
michael@0 | 3489 | mov edx, [esp + 4 + 8] // src_v |
michael@0 | 3490 | mov edi, [esp + 4 + 12] // dst_uv |
michael@0 | 3491 | mov ecx, [esp + 4 + 16] // width |
michael@0 | 3492 | sub edx, eax |
michael@0 | 3493 | |
michael@0 | 3494 | align 4 |
michael@0 | 3495 | convertloop: |
michael@0 | 3496 | movdqu xmm0, [eax] // read 16 U's |
michael@0 | 3497 | movdqu xmm1, [eax + edx] // and 16 V's |
michael@0 | 3498 | lea eax, [eax + 16] |
michael@0 | 3499 | movdqa xmm2, xmm0 |
michael@0 | 3500 | punpcklbw xmm0, xmm1 // first 8 UV pairs |
michael@0 | 3501 | punpckhbw xmm2, xmm1 // next 8 UV pairs |
michael@0 | 3502 | movdqu [edi], xmm0 |
michael@0 | 3503 | movdqu [edi + 16], xmm2 |
michael@0 | 3504 | lea edi, [edi + 32] |
michael@0 | 3505 | sub ecx, 16 |
michael@0 | 3506 | jg convertloop |
michael@0 | 3507 | |
michael@0 | 3508 | pop edi |
michael@0 | 3509 | ret |
michael@0 | 3510 | } |
michael@0 | 3511 | } |
michael@0 | 3512 | #endif // HAS_MERGEUVROW_SSE2 |
michael@0 | 3513 | |
michael@0 | 3514 | #ifdef HAS_MERGEUVROW_AVX2 |
michael@0 | 3515 | __declspec(naked) __declspec(align(16)) |
michael@0 | 3516 | void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, |
michael@0 | 3517 | int width) { |
michael@0 | 3518 | __asm { |
michael@0 | 3519 | push edi |
michael@0 | 3520 | mov eax, [esp + 4 + 4] // src_u |
michael@0 | 3521 | mov edx, [esp + 4 + 8] // src_v |
michael@0 | 3522 | mov edi, [esp + 4 + 12] // dst_uv |
michael@0 | 3523 | mov ecx, [esp + 4 + 16] // width |
michael@0 | 3524 | sub edx, eax |
michael@0 | 3525 | |
michael@0 | 3526 | align 4 |
michael@0 | 3527 | convertloop: |
michael@0 | 3528 | vmovdqu ymm0, [eax] // read 32 U's |
michael@0 | 3529 | vmovdqu ymm1, [eax + edx] // and 32 V's |
michael@0 | 3530 | lea eax, [eax + 32] |
michael@0 | 3531 | vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2 |
michael@0 | 3532 | vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3 |
michael@0 | 3533 | vperm2i128 ymm1, ymm2, ymm0, 0x20 // low 128 of ymm2 and low 128 of ymm0 |
michael@0 | 3534 | vperm2i128 ymm2, ymm2, ymm0, 0x31 // high 128 of ymm2 and high 128 of ymm0 |
michael@0 | 3535 | vmovdqu [edi], ymm1 |
michael@0 | 3536 | vmovdqu [edi + 32], ymm2 |
michael@0 | 3537 | lea edi, [edi + 64] |
michael@0 | 3538 | sub ecx, 32 |
michael@0 | 3539 | jg convertloop |
michael@0 | 3540 | |
michael@0 | 3541 | pop edi |
michael@0 | 3542 | vzeroupper |
michael@0 | 3543 | ret |
michael@0 | 3544 | } |
michael@0 | 3545 | } |
michael@0 | 3546 | #endif // HAS_MERGEUVROW_AVX2 |
michael@0 | 3547 | |
michael@0 | 3548 | #ifdef HAS_COPYROW_SSE2 |
michael@0 | 3549 | // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. |
michael@0 | 3550 | __declspec(naked) __declspec(align(16)) |
michael@0 | 3551 | void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { |
michael@0 | 3552 | __asm { |
michael@0 | 3553 | mov eax, [esp + 4] // src |
michael@0 | 3554 | mov edx, [esp + 8] // dst |
michael@0 | 3555 | mov ecx, [esp + 12] // count |
michael@0 | 3556 | |
michael@0 | 3557 | align 4 |
michael@0 | 3558 | convertloop: |
michael@0 | 3559 | movdqa xmm0, [eax] |
michael@0 | 3560 | movdqa xmm1, [eax + 16] |
michael@0 | 3561 | lea eax, [eax + 32] |
michael@0 | 3562 | movdqa [edx], xmm0 |
michael@0 | 3563 | movdqa [edx + 16], xmm1 |
michael@0 | 3564 | lea edx, [edx + 32] |
michael@0 | 3565 | sub ecx, 32 |
michael@0 | 3566 | jg convertloop |
michael@0 | 3567 | ret |
michael@0 | 3568 | } |
michael@0 | 3569 | } |
michael@0 | 3570 | #endif // HAS_COPYROW_SSE2 |
michael@0 | 3571 | |
michael@0 | 3572 | // Unaligned Multiple of 1. |
michael@0 | 3573 | __declspec(naked) __declspec(align(16)) |
michael@0 | 3574 | void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { |
michael@0 | 3575 | __asm { |
michael@0 | 3576 | mov eax, esi |
michael@0 | 3577 | mov edx, edi |
michael@0 | 3578 | mov esi, [esp + 4] // src |
michael@0 | 3579 | mov edi, [esp + 8] // dst |
michael@0 | 3580 | mov ecx, [esp + 12] // count |
michael@0 | 3581 | rep movsb |
michael@0 | 3582 | mov edi, edx |
michael@0 | 3583 | mov esi, eax |
michael@0 | 3584 | ret |
michael@0 | 3585 | } |
michael@0 | 3586 | } |
michael@0 | 3587 | |
michael@0 | 3588 | #ifdef HAS_COPYROW_X86 |
michael@0 | 3589 | __declspec(naked) __declspec(align(16)) |
michael@0 | 3590 | void CopyRow_X86(const uint8* src, uint8* dst, int count) { |
michael@0 | 3591 | __asm { |
michael@0 | 3592 | mov eax, esi |
michael@0 | 3593 | mov edx, edi |
michael@0 | 3594 | mov esi, [esp + 4] // src |
michael@0 | 3595 | mov edi, [esp + 8] // dst |
michael@0 | 3596 | mov ecx, [esp + 12] // count |
michael@0 | 3597 | shr ecx, 2 |
michael@0 | 3598 | rep movsd |
michael@0 | 3599 | mov edi, edx |
michael@0 | 3600 | mov esi, eax |
michael@0 | 3601 | ret |
michael@0 | 3602 | } |
michael@0 | 3603 | } |
michael@0 | 3604 | #endif // HAS_COPYROW_X86 |
michael@0 | 3605 | |
michael@0 | 3606 | #ifdef HAS_ARGBCOPYALPHAROW_SSE2 |
michael@0 | 3607 | // width in pixels |
michael@0 | 3608 | __declspec(naked) __declspec(align(16)) |
michael@0 | 3609 | void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { |
michael@0 | 3610 | __asm { |
michael@0 | 3611 | mov eax, [esp + 4] // src |
michael@0 | 3612 | mov edx, [esp + 8] // dst |
michael@0 | 3613 | mov ecx, [esp + 12] // count |
michael@0 | 3614 | pcmpeqb xmm0, xmm0 // generate mask 0xff000000 |
michael@0 | 3615 | pslld xmm0, 24 |
michael@0 | 3616 | pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff |
michael@0 | 3617 | psrld xmm1, 8 |
michael@0 | 3618 | |
michael@0 | 3619 | align 4 |
michael@0 | 3620 | convertloop: |
michael@0 | 3621 | movdqa xmm2, [eax] |
michael@0 | 3622 | movdqa xmm3, [eax + 16] |
michael@0 | 3623 | lea eax, [eax + 32] |
michael@0 | 3624 | movdqa xmm4, [edx] |
michael@0 | 3625 | movdqa xmm5, [edx + 16] |
michael@0 | 3626 | pand xmm2, xmm0 |
michael@0 | 3627 | pand xmm3, xmm0 |
michael@0 | 3628 | pand xmm4, xmm1 |
michael@0 | 3629 | pand xmm5, xmm1 |
michael@0 | 3630 | por xmm2, xmm4 |
michael@0 | 3631 | por xmm3, xmm5 |
michael@0 | 3632 | movdqa [edx], xmm2 |
michael@0 | 3633 | movdqa [edx + 16], xmm3 |
michael@0 | 3634 | lea edx, [edx + 32] |
michael@0 | 3635 | sub ecx, 8 |
michael@0 | 3636 | jg convertloop |
michael@0 | 3637 | |
michael@0 | 3638 | ret |
michael@0 | 3639 | } |
michael@0 | 3640 | } |
michael@0 | 3641 | #endif // HAS_ARGBCOPYALPHAROW_SSE2 |
michael@0 | 3642 | |
michael@0 | 3643 | #ifdef HAS_ARGBCOPYALPHAROW_AVX2 |
michael@0 | 3644 | // width in pixels |
michael@0 | 3645 | __declspec(naked) __declspec(align(16)) |
michael@0 | 3646 | void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { |
michael@0 | 3647 | __asm { |
michael@0 | 3648 | mov eax, [esp + 4] // src |
michael@0 | 3649 | mov edx, [esp + 8] // dst |
michael@0 | 3650 | mov ecx, [esp + 12] // count |
michael@0 | 3651 | vpcmpeqb ymm0, ymm0, ymm0 |
michael@0 | 3652 | vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff |
michael@0 | 3653 | |
michael@0 | 3654 | align 4 |
michael@0 | 3655 | convertloop: |
michael@0 | 3656 | vmovdqu ymm1, [eax] |
michael@0 | 3657 | vmovdqu ymm2, [eax + 32] |
michael@0 | 3658 | lea eax, [eax + 64] |
michael@0 | 3659 | vpblendvb ymm1, ymm1, [edx], ymm0 |
michael@0 | 3660 | vpblendvb ymm2, ymm2, [edx + 32], ymm0 |
michael@0 | 3661 | vmovdqu [edx], ymm1 |
michael@0 | 3662 | vmovdqu [edx + 32], ymm2 |
michael@0 | 3663 | lea edx, [edx + 64] |
michael@0 | 3664 | sub ecx, 16 |
michael@0 | 3665 | jg convertloop |
michael@0 | 3666 | |
michael@0 | 3667 | vzeroupper |
michael@0 | 3668 | ret |
michael@0 | 3669 | } |
michael@0 | 3670 | } |
michael@0 | 3671 | #endif // HAS_ARGBCOPYALPHAROW_AVX2 |
michael@0 | 3672 | |
michael@0 | 3673 | #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 |
michael@0 | 3674 | // width in pixels |
michael@0 | 3675 | __declspec(naked) __declspec(align(16)) |
michael@0 | 3676 | void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { |
michael@0 | 3677 | __asm { |
michael@0 | 3678 | mov eax, [esp + 4] // src |
michael@0 | 3679 | mov edx, [esp + 8] // dst |
michael@0 | 3680 | mov ecx, [esp + 12] // count |
michael@0 | 3681 | pcmpeqb xmm0, xmm0 // generate mask 0xff000000 |
michael@0 | 3682 | pslld xmm0, 24 |
michael@0 | 3683 | pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff |
michael@0 | 3684 | psrld xmm1, 8 |
michael@0 | 3685 | |
michael@0 | 3686 | align 4 |
michael@0 | 3687 | convertloop: |
michael@0 | 3688 | movq xmm2, qword ptr [eax] // 8 Y's |
michael@0 | 3689 | lea eax, [eax + 8] |
michael@0 | 3690 | punpcklbw xmm2, xmm2 |
michael@0 | 3691 | punpckhwd xmm3, xmm2 |
michael@0 | 3692 | punpcklwd xmm2, xmm2 |
michael@0 | 3693 | movdqa xmm4, [edx] |
michael@0 | 3694 | movdqa xmm5, [edx + 16] |
michael@0 | 3695 | pand xmm2, xmm0 |
michael@0 | 3696 | pand xmm3, xmm0 |
michael@0 | 3697 | pand xmm4, xmm1 |
michael@0 | 3698 | pand xmm5, xmm1 |
michael@0 | 3699 | por xmm2, xmm4 |
michael@0 | 3700 | por xmm3, xmm5 |
michael@0 | 3701 | movdqa [edx], xmm2 |
michael@0 | 3702 | movdqa [edx + 16], xmm3 |
michael@0 | 3703 | lea edx, [edx + 32] |
michael@0 | 3704 | sub ecx, 8 |
michael@0 | 3705 | jg convertloop |
michael@0 | 3706 | |
michael@0 | 3707 | ret |
michael@0 | 3708 | } |
michael@0 | 3709 | } |
michael@0 | 3710 | #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 |
michael@0 | 3711 | |
michael@0 | 3712 | #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 |
michael@0 | 3713 | // width in pixels |
michael@0 | 3714 | __declspec(naked) __declspec(align(16)) |
michael@0 | 3715 | void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { |
michael@0 | 3716 | __asm { |
michael@0 | 3717 | mov eax, [esp + 4] // src |
michael@0 | 3718 | mov edx, [esp + 8] // dst |
michael@0 | 3719 | mov ecx, [esp + 12] // count |
michael@0 | 3720 | vpcmpeqb ymm0, ymm0, ymm0 |
michael@0 | 3721 | vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff |
michael@0 | 3722 | |
michael@0 | 3723 | align 4 |
michael@0 | 3724 | convertloop: |
michael@0 | 3725 | vpmovzxbd ymm1, qword ptr [eax] |
michael@0 | 3726 | vpmovzxbd ymm2, qword ptr [eax + 8] |
michael@0 | 3727 | lea eax, [eax + 16] |
michael@0 | 3728 | vpslld ymm1, ymm1, 24 |
michael@0 | 3729 | vpslld ymm2, ymm2, 24 |
michael@0 | 3730 | vpblendvb ymm1, ymm1, [edx], ymm0 |
michael@0 | 3731 | vpblendvb ymm2, ymm2, [edx + 32], ymm0 |
michael@0 | 3732 | vmovdqu [edx], ymm1 |
michael@0 | 3733 | vmovdqu [edx + 32], ymm2 |
michael@0 | 3734 | lea edx, [edx + 64] |
michael@0 | 3735 | sub ecx, 16 |
michael@0 | 3736 | jg convertloop |
michael@0 | 3737 | |
michael@0 | 3738 | vzeroupper |
michael@0 | 3739 | ret |
michael@0 | 3740 | } |
michael@0 | 3741 | } |
michael@0 | 3742 | #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 |
michael@0 | 3743 | |
michael@0 | 3744 | #ifdef HAS_SETROW_X86 |
michael@0 | 3745 | // SetRow8 writes 'count' bytes using a 32 bit value repeated. |
michael@0 | 3746 | __declspec(naked) __declspec(align(16)) |
michael@0 | 3747 | void SetRow_X86(uint8* dst, uint32 v32, int count) { |
michael@0 | 3748 | __asm { |
michael@0 | 3749 | mov edx, edi |
michael@0 | 3750 | mov edi, [esp + 4] // dst |
michael@0 | 3751 | mov eax, [esp + 8] // v32 |
michael@0 | 3752 | mov ecx, [esp + 12] // count |
michael@0 | 3753 | shr ecx, 2 |
michael@0 | 3754 | rep stosd |
michael@0 | 3755 | mov edi, edx |
michael@0 | 3756 | ret |
michael@0 | 3757 | } |
michael@0 | 3758 | } |
michael@0 | 3759 | |
michael@0 | 3760 | // SetRow32 writes 'count' words using a 32 bit value repeated. |
michael@0 | 3761 | __declspec(naked) __declspec(align(16)) |
michael@0 | 3762 | void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, |
michael@0 | 3763 | int dst_stride, int height) { |
michael@0 | 3764 | __asm { |
michael@0 | 3765 | push esi |
michael@0 | 3766 | push edi |
michael@0 | 3767 | push ebp |
michael@0 | 3768 | mov edi, [esp + 12 + 4] // dst |
michael@0 | 3769 | mov eax, [esp + 12 + 8] // v32 |
michael@0 | 3770 | mov ebp, [esp + 12 + 12] // width |
michael@0 | 3771 | mov edx, [esp + 12 + 16] // dst_stride |
michael@0 | 3772 | mov esi, [esp + 12 + 20] // height |
michael@0 | 3773 | lea ecx, [ebp * 4] |
michael@0 | 3774 | sub edx, ecx // stride - width * 4 |
michael@0 | 3775 | |
michael@0 | 3776 | align 4 |
michael@0 | 3777 | convertloop: |
michael@0 | 3778 | mov ecx, ebp |
michael@0 | 3779 | rep stosd |
michael@0 | 3780 | add edi, edx |
michael@0 | 3781 | sub esi, 1 |
michael@0 | 3782 | jg convertloop |
michael@0 | 3783 | |
michael@0 | 3784 | pop ebp |
michael@0 | 3785 | pop edi |
michael@0 | 3786 | pop esi |
michael@0 | 3787 | ret |
michael@0 | 3788 | } |
michael@0 | 3789 | } |
michael@0 | 3790 | #endif // HAS_SETROW_X86 |
michael@0 | 3791 | |
michael@0 | 3792 | #ifdef HAS_YUY2TOYROW_AVX2 |
michael@0 | 3793 | __declspec(naked) __declspec(align(16)) |
michael@0 | 3794 | void YUY2ToYRow_AVX2(const uint8* src_yuy2, |
michael@0 | 3795 | uint8* dst_y, int pix) { |
michael@0 | 3796 | __asm { |
michael@0 | 3797 | mov eax, [esp + 4] // src_yuy2 |
michael@0 | 3798 | mov edx, [esp + 8] // dst_y |
michael@0 | 3799 | mov ecx, [esp + 12] // pix |
michael@0 | 3800 | vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
michael@0 | 3801 | vpsrlw ymm5, ymm5, 8 |
michael@0 | 3802 | |
michael@0 | 3803 | align 4 |
michael@0 | 3804 | convertloop: |
michael@0 | 3805 | vmovdqu ymm0, [eax] |
michael@0 | 3806 | vmovdqu ymm1, [eax + 32] |
michael@0 | 3807 | lea eax, [eax + 64] |
michael@0 | 3808 | vpand ymm0, ymm0, ymm5 // even bytes are Y |
michael@0 | 3809 | vpand ymm1, ymm1, ymm5 |
michael@0 | 3810 | vpackuswb ymm0, ymm0, ymm1 // mutates. |
michael@0 | 3811 | vpermq ymm0, ymm0, 0xd8 |
michael@0 | 3812 | sub ecx, 32 |
michael@0 | 3813 | vmovdqu [edx], ymm0 |
michael@0 | 3814 | lea edx, [edx + 32] |
michael@0 | 3815 | jg convertloop |
michael@0 | 3816 | vzeroupper |
michael@0 | 3817 | ret |
michael@0 | 3818 | } |
michael@0 | 3819 | } |
michael@0 | 3820 | |
michael@0 | 3821 | __declspec(naked) __declspec(align(16)) |
michael@0 | 3822 | void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, |
michael@0 | 3823 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 3824 | __asm { |
michael@0 | 3825 | push esi |
michael@0 | 3826 | push edi |
michael@0 | 3827 | mov eax, [esp + 8 + 4] // src_yuy2 |
michael@0 | 3828 | mov esi, [esp + 8 + 8] // stride_yuy2 |
michael@0 | 3829 | mov edx, [esp + 8 + 12] // dst_u |
michael@0 | 3830 | mov edi, [esp + 8 + 16] // dst_v |
michael@0 | 3831 | mov ecx, [esp + 8 + 20] // pix |
michael@0 | 3832 | vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
michael@0 | 3833 | vpsrlw ymm5, ymm5, 8 |
michael@0 | 3834 | sub edi, edx |
michael@0 | 3835 | |
michael@0 | 3836 | align 4 |
michael@0 | 3837 | convertloop: |
michael@0 | 3838 | vmovdqu ymm0, [eax] |
michael@0 | 3839 | vmovdqu ymm1, [eax + 32] |
michael@0 | 3840 | vpavgb ymm0, ymm0, [eax + esi] |
michael@0 | 3841 | vpavgb ymm1, ymm1, [eax + esi + 32] |
michael@0 | 3842 | lea eax, [eax + 64] |
michael@0 | 3843 | vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV |
michael@0 | 3844 | vpsrlw ymm1, ymm1, 8 |
michael@0 | 3845 | vpackuswb ymm0, ymm0, ymm1 // mutates. |
michael@0 | 3846 | vpermq ymm0, ymm0, 0xd8 |
michael@0 | 3847 | vpand ymm1, ymm0, ymm5 // U |
michael@0 | 3848 | vpsrlw ymm0, ymm0, 8 // V |
michael@0 | 3849 | vpackuswb ymm1, ymm1, ymm1 // mutates. |
michael@0 | 3850 | vpackuswb ymm0, ymm0, ymm0 // mutates. |
michael@0 | 3851 | vpermq ymm1, ymm1, 0xd8 |
michael@0 | 3852 | vpermq ymm0, ymm0, 0xd8 |
michael@0 | 3853 | vextractf128 [edx], ymm1, 0 // U |
michael@0 | 3854 | vextractf128 [edx + edi], ymm0, 0 // V |
michael@0 | 3855 | lea edx, [edx + 16] |
michael@0 | 3856 | sub ecx, 32 |
michael@0 | 3857 | jg convertloop |
michael@0 | 3858 | |
michael@0 | 3859 | pop edi |
michael@0 | 3860 | pop esi |
michael@0 | 3861 | vzeroupper |
michael@0 | 3862 | ret |
michael@0 | 3863 | } |
michael@0 | 3864 | } |
michael@0 | 3865 | |
michael@0 | 3866 | __declspec(naked) __declspec(align(16)) |
michael@0 | 3867 | void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, |
michael@0 | 3868 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 3869 | __asm { |
michael@0 | 3870 | push edi |
michael@0 | 3871 | mov eax, [esp + 4 + 4] // src_yuy2 |
michael@0 | 3872 | mov edx, [esp + 4 + 8] // dst_u |
michael@0 | 3873 | mov edi, [esp + 4 + 12] // dst_v |
michael@0 | 3874 | mov ecx, [esp + 4 + 16] // pix |
michael@0 | 3875 | vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
michael@0 | 3876 | vpsrlw ymm5, ymm5, 8 |
michael@0 | 3877 | sub edi, edx |
michael@0 | 3878 | |
michael@0 | 3879 | align 4 |
michael@0 | 3880 | convertloop: |
michael@0 | 3881 | vmovdqu ymm0, [eax] |
michael@0 | 3882 | vmovdqu ymm1, [eax + 32] |
michael@0 | 3883 | lea eax, [eax + 64] |
michael@0 | 3884 | vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV |
michael@0 | 3885 | vpsrlw ymm1, ymm1, 8 |
michael@0 | 3886 | vpackuswb ymm0, ymm0, ymm1 // mutates. |
michael@0 | 3887 | vpermq ymm0, ymm0, 0xd8 |
michael@0 | 3888 | vpand ymm1, ymm0, ymm5 // U |
michael@0 | 3889 | vpsrlw ymm0, ymm0, 8 // V |
michael@0 | 3890 | vpackuswb ymm1, ymm1, ymm1 // mutates. |
michael@0 | 3891 | vpackuswb ymm0, ymm0, ymm0 // mutates. |
michael@0 | 3892 | vpermq ymm1, ymm1, 0xd8 |
michael@0 | 3893 | vpermq ymm0, ymm0, 0xd8 |
michael@0 | 3894 | vextractf128 [edx], ymm1, 0 // U |
michael@0 | 3895 | vextractf128 [edx + edi], ymm0, 0 // V |
michael@0 | 3896 | lea edx, [edx + 16] |
michael@0 | 3897 | sub ecx, 32 |
michael@0 | 3898 | jg convertloop |
michael@0 | 3899 | |
michael@0 | 3900 | pop edi |
michael@0 | 3901 | vzeroupper |
michael@0 | 3902 | ret |
michael@0 | 3903 | } |
michael@0 | 3904 | } |
michael@0 | 3905 | |
michael@0 | 3906 | __declspec(naked) __declspec(align(16)) |
michael@0 | 3907 | void UYVYToYRow_AVX2(const uint8* src_uyvy, |
michael@0 | 3908 | uint8* dst_y, int pix) { |
michael@0 | 3909 | __asm { |
michael@0 | 3910 | mov eax, [esp + 4] // src_uyvy |
michael@0 | 3911 | mov edx, [esp + 8] // dst_y |
michael@0 | 3912 | mov ecx, [esp + 12] // pix |
michael@0 | 3913 | |
michael@0 | 3914 | align 4 |
michael@0 | 3915 | convertloop: |
michael@0 | 3916 | vmovdqu ymm0, [eax] |
michael@0 | 3917 | vmovdqu ymm1, [eax + 32] |
michael@0 | 3918 | lea eax, [eax + 64] |
michael@0 | 3919 | vpsrlw ymm0, ymm0, 8 // odd bytes are Y |
michael@0 | 3920 | vpsrlw ymm1, ymm1, 8 |
michael@0 | 3921 | vpackuswb ymm0, ymm0, ymm1 // mutates. |
michael@0 | 3922 | vpermq ymm0, ymm0, 0xd8 |
michael@0 | 3923 | sub ecx, 32 |
michael@0 | 3924 | vmovdqu [edx], ymm0 |
michael@0 | 3925 | lea edx, [edx + 32] |
michael@0 | 3926 | jg convertloop |
michael@0 | 3927 | ret |
michael@0 | 3928 | vzeroupper |
michael@0 | 3929 | } |
michael@0 | 3930 | } |
michael@0 | 3931 | |
michael@0 | 3932 | __declspec(naked) __declspec(align(16)) |
michael@0 | 3933 | void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, |
michael@0 | 3934 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 3935 | __asm { |
michael@0 | 3936 | push esi |
michael@0 | 3937 | push edi |
michael@0 | 3938 | mov eax, [esp + 8 + 4] // src_yuy2 |
michael@0 | 3939 | mov esi, [esp + 8 + 8] // stride_yuy2 |
michael@0 | 3940 | mov edx, [esp + 8 + 12] // dst_u |
michael@0 | 3941 | mov edi, [esp + 8 + 16] // dst_v |
michael@0 | 3942 | mov ecx, [esp + 8 + 20] // pix |
michael@0 | 3943 | vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
michael@0 | 3944 | vpsrlw ymm5, ymm5, 8 |
michael@0 | 3945 | sub edi, edx |
michael@0 | 3946 | |
michael@0 | 3947 | align 4 |
michael@0 | 3948 | convertloop: |
michael@0 | 3949 | vmovdqu ymm0, [eax] |
michael@0 | 3950 | vmovdqu ymm1, [eax + 32] |
michael@0 | 3951 | vpavgb ymm0, ymm0, [eax + esi] |
michael@0 | 3952 | vpavgb ymm1, ymm1, [eax + esi + 32] |
michael@0 | 3953 | lea eax, [eax + 64] |
michael@0 | 3954 | vpand ymm0, ymm0, ymm5 // UYVY -> UVUV |
michael@0 | 3955 | vpand ymm1, ymm1, ymm5 |
michael@0 | 3956 | vpackuswb ymm0, ymm0, ymm1 // mutates. |
michael@0 | 3957 | vpermq ymm0, ymm0, 0xd8 |
michael@0 | 3958 | vpand ymm1, ymm0, ymm5 // U |
michael@0 | 3959 | vpsrlw ymm0, ymm0, 8 // V |
michael@0 | 3960 | vpackuswb ymm1, ymm1, ymm1 // mutates. |
michael@0 | 3961 | vpackuswb ymm0, ymm0, ymm0 // mutates. |
michael@0 | 3962 | vpermq ymm1, ymm1, 0xd8 |
michael@0 | 3963 | vpermq ymm0, ymm0, 0xd8 |
michael@0 | 3964 | vextractf128 [edx], ymm1, 0 // U |
michael@0 | 3965 | vextractf128 [edx + edi], ymm0, 0 // V |
michael@0 | 3966 | lea edx, [edx + 16] |
michael@0 | 3967 | sub ecx, 32 |
michael@0 | 3968 | jg convertloop |
michael@0 | 3969 | |
michael@0 | 3970 | pop edi |
michael@0 | 3971 | pop esi |
michael@0 | 3972 | vzeroupper |
michael@0 | 3973 | ret |
michael@0 | 3974 | } |
michael@0 | 3975 | } |
michael@0 | 3976 | |
michael@0 | 3977 | __declspec(naked) __declspec(align(16)) |
michael@0 | 3978 | void UYVYToUV422Row_AVX2(const uint8* src_uyvy, |
michael@0 | 3979 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 3980 | __asm { |
michael@0 | 3981 | push edi |
michael@0 | 3982 | mov eax, [esp + 4 + 4] // src_yuy2 |
michael@0 | 3983 | mov edx, [esp + 4 + 8] // dst_u |
michael@0 | 3984 | mov edi, [esp + 4 + 12] // dst_v |
michael@0 | 3985 | mov ecx, [esp + 4 + 16] // pix |
michael@0 | 3986 | vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
michael@0 | 3987 | vpsrlw ymm5, ymm5, 8 |
michael@0 | 3988 | sub edi, edx |
michael@0 | 3989 | |
michael@0 | 3990 | align 4 |
michael@0 | 3991 | convertloop: |
michael@0 | 3992 | vmovdqu ymm0, [eax] |
michael@0 | 3993 | vmovdqu ymm1, [eax + 32] |
michael@0 | 3994 | lea eax, [eax + 64] |
michael@0 | 3995 | vpand ymm0, ymm0, ymm5 // UYVY -> UVUV |
michael@0 | 3996 | vpand ymm1, ymm1, ymm5 |
michael@0 | 3997 | vpackuswb ymm0, ymm0, ymm1 // mutates. |
michael@0 | 3998 | vpermq ymm0, ymm0, 0xd8 |
michael@0 | 3999 | vpand ymm1, ymm0, ymm5 // U |
michael@0 | 4000 | vpsrlw ymm0, ymm0, 8 // V |
michael@0 | 4001 | vpackuswb ymm1, ymm1, ymm1 // mutates. |
michael@0 | 4002 | vpackuswb ymm0, ymm0, ymm0 // mutates. |
michael@0 | 4003 | vpermq ymm1, ymm1, 0xd8 |
michael@0 | 4004 | vpermq ymm0, ymm0, 0xd8 |
michael@0 | 4005 | vextractf128 [edx], ymm1, 0 // U |
michael@0 | 4006 | vextractf128 [edx + edi], ymm0, 0 // V |
michael@0 | 4007 | lea edx, [edx + 16] |
michael@0 | 4008 | sub ecx, 32 |
michael@0 | 4009 | jg convertloop |
michael@0 | 4010 | |
michael@0 | 4011 | pop edi |
michael@0 | 4012 | vzeroupper |
michael@0 | 4013 | ret |
michael@0 | 4014 | } |
michael@0 | 4015 | } |
michael@0 | 4016 | #endif // HAS_YUY2TOYROW_AVX2 |
michael@0 | 4017 | |
michael@0 | 4018 | #ifdef HAS_YUY2TOYROW_SSE2 |
michael@0 | 4019 | __declspec(naked) __declspec(align(16)) |
michael@0 | 4020 | void YUY2ToYRow_SSE2(const uint8* src_yuy2, |
michael@0 | 4021 | uint8* dst_y, int pix) { |
michael@0 | 4022 | __asm { |
michael@0 | 4023 | mov eax, [esp + 4] // src_yuy2 |
michael@0 | 4024 | mov edx, [esp + 8] // dst_y |
michael@0 | 4025 | mov ecx, [esp + 12] // pix |
michael@0 | 4026 | pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
michael@0 | 4027 | psrlw xmm5, 8 |
michael@0 | 4028 | |
michael@0 | 4029 | align 4 |
michael@0 | 4030 | convertloop: |
michael@0 | 4031 | movdqa xmm0, [eax] |
michael@0 | 4032 | movdqa xmm1, [eax + 16] |
michael@0 | 4033 | lea eax, [eax + 32] |
michael@0 | 4034 | pand xmm0, xmm5 // even bytes are Y |
michael@0 | 4035 | pand xmm1, xmm5 |
michael@0 | 4036 | packuswb xmm0, xmm1 |
michael@0 | 4037 | sub ecx, 16 |
michael@0 | 4038 | movdqa [edx], xmm0 |
michael@0 | 4039 | lea edx, [edx + 16] |
michael@0 | 4040 | jg convertloop |
michael@0 | 4041 | ret |
michael@0 | 4042 | } |
michael@0 | 4043 | } |
michael@0 | 4044 | |
michael@0 | 4045 | __declspec(naked) __declspec(align(16)) |
michael@0 | 4046 | void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, |
michael@0 | 4047 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 4048 | __asm { |
michael@0 | 4049 | push esi |
michael@0 | 4050 | push edi |
michael@0 | 4051 | mov eax, [esp + 8 + 4] // src_yuy2 |
michael@0 | 4052 | mov esi, [esp + 8 + 8] // stride_yuy2 |
michael@0 | 4053 | mov edx, [esp + 8 + 12] // dst_u |
michael@0 | 4054 | mov edi, [esp + 8 + 16] // dst_v |
michael@0 | 4055 | mov ecx, [esp + 8 + 20] // pix |
michael@0 | 4056 | pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
michael@0 | 4057 | psrlw xmm5, 8 |
michael@0 | 4058 | sub edi, edx |
michael@0 | 4059 | |
michael@0 | 4060 | align 4 |
michael@0 | 4061 | convertloop: |
michael@0 | 4062 | movdqa xmm0, [eax] |
michael@0 | 4063 | movdqa xmm1, [eax + 16] |
michael@0 | 4064 | movdqa xmm2, [eax + esi] |
michael@0 | 4065 | movdqa xmm3, [eax + esi + 16] |
michael@0 | 4066 | lea eax, [eax + 32] |
michael@0 | 4067 | pavgb xmm0, xmm2 |
michael@0 | 4068 | pavgb xmm1, xmm3 |
michael@0 | 4069 | psrlw xmm0, 8 // YUYV -> UVUV |
michael@0 | 4070 | psrlw xmm1, 8 |
michael@0 | 4071 | packuswb xmm0, xmm1 |
michael@0 | 4072 | movdqa xmm1, xmm0 |
michael@0 | 4073 | pand xmm0, xmm5 // U |
michael@0 | 4074 | packuswb xmm0, xmm0 |
michael@0 | 4075 | psrlw xmm1, 8 // V |
michael@0 | 4076 | packuswb xmm1, xmm1 |
michael@0 | 4077 | movq qword ptr [edx], xmm0 |
michael@0 | 4078 | movq qword ptr [edx + edi], xmm1 |
michael@0 | 4079 | lea edx, [edx + 8] |
michael@0 | 4080 | sub ecx, 16 |
michael@0 | 4081 | jg convertloop |
michael@0 | 4082 | |
michael@0 | 4083 | pop edi |
michael@0 | 4084 | pop esi |
michael@0 | 4085 | ret |
michael@0 | 4086 | } |
michael@0 | 4087 | } |
michael@0 | 4088 | |
michael@0 | 4089 | __declspec(naked) __declspec(align(16)) |
michael@0 | 4090 | void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, |
michael@0 | 4091 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 4092 | __asm { |
michael@0 | 4093 | push edi |
michael@0 | 4094 | mov eax, [esp + 4 + 4] // src_yuy2 |
michael@0 | 4095 | mov edx, [esp + 4 + 8] // dst_u |
michael@0 | 4096 | mov edi, [esp + 4 + 12] // dst_v |
michael@0 | 4097 | mov ecx, [esp + 4 + 16] // pix |
michael@0 | 4098 | pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
michael@0 | 4099 | psrlw xmm5, 8 |
michael@0 | 4100 | sub edi, edx |
michael@0 | 4101 | |
michael@0 | 4102 | align 4 |
michael@0 | 4103 | convertloop: |
michael@0 | 4104 | movdqa xmm0, [eax] |
michael@0 | 4105 | movdqa xmm1, [eax + 16] |
michael@0 | 4106 | lea eax, [eax + 32] |
michael@0 | 4107 | psrlw xmm0, 8 // YUYV -> UVUV |
michael@0 | 4108 | psrlw xmm1, 8 |
michael@0 | 4109 | packuswb xmm0, xmm1 |
michael@0 | 4110 | movdqa xmm1, xmm0 |
michael@0 | 4111 | pand xmm0, xmm5 // U |
michael@0 | 4112 | packuswb xmm0, xmm0 |
michael@0 | 4113 | psrlw xmm1, 8 // V |
michael@0 | 4114 | packuswb xmm1, xmm1 |
michael@0 | 4115 | movq qword ptr [edx], xmm0 |
michael@0 | 4116 | movq qword ptr [edx + edi], xmm1 |
michael@0 | 4117 | lea edx, [edx + 8] |
michael@0 | 4118 | sub ecx, 16 |
michael@0 | 4119 | jg convertloop |
michael@0 | 4120 | |
michael@0 | 4121 | pop edi |
michael@0 | 4122 | ret |
michael@0 | 4123 | } |
michael@0 | 4124 | } |
michael@0 | 4125 | |
michael@0 | 4126 | __declspec(naked) __declspec(align(16)) |
michael@0 | 4127 | void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, |
michael@0 | 4128 | uint8* dst_y, int pix) { |
michael@0 | 4129 | __asm { |
michael@0 | 4130 | mov eax, [esp + 4] // src_yuy2 |
michael@0 | 4131 | mov edx, [esp + 8] // dst_y |
michael@0 | 4132 | mov ecx, [esp + 12] // pix |
michael@0 | 4133 | pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
michael@0 | 4134 | psrlw xmm5, 8 |
michael@0 | 4135 | |
michael@0 | 4136 | align 4 |
michael@0 | 4137 | convertloop: |
michael@0 | 4138 | movdqu xmm0, [eax] |
michael@0 | 4139 | movdqu xmm1, [eax + 16] |
michael@0 | 4140 | lea eax, [eax + 32] |
michael@0 | 4141 | pand xmm0, xmm5 // even bytes are Y |
michael@0 | 4142 | pand xmm1, xmm5 |
michael@0 | 4143 | packuswb xmm0, xmm1 |
michael@0 | 4144 | sub ecx, 16 |
michael@0 | 4145 | movdqu [edx], xmm0 |
michael@0 | 4146 | lea edx, [edx + 16] |
michael@0 | 4147 | jg convertloop |
michael@0 | 4148 | ret |
michael@0 | 4149 | } |
michael@0 | 4150 | } |
michael@0 | 4151 | |
michael@0 | 4152 | __declspec(naked) __declspec(align(16)) |
michael@0 | 4153 | void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, |
michael@0 | 4154 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 4155 | __asm { |
michael@0 | 4156 | push esi |
michael@0 | 4157 | push edi |
michael@0 | 4158 | mov eax, [esp + 8 + 4] // src_yuy2 |
michael@0 | 4159 | mov esi, [esp + 8 + 8] // stride_yuy2 |
michael@0 | 4160 | mov edx, [esp + 8 + 12] // dst_u |
michael@0 | 4161 | mov edi, [esp + 8 + 16] // dst_v |
michael@0 | 4162 | mov ecx, [esp + 8 + 20] // pix |
michael@0 | 4163 | pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
michael@0 | 4164 | psrlw xmm5, 8 |
michael@0 | 4165 | sub edi, edx |
michael@0 | 4166 | |
michael@0 | 4167 | align 4 |
michael@0 | 4168 | convertloop: |
michael@0 | 4169 | movdqu xmm0, [eax] |
michael@0 | 4170 | movdqu xmm1, [eax + 16] |
michael@0 | 4171 | movdqu xmm2, [eax + esi] |
michael@0 | 4172 | movdqu xmm3, [eax + esi + 16] |
michael@0 | 4173 | lea eax, [eax + 32] |
michael@0 | 4174 | pavgb xmm0, xmm2 |
michael@0 | 4175 | pavgb xmm1, xmm3 |
michael@0 | 4176 | psrlw xmm0, 8 // YUYV -> UVUV |
michael@0 | 4177 | psrlw xmm1, 8 |
michael@0 | 4178 | packuswb xmm0, xmm1 |
michael@0 | 4179 | movdqa xmm1, xmm0 |
michael@0 | 4180 | pand xmm0, xmm5 // U |
michael@0 | 4181 | packuswb xmm0, xmm0 |
michael@0 | 4182 | psrlw xmm1, 8 // V |
michael@0 | 4183 | packuswb xmm1, xmm1 |
michael@0 | 4184 | movq qword ptr [edx], xmm0 |
michael@0 | 4185 | movq qword ptr [edx + edi], xmm1 |
michael@0 | 4186 | lea edx, [edx + 8] |
michael@0 | 4187 | sub ecx, 16 |
michael@0 | 4188 | jg convertloop |
michael@0 | 4189 | |
michael@0 | 4190 | pop edi |
michael@0 | 4191 | pop esi |
michael@0 | 4192 | ret |
michael@0 | 4193 | } |
michael@0 | 4194 | } |
michael@0 | 4195 | |
michael@0 | 4196 | __declspec(naked) __declspec(align(16)) |
michael@0 | 4197 | void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, |
michael@0 | 4198 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 4199 | __asm { |
michael@0 | 4200 | push edi |
michael@0 | 4201 | mov eax, [esp + 4 + 4] // src_yuy2 |
michael@0 | 4202 | mov edx, [esp + 4 + 8] // dst_u |
michael@0 | 4203 | mov edi, [esp + 4 + 12] // dst_v |
michael@0 | 4204 | mov ecx, [esp + 4 + 16] // pix |
michael@0 | 4205 | pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
michael@0 | 4206 | psrlw xmm5, 8 |
michael@0 | 4207 | sub edi, edx |
michael@0 | 4208 | |
michael@0 | 4209 | align 4 |
michael@0 | 4210 | convertloop: |
michael@0 | 4211 | movdqu xmm0, [eax] |
michael@0 | 4212 | movdqu xmm1, [eax + 16] |
michael@0 | 4213 | lea eax, [eax + 32] |
michael@0 | 4214 | psrlw xmm0, 8 // YUYV -> UVUV |
michael@0 | 4215 | psrlw xmm1, 8 |
michael@0 | 4216 | packuswb xmm0, xmm1 |
michael@0 | 4217 | movdqa xmm1, xmm0 |
michael@0 | 4218 | pand xmm0, xmm5 // U |
michael@0 | 4219 | packuswb xmm0, xmm0 |
michael@0 | 4220 | psrlw xmm1, 8 // V |
michael@0 | 4221 | packuswb xmm1, xmm1 |
michael@0 | 4222 | movq qword ptr [edx], xmm0 |
michael@0 | 4223 | movq qword ptr [edx + edi], xmm1 |
michael@0 | 4224 | lea edx, [edx + 8] |
michael@0 | 4225 | sub ecx, 16 |
michael@0 | 4226 | jg convertloop |
michael@0 | 4227 | |
michael@0 | 4228 | pop edi |
michael@0 | 4229 | ret |
michael@0 | 4230 | } |
michael@0 | 4231 | } |
michael@0 | 4232 | |
michael@0 | 4233 | __declspec(naked) __declspec(align(16)) |
michael@0 | 4234 | void UYVYToYRow_SSE2(const uint8* src_uyvy, |
michael@0 | 4235 | uint8* dst_y, int pix) { |
michael@0 | 4236 | __asm { |
michael@0 | 4237 | mov eax, [esp + 4] // src_uyvy |
michael@0 | 4238 | mov edx, [esp + 8] // dst_y |
michael@0 | 4239 | mov ecx, [esp + 12] // pix |
michael@0 | 4240 | |
michael@0 | 4241 | align 4 |
michael@0 | 4242 | convertloop: |
michael@0 | 4243 | movdqa xmm0, [eax] |
michael@0 | 4244 | movdqa xmm1, [eax + 16] |
michael@0 | 4245 | lea eax, [eax + 32] |
michael@0 | 4246 | psrlw xmm0, 8 // odd bytes are Y |
michael@0 | 4247 | psrlw xmm1, 8 |
michael@0 | 4248 | packuswb xmm0, xmm1 |
michael@0 | 4249 | sub ecx, 16 |
michael@0 | 4250 | movdqa [edx], xmm0 |
michael@0 | 4251 | lea edx, [edx + 16] |
michael@0 | 4252 | jg convertloop |
michael@0 | 4253 | ret |
michael@0 | 4254 | } |
michael@0 | 4255 | } |
michael@0 | 4256 | |
michael@0 | 4257 | __declspec(naked) __declspec(align(16)) |
michael@0 | 4258 | void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, |
michael@0 | 4259 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 4260 | __asm { |
michael@0 | 4261 | push esi |
michael@0 | 4262 | push edi |
michael@0 | 4263 | mov eax, [esp + 8 + 4] // src_yuy2 |
michael@0 | 4264 | mov esi, [esp + 8 + 8] // stride_yuy2 |
michael@0 | 4265 | mov edx, [esp + 8 + 12] // dst_u |
michael@0 | 4266 | mov edi, [esp + 8 + 16] // dst_v |
michael@0 | 4267 | mov ecx, [esp + 8 + 20] // pix |
michael@0 | 4268 | pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
michael@0 | 4269 | psrlw xmm5, 8 |
michael@0 | 4270 | sub edi, edx |
michael@0 | 4271 | |
michael@0 | 4272 | align 4 |
michael@0 | 4273 | convertloop: |
michael@0 | 4274 | movdqa xmm0, [eax] |
michael@0 | 4275 | movdqa xmm1, [eax + 16] |
michael@0 | 4276 | movdqa xmm2, [eax + esi] |
michael@0 | 4277 | movdqa xmm3, [eax + esi + 16] |
michael@0 | 4278 | lea eax, [eax + 32] |
michael@0 | 4279 | pavgb xmm0, xmm2 |
michael@0 | 4280 | pavgb xmm1, xmm3 |
michael@0 | 4281 | pand xmm0, xmm5 // UYVY -> UVUV |
michael@0 | 4282 | pand xmm1, xmm5 |
michael@0 | 4283 | packuswb xmm0, xmm1 |
michael@0 | 4284 | movdqa xmm1, xmm0 |
michael@0 | 4285 | pand xmm0, xmm5 // U |
michael@0 | 4286 | packuswb xmm0, xmm0 |
michael@0 | 4287 | psrlw xmm1, 8 // V |
michael@0 | 4288 | packuswb xmm1, xmm1 |
michael@0 | 4289 | movq qword ptr [edx], xmm0 |
michael@0 | 4290 | movq qword ptr [edx + edi], xmm1 |
michael@0 | 4291 | lea edx, [edx + 8] |
michael@0 | 4292 | sub ecx, 16 |
michael@0 | 4293 | jg convertloop |
michael@0 | 4294 | |
michael@0 | 4295 | pop edi |
michael@0 | 4296 | pop esi |
michael@0 | 4297 | ret |
michael@0 | 4298 | } |
michael@0 | 4299 | } |
michael@0 | 4300 | |
michael@0 | 4301 | __declspec(naked) __declspec(align(16)) |
michael@0 | 4302 | void UYVYToUV422Row_SSE2(const uint8* src_uyvy, |
michael@0 | 4303 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 4304 | __asm { |
michael@0 | 4305 | push edi |
michael@0 | 4306 | mov eax, [esp + 4 + 4] // src_yuy2 |
michael@0 | 4307 | mov edx, [esp + 4 + 8] // dst_u |
michael@0 | 4308 | mov edi, [esp + 4 + 12] // dst_v |
michael@0 | 4309 | mov ecx, [esp + 4 + 16] // pix |
michael@0 | 4310 | pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
michael@0 | 4311 | psrlw xmm5, 8 |
michael@0 | 4312 | sub edi, edx |
michael@0 | 4313 | |
michael@0 | 4314 | align 4 |
michael@0 | 4315 | convertloop: |
michael@0 | 4316 | movdqa xmm0, [eax] |
michael@0 | 4317 | movdqa xmm1, [eax + 16] |
michael@0 | 4318 | lea eax, [eax + 32] |
michael@0 | 4319 | pand xmm0, xmm5 // UYVY -> UVUV |
michael@0 | 4320 | pand xmm1, xmm5 |
michael@0 | 4321 | packuswb xmm0, xmm1 |
michael@0 | 4322 | movdqa xmm1, xmm0 |
michael@0 | 4323 | pand xmm0, xmm5 // U |
michael@0 | 4324 | packuswb xmm0, xmm0 |
michael@0 | 4325 | psrlw xmm1, 8 // V |
michael@0 | 4326 | packuswb xmm1, xmm1 |
michael@0 | 4327 | movq qword ptr [edx], xmm0 |
michael@0 | 4328 | movq qword ptr [edx + edi], xmm1 |
michael@0 | 4329 | lea edx, [edx + 8] |
michael@0 | 4330 | sub ecx, 16 |
michael@0 | 4331 | jg convertloop |
michael@0 | 4332 | |
michael@0 | 4333 | pop edi |
michael@0 | 4334 | ret |
michael@0 | 4335 | } |
michael@0 | 4336 | } |
michael@0 | 4337 | |
michael@0 | 4338 | __declspec(naked) __declspec(align(16)) |
michael@0 | 4339 | void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, |
michael@0 | 4340 | uint8* dst_y, int pix) { |
michael@0 | 4341 | __asm { |
michael@0 | 4342 | mov eax, [esp + 4] // src_uyvy |
michael@0 | 4343 | mov edx, [esp + 8] // dst_y |
michael@0 | 4344 | mov ecx, [esp + 12] // pix |
michael@0 | 4345 | |
michael@0 | 4346 | align 4 |
michael@0 | 4347 | convertloop: |
michael@0 | 4348 | movdqu xmm0, [eax] |
michael@0 | 4349 | movdqu xmm1, [eax + 16] |
michael@0 | 4350 | lea eax, [eax + 32] |
michael@0 | 4351 | psrlw xmm0, 8 // odd bytes are Y |
michael@0 | 4352 | psrlw xmm1, 8 |
michael@0 | 4353 | packuswb xmm0, xmm1 |
michael@0 | 4354 | sub ecx, 16 |
michael@0 | 4355 | movdqu [edx], xmm0 |
michael@0 | 4356 | lea edx, [edx + 16] |
michael@0 | 4357 | jg convertloop |
michael@0 | 4358 | ret |
michael@0 | 4359 | } |
michael@0 | 4360 | } |
michael@0 | 4361 | |
michael@0 | 4362 | __declspec(naked) __declspec(align(16)) |
michael@0 | 4363 | void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, |
michael@0 | 4364 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 4365 | __asm { |
michael@0 | 4366 | push esi |
michael@0 | 4367 | push edi |
michael@0 | 4368 | mov eax, [esp + 8 + 4] // src_yuy2 |
michael@0 | 4369 | mov esi, [esp + 8 + 8] // stride_yuy2 |
michael@0 | 4370 | mov edx, [esp + 8 + 12] // dst_u |
michael@0 | 4371 | mov edi, [esp + 8 + 16] // dst_v |
michael@0 | 4372 | mov ecx, [esp + 8 + 20] // pix |
michael@0 | 4373 | pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
michael@0 | 4374 | psrlw xmm5, 8 |
michael@0 | 4375 | sub edi, edx |
michael@0 | 4376 | |
michael@0 | 4377 | align 4 |
michael@0 | 4378 | convertloop: |
michael@0 | 4379 | movdqu xmm0, [eax] |
michael@0 | 4380 | movdqu xmm1, [eax + 16] |
michael@0 | 4381 | movdqu xmm2, [eax + esi] |
michael@0 | 4382 | movdqu xmm3, [eax + esi + 16] |
michael@0 | 4383 | lea eax, [eax + 32] |
michael@0 | 4384 | pavgb xmm0, xmm2 |
michael@0 | 4385 | pavgb xmm1, xmm3 |
michael@0 | 4386 | pand xmm0, xmm5 // UYVY -> UVUV |
michael@0 | 4387 | pand xmm1, xmm5 |
michael@0 | 4388 | packuswb xmm0, xmm1 |
michael@0 | 4389 | movdqa xmm1, xmm0 |
michael@0 | 4390 | pand xmm0, xmm5 // U |
michael@0 | 4391 | packuswb xmm0, xmm0 |
michael@0 | 4392 | psrlw xmm1, 8 // V |
michael@0 | 4393 | packuswb xmm1, xmm1 |
michael@0 | 4394 | movq qword ptr [edx], xmm0 |
michael@0 | 4395 | movq qword ptr [edx + edi], xmm1 |
michael@0 | 4396 | lea edx, [edx + 8] |
michael@0 | 4397 | sub ecx, 16 |
michael@0 | 4398 | jg convertloop |
michael@0 | 4399 | |
michael@0 | 4400 | pop edi |
michael@0 | 4401 | pop esi |
michael@0 | 4402 | ret |
michael@0 | 4403 | } |
michael@0 | 4404 | } |
michael@0 | 4405 | |
michael@0 | 4406 | __declspec(naked) __declspec(align(16)) |
michael@0 | 4407 | void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, |
michael@0 | 4408 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 4409 | __asm { |
michael@0 | 4410 | push edi |
michael@0 | 4411 | mov eax, [esp + 4 + 4] // src_yuy2 |
michael@0 | 4412 | mov edx, [esp + 4 + 8] // dst_u |
michael@0 | 4413 | mov edi, [esp + 4 + 12] // dst_v |
michael@0 | 4414 | mov ecx, [esp + 4 + 16] // pix |
michael@0 | 4415 | pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
michael@0 | 4416 | psrlw xmm5, 8 |
michael@0 | 4417 | sub edi, edx |
michael@0 | 4418 | |
michael@0 | 4419 | align 4 |
michael@0 | 4420 | convertloop: |
michael@0 | 4421 | movdqu xmm0, [eax] |
michael@0 | 4422 | movdqu xmm1, [eax + 16] |
michael@0 | 4423 | lea eax, [eax + 32] |
michael@0 | 4424 | pand xmm0, xmm5 // UYVY -> UVUV |
michael@0 | 4425 | pand xmm1, xmm5 |
michael@0 | 4426 | packuswb xmm0, xmm1 |
michael@0 | 4427 | movdqa xmm1, xmm0 |
michael@0 | 4428 | pand xmm0, xmm5 // U |
michael@0 | 4429 | packuswb xmm0, xmm0 |
michael@0 | 4430 | psrlw xmm1, 8 // V |
michael@0 | 4431 | packuswb xmm1, xmm1 |
michael@0 | 4432 | movq qword ptr [edx], xmm0 |
michael@0 | 4433 | movq qword ptr [edx + edi], xmm1 |
michael@0 | 4434 | lea edx, [edx + 8] |
michael@0 | 4435 | sub ecx, 16 |
michael@0 | 4436 | jg convertloop |
michael@0 | 4437 | |
michael@0 | 4438 | pop edi |
michael@0 | 4439 | ret |
michael@0 | 4440 | } |
michael@0 | 4441 | } |
michael@0 | 4442 | #endif // HAS_YUY2TOYROW_SSE2 |
michael@0 | 4443 | |
michael@0 | 4444 | #ifdef HAS_ARGBBLENDROW_SSE2 |
michael@0 | 4445 | // Blend 8 pixels at a time. |
michael@0 | 4446 | __declspec(naked) __declspec(align(16)) |
michael@0 | 4447 | void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, |
michael@0 | 4448 | uint8* dst_argb, int width) { |
michael@0 | 4449 | __asm { |
michael@0 | 4450 | push esi |
michael@0 | 4451 | mov eax, [esp + 4 + 4] // src_argb0 |
michael@0 | 4452 | mov esi, [esp + 4 + 8] // src_argb1 |
michael@0 | 4453 | mov edx, [esp + 4 + 12] // dst_argb |
michael@0 | 4454 | mov ecx, [esp + 4 + 16] // width |
michael@0 | 4455 | pcmpeqb xmm7, xmm7 // generate constant 1 |
michael@0 | 4456 | psrlw xmm7, 15 |
michael@0 | 4457 | pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff |
michael@0 | 4458 | psrlw xmm6, 8 |
michael@0 | 4459 | pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 |
michael@0 | 4460 | psllw xmm5, 8 |
michael@0 | 4461 | pcmpeqb xmm4, xmm4 // generate mask 0xff000000 |
michael@0 | 4462 | pslld xmm4, 24 |
michael@0 | 4463 | |
michael@0 | 4464 | sub ecx, 1 |
michael@0 | 4465 | je convertloop1 // only 1 pixel? |
michael@0 | 4466 | jl convertloop1b |
michael@0 | 4467 | |
michael@0 | 4468 | // 1 pixel loop until destination pointer is aligned. |
michael@0 | 4469 | alignloop1: |
michael@0 | 4470 | test edx, 15 // aligned? |
michael@0 | 4471 | je alignloop1b |
michael@0 | 4472 | movd xmm3, [eax] |
michael@0 | 4473 | lea eax, [eax + 4] |
michael@0 | 4474 | movdqa xmm0, xmm3 // src argb |
michael@0 | 4475 | pxor xmm3, xmm4 // ~alpha |
michael@0 | 4476 | movd xmm2, [esi] // _r_b |
michael@0 | 4477 | psrlw xmm3, 8 // alpha |
michael@0 | 4478 | pshufhw xmm3, xmm3, 0F5h // 8 alpha words |
michael@0 | 4479 | pshuflw xmm3, xmm3, 0F5h |
michael@0 | 4480 | pand xmm2, xmm6 // _r_b |
michael@0 | 4481 | paddw xmm3, xmm7 // 256 - alpha |
michael@0 | 4482 | pmullw xmm2, xmm3 // _r_b * alpha |
michael@0 | 4483 | movd xmm1, [esi] // _a_g |
michael@0 | 4484 | lea esi, [esi + 4] |
michael@0 | 4485 | psrlw xmm1, 8 // _a_g |
michael@0 | 4486 | por xmm0, xmm4 // set alpha to 255 |
michael@0 | 4487 | pmullw xmm1, xmm3 // _a_g * alpha |
michael@0 | 4488 | psrlw xmm2, 8 // _r_b convert to 8 bits again |
michael@0 | 4489 | paddusb xmm0, xmm2 // + src argb |
michael@0 | 4490 | pand xmm1, xmm5 // a_g_ convert to 8 bits again |
michael@0 | 4491 | paddusb xmm0, xmm1 // + src argb |
michael@0 | 4492 | sub ecx, 1 |
michael@0 | 4493 | movd [edx], xmm0 |
michael@0 | 4494 | lea edx, [edx + 4] |
michael@0 | 4495 | jge alignloop1 |
michael@0 | 4496 | |
michael@0 | 4497 | alignloop1b: |
michael@0 | 4498 | add ecx, 1 - 4 |
michael@0 | 4499 | jl convertloop4b |
michael@0 | 4500 | |
michael@0 | 4501 | // 4 pixel loop. |
michael@0 | 4502 | convertloop4: |
michael@0 | 4503 | movdqu xmm3, [eax] // src argb |
michael@0 | 4504 | lea eax, [eax + 16] |
michael@0 | 4505 | movdqa xmm0, xmm3 // src argb |
michael@0 | 4506 | pxor xmm3, xmm4 // ~alpha |
michael@0 | 4507 | movdqu xmm2, [esi] // _r_b |
michael@0 | 4508 | psrlw xmm3, 8 // alpha |
michael@0 | 4509 | pshufhw xmm3, xmm3, 0F5h // 8 alpha words |
michael@0 | 4510 | pshuflw xmm3, xmm3, 0F5h |
michael@0 | 4511 | pand xmm2, xmm6 // _r_b |
michael@0 | 4512 | paddw xmm3, xmm7 // 256 - alpha |
michael@0 | 4513 | pmullw xmm2, xmm3 // _r_b * alpha |
michael@0 | 4514 | movdqu xmm1, [esi] // _a_g |
michael@0 | 4515 | lea esi, [esi + 16] |
michael@0 | 4516 | psrlw xmm1, 8 // _a_g |
michael@0 | 4517 | por xmm0, xmm4 // set alpha to 255 |
michael@0 | 4518 | pmullw xmm1, xmm3 // _a_g * alpha |
michael@0 | 4519 | psrlw xmm2, 8 // _r_b convert to 8 bits again |
michael@0 | 4520 | paddusb xmm0, xmm2 // + src argb |
michael@0 | 4521 | pand xmm1, xmm5 // a_g_ convert to 8 bits again |
michael@0 | 4522 | paddusb xmm0, xmm1 // + src argb |
michael@0 | 4523 | sub ecx, 4 |
michael@0 | 4524 | movdqa [edx], xmm0 |
michael@0 | 4525 | lea edx, [edx + 16] |
michael@0 | 4526 | jge convertloop4 |
michael@0 | 4527 | |
michael@0 | 4528 | convertloop4b: |
michael@0 | 4529 | add ecx, 4 - 1 |
michael@0 | 4530 | jl convertloop1b |
michael@0 | 4531 | |
michael@0 | 4532 | // 1 pixel loop. |
michael@0 | 4533 | convertloop1: |
michael@0 | 4534 | movd xmm3, [eax] // src argb |
michael@0 | 4535 | lea eax, [eax + 4] |
michael@0 | 4536 | movdqa xmm0, xmm3 // src argb |
michael@0 | 4537 | pxor xmm3, xmm4 // ~alpha |
michael@0 | 4538 | movd xmm2, [esi] // _r_b |
michael@0 | 4539 | psrlw xmm3, 8 // alpha |
michael@0 | 4540 | pshufhw xmm3, xmm3, 0F5h // 8 alpha words |
michael@0 | 4541 | pshuflw xmm3, xmm3, 0F5h |
michael@0 | 4542 | pand xmm2, xmm6 // _r_b |
michael@0 | 4543 | paddw xmm3, xmm7 // 256 - alpha |
michael@0 | 4544 | pmullw xmm2, xmm3 // _r_b * alpha |
michael@0 | 4545 | movd xmm1, [esi] // _a_g |
michael@0 | 4546 | lea esi, [esi + 4] |
michael@0 | 4547 | psrlw xmm1, 8 // _a_g |
michael@0 | 4548 | por xmm0, xmm4 // set alpha to 255 |
michael@0 | 4549 | pmullw xmm1, xmm3 // _a_g * alpha |
michael@0 | 4550 | psrlw xmm2, 8 // _r_b convert to 8 bits again |
michael@0 | 4551 | paddusb xmm0, xmm2 // + src argb |
michael@0 | 4552 | pand xmm1, xmm5 // a_g_ convert to 8 bits again |
michael@0 | 4553 | paddusb xmm0, xmm1 // + src argb |
michael@0 | 4554 | sub ecx, 1 |
michael@0 | 4555 | movd [edx], xmm0 |
michael@0 | 4556 | lea edx, [edx + 4] |
michael@0 | 4557 | jge convertloop1 |
michael@0 | 4558 | |
michael@0 | 4559 | convertloop1b: |
michael@0 | 4560 | pop esi |
michael@0 | 4561 | ret |
michael@0 | 4562 | } |
michael@0 | 4563 | } |
michael@0 | 4564 | #endif // HAS_ARGBBLENDROW_SSE2 |
michael@0 | 4565 | |
michael@0 | 4566 | #ifdef HAS_ARGBBLENDROW_SSSE3 |
michael@0 | 4567 | // Shuffle table for isolating alpha. |
michael@0 | 4568 | static const uvec8 kShuffleAlpha = { |
michael@0 | 4569 | 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, |
michael@0 | 4570 | 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 |
michael@0 | 4571 | }; |
michael@0 | 4572 | // Same as SSE2, but replaces: |
michael@0 | 4573 | // psrlw xmm3, 8 // alpha |
michael@0 | 4574 | // pshufhw xmm3, xmm3, 0F5h // 8 alpha words |
michael@0 | 4575 | // pshuflw xmm3, xmm3, 0F5h |
michael@0 | 4576 | // with.. |
michael@0 | 4577 | // pshufb xmm3, kShuffleAlpha // alpha |
michael@0 | 4578 | // Blend 8 pixels at a time. |
michael@0 | 4579 | |
michael@0 | 4580 | __declspec(naked) __declspec(align(16)) |
michael@0 | 4581 | void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, |
michael@0 | 4582 | uint8* dst_argb, int width) { |
michael@0 | 4583 | __asm { |
michael@0 | 4584 | push esi |
michael@0 | 4585 | mov eax, [esp + 4 + 4] // src_argb0 |
michael@0 | 4586 | mov esi, [esp + 4 + 8] // src_argb1 |
michael@0 | 4587 | mov edx, [esp + 4 + 12] // dst_argb |
michael@0 | 4588 | mov ecx, [esp + 4 + 16] // width |
michael@0 | 4589 | pcmpeqb xmm7, xmm7 // generate constant 0x0001 |
michael@0 | 4590 | psrlw xmm7, 15 |
michael@0 | 4591 | pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff |
michael@0 | 4592 | psrlw xmm6, 8 |
michael@0 | 4593 | pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 |
michael@0 | 4594 | psllw xmm5, 8 |
michael@0 | 4595 | pcmpeqb xmm4, xmm4 // generate mask 0xff000000 |
michael@0 | 4596 | pslld xmm4, 24 |
michael@0 | 4597 | |
michael@0 | 4598 | sub ecx, 1 |
michael@0 | 4599 | je convertloop1 // only 1 pixel? |
michael@0 | 4600 | jl convertloop1b |
michael@0 | 4601 | |
michael@0 | 4602 | // 1 pixel loop until destination pointer is aligned. |
michael@0 | 4603 | alignloop1: |
michael@0 | 4604 | test edx, 15 // aligned? |
michael@0 | 4605 | je alignloop1b |
michael@0 | 4606 | movd xmm3, [eax] |
michael@0 | 4607 | lea eax, [eax + 4] |
michael@0 | 4608 | movdqa xmm0, xmm3 // src argb |
michael@0 | 4609 | pxor xmm3, xmm4 // ~alpha |
michael@0 | 4610 | movd xmm2, [esi] // _r_b |
michael@0 | 4611 | pshufb xmm3, kShuffleAlpha // alpha |
michael@0 | 4612 | pand xmm2, xmm6 // _r_b |
michael@0 | 4613 | paddw xmm3, xmm7 // 256 - alpha |
michael@0 | 4614 | pmullw xmm2, xmm3 // _r_b * alpha |
michael@0 | 4615 | movd xmm1, [esi] // _a_g |
michael@0 | 4616 | lea esi, [esi + 4] |
michael@0 | 4617 | psrlw xmm1, 8 // _a_g |
michael@0 | 4618 | por xmm0, xmm4 // set alpha to 255 |
michael@0 | 4619 | pmullw xmm1, xmm3 // _a_g * alpha |
michael@0 | 4620 | psrlw xmm2, 8 // _r_b convert to 8 bits again |
michael@0 | 4621 | paddusb xmm0, xmm2 // + src argb |
michael@0 | 4622 | pand xmm1, xmm5 // a_g_ convert to 8 bits again |
michael@0 | 4623 | paddusb xmm0, xmm1 // + src argb |
michael@0 | 4624 | sub ecx, 1 |
michael@0 | 4625 | movd [edx], xmm0 |
michael@0 | 4626 | lea edx, [edx + 4] |
michael@0 | 4627 | jge alignloop1 |
michael@0 | 4628 | |
michael@0 | 4629 | alignloop1b: |
michael@0 | 4630 | add ecx, 1 - 4 |
michael@0 | 4631 | jl convertloop4b |
michael@0 | 4632 | |
michael@0 | 4633 | test eax, 15 // unaligned? |
michael@0 | 4634 | jne convertuloop4 |
michael@0 | 4635 | test esi, 15 // unaligned? |
michael@0 | 4636 | jne convertuloop4 |
michael@0 | 4637 | |
michael@0 | 4638 | // 4 pixel loop. |
michael@0 | 4639 | convertloop4: |
michael@0 | 4640 | movdqa xmm3, [eax] // src argb |
michael@0 | 4641 | lea eax, [eax + 16] |
michael@0 | 4642 | movdqa xmm0, xmm3 // src argb |
michael@0 | 4643 | pxor xmm3, xmm4 // ~alpha |
michael@0 | 4644 | movdqa xmm2, [esi] // _r_b |
michael@0 | 4645 | pshufb xmm3, kShuffleAlpha // alpha |
michael@0 | 4646 | pand xmm2, xmm6 // _r_b |
michael@0 | 4647 | paddw xmm3, xmm7 // 256 - alpha |
michael@0 | 4648 | pmullw xmm2, xmm3 // _r_b * alpha |
michael@0 | 4649 | movdqa xmm1, [esi] // _a_g |
michael@0 | 4650 | lea esi, [esi + 16] |
michael@0 | 4651 | psrlw xmm1, 8 // _a_g |
michael@0 | 4652 | por xmm0, xmm4 // set alpha to 255 |
michael@0 | 4653 | pmullw xmm1, xmm3 // _a_g * alpha |
michael@0 | 4654 | psrlw xmm2, 8 // _r_b convert to 8 bits again |
michael@0 | 4655 | paddusb xmm0, xmm2 // + src argb |
michael@0 | 4656 | pand xmm1, xmm5 // a_g_ convert to 8 bits again |
michael@0 | 4657 | paddusb xmm0, xmm1 // + src argb |
michael@0 | 4658 | sub ecx, 4 |
michael@0 | 4659 | movdqa [edx], xmm0 |
michael@0 | 4660 | lea edx, [edx + 16] |
michael@0 | 4661 | jge convertloop4 |
michael@0 | 4662 | jmp convertloop4b |
michael@0 | 4663 | |
michael@0 | 4664 | // 4 pixel unaligned loop. |
michael@0 | 4665 | convertuloop4: |
michael@0 | 4666 | movdqu xmm3, [eax] // src argb |
michael@0 | 4667 | lea eax, [eax + 16] |
michael@0 | 4668 | movdqa xmm0, xmm3 // src argb |
michael@0 | 4669 | pxor xmm3, xmm4 // ~alpha |
michael@0 | 4670 | movdqu xmm2, [esi] // _r_b |
michael@0 | 4671 | pshufb xmm3, kShuffleAlpha // alpha |
michael@0 | 4672 | pand xmm2, xmm6 // _r_b |
michael@0 | 4673 | paddw xmm3, xmm7 // 256 - alpha |
michael@0 | 4674 | pmullw xmm2, xmm3 // _r_b * alpha |
michael@0 | 4675 | movdqu xmm1, [esi] // _a_g |
michael@0 | 4676 | lea esi, [esi + 16] |
michael@0 | 4677 | psrlw xmm1, 8 // _a_g |
michael@0 | 4678 | por xmm0, xmm4 // set alpha to 255 |
michael@0 | 4679 | pmullw xmm1, xmm3 // _a_g * alpha |
michael@0 | 4680 | psrlw xmm2, 8 // _r_b convert to 8 bits again |
michael@0 | 4681 | paddusb xmm0, xmm2 // + src argb |
michael@0 | 4682 | pand xmm1, xmm5 // a_g_ convert to 8 bits again |
michael@0 | 4683 | paddusb xmm0, xmm1 // + src argb |
michael@0 | 4684 | sub ecx, 4 |
michael@0 | 4685 | movdqa [edx], xmm0 |
michael@0 | 4686 | lea edx, [edx + 16] |
michael@0 | 4687 | jge convertuloop4 |
michael@0 | 4688 | |
michael@0 | 4689 | convertloop4b: |
michael@0 | 4690 | add ecx, 4 - 1 |
michael@0 | 4691 | jl convertloop1b |
michael@0 | 4692 | |
michael@0 | 4693 | // 1 pixel loop. |
michael@0 | 4694 | convertloop1: |
michael@0 | 4695 | movd xmm3, [eax] // src argb |
michael@0 | 4696 | lea eax, [eax + 4] |
michael@0 | 4697 | movdqa xmm0, xmm3 // src argb |
michael@0 | 4698 | pxor xmm3, xmm4 // ~alpha |
michael@0 | 4699 | movd xmm2, [esi] // _r_b |
michael@0 | 4700 | pshufb xmm3, kShuffleAlpha // alpha |
michael@0 | 4701 | pand xmm2, xmm6 // _r_b |
michael@0 | 4702 | paddw xmm3, xmm7 // 256 - alpha |
michael@0 | 4703 | pmullw xmm2, xmm3 // _r_b * alpha |
michael@0 | 4704 | movd xmm1, [esi] // _a_g |
michael@0 | 4705 | lea esi, [esi + 4] |
michael@0 | 4706 | psrlw xmm1, 8 // _a_g |
michael@0 | 4707 | por xmm0, xmm4 // set alpha to 255 |
michael@0 | 4708 | pmullw xmm1, xmm3 // _a_g * alpha |
michael@0 | 4709 | psrlw xmm2, 8 // _r_b convert to 8 bits again |
michael@0 | 4710 | paddusb xmm0, xmm2 // + src argb |
michael@0 | 4711 | pand xmm1, xmm5 // a_g_ convert to 8 bits again |
michael@0 | 4712 | paddusb xmm0, xmm1 // + src argb |
michael@0 | 4713 | sub ecx, 1 |
michael@0 | 4714 | movd [edx], xmm0 |
michael@0 | 4715 | lea edx, [edx + 4] |
michael@0 | 4716 | jge convertloop1 |
michael@0 | 4717 | |
michael@0 | 4718 | convertloop1b: |
michael@0 | 4719 | pop esi |
michael@0 | 4720 | ret |
michael@0 | 4721 | } |
michael@0 | 4722 | } |
michael@0 | 4723 | #endif // HAS_ARGBBLENDROW_SSSE3 |
michael@0 | 4724 | |
michael@0 | 4725 | #ifdef HAS_ARGBATTENUATEROW_SSE2 |
michael@0 | 4726 | // Attenuate 4 pixels at a time. |
michael@0 | 4727 | // Aligned to 16 bytes. |
michael@0 | 4728 | __declspec(naked) __declspec(align(16)) |
michael@0 | 4729 | void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { |
michael@0 | 4730 | __asm { |
michael@0 | 4731 | mov eax, [esp + 4] // src_argb0 |
michael@0 | 4732 | mov edx, [esp + 8] // dst_argb |
michael@0 | 4733 | mov ecx, [esp + 12] // width |
michael@0 | 4734 | pcmpeqb xmm4, xmm4 // generate mask 0xff000000 |
michael@0 | 4735 | pslld xmm4, 24 |
michael@0 | 4736 | pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff |
michael@0 | 4737 | psrld xmm5, 8 |
michael@0 | 4738 | |
michael@0 | 4739 | align 4 |
michael@0 | 4740 | convertloop: |
michael@0 | 4741 | movdqa xmm0, [eax] // read 4 pixels |
michael@0 | 4742 | punpcklbw xmm0, xmm0 // first 2 |
michael@0 | 4743 | pshufhw xmm2, xmm0, 0FFh // 8 alpha words |
michael@0 | 4744 | pshuflw xmm2, xmm2, 0FFh |
michael@0 | 4745 | pmulhuw xmm0, xmm2 // rgb * a |
michael@0 | 4746 | movdqa xmm1, [eax] // read 4 pixels |
michael@0 | 4747 | punpckhbw xmm1, xmm1 // next 2 pixels |
michael@0 | 4748 | pshufhw xmm2, xmm1, 0FFh // 8 alpha words |
michael@0 | 4749 | pshuflw xmm2, xmm2, 0FFh |
michael@0 | 4750 | pmulhuw xmm1, xmm2 // rgb * a |
michael@0 | 4751 | movdqa xmm2, [eax] // alphas |
michael@0 | 4752 | lea eax, [eax + 16] |
michael@0 | 4753 | psrlw xmm0, 8 |
michael@0 | 4754 | pand xmm2, xmm4 |
michael@0 | 4755 | psrlw xmm1, 8 |
michael@0 | 4756 | packuswb xmm0, xmm1 |
michael@0 | 4757 | pand xmm0, xmm5 // keep original alphas |
michael@0 | 4758 | por xmm0, xmm2 |
michael@0 | 4759 | sub ecx, 4 |
michael@0 | 4760 | movdqa [edx], xmm0 |
michael@0 | 4761 | lea edx, [edx + 16] |
michael@0 | 4762 | jg convertloop |
michael@0 | 4763 | |
michael@0 | 4764 | ret |
michael@0 | 4765 | } |
michael@0 | 4766 | } |
michael@0 | 4767 | #endif // HAS_ARGBATTENUATEROW_SSE2 |
michael@0 | 4768 | |
michael@0 | 4769 | #ifdef HAS_ARGBATTENUATEROW_SSSE3 |
michael@0 | 4770 | // Shuffle table duplicating alpha. |
michael@0 | 4771 | static const uvec8 kShuffleAlpha0 = { |
michael@0 | 4772 | 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, |
michael@0 | 4773 | }; |
michael@0 | 4774 | static const uvec8 kShuffleAlpha1 = { |
michael@0 | 4775 | 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, |
michael@0 | 4776 | 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, |
michael@0 | 4777 | }; |
michael@0 | 4778 | __declspec(naked) __declspec(align(16)) |
michael@0 | 4779 | void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { |
michael@0 | 4780 | __asm { |
michael@0 | 4781 | mov eax, [esp + 4] // src_argb0 |
michael@0 | 4782 | mov edx, [esp + 8] // dst_argb |
michael@0 | 4783 | mov ecx, [esp + 12] // width |
michael@0 | 4784 | pcmpeqb xmm3, xmm3 // generate mask 0xff000000 |
michael@0 | 4785 | pslld xmm3, 24 |
michael@0 | 4786 | movdqa xmm4, kShuffleAlpha0 |
michael@0 | 4787 | movdqa xmm5, kShuffleAlpha1 |
michael@0 | 4788 | |
michael@0 | 4789 | align 4 |
michael@0 | 4790 | convertloop: |
michael@0 | 4791 | movdqu xmm0, [eax] // read 4 pixels |
michael@0 | 4792 | pshufb xmm0, xmm4 // isolate first 2 alphas |
michael@0 | 4793 | movdqu xmm1, [eax] // read 4 pixels |
michael@0 | 4794 | punpcklbw xmm1, xmm1 // first 2 pixel rgbs |
michael@0 | 4795 | pmulhuw xmm0, xmm1 // rgb * a |
michael@0 | 4796 | movdqu xmm1, [eax] // read 4 pixels |
michael@0 | 4797 | pshufb xmm1, xmm5 // isolate next 2 alphas |
michael@0 | 4798 | movdqu xmm2, [eax] // read 4 pixels |
michael@0 | 4799 | punpckhbw xmm2, xmm2 // next 2 pixel rgbs |
michael@0 | 4800 | pmulhuw xmm1, xmm2 // rgb * a |
michael@0 | 4801 | movdqu xmm2, [eax] // mask original alpha |
michael@0 | 4802 | lea eax, [eax + 16] |
michael@0 | 4803 | pand xmm2, xmm3 |
michael@0 | 4804 | psrlw xmm0, 8 |
michael@0 | 4805 | psrlw xmm1, 8 |
michael@0 | 4806 | packuswb xmm0, xmm1 |
michael@0 | 4807 | por xmm0, xmm2 // copy original alpha |
michael@0 | 4808 | sub ecx, 4 |
michael@0 | 4809 | movdqu [edx], xmm0 |
michael@0 | 4810 | lea edx, [edx + 16] |
michael@0 | 4811 | jg convertloop |
michael@0 | 4812 | |
michael@0 | 4813 | ret |
michael@0 | 4814 | } |
michael@0 | 4815 | } |
michael@0 | 4816 | #endif // HAS_ARGBATTENUATEROW_SSSE3 |
michael@0 | 4817 | |
michael@0 | 4818 | #ifdef HAS_ARGBATTENUATEROW_AVX2 |
michael@0 | 4819 | // Shuffle table duplicating alpha. |
michael@0 | 4820 | static const ulvec8 kShuffleAlpha_AVX2 = { |
michael@0 | 4821 | 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, |
michael@0 | 4822 | 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u, |
michael@0 | 4823 | 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, |
michael@0 | 4824 | 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u, |
michael@0 | 4825 | }; |
michael@0 | 4826 | __declspec(naked) __declspec(align(16)) |
michael@0 | 4827 | void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { |
michael@0 | 4828 | __asm { |
michael@0 | 4829 | mov eax, [esp + 4] // src_argb0 |
michael@0 | 4830 | mov edx, [esp + 8] // dst_argb |
michael@0 | 4831 | mov ecx, [esp + 12] // width |
michael@0 | 4832 | sub edx, eax |
michael@0 | 4833 | vmovdqa ymm4, kShuffleAlpha_AVX2 |
michael@0 | 4834 | vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 |
michael@0 | 4835 | vpslld ymm5, ymm5, 24 |
michael@0 | 4836 | |
michael@0 | 4837 | align 4 |
michael@0 | 4838 | convertloop: |
michael@0 | 4839 | vmovdqu ymm6, [eax] // read 8 pixels. |
michael@0 | 4840 | vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. |
michael@0 | 4841 | vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. |
michael@0 | 4842 | vpshufb ymm2, ymm0, ymm4 // low 4 alphas |
michael@0 | 4843 | vpshufb ymm3, ymm1, ymm4 // high 4 alphas |
michael@0 | 4844 | vpmulhuw ymm0, ymm0, ymm2 // rgb * a |
michael@0 | 4845 | vpmulhuw ymm1, ymm1, ymm3 // rgb * a |
michael@0 | 4846 | vpand ymm6, ymm6, ymm5 // isolate alpha |
michael@0 | 4847 | vpsrlw ymm0, ymm0, 8 |
michael@0 | 4848 | vpsrlw ymm1, ymm1, 8 |
michael@0 | 4849 | vpackuswb ymm0, ymm0, ymm1 // unmutated. |
michael@0 | 4850 | vpor ymm0, ymm0, ymm6 // copy original alpha |
michael@0 | 4851 | sub ecx, 8 |
michael@0 | 4852 | vmovdqu [eax + edx], ymm0 |
michael@0 | 4853 | lea eax, [eax + 32] |
michael@0 | 4854 | jg convertloop |
michael@0 | 4855 | |
michael@0 | 4856 | vzeroupper |
michael@0 | 4857 | ret |
michael@0 | 4858 | } |
michael@0 | 4859 | } |
michael@0 | 4860 | #endif // HAS_ARGBATTENUATEROW_AVX2 |
michael@0 | 4861 | |
michael@0 | 4862 | #ifdef HAS_ARGBUNATTENUATEROW_SSE2 |
michael@0 | 4863 | // Unattenuate 4 pixels at a time. |
michael@0 | 4864 | // Aligned to 16 bytes. |
michael@0 | 4865 | __declspec(naked) __declspec(align(16)) |
michael@0 | 4866 | void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, |
michael@0 | 4867 | int width) { |
michael@0 | 4868 | __asm { |
michael@0 | 4869 | push esi |
michael@0 | 4870 | push edi |
michael@0 | 4871 | mov eax, [esp + 8 + 4] // src_argb0 |
michael@0 | 4872 | mov edx, [esp + 8 + 8] // dst_argb |
michael@0 | 4873 | mov ecx, [esp + 8 + 12] // width |
michael@0 | 4874 | |
michael@0 | 4875 | align 4 |
michael@0 | 4876 | convertloop: |
michael@0 | 4877 | movdqu xmm0, [eax] // read 4 pixels |
michael@0 | 4878 | movzx esi, byte ptr [eax + 3] // first alpha |
michael@0 | 4879 | movzx edi, byte ptr [eax + 7] // second alpha |
michael@0 | 4880 | punpcklbw xmm0, xmm0 // first 2 |
michael@0 | 4881 | movd xmm2, dword ptr fixed_invtbl8[esi * 4] |
michael@0 | 4882 | movd xmm3, dword ptr fixed_invtbl8[edi * 4] |
michael@0 | 4883 | pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a |
michael@0 | 4884 | pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words |
michael@0 | 4885 | movlhps xmm2, xmm3 |
michael@0 | 4886 | pmulhuw xmm0, xmm2 // rgb * a |
michael@0 | 4887 | |
michael@0 | 4888 | movdqu xmm1, [eax] // read 4 pixels |
michael@0 | 4889 | movzx esi, byte ptr [eax + 11] // third alpha |
michael@0 | 4890 | movzx edi, byte ptr [eax + 15] // forth alpha |
michael@0 | 4891 | punpckhbw xmm1, xmm1 // next 2 |
michael@0 | 4892 | movd xmm2, dword ptr fixed_invtbl8[esi * 4] |
michael@0 | 4893 | movd xmm3, dword ptr fixed_invtbl8[edi * 4] |
michael@0 | 4894 | pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words |
michael@0 | 4895 | pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words |
michael@0 | 4896 | movlhps xmm2, xmm3 |
michael@0 | 4897 | pmulhuw xmm1, xmm2 // rgb * a |
michael@0 | 4898 | lea eax, [eax + 16] |
michael@0 | 4899 | |
michael@0 | 4900 | packuswb xmm0, xmm1 |
michael@0 | 4901 | sub ecx, 4 |
michael@0 | 4902 | movdqu [edx], xmm0 |
michael@0 | 4903 | lea edx, [edx + 16] |
michael@0 | 4904 | jg convertloop |
michael@0 | 4905 | pop edi |
michael@0 | 4906 | pop esi |
michael@0 | 4907 | ret |
michael@0 | 4908 | } |
michael@0 | 4909 | } |
michael@0 | 4910 | #endif // HAS_ARGBUNATTENUATEROW_SSE2 |
michael@0 | 4911 | |
michael@0 | 4912 | #ifdef HAS_ARGBUNATTENUATEROW_AVX2 |
michael@0 | 4913 | // Shuffle table duplicating alpha. |
michael@0 | 4914 | static const ulvec8 kUnattenShuffleAlpha_AVX2 = { |
michael@0 | 4915 | 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15, |
michael@0 | 4916 | 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15, |
michael@0 | 4917 | }; |
michael@0 | 4918 | // TODO(fbarchard): Enable USE_GATHER for future hardware if faster. |
michael@0 | 4919 | // USE_GATHER is not on by default, due to being a slow instruction. |
michael@0 | 4920 | #ifdef USE_GATHER |
michael@0 | 4921 | __declspec(naked) __declspec(align(16)) |
michael@0 | 4922 | void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, |
michael@0 | 4923 | int width) { |
michael@0 | 4924 | __asm { |
michael@0 | 4925 | mov eax, [esp + 4] // src_argb0 |
michael@0 | 4926 | mov edx, [esp + 8] // dst_argb |
michael@0 | 4927 | mov ecx, [esp + 12] // width |
michael@0 | 4928 | sub edx, eax |
michael@0 | 4929 | vmovdqa ymm4, kUnattenShuffleAlpha_AVX2 |
michael@0 | 4930 | |
michael@0 | 4931 | align 4 |
michael@0 | 4932 | convertloop: |
michael@0 | 4933 | vmovdqu ymm6, [eax] // read 8 pixels. |
michael@0 | 4934 | vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather. |
michael@0 | 4935 | vpsrld ymm2, ymm6, 24 // alpha in low 8 bits. |
michael@0 | 4936 | vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. |
michael@0 | 4937 | vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. |
michael@0 | 4938 | vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a |
michael@0 | 4939 | vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a |
michael@0 | 4940 | vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. |
michael@0 | 4941 | vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a |
michael@0 | 4942 | vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas |
michael@0 | 4943 | vpmulhuw ymm0, ymm0, ymm2 // rgb * ia |
michael@0 | 4944 | vpmulhuw ymm1, ymm1, ymm3 // rgb * ia |
michael@0 | 4945 | vpackuswb ymm0, ymm0, ymm1 // unmutated. |
michael@0 | 4946 | sub ecx, 8 |
michael@0 | 4947 | vmovdqu [eax + edx], ymm0 |
michael@0 | 4948 | lea eax, [eax + 32] |
michael@0 | 4949 | jg convertloop |
michael@0 | 4950 | |
michael@0 | 4951 | vzeroupper |
michael@0 | 4952 | ret |
michael@0 | 4953 | } |
michael@0 | 4954 | } |
michael@0 | 4955 | #else // USE_GATHER |
michael@0 | 4956 | __declspec(naked) __declspec(align(16)) |
michael@0 | 4957 | void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, |
michael@0 | 4958 | int width) { |
michael@0 | 4959 | __asm { |
michael@0 | 4960 | |
michael@0 | 4961 | mov eax, [esp + 4] // src_argb0 |
michael@0 | 4962 | mov edx, [esp + 8] // dst_argb |
michael@0 | 4963 | mov ecx, [esp + 12] // width |
michael@0 | 4964 | sub edx, eax |
michael@0 | 4965 | vmovdqa ymm5, kUnattenShuffleAlpha_AVX2 |
michael@0 | 4966 | |
michael@0 | 4967 | push esi |
michael@0 | 4968 | push edi |
michael@0 | 4969 | |
michael@0 | 4970 | align 4 |
michael@0 | 4971 | convertloop: |
michael@0 | 4972 | // replace VPGATHER |
michael@0 | 4973 | movzx esi, byte ptr [eax + 3] // alpha0 |
michael@0 | 4974 | movzx edi, byte ptr [eax + 7] // alpha1 |
michael@0 | 4975 | vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a0] |
michael@0 | 4976 | vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a1] |
michael@0 | 4977 | movzx esi, byte ptr [eax + 11] // alpha2 |
michael@0 | 4978 | movzx edi, byte ptr [eax + 15] // alpha3 |
michael@0 | 4979 | vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0] |
michael@0 | 4980 | vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a2] |
michael@0 | 4981 | vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a3] |
michael@0 | 4982 | movzx esi, byte ptr [eax + 19] // alpha4 |
michael@0 | 4983 | movzx edi, byte ptr [eax + 23] // alpha5 |
michael@0 | 4984 | vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2] |
michael@0 | 4985 | vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a4] |
michael@0 | 4986 | vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a5] |
michael@0 | 4987 | movzx esi, byte ptr [eax + 27] // alpha6 |
michael@0 | 4988 | movzx edi, byte ptr [eax + 31] // alpha7 |
michael@0 | 4989 | vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4] |
michael@0 | 4990 | vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a6] |
michael@0 | 4991 | vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a7] |
michael@0 | 4992 | vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6] |
michael@0 | 4993 | vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0] |
michael@0 | 4994 | vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4] |
michael@0 | 4995 | vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0] |
michael@0 | 4996 | // end of VPGATHER |
michael@0 | 4997 | |
michael@0 | 4998 | vmovdqu ymm6, [eax] // read 8 pixels. |
michael@0 | 4999 | vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. |
michael@0 | 5000 | vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. |
michael@0 | 5001 | vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a |
michael@0 | 5002 | vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated. |
michael@0 | 5003 | vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a |
michael@0 | 5004 | vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas |
michael@0 | 5005 | vpmulhuw ymm0, ymm0, ymm2 // rgb * ia |
michael@0 | 5006 | vpmulhuw ymm1, ymm1, ymm3 // rgb * ia |
michael@0 | 5007 | vpackuswb ymm0, ymm0, ymm1 // unmutated. |
michael@0 | 5008 | sub ecx, 8 |
michael@0 | 5009 | vmovdqu [eax + edx], ymm0 |
michael@0 | 5010 | lea eax, [eax + 32] |
michael@0 | 5011 | jg convertloop |
michael@0 | 5012 | |
michael@0 | 5013 | pop edi |
michael@0 | 5014 | pop esi |
michael@0 | 5015 | vzeroupper |
michael@0 | 5016 | ret |
michael@0 | 5017 | } |
michael@0 | 5018 | } |
michael@0 | 5019 | #endif // USE_GATHER |
michael@0 | 5020 | #endif // HAS_ARGBATTENUATEROW_AVX2 |
michael@0 | 5021 | |
michael@0 | 5022 | #ifdef HAS_ARGBGRAYROW_SSSE3 |
michael@0 | 5023 | // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. |
michael@0 | 5024 | __declspec(naked) __declspec(align(16)) |
michael@0 | 5025 | void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { |
michael@0 | 5026 | __asm { |
michael@0 | 5027 | mov eax, [esp + 4] /* src_argb */ |
michael@0 | 5028 | mov edx, [esp + 8] /* dst_argb */ |
michael@0 | 5029 | mov ecx, [esp + 12] /* width */ |
michael@0 | 5030 | movdqa xmm4, kARGBToYJ |
michael@0 | 5031 | movdqa xmm5, kAddYJ64 |
michael@0 | 5032 | |
michael@0 | 5033 | align 4 |
michael@0 | 5034 | convertloop: |
michael@0 | 5035 | movdqa xmm0, [eax] // G |
michael@0 | 5036 | movdqa xmm1, [eax + 16] |
michael@0 | 5037 | pmaddubsw xmm0, xmm4 |
michael@0 | 5038 | pmaddubsw xmm1, xmm4 |
michael@0 | 5039 | phaddw xmm0, xmm1 |
michael@0 | 5040 | paddw xmm0, xmm5 // Add .5 for rounding. |
michael@0 | 5041 | psrlw xmm0, 7 |
michael@0 | 5042 | packuswb xmm0, xmm0 // 8 G bytes |
michael@0 | 5043 | movdqa xmm2, [eax] // A |
michael@0 | 5044 | movdqa xmm3, [eax + 16] |
michael@0 | 5045 | lea eax, [eax + 32] |
michael@0 | 5046 | psrld xmm2, 24 |
michael@0 | 5047 | psrld xmm3, 24 |
michael@0 | 5048 | packuswb xmm2, xmm3 |
michael@0 | 5049 | packuswb xmm2, xmm2 // 8 A bytes |
michael@0 | 5050 | movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA |
michael@0 | 5051 | punpcklbw xmm0, xmm0 // 8 GG words |
michael@0 | 5052 | punpcklbw xmm3, xmm2 // 8 GA words |
michael@0 | 5053 | movdqa xmm1, xmm0 |
michael@0 | 5054 | punpcklwd xmm0, xmm3 // GGGA first 4 |
michael@0 | 5055 | punpckhwd xmm1, xmm3 // GGGA next 4 |
michael@0 | 5056 | sub ecx, 8 |
michael@0 | 5057 | movdqa [edx], xmm0 |
michael@0 | 5058 | movdqa [edx + 16], xmm1 |
michael@0 | 5059 | lea edx, [edx + 32] |
michael@0 | 5060 | jg convertloop |
michael@0 | 5061 | ret |
michael@0 | 5062 | } |
michael@0 | 5063 | } |
michael@0 | 5064 | #endif // HAS_ARGBGRAYROW_SSSE3 |
michael@0 | 5065 | |
michael@0 | 5066 | #ifdef HAS_ARGBSEPIAROW_SSSE3 |
michael@0 | 5067 | // b = (r * 35 + g * 68 + b * 17) >> 7 |
michael@0 | 5068 | // g = (r * 45 + g * 88 + b * 22) >> 7 |
michael@0 | 5069 | // r = (r * 50 + g * 98 + b * 24) >> 7 |
michael@0 | 5070 | // Constant for ARGB color to sepia tone. |
michael@0 | 5071 | static const vec8 kARGBToSepiaB = { |
michael@0 | 5072 | 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 |
michael@0 | 5073 | }; |
michael@0 | 5074 | |
michael@0 | 5075 | static const vec8 kARGBToSepiaG = { |
michael@0 | 5076 | 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 |
michael@0 | 5077 | }; |
michael@0 | 5078 | |
michael@0 | 5079 | static const vec8 kARGBToSepiaR = { |
michael@0 | 5080 | 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 |
michael@0 | 5081 | }; |
michael@0 | 5082 | |
michael@0 | 5083 | // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. |
michael@0 | 5084 | __declspec(naked) __declspec(align(16)) |
michael@0 | 5085 | void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { |
michael@0 | 5086 | __asm { |
michael@0 | 5087 | mov eax, [esp + 4] /* dst_argb */ |
michael@0 | 5088 | mov ecx, [esp + 8] /* width */ |
michael@0 | 5089 | movdqa xmm2, kARGBToSepiaB |
michael@0 | 5090 | movdqa xmm3, kARGBToSepiaG |
michael@0 | 5091 | movdqa xmm4, kARGBToSepiaR |
michael@0 | 5092 | |
michael@0 | 5093 | align 4 |
michael@0 | 5094 | convertloop: |
michael@0 | 5095 | movdqa xmm0, [eax] // B |
michael@0 | 5096 | movdqa xmm6, [eax + 16] |
michael@0 | 5097 | pmaddubsw xmm0, xmm2 |
michael@0 | 5098 | pmaddubsw xmm6, xmm2 |
michael@0 | 5099 | phaddw xmm0, xmm6 |
michael@0 | 5100 | psrlw xmm0, 7 |
michael@0 | 5101 | packuswb xmm0, xmm0 // 8 B values |
michael@0 | 5102 | movdqa xmm5, [eax] // G |
michael@0 | 5103 | movdqa xmm1, [eax + 16] |
michael@0 | 5104 | pmaddubsw xmm5, xmm3 |
michael@0 | 5105 | pmaddubsw xmm1, xmm3 |
michael@0 | 5106 | phaddw xmm5, xmm1 |
michael@0 | 5107 | psrlw xmm5, 7 |
michael@0 | 5108 | packuswb xmm5, xmm5 // 8 G values |
michael@0 | 5109 | punpcklbw xmm0, xmm5 // 8 BG values |
michael@0 | 5110 | movdqa xmm5, [eax] // R |
michael@0 | 5111 | movdqa xmm1, [eax + 16] |
michael@0 | 5112 | pmaddubsw xmm5, xmm4 |
michael@0 | 5113 | pmaddubsw xmm1, xmm4 |
michael@0 | 5114 | phaddw xmm5, xmm1 |
michael@0 | 5115 | psrlw xmm5, 7 |
michael@0 | 5116 | packuswb xmm5, xmm5 // 8 R values |
michael@0 | 5117 | movdqa xmm6, [eax] // A |
michael@0 | 5118 | movdqa xmm1, [eax + 16] |
michael@0 | 5119 | psrld xmm6, 24 |
michael@0 | 5120 | psrld xmm1, 24 |
michael@0 | 5121 | packuswb xmm6, xmm1 |
michael@0 | 5122 | packuswb xmm6, xmm6 // 8 A values |
michael@0 | 5123 | punpcklbw xmm5, xmm6 // 8 RA values |
michael@0 | 5124 | movdqa xmm1, xmm0 // Weave BG, RA together |
michael@0 | 5125 | punpcklwd xmm0, xmm5 // BGRA first 4 |
michael@0 | 5126 | punpckhwd xmm1, xmm5 // BGRA next 4 |
michael@0 | 5127 | sub ecx, 8 |
michael@0 | 5128 | movdqa [eax], xmm0 |
michael@0 | 5129 | movdqa [eax + 16], xmm1 |
michael@0 | 5130 | lea eax, [eax + 32] |
michael@0 | 5131 | jg convertloop |
michael@0 | 5132 | ret |
michael@0 | 5133 | } |
michael@0 | 5134 | } |
michael@0 | 5135 | #endif // HAS_ARGBSEPIAROW_SSSE3 |
michael@0 | 5136 | |
michael@0 | 5137 | #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 |
michael@0 | 5138 | // Tranform 8 ARGB pixels (32 bytes) with color matrix. |
michael@0 | 5139 | // Same as Sepia except matrix is provided. |
michael@0 | 5140 | // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R |
michael@0 | 5141 | // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. |
michael@0 | 5142 | __declspec(naked) __declspec(align(16)) |
michael@0 | 5143 | void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, |
michael@0 | 5144 | const int8* matrix_argb, int width) { |
michael@0 | 5145 | __asm { |
michael@0 | 5146 | mov eax, [esp + 4] /* src_argb */ |
michael@0 | 5147 | mov edx, [esp + 8] /* dst_argb */ |
michael@0 | 5148 | mov ecx, [esp + 12] /* matrix_argb */ |
michael@0 | 5149 | movdqu xmm5, [ecx] |
michael@0 | 5150 | pshufd xmm2, xmm5, 0x00 |
michael@0 | 5151 | pshufd xmm3, xmm5, 0x55 |
michael@0 | 5152 | pshufd xmm4, xmm5, 0xaa |
michael@0 | 5153 | pshufd xmm5, xmm5, 0xff |
michael@0 | 5154 | mov ecx, [esp + 16] /* width */ |
michael@0 | 5155 | |
michael@0 | 5156 | align 4 |
michael@0 | 5157 | convertloop: |
michael@0 | 5158 | movdqa xmm0, [eax] // B |
michael@0 | 5159 | movdqa xmm7, [eax + 16] |
michael@0 | 5160 | pmaddubsw xmm0, xmm2 |
michael@0 | 5161 | pmaddubsw xmm7, xmm2 |
michael@0 | 5162 | movdqa xmm6, [eax] // G |
michael@0 | 5163 | movdqa xmm1, [eax + 16] |
michael@0 | 5164 | pmaddubsw xmm6, xmm3 |
michael@0 | 5165 | pmaddubsw xmm1, xmm3 |
michael@0 | 5166 | phaddsw xmm0, xmm7 // B |
michael@0 | 5167 | phaddsw xmm6, xmm1 // G |
michael@0 | 5168 | psraw xmm0, 6 // B |
michael@0 | 5169 | psraw xmm6, 6 // G |
michael@0 | 5170 | packuswb xmm0, xmm0 // 8 B values |
michael@0 | 5171 | packuswb xmm6, xmm6 // 8 G values |
michael@0 | 5172 | punpcklbw xmm0, xmm6 // 8 BG values |
michael@0 | 5173 | movdqa xmm1, [eax] // R |
michael@0 | 5174 | movdqa xmm7, [eax + 16] |
michael@0 | 5175 | pmaddubsw xmm1, xmm4 |
michael@0 | 5176 | pmaddubsw xmm7, xmm4 |
michael@0 | 5177 | phaddsw xmm1, xmm7 // R |
michael@0 | 5178 | movdqa xmm6, [eax] // A |
michael@0 | 5179 | movdqa xmm7, [eax + 16] |
michael@0 | 5180 | pmaddubsw xmm6, xmm5 |
michael@0 | 5181 | pmaddubsw xmm7, xmm5 |
michael@0 | 5182 | phaddsw xmm6, xmm7 // A |
michael@0 | 5183 | psraw xmm1, 6 // R |
michael@0 | 5184 | psraw xmm6, 6 // A |
michael@0 | 5185 | packuswb xmm1, xmm1 // 8 R values |
michael@0 | 5186 | packuswb xmm6, xmm6 // 8 A values |
michael@0 | 5187 | punpcklbw xmm1, xmm6 // 8 RA values |
michael@0 | 5188 | movdqa xmm6, xmm0 // Weave BG, RA together |
michael@0 | 5189 | punpcklwd xmm0, xmm1 // BGRA first 4 |
michael@0 | 5190 | punpckhwd xmm6, xmm1 // BGRA next 4 |
michael@0 | 5191 | sub ecx, 8 |
michael@0 | 5192 | movdqa [edx], xmm0 |
michael@0 | 5193 | movdqa [edx + 16], xmm6 |
michael@0 | 5194 | lea eax, [eax + 32] |
michael@0 | 5195 | lea edx, [edx + 32] |
michael@0 | 5196 | jg convertloop |
michael@0 | 5197 | ret |
michael@0 | 5198 | } |
michael@0 | 5199 | } |
michael@0 | 5200 | #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 |
michael@0 | 5201 | |
michael@0 | 5202 | #ifdef HAS_ARGBQUANTIZEROW_SSE2 |
michael@0 | 5203 | // Quantize 4 ARGB pixels (16 bytes). |
michael@0 | 5204 | // Aligned to 16 bytes. |
michael@0 | 5205 | __declspec(naked) __declspec(align(16)) |
michael@0 | 5206 | void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, |
michael@0 | 5207 | int interval_offset, int width) { |
michael@0 | 5208 | __asm { |
michael@0 | 5209 | mov eax, [esp + 4] /* dst_argb */ |
michael@0 | 5210 | movd xmm2, [esp + 8] /* scale */ |
michael@0 | 5211 | movd xmm3, [esp + 12] /* interval_size */ |
michael@0 | 5212 | movd xmm4, [esp + 16] /* interval_offset */ |
michael@0 | 5213 | mov ecx, [esp + 20] /* width */ |
michael@0 | 5214 | pshuflw xmm2, xmm2, 040h |
michael@0 | 5215 | pshufd xmm2, xmm2, 044h |
michael@0 | 5216 | pshuflw xmm3, xmm3, 040h |
michael@0 | 5217 | pshufd xmm3, xmm3, 044h |
michael@0 | 5218 | pshuflw xmm4, xmm4, 040h |
michael@0 | 5219 | pshufd xmm4, xmm4, 044h |
michael@0 | 5220 | pxor xmm5, xmm5 // constant 0 |
michael@0 | 5221 | pcmpeqb xmm6, xmm6 // generate mask 0xff000000 |
michael@0 | 5222 | pslld xmm6, 24 |
michael@0 | 5223 | |
michael@0 | 5224 | align 4 |
michael@0 | 5225 | convertloop: |
michael@0 | 5226 | movdqa xmm0, [eax] // read 4 pixels |
michael@0 | 5227 | punpcklbw xmm0, xmm5 // first 2 pixels |
michael@0 | 5228 | pmulhuw xmm0, xmm2 // pixel * scale >> 16 |
michael@0 | 5229 | movdqa xmm1, [eax] // read 4 pixels |
michael@0 | 5230 | punpckhbw xmm1, xmm5 // next 2 pixels |
michael@0 | 5231 | pmulhuw xmm1, xmm2 |
michael@0 | 5232 | pmullw xmm0, xmm3 // * interval_size |
michael@0 | 5233 | movdqa xmm7, [eax] // read 4 pixels |
michael@0 | 5234 | pmullw xmm1, xmm3 |
michael@0 | 5235 | pand xmm7, xmm6 // mask alpha |
michael@0 | 5236 | paddw xmm0, xmm4 // + interval_size / 2 |
michael@0 | 5237 | paddw xmm1, xmm4 |
michael@0 | 5238 | packuswb xmm0, xmm1 |
michael@0 | 5239 | por xmm0, xmm7 |
michael@0 | 5240 | sub ecx, 4 |
michael@0 | 5241 | movdqa [eax], xmm0 |
michael@0 | 5242 | lea eax, [eax + 16] |
michael@0 | 5243 | jg convertloop |
michael@0 | 5244 | ret |
michael@0 | 5245 | } |
michael@0 | 5246 | } |
michael@0 | 5247 | #endif // HAS_ARGBQUANTIZEROW_SSE2 |
michael@0 | 5248 | |
michael@0 | 5249 | #ifdef HAS_ARGBSHADEROW_SSE2 |
michael@0 | 5250 | // Shade 4 pixels at a time by specified value. |
michael@0 | 5251 | // Aligned to 16 bytes. |
michael@0 | 5252 | __declspec(naked) __declspec(align(16)) |
michael@0 | 5253 | void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, |
michael@0 | 5254 | uint32 value) { |
michael@0 | 5255 | __asm { |
michael@0 | 5256 | mov eax, [esp + 4] // src_argb |
michael@0 | 5257 | mov edx, [esp + 8] // dst_argb |
michael@0 | 5258 | mov ecx, [esp + 12] // width |
michael@0 | 5259 | movd xmm2, [esp + 16] // value |
michael@0 | 5260 | punpcklbw xmm2, xmm2 |
michael@0 | 5261 | punpcklqdq xmm2, xmm2 |
michael@0 | 5262 | |
michael@0 | 5263 | align 4 |
michael@0 | 5264 | convertloop: |
michael@0 | 5265 | movdqa xmm0, [eax] // read 4 pixels |
michael@0 | 5266 | lea eax, [eax + 16] |
michael@0 | 5267 | movdqa xmm1, xmm0 |
michael@0 | 5268 | punpcklbw xmm0, xmm0 // first 2 |
michael@0 | 5269 | punpckhbw xmm1, xmm1 // next 2 |
michael@0 | 5270 | pmulhuw xmm0, xmm2 // argb * value |
michael@0 | 5271 | pmulhuw xmm1, xmm2 // argb * value |
michael@0 | 5272 | psrlw xmm0, 8 |
michael@0 | 5273 | psrlw xmm1, 8 |
michael@0 | 5274 | packuswb xmm0, xmm1 |
michael@0 | 5275 | sub ecx, 4 |
michael@0 | 5276 | movdqa [edx], xmm0 |
michael@0 | 5277 | lea edx, [edx + 16] |
michael@0 | 5278 | jg convertloop |
michael@0 | 5279 | |
michael@0 | 5280 | ret |
michael@0 | 5281 | } |
michael@0 | 5282 | } |
michael@0 | 5283 | #endif // HAS_ARGBSHADEROW_SSE2 |
michael@0 | 5284 | |
michael@0 | 5285 | #ifdef HAS_ARGBMULTIPLYROW_SSE2 |
michael@0 | 5286 | // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. |
michael@0 | 5287 | __declspec(naked) __declspec(align(16)) |
michael@0 | 5288 | void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, |
michael@0 | 5289 | uint8* dst_argb, int width) { |
michael@0 | 5290 | __asm { |
michael@0 | 5291 | push esi |
michael@0 | 5292 | mov eax, [esp + 4 + 4] // src_argb0 |
michael@0 | 5293 | mov esi, [esp + 4 + 8] // src_argb1 |
michael@0 | 5294 | mov edx, [esp + 4 + 12] // dst_argb |
michael@0 | 5295 | mov ecx, [esp + 4 + 16] // width |
michael@0 | 5296 | pxor xmm5, xmm5 // constant 0 |
michael@0 | 5297 | |
michael@0 | 5298 | align 4 |
michael@0 | 5299 | convertloop: |
michael@0 | 5300 | movdqu xmm0, [eax] // read 4 pixels from src_argb0 |
michael@0 | 5301 | movdqu xmm2, [esi] // read 4 pixels from src_argb1 |
michael@0 | 5302 | movdqu xmm1, xmm0 |
michael@0 | 5303 | movdqu xmm3, xmm2 |
michael@0 | 5304 | punpcklbw xmm0, xmm0 // first 2 |
michael@0 | 5305 | punpckhbw xmm1, xmm1 // next 2 |
michael@0 | 5306 | punpcklbw xmm2, xmm5 // first 2 |
michael@0 | 5307 | punpckhbw xmm3, xmm5 // next 2 |
michael@0 | 5308 | pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2 |
michael@0 | 5309 | pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2 |
michael@0 | 5310 | lea eax, [eax + 16] |
michael@0 | 5311 | lea esi, [esi + 16] |
michael@0 | 5312 | packuswb xmm0, xmm1 |
michael@0 | 5313 | sub ecx, 4 |
michael@0 | 5314 | movdqu [edx], xmm0 |
michael@0 | 5315 | lea edx, [edx + 16] |
michael@0 | 5316 | jg convertloop |
michael@0 | 5317 | |
michael@0 | 5318 | pop esi |
michael@0 | 5319 | ret |
michael@0 | 5320 | } |
michael@0 | 5321 | } |
michael@0 | 5322 | #endif // HAS_ARGBMULTIPLYROW_SSE2 |
michael@0 | 5323 | |
michael@0 | 5324 | #ifdef HAS_ARGBADDROW_SSE2 |
michael@0 | 5325 | // Add 2 rows of ARGB pixels together, 4 pixels at a time. |
michael@0 | 5326 | // TODO(fbarchard): Port this to posix, neon and other math functions. |
michael@0 | 5327 | __declspec(naked) __declspec(align(16)) |
michael@0 | 5328 | void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, |
michael@0 | 5329 | uint8* dst_argb, int width) { |
michael@0 | 5330 | __asm { |
michael@0 | 5331 | push esi |
michael@0 | 5332 | mov eax, [esp + 4 + 4] // src_argb0 |
michael@0 | 5333 | mov esi, [esp + 4 + 8] // src_argb1 |
michael@0 | 5334 | mov edx, [esp + 4 + 12] // dst_argb |
michael@0 | 5335 | mov ecx, [esp + 4 + 16] // width |
michael@0 | 5336 | |
michael@0 | 5337 | sub ecx, 4 |
michael@0 | 5338 | jl convertloop49 |
michael@0 | 5339 | |
michael@0 | 5340 | align 4 |
michael@0 | 5341 | convertloop4: |
michael@0 | 5342 | movdqu xmm0, [eax] // read 4 pixels from src_argb0 |
michael@0 | 5343 | lea eax, [eax + 16] |
michael@0 | 5344 | movdqu xmm1, [esi] // read 4 pixels from src_argb1 |
michael@0 | 5345 | lea esi, [esi + 16] |
michael@0 | 5346 | paddusb xmm0, xmm1 // src_argb0 + src_argb1 |
michael@0 | 5347 | sub ecx, 4 |
michael@0 | 5348 | movdqu [edx], xmm0 |
michael@0 | 5349 | lea edx, [edx + 16] |
michael@0 | 5350 | jge convertloop4 |
michael@0 | 5351 | |
michael@0 | 5352 | convertloop49: |
michael@0 | 5353 | add ecx, 4 - 1 |
michael@0 | 5354 | jl convertloop19 |
michael@0 | 5355 | |
michael@0 | 5356 | convertloop1: |
michael@0 | 5357 | movd xmm0, [eax] // read 1 pixels from src_argb0 |
michael@0 | 5358 | lea eax, [eax + 4] |
michael@0 | 5359 | movd xmm1, [esi] // read 1 pixels from src_argb1 |
michael@0 | 5360 | lea esi, [esi + 4] |
michael@0 | 5361 | paddusb xmm0, xmm1 // src_argb0 + src_argb1 |
michael@0 | 5362 | sub ecx, 1 |
michael@0 | 5363 | movd [edx], xmm0 |
michael@0 | 5364 | lea edx, [edx + 4] |
michael@0 | 5365 | jge convertloop1 |
michael@0 | 5366 | |
michael@0 | 5367 | convertloop19: |
michael@0 | 5368 | pop esi |
michael@0 | 5369 | ret |
michael@0 | 5370 | } |
michael@0 | 5371 | } |
michael@0 | 5372 | #endif // HAS_ARGBADDROW_SSE2 |
michael@0 | 5373 | |
michael@0 | 5374 | #ifdef HAS_ARGBSUBTRACTROW_SSE2 |
michael@0 | 5375 | // Subtract 2 rows of ARGB pixels together, 4 pixels at a time. |
michael@0 | 5376 | __declspec(naked) __declspec(align(16)) |
michael@0 | 5377 | void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, |
michael@0 | 5378 | uint8* dst_argb, int width) { |
michael@0 | 5379 | __asm { |
michael@0 | 5380 | push esi |
michael@0 | 5381 | mov eax, [esp + 4 + 4] // src_argb0 |
michael@0 | 5382 | mov esi, [esp + 4 + 8] // src_argb1 |
michael@0 | 5383 | mov edx, [esp + 4 + 12] // dst_argb |
michael@0 | 5384 | mov ecx, [esp + 4 + 16] // width |
michael@0 | 5385 | |
michael@0 | 5386 | align 4 |
michael@0 | 5387 | convertloop: |
michael@0 | 5388 | movdqu xmm0, [eax] // read 4 pixels from src_argb0 |
michael@0 | 5389 | lea eax, [eax + 16] |
michael@0 | 5390 | movdqu xmm1, [esi] // read 4 pixels from src_argb1 |
michael@0 | 5391 | lea esi, [esi + 16] |
michael@0 | 5392 | psubusb xmm0, xmm1 // src_argb0 - src_argb1 |
michael@0 | 5393 | sub ecx, 4 |
michael@0 | 5394 | movdqu [edx], xmm0 |
michael@0 | 5395 | lea edx, [edx + 16] |
michael@0 | 5396 | jg convertloop |
michael@0 | 5397 | |
michael@0 | 5398 | pop esi |
michael@0 | 5399 | ret |
michael@0 | 5400 | } |
michael@0 | 5401 | } |
michael@0 | 5402 | #endif // HAS_ARGBSUBTRACTROW_SSE2 |
michael@0 | 5403 | |
michael@0 | 5404 | #ifdef HAS_ARGBMULTIPLYROW_AVX2 |
michael@0 | 5405 | // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. |
michael@0 | 5406 | __declspec(naked) __declspec(align(16)) |
michael@0 | 5407 | void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, |
michael@0 | 5408 | uint8* dst_argb, int width) { |
michael@0 | 5409 | __asm { |
michael@0 | 5410 | push esi |
michael@0 | 5411 | mov eax, [esp + 4 + 4] // src_argb0 |
michael@0 | 5412 | mov esi, [esp + 4 + 8] // src_argb1 |
michael@0 | 5413 | mov edx, [esp + 4 + 12] // dst_argb |
michael@0 | 5414 | mov ecx, [esp + 4 + 16] // width |
michael@0 | 5415 | vpxor ymm5, ymm5, ymm5 // constant 0 |
michael@0 | 5416 | |
michael@0 | 5417 | align 4 |
michael@0 | 5418 | convertloop: |
michael@0 | 5419 | vmovdqu ymm1, [eax] // read 8 pixels from src_argb0 |
michael@0 | 5420 | lea eax, [eax + 32] |
michael@0 | 5421 | vmovdqu ymm3, [esi] // read 8 pixels from src_argb1 |
michael@0 | 5422 | lea esi, [esi + 32] |
michael@0 | 5423 | vpunpcklbw ymm0, ymm1, ymm1 // low 4 |
michael@0 | 5424 | vpunpckhbw ymm1, ymm1, ymm1 // high 4 |
michael@0 | 5425 | vpunpcklbw ymm2, ymm3, ymm5 // low 4 |
michael@0 | 5426 | vpunpckhbw ymm3, ymm3, ymm5 // high 4 |
michael@0 | 5427 | vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4 |
michael@0 | 5428 | vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4 |
michael@0 | 5429 | vpackuswb ymm0, ymm0, ymm1 |
michael@0 | 5430 | vmovdqu [edx], ymm0 |
michael@0 | 5431 | lea edx, [edx + 32] |
michael@0 | 5432 | sub ecx, 8 |
michael@0 | 5433 | jg convertloop |
michael@0 | 5434 | |
michael@0 | 5435 | pop esi |
michael@0 | 5436 | vzeroupper |
michael@0 | 5437 | ret |
michael@0 | 5438 | } |
michael@0 | 5439 | } |
michael@0 | 5440 | #endif // HAS_ARGBMULTIPLYROW_AVX2 |
michael@0 | 5441 | |
michael@0 | 5442 | #ifdef HAS_ARGBADDROW_AVX2 |
michael@0 | 5443 | // Add 2 rows of ARGB pixels together, 8 pixels at a time. |
michael@0 | 5444 | __declspec(naked) __declspec(align(16)) |
michael@0 | 5445 | void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, |
michael@0 | 5446 | uint8* dst_argb, int width) { |
michael@0 | 5447 | __asm { |
michael@0 | 5448 | push esi |
michael@0 | 5449 | mov eax, [esp + 4 + 4] // src_argb0 |
michael@0 | 5450 | mov esi, [esp + 4 + 8] // src_argb1 |
michael@0 | 5451 | mov edx, [esp + 4 + 12] // dst_argb |
michael@0 | 5452 | mov ecx, [esp + 4 + 16] // width |
michael@0 | 5453 | |
michael@0 | 5454 | align 4 |
michael@0 | 5455 | convertloop: |
michael@0 | 5456 | vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 |
michael@0 | 5457 | lea eax, [eax + 32] |
michael@0 | 5458 | vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1 |
michael@0 | 5459 | lea esi, [esi + 32] |
michael@0 | 5460 | vmovdqu [edx], ymm0 |
michael@0 | 5461 | lea edx, [edx + 32] |
michael@0 | 5462 | sub ecx, 8 |
michael@0 | 5463 | jg convertloop |
michael@0 | 5464 | |
michael@0 | 5465 | pop esi |
michael@0 | 5466 | vzeroupper |
michael@0 | 5467 | ret |
michael@0 | 5468 | } |
michael@0 | 5469 | } |
michael@0 | 5470 | #endif // HAS_ARGBADDROW_AVX2 |
michael@0 | 5471 | |
michael@0 | 5472 | #ifdef HAS_ARGBSUBTRACTROW_AVX2 |
michael@0 | 5473 | // Subtract 2 rows of ARGB pixels together, 8 pixels at a time. |
michael@0 | 5474 | __declspec(naked) __declspec(align(16)) |
michael@0 | 5475 | void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, |
michael@0 | 5476 | uint8* dst_argb, int width) { |
michael@0 | 5477 | __asm { |
michael@0 | 5478 | push esi |
michael@0 | 5479 | mov eax, [esp + 4 + 4] // src_argb0 |
michael@0 | 5480 | mov esi, [esp + 4 + 8] // src_argb1 |
michael@0 | 5481 | mov edx, [esp + 4 + 12] // dst_argb |
michael@0 | 5482 | mov ecx, [esp + 4 + 16] // width |
michael@0 | 5483 | |
michael@0 | 5484 | align 4 |
michael@0 | 5485 | convertloop: |
michael@0 | 5486 | vmovdqu ymm0, [eax] // read 8 pixels from src_argb0 |
michael@0 | 5487 | lea eax, [eax + 32] |
michael@0 | 5488 | vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1 |
michael@0 | 5489 | lea esi, [esi + 32] |
michael@0 | 5490 | vmovdqu [edx], ymm0 |
michael@0 | 5491 | lea edx, [edx + 32] |
michael@0 | 5492 | sub ecx, 8 |
michael@0 | 5493 | jg convertloop |
michael@0 | 5494 | |
michael@0 | 5495 | pop esi |
michael@0 | 5496 | vzeroupper |
michael@0 | 5497 | ret |
michael@0 | 5498 | } |
michael@0 | 5499 | } |
michael@0 | 5500 | #endif // HAS_ARGBSUBTRACTROW_AVX2 |
michael@0 | 5501 | |
michael@0 | 5502 | #ifdef HAS_SOBELXROW_SSE2 |
michael@0 | 5503 | // SobelX as a matrix is |
michael@0 | 5504 | // -1 0 1 |
michael@0 | 5505 | // -2 0 2 |
michael@0 | 5506 | // -1 0 1 |
michael@0 | 5507 | __declspec(naked) __declspec(align(16)) |
michael@0 | 5508 | void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, |
michael@0 | 5509 | const uint8* src_y2, uint8* dst_sobelx, int width) { |
michael@0 | 5510 | __asm { |
michael@0 | 5511 | push esi |
michael@0 | 5512 | push edi |
michael@0 | 5513 | mov eax, [esp + 8 + 4] // src_y0 |
michael@0 | 5514 | mov esi, [esp + 8 + 8] // src_y1 |
michael@0 | 5515 | mov edi, [esp + 8 + 12] // src_y2 |
michael@0 | 5516 | mov edx, [esp + 8 + 16] // dst_sobelx |
michael@0 | 5517 | mov ecx, [esp + 8 + 20] // width |
michael@0 | 5518 | sub esi, eax |
michael@0 | 5519 | sub edi, eax |
michael@0 | 5520 | sub edx, eax |
michael@0 | 5521 | pxor xmm5, xmm5 // constant 0 |
michael@0 | 5522 | |
michael@0 | 5523 | align 4 |
michael@0 | 5524 | convertloop: |
michael@0 | 5525 | movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] |
michael@0 | 5526 | movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] |
michael@0 | 5527 | punpcklbw xmm0, xmm5 |
michael@0 | 5528 | punpcklbw xmm1, xmm5 |
michael@0 | 5529 | psubw xmm0, xmm1 |
michael@0 | 5530 | movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] |
michael@0 | 5531 | movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] |
michael@0 | 5532 | punpcklbw xmm1, xmm5 |
michael@0 | 5533 | punpcklbw xmm2, xmm5 |
michael@0 | 5534 | psubw xmm1, xmm2 |
michael@0 | 5535 | movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] |
michael@0 | 5536 | movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2] |
michael@0 | 5537 | punpcklbw xmm2, xmm5 |
michael@0 | 5538 | punpcklbw xmm3, xmm5 |
michael@0 | 5539 | psubw xmm2, xmm3 |
michael@0 | 5540 | paddw xmm0, xmm2 |
michael@0 | 5541 | paddw xmm0, xmm1 |
michael@0 | 5542 | paddw xmm0, xmm1 |
michael@0 | 5543 | pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw |
michael@0 | 5544 | psubw xmm1, xmm0 |
michael@0 | 5545 | pmaxsw xmm0, xmm1 |
michael@0 | 5546 | packuswb xmm0, xmm0 |
michael@0 | 5547 | sub ecx, 8 |
michael@0 | 5548 | movq qword ptr [eax + edx], xmm0 |
michael@0 | 5549 | lea eax, [eax + 8] |
michael@0 | 5550 | jg convertloop |
michael@0 | 5551 | |
michael@0 | 5552 | pop edi |
michael@0 | 5553 | pop esi |
michael@0 | 5554 | ret |
michael@0 | 5555 | } |
michael@0 | 5556 | } |
michael@0 | 5557 | #endif // HAS_SOBELXROW_SSE2 |
michael@0 | 5558 | |
michael@0 | 5559 | #ifdef HAS_SOBELYROW_SSE2 |
michael@0 | 5560 | // SobelY as a matrix is |
michael@0 | 5561 | // -1 -2 -1 |
michael@0 | 5562 | // 0 0 0 |
michael@0 | 5563 | // 1 2 1 |
michael@0 | 5564 | __declspec(naked) __declspec(align(16)) |
michael@0 | 5565 | void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, |
michael@0 | 5566 | uint8* dst_sobely, int width) { |
michael@0 | 5567 | __asm { |
michael@0 | 5568 | push esi |
michael@0 | 5569 | mov eax, [esp + 4 + 4] // src_y0 |
michael@0 | 5570 | mov esi, [esp + 4 + 8] // src_y1 |
michael@0 | 5571 | mov edx, [esp + 4 + 12] // dst_sobely |
michael@0 | 5572 | mov ecx, [esp + 4 + 16] // width |
michael@0 | 5573 | sub esi, eax |
michael@0 | 5574 | sub edx, eax |
michael@0 | 5575 | pxor xmm5, xmm5 // constant 0 |
michael@0 | 5576 | |
michael@0 | 5577 | align 4 |
michael@0 | 5578 | convertloop: |
michael@0 | 5579 | movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] |
michael@0 | 5580 | movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] |
michael@0 | 5581 | punpcklbw xmm0, xmm5 |
michael@0 | 5582 | punpcklbw xmm1, xmm5 |
michael@0 | 5583 | psubw xmm0, xmm1 |
michael@0 | 5584 | movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] |
michael@0 | 5585 | movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1] |
michael@0 | 5586 | punpcklbw xmm1, xmm5 |
michael@0 | 5587 | punpcklbw xmm2, xmm5 |
michael@0 | 5588 | psubw xmm1, xmm2 |
michael@0 | 5589 | movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] |
michael@0 | 5590 | movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] |
michael@0 | 5591 | punpcklbw xmm2, xmm5 |
michael@0 | 5592 | punpcklbw xmm3, xmm5 |
michael@0 | 5593 | psubw xmm2, xmm3 |
michael@0 | 5594 | paddw xmm0, xmm2 |
michael@0 | 5595 | paddw xmm0, xmm1 |
michael@0 | 5596 | paddw xmm0, xmm1 |
michael@0 | 5597 | pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw |
michael@0 | 5598 | psubw xmm1, xmm0 |
michael@0 | 5599 | pmaxsw xmm0, xmm1 |
michael@0 | 5600 | packuswb xmm0, xmm0 |
michael@0 | 5601 | sub ecx, 8 |
michael@0 | 5602 | movq qword ptr [eax + edx], xmm0 |
michael@0 | 5603 | lea eax, [eax + 8] |
michael@0 | 5604 | jg convertloop |
michael@0 | 5605 | |
michael@0 | 5606 | pop esi |
michael@0 | 5607 | ret |
michael@0 | 5608 | } |
michael@0 | 5609 | } |
michael@0 | 5610 | #endif // HAS_SOBELYROW_SSE2 |
michael@0 | 5611 | |
michael@0 | 5612 | #ifdef HAS_SOBELROW_SSE2 |
michael@0 | 5613 | // Adds Sobel X and Sobel Y and stores Sobel into ARGB. |
michael@0 | 5614 | // A = 255 |
michael@0 | 5615 | // R = Sobel |
michael@0 | 5616 | // G = Sobel |
michael@0 | 5617 | // B = Sobel |
michael@0 | 5618 | __declspec(naked) __declspec(align(16)) |
michael@0 | 5619 | void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, |
michael@0 | 5620 | uint8* dst_argb, int width) { |
michael@0 | 5621 | __asm { |
michael@0 | 5622 | push esi |
michael@0 | 5623 | mov eax, [esp + 4 + 4] // src_sobelx |
michael@0 | 5624 | mov esi, [esp + 4 + 8] // src_sobely |
michael@0 | 5625 | mov edx, [esp + 4 + 12] // dst_argb |
michael@0 | 5626 | mov ecx, [esp + 4 + 16] // width |
michael@0 | 5627 | sub esi, eax |
michael@0 | 5628 | pcmpeqb xmm5, xmm5 // alpha 255 |
michael@0 | 5629 | pslld xmm5, 24 // 0xff000000 |
michael@0 | 5630 | |
michael@0 | 5631 | align 4 |
michael@0 | 5632 | convertloop: |
michael@0 | 5633 | movdqa xmm0, [eax] // read 16 pixels src_sobelx |
michael@0 | 5634 | movdqa xmm1, [eax + esi] // read 16 pixels src_sobely |
michael@0 | 5635 | lea eax, [eax + 16] |
michael@0 | 5636 | paddusb xmm0, xmm1 // sobel = sobelx + sobely |
michael@0 | 5637 | movdqa xmm2, xmm0 // GG |
michael@0 | 5638 | punpcklbw xmm2, xmm0 // First 8 |
michael@0 | 5639 | punpckhbw xmm0, xmm0 // Next 8 |
michael@0 | 5640 | movdqa xmm1, xmm2 // GGGG |
michael@0 | 5641 | punpcklwd xmm1, xmm2 // First 4 |
michael@0 | 5642 | punpckhwd xmm2, xmm2 // Next 4 |
michael@0 | 5643 | por xmm1, xmm5 // GGGA |
michael@0 | 5644 | por xmm2, xmm5 |
michael@0 | 5645 | movdqa xmm3, xmm0 // GGGG |
michael@0 | 5646 | punpcklwd xmm3, xmm0 // Next 4 |
michael@0 | 5647 | punpckhwd xmm0, xmm0 // Last 4 |
michael@0 | 5648 | por xmm3, xmm5 // GGGA |
michael@0 | 5649 | por xmm0, xmm5 |
michael@0 | 5650 | sub ecx, 16 |
michael@0 | 5651 | movdqa [edx], xmm1 |
michael@0 | 5652 | movdqa [edx + 16], xmm2 |
michael@0 | 5653 | movdqa [edx + 32], xmm3 |
michael@0 | 5654 | movdqa [edx + 48], xmm0 |
michael@0 | 5655 | lea edx, [edx + 64] |
michael@0 | 5656 | jg convertloop |
michael@0 | 5657 | |
michael@0 | 5658 | pop esi |
michael@0 | 5659 | ret |
michael@0 | 5660 | } |
michael@0 | 5661 | } |
michael@0 | 5662 | #endif // HAS_SOBELROW_SSE2 |
michael@0 | 5663 | |
michael@0 | 5664 | #ifdef HAS_SOBELTOPLANEROW_SSE2 |
michael@0 | 5665 | // Adds Sobel X and Sobel Y and stores Sobel into a plane. |
michael@0 | 5666 | __declspec(naked) __declspec(align(16)) |
michael@0 | 5667 | void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, |
michael@0 | 5668 | uint8* dst_y, int width) { |
michael@0 | 5669 | __asm { |
michael@0 | 5670 | push esi |
michael@0 | 5671 | mov eax, [esp + 4 + 4] // src_sobelx |
michael@0 | 5672 | mov esi, [esp + 4 + 8] // src_sobely |
michael@0 | 5673 | mov edx, [esp + 4 + 12] // dst_argb |
michael@0 | 5674 | mov ecx, [esp + 4 + 16] // width |
michael@0 | 5675 | sub esi, eax |
michael@0 | 5676 | |
michael@0 | 5677 | align 4 |
michael@0 | 5678 | convertloop: |
michael@0 | 5679 | movdqa xmm0, [eax] // read 16 pixels src_sobelx |
michael@0 | 5680 | movdqa xmm1, [eax + esi] // read 16 pixels src_sobely |
michael@0 | 5681 | lea eax, [eax + 16] |
michael@0 | 5682 | paddusb xmm0, xmm1 // sobel = sobelx + sobely |
michael@0 | 5683 | sub ecx, 16 |
michael@0 | 5684 | movdqa [edx], xmm0 |
michael@0 | 5685 | lea edx, [edx + 16] |
michael@0 | 5686 | jg convertloop |
michael@0 | 5687 | |
michael@0 | 5688 | pop esi |
michael@0 | 5689 | ret |
michael@0 | 5690 | } |
michael@0 | 5691 | } |
michael@0 | 5692 | #endif // HAS_SOBELTOPLANEROW_SSE2 |
michael@0 | 5693 | |
michael@0 | 5694 | #ifdef HAS_SOBELXYROW_SSE2 |
michael@0 | 5695 | // Mixes Sobel X, Sobel Y and Sobel into ARGB. |
michael@0 | 5696 | // A = 255 |
michael@0 | 5697 | // R = Sobel X |
michael@0 | 5698 | // G = Sobel |
michael@0 | 5699 | // B = Sobel Y |
michael@0 | 5700 | __declspec(naked) __declspec(align(16)) |
michael@0 | 5701 | void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, |
michael@0 | 5702 | uint8* dst_argb, int width) { |
michael@0 | 5703 | __asm { |
michael@0 | 5704 | push esi |
michael@0 | 5705 | mov eax, [esp + 4 + 4] // src_sobelx |
michael@0 | 5706 | mov esi, [esp + 4 + 8] // src_sobely |
michael@0 | 5707 | mov edx, [esp + 4 + 12] // dst_argb |
michael@0 | 5708 | mov ecx, [esp + 4 + 16] // width |
michael@0 | 5709 | sub esi, eax |
michael@0 | 5710 | pcmpeqb xmm5, xmm5 // alpha 255 |
michael@0 | 5711 | |
michael@0 | 5712 | align 4 |
michael@0 | 5713 | convertloop: |
michael@0 | 5714 | movdqa xmm0, [eax] // read 16 pixels src_sobelx |
michael@0 | 5715 | movdqa xmm1, [eax + esi] // read 16 pixels src_sobely |
michael@0 | 5716 | lea eax, [eax + 16] |
michael@0 | 5717 | movdqa xmm2, xmm0 |
michael@0 | 5718 | paddusb xmm2, xmm1 // sobel = sobelx + sobely |
michael@0 | 5719 | movdqa xmm3, xmm0 // XA |
michael@0 | 5720 | punpcklbw xmm3, xmm5 |
michael@0 | 5721 | punpckhbw xmm0, xmm5 |
michael@0 | 5722 | movdqa xmm4, xmm1 // YS |
michael@0 | 5723 | punpcklbw xmm4, xmm2 |
michael@0 | 5724 | punpckhbw xmm1, xmm2 |
michael@0 | 5725 | movdqa xmm6, xmm4 // YSXA |
michael@0 | 5726 | punpcklwd xmm6, xmm3 // First 4 |
michael@0 | 5727 | punpckhwd xmm4, xmm3 // Next 4 |
michael@0 | 5728 | movdqa xmm7, xmm1 // YSXA |
michael@0 | 5729 | punpcklwd xmm7, xmm0 // Next 4 |
michael@0 | 5730 | punpckhwd xmm1, xmm0 // Last 4 |
michael@0 | 5731 | sub ecx, 16 |
michael@0 | 5732 | movdqa [edx], xmm6 |
michael@0 | 5733 | movdqa [edx + 16], xmm4 |
michael@0 | 5734 | movdqa [edx + 32], xmm7 |
michael@0 | 5735 | movdqa [edx + 48], xmm1 |
michael@0 | 5736 | lea edx, [edx + 64] |
michael@0 | 5737 | jg convertloop |
michael@0 | 5738 | |
michael@0 | 5739 | pop esi |
michael@0 | 5740 | ret |
michael@0 | 5741 | } |
michael@0 | 5742 | } |
michael@0 | 5743 | #endif // HAS_SOBELXYROW_SSE2 |
michael@0 | 5744 | |
michael@0 | 5745 | #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 |
michael@0 | 5746 | // Consider float CumulativeSum. |
michael@0 | 5747 | // Consider calling CumulativeSum one row at time as needed. |
michael@0 | 5748 | // Consider circular CumulativeSum buffer of radius * 2 + 1 height. |
michael@0 | 5749 | // Convert cumulative sum for an area to an average for 1 pixel. |
michael@0 | 5750 | // topleft is pointer to top left of CumulativeSum buffer for area. |
michael@0 | 5751 | // botleft is pointer to bottom left of CumulativeSum buffer. |
michael@0 | 5752 | // width is offset from left to right of area in CumulativeSum buffer measured |
michael@0 | 5753 | // in number of ints. |
michael@0 | 5754 | // area is the number of pixels in the area being averaged. |
michael@0 | 5755 | // dst points to pixel to store result to. |
michael@0 | 5756 | // count is number of averaged pixels to produce. |
michael@0 | 5757 | // Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte |
michael@0 | 5758 | // aligned. |
michael@0 | 5759 | void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, |
michael@0 | 5760 | int width, int area, uint8* dst, |
michael@0 | 5761 | int count) { |
michael@0 | 5762 | __asm { |
michael@0 | 5763 | mov eax, topleft // eax topleft |
michael@0 | 5764 | mov esi, botleft // esi botleft |
michael@0 | 5765 | mov edx, width |
michael@0 | 5766 | movd xmm5, area |
michael@0 | 5767 | mov edi, dst |
michael@0 | 5768 | mov ecx, count |
michael@0 | 5769 | cvtdq2ps xmm5, xmm5 |
michael@0 | 5770 | rcpss xmm4, xmm5 // 1.0f / area |
michael@0 | 5771 | pshufd xmm4, xmm4, 0 |
michael@0 | 5772 | sub ecx, 4 |
michael@0 | 5773 | jl l4b |
michael@0 | 5774 | |
michael@0 | 5775 | cmp area, 128 // 128 pixels will not overflow 15 bits. |
michael@0 | 5776 | ja l4 |
michael@0 | 5777 | |
michael@0 | 5778 | pshufd xmm5, xmm5, 0 // area |
michael@0 | 5779 | pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0 |
michael@0 | 5780 | psrld xmm6, 16 |
michael@0 | 5781 | cvtdq2ps xmm6, xmm6 |
michael@0 | 5782 | addps xmm5, xmm6 // (65536.0 + area - 1) |
michael@0 | 5783 | mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area |
michael@0 | 5784 | cvtps2dq xmm5, xmm5 // 0.16 fixed point |
michael@0 | 5785 | packssdw xmm5, xmm5 // 16 bit shorts |
michael@0 | 5786 | |
michael@0 | 5787 | // 4 pixel loop small blocks. |
michael@0 | 5788 | align 4 |
michael@0 | 5789 | s4: |
michael@0 | 5790 | // top left |
michael@0 | 5791 | movdqa xmm0, [eax] |
michael@0 | 5792 | movdqa xmm1, [eax + 16] |
michael@0 | 5793 | movdqa xmm2, [eax + 32] |
michael@0 | 5794 | movdqa xmm3, [eax + 48] |
michael@0 | 5795 | |
michael@0 | 5796 | // - top right |
michael@0 | 5797 | psubd xmm0, [eax + edx * 4] |
michael@0 | 5798 | psubd xmm1, [eax + edx * 4 + 16] |
michael@0 | 5799 | psubd xmm2, [eax + edx * 4 + 32] |
michael@0 | 5800 | psubd xmm3, [eax + edx * 4 + 48] |
michael@0 | 5801 | lea eax, [eax + 64] |
michael@0 | 5802 | |
michael@0 | 5803 | // - bottom left |
michael@0 | 5804 | psubd xmm0, [esi] |
michael@0 | 5805 | psubd xmm1, [esi + 16] |
michael@0 | 5806 | psubd xmm2, [esi + 32] |
michael@0 | 5807 | psubd xmm3, [esi + 48] |
michael@0 | 5808 | |
michael@0 | 5809 | // + bottom right |
michael@0 | 5810 | paddd xmm0, [esi + edx * 4] |
michael@0 | 5811 | paddd xmm1, [esi + edx * 4 + 16] |
michael@0 | 5812 | paddd xmm2, [esi + edx * 4 + 32] |
michael@0 | 5813 | paddd xmm3, [esi + edx * 4 + 48] |
michael@0 | 5814 | lea esi, [esi + 64] |
michael@0 | 5815 | |
michael@0 | 5816 | packssdw xmm0, xmm1 // pack 4 pixels into 2 registers |
michael@0 | 5817 | packssdw xmm2, xmm3 |
michael@0 | 5818 | |
michael@0 | 5819 | pmulhuw xmm0, xmm5 |
michael@0 | 5820 | pmulhuw xmm2, xmm5 |
michael@0 | 5821 | |
michael@0 | 5822 | packuswb xmm0, xmm2 |
michael@0 | 5823 | movdqu [edi], xmm0 |
michael@0 | 5824 | lea edi, [edi + 16] |
michael@0 | 5825 | sub ecx, 4 |
michael@0 | 5826 | jge s4 |
michael@0 | 5827 | |
michael@0 | 5828 | jmp l4b |
michael@0 | 5829 | |
michael@0 | 5830 | // 4 pixel loop |
michael@0 | 5831 | align 4 |
michael@0 | 5832 | l4: |
michael@0 | 5833 | // top left |
michael@0 | 5834 | movdqa xmm0, [eax] |
michael@0 | 5835 | movdqa xmm1, [eax + 16] |
michael@0 | 5836 | movdqa xmm2, [eax + 32] |
michael@0 | 5837 | movdqa xmm3, [eax + 48] |
michael@0 | 5838 | |
michael@0 | 5839 | // - top right |
michael@0 | 5840 | psubd xmm0, [eax + edx * 4] |
michael@0 | 5841 | psubd xmm1, [eax + edx * 4 + 16] |
michael@0 | 5842 | psubd xmm2, [eax + edx * 4 + 32] |
michael@0 | 5843 | psubd xmm3, [eax + edx * 4 + 48] |
michael@0 | 5844 | lea eax, [eax + 64] |
michael@0 | 5845 | |
michael@0 | 5846 | // - bottom left |
michael@0 | 5847 | psubd xmm0, [esi] |
michael@0 | 5848 | psubd xmm1, [esi + 16] |
michael@0 | 5849 | psubd xmm2, [esi + 32] |
michael@0 | 5850 | psubd xmm3, [esi + 48] |
michael@0 | 5851 | |
michael@0 | 5852 | // + bottom right |
michael@0 | 5853 | paddd xmm0, [esi + edx * 4] |
michael@0 | 5854 | paddd xmm1, [esi + edx * 4 + 16] |
michael@0 | 5855 | paddd xmm2, [esi + edx * 4 + 32] |
michael@0 | 5856 | paddd xmm3, [esi + edx * 4 + 48] |
michael@0 | 5857 | lea esi, [esi + 64] |
michael@0 | 5858 | |
michael@0 | 5859 | cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area |
michael@0 | 5860 | cvtdq2ps xmm1, xmm1 |
michael@0 | 5861 | mulps xmm0, xmm4 |
michael@0 | 5862 | mulps xmm1, xmm4 |
michael@0 | 5863 | cvtdq2ps xmm2, xmm2 |
michael@0 | 5864 | cvtdq2ps xmm3, xmm3 |
michael@0 | 5865 | mulps xmm2, xmm4 |
michael@0 | 5866 | mulps xmm3, xmm4 |
michael@0 | 5867 | cvtps2dq xmm0, xmm0 |
michael@0 | 5868 | cvtps2dq xmm1, xmm1 |
michael@0 | 5869 | cvtps2dq xmm2, xmm2 |
michael@0 | 5870 | cvtps2dq xmm3, xmm3 |
michael@0 | 5871 | packssdw xmm0, xmm1 |
michael@0 | 5872 | packssdw xmm2, xmm3 |
michael@0 | 5873 | packuswb xmm0, xmm2 |
michael@0 | 5874 | movdqu [edi], xmm0 |
michael@0 | 5875 | lea edi, [edi + 16] |
michael@0 | 5876 | sub ecx, 4 |
michael@0 | 5877 | jge l4 |
michael@0 | 5878 | |
michael@0 | 5879 | l4b: |
michael@0 | 5880 | add ecx, 4 - 1 |
michael@0 | 5881 | jl l1b |
michael@0 | 5882 | |
michael@0 | 5883 | // 1 pixel loop |
michael@0 | 5884 | align 4 |
michael@0 | 5885 | l1: |
michael@0 | 5886 | movdqa xmm0, [eax] |
michael@0 | 5887 | psubd xmm0, [eax + edx * 4] |
michael@0 | 5888 | lea eax, [eax + 16] |
michael@0 | 5889 | psubd xmm0, [esi] |
michael@0 | 5890 | paddd xmm0, [esi + edx * 4] |
michael@0 | 5891 | lea esi, [esi + 16] |
michael@0 | 5892 | cvtdq2ps xmm0, xmm0 |
michael@0 | 5893 | mulps xmm0, xmm4 |
michael@0 | 5894 | cvtps2dq xmm0, xmm0 |
michael@0 | 5895 | packssdw xmm0, xmm0 |
michael@0 | 5896 | packuswb xmm0, xmm0 |
michael@0 | 5897 | movd dword ptr [edi], xmm0 |
michael@0 | 5898 | lea edi, [edi + 4] |
michael@0 | 5899 | sub ecx, 1 |
michael@0 | 5900 | jge l1 |
michael@0 | 5901 | l1b: |
michael@0 | 5902 | } |
michael@0 | 5903 | } |
michael@0 | 5904 | #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 |
michael@0 | 5905 | |
michael@0 | 5906 | #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 |
michael@0 | 5907 | // Creates a table of cumulative sums where each value is a sum of all values |
michael@0 | 5908 | // above and to the left of the value. |
michael@0 | 5909 | void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, |
michael@0 | 5910 | const int32* previous_cumsum, int width) { |
michael@0 | 5911 | __asm { |
michael@0 | 5912 | mov eax, row |
michael@0 | 5913 | mov edx, cumsum |
michael@0 | 5914 | mov esi, previous_cumsum |
michael@0 | 5915 | mov ecx, width |
michael@0 | 5916 | pxor xmm0, xmm0 |
michael@0 | 5917 | pxor xmm1, xmm1 |
michael@0 | 5918 | |
michael@0 | 5919 | sub ecx, 4 |
michael@0 | 5920 | jl l4b |
michael@0 | 5921 | test edx, 15 |
michael@0 | 5922 | jne l4b |
michael@0 | 5923 | |
michael@0 | 5924 | // 4 pixel loop |
michael@0 | 5925 | align 4 |
michael@0 | 5926 | l4: |
michael@0 | 5927 | movdqu xmm2, [eax] // 4 argb pixels 16 bytes. |
michael@0 | 5928 | lea eax, [eax + 16] |
michael@0 | 5929 | movdqa xmm4, xmm2 |
michael@0 | 5930 | |
michael@0 | 5931 | punpcklbw xmm2, xmm1 |
michael@0 | 5932 | movdqa xmm3, xmm2 |
michael@0 | 5933 | punpcklwd xmm2, xmm1 |
michael@0 | 5934 | punpckhwd xmm3, xmm1 |
michael@0 | 5935 | |
michael@0 | 5936 | punpckhbw xmm4, xmm1 |
michael@0 | 5937 | movdqa xmm5, xmm4 |
michael@0 | 5938 | punpcklwd xmm4, xmm1 |
michael@0 | 5939 | punpckhwd xmm5, xmm1 |
michael@0 | 5940 | |
michael@0 | 5941 | paddd xmm0, xmm2 |
michael@0 | 5942 | movdqa xmm2, [esi] // previous row above. |
michael@0 | 5943 | paddd xmm2, xmm0 |
michael@0 | 5944 | |
michael@0 | 5945 | paddd xmm0, xmm3 |
michael@0 | 5946 | movdqa xmm3, [esi + 16] |
michael@0 | 5947 | paddd xmm3, xmm0 |
michael@0 | 5948 | |
michael@0 | 5949 | paddd xmm0, xmm4 |
michael@0 | 5950 | movdqa xmm4, [esi + 32] |
michael@0 | 5951 | paddd xmm4, xmm0 |
michael@0 | 5952 | |
michael@0 | 5953 | paddd xmm0, xmm5 |
michael@0 | 5954 | movdqa xmm5, [esi + 48] |
michael@0 | 5955 | lea esi, [esi + 64] |
michael@0 | 5956 | paddd xmm5, xmm0 |
michael@0 | 5957 | |
michael@0 | 5958 | movdqa [edx], xmm2 |
michael@0 | 5959 | movdqa [edx + 16], xmm3 |
michael@0 | 5960 | movdqa [edx + 32], xmm4 |
michael@0 | 5961 | movdqa [edx + 48], xmm5 |
michael@0 | 5962 | |
michael@0 | 5963 | lea edx, [edx + 64] |
michael@0 | 5964 | sub ecx, 4 |
michael@0 | 5965 | jge l4 |
michael@0 | 5966 | |
michael@0 | 5967 | l4b: |
michael@0 | 5968 | add ecx, 4 - 1 |
michael@0 | 5969 | jl l1b |
michael@0 | 5970 | |
michael@0 | 5971 | // 1 pixel loop |
michael@0 | 5972 | align 4 |
michael@0 | 5973 | l1: |
michael@0 | 5974 | movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes. |
michael@0 | 5975 | lea eax, [eax + 4] |
michael@0 | 5976 | punpcklbw xmm2, xmm1 |
michael@0 | 5977 | punpcklwd xmm2, xmm1 |
michael@0 | 5978 | paddd xmm0, xmm2 |
michael@0 | 5979 | movdqu xmm2, [esi] |
michael@0 | 5980 | lea esi, [esi + 16] |
michael@0 | 5981 | paddd xmm2, xmm0 |
michael@0 | 5982 | movdqu [edx], xmm2 |
michael@0 | 5983 | lea edx, [edx + 16] |
michael@0 | 5984 | sub ecx, 1 |
michael@0 | 5985 | jge l1 |
michael@0 | 5986 | |
michael@0 | 5987 | l1b: |
michael@0 | 5988 | } |
michael@0 | 5989 | } |
michael@0 | 5990 | #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 |
michael@0 | 5991 | |
michael@0 | 5992 | #ifdef HAS_ARGBAFFINEROW_SSE2 |
michael@0 | 5993 | // Copy ARGB pixels from source image with slope to a row of destination. |
michael@0 | 5994 | __declspec(naked) __declspec(align(16)) |
michael@0 | 5995 | LIBYUV_API |
michael@0 | 5996 | void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, |
michael@0 | 5997 | uint8* dst_argb, const float* uv_dudv, int width) { |
michael@0 | 5998 | __asm { |
michael@0 | 5999 | push esi |
michael@0 | 6000 | push edi |
michael@0 | 6001 | mov eax, [esp + 12] // src_argb |
michael@0 | 6002 | mov esi, [esp + 16] // stride |
michael@0 | 6003 | mov edx, [esp + 20] // dst_argb |
michael@0 | 6004 | mov ecx, [esp + 24] // pointer to uv_dudv |
michael@0 | 6005 | movq xmm2, qword ptr [ecx] // uv |
michael@0 | 6006 | movq xmm7, qword ptr [ecx + 8] // dudv |
michael@0 | 6007 | mov ecx, [esp + 28] // width |
michael@0 | 6008 | shl esi, 16 // 4, stride |
michael@0 | 6009 | add esi, 4 |
michael@0 | 6010 | movd xmm5, esi |
michael@0 | 6011 | sub ecx, 4 |
michael@0 | 6012 | jl l4b |
michael@0 | 6013 | |
michael@0 | 6014 | // setup for 4 pixel loop |
michael@0 | 6015 | pshufd xmm7, xmm7, 0x44 // dup dudv |
michael@0 | 6016 | pshufd xmm5, xmm5, 0 // dup 4, stride |
michael@0 | 6017 | movdqa xmm0, xmm2 // x0, y0, x1, y1 |
michael@0 | 6018 | addps xmm0, xmm7 |
michael@0 | 6019 | movlhps xmm2, xmm0 |
michael@0 | 6020 | movdqa xmm4, xmm7 |
michael@0 | 6021 | addps xmm4, xmm4 // dudv *= 2 |
michael@0 | 6022 | movdqa xmm3, xmm2 // x2, y2, x3, y3 |
michael@0 | 6023 | addps xmm3, xmm4 |
michael@0 | 6024 | addps xmm4, xmm4 // dudv *= 4 |
michael@0 | 6025 | |
michael@0 | 6026 | // 4 pixel loop |
michael@0 | 6027 | align 4 |
michael@0 | 6028 | l4: |
michael@0 | 6029 | cvttps2dq xmm0, xmm2 // x, y float to int first 2 |
michael@0 | 6030 | cvttps2dq xmm1, xmm3 // x, y float to int next 2 |
michael@0 | 6031 | packssdw xmm0, xmm1 // x, y as 8 shorts |
michael@0 | 6032 | pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. |
michael@0 | 6033 | movd esi, xmm0 |
michael@0 | 6034 | pshufd xmm0, xmm0, 0x39 // shift right |
michael@0 | 6035 | movd edi, xmm0 |
michael@0 | 6036 | pshufd xmm0, xmm0, 0x39 // shift right |
michael@0 | 6037 | movd xmm1, [eax + esi] // read pixel 0 |
michael@0 | 6038 | movd xmm6, [eax + edi] // read pixel 1 |
michael@0 | 6039 | punpckldq xmm1, xmm6 // combine pixel 0 and 1 |
michael@0 | 6040 | addps xmm2, xmm4 // x, y += dx, dy first 2 |
michael@0 | 6041 | movq qword ptr [edx], xmm1 |
michael@0 | 6042 | movd esi, xmm0 |
michael@0 | 6043 | pshufd xmm0, xmm0, 0x39 // shift right |
michael@0 | 6044 | movd edi, xmm0 |
michael@0 | 6045 | movd xmm6, [eax + esi] // read pixel 2 |
michael@0 | 6046 | movd xmm0, [eax + edi] // read pixel 3 |
michael@0 | 6047 | punpckldq xmm6, xmm0 // combine pixel 2 and 3 |
michael@0 | 6048 | addps xmm3, xmm4 // x, y += dx, dy next 2 |
michael@0 | 6049 | sub ecx, 4 |
michael@0 | 6050 | movq qword ptr 8[edx], xmm6 |
michael@0 | 6051 | lea edx, [edx + 16] |
michael@0 | 6052 | jge l4 |
michael@0 | 6053 | |
michael@0 | 6054 | l4b: |
michael@0 | 6055 | add ecx, 4 - 1 |
michael@0 | 6056 | jl l1b |
michael@0 | 6057 | |
michael@0 | 6058 | // 1 pixel loop |
michael@0 | 6059 | align 4 |
michael@0 | 6060 | l1: |
michael@0 | 6061 | cvttps2dq xmm0, xmm2 // x, y float to int |
michael@0 | 6062 | packssdw xmm0, xmm0 // x, y as shorts |
michael@0 | 6063 | pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride |
michael@0 | 6064 | addps xmm2, xmm7 // x, y += dx, dy |
michael@0 | 6065 | movd esi, xmm0 |
michael@0 | 6066 | movd xmm0, [eax + esi] // copy a pixel |
michael@0 | 6067 | sub ecx, 1 |
michael@0 | 6068 | movd [edx], xmm0 |
michael@0 | 6069 | lea edx, [edx + 4] |
michael@0 | 6070 | jge l1 |
michael@0 | 6071 | l1b: |
michael@0 | 6072 | pop edi |
michael@0 | 6073 | pop esi |
michael@0 | 6074 | ret |
michael@0 | 6075 | } |
michael@0 | 6076 | } |
michael@0 | 6077 | #endif // HAS_ARGBAFFINEROW_SSE2 |
michael@0 | 6078 | |
michael@0 | 6079 | #ifdef HAS_INTERPOLATEROW_AVX2 |
michael@0 | 6080 | // Bilinear filter 16x2 -> 16x1 |
michael@0 | 6081 | __declspec(naked) __declspec(align(16)) |
michael@0 | 6082 | void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, |
michael@0 | 6083 | ptrdiff_t src_stride, int dst_width, |
michael@0 | 6084 | int source_y_fraction) { |
michael@0 | 6085 | __asm { |
michael@0 | 6086 | push esi |
michael@0 | 6087 | push edi |
michael@0 | 6088 | mov edi, [esp + 8 + 4] // dst_ptr |
michael@0 | 6089 | mov esi, [esp + 8 + 8] // src_ptr |
michael@0 | 6090 | mov edx, [esp + 8 + 12] // src_stride |
michael@0 | 6091 | mov ecx, [esp + 8 + 16] // dst_width |
michael@0 | 6092 | mov eax, [esp + 8 + 20] // source_y_fraction (0..255) |
michael@0 | 6093 | shr eax, 1 |
michael@0 | 6094 | // Dispatch to specialized filters if applicable. |
michael@0 | 6095 | cmp eax, 0 |
michael@0 | 6096 | je xloop100 // 0 / 128. Blend 100 / 0. |
michael@0 | 6097 | sub edi, esi |
michael@0 | 6098 | cmp eax, 32 |
michael@0 | 6099 | je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. |
michael@0 | 6100 | cmp eax, 64 |
michael@0 | 6101 | je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. |
michael@0 | 6102 | cmp eax, 96 |
michael@0 | 6103 | je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. |
michael@0 | 6104 | |
michael@0 | 6105 | vmovd xmm0, eax // high fraction 0..127 |
michael@0 | 6106 | neg eax |
michael@0 | 6107 | add eax, 128 |
michael@0 | 6108 | vmovd xmm5, eax // low fraction 128..1 |
michael@0 | 6109 | vpunpcklbw xmm5, xmm5, xmm0 |
michael@0 | 6110 | vpunpcklwd xmm5, xmm5, xmm5 |
michael@0 | 6111 | vpxor ymm0, ymm0, ymm0 |
michael@0 | 6112 | vpermd ymm5, ymm0, ymm5 |
michael@0 | 6113 | |
michael@0 | 6114 | align 4 |
michael@0 | 6115 | xloop: |
michael@0 | 6116 | vmovdqu ymm0, [esi] |
michael@0 | 6117 | vmovdqu ymm2, [esi + edx] |
michael@0 | 6118 | vpunpckhbw ymm1, ymm0, ymm2 // mutates |
michael@0 | 6119 | vpunpcklbw ymm0, ymm0, ymm2 // mutates |
michael@0 | 6120 | vpmaddubsw ymm0, ymm0, ymm5 |
michael@0 | 6121 | vpmaddubsw ymm1, ymm1, ymm5 |
michael@0 | 6122 | vpsrlw ymm0, ymm0, 7 |
michael@0 | 6123 | vpsrlw ymm1, ymm1, 7 |
michael@0 | 6124 | vpackuswb ymm0, ymm0, ymm1 // unmutates |
michael@0 | 6125 | sub ecx, 32 |
michael@0 | 6126 | vmovdqu [esi + edi], ymm0 |
michael@0 | 6127 | lea esi, [esi + 32] |
michael@0 | 6128 | jg xloop |
michael@0 | 6129 | jmp xloop99 |
michael@0 | 6130 | |
michael@0 | 6131 | // Blend 25 / 75. |
michael@0 | 6132 | align 4 |
michael@0 | 6133 | xloop25: |
michael@0 | 6134 | vmovdqu ymm0, [esi] |
michael@0 | 6135 | vpavgb ymm0, ymm0, [esi + edx] |
michael@0 | 6136 | vpavgb ymm0, ymm0, [esi + edx] |
michael@0 | 6137 | sub ecx, 32 |
michael@0 | 6138 | vmovdqu [esi + edi], ymm0 |
michael@0 | 6139 | lea esi, [esi + 32] |
michael@0 | 6140 | jg xloop25 |
michael@0 | 6141 | jmp xloop99 |
michael@0 | 6142 | |
michael@0 | 6143 | // Blend 50 / 50. |
michael@0 | 6144 | align 4 |
michael@0 | 6145 | xloop50: |
michael@0 | 6146 | vmovdqu ymm0, [esi] |
michael@0 | 6147 | vpavgb ymm0, ymm0, [esi + edx] |
michael@0 | 6148 | sub ecx, 32 |
michael@0 | 6149 | vmovdqu [esi + edi], ymm0 |
michael@0 | 6150 | lea esi, [esi + 32] |
michael@0 | 6151 | jg xloop50 |
michael@0 | 6152 | jmp xloop99 |
michael@0 | 6153 | |
michael@0 | 6154 | // Blend 75 / 25. |
michael@0 | 6155 | align 4 |
michael@0 | 6156 | xloop75: |
michael@0 | 6157 | vmovdqu ymm0, [esi + edx] |
michael@0 | 6158 | vpavgb ymm0, ymm0, [esi] |
michael@0 | 6159 | vpavgb ymm0, ymm0, [esi] |
michael@0 | 6160 | sub ecx, 32 |
michael@0 | 6161 | vmovdqu [esi + edi], ymm0 |
michael@0 | 6162 | lea esi, [esi + 32] |
michael@0 | 6163 | jg xloop75 |
michael@0 | 6164 | jmp xloop99 |
michael@0 | 6165 | |
michael@0 | 6166 | // Blend 100 / 0 - Copy row unchanged. |
michael@0 | 6167 | align 4 |
michael@0 | 6168 | xloop100: |
michael@0 | 6169 | rep movsb |
michael@0 | 6170 | |
michael@0 | 6171 | xloop99: |
michael@0 | 6172 | pop edi |
michael@0 | 6173 | pop esi |
michael@0 | 6174 | vzeroupper |
michael@0 | 6175 | ret |
michael@0 | 6176 | } |
michael@0 | 6177 | } |
michael@0 | 6178 | #endif // HAS_INTERPOLATEROW_AVX2 |
michael@0 | 6179 | |
michael@0 | 6180 | #ifdef HAS_INTERPOLATEROW_SSSE3 |
michael@0 | 6181 | // Bilinear filter 16x2 -> 16x1 |
michael@0 | 6182 | __declspec(naked) __declspec(align(16)) |
michael@0 | 6183 | void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
michael@0 | 6184 | ptrdiff_t src_stride, int dst_width, |
michael@0 | 6185 | int source_y_fraction) { |
michael@0 | 6186 | __asm { |
michael@0 | 6187 | push esi |
michael@0 | 6188 | push edi |
michael@0 | 6189 | mov edi, [esp + 8 + 4] // dst_ptr |
michael@0 | 6190 | mov esi, [esp + 8 + 8] // src_ptr |
michael@0 | 6191 | mov edx, [esp + 8 + 12] // src_stride |
michael@0 | 6192 | mov ecx, [esp + 8 + 16] // dst_width |
michael@0 | 6193 | mov eax, [esp + 8 + 20] // source_y_fraction (0..255) |
michael@0 | 6194 | sub edi, esi |
michael@0 | 6195 | shr eax, 1 |
michael@0 | 6196 | // Dispatch to specialized filters if applicable. |
michael@0 | 6197 | cmp eax, 0 |
michael@0 | 6198 | je xloop100 // 0 / 128. Blend 100 / 0. |
michael@0 | 6199 | cmp eax, 32 |
michael@0 | 6200 | je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. |
michael@0 | 6201 | cmp eax, 64 |
michael@0 | 6202 | je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. |
michael@0 | 6203 | cmp eax, 96 |
michael@0 | 6204 | je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. |
michael@0 | 6205 | |
michael@0 | 6206 | movd xmm0, eax // high fraction 0..127 |
michael@0 | 6207 | neg eax |
michael@0 | 6208 | add eax, 128 |
michael@0 | 6209 | movd xmm5, eax // low fraction 128..1 |
michael@0 | 6210 | punpcklbw xmm5, xmm0 |
michael@0 | 6211 | punpcklwd xmm5, xmm5 |
michael@0 | 6212 | pshufd xmm5, xmm5, 0 |
michael@0 | 6213 | |
michael@0 | 6214 | align 4 |
michael@0 | 6215 | xloop: |
michael@0 | 6216 | movdqa xmm0, [esi] |
michael@0 | 6217 | movdqa xmm2, [esi + edx] |
michael@0 | 6218 | movdqa xmm1, xmm0 |
michael@0 | 6219 | punpcklbw xmm0, xmm2 |
michael@0 | 6220 | punpckhbw xmm1, xmm2 |
michael@0 | 6221 | pmaddubsw xmm0, xmm5 |
michael@0 | 6222 | pmaddubsw xmm1, xmm5 |
michael@0 | 6223 | psrlw xmm0, 7 |
michael@0 | 6224 | psrlw xmm1, 7 |
michael@0 | 6225 | packuswb xmm0, xmm1 |
michael@0 | 6226 | sub ecx, 16 |
michael@0 | 6227 | movdqa [esi + edi], xmm0 |
michael@0 | 6228 | lea esi, [esi + 16] |
michael@0 | 6229 | jg xloop |
michael@0 | 6230 | jmp xloop99 |
michael@0 | 6231 | |
michael@0 | 6232 | // Blend 25 / 75. |
michael@0 | 6233 | align 4 |
michael@0 | 6234 | xloop25: |
michael@0 | 6235 | movdqa xmm0, [esi] |
michael@0 | 6236 | movdqa xmm1, [esi + edx] |
michael@0 | 6237 | pavgb xmm0, xmm1 |
michael@0 | 6238 | pavgb xmm0, xmm1 |
michael@0 | 6239 | sub ecx, 16 |
michael@0 | 6240 | movdqa [esi + edi], xmm0 |
michael@0 | 6241 | lea esi, [esi + 16] |
michael@0 | 6242 | jg xloop25 |
michael@0 | 6243 | jmp xloop99 |
michael@0 | 6244 | |
michael@0 | 6245 | // Blend 50 / 50. |
michael@0 | 6246 | align 4 |
michael@0 | 6247 | xloop50: |
michael@0 | 6248 | movdqa xmm0, [esi] |
michael@0 | 6249 | movdqa xmm1, [esi + edx] |
michael@0 | 6250 | pavgb xmm0, xmm1 |
michael@0 | 6251 | sub ecx, 16 |
michael@0 | 6252 | movdqa [esi + edi], xmm0 |
michael@0 | 6253 | lea esi, [esi + 16] |
michael@0 | 6254 | jg xloop50 |
michael@0 | 6255 | jmp xloop99 |
michael@0 | 6256 | |
michael@0 | 6257 | // Blend 75 / 25. |
michael@0 | 6258 | align 4 |
michael@0 | 6259 | xloop75: |
michael@0 | 6260 | movdqa xmm1, [esi] |
michael@0 | 6261 | movdqa xmm0, [esi + edx] |
michael@0 | 6262 | pavgb xmm0, xmm1 |
michael@0 | 6263 | pavgb xmm0, xmm1 |
michael@0 | 6264 | sub ecx, 16 |
michael@0 | 6265 | movdqa [esi + edi], xmm0 |
michael@0 | 6266 | lea esi, [esi + 16] |
michael@0 | 6267 | jg xloop75 |
michael@0 | 6268 | jmp xloop99 |
michael@0 | 6269 | |
michael@0 | 6270 | // Blend 100 / 0 - Copy row unchanged. |
michael@0 | 6271 | align 4 |
michael@0 | 6272 | xloop100: |
michael@0 | 6273 | movdqa xmm0, [esi] |
michael@0 | 6274 | sub ecx, 16 |
michael@0 | 6275 | movdqa [esi + edi], xmm0 |
michael@0 | 6276 | lea esi, [esi + 16] |
michael@0 | 6277 | jg xloop100 |
michael@0 | 6278 | |
michael@0 | 6279 | xloop99: |
michael@0 | 6280 | pop edi |
michael@0 | 6281 | pop esi |
michael@0 | 6282 | ret |
michael@0 | 6283 | } |
michael@0 | 6284 | } |
michael@0 | 6285 | #endif // HAS_INTERPOLATEROW_SSSE3 |
michael@0 | 6286 | |
michael@0 | 6287 | #ifdef HAS_INTERPOLATEROW_SSE2 |
michael@0 | 6288 | // Bilinear filter 16x2 -> 16x1 |
michael@0 | 6289 | __declspec(naked) __declspec(align(16)) |
michael@0 | 6290 | void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, |
michael@0 | 6291 | ptrdiff_t src_stride, int dst_width, |
michael@0 | 6292 | int source_y_fraction) { |
michael@0 | 6293 | __asm { |
michael@0 | 6294 | push esi |
michael@0 | 6295 | push edi |
michael@0 | 6296 | mov edi, [esp + 8 + 4] // dst_ptr |
michael@0 | 6297 | mov esi, [esp + 8 + 8] // src_ptr |
michael@0 | 6298 | mov edx, [esp + 8 + 12] // src_stride |
michael@0 | 6299 | mov ecx, [esp + 8 + 16] // dst_width |
michael@0 | 6300 | mov eax, [esp + 8 + 20] // source_y_fraction (0..255) |
michael@0 | 6301 | sub edi, esi |
michael@0 | 6302 | // Dispatch to specialized filters if applicable. |
michael@0 | 6303 | cmp eax, 0 |
michael@0 | 6304 | je xloop100 // 0 / 256. Blend 100 / 0. |
michael@0 | 6305 | cmp eax, 64 |
michael@0 | 6306 | je xloop75 // 64 / 256 is 0.25. Blend 75 / 25. |
michael@0 | 6307 | cmp eax, 128 |
michael@0 | 6308 | je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. |
michael@0 | 6309 | cmp eax, 192 |
michael@0 | 6310 | je xloop25 // 192 / 256 is 0.75. Blend 25 / 75. |
michael@0 | 6311 | |
michael@0 | 6312 | movd xmm5, eax // xmm5 = y fraction |
michael@0 | 6313 | punpcklbw xmm5, xmm5 |
michael@0 | 6314 | psrlw xmm5, 1 |
michael@0 | 6315 | punpcklwd xmm5, xmm5 |
michael@0 | 6316 | punpckldq xmm5, xmm5 |
michael@0 | 6317 | punpcklqdq xmm5, xmm5 |
michael@0 | 6318 | pxor xmm4, xmm4 |
michael@0 | 6319 | |
michael@0 | 6320 | align 4 |
michael@0 | 6321 | xloop: |
michael@0 | 6322 | movdqa xmm0, [esi] // row0 |
michael@0 | 6323 | movdqa xmm2, [esi + edx] // row1 |
michael@0 | 6324 | movdqa xmm1, xmm0 |
michael@0 | 6325 | movdqa xmm3, xmm2 |
michael@0 | 6326 | punpcklbw xmm2, xmm4 |
michael@0 | 6327 | punpckhbw xmm3, xmm4 |
michael@0 | 6328 | punpcklbw xmm0, xmm4 |
michael@0 | 6329 | punpckhbw xmm1, xmm4 |
michael@0 | 6330 | psubw xmm2, xmm0 // row1 - row0 |
michael@0 | 6331 | psubw xmm3, xmm1 |
michael@0 | 6332 | paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16 |
michael@0 | 6333 | paddw xmm3, xmm3 |
michael@0 | 6334 | pmulhw xmm2, xmm5 // scale diff |
michael@0 | 6335 | pmulhw xmm3, xmm5 |
michael@0 | 6336 | paddw xmm0, xmm2 // sum rows |
michael@0 | 6337 | paddw xmm1, xmm3 |
michael@0 | 6338 | packuswb xmm0, xmm1 |
michael@0 | 6339 | sub ecx, 16 |
michael@0 | 6340 | movdqa [esi + edi], xmm0 |
michael@0 | 6341 | lea esi, [esi + 16] |
michael@0 | 6342 | jg xloop |
michael@0 | 6343 | jmp xloop99 |
michael@0 | 6344 | |
michael@0 | 6345 | // Blend 25 / 75. |
michael@0 | 6346 | align 4 |
michael@0 | 6347 | xloop25: |
michael@0 | 6348 | movdqa xmm0, [esi] |
michael@0 | 6349 | movdqa xmm1, [esi + edx] |
michael@0 | 6350 | pavgb xmm0, xmm1 |
michael@0 | 6351 | pavgb xmm0, xmm1 |
michael@0 | 6352 | sub ecx, 16 |
michael@0 | 6353 | movdqa [esi + edi], xmm0 |
michael@0 | 6354 | lea esi, [esi + 16] |
michael@0 | 6355 | jg xloop25 |
michael@0 | 6356 | jmp xloop99 |
michael@0 | 6357 | |
michael@0 | 6358 | // Blend 50 / 50. |
michael@0 | 6359 | align 4 |
michael@0 | 6360 | xloop50: |
michael@0 | 6361 | movdqa xmm0, [esi] |
michael@0 | 6362 | movdqa xmm1, [esi + edx] |
michael@0 | 6363 | pavgb xmm0, xmm1 |
michael@0 | 6364 | sub ecx, 16 |
michael@0 | 6365 | movdqa [esi + edi], xmm0 |
michael@0 | 6366 | lea esi, [esi + 16] |
michael@0 | 6367 | jg xloop50 |
michael@0 | 6368 | jmp xloop99 |
michael@0 | 6369 | |
michael@0 | 6370 | // Blend 75 / 25. |
michael@0 | 6371 | align 4 |
michael@0 | 6372 | xloop75: |
michael@0 | 6373 | movdqa xmm1, [esi] |
michael@0 | 6374 | movdqa xmm0, [esi + edx] |
michael@0 | 6375 | pavgb xmm0, xmm1 |
michael@0 | 6376 | pavgb xmm0, xmm1 |
michael@0 | 6377 | sub ecx, 16 |
michael@0 | 6378 | movdqa [esi + edi], xmm0 |
michael@0 | 6379 | lea esi, [esi + 16] |
michael@0 | 6380 | jg xloop75 |
michael@0 | 6381 | jmp xloop99 |
michael@0 | 6382 | |
michael@0 | 6383 | // Blend 100 / 0 - Copy row unchanged. |
michael@0 | 6384 | align 4 |
michael@0 | 6385 | xloop100: |
michael@0 | 6386 | movdqa xmm0, [esi] |
michael@0 | 6387 | sub ecx, 16 |
michael@0 | 6388 | movdqa [esi + edi], xmm0 |
michael@0 | 6389 | lea esi, [esi + 16] |
michael@0 | 6390 | jg xloop100 |
michael@0 | 6391 | |
michael@0 | 6392 | xloop99: |
michael@0 | 6393 | pop edi |
michael@0 | 6394 | pop esi |
michael@0 | 6395 | ret |
michael@0 | 6396 | } |
michael@0 | 6397 | } |
michael@0 | 6398 | #endif // HAS_INTERPOLATEROW_SSE2 |
michael@0 | 6399 | |
michael@0 | 6400 | // Bilinear filter 16x2 -> 16x1 |
michael@0 | 6401 | __declspec(naked) __declspec(align(16)) |
michael@0 | 6402 | void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
michael@0 | 6403 | ptrdiff_t src_stride, int dst_width, |
michael@0 | 6404 | int source_y_fraction) { |
michael@0 | 6405 | __asm { |
michael@0 | 6406 | push esi |
michael@0 | 6407 | push edi |
michael@0 | 6408 | mov edi, [esp + 8 + 4] // dst_ptr |
michael@0 | 6409 | mov esi, [esp + 8 + 8] // src_ptr |
michael@0 | 6410 | mov edx, [esp + 8 + 12] // src_stride |
michael@0 | 6411 | mov ecx, [esp + 8 + 16] // dst_width |
michael@0 | 6412 | mov eax, [esp + 8 + 20] // source_y_fraction (0..255) |
michael@0 | 6413 | sub edi, esi |
michael@0 | 6414 | shr eax, 1 |
michael@0 | 6415 | // Dispatch to specialized filters if applicable. |
michael@0 | 6416 | cmp eax, 0 |
michael@0 | 6417 | je xloop100 // 0 / 128. Blend 100 / 0. |
michael@0 | 6418 | cmp eax, 32 |
michael@0 | 6419 | je xloop75 // 32 / 128 is 0.25. Blend 75 / 25. |
michael@0 | 6420 | cmp eax, 64 |
michael@0 | 6421 | je xloop50 // 64 / 128 is 0.50. Blend 50 / 50. |
michael@0 | 6422 | cmp eax, 96 |
michael@0 | 6423 | je xloop25 // 96 / 128 is 0.75. Blend 25 / 75. |
michael@0 | 6424 | |
michael@0 | 6425 | movd xmm0, eax // high fraction 0..127 |
michael@0 | 6426 | neg eax |
michael@0 | 6427 | add eax, 128 |
michael@0 | 6428 | movd xmm5, eax // low fraction 128..1 |
michael@0 | 6429 | punpcklbw xmm5, xmm0 |
michael@0 | 6430 | punpcklwd xmm5, xmm5 |
michael@0 | 6431 | pshufd xmm5, xmm5, 0 |
michael@0 | 6432 | |
michael@0 | 6433 | align 4 |
michael@0 | 6434 | xloop: |
michael@0 | 6435 | movdqu xmm0, [esi] |
michael@0 | 6436 | movdqu xmm2, [esi + edx] |
michael@0 | 6437 | movdqu xmm1, xmm0 |
michael@0 | 6438 | punpcklbw xmm0, xmm2 |
michael@0 | 6439 | punpckhbw xmm1, xmm2 |
michael@0 | 6440 | pmaddubsw xmm0, xmm5 |
michael@0 | 6441 | pmaddubsw xmm1, xmm5 |
michael@0 | 6442 | psrlw xmm0, 7 |
michael@0 | 6443 | psrlw xmm1, 7 |
michael@0 | 6444 | packuswb xmm0, xmm1 |
michael@0 | 6445 | sub ecx, 16 |
michael@0 | 6446 | movdqu [esi + edi], xmm0 |
michael@0 | 6447 | lea esi, [esi + 16] |
michael@0 | 6448 | jg xloop |
michael@0 | 6449 | jmp xloop99 |
michael@0 | 6450 | |
michael@0 | 6451 | // Blend 25 / 75. |
michael@0 | 6452 | align 4 |
michael@0 | 6453 | xloop25: |
michael@0 | 6454 | movdqu xmm0, [esi] |
michael@0 | 6455 | movdqu xmm1, [esi + edx] |
michael@0 | 6456 | pavgb xmm0, xmm1 |
michael@0 | 6457 | pavgb xmm0, xmm1 |
michael@0 | 6458 | sub ecx, 16 |
michael@0 | 6459 | movdqu [esi + edi], xmm0 |
michael@0 | 6460 | lea esi, [esi + 16] |
michael@0 | 6461 | jg xloop25 |
michael@0 | 6462 | jmp xloop99 |
michael@0 | 6463 | |
michael@0 | 6464 | // Blend 50 / 50. |
michael@0 | 6465 | align 4 |
michael@0 | 6466 | xloop50: |
michael@0 | 6467 | movdqu xmm0, [esi] |
michael@0 | 6468 | movdqu xmm1, [esi + edx] |
michael@0 | 6469 | pavgb xmm0, xmm1 |
michael@0 | 6470 | sub ecx, 16 |
michael@0 | 6471 | movdqu [esi + edi], xmm0 |
michael@0 | 6472 | lea esi, [esi + 16] |
michael@0 | 6473 | jg xloop50 |
michael@0 | 6474 | jmp xloop99 |
michael@0 | 6475 | |
michael@0 | 6476 | // Blend 75 / 25. |
michael@0 | 6477 | align 4 |
michael@0 | 6478 | xloop75: |
michael@0 | 6479 | movdqu xmm1, [esi] |
michael@0 | 6480 | movdqu xmm0, [esi + edx] |
michael@0 | 6481 | pavgb xmm0, xmm1 |
michael@0 | 6482 | pavgb xmm0, xmm1 |
michael@0 | 6483 | sub ecx, 16 |
michael@0 | 6484 | movdqu [esi + edi], xmm0 |
michael@0 | 6485 | lea esi, [esi + 16] |
michael@0 | 6486 | jg xloop75 |
michael@0 | 6487 | jmp xloop99 |
michael@0 | 6488 | |
michael@0 | 6489 | // Blend 100 / 0 - Copy row unchanged. |
michael@0 | 6490 | align 4 |
michael@0 | 6491 | xloop100: |
michael@0 | 6492 | movdqu xmm0, [esi] |
michael@0 | 6493 | sub ecx, 16 |
michael@0 | 6494 | movdqu [esi + edi], xmm0 |
michael@0 | 6495 | lea esi, [esi + 16] |
michael@0 | 6496 | jg xloop100 |
michael@0 | 6497 | |
michael@0 | 6498 | xloop99: |
michael@0 | 6499 | pop edi |
michael@0 | 6500 | pop esi |
michael@0 | 6501 | ret |
michael@0 | 6502 | } |
michael@0 | 6503 | } |
michael@0 | 6504 | |
michael@0 | 6505 | #ifdef HAS_INTERPOLATEROW_SSE2 |
michael@0 | 6506 | // Bilinear filter 16x2 -> 16x1 |
michael@0 | 6507 | __declspec(naked) __declspec(align(16)) |
michael@0 | 6508 | void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, |
michael@0 | 6509 | ptrdiff_t src_stride, int dst_width, |
michael@0 | 6510 | int source_y_fraction) { |
michael@0 | 6511 | __asm { |
michael@0 | 6512 | push esi |
michael@0 | 6513 | push edi |
michael@0 | 6514 | mov edi, [esp + 8 + 4] // dst_ptr |
michael@0 | 6515 | mov esi, [esp + 8 + 8] // src_ptr |
michael@0 | 6516 | mov edx, [esp + 8 + 12] // src_stride |
michael@0 | 6517 | mov ecx, [esp + 8 + 16] // dst_width |
michael@0 | 6518 | mov eax, [esp + 8 + 20] // source_y_fraction (0..255) |
michael@0 | 6519 | sub edi, esi |
michael@0 | 6520 | // Dispatch to specialized filters if applicable. |
michael@0 | 6521 | cmp eax, 0 |
michael@0 | 6522 | je xloop100 // 0 / 256. Blend 100 / 0. |
michael@0 | 6523 | cmp eax, 64 |
michael@0 | 6524 | je xloop75 // 64 / 256 is 0.25. Blend 75 / 25. |
michael@0 | 6525 | cmp eax, 128 |
michael@0 | 6526 | je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. |
michael@0 | 6527 | cmp eax, 192 |
michael@0 | 6528 | je xloop25 // 192 / 256 is 0.75. Blend 25 / 75. |
michael@0 | 6529 | |
michael@0 | 6530 | movd xmm5, eax // xmm5 = y fraction |
michael@0 | 6531 | punpcklbw xmm5, xmm5 |
michael@0 | 6532 | psrlw xmm5, 1 |
michael@0 | 6533 | punpcklwd xmm5, xmm5 |
michael@0 | 6534 | punpckldq xmm5, xmm5 |
michael@0 | 6535 | punpcklqdq xmm5, xmm5 |
michael@0 | 6536 | pxor xmm4, xmm4 |
michael@0 | 6537 | |
michael@0 | 6538 | align 4 |
michael@0 | 6539 | xloop: |
michael@0 | 6540 | movdqu xmm0, [esi] // row0 |
michael@0 | 6541 | movdqu xmm2, [esi + edx] // row1 |
michael@0 | 6542 | movdqu xmm1, xmm0 |
michael@0 | 6543 | movdqu xmm3, xmm2 |
michael@0 | 6544 | punpcklbw xmm2, xmm4 |
michael@0 | 6545 | punpckhbw xmm3, xmm4 |
michael@0 | 6546 | punpcklbw xmm0, xmm4 |
michael@0 | 6547 | punpckhbw xmm1, xmm4 |
michael@0 | 6548 | psubw xmm2, xmm0 // row1 - row0 |
michael@0 | 6549 | psubw xmm3, xmm1 |
michael@0 | 6550 | paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16 |
michael@0 | 6551 | paddw xmm3, xmm3 |
michael@0 | 6552 | pmulhw xmm2, xmm5 // scale diff |
michael@0 | 6553 | pmulhw xmm3, xmm5 |
michael@0 | 6554 | paddw xmm0, xmm2 // sum rows |
michael@0 | 6555 | paddw xmm1, xmm3 |
michael@0 | 6556 | packuswb xmm0, xmm1 |
michael@0 | 6557 | sub ecx, 16 |
michael@0 | 6558 | movdqu [esi + edi], xmm0 |
michael@0 | 6559 | lea esi, [esi + 16] |
michael@0 | 6560 | jg xloop |
michael@0 | 6561 | jmp xloop99 |
michael@0 | 6562 | |
michael@0 | 6563 | // Blend 25 / 75. |
michael@0 | 6564 | align 4 |
michael@0 | 6565 | xloop25: |
michael@0 | 6566 | movdqu xmm0, [esi] |
michael@0 | 6567 | movdqu xmm1, [esi + edx] |
michael@0 | 6568 | pavgb xmm0, xmm1 |
michael@0 | 6569 | pavgb xmm0, xmm1 |
michael@0 | 6570 | sub ecx, 16 |
michael@0 | 6571 | movdqu [esi + edi], xmm0 |
michael@0 | 6572 | lea esi, [esi + 16] |
michael@0 | 6573 | jg xloop25 |
michael@0 | 6574 | jmp xloop99 |
michael@0 | 6575 | |
michael@0 | 6576 | // Blend 50 / 50. |
michael@0 | 6577 | align 4 |
michael@0 | 6578 | xloop50: |
michael@0 | 6579 | movdqu xmm0, [esi] |
michael@0 | 6580 | movdqu xmm1, [esi + edx] |
michael@0 | 6581 | pavgb xmm0, xmm1 |
michael@0 | 6582 | sub ecx, 16 |
michael@0 | 6583 | movdqu [esi + edi], xmm0 |
michael@0 | 6584 | lea esi, [esi + 16] |
michael@0 | 6585 | jg xloop50 |
michael@0 | 6586 | jmp xloop99 |
michael@0 | 6587 | |
michael@0 | 6588 | // Blend 75 / 25. |
michael@0 | 6589 | align 4 |
michael@0 | 6590 | xloop75: |
michael@0 | 6591 | movdqu xmm1, [esi] |
michael@0 | 6592 | movdqu xmm0, [esi + edx] |
michael@0 | 6593 | pavgb xmm0, xmm1 |
michael@0 | 6594 | pavgb xmm0, xmm1 |
michael@0 | 6595 | sub ecx, 16 |
michael@0 | 6596 | movdqu [esi + edi], xmm0 |
michael@0 | 6597 | lea esi, [esi + 16] |
michael@0 | 6598 | jg xloop75 |
michael@0 | 6599 | jmp xloop99 |
michael@0 | 6600 | |
michael@0 | 6601 | // Blend 100 / 0 - Copy row unchanged. |
michael@0 | 6602 | align 4 |
michael@0 | 6603 | xloop100: |
michael@0 | 6604 | movdqu xmm0, [esi] |
michael@0 | 6605 | sub ecx, 16 |
michael@0 | 6606 | movdqu [esi + edi], xmm0 |
michael@0 | 6607 | lea esi, [esi + 16] |
michael@0 | 6608 | jg xloop100 |
michael@0 | 6609 | |
michael@0 | 6610 | xloop99: |
michael@0 | 6611 | pop edi |
michael@0 | 6612 | pop esi |
michael@0 | 6613 | ret |
michael@0 | 6614 | } |
michael@0 | 6615 | } |
michael@0 | 6616 | #endif // HAS_INTERPOLATEROW_SSE2 |
michael@0 | 6617 | |
michael@0 | 6618 | __declspec(naked) __declspec(align(16)) |
michael@0 | 6619 | void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, |
michael@0 | 6620 | uint8* dst_uv, int pix) { |
michael@0 | 6621 | __asm { |
michael@0 | 6622 | push edi |
michael@0 | 6623 | mov eax, [esp + 4 + 4] // src_uv |
michael@0 | 6624 | mov edx, [esp + 4 + 8] // src_uv_stride |
michael@0 | 6625 | mov edi, [esp + 4 + 12] // dst_v |
michael@0 | 6626 | mov ecx, [esp + 4 + 16] // pix |
michael@0 | 6627 | sub edi, eax |
michael@0 | 6628 | |
michael@0 | 6629 | align 4 |
michael@0 | 6630 | convertloop: |
michael@0 | 6631 | movdqa xmm0, [eax] |
michael@0 | 6632 | pavgb xmm0, [eax + edx] |
michael@0 | 6633 | sub ecx, 16 |
michael@0 | 6634 | movdqa [eax + edi], xmm0 |
michael@0 | 6635 | lea eax, [eax + 16] |
michael@0 | 6636 | jg convertloop |
michael@0 | 6637 | pop edi |
michael@0 | 6638 | ret |
michael@0 | 6639 | } |
michael@0 | 6640 | } |
michael@0 | 6641 | |
michael@0 | 6642 | #ifdef HAS_HALFROW_AVX2 |
michael@0 | 6643 | __declspec(naked) __declspec(align(16)) |
michael@0 | 6644 | void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride, |
michael@0 | 6645 | uint8* dst_uv, int pix) { |
michael@0 | 6646 | __asm { |
michael@0 | 6647 | push edi |
michael@0 | 6648 | mov eax, [esp + 4 + 4] // src_uv |
michael@0 | 6649 | mov edx, [esp + 4 + 8] // src_uv_stride |
michael@0 | 6650 | mov edi, [esp + 4 + 12] // dst_v |
michael@0 | 6651 | mov ecx, [esp + 4 + 16] // pix |
michael@0 | 6652 | sub edi, eax |
michael@0 | 6653 | |
michael@0 | 6654 | align 4 |
michael@0 | 6655 | convertloop: |
michael@0 | 6656 | vmovdqu ymm0, [eax] |
michael@0 | 6657 | vpavgb ymm0, ymm0, [eax + edx] |
michael@0 | 6658 | sub ecx, 32 |
michael@0 | 6659 | vmovdqu [eax + edi], ymm0 |
michael@0 | 6660 | lea eax, [eax + 32] |
michael@0 | 6661 | jg convertloop |
michael@0 | 6662 | |
michael@0 | 6663 | pop edi |
michael@0 | 6664 | vzeroupper |
michael@0 | 6665 | ret |
michael@0 | 6666 | } |
michael@0 | 6667 | } |
michael@0 | 6668 | #endif // HAS_HALFROW_AVX2 |
michael@0 | 6669 | |
michael@0 | 6670 | __declspec(naked) __declspec(align(16)) |
michael@0 | 6671 | void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, |
michael@0 | 6672 | uint32 selector, int pix) { |
michael@0 | 6673 | __asm { |
michael@0 | 6674 | mov eax, [esp + 4] // src_argb |
michael@0 | 6675 | mov edx, [esp + 8] // dst_bayer |
michael@0 | 6676 | movd xmm5, [esp + 12] // selector |
michael@0 | 6677 | mov ecx, [esp + 16] // pix |
michael@0 | 6678 | pshufd xmm5, xmm5, 0 |
michael@0 | 6679 | |
michael@0 | 6680 | align 4 |
michael@0 | 6681 | wloop: |
michael@0 | 6682 | movdqa xmm0, [eax] |
michael@0 | 6683 | movdqa xmm1, [eax + 16] |
michael@0 | 6684 | lea eax, [eax + 32] |
michael@0 | 6685 | pshufb xmm0, xmm5 |
michael@0 | 6686 | pshufb xmm1, xmm5 |
michael@0 | 6687 | punpckldq xmm0, xmm1 |
michael@0 | 6688 | sub ecx, 8 |
michael@0 | 6689 | movq qword ptr [edx], xmm0 |
michael@0 | 6690 | lea edx, [edx + 8] |
michael@0 | 6691 | jg wloop |
michael@0 | 6692 | ret |
michael@0 | 6693 | } |
michael@0 | 6694 | } |
michael@0 | 6695 | |
michael@0 | 6696 | // Specialized ARGB to Bayer that just isolates G channel. |
michael@0 | 6697 | __declspec(naked) __declspec(align(16)) |
michael@0 | 6698 | void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, |
michael@0 | 6699 | uint32 selector, int pix) { |
michael@0 | 6700 | __asm { |
michael@0 | 6701 | mov eax, [esp + 4] // src_argb |
michael@0 | 6702 | mov edx, [esp + 8] // dst_bayer |
michael@0 | 6703 | // selector |
michael@0 | 6704 | mov ecx, [esp + 16] // pix |
michael@0 | 6705 | pcmpeqb xmm5, xmm5 // generate mask 0x000000ff |
michael@0 | 6706 | psrld xmm5, 24 |
michael@0 | 6707 | |
michael@0 | 6708 | align 4 |
michael@0 | 6709 | wloop: |
michael@0 | 6710 | movdqa xmm0, [eax] |
michael@0 | 6711 | movdqa xmm1, [eax + 16] |
michael@0 | 6712 | lea eax, [eax + 32] |
michael@0 | 6713 | psrld xmm0, 8 // Move green to bottom. |
michael@0 | 6714 | psrld xmm1, 8 |
michael@0 | 6715 | pand xmm0, xmm5 |
michael@0 | 6716 | pand xmm1, xmm5 |
michael@0 | 6717 | packssdw xmm0, xmm1 |
michael@0 | 6718 | packuswb xmm0, xmm1 |
michael@0 | 6719 | sub ecx, 8 |
michael@0 | 6720 | movq qword ptr [edx], xmm0 |
michael@0 | 6721 | lea edx, [edx + 8] |
michael@0 | 6722 | jg wloop |
michael@0 | 6723 | ret |
michael@0 | 6724 | } |
michael@0 | 6725 | } |
michael@0 | 6726 | |
michael@0 | 6727 | // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
michael@0 | 6728 | __declspec(naked) __declspec(align(16)) |
michael@0 | 6729 | void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, |
michael@0 | 6730 | const uint8* shuffler, int pix) { |
michael@0 | 6731 | __asm { |
michael@0 | 6732 | mov eax, [esp + 4] // src_argb |
michael@0 | 6733 | mov edx, [esp + 8] // dst_argb |
michael@0 | 6734 | mov ecx, [esp + 12] // shuffler |
michael@0 | 6735 | movdqa xmm5, [ecx] |
michael@0 | 6736 | mov ecx, [esp + 16] // pix |
michael@0 | 6737 | |
michael@0 | 6738 | align 4 |
michael@0 | 6739 | wloop: |
michael@0 | 6740 | movdqa xmm0, [eax] |
michael@0 | 6741 | movdqa xmm1, [eax + 16] |
michael@0 | 6742 | lea eax, [eax + 32] |
michael@0 | 6743 | pshufb xmm0, xmm5 |
michael@0 | 6744 | pshufb xmm1, xmm5 |
michael@0 | 6745 | sub ecx, 8 |
michael@0 | 6746 | movdqa [edx], xmm0 |
michael@0 | 6747 | movdqa [edx + 16], xmm1 |
michael@0 | 6748 | lea edx, [edx + 32] |
michael@0 | 6749 | jg wloop |
michael@0 | 6750 | ret |
michael@0 | 6751 | } |
michael@0 | 6752 | } |
michael@0 | 6753 | |
michael@0 | 6754 | __declspec(naked) __declspec(align(16)) |
michael@0 | 6755 | void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, |
michael@0 | 6756 | const uint8* shuffler, int pix) { |
michael@0 | 6757 | __asm { |
michael@0 | 6758 | mov eax, [esp + 4] // src_argb |
michael@0 | 6759 | mov edx, [esp + 8] // dst_argb |
michael@0 | 6760 | mov ecx, [esp + 12] // shuffler |
michael@0 | 6761 | movdqa xmm5, [ecx] |
michael@0 | 6762 | mov ecx, [esp + 16] // pix |
michael@0 | 6763 | |
michael@0 | 6764 | align 4 |
michael@0 | 6765 | wloop: |
michael@0 | 6766 | movdqu xmm0, [eax] |
michael@0 | 6767 | movdqu xmm1, [eax + 16] |
michael@0 | 6768 | lea eax, [eax + 32] |
michael@0 | 6769 | pshufb xmm0, xmm5 |
michael@0 | 6770 | pshufb xmm1, xmm5 |
michael@0 | 6771 | sub ecx, 8 |
michael@0 | 6772 | movdqu [edx], xmm0 |
michael@0 | 6773 | movdqu [edx + 16], xmm1 |
michael@0 | 6774 | lea edx, [edx + 32] |
michael@0 | 6775 | jg wloop |
michael@0 | 6776 | ret |
michael@0 | 6777 | } |
michael@0 | 6778 | } |
michael@0 | 6779 | |
michael@0 | 6780 | #ifdef HAS_ARGBSHUFFLEROW_AVX2 |
michael@0 | 6781 | __declspec(naked) __declspec(align(16)) |
michael@0 | 6782 | void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, |
michael@0 | 6783 | const uint8* shuffler, int pix) { |
michael@0 | 6784 | __asm { |
michael@0 | 6785 | mov eax, [esp + 4] // src_argb |
michael@0 | 6786 | mov edx, [esp + 8] // dst_argb |
michael@0 | 6787 | mov ecx, [esp + 12] // shuffler |
michael@0 | 6788 | vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. |
michael@0 | 6789 | mov ecx, [esp + 16] // pix |
michael@0 | 6790 | |
michael@0 | 6791 | align 4 |
michael@0 | 6792 | wloop: |
michael@0 | 6793 | vmovdqu ymm0, [eax] |
michael@0 | 6794 | vmovdqu ymm1, [eax + 32] |
michael@0 | 6795 | lea eax, [eax + 64] |
michael@0 | 6796 | vpshufb ymm0, ymm0, ymm5 |
michael@0 | 6797 | vpshufb ymm1, ymm1, ymm5 |
michael@0 | 6798 | sub ecx, 16 |
michael@0 | 6799 | vmovdqu [edx], ymm0 |
michael@0 | 6800 | vmovdqu [edx + 32], ymm1 |
michael@0 | 6801 | lea edx, [edx + 64] |
michael@0 | 6802 | jg wloop |
michael@0 | 6803 | |
michael@0 | 6804 | vzeroupper |
michael@0 | 6805 | ret |
michael@0 | 6806 | } |
michael@0 | 6807 | } |
michael@0 | 6808 | #endif // HAS_ARGBSHUFFLEROW_AVX2 |
michael@0 | 6809 | |
michael@0 | 6810 | __declspec(naked) __declspec(align(16)) |
michael@0 | 6811 | void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, |
michael@0 | 6812 | const uint8* shuffler, int pix) { |
michael@0 | 6813 | __asm { |
michael@0 | 6814 | push ebx |
michael@0 | 6815 | push esi |
michael@0 | 6816 | mov eax, [esp + 8 + 4] // src_argb |
michael@0 | 6817 | mov edx, [esp + 8 + 8] // dst_argb |
michael@0 | 6818 | mov esi, [esp + 8 + 12] // shuffler |
michael@0 | 6819 | mov ecx, [esp + 8 + 16] // pix |
michael@0 | 6820 | pxor xmm5, xmm5 |
michael@0 | 6821 | |
michael@0 | 6822 | mov ebx, [esi] // shuffler |
michael@0 | 6823 | cmp ebx, 0x03000102 |
michael@0 | 6824 | je shuf_3012 |
michael@0 | 6825 | cmp ebx, 0x00010203 |
michael@0 | 6826 | je shuf_0123 |
michael@0 | 6827 | cmp ebx, 0x00030201 |
michael@0 | 6828 | je shuf_0321 |
michael@0 | 6829 | cmp ebx, 0x02010003 |
michael@0 | 6830 | je shuf_2103 |
michael@0 | 6831 | |
michael@0 | 6832 | // TODO(fbarchard): Use one source pointer and 3 offsets. |
michael@0 | 6833 | shuf_any1: |
michael@0 | 6834 | movzx ebx, byte ptr [esi] |
michael@0 | 6835 | movzx ebx, byte ptr [eax + ebx] |
michael@0 | 6836 | mov [edx], bl |
michael@0 | 6837 | movzx ebx, byte ptr [esi + 1] |
michael@0 | 6838 | movzx ebx, byte ptr [eax + ebx] |
michael@0 | 6839 | mov [edx + 1], bl |
michael@0 | 6840 | movzx ebx, byte ptr [esi + 2] |
michael@0 | 6841 | movzx ebx, byte ptr [eax + ebx] |
michael@0 | 6842 | mov [edx + 2], bl |
michael@0 | 6843 | movzx ebx, byte ptr [esi + 3] |
michael@0 | 6844 | movzx ebx, byte ptr [eax + ebx] |
michael@0 | 6845 | mov [edx + 3], bl |
michael@0 | 6846 | lea eax, [eax + 4] |
michael@0 | 6847 | lea edx, [edx + 4] |
michael@0 | 6848 | sub ecx, 1 |
michael@0 | 6849 | jg shuf_any1 |
michael@0 | 6850 | jmp shuf99 |
michael@0 | 6851 | |
michael@0 | 6852 | align 4 |
michael@0 | 6853 | shuf_0123: |
michael@0 | 6854 | movdqu xmm0, [eax] |
michael@0 | 6855 | lea eax, [eax + 16] |
michael@0 | 6856 | movdqa xmm1, xmm0 |
michael@0 | 6857 | punpcklbw xmm0, xmm5 |
michael@0 | 6858 | punpckhbw xmm1, xmm5 |
michael@0 | 6859 | pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB |
michael@0 | 6860 | pshuflw xmm0, xmm0, 01Bh |
michael@0 | 6861 | pshufhw xmm1, xmm1, 01Bh |
michael@0 | 6862 | pshuflw xmm1, xmm1, 01Bh |
michael@0 | 6863 | packuswb xmm0, xmm1 |
michael@0 | 6864 | sub ecx, 4 |
michael@0 | 6865 | movdqu [edx], xmm0 |
michael@0 | 6866 | lea edx, [edx + 16] |
michael@0 | 6867 | jg shuf_0123 |
michael@0 | 6868 | jmp shuf99 |
michael@0 | 6869 | |
michael@0 | 6870 | align 4 |
michael@0 | 6871 | shuf_0321: |
michael@0 | 6872 | movdqu xmm0, [eax] |
michael@0 | 6873 | lea eax, [eax + 16] |
michael@0 | 6874 | movdqa xmm1, xmm0 |
michael@0 | 6875 | punpcklbw xmm0, xmm5 |
michael@0 | 6876 | punpckhbw xmm1, xmm5 |
michael@0 | 6877 | pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB |
michael@0 | 6878 | pshuflw xmm0, xmm0, 039h |
michael@0 | 6879 | pshufhw xmm1, xmm1, 039h |
michael@0 | 6880 | pshuflw xmm1, xmm1, 039h |
michael@0 | 6881 | packuswb xmm0, xmm1 |
michael@0 | 6882 | sub ecx, 4 |
michael@0 | 6883 | movdqu [edx], xmm0 |
michael@0 | 6884 | lea edx, [edx + 16] |
michael@0 | 6885 | jg shuf_0321 |
michael@0 | 6886 | jmp shuf99 |
michael@0 | 6887 | |
michael@0 | 6888 | align 4 |
michael@0 | 6889 | shuf_2103: |
michael@0 | 6890 | movdqu xmm0, [eax] |
michael@0 | 6891 | lea eax, [eax + 16] |
michael@0 | 6892 | movdqa xmm1, xmm0 |
michael@0 | 6893 | punpcklbw xmm0, xmm5 |
michael@0 | 6894 | punpckhbw xmm1, xmm5 |
michael@0 | 6895 | pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA |
michael@0 | 6896 | pshuflw xmm0, xmm0, 093h |
michael@0 | 6897 | pshufhw xmm1, xmm1, 093h |
michael@0 | 6898 | pshuflw xmm1, xmm1, 093h |
michael@0 | 6899 | packuswb xmm0, xmm1 |
michael@0 | 6900 | sub ecx, 4 |
michael@0 | 6901 | movdqu [edx], xmm0 |
michael@0 | 6902 | lea edx, [edx + 16] |
michael@0 | 6903 | jg shuf_2103 |
michael@0 | 6904 | jmp shuf99 |
michael@0 | 6905 | |
michael@0 | 6906 | align 4 |
michael@0 | 6907 | shuf_3012: |
michael@0 | 6908 | movdqu xmm0, [eax] |
michael@0 | 6909 | lea eax, [eax + 16] |
michael@0 | 6910 | movdqa xmm1, xmm0 |
michael@0 | 6911 | punpcklbw xmm0, xmm5 |
michael@0 | 6912 | punpckhbw xmm1, xmm5 |
michael@0 | 6913 | pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB |
michael@0 | 6914 | pshuflw xmm0, xmm0, 0C6h |
michael@0 | 6915 | pshufhw xmm1, xmm1, 0C6h |
michael@0 | 6916 | pshuflw xmm1, xmm1, 0C6h |
michael@0 | 6917 | packuswb xmm0, xmm1 |
michael@0 | 6918 | sub ecx, 4 |
michael@0 | 6919 | movdqu [edx], xmm0 |
michael@0 | 6920 | lea edx, [edx + 16] |
michael@0 | 6921 | jg shuf_3012 |
michael@0 | 6922 | |
michael@0 | 6923 | shuf99: |
michael@0 | 6924 | pop esi |
michael@0 | 6925 | pop ebx |
michael@0 | 6926 | ret |
michael@0 | 6927 | } |
michael@0 | 6928 | } |
michael@0 | 6929 | |
michael@0 | 6930 | // YUY2 - Macro-pixel = 2 image pixels |
michael@0 | 6931 | // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... |
michael@0 | 6932 | |
michael@0 | 6933 | // UYVY - Macro-pixel = 2 image pixels |
michael@0 | 6934 | // U0Y0V0Y1 |
michael@0 | 6935 | |
michael@0 | 6936 | __declspec(naked) __declspec(align(16)) |
michael@0 | 6937 | void I422ToYUY2Row_SSE2(const uint8* src_y, |
michael@0 | 6938 | const uint8* src_u, |
michael@0 | 6939 | const uint8* src_v, |
michael@0 | 6940 | uint8* dst_frame, int width) { |
michael@0 | 6941 | __asm { |
michael@0 | 6942 | push esi |
michael@0 | 6943 | push edi |
michael@0 | 6944 | mov eax, [esp + 8 + 4] // src_y |
michael@0 | 6945 | mov esi, [esp + 8 + 8] // src_u |
michael@0 | 6946 | mov edx, [esp + 8 + 12] // src_v |
michael@0 | 6947 | mov edi, [esp + 8 + 16] // dst_frame |
michael@0 | 6948 | mov ecx, [esp + 8 + 20] // width |
michael@0 | 6949 | sub edx, esi |
michael@0 | 6950 | |
michael@0 | 6951 | align 4 |
michael@0 | 6952 | convertloop: |
michael@0 | 6953 | movq xmm2, qword ptr [esi] // U |
michael@0 | 6954 | movq xmm3, qword ptr [esi + edx] // V |
michael@0 | 6955 | lea esi, [esi + 8] |
michael@0 | 6956 | punpcklbw xmm2, xmm3 // UV |
michael@0 | 6957 | movdqu xmm0, [eax] // Y |
michael@0 | 6958 | lea eax, [eax + 16] |
michael@0 | 6959 | movdqa xmm1, xmm0 |
michael@0 | 6960 | punpcklbw xmm0, xmm2 // YUYV |
michael@0 | 6961 | punpckhbw xmm1, xmm2 |
michael@0 | 6962 | movdqu [edi], xmm0 |
michael@0 | 6963 | movdqu [edi + 16], xmm1 |
michael@0 | 6964 | lea edi, [edi + 32] |
michael@0 | 6965 | sub ecx, 16 |
michael@0 | 6966 | jg convertloop |
michael@0 | 6967 | |
michael@0 | 6968 | pop edi |
michael@0 | 6969 | pop esi |
michael@0 | 6970 | ret |
michael@0 | 6971 | } |
michael@0 | 6972 | } |
michael@0 | 6973 | |
michael@0 | 6974 | __declspec(naked) __declspec(align(16)) |
michael@0 | 6975 | void I422ToUYVYRow_SSE2(const uint8* src_y, |
michael@0 | 6976 | const uint8* src_u, |
michael@0 | 6977 | const uint8* src_v, |
michael@0 | 6978 | uint8* dst_frame, int width) { |
michael@0 | 6979 | __asm { |
michael@0 | 6980 | push esi |
michael@0 | 6981 | push edi |
michael@0 | 6982 | mov eax, [esp + 8 + 4] // src_y |
michael@0 | 6983 | mov esi, [esp + 8 + 8] // src_u |
michael@0 | 6984 | mov edx, [esp + 8 + 12] // src_v |
michael@0 | 6985 | mov edi, [esp + 8 + 16] // dst_frame |
michael@0 | 6986 | mov ecx, [esp + 8 + 20] // width |
michael@0 | 6987 | sub edx, esi |
michael@0 | 6988 | |
michael@0 | 6989 | align 4 |
michael@0 | 6990 | convertloop: |
michael@0 | 6991 | movq xmm2, qword ptr [esi] // U |
michael@0 | 6992 | movq xmm3, qword ptr [esi + edx] // V |
michael@0 | 6993 | lea esi, [esi + 8] |
michael@0 | 6994 | punpcklbw xmm2, xmm3 // UV |
michael@0 | 6995 | movdqu xmm0, [eax] // Y |
michael@0 | 6996 | movdqa xmm1, xmm2 |
michael@0 | 6997 | lea eax, [eax + 16] |
michael@0 | 6998 | punpcklbw xmm1, xmm0 // UYVY |
michael@0 | 6999 | punpckhbw xmm2, xmm0 |
michael@0 | 7000 | movdqu [edi], xmm1 |
michael@0 | 7001 | movdqu [edi + 16], xmm2 |
michael@0 | 7002 | lea edi, [edi + 32] |
michael@0 | 7003 | sub ecx, 16 |
michael@0 | 7004 | jg convertloop |
michael@0 | 7005 | |
michael@0 | 7006 | pop edi |
michael@0 | 7007 | pop esi |
michael@0 | 7008 | ret |
michael@0 | 7009 | } |
michael@0 | 7010 | } |
michael@0 | 7011 | |
michael@0 | 7012 | #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 |
michael@0 | 7013 | __declspec(naked) __declspec(align(16)) |
michael@0 | 7014 | void ARGBPolynomialRow_SSE2(const uint8* src_argb, |
michael@0 | 7015 | uint8* dst_argb, const float* poly, |
michael@0 | 7016 | int width) { |
michael@0 | 7017 | __asm { |
michael@0 | 7018 | push esi |
michael@0 | 7019 | mov eax, [esp + 4 + 4] /* src_argb */ |
michael@0 | 7020 | mov edx, [esp + 4 + 8] /* dst_argb */ |
michael@0 | 7021 | mov esi, [esp + 4 + 12] /* poly */ |
michael@0 | 7022 | mov ecx, [esp + 4 + 16] /* width */ |
michael@0 | 7023 | pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. |
michael@0 | 7024 | |
michael@0 | 7025 | // 2 pixel loop. |
michael@0 | 7026 | align 4 |
michael@0 | 7027 | convertloop: |
michael@0 | 7028 | // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel |
michael@0 | 7029 | // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel |
michael@0 | 7030 | movq xmm0, qword ptr [eax] // BGRABGRA |
michael@0 | 7031 | lea eax, [eax + 8] |
michael@0 | 7032 | punpcklbw xmm0, xmm3 |
michael@0 | 7033 | movdqa xmm4, xmm0 |
michael@0 | 7034 | punpcklwd xmm0, xmm3 // pixel 0 |
michael@0 | 7035 | punpckhwd xmm4, xmm3 // pixel 1 |
michael@0 | 7036 | cvtdq2ps xmm0, xmm0 // 4 floats |
michael@0 | 7037 | cvtdq2ps xmm4, xmm4 |
michael@0 | 7038 | movdqa xmm1, xmm0 // X |
michael@0 | 7039 | movdqa xmm5, xmm4 |
michael@0 | 7040 | mulps xmm0, [esi + 16] // C1 * X |
michael@0 | 7041 | mulps xmm4, [esi + 16] |
michael@0 | 7042 | addps xmm0, [esi] // result = C0 + C1 * X |
michael@0 | 7043 | addps xmm4, [esi] |
michael@0 | 7044 | movdqa xmm2, xmm1 |
michael@0 | 7045 | movdqa xmm6, xmm5 |
michael@0 | 7046 | mulps xmm2, xmm1 // X * X |
michael@0 | 7047 | mulps xmm6, xmm5 |
michael@0 | 7048 | mulps xmm1, xmm2 // X * X * X |
michael@0 | 7049 | mulps xmm5, xmm6 |
michael@0 | 7050 | mulps xmm2, [esi + 32] // C2 * X * X |
michael@0 | 7051 | mulps xmm6, [esi + 32] |
michael@0 | 7052 | mulps xmm1, [esi + 48] // C3 * X * X * X |
michael@0 | 7053 | mulps xmm5, [esi + 48] |
michael@0 | 7054 | addps xmm0, xmm2 // result += C2 * X * X |
michael@0 | 7055 | addps xmm4, xmm6 |
michael@0 | 7056 | addps xmm0, xmm1 // result += C3 * X * X * X |
michael@0 | 7057 | addps xmm4, xmm5 |
michael@0 | 7058 | cvttps2dq xmm0, xmm0 |
michael@0 | 7059 | cvttps2dq xmm4, xmm4 |
michael@0 | 7060 | packuswb xmm0, xmm4 |
michael@0 | 7061 | packuswb xmm0, xmm0 |
michael@0 | 7062 | sub ecx, 2 |
michael@0 | 7063 | movq qword ptr [edx], xmm0 |
michael@0 | 7064 | lea edx, [edx + 8] |
michael@0 | 7065 | jg convertloop |
michael@0 | 7066 | pop esi |
michael@0 | 7067 | ret |
michael@0 | 7068 | } |
michael@0 | 7069 | } |
michael@0 | 7070 | #endif // HAS_ARGBPOLYNOMIALROW_SSE2 |
michael@0 | 7071 | |
michael@0 | 7072 | #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 |
michael@0 | 7073 | __declspec(naked) __declspec(align(16)) |
michael@0 | 7074 | void ARGBPolynomialRow_AVX2(const uint8* src_argb, |
michael@0 | 7075 | uint8* dst_argb, const float* poly, |
michael@0 | 7076 | int width) { |
michael@0 | 7077 | __asm { |
michael@0 | 7078 | mov eax, [esp + 4] /* src_argb */ |
michael@0 | 7079 | mov edx, [esp + 8] /* dst_argb */ |
michael@0 | 7080 | mov ecx, [esp + 12] /* poly */ |
michael@0 | 7081 | vbroadcastf128 ymm4, [ecx] // C0 |
michael@0 | 7082 | vbroadcastf128 ymm5, [ecx + 16] // C1 |
michael@0 | 7083 | vbroadcastf128 ymm6, [ecx + 32] // C2 |
michael@0 | 7084 | vbroadcastf128 ymm7, [ecx + 48] // C3 |
michael@0 | 7085 | mov ecx, [esp + 16] /* width */ |
michael@0 | 7086 | |
michael@0 | 7087 | // 2 pixel loop. |
michael@0 | 7088 | align 4 |
michael@0 | 7089 | convertloop: |
michael@0 | 7090 | vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels |
michael@0 | 7091 | lea eax, [eax + 8] |
michael@0 | 7092 | vcvtdq2ps ymm0, ymm0 // X 8 floats |
michael@0 | 7093 | vmulps ymm2, ymm0, ymm0 // X * X |
michael@0 | 7094 | vmulps ymm3, ymm0, ymm7 // C3 * X |
michael@0 | 7095 | vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X |
michael@0 | 7096 | vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X |
michael@0 | 7097 | vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X |
michael@0 | 7098 | vcvttps2dq ymm0, ymm0 |
michael@0 | 7099 | vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000 |
michael@0 | 7100 | vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000 |
michael@0 | 7101 | vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000 |
michael@0 | 7102 | sub ecx, 2 |
michael@0 | 7103 | vmovq qword ptr [edx], xmm0 |
michael@0 | 7104 | lea edx, [edx + 8] |
michael@0 | 7105 | jg convertloop |
michael@0 | 7106 | vzeroupper |
michael@0 | 7107 | ret |
michael@0 | 7108 | } |
michael@0 | 7109 | } |
michael@0 | 7110 | #endif // HAS_ARGBPOLYNOMIALROW_AVX2 |
michael@0 | 7111 | |
michael@0 | 7112 | #ifdef HAS_ARGBCOLORTABLEROW_X86 |
michael@0 | 7113 | // Tranform ARGB pixels with color table. |
michael@0 | 7114 | __declspec(naked) __declspec(align(16)) |
michael@0 | 7115 | void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, |
michael@0 | 7116 | int width) { |
michael@0 | 7117 | __asm { |
michael@0 | 7118 | push esi |
michael@0 | 7119 | mov eax, [esp + 4 + 4] /* dst_argb */ |
michael@0 | 7120 | mov esi, [esp + 4 + 8] /* table_argb */ |
michael@0 | 7121 | mov ecx, [esp + 4 + 12] /* width */ |
michael@0 | 7122 | |
michael@0 | 7123 | // 1 pixel loop. |
michael@0 | 7124 | align 4 |
michael@0 | 7125 | convertloop: |
michael@0 | 7126 | movzx edx, byte ptr [eax] |
michael@0 | 7127 | lea eax, [eax + 4] |
michael@0 | 7128 | movzx edx, byte ptr [esi + edx * 4] |
michael@0 | 7129 | mov byte ptr [eax - 4], dl |
michael@0 | 7130 | movzx edx, byte ptr [eax - 4 + 1] |
michael@0 | 7131 | movzx edx, byte ptr [esi + edx * 4 + 1] |
michael@0 | 7132 | mov byte ptr [eax - 4 + 1], dl |
michael@0 | 7133 | movzx edx, byte ptr [eax - 4 + 2] |
michael@0 | 7134 | movzx edx, byte ptr [esi + edx * 4 + 2] |
michael@0 | 7135 | mov byte ptr [eax - 4 + 2], dl |
michael@0 | 7136 | movzx edx, byte ptr [eax - 4 + 3] |
michael@0 | 7137 | movzx edx, byte ptr [esi + edx * 4 + 3] |
michael@0 | 7138 | mov byte ptr [eax - 4 + 3], dl |
michael@0 | 7139 | dec ecx |
michael@0 | 7140 | jg convertloop |
michael@0 | 7141 | pop esi |
michael@0 | 7142 | ret |
michael@0 | 7143 | } |
michael@0 | 7144 | } |
michael@0 | 7145 | #endif // HAS_ARGBCOLORTABLEROW_X86 |
michael@0 | 7146 | |
michael@0 | 7147 | #ifdef HAS_RGBCOLORTABLEROW_X86 |
michael@0 | 7148 | // Tranform RGB pixels with color table. |
michael@0 | 7149 | __declspec(naked) __declspec(align(16)) |
michael@0 | 7150 | void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { |
michael@0 | 7151 | __asm { |
michael@0 | 7152 | push esi |
michael@0 | 7153 | mov eax, [esp + 4 + 4] /* dst_argb */ |
michael@0 | 7154 | mov esi, [esp + 4 + 8] /* table_argb */ |
michael@0 | 7155 | mov ecx, [esp + 4 + 12] /* width */ |
michael@0 | 7156 | |
michael@0 | 7157 | // 1 pixel loop. |
michael@0 | 7158 | align 4 |
michael@0 | 7159 | convertloop: |
michael@0 | 7160 | movzx edx, byte ptr [eax] |
michael@0 | 7161 | lea eax, [eax + 4] |
michael@0 | 7162 | movzx edx, byte ptr [esi + edx * 4] |
michael@0 | 7163 | mov byte ptr [eax - 4], dl |
michael@0 | 7164 | movzx edx, byte ptr [eax - 4 + 1] |
michael@0 | 7165 | movzx edx, byte ptr [esi + edx * 4 + 1] |
michael@0 | 7166 | mov byte ptr [eax - 4 + 1], dl |
michael@0 | 7167 | movzx edx, byte ptr [eax - 4 + 2] |
michael@0 | 7168 | movzx edx, byte ptr [esi + edx * 4 + 2] |
michael@0 | 7169 | mov byte ptr [eax - 4 + 2], dl |
michael@0 | 7170 | dec ecx |
michael@0 | 7171 | jg convertloop |
michael@0 | 7172 | |
michael@0 | 7173 | pop esi |
michael@0 | 7174 | ret |
michael@0 | 7175 | } |
michael@0 | 7176 | } |
michael@0 | 7177 | #endif // HAS_RGBCOLORTABLEROW_X86 |
michael@0 | 7178 | |
michael@0 | 7179 | #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
michael@0 | 7180 | // Tranform RGB pixels with luma table. |
michael@0 | 7181 | __declspec(naked) __declspec(align(16)) |
michael@0 | 7182 | void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, |
michael@0 | 7183 | int width, |
michael@0 | 7184 | const uint8* luma, uint32 lumacoeff) { |
michael@0 | 7185 | __asm { |
michael@0 | 7186 | push esi |
michael@0 | 7187 | push edi |
michael@0 | 7188 | mov eax, [esp + 8 + 4] /* src_argb */ |
michael@0 | 7189 | mov edi, [esp + 8 + 8] /* dst_argb */ |
michael@0 | 7190 | mov ecx, [esp + 8 + 12] /* width */ |
michael@0 | 7191 | movd xmm2, dword ptr [esp + 8 + 16] // luma table |
michael@0 | 7192 | movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff |
michael@0 | 7193 | pshufd xmm2, xmm2, 0 |
michael@0 | 7194 | pshufd xmm3, xmm3, 0 |
michael@0 | 7195 | pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00 |
michael@0 | 7196 | psllw xmm4, 8 |
michael@0 | 7197 | pxor xmm5, xmm5 |
michael@0 | 7198 | |
michael@0 | 7199 | // 4 pixel loop. |
michael@0 | 7200 | align 4 |
michael@0 | 7201 | convertloop: |
michael@0 | 7202 | movdqu xmm0, qword ptr [eax] // generate luma ptr |
michael@0 | 7203 | pmaddubsw xmm0, xmm3 |
michael@0 | 7204 | phaddw xmm0, xmm0 |
michael@0 | 7205 | pand xmm0, xmm4 // mask out low bits |
michael@0 | 7206 | punpcklwd xmm0, xmm5 |
michael@0 | 7207 | paddd xmm0, xmm2 // add table base |
michael@0 | 7208 | movd esi, xmm0 |
michael@0 | 7209 | pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 |
michael@0 | 7210 | |
michael@0 | 7211 | movzx edx, byte ptr [eax] |
michael@0 | 7212 | movzx edx, byte ptr [esi + edx] |
michael@0 | 7213 | mov byte ptr [edi], dl |
michael@0 | 7214 | movzx edx, byte ptr [eax + 1] |
michael@0 | 7215 | movzx edx, byte ptr [esi + edx] |
michael@0 | 7216 | mov byte ptr [edi + 1], dl |
michael@0 | 7217 | movzx edx, byte ptr [eax + 2] |
michael@0 | 7218 | movzx edx, byte ptr [esi + edx] |
michael@0 | 7219 | mov byte ptr [edi + 2], dl |
michael@0 | 7220 | movzx edx, byte ptr [eax + 3] // copy alpha. |
michael@0 | 7221 | mov byte ptr [edi + 3], dl |
michael@0 | 7222 | |
michael@0 | 7223 | movd esi, xmm0 |
michael@0 | 7224 | pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 |
michael@0 | 7225 | |
michael@0 | 7226 | movzx edx, byte ptr [eax + 4] |
michael@0 | 7227 | movzx edx, byte ptr [esi + edx] |
michael@0 | 7228 | mov byte ptr [edi + 4], dl |
michael@0 | 7229 | movzx edx, byte ptr [eax + 5] |
michael@0 | 7230 | movzx edx, byte ptr [esi + edx] |
michael@0 | 7231 | mov byte ptr [edi + 5], dl |
michael@0 | 7232 | movzx edx, byte ptr [eax + 6] |
michael@0 | 7233 | movzx edx, byte ptr [esi + edx] |
michael@0 | 7234 | mov byte ptr [edi + 6], dl |
michael@0 | 7235 | movzx edx, byte ptr [eax + 7] // copy alpha. |
michael@0 | 7236 | mov byte ptr [edi + 7], dl |
michael@0 | 7237 | |
michael@0 | 7238 | movd esi, xmm0 |
michael@0 | 7239 | pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 |
michael@0 | 7240 | |
michael@0 | 7241 | movzx edx, byte ptr [eax + 8] |
michael@0 | 7242 | movzx edx, byte ptr [esi + edx] |
michael@0 | 7243 | mov byte ptr [edi + 8], dl |
michael@0 | 7244 | movzx edx, byte ptr [eax + 9] |
michael@0 | 7245 | movzx edx, byte ptr [esi + edx] |
michael@0 | 7246 | mov byte ptr [edi + 9], dl |
michael@0 | 7247 | movzx edx, byte ptr [eax + 10] |
michael@0 | 7248 | movzx edx, byte ptr [esi + edx] |
michael@0 | 7249 | mov byte ptr [edi + 10], dl |
michael@0 | 7250 | movzx edx, byte ptr [eax + 11] // copy alpha. |
michael@0 | 7251 | mov byte ptr [edi + 11], dl |
michael@0 | 7252 | |
michael@0 | 7253 | movd esi, xmm0 |
michael@0 | 7254 | |
michael@0 | 7255 | movzx edx, byte ptr [eax + 12] |
michael@0 | 7256 | movzx edx, byte ptr [esi + edx] |
michael@0 | 7257 | mov byte ptr [edi + 12], dl |
michael@0 | 7258 | movzx edx, byte ptr [eax + 13] |
michael@0 | 7259 | movzx edx, byte ptr [esi + edx] |
michael@0 | 7260 | mov byte ptr [edi + 13], dl |
michael@0 | 7261 | movzx edx, byte ptr [eax + 14] |
michael@0 | 7262 | movzx edx, byte ptr [esi + edx] |
michael@0 | 7263 | mov byte ptr [edi + 14], dl |
michael@0 | 7264 | movzx edx, byte ptr [eax + 15] // copy alpha. |
michael@0 | 7265 | mov byte ptr [edi + 15], dl |
michael@0 | 7266 | |
michael@0 | 7267 | sub ecx, 4 |
michael@0 | 7268 | lea eax, [eax + 16] |
michael@0 | 7269 | lea edi, [edi + 16] |
michael@0 | 7270 | jg convertloop |
michael@0 | 7271 | |
michael@0 | 7272 | pop edi |
michael@0 | 7273 | pop esi |
michael@0 | 7274 | ret |
michael@0 | 7275 | } |
michael@0 | 7276 | } |
michael@0 | 7277 | #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
michael@0 | 7278 | |
michael@0 | 7279 | #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) |
michael@0 | 7280 | |
michael@0 | 7281 | #ifdef __cplusplus |
michael@0 | 7282 | } // extern "C" |
michael@0 | 7283 | } // namespace libyuv |
michael@0 | 7284 | #endif |