media/libyuv/source/row_win.cc

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /*
michael@0 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
michael@0 3 *
michael@0 4 * Use of this source code is governed by a BSD-style license
michael@0 5 * that can be found in the LICENSE file in the root of the source
michael@0 6 * tree. An additional intellectual property rights grant can be found
michael@0 7 * in the file PATENTS. All contributing project authors may
michael@0 8 * be found in the AUTHORS file in the root of the source tree.
michael@0 9 */
michael@0 10
michael@0 11 #include "libyuv/row.h"
michael@0 12
michael@0 13 #ifdef __cplusplus
michael@0 14 namespace libyuv {
michael@0 15 extern "C" {
michael@0 16 #endif
michael@0 17
michael@0 18 // This module is for Visual C x86.
michael@0 19 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
michael@0 20
michael@0 21 #ifdef HAS_ARGBTOYROW_SSSE3
michael@0 22
michael@0 23 // Constants for ARGB.
michael@0 24 static const vec8 kARGBToY = {
michael@0 25 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
michael@0 26 };
michael@0 27
michael@0 28 // JPeg full range.
michael@0 29 static const vec8 kARGBToYJ = {
michael@0 30 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
michael@0 31 };
michael@0 32
michael@0 33 static const vec8 kARGBToU = {
michael@0 34 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
michael@0 35 };
michael@0 36
michael@0 37 static const vec8 kARGBToUJ = {
michael@0 38 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
michael@0 39 };
michael@0 40
michael@0 41 static const vec8 kARGBToV = {
michael@0 42 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
michael@0 43 };
michael@0 44
michael@0 45 static const vec8 kARGBToVJ = {
michael@0 46 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
michael@0 47 };
michael@0 48
michael@0 49 // vpermd for vphaddw + vpackuswb vpermd.
michael@0 50 static const lvec32 kPermdARGBToY_AVX = {
michael@0 51 0, 4, 1, 5, 2, 6, 3, 7
michael@0 52 };
michael@0 53
michael@0 54 // vpshufb for vphaddw + vpackuswb packed to shorts.
michael@0 55 static const lvec8 kShufARGBToUV_AVX = {
michael@0 56 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
michael@0 57 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
michael@0 58 };
michael@0 59
michael@0 60 // Constants for BGRA.
michael@0 61 static const vec8 kBGRAToY = {
michael@0 62 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
michael@0 63 };
michael@0 64
michael@0 65 static const vec8 kBGRAToU = {
michael@0 66 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
michael@0 67 };
michael@0 68
michael@0 69 static const vec8 kBGRAToV = {
michael@0 70 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
michael@0 71 };
michael@0 72
michael@0 73 // Constants for ABGR.
michael@0 74 static const vec8 kABGRToY = {
michael@0 75 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
michael@0 76 };
michael@0 77
michael@0 78 static const vec8 kABGRToU = {
michael@0 79 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
michael@0 80 };
michael@0 81
michael@0 82 static const vec8 kABGRToV = {
michael@0 83 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
michael@0 84 };
michael@0 85
michael@0 86 // Constants for RGBA.
michael@0 87 static const vec8 kRGBAToY = {
michael@0 88 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
michael@0 89 };
michael@0 90
michael@0 91 static const vec8 kRGBAToU = {
michael@0 92 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
michael@0 93 };
michael@0 94
michael@0 95 static const vec8 kRGBAToV = {
michael@0 96 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
michael@0 97 };
michael@0 98
michael@0 99 static const uvec8 kAddY16 = {
michael@0 100 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
michael@0 101 };
michael@0 102
michael@0 103 static const vec16 kAddYJ64 = {
michael@0 104 64, 64, 64, 64, 64, 64, 64, 64
michael@0 105 };
michael@0 106
michael@0 107 static const uvec8 kAddUV128 = {
michael@0 108 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
michael@0 109 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
michael@0 110 };
michael@0 111
michael@0 112 static const uvec16 kAddUVJ128 = {
michael@0 113 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
michael@0 114 };
michael@0 115
michael@0 116 // Shuffle table for converting RGB24 to ARGB.
michael@0 117 static const uvec8 kShuffleMaskRGB24ToARGB = {
michael@0 118 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
michael@0 119 };
michael@0 120
michael@0 121 // Shuffle table for converting RAW to ARGB.
michael@0 122 static const uvec8 kShuffleMaskRAWToARGB = {
michael@0 123 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
michael@0 124 };
michael@0 125
michael@0 126 // Shuffle table for converting ARGB to RGB24.
michael@0 127 static const uvec8 kShuffleMaskARGBToRGB24 = {
michael@0 128 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
michael@0 129 };
michael@0 130
michael@0 131 // Shuffle table for converting ARGB to RAW.
michael@0 132 static const uvec8 kShuffleMaskARGBToRAW = {
michael@0 133 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
michael@0 134 };
michael@0 135
michael@0 136 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
michael@0 137 static const uvec8 kShuffleMaskARGBToRGB24_0 = {
michael@0 138 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
michael@0 139 };
michael@0 140
michael@0 141 // Shuffle table for converting ARGB to RAW.
michael@0 142 static const uvec8 kShuffleMaskARGBToRAW_0 = {
michael@0 143 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
michael@0 144 };
michael@0 145
michael@0 146 // Duplicates gray value 3 times and fills in alpha opaque.
michael@0 147 __declspec(naked) __declspec(align(16))
michael@0 148 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
michael@0 149 __asm {
michael@0 150 mov eax, [esp + 4] // src_y
michael@0 151 mov edx, [esp + 8] // dst_argb
michael@0 152 mov ecx, [esp + 12] // pix
michael@0 153 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
michael@0 154 pslld xmm5, 24
michael@0 155
michael@0 156 align 4
michael@0 157 convertloop:
michael@0 158 movq xmm0, qword ptr [eax]
michael@0 159 lea eax, [eax + 8]
michael@0 160 punpcklbw xmm0, xmm0
michael@0 161 movdqa xmm1, xmm0
michael@0 162 punpcklwd xmm0, xmm0
michael@0 163 punpckhwd xmm1, xmm1
michael@0 164 por xmm0, xmm5
michael@0 165 por xmm1, xmm5
michael@0 166 movdqa [edx], xmm0
michael@0 167 movdqa [edx + 16], xmm1
michael@0 168 lea edx, [edx + 32]
michael@0 169 sub ecx, 8
michael@0 170 jg convertloop
michael@0 171 ret
michael@0 172 }
michael@0 173 }
michael@0 174
michael@0 175 __declspec(naked) __declspec(align(16))
michael@0 176 void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
michael@0 177 int pix) {
michael@0 178 __asm {
michael@0 179 mov eax, [esp + 4] // src_y
michael@0 180 mov edx, [esp + 8] // dst_argb
michael@0 181 mov ecx, [esp + 12] // pix
michael@0 182 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
michael@0 183 pslld xmm5, 24
michael@0 184
michael@0 185 align 4
michael@0 186 convertloop:
michael@0 187 movq xmm0, qword ptr [eax]
michael@0 188 lea eax, [eax + 8]
michael@0 189 punpcklbw xmm0, xmm0
michael@0 190 movdqa xmm1, xmm0
michael@0 191 punpcklwd xmm0, xmm0
michael@0 192 punpckhwd xmm1, xmm1
michael@0 193 por xmm0, xmm5
michael@0 194 por xmm1, xmm5
michael@0 195 movdqu [edx], xmm0
michael@0 196 movdqu [edx + 16], xmm1
michael@0 197 lea edx, [edx + 32]
michael@0 198 sub ecx, 8
michael@0 199 jg convertloop
michael@0 200 ret
michael@0 201 }
michael@0 202 }
michael@0 203
michael@0 204 __declspec(naked) __declspec(align(16))
michael@0 205 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
michael@0 206 __asm {
michael@0 207 mov eax, [esp + 4] // src_rgb24
michael@0 208 mov edx, [esp + 8] // dst_argb
michael@0 209 mov ecx, [esp + 12] // pix
michael@0 210 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
michael@0 211 pslld xmm5, 24
michael@0 212 movdqa xmm4, kShuffleMaskRGB24ToARGB
michael@0 213
michael@0 214 align 4
michael@0 215 convertloop:
michael@0 216 movdqu xmm0, [eax]
michael@0 217 movdqu xmm1, [eax + 16]
michael@0 218 movdqu xmm3, [eax + 32]
michael@0 219 lea eax, [eax + 48]
michael@0 220 movdqa xmm2, xmm3
michael@0 221 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
michael@0 222 pshufb xmm2, xmm4
michael@0 223 por xmm2, xmm5
michael@0 224 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
michael@0 225 pshufb xmm0, xmm4
michael@0 226 movdqa [edx + 32], xmm2
michael@0 227 por xmm0, xmm5
michael@0 228 pshufb xmm1, xmm4
michael@0 229 movdqa [edx], xmm0
michael@0 230 por xmm1, xmm5
michael@0 231 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
michael@0 232 pshufb xmm3, xmm4
michael@0 233 movdqa [edx + 16], xmm1
michael@0 234 por xmm3, xmm5
michael@0 235 sub ecx, 16
michael@0 236 movdqa [edx + 48], xmm3
michael@0 237 lea edx, [edx + 64]
michael@0 238 jg convertloop
michael@0 239 ret
michael@0 240 }
michael@0 241 }
michael@0 242
michael@0 243 __declspec(naked) __declspec(align(16))
michael@0 244 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
michael@0 245 int pix) {
michael@0 246 __asm {
michael@0 247 mov eax, [esp + 4] // src_raw
michael@0 248 mov edx, [esp + 8] // dst_argb
michael@0 249 mov ecx, [esp + 12] // pix
michael@0 250 pcmpeqb xmm5, xmm5 // generate mask 0xff000000
michael@0 251 pslld xmm5, 24
michael@0 252 movdqa xmm4, kShuffleMaskRAWToARGB
michael@0 253
michael@0 254 align 4
michael@0 255 convertloop:
michael@0 256 movdqu xmm0, [eax]
michael@0 257 movdqu xmm1, [eax + 16]
michael@0 258 movdqu xmm3, [eax + 32]
michael@0 259 lea eax, [eax + 48]
michael@0 260 movdqa xmm2, xmm3
michael@0 261 palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
michael@0 262 pshufb xmm2, xmm4
michael@0 263 por xmm2, xmm5
michael@0 264 palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
michael@0 265 pshufb xmm0, xmm4
michael@0 266 movdqa [edx + 32], xmm2
michael@0 267 por xmm0, xmm5
michael@0 268 pshufb xmm1, xmm4
michael@0 269 movdqa [edx], xmm0
michael@0 270 por xmm1, xmm5
michael@0 271 palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
michael@0 272 pshufb xmm3, xmm4
michael@0 273 movdqa [edx + 16], xmm1
michael@0 274 por xmm3, xmm5
michael@0 275 sub ecx, 16
michael@0 276 movdqa [edx + 48], xmm3
michael@0 277 lea edx, [edx + 64]
michael@0 278 jg convertloop
michael@0 279 ret
michael@0 280 }
michael@0 281 }
michael@0 282
michael@0 283 // pmul method to replicate bits.
michael@0 284 // Math to replicate bits:
michael@0 285 // (v << 8) | (v << 3)
michael@0 286 // v * 256 + v * 8
michael@0 287 // v * (256 + 8)
michael@0 288 // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
michael@0 289 // 20 instructions.
michael@0 290 __declspec(naked) __declspec(align(16))
michael@0 291 void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
michael@0 292 int pix) {
michael@0 293 __asm {
michael@0 294 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
michael@0 295 movd xmm5, eax
michael@0 296 pshufd xmm5, xmm5, 0
michael@0 297 mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits
michael@0 298 movd xmm6, eax
michael@0 299 pshufd xmm6, xmm6, 0
michael@0 300 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
michael@0 301 psllw xmm3, 11
michael@0 302 pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green
michael@0 303 psllw xmm4, 10
michael@0 304 psrlw xmm4, 5
michael@0 305 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
michael@0 306 psllw xmm7, 8
michael@0 307
michael@0 308 mov eax, [esp + 4] // src_rgb565
michael@0 309 mov edx, [esp + 8] // dst_argb
michael@0 310 mov ecx, [esp + 12] // pix
michael@0 311 sub edx, eax
michael@0 312 sub edx, eax
michael@0 313
michael@0 314 align 4
michael@0 315 convertloop:
michael@0 316 movdqu xmm0, [eax] // fetch 8 pixels of bgr565
michael@0 317 movdqa xmm1, xmm0
michael@0 318 movdqa xmm2, xmm0
michael@0 319 pand xmm1, xmm3 // R in upper 5 bits
michael@0 320 psllw xmm2, 11 // B in upper 5 bits
michael@0 321 pmulhuw xmm1, xmm5 // * (256 + 8)
michael@0 322 pmulhuw xmm2, xmm5 // * (256 + 8)
michael@0 323 psllw xmm1, 8
michael@0 324 por xmm1, xmm2 // RB
michael@0 325 pand xmm0, xmm4 // G in middle 6 bits
michael@0 326 pmulhuw xmm0, xmm6 // << 5 * (256 + 4)
michael@0 327 por xmm0, xmm7 // AG
michael@0 328 movdqa xmm2, xmm1
michael@0 329 punpcklbw xmm1, xmm0
michael@0 330 punpckhbw xmm2, xmm0
michael@0 331 movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
michael@0 332 movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
michael@0 333 lea eax, [eax + 16]
michael@0 334 sub ecx, 8
michael@0 335 jg convertloop
michael@0 336 ret
michael@0 337 }
michael@0 338 }
michael@0 339
michael@0 340 // 24 instructions
michael@0 341 __declspec(naked) __declspec(align(16))
michael@0 342 void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
michael@0 343 int pix) {
michael@0 344 __asm {
michael@0 345 mov eax, 0x01080108 // generate multiplier to repeat 5 bits
michael@0 346 movd xmm5, eax
michael@0 347 pshufd xmm5, xmm5, 0
michael@0 348 mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits
michael@0 349 movd xmm6, eax
michael@0 350 pshufd xmm6, xmm6, 0
michael@0 351 pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red
michael@0 352 psllw xmm3, 11
michael@0 353 movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green
michael@0 354 psrlw xmm4, 6
michael@0 355 pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha
michael@0 356 psllw xmm7, 8
michael@0 357
michael@0 358 mov eax, [esp + 4] // src_argb1555
michael@0 359 mov edx, [esp + 8] // dst_argb
michael@0 360 mov ecx, [esp + 12] // pix
michael@0 361 sub edx, eax
michael@0 362 sub edx, eax
michael@0 363
michael@0 364 align 4
michael@0 365 convertloop:
michael@0 366 movdqu xmm0, [eax] // fetch 8 pixels of 1555
michael@0 367 movdqa xmm1, xmm0
michael@0 368 movdqa xmm2, xmm0
michael@0 369 psllw xmm1, 1 // R in upper 5 bits
michael@0 370 psllw xmm2, 11 // B in upper 5 bits
michael@0 371 pand xmm1, xmm3
michael@0 372 pmulhuw xmm2, xmm5 // * (256 + 8)
michael@0 373 pmulhuw xmm1, xmm5 // * (256 + 8)
michael@0 374 psllw xmm1, 8
michael@0 375 por xmm1, xmm2 // RB
michael@0 376 movdqa xmm2, xmm0
michael@0 377 pand xmm0, xmm4 // G in middle 5 bits
michael@0 378 psraw xmm2, 8 // A
michael@0 379 pmulhuw xmm0, xmm6 // << 6 * (256 + 8)
michael@0 380 pand xmm2, xmm7
michael@0 381 por xmm0, xmm2 // AG
michael@0 382 movdqa xmm2, xmm1
michael@0 383 punpcklbw xmm1, xmm0
michael@0 384 punpckhbw xmm2, xmm0
michael@0 385 movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB
michael@0 386 movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB
michael@0 387 lea eax, [eax + 16]
michael@0 388 sub ecx, 8
michael@0 389 jg convertloop
michael@0 390 ret
michael@0 391 }
michael@0 392 }
michael@0 393
michael@0 394 // 18 instructions.
michael@0 395 __declspec(naked) __declspec(align(16))
michael@0 396 void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
michael@0 397 int pix) {
michael@0 398 __asm {
michael@0 399 mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f
michael@0 400 movd xmm4, eax
michael@0 401 pshufd xmm4, xmm4, 0
michael@0 402 movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles
michael@0 403 pslld xmm5, 4
michael@0 404 mov eax, [esp + 4] // src_argb4444
michael@0 405 mov edx, [esp + 8] // dst_argb
michael@0 406 mov ecx, [esp + 12] // pix
michael@0 407 sub edx, eax
michael@0 408 sub edx, eax
michael@0 409
michael@0 410 align 4
michael@0 411 convertloop:
michael@0 412 movdqu xmm0, [eax] // fetch 8 pixels of bgra4444
michael@0 413 movdqa xmm2, xmm0
michael@0 414 pand xmm0, xmm4 // mask low nibbles
michael@0 415 pand xmm2, xmm5 // mask high nibbles
michael@0 416 movdqa xmm1, xmm0
michael@0 417 movdqa xmm3, xmm2
michael@0 418 psllw xmm1, 4
michael@0 419 psrlw xmm3, 4
michael@0 420 por xmm0, xmm1
michael@0 421 por xmm2, xmm3
michael@0 422 movdqa xmm1, xmm0
michael@0 423 punpcklbw xmm0, xmm2
michael@0 424 punpckhbw xmm1, xmm2
michael@0 425 movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB
michael@0 426 movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB
michael@0 427 lea eax, [eax + 16]
michael@0 428 sub ecx, 8
michael@0 429 jg convertloop
michael@0 430 ret
michael@0 431 }
michael@0 432 }
michael@0 433
michael@0 434 __declspec(naked) __declspec(align(16))
michael@0 435 void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
michael@0 436 __asm {
michael@0 437 mov eax, [esp + 4] // src_argb
michael@0 438 mov edx, [esp + 8] // dst_rgb
michael@0 439 mov ecx, [esp + 12] // pix
michael@0 440 movdqa xmm6, kShuffleMaskARGBToRGB24
michael@0 441
michael@0 442 align 4
michael@0 443 convertloop:
michael@0 444 movdqu xmm0, [eax] // fetch 16 pixels of argb
michael@0 445 movdqu xmm1, [eax + 16]
michael@0 446 movdqu xmm2, [eax + 32]
michael@0 447 movdqu xmm3, [eax + 48]
michael@0 448 lea eax, [eax + 64]
michael@0 449 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
michael@0 450 pshufb xmm1, xmm6
michael@0 451 pshufb xmm2, xmm6
michael@0 452 pshufb xmm3, xmm6
michael@0 453 movdqa xmm4, xmm1 // 4 bytes from 1 for 0
michael@0 454 psrldq xmm1, 4 // 8 bytes from 1
michael@0 455 pslldq xmm4, 12 // 4 bytes from 1 for 0
michael@0 456 movdqa xmm5, xmm2 // 8 bytes from 2 for 1
michael@0 457 por xmm0, xmm4 // 4 bytes from 1 for 0
michael@0 458 pslldq xmm5, 8 // 8 bytes from 2 for 1
michael@0 459 movdqu [edx], xmm0 // store 0
michael@0 460 por xmm1, xmm5 // 8 bytes from 2 for 1
michael@0 461 psrldq xmm2, 8 // 4 bytes from 2
michael@0 462 pslldq xmm3, 4 // 12 bytes from 3 for 2
michael@0 463 por xmm2, xmm3 // 12 bytes from 3 for 2
michael@0 464 movdqu [edx + 16], xmm1 // store 1
michael@0 465 movdqu [edx + 32], xmm2 // store 2
michael@0 466 lea edx, [edx + 48]
michael@0 467 sub ecx, 16
michael@0 468 jg convertloop
michael@0 469 ret
michael@0 470 }
michael@0 471 }
michael@0 472
michael@0 473 __declspec(naked) __declspec(align(16))
michael@0 474 void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
michael@0 475 __asm {
michael@0 476 mov eax, [esp + 4] // src_argb
michael@0 477 mov edx, [esp + 8] // dst_rgb
michael@0 478 mov ecx, [esp + 12] // pix
michael@0 479 movdqa xmm6, kShuffleMaskARGBToRAW
michael@0 480
michael@0 481 align 4
michael@0 482 convertloop:
michael@0 483 movdqu xmm0, [eax] // fetch 16 pixels of argb
michael@0 484 movdqu xmm1, [eax + 16]
michael@0 485 movdqu xmm2, [eax + 32]
michael@0 486 movdqu xmm3, [eax + 48]
michael@0 487 lea eax, [eax + 64]
michael@0 488 pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
michael@0 489 pshufb xmm1, xmm6
michael@0 490 pshufb xmm2, xmm6
michael@0 491 pshufb xmm3, xmm6
michael@0 492 movdqa xmm4, xmm1 // 4 bytes from 1 for 0
michael@0 493 psrldq xmm1, 4 // 8 bytes from 1
michael@0 494 pslldq xmm4, 12 // 4 bytes from 1 for 0
michael@0 495 movdqa xmm5, xmm2 // 8 bytes from 2 for 1
michael@0 496 por xmm0, xmm4 // 4 bytes from 1 for 0
michael@0 497 pslldq xmm5, 8 // 8 bytes from 2 for 1
michael@0 498 movdqu [edx], xmm0 // store 0
michael@0 499 por xmm1, xmm5 // 8 bytes from 2 for 1
michael@0 500 psrldq xmm2, 8 // 4 bytes from 2
michael@0 501 pslldq xmm3, 4 // 12 bytes from 3 for 2
michael@0 502 por xmm2, xmm3 // 12 bytes from 3 for 2
michael@0 503 movdqu [edx + 16], xmm1 // store 1
michael@0 504 movdqu [edx + 32], xmm2 // store 2
michael@0 505 lea edx, [edx + 48]
michael@0 506 sub ecx, 16
michael@0 507 jg convertloop
michael@0 508 ret
michael@0 509 }
michael@0 510 }
michael@0 511
michael@0 512 __declspec(naked) __declspec(align(16))
michael@0 513 void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
michael@0 514 __asm {
michael@0 515 mov eax, [esp + 4] // src_argb
michael@0 516 mov edx, [esp + 8] // dst_rgb
michael@0 517 mov ecx, [esp + 12] // pix
michael@0 518 pcmpeqb xmm3, xmm3 // generate mask 0x0000001f
michael@0 519 psrld xmm3, 27
michael@0 520 pcmpeqb xmm4, xmm4 // generate mask 0x000007e0
michael@0 521 psrld xmm4, 26
michael@0 522 pslld xmm4, 5
michael@0 523 pcmpeqb xmm5, xmm5 // generate mask 0xfffff800
michael@0 524 pslld xmm5, 11
michael@0 525
michael@0 526 align 4
michael@0 527 convertloop:
michael@0 528 movdqa xmm0, [eax] // fetch 4 pixels of argb
michael@0 529 movdqa xmm1, xmm0 // B
michael@0 530 movdqa xmm2, xmm0 // G
michael@0 531 pslld xmm0, 8 // R
michael@0 532 psrld xmm1, 3 // B
michael@0 533 psrld xmm2, 5 // G
michael@0 534 psrad xmm0, 16 // R
michael@0 535 pand xmm1, xmm3 // B
michael@0 536 pand xmm2, xmm4 // G
michael@0 537 pand xmm0, xmm5 // R
michael@0 538 por xmm1, xmm2 // BG
michael@0 539 por xmm0, xmm1 // BGR
michael@0 540 packssdw xmm0, xmm0
michael@0 541 lea eax, [eax + 16]
michael@0 542 movq qword ptr [edx], xmm0 // store 4 pixels of RGB565
michael@0 543 lea edx, [edx + 8]
michael@0 544 sub ecx, 4
michael@0 545 jg convertloop
michael@0 546 ret
michael@0 547 }
michael@0 548 }
michael@0 549
michael@0 550 // TODO(fbarchard): Improve sign extension/packing.
michael@0 551 __declspec(naked) __declspec(align(16))
michael@0 552 void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
michael@0 553 __asm {
michael@0 554 mov eax, [esp + 4] // src_argb
michael@0 555 mov edx, [esp + 8] // dst_rgb
michael@0 556 mov ecx, [esp + 12] // pix
michael@0 557 pcmpeqb xmm4, xmm4 // generate mask 0x0000001f
michael@0 558 psrld xmm4, 27
michael@0 559 movdqa xmm5, xmm4 // generate mask 0x000003e0
michael@0 560 pslld xmm5, 5
michael@0 561 movdqa xmm6, xmm4 // generate mask 0x00007c00
michael@0 562 pslld xmm6, 10
michael@0 563 pcmpeqb xmm7, xmm7 // generate mask 0xffff8000
michael@0 564 pslld xmm7, 15
michael@0 565
michael@0 566 align 4
michael@0 567 convertloop:
michael@0 568 movdqa xmm0, [eax] // fetch 4 pixels of argb
michael@0 569 movdqa xmm1, xmm0 // B
michael@0 570 movdqa xmm2, xmm0 // G
michael@0 571 movdqa xmm3, xmm0 // R
michael@0 572 psrad xmm0, 16 // A
michael@0 573 psrld xmm1, 3 // B
michael@0 574 psrld xmm2, 6 // G
michael@0 575 psrld xmm3, 9 // R
michael@0 576 pand xmm0, xmm7 // A
michael@0 577 pand xmm1, xmm4 // B
michael@0 578 pand xmm2, xmm5 // G
michael@0 579 pand xmm3, xmm6 // R
michael@0 580 por xmm0, xmm1 // BA
michael@0 581 por xmm2, xmm3 // GR
michael@0 582 por xmm0, xmm2 // BGRA
michael@0 583 packssdw xmm0, xmm0
michael@0 584 lea eax, [eax + 16]
michael@0 585 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555
michael@0 586 lea edx, [edx + 8]
michael@0 587 sub ecx, 4
michael@0 588 jg convertloop
michael@0 589 ret
michael@0 590 }
michael@0 591 }
michael@0 592
michael@0 593 __declspec(naked) __declspec(align(16))
michael@0 594 void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
michael@0 595 __asm {
michael@0 596 mov eax, [esp + 4] // src_argb
michael@0 597 mov edx, [esp + 8] // dst_rgb
michael@0 598 mov ecx, [esp + 12] // pix
michael@0 599 pcmpeqb xmm4, xmm4 // generate mask 0xf000f000
michael@0 600 psllw xmm4, 12
michael@0 601 movdqa xmm3, xmm4 // generate mask 0x00f000f0
michael@0 602 psrlw xmm3, 8
michael@0 603
michael@0 604 align 4
michael@0 605 convertloop:
michael@0 606 movdqa xmm0, [eax] // fetch 4 pixels of argb
michael@0 607 movdqa xmm1, xmm0
michael@0 608 pand xmm0, xmm3 // low nibble
michael@0 609 pand xmm1, xmm4 // high nibble
michael@0 610 psrl xmm0, 4
michael@0 611 psrl xmm1, 8
michael@0 612 por xmm0, xmm1
michael@0 613 packuswb xmm0, xmm0
michael@0 614 lea eax, [eax + 16]
michael@0 615 movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444
michael@0 616 lea edx, [edx + 8]
michael@0 617 sub ecx, 4
michael@0 618 jg convertloop
michael@0 619 ret
michael@0 620 }
michael@0 621 }
michael@0 622
michael@0 623 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
michael@0 624 __declspec(naked) __declspec(align(16))
michael@0 625 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
michael@0 626 __asm {
michael@0 627 mov eax, [esp + 4] /* src_argb */
michael@0 628 mov edx, [esp + 8] /* dst_y */
michael@0 629 mov ecx, [esp + 12] /* pix */
michael@0 630 movdqa xmm5, kAddY16
michael@0 631 movdqa xmm4, kARGBToY
michael@0 632
michael@0 633 align 4
michael@0 634 convertloop:
michael@0 635 movdqa xmm0, [eax]
michael@0 636 movdqa xmm1, [eax + 16]
michael@0 637 movdqa xmm2, [eax + 32]
michael@0 638 movdqa xmm3, [eax + 48]
michael@0 639 pmaddubsw xmm0, xmm4
michael@0 640 pmaddubsw xmm1, xmm4
michael@0 641 pmaddubsw xmm2, xmm4
michael@0 642 pmaddubsw xmm3, xmm4
michael@0 643 lea eax, [eax + 64]
michael@0 644 phaddw xmm0, xmm1
michael@0 645 phaddw xmm2, xmm3
michael@0 646 psrlw xmm0, 7
michael@0 647 psrlw xmm2, 7
michael@0 648 packuswb xmm0, xmm2
michael@0 649 paddb xmm0, xmm5
michael@0 650 sub ecx, 16
michael@0 651 movdqa [edx], xmm0
michael@0 652 lea edx, [edx + 16]
michael@0 653 jg convertloop
michael@0 654 ret
michael@0 655 }
michael@0 656 }
michael@0 657
michael@0 658 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
michael@0 659 __declspec(naked) __declspec(align(16))
michael@0 660 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
michael@0 661 __asm {
michael@0 662 mov eax, [esp + 4] /* src_argb */
michael@0 663 mov edx, [esp + 8] /* dst_y */
michael@0 664 mov ecx, [esp + 12] /* pix */
michael@0 665 movdqa xmm4, kARGBToYJ
michael@0 666 movdqa xmm5, kAddYJ64
michael@0 667
michael@0 668 align 4
michael@0 669 convertloop:
michael@0 670 movdqa xmm0, [eax]
michael@0 671 movdqa xmm1, [eax + 16]
michael@0 672 movdqa xmm2, [eax + 32]
michael@0 673 movdqa xmm3, [eax + 48]
michael@0 674 pmaddubsw xmm0, xmm4
michael@0 675 pmaddubsw xmm1, xmm4
michael@0 676 pmaddubsw xmm2, xmm4
michael@0 677 pmaddubsw xmm3, xmm4
michael@0 678 lea eax, [eax + 64]
michael@0 679 phaddw xmm0, xmm1
michael@0 680 phaddw xmm2, xmm3
michael@0 681 paddw xmm0, xmm5 // Add .5 for rounding.
michael@0 682 paddw xmm2, xmm5
michael@0 683 psrlw xmm0, 7
michael@0 684 psrlw xmm2, 7
michael@0 685 packuswb xmm0, xmm2
michael@0 686 sub ecx, 16
michael@0 687 movdqa [edx], xmm0
michael@0 688 lea edx, [edx + 16]
michael@0 689 jg convertloop
michael@0 690 ret
michael@0 691 }
michael@0 692 }
michael@0 693
michael@0 694 #ifdef HAS_ARGBTOYROW_AVX2
michael@0 695 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
michael@0 696 __declspec(naked) __declspec(align(32))
michael@0 697 void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
michael@0 698 __asm {
michael@0 699 mov eax, [esp + 4] /* src_argb */
michael@0 700 mov edx, [esp + 8] /* dst_y */
michael@0 701 mov ecx, [esp + 12] /* pix */
michael@0 702 vbroadcastf128 ymm4, kARGBToY
michael@0 703 vbroadcastf128 ymm5, kAddY16
michael@0 704 vmovdqa ymm6, kPermdARGBToY_AVX
michael@0 705
michael@0 706 align 4
michael@0 707 convertloop:
michael@0 708 vmovdqu ymm0, [eax]
michael@0 709 vmovdqu ymm1, [eax + 32]
michael@0 710 vmovdqu ymm2, [eax + 64]
michael@0 711 vmovdqu ymm3, [eax + 96]
michael@0 712 vpmaddubsw ymm0, ymm0, ymm4
michael@0 713 vpmaddubsw ymm1, ymm1, ymm4
michael@0 714 vpmaddubsw ymm2, ymm2, ymm4
michael@0 715 vpmaddubsw ymm3, ymm3, ymm4
michael@0 716 lea eax, [eax + 128]
michael@0 717 vphaddw ymm0, ymm0, ymm1 // mutates.
michael@0 718 vphaddw ymm2, ymm2, ymm3
michael@0 719 vpsrlw ymm0, ymm0, 7
michael@0 720 vpsrlw ymm2, ymm2, 7
michael@0 721 vpackuswb ymm0, ymm0, ymm2 // mutates.
michael@0 722 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
michael@0 723 vpaddb ymm0, ymm0, ymm5
michael@0 724 sub ecx, 32
michael@0 725 vmovdqu [edx], ymm0
michael@0 726 lea edx, [edx + 32]
michael@0 727 jg convertloop
michael@0 728 vzeroupper
michael@0 729 ret
michael@0 730 }
michael@0 731 }
michael@0 732 #endif // HAS_ARGBTOYROW_AVX2
michael@0 733
michael@0 734 #ifdef HAS_ARGBTOYROW_AVX2
michael@0 735 // Convert 32 ARGB pixels (128 bytes) to 32 Y values.
michael@0 736 __declspec(naked) __declspec(align(32))
michael@0 737 void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
michael@0 738 __asm {
michael@0 739 mov eax, [esp + 4] /* src_argb */
michael@0 740 mov edx, [esp + 8] /* dst_y */
michael@0 741 mov ecx, [esp + 12] /* pix */
michael@0 742 vbroadcastf128 ymm4, kARGBToYJ
michael@0 743 vbroadcastf128 ymm5, kAddYJ64
michael@0 744 vmovdqa ymm6, kPermdARGBToY_AVX
michael@0 745
michael@0 746 align 4
michael@0 747 convertloop:
michael@0 748 vmovdqu ymm0, [eax]
michael@0 749 vmovdqu ymm1, [eax + 32]
michael@0 750 vmovdqu ymm2, [eax + 64]
michael@0 751 vmovdqu ymm3, [eax + 96]
michael@0 752 vpmaddubsw ymm0, ymm0, ymm4
michael@0 753 vpmaddubsw ymm1, ymm1, ymm4
michael@0 754 vpmaddubsw ymm2, ymm2, ymm4
michael@0 755 vpmaddubsw ymm3, ymm3, ymm4
michael@0 756 lea eax, [eax + 128]
michael@0 757 vphaddw ymm0, ymm0, ymm1 // mutates.
michael@0 758 vphaddw ymm2, ymm2, ymm3
michael@0 759 vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding.
michael@0 760 vpaddw ymm2, ymm2, ymm5
michael@0 761 vpsrlw ymm0, ymm0, 7
michael@0 762 vpsrlw ymm2, ymm2, 7
michael@0 763 vpackuswb ymm0, ymm0, ymm2 // mutates.
michael@0 764 vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
michael@0 765 sub ecx, 32
michael@0 766 vmovdqu [edx], ymm0
michael@0 767 lea edx, [edx + 32]
michael@0 768 jg convertloop
michael@0 769
michael@0 770 vzeroupper
michael@0 771 ret
michael@0 772 }
michael@0 773 }
michael@0 774 #endif // HAS_ARGBTOYJROW_AVX2
michael@0 775
michael@0 776 __declspec(naked) __declspec(align(16))
michael@0 777 void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
michael@0 778 __asm {
michael@0 779 mov eax, [esp + 4] /* src_argb */
michael@0 780 mov edx, [esp + 8] /* dst_y */
michael@0 781 mov ecx, [esp + 12] /* pix */
michael@0 782 movdqa xmm5, kAddY16
michael@0 783 movdqa xmm4, kARGBToY
michael@0 784
michael@0 785 align 4
michael@0 786 convertloop:
michael@0 787 movdqu xmm0, [eax]
michael@0 788 movdqu xmm1, [eax + 16]
michael@0 789 movdqu xmm2, [eax + 32]
michael@0 790 movdqu xmm3, [eax + 48]
michael@0 791 pmaddubsw xmm0, xmm4
michael@0 792 pmaddubsw xmm1, xmm4
michael@0 793 pmaddubsw xmm2, xmm4
michael@0 794 pmaddubsw xmm3, xmm4
michael@0 795 lea eax, [eax + 64]
michael@0 796 phaddw xmm0, xmm1
michael@0 797 phaddw xmm2, xmm3
michael@0 798 psrlw xmm0, 7
michael@0 799 psrlw xmm2, 7
michael@0 800 packuswb xmm0, xmm2
michael@0 801 paddb xmm0, xmm5
michael@0 802 sub ecx, 16
michael@0 803 movdqu [edx], xmm0
michael@0 804 lea edx, [edx + 16]
michael@0 805 jg convertloop
michael@0 806 ret
michael@0 807 }
michael@0 808 }
michael@0 809
michael@0 810 __declspec(naked) __declspec(align(16))
michael@0 811 void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
michael@0 812 __asm {
michael@0 813 mov eax, [esp + 4] /* src_argb */
michael@0 814 mov edx, [esp + 8] /* dst_y */
michael@0 815 mov ecx, [esp + 12] /* pix */
michael@0 816 movdqa xmm4, kARGBToYJ
michael@0 817 movdqa xmm5, kAddYJ64
michael@0 818
michael@0 819 align 4
michael@0 820 convertloop:
michael@0 821 movdqu xmm0, [eax]
michael@0 822 movdqu xmm1, [eax + 16]
michael@0 823 movdqu xmm2, [eax + 32]
michael@0 824 movdqu xmm3, [eax + 48]
michael@0 825 pmaddubsw xmm0, xmm4
michael@0 826 pmaddubsw xmm1, xmm4
michael@0 827 pmaddubsw xmm2, xmm4
michael@0 828 pmaddubsw xmm3, xmm4
michael@0 829 lea eax, [eax + 64]
michael@0 830 phaddw xmm0, xmm1
michael@0 831 phaddw xmm2, xmm3
michael@0 832 paddw xmm0, xmm5
michael@0 833 paddw xmm2, xmm5
michael@0 834 psrlw xmm0, 7
michael@0 835 psrlw xmm2, 7
michael@0 836 packuswb xmm0, xmm2
michael@0 837 sub ecx, 16
michael@0 838 movdqu [edx], xmm0
michael@0 839 lea edx, [edx + 16]
michael@0 840 jg convertloop
michael@0 841 ret
michael@0 842 }
michael@0 843 }
michael@0 844
michael@0 845 __declspec(naked) __declspec(align(16))
michael@0 846 void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
michael@0 847 __asm {
michael@0 848 mov eax, [esp + 4] /* src_argb */
michael@0 849 mov edx, [esp + 8] /* dst_y */
michael@0 850 mov ecx, [esp + 12] /* pix */
michael@0 851 movdqa xmm5, kAddY16
michael@0 852 movdqa xmm4, kBGRAToY
michael@0 853
michael@0 854 align 4
michael@0 855 convertloop:
michael@0 856 movdqa xmm0, [eax]
michael@0 857 movdqa xmm1, [eax + 16]
michael@0 858 movdqa xmm2, [eax + 32]
michael@0 859 movdqa xmm3, [eax + 48]
michael@0 860 pmaddubsw xmm0, xmm4
michael@0 861 pmaddubsw xmm1, xmm4
michael@0 862 pmaddubsw xmm2, xmm4
michael@0 863 pmaddubsw xmm3, xmm4
michael@0 864 lea eax, [eax + 64]
michael@0 865 phaddw xmm0, xmm1
michael@0 866 phaddw xmm2, xmm3
michael@0 867 psrlw xmm0, 7
michael@0 868 psrlw xmm2, 7
michael@0 869 packuswb xmm0, xmm2
michael@0 870 paddb xmm0, xmm5
michael@0 871 sub ecx, 16
michael@0 872 movdqa [edx], xmm0
michael@0 873 lea edx, [edx + 16]
michael@0 874 jg convertloop
michael@0 875 ret
michael@0 876 }
michael@0 877 }
michael@0 878
michael@0 879 __declspec(naked) __declspec(align(16))
michael@0 880 void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
michael@0 881 __asm {
michael@0 882 mov eax, [esp + 4] /* src_argb */
michael@0 883 mov edx, [esp + 8] /* dst_y */
michael@0 884 mov ecx, [esp + 12] /* pix */
michael@0 885 movdqa xmm5, kAddY16
michael@0 886 movdqa xmm4, kBGRAToY
michael@0 887
michael@0 888 align 4
michael@0 889 convertloop:
michael@0 890 movdqu xmm0, [eax]
michael@0 891 movdqu xmm1, [eax + 16]
michael@0 892 movdqu xmm2, [eax + 32]
michael@0 893 movdqu xmm3, [eax + 48]
michael@0 894 pmaddubsw xmm0, xmm4
michael@0 895 pmaddubsw xmm1, xmm4
michael@0 896 pmaddubsw xmm2, xmm4
michael@0 897 pmaddubsw xmm3, xmm4
michael@0 898 lea eax, [eax + 64]
michael@0 899 phaddw xmm0, xmm1
michael@0 900 phaddw xmm2, xmm3
michael@0 901 psrlw xmm0, 7
michael@0 902 psrlw xmm2, 7
michael@0 903 packuswb xmm0, xmm2
michael@0 904 paddb xmm0, xmm5
michael@0 905 sub ecx, 16
michael@0 906 movdqu [edx], xmm0
michael@0 907 lea edx, [edx + 16]
michael@0 908 jg convertloop
michael@0 909 ret
michael@0 910 }
michael@0 911 }
michael@0 912
michael@0 913 __declspec(naked) __declspec(align(16))
michael@0 914 void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
michael@0 915 __asm {
michael@0 916 mov eax, [esp + 4] /* src_argb */
michael@0 917 mov edx, [esp + 8] /* dst_y */
michael@0 918 mov ecx, [esp + 12] /* pix */
michael@0 919 movdqa xmm5, kAddY16
michael@0 920 movdqa xmm4, kABGRToY
michael@0 921
michael@0 922 align 4
michael@0 923 convertloop:
michael@0 924 movdqa xmm0, [eax]
michael@0 925 movdqa xmm1, [eax + 16]
michael@0 926 movdqa xmm2, [eax + 32]
michael@0 927 movdqa xmm3, [eax + 48]
michael@0 928 pmaddubsw xmm0, xmm4
michael@0 929 pmaddubsw xmm1, xmm4
michael@0 930 pmaddubsw xmm2, xmm4
michael@0 931 pmaddubsw xmm3, xmm4
michael@0 932 lea eax, [eax + 64]
michael@0 933 phaddw xmm0, xmm1
michael@0 934 phaddw xmm2, xmm3
michael@0 935 psrlw xmm0, 7
michael@0 936 psrlw xmm2, 7
michael@0 937 packuswb xmm0, xmm2
michael@0 938 paddb xmm0, xmm5
michael@0 939 sub ecx, 16
michael@0 940 movdqa [edx], xmm0
michael@0 941 lea edx, [edx + 16]
michael@0 942 jg convertloop
michael@0 943 ret
michael@0 944 }
michael@0 945 }
michael@0 946
michael@0 947 __declspec(naked) __declspec(align(16))
michael@0 948 void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
michael@0 949 __asm {
michael@0 950 mov eax, [esp + 4] /* src_argb */
michael@0 951 mov edx, [esp + 8] /* dst_y */
michael@0 952 mov ecx, [esp + 12] /* pix */
michael@0 953 movdqa xmm5, kAddY16
michael@0 954 movdqa xmm4, kABGRToY
michael@0 955
michael@0 956 align 4
michael@0 957 convertloop:
michael@0 958 movdqu xmm0, [eax]
michael@0 959 movdqu xmm1, [eax + 16]
michael@0 960 movdqu xmm2, [eax + 32]
michael@0 961 movdqu xmm3, [eax + 48]
michael@0 962 pmaddubsw xmm0, xmm4
michael@0 963 pmaddubsw xmm1, xmm4
michael@0 964 pmaddubsw xmm2, xmm4
michael@0 965 pmaddubsw xmm3, xmm4
michael@0 966 lea eax, [eax + 64]
michael@0 967 phaddw xmm0, xmm1
michael@0 968 phaddw xmm2, xmm3
michael@0 969 psrlw xmm0, 7
michael@0 970 psrlw xmm2, 7
michael@0 971 packuswb xmm0, xmm2
michael@0 972 paddb xmm0, xmm5
michael@0 973 sub ecx, 16
michael@0 974 movdqu [edx], xmm0
michael@0 975 lea edx, [edx + 16]
michael@0 976 jg convertloop
michael@0 977 ret
michael@0 978 }
michael@0 979 }
michael@0 980
michael@0 981 __declspec(naked) __declspec(align(16))
michael@0 982 void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
michael@0 983 __asm {
michael@0 984 mov eax, [esp + 4] /* src_argb */
michael@0 985 mov edx, [esp + 8] /* dst_y */
michael@0 986 mov ecx, [esp + 12] /* pix */
michael@0 987 movdqa xmm5, kAddY16
michael@0 988 movdqa xmm4, kRGBAToY
michael@0 989
michael@0 990 align 4
michael@0 991 convertloop:
michael@0 992 movdqa xmm0, [eax]
michael@0 993 movdqa xmm1, [eax + 16]
michael@0 994 movdqa xmm2, [eax + 32]
michael@0 995 movdqa xmm3, [eax + 48]
michael@0 996 pmaddubsw xmm0, xmm4
michael@0 997 pmaddubsw xmm1, xmm4
michael@0 998 pmaddubsw xmm2, xmm4
michael@0 999 pmaddubsw xmm3, xmm4
michael@0 1000 lea eax, [eax + 64]
michael@0 1001 phaddw xmm0, xmm1
michael@0 1002 phaddw xmm2, xmm3
michael@0 1003 psrlw xmm0, 7
michael@0 1004 psrlw xmm2, 7
michael@0 1005 packuswb xmm0, xmm2
michael@0 1006 paddb xmm0, xmm5
michael@0 1007 sub ecx, 16
michael@0 1008 movdqa [edx], xmm0
michael@0 1009 lea edx, [edx + 16]
michael@0 1010 jg convertloop
michael@0 1011 ret
michael@0 1012 }
michael@0 1013 }
michael@0 1014
michael@0 1015 __declspec(naked) __declspec(align(16))
michael@0 1016 void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
michael@0 1017 __asm {
michael@0 1018 mov eax, [esp + 4] /* src_argb */
michael@0 1019 mov edx, [esp + 8] /* dst_y */
michael@0 1020 mov ecx, [esp + 12] /* pix */
michael@0 1021 movdqa xmm5, kAddY16
michael@0 1022 movdqa xmm4, kRGBAToY
michael@0 1023
michael@0 1024 align 4
michael@0 1025 convertloop:
michael@0 1026 movdqu xmm0, [eax]
michael@0 1027 movdqu xmm1, [eax + 16]
michael@0 1028 movdqu xmm2, [eax + 32]
michael@0 1029 movdqu xmm3, [eax + 48]
michael@0 1030 pmaddubsw xmm0, xmm4
michael@0 1031 pmaddubsw xmm1, xmm4
michael@0 1032 pmaddubsw xmm2, xmm4
michael@0 1033 pmaddubsw xmm3, xmm4
michael@0 1034 lea eax, [eax + 64]
michael@0 1035 phaddw xmm0, xmm1
michael@0 1036 phaddw xmm2, xmm3
michael@0 1037 psrlw xmm0, 7
michael@0 1038 psrlw xmm2, 7
michael@0 1039 packuswb xmm0, xmm2
michael@0 1040 paddb xmm0, xmm5
michael@0 1041 sub ecx, 16
michael@0 1042 movdqu [edx], xmm0
michael@0 1043 lea edx, [edx + 16]
michael@0 1044 jg convertloop
michael@0 1045 ret
michael@0 1046 }
michael@0 1047 }
michael@0 1048
michael@0 1049 __declspec(naked) __declspec(align(16))
michael@0 1050 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
michael@0 1051 uint8* dst_u, uint8* dst_v, int width) {
michael@0 1052 __asm {
michael@0 1053 push esi
michael@0 1054 push edi
michael@0 1055 mov eax, [esp + 8 + 4] // src_argb
michael@0 1056 mov esi, [esp + 8 + 8] // src_stride_argb
michael@0 1057 mov edx, [esp + 8 + 12] // dst_u
michael@0 1058 mov edi, [esp + 8 + 16] // dst_v
michael@0 1059 mov ecx, [esp + 8 + 20] // pix
michael@0 1060 movdqa xmm7, kARGBToU
michael@0 1061 movdqa xmm6, kARGBToV
michael@0 1062 movdqa xmm5, kAddUV128
michael@0 1063 sub edi, edx // stride from u to v
michael@0 1064
michael@0 1065 align 4
michael@0 1066 convertloop:
michael@0 1067 /* step 1 - subsample 16x2 argb pixels to 8x1 */
michael@0 1068 movdqa xmm0, [eax]
michael@0 1069 movdqa xmm1, [eax + 16]
michael@0 1070 movdqa xmm2, [eax + 32]
michael@0 1071 movdqa xmm3, [eax + 48]
michael@0 1072 pavgb xmm0, [eax + esi]
michael@0 1073 pavgb xmm1, [eax + esi + 16]
michael@0 1074 pavgb xmm2, [eax + esi + 32]
michael@0 1075 pavgb xmm3, [eax + esi + 48]
michael@0 1076 lea eax, [eax + 64]
michael@0 1077 movdqa xmm4, xmm0
michael@0 1078 shufps xmm0, xmm1, 0x88
michael@0 1079 shufps xmm4, xmm1, 0xdd
michael@0 1080 pavgb xmm0, xmm4
michael@0 1081 movdqa xmm4, xmm2
michael@0 1082 shufps xmm2, xmm3, 0x88
michael@0 1083 shufps xmm4, xmm3, 0xdd
michael@0 1084 pavgb xmm2, xmm4
michael@0 1085
michael@0 1086 // step 2 - convert to U and V
michael@0 1087 // from here down is very similar to Y code except
michael@0 1088 // instead of 16 different pixels, its 8 pixels of U and 8 of V
michael@0 1089 movdqa xmm1, xmm0
michael@0 1090 movdqa xmm3, xmm2
michael@0 1091 pmaddubsw xmm0, xmm7 // U
michael@0 1092 pmaddubsw xmm2, xmm7
michael@0 1093 pmaddubsw xmm1, xmm6 // V
michael@0 1094 pmaddubsw xmm3, xmm6
michael@0 1095 phaddw xmm0, xmm2
michael@0 1096 phaddw xmm1, xmm3
michael@0 1097 psraw xmm0, 8
michael@0 1098 psraw xmm1, 8
michael@0 1099 packsswb xmm0, xmm1
michael@0 1100 paddb xmm0, xmm5 // -> unsigned
michael@0 1101
michael@0 1102 // step 3 - store 8 U and 8 V values
michael@0 1103 sub ecx, 16
michael@0 1104 movlps qword ptr [edx], xmm0 // U
michael@0 1105 movhps qword ptr [edx + edi], xmm0 // V
michael@0 1106 lea edx, [edx + 8]
michael@0 1107 jg convertloop
michael@0 1108
michael@0 1109 pop edi
michael@0 1110 pop esi
michael@0 1111 ret
michael@0 1112 }
michael@0 1113 }
michael@0 1114
michael@0 1115 __declspec(naked) __declspec(align(16))
michael@0 1116 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
michael@0 1117 uint8* dst_u, uint8* dst_v, int width) {
michael@0 1118 __asm {
michael@0 1119 push esi
michael@0 1120 push edi
michael@0 1121 mov eax, [esp + 8 + 4] // src_argb
michael@0 1122 mov esi, [esp + 8 + 8] // src_stride_argb
michael@0 1123 mov edx, [esp + 8 + 12] // dst_u
michael@0 1124 mov edi, [esp + 8 + 16] // dst_v
michael@0 1125 mov ecx, [esp + 8 + 20] // pix
michael@0 1126 movdqa xmm7, kARGBToUJ
michael@0 1127 movdqa xmm6, kARGBToVJ
michael@0 1128 movdqa xmm5, kAddUVJ128
michael@0 1129 sub edi, edx // stride from u to v
michael@0 1130
michael@0 1131 align 4
michael@0 1132 convertloop:
michael@0 1133 /* step 1 - subsample 16x2 argb pixels to 8x1 */
michael@0 1134 movdqa xmm0, [eax]
michael@0 1135 movdqa xmm1, [eax + 16]
michael@0 1136 movdqa xmm2, [eax + 32]
michael@0 1137 movdqa xmm3, [eax + 48]
michael@0 1138 pavgb xmm0, [eax + esi]
michael@0 1139 pavgb xmm1, [eax + esi + 16]
michael@0 1140 pavgb xmm2, [eax + esi + 32]
michael@0 1141 pavgb xmm3, [eax + esi + 48]
michael@0 1142 lea eax, [eax + 64]
michael@0 1143 movdqa xmm4, xmm0
michael@0 1144 shufps xmm0, xmm1, 0x88
michael@0 1145 shufps xmm4, xmm1, 0xdd
michael@0 1146 pavgb xmm0, xmm4
michael@0 1147 movdqa xmm4, xmm2
michael@0 1148 shufps xmm2, xmm3, 0x88
michael@0 1149 shufps xmm4, xmm3, 0xdd
michael@0 1150 pavgb xmm2, xmm4
michael@0 1151
michael@0 1152 // step 2 - convert to U and V
michael@0 1153 // from here down is very similar to Y code except
michael@0 1154 // instead of 16 different pixels, its 8 pixels of U and 8 of V
michael@0 1155 movdqa xmm1, xmm0
michael@0 1156 movdqa xmm3, xmm2
michael@0 1157 pmaddubsw xmm0, xmm7 // U
michael@0 1158 pmaddubsw xmm2, xmm7
michael@0 1159 pmaddubsw xmm1, xmm6 // V
michael@0 1160 pmaddubsw xmm3, xmm6
michael@0 1161 phaddw xmm0, xmm2
michael@0 1162 phaddw xmm1, xmm3
michael@0 1163 paddw xmm0, xmm5 // +.5 rounding -> unsigned
michael@0 1164 paddw xmm1, xmm5
michael@0 1165 psraw xmm0, 8
michael@0 1166 psraw xmm1, 8
michael@0 1167 packsswb xmm0, xmm1
michael@0 1168
michael@0 1169 // step 3 - store 8 U and 8 V values
michael@0 1170 sub ecx, 16
michael@0 1171 movlps qword ptr [edx], xmm0 // U
michael@0 1172 movhps qword ptr [edx + edi], xmm0 // V
michael@0 1173 lea edx, [edx + 8]
michael@0 1174 jg convertloop
michael@0 1175
michael@0 1176 pop edi
michael@0 1177 pop esi
michael@0 1178 ret
michael@0 1179 }
michael@0 1180 }
michael@0 1181
michael@0 1182 #ifdef HAS_ARGBTOUVROW_AVX2
michael@0 1183 __declspec(naked) __declspec(align(32))
michael@0 1184 void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
michael@0 1185 uint8* dst_u, uint8* dst_v, int width) {
michael@0 1186 __asm {
michael@0 1187 push esi
michael@0 1188 push edi
michael@0 1189 mov eax, [esp + 8 + 4] // src_argb
michael@0 1190 mov esi, [esp + 8 + 8] // src_stride_argb
michael@0 1191 mov edx, [esp + 8 + 12] // dst_u
michael@0 1192 mov edi, [esp + 8 + 16] // dst_v
michael@0 1193 mov ecx, [esp + 8 + 20] // pix
michael@0 1194 vbroadcastf128 ymm5, kAddUV128
michael@0 1195 vbroadcastf128 ymm6, kARGBToV
michael@0 1196 vbroadcastf128 ymm7, kARGBToU
michael@0 1197 sub edi, edx // stride from u to v
michael@0 1198
michael@0 1199 align 4
michael@0 1200 convertloop:
michael@0 1201 /* step 1 - subsample 32x2 argb pixels to 16x1 */
michael@0 1202 vmovdqu ymm0, [eax]
michael@0 1203 vmovdqu ymm1, [eax + 32]
michael@0 1204 vmovdqu ymm2, [eax + 64]
michael@0 1205 vmovdqu ymm3, [eax + 96]
michael@0 1206 vpavgb ymm0, ymm0, [eax + esi]
michael@0 1207 vpavgb ymm1, ymm1, [eax + esi + 32]
michael@0 1208 vpavgb ymm2, ymm2, [eax + esi + 64]
michael@0 1209 vpavgb ymm3, ymm3, [eax + esi + 96]
michael@0 1210 lea eax, [eax + 128]
michael@0 1211 vshufps ymm4, ymm0, ymm1, 0x88
michael@0 1212 vshufps ymm0, ymm0, ymm1, 0xdd
michael@0 1213 vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
michael@0 1214 vshufps ymm4, ymm2, ymm3, 0x88
michael@0 1215 vshufps ymm2, ymm2, ymm3, 0xdd
michael@0 1216 vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
michael@0 1217
michael@0 1218 // step 2 - convert to U and V
michael@0 1219 // from here down is very similar to Y code except
michael@0 1220 // instead of 32 different pixels, its 16 pixels of U and 16 of V
michael@0 1221 vpmaddubsw ymm1, ymm0, ymm7 // U
michael@0 1222 vpmaddubsw ymm3, ymm2, ymm7
michael@0 1223 vpmaddubsw ymm0, ymm0, ymm6 // V
michael@0 1224 vpmaddubsw ymm2, ymm2, ymm6
michael@0 1225 vphaddw ymm1, ymm1, ymm3 // mutates
michael@0 1226 vphaddw ymm0, ymm0, ymm2
michael@0 1227 vpsraw ymm1, ymm1, 8
michael@0 1228 vpsraw ymm0, ymm0, 8
michael@0 1229 vpacksswb ymm0, ymm1, ymm0 // mutates
michael@0 1230 vpermq ymm0, ymm0, 0xd8 // For vpacksswb
michael@0 1231 vpshufb ymm0, ymm0, kShufARGBToUV_AVX // For vshufps + vphaddw
michael@0 1232 vpaddb ymm0, ymm0, ymm5 // -> unsigned
michael@0 1233
michael@0 1234 // step 3 - store 16 U and 16 V values
michael@0 1235 sub ecx, 32
michael@0 1236 vextractf128 [edx], ymm0, 0 // U
michael@0 1237 vextractf128 [edx + edi], ymm0, 1 // V
michael@0 1238 lea edx, [edx + 16]
michael@0 1239 jg convertloop
michael@0 1240
michael@0 1241 pop edi
michael@0 1242 pop esi
michael@0 1243 vzeroupper
michael@0 1244 ret
michael@0 1245 }
michael@0 1246 }
michael@0 1247 #endif // HAS_ARGBTOUVROW_AVX2
michael@0 1248
michael@0 1249 __declspec(naked) __declspec(align(16))
michael@0 1250 void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
michael@0 1251 uint8* dst_u, uint8* dst_v, int width) {
michael@0 1252 __asm {
michael@0 1253 push esi
michael@0 1254 push edi
michael@0 1255 mov eax, [esp + 8 + 4] // src_argb
michael@0 1256 mov esi, [esp + 8 + 8] // src_stride_argb
michael@0 1257 mov edx, [esp + 8 + 12] // dst_u
michael@0 1258 mov edi, [esp + 8 + 16] // dst_v
michael@0 1259 mov ecx, [esp + 8 + 20] // pix
michael@0 1260 movdqa xmm7, kARGBToU
michael@0 1261 movdqa xmm6, kARGBToV
michael@0 1262 movdqa xmm5, kAddUV128
michael@0 1263 sub edi, edx // stride from u to v
michael@0 1264
michael@0 1265 align 4
michael@0 1266 convertloop:
michael@0 1267 /* step 1 - subsample 16x2 argb pixels to 8x1 */
michael@0 1268 movdqu xmm0, [eax]
michael@0 1269 movdqu xmm1, [eax + 16]
michael@0 1270 movdqu xmm2, [eax + 32]
michael@0 1271 movdqu xmm3, [eax + 48]
michael@0 1272 movdqu xmm4, [eax + esi]
michael@0 1273 pavgb xmm0, xmm4
michael@0 1274 movdqu xmm4, [eax + esi + 16]
michael@0 1275 pavgb xmm1, xmm4
michael@0 1276 movdqu xmm4, [eax + esi + 32]
michael@0 1277 pavgb xmm2, xmm4
michael@0 1278 movdqu xmm4, [eax + esi + 48]
michael@0 1279 pavgb xmm3, xmm4
michael@0 1280 lea eax, [eax + 64]
michael@0 1281 movdqa xmm4, xmm0
michael@0 1282 shufps xmm0, xmm1, 0x88
michael@0 1283 shufps xmm4, xmm1, 0xdd
michael@0 1284 pavgb xmm0, xmm4
michael@0 1285 movdqa xmm4, xmm2
michael@0 1286 shufps xmm2, xmm3, 0x88
michael@0 1287 shufps xmm4, xmm3, 0xdd
michael@0 1288 pavgb xmm2, xmm4
michael@0 1289
michael@0 1290 // step 2 - convert to U and V
michael@0 1291 // from here down is very similar to Y code except
michael@0 1292 // instead of 16 different pixels, its 8 pixels of U and 8 of V
michael@0 1293 movdqa xmm1, xmm0
michael@0 1294 movdqa xmm3, xmm2
michael@0 1295 pmaddubsw xmm0, xmm7 // U
michael@0 1296 pmaddubsw xmm2, xmm7
michael@0 1297 pmaddubsw xmm1, xmm6 // V
michael@0 1298 pmaddubsw xmm3, xmm6
michael@0 1299 phaddw xmm0, xmm2
michael@0 1300 phaddw xmm1, xmm3
michael@0 1301 psraw xmm0, 8
michael@0 1302 psraw xmm1, 8
michael@0 1303 packsswb xmm0, xmm1
michael@0 1304 paddb xmm0, xmm5 // -> unsigned
michael@0 1305
michael@0 1306 // step 3 - store 8 U and 8 V values
michael@0 1307 sub ecx, 16
michael@0 1308 movlps qword ptr [edx], xmm0 // U
michael@0 1309 movhps qword ptr [edx + edi], xmm0 // V
michael@0 1310 lea edx, [edx + 8]
michael@0 1311 jg convertloop
michael@0 1312
michael@0 1313 pop edi
michael@0 1314 pop esi
michael@0 1315 ret
michael@0 1316 }
michael@0 1317 }
michael@0 1318
michael@0 1319 __declspec(naked) __declspec(align(16))
michael@0 1320 void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
michael@0 1321 uint8* dst_u, uint8* dst_v, int width) {
michael@0 1322 __asm {
michael@0 1323 push esi
michael@0 1324 push edi
michael@0 1325 mov eax, [esp + 8 + 4] // src_argb
michael@0 1326 mov esi, [esp + 8 + 8] // src_stride_argb
michael@0 1327 mov edx, [esp + 8 + 12] // dst_u
michael@0 1328 mov edi, [esp + 8 + 16] // dst_v
michael@0 1329 mov ecx, [esp + 8 + 20] // pix
michael@0 1330 movdqa xmm7, kARGBToUJ
michael@0 1331 movdqa xmm6, kARGBToVJ
michael@0 1332 movdqa xmm5, kAddUVJ128
michael@0 1333 sub edi, edx // stride from u to v
michael@0 1334
michael@0 1335 align 4
michael@0 1336 convertloop:
michael@0 1337 /* step 1 - subsample 16x2 argb pixels to 8x1 */
michael@0 1338 movdqu xmm0, [eax]
michael@0 1339 movdqu xmm1, [eax + 16]
michael@0 1340 movdqu xmm2, [eax + 32]
michael@0 1341 movdqu xmm3, [eax + 48]
michael@0 1342 movdqu xmm4, [eax + esi]
michael@0 1343 pavgb xmm0, xmm4
michael@0 1344 movdqu xmm4, [eax + esi + 16]
michael@0 1345 pavgb xmm1, xmm4
michael@0 1346 movdqu xmm4, [eax + esi + 32]
michael@0 1347 pavgb xmm2, xmm4
michael@0 1348 movdqu xmm4, [eax + esi + 48]
michael@0 1349 pavgb xmm3, xmm4
michael@0 1350 lea eax, [eax + 64]
michael@0 1351 movdqa xmm4, xmm0
michael@0 1352 shufps xmm0, xmm1, 0x88
michael@0 1353 shufps xmm4, xmm1, 0xdd
michael@0 1354 pavgb xmm0, xmm4
michael@0 1355 movdqa xmm4, xmm2
michael@0 1356 shufps xmm2, xmm3, 0x88
michael@0 1357 shufps xmm4, xmm3, 0xdd
michael@0 1358 pavgb xmm2, xmm4
michael@0 1359
michael@0 1360 // step 2 - convert to U and V
michael@0 1361 // from here down is very similar to Y code except
michael@0 1362 // instead of 16 different pixels, its 8 pixels of U and 8 of V
michael@0 1363 movdqa xmm1, xmm0
michael@0 1364 movdqa xmm3, xmm2
michael@0 1365 pmaddubsw xmm0, xmm7 // U
michael@0 1366 pmaddubsw xmm2, xmm7
michael@0 1367 pmaddubsw xmm1, xmm6 // V
michael@0 1368 pmaddubsw xmm3, xmm6
michael@0 1369 phaddw xmm0, xmm2
michael@0 1370 phaddw xmm1, xmm3
michael@0 1371 paddw xmm0, xmm5 // +.5 rounding -> unsigned
michael@0 1372 paddw xmm1, xmm5
michael@0 1373 psraw xmm0, 8
michael@0 1374 psraw xmm1, 8
michael@0 1375 packsswb xmm0, xmm1
michael@0 1376
michael@0 1377 // step 3 - store 8 U and 8 V values
michael@0 1378 sub ecx, 16
michael@0 1379 movlps qword ptr [edx], xmm0 // U
michael@0 1380 movhps qword ptr [edx + edi], xmm0 // V
michael@0 1381 lea edx, [edx + 8]
michael@0 1382 jg convertloop
michael@0 1383
michael@0 1384 pop edi
michael@0 1385 pop esi
michael@0 1386 ret
michael@0 1387 }
michael@0 1388 }
michael@0 1389
michael@0 1390 __declspec(naked) __declspec(align(16))
michael@0 1391 void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
michael@0 1392 uint8* dst_u, uint8* dst_v, int width) {
michael@0 1393 __asm {
michael@0 1394 push edi
michael@0 1395 mov eax, [esp + 4 + 4] // src_argb
michael@0 1396 mov edx, [esp + 4 + 8] // dst_u
michael@0 1397 mov edi, [esp + 4 + 12] // dst_v
michael@0 1398 mov ecx, [esp + 4 + 16] // pix
michael@0 1399 movdqa xmm7, kARGBToU
michael@0 1400 movdqa xmm6, kARGBToV
michael@0 1401 movdqa xmm5, kAddUV128
michael@0 1402 sub edi, edx // stride from u to v
michael@0 1403
michael@0 1404 align 4
michael@0 1405 convertloop:
michael@0 1406 /* convert to U and V */
michael@0 1407 movdqa xmm0, [eax] // U
michael@0 1408 movdqa xmm1, [eax + 16]
michael@0 1409 movdqa xmm2, [eax + 32]
michael@0 1410 movdqa xmm3, [eax + 48]
michael@0 1411 pmaddubsw xmm0, xmm7
michael@0 1412 pmaddubsw xmm1, xmm7
michael@0 1413 pmaddubsw xmm2, xmm7
michael@0 1414 pmaddubsw xmm3, xmm7
michael@0 1415 phaddw xmm0, xmm1
michael@0 1416 phaddw xmm2, xmm3
michael@0 1417 psraw xmm0, 8
michael@0 1418 psraw xmm2, 8
michael@0 1419 packsswb xmm0, xmm2
michael@0 1420 paddb xmm0, xmm5
michael@0 1421 sub ecx, 16
michael@0 1422 movdqa [edx], xmm0
michael@0 1423
michael@0 1424 movdqa xmm0, [eax] // V
michael@0 1425 movdqa xmm1, [eax + 16]
michael@0 1426 movdqa xmm2, [eax + 32]
michael@0 1427 movdqa xmm3, [eax + 48]
michael@0 1428 pmaddubsw xmm0, xmm6
michael@0 1429 pmaddubsw xmm1, xmm6
michael@0 1430 pmaddubsw xmm2, xmm6
michael@0 1431 pmaddubsw xmm3, xmm6
michael@0 1432 phaddw xmm0, xmm1
michael@0 1433 phaddw xmm2, xmm3
michael@0 1434 psraw xmm0, 8
michael@0 1435 psraw xmm2, 8
michael@0 1436 packsswb xmm0, xmm2
michael@0 1437 paddb xmm0, xmm5
michael@0 1438 lea eax, [eax + 64]
michael@0 1439 movdqa [edx + edi], xmm0
michael@0 1440 lea edx, [edx + 16]
michael@0 1441 jg convertloop
michael@0 1442
michael@0 1443 pop edi
michael@0 1444 ret
michael@0 1445 }
michael@0 1446 }
michael@0 1447
michael@0 1448 __declspec(naked) __declspec(align(16))
michael@0 1449 void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb0,
michael@0 1450 uint8* dst_u, uint8* dst_v, int width) {
michael@0 1451 __asm {
michael@0 1452 push edi
michael@0 1453 mov eax, [esp + 4 + 4] // src_argb
michael@0 1454 mov edx, [esp + 4 + 8] // dst_u
michael@0 1455 mov edi, [esp + 4 + 12] // dst_v
michael@0 1456 mov ecx, [esp + 4 + 16] // pix
michael@0 1457 movdqa xmm7, kARGBToU
michael@0 1458 movdqa xmm6, kARGBToV
michael@0 1459 movdqa xmm5, kAddUV128
michael@0 1460 sub edi, edx // stride from u to v
michael@0 1461
michael@0 1462 align 4
michael@0 1463 convertloop:
michael@0 1464 /* convert to U and V */
michael@0 1465 movdqu xmm0, [eax] // U
michael@0 1466 movdqu xmm1, [eax + 16]
michael@0 1467 movdqu xmm2, [eax + 32]
michael@0 1468 movdqu xmm3, [eax + 48]
michael@0 1469 pmaddubsw xmm0, xmm7
michael@0 1470 pmaddubsw xmm1, xmm7
michael@0 1471 pmaddubsw xmm2, xmm7
michael@0 1472 pmaddubsw xmm3, xmm7
michael@0 1473 phaddw xmm0, xmm1
michael@0 1474 phaddw xmm2, xmm3
michael@0 1475 psraw xmm0, 8
michael@0 1476 psraw xmm2, 8
michael@0 1477 packsswb xmm0, xmm2
michael@0 1478 paddb xmm0, xmm5
michael@0 1479 sub ecx, 16
michael@0 1480 movdqu [edx], xmm0
michael@0 1481
michael@0 1482 movdqu xmm0, [eax] // V
michael@0 1483 movdqu xmm1, [eax + 16]
michael@0 1484 movdqu xmm2, [eax + 32]
michael@0 1485 movdqu xmm3, [eax + 48]
michael@0 1486 pmaddubsw xmm0, xmm6
michael@0 1487 pmaddubsw xmm1, xmm6
michael@0 1488 pmaddubsw xmm2, xmm6
michael@0 1489 pmaddubsw xmm3, xmm6
michael@0 1490 phaddw xmm0, xmm1
michael@0 1491 phaddw xmm2, xmm3
michael@0 1492 psraw xmm0, 8
michael@0 1493 psraw xmm2, 8
michael@0 1494 packsswb xmm0, xmm2
michael@0 1495 paddb xmm0, xmm5
michael@0 1496 lea eax, [eax + 64]
michael@0 1497 movdqu [edx + edi], xmm0
michael@0 1498 lea edx, [edx + 16]
michael@0 1499 jg convertloop
michael@0 1500
michael@0 1501 pop edi
michael@0 1502 ret
michael@0 1503 }
michael@0 1504 }
michael@0 1505
michael@0 1506 __declspec(naked) __declspec(align(16))
michael@0 1507 void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
michael@0 1508 uint8* dst_u, uint8* dst_v, int width) {
michael@0 1509 __asm {
michael@0 1510 push edi
michael@0 1511 mov eax, [esp + 4 + 4] // src_argb
michael@0 1512 mov edx, [esp + 4 + 8] // dst_u
michael@0 1513 mov edi, [esp + 4 + 12] // dst_v
michael@0 1514 mov ecx, [esp + 4 + 16] // pix
michael@0 1515 movdqa xmm7, kARGBToU
michael@0 1516 movdqa xmm6, kARGBToV
michael@0 1517 movdqa xmm5, kAddUV128
michael@0 1518 sub edi, edx // stride from u to v
michael@0 1519
michael@0 1520 align 4
michael@0 1521 convertloop:
michael@0 1522 /* step 1 - subsample 16x2 argb pixels to 8x1 */
michael@0 1523 movdqa xmm0, [eax]
michael@0 1524 movdqa xmm1, [eax + 16]
michael@0 1525 movdqa xmm2, [eax + 32]
michael@0 1526 movdqa xmm3, [eax + 48]
michael@0 1527 lea eax, [eax + 64]
michael@0 1528 movdqa xmm4, xmm0
michael@0 1529 shufps xmm0, xmm1, 0x88
michael@0 1530 shufps xmm4, xmm1, 0xdd
michael@0 1531 pavgb xmm0, xmm4
michael@0 1532 movdqa xmm4, xmm2
michael@0 1533 shufps xmm2, xmm3, 0x88
michael@0 1534 shufps xmm4, xmm3, 0xdd
michael@0 1535 pavgb xmm2, xmm4
michael@0 1536
michael@0 1537 // step 2 - convert to U and V
michael@0 1538 // from here down is very similar to Y code except
michael@0 1539 // instead of 16 different pixels, its 8 pixels of U and 8 of V
michael@0 1540 movdqa xmm1, xmm0
michael@0 1541 movdqa xmm3, xmm2
michael@0 1542 pmaddubsw xmm0, xmm7 // U
michael@0 1543 pmaddubsw xmm2, xmm7
michael@0 1544 pmaddubsw xmm1, xmm6 // V
michael@0 1545 pmaddubsw xmm3, xmm6
michael@0 1546 phaddw xmm0, xmm2
michael@0 1547 phaddw xmm1, xmm3
michael@0 1548 psraw xmm0, 8
michael@0 1549 psraw xmm1, 8
michael@0 1550 packsswb xmm0, xmm1
michael@0 1551 paddb xmm0, xmm5 // -> unsigned
michael@0 1552
michael@0 1553 // step 3 - store 8 U and 8 V values
michael@0 1554 sub ecx, 16
michael@0 1555 movlps qword ptr [edx], xmm0 // U
michael@0 1556 movhps qword ptr [edx + edi], xmm0 // V
michael@0 1557 lea edx, [edx + 8]
michael@0 1558 jg convertloop
michael@0 1559
michael@0 1560 pop edi
michael@0 1561 ret
michael@0 1562 }
michael@0 1563 }
michael@0 1564
michael@0 1565 __declspec(naked) __declspec(align(16))
michael@0 1566 void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
michael@0 1567 uint8* dst_u, uint8* dst_v, int width) {
michael@0 1568 __asm {
michael@0 1569 push edi
michael@0 1570 mov eax, [esp + 4 + 4] // src_argb
michael@0 1571 mov edx, [esp + 4 + 8] // dst_u
michael@0 1572 mov edi, [esp + 4 + 12] // dst_v
michael@0 1573 mov ecx, [esp + 4 + 16] // pix
michael@0 1574 movdqa xmm7, kARGBToU
michael@0 1575 movdqa xmm6, kARGBToV
michael@0 1576 movdqa xmm5, kAddUV128
michael@0 1577 sub edi, edx // stride from u to v
michael@0 1578
michael@0 1579 align 4
michael@0 1580 convertloop:
michael@0 1581 /* step 1 - subsample 16x2 argb pixels to 8x1 */
michael@0 1582 movdqu xmm0, [eax]
michael@0 1583 movdqu xmm1, [eax + 16]
michael@0 1584 movdqu xmm2, [eax + 32]
michael@0 1585 movdqu xmm3, [eax + 48]
michael@0 1586 lea eax, [eax + 64]
michael@0 1587 movdqa xmm4, xmm0
michael@0 1588 shufps xmm0, xmm1, 0x88
michael@0 1589 shufps xmm4, xmm1, 0xdd
michael@0 1590 pavgb xmm0, xmm4
michael@0 1591 movdqa xmm4, xmm2
michael@0 1592 shufps xmm2, xmm3, 0x88
michael@0 1593 shufps xmm4, xmm3, 0xdd
michael@0 1594 pavgb xmm2, xmm4
michael@0 1595
michael@0 1596 // step 2 - convert to U and V
michael@0 1597 // from here down is very similar to Y code except
michael@0 1598 // instead of 16 different pixels, its 8 pixels of U and 8 of V
michael@0 1599 movdqa xmm1, xmm0
michael@0 1600 movdqa xmm3, xmm2
michael@0 1601 pmaddubsw xmm0, xmm7 // U
michael@0 1602 pmaddubsw xmm2, xmm7
michael@0 1603 pmaddubsw xmm1, xmm6 // V
michael@0 1604 pmaddubsw xmm3, xmm6
michael@0 1605 phaddw xmm0, xmm2
michael@0 1606 phaddw xmm1, xmm3
michael@0 1607 psraw xmm0, 8
michael@0 1608 psraw xmm1, 8
michael@0 1609 packsswb xmm0, xmm1
michael@0 1610 paddb xmm0, xmm5 // -> unsigned
michael@0 1611
michael@0 1612 // step 3 - store 8 U and 8 V values
michael@0 1613 sub ecx, 16
michael@0 1614 movlps qword ptr [edx], xmm0 // U
michael@0 1615 movhps qword ptr [edx + edi], xmm0 // V
michael@0 1616 lea edx, [edx + 8]
michael@0 1617 jg convertloop
michael@0 1618
michael@0 1619 pop edi
michael@0 1620 ret
michael@0 1621 }
michael@0 1622 }
michael@0 1623
michael@0 1624 __declspec(naked) __declspec(align(16))
michael@0 1625 void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
michael@0 1626 uint8* dst_u, uint8* dst_v, int width) {
michael@0 1627 __asm {
michael@0 1628 push esi
michael@0 1629 push edi
michael@0 1630 mov eax, [esp + 8 + 4] // src_argb
michael@0 1631 mov esi, [esp + 8 + 8] // src_stride_argb
michael@0 1632 mov edx, [esp + 8 + 12] // dst_u
michael@0 1633 mov edi, [esp + 8 + 16] // dst_v
michael@0 1634 mov ecx, [esp + 8 + 20] // pix
michael@0 1635 movdqa xmm7, kBGRAToU
michael@0 1636 movdqa xmm6, kBGRAToV
michael@0 1637 movdqa xmm5, kAddUV128
michael@0 1638 sub edi, edx // stride from u to v
michael@0 1639
michael@0 1640 align 4
michael@0 1641 convertloop:
michael@0 1642 /* step 1 - subsample 16x2 argb pixels to 8x1 */
michael@0 1643 movdqa xmm0, [eax]
michael@0 1644 movdqa xmm1, [eax + 16]
michael@0 1645 movdqa xmm2, [eax + 32]
michael@0 1646 movdqa xmm3, [eax + 48]
michael@0 1647 pavgb xmm0, [eax + esi]
michael@0 1648 pavgb xmm1, [eax + esi + 16]
michael@0 1649 pavgb xmm2, [eax + esi + 32]
michael@0 1650 pavgb xmm3, [eax + esi + 48]
michael@0 1651 lea eax, [eax + 64]
michael@0 1652 movdqa xmm4, xmm0
michael@0 1653 shufps xmm0, xmm1, 0x88
michael@0 1654 shufps xmm4, xmm1, 0xdd
michael@0 1655 pavgb xmm0, xmm4
michael@0 1656 movdqa xmm4, xmm2
michael@0 1657 shufps xmm2, xmm3, 0x88
michael@0 1658 shufps xmm4, xmm3, 0xdd
michael@0 1659 pavgb xmm2, xmm4
michael@0 1660
michael@0 1661 // step 2 - convert to U and V
michael@0 1662 // from here down is very similar to Y code except
michael@0 1663 // instead of 16 different pixels, its 8 pixels of U and 8 of V
michael@0 1664 movdqa xmm1, xmm0
michael@0 1665 movdqa xmm3, xmm2
michael@0 1666 pmaddubsw xmm0, xmm7 // U
michael@0 1667 pmaddubsw xmm2, xmm7
michael@0 1668 pmaddubsw xmm1, xmm6 // V
michael@0 1669 pmaddubsw xmm3, xmm6
michael@0 1670 phaddw xmm0, xmm2
michael@0 1671 phaddw xmm1, xmm3
michael@0 1672 psraw xmm0, 8
michael@0 1673 psraw xmm1, 8
michael@0 1674 packsswb xmm0, xmm1
michael@0 1675 paddb xmm0, xmm5 // -> unsigned
michael@0 1676
michael@0 1677 // step 3 - store 8 U and 8 V values
michael@0 1678 sub ecx, 16
michael@0 1679 movlps qword ptr [edx], xmm0 // U
michael@0 1680 movhps qword ptr [edx + edi], xmm0 // V
michael@0 1681 lea edx, [edx + 8]
michael@0 1682 jg convertloop
michael@0 1683
michael@0 1684 pop edi
michael@0 1685 pop esi
michael@0 1686 ret
michael@0 1687 }
michael@0 1688 }
michael@0 1689
michael@0 1690 __declspec(naked) __declspec(align(16))
michael@0 1691 void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
michael@0 1692 uint8* dst_u, uint8* dst_v, int width) {
michael@0 1693 __asm {
michael@0 1694 push esi
michael@0 1695 push edi
michael@0 1696 mov eax, [esp + 8 + 4] // src_argb
michael@0 1697 mov esi, [esp + 8 + 8] // src_stride_argb
michael@0 1698 mov edx, [esp + 8 + 12] // dst_u
michael@0 1699 mov edi, [esp + 8 + 16] // dst_v
michael@0 1700 mov ecx, [esp + 8 + 20] // pix
michael@0 1701 movdqa xmm7, kBGRAToU
michael@0 1702 movdqa xmm6, kBGRAToV
michael@0 1703 movdqa xmm5, kAddUV128
michael@0 1704 sub edi, edx // stride from u to v
michael@0 1705
michael@0 1706 align 4
michael@0 1707 convertloop:
michael@0 1708 /* step 1 - subsample 16x2 argb pixels to 8x1 */
michael@0 1709 movdqu xmm0, [eax]
michael@0 1710 movdqu xmm1, [eax + 16]
michael@0 1711 movdqu xmm2, [eax + 32]
michael@0 1712 movdqu xmm3, [eax + 48]
michael@0 1713 movdqu xmm4, [eax + esi]
michael@0 1714 pavgb xmm0, xmm4
michael@0 1715 movdqu xmm4, [eax + esi + 16]
michael@0 1716 pavgb xmm1, xmm4
michael@0 1717 movdqu xmm4, [eax + esi + 32]
michael@0 1718 pavgb xmm2, xmm4
michael@0 1719 movdqu xmm4, [eax + esi + 48]
michael@0 1720 pavgb xmm3, xmm4
michael@0 1721 lea eax, [eax + 64]
michael@0 1722 movdqa xmm4, xmm0
michael@0 1723 shufps xmm0, xmm1, 0x88
michael@0 1724 shufps xmm4, xmm1, 0xdd
michael@0 1725 pavgb xmm0, xmm4
michael@0 1726 movdqa xmm4, xmm2
michael@0 1727 shufps xmm2, xmm3, 0x88
michael@0 1728 shufps xmm4, xmm3, 0xdd
michael@0 1729 pavgb xmm2, xmm4
michael@0 1730
michael@0 1731 // step 2 - convert to U and V
michael@0 1732 // from here down is very similar to Y code except
michael@0 1733 // instead of 16 different pixels, its 8 pixels of U and 8 of V
michael@0 1734 movdqa xmm1, xmm0
michael@0 1735 movdqa xmm3, xmm2
michael@0 1736 pmaddubsw xmm0, xmm7 // U
michael@0 1737 pmaddubsw xmm2, xmm7
michael@0 1738 pmaddubsw xmm1, xmm6 // V
michael@0 1739 pmaddubsw xmm3, xmm6
michael@0 1740 phaddw xmm0, xmm2
michael@0 1741 phaddw xmm1, xmm3
michael@0 1742 psraw xmm0, 8
michael@0 1743 psraw xmm1, 8
michael@0 1744 packsswb xmm0, xmm1
michael@0 1745 paddb xmm0, xmm5 // -> unsigned
michael@0 1746
michael@0 1747 // step 3 - store 8 U and 8 V values
michael@0 1748 sub ecx, 16
michael@0 1749 movlps qword ptr [edx], xmm0 // U
michael@0 1750 movhps qword ptr [edx + edi], xmm0 // V
michael@0 1751 lea edx, [edx + 8]
michael@0 1752 jg convertloop
michael@0 1753
michael@0 1754 pop edi
michael@0 1755 pop esi
michael@0 1756 ret
michael@0 1757 }
michael@0 1758 }
michael@0 1759
michael@0 1760 __declspec(naked) __declspec(align(16))
michael@0 1761 void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
michael@0 1762 uint8* dst_u, uint8* dst_v, int width) {
michael@0 1763 __asm {
michael@0 1764 push esi
michael@0 1765 push edi
michael@0 1766 mov eax, [esp + 8 + 4] // src_argb
michael@0 1767 mov esi, [esp + 8 + 8] // src_stride_argb
michael@0 1768 mov edx, [esp + 8 + 12] // dst_u
michael@0 1769 mov edi, [esp + 8 + 16] // dst_v
michael@0 1770 mov ecx, [esp + 8 + 20] // pix
michael@0 1771 movdqa xmm7, kABGRToU
michael@0 1772 movdqa xmm6, kABGRToV
michael@0 1773 movdqa xmm5, kAddUV128
michael@0 1774 sub edi, edx // stride from u to v
michael@0 1775
michael@0 1776 align 4
michael@0 1777 convertloop:
michael@0 1778 /* step 1 - subsample 16x2 argb pixels to 8x1 */
michael@0 1779 movdqa xmm0, [eax]
michael@0 1780 movdqa xmm1, [eax + 16]
michael@0 1781 movdqa xmm2, [eax + 32]
michael@0 1782 movdqa xmm3, [eax + 48]
michael@0 1783 pavgb xmm0, [eax + esi]
michael@0 1784 pavgb xmm1, [eax + esi + 16]
michael@0 1785 pavgb xmm2, [eax + esi + 32]
michael@0 1786 pavgb xmm3, [eax + esi + 48]
michael@0 1787 lea eax, [eax + 64]
michael@0 1788 movdqa xmm4, xmm0
michael@0 1789 shufps xmm0, xmm1, 0x88
michael@0 1790 shufps xmm4, xmm1, 0xdd
michael@0 1791 pavgb xmm0, xmm4
michael@0 1792 movdqa xmm4, xmm2
michael@0 1793 shufps xmm2, xmm3, 0x88
michael@0 1794 shufps xmm4, xmm3, 0xdd
michael@0 1795 pavgb xmm2, xmm4
michael@0 1796
michael@0 1797 // step 2 - convert to U and V
michael@0 1798 // from here down is very similar to Y code except
michael@0 1799 // instead of 16 different pixels, its 8 pixels of U and 8 of V
michael@0 1800 movdqa xmm1, xmm0
michael@0 1801 movdqa xmm3, xmm2
michael@0 1802 pmaddubsw xmm0, xmm7 // U
michael@0 1803 pmaddubsw xmm2, xmm7
michael@0 1804 pmaddubsw xmm1, xmm6 // V
michael@0 1805 pmaddubsw xmm3, xmm6
michael@0 1806 phaddw xmm0, xmm2
michael@0 1807 phaddw xmm1, xmm3
michael@0 1808 psraw xmm0, 8
michael@0 1809 psraw xmm1, 8
michael@0 1810 packsswb xmm0, xmm1
michael@0 1811 paddb xmm0, xmm5 // -> unsigned
michael@0 1812
michael@0 1813 // step 3 - store 8 U and 8 V values
michael@0 1814 sub ecx, 16
michael@0 1815 movlps qword ptr [edx], xmm0 // U
michael@0 1816 movhps qword ptr [edx + edi], xmm0 // V
michael@0 1817 lea edx, [edx + 8]
michael@0 1818 jg convertloop
michael@0 1819
michael@0 1820 pop edi
michael@0 1821 pop esi
michael@0 1822 ret
michael@0 1823 }
michael@0 1824 }
michael@0 1825
michael@0 1826 __declspec(naked) __declspec(align(16))
michael@0 1827 void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
michael@0 1828 uint8* dst_u, uint8* dst_v, int width) {
michael@0 1829 __asm {
michael@0 1830 push esi
michael@0 1831 push edi
michael@0 1832 mov eax, [esp + 8 + 4] // src_argb
michael@0 1833 mov esi, [esp + 8 + 8] // src_stride_argb
michael@0 1834 mov edx, [esp + 8 + 12] // dst_u
michael@0 1835 mov edi, [esp + 8 + 16] // dst_v
michael@0 1836 mov ecx, [esp + 8 + 20] // pix
michael@0 1837 movdqa xmm7, kABGRToU
michael@0 1838 movdqa xmm6, kABGRToV
michael@0 1839 movdqa xmm5, kAddUV128
michael@0 1840 sub edi, edx // stride from u to v
michael@0 1841
michael@0 1842 align 4
michael@0 1843 convertloop:
michael@0 1844 /* step 1 - subsample 16x2 argb pixels to 8x1 */
michael@0 1845 movdqu xmm0, [eax]
michael@0 1846 movdqu xmm1, [eax + 16]
michael@0 1847 movdqu xmm2, [eax + 32]
michael@0 1848 movdqu xmm3, [eax + 48]
michael@0 1849 movdqu xmm4, [eax + esi]
michael@0 1850 pavgb xmm0, xmm4
michael@0 1851 movdqu xmm4, [eax + esi + 16]
michael@0 1852 pavgb xmm1, xmm4
michael@0 1853 movdqu xmm4, [eax + esi + 32]
michael@0 1854 pavgb xmm2, xmm4
michael@0 1855 movdqu xmm4, [eax + esi + 48]
michael@0 1856 pavgb xmm3, xmm4
michael@0 1857 lea eax, [eax + 64]
michael@0 1858 movdqa xmm4, xmm0
michael@0 1859 shufps xmm0, xmm1, 0x88
michael@0 1860 shufps xmm4, xmm1, 0xdd
michael@0 1861 pavgb xmm0, xmm4
michael@0 1862 movdqa xmm4, xmm2
michael@0 1863 shufps xmm2, xmm3, 0x88
michael@0 1864 shufps xmm4, xmm3, 0xdd
michael@0 1865 pavgb xmm2, xmm4
michael@0 1866
michael@0 1867 // step 2 - convert to U and V
michael@0 1868 // from here down is very similar to Y code except
michael@0 1869 // instead of 16 different pixels, its 8 pixels of U and 8 of V
michael@0 1870 movdqa xmm1, xmm0
michael@0 1871 movdqa xmm3, xmm2
michael@0 1872 pmaddubsw xmm0, xmm7 // U
michael@0 1873 pmaddubsw xmm2, xmm7
michael@0 1874 pmaddubsw xmm1, xmm6 // V
michael@0 1875 pmaddubsw xmm3, xmm6
michael@0 1876 phaddw xmm0, xmm2
michael@0 1877 phaddw xmm1, xmm3
michael@0 1878 psraw xmm0, 8
michael@0 1879 psraw xmm1, 8
michael@0 1880 packsswb xmm0, xmm1
michael@0 1881 paddb xmm0, xmm5 // -> unsigned
michael@0 1882
michael@0 1883 // step 3 - store 8 U and 8 V values
michael@0 1884 sub ecx, 16
michael@0 1885 movlps qword ptr [edx], xmm0 // U
michael@0 1886 movhps qword ptr [edx + edi], xmm0 // V
michael@0 1887 lea edx, [edx + 8]
michael@0 1888 jg convertloop
michael@0 1889
michael@0 1890 pop edi
michael@0 1891 pop esi
michael@0 1892 ret
michael@0 1893 }
michael@0 1894 }
michael@0 1895
michael@0 1896 __declspec(naked) __declspec(align(16))
michael@0 1897 void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
michael@0 1898 uint8* dst_u, uint8* dst_v, int width) {
michael@0 1899 __asm {
michael@0 1900 push esi
michael@0 1901 push edi
michael@0 1902 mov eax, [esp + 8 + 4] // src_argb
michael@0 1903 mov esi, [esp + 8 + 8] // src_stride_argb
michael@0 1904 mov edx, [esp + 8 + 12] // dst_u
michael@0 1905 mov edi, [esp + 8 + 16] // dst_v
michael@0 1906 mov ecx, [esp + 8 + 20] // pix
michael@0 1907 movdqa xmm7, kRGBAToU
michael@0 1908 movdqa xmm6, kRGBAToV
michael@0 1909 movdqa xmm5, kAddUV128
michael@0 1910 sub edi, edx // stride from u to v
michael@0 1911
michael@0 1912 align 4
michael@0 1913 convertloop:
michael@0 1914 /* step 1 - subsample 16x2 argb pixels to 8x1 */
michael@0 1915 movdqa xmm0, [eax]
michael@0 1916 movdqa xmm1, [eax + 16]
michael@0 1917 movdqa xmm2, [eax + 32]
michael@0 1918 movdqa xmm3, [eax + 48]
michael@0 1919 pavgb xmm0, [eax + esi]
michael@0 1920 pavgb xmm1, [eax + esi + 16]
michael@0 1921 pavgb xmm2, [eax + esi + 32]
michael@0 1922 pavgb xmm3, [eax + esi + 48]
michael@0 1923 lea eax, [eax + 64]
michael@0 1924 movdqa xmm4, xmm0
michael@0 1925 shufps xmm0, xmm1, 0x88
michael@0 1926 shufps xmm4, xmm1, 0xdd
michael@0 1927 pavgb xmm0, xmm4
michael@0 1928 movdqa xmm4, xmm2
michael@0 1929 shufps xmm2, xmm3, 0x88
michael@0 1930 shufps xmm4, xmm3, 0xdd
michael@0 1931 pavgb xmm2, xmm4
michael@0 1932
michael@0 1933 // step 2 - convert to U and V
michael@0 1934 // from here down is very similar to Y code except
michael@0 1935 // instead of 16 different pixels, its 8 pixels of U and 8 of V
michael@0 1936 movdqa xmm1, xmm0
michael@0 1937 movdqa xmm3, xmm2
michael@0 1938 pmaddubsw xmm0, xmm7 // U
michael@0 1939 pmaddubsw xmm2, xmm7
michael@0 1940 pmaddubsw xmm1, xmm6 // V
michael@0 1941 pmaddubsw xmm3, xmm6
michael@0 1942 phaddw xmm0, xmm2
michael@0 1943 phaddw xmm1, xmm3
michael@0 1944 psraw xmm0, 8
michael@0 1945 psraw xmm1, 8
michael@0 1946 packsswb xmm0, xmm1
michael@0 1947 paddb xmm0, xmm5 // -> unsigned
michael@0 1948
michael@0 1949 // step 3 - store 8 U and 8 V values
michael@0 1950 sub ecx, 16
michael@0 1951 movlps qword ptr [edx], xmm0 // U
michael@0 1952 movhps qword ptr [edx + edi], xmm0 // V
michael@0 1953 lea edx, [edx + 8]
michael@0 1954 jg convertloop
michael@0 1955
michael@0 1956 pop edi
michael@0 1957 pop esi
michael@0 1958 ret
michael@0 1959 }
michael@0 1960 }
michael@0 1961
michael@0 1962 __declspec(naked) __declspec(align(16))
michael@0 1963 void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
michael@0 1964 uint8* dst_u, uint8* dst_v, int width) {
michael@0 1965 __asm {
michael@0 1966 push esi
michael@0 1967 push edi
michael@0 1968 mov eax, [esp + 8 + 4] // src_argb
michael@0 1969 mov esi, [esp + 8 + 8] // src_stride_argb
michael@0 1970 mov edx, [esp + 8 + 12] // dst_u
michael@0 1971 mov edi, [esp + 8 + 16] // dst_v
michael@0 1972 mov ecx, [esp + 8 + 20] // pix
michael@0 1973 movdqa xmm7, kRGBAToU
michael@0 1974 movdqa xmm6, kRGBAToV
michael@0 1975 movdqa xmm5, kAddUV128
michael@0 1976 sub edi, edx // stride from u to v
michael@0 1977
michael@0 1978 align 4
michael@0 1979 convertloop:
michael@0 1980 /* step 1 - subsample 16x2 argb pixels to 8x1 */
michael@0 1981 movdqu xmm0, [eax]
michael@0 1982 movdqu xmm1, [eax + 16]
michael@0 1983 movdqu xmm2, [eax + 32]
michael@0 1984 movdqu xmm3, [eax + 48]
michael@0 1985 movdqu xmm4, [eax + esi]
michael@0 1986 pavgb xmm0, xmm4
michael@0 1987 movdqu xmm4, [eax + esi + 16]
michael@0 1988 pavgb xmm1, xmm4
michael@0 1989 movdqu xmm4, [eax + esi + 32]
michael@0 1990 pavgb xmm2, xmm4
michael@0 1991 movdqu xmm4, [eax + esi + 48]
michael@0 1992 pavgb xmm3, xmm4
michael@0 1993 lea eax, [eax + 64]
michael@0 1994 movdqa xmm4, xmm0
michael@0 1995 shufps xmm0, xmm1, 0x88
michael@0 1996 shufps xmm4, xmm1, 0xdd
michael@0 1997 pavgb xmm0, xmm4
michael@0 1998 movdqa xmm4, xmm2
michael@0 1999 shufps xmm2, xmm3, 0x88
michael@0 2000 shufps xmm4, xmm3, 0xdd
michael@0 2001 pavgb xmm2, xmm4
michael@0 2002
michael@0 2003 // step 2 - convert to U and V
michael@0 2004 // from here down is very similar to Y code except
michael@0 2005 // instead of 16 different pixels, its 8 pixels of U and 8 of V
michael@0 2006 movdqa xmm1, xmm0
michael@0 2007 movdqa xmm3, xmm2
michael@0 2008 pmaddubsw xmm0, xmm7 // U
michael@0 2009 pmaddubsw xmm2, xmm7
michael@0 2010 pmaddubsw xmm1, xmm6 // V
michael@0 2011 pmaddubsw xmm3, xmm6
michael@0 2012 phaddw xmm0, xmm2
michael@0 2013 phaddw xmm1, xmm3
michael@0 2014 psraw xmm0, 8
michael@0 2015 psraw xmm1, 8
michael@0 2016 packsswb xmm0, xmm1
michael@0 2017 paddb xmm0, xmm5 // -> unsigned
michael@0 2018
michael@0 2019 // step 3 - store 8 U and 8 V values
michael@0 2020 sub ecx, 16
michael@0 2021 movlps qword ptr [edx], xmm0 // U
michael@0 2022 movhps qword ptr [edx + edi], xmm0 // V
michael@0 2023 lea edx, [edx + 8]
michael@0 2024 jg convertloop
michael@0 2025
michael@0 2026 pop edi
michael@0 2027 pop esi
michael@0 2028 ret
michael@0 2029 }
michael@0 2030 }
michael@0 2031 #endif // HAS_ARGBTOYROW_SSSE3
michael@0 2032
michael@0 2033 #define YG 74 /* (int8)(1.164 * 64 + 0.5) */
michael@0 2034
michael@0 2035 #define UB 127 /* min(63,(int8)(2.018 * 64)) */
michael@0 2036 #define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
michael@0 2037 #define UR 0
michael@0 2038
michael@0 2039 #define VB 0
michael@0 2040 #define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
michael@0 2041 #define VR 102 /* (int8)(1.596 * 64 + 0.5) */
michael@0 2042
michael@0 2043 // Bias
michael@0 2044 #define BB UB * 128 + VB * 128
michael@0 2045 #define BG UG * 128 + VG * 128
michael@0 2046 #define BR UR * 128 + VR * 128
michael@0 2047
michael@0 2048 #ifdef HAS_I422TOARGBROW_AVX2
michael@0 2049
michael@0 2050 static const lvec8 kUVToB_AVX = {
michael@0 2051 UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB,
michael@0 2052 UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
michael@0 2053 };
michael@0 2054 static const lvec8 kUVToR_AVX = {
michael@0 2055 UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR,
michael@0 2056 UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
michael@0 2057 };
michael@0 2058 static const lvec8 kUVToG_AVX = {
michael@0 2059 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG,
michael@0 2060 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
michael@0 2061 };
michael@0 2062 static const lvec16 kYToRgb_AVX = {
michael@0 2063 YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG
michael@0 2064 };
michael@0 2065 static const lvec16 kYSub16_AVX = {
michael@0 2066 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
michael@0 2067 };
michael@0 2068 static const lvec16 kUVBiasB_AVX = {
michael@0 2069 BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB
michael@0 2070 };
michael@0 2071 static const lvec16 kUVBiasG_AVX = {
michael@0 2072 BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG
michael@0 2073 };
michael@0 2074 static const lvec16 kUVBiasR_AVX = {
michael@0 2075 BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR
michael@0 2076 };
michael@0 2077
michael@0 2078 // 16 pixels
michael@0 2079 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
michael@0 2080 __declspec(naked) __declspec(align(16))
michael@0 2081 void I422ToARGBRow_AVX2(const uint8* y_buf,
michael@0 2082 const uint8* u_buf,
michael@0 2083 const uint8* v_buf,
michael@0 2084 uint8* dst_argb,
michael@0 2085 int width) {
michael@0 2086 __asm {
michael@0 2087 push esi
michael@0 2088 push edi
michael@0 2089 mov eax, [esp + 8 + 4] // Y
michael@0 2090 mov esi, [esp + 8 + 8] // U
michael@0 2091 mov edi, [esp + 8 + 12] // V
michael@0 2092 mov edx, [esp + 8 + 16] // argb
michael@0 2093 mov ecx, [esp + 8 + 20] // width
michael@0 2094 sub edi, esi
michael@0 2095 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha
michael@0 2096 vpxor ymm4, ymm4, ymm4
michael@0 2097
michael@0 2098 align 4
michael@0 2099 convertloop:
michael@0 2100 vmovq xmm0, qword ptr [esi] // U
michael@0 2101 vmovq xmm1, qword ptr [esi + edi] // V
michael@0 2102 lea esi, [esi + 8]
michael@0 2103 vpunpcklbw ymm0, ymm0, ymm1 // UV
michael@0 2104 vpermq ymm0, ymm0, 0xd8
michael@0 2105 vpunpcklwd ymm0, ymm0, ymm0 // UVUV
michael@0 2106 vpmaddubsw ymm2, ymm0, kUVToB_AVX // scale B UV
michael@0 2107 vpmaddubsw ymm1, ymm0, kUVToG_AVX // scale G UV
michael@0 2108 vpmaddubsw ymm0, ymm0, kUVToR_AVX // scale R UV
michael@0 2109 vpsubw ymm2, ymm2, kUVBiasB_AVX // unbias back to signed
michael@0 2110 vpsubw ymm1, ymm1, kUVBiasG_AVX
michael@0 2111 vpsubw ymm0, ymm0, kUVBiasR_AVX
michael@0 2112
michael@0 2113 // Step 2: Find Y contribution to 16 R,G,B values
michael@0 2114 vmovdqu xmm3, [eax] // NOLINT
michael@0 2115 lea eax, [eax + 16]
michael@0 2116 vpermq ymm3, ymm3, 0xd8
michael@0 2117 vpunpcklbw ymm3, ymm3, ymm4
michael@0 2118 vpsubsw ymm3, ymm3, kYSub16_AVX
michael@0 2119 vpmullw ymm3, ymm3, kYToRgb_AVX
michael@0 2120 vpaddsw ymm2, ymm2, ymm3 // B += Y
michael@0 2121 vpaddsw ymm1, ymm1, ymm3 // G += Y
michael@0 2122 vpaddsw ymm0, ymm0, ymm3 // R += Y
michael@0 2123 vpsraw ymm2, ymm2, 6
michael@0 2124 vpsraw ymm1, ymm1, 6
michael@0 2125 vpsraw ymm0, ymm0, 6
michael@0 2126 vpackuswb ymm2, ymm2, ymm2 // B
michael@0 2127 vpackuswb ymm1, ymm1, ymm1 // G
michael@0 2128 vpackuswb ymm0, ymm0, ymm0 // R
michael@0 2129
michael@0 2130 // Step 3: Weave into ARGB
michael@0 2131 vpunpcklbw ymm2, ymm2, ymm1 // BG
michael@0 2132 vpermq ymm2, ymm2, 0xd8
michael@0 2133 vpunpcklbw ymm0, ymm0, ymm5 // RA
michael@0 2134 vpermq ymm0, ymm0, 0xd8
michael@0 2135 vpunpcklwd ymm1, ymm2, ymm0 // BGRA first 8 pixels
michael@0 2136 vpunpckhwd ymm2, ymm2, ymm0 // BGRA next 8 pixels
michael@0 2137 vmovdqu [edx], ymm1
michael@0 2138 vmovdqu [edx + 32], ymm2
michael@0 2139 lea edx, [edx + 64]
michael@0 2140 sub ecx, 16
michael@0 2141 jg convertloop
michael@0 2142 vzeroupper
michael@0 2143
michael@0 2144 pop edi
michael@0 2145 pop esi
michael@0 2146 ret
michael@0 2147 }
michael@0 2148 }
michael@0 2149 #endif // HAS_I422TOARGBROW_AVX2
michael@0 2150
michael@0 2151 #ifdef HAS_I422TOARGBROW_SSSE3
michael@0 2152
michael@0 2153 static const vec8 kUVToB = {
michael@0 2154 UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
michael@0 2155 };
michael@0 2156
michael@0 2157 static const vec8 kUVToR = {
michael@0 2158 UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
michael@0 2159 };
michael@0 2160
michael@0 2161 static const vec8 kUVToG = {
michael@0 2162 UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
michael@0 2163 };
michael@0 2164
michael@0 2165 static const vec8 kVUToB = {
michael@0 2166 VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
michael@0 2167 };
michael@0 2168
michael@0 2169 static const vec8 kVUToR = {
michael@0 2170 VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
michael@0 2171 };
michael@0 2172
michael@0 2173 static const vec8 kVUToG = {
michael@0 2174 VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
michael@0 2175 };
michael@0 2176
michael@0 2177 static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
michael@0 2178 static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
michael@0 2179 static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
michael@0 2180 static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
michael@0 2181 static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
michael@0 2182
michael@0 2183 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
michael@0 2184
michael@0 2185 // Read 8 UV from 444.
michael@0 2186 #define READYUV444 __asm { \
michael@0 2187 __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \
michael@0 2188 __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \
michael@0 2189 __asm lea esi, [esi + 8] \
michael@0 2190 __asm punpcklbw xmm0, xmm1 /* UV */ \
michael@0 2191 }
michael@0 2192
michael@0 2193 // Read 4 UV from 422, upsample to 8 UV.
michael@0 2194 #define READYUV422 __asm { \
michael@0 2195 __asm movd xmm0, [esi] /* U */ \
michael@0 2196 __asm movd xmm1, [esi + edi] /* V */ \
michael@0 2197 __asm lea esi, [esi + 4] \
michael@0 2198 __asm punpcklbw xmm0, xmm1 /* UV */ \
michael@0 2199 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
michael@0 2200 }
michael@0 2201
michael@0 2202 // Read 2 UV from 411, upsample to 8 UV.
michael@0 2203 #define READYUV411 __asm { \
michael@0 2204 __asm movzx ebx, word ptr [esi] /* U */ /* NOLINT */ \
michael@0 2205 __asm movd xmm0, ebx \
michael@0 2206 __asm movzx ebx, word ptr [esi + edi] /* V */ /* NOLINT */ \
michael@0 2207 __asm movd xmm1, ebx \
michael@0 2208 __asm lea esi, [esi + 2] \
michael@0 2209 __asm punpcklbw xmm0, xmm1 /* UV */ \
michael@0 2210 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
michael@0 2211 __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \
michael@0 2212 }
michael@0 2213
michael@0 2214 // Read 4 UV from NV12, upsample to 8 UV.
michael@0 2215 #define READNV12 __asm { \
michael@0 2216 __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \
michael@0 2217 __asm lea esi, [esi + 8] \
michael@0 2218 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
michael@0 2219 }
michael@0 2220
michael@0 2221 // Convert 8 pixels: 8 UV and 8 Y.
michael@0 2222 #define YUVTORGB __asm { \
michael@0 2223 /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
michael@0 2224 __asm movdqa xmm1, xmm0 \
michael@0 2225 __asm movdqa xmm2, xmm0 \
michael@0 2226 __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \
michael@0 2227 __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \
michael@0 2228 __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \
michael@0 2229 __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
michael@0 2230 __asm psubw xmm1, kUVBiasG \
michael@0 2231 __asm psubw xmm2, kUVBiasR \
michael@0 2232 /* Step 2: Find Y contribution to 8 R,G,B values */ \
michael@0 2233 __asm movq xmm3, qword ptr [eax] /* NOLINT */ \
michael@0 2234 __asm lea eax, [eax + 8] \
michael@0 2235 __asm punpcklbw xmm3, xmm4 \
michael@0 2236 __asm psubsw xmm3, kYSub16 \
michael@0 2237 __asm pmullw xmm3, kYToRgb \
michael@0 2238 __asm paddsw xmm0, xmm3 /* B += Y */ \
michael@0 2239 __asm paddsw xmm1, xmm3 /* G += Y */ \
michael@0 2240 __asm paddsw xmm2, xmm3 /* R += Y */ \
michael@0 2241 __asm psraw xmm0, 6 \
michael@0 2242 __asm psraw xmm1, 6 \
michael@0 2243 __asm psraw xmm2, 6 \
michael@0 2244 __asm packuswb xmm0, xmm0 /* B */ \
michael@0 2245 __asm packuswb xmm1, xmm1 /* G */ \
michael@0 2246 __asm packuswb xmm2, xmm2 /* R */ \
michael@0 2247 }
michael@0 2248
michael@0 2249 // Convert 8 pixels: 8 VU and 8 Y.
michael@0 2250 #define YVUTORGB __asm { \
michael@0 2251 /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \
michael@0 2252 __asm movdqa xmm1, xmm0 \
michael@0 2253 __asm movdqa xmm2, xmm0 \
michael@0 2254 __asm pmaddubsw xmm0, kVUToB /* scale B UV */ \
michael@0 2255 __asm pmaddubsw xmm1, kVUToG /* scale G UV */ \
michael@0 2256 __asm pmaddubsw xmm2, kVUToR /* scale R UV */ \
michael@0 2257 __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
michael@0 2258 __asm psubw xmm1, kUVBiasG \
michael@0 2259 __asm psubw xmm2, kUVBiasR \
michael@0 2260 /* Step 2: Find Y contribution to 8 R,G,B values */ \
michael@0 2261 __asm movq xmm3, qword ptr [eax] /* NOLINT */ \
michael@0 2262 __asm lea eax, [eax + 8] \
michael@0 2263 __asm punpcklbw xmm3, xmm4 \
michael@0 2264 __asm psubsw xmm3, kYSub16 \
michael@0 2265 __asm pmullw xmm3, kYToRgb \
michael@0 2266 __asm paddsw xmm0, xmm3 /* B += Y */ \
michael@0 2267 __asm paddsw xmm1, xmm3 /* G += Y */ \
michael@0 2268 __asm paddsw xmm2, xmm3 /* R += Y */ \
michael@0 2269 __asm psraw xmm0, 6 \
michael@0 2270 __asm psraw xmm1, 6 \
michael@0 2271 __asm psraw xmm2, 6 \
michael@0 2272 __asm packuswb xmm0, xmm0 /* B */ \
michael@0 2273 __asm packuswb xmm1, xmm1 /* G */ \
michael@0 2274 __asm packuswb xmm2, xmm2 /* R */ \
michael@0 2275 }
michael@0 2276
michael@0 2277 // 8 pixels, dest aligned 16.
michael@0 2278 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
michael@0 2279 __declspec(naked) __declspec(align(16))
michael@0 2280 void I444ToARGBRow_SSSE3(const uint8* y_buf,
michael@0 2281 const uint8* u_buf,
michael@0 2282 const uint8* v_buf,
michael@0 2283 uint8* dst_argb,
michael@0 2284 int width) {
michael@0 2285 __asm {
michael@0 2286 push esi
michael@0 2287 push edi
michael@0 2288 mov eax, [esp + 8 + 4] // Y
michael@0 2289 mov esi, [esp + 8 + 8] // U
michael@0 2290 mov edi, [esp + 8 + 12] // V
michael@0 2291 mov edx, [esp + 8 + 16] // argb
michael@0 2292 mov ecx, [esp + 8 + 20] // width
michael@0 2293 sub edi, esi
michael@0 2294 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
michael@0 2295 pxor xmm4, xmm4
michael@0 2296
michael@0 2297 align 4
michael@0 2298 convertloop:
michael@0 2299 READYUV444
michael@0 2300 YUVTORGB
michael@0 2301
michael@0 2302 // Step 3: Weave into ARGB
michael@0 2303 punpcklbw xmm0, xmm1 // BG
michael@0 2304 punpcklbw xmm2, xmm5 // RA
michael@0 2305 movdqa xmm1, xmm0
michael@0 2306 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
michael@0 2307 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
michael@0 2308 movdqa [edx], xmm0
michael@0 2309 movdqa [edx + 16], xmm1
michael@0 2310 lea edx, [edx + 32]
michael@0 2311 sub ecx, 8
michael@0 2312 jg convertloop
michael@0 2313
michael@0 2314 pop edi
michael@0 2315 pop esi
michael@0 2316 ret
michael@0 2317 }
michael@0 2318 }
michael@0 2319
michael@0 2320 // 8 pixels, dest aligned 16.
michael@0 2321 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
michael@0 2322 __declspec(naked) __declspec(align(16))
michael@0 2323 void I422ToRGB24Row_SSSE3(const uint8* y_buf,
michael@0 2324 const uint8* u_buf,
michael@0 2325 const uint8* v_buf,
michael@0 2326 uint8* dst_rgb24,
michael@0 2327 int width) {
michael@0 2328 __asm {
michael@0 2329 push esi
michael@0 2330 push edi
michael@0 2331 mov eax, [esp + 8 + 4] // Y
michael@0 2332 mov esi, [esp + 8 + 8] // U
michael@0 2333 mov edi, [esp + 8 + 12] // V
michael@0 2334 mov edx, [esp + 8 + 16] // rgb24
michael@0 2335 mov ecx, [esp + 8 + 20] // width
michael@0 2336 sub edi, esi
michael@0 2337 pxor xmm4, xmm4
michael@0 2338 movdqa xmm5, kShuffleMaskARGBToRGB24_0
michael@0 2339 movdqa xmm6, kShuffleMaskARGBToRGB24
michael@0 2340
michael@0 2341 align 4
michael@0 2342 convertloop:
michael@0 2343 READYUV422
michael@0 2344 YUVTORGB
michael@0 2345
michael@0 2346 // Step 3: Weave into RRGB
michael@0 2347 punpcklbw xmm0, xmm1 // BG
michael@0 2348 punpcklbw xmm2, xmm2 // RR
michael@0 2349 movdqa xmm1, xmm0
michael@0 2350 punpcklwd xmm0, xmm2 // BGRR first 4 pixels
michael@0 2351 punpckhwd xmm1, xmm2 // BGRR next 4 pixels
michael@0 2352 pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes.
michael@0 2353 pshufb xmm1, xmm6 // Pack into first 12 bytes.
michael@0 2354 palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1
michael@0 2355 movq qword ptr [edx], xmm0 // First 8 bytes
michael@0 2356 movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels.
michael@0 2357 lea edx, [edx + 24]
michael@0 2358 sub ecx, 8
michael@0 2359 jg convertloop
michael@0 2360
michael@0 2361 pop edi
michael@0 2362 pop esi
michael@0 2363 ret
michael@0 2364 }
michael@0 2365 }
michael@0 2366
michael@0 2367 // 8 pixels, dest aligned 16.
michael@0 2368 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
michael@0 2369 __declspec(naked) __declspec(align(16))
michael@0 2370 void I422ToRAWRow_SSSE3(const uint8* y_buf,
michael@0 2371 const uint8* u_buf,
michael@0 2372 const uint8* v_buf,
michael@0 2373 uint8* dst_raw,
michael@0 2374 int width) {
michael@0 2375 __asm {
michael@0 2376 push esi
michael@0 2377 push edi
michael@0 2378 mov eax, [esp + 8 + 4] // Y
michael@0 2379 mov esi, [esp + 8 + 8] // U
michael@0 2380 mov edi, [esp + 8 + 12] // V
michael@0 2381 mov edx, [esp + 8 + 16] // raw
michael@0 2382 mov ecx, [esp + 8 + 20] // width
michael@0 2383 sub edi, esi
michael@0 2384 pxor xmm4, xmm4
michael@0 2385 movdqa xmm5, kShuffleMaskARGBToRAW_0
michael@0 2386 movdqa xmm6, kShuffleMaskARGBToRAW
michael@0 2387
michael@0 2388 align 4
michael@0 2389 convertloop:
michael@0 2390 READYUV422
michael@0 2391 YUVTORGB
michael@0 2392
michael@0 2393 // Step 3: Weave into RRGB
michael@0 2394 punpcklbw xmm0, xmm1 // BG
michael@0 2395 punpcklbw xmm2, xmm2 // RR
michael@0 2396 movdqa xmm1, xmm0
michael@0 2397 punpcklwd xmm0, xmm2 // BGRR first 4 pixels
michael@0 2398 punpckhwd xmm1, xmm2 // BGRR next 4 pixels
michael@0 2399 pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes.
michael@0 2400 pshufb xmm1, xmm6 // Pack into first 12 bytes.
michael@0 2401 palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1
michael@0 2402 movq qword ptr [edx], xmm0 // First 8 bytes
michael@0 2403 movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels.
michael@0 2404 lea edx, [edx + 24]
michael@0 2405 sub ecx, 8
michael@0 2406 jg convertloop
michael@0 2407
michael@0 2408 pop edi
michael@0 2409 pop esi
michael@0 2410 ret
michael@0 2411 }
michael@0 2412 }
michael@0 2413
michael@0 2414 // 8 pixels, dest unaligned.
michael@0 2415 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
michael@0 2416 __declspec(naked) __declspec(align(16))
michael@0 2417 void I422ToRGB565Row_SSSE3(const uint8* y_buf,
michael@0 2418 const uint8* u_buf,
michael@0 2419 const uint8* v_buf,
michael@0 2420 uint8* rgb565_buf,
michael@0 2421 int width) {
michael@0 2422 __asm {
michael@0 2423 push esi
michael@0 2424 push edi
michael@0 2425 mov eax, [esp + 8 + 4] // Y
michael@0 2426 mov esi, [esp + 8 + 8] // U
michael@0 2427 mov edi, [esp + 8 + 12] // V
michael@0 2428 mov edx, [esp + 8 + 16] // rgb565
michael@0 2429 mov ecx, [esp + 8 + 20] // width
michael@0 2430 sub edi, esi
michael@0 2431 pxor xmm4, xmm4
michael@0 2432 pcmpeqb xmm5, xmm5 // generate mask 0x0000001f
michael@0 2433 psrld xmm5, 27
michael@0 2434 pcmpeqb xmm6, xmm6 // generate mask 0x000007e0
michael@0 2435 psrld xmm6, 26
michael@0 2436 pslld xmm6, 5
michael@0 2437 pcmpeqb xmm7, xmm7 // generate mask 0xfffff800
michael@0 2438 pslld xmm7, 11
michael@0 2439
michael@0 2440 align 4
michael@0 2441 convertloop:
michael@0 2442 READYUV422
michael@0 2443 YUVTORGB
michael@0 2444
michael@0 2445 // Step 3: Weave into RRGB
michael@0 2446 punpcklbw xmm0, xmm1 // BG
michael@0 2447 punpcklbw xmm2, xmm2 // RR
michael@0 2448 movdqa xmm1, xmm0
michael@0 2449 punpcklwd xmm0, xmm2 // BGRR first 4 pixels
michael@0 2450 punpckhwd xmm1, xmm2 // BGRR next 4 pixels
michael@0 2451
michael@0 2452 // Step 3b: RRGB -> RGB565
michael@0 2453 movdqa xmm3, xmm0 // B first 4 pixels of argb
michael@0 2454 movdqa xmm2, xmm0 // G
michael@0 2455 pslld xmm0, 8 // R
michael@0 2456 psrld xmm3, 3 // B
michael@0 2457 psrld xmm2, 5 // G
michael@0 2458 psrad xmm0, 16 // R
michael@0 2459 pand xmm3, xmm5 // B
michael@0 2460 pand xmm2, xmm6 // G
michael@0 2461 pand xmm0, xmm7 // R
michael@0 2462 por xmm3, xmm2 // BG
michael@0 2463 por xmm0, xmm3 // BGR
michael@0 2464 movdqa xmm3, xmm1 // B next 4 pixels of argb
michael@0 2465 movdqa xmm2, xmm1 // G
michael@0 2466 pslld xmm1, 8 // R
michael@0 2467 psrld xmm3, 3 // B
michael@0 2468 psrld xmm2, 5 // G
michael@0 2469 psrad xmm1, 16 // R
michael@0 2470 pand xmm3, xmm5 // B
michael@0 2471 pand xmm2, xmm6 // G
michael@0 2472 pand xmm1, xmm7 // R
michael@0 2473 por xmm3, xmm2 // BG
michael@0 2474 por xmm1, xmm3 // BGR
michael@0 2475 packssdw xmm0, xmm1
michael@0 2476 sub ecx, 8
michael@0 2477 movdqu [edx], xmm0 // store 8 pixels of RGB565
michael@0 2478 lea edx, [edx + 16]
michael@0 2479 jg convertloop
michael@0 2480
michael@0 2481 pop edi
michael@0 2482 pop esi
michael@0 2483 ret
michael@0 2484 }
michael@0 2485 }
michael@0 2486
michael@0 2487 // 8 pixels, dest aligned 16.
michael@0 2488 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
michael@0 2489 __declspec(naked) __declspec(align(16))
michael@0 2490 void I422ToARGBRow_SSSE3(const uint8* y_buf,
michael@0 2491 const uint8* u_buf,
michael@0 2492 const uint8* v_buf,
michael@0 2493 uint8* dst_argb,
michael@0 2494 int width) {
michael@0 2495 __asm {
michael@0 2496 push esi
michael@0 2497 push edi
michael@0 2498 mov eax, [esp + 8 + 4] // Y
michael@0 2499 mov esi, [esp + 8 + 8] // U
michael@0 2500 mov edi, [esp + 8 + 12] // V
michael@0 2501 mov edx, [esp + 8 + 16] // argb
michael@0 2502 mov ecx, [esp + 8 + 20] // width
michael@0 2503 sub edi, esi
michael@0 2504 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
michael@0 2505 pxor xmm4, xmm4
michael@0 2506
michael@0 2507 align 4
michael@0 2508 convertloop:
michael@0 2509 READYUV422
michael@0 2510 YUVTORGB
michael@0 2511
michael@0 2512 // Step 3: Weave into ARGB
michael@0 2513 punpcklbw xmm0, xmm1 // BG
michael@0 2514 punpcklbw xmm2, xmm5 // RA
michael@0 2515 movdqa xmm1, xmm0
michael@0 2516 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
michael@0 2517 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
michael@0 2518 movdqa [edx], xmm0
michael@0 2519 movdqa [edx + 16], xmm1
michael@0 2520 lea edx, [edx + 32]
michael@0 2521 sub ecx, 8
michael@0 2522 jg convertloop
michael@0 2523
michael@0 2524 pop edi
michael@0 2525 pop esi
michael@0 2526 ret
michael@0 2527 }
michael@0 2528 }
michael@0 2529
michael@0 2530 // 8 pixels, dest aligned 16.
michael@0 2531 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
michael@0 2532 // Similar to I420 but duplicate UV once more.
michael@0 2533 __declspec(naked) __declspec(align(16))
michael@0 2534 void I411ToARGBRow_SSSE3(const uint8* y_buf,
michael@0 2535 const uint8* u_buf,
michael@0 2536 const uint8* v_buf,
michael@0 2537 uint8* dst_argb,
michael@0 2538 int width) {
michael@0 2539 __asm {
michael@0 2540 push ebx
michael@0 2541 push esi
michael@0 2542 push edi
michael@0 2543 mov eax, [esp + 12 + 4] // Y
michael@0 2544 mov esi, [esp + 12 + 8] // U
michael@0 2545 mov edi, [esp + 12 + 12] // V
michael@0 2546 mov edx, [esp + 12 + 16] // argb
michael@0 2547 mov ecx, [esp + 12 + 20] // width
michael@0 2548 sub edi, esi
michael@0 2549 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
michael@0 2550 pxor xmm4, xmm4
michael@0 2551
michael@0 2552 align 4
michael@0 2553 convertloop:
michael@0 2554 READYUV411 // modifies EBX
michael@0 2555 YUVTORGB
michael@0 2556
michael@0 2557 // Step 3: Weave into ARGB
michael@0 2558 punpcklbw xmm0, xmm1 // BG
michael@0 2559 punpcklbw xmm2, xmm5 // RA
michael@0 2560 movdqa xmm1, xmm0
michael@0 2561 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
michael@0 2562 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
michael@0 2563 movdqa [edx], xmm0
michael@0 2564 movdqa [edx + 16], xmm1
michael@0 2565 lea edx, [edx + 32]
michael@0 2566 sub ecx, 8
michael@0 2567 jg convertloop
michael@0 2568
michael@0 2569 pop edi
michael@0 2570 pop esi
michael@0 2571 pop ebx
michael@0 2572 ret
michael@0 2573 }
michael@0 2574 }
michael@0 2575
michael@0 2576 // 8 pixels, dest aligned 16.
michael@0 2577 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
michael@0 2578 __declspec(naked) __declspec(align(16))
michael@0 2579 void NV12ToARGBRow_SSSE3(const uint8* y_buf,
michael@0 2580 const uint8* uv_buf,
michael@0 2581 uint8* dst_argb,
michael@0 2582 int width) {
michael@0 2583 __asm {
michael@0 2584 push esi
michael@0 2585 mov eax, [esp + 4 + 4] // Y
michael@0 2586 mov esi, [esp + 4 + 8] // UV
michael@0 2587 mov edx, [esp + 4 + 12] // argb
michael@0 2588 mov ecx, [esp + 4 + 16] // width
michael@0 2589 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
michael@0 2590 pxor xmm4, xmm4
michael@0 2591
michael@0 2592 align 4
michael@0 2593 convertloop:
michael@0 2594 READNV12
michael@0 2595 YUVTORGB
michael@0 2596
michael@0 2597 // Step 3: Weave into ARGB
michael@0 2598 punpcklbw xmm0, xmm1 // BG
michael@0 2599 punpcklbw xmm2, xmm5 // RA
michael@0 2600 movdqa xmm1, xmm0
michael@0 2601 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
michael@0 2602 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
michael@0 2603 movdqa [edx], xmm0
michael@0 2604 movdqa [edx + 16], xmm1
michael@0 2605 lea edx, [edx + 32]
michael@0 2606 sub ecx, 8
michael@0 2607 jg convertloop
michael@0 2608
michael@0 2609 pop esi
michael@0 2610 ret
michael@0 2611 }
michael@0 2612 }
michael@0 2613
michael@0 2614 // 8 pixels, dest aligned 16.
michael@0 2615 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
michael@0 2616 __declspec(naked) __declspec(align(16))
michael@0 2617 void NV21ToARGBRow_SSSE3(const uint8* y_buf,
michael@0 2618 const uint8* uv_buf,
michael@0 2619 uint8* dst_argb,
michael@0 2620 int width) {
michael@0 2621 __asm {
michael@0 2622 push esi
michael@0 2623 mov eax, [esp + 4 + 4] // Y
michael@0 2624 mov esi, [esp + 4 + 8] // VU
michael@0 2625 mov edx, [esp + 4 + 12] // argb
michael@0 2626 mov ecx, [esp + 4 + 16] // width
michael@0 2627 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
michael@0 2628 pxor xmm4, xmm4
michael@0 2629
michael@0 2630 align 4
michael@0 2631 convertloop:
michael@0 2632 READNV12
michael@0 2633 YVUTORGB
michael@0 2634
michael@0 2635 // Step 3: Weave into ARGB
michael@0 2636 punpcklbw xmm0, xmm1 // BG
michael@0 2637 punpcklbw xmm2, xmm5 // RA
michael@0 2638 movdqa xmm1, xmm0
michael@0 2639 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
michael@0 2640 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
michael@0 2641 movdqa [edx], xmm0
michael@0 2642 movdqa [edx + 16], xmm1
michael@0 2643 lea edx, [edx + 32]
michael@0 2644 sub ecx, 8
michael@0 2645 jg convertloop
michael@0 2646
michael@0 2647 pop esi
michael@0 2648 ret
michael@0 2649 }
michael@0 2650 }
michael@0 2651
michael@0 2652 // 8 pixels, unaligned.
michael@0 2653 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
michael@0 2654 __declspec(naked) __declspec(align(16))
michael@0 2655 void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
michael@0 2656 const uint8* u_buf,
michael@0 2657 const uint8* v_buf,
michael@0 2658 uint8* dst_argb,
michael@0 2659 int width) {
michael@0 2660 __asm {
michael@0 2661 push esi
michael@0 2662 push edi
michael@0 2663 mov eax, [esp + 8 + 4] // Y
michael@0 2664 mov esi, [esp + 8 + 8] // U
michael@0 2665 mov edi, [esp + 8 + 12] // V
michael@0 2666 mov edx, [esp + 8 + 16] // argb
michael@0 2667 mov ecx, [esp + 8 + 20] // width
michael@0 2668 sub edi, esi
michael@0 2669 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
michael@0 2670 pxor xmm4, xmm4
michael@0 2671
michael@0 2672 align 4
michael@0 2673 convertloop:
michael@0 2674 READYUV444
michael@0 2675 YUVTORGB
michael@0 2676
michael@0 2677 // Step 3: Weave into ARGB
michael@0 2678 punpcklbw xmm0, xmm1 // BG
michael@0 2679 punpcklbw xmm2, xmm5 // RA
michael@0 2680 movdqa xmm1, xmm0
michael@0 2681 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
michael@0 2682 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
michael@0 2683 movdqu [edx], xmm0
michael@0 2684 movdqu [edx + 16], xmm1
michael@0 2685 lea edx, [edx + 32]
michael@0 2686 sub ecx, 8
michael@0 2687 jg convertloop
michael@0 2688
michael@0 2689 pop edi
michael@0 2690 pop esi
michael@0 2691 ret
michael@0 2692 }
michael@0 2693 }
michael@0 2694
michael@0 2695 // 8 pixels, unaligned.
michael@0 2696 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
michael@0 2697 __declspec(naked) __declspec(align(16))
michael@0 2698 void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
michael@0 2699 const uint8* u_buf,
michael@0 2700 const uint8* v_buf,
michael@0 2701 uint8* dst_argb,
michael@0 2702 int width) {
michael@0 2703 __asm {
michael@0 2704 push esi
michael@0 2705 push edi
michael@0 2706 mov eax, [esp + 8 + 4] // Y
michael@0 2707 mov esi, [esp + 8 + 8] // U
michael@0 2708 mov edi, [esp + 8 + 12] // V
michael@0 2709 mov edx, [esp + 8 + 16] // argb
michael@0 2710 mov ecx, [esp + 8 + 20] // width
michael@0 2711 sub edi, esi
michael@0 2712 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
michael@0 2713 pxor xmm4, xmm4
michael@0 2714
michael@0 2715 align 4
michael@0 2716 convertloop:
michael@0 2717 READYUV422
michael@0 2718 YUVTORGB
michael@0 2719
michael@0 2720 // Step 3: Weave into ARGB
michael@0 2721 punpcklbw xmm0, xmm1 // BG
michael@0 2722 punpcklbw xmm2, xmm5 // RA
michael@0 2723 movdqa xmm1, xmm0
michael@0 2724 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
michael@0 2725 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
michael@0 2726 movdqu [edx], xmm0
michael@0 2727 movdqu [edx + 16], xmm1
michael@0 2728 lea edx, [edx + 32]
michael@0 2729 sub ecx, 8
michael@0 2730 jg convertloop
michael@0 2731
michael@0 2732 pop edi
michael@0 2733 pop esi
michael@0 2734 ret
michael@0 2735 }
michael@0 2736 }
michael@0 2737
michael@0 2738 // 8 pixels, unaligned.
michael@0 2739 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
michael@0 2740 // Similar to I420 but duplicate UV once more.
michael@0 2741 __declspec(naked) __declspec(align(16))
michael@0 2742 void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
michael@0 2743 const uint8* u_buf,
michael@0 2744 const uint8* v_buf,
michael@0 2745 uint8* dst_argb,
michael@0 2746 int width) {
michael@0 2747 __asm {
michael@0 2748 push ebx
michael@0 2749 push esi
michael@0 2750 push edi
michael@0 2751 mov eax, [esp + 12 + 4] // Y
michael@0 2752 mov esi, [esp + 12 + 8] // U
michael@0 2753 mov edi, [esp + 12 + 12] // V
michael@0 2754 mov edx, [esp + 12 + 16] // argb
michael@0 2755 mov ecx, [esp + 12 + 20] // width
michael@0 2756 sub edi, esi
michael@0 2757 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
michael@0 2758 pxor xmm4, xmm4
michael@0 2759
michael@0 2760 align 4
michael@0 2761 convertloop:
michael@0 2762 READYUV411 // modifies EBX
michael@0 2763 YUVTORGB
michael@0 2764
michael@0 2765 // Step 3: Weave into ARGB
michael@0 2766 punpcklbw xmm0, xmm1 // BG
michael@0 2767 punpcklbw xmm2, xmm5 // RA
michael@0 2768 movdqa xmm1, xmm0
michael@0 2769 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
michael@0 2770 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
michael@0 2771 movdqu [edx], xmm0
michael@0 2772 movdqu [edx + 16], xmm1
michael@0 2773 lea edx, [edx + 32]
michael@0 2774 sub ecx, 8
michael@0 2775 jg convertloop
michael@0 2776
michael@0 2777 pop edi
michael@0 2778 pop esi
michael@0 2779 pop ebx
michael@0 2780 ret
michael@0 2781 }
michael@0 2782 }
michael@0 2783
michael@0 2784 // 8 pixels, dest aligned 16.
michael@0 2785 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
michael@0 2786 __declspec(naked) __declspec(align(16))
michael@0 2787 void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
michael@0 2788 const uint8* uv_buf,
michael@0 2789 uint8* dst_argb,
michael@0 2790 int width) {
michael@0 2791 __asm {
michael@0 2792 push esi
michael@0 2793 mov eax, [esp + 4 + 4] // Y
michael@0 2794 mov esi, [esp + 4 + 8] // UV
michael@0 2795 mov edx, [esp + 4 + 12] // argb
michael@0 2796 mov ecx, [esp + 4 + 16] // width
michael@0 2797 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
michael@0 2798 pxor xmm4, xmm4
michael@0 2799
michael@0 2800 align 4
michael@0 2801 convertloop:
michael@0 2802 READNV12
michael@0 2803 YUVTORGB
michael@0 2804
michael@0 2805 // Step 3: Weave into ARGB
michael@0 2806 punpcklbw xmm0, xmm1 // BG
michael@0 2807 punpcklbw xmm2, xmm5 // RA
michael@0 2808 movdqa xmm1, xmm0
michael@0 2809 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
michael@0 2810 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
michael@0 2811 movdqu [edx], xmm0
michael@0 2812 movdqu [edx + 16], xmm1
michael@0 2813 lea edx, [edx + 32]
michael@0 2814 sub ecx, 8
michael@0 2815 jg convertloop
michael@0 2816
michael@0 2817 pop esi
michael@0 2818 ret
michael@0 2819 }
michael@0 2820 }
michael@0 2821
michael@0 2822 // 8 pixels, dest aligned 16.
michael@0 2823 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
michael@0 2824 __declspec(naked) __declspec(align(16))
michael@0 2825 void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
michael@0 2826 const uint8* uv_buf,
michael@0 2827 uint8* dst_argb,
michael@0 2828 int width) {
michael@0 2829 __asm {
michael@0 2830 push esi
michael@0 2831 mov eax, [esp + 4 + 4] // Y
michael@0 2832 mov esi, [esp + 4 + 8] // VU
michael@0 2833 mov edx, [esp + 4 + 12] // argb
michael@0 2834 mov ecx, [esp + 4 + 16] // width
michael@0 2835 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
michael@0 2836 pxor xmm4, xmm4
michael@0 2837
michael@0 2838 align 4
michael@0 2839 convertloop:
michael@0 2840 READNV12
michael@0 2841 YVUTORGB
michael@0 2842
michael@0 2843 // Step 3: Weave into ARGB
michael@0 2844 punpcklbw xmm0, xmm1 // BG
michael@0 2845 punpcklbw xmm2, xmm5 // RA
michael@0 2846 movdqa xmm1, xmm0
michael@0 2847 punpcklwd xmm0, xmm2 // BGRA first 4 pixels
michael@0 2848 punpckhwd xmm1, xmm2 // BGRA next 4 pixels
michael@0 2849 movdqu [edx], xmm0
michael@0 2850 movdqu [edx + 16], xmm1
michael@0 2851 lea edx, [edx + 32]
michael@0 2852 sub ecx, 8
michael@0 2853 jg convertloop
michael@0 2854
michael@0 2855 pop esi
michael@0 2856 ret
michael@0 2857 }
michael@0 2858 }
michael@0 2859
michael@0 2860 __declspec(naked) __declspec(align(16))
michael@0 2861 void I422ToBGRARow_SSSE3(const uint8* y_buf,
michael@0 2862 const uint8* u_buf,
michael@0 2863 const uint8* v_buf,
michael@0 2864 uint8* dst_bgra,
michael@0 2865 int width) {
michael@0 2866 __asm {
michael@0 2867 push esi
michael@0 2868 push edi
michael@0 2869 mov eax, [esp + 8 + 4] // Y
michael@0 2870 mov esi, [esp + 8 + 8] // U
michael@0 2871 mov edi, [esp + 8 + 12] // V
michael@0 2872 mov edx, [esp + 8 + 16] // bgra
michael@0 2873 mov ecx, [esp + 8 + 20] // width
michael@0 2874 sub edi, esi
michael@0 2875 pxor xmm4, xmm4
michael@0 2876
michael@0 2877 align 4
michael@0 2878 convertloop:
michael@0 2879 READYUV422
michael@0 2880 YUVTORGB
michael@0 2881
michael@0 2882 // Step 3: Weave into BGRA
michael@0 2883 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
michael@0 2884 punpcklbw xmm1, xmm0 // GB
michael@0 2885 punpcklbw xmm5, xmm2 // AR
michael@0 2886 movdqa xmm0, xmm5
michael@0 2887 punpcklwd xmm5, xmm1 // BGRA first 4 pixels
michael@0 2888 punpckhwd xmm0, xmm1 // BGRA next 4 pixels
michael@0 2889 movdqa [edx], xmm5
michael@0 2890 movdqa [edx + 16], xmm0
michael@0 2891 lea edx, [edx + 32]
michael@0 2892 sub ecx, 8
michael@0 2893 jg convertloop
michael@0 2894
michael@0 2895 pop edi
michael@0 2896 pop esi
michael@0 2897 ret
michael@0 2898 }
michael@0 2899 }
michael@0 2900
michael@0 2901 __declspec(naked) __declspec(align(16))
michael@0 2902 void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
michael@0 2903 const uint8* u_buf,
michael@0 2904 const uint8* v_buf,
michael@0 2905 uint8* dst_bgra,
michael@0 2906 int width) {
michael@0 2907 __asm {
michael@0 2908 push esi
michael@0 2909 push edi
michael@0 2910 mov eax, [esp + 8 + 4] // Y
michael@0 2911 mov esi, [esp + 8 + 8] // U
michael@0 2912 mov edi, [esp + 8 + 12] // V
michael@0 2913 mov edx, [esp + 8 + 16] // bgra
michael@0 2914 mov ecx, [esp + 8 + 20] // width
michael@0 2915 sub edi, esi
michael@0 2916 pxor xmm4, xmm4
michael@0 2917
michael@0 2918 align 4
michael@0 2919 convertloop:
michael@0 2920 READYUV422
michael@0 2921 YUVTORGB
michael@0 2922
michael@0 2923 // Step 3: Weave into BGRA
michael@0 2924 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
michael@0 2925 punpcklbw xmm1, xmm0 // GB
michael@0 2926 punpcklbw xmm5, xmm2 // AR
michael@0 2927 movdqa xmm0, xmm5
michael@0 2928 punpcklwd xmm5, xmm1 // BGRA first 4 pixels
michael@0 2929 punpckhwd xmm0, xmm1 // BGRA next 4 pixels
michael@0 2930 movdqu [edx], xmm5
michael@0 2931 movdqu [edx + 16], xmm0
michael@0 2932 lea edx, [edx + 32]
michael@0 2933 sub ecx, 8
michael@0 2934 jg convertloop
michael@0 2935
michael@0 2936 pop edi
michael@0 2937 pop esi
michael@0 2938 ret
michael@0 2939 }
michael@0 2940 }
michael@0 2941
michael@0 2942 __declspec(naked) __declspec(align(16))
michael@0 2943 void I422ToABGRRow_SSSE3(const uint8* y_buf,
michael@0 2944 const uint8* u_buf,
michael@0 2945 const uint8* v_buf,
michael@0 2946 uint8* dst_abgr,
michael@0 2947 int width) {
michael@0 2948 __asm {
michael@0 2949 push esi
michael@0 2950 push edi
michael@0 2951 mov eax, [esp + 8 + 4] // Y
michael@0 2952 mov esi, [esp + 8 + 8] // U
michael@0 2953 mov edi, [esp + 8 + 12] // V
michael@0 2954 mov edx, [esp + 8 + 16] // abgr
michael@0 2955 mov ecx, [esp + 8 + 20] // width
michael@0 2956 sub edi, esi
michael@0 2957 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
michael@0 2958 pxor xmm4, xmm4
michael@0 2959
michael@0 2960 align 4
michael@0 2961 convertloop:
michael@0 2962 READYUV422
michael@0 2963 YUVTORGB
michael@0 2964
michael@0 2965 // Step 3: Weave into ARGB
michael@0 2966 punpcklbw xmm2, xmm1 // RG
michael@0 2967 punpcklbw xmm0, xmm5 // BA
michael@0 2968 movdqa xmm1, xmm2
michael@0 2969 punpcklwd xmm2, xmm0 // RGBA first 4 pixels
michael@0 2970 punpckhwd xmm1, xmm0 // RGBA next 4 pixels
michael@0 2971 movdqa [edx], xmm2
michael@0 2972 movdqa [edx + 16], xmm1
michael@0 2973 lea edx, [edx + 32]
michael@0 2974 sub ecx, 8
michael@0 2975 jg convertloop
michael@0 2976
michael@0 2977 pop edi
michael@0 2978 pop esi
michael@0 2979 ret
michael@0 2980 }
michael@0 2981 }
michael@0 2982
michael@0 2983 __declspec(naked) __declspec(align(16))
michael@0 2984 void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
michael@0 2985 const uint8* u_buf,
michael@0 2986 const uint8* v_buf,
michael@0 2987 uint8* dst_abgr,
michael@0 2988 int width) {
michael@0 2989 __asm {
michael@0 2990 push esi
michael@0 2991 push edi
michael@0 2992 mov eax, [esp + 8 + 4] // Y
michael@0 2993 mov esi, [esp + 8 + 8] // U
michael@0 2994 mov edi, [esp + 8 + 12] // V
michael@0 2995 mov edx, [esp + 8 + 16] // abgr
michael@0 2996 mov ecx, [esp + 8 + 20] // width
michael@0 2997 sub edi, esi
michael@0 2998 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
michael@0 2999 pxor xmm4, xmm4
michael@0 3000
michael@0 3001 align 4
michael@0 3002 convertloop:
michael@0 3003 READYUV422
michael@0 3004 YUVTORGB
michael@0 3005
michael@0 3006 // Step 3: Weave into ARGB
michael@0 3007 punpcklbw xmm2, xmm1 // RG
michael@0 3008 punpcklbw xmm0, xmm5 // BA
michael@0 3009 movdqa xmm1, xmm2
michael@0 3010 punpcklwd xmm2, xmm0 // RGBA first 4 pixels
michael@0 3011 punpckhwd xmm1, xmm0 // RGBA next 4 pixels
michael@0 3012 movdqu [edx], xmm2
michael@0 3013 movdqu [edx + 16], xmm1
michael@0 3014 lea edx, [edx + 32]
michael@0 3015 sub ecx, 8
michael@0 3016 jg convertloop
michael@0 3017
michael@0 3018 pop edi
michael@0 3019 pop esi
michael@0 3020 ret
michael@0 3021 }
michael@0 3022 }
michael@0 3023
michael@0 3024 __declspec(naked) __declspec(align(16))
michael@0 3025 void I422ToRGBARow_SSSE3(const uint8* y_buf,
michael@0 3026 const uint8* u_buf,
michael@0 3027 const uint8* v_buf,
michael@0 3028 uint8* dst_rgba,
michael@0 3029 int width) {
michael@0 3030 __asm {
michael@0 3031 push esi
michael@0 3032 push edi
michael@0 3033 mov eax, [esp + 8 + 4] // Y
michael@0 3034 mov esi, [esp + 8 + 8] // U
michael@0 3035 mov edi, [esp + 8 + 12] // V
michael@0 3036 mov edx, [esp + 8 + 16] // rgba
michael@0 3037 mov ecx, [esp + 8 + 20] // width
michael@0 3038 sub edi, esi
michael@0 3039 pxor xmm4, xmm4
michael@0 3040
michael@0 3041 align 4
michael@0 3042 convertloop:
michael@0 3043 READYUV422
michael@0 3044 YUVTORGB
michael@0 3045
michael@0 3046 // Step 3: Weave into RGBA
michael@0 3047 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
michael@0 3048 punpcklbw xmm1, xmm2 // GR
michael@0 3049 punpcklbw xmm5, xmm0 // AB
michael@0 3050 movdqa xmm0, xmm5
michael@0 3051 punpcklwd xmm5, xmm1 // RGBA first 4 pixels
michael@0 3052 punpckhwd xmm0, xmm1 // RGBA next 4 pixels
michael@0 3053 movdqa [edx], xmm5
michael@0 3054 movdqa [edx + 16], xmm0
michael@0 3055 lea edx, [edx + 32]
michael@0 3056 sub ecx, 8
michael@0 3057 jg convertloop
michael@0 3058
michael@0 3059 pop edi
michael@0 3060 pop esi
michael@0 3061 ret
michael@0 3062 }
michael@0 3063 }
michael@0 3064
michael@0 3065 __declspec(naked) __declspec(align(16))
michael@0 3066 void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
michael@0 3067 const uint8* u_buf,
michael@0 3068 const uint8* v_buf,
michael@0 3069 uint8* dst_rgba,
michael@0 3070 int width) {
michael@0 3071 __asm {
michael@0 3072 push esi
michael@0 3073 push edi
michael@0 3074 mov eax, [esp + 8 + 4] // Y
michael@0 3075 mov esi, [esp + 8 + 8] // U
michael@0 3076 mov edi, [esp + 8 + 12] // V
michael@0 3077 mov edx, [esp + 8 + 16] // rgba
michael@0 3078 mov ecx, [esp + 8 + 20] // width
michael@0 3079 sub edi, esi
michael@0 3080 pxor xmm4, xmm4
michael@0 3081
michael@0 3082 align 4
michael@0 3083 convertloop:
michael@0 3084 READYUV422
michael@0 3085 YUVTORGB
michael@0 3086
michael@0 3087 // Step 3: Weave into RGBA
michael@0 3088 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
michael@0 3089 punpcklbw xmm1, xmm2 // GR
michael@0 3090 punpcklbw xmm5, xmm0 // AB
michael@0 3091 movdqa xmm0, xmm5
michael@0 3092 punpcklwd xmm5, xmm1 // RGBA first 4 pixels
michael@0 3093 punpckhwd xmm0, xmm1 // RGBA next 4 pixels
michael@0 3094 movdqu [edx], xmm5
michael@0 3095 movdqu [edx + 16], xmm0
michael@0 3096 lea edx, [edx + 32]
michael@0 3097 sub ecx, 8
michael@0 3098 jg convertloop
michael@0 3099
michael@0 3100 pop edi
michael@0 3101 pop esi
michael@0 3102 ret
michael@0 3103 }
michael@0 3104 }
michael@0 3105
michael@0 3106 #endif // HAS_I422TOARGBROW_SSSE3
michael@0 3107
michael@0 3108 #ifdef HAS_YTOARGBROW_SSE2
michael@0 3109 __declspec(naked) __declspec(align(16))
michael@0 3110 void YToARGBRow_SSE2(const uint8* y_buf,
michael@0 3111 uint8* rgb_buf,
michael@0 3112 int width) {
michael@0 3113 __asm {
michael@0 3114 pxor xmm5, xmm5
michael@0 3115 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
michael@0 3116 pslld xmm4, 24
michael@0 3117 mov eax, 0x00100010
michael@0 3118 movd xmm3, eax
michael@0 3119 pshufd xmm3, xmm3, 0
michael@0 3120 mov eax, 0x004a004a // 74
michael@0 3121 movd xmm2, eax
michael@0 3122 pshufd xmm2, xmm2,0
michael@0 3123 mov eax, [esp + 4] // Y
michael@0 3124 mov edx, [esp + 8] // rgb
michael@0 3125 mov ecx, [esp + 12] // width
michael@0 3126
michael@0 3127 align 4
michael@0 3128 convertloop:
michael@0 3129 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
michael@0 3130 movq xmm0, qword ptr [eax]
michael@0 3131 lea eax, [eax + 8]
michael@0 3132 punpcklbw xmm0, xmm5 // 0.Y
michael@0 3133 psubusw xmm0, xmm3
michael@0 3134 pmullw xmm0, xmm2
michael@0 3135 psrlw xmm0, 6
michael@0 3136 packuswb xmm0, xmm0 // G
michael@0 3137
michael@0 3138 // Step 2: Weave into ARGB
michael@0 3139 punpcklbw xmm0, xmm0 // GG
michael@0 3140 movdqa xmm1, xmm0
michael@0 3141 punpcklwd xmm0, xmm0 // BGRA first 4 pixels
michael@0 3142 punpckhwd xmm1, xmm1 // BGRA next 4 pixels
michael@0 3143 por xmm0, xmm4
michael@0 3144 por xmm1, xmm4
michael@0 3145 movdqa [edx], xmm0
michael@0 3146 movdqa [edx + 16], xmm1
michael@0 3147 lea edx, [edx + 32]
michael@0 3148 sub ecx, 8
michael@0 3149 jg convertloop
michael@0 3150
michael@0 3151 ret
michael@0 3152 }
michael@0 3153 }
michael@0 3154 #endif // HAS_YTOARGBROW_SSE2
michael@0 3155
michael@0 3156 #ifdef HAS_MIRRORROW_SSSE3
michael@0 3157 // Shuffle table for reversing the bytes.
michael@0 3158 static const uvec8 kShuffleMirror = {
michael@0 3159 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
michael@0 3160 };
michael@0 3161
michael@0 3162 __declspec(naked) __declspec(align(16))
michael@0 3163 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
michael@0 3164 __asm {
michael@0 3165 mov eax, [esp + 4] // src
michael@0 3166 mov edx, [esp + 8] // dst
michael@0 3167 mov ecx, [esp + 12] // width
michael@0 3168 movdqa xmm5, kShuffleMirror
michael@0 3169 lea eax, [eax - 16]
michael@0 3170
michael@0 3171 align 4
michael@0 3172 convertloop:
michael@0 3173 movdqa xmm0, [eax + ecx]
michael@0 3174 pshufb xmm0, xmm5
michael@0 3175 sub ecx, 16
michael@0 3176 movdqa [edx], xmm0
michael@0 3177 lea edx, [edx + 16]
michael@0 3178 jg convertloop
michael@0 3179 ret
michael@0 3180 }
michael@0 3181 }
michael@0 3182 #endif // HAS_MIRRORROW_SSSE3
michael@0 3183
michael@0 3184 #ifdef HAS_MIRRORROW_AVX2
michael@0 3185 // Shuffle table for reversing the bytes.
michael@0 3186 static const ulvec8 kShuffleMirror_AVX2 = {
michael@0 3187 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u,
michael@0 3188 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
michael@0 3189 };
michael@0 3190
michael@0 3191 __declspec(naked) __declspec(align(16))
michael@0 3192 void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
michael@0 3193 __asm {
michael@0 3194 mov eax, [esp + 4] // src
michael@0 3195 mov edx, [esp + 8] // dst
michael@0 3196 mov ecx, [esp + 12] // width
michael@0 3197 vmovdqa ymm5, kShuffleMirror_AVX2
michael@0 3198 lea eax, [eax - 32]
michael@0 3199
michael@0 3200 align 4
michael@0 3201 convertloop:
michael@0 3202 vmovdqu ymm0, [eax + ecx]
michael@0 3203 vpshufb ymm0, ymm0, ymm5
michael@0 3204 vpermq ymm0, ymm0, 0x4e // swap high and low halfs
michael@0 3205 sub ecx, 32
michael@0 3206 vmovdqu [edx], ymm0
michael@0 3207 lea edx, [edx + 32]
michael@0 3208 jg convertloop
michael@0 3209 vzeroupper
michael@0 3210 ret
michael@0 3211 }
michael@0 3212 }
michael@0 3213 #endif // HAS_MIRRORROW_AVX2
michael@0 3214
michael@0 3215 #ifdef HAS_MIRRORROW_SSE2
michael@0 3216 // SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
michael@0 3217 // version can not.
michael@0 3218 __declspec(naked) __declspec(align(16))
michael@0 3219 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
michael@0 3220 __asm {
michael@0 3221 mov eax, [esp + 4] // src
michael@0 3222 mov edx, [esp + 8] // dst
michael@0 3223 mov ecx, [esp + 12] // width
michael@0 3224 lea eax, [eax - 16]
michael@0 3225
michael@0 3226 align 4
michael@0 3227 convertloop:
michael@0 3228 movdqu xmm0, [eax + ecx]
michael@0 3229 movdqa xmm1, xmm0 // swap bytes
michael@0 3230 psllw xmm0, 8
michael@0 3231 psrlw xmm1, 8
michael@0 3232 por xmm0, xmm1
michael@0 3233 pshuflw xmm0, xmm0, 0x1b // swap words
michael@0 3234 pshufhw xmm0, xmm0, 0x1b
michael@0 3235 pshufd xmm0, xmm0, 0x4e // swap qwords
michael@0 3236 sub ecx, 16
michael@0 3237 movdqu [edx], xmm0
michael@0 3238 lea edx, [edx + 16]
michael@0 3239 jg convertloop
michael@0 3240 ret
michael@0 3241 }
michael@0 3242 }
michael@0 3243 #endif // HAS_MIRRORROW_SSE2
michael@0 3244
michael@0 3245 #ifdef HAS_MIRRORROW_UV_SSSE3
michael@0 3246 // Shuffle table for reversing the bytes of UV channels.
michael@0 3247 static const uvec8 kShuffleMirrorUV = {
michael@0 3248 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
michael@0 3249 };
michael@0 3250
michael@0 3251 __declspec(naked) __declspec(align(16))
michael@0 3252 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
michael@0 3253 int width) {
michael@0 3254 __asm {
michael@0 3255 push edi
michael@0 3256 mov eax, [esp + 4 + 4] // src
michael@0 3257 mov edx, [esp + 4 + 8] // dst_u
michael@0 3258 mov edi, [esp + 4 + 12] // dst_v
michael@0 3259 mov ecx, [esp + 4 + 16] // width
michael@0 3260 movdqa xmm1, kShuffleMirrorUV
michael@0 3261 lea eax, [eax + ecx * 2 - 16]
michael@0 3262 sub edi, edx
michael@0 3263
michael@0 3264 align 4
michael@0 3265 convertloop:
michael@0 3266 movdqa xmm0, [eax]
michael@0 3267 lea eax, [eax - 16]
michael@0 3268 pshufb xmm0, xmm1
michael@0 3269 sub ecx, 8
michael@0 3270 movlpd qword ptr [edx], xmm0
michael@0 3271 movhpd qword ptr [edx + edi], xmm0
michael@0 3272 lea edx, [edx + 8]
michael@0 3273 jg convertloop
michael@0 3274
michael@0 3275 pop edi
michael@0 3276 ret
michael@0 3277 }
michael@0 3278 }
michael@0 3279 #endif // HAS_MIRRORROW_UV_SSSE3
michael@0 3280
michael@0 3281 #ifdef HAS_ARGBMIRRORROW_SSSE3
michael@0 3282 // Shuffle table for reversing the bytes.
michael@0 3283 static const uvec8 kARGBShuffleMirror = {
michael@0 3284 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
michael@0 3285 };
michael@0 3286
michael@0 3287 __declspec(naked) __declspec(align(16))
michael@0 3288 void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
michael@0 3289 __asm {
michael@0 3290 mov eax, [esp + 4] // src
michael@0 3291 mov edx, [esp + 8] // dst
michael@0 3292 mov ecx, [esp + 12] // width
michael@0 3293 lea eax, [eax - 16 + ecx * 4] // last 4 pixels.
michael@0 3294 movdqa xmm5, kARGBShuffleMirror
michael@0 3295
michael@0 3296 align 4
michael@0 3297 convertloop:
michael@0 3298 movdqa xmm0, [eax]
michael@0 3299 lea eax, [eax - 16]
michael@0 3300 pshufb xmm0, xmm5
michael@0 3301 sub ecx, 4
michael@0 3302 movdqa [edx], xmm0
michael@0 3303 lea edx, [edx + 16]
michael@0 3304 jg convertloop
michael@0 3305 ret
michael@0 3306 }
michael@0 3307 }
michael@0 3308 #endif // HAS_ARGBMIRRORROW_SSSE3
michael@0 3309
michael@0 3310 #ifdef HAS_ARGBMIRRORROW_AVX2
michael@0 3311 // Shuffle table for reversing the bytes.
michael@0 3312 static const ulvec32 kARGBShuffleMirror_AVX2 = {
michael@0 3313 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
michael@0 3314 };
michael@0 3315
michael@0 3316 __declspec(naked) __declspec(align(16))
michael@0 3317 void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
michael@0 3318 __asm {
michael@0 3319 mov eax, [esp + 4] // src
michael@0 3320 mov edx, [esp + 8] // dst
michael@0 3321 mov ecx, [esp + 12] // width
michael@0 3322 lea eax, [eax - 32]
michael@0 3323 vmovdqa ymm5, kARGBShuffleMirror_AVX2
michael@0 3324
michael@0 3325 align 4
michael@0 3326 convertloop:
michael@0 3327 vpermd ymm0, ymm5, [eax + ecx * 4] // permute dword order
michael@0 3328 sub ecx, 8
michael@0 3329 vmovdqu [edx], ymm0
michael@0 3330 lea edx, [edx + 32]
michael@0 3331 jg convertloop
michael@0 3332 vzeroupper
michael@0 3333 ret
michael@0 3334 }
michael@0 3335 }
michael@0 3336 #endif // HAS_ARGBMIRRORROW_AVX2
michael@0 3337
michael@0 3338 #ifdef HAS_SPLITUVROW_SSE2
michael@0 3339 __declspec(naked) __declspec(align(16))
michael@0 3340 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
michael@0 3341 __asm {
michael@0 3342 push edi
michael@0 3343 mov eax, [esp + 4 + 4] // src_uv
michael@0 3344 mov edx, [esp + 4 + 8] // dst_u
michael@0 3345 mov edi, [esp + 4 + 12] // dst_v
michael@0 3346 mov ecx, [esp + 4 + 16] // pix
michael@0 3347 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
michael@0 3348 psrlw xmm5, 8
michael@0 3349 sub edi, edx
michael@0 3350
michael@0 3351 align 4
michael@0 3352 convertloop:
michael@0 3353 movdqa xmm0, [eax]
michael@0 3354 movdqa xmm1, [eax + 16]
michael@0 3355 lea eax, [eax + 32]
michael@0 3356 movdqa xmm2, xmm0
michael@0 3357 movdqa xmm3, xmm1
michael@0 3358 pand xmm0, xmm5 // even bytes
michael@0 3359 pand xmm1, xmm5
michael@0 3360 packuswb xmm0, xmm1
michael@0 3361 psrlw xmm2, 8 // odd bytes
michael@0 3362 psrlw xmm3, 8
michael@0 3363 packuswb xmm2, xmm3
michael@0 3364 movdqa [edx], xmm0
michael@0 3365 movdqa [edx + edi], xmm2
michael@0 3366 lea edx, [edx + 16]
michael@0 3367 sub ecx, 16
michael@0 3368 jg convertloop
michael@0 3369
michael@0 3370 pop edi
michael@0 3371 ret
michael@0 3372 }
michael@0 3373 }
michael@0 3374
michael@0 3375 __declspec(naked) __declspec(align(16))
michael@0 3376 void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
michael@0 3377 int pix) {
michael@0 3378 __asm {
michael@0 3379 push edi
michael@0 3380 mov eax, [esp + 4 + 4] // src_uv
michael@0 3381 mov edx, [esp + 4 + 8] // dst_u
michael@0 3382 mov edi, [esp + 4 + 12] // dst_v
michael@0 3383 mov ecx, [esp + 4 + 16] // pix
michael@0 3384 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
michael@0 3385 psrlw xmm5, 8
michael@0 3386 sub edi, edx
michael@0 3387
michael@0 3388 align 4
michael@0 3389 convertloop:
michael@0 3390 movdqu xmm0, [eax]
michael@0 3391 movdqu xmm1, [eax + 16]
michael@0 3392 lea eax, [eax + 32]
michael@0 3393 movdqa xmm2, xmm0
michael@0 3394 movdqa xmm3, xmm1
michael@0 3395 pand xmm0, xmm5 // even bytes
michael@0 3396 pand xmm1, xmm5
michael@0 3397 packuswb xmm0, xmm1
michael@0 3398 psrlw xmm2, 8 // odd bytes
michael@0 3399 psrlw xmm3, 8
michael@0 3400 packuswb xmm2, xmm3
michael@0 3401 movdqu [edx], xmm0
michael@0 3402 movdqu [edx + edi], xmm2
michael@0 3403 lea edx, [edx + 16]
michael@0 3404 sub ecx, 16
michael@0 3405 jg convertloop
michael@0 3406
michael@0 3407 pop edi
michael@0 3408 ret
michael@0 3409 }
michael@0 3410 }
michael@0 3411 #endif // HAS_SPLITUVROW_SSE2
michael@0 3412
michael@0 3413 #ifdef HAS_SPLITUVROW_AVX2
michael@0 3414 __declspec(naked) __declspec(align(16))
michael@0 3415 void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
michael@0 3416 __asm {
michael@0 3417 push edi
michael@0 3418 mov eax, [esp + 4 + 4] // src_uv
michael@0 3419 mov edx, [esp + 4 + 8] // dst_u
michael@0 3420 mov edi, [esp + 4 + 12] // dst_v
michael@0 3421 mov ecx, [esp + 4 + 16] // pix
michael@0 3422 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
michael@0 3423 vpsrlw ymm5, ymm5, 8
michael@0 3424 sub edi, edx
michael@0 3425
michael@0 3426 align 4
michael@0 3427 convertloop:
michael@0 3428 vmovdqu ymm0, [eax]
michael@0 3429 vmovdqu ymm1, [eax + 32]
michael@0 3430 lea eax, [eax + 64]
michael@0 3431 vpsrlw ymm2, ymm0, 8 // odd bytes
michael@0 3432 vpsrlw ymm3, ymm1, 8
michael@0 3433 vpand ymm0, ymm0, ymm5 // even bytes
michael@0 3434 vpand ymm1, ymm1, ymm5
michael@0 3435 vpackuswb ymm0, ymm0, ymm1
michael@0 3436 vpackuswb ymm2, ymm2, ymm3
michael@0 3437 vpermq ymm0, ymm0, 0xd8
michael@0 3438 vpermq ymm2, ymm2, 0xd8
michael@0 3439 vmovdqu [edx], ymm0
michael@0 3440 vmovdqu [edx + edi], ymm2
michael@0 3441 lea edx, [edx + 32]
michael@0 3442 sub ecx, 32
michael@0 3443 jg convertloop
michael@0 3444
michael@0 3445 pop edi
michael@0 3446 vzeroupper
michael@0 3447 ret
michael@0 3448 }
michael@0 3449 }
michael@0 3450 #endif // HAS_SPLITUVROW_AVX2
michael@0 3451
michael@0 3452 #ifdef HAS_MERGEUVROW_SSE2
michael@0 3453 __declspec(naked) __declspec(align(16))
michael@0 3454 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
michael@0 3455 int width) {
michael@0 3456 __asm {
michael@0 3457 push edi
michael@0 3458 mov eax, [esp + 4 + 4] // src_u
michael@0 3459 mov edx, [esp + 4 + 8] // src_v
michael@0 3460 mov edi, [esp + 4 + 12] // dst_uv
michael@0 3461 mov ecx, [esp + 4 + 16] // width
michael@0 3462 sub edx, eax
michael@0 3463
michael@0 3464 align 4
michael@0 3465 convertloop:
michael@0 3466 movdqa xmm0, [eax] // read 16 U's
michael@0 3467 movdqa xmm1, [eax + edx] // and 16 V's
michael@0 3468 lea eax, [eax + 16]
michael@0 3469 movdqa xmm2, xmm0
michael@0 3470 punpcklbw xmm0, xmm1 // first 8 UV pairs
michael@0 3471 punpckhbw xmm2, xmm1 // next 8 UV pairs
michael@0 3472 movdqa [edi], xmm0
michael@0 3473 movdqa [edi + 16], xmm2
michael@0 3474 lea edi, [edi + 32]
michael@0 3475 sub ecx, 16
michael@0 3476 jg convertloop
michael@0 3477
michael@0 3478 pop edi
michael@0 3479 ret
michael@0 3480 }
michael@0 3481 }
michael@0 3482
michael@0 3483 __declspec(naked) __declspec(align(16))
michael@0 3484 void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
michael@0 3485 uint8* dst_uv, int width) {
michael@0 3486 __asm {
michael@0 3487 push edi
michael@0 3488 mov eax, [esp + 4 + 4] // src_u
michael@0 3489 mov edx, [esp + 4 + 8] // src_v
michael@0 3490 mov edi, [esp + 4 + 12] // dst_uv
michael@0 3491 mov ecx, [esp + 4 + 16] // width
michael@0 3492 sub edx, eax
michael@0 3493
michael@0 3494 align 4
michael@0 3495 convertloop:
michael@0 3496 movdqu xmm0, [eax] // read 16 U's
michael@0 3497 movdqu xmm1, [eax + edx] // and 16 V's
michael@0 3498 lea eax, [eax + 16]
michael@0 3499 movdqa xmm2, xmm0
michael@0 3500 punpcklbw xmm0, xmm1 // first 8 UV pairs
michael@0 3501 punpckhbw xmm2, xmm1 // next 8 UV pairs
michael@0 3502 movdqu [edi], xmm0
michael@0 3503 movdqu [edi + 16], xmm2
michael@0 3504 lea edi, [edi + 32]
michael@0 3505 sub ecx, 16
michael@0 3506 jg convertloop
michael@0 3507
michael@0 3508 pop edi
michael@0 3509 ret
michael@0 3510 }
michael@0 3511 }
michael@0 3512 #endif // HAS_MERGEUVROW_SSE2
michael@0 3513
michael@0 3514 #ifdef HAS_MERGEUVROW_AVX2
michael@0 3515 __declspec(naked) __declspec(align(16))
michael@0 3516 void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
michael@0 3517 int width) {
michael@0 3518 __asm {
michael@0 3519 push edi
michael@0 3520 mov eax, [esp + 4 + 4] // src_u
michael@0 3521 mov edx, [esp + 4 + 8] // src_v
michael@0 3522 mov edi, [esp + 4 + 12] // dst_uv
michael@0 3523 mov ecx, [esp + 4 + 16] // width
michael@0 3524 sub edx, eax
michael@0 3525
michael@0 3526 align 4
michael@0 3527 convertloop:
michael@0 3528 vmovdqu ymm0, [eax] // read 32 U's
michael@0 3529 vmovdqu ymm1, [eax + edx] // and 32 V's
michael@0 3530 lea eax, [eax + 32]
michael@0 3531 vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2
michael@0 3532 vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3
michael@0 3533 vperm2i128 ymm1, ymm2, ymm0, 0x20 // low 128 of ymm2 and low 128 of ymm0
michael@0 3534 vperm2i128 ymm2, ymm2, ymm0, 0x31 // high 128 of ymm2 and high 128 of ymm0
michael@0 3535 vmovdqu [edi], ymm1
michael@0 3536 vmovdqu [edi + 32], ymm2
michael@0 3537 lea edi, [edi + 64]
michael@0 3538 sub ecx, 32
michael@0 3539 jg convertloop
michael@0 3540
michael@0 3541 pop edi
michael@0 3542 vzeroupper
michael@0 3543 ret
michael@0 3544 }
michael@0 3545 }
michael@0 3546 #endif // HAS_MERGEUVROW_AVX2
michael@0 3547
michael@0 3548 #ifdef HAS_COPYROW_SSE2
michael@0 3549 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
michael@0 3550 __declspec(naked) __declspec(align(16))
michael@0 3551 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
michael@0 3552 __asm {
michael@0 3553 mov eax, [esp + 4] // src
michael@0 3554 mov edx, [esp + 8] // dst
michael@0 3555 mov ecx, [esp + 12] // count
michael@0 3556
michael@0 3557 align 4
michael@0 3558 convertloop:
michael@0 3559 movdqa xmm0, [eax]
michael@0 3560 movdqa xmm1, [eax + 16]
michael@0 3561 lea eax, [eax + 32]
michael@0 3562 movdqa [edx], xmm0
michael@0 3563 movdqa [edx + 16], xmm1
michael@0 3564 lea edx, [edx + 32]
michael@0 3565 sub ecx, 32
michael@0 3566 jg convertloop
michael@0 3567 ret
michael@0 3568 }
michael@0 3569 }
michael@0 3570 #endif // HAS_COPYROW_SSE2
michael@0 3571
michael@0 3572 // Unaligned Multiple of 1.
michael@0 3573 __declspec(naked) __declspec(align(16))
michael@0 3574 void CopyRow_ERMS(const uint8* src, uint8* dst, int count) {
michael@0 3575 __asm {
michael@0 3576 mov eax, esi
michael@0 3577 mov edx, edi
michael@0 3578 mov esi, [esp + 4] // src
michael@0 3579 mov edi, [esp + 8] // dst
michael@0 3580 mov ecx, [esp + 12] // count
michael@0 3581 rep movsb
michael@0 3582 mov edi, edx
michael@0 3583 mov esi, eax
michael@0 3584 ret
michael@0 3585 }
michael@0 3586 }
michael@0 3587
michael@0 3588 #ifdef HAS_COPYROW_X86
michael@0 3589 __declspec(naked) __declspec(align(16))
michael@0 3590 void CopyRow_X86(const uint8* src, uint8* dst, int count) {
michael@0 3591 __asm {
michael@0 3592 mov eax, esi
michael@0 3593 mov edx, edi
michael@0 3594 mov esi, [esp + 4] // src
michael@0 3595 mov edi, [esp + 8] // dst
michael@0 3596 mov ecx, [esp + 12] // count
michael@0 3597 shr ecx, 2
michael@0 3598 rep movsd
michael@0 3599 mov edi, edx
michael@0 3600 mov esi, eax
michael@0 3601 ret
michael@0 3602 }
michael@0 3603 }
michael@0 3604 #endif // HAS_COPYROW_X86
michael@0 3605
michael@0 3606 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
michael@0 3607 // width in pixels
michael@0 3608 __declspec(naked) __declspec(align(16))
michael@0 3609 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
michael@0 3610 __asm {
michael@0 3611 mov eax, [esp + 4] // src
michael@0 3612 mov edx, [esp + 8] // dst
michael@0 3613 mov ecx, [esp + 12] // count
michael@0 3614 pcmpeqb xmm0, xmm0 // generate mask 0xff000000
michael@0 3615 pslld xmm0, 24
michael@0 3616 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
michael@0 3617 psrld xmm1, 8
michael@0 3618
michael@0 3619 align 4
michael@0 3620 convertloop:
michael@0 3621 movdqa xmm2, [eax]
michael@0 3622 movdqa xmm3, [eax + 16]
michael@0 3623 lea eax, [eax + 32]
michael@0 3624 movdqa xmm4, [edx]
michael@0 3625 movdqa xmm5, [edx + 16]
michael@0 3626 pand xmm2, xmm0
michael@0 3627 pand xmm3, xmm0
michael@0 3628 pand xmm4, xmm1
michael@0 3629 pand xmm5, xmm1
michael@0 3630 por xmm2, xmm4
michael@0 3631 por xmm3, xmm5
michael@0 3632 movdqa [edx], xmm2
michael@0 3633 movdqa [edx + 16], xmm3
michael@0 3634 lea edx, [edx + 32]
michael@0 3635 sub ecx, 8
michael@0 3636 jg convertloop
michael@0 3637
michael@0 3638 ret
michael@0 3639 }
michael@0 3640 }
michael@0 3641 #endif // HAS_ARGBCOPYALPHAROW_SSE2
michael@0 3642
michael@0 3643 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
michael@0 3644 // width in pixels
michael@0 3645 __declspec(naked) __declspec(align(16))
michael@0 3646 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
michael@0 3647 __asm {
michael@0 3648 mov eax, [esp + 4] // src
michael@0 3649 mov edx, [esp + 8] // dst
michael@0 3650 mov ecx, [esp + 12] // count
michael@0 3651 vpcmpeqb ymm0, ymm0, ymm0
michael@0 3652 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
michael@0 3653
michael@0 3654 align 4
michael@0 3655 convertloop:
michael@0 3656 vmovdqu ymm1, [eax]
michael@0 3657 vmovdqu ymm2, [eax + 32]
michael@0 3658 lea eax, [eax + 64]
michael@0 3659 vpblendvb ymm1, ymm1, [edx], ymm0
michael@0 3660 vpblendvb ymm2, ymm2, [edx + 32], ymm0
michael@0 3661 vmovdqu [edx], ymm1
michael@0 3662 vmovdqu [edx + 32], ymm2
michael@0 3663 lea edx, [edx + 64]
michael@0 3664 sub ecx, 16
michael@0 3665 jg convertloop
michael@0 3666
michael@0 3667 vzeroupper
michael@0 3668 ret
michael@0 3669 }
michael@0 3670 }
michael@0 3671 #endif // HAS_ARGBCOPYALPHAROW_AVX2
michael@0 3672
michael@0 3673 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
michael@0 3674 // width in pixels
michael@0 3675 __declspec(naked) __declspec(align(16))
michael@0 3676 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
michael@0 3677 __asm {
michael@0 3678 mov eax, [esp + 4] // src
michael@0 3679 mov edx, [esp + 8] // dst
michael@0 3680 mov ecx, [esp + 12] // count
michael@0 3681 pcmpeqb xmm0, xmm0 // generate mask 0xff000000
michael@0 3682 pslld xmm0, 24
michael@0 3683 pcmpeqb xmm1, xmm1 // generate mask 0x00ffffff
michael@0 3684 psrld xmm1, 8
michael@0 3685
michael@0 3686 align 4
michael@0 3687 convertloop:
michael@0 3688 movq xmm2, qword ptr [eax] // 8 Y's
michael@0 3689 lea eax, [eax + 8]
michael@0 3690 punpcklbw xmm2, xmm2
michael@0 3691 punpckhwd xmm3, xmm2
michael@0 3692 punpcklwd xmm2, xmm2
michael@0 3693 movdqa xmm4, [edx]
michael@0 3694 movdqa xmm5, [edx + 16]
michael@0 3695 pand xmm2, xmm0
michael@0 3696 pand xmm3, xmm0
michael@0 3697 pand xmm4, xmm1
michael@0 3698 pand xmm5, xmm1
michael@0 3699 por xmm2, xmm4
michael@0 3700 por xmm3, xmm5
michael@0 3701 movdqa [edx], xmm2
michael@0 3702 movdqa [edx + 16], xmm3
michael@0 3703 lea edx, [edx + 32]
michael@0 3704 sub ecx, 8
michael@0 3705 jg convertloop
michael@0 3706
michael@0 3707 ret
michael@0 3708 }
michael@0 3709 }
michael@0 3710 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
michael@0 3711
michael@0 3712 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
michael@0 3713 // width in pixels
michael@0 3714 __declspec(naked) __declspec(align(16))
michael@0 3715 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
michael@0 3716 __asm {
michael@0 3717 mov eax, [esp + 4] // src
michael@0 3718 mov edx, [esp + 8] // dst
michael@0 3719 mov ecx, [esp + 12] // count
michael@0 3720 vpcmpeqb ymm0, ymm0, ymm0
michael@0 3721 vpsrld ymm0, ymm0, 8 // generate mask 0x00ffffff
michael@0 3722
michael@0 3723 align 4
michael@0 3724 convertloop:
michael@0 3725 vpmovzxbd ymm1, qword ptr [eax]
michael@0 3726 vpmovzxbd ymm2, qword ptr [eax + 8]
michael@0 3727 lea eax, [eax + 16]
michael@0 3728 vpslld ymm1, ymm1, 24
michael@0 3729 vpslld ymm2, ymm2, 24
michael@0 3730 vpblendvb ymm1, ymm1, [edx], ymm0
michael@0 3731 vpblendvb ymm2, ymm2, [edx + 32], ymm0
michael@0 3732 vmovdqu [edx], ymm1
michael@0 3733 vmovdqu [edx + 32], ymm2
michael@0 3734 lea edx, [edx + 64]
michael@0 3735 sub ecx, 16
michael@0 3736 jg convertloop
michael@0 3737
michael@0 3738 vzeroupper
michael@0 3739 ret
michael@0 3740 }
michael@0 3741 }
michael@0 3742 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
michael@0 3743
michael@0 3744 #ifdef HAS_SETROW_X86
michael@0 3745 // SetRow8 writes 'count' bytes using a 32 bit value repeated.
michael@0 3746 __declspec(naked) __declspec(align(16))
michael@0 3747 void SetRow_X86(uint8* dst, uint32 v32, int count) {
michael@0 3748 __asm {
michael@0 3749 mov edx, edi
michael@0 3750 mov edi, [esp + 4] // dst
michael@0 3751 mov eax, [esp + 8] // v32
michael@0 3752 mov ecx, [esp + 12] // count
michael@0 3753 shr ecx, 2
michael@0 3754 rep stosd
michael@0 3755 mov edi, edx
michael@0 3756 ret
michael@0 3757 }
michael@0 3758 }
michael@0 3759
michael@0 3760 // SetRow32 writes 'count' words using a 32 bit value repeated.
michael@0 3761 __declspec(naked) __declspec(align(16))
michael@0 3762 void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
michael@0 3763 int dst_stride, int height) {
michael@0 3764 __asm {
michael@0 3765 push esi
michael@0 3766 push edi
michael@0 3767 push ebp
michael@0 3768 mov edi, [esp + 12 + 4] // dst
michael@0 3769 mov eax, [esp + 12 + 8] // v32
michael@0 3770 mov ebp, [esp + 12 + 12] // width
michael@0 3771 mov edx, [esp + 12 + 16] // dst_stride
michael@0 3772 mov esi, [esp + 12 + 20] // height
michael@0 3773 lea ecx, [ebp * 4]
michael@0 3774 sub edx, ecx // stride - width * 4
michael@0 3775
michael@0 3776 align 4
michael@0 3777 convertloop:
michael@0 3778 mov ecx, ebp
michael@0 3779 rep stosd
michael@0 3780 add edi, edx
michael@0 3781 sub esi, 1
michael@0 3782 jg convertloop
michael@0 3783
michael@0 3784 pop ebp
michael@0 3785 pop edi
michael@0 3786 pop esi
michael@0 3787 ret
michael@0 3788 }
michael@0 3789 }
michael@0 3790 #endif // HAS_SETROW_X86
michael@0 3791
michael@0 3792 #ifdef HAS_YUY2TOYROW_AVX2
michael@0 3793 __declspec(naked) __declspec(align(16))
michael@0 3794 void YUY2ToYRow_AVX2(const uint8* src_yuy2,
michael@0 3795 uint8* dst_y, int pix) {
michael@0 3796 __asm {
michael@0 3797 mov eax, [esp + 4] // src_yuy2
michael@0 3798 mov edx, [esp + 8] // dst_y
michael@0 3799 mov ecx, [esp + 12] // pix
michael@0 3800 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
michael@0 3801 vpsrlw ymm5, ymm5, 8
michael@0 3802
michael@0 3803 align 4
michael@0 3804 convertloop:
michael@0 3805 vmovdqu ymm0, [eax]
michael@0 3806 vmovdqu ymm1, [eax + 32]
michael@0 3807 lea eax, [eax + 64]
michael@0 3808 vpand ymm0, ymm0, ymm5 // even bytes are Y
michael@0 3809 vpand ymm1, ymm1, ymm5
michael@0 3810 vpackuswb ymm0, ymm0, ymm1 // mutates.
michael@0 3811 vpermq ymm0, ymm0, 0xd8
michael@0 3812 sub ecx, 32
michael@0 3813 vmovdqu [edx], ymm0
michael@0 3814 lea edx, [edx + 32]
michael@0 3815 jg convertloop
michael@0 3816 vzeroupper
michael@0 3817 ret
michael@0 3818 }
michael@0 3819 }
michael@0 3820
michael@0 3821 __declspec(naked) __declspec(align(16))
michael@0 3822 void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2,
michael@0 3823 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 3824 __asm {
michael@0 3825 push esi
michael@0 3826 push edi
michael@0 3827 mov eax, [esp + 8 + 4] // src_yuy2
michael@0 3828 mov esi, [esp + 8 + 8] // stride_yuy2
michael@0 3829 mov edx, [esp + 8 + 12] // dst_u
michael@0 3830 mov edi, [esp + 8 + 16] // dst_v
michael@0 3831 mov ecx, [esp + 8 + 20] // pix
michael@0 3832 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
michael@0 3833 vpsrlw ymm5, ymm5, 8
michael@0 3834 sub edi, edx
michael@0 3835
michael@0 3836 align 4
michael@0 3837 convertloop:
michael@0 3838 vmovdqu ymm0, [eax]
michael@0 3839 vmovdqu ymm1, [eax + 32]
michael@0 3840 vpavgb ymm0, ymm0, [eax + esi]
michael@0 3841 vpavgb ymm1, ymm1, [eax + esi + 32]
michael@0 3842 lea eax, [eax + 64]
michael@0 3843 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
michael@0 3844 vpsrlw ymm1, ymm1, 8
michael@0 3845 vpackuswb ymm0, ymm0, ymm1 // mutates.
michael@0 3846 vpermq ymm0, ymm0, 0xd8
michael@0 3847 vpand ymm1, ymm0, ymm5 // U
michael@0 3848 vpsrlw ymm0, ymm0, 8 // V
michael@0 3849 vpackuswb ymm1, ymm1, ymm1 // mutates.
michael@0 3850 vpackuswb ymm0, ymm0, ymm0 // mutates.
michael@0 3851 vpermq ymm1, ymm1, 0xd8
michael@0 3852 vpermq ymm0, ymm0, 0xd8
michael@0 3853 vextractf128 [edx], ymm1, 0 // U
michael@0 3854 vextractf128 [edx + edi], ymm0, 0 // V
michael@0 3855 lea edx, [edx + 16]
michael@0 3856 sub ecx, 32
michael@0 3857 jg convertloop
michael@0 3858
michael@0 3859 pop edi
michael@0 3860 pop esi
michael@0 3861 vzeroupper
michael@0 3862 ret
michael@0 3863 }
michael@0 3864 }
michael@0 3865
michael@0 3866 __declspec(naked) __declspec(align(16))
michael@0 3867 void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
michael@0 3868 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 3869 __asm {
michael@0 3870 push edi
michael@0 3871 mov eax, [esp + 4 + 4] // src_yuy2
michael@0 3872 mov edx, [esp + 4 + 8] // dst_u
michael@0 3873 mov edi, [esp + 4 + 12] // dst_v
michael@0 3874 mov ecx, [esp + 4 + 16] // pix
michael@0 3875 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
michael@0 3876 vpsrlw ymm5, ymm5, 8
michael@0 3877 sub edi, edx
michael@0 3878
michael@0 3879 align 4
michael@0 3880 convertloop:
michael@0 3881 vmovdqu ymm0, [eax]
michael@0 3882 vmovdqu ymm1, [eax + 32]
michael@0 3883 lea eax, [eax + 64]
michael@0 3884 vpsrlw ymm0, ymm0, 8 // YUYV -> UVUV
michael@0 3885 vpsrlw ymm1, ymm1, 8
michael@0 3886 vpackuswb ymm0, ymm0, ymm1 // mutates.
michael@0 3887 vpermq ymm0, ymm0, 0xd8
michael@0 3888 vpand ymm1, ymm0, ymm5 // U
michael@0 3889 vpsrlw ymm0, ymm0, 8 // V
michael@0 3890 vpackuswb ymm1, ymm1, ymm1 // mutates.
michael@0 3891 vpackuswb ymm0, ymm0, ymm0 // mutates.
michael@0 3892 vpermq ymm1, ymm1, 0xd8
michael@0 3893 vpermq ymm0, ymm0, 0xd8
michael@0 3894 vextractf128 [edx], ymm1, 0 // U
michael@0 3895 vextractf128 [edx + edi], ymm0, 0 // V
michael@0 3896 lea edx, [edx + 16]
michael@0 3897 sub ecx, 32
michael@0 3898 jg convertloop
michael@0 3899
michael@0 3900 pop edi
michael@0 3901 vzeroupper
michael@0 3902 ret
michael@0 3903 }
michael@0 3904 }
michael@0 3905
michael@0 3906 __declspec(naked) __declspec(align(16))
michael@0 3907 void UYVYToYRow_AVX2(const uint8* src_uyvy,
michael@0 3908 uint8* dst_y, int pix) {
michael@0 3909 __asm {
michael@0 3910 mov eax, [esp + 4] // src_uyvy
michael@0 3911 mov edx, [esp + 8] // dst_y
michael@0 3912 mov ecx, [esp + 12] // pix
michael@0 3913
michael@0 3914 align 4
michael@0 3915 convertloop:
michael@0 3916 vmovdqu ymm0, [eax]
michael@0 3917 vmovdqu ymm1, [eax + 32]
michael@0 3918 lea eax, [eax + 64]
michael@0 3919 vpsrlw ymm0, ymm0, 8 // odd bytes are Y
michael@0 3920 vpsrlw ymm1, ymm1, 8
michael@0 3921 vpackuswb ymm0, ymm0, ymm1 // mutates.
michael@0 3922 vpermq ymm0, ymm0, 0xd8
michael@0 3923 sub ecx, 32
michael@0 3924 vmovdqu [edx], ymm0
michael@0 3925 lea edx, [edx + 32]
michael@0 3926 jg convertloop
michael@0 3927 ret
michael@0 3928 vzeroupper
michael@0 3929 }
michael@0 3930 }
michael@0 3931
michael@0 3932 __declspec(naked) __declspec(align(16))
michael@0 3933 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
michael@0 3934 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 3935 __asm {
michael@0 3936 push esi
michael@0 3937 push edi
michael@0 3938 mov eax, [esp + 8 + 4] // src_yuy2
michael@0 3939 mov esi, [esp + 8 + 8] // stride_yuy2
michael@0 3940 mov edx, [esp + 8 + 12] // dst_u
michael@0 3941 mov edi, [esp + 8 + 16] // dst_v
michael@0 3942 mov ecx, [esp + 8 + 20] // pix
michael@0 3943 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
michael@0 3944 vpsrlw ymm5, ymm5, 8
michael@0 3945 sub edi, edx
michael@0 3946
michael@0 3947 align 4
michael@0 3948 convertloop:
michael@0 3949 vmovdqu ymm0, [eax]
michael@0 3950 vmovdqu ymm1, [eax + 32]
michael@0 3951 vpavgb ymm0, ymm0, [eax + esi]
michael@0 3952 vpavgb ymm1, ymm1, [eax + esi + 32]
michael@0 3953 lea eax, [eax + 64]
michael@0 3954 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
michael@0 3955 vpand ymm1, ymm1, ymm5
michael@0 3956 vpackuswb ymm0, ymm0, ymm1 // mutates.
michael@0 3957 vpermq ymm0, ymm0, 0xd8
michael@0 3958 vpand ymm1, ymm0, ymm5 // U
michael@0 3959 vpsrlw ymm0, ymm0, 8 // V
michael@0 3960 vpackuswb ymm1, ymm1, ymm1 // mutates.
michael@0 3961 vpackuswb ymm0, ymm0, ymm0 // mutates.
michael@0 3962 vpermq ymm1, ymm1, 0xd8
michael@0 3963 vpermq ymm0, ymm0, 0xd8
michael@0 3964 vextractf128 [edx], ymm1, 0 // U
michael@0 3965 vextractf128 [edx + edi], ymm0, 0 // V
michael@0 3966 lea edx, [edx + 16]
michael@0 3967 sub ecx, 32
michael@0 3968 jg convertloop
michael@0 3969
michael@0 3970 pop edi
michael@0 3971 pop esi
michael@0 3972 vzeroupper
michael@0 3973 ret
michael@0 3974 }
michael@0 3975 }
michael@0 3976
michael@0 3977 __declspec(naked) __declspec(align(16))
michael@0 3978 void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
michael@0 3979 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 3980 __asm {
michael@0 3981 push edi
michael@0 3982 mov eax, [esp + 4 + 4] // src_yuy2
michael@0 3983 mov edx, [esp + 4 + 8] // dst_u
michael@0 3984 mov edi, [esp + 4 + 12] // dst_v
michael@0 3985 mov ecx, [esp + 4 + 16] // pix
michael@0 3986 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff
michael@0 3987 vpsrlw ymm5, ymm5, 8
michael@0 3988 sub edi, edx
michael@0 3989
michael@0 3990 align 4
michael@0 3991 convertloop:
michael@0 3992 vmovdqu ymm0, [eax]
michael@0 3993 vmovdqu ymm1, [eax + 32]
michael@0 3994 lea eax, [eax + 64]
michael@0 3995 vpand ymm0, ymm0, ymm5 // UYVY -> UVUV
michael@0 3996 vpand ymm1, ymm1, ymm5
michael@0 3997 vpackuswb ymm0, ymm0, ymm1 // mutates.
michael@0 3998 vpermq ymm0, ymm0, 0xd8
michael@0 3999 vpand ymm1, ymm0, ymm5 // U
michael@0 4000 vpsrlw ymm0, ymm0, 8 // V
michael@0 4001 vpackuswb ymm1, ymm1, ymm1 // mutates.
michael@0 4002 vpackuswb ymm0, ymm0, ymm0 // mutates.
michael@0 4003 vpermq ymm1, ymm1, 0xd8
michael@0 4004 vpermq ymm0, ymm0, 0xd8
michael@0 4005 vextractf128 [edx], ymm1, 0 // U
michael@0 4006 vextractf128 [edx + edi], ymm0, 0 // V
michael@0 4007 lea edx, [edx + 16]
michael@0 4008 sub ecx, 32
michael@0 4009 jg convertloop
michael@0 4010
michael@0 4011 pop edi
michael@0 4012 vzeroupper
michael@0 4013 ret
michael@0 4014 }
michael@0 4015 }
michael@0 4016 #endif // HAS_YUY2TOYROW_AVX2
michael@0 4017
michael@0 4018 #ifdef HAS_YUY2TOYROW_SSE2
michael@0 4019 __declspec(naked) __declspec(align(16))
michael@0 4020 void YUY2ToYRow_SSE2(const uint8* src_yuy2,
michael@0 4021 uint8* dst_y, int pix) {
michael@0 4022 __asm {
michael@0 4023 mov eax, [esp + 4] // src_yuy2
michael@0 4024 mov edx, [esp + 8] // dst_y
michael@0 4025 mov ecx, [esp + 12] // pix
michael@0 4026 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
michael@0 4027 psrlw xmm5, 8
michael@0 4028
michael@0 4029 align 4
michael@0 4030 convertloop:
michael@0 4031 movdqa xmm0, [eax]
michael@0 4032 movdqa xmm1, [eax + 16]
michael@0 4033 lea eax, [eax + 32]
michael@0 4034 pand xmm0, xmm5 // even bytes are Y
michael@0 4035 pand xmm1, xmm5
michael@0 4036 packuswb xmm0, xmm1
michael@0 4037 sub ecx, 16
michael@0 4038 movdqa [edx], xmm0
michael@0 4039 lea edx, [edx + 16]
michael@0 4040 jg convertloop
michael@0 4041 ret
michael@0 4042 }
michael@0 4043 }
michael@0 4044
michael@0 4045 __declspec(naked) __declspec(align(16))
michael@0 4046 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
michael@0 4047 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 4048 __asm {
michael@0 4049 push esi
michael@0 4050 push edi
michael@0 4051 mov eax, [esp + 8 + 4] // src_yuy2
michael@0 4052 mov esi, [esp + 8 + 8] // stride_yuy2
michael@0 4053 mov edx, [esp + 8 + 12] // dst_u
michael@0 4054 mov edi, [esp + 8 + 16] // dst_v
michael@0 4055 mov ecx, [esp + 8 + 20] // pix
michael@0 4056 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
michael@0 4057 psrlw xmm5, 8
michael@0 4058 sub edi, edx
michael@0 4059
michael@0 4060 align 4
michael@0 4061 convertloop:
michael@0 4062 movdqa xmm0, [eax]
michael@0 4063 movdqa xmm1, [eax + 16]
michael@0 4064 movdqa xmm2, [eax + esi]
michael@0 4065 movdqa xmm3, [eax + esi + 16]
michael@0 4066 lea eax, [eax + 32]
michael@0 4067 pavgb xmm0, xmm2
michael@0 4068 pavgb xmm1, xmm3
michael@0 4069 psrlw xmm0, 8 // YUYV -> UVUV
michael@0 4070 psrlw xmm1, 8
michael@0 4071 packuswb xmm0, xmm1
michael@0 4072 movdqa xmm1, xmm0
michael@0 4073 pand xmm0, xmm5 // U
michael@0 4074 packuswb xmm0, xmm0
michael@0 4075 psrlw xmm1, 8 // V
michael@0 4076 packuswb xmm1, xmm1
michael@0 4077 movq qword ptr [edx], xmm0
michael@0 4078 movq qword ptr [edx + edi], xmm1
michael@0 4079 lea edx, [edx + 8]
michael@0 4080 sub ecx, 16
michael@0 4081 jg convertloop
michael@0 4082
michael@0 4083 pop edi
michael@0 4084 pop esi
michael@0 4085 ret
michael@0 4086 }
michael@0 4087 }
michael@0 4088
michael@0 4089 __declspec(naked) __declspec(align(16))
michael@0 4090 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
michael@0 4091 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 4092 __asm {
michael@0 4093 push edi
michael@0 4094 mov eax, [esp + 4 + 4] // src_yuy2
michael@0 4095 mov edx, [esp + 4 + 8] // dst_u
michael@0 4096 mov edi, [esp + 4 + 12] // dst_v
michael@0 4097 mov ecx, [esp + 4 + 16] // pix
michael@0 4098 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
michael@0 4099 psrlw xmm5, 8
michael@0 4100 sub edi, edx
michael@0 4101
michael@0 4102 align 4
michael@0 4103 convertloop:
michael@0 4104 movdqa xmm0, [eax]
michael@0 4105 movdqa xmm1, [eax + 16]
michael@0 4106 lea eax, [eax + 32]
michael@0 4107 psrlw xmm0, 8 // YUYV -> UVUV
michael@0 4108 psrlw xmm1, 8
michael@0 4109 packuswb xmm0, xmm1
michael@0 4110 movdqa xmm1, xmm0
michael@0 4111 pand xmm0, xmm5 // U
michael@0 4112 packuswb xmm0, xmm0
michael@0 4113 psrlw xmm1, 8 // V
michael@0 4114 packuswb xmm1, xmm1
michael@0 4115 movq qword ptr [edx], xmm0
michael@0 4116 movq qword ptr [edx + edi], xmm1
michael@0 4117 lea edx, [edx + 8]
michael@0 4118 sub ecx, 16
michael@0 4119 jg convertloop
michael@0 4120
michael@0 4121 pop edi
michael@0 4122 ret
michael@0 4123 }
michael@0 4124 }
michael@0 4125
michael@0 4126 __declspec(naked) __declspec(align(16))
michael@0 4127 void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
michael@0 4128 uint8* dst_y, int pix) {
michael@0 4129 __asm {
michael@0 4130 mov eax, [esp + 4] // src_yuy2
michael@0 4131 mov edx, [esp + 8] // dst_y
michael@0 4132 mov ecx, [esp + 12] // pix
michael@0 4133 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
michael@0 4134 psrlw xmm5, 8
michael@0 4135
michael@0 4136 align 4
michael@0 4137 convertloop:
michael@0 4138 movdqu xmm0, [eax]
michael@0 4139 movdqu xmm1, [eax + 16]
michael@0 4140 lea eax, [eax + 32]
michael@0 4141 pand xmm0, xmm5 // even bytes are Y
michael@0 4142 pand xmm1, xmm5
michael@0 4143 packuswb xmm0, xmm1
michael@0 4144 sub ecx, 16
michael@0 4145 movdqu [edx], xmm0
michael@0 4146 lea edx, [edx + 16]
michael@0 4147 jg convertloop
michael@0 4148 ret
michael@0 4149 }
michael@0 4150 }
michael@0 4151
michael@0 4152 __declspec(naked) __declspec(align(16))
michael@0 4153 void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
michael@0 4154 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 4155 __asm {
michael@0 4156 push esi
michael@0 4157 push edi
michael@0 4158 mov eax, [esp + 8 + 4] // src_yuy2
michael@0 4159 mov esi, [esp + 8 + 8] // stride_yuy2
michael@0 4160 mov edx, [esp + 8 + 12] // dst_u
michael@0 4161 mov edi, [esp + 8 + 16] // dst_v
michael@0 4162 mov ecx, [esp + 8 + 20] // pix
michael@0 4163 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
michael@0 4164 psrlw xmm5, 8
michael@0 4165 sub edi, edx
michael@0 4166
michael@0 4167 align 4
michael@0 4168 convertloop:
michael@0 4169 movdqu xmm0, [eax]
michael@0 4170 movdqu xmm1, [eax + 16]
michael@0 4171 movdqu xmm2, [eax + esi]
michael@0 4172 movdqu xmm3, [eax + esi + 16]
michael@0 4173 lea eax, [eax + 32]
michael@0 4174 pavgb xmm0, xmm2
michael@0 4175 pavgb xmm1, xmm3
michael@0 4176 psrlw xmm0, 8 // YUYV -> UVUV
michael@0 4177 psrlw xmm1, 8
michael@0 4178 packuswb xmm0, xmm1
michael@0 4179 movdqa xmm1, xmm0
michael@0 4180 pand xmm0, xmm5 // U
michael@0 4181 packuswb xmm0, xmm0
michael@0 4182 psrlw xmm1, 8 // V
michael@0 4183 packuswb xmm1, xmm1
michael@0 4184 movq qword ptr [edx], xmm0
michael@0 4185 movq qword ptr [edx + edi], xmm1
michael@0 4186 lea edx, [edx + 8]
michael@0 4187 sub ecx, 16
michael@0 4188 jg convertloop
michael@0 4189
michael@0 4190 pop edi
michael@0 4191 pop esi
michael@0 4192 ret
michael@0 4193 }
michael@0 4194 }
michael@0 4195
michael@0 4196 __declspec(naked) __declspec(align(16))
michael@0 4197 void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
michael@0 4198 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 4199 __asm {
michael@0 4200 push edi
michael@0 4201 mov eax, [esp + 4 + 4] // src_yuy2
michael@0 4202 mov edx, [esp + 4 + 8] // dst_u
michael@0 4203 mov edi, [esp + 4 + 12] // dst_v
michael@0 4204 mov ecx, [esp + 4 + 16] // pix
michael@0 4205 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
michael@0 4206 psrlw xmm5, 8
michael@0 4207 sub edi, edx
michael@0 4208
michael@0 4209 align 4
michael@0 4210 convertloop:
michael@0 4211 movdqu xmm0, [eax]
michael@0 4212 movdqu xmm1, [eax + 16]
michael@0 4213 lea eax, [eax + 32]
michael@0 4214 psrlw xmm0, 8 // YUYV -> UVUV
michael@0 4215 psrlw xmm1, 8
michael@0 4216 packuswb xmm0, xmm1
michael@0 4217 movdqa xmm1, xmm0
michael@0 4218 pand xmm0, xmm5 // U
michael@0 4219 packuswb xmm0, xmm0
michael@0 4220 psrlw xmm1, 8 // V
michael@0 4221 packuswb xmm1, xmm1
michael@0 4222 movq qword ptr [edx], xmm0
michael@0 4223 movq qword ptr [edx + edi], xmm1
michael@0 4224 lea edx, [edx + 8]
michael@0 4225 sub ecx, 16
michael@0 4226 jg convertloop
michael@0 4227
michael@0 4228 pop edi
michael@0 4229 ret
michael@0 4230 }
michael@0 4231 }
michael@0 4232
michael@0 4233 __declspec(naked) __declspec(align(16))
michael@0 4234 void UYVYToYRow_SSE2(const uint8* src_uyvy,
michael@0 4235 uint8* dst_y, int pix) {
michael@0 4236 __asm {
michael@0 4237 mov eax, [esp + 4] // src_uyvy
michael@0 4238 mov edx, [esp + 8] // dst_y
michael@0 4239 mov ecx, [esp + 12] // pix
michael@0 4240
michael@0 4241 align 4
michael@0 4242 convertloop:
michael@0 4243 movdqa xmm0, [eax]
michael@0 4244 movdqa xmm1, [eax + 16]
michael@0 4245 lea eax, [eax + 32]
michael@0 4246 psrlw xmm0, 8 // odd bytes are Y
michael@0 4247 psrlw xmm1, 8
michael@0 4248 packuswb xmm0, xmm1
michael@0 4249 sub ecx, 16
michael@0 4250 movdqa [edx], xmm0
michael@0 4251 lea edx, [edx + 16]
michael@0 4252 jg convertloop
michael@0 4253 ret
michael@0 4254 }
michael@0 4255 }
michael@0 4256
michael@0 4257 __declspec(naked) __declspec(align(16))
michael@0 4258 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
michael@0 4259 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 4260 __asm {
michael@0 4261 push esi
michael@0 4262 push edi
michael@0 4263 mov eax, [esp + 8 + 4] // src_yuy2
michael@0 4264 mov esi, [esp + 8 + 8] // stride_yuy2
michael@0 4265 mov edx, [esp + 8 + 12] // dst_u
michael@0 4266 mov edi, [esp + 8 + 16] // dst_v
michael@0 4267 mov ecx, [esp + 8 + 20] // pix
michael@0 4268 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
michael@0 4269 psrlw xmm5, 8
michael@0 4270 sub edi, edx
michael@0 4271
michael@0 4272 align 4
michael@0 4273 convertloop:
michael@0 4274 movdqa xmm0, [eax]
michael@0 4275 movdqa xmm1, [eax + 16]
michael@0 4276 movdqa xmm2, [eax + esi]
michael@0 4277 movdqa xmm3, [eax + esi + 16]
michael@0 4278 lea eax, [eax + 32]
michael@0 4279 pavgb xmm0, xmm2
michael@0 4280 pavgb xmm1, xmm3
michael@0 4281 pand xmm0, xmm5 // UYVY -> UVUV
michael@0 4282 pand xmm1, xmm5
michael@0 4283 packuswb xmm0, xmm1
michael@0 4284 movdqa xmm1, xmm0
michael@0 4285 pand xmm0, xmm5 // U
michael@0 4286 packuswb xmm0, xmm0
michael@0 4287 psrlw xmm1, 8 // V
michael@0 4288 packuswb xmm1, xmm1
michael@0 4289 movq qword ptr [edx], xmm0
michael@0 4290 movq qword ptr [edx + edi], xmm1
michael@0 4291 lea edx, [edx + 8]
michael@0 4292 sub ecx, 16
michael@0 4293 jg convertloop
michael@0 4294
michael@0 4295 pop edi
michael@0 4296 pop esi
michael@0 4297 ret
michael@0 4298 }
michael@0 4299 }
michael@0 4300
michael@0 4301 __declspec(naked) __declspec(align(16))
michael@0 4302 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
michael@0 4303 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 4304 __asm {
michael@0 4305 push edi
michael@0 4306 mov eax, [esp + 4 + 4] // src_yuy2
michael@0 4307 mov edx, [esp + 4 + 8] // dst_u
michael@0 4308 mov edi, [esp + 4 + 12] // dst_v
michael@0 4309 mov ecx, [esp + 4 + 16] // pix
michael@0 4310 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
michael@0 4311 psrlw xmm5, 8
michael@0 4312 sub edi, edx
michael@0 4313
michael@0 4314 align 4
michael@0 4315 convertloop:
michael@0 4316 movdqa xmm0, [eax]
michael@0 4317 movdqa xmm1, [eax + 16]
michael@0 4318 lea eax, [eax + 32]
michael@0 4319 pand xmm0, xmm5 // UYVY -> UVUV
michael@0 4320 pand xmm1, xmm5
michael@0 4321 packuswb xmm0, xmm1
michael@0 4322 movdqa xmm1, xmm0
michael@0 4323 pand xmm0, xmm5 // U
michael@0 4324 packuswb xmm0, xmm0
michael@0 4325 psrlw xmm1, 8 // V
michael@0 4326 packuswb xmm1, xmm1
michael@0 4327 movq qword ptr [edx], xmm0
michael@0 4328 movq qword ptr [edx + edi], xmm1
michael@0 4329 lea edx, [edx + 8]
michael@0 4330 sub ecx, 16
michael@0 4331 jg convertloop
michael@0 4332
michael@0 4333 pop edi
michael@0 4334 ret
michael@0 4335 }
michael@0 4336 }
michael@0 4337
michael@0 4338 __declspec(naked) __declspec(align(16))
michael@0 4339 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
michael@0 4340 uint8* dst_y, int pix) {
michael@0 4341 __asm {
michael@0 4342 mov eax, [esp + 4] // src_uyvy
michael@0 4343 mov edx, [esp + 8] // dst_y
michael@0 4344 mov ecx, [esp + 12] // pix
michael@0 4345
michael@0 4346 align 4
michael@0 4347 convertloop:
michael@0 4348 movdqu xmm0, [eax]
michael@0 4349 movdqu xmm1, [eax + 16]
michael@0 4350 lea eax, [eax + 32]
michael@0 4351 psrlw xmm0, 8 // odd bytes are Y
michael@0 4352 psrlw xmm1, 8
michael@0 4353 packuswb xmm0, xmm1
michael@0 4354 sub ecx, 16
michael@0 4355 movdqu [edx], xmm0
michael@0 4356 lea edx, [edx + 16]
michael@0 4357 jg convertloop
michael@0 4358 ret
michael@0 4359 }
michael@0 4360 }
michael@0 4361
michael@0 4362 __declspec(naked) __declspec(align(16))
michael@0 4363 void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
michael@0 4364 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 4365 __asm {
michael@0 4366 push esi
michael@0 4367 push edi
michael@0 4368 mov eax, [esp + 8 + 4] // src_yuy2
michael@0 4369 mov esi, [esp + 8 + 8] // stride_yuy2
michael@0 4370 mov edx, [esp + 8 + 12] // dst_u
michael@0 4371 mov edi, [esp + 8 + 16] // dst_v
michael@0 4372 mov ecx, [esp + 8 + 20] // pix
michael@0 4373 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
michael@0 4374 psrlw xmm5, 8
michael@0 4375 sub edi, edx
michael@0 4376
michael@0 4377 align 4
michael@0 4378 convertloop:
michael@0 4379 movdqu xmm0, [eax]
michael@0 4380 movdqu xmm1, [eax + 16]
michael@0 4381 movdqu xmm2, [eax + esi]
michael@0 4382 movdqu xmm3, [eax + esi + 16]
michael@0 4383 lea eax, [eax + 32]
michael@0 4384 pavgb xmm0, xmm2
michael@0 4385 pavgb xmm1, xmm3
michael@0 4386 pand xmm0, xmm5 // UYVY -> UVUV
michael@0 4387 pand xmm1, xmm5
michael@0 4388 packuswb xmm0, xmm1
michael@0 4389 movdqa xmm1, xmm0
michael@0 4390 pand xmm0, xmm5 // U
michael@0 4391 packuswb xmm0, xmm0
michael@0 4392 psrlw xmm1, 8 // V
michael@0 4393 packuswb xmm1, xmm1
michael@0 4394 movq qword ptr [edx], xmm0
michael@0 4395 movq qword ptr [edx + edi], xmm1
michael@0 4396 lea edx, [edx + 8]
michael@0 4397 sub ecx, 16
michael@0 4398 jg convertloop
michael@0 4399
michael@0 4400 pop edi
michael@0 4401 pop esi
michael@0 4402 ret
michael@0 4403 }
michael@0 4404 }
michael@0 4405
michael@0 4406 __declspec(naked) __declspec(align(16))
michael@0 4407 void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
michael@0 4408 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 4409 __asm {
michael@0 4410 push edi
michael@0 4411 mov eax, [esp + 4 + 4] // src_yuy2
michael@0 4412 mov edx, [esp + 4 + 8] // dst_u
michael@0 4413 mov edi, [esp + 4 + 12] // dst_v
michael@0 4414 mov ecx, [esp + 4 + 16] // pix
michael@0 4415 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
michael@0 4416 psrlw xmm5, 8
michael@0 4417 sub edi, edx
michael@0 4418
michael@0 4419 align 4
michael@0 4420 convertloop:
michael@0 4421 movdqu xmm0, [eax]
michael@0 4422 movdqu xmm1, [eax + 16]
michael@0 4423 lea eax, [eax + 32]
michael@0 4424 pand xmm0, xmm5 // UYVY -> UVUV
michael@0 4425 pand xmm1, xmm5
michael@0 4426 packuswb xmm0, xmm1
michael@0 4427 movdqa xmm1, xmm0
michael@0 4428 pand xmm0, xmm5 // U
michael@0 4429 packuswb xmm0, xmm0
michael@0 4430 psrlw xmm1, 8 // V
michael@0 4431 packuswb xmm1, xmm1
michael@0 4432 movq qword ptr [edx], xmm0
michael@0 4433 movq qword ptr [edx + edi], xmm1
michael@0 4434 lea edx, [edx + 8]
michael@0 4435 sub ecx, 16
michael@0 4436 jg convertloop
michael@0 4437
michael@0 4438 pop edi
michael@0 4439 ret
michael@0 4440 }
michael@0 4441 }
michael@0 4442 #endif // HAS_YUY2TOYROW_SSE2
michael@0 4443
michael@0 4444 #ifdef HAS_ARGBBLENDROW_SSE2
michael@0 4445 // Blend 8 pixels at a time.
michael@0 4446 __declspec(naked) __declspec(align(16))
michael@0 4447 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
michael@0 4448 uint8* dst_argb, int width) {
michael@0 4449 __asm {
michael@0 4450 push esi
michael@0 4451 mov eax, [esp + 4 + 4] // src_argb0
michael@0 4452 mov esi, [esp + 4 + 8] // src_argb1
michael@0 4453 mov edx, [esp + 4 + 12] // dst_argb
michael@0 4454 mov ecx, [esp + 4 + 16] // width
michael@0 4455 pcmpeqb xmm7, xmm7 // generate constant 1
michael@0 4456 psrlw xmm7, 15
michael@0 4457 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
michael@0 4458 psrlw xmm6, 8
michael@0 4459 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
michael@0 4460 psllw xmm5, 8
michael@0 4461 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
michael@0 4462 pslld xmm4, 24
michael@0 4463
michael@0 4464 sub ecx, 1
michael@0 4465 je convertloop1 // only 1 pixel?
michael@0 4466 jl convertloop1b
michael@0 4467
michael@0 4468 // 1 pixel loop until destination pointer is aligned.
michael@0 4469 alignloop1:
michael@0 4470 test edx, 15 // aligned?
michael@0 4471 je alignloop1b
michael@0 4472 movd xmm3, [eax]
michael@0 4473 lea eax, [eax + 4]
michael@0 4474 movdqa xmm0, xmm3 // src argb
michael@0 4475 pxor xmm3, xmm4 // ~alpha
michael@0 4476 movd xmm2, [esi] // _r_b
michael@0 4477 psrlw xmm3, 8 // alpha
michael@0 4478 pshufhw xmm3, xmm3, 0F5h // 8 alpha words
michael@0 4479 pshuflw xmm3, xmm3, 0F5h
michael@0 4480 pand xmm2, xmm6 // _r_b
michael@0 4481 paddw xmm3, xmm7 // 256 - alpha
michael@0 4482 pmullw xmm2, xmm3 // _r_b * alpha
michael@0 4483 movd xmm1, [esi] // _a_g
michael@0 4484 lea esi, [esi + 4]
michael@0 4485 psrlw xmm1, 8 // _a_g
michael@0 4486 por xmm0, xmm4 // set alpha to 255
michael@0 4487 pmullw xmm1, xmm3 // _a_g * alpha
michael@0 4488 psrlw xmm2, 8 // _r_b convert to 8 bits again
michael@0 4489 paddusb xmm0, xmm2 // + src argb
michael@0 4490 pand xmm1, xmm5 // a_g_ convert to 8 bits again
michael@0 4491 paddusb xmm0, xmm1 // + src argb
michael@0 4492 sub ecx, 1
michael@0 4493 movd [edx], xmm0
michael@0 4494 lea edx, [edx + 4]
michael@0 4495 jge alignloop1
michael@0 4496
michael@0 4497 alignloop1b:
michael@0 4498 add ecx, 1 - 4
michael@0 4499 jl convertloop4b
michael@0 4500
michael@0 4501 // 4 pixel loop.
michael@0 4502 convertloop4:
michael@0 4503 movdqu xmm3, [eax] // src argb
michael@0 4504 lea eax, [eax + 16]
michael@0 4505 movdqa xmm0, xmm3 // src argb
michael@0 4506 pxor xmm3, xmm4 // ~alpha
michael@0 4507 movdqu xmm2, [esi] // _r_b
michael@0 4508 psrlw xmm3, 8 // alpha
michael@0 4509 pshufhw xmm3, xmm3, 0F5h // 8 alpha words
michael@0 4510 pshuflw xmm3, xmm3, 0F5h
michael@0 4511 pand xmm2, xmm6 // _r_b
michael@0 4512 paddw xmm3, xmm7 // 256 - alpha
michael@0 4513 pmullw xmm2, xmm3 // _r_b * alpha
michael@0 4514 movdqu xmm1, [esi] // _a_g
michael@0 4515 lea esi, [esi + 16]
michael@0 4516 psrlw xmm1, 8 // _a_g
michael@0 4517 por xmm0, xmm4 // set alpha to 255
michael@0 4518 pmullw xmm1, xmm3 // _a_g * alpha
michael@0 4519 psrlw xmm2, 8 // _r_b convert to 8 bits again
michael@0 4520 paddusb xmm0, xmm2 // + src argb
michael@0 4521 pand xmm1, xmm5 // a_g_ convert to 8 bits again
michael@0 4522 paddusb xmm0, xmm1 // + src argb
michael@0 4523 sub ecx, 4
michael@0 4524 movdqa [edx], xmm0
michael@0 4525 lea edx, [edx + 16]
michael@0 4526 jge convertloop4
michael@0 4527
michael@0 4528 convertloop4b:
michael@0 4529 add ecx, 4 - 1
michael@0 4530 jl convertloop1b
michael@0 4531
michael@0 4532 // 1 pixel loop.
michael@0 4533 convertloop1:
michael@0 4534 movd xmm3, [eax] // src argb
michael@0 4535 lea eax, [eax + 4]
michael@0 4536 movdqa xmm0, xmm3 // src argb
michael@0 4537 pxor xmm3, xmm4 // ~alpha
michael@0 4538 movd xmm2, [esi] // _r_b
michael@0 4539 psrlw xmm3, 8 // alpha
michael@0 4540 pshufhw xmm3, xmm3, 0F5h // 8 alpha words
michael@0 4541 pshuflw xmm3, xmm3, 0F5h
michael@0 4542 pand xmm2, xmm6 // _r_b
michael@0 4543 paddw xmm3, xmm7 // 256 - alpha
michael@0 4544 pmullw xmm2, xmm3 // _r_b * alpha
michael@0 4545 movd xmm1, [esi] // _a_g
michael@0 4546 lea esi, [esi + 4]
michael@0 4547 psrlw xmm1, 8 // _a_g
michael@0 4548 por xmm0, xmm4 // set alpha to 255
michael@0 4549 pmullw xmm1, xmm3 // _a_g * alpha
michael@0 4550 psrlw xmm2, 8 // _r_b convert to 8 bits again
michael@0 4551 paddusb xmm0, xmm2 // + src argb
michael@0 4552 pand xmm1, xmm5 // a_g_ convert to 8 bits again
michael@0 4553 paddusb xmm0, xmm1 // + src argb
michael@0 4554 sub ecx, 1
michael@0 4555 movd [edx], xmm0
michael@0 4556 lea edx, [edx + 4]
michael@0 4557 jge convertloop1
michael@0 4558
michael@0 4559 convertloop1b:
michael@0 4560 pop esi
michael@0 4561 ret
michael@0 4562 }
michael@0 4563 }
michael@0 4564 #endif // HAS_ARGBBLENDROW_SSE2
michael@0 4565
michael@0 4566 #ifdef HAS_ARGBBLENDROW_SSSE3
michael@0 4567 // Shuffle table for isolating alpha.
michael@0 4568 static const uvec8 kShuffleAlpha = {
michael@0 4569 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
michael@0 4570 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
michael@0 4571 };
michael@0 4572 // Same as SSE2, but replaces:
michael@0 4573 // psrlw xmm3, 8 // alpha
michael@0 4574 // pshufhw xmm3, xmm3, 0F5h // 8 alpha words
michael@0 4575 // pshuflw xmm3, xmm3, 0F5h
michael@0 4576 // with..
michael@0 4577 // pshufb xmm3, kShuffleAlpha // alpha
michael@0 4578 // Blend 8 pixels at a time.
michael@0 4579
michael@0 4580 __declspec(naked) __declspec(align(16))
michael@0 4581 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
michael@0 4582 uint8* dst_argb, int width) {
michael@0 4583 __asm {
michael@0 4584 push esi
michael@0 4585 mov eax, [esp + 4 + 4] // src_argb0
michael@0 4586 mov esi, [esp + 4 + 8] // src_argb1
michael@0 4587 mov edx, [esp + 4 + 12] // dst_argb
michael@0 4588 mov ecx, [esp + 4 + 16] // width
michael@0 4589 pcmpeqb xmm7, xmm7 // generate constant 0x0001
michael@0 4590 psrlw xmm7, 15
michael@0 4591 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
michael@0 4592 psrlw xmm6, 8
michael@0 4593 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
michael@0 4594 psllw xmm5, 8
michael@0 4595 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
michael@0 4596 pslld xmm4, 24
michael@0 4597
michael@0 4598 sub ecx, 1
michael@0 4599 je convertloop1 // only 1 pixel?
michael@0 4600 jl convertloop1b
michael@0 4601
michael@0 4602 // 1 pixel loop until destination pointer is aligned.
michael@0 4603 alignloop1:
michael@0 4604 test edx, 15 // aligned?
michael@0 4605 je alignloop1b
michael@0 4606 movd xmm3, [eax]
michael@0 4607 lea eax, [eax + 4]
michael@0 4608 movdqa xmm0, xmm3 // src argb
michael@0 4609 pxor xmm3, xmm4 // ~alpha
michael@0 4610 movd xmm2, [esi] // _r_b
michael@0 4611 pshufb xmm3, kShuffleAlpha // alpha
michael@0 4612 pand xmm2, xmm6 // _r_b
michael@0 4613 paddw xmm3, xmm7 // 256 - alpha
michael@0 4614 pmullw xmm2, xmm3 // _r_b * alpha
michael@0 4615 movd xmm1, [esi] // _a_g
michael@0 4616 lea esi, [esi + 4]
michael@0 4617 psrlw xmm1, 8 // _a_g
michael@0 4618 por xmm0, xmm4 // set alpha to 255
michael@0 4619 pmullw xmm1, xmm3 // _a_g * alpha
michael@0 4620 psrlw xmm2, 8 // _r_b convert to 8 bits again
michael@0 4621 paddusb xmm0, xmm2 // + src argb
michael@0 4622 pand xmm1, xmm5 // a_g_ convert to 8 bits again
michael@0 4623 paddusb xmm0, xmm1 // + src argb
michael@0 4624 sub ecx, 1
michael@0 4625 movd [edx], xmm0
michael@0 4626 lea edx, [edx + 4]
michael@0 4627 jge alignloop1
michael@0 4628
michael@0 4629 alignloop1b:
michael@0 4630 add ecx, 1 - 4
michael@0 4631 jl convertloop4b
michael@0 4632
michael@0 4633 test eax, 15 // unaligned?
michael@0 4634 jne convertuloop4
michael@0 4635 test esi, 15 // unaligned?
michael@0 4636 jne convertuloop4
michael@0 4637
michael@0 4638 // 4 pixel loop.
michael@0 4639 convertloop4:
michael@0 4640 movdqa xmm3, [eax] // src argb
michael@0 4641 lea eax, [eax + 16]
michael@0 4642 movdqa xmm0, xmm3 // src argb
michael@0 4643 pxor xmm3, xmm4 // ~alpha
michael@0 4644 movdqa xmm2, [esi] // _r_b
michael@0 4645 pshufb xmm3, kShuffleAlpha // alpha
michael@0 4646 pand xmm2, xmm6 // _r_b
michael@0 4647 paddw xmm3, xmm7 // 256 - alpha
michael@0 4648 pmullw xmm2, xmm3 // _r_b * alpha
michael@0 4649 movdqa xmm1, [esi] // _a_g
michael@0 4650 lea esi, [esi + 16]
michael@0 4651 psrlw xmm1, 8 // _a_g
michael@0 4652 por xmm0, xmm4 // set alpha to 255
michael@0 4653 pmullw xmm1, xmm3 // _a_g * alpha
michael@0 4654 psrlw xmm2, 8 // _r_b convert to 8 bits again
michael@0 4655 paddusb xmm0, xmm2 // + src argb
michael@0 4656 pand xmm1, xmm5 // a_g_ convert to 8 bits again
michael@0 4657 paddusb xmm0, xmm1 // + src argb
michael@0 4658 sub ecx, 4
michael@0 4659 movdqa [edx], xmm0
michael@0 4660 lea edx, [edx + 16]
michael@0 4661 jge convertloop4
michael@0 4662 jmp convertloop4b
michael@0 4663
michael@0 4664 // 4 pixel unaligned loop.
michael@0 4665 convertuloop4:
michael@0 4666 movdqu xmm3, [eax] // src argb
michael@0 4667 lea eax, [eax + 16]
michael@0 4668 movdqa xmm0, xmm3 // src argb
michael@0 4669 pxor xmm3, xmm4 // ~alpha
michael@0 4670 movdqu xmm2, [esi] // _r_b
michael@0 4671 pshufb xmm3, kShuffleAlpha // alpha
michael@0 4672 pand xmm2, xmm6 // _r_b
michael@0 4673 paddw xmm3, xmm7 // 256 - alpha
michael@0 4674 pmullw xmm2, xmm3 // _r_b * alpha
michael@0 4675 movdqu xmm1, [esi] // _a_g
michael@0 4676 lea esi, [esi + 16]
michael@0 4677 psrlw xmm1, 8 // _a_g
michael@0 4678 por xmm0, xmm4 // set alpha to 255
michael@0 4679 pmullw xmm1, xmm3 // _a_g * alpha
michael@0 4680 psrlw xmm2, 8 // _r_b convert to 8 bits again
michael@0 4681 paddusb xmm0, xmm2 // + src argb
michael@0 4682 pand xmm1, xmm5 // a_g_ convert to 8 bits again
michael@0 4683 paddusb xmm0, xmm1 // + src argb
michael@0 4684 sub ecx, 4
michael@0 4685 movdqa [edx], xmm0
michael@0 4686 lea edx, [edx + 16]
michael@0 4687 jge convertuloop4
michael@0 4688
michael@0 4689 convertloop4b:
michael@0 4690 add ecx, 4 - 1
michael@0 4691 jl convertloop1b
michael@0 4692
michael@0 4693 // 1 pixel loop.
michael@0 4694 convertloop1:
michael@0 4695 movd xmm3, [eax] // src argb
michael@0 4696 lea eax, [eax + 4]
michael@0 4697 movdqa xmm0, xmm3 // src argb
michael@0 4698 pxor xmm3, xmm4 // ~alpha
michael@0 4699 movd xmm2, [esi] // _r_b
michael@0 4700 pshufb xmm3, kShuffleAlpha // alpha
michael@0 4701 pand xmm2, xmm6 // _r_b
michael@0 4702 paddw xmm3, xmm7 // 256 - alpha
michael@0 4703 pmullw xmm2, xmm3 // _r_b * alpha
michael@0 4704 movd xmm1, [esi] // _a_g
michael@0 4705 lea esi, [esi + 4]
michael@0 4706 psrlw xmm1, 8 // _a_g
michael@0 4707 por xmm0, xmm4 // set alpha to 255
michael@0 4708 pmullw xmm1, xmm3 // _a_g * alpha
michael@0 4709 psrlw xmm2, 8 // _r_b convert to 8 bits again
michael@0 4710 paddusb xmm0, xmm2 // + src argb
michael@0 4711 pand xmm1, xmm5 // a_g_ convert to 8 bits again
michael@0 4712 paddusb xmm0, xmm1 // + src argb
michael@0 4713 sub ecx, 1
michael@0 4714 movd [edx], xmm0
michael@0 4715 lea edx, [edx + 4]
michael@0 4716 jge convertloop1
michael@0 4717
michael@0 4718 convertloop1b:
michael@0 4719 pop esi
michael@0 4720 ret
michael@0 4721 }
michael@0 4722 }
michael@0 4723 #endif // HAS_ARGBBLENDROW_SSSE3
michael@0 4724
michael@0 4725 #ifdef HAS_ARGBATTENUATEROW_SSE2
michael@0 4726 // Attenuate 4 pixels at a time.
michael@0 4727 // Aligned to 16 bytes.
michael@0 4728 __declspec(naked) __declspec(align(16))
michael@0 4729 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
michael@0 4730 __asm {
michael@0 4731 mov eax, [esp + 4] // src_argb0
michael@0 4732 mov edx, [esp + 8] // dst_argb
michael@0 4733 mov ecx, [esp + 12] // width
michael@0 4734 pcmpeqb xmm4, xmm4 // generate mask 0xff000000
michael@0 4735 pslld xmm4, 24
michael@0 4736 pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff
michael@0 4737 psrld xmm5, 8
michael@0 4738
michael@0 4739 align 4
michael@0 4740 convertloop:
michael@0 4741 movdqa xmm0, [eax] // read 4 pixels
michael@0 4742 punpcklbw xmm0, xmm0 // first 2
michael@0 4743 pshufhw xmm2, xmm0, 0FFh // 8 alpha words
michael@0 4744 pshuflw xmm2, xmm2, 0FFh
michael@0 4745 pmulhuw xmm0, xmm2 // rgb * a
michael@0 4746 movdqa xmm1, [eax] // read 4 pixels
michael@0 4747 punpckhbw xmm1, xmm1 // next 2 pixels
michael@0 4748 pshufhw xmm2, xmm1, 0FFh // 8 alpha words
michael@0 4749 pshuflw xmm2, xmm2, 0FFh
michael@0 4750 pmulhuw xmm1, xmm2 // rgb * a
michael@0 4751 movdqa xmm2, [eax] // alphas
michael@0 4752 lea eax, [eax + 16]
michael@0 4753 psrlw xmm0, 8
michael@0 4754 pand xmm2, xmm4
michael@0 4755 psrlw xmm1, 8
michael@0 4756 packuswb xmm0, xmm1
michael@0 4757 pand xmm0, xmm5 // keep original alphas
michael@0 4758 por xmm0, xmm2
michael@0 4759 sub ecx, 4
michael@0 4760 movdqa [edx], xmm0
michael@0 4761 lea edx, [edx + 16]
michael@0 4762 jg convertloop
michael@0 4763
michael@0 4764 ret
michael@0 4765 }
michael@0 4766 }
michael@0 4767 #endif // HAS_ARGBATTENUATEROW_SSE2
michael@0 4768
michael@0 4769 #ifdef HAS_ARGBATTENUATEROW_SSSE3
michael@0 4770 // Shuffle table duplicating alpha.
michael@0 4771 static const uvec8 kShuffleAlpha0 = {
michael@0 4772 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
michael@0 4773 };
michael@0 4774 static const uvec8 kShuffleAlpha1 = {
michael@0 4775 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
michael@0 4776 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
michael@0 4777 };
michael@0 4778 __declspec(naked) __declspec(align(16))
michael@0 4779 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
michael@0 4780 __asm {
michael@0 4781 mov eax, [esp + 4] // src_argb0
michael@0 4782 mov edx, [esp + 8] // dst_argb
michael@0 4783 mov ecx, [esp + 12] // width
michael@0 4784 pcmpeqb xmm3, xmm3 // generate mask 0xff000000
michael@0 4785 pslld xmm3, 24
michael@0 4786 movdqa xmm4, kShuffleAlpha0
michael@0 4787 movdqa xmm5, kShuffleAlpha1
michael@0 4788
michael@0 4789 align 4
michael@0 4790 convertloop:
michael@0 4791 movdqu xmm0, [eax] // read 4 pixels
michael@0 4792 pshufb xmm0, xmm4 // isolate first 2 alphas
michael@0 4793 movdqu xmm1, [eax] // read 4 pixels
michael@0 4794 punpcklbw xmm1, xmm1 // first 2 pixel rgbs
michael@0 4795 pmulhuw xmm0, xmm1 // rgb * a
michael@0 4796 movdqu xmm1, [eax] // read 4 pixels
michael@0 4797 pshufb xmm1, xmm5 // isolate next 2 alphas
michael@0 4798 movdqu xmm2, [eax] // read 4 pixels
michael@0 4799 punpckhbw xmm2, xmm2 // next 2 pixel rgbs
michael@0 4800 pmulhuw xmm1, xmm2 // rgb * a
michael@0 4801 movdqu xmm2, [eax] // mask original alpha
michael@0 4802 lea eax, [eax + 16]
michael@0 4803 pand xmm2, xmm3
michael@0 4804 psrlw xmm0, 8
michael@0 4805 psrlw xmm1, 8
michael@0 4806 packuswb xmm0, xmm1
michael@0 4807 por xmm0, xmm2 // copy original alpha
michael@0 4808 sub ecx, 4
michael@0 4809 movdqu [edx], xmm0
michael@0 4810 lea edx, [edx + 16]
michael@0 4811 jg convertloop
michael@0 4812
michael@0 4813 ret
michael@0 4814 }
michael@0 4815 }
michael@0 4816 #endif // HAS_ARGBATTENUATEROW_SSSE3
michael@0 4817
michael@0 4818 #ifdef HAS_ARGBATTENUATEROW_AVX2
michael@0 4819 // Shuffle table duplicating alpha.
michael@0 4820 static const ulvec8 kShuffleAlpha_AVX2 = {
michael@0 4821 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
michael@0 4822 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
michael@0 4823 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
michael@0 4824 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u,
michael@0 4825 };
michael@0 4826 __declspec(naked) __declspec(align(16))
michael@0 4827 void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
michael@0 4828 __asm {
michael@0 4829 mov eax, [esp + 4] // src_argb0
michael@0 4830 mov edx, [esp + 8] // dst_argb
michael@0 4831 mov ecx, [esp + 12] // width
michael@0 4832 sub edx, eax
michael@0 4833 vmovdqa ymm4, kShuffleAlpha_AVX2
michael@0 4834 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000
michael@0 4835 vpslld ymm5, ymm5, 24
michael@0 4836
michael@0 4837 align 4
michael@0 4838 convertloop:
michael@0 4839 vmovdqu ymm6, [eax] // read 8 pixels.
michael@0 4840 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
michael@0 4841 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
michael@0 4842 vpshufb ymm2, ymm0, ymm4 // low 4 alphas
michael@0 4843 vpshufb ymm3, ymm1, ymm4 // high 4 alphas
michael@0 4844 vpmulhuw ymm0, ymm0, ymm2 // rgb * a
michael@0 4845 vpmulhuw ymm1, ymm1, ymm3 // rgb * a
michael@0 4846 vpand ymm6, ymm6, ymm5 // isolate alpha
michael@0 4847 vpsrlw ymm0, ymm0, 8
michael@0 4848 vpsrlw ymm1, ymm1, 8
michael@0 4849 vpackuswb ymm0, ymm0, ymm1 // unmutated.
michael@0 4850 vpor ymm0, ymm0, ymm6 // copy original alpha
michael@0 4851 sub ecx, 8
michael@0 4852 vmovdqu [eax + edx], ymm0
michael@0 4853 lea eax, [eax + 32]
michael@0 4854 jg convertloop
michael@0 4855
michael@0 4856 vzeroupper
michael@0 4857 ret
michael@0 4858 }
michael@0 4859 }
michael@0 4860 #endif // HAS_ARGBATTENUATEROW_AVX2
michael@0 4861
michael@0 4862 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
michael@0 4863 // Unattenuate 4 pixels at a time.
michael@0 4864 // Aligned to 16 bytes.
michael@0 4865 __declspec(naked) __declspec(align(16))
michael@0 4866 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
michael@0 4867 int width) {
michael@0 4868 __asm {
michael@0 4869 push esi
michael@0 4870 push edi
michael@0 4871 mov eax, [esp + 8 + 4] // src_argb0
michael@0 4872 mov edx, [esp + 8 + 8] // dst_argb
michael@0 4873 mov ecx, [esp + 8 + 12] // width
michael@0 4874
michael@0 4875 align 4
michael@0 4876 convertloop:
michael@0 4877 movdqu xmm0, [eax] // read 4 pixels
michael@0 4878 movzx esi, byte ptr [eax + 3] // first alpha
michael@0 4879 movzx edi, byte ptr [eax + 7] // second alpha
michael@0 4880 punpcklbw xmm0, xmm0 // first 2
michael@0 4881 movd xmm2, dword ptr fixed_invtbl8[esi * 4]
michael@0 4882 movd xmm3, dword ptr fixed_invtbl8[edi * 4]
michael@0 4883 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words. 1, a, a, a
michael@0 4884 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
michael@0 4885 movlhps xmm2, xmm3
michael@0 4886 pmulhuw xmm0, xmm2 // rgb * a
michael@0 4887
michael@0 4888 movdqu xmm1, [eax] // read 4 pixels
michael@0 4889 movzx esi, byte ptr [eax + 11] // third alpha
michael@0 4890 movzx edi, byte ptr [eax + 15] // forth alpha
michael@0 4891 punpckhbw xmm1, xmm1 // next 2
michael@0 4892 movd xmm2, dword ptr fixed_invtbl8[esi * 4]
michael@0 4893 movd xmm3, dword ptr fixed_invtbl8[edi * 4]
michael@0 4894 pshuflw xmm2, xmm2, 040h // first 4 inv_alpha words
michael@0 4895 pshuflw xmm3, xmm3, 040h // next 4 inv_alpha words
michael@0 4896 movlhps xmm2, xmm3
michael@0 4897 pmulhuw xmm1, xmm2 // rgb * a
michael@0 4898 lea eax, [eax + 16]
michael@0 4899
michael@0 4900 packuswb xmm0, xmm1
michael@0 4901 sub ecx, 4
michael@0 4902 movdqu [edx], xmm0
michael@0 4903 lea edx, [edx + 16]
michael@0 4904 jg convertloop
michael@0 4905 pop edi
michael@0 4906 pop esi
michael@0 4907 ret
michael@0 4908 }
michael@0 4909 }
michael@0 4910 #endif // HAS_ARGBUNATTENUATEROW_SSE2
michael@0 4911
michael@0 4912 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
michael@0 4913 // Shuffle table duplicating alpha.
michael@0 4914 static const ulvec8 kUnattenShuffleAlpha_AVX2 = {
michael@0 4915 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
michael@0 4916 0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15,
michael@0 4917 };
michael@0 4918 // TODO(fbarchard): Enable USE_GATHER for future hardware if faster.
michael@0 4919 // USE_GATHER is not on by default, due to being a slow instruction.
michael@0 4920 #ifdef USE_GATHER
michael@0 4921 __declspec(naked) __declspec(align(16))
michael@0 4922 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
michael@0 4923 int width) {
michael@0 4924 __asm {
michael@0 4925 mov eax, [esp + 4] // src_argb0
michael@0 4926 mov edx, [esp + 8] // dst_argb
michael@0 4927 mov ecx, [esp + 12] // width
michael@0 4928 sub edx, eax
michael@0 4929 vmovdqa ymm4, kUnattenShuffleAlpha_AVX2
michael@0 4930
michael@0 4931 align 4
michael@0 4932 convertloop:
michael@0 4933 vmovdqu ymm6, [eax] // read 8 pixels.
michael@0 4934 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xffffffff for gather.
michael@0 4935 vpsrld ymm2, ymm6, 24 // alpha in low 8 bits.
michael@0 4936 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
michael@0 4937 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
michael@0 4938 vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm5 // ymm5 cleared. 1, a
michael@0 4939 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
michael@0 4940 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
michael@0 4941 vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas. 1, a, a, a
michael@0 4942 vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas
michael@0 4943 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
michael@0 4944 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
michael@0 4945 vpackuswb ymm0, ymm0, ymm1 // unmutated.
michael@0 4946 sub ecx, 8
michael@0 4947 vmovdqu [eax + edx], ymm0
michael@0 4948 lea eax, [eax + 32]
michael@0 4949 jg convertloop
michael@0 4950
michael@0 4951 vzeroupper
michael@0 4952 ret
michael@0 4953 }
michael@0 4954 }
michael@0 4955 #else // USE_GATHER
michael@0 4956 __declspec(naked) __declspec(align(16))
michael@0 4957 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
michael@0 4958 int width) {
michael@0 4959 __asm {
michael@0 4960
michael@0 4961 mov eax, [esp + 4] // src_argb0
michael@0 4962 mov edx, [esp + 8] // dst_argb
michael@0 4963 mov ecx, [esp + 12] // width
michael@0 4964 sub edx, eax
michael@0 4965 vmovdqa ymm5, kUnattenShuffleAlpha_AVX2
michael@0 4966
michael@0 4967 push esi
michael@0 4968 push edi
michael@0 4969
michael@0 4970 align 4
michael@0 4971 convertloop:
michael@0 4972 // replace VPGATHER
michael@0 4973 movzx esi, byte ptr [eax + 3] // alpha0
michael@0 4974 movzx edi, byte ptr [eax + 7] // alpha1
michael@0 4975 vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a0]
michael@0 4976 vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a1]
michael@0 4977 movzx esi, byte ptr [eax + 11] // alpha2
michael@0 4978 movzx edi, byte ptr [eax + 15] // alpha3
michael@0 4979 vpunpckldq xmm6, xmm0, xmm1 // [1,a1,1,a0]
michael@0 4980 vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a2]
michael@0 4981 vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a3]
michael@0 4982 movzx esi, byte ptr [eax + 19] // alpha4
michael@0 4983 movzx edi, byte ptr [eax + 23] // alpha5
michael@0 4984 vpunpckldq xmm7, xmm2, xmm3 // [1,a3,1,a2]
michael@0 4985 vmovd xmm0, dword ptr fixed_invtbl8[esi * 4] // [1,a4]
michael@0 4986 vmovd xmm1, dword ptr fixed_invtbl8[edi * 4] // [1,a5]
michael@0 4987 movzx esi, byte ptr [eax + 27] // alpha6
michael@0 4988 movzx edi, byte ptr [eax + 31] // alpha7
michael@0 4989 vpunpckldq xmm0, xmm0, xmm1 // [1,a5,1,a4]
michael@0 4990 vmovd xmm2, dword ptr fixed_invtbl8[esi * 4] // [1,a6]
michael@0 4991 vmovd xmm3, dword ptr fixed_invtbl8[edi * 4] // [1,a7]
michael@0 4992 vpunpckldq xmm2, xmm2, xmm3 // [1,a7,1,a6]
michael@0 4993 vpunpcklqdq xmm3, xmm6, xmm7 // [1,a3,1,a2,1,a1,1,a0]
michael@0 4994 vpunpcklqdq xmm0, xmm0, xmm2 // [1,a7,1,a6,1,a5,1,a4]
michael@0 4995 vinserti128 ymm3, ymm3, xmm0, 1 // [1,a7,1,a6,1,a5,1,a4,1,a3,1,a2,1,a1,1,a0]
michael@0 4996 // end of VPGATHER
michael@0 4997
michael@0 4998 vmovdqu ymm6, [eax] // read 8 pixels.
michael@0 4999 vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated.
michael@0 5000 vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated.
michael@0 5001 vpunpcklwd ymm2, ymm3, ymm3 // low 4 inverted alphas. mutated. 1, 1, a, a
michael@0 5002 vpunpckhwd ymm3, ymm3, ymm3 // high 4 inverted alphas. mutated.
michael@0 5003 vpshufb ymm2, ymm2, ymm5 // replicate low 4 alphas. 1, a, a, a
michael@0 5004 vpshufb ymm3, ymm3, ymm5 // replicate high 4 alphas
michael@0 5005 vpmulhuw ymm0, ymm0, ymm2 // rgb * ia
michael@0 5006 vpmulhuw ymm1, ymm1, ymm3 // rgb * ia
michael@0 5007 vpackuswb ymm0, ymm0, ymm1 // unmutated.
michael@0 5008 sub ecx, 8
michael@0 5009 vmovdqu [eax + edx], ymm0
michael@0 5010 lea eax, [eax + 32]
michael@0 5011 jg convertloop
michael@0 5012
michael@0 5013 pop edi
michael@0 5014 pop esi
michael@0 5015 vzeroupper
michael@0 5016 ret
michael@0 5017 }
michael@0 5018 }
michael@0 5019 #endif // USE_GATHER
michael@0 5020 #endif // HAS_ARGBATTENUATEROW_AVX2
michael@0 5021
michael@0 5022 #ifdef HAS_ARGBGRAYROW_SSSE3
michael@0 5023 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
michael@0 5024 __declspec(naked) __declspec(align(16))
michael@0 5025 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
michael@0 5026 __asm {
michael@0 5027 mov eax, [esp + 4] /* src_argb */
michael@0 5028 mov edx, [esp + 8] /* dst_argb */
michael@0 5029 mov ecx, [esp + 12] /* width */
michael@0 5030 movdqa xmm4, kARGBToYJ
michael@0 5031 movdqa xmm5, kAddYJ64
michael@0 5032
michael@0 5033 align 4
michael@0 5034 convertloop:
michael@0 5035 movdqa xmm0, [eax] // G
michael@0 5036 movdqa xmm1, [eax + 16]
michael@0 5037 pmaddubsw xmm0, xmm4
michael@0 5038 pmaddubsw xmm1, xmm4
michael@0 5039 phaddw xmm0, xmm1
michael@0 5040 paddw xmm0, xmm5 // Add .5 for rounding.
michael@0 5041 psrlw xmm0, 7
michael@0 5042 packuswb xmm0, xmm0 // 8 G bytes
michael@0 5043 movdqa xmm2, [eax] // A
michael@0 5044 movdqa xmm3, [eax + 16]
michael@0 5045 lea eax, [eax + 32]
michael@0 5046 psrld xmm2, 24
michael@0 5047 psrld xmm3, 24
michael@0 5048 packuswb xmm2, xmm3
michael@0 5049 packuswb xmm2, xmm2 // 8 A bytes
michael@0 5050 movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA
michael@0 5051 punpcklbw xmm0, xmm0 // 8 GG words
michael@0 5052 punpcklbw xmm3, xmm2 // 8 GA words
michael@0 5053 movdqa xmm1, xmm0
michael@0 5054 punpcklwd xmm0, xmm3 // GGGA first 4
michael@0 5055 punpckhwd xmm1, xmm3 // GGGA next 4
michael@0 5056 sub ecx, 8
michael@0 5057 movdqa [edx], xmm0
michael@0 5058 movdqa [edx + 16], xmm1
michael@0 5059 lea edx, [edx + 32]
michael@0 5060 jg convertloop
michael@0 5061 ret
michael@0 5062 }
michael@0 5063 }
michael@0 5064 #endif // HAS_ARGBGRAYROW_SSSE3
michael@0 5065
michael@0 5066 #ifdef HAS_ARGBSEPIAROW_SSSE3
michael@0 5067 // b = (r * 35 + g * 68 + b * 17) >> 7
michael@0 5068 // g = (r * 45 + g * 88 + b * 22) >> 7
michael@0 5069 // r = (r * 50 + g * 98 + b * 24) >> 7
michael@0 5070 // Constant for ARGB color to sepia tone.
michael@0 5071 static const vec8 kARGBToSepiaB = {
michael@0 5072 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
michael@0 5073 };
michael@0 5074
michael@0 5075 static const vec8 kARGBToSepiaG = {
michael@0 5076 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
michael@0 5077 };
michael@0 5078
michael@0 5079 static const vec8 kARGBToSepiaR = {
michael@0 5080 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
michael@0 5081 };
michael@0 5082
michael@0 5083 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
michael@0 5084 __declspec(naked) __declspec(align(16))
michael@0 5085 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
michael@0 5086 __asm {
michael@0 5087 mov eax, [esp + 4] /* dst_argb */
michael@0 5088 mov ecx, [esp + 8] /* width */
michael@0 5089 movdqa xmm2, kARGBToSepiaB
michael@0 5090 movdqa xmm3, kARGBToSepiaG
michael@0 5091 movdqa xmm4, kARGBToSepiaR
michael@0 5092
michael@0 5093 align 4
michael@0 5094 convertloop:
michael@0 5095 movdqa xmm0, [eax] // B
michael@0 5096 movdqa xmm6, [eax + 16]
michael@0 5097 pmaddubsw xmm0, xmm2
michael@0 5098 pmaddubsw xmm6, xmm2
michael@0 5099 phaddw xmm0, xmm6
michael@0 5100 psrlw xmm0, 7
michael@0 5101 packuswb xmm0, xmm0 // 8 B values
michael@0 5102 movdqa xmm5, [eax] // G
michael@0 5103 movdqa xmm1, [eax + 16]
michael@0 5104 pmaddubsw xmm5, xmm3
michael@0 5105 pmaddubsw xmm1, xmm3
michael@0 5106 phaddw xmm5, xmm1
michael@0 5107 psrlw xmm5, 7
michael@0 5108 packuswb xmm5, xmm5 // 8 G values
michael@0 5109 punpcklbw xmm0, xmm5 // 8 BG values
michael@0 5110 movdqa xmm5, [eax] // R
michael@0 5111 movdqa xmm1, [eax + 16]
michael@0 5112 pmaddubsw xmm5, xmm4
michael@0 5113 pmaddubsw xmm1, xmm4
michael@0 5114 phaddw xmm5, xmm1
michael@0 5115 psrlw xmm5, 7
michael@0 5116 packuswb xmm5, xmm5 // 8 R values
michael@0 5117 movdqa xmm6, [eax] // A
michael@0 5118 movdqa xmm1, [eax + 16]
michael@0 5119 psrld xmm6, 24
michael@0 5120 psrld xmm1, 24
michael@0 5121 packuswb xmm6, xmm1
michael@0 5122 packuswb xmm6, xmm6 // 8 A values
michael@0 5123 punpcklbw xmm5, xmm6 // 8 RA values
michael@0 5124 movdqa xmm1, xmm0 // Weave BG, RA together
michael@0 5125 punpcklwd xmm0, xmm5 // BGRA first 4
michael@0 5126 punpckhwd xmm1, xmm5 // BGRA next 4
michael@0 5127 sub ecx, 8
michael@0 5128 movdqa [eax], xmm0
michael@0 5129 movdqa [eax + 16], xmm1
michael@0 5130 lea eax, [eax + 32]
michael@0 5131 jg convertloop
michael@0 5132 ret
michael@0 5133 }
michael@0 5134 }
michael@0 5135 #endif // HAS_ARGBSEPIAROW_SSSE3
michael@0 5136
michael@0 5137 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
michael@0 5138 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
michael@0 5139 // Same as Sepia except matrix is provided.
michael@0 5140 // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
michael@0 5141 // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
michael@0 5142 __declspec(naked) __declspec(align(16))
michael@0 5143 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
michael@0 5144 const int8* matrix_argb, int width) {
michael@0 5145 __asm {
michael@0 5146 mov eax, [esp + 4] /* src_argb */
michael@0 5147 mov edx, [esp + 8] /* dst_argb */
michael@0 5148 mov ecx, [esp + 12] /* matrix_argb */
michael@0 5149 movdqu xmm5, [ecx]
michael@0 5150 pshufd xmm2, xmm5, 0x00
michael@0 5151 pshufd xmm3, xmm5, 0x55
michael@0 5152 pshufd xmm4, xmm5, 0xaa
michael@0 5153 pshufd xmm5, xmm5, 0xff
michael@0 5154 mov ecx, [esp + 16] /* width */
michael@0 5155
michael@0 5156 align 4
michael@0 5157 convertloop:
michael@0 5158 movdqa xmm0, [eax] // B
michael@0 5159 movdqa xmm7, [eax + 16]
michael@0 5160 pmaddubsw xmm0, xmm2
michael@0 5161 pmaddubsw xmm7, xmm2
michael@0 5162 movdqa xmm6, [eax] // G
michael@0 5163 movdqa xmm1, [eax + 16]
michael@0 5164 pmaddubsw xmm6, xmm3
michael@0 5165 pmaddubsw xmm1, xmm3
michael@0 5166 phaddsw xmm0, xmm7 // B
michael@0 5167 phaddsw xmm6, xmm1 // G
michael@0 5168 psraw xmm0, 6 // B
michael@0 5169 psraw xmm6, 6 // G
michael@0 5170 packuswb xmm0, xmm0 // 8 B values
michael@0 5171 packuswb xmm6, xmm6 // 8 G values
michael@0 5172 punpcklbw xmm0, xmm6 // 8 BG values
michael@0 5173 movdqa xmm1, [eax] // R
michael@0 5174 movdqa xmm7, [eax + 16]
michael@0 5175 pmaddubsw xmm1, xmm4
michael@0 5176 pmaddubsw xmm7, xmm4
michael@0 5177 phaddsw xmm1, xmm7 // R
michael@0 5178 movdqa xmm6, [eax] // A
michael@0 5179 movdqa xmm7, [eax + 16]
michael@0 5180 pmaddubsw xmm6, xmm5
michael@0 5181 pmaddubsw xmm7, xmm5
michael@0 5182 phaddsw xmm6, xmm7 // A
michael@0 5183 psraw xmm1, 6 // R
michael@0 5184 psraw xmm6, 6 // A
michael@0 5185 packuswb xmm1, xmm1 // 8 R values
michael@0 5186 packuswb xmm6, xmm6 // 8 A values
michael@0 5187 punpcklbw xmm1, xmm6 // 8 RA values
michael@0 5188 movdqa xmm6, xmm0 // Weave BG, RA together
michael@0 5189 punpcklwd xmm0, xmm1 // BGRA first 4
michael@0 5190 punpckhwd xmm6, xmm1 // BGRA next 4
michael@0 5191 sub ecx, 8
michael@0 5192 movdqa [edx], xmm0
michael@0 5193 movdqa [edx + 16], xmm6
michael@0 5194 lea eax, [eax + 32]
michael@0 5195 lea edx, [edx + 32]
michael@0 5196 jg convertloop
michael@0 5197 ret
michael@0 5198 }
michael@0 5199 }
michael@0 5200 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
michael@0 5201
michael@0 5202 #ifdef HAS_ARGBQUANTIZEROW_SSE2
michael@0 5203 // Quantize 4 ARGB pixels (16 bytes).
michael@0 5204 // Aligned to 16 bytes.
michael@0 5205 __declspec(naked) __declspec(align(16))
michael@0 5206 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
michael@0 5207 int interval_offset, int width) {
michael@0 5208 __asm {
michael@0 5209 mov eax, [esp + 4] /* dst_argb */
michael@0 5210 movd xmm2, [esp + 8] /* scale */
michael@0 5211 movd xmm3, [esp + 12] /* interval_size */
michael@0 5212 movd xmm4, [esp + 16] /* interval_offset */
michael@0 5213 mov ecx, [esp + 20] /* width */
michael@0 5214 pshuflw xmm2, xmm2, 040h
michael@0 5215 pshufd xmm2, xmm2, 044h
michael@0 5216 pshuflw xmm3, xmm3, 040h
michael@0 5217 pshufd xmm3, xmm3, 044h
michael@0 5218 pshuflw xmm4, xmm4, 040h
michael@0 5219 pshufd xmm4, xmm4, 044h
michael@0 5220 pxor xmm5, xmm5 // constant 0
michael@0 5221 pcmpeqb xmm6, xmm6 // generate mask 0xff000000
michael@0 5222 pslld xmm6, 24
michael@0 5223
michael@0 5224 align 4
michael@0 5225 convertloop:
michael@0 5226 movdqa xmm0, [eax] // read 4 pixels
michael@0 5227 punpcklbw xmm0, xmm5 // first 2 pixels
michael@0 5228 pmulhuw xmm0, xmm2 // pixel * scale >> 16
michael@0 5229 movdqa xmm1, [eax] // read 4 pixels
michael@0 5230 punpckhbw xmm1, xmm5 // next 2 pixels
michael@0 5231 pmulhuw xmm1, xmm2
michael@0 5232 pmullw xmm0, xmm3 // * interval_size
michael@0 5233 movdqa xmm7, [eax] // read 4 pixels
michael@0 5234 pmullw xmm1, xmm3
michael@0 5235 pand xmm7, xmm6 // mask alpha
michael@0 5236 paddw xmm0, xmm4 // + interval_size / 2
michael@0 5237 paddw xmm1, xmm4
michael@0 5238 packuswb xmm0, xmm1
michael@0 5239 por xmm0, xmm7
michael@0 5240 sub ecx, 4
michael@0 5241 movdqa [eax], xmm0
michael@0 5242 lea eax, [eax + 16]
michael@0 5243 jg convertloop
michael@0 5244 ret
michael@0 5245 }
michael@0 5246 }
michael@0 5247 #endif // HAS_ARGBQUANTIZEROW_SSE2
michael@0 5248
michael@0 5249 #ifdef HAS_ARGBSHADEROW_SSE2
michael@0 5250 // Shade 4 pixels at a time by specified value.
michael@0 5251 // Aligned to 16 bytes.
michael@0 5252 __declspec(naked) __declspec(align(16))
michael@0 5253 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
michael@0 5254 uint32 value) {
michael@0 5255 __asm {
michael@0 5256 mov eax, [esp + 4] // src_argb
michael@0 5257 mov edx, [esp + 8] // dst_argb
michael@0 5258 mov ecx, [esp + 12] // width
michael@0 5259 movd xmm2, [esp + 16] // value
michael@0 5260 punpcklbw xmm2, xmm2
michael@0 5261 punpcklqdq xmm2, xmm2
michael@0 5262
michael@0 5263 align 4
michael@0 5264 convertloop:
michael@0 5265 movdqa xmm0, [eax] // read 4 pixels
michael@0 5266 lea eax, [eax + 16]
michael@0 5267 movdqa xmm1, xmm0
michael@0 5268 punpcklbw xmm0, xmm0 // first 2
michael@0 5269 punpckhbw xmm1, xmm1 // next 2
michael@0 5270 pmulhuw xmm0, xmm2 // argb * value
michael@0 5271 pmulhuw xmm1, xmm2 // argb * value
michael@0 5272 psrlw xmm0, 8
michael@0 5273 psrlw xmm1, 8
michael@0 5274 packuswb xmm0, xmm1
michael@0 5275 sub ecx, 4
michael@0 5276 movdqa [edx], xmm0
michael@0 5277 lea edx, [edx + 16]
michael@0 5278 jg convertloop
michael@0 5279
michael@0 5280 ret
michael@0 5281 }
michael@0 5282 }
michael@0 5283 #endif // HAS_ARGBSHADEROW_SSE2
michael@0 5284
michael@0 5285 #ifdef HAS_ARGBMULTIPLYROW_SSE2
michael@0 5286 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
michael@0 5287 __declspec(naked) __declspec(align(16))
michael@0 5288 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
michael@0 5289 uint8* dst_argb, int width) {
michael@0 5290 __asm {
michael@0 5291 push esi
michael@0 5292 mov eax, [esp + 4 + 4] // src_argb0
michael@0 5293 mov esi, [esp + 4 + 8] // src_argb1
michael@0 5294 mov edx, [esp + 4 + 12] // dst_argb
michael@0 5295 mov ecx, [esp + 4 + 16] // width
michael@0 5296 pxor xmm5, xmm5 // constant 0
michael@0 5297
michael@0 5298 align 4
michael@0 5299 convertloop:
michael@0 5300 movdqu xmm0, [eax] // read 4 pixels from src_argb0
michael@0 5301 movdqu xmm2, [esi] // read 4 pixels from src_argb1
michael@0 5302 movdqu xmm1, xmm0
michael@0 5303 movdqu xmm3, xmm2
michael@0 5304 punpcklbw xmm0, xmm0 // first 2
michael@0 5305 punpckhbw xmm1, xmm1 // next 2
michael@0 5306 punpcklbw xmm2, xmm5 // first 2
michael@0 5307 punpckhbw xmm3, xmm5 // next 2
michael@0 5308 pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
michael@0 5309 pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
michael@0 5310 lea eax, [eax + 16]
michael@0 5311 lea esi, [esi + 16]
michael@0 5312 packuswb xmm0, xmm1
michael@0 5313 sub ecx, 4
michael@0 5314 movdqu [edx], xmm0
michael@0 5315 lea edx, [edx + 16]
michael@0 5316 jg convertloop
michael@0 5317
michael@0 5318 pop esi
michael@0 5319 ret
michael@0 5320 }
michael@0 5321 }
michael@0 5322 #endif // HAS_ARGBMULTIPLYROW_SSE2
michael@0 5323
michael@0 5324 #ifdef HAS_ARGBADDROW_SSE2
michael@0 5325 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
michael@0 5326 // TODO(fbarchard): Port this to posix, neon and other math functions.
michael@0 5327 __declspec(naked) __declspec(align(16))
michael@0 5328 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
michael@0 5329 uint8* dst_argb, int width) {
michael@0 5330 __asm {
michael@0 5331 push esi
michael@0 5332 mov eax, [esp + 4 + 4] // src_argb0
michael@0 5333 mov esi, [esp + 4 + 8] // src_argb1
michael@0 5334 mov edx, [esp + 4 + 12] // dst_argb
michael@0 5335 mov ecx, [esp + 4 + 16] // width
michael@0 5336
michael@0 5337 sub ecx, 4
michael@0 5338 jl convertloop49
michael@0 5339
michael@0 5340 align 4
michael@0 5341 convertloop4:
michael@0 5342 movdqu xmm0, [eax] // read 4 pixels from src_argb0
michael@0 5343 lea eax, [eax + 16]
michael@0 5344 movdqu xmm1, [esi] // read 4 pixels from src_argb1
michael@0 5345 lea esi, [esi + 16]
michael@0 5346 paddusb xmm0, xmm1 // src_argb0 + src_argb1
michael@0 5347 sub ecx, 4
michael@0 5348 movdqu [edx], xmm0
michael@0 5349 lea edx, [edx + 16]
michael@0 5350 jge convertloop4
michael@0 5351
michael@0 5352 convertloop49:
michael@0 5353 add ecx, 4 - 1
michael@0 5354 jl convertloop19
michael@0 5355
michael@0 5356 convertloop1:
michael@0 5357 movd xmm0, [eax] // read 1 pixels from src_argb0
michael@0 5358 lea eax, [eax + 4]
michael@0 5359 movd xmm1, [esi] // read 1 pixels from src_argb1
michael@0 5360 lea esi, [esi + 4]
michael@0 5361 paddusb xmm0, xmm1 // src_argb0 + src_argb1
michael@0 5362 sub ecx, 1
michael@0 5363 movd [edx], xmm0
michael@0 5364 lea edx, [edx + 4]
michael@0 5365 jge convertloop1
michael@0 5366
michael@0 5367 convertloop19:
michael@0 5368 pop esi
michael@0 5369 ret
michael@0 5370 }
michael@0 5371 }
michael@0 5372 #endif // HAS_ARGBADDROW_SSE2
michael@0 5373
michael@0 5374 #ifdef HAS_ARGBSUBTRACTROW_SSE2
michael@0 5375 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time.
michael@0 5376 __declspec(naked) __declspec(align(16))
michael@0 5377 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
michael@0 5378 uint8* dst_argb, int width) {
michael@0 5379 __asm {
michael@0 5380 push esi
michael@0 5381 mov eax, [esp + 4 + 4] // src_argb0
michael@0 5382 mov esi, [esp + 4 + 8] // src_argb1
michael@0 5383 mov edx, [esp + 4 + 12] // dst_argb
michael@0 5384 mov ecx, [esp + 4 + 16] // width
michael@0 5385
michael@0 5386 align 4
michael@0 5387 convertloop:
michael@0 5388 movdqu xmm0, [eax] // read 4 pixels from src_argb0
michael@0 5389 lea eax, [eax + 16]
michael@0 5390 movdqu xmm1, [esi] // read 4 pixels from src_argb1
michael@0 5391 lea esi, [esi + 16]
michael@0 5392 psubusb xmm0, xmm1 // src_argb0 - src_argb1
michael@0 5393 sub ecx, 4
michael@0 5394 movdqu [edx], xmm0
michael@0 5395 lea edx, [edx + 16]
michael@0 5396 jg convertloop
michael@0 5397
michael@0 5398 pop esi
michael@0 5399 ret
michael@0 5400 }
michael@0 5401 }
michael@0 5402 #endif // HAS_ARGBSUBTRACTROW_SSE2
michael@0 5403
michael@0 5404 #ifdef HAS_ARGBMULTIPLYROW_AVX2
michael@0 5405 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
michael@0 5406 __declspec(naked) __declspec(align(16))
michael@0 5407 void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
michael@0 5408 uint8* dst_argb, int width) {
michael@0 5409 __asm {
michael@0 5410 push esi
michael@0 5411 mov eax, [esp + 4 + 4] // src_argb0
michael@0 5412 mov esi, [esp + 4 + 8] // src_argb1
michael@0 5413 mov edx, [esp + 4 + 12] // dst_argb
michael@0 5414 mov ecx, [esp + 4 + 16] // width
michael@0 5415 vpxor ymm5, ymm5, ymm5 // constant 0
michael@0 5416
michael@0 5417 align 4
michael@0 5418 convertloop:
michael@0 5419 vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
michael@0 5420 lea eax, [eax + 32]
michael@0 5421 vmovdqu ymm3, [esi] // read 8 pixels from src_argb1
michael@0 5422 lea esi, [esi + 32]
michael@0 5423 vpunpcklbw ymm0, ymm1, ymm1 // low 4
michael@0 5424 vpunpckhbw ymm1, ymm1, ymm1 // high 4
michael@0 5425 vpunpcklbw ymm2, ymm3, ymm5 // low 4
michael@0 5426 vpunpckhbw ymm3, ymm3, ymm5 // high 4
michael@0 5427 vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
michael@0 5428 vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
michael@0 5429 vpackuswb ymm0, ymm0, ymm1
michael@0 5430 vmovdqu [edx], ymm0
michael@0 5431 lea edx, [edx + 32]
michael@0 5432 sub ecx, 8
michael@0 5433 jg convertloop
michael@0 5434
michael@0 5435 pop esi
michael@0 5436 vzeroupper
michael@0 5437 ret
michael@0 5438 }
michael@0 5439 }
michael@0 5440 #endif // HAS_ARGBMULTIPLYROW_AVX2
michael@0 5441
michael@0 5442 #ifdef HAS_ARGBADDROW_AVX2
michael@0 5443 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
michael@0 5444 __declspec(naked) __declspec(align(16))
michael@0 5445 void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
michael@0 5446 uint8* dst_argb, int width) {
michael@0 5447 __asm {
michael@0 5448 push esi
michael@0 5449 mov eax, [esp + 4 + 4] // src_argb0
michael@0 5450 mov esi, [esp + 4 + 8] // src_argb1
michael@0 5451 mov edx, [esp + 4 + 12] // dst_argb
michael@0 5452 mov ecx, [esp + 4 + 16] // width
michael@0 5453
michael@0 5454 align 4
michael@0 5455 convertloop:
michael@0 5456 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
michael@0 5457 lea eax, [eax + 32]
michael@0 5458 vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
michael@0 5459 lea esi, [esi + 32]
michael@0 5460 vmovdqu [edx], ymm0
michael@0 5461 lea edx, [edx + 32]
michael@0 5462 sub ecx, 8
michael@0 5463 jg convertloop
michael@0 5464
michael@0 5465 pop esi
michael@0 5466 vzeroupper
michael@0 5467 ret
michael@0 5468 }
michael@0 5469 }
michael@0 5470 #endif // HAS_ARGBADDROW_AVX2
michael@0 5471
michael@0 5472 #ifdef HAS_ARGBSUBTRACTROW_AVX2
michael@0 5473 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time.
michael@0 5474 __declspec(naked) __declspec(align(16))
michael@0 5475 void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
michael@0 5476 uint8* dst_argb, int width) {
michael@0 5477 __asm {
michael@0 5478 push esi
michael@0 5479 mov eax, [esp + 4 + 4] // src_argb0
michael@0 5480 mov esi, [esp + 4 + 8] // src_argb1
michael@0 5481 mov edx, [esp + 4 + 12] // dst_argb
michael@0 5482 mov ecx, [esp + 4 + 16] // width
michael@0 5483
michael@0 5484 align 4
michael@0 5485 convertloop:
michael@0 5486 vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
michael@0 5487 lea eax, [eax + 32]
michael@0 5488 vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1
michael@0 5489 lea esi, [esi + 32]
michael@0 5490 vmovdqu [edx], ymm0
michael@0 5491 lea edx, [edx + 32]
michael@0 5492 sub ecx, 8
michael@0 5493 jg convertloop
michael@0 5494
michael@0 5495 pop esi
michael@0 5496 vzeroupper
michael@0 5497 ret
michael@0 5498 }
michael@0 5499 }
michael@0 5500 #endif // HAS_ARGBSUBTRACTROW_AVX2
michael@0 5501
michael@0 5502 #ifdef HAS_SOBELXROW_SSE2
michael@0 5503 // SobelX as a matrix is
michael@0 5504 // -1 0 1
michael@0 5505 // -2 0 2
michael@0 5506 // -1 0 1
michael@0 5507 __declspec(naked) __declspec(align(16))
michael@0 5508 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
michael@0 5509 const uint8* src_y2, uint8* dst_sobelx, int width) {
michael@0 5510 __asm {
michael@0 5511 push esi
michael@0 5512 push edi
michael@0 5513 mov eax, [esp + 8 + 4] // src_y0
michael@0 5514 mov esi, [esp + 8 + 8] // src_y1
michael@0 5515 mov edi, [esp + 8 + 12] // src_y2
michael@0 5516 mov edx, [esp + 8 + 16] // dst_sobelx
michael@0 5517 mov ecx, [esp + 8 + 20] // width
michael@0 5518 sub esi, eax
michael@0 5519 sub edi, eax
michael@0 5520 sub edx, eax
michael@0 5521 pxor xmm5, xmm5 // constant 0
michael@0 5522
michael@0 5523 align 4
michael@0 5524 convertloop:
michael@0 5525 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
michael@0 5526 movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
michael@0 5527 punpcklbw xmm0, xmm5
michael@0 5528 punpcklbw xmm1, xmm5
michael@0 5529 psubw xmm0, xmm1
michael@0 5530 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
michael@0 5531 movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
michael@0 5532 punpcklbw xmm1, xmm5
michael@0 5533 punpcklbw xmm2, xmm5
michael@0 5534 psubw xmm1, xmm2
michael@0 5535 movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
michael@0 5536 movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2]
michael@0 5537 punpcklbw xmm2, xmm5
michael@0 5538 punpcklbw xmm3, xmm5
michael@0 5539 psubw xmm2, xmm3
michael@0 5540 paddw xmm0, xmm2
michael@0 5541 paddw xmm0, xmm1
michael@0 5542 paddw xmm0, xmm1
michael@0 5543 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
michael@0 5544 psubw xmm1, xmm0
michael@0 5545 pmaxsw xmm0, xmm1
michael@0 5546 packuswb xmm0, xmm0
michael@0 5547 sub ecx, 8
michael@0 5548 movq qword ptr [eax + edx], xmm0
michael@0 5549 lea eax, [eax + 8]
michael@0 5550 jg convertloop
michael@0 5551
michael@0 5552 pop edi
michael@0 5553 pop esi
michael@0 5554 ret
michael@0 5555 }
michael@0 5556 }
michael@0 5557 #endif // HAS_SOBELXROW_SSE2
michael@0 5558
michael@0 5559 #ifdef HAS_SOBELYROW_SSE2
michael@0 5560 // SobelY as a matrix is
michael@0 5561 // -1 -2 -1
michael@0 5562 // 0 0 0
michael@0 5563 // 1 2 1
michael@0 5564 __declspec(naked) __declspec(align(16))
michael@0 5565 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
michael@0 5566 uint8* dst_sobely, int width) {
michael@0 5567 __asm {
michael@0 5568 push esi
michael@0 5569 mov eax, [esp + 4 + 4] // src_y0
michael@0 5570 mov esi, [esp + 4 + 8] // src_y1
michael@0 5571 mov edx, [esp + 4 + 12] // dst_sobely
michael@0 5572 mov ecx, [esp + 4 + 16] // width
michael@0 5573 sub esi, eax
michael@0 5574 sub edx, eax
michael@0 5575 pxor xmm5, xmm5 // constant 0
michael@0 5576
michael@0 5577 align 4
michael@0 5578 convertloop:
michael@0 5579 movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
michael@0 5580 movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
michael@0 5581 punpcklbw xmm0, xmm5
michael@0 5582 punpcklbw xmm1, xmm5
michael@0 5583 psubw xmm0, xmm1
michael@0 5584 movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
michael@0 5585 movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1]
michael@0 5586 punpcklbw xmm1, xmm5
michael@0 5587 punpcklbw xmm2, xmm5
michael@0 5588 psubw xmm1, xmm2
michael@0 5589 movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
michael@0 5590 movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
michael@0 5591 punpcklbw xmm2, xmm5
michael@0 5592 punpcklbw xmm3, xmm5
michael@0 5593 psubw xmm2, xmm3
michael@0 5594 paddw xmm0, xmm2
michael@0 5595 paddw xmm0, xmm1
michael@0 5596 paddw xmm0, xmm1
michael@0 5597 pxor xmm1, xmm1 // abs = max(xmm0, -xmm0). SSSE3 could use pabsw
michael@0 5598 psubw xmm1, xmm0
michael@0 5599 pmaxsw xmm0, xmm1
michael@0 5600 packuswb xmm0, xmm0
michael@0 5601 sub ecx, 8
michael@0 5602 movq qword ptr [eax + edx], xmm0
michael@0 5603 lea eax, [eax + 8]
michael@0 5604 jg convertloop
michael@0 5605
michael@0 5606 pop esi
michael@0 5607 ret
michael@0 5608 }
michael@0 5609 }
michael@0 5610 #endif // HAS_SOBELYROW_SSE2
michael@0 5611
michael@0 5612 #ifdef HAS_SOBELROW_SSE2
michael@0 5613 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
michael@0 5614 // A = 255
michael@0 5615 // R = Sobel
michael@0 5616 // G = Sobel
michael@0 5617 // B = Sobel
michael@0 5618 __declspec(naked) __declspec(align(16))
michael@0 5619 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
michael@0 5620 uint8* dst_argb, int width) {
michael@0 5621 __asm {
michael@0 5622 push esi
michael@0 5623 mov eax, [esp + 4 + 4] // src_sobelx
michael@0 5624 mov esi, [esp + 4 + 8] // src_sobely
michael@0 5625 mov edx, [esp + 4 + 12] // dst_argb
michael@0 5626 mov ecx, [esp + 4 + 16] // width
michael@0 5627 sub esi, eax
michael@0 5628 pcmpeqb xmm5, xmm5 // alpha 255
michael@0 5629 pslld xmm5, 24 // 0xff000000
michael@0 5630
michael@0 5631 align 4
michael@0 5632 convertloop:
michael@0 5633 movdqa xmm0, [eax] // read 16 pixels src_sobelx
michael@0 5634 movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
michael@0 5635 lea eax, [eax + 16]
michael@0 5636 paddusb xmm0, xmm1 // sobel = sobelx + sobely
michael@0 5637 movdqa xmm2, xmm0 // GG
michael@0 5638 punpcklbw xmm2, xmm0 // First 8
michael@0 5639 punpckhbw xmm0, xmm0 // Next 8
michael@0 5640 movdqa xmm1, xmm2 // GGGG
michael@0 5641 punpcklwd xmm1, xmm2 // First 4
michael@0 5642 punpckhwd xmm2, xmm2 // Next 4
michael@0 5643 por xmm1, xmm5 // GGGA
michael@0 5644 por xmm2, xmm5
michael@0 5645 movdqa xmm3, xmm0 // GGGG
michael@0 5646 punpcklwd xmm3, xmm0 // Next 4
michael@0 5647 punpckhwd xmm0, xmm0 // Last 4
michael@0 5648 por xmm3, xmm5 // GGGA
michael@0 5649 por xmm0, xmm5
michael@0 5650 sub ecx, 16
michael@0 5651 movdqa [edx], xmm1
michael@0 5652 movdqa [edx + 16], xmm2
michael@0 5653 movdqa [edx + 32], xmm3
michael@0 5654 movdqa [edx + 48], xmm0
michael@0 5655 lea edx, [edx + 64]
michael@0 5656 jg convertloop
michael@0 5657
michael@0 5658 pop esi
michael@0 5659 ret
michael@0 5660 }
michael@0 5661 }
michael@0 5662 #endif // HAS_SOBELROW_SSE2
michael@0 5663
michael@0 5664 #ifdef HAS_SOBELTOPLANEROW_SSE2
michael@0 5665 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
michael@0 5666 __declspec(naked) __declspec(align(16))
michael@0 5667 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
michael@0 5668 uint8* dst_y, int width) {
michael@0 5669 __asm {
michael@0 5670 push esi
michael@0 5671 mov eax, [esp + 4 + 4] // src_sobelx
michael@0 5672 mov esi, [esp + 4 + 8] // src_sobely
michael@0 5673 mov edx, [esp + 4 + 12] // dst_argb
michael@0 5674 mov ecx, [esp + 4 + 16] // width
michael@0 5675 sub esi, eax
michael@0 5676
michael@0 5677 align 4
michael@0 5678 convertloop:
michael@0 5679 movdqa xmm0, [eax] // read 16 pixels src_sobelx
michael@0 5680 movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
michael@0 5681 lea eax, [eax + 16]
michael@0 5682 paddusb xmm0, xmm1 // sobel = sobelx + sobely
michael@0 5683 sub ecx, 16
michael@0 5684 movdqa [edx], xmm0
michael@0 5685 lea edx, [edx + 16]
michael@0 5686 jg convertloop
michael@0 5687
michael@0 5688 pop esi
michael@0 5689 ret
michael@0 5690 }
michael@0 5691 }
michael@0 5692 #endif // HAS_SOBELTOPLANEROW_SSE2
michael@0 5693
michael@0 5694 #ifdef HAS_SOBELXYROW_SSE2
michael@0 5695 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
michael@0 5696 // A = 255
michael@0 5697 // R = Sobel X
michael@0 5698 // G = Sobel
michael@0 5699 // B = Sobel Y
michael@0 5700 __declspec(naked) __declspec(align(16))
michael@0 5701 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
michael@0 5702 uint8* dst_argb, int width) {
michael@0 5703 __asm {
michael@0 5704 push esi
michael@0 5705 mov eax, [esp + 4 + 4] // src_sobelx
michael@0 5706 mov esi, [esp + 4 + 8] // src_sobely
michael@0 5707 mov edx, [esp + 4 + 12] // dst_argb
michael@0 5708 mov ecx, [esp + 4 + 16] // width
michael@0 5709 sub esi, eax
michael@0 5710 pcmpeqb xmm5, xmm5 // alpha 255
michael@0 5711
michael@0 5712 align 4
michael@0 5713 convertloop:
michael@0 5714 movdqa xmm0, [eax] // read 16 pixels src_sobelx
michael@0 5715 movdqa xmm1, [eax + esi] // read 16 pixels src_sobely
michael@0 5716 lea eax, [eax + 16]
michael@0 5717 movdqa xmm2, xmm0
michael@0 5718 paddusb xmm2, xmm1 // sobel = sobelx + sobely
michael@0 5719 movdqa xmm3, xmm0 // XA
michael@0 5720 punpcklbw xmm3, xmm5
michael@0 5721 punpckhbw xmm0, xmm5
michael@0 5722 movdqa xmm4, xmm1 // YS
michael@0 5723 punpcklbw xmm4, xmm2
michael@0 5724 punpckhbw xmm1, xmm2
michael@0 5725 movdqa xmm6, xmm4 // YSXA
michael@0 5726 punpcklwd xmm6, xmm3 // First 4
michael@0 5727 punpckhwd xmm4, xmm3 // Next 4
michael@0 5728 movdqa xmm7, xmm1 // YSXA
michael@0 5729 punpcklwd xmm7, xmm0 // Next 4
michael@0 5730 punpckhwd xmm1, xmm0 // Last 4
michael@0 5731 sub ecx, 16
michael@0 5732 movdqa [edx], xmm6
michael@0 5733 movdqa [edx + 16], xmm4
michael@0 5734 movdqa [edx + 32], xmm7
michael@0 5735 movdqa [edx + 48], xmm1
michael@0 5736 lea edx, [edx + 64]
michael@0 5737 jg convertloop
michael@0 5738
michael@0 5739 pop esi
michael@0 5740 ret
michael@0 5741 }
michael@0 5742 }
michael@0 5743 #endif // HAS_SOBELXYROW_SSE2
michael@0 5744
michael@0 5745 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
michael@0 5746 // Consider float CumulativeSum.
michael@0 5747 // Consider calling CumulativeSum one row at time as needed.
michael@0 5748 // Consider circular CumulativeSum buffer of radius * 2 + 1 height.
michael@0 5749 // Convert cumulative sum for an area to an average for 1 pixel.
michael@0 5750 // topleft is pointer to top left of CumulativeSum buffer for area.
michael@0 5751 // botleft is pointer to bottom left of CumulativeSum buffer.
michael@0 5752 // width is offset from left to right of area in CumulativeSum buffer measured
michael@0 5753 // in number of ints.
michael@0 5754 // area is the number of pixels in the area being averaged.
michael@0 5755 // dst points to pixel to store result to.
michael@0 5756 // count is number of averaged pixels to produce.
michael@0 5757 // Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
michael@0 5758 // aligned.
michael@0 5759 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
michael@0 5760 int width, int area, uint8* dst,
michael@0 5761 int count) {
michael@0 5762 __asm {
michael@0 5763 mov eax, topleft // eax topleft
michael@0 5764 mov esi, botleft // esi botleft
michael@0 5765 mov edx, width
michael@0 5766 movd xmm5, area
michael@0 5767 mov edi, dst
michael@0 5768 mov ecx, count
michael@0 5769 cvtdq2ps xmm5, xmm5
michael@0 5770 rcpss xmm4, xmm5 // 1.0f / area
michael@0 5771 pshufd xmm4, xmm4, 0
michael@0 5772 sub ecx, 4
michael@0 5773 jl l4b
michael@0 5774
michael@0 5775 cmp area, 128 // 128 pixels will not overflow 15 bits.
michael@0 5776 ja l4
michael@0 5777
michael@0 5778 pshufd xmm5, xmm5, 0 // area
michael@0 5779 pcmpeqb xmm6, xmm6 // constant of 65536.0 - 1 = 65535.0
michael@0 5780 psrld xmm6, 16
michael@0 5781 cvtdq2ps xmm6, xmm6
michael@0 5782 addps xmm5, xmm6 // (65536.0 + area - 1)
michael@0 5783 mulps xmm5, xmm4 // (65536.0 + area - 1) * 1 / area
michael@0 5784 cvtps2dq xmm5, xmm5 // 0.16 fixed point
michael@0 5785 packssdw xmm5, xmm5 // 16 bit shorts
michael@0 5786
michael@0 5787 // 4 pixel loop small blocks.
michael@0 5788 align 4
michael@0 5789 s4:
michael@0 5790 // top left
michael@0 5791 movdqa xmm0, [eax]
michael@0 5792 movdqa xmm1, [eax + 16]
michael@0 5793 movdqa xmm2, [eax + 32]
michael@0 5794 movdqa xmm3, [eax + 48]
michael@0 5795
michael@0 5796 // - top right
michael@0 5797 psubd xmm0, [eax + edx * 4]
michael@0 5798 psubd xmm1, [eax + edx * 4 + 16]
michael@0 5799 psubd xmm2, [eax + edx * 4 + 32]
michael@0 5800 psubd xmm3, [eax + edx * 4 + 48]
michael@0 5801 lea eax, [eax + 64]
michael@0 5802
michael@0 5803 // - bottom left
michael@0 5804 psubd xmm0, [esi]
michael@0 5805 psubd xmm1, [esi + 16]
michael@0 5806 psubd xmm2, [esi + 32]
michael@0 5807 psubd xmm3, [esi + 48]
michael@0 5808
michael@0 5809 // + bottom right
michael@0 5810 paddd xmm0, [esi + edx * 4]
michael@0 5811 paddd xmm1, [esi + edx * 4 + 16]
michael@0 5812 paddd xmm2, [esi + edx * 4 + 32]
michael@0 5813 paddd xmm3, [esi + edx * 4 + 48]
michael@0 5814 lea esi, [esi + 64]
michael@0 5815
michael@0 5816 packssdw xmm0, xmm1 // pack 4 pixels into 2 registers
michael@0 5817 packssdw xmm2, xmm3
michael@0 5818
michael@0 5819 pmulhuw xmm0, xmm5
michael@0 5820 pmulhuw xmm2, xmm5
michael@0 5821
michael@0 5822 packuswb xmm0, xmm2
michael@0 5823 movdqu [edi], xmm0
michael@0 5824 lea edi, [edi + 16]
michael@0 5825 sub ecx, 4
michael@0 5826 jge s4
michael@0 5827
michael@0 5828 jmp l4b
michael@0 5829
michael@0 5830 // 4 pixel loop
michael@0 5831 align 4
michael@0 5832 l4:
michael@0 5833 // top left
michael@0 5834 movdqa xmm0, [eax]
michael@0 5835 movdqa xmm1, [eax + 16]
michael@0 5836 movdqa xmm2, [eax + 32]
michael@0 5837 movdqa xmm3, [eax + 48]
michael@0 5838
michael@0 5839 // - top right
michael@0 5840 psubd xmm0, [eax + edx * 4]
michael@0 5841 psubd xmm1, [eax + edx * 4 + 16]
michael@0 5842 psubd xmm2, [eax + edx * 4 + 32]
michael@0 5843 psubd xmm3, [eax + edx * 4 + 48]
michael@0 5844 lea eax, [eax + 64]
michael@0 5845
michael@0 5846 // - bottom left
michael@0 5847 psubd xmm0, [esi]
michael@0 5848 psubd xmm1, [esi + 16]
michael@0 5849 psubd xmm2, [esi + 32]
michael@0 5850 psubd xmm3, [esi + 48]
michael@0 5851
michael@0 5852 // + bottom right
michael@0 5853 paddd xmm0, [esi + edx * 4]
michael@0 5854 paddd xmm1, [esi + edx * 4 + 16]
michael@0 5855 paddd xmm2, [esi + edx * 4 + 32]
michael@0 5856 paddd xmm3, [esi + edx * 4 + 48]
michael@0 5857 lea esi, [esi + 64]
michael@0 5858
michael@0 5859 cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area
michael@0 5860 cvtdq2ps xmm1, xmm1
michael@0 5861 mulps xmm0, xmm4
michael@0 5862 mulps xmm1, xmm4
michael@0 5863 cvtdq2ps xmm2, xmm2
michael@0 5864 cvtdq2ps xmm3, xmm3
michael@0 5865 mulps xmm2, xmm4
michael@0 5866 mulps xmm3, xmm4
michael@0 5867 cvtps2dq xmm0, xmm0
michael@0 5868 cvtps2dq xmm1, xmm1
michael@0 5869 cvtps2dq xmm2, xmm2
michael@0 5870 cvtps2dq xmm3, xmm3
michael@0 5871 packssdw xmm0, xmm1
michael@0 5872 packssdw xmm2, xmm3
michael@0 5873 packuswb xmm0, xmm2
michael@0 5874 movdqu [edi], xmm0
michael@0 5875 lea edi, [edi + 16]
michael@0 5876 sub ecx, 4
michael@0 5877 jge l4
michael@0 5878
michael@0 5879 l4b:
michael@0 5880 add ecx, 4 - 1
michael@0 5881 jl l1b
michael@0 5882
michael@0 5883 // 1 pixel loop
michael@0 5884 align 4
michael@0 5885 l1:
michael@0 5886 movdqa xmm0, [eax]
michael@0 5887 psubd xmm0, [eax + edx * 4]
michael@0 5888 lea eax, [eax + 16]
michael@0 5889 psubd xmm0, [esi]
michael@0 5890 paddd xmm0, [esi + edx * 4]
michael@0 5891 lea esi, [esi + 16]
michael@0 5892 cvtdq2ps xmm0, xmm0
michael@0 5893 mulps xmm0, xmm4
michael@0 5894 cvtps2dq xmm0, xmm0
michael@0 5895 packssdw xmm0, xmm0
michael@0 5896 packuswb xmm0, xmm0
michael@0 5897 movd dword ptr [edi], xmm0
michael@0 5898 lea edi, [edi + 4]
michael@0 5899 sub ecx, 1
michael@0 5900 jge l1
michael@0 5901 l1b:
michael@0 5902 }
michael@0 5903 }
michael@0 5904 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
michael@0 5905
michael@0 5906 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
michael@0 5907 // Creates a table of cumulative sums where each value is a sum of all values
michael@0 5908 // above and to the left of the value.
michael@0 5909 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
michael@0 5910 const int32* previous_cumsum, int width) {
michael@0 5911 __asm {
michael@0 5912 mov eax, row
michael@0 5913 mov edx, cumsum
michael@0 5914 mov esi, previous_cumsum
michael@0 5915 mov ecx, width
michael@0 5916 pxor xmm0, xmm0
michael@0 5917 pxor xmm1, xmm1
michael@0 5918
michael@0 5919 sub ecx, 4
michael@0 5920 jl l4b
michael@0 5921 test edx, 15
michael@0 5922 jne l4b
michael@0 5923
michael@0 5924 // 4 pixel loop
michael@0 5925 align 4
michael@0 5926 l4:
michael@0 5927 movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
michael@0 5928 lea eax, [eax + 16]
michael@0 5929 movdqa xmm4, xmm2
michael@0 5930
michael@0 5931 punpcklbw xmm2, xmm1
michael@0 5932 movdqa xmm3, xmm2
michael@0 5933 punpcklwd xmm2, xmm1
michael@0 5934 punpckhwd xmm3, xmm1
michael@0 5935
michael@0 5936 punpckhbw xmm4, xmm1
michael@0 5937 movdqa xmm5, xmm4
michael@0 5938 punpcklwd xmm4, xmm1
michael@0 5939 punpckhwd xmm5, xmm1
michael@0 5940
michael@0 5941 paddd xmm0, xmm2
michael@0 5942 movdqa xmm2, [esi] // previous row above.
michael@0 5943 paddd xmm2, xmm0
michael@0 5944
michael@0 5945 paddd xmm0, xmm3
michael@0 5946 movdqa xmm3, [esi + 16]
michael@0 5947 paddd xmm3, xmm0
michael@0 5948
michael@0 5949 paddd xmm0, xmm4
michael@0 5950 movdqa xmm4, [esi + 32]
michael@0 5951 paddd xmm4, xmm0
michael@0 5952
michael@0 5953 paddd xmm0, xmm5
michael@0 5954 movdqa xmm5, [esi + 48]
michael@0 5955 lea esi, [esi + 64]
michael@0 5956 paddd xmm5, xmm0
michael@0 5957
michael@0 5958 movdqa [edx], xmm2
michael@0 5959 movdqa [edx + 16], xmm3
michael@0 5960 movdqa [edx + 32], xmm4
michael@0 5961 movdqa [edx + 48], xmm5
michael@0 5962
michael@0 5963 lea edx, [edx + 64]
michael@0 5964 sub ecx, 4
michael@0 5965 jge l4
michael@0 5966
michael@0 5967 l4b:
michael@0 5968 add ecx, 4 - 1
michael@0 5969 jl l1b
michael@0 5970
michael@0 5971 // 1 pixel loop
michael@0 5972 align 4
michael@0 5973 l1:
michael@0 5974 movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
michael@0 5975 lea eax, [eax + 4]
michael@0 5976 punpcklbw xmm2, xmm1
michael@0 5977 punpcklwd xmm2, xmm1
michael@0 5978 paddd xmm0, xmm2
michael@0 5979 movdqu xmm2, [esi]
michael@0 5980 lea esi, [esi + 16]
michael@0 5981 paddd xmm2, xmm0
michael@0 5982 movdqu [edx], xmm2
michael@0 5983 lea edx, [edx + 16]
michael@0 5984 sub ecx, 1
michael@0 5985 jge l1
michael@0 5986
michael@0 5987 l1b:
michael@0 5988 }
michael@0 5989 }
michael@0 5990 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
michael@0 5991
michael@0 5992 #ifdef HAS_ARGBAFFINEROW_SSE2
michael@0 5993 // Copy ARGB pixels from source image with slope to a row of destination.
michael@0 5994 __declspec(naked) __declspec(align(16))
michael@0 5995 LIBYUV_API
michael@0 5996 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
michael@0 5997 uint8* dst_argb, const float* uv_dudv, int width) {
michael@0 5998 __asm {
michael@0 5999 push esi
michael@0 6000 push edi
michael@0 6001 mov eax, [esp + 12] // src_argb
michael@0 6002 mov esi, [esp + 16] // stride
michael@0 6003 mov edx, [esp + 20] // dst_argb
michael@0 6004 mov ecx, [esp + 24] // pointer to uv_dudv
michael@0 6005 movq xmm2, qword ptr [ecx] // uv
michael@0 6006 movq xmm7, qword ptr [ecx + 8] // dudv
michael@0 6007 mov ecx, [esp + 28] // width
michael@0 6008 shl esi, 16 // 4, stride
michael@0 6009 add esi, 4
michael@0 6010 movd xmm5, esi
michael@0 6011 sub ecx, 4
michael@0 6012 jl l4b
michael@0 6013
michael@0 6014 // setup for 4 pixel loop
michael@0 6015 pshufd xmm7, xmm7, 0x44 // dup dudv
michael@0 6016 pshufd xmm5, xmm5, 0 // dup 4, stride
michael@0 6017 movdqa xmm0, xmm2 // x0, y0, x1, y1
michael@0 6018 addps xmm0, xmm7
michael@0 6019 movlhps xmm2, xmm0
michael@0 6020 movdqa xmm4, xmm7
michael@0 6021 addps xmm4, xmm4 // dudv *= 2
michael@0 6022 movdqa xmm3, xmm2 // x2, y2, x3, y3
michael@0 6023 addps xmm3, xmm4
michael@0 6024 addps xmm4, xmm4 // dudv *= 4
michael@0 6025
michael@0 6026 // 4 pixel loop
michael@0 6027 align 4
michael@0 6028 l4:
michael@0 6029 cvttps2dq xmm0, xmm2 // x, y float to int first 2
michael@0 6030 cvttps2dq xmm1, xmm3 // x, y float to int next 2
michael@0 6031 packssdw xmm0, xmm1 // x, y as 8 shorts
michael@0 6032 pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
michael@0 6033 movd esi, xmm0
michael@0 6034 pshufd xmm0, xmm0, 0x39 // shift right
michael@0 6035 movd edi, xmm0
michael@0 6036 pshufd xmm0, xmm0, 0x39 // shift right
michael@0 6037 movd xmm1, [eax + esi] // read pixel 0
michael@0 6038 movd xmm6, [eax + edi] // read pixel 1
michael@0 6039 punpckldq xmm1, xmm6 // combine pixel 0 and 1
michael@0 6040 addps xmm2, xmm4 // x, y += dx, dy first 2
michael@0 6041 movq qword ptr [edx], xmm1
michael@0 6042 movd esi, xmm0
michael@0 6043 pshufd xmm0, xmm0, 0x39 // shift right
michael@0 6044 movd edi, xmm0
michael@0 6045 movd xmm6, [eax + esi] // read pixel 2
michael@0 6046 movd xmm0, [eax + edi] // read pixel 3
michael@0 6047 punpckldq xmm6, xmm0 // combine pixel 2 and 3
michael@0 6048 addps xmm3, xmm4 // x, y += dx, dy next 2
michael@0 6049 sub ecx, 4
michael@0 6050 movq qword ptr 8[edx], xmm6
michael@0 6051 lea edx, [edx + 16]
michael@0 6052 jge l4
michael@0 6053
michael@0 6054 l4b:
michael@0 6055 add ecx, 4 - 1
michael@0 6056 jl l1b
michael@0 6057
michael@0 6058 // 1 pixel loop
michael@0 6059 align 4
michael@0 6060 l1:
michael@0 6061 cvttps2dq xmm0, xmm2 // x, y float to int
michael@0 6062 packssdw xmm0, xmm0 // x, y as shorts
michael@0 6063 pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride
michael@0 6064 addps xmm2, xmm7 // x, y += dx, dy
michael@0 6065 movd esi, xmm0
michael@0 6066 movd xmm0, [eax + esi] // copy a pixel
michael@0 6067 sub ecx, 1
michael@0 6068 movd [edx], xmm0
michael@0 6069 lea edx, [edx + 4]
michael@0 6070 jge l1
michael@0 6071 l1b:
michael@0 6072 pop edi
michael@0 6073 pop esi
michael@0 6074 ret
michael@0 6075 }
michael@0 6076 }
michael@0 6077 #endif // HAS_ARGBAFFINEROW_SSE2
michael@0 6078
michael@0 6079 #ifdef HAS_INTERPOLATEROW_AVX2
michael@0 6080 // Bilinear filter 16x2 -> 16x1
michael@0 6081 __declspec(naked) __declspec(align(16))
michael@0 6082 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
michael@0 6083 ptrdiff_t src_stride, int dst_width,
michael@0 6084 int source_y_fraction) {
michael@0 6085 __asm {
michael@0 6086 push esi
michael@0 6087 push edi
michael@0 6088 mov edi, [esp + 8 + 4] // dst_ptr
michael@0 6089 mov esi, [esp + 8 + 8] // src_ptr
michael@0 6090 mov edx, [esp + 8 + 12] // src_stride
michael@0 6091 mov ecx, [esp + 8 + 16] // dst_width
michael@0 6092 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
michael@0 6093 shr eax, 1
michael@0 6094 // Dispatch to specialized filters if applicable.
michael@0 6095 cmp eax, 0
michael@0 6096 je xloop100 // 0 / 128. Blend 100 / 0.
michael@0 6097 sub edi, esi
michael@0 6098 cmp eax, 32
michael@0 6099 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
michael@0 6100 cmp eax, 64
michael@0 6101 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
michael@0 6102 cmp eax, 96
michael@0 6103 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
michael@0 6104
michael@0 6105 vmovd xmm0, eax // high fraction 0..127
michael@0 6106 neg eax
michael@0 6107 add eax, 128
michael@0 6108 vmovd xmm5, eax // low fraction 128..1
michael@0 6109 vpunpcklbw xmm5, xmm5, xmm0
michael@0 6110 vpunpcklwd xmm5, xmm5, xmm5
michael@0 6111 vpxor ymm0, ymm0, ymm0
michael@0 6112 vpermd ymm5, ymm0, ymm5
michael@0 6113
michael@0 6114 align 4
michael@0 6115 xloop:
michael@0 6116 vmovdqu ymm0, [esi]
michael@0 6117 vmovdqu ymm2, [esi + edx]
michael@0 6118 vpunpckhbw ymm1, ymm0, ymm2 // mutates
michael@0 6119 vpunpcklbw ymm0, ymm0, ymm2 // mutates
michael@0 6120 vpmaddubsw ymm0, ymm0, ymm5
michael@0 6121 vpmaddubsw ymm1, ymm1, ymm5
michael@0 6122 vpsrlw ymm0, ymm0, 7
michael@0 6123 vpsrlw ymm1, ymm1, 7
michael@0 6124 vpackuswb ymm0, ymm0, ymm1 // unmutates
michael@0 6125 sub ecx, 32
michael@0 6126 vmovdqu [esi + edi], ymm0
michael@0 6127 lea esi, [esi + 32]
michael@0 6128 jg xloop
michael@0 6129 jmp xloop99
michael@0 6130
michael@0 6131 // Blend 25 / 75.
michael@0 6132 align 4
michael@0 6133 xloop25:
michael@0 6134 vmovdqu ymm0, [esi]
michael@0 6135 vpavgb ymm0, ymm0, [esi + edx]
michael@0 6136 vpavgb ymm0, ymm0, [esi + edx]
michael@0 6137 sub ecx, 32
michael@0 6138 vmovdqu [esi + edi], ymm0
michael@0 6139 lea esi, [esi + 32]
michael@0 6140 jg xloop25
michael@0 6141 jmp xloop99
michael@0 6142
michael@0 6143 // Blend 50 / 50.
michael@0 6144 align 4
michael@0 6145 xloop50:
michael@0 6146 vmovdqu ymm0, [esi]
michael@0 6147 vpavgb ymm0, ymm0, [esi + edx]
michael@0 6148 sub ecx, 32
michael@0 6149 vmovdqu [esi + edi], ymm0
michael@0 6150 lea esi, [esi + 32]
michael@0 6151 jg xloop50
michael@0 6152 jmp xloop99
michael@0 6153
michael@0 6154 // Blend 75 / 25.
michael@0 6155 align 4
michael@0 6156 xloop75:
michael@0 6157 vmovdqu ymm0, [esi + edx]
michael@0 6158 vpavgb ymm0, ymm0, [esi]
michael@0 6159 vpavgb ymm0, ymm0, [esi]
michael@0 6160 sub ecx, 32
michael@0 6161 vmovdqu [esi + edi], ymm0
michael@0 6162 lea esi, [esi + 32]
michael@0 6163 jg xloop75
michael@0 6164 jmp xloop99
michael@0 6165
michael@0 6166 // Blend 100 / 0 - Copy row unchanged.
michael@0 6167 align 4
michael@0 6168 xloop100:
michael@0 6169 rep movsb
michael@0 6170
michael@0 6171 xloop99:
michael@0 6172 pop edi
michael@0 6173 pop esi
michael@0 6174 vzeroupper
michael@0 6175 ret
michael@0 6176 }
michael@0 6177 }
michael@0 6178 #endif // HAS_INTERPOLATEROW_AVX2
michael@0 6179
michael@0 6180 #ifdef HAS_INTERPOLATEROW_SSSE3
michael@0 6181 // Bilinear filter 16x2 -> 16x1
michael@0 6182 __declspec(naked) __declspec(align(16))
michael@0 6183 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
michael@0 6184 ptrdiff_t src_stride, int dst_width,
michael@0 6185 int source_y_fraction) {
michael@0 6186 __asm {
michael@0 6187 push esi
michael@0 6188 push edi
michael@0 6189 mov edi, [esp + 8 + 4] // dst_ptr
michael@0 6190 mov esi, [esp + 8 + 8] // src_ptr
michael@0 6191 mov edx, [esp + 8 + 12] // src_stride
michael@0 6192 mov ecx, [esp + 8 + 16] // dst_width
michael@0 6193 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
michael@0 6194 sub edi, esi
michael@0 6195 shr eax, 1
michael@0 6196 // Dispatch to specialized filters if applicable.
michael@0 6197 cmp eax, 0
michael@0 6198 je xloop100 // 0 / 128. Blend 100 / 0.
michael@0 6199 cmp eax, 32
michael@0 6200 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
michael@0 6201 cmp eax, 64
michael@0 6202 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
michael@0 6203 cmp eax, 96
michael@0 6204 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
michael@0 6205
michael@0 6206 movd xmm0, eax // high fraction 0..127
michael@0 6207 neg eax
michael@0 6208 add eax, 128
michael@0 6209 movd xmm5, eax // low fraction 128..1
michael@0 6210 punpcklbw xmm5, xmm0
michael@0 6211 punpcklwd xmm5, xmm5
michael@0 6212 pshufd xmm5, xmm5, 0
michael@0 6213
michael@0 6214 align 4
michael@0 6215 xloop:
michael@0 6216 movdqa xmm0, [esi]
michael@0 6217 movdqa xmm2, [esi + edx]
michael@0 6218 movdqa xmm1, xmm0
michael@0 6219 punpcklbw xmm0, xmm2
michael@0 6220 punpckhbw xmm1, xmm2
michael@0 6221 pmaddubsw xmm0, xmm5
michael@0 6222 pmaddubsw xmm1, xmm5
michael@0 6223 psrlw xmm0, 7
michael@0 6224 psrlw xmm1, 7
michael@0 6225 packuswb xmm0, xmm1
michael@0 6226 sub ecx, 16
michael@0 6227 movdqa [esi + edi], xmm0
michael@0 6228 lea esi, [esi + 16]
michael@0 6229 jg xloop
michael@0 6230 jmp xloop99
michael@0 6231
michael@0 6232 // Blend 25 / 75.
michael@0 6233 align 4
michael@0 6234 xloop25:
michael@0 6235 movdqa xmm0, [esi]
michael@0 6236 movdqa xmm1, [esi + edx]
michael@0 6237 pavgb xmm0, xmm1
michael@0 6238 pavgb xmm0, xmm1
michael@0 6239 sub ecx, 16
michael@0 6240 movdqa [esi + edi], xmm0
michael@0 6241 lea esi, [esi + 16]
michael@0 6242 jg xloop25
michael@0 6243 jmp xloop99
michael@0 6244
michael@0 6245 // Blend 50 / 50.
michael@0 6246 align 4
michael@0 6247 xloop50:
michael@0 6248 movdqa xmm0, [esi]
michael@0 6249 movdqa xmm1, [esi + edx]
michael@0 6250 pavgb xmm0, xmm1
michael@0 6251 sub ecx, 16
michael@0 6252 movdqa [esi + edi], xmm0
michael@0 6253 lea esi, [esi + 16]
michael@0 6254 jg xloop50
michael@0 6255 jmp xloop99
michael@0 6256
michael@0 6257 // Blend 75 / 25.
michael@0 6258 align 4
michael@0 6259 xloop75:
michael@0 6260 movdqa xmm1, [esi]
michael@0 6261 movdqa xmm0, [esi + edx]
michael@0 6262 pavgb xmm0, xmm1
michael@0 6263 pavgb xmm0, xmm1
michael@0 6264 sub ecx, 16
michael@0 6265 movdqa [esi + edi], xmm0
michael@0 6266 lea esi, [esi + 16]
michael@0 6267 jg xloop75
michael@0 6268 jmp xloop99
michael@0 6269
michael@0 6270 // Blend 100 / 0 - Copy row unchanged.
michael@0 6271 align 4
michael@0 6272 xloop100:
michael@0 6273 movdqa xmm0, [esi]
michael@0 6274 sub ecx, 16
michael@0 6275 movdqa [esi + edi], xmm0
michael@0 6276 lea esi, [esi + 16]
michael@0 6277 jg xloop100
michael@0 6278
michael@0 6279 xloop99:
michael@0 6280 pop edi
michael@0 6281 pop esi
michael@0 6282 ret
michael@0 6283 }
michael@0 6284 }
michael@0 6285 #endif // HAS_INTERPOLATEROW_SSSE3
michael@0 6286
michael@0 6287 #ifdef HAS_INTERPOLATEROW_SSE2
michael@0 6288 // Bilinear filter 16x2 -> 16x1
michael@0 6289 __declspec(naked) __declspec(align(16))
michael@0 6290 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
michael@0 6291 ptrdiff_t src_stride, int dst_width,
michael@0 6292 int source_y_fraction) {
michael@0 6293 __asm {
michael@0 6294 push esi
michael@0 6295 push edi
michael@0 6296 mov edi, [esp + 8 + 4] // dst_ptr
michael@0 6297 mov esi, [esp + 8 + 8] // src_ptr
michael@0 6298 mov edx, [esp + 8 + 12] // src_stride
michael@0 6299 mov ecx, [esp + 8 + 16] // dst_width
michael@0 6300 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
michael@0 6301 sub edi, esi
michael@0 6302 // Dispatch to specialized filters if applicable.
michael@0 6303 cmp eax, 0
michael@0 6304 je xloop100 // 0 / 256. Blend 100 / 0.
michael@0 6305 cmp eax, 64
michael@0 6306 je xloop75 // 64 / 256 is 0.25. Blend 75 / 25.
michael@0 6307 cmp eax, 128
michael@0 6308 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
michael@0 6309 cmp eax, 192
michael@0 6310 je xloop25 // 192 / 256 is 0.75. Blend 25 / 75.
michael@0 6311
michael@0 6312 movd xmm5, eax // xmm5 = y fraction
michael@0 6313 punpcklbw xmm5, xmm5
michael@0 6314 psrlw xmm5, 1
michael@0 6315 punpcklwd xmm5, xmm5
michael@0 6316 punpckldq xmm5, xmm5
michael@0 6317 punpcklqdq xmm5, xmm5
michael@0 6318 pxor xmm4, xmm4
michael@0 6319
michael@0 6320 align 4
michael@0 6321 xloop:
michael@0 6322 movdqa xmm0, [esi] // row0
michael@0 6323 movdqa xmm2, [esi + edx] // row1
michael@0 6324 movdqa xmm1, xmm0
michael@0 6325 movdqa xmm3, xmm2
michael@0 6326 punpcklbw xmm2, xmm4
michael@0 6327 punpckhbw xmm3, xmm4
michael@0 6328 punpcklbw xmm0, xmm4
michael@0 6329 punpckhbw xmm1, xmm4
michael@0 6330 psubw xmm2, xmm0 // row1 - row0
michael@0 6331 psubw xmm3, xmm1
michael@0 6332 paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16
michael@0 6333 paddw xmm3, xmm3
michael@0 6334 pmulhw xmm2, xmm5 // scale diff
michael@0 6335 pmulhw xmm3, xmm5
michael@0 6336 paddw xmm0, xmm2 // sum rows
michael@0 6337 paddw xmm1, xmm3
michael@0 6338 packuswb xmm0, xmm1
michael@0 6339 sub ecx, 16
michael@0 6340 movdqa [esi + edi], xmm0
michael@0 6341 lea esi, [esi + 16]
michael@0 6342 jg xloop
michael@0 6343 jmp xloop99
michael@0 6344
michael@0 6345 // Blend 25 / 75.
michael@0 6346 align 4
michael@0 6347 xloop25:
michael@0 6348 movdqa xmm0, [esi]
michael@0 6349 movdqa xmm1, [esi + edx]
michael@0 6350 pavgb xmm0, xmm1
michael@0 6351 pavgb xmm0, xmm1
michael@0 6352 sub ecx, 16
michael@0 6353 movdqa [esi + edi], xmm0
michael@0 6354 lea esi, [esi + 16]
michael@0 6355 jg xloop25
michael@0 6356 jmp xloop99
michael@0 6357
michael@0 6358 // Blend 50 / 50.
michael@0 6359 align 4
michael@0 6360 xloop50:
michael@0 6361 movdqa xmm0, [esi]
michael@0 6362 movdqa xmm1, [esi + edx]
michael@0 6363 pavgb xmm0, xmm1
michael@0 6364 sub ecx, 16
michael@0 6365 movdqa [esi + edi], xmm0
michael@0 6366 lea esi, [esi + 16]
michael@0 6367 jg xloop50
michael@0 6368 jmp xloop99
michael@0 6369
michael@0 6370 // Blend 75 / 25.
michael@0 6371 align 4
michael@0 6372 xloop75:
michael@0 6373 movdqa xmm1, [esi]
michael@0 6374 movdqa xmm0, [esi + edx]
michael@0 6375 pavgb xmm0, xmm1
michael@0 6376 pavgb xmm0, xmm1
michael@0 6377 sub ecx, 16
michael@0 6378 movdqa [esi + edi], xmm0
michael@0 6379 lea esi, [esi + 16]
michael@0 6380 jg xloop75
michael@0 6381 jmp xloop99
michael@0 6382
michael@0 6383 // Blend 100 / 0 - Copy row unchanged.
michael@0 6384 align 4
michael@0 6385 xloop100:
michael@0 6386 movdqa xmm0, [esi]
michael@0 6387 sub ecx, 16
michael@0 6388 movdqa [esi + edi], xmm0
michael@0 6389 lea esi, [esi + 16]
michael@0 6390 jg xloop100
michael@0 6391
michael@0 6392 xloop99:
michael@0 6393 pop edi
michael@0 6394 pop esi
michael@0 6395 ret
michael@0 6396 }
michael@0 6397 }
michael@0 6398 #endif // HAS_INTERPOLATEROW_SSE2
michael@0 6399
michael@0 6400 // Bilinear filter 16x2 -> 16x1
michael@0 6401 __declspec(naked) __declspec(align(16))
michael@0 6402 void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
michael@0 6403 ptrdiff_t src_stride, int dst_width,
michael@0 6404 int source_y_fraction) {
michael@0 6405 __asm {
michael@0 6406 push esi
michael@0 6407 push edi
michael@0 6408 mov edi, [esp + 8 + 4] // dst_ptr
michael@0 6409 mov esi, [esp + 8 + 8] // src_ptr
michael@0 6410 mov edx, [esp + 8 + 12] // src_stride
michael@0 6411 mov ecx, [esp + 8 + 16] // dst_width
michael@0 6412 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
michael@0 6413 sub edi, esi
michael@0 6414 shr eax, 1
michael@0 6415 // Dispatch to specialized filters if applicable.
michael@0 6416 cmp eax, 0
michael@0 6417 je xloop100 // 0 / 128. Blend 100 / 0.
michael@0 6418 cmp eax, 32
michael@0 6419 je xloop75 // 32 / 128 is 0.25. Blend 75 / 25.
michael@0 6420 cmp eax, 64
michael@0 6421 je xloop50 // 64 / 128 is 0.50. Blend 50 / 50.
michael@0 6422 cmp eax, 96
michael@0 6423 je xloop25 // 96 / 128 is 0.75. Blend 25 / 75.
michael@0 6424
michael@0 6425 movd xmm0, eax // high fraction 0..127
michael@0 6426 neg eax
michael@0 6427 add eax, 128
michael@0 6428 movd xmm5, eax // low fraction 128..1
michael@0 6429 punpcklbw xmm5, xmm0
michael@0 6430 punpcklwd xmm5, xmm5
michael@0 6431 pshufd xmm5, xmm5, 0
michael@0 6432
michael@0 6433 align 4
michael@0 6434 xloop:
michael@0 6435 movdqu xmm0, [esi]
michael@0 6436 movdqu xmm2, [esi + edx]
michael@0 6437 movdqu xmm1, xmm0
michael@0 6438 punpcklbw xmm0, xmm2
michael@0 6439 punpckhbw xmm1, xmm2
michael@0 6440 pmaddubsw xmm0, xmm5
michael@0 6441 pmaddubsw xmm1, xmm5
michael@0 6442 psrlw xmm0, 7
michael@0 6443 psrlw xmm1, 7
michael@0 6444 packuswb xmm0, xmm1
michael@0 6445 sub ecx, 16
michael@0 6446 movdqu [esi + edi], xmm0
michael@0 6447 lea esi, [esi + 16]
michael@0 6448 jg xloop
michael@0 6449 jmp xloop99
michael@0 6450
michael@0 6451 // Blend 25 / 75.
michael@0 6452 align 4
michael@0 6453 xloop25:
michael@0 6454 movdqu xmm0, [esi]
michael@0 6455 movdqu xmm1, [esi + edx]
michael@0 6456 pavgb xmm0, xmm1
michael@0 6457 pavgb xmm0, xmm1
michael@0 6458 sub ecx, 16
michael@0 6459 movdqu [esi + edi], xmm0
michael@0 6460 lea esi, [esi + 16]
michael@0 6461 jg xloop25
michael@0 6462 jmp xloop99
michael@0 6463
michael@0 6464 // Blend 50 / 50.
michael@0 6465 align 4
michael@0 6466 xloop50:
michael@0 6467 movdqu xmm0, [esi]
michael@0 6468 movdqu xmm1, [esi + edx]
michael@0 6469 pavgb xmm0, xmm1
michael@0 6470 sub ecx, 16
michael@0 6471 movdqu [esi + edi], xmm0
michael@0 6472 lea esi, [esi + 16]
michael@0 6473 jg xloop50
michael@0 6474 jmp xloop99
michael@0 6475
michael@0 6476 // Blend 75 / 25.
michael@0 6477 align 4
michael@0 6478 xloop75:
michael@0 6479 movdqu xmm1, [esi]
michael@0 6480 movdqu xmm0, [esi + edx]
michael@0 6481 pavgb xmm0, xmm1
michael@0 6482 pavgb xmm0, xmm1
michael@0 6483 sub ecx, 16
michael@0 6484 movdqu [esi + edi], xmm0
michael@0 6485 lea esi, [esi + 16]
michael@0 6486 jg xloop75
michael@0 6487 jmp xloop99
michael@0 6488
michael@0 6489 // Blend 100 / 0 - Copy row unchanged.
michael@0 6490 align 4
michael@0 6491 xloop100:
michael@0 6492 movdqu xmm0, [esi]
michael@0 6493 sub ecx, 16
michael@0 6494 movdqu [esi + edi], xmm0
michael@0 6495 lea esi, [esi + 16]
michael@0 6496 jg xloop100
michael@0 6497
michael@0 6498 xloop99:
michael@0 6499 pop edi
michael@0 6500 pop esi
michael@0 6501 ret
michael@0 6502 }
michael@0 6503 }
michael@0 6504
michael@0 6505 #ifdef HAS_INTERPOLATEROW_SSE2
michael@0 6506 // Bilinear filter 16x2 -> 16x1
michael@0 6507 __declspec(naked) __declspec(align(16))
michael@0 6508 void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
michael@0 6509 ptrdiff_t src_stride, int dst_width,
michael@0 6510 int source_y_fraction) {
michael@0 6511 __asm {
michael@0 6512 push esi
michael@0 6513 push edi
michael@0 6514 mov edi, [esp + 8 + 4] // dst_ptr
michael@0 6515 mov esi, [esp + 8 + 8] // src_ptr
michael@0 6516 mov edx, [esp + 8 + 12] // src_stride
michael@0 6517 mov ecx, [esp + 8 + 16] // dst_width
michael@0 6518 mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
michael@0 6519 sub edi, esi
michael@0 6520 // Dispatch to specialized filters if applicable.
michael@0 6521 cmp eax, 0
michael@0 6522 je xloop100 // 0 / 256. Blend 100 / 0.
michael@0 6523 cmp eax, 64
michael@0 6524 je xloop75 // 64 / 256 is 0.25. Blend 75 / 25.
michael@0 6525 cmp eax, 128
michael@0 6526 je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
michael@0 6527 cmp eax, 192
michael@0 6528 je xloop25 // 192 / 256 is 0.75. Blend 25 / 75.
michael@0 6529
michael@0 6530 movd xmm5, eax // xmm5 = y fraction
michael@0 6531 punpcklbw xmm5, xmm5
michael@0 6532 psrlw xmm5, 1
michael@0 6533 punpcklwd xmm5, xmm5
michael@0 6534 punpckldq xmm5, xmm5
michael@0 6535 punpcklqdq xmm5, xmm5
michael@0 6536 pxor xmm4, xmm4
michael@0 6537
michael@0 6538 align 4
michael@0 6539 xloop:
michael@0 6540 movdqu xmm0, [esi] // row0
michael@0 6541 movdqu xmm2, [esi + edx] // row1
michael@0 6542 movdqu xmm1, xmm0
michael@0 6543 movdqu xmm3, xmm2
michael@0 6544 punpcklbw xmm2, xmm4
michael@0 6545 punpckhbw xmm3, xmm4
michael@0 6546 punpcklbw xmm0, xmm4
michael@0 6547 punpckhbw xmm1, xmm4
michael@0 6548 psubw xmm2, xmm0 // row1 - row0
michael@0 6549 psubw xmm3, xmm1
michael@0 6550 paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16
michael@0 6551 paddw xmm3, xmm3
michael@0 6552 pmulhw xmm2, xmm5 // scale diff
michael@0 6553 pmulhw xmm3, xmm5
michael@0 6554 paddw xmm0, xmm2 // sum rows
michael@0 6555 paddw xmm1, xmm3
michael@0 6556 packuswb xmm0, xmm1
michael@0 6557 sub ecx, 16
michael@0 6558 movdqu [esi + edi], xmm0
michael@0 6559 lea esi, [esi + 16]
michael@0 6560 jg xloop
michael@0 6561 jmp xloop99
michael@0 6562
michael@0 6563 // Blend 25 / 75.
michael@0 6564 align 4
michael@0 6565 xloop25:
michael@0 6566 movdqu xmm0, [esi]
michael@0 6567 movdqu xmm1, [esi + edx]
michael@0 6568 pavgb xmm0, xmm1
michael@0 6569 pavgb xmm0, xmm1
michael@0 6570 sub ecx, 16
michael@0 6571 movdqu [esi + edi], xmm0
michael@0 6572 lea esi, [esi + 16]
michael@0 6573 jg xloop25
michael@0 6574 jmp xloop99
michael@0 6575
michael@0 6576 // Blend 50 / 50.
michael@0 6577 align 4
michael@0 6578 xloop50:
michael@0 6579 movdqu xmm0, [esi]
michael@0 6580 movdqu xmm1, [esi + edx]
michael@0 6581 pavgb xmm0, xmm1
michael@0 6582 sub ecx, 16
michael@0 6583 movdqu [esi + edi], xmm0
michael@0 6584 lea esi, [esi + 16]
michael@0 6585 jg xloop50
michael@0 6586 jmp xloop99
michael@0 6587
michael@0 6588 // Blend 75 / 25.
michael@0 6589 align 4
michael@0 6590 xloop75:
michael@0 6591 movdqu xmm1, [esi]
michael@0 6592 movdqu xmm0, [esi + edx]
michael@0 6593 pavgb xmm0, xmm1
michael@0 6594 pavgb xmm0, xmm1
michael@0 6595 sub ecx, 16
michael@0 6596 movdqu [esi + edi], xmm0
michael@0 6597 lea esi, [esi + 16]
michael@0 6598 jg xloop75
michael@0 6599 jmp xloop99
michael@0 6600
michael@0 6601 // Blend 100 / 0 - Copy row unchanged.
michael@0 6602 align 4
michael@0 6603 xloop100:
michael@0 6604 movdqu xmm0, [esi]
michael@0 6605 sub ecx, 16
michael@0 6606 movdqu [esi + edi], xmm0
michael@0 6607 lea esi, [esi + 16]
michael@0 6608 jg xloop100
michael@0 6609
michael@0 6610 xloop99:
michael@0 6611 pop edi
michael@0 6612 pop esi
michael@0 6613 ret
michael@0 6614 }
michael@0 6615 }
michael@0 6616 #endif // HAS_INTERPOLATEROW_SSE2
michael@0 6617
michael@0 6618 __declspec(naked) __declspec(align(16))
michael@0 6619 void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
michael@0 6620 uint8* dst_uv, int pix) {
michael@0 6621 __asm {
michael@0 6622 push edi
michael@0 6623 mov eax, [esp + 4 + 4] // src_uv
michael@0 6624 mov edx, [esp + 4 + 8] // src_uv_stride
michael@0 6625 mov edi, [esp + 4 + 12] // dst_v
michael@0 6626 mov ecx, [esp + 4 + 16] // pix
michael@0 6627 sub edi, eax
michael@0 6628
michael@0 6629 align 4
michael@0 6630 convertloop:
michael@0 6631 movdqa xmm0, [eax]
michael@0 6632 pavgb xmm0, [eax + edx]
michael@0 6633 sub ecx, 16
michael@0 6634 movdqa [eax + edi], xmm0
michael@0 6635 lea eax, [eax + 16]
michael@0 6636 jg convertloop
michael@0 6637 pop edi
michael@0 6638 ret
michael@0 6639 }
michael@0 6640 }
michael@0 6641
michael@0 6642 #ifdef HAS_HALFROW_AVX2
michael@0 6643 __declspec(naked) __declspec(align(16))
michael@0 6644 void HalfRow_AVX2(const uint8* src_uv, int src_uv_stride,
michael@0 6645 uint8* dst_uv, int pix) {
michael@0 6646 __asm {
michael@0 6647 push edi
michael@0 6648 mov eax, [esp + 4 + 4] // src_uv
michael@0 6649 mov edx, [esp + 4 + 8] // src_uv_stride
michael@0 6650 mov edi, [esp + 4 + 12] // dst_v
michael@0 6651 mov ecx, [esp + 4 + 16] // pix
michael@0 6652 sub edi, eax
michael@0 6653
michael@0 6654 align 4
michael@0 6655 convertloop:
michael@0 6656 vmovdqu ymm0, [eax]
michael@0 6657 vpavgb ymm0, ymm0, [eax + edx]
michael@0 6658 sub ecx, 32
michael@0 6659 vmovdqu [eax + edi], ymm0
michael@0 6660 lea eax, [eax + 32]
michael@0 6661 jg convertloop
michael@0 6662
michael@0 6663 pop edi
michael@0 6664 vzeroupper
michael@0 6665 ret
michael@0 6666 }
michael@0 6667 }
michael@0 6668 #endif // HAS_HALFROW_AVX2
michael@0 6669
michael@0 6670 __declspec(naked) __declspec(align(16))
michael@0 6671 void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
michael@0 6672 uint32 selector, int pix) {
michael@0 6673 __asm {
michael@0 6674 mov eax, [esp + 4] // src_argb
michael@0 6675 mov edx, [esp + 8] // dst_bayer
michael@0 6676 movd xmm5, [esp + 12] // selector
michael@0 6677 mov ecx, [esp + 16] // pix
michael@0 6678 pshufd xmm5, xmm5, 0
michael@0 6679
michael@0 6680 align 4
michael@0 6681 wloop:
michael@0 6682 movdqa xmm0, [eax]
michael@0 6683 movdqa xmm1, [eax + 16]
michael@0 6684 lea eax, [eax + 32]
michael@0 6685 pshufb xmm0, xmm5
michael@0 6686 pshufb xmm1, xmm5
michael@0 6687 punpckldq xmm0, xmm1
michael@0 6688 sub ecx, 8
michael@0 6689 movq qword ptr [edx], xmm0
michael@0 6690 lea edx, [edx + 8]
michael@0 6691 jg wloop
michael@0 6692 ret
michael@0 6693 }
michael@0 6694 }
michael@0 6695
michael@0 6696 // Specialized ARGB to Bayer that just isolates G channel.
michael@0 6697 __declspec(naked) __declspec(align(16))
michael@0 6698 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
michael@0 6699 uint32 selector, int pix) {
michael@0 6700 __asm {
michael@0 6701 mov eax, [esp + 4] // src_argb
michael@0 6702 mov edx, [esp + 8] // dst_bayer
michael@0 6703 // selector
michael@0 6704 mov ecx, [esp + 16] // pix
michael@0 6705 pcmpeqb xmm5, xmm5 // generate mask 0x000000ff
michael@0 6706 psrld xmm5, 24
michael@0 6707
michael@0 6708 align 4
michael@0 6709 wloop:
michael@0 6710 movdqa xmm0, [eax]
michael@0 6711 movdqa xmm1, [eax + 16]
michael@0 6712 lea eax, [eax + 32]
michael@0 6713 psrld xmm0, 8 // Move green to bottom.
michael@0 6714 psrld xmm1, 8
michael@0 6715 pand xmm0, xmm5
michael@0 6716 pand xmm1, xmm5
michael@0 6717 packssdw xmm0, xmm1
michael@0 6718 packuswb xmm0, xmm1
michael@0 6719 sub ecx, 8
michael@0 6720 movq qword ptr [edx], xmm0
michael@0 6721 lea edx, [edx + 8]
michael@0 6722 jg wloop
michael@0 6723 ret
michael@0 6724 }
michael@0 6725 }
michael@0 6726
michael@0 6727 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
michael@0 6728 __declspec(naked) __declspec(align(16))
michael@0 6729 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
michael@0 6730 const uint8* shuffler, int pix) {
michael@0 6731 __asm {
michael@0 6732 mov eax, [esp + 4] // src_argb
michael@0 6733 mov edx, [esp + 8] // dst_argb
michael@0 6734 mov ecx, [esp + 12] // shuffler
michael@0 6735 movdqa xmm5, [ecx]
michael@0 6736 mov ecx, [esp + 16] // pix
michael@0 6737
michael@0 6738 align 4
michael@0 6739 wloop:
michael@0 6740 movdqa xmm0, [eax]
michael@0 6741 movdqa xmm1, [eax + 16]
michael@0 6742 lea eax, [eax + 32]
michael@0 6743 pshufb xmm0, xmm5
michael@0 6744 pshufb xmm1, xmm5
michael@0 6745 sub ecx, 8
michael@0 6746 movdqa [edx], xmm0
michael@0 6747 movdqa [edx + 16], xmm1
michael@0 6748 lea edx, [edx + 32]
michael@0 6749 jg wloop
michael@0 6750 ret
michael@0 6751 }
michael@0 6752 }
michael@0 6753
michael@0 6754 __declspec(naked) __declspec(align(16))
michael@0 6755 void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
michael@0 6756 const uint8* shuffler, int pix) {
michael@0 6757 __asm {
michael@0 6758 mov eax, [esp + 4] // src_argb
michael@0 6759 mov edx, [esp + 8] // dst_argb
michael@0 6760 mov ecx, [esp + 12] // shuffler
michael@0 6761 movdqa xmm5, [ecx]
michael@0 6762 mov ecx, [esp + 16] // pix
michael@0 6763
michael@0 6764 align 4
michael@0 6765 wloop:
michael@0 6766 movdqu xmm0, [eax]
michael@0 6767 movdqu xmm1, [eax + 16]
michael@0 6768 lea eax, [eax + 32]
michael@0 6769 pshufb xmm0, xmm5
michael@0 6770 pshufb xmm1, xmm5
michael@0 6771 sub ecx, 8
michael@0 6772 movdqu [edx], xmm0
michael@0 6773 movdqu [edx + 16], xmm1
michael@0 6774 lea edx, [edx + 32]
michael@0 6775 jg wloop
michael@0 6776 ret
michael@0 6777 }
michael@0 6778 }
michael@0 6779
michael@0 6780 #ifdef HAS_ARGBSHUFFLEROW_AVX2
michael@0 6781 __declspec(naked) __declspec(align(16))
michael@0 6782 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
michael@0 6783 const uint8* shuffler, int pix) {
michael@0 6784 __asm {
michael@0 6785 mov eax, [esp + 4] // src_argb
michael@0 6786 mov edx, [esp + 8] // dst_argb
michael@0 6787 mov ecx, [esp + 12] // shuffler
michael@0 6788 vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
michael@0 6789 mov ecx, [esp + 16] // pix
michael@0 6790
michael@0 6791 align 4
michael@0 6792 wloop:
michael@0 6793 vmovdqu ymm0, [eax]
michael@0 6794 vmovdqu ymm1, [eax + 32]
michael@0 6795 lea eax, [eax + 64]
michael@0 6796 vpshufb ymm0, ymm0, ymm5
michael@0 6797 vpshufb ymm1, ymm1, ymm5
michael@0 6798 sub ecx, 16
michael@0 6799 vmovdqu [edx], ymm0
michael@0 6800 vmovdqu [edx + 32], ymm1
michael@0 6801 lea edx, [edx + 64]
michael@0 6802 jg wloop
michael@0 6803
michael@0 6804 vzeroupper
michael@0 6805 ret
michael@0 6806 }
michael@0 6807 }
michael@0 6808 #endif // HAS_ARGBSHUFFLEROW_AVX2
michael@0 6809
michael@0 6810 __declspec(naked) __declspec(align(16))
michael@0 6811 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
michael@0 6812 const uint8* shuffler, int pix) {
michael@0 6813 __asm {
michael@0 6814 push ebx
michael@0 6815 push esi
michael@0 6816 mov eax, [esp + 8 + 4] // src_argb
michael@0 6817 mov edx, [esp + 8 + 8] // dst_argb
michael@0 6818 mov esi, [esp + 8 + 12] // shuffler
michael@0 6819 mov ecx, [esp + 8 + 16] // pix
michael@0 6820 pxor xmm5, xmm5
michael@0 6821
michael@0 6822 mov ebx, [esi] // shuffler
michael@0 6823 cmp ebx, 0x03000102
michael@0 6824 je shuf_3012
michael@0 6825 cmp ebx, 0x00010203
michael@0 6826 je shuf_0123
michael@0 6827 cmp ebx, 0x00030201
michael@0 6828 je shuf_0321
michael@0 6829 cmp ebx, 0x02010003
michael@0 6830 je shuf_2103
michael@0 6831
michael@0 6832 // TODO(fbarchard): Use one source pointer and 3 offsets.
michael@0 6833 shuf_any1:
michael@0 6834 movzx ebx, byte ptr [esi]
michael@0 6835 movzx ebx, byte ptr [eax + ebx]
michael@0 6836 mov [edx], bl
michael@0 6837 movzx ebx, byte ptr [esi + 1]
michael@0 6838 movzx ebx, byte ptr [eax + ebx]
michael@0 6839 mov [edx + 1], bl
michael@0 6840 movzx ebx, byte ptr [esi + 2]
michael@0 6841 movzx ebx, byte ptr [eax + ebx]
michael@0 6842 mov [edx + 2], bl
michael@0 6843 movzx ebx, byte ptr [esi + 3]
michael@0 6844 movzx ebx, byte ptr [eax + ebx]
michael@0 6845 mov [edx + 3], bl
michael@0 6846 lea eax, [eax + 4]
michael@0 6847 lea edx, [edx + 4]
michael@0 6848 sub ecx, 1
michael@0 6849 jg shuf_any1
michael@0 6850 jmp shuf99
michael@0 6851
michael@0 6852 align 4
michael@0 6853 shuf_0123:
michael@0 6854 movdqu xmm0, [eax]
michael@0 6855 lea eax, [eax + 16]
michael@0 6856 movdqa xmm1, xmm0
michael@0 6857 punpcklbw xmm0, xmm5
michael@0 6858 punpckhbw xmm1, xmm5
michael@0 6859 pshufhw xmm0, xmm0, 01Bh // 1B = 00011011 = 0x0123 = BGRAToARGB
michael@0 6860 pshuflw xmm0, xmm0, 01Bh
michael@0 6861 pshufhw xmm1, xmm1, 01Bh
michael@0 6862 pshuflw xmm1, xmm1, 01Bh
michael@0 6863 packuswb xmm0, xmm1
michael@0 6864 sub ecx, 4
michael@0 6865 movdqu [edx], xmm0
michael@0 6866 lea edx, [edx + 16]
michael@0 6867 jg shuf_0123
michael@0 6868 jmp shuf99
michael@0 6869
michael@0 6870 align 4
michael@0 6871 shuf_0321:
michael@0 6872 movdqu xmm0, [eax]
michael@0 6873 lea eax, [eax + 16]
michael@0 6874 movdqa xmm1, xmm0
michael@0 6875 punpcklbw xmm0, xmm5
michael@0 6876 punpckhbw xmm1, xmm5
michael@0 6877 pshufhw xmm0, xmm0, 039h // 39 = 00111001 = 0x0321 = RGBAToARGB
michael@0 6878 pshuflw xmm0, xmm0, 039h
michael@0 6879 pshufhw xmm1, xmm1, 039h
michael@0 6880 pshuflw xmm1, xmm1, 039h
michael@0 6881 packuswb xmm0, xmm1
michael@0 6882 sub ecx, 4
michael@0 6883 movdqu [edx], xmm0
michael@0 6884 lea edx, [edx + 16]
michael@0 6885 jg shuf_0321
michael@0 6886 jmp shuf99
michael@0 6887
michael@0 6888 align 4
michael@0 6889 shuf_2103:
michael@0 6890 movdqu xmm0, [eax]
michael@0 6891 lea eax, [eax + 16]
michael@0 6892 movdqa xmm1, xmm0
michael@0 6893 punpcklbw xmm0, xmm5
michael@0 6894 punpckhbw xmm1, xmm5
michael@0 6895 pshufhw xmm0, xmm0, 093h // 93 = 10010011 = 0x2103 = ARGBToRGBA
michael@0 6896 pshuflw xmm0, xmm0, 093h
michael@0 6897 pshufhw xmm1, xmm1, 093h
michael@0 6898 pshuflw xmm1, xmm1, 093h
michael@0 6899 packuswb xmm0, xmm1
michael@0 6900 sub ecx, 4
michael@0 6901 movdqu [edx], xmm0
michael@0 6902 lea edx, [edx + 16]
michael@0 6903 jg shuf_2103
michael@0 6904 jmp shuf99
michael@0 6905
michael@0 6906 align 4
michael@0 6907 shuf_3012:
michael@0 6908 movdqu xmm0, [eax]
michael@0 6909 lea eax, [eax + 16]
michael@0 6910 movdqa xmm1, xmm0
michael@0 6911 punpcklbw xmm0, xmm5
michael@0 6912 punpckhbw xmm1, xmm5
michael@0 6913 pshufhw xmm0, xmm0, 0C6h // C6 = 11000110 = 0x3012 = ABGRToARGB
michael@0 6914 pshuflw xmm0, xmm0, 0C6h
michael@0 6915 pshufhw xmm1, xmm1, 0C6h
michael@0 6916 pshuflw xmm1, xmm1, 0C6h
michael@0 6917 packuswb xmm0, xmm1
michael@0 6918 sub ecx, 4
michael@0 6919 movdqu [edx], xmm0
michael@0 6920 lea edx, [edx + 16]
michael@0 6921 jg shuf_3012
michael@0 6922
michael@0 6923 shuf99:
michael@0 6924 pop esi
michael@0 6925 pop ebx
michael@0 6926 ret
michael@0 6927 }
michael@0 6928 }
michael@0 6929
michael@0 6930 // YUY2 - Macro-pixel = 2 image pixels
michael@0 6931 // Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4....
michael@0 6932
michael@0 6933 // UYVY - Macro-pixel = 2 image pixels
michael@0 6934 // U0Y0V0Y1
michael@0 6935
michael@0 6936 __declspec(naked) __declspec(align(16))
michael@0 6937 void I422ToYUY2Row_SSE2(const uint8* src_y,
michael@0 6938 const uint8* src_u,
michael@0 6939 const uint8* src_v,
michael@0 6940 uint8* dst_frame, int width) {
michael@0 6941 __asm {
michael@0 6942 push esi
michael@0 6943 push edi
michael@0 6944 mov eax, [esp + 8 + 4] // src_y
michael@0 6945 mov esi, [esp + 8 + 8] // src_u
michael@0 6946 mov edx, [esp + 8 + 12] // src_v
michael@0 6947 mov edi, [esp + 8 + 16] // dst_frame
michael@0 6948 mov ecx, [esp + 8 + 20] // width
michael@0 6949 sub edx, esi
michael@0 6950
michael@0 6951 align 4
michael@0 6952 convertloop:
michael@0 6953 movq xmm2, qword ptr [esi] // U
michael@0 6954 movq xmm3, qword ptr [esi + edx] // V
michael@0 6955 lea esi, [esi + 8]
michael@0 6956 punpcklbw xmm2, xmm3 // UV
michael@0 6957 movdqu xmm0, [eax] // Y
michael@0 6958 lea eax, [eax + 16]
michael@0 6959 movdqa xmm1, xmm0
michael@0 6960 punpcklbw xmm0, xmm2 // YUYV
michael@0 6961 punpckhbw xmm1, xmm2
michael@0 6962 movdqu [edi], xmm0
michael@0 6963 movdqu [edi + 16], xmm1
michael@0 6964 lea edi, [edi + 32]
michael@0 6965 sub ecx, 16
michael@0 6966 jg convertloop
michael@0 6967
michael@0 6968 pop edi
michael@0 6969 pop esi
michael@0 6970 ret
michael@0 6971 }
michael@0 6972 }
michael@0 6973
michael@0 6974 __declspec(naked) __declspec(align(16))
michael@0 6975 void I422ToUYVYRow_SSE2(const uint8* src_y,
michael@0 6976 const uint8* src_u,
michael@0 6977 const uint8* src_v,
michael@0 6978 uint8* dst_frame, int width) {
michael@0 6979 __asm {
michael@0 6980 push esi
michael@0 6981 push edi
michael@0 6982 mov eax, [esp + 8 + 4] // src_y
michael@0 6983 mov esi, [esp + 8 + 8] // src_u
michael@0 6984 mov edx, [esp + 8 + 12] // src_v
michael@0 6985 mov edi, [esp + 8 + 16] // dst_frame
michael@0 6986 mov ecx, [esp + 8 + 20] // width
michael@0 6987 sub edx, esi
michael@0 6988
michael@0 6989 align 4
michael@0 6990 convertloop:
michael@0 6991 movq xmm2, qword ptr [esi] // U
michael@0 6992 movq xmm3, qword ptr [esi + edx] // V
michael@0 6993 lea esi, [esi + 8]
michael@0 6994 punpcklbw xmm2, xmm3 // UV
michael@0 6995 movdqu xmm0, [eax] // Y
michael@0 6996 movdqa xmm1, xmm2
michael@0 6997 lea eax, [eax + 16]
michael@0 6998 punpcklbw xmm1, xmm0 // UYVY
michael@0 6999 punpckhbw xmm2, xmm0
michael@0 7000 movdqu [edi], xmm1
michael@0 7001 movdqu [edi + 16], xmm2
michael@0 7002 lea edi, [edi + 32]
michael@0 7003 sub ecx, 16
michael@0 7004 jg convertloop
michael@0 7005
michael@0 7006 pop edi
michael@0 7007 pop esi
michael@0 7008 ret
michael@0 7009 }
michael@0 7010 }
michael@0 7011
michael@0 7012 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
michael@0 7013 __declspec(naked) __declspec(align(16))
michael@0 7014 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
michael@0 7015 uint8* dst_argb, const float* poly,
michael@0 7016 int width) {
michael@0 7017 __asm {
michael@0 7018 push esi
michael@0 7019 mov eax, [esp + 4 + 4] /* src_argb */
michael@0 7020 mov edx, [esp + 4 + 8] /* dst_argb */
michael@0 7021 mov esi, [esp + 4 + 12] /* poly */
michael@0 7022 mov ecx, [esp + 4 + 16] /* width */
michael@0 7023 pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
michael@0 7024
michael@0 7025 // 2 pixel loop.
michael@0 7026 align 4
michael@0 7027 convertloop:
michael@0 7028 // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
michael@0 7029 // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
michael@0 7030 movq xmm0, qword ptr [eax] // BGRABGRA
michael@0 7031 lea eax, [eax + 8]
michael@0 7032 punpcklbw xmm0, xmm3
michael@0 7033 movdqa xmm4, xmm0
michael@0 7034 punpcklwd xmm0, xmm3 // pixel 0
michael@0 7035 punpckhwd xmm4, xmm3 // pixel 1
michael@0 7036 cvtdq2ps xmm0, xmm0 // 4 floats
michael@0 7037 cvtdq2ps xmm4, xmm4
michael@0 7038 movdqa xmm1, xmm0 // X
michael@0 7039 movdqa xmm5, xmm4
michael@0 7040 mulps xmm0, [esi + 16] // C1 * X
michael@0 7041 mulps xmm4, [esi + 16]
michael@0 7042 addps xmm0, [esi] // result = C0 + C1 * X
michael@0 7043 addps xmm4, [esi]
michael@0 7044 movdqa xmm2, xmm1
michael@0 7045 movdqa xmm6, xmm5
michael@0 7046 mulps xmm2, xmm1 // X * X
michael@0 7047 mulps xmm6, xmm5
michael@0 7048 mulps xmm1, xmm2 // X * X * X
michael@0 7049 mulps xmm5, xmm6
michael@0 7050 mulps xmm2, [esi + 32] // C2 * X * X
michael@0 7051 mulps xmm6, [esi + 32]
michael@0 7052 mulps xmm1, [esi + 48] // C3 * X * X * X
michael@0 7053 mulps xmm5, [esi + 48]
michael@0 7054 addps xmm0, xmm2 // result += C2 * X * X
michael@0 7055 addps xmm4, xmm6
michael@0 7056 addps xmm0, xmm1 // result += C3 * X * X * X
michael@0 7057 addps xmm4, xmm5
michael@0 7058 cvttps2dq xmm0, xmm0
michael@0 7059 cvttps2dq xmm4, xmm4
michael@0 7060 packuswb xmm0, xmm4
michael@0 7061 packuswb xmm0, xmm0
michael@0 7062 sub ecx, 2
michael@0 7063 movq qword ptr [edx], xmm0
michael@0 7064 lea edx, [edx + 8]
michael@0 7065 jg convertloop
michael@0 7066 pop esi
michael@0 7067 ret
michael@0 7068 }
michael@0 7069 }
michael@0 7070 #endif // HAS_ARGBPOLYNOMIALROW_SSE2
michael@0 7071
michael@0 7072 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
michael@0 7073 __declspec(naked) __declspec(align(16))
michael@0 7074 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
michael@0 7075 uint8* dst_argb, const float* poly,
michael@0 7076 int width) {
michael@0 7077 __asm {
michael@0 7078 mov eax, [esp + 4] /* src_argb */
michael@0 7079 mov edx, [esp + 8] /* dst_argb */
michael@0 7080 mov ecx, [esp + 12] /* poly */
michael@0 7081 vbroadcastf128 ymm4, [ecx] // C0
michael@0 7082 vbroadcastf128 ymm5, [ecx + 16] // C1
michael@0 7083 vbroadcastf128 ymm6, [ecx + 32] // C2
michael@0 7084 vbroadcastf128 ymm7, [ecx + 48] // C3
michael@0 7085 mov ecx, [esp + 16] /* width */
michael@0 7086
michael@0 7087 // 2 pixel loop.
michael@0 7088 align 4
michael@0 7089 convertloop:
michael@0 7090 vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels
michael@0 7091 lea eax, [eax + 8]
michael@0 7092 vcvtdq2ps ymm0, ymm0 // X 8 floats
michael@0 7093 vmulps ymm2, ymm0, ymm0 // X * X
michael@0 7094 vmulps ymm3, ymm0, ymm7 // C3 * X
michael@0 7095 vfmadd132ps ymm0, ymm4, ymm5 // result = C0 + C1 * X
michael@0 7096 vfmadd231ps ymm0, ymm2, ymm6 // result += C2 * X * X
michael@0 7097 vfmadd231ps ymm0, ymm2, ymm3 // result += C3 * X * X * X
michael@0 7098 vcvttps2dq ymm0, ymm0
michael@0 7099 vpackusdw ymm0, ymm0, ymm0 // b0g0r0a0_00000000_b0g0r0a0_00000000
michael@0 7100 vpermq ymm0, ymm0, 0xd8 // b0g0r0a0_b0g0r0a0_00000000_00000000
michael@0 7101 vpackuswb xmm0, xmm0, xmm0 // bgrabgra_00000000_00000000_00000000
michael@0 7102 sub ecx, 2
michael@0 7103 vmovq qword ptr [edx], xmm0
michael@0 7104 lea edx, [edx + 8]
michael@0 7105 jg convertloop
michael@0 7106 vzeroupper
michael@0 7107 ret
michael@0 7108 }
michael@0 7109 }
michael@0 7110 #endif // HAS_ARGBPOLYNOMIALROW_AVX2
michael@0 7111
michael@0 7112 #ifdef HAS_ARGBCOLORTABLEROW_X86
michael@0 7113 // Tranform ARGB pixels with color table.
michael@0 7114 __declspec(naked) __declspec(align(16))
michael@0 7115 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
michael@0 7116 int width) {
michael@0 7117 __asm {
michael@0 7118 push esi
michael@0 7119 mov eax, [esp + 4 + 4] /* dst_argb */
michael@0 7120 mov esi, [esp + 4 + 8] /* table_argb */
michael@0 7121 mov ecx, [esp + 4 + 12] /* width */
michael@0 7122
michael@0 7123 // 1 pixel loop.
michael@0 7124 align 4
michael@0 7125 convertloop:
michael@0 7126 movzx edx, byte ptr [eax]
michael@0 7127 lea eax, [eax + 4]
michael@0 7128 movzx edx, byte ptr [esi + edx * 4]
michael@0 7129 mov byte ptr [eax - 4], dl
michael@0 7130 movzx edx, byte ptr [eax - 4 + 1]
michael@0 7131 movzx edx, byte ptr [esi + edx * 4 + 1]
michael@0 7132 mov byte ptr [eax - 4 + 1], dl
michael@0 7133 movzx edx, byte ptr [eax - 4 + 2]
michael@0 7134 movzx edx, byte ptr [esi + edx * 4 + 2]
michael@0 7135 mov byte ptr [eax - 4 + 2], dl
michael@0 7136 movzx edx, byte ptr [eax - 4 + 3]
michael@0 7137 movzx edx, byte ptr [esi + edx * 4 + 3]
michael@0 7138 mov byte ptr [eax - 4 + 3], dl
michael@0 7139 dec ecx
michael@0 7140 jg convertloop
michael@0 7141 pop esi
michael@0 7142 ret
michael@0 7143 }
michael@0 7144 }
michael@0 7145 #endif // HAS_ARGBCOLORTABLEROW_X86
michael@0 7146
michael@0 7147 #ifdef HAS_RGBCOLORTABLEROW_X86
michael@0 7148 // Tranform RGB pixels with color table.
michael@0 7149 __declspec(naked) __declspec(align(16))
michael@0 7150 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
michael@0 7151 __asm {
michael@0 7152 push esi
michael@0 7153 mov eax, [esp + 4 + 4] /* dst_argb */
michael@0 7154 mov esi, [esp + 4 + 8] /* table_argb */
michael@0 7155 mov ecx, [esp + 4 + 12] /* width */
michael@0 7156
michael@0 7157 // 1 pixel loop.
michael@0 7158 align 4
michael@0 7159 convertloop:
michael@0 7160 movzx edx, byte ptr [eax]
michael@0 7161 lea eax, [eax + 4]
michael@0 7162 movzx edx, byte ptr [esi + edx * 4]
michael@0 7163 mov byte ptr [eax - 4], dl
michael@0 7164 movzx edx, byte ptr [eax - 4 + 1]
michael@0 7165 movzx edx, byte ptr [esi + edx * 4 + 1]
michael@0 7166 mov byte ptr [eax - 4 + 1], dl
michael@0 7167 movzx edx, byte ptr [eax - 4 + 2]
michael@0 7168 movzx edx, byte ptr [esi + edx * 4 + 2]
michael@0 7169 mov byte ptr [eax - 4 + 2], dl
michael@0 7170 dec ecx
michael@0 7171 jg convertloop
michael@0 7172
michael@0 7173 pop esi
michael@0 7174 ret
michael@0 7175 }
michael@0 7176 }
michael@0 7177 #endif // HAS_RGBCOLORTABLEROW_X86
michael@0 7178
michael@0 7179 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
michael@0 7180 // Tranform RGB pixels with luma table.
michael@0 7181 __declspec(naked) __declspec(align(16))
michael@0 7182 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
michael@0 7183 int width,
michael@0 7184 const uint8* luma, uint32 lumacoeff) {
michael@0 7185 __asm {
michael@0 7186 push esi
michael@0 7187 push edi
michael@0 7188 mov eax, [esp + 8 + 4] /* src_argb */
michael@0 7189 mov edi, [esp + 8 + 8] /* dst_argb */
michael@0 7190 mov ecx, [esp + 8 + 12] /* width */
michael@0 7191 movd xmm2, dword ptr [esp + 8 + 16] // luma table
michael@0 7192 movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff
michael@0 7193 pshufd xmm2, xmm2, 0
michael@0 7194 pshufd xmm3, xmm3, 0
michael@0 7195 pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00
michael@0 7196 psllw xmm4, 8
michael@0 7197 pxor xmm5, xmm5
michael@0 7198
michael@0 7199 // 4 pixel loop.
michael@0 7200 align 4
michael@0 7201 convertloop:
michael@0 7202 movdqu xmm0, qword ptr [eax] // generate luma ptr
michael@0 7203 pmaddubsw xmm0, xmm3
michael@0 7204 phaddw xmm0, xmm0
michael@0 7205 pand xmm0, xmm4 // mask out low bits
michael@0 7206 punpcklwd xmm0, xmm5
michael@0 7207 paddd xmm0, xmm2 // add table base
michael@0 7208 movd esi, xmm0
michael@0 7209 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
michael@0 7210
michael@0 7211 movzx edx, byte ptr [eax]
michael@0 7212 movzx edx, byte ptr [esi + edx]
michael@0 7213 mov byte ptr [edi], dl
michael@0 7214 movzx edx, byte ptr [eax + 1]
michael@0 7215 movzx edx, byte ptr [esi + edx]
michael@0 7216 mov byte ptr [edi + 1], dl
michael@0 7217 movzx edx, byte ptr [eax + 2]
michael@0 7218 movzx edx, byte ptr [esi + edx]
michael@0 7219 mov byte ptr [edi + 2], dl
michael@0 7220 movzx edx, byte ptr [eax + 3] // copy alpha.
michael@0 7221 mov byte ptr [edi + 3], dl
michael@0 7222
michael@0 7223 movd esi, xmm0
michael@0 7224 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
michael@0 7225
michael@0 7226 movzx edx, byte ptr [eax + 4]
michael@0 7227 movzx edx, byte ptr [esi + edx]
michael@0 7228 mov byte ptr [edi + 4], dl
michael@0 7229 movzx edx, byte ptr [eax + 5]
michael@0 7230 movzx edx, byte ptr [esi + edx]
michael@0 7231 mov byte ptr [edi + 5], dl
michael@0 7232 movzx edx, byte ptr [eax + 6]
michael@0 7233 movzx edx, byte ptr [esi + edx]
michael@0 7234 mov byte ptr [edi + 6], dl
michael@0 7235 movzx edx, byte ptr [eax + 7] // copy alpha.
michael@0 7236 mov byte ptr [edi + 7], dl
michael@0 7237
michael@0 7238 movd esi, xmm0
michael@0 7239 pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32
michael@0 7240
michael@0 7241 movzx edx, byte ptr [eax + 8]
michael@0 7242 movzx edx, byte ptr [esi + edx]
michael@0 7243 mov byte ptr [edi + 8], dl
michael@0 7244 movzx edx, byte ptr [eax + 9]
michael@0 7245 movzx edx, byte ptr [esi + edx]
michael@0 7246 mov byte ptr [edi + 9], dl
michael@0 7247 movzx edx, byte ptr [eax + 10]
michael@0 7248 movzx edx, byte ptr [esi + edx]
michael@0 7249 mov byte ptr [edi + 10], dl
michael@0 7250 movzx edx, byte ptr [eax + 11] // copy alpha.
michael@0 7251 mov byte ptr [edi + 11], dl
michael@0 7252
michael@0 7253 movd esi, xmm0
michael@0 7254
michael@0 7255 movzx edx, byte ptr [eax + 12]
michael@0 7256 movzx edx, byte ptr [esi + edx]
michael@0 7257 mov byte ptr [edi + 12], dl
michael@0 7258 movzx edx, byte ptr [eax + 13]
michael@0 7259 movzx edx, byte ptr [esi + edx]
michael@0 7260 mov byte ptr [edi + 13], dl
michael@0 7261 movzx edx, byte ptr [eax + 14]
michael@0 7262 movzx edx, byte ptr [esi + edx]
michael@0 7263 mov byte ptr [edi + 14], dl
michael@0 7264 movzx edx, byte ptr [eax + 15] // copy alpha.
michael@0 7265 mov byte ptr [edi + 15], dl
michael@0 7266
michael@0 7267 sub ecx, 4
michael@0 7268 lea eax, [eax + 16]
michael@0 7269 lea edi, [edi + 16]
michael@0 7270 jg convertloop
michael@0 7271
michael@0 7272 pop edi
michael@0 7273 pop esi
michael@0 7274 ret
michael@0 7275 }
michael@0 7276 }
michael@0 7277 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
michael@0 7278
michael@0 7279 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
michael@0 7280
michael@0 7281 #ifdef __cplusplus
michael@0 7282 } // extern "C"
michael@0 7283 } // namespace libyuv
michael@0 7284 #endif

mercurial