media/libyuv/source/scale_win.cc

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /*
michael@0 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
michael@0 3 *
michael@0 4 * Use of this source code is governed by a BSD-style license
michael@0 5 * that can be found in the LICENSE file in the root of the source
michael@0 6 * tree. An additional intellectual property rights grant can be found
michael@0 7 * in the file PATENTS. All contributing project authors may
michael@0 8 * be found in the AUTHORS file in the root of the source tree.
michael@0 9 */
michael@0 10
michael@0 11 #include "libyuv/row.h"
michael@0 12
michael@0 13 #ifdef __cplusplus
michael@0 14 namespace libyuv {
michael@0 15 extern "C" {
michael@0 16 #endif
michael@0 17
michael@0 18 // This module is for Visual C x86.
michael@0 19 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
michael@0 20
michael@0 21 // Offsets for source bytes 0 to 9
michael@0 22 static uvec8 kShuf0 =
michael@0 23 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
michael@0 24
michael@0 25 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
michael@0 26 static uvec8 kShuf1 =
michael@0 27 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
michael@0 28
michael@0 29 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
michael@0 30 static uvec8 kShuf2 =
michael@0 31 { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
michael@0 32
michael@0 33 // Offsets for source bytes 0 to 10
michael@0 34 static uvec8 kShuf01 =
michael@0 35 { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
michael@0 36
michael@0 37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
michael@0 38 static uvec8 kShuf11 =
michael@0 39 { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
michael@0 40
michael@0 41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
michael@0 42 static uvec8 kShuf21 =
michael@0 43 { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
michael@0 44
michael@0 45 // Coefficients for source bytes 0 to 10
michael@0 46 static uvec8 kMadd01 =
michael@0 47 { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
michael@0 48
michael@0 49 // Coefficients for source bytes 10 to 21
michael@0 50 static uvec8 kMadd11 =
michael@0 51 { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
michael@0 52
michael@0 53 // Coefficients for source bytes 21 to 31
michael@0 54 static uvec8 kMadd21 =
michael@0 55 { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
michael@0 56
michael@0 57 // Coefficients for source bytes 21 to 31
michael@0 58 static vec16 kRound34 =
michael@0 59 { 2, 2, 2, 2, 2, 2, 2, 2 };
michael@0 60
michael@0 61 static uvec8 kShuf38a =
michael@0 62 { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
michael@0 63
michael@0 64 static uvec8 kShuf38b =
michael@0 65 { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
michael@0 66
michael@0 67 // Arrange words 0,3,6 into 0,1,2
michael@0 68 static uvec8 kShufAc =
michael@0 69 { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
michael@0 70
michael@0 71 // Arrange words 0,3,6 into 3,4,5
michael@0 72 static uvec8 kShufAc3 =
michael@0 73 { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
michael@0 74
michael@0 75 // Scaling values for boxes of 3x3 and 2x3
michael@0 76 static uvec16 kScaleAc33 =
michael@0 77 { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
michael@0 78
michael@0 79 // Arrange first value for pixels 0,1,2,3,4,5
michael@0 80 static uvec8 kShufAb0 =
michael@0 81 { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
michael@0 82
michael@0 83 // Arrange second value for pixels 0,1,2,3,4,5
michael@0 84 static uvec8 kShufAb1 =
michael@0 85 { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
michael@0 86
michael@0 87 // Arrange third value for pixels 0,1,2,3,4,5
michael@0 88 static uvec8 kShufAb2 =
michael@0 89 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
michael@0 90
michael@0 91 // Scaling values for boxes of 3x2 and 2x2
michael@0 92 static uvec16 kScaleAb2 =
michael@0 93 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
michael@0 94
michael@0 95 // Reads 32 pixels, throws half away and writes 16 pixels.
michael@0 96 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
michael@0 97 __declspec(naked) __declspec(align(16))
michael@0 98 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
michael@0 99 uint8* dst_ptr, int dst_width) {
michael@0 100 __asm {
michael@0 101 mov eax, [esp + 4] // src_ptr
michael@0 102 // src_stride ignored
michael@0 103 mov edx, [esp + 12] // dst_ptr
michael@0 104 mov ecx, [esp + 16] // dst_width
michael@0 105
michael@0 106 align 4
michael@0 107 wloop:
michael@0 108 movdqa xmm0, [eax]
michael@0 109 movdqa xmm1, [eax + 16]
michael@0 110 lea eax, [eax + 32]
michael@0 111 psrlw xmm0, 8 // isolate odd pixels.
michael@0 112 psrlw xmm1, 8
michael@0 113 packuswb xmm0, xmm1
michael@0 114 sub ecx, 16
michael@0 115 movdqa [edx], xmm0
michael@0 116 lea edx, [edx + 16]
michael@0 117 jg wloop
michael@0 118
michael@0 119 ret
michael@0 120 }
michael@0 121 }
michael@0 122
michael@0 123 // Blends 32x1 rectangle to 16x1.
michael@0 124 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
michael@0 125 __declspec(naked) __declspec(align(16))
michael@0 126 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
michael@0 127 uint8* dst_ptr, int dst_width) {
michael@0 128 __asm {
michael@0 129 mov eax, [esp + 4] // src_ptr
michael@0 130 // src_stride
michael@0 131 mov edx, [esp + 12] // dst_ptr
michael@0 132 mov ecx, [esp + 16] // dst_width
michael@0 133 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
michael@0 134 psrlw xmm5, 8
michael@0 135
michael@0 136 align 4
michael@0 137 wloop:
michael@0 138 movdqa xmm0, [eax]
michael@0 139 movdqa xmm1, [eax + 16]
michael@0 140 lea eax, [eax + 32]
michael@0 141
michael@0 142 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
michael@0 143 psrlw xmm0, 8
michael@0 144 movdqa xmm3, xmm1
michael@0 145 psrlw xmm1, 8
michael@0 146 pand xmm2, xmm5
michael@0 147 pand xmm3, xmm5
michael@0 148 pavgw xmm0, xmm2
michael@0 149 pavgw xmm1, xmm3
michael@0 150 packuswb xmm0, xmm1
michael@0 151
michael@0 152 sub ecx, 16
michael@0 153 movdqa [edx], xmm0
michael@0 154 lea edx, [edx + 16]
michael@0 155 jg wloop
michael@0 156
michael@0 157 ret
michael@0 158 }
michael@0 159 }
michael@0 160
michael@0 161 // Blends 32x2 rectangle to 16x1.
michael@0 162 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
michael@0 163 __declspec(naked) __declspec(align(16))
michael@0 164 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
michael@0 165 uint8* dst_ptr, int dst_width) {
michael@0 166 __asm {
michael@0 167 push esi
michael@0 168 mov eax, [esp + 4 + 4] // src_ptr
michael@0 169 mov esi, [esp + 4 + 8] // src_stride
michael@0 170 mov edx, [esp + 4 + 12] // dst_ptr
michael@0 171 mov ecx, [esp + 4 + 16] // dst_width
michael@0 172 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
michael@0 173 psrlw xmm5, 8
michael@0 174
michael@0 175 align 4
michael@0 176 wloop:
michael@0 177 movdqa xmm0, [eax]
michael@0 178 movdqa xmm1, [eax + 16]
michael@0 179 movdqa xmm2, [eax + esi]
michael@0 180 movdqa xmm3, [eax + esi + 16]
michael@0 181 lea eax, [eax + 32]
michael@0 182 pavgb xmm0, xmm2 // average rows
michael@0 183 pavgb xmm1, xmm3
michael@0 184
michael@0 185 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
michael@0 186 psrlw xmm0, 8
michael@0 187 movdqa xmm3, xmm1
michael@0 188 psrlw xmm1, 8
michael@0 189 pand xmm2, xmm5
michael@0 190 pand xmm3, xmm5
michael@0 191 pavgw xmm0, xmm2
michael@0 192 pavgw xmm1, xmm3
michael@0 193 packuswb xmm0, xmm1
michael@0 194
michael@0 195 sub ecx, 16
michael@0 196 movdqa [edx], xmm0
michael@0 197 lea edx, [edx + 16]
michael@0 198 jg wloop
michael@0 199
michael@0 200 pop esi
michael@0 201 ret
michael@0 202 }
michael@0 203 }
michael@0 204
michael@0 205 // Reads 32 pixels, throws half away and writes 16 pixels.
michael@0 206 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
michael@0 207 __declspec(naked) __declspec(align(16))
michael@0 208 void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
michael@0 209 ptrdiff_t src_stride,
michael@0 210 uint8* dst_ptr, int dst_width) {
michael@0 211 __asm {
michael@0 212 mov eax, [esp + 4] // src_ptr
michael@0 213 // src_stride ignored
michael@0 214 mov edx, [esp + 12] // dst_ptr
michael@0 215 mov ecx, [esp + 16] // dst_width
michael@0 216
michael@0 217 align 4
michael@0 218 wloop:
michael@0 219 movdqu xmm0, [eax]
michael@0 220 movdqu xmm1, [eax + 16]
michael@0 221 lea eax, [eax + 32]
michael@0 222 psrlw xmm0, 8 // isolate odd pixels.
michael@0 223 psrlw xmm1, 8
michael@0 224 packuswb xmm0, xmm1
michael@0 225 sub ecx, 16
michael@0 226 movdqu [edx], xmm0
michael@0 227 lea edx, [edx + 16]
michael@0 228 jg wloop
michael@0 229
michael@0 230 ret
michael@0 231 }
michael@0 232 }
michael@0 233
michael@0 234 // Blends 32x1 rectangle to 16x1.
michael@0 235 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
michael@0 236 __declspec(naked) __declspec(align(16))
michael@0 237 void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
michael@0 238 ptrdiff_t src_stride,
michael@0 239 uint8* dst_ptr, int dst_width) {
michael@0 240 __asm {
michael@0 241 mov eax, [esp + 4] // src_ptr
michael@0 242 // src_stride
michael@0 243 mov edx, [esp + 12] // dst_ptr
michael@0 244 mov ecx, [esp + 16] // dst_width
michael@0 245 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
michael@0 246 psrlw xmm5, 8
michael@0 247
michael@0 248 align 4
michael@0 249 wloop:
michael@0 250 movdqu xmm0, [eax]
michael@0 251 movdqu xmm1, [eax + 16]
michael@0 252 lea eax, [eax + 32]
michael@0 253
michael@0 254 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
michael@0 255 psrlw xmm0, 8
michael@0 256 movdqa xmm3, xmm1
michael@0 257 psrlw xmm1, 8
michael@0 258 pand xmm2, xmm5
michael@0 259 pand xmm3, xmm5
michael@0 260 pavgw xmm0, xmm2
michael@0 261 pavgw xmm1, xmm3
michael@0 262 packuswb xmm0, xmm1
michael@0 263
michael@0 264 sub ecx, 16
michael@0 265 movdqu [edx], xmm0
michael@0 266 lea edx, [edx + 16]
michael@0 267 jg wloop
michael@0 268
michael@0 269 ret
michael@0 270 }
michael@0 271 }
michael@0 272
michael@0 273 // Blends 32x2 rectangle to 16x1.
michael@0 274 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
michael@0 275 __declspec(naked) __declspec(align(16))
michael@0 276 void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
michael@0 277 ptrdiff_t src_stride,
michael@0 278 uint8* dst_ptr, int dst_width) {
michael@0 279 __asm {
michael@0 280 push esi
michael@0 281 mov eax, [esp + 4 + 4] // src_ptr
michael@0 282 mov esi, [esp + 4 + 8] // src_stride
michael@0 283 mov edx, [esp + 4 + 12] // dst_ptr
michael@0 284 mov ecx, [esp + 4 + 16] // dst_width
michael@0 285 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
michael@0 286 psrlw xmm5, 8
michael@0 287
michael@0 288 align 4
michael@0 289 wloop:
michael@0 290 movdqu xmm0, [eax]
michael@0 291 movdqu xmm1, [eax + 16]
michael@0 292 movdqu xmm2, [eax + esi]
michael@0 293 movdqu xmm3, [eax + esi + 16]
michael@0 294 lea eax, [eax + 32]
michael@0 295 pavgb xmm0, xmm2 // average rows
michael@0 296 pavgb xmm1, xmm3
michael@0 297
michael@0 298 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
michael@0 299 psrlw xmm0, 8
michael@0 300 movdqa xmm3, xmm1
michael@0 301 psrlw xmm1, 8
michael@0 302 pand xmm2, xmm5
michael@0 303 pand xmm3, xmm5
michael@0 304 pavgw xmm0, xmm2
michael@0 305 pavgw xmm1, xmm3
michael@0 306 packuswb xmm0, xmm1
michael@0 307
michael@0 308 sub ecx, 16
michael@0 309 movdqu [edx], xmm0
michael@0 310 lea edx, [edx + 16]
michael@0 311 jg wloop
michael@0 312
michael@0 313 pop esi
michael@0 314 ret
michael@0 315 }
michael@0 316 }
michael@0 317
michael@0 318 // Point samples 32 pixels to 8 pixels.
michael@0 319 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
michael@0 320 __declspec(naked) __declspec(align(16))
michael@0 321 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
michael@0 322 uint8* dst_ptr, int dst_width) {
michael@0 323 __asm {
michael@0 324 mov eax, [esp + 4] // src_ptr
michael@0 325 // src_stride ignored
michael@0 326 mov edx, [esp + 12] // dst_ptr
michael@0 327 mov ecx, [esp + 16] // dst_width
michael@0 328 pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000
michael@0 329 psrld xmm5, 24
michael@0 330 pslld xmm5, 16
michael@0 331
michael@0 332 align 4
michael@0 333 wloop:
michael@0 334 movdqa xmm0, [eax]
michael@0 335 movdqa xmm1, [eax + 16]
michael@0 336 lea eax, [eax + 32]
michael@0 337 pand xmm0, xmm5
michael@0 338 pand xmm1, xmm5
michael@0 339 packuswb xmm0, xmm1
michael@0 340 psrlw xmm0, 8
michael@0 341 packuswb xmm0, xmm0
michael@0 342 sub ecx, 8
michael@0 343 movq qword ptr [edx], xmm0
michael@0 344 lea edx, [edx + 8]
michael@0 345 jg wloop
michael@0 346
michael@0 347 ret
michael@0 348 }
michael@0 349 }
michael@0 350
michael@0 351 // Blends 32x4 rectangle to 8x1.
michael@0 352 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
michael@0 353 __declspec(naked) __declspec(align(16))
michael@0 354 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
michael@0 355 uint8* dst_ptr, int dst_width) {
michael@0 356 __asm {
michael@0 357 push esi
michael@0 358 push edi
michael@0 359 mov eax, [esp + 8 + 4] // src_ptr
michael@0 360 mov esi, [esp + 8 + 8] // src_stride
michael@0 361 mov edx, [esp + 8 + 12] // dst_ptr
michael@0 362 mov ecx, [esp + 8 + 16] // dst_width
michael@0 363 lea edi, [esi + esi * 2] // src_stride * 3
michael@0 364 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
michael@0 365 psrlw xmm7, 8
michael@0 366
michael@0 367 align 4
michael@0 368 wloop:
michael@0 369 movdqa xmm0, [eax]
michael@0 370 movdqa xmm1, [eax + 16]
michael@0 371 movdqa xmm2, [eax + esi]
michael@0 372 movdqa xmm3, [eax + esi + 16]
michael@0 373 pavgb xmm0, xmm2 // average rows
michael@0 374 pavgb xmm1, xmm3
michael@0 375 movdqa xmm2, [eax + esi * 2]
michael@0 376 movdqa xmm3, [eax + esi * 2 + 16]
michael@0 377 movdqa xmm4, [eax + edi]
michael@0 378 movdqa xmm5, [eax + edi + 16]
michael@0 379 lea eax, [eax + 32]
michael@0 380 pavgb xmm2, xmm4
michael@0 381 pavgb xmm3, xmm5
michael@0 382 pavgb xmm0, xmm2
michael@0 383 pavgb xmm1, xmm3
michael@0 384
michael@0 385 movdqa xmm2, xmm0 // average columns (32 to 16 pixels)
michael@0 386 psrlw xmm0, 8
michael@0 387 movdqa xmm3, xmm1
michael@0 388 psrlw xmm1, 8
michael@0 389 pand xmm2, xmm7
michael@0 390 pand xmm3, xmm7
michael@0 391 pavgw xmm0, xmm2
michael@0 392 pavgw xmm1, xmm3
michael@0 393 packuswb xmm0, xmm1
michael@0 394
michael@0 395 movdqa xmm2, xmm0 // average columns (16 to 8 pixels)
michael@0 396 psrlw xmm0, 8
michael@0 397 pand xmm2, xmm7
michael@0 398 pavgw xmm0, xmm2
michael@0 399 packuswb xmm0, xmm0
michael@0 400
michael@0 401 sub ecx, 8
michael@0 402 movq qword ptr [edx], xmm0
michael@0 403 lea edx, [edx + 8]
michael@0 404 jg wloop
michael@0 405
michael@0 406 pop edi
michael@0 407 pop esi
michael@0 408 ret
michael@0 409 }
michael@0 410 }
michael@0 411
michael@0 412 // Point samples 32 pixels to 24 pixels.
michael@0 413 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
michael@0 414 // Then shuffled to do the scaling.
michael@0 415
michael@0 416 // Note that movdqa+palign may be better than movdqu.
michael@0 417 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
michael@0 418 __declspec(naked) __declspec(align(16))
michael@0 419 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
michael@0 420 uint8* dst_ptr, int dst_width) {
michael@0 421 __asm {
michael@0 422 mov eax, [esp + 4] // src_ptr
michael@0 423 // src_stride ignored
michael@0 424 mov edx, [esp + 12] // dst_ptr
michael@0 425 mov ecx, [esp + 16] // dst_width
michael@0 426 movdqa xmm3, kShuf0
michael@0 427 movdqa xmm4, kShuf1
michael@0 428 movdqa xmm5, kShuf2
michael@0 429
michael@0 430 align 4
michael@0 431 wloop:
michael@0 432 movdqa xmm0, [eax]
michael@0 433 movdqa xmm1, [eax + 16]
michael@0 434 lea eax, [eax + 32]
michael@0 435 movdqa xmm2, xmm1
michael@0 436 palignr xmm1, xmm0, 8
michael@0 437 pshufb xmm0, xmm3
michael@0 438 pshufb xmm1, xmm4
michael@0 439 pshufb xmm2, xmm5
michael@0 440 movq qword ptr [edx], xmm0
michael@0 441 movq qword ptr [edx + 8], xmm1
michael@0 442 movq qword ptr [edx + 16], xmm2
michael@0 443 lea edx, [edx + 24]
michael@0 444 sub ecx, 24
michael@0 445 jg wloop
michael@0 446
michael@0 447 ret
michael@0 448 }
michael@0 449 }
michael@0 450
michael@0 451 // Blends 32x2 rectangle to 24x1
michael@0 452 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
michael@0 453 // Then shuffled to do the scaling.
michael@0 454
michael@0 455 // Register usage:
michael@0 456 // xmm0 src_row 0
michael@0 457 // xmm1 src_row 1
michael@0 458 // xmm2 shuf 0
michael@0 459 // xmm3 shuf 1
michael@0 460 // xmm4 shuf 2
michael@0 461 // xmm5 madd 0
michael@0 462 // xmm6 madd 1
michael@0 463 // xmm7 kRound34
michael@0 464
michael@0 465 // Note that movdqa+palign may be better than movdqu.
michael@0 466 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
michael@0 467 __declspec(naked) __declspec(align(16))
michael@0 468 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
michael@0 469 ptrdiff_t src_stride,
michael@0 470 uint8* dst_ptr, int dst_width) {
michael@0 471 __asm {
michael@0 472 push esi
michael@0 473 mov eax, [esp + 4 + 4] // src_ptr
michael@0 474 mov esi, [esp + 4 + 8] // src_stride
michael@0 475 mov edx, [esp + 4 + 12] // dst_ptr
michael@0 476 mov ecx, [esp + 4 + 16] // dst_width
michael@0 477 movdqa xmm2, kShuf01
michael@0 478 movdqa xmm3, kShuf11
michael@0 479 movdqa xmm4, kShuf21
michael@0 480 movdqa xmm5, kMadd01
michael@0 481 movdqa xmm6, kMadd11
michael@0 482 movdqa xmm7, kRound34
michael@0 483
michael@0 484 align 4
michael@0 485 wloop:
michael@0 486 movdqa xmm0, [eax] // pixels 0..7
michael@0 487 movdqa xmm1, [eax + esi]
michael@0 488 pavgb xmm0, xmm1
michael@0 489 pshufb xmm0, xmm2
michael@0 490 pmaddubsw xmm0, xmm5
michael@0 491 paddsw xmm0, xmm7
michael@0 492 psrlw xmm0, 2
michael@0 493 packuswb xmm0, xmm0
michael@0 494 movq qword ptr [edx], xmm0
michael@0 495 movdqu xmm0, [eax + 8] // pixels 8..15
michael@0 496 movdqu xmm1, [eax + esi + 8]
michael@0 497 pavgb xmm0, xmm1
michael@0 498 pshufb xmm0, xmm3
michael@0 499 pmaddubsw xmm0, xmm6
michael@0 500 paddsw xmm0, xmm7
michael@0 501 psrlw xmm0, 2
michael@0 502 packuswb xmm0, xmm0
michael@0 503 movq qword ptr [edx + 8], xmm0
michael@0 504 movdqa xmm0, [eax + 16] // pixels 16..23
michael@0 505 movdqa xmm1, [eax + esi + 16]
michael@0 506 lea eax, [eax + 32]
michael@0 507 pavgb xmm0, xmm1
michael@0 508 pshufb xmm0, xmm4
michael@0 509 movdqa xmm1, kMadd21
michael@0 510 pmaddubsw xmm0, xmm1
michael@0 511 paddsw xmm0, xmm7
michael@0 512 psrlw xmm0, 2
michael@0 513 packuswb xmm0, xmm0
michael@0 514 sub ecx, 24
michael@0 515 movq qword ptr [edx + 16], xmm0
michael@0 516 lea edx, [edx + 24]
michael@0 517 jg wloop
michael@0 518
michael@0 519 pop esi
michael@0 520 ret
michael@0 521 }
michael@0 522 }
michael@0 523
michael@0 524 // Note that movdqa+palign may be better than movdqu.
michael@0 525 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
michael@0 526 __declspec(naked) __declspec(align(16))
michael@0 527 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
michael@0 528 ptrdiff_t src_stride,
michael@0 529 uint8* dst_ptr, int dst_width) {
michael@0 530 __asm {
michael@0 531 push esi
michael@0 532 mov eax, [esp + 4 + 4] // src_ptr
michael@0 533 mov esi, [esp + 4 + 8] // src_stride
michael@0 534 mov edx, [esp + 4 + 12] // dst_ptr
michael@0 535 mov ecx, [esp + 4 + 16] // dst_width
michael@0 536 movdqa xmm2, kShuf01
michael@0 537 movdqa xmm3, kShuf11
michael@0 538 movdqa xmm4, kShuf21
michael@0 539 movdqa xmm5, kMadd01
michael@0 540 movdqa xmm6, kMadd11
michael@0 541 movdqa xmm7, kRound34
michael@0 542
michael@0 543 align 4
michael@0 544 wloop:
michael@0 545 movdqa xmm0, [eax] // pixels 0..7
michael@0 546 movdqa xmm1, [eax + esi]
michael@0 547 pavgb xmm1, xmm0
michael@0 548 pavgb xmm0, xmm1
michael@0 549 pshufb xmm0, xmm2
michael@0 550 pmaddubsw xmm0, xmm5
michael@0 551 paddsw xmm0, xmm7
michael@0 552 psrlw xmm0, 2
michael@0 553 packuswb xmm0, xmm0
michael@0 554 movq qword ptr [edx], xmm0
michael@0 555 movdqu xmm0, [eax + 8] // pixels 8..15
michael@0 556 movdqu xmm1, [eax + esi + 8]
michael@0 557 pavgb xmm1, xmm0
michael@0 558 pavgb xmm0, xmm1
michael@0 559 pshufb xmm0, xmm3
michael@0 560 pmaddubsw xmm0, xmm6
michael@0 561 paddsw xmm0, xmm7
michael@0 562 psrlw xmm0, 2
michael@0 563 packuswb xmm0, xmm0
michael@0 564 movq qword ptr [edx + 8], xmm0
michael@0 565 movdqa xmm0, [eax + 16] // pixels 16..23
michael@0 566 movdqa xmm1, [eax + esi + 16]
michael@0 567 lea eax, [eax + 32]
michael@0 568 pavgb xmm1, xmm0
michael@0 569 pavgb xmm0, xmm1
michael@0 570 pshufb xmm0, xmm4
michael@0 571 movdqa xmm1, kMadd21
michael@0 572 pmaddubsw xmm0, xmm1
michael@0 573 paddsw xmm0, xmm7
michael@0 574 psrlw xmm0, 2
michael@0 575 packuswb xmm0, xmm0
michael@0 576 sub ecx, 24
michael@0 577 movq qword ptr [edx + 16], xmm0
michael@0 578 lea edx, [edx+24]
michael@0 579 jg wloop
michael@0 580
michael@0 581 pop esi
michael@0 582 ret
michael@0 583 }
michael@0 584 }
michael@0 585
michael@0 586 // 3/8 point sampler
michael@0 587
michael@0 588 // Scale 32 pixels to 12
michael@0 589 __declspec(naked) __declspec(align(16))
michael@0 590 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
michael@0 591 uint8* dst_ptr, int dst_width) {
michael@0 592 __asm {
michael@0 593 mov eax, [esp + 4] // src_ptr
michael@0 594 // src_stride ignored
michael@0 595 mov edx, [esp + 12] // dst_ptr
michael@0 596 mov ecx, [esp + 16] // dst_width
michael@0 597 movdqa xmm4, kShuf38a
michael@0 598 movdqa xmm5, kShuf38b
michael@0 599
michael@0 600 align 4
michael@0 601 xloop:
michael@0 602 movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5
michael@0 603 movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11
michael@0 604 lea eax, [eax + 32]
michael@0 605 pshufb xmm0, xmm4
michael@0 606 pshufb xmm1, xmm5
michael@0 607 paddusb xmm0, xmm1
michael@0 608
michael@0 609 sub ecx, 12
michael@0 610 movq qword ptr [edx], xmm0 // write 12 pixels
michael@0 611 movhlps xmm1, xmm0
michael@0 612 movd [edx + 8], xmm1
michael@0 613 lea edx, [edx + 12]
michael@0 614 jg xloop
michael@0 615
michael@0 616 ret
michael@0 617 }
michael@0 618 }
michael@0 619
michael@0 620 // Scale 16x3 pixels to 6x1 with interpolation
michael@0 621 __declspec(naked) __declspec(align(16))
michael@0 622 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
michael@0 623 ptrdiff_t src_stride,
michael@0 624 uint8* dst_ptr, int dst_width) {
michael@0 625 __asm {
michael@0 626 push esi
michael@0 627 mov eax, [esp + 4 + 4] // src_ptr
michael@0 628 mov esi, [esp + 4 + 8] // src_stride
michael@0 629 mov edx, [esp + 4 + 12] // dst_ptr
michael@0 630 mov ecx, [esp + 4 + 16] // dst_width
michael@0 631 movdqa xmm2, kShufAc
michael@0 632 movdqa xmm3, kShufAc3
michael@0 633 movdqa xmm4, kScaleAc33
michael@0 634 pxor xmm5, xmm5
michael@0 635
michael@0 636 align 4
michael@0 637 xloop:
michael@0 638 movdqa xmm0, [eax] // sum up 3 rows into xmm0/1
michael@0 639 movdqa xmm6, [eax + esi]
michael@0 640 movhlps xmm1, xmm0
michael@0 641 movhlps xmm7, xmm6
michael@0 642 punpcklbw xmm0, xmm5
michael@0 643 punpcklbw xmm1, xmm5
michael@0 644 punpcklbw xmm6, xmm5
michael@0 645 punpcklbw xmm7, xmm5
michael@0 646 paddusw xmm0, xmm6
michael@0 647 paddusw xmm1, xmm7
michael@0 648 movdqa xmm6, [eax + esi * 2]
michael@0 649 lea eax, [eax + 16]
michael@0 650 movhlps xmm7, xmm6
michael@0 651 punpcklbw xmm6, xmm5
michael@0 652 punpcklbw xmm7, xmm5
michael@0 653 paddusw xmm0, xmm6
michael@0 654 paddusw xmm1, xmm7
michael@0 655
michael@0 656 movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6
michael@0 657 psrldq xmm0, 2
michael@0 658 paddusw xmm6, xmm0
michael@0 659 psrldq xmm0, 2
michael@0 660 paddusw xmm6, xmm0
michael@0 661 pshufb xmm6, xmm2
michael@0 662
michael@0 663 movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6
michael@0 664 psrldq xmm1, 2
michael@0 665 paddusw xmm7, xmm1
michael@0 666 psrldq xmm1, 2
michael@0 667 paddusw xmm7, xmm1
michael@0 668 pshufb xmm7, xmm3
michael@0 669 paddusw xmm6, xmm7
michael@0 670
michael@0 671 pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6
michael@0 672 packuswb xmm6, xmm6
michael@0 673
michael@0 674 sub ecx, 6
michael@0 675 movd [edx], xmm6 // write 6 pixels
michael@0 676 psrlq xmm6, 16
michael@0 677 movd [edx + 2], xmm6
michael@0 678 lea edx, [edx + 6]
michael@0 679 jg xloop
michael@0 680
michael@0 681 pop esi
michael@0 682 ret
michael@0 683 }
michael@0 684 }
michael@0 685
michael@0 686 // Scale 16x2 pixels to 6x1 with interpolation
michael@0 687 __declspec(naked) __declspec(align(16))
michael@0 688 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
michael@0 689 ptrdiff_t src_stride,
michael@0 690 uint8* dst_ptr, int dst_width) {
michael@0 691 __asm {
michael@0 692 push esi
michael@0 693 mov eax, [esp + 4 + 4] // src_ptr
michael@0 694 mov esi, [esp + 4 + 8] // src_stride
michael@0 695 mov edx, [esp + 4 + 12] // dst_ptr
michael@0 696 mov ecx, [esp + 4 + 16] // dst_width
michael@0 697 movdqa xmm2, kShufAb0
michael@0 698 movdqa xmm3, kShufAb1
michael@0 699 movdqa xmm4, kShufAb2
michael@0 700 movdqa xmm5, kScaleAb2
michael@0 701
michael@0 702 align 4
michael@0 703 xloop:
michael@0 704 movdqa xmm0, [eax] // average 2 rows into xmm0
michael@0 705 pavgb xmm0, [eax + esi]
michael@0 706 lea eax, [eax + 16]
michael@0 707
michael@0 708 movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1
michael@0 709 pshufb xmm1, xmm2
michael@0 710 movdqa xmm6, xmm0
michael@0 711 pshufb xmm6, xmm3
michael@0 712 paddusw xmm1, xmm6
michael@0 713 pshufb xmm0, xmm4
michael@0 714 paddusw xmm1, xmm0
michael@0 715
michael@0 716 pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2
michael@0 717 packuswb xmm1, xmm1
michael@0 718
michael@0 719 sub ecx, 6
michael@0 720 movd [edx], xmm1 // write 6 pixels
michael@0 721 psrlq xmm1, 16
michael@0 722 movd [edx + 2], xmm1
michael@0 723 lea edx, [edx + 6]
michael@0 724 jg xloop
michael@0 725
michael@0 726 pop esi
michael@0 727 ret
michael@0 728 }
michael@0 729 }
michael@0 730
michael@0 731 // Reads 16xN bytes and produces 16 shorts at a time.
michael@0 732 // TODO(fbarchard): Make this handle 4xN bytes for any width ARGB.
michael@0 733 __declspec(naked) __declspec(align(16))
michael@0 734 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
michael@0 735 uint16* dst_ptr, int src_width,
michael@0 736 int src_height) {
michael@0 737 __asm {
michael@0 738 push esi
michael@0 739 push edi
michael@0 740 push ebx
michael@0 741 push ebp
michael@0 742 mov esi, [esp + 16 + 4] // src_ptr
michael@0 743 mov edx, [esp + 16 + 8] // src_stride
michael@0 744 mov edi, [esp + 16 + 12] // dst_ptr
michael@0 745 mov ecx, [esp + 16 + 16] // dst_width
michael@0 746 mov ebx, [esp + 16 + 20] // height
michael@0 747 pxor xmm4, xmm4
michael@0 748 dec ebx
michael@0 749
michael@0 750 align 4
michael@0 751 xloop:
michael@0 752 // first row
michael@0 753 movdqa xmm0, [esi]
michael@0 754 lea eax, [esi + edx]
michael@0 755 movdqa xmm1, xmm0
michael@0 756 punpcklbw xmm0, xmm4
michael@0 757 punpckhbw xmm1, xmm4
michael@0 758 lea esi, [esi + 16]
michael@0 759 mov ebp, ebx
michael@0 760 test ebp, ebp
michael@0 761 je ydone
michael@0 762
michael@0 763 // sum remaining rows
michael@0 764 align 4
michael@0 765 yloop:
michael@0 766 movdqa xmm2, [eax] // read 16 pixels
michael@0 767 lea eax, [eax + edx] // advance to next row
michael@0 768 movdqa xmm3, xmm2
michael@0 769 punpcklbw xmm2, xmm4
michael@0 770 punpckhbw xmm3, xmm4
michael@0 771 paddusw xmm0, xmm2 // sum 16 words
michael@0 772 paddusw xmm1, xmm3
michael@0 773 sub ebp, 1
michael@0 774 jg yloop
michael@0 775
michael@0 776 align 4
michael@0 777 ydone:
michael@0 778 movdqa [edi], xmm0
michael@0 779 movdqa [edi + 16], xmm1
michael@0 780 lea edi, [edi + 32]
michael@0 781
michael@0 782 sub ecx, 16
michael@0 783 jg xloop
michael@0 784
michael@0 785 pop ebp
michael@0 786 pop ebx
michael@0 787 pop edi
michael@0 788 pop esi
michael@0 789 ret
michael@0 790 }
michael@0 791 }
michael@0 792
michael@0 793 // Bilinear column filtering. SSSE3 version.
michael@0 794 // TODO(fbarchard): Port to Neon
michael@0 795 // TODO(fbarchard): Switch the following:
michael@0 796 // xor ebx, ebx
michael@0 797 // mov bx, word ptr [esi + eax] // 2 source x0 pixels
michael@0 798 // To
michael@0 799 // movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
michael@0 800 // when drmemory bug fixed.
michael@0 801 // https://code.google.com/p/drmemory/issues/detail?id=1396
michael@0 802
michael@0 803 __declspec(naked) __declspec(align(16))
michael@0 804 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
michael@0 805 int dst_width, int x, int dx) {
michael@0 806 __asm {
michael@0 807 push ebx
michael@0 808 push esi
michael@0 809 push edi
michael@0 810 mov edi, [esp + 12 + 4] // dst_ptr
michael@0 811 mov esi, [esp + 12 + 8] // src_ptr
michael@0 812 mov ecx, [esp + 12 + 12] // dst_width
michael@0 813 movd xmm2, [esp + 12 + 16] // x
michael@0 814 movd xmm3, [esp + 12 + 20] // dx
michael@0 815 mov eax, 0x04040000 // shuffle to line up fractions with pixel.
michael@0 816 movd xmm5, eax
michael@0 817 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
michael@0 818 psrlw xmm6, 9
michael@0 819 pextrw eax, xmm2, 1 // get x0 integer. preroll
michael@0 820 sub ecx, 2
michael@0 821 jl xloop29
michael@0 822
michael@0 823 movdqa xmm0, xmm2 // x1 = x0 + dx
michael@0 824 paddd xmm0, xmm3
michael@0 825 punpckldq xmm2, xmm0 // x0 x1
michael@0 826 punpckldq xmm3, xmm3 // dx dx
michael@0 827 paddd xmm3, xmm3 // dx * 2, dx * 2
michael@0 828 pextrw edx, xmm2, 3 // get x1 integer. preroll
michael@0 829
michael@0 830 // 2 Pixel loop.
michael@0 831 align 4
michael@0 832 xloop2:
michael@0 833 movdqa xmm1, xmm2 // x0, x1 fractions.
michael@0 834 paddd xmm2, xmm3 // x += dx
michael@0 835 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
michael@0 836 movd xmm0, ebx
michael@0 837 psrlw xmm1, 9 // 7 bit fractions.
michael@0 838 movzx ebx, word ptr [esi + edx] // 2 source x1 pixels
michael@0 839 movd xmm4, ebx
michael@0 840 pshufb xmm1, xmm5 // 0011
michael@0 841 punpcklwd xmm0, xmm4
michael@0 842 pxor xmm1, xmm6 // 0..7f and 7f..0
michael@0 843 pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels.
michael@0 844 pextrw eax, xmm2, 1 // get x0 integer. next iteration.
michael@0 845 pextrw edx, xmm2, 3 // get x1 integer. next iteration.
michael@0 846 psrlw xmm0, 7 // 8.7 fixed point to low 8 bits.
michael@0 847 packuswb xmm0, xmm0 // 8 bits, 2 pixels.
michael@0 848 movd ebx, xmm0
michael@0 849 mov [edi], bx
michael@0 850 lea edi, [edi + 2]
michael@0 851 sub ecx, 2 // 2 pixels
michael@0 852 jge xloop2
michael@0 853
michael@0 854 align 4
michael@0 855 xloop29:
michael@0 856
michael@0 857 add ecx, 2 - 1
michael@0 858 jl xloop99
michael@0 859
michael@0 860 // 1 pixel remainder
michael@0 861 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels
michael@0 862 movd xmm0, ebx
michael@0 863 psrlw xmm2, 9 // 7 bit fractions.
michael@0 864 pshufb xmm2, xmm5 // 0011
michael@0 865 pxor xmm2, xmm6 // 0..7f and 7f..0
michael@0 866 pmaddubsw xmm0, xmm2 // 16 bit
michael@0 867 psrlw xmm0, 7 // 8.7 fixed point to low 8 bits.
michael@0 868 packuswb xmm0, xmm0 // 8 bits
michael@0 869 movd ebx, xmm0
michael@0 870 mov [edi], bl
michael@0 871
michael@0 872 align 4
michael@0 873 xloop99:
michael@0 874
michael@0 875 pop edi
michael@0 876 pop esi
michael@0 877 pop ebx
michael@0 878 ret
michael@0 879 }
michael@0 880 }
michael@0 881
michael@0 882 // Reads 16 pixels, duplicates them and writes 32 pixels.
michael@0 883 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
michael@0 884 __declspec(naked) __declspec(align(16))
michael@0 885 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
michael@0 886 int dst_width, int x, int dx) {
michael@0 887 __asm {
michael@0 888 mov edx, [esp + 4] // dst_ptr
michael@0 889 mov eax, [esp + 8] // src_ptr
michael@0 890 mov ecx, [esp + 12] // dst_width
michael@0 891
michael@0 892 align 4
michael@0 893 wloop:
michael@0 894 movdqa xmm0, [eax]
michael@0 895 lea eax, [eax + 16]
michael@0 896 movdqa xmm1, xmm0
michael@0 897 punpcklbw xmm0, xmm0
michael@0 898 punpckhbw xmm1, xmm1
michael@0 899 sub ecx, 32
michael@0 900 movdqa [edx], xmm0
michael@0 901 movdqa [edx + 16], xmm1
michael@0 902 lea edx, [edx + 32]
michael@0 903 jg wloop
michael@0 904
michael@0 905 ret
michael@0 906 }
michael@0 907 }
michael@0 908
michael@0 909 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
michael@0 910 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
michael@0 911 __declspec(naked) __declspec(align(16))
michael@0 912 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
michael@0 913 ptrdiff_t src_stride,
michael@0 914 uint8* dst_argb, int dst_width) {
michael@0 915 __asm {
michael@0 916 mov eax, [esp + 4] // src_argb
michael@0 917 // src_stride ignored
michael@0 918 mov edx, [esp + 12] // dst_argb
michael@0 919 mov ecx, [esp + 16] // dst_width
michael@0 920
michael@0 921 align 4
michael@0 922 wloop:
michael@0 923 movdqa xmm0, [eax]
michael@0 924 movdqa xmm1, [eax + 16]
michael@0 925 lea eax, [eax + 32]
michael@0 926 shufps xmm0, xmm1, 0xdd
michael@0 927 sub ecx, 4
michael@0 928 movdqa [edx], xmm0
michael@0 929 lea edx, [edx + 16]
michael@0 930 jg wloop
michael@0 931
michael@0 932 ret
michael@0 933 }
michael@0 934 }
michael@0 935
michael@0 936 // Blends 8x1 rectangle to 4x1.
michael@0 937 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
michael@0 938 __declspec(naked) __declspec(align(16))
michael@0 939 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
michael@0 940 ptrdiff_t src_stride,
michael@0 941 uint8* dst_argb, int dst_width) {
michael@0 942 __asm {
michael@0 943 mov eax, [esp + 4] // src_argb
michael@0 944 // src_stride ignored
michael@0 945 mov edx, [esp + 12] // dst_argb
michael@0 946 mov ecx, [esp + 16] // dst_width
michael@0 947
michael@0 948 align 4
michael@0 949 wloop:
michael@0 950 movdqa xmm0, [eax]
michael@0 951 movdqa xmm1, [eax + 16]
michael@0 952 lea eax, [eax + 32]
michael@0 953 movdqa xmm2, xmm0
michael@0 954 shufps xmm0, xmm1, 0x88 // even pixels
michael@0 955 shufps xmm2, xmm1, 0xdd // odd pixels
michael@0 956 pavgb xmm0, xmm2
michael@0 957 sub ecx, 4
michael@0 958 movdqa [edx], xmm0
michael@0 959 lea edx, [edx + 16]
michael@0 960 jg wloop
michael@0 961
michael@0 962 ret
michael@0 963 }
michael@0 964 }
michael@0 965
michael@0 966 // Blends 8x2 rectangle to 4x1.
michael@0 967 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
michael@0 968 __declspec(naked) __declspec(align(16))
michael@0 969 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
michael@0 970 ptrdiff_t src_stride,
michael@0 971 uint8* dst_argb, int dst_width) {
michael@0 972 __asm {
michael@0 973 push esi
michael@0 974 mov eax, [esp + 4 + 4] // src_argb
michael@0 975 mov esi, [esp + 4 + 8] // src_stride
michael@0 976 mov edx, [esp + 4 + 12] // dst_argb
michael@0 977 mov ecx, [esp + 4 + 16] // dst_width
michael@0 978
michael@0 979 align 4
michael@0 980 wloop:
michael@0 981 movdqa xmm0, [eax]
michael@0 982 movdqa xmm1, [eax + 16]
michael@0 983 movdqa xmm2, [eax + esi]
michael@0 984 movdqa xmm3, [eax + esi + 16]
michael@0 985 lea eax, [eax + 32]
michael@0 986 pavgb xmm0, xmm2 // average rows
michael@0 987 pavgb xmm1, xmm3
michael@0 988 movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
michael@0 989 shufps xmm0, xmm1, 0x88 // even pixels
michael@0 990 shufps xmm2, xmm1, 0xdd // odd pixels
michael@0 991 pavgb xmm0, xmm2
michael@0 992 sub ecx, 4
michael@0 993 movdqa [edx], xmm0
michael@0 994 lea edx, [edx + 16]
michael@0 995 jg wloop
michael@0 996
michael@0 997 pop esi
michael@0 998 ret
michael@0 999 }
michael@0 1000 }
michael@0 1001
michael@0 1002 // Reads 4 pixels at a time.
michael@0 1003 // Alignment requirement: dst_argb 16 byte aligned.
michael@0 1004 __declspec(naked) __declspec(align(16))
michael@0 1005 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
michael@0 1006 int src_stepx,
michael@0 1007 uint8* dst_argb, int dst_width) {
michael@0 1008 __asm {
michael@0 1009 push ebx
michael@0 1010 push edi
michael@0 1011 mov eax, [esp + 8 + 4] // src_argb
michael@0 1012 // src_stride ignored
michael@0 1013 mov ebx, [esp + 8 + 12] // src_stepx
michael@0 1014 mov edx, [esp + 8 + 16] // dst_argb
michael@0 1015 mov ecx, [esp + 8 + 20] // dst_width
michael@0 1016 lea ebx, [ebx * 4]
michael@0 1017 lea edi, [ebx + ebx * 2]
michael@0 1018
michael@0 1019 align 4
michael@0 1020 wloop:
michael@0 1021 movd xmm0, [eax]
michael@0 1022 movd xmm1, [eax + ebx]
michael@0 1023 punpckldq xmm0, xmm1
michael@0 1024 movd xmm2, [eax + ebx * 2]
michael@0 1025 movd xmm3, [eax + edi]
michael@0 1026 lea eax, [eax + ebx * 4]
michael@0 1027 punpckldq xmm2, xmm3
michael@0 1028 punpcklqdq xmm0, xmm2
michael@0 1029 sub ecx, 4
michael@0 1030 movdqa [edx], xmm0
michael@0 1031 lea edx, [edx + 16]
michael@0 1032 jg wloop
michael@0 1033
michael@0 1034 pop edi
michael@0 1035 pop ebx
michael@0 1036 ret
michael@0 1037 }
michael@0 1038 }
michael@0 1039
michael@0 1040 // Blends four 2x2 to 4x1.
michael@0 1041 // Alignment requirement: dst_argb 16 byte aligned.
michael@0 1042 __declspec(naked) __declspec(align(16))
michael@0 1043 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
michael@0 1044 ptrdiff_t src_stride,
michael@0 1045 int src_stepx,
michael@0 1046 uint8* dst_argb, int dst_width) {
michael@0 1047 __asm {
michael@0 1048 push ebx
michael@0 1049 push esi
michael@0 1050 push edi
michael@0 1051 mov eax, [esp + 12 + 4] // src_argb
michael@0 1052 mov esi, [esp + 12 + 8] // src_stride
michael@0 1053 mov ebx, [esp + 12 + 12] // src_stepx
michael@0 1054 mov edx, [esp + 12 + 16] // dst_argb
michael@0 1055 mov ecx, [esp + 12 + 20] // dst_width
michael@0 1056 lea esi, [eax + esi] // row1 pointer
michael@0 1057 lea ebx, [ebx * 4]
michael@0 1058 lea edi, [ebx + ebx * 2]
michael@0 1059
michael@0 1060 align 4
michael@0 1061 wloop:
michael@0 1062 movq xmm0, qword ptr [eax] // row0 4 pairs
michael@0 1063 movhps xmm0, qword ptr [eax + ebx]
michael@0 1064 movq xmm1, qword ptr [eax + ebx * 2]
michael@0 1065 movhps xmm1, qword ptr [eax + edi]
michael@0 1066 lea eax, [eax + ebx * 4]
michael@0 1067 movq xmm2, qword ptr [esi] // row1 4 pairs
michael@0 1068 movhps xmm2, qword ptr [esi + ebx]
michael@0 1069 movq xmm3, qword ptr [esi + ebx * 2]
michael@0 1070 movhps xmm3, qword ptr [esi + edi]
michael@0 1071 lea esi, [esi + ebx * 4]
michael@0 1072 pavgb xmm0, xmm2 // average rows
michael@0 1073 pavgb xmm1, xmm3
michael@0 1074 movdqa xmm2, xmm0 // average columns (8 to 4 pixels)
michael@0 1075 shufps xmm0, xmm1, 0x88 // even pixels
michael@0 1076 shufps xmm2, xmm1, 0xdd // odd pixels
michael@0 1077 pavgb xmm0, xmm2
michael@0 1078 sub ecx, 4
michael@0 1079 movdqa [edx], xmm0
michael@0 1080 lea edx, [edx + 16]
michael@0 1081 jg wloop
michael@0 1082
michael@0 1083 pop edi
michael@0 1084 pop esi
michael@0 1085 pop ebx
michael@0 1086 ret
michael@0 1087 }
michael@0 1088 }
michael@0 1089
michael@0 1090 // Column scaling unfiltered. SSE2 version.
michael@0 1091 __declspec(naked) __declspec(align(16))
michael@0 1092 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
michael@0 1093 int dst_width, int x, int dx) {
michael@0 1094 __asm {
michael@0 1095 push edi
michael@0 1096 push esi
michael@0 1097 mov edi, [esp + 8 + 4] // dst_argb
michael@0 1098 mov esi, [esp + 8 + 8] // src_argb
michael@0 1099 mov ecx, [esp + 8 + 12] // dst_width
michael@0 1100 movd xmm2, [esp + 8 + 16] // x
michael@0 1101 movd xmm3, [esp + 8 + 20] // dx
michael@0 1102
michael@0 1103 pshufd xmm2, xmm2, 0 // x0 x0 x0 x0
michael@0 1104 pshufd xmm0, xmm3, 0x11 // dx 0 dx 0
michael@0 1105 paddd xmm2, xmm0
michael@0 1106 paddd xmm3, xmm3 // 0, 0, 0, dx * 2
michael@0 1107 pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0
michael@0 1108 paddd xmm2, xmm0 // x3 x2 x1 x0
michael@0 1109 paddd xmm3, xmm3 // 0, 0, 0, dx * 4
michael@0 1110 pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4
michael@0 1111
michael@0 1112 pextrw eax, xmm2, 1 // get x0 integer.
michael@0 1113 pextrw edx, xmm2, 3 // get x1 integer.
michael@0 1114
michael@0 1115 cmp ecx, 0
michael@0 1116 jle xloop99
michael@0 1117 sub ecx, 4
michael@0 1118 jl xloop49
michael@0 1119
michael@0 1120 // 4 Pixel loop.
michael@0 1121 align 4
michael@0 1122 xloop4:
michael@0 1123 movd xmm0, [esi + eax * 4] // 1 source x0 pixels
michael@0 1124 movd xmm1, [esi + edx * 4] // 1 source x1 pixels
michael@0 1125 pextrw eax, xmm2, 5 // get x2 integer.
michael@0 1126 pextrw edx, xmm2, 7 // get x3 integer.
michael@0 1127 paddd xmm2, xmm3 // x += dx
michael@0 1128 punpckldq xmm0, xmm1 // x0 x1
michael@0 1129
michael@0 1130 movd xmm1, [esi + eax * 4] // 1 source x2 pixels
michael@0 1131 movd xmm4, [esi + edx * 4] // 1 source x3 pixels
michael@0 1132 pextrw eax, xmm2, 1 // get x0 integer. next iteration.
michael@0 1133 pextrw edx, xmm2, 3 // get x1 integer. next iteration.
michael@0 1134 punpckldq xmm1, xmm4 // x2 x3
michael@0 1135 punpcklqdq xmm0, xmm1 // x0 x1 x2 x3
michael@0 1136 sub ecx, 4 // 4 pixels
michael@0 1137 movdqu [edi], xmm0
michael@0 1138 lea edi, [edi + 16]
michael@0 1139 jge xloop4
michael@0 1140
michael@0 1141 align 4
michael@0 1142 xloop49:
michael@0 1143 test ecx, 2
michael@0 1144 je xloop29
michael@0 1145
michael@0 1146 // 2 Pixels.
michael@0 1147 movd xmm0, [esi + eax * 4] // 1 source x0 pixels
michael@0 1148 movd xmm1, [esi + edx * 4] // 1 source x1 pixels
michael@0 1149 pextrw eax, xmm2, 5 // get x2 integer.
michael@0 1150 punpckldq xmm0, xmm1 // x0 x1
michael@0 1151
michael@0 1152 movq qword ptr [edi], xmm0
michael@0 1153 lea edi, [edi + 8]
michael@0 1154
michael@0 1155 xloop29:
michael@0 1156 test ecx, 1
michael@0 1157 je xloop99
michael@0 1158
michael@0 1159 // 1 Pixels.
michael@0 1160 movd xmm0, [esi + eax * 4] // 1 source x2 pixels
michael@0 1161 movd dword ptr [edi], xmm0
michael@0 1162 align 4
michael@0 1163 xloop99:
michael@0 1164
michael@0 1165 pop esi
michael@0 1166 pop edi
michael@0 1167 ret
michael@0 1168 }
michael@0 1169 }
michael@0 1170
michael@0 1171 // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
michael@0 1172 // TODO(fbarchard): Port to Neon
michael@0 1173
michael@0 1174 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
michael@0 1175 static uvec8 kShuffleColARGB = {
michael@0 1176 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
michael@0 1177 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
michael@0 1178 };
michael@0 1179
michael@0 1180 // Shuffle table for duplicating 2 fractions into 8 bytes each
michael@0 1181 static uvec8 kShuffleFractions = {
michael@0 1182 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
michael@0 1183 };
michael@0 1184
michael@0 1185 __declspec(naked) __declspec(align(16))
michael@0 1186 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
michael@0 1187 int dst_width, int x, int dx) {
michael@0 1188 __asm {
michael@0 1189 push esi
michael@0 1190 push edi
michael@0 1191 mov edi, [esp + 8 + 4] // dst_argb
michael@0 1192 mov esi, [esp + 8 + 8] // src_argb
michael@0 1193 mov ecx, [esp + 8 + 12] // dst_width
michael@0 1194 movd xmm2, [esp + 8 + 16] // x
michael@0 1195 movd xmm3, [esp + 8 + 20] // dx
michael@0 1196 movdqa xmm4, kShuffleColARGB
michael@0 1197 movdqa xmm5, kShuffleFractions
michael@0 1198 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction.
michael@0 1199 psrlw xmm6, 9
michael@0 1200 pextrw eax, xmm2, 1 // get x0 integer. preroll
michael@0 1201 sub ecx, 2
michael@0 1202 jl xloop29
michael@0 1203
michael@0 1204 movdqa xmm0, xmm2 // x1 = x0 + dx
michael@0 1205 paddd xmm0, xmm3
michael@0 1206 punpckldq xmm2, xmm0 // x0 x1
michael@0 1207 punpckldq xmm3, xmm3 // dx dx
michael@0 1208 paddd xmm3, xmm3 // dx * 2, dx * 2
michael@0 1209 pextrw edx, xmm2, 3 // get x1 integer. preroll
michael@0 1210
michael@0 1211 // 2 Pixel loop.
michael@0 1212 align 4
michael@0 1213 xloop2:
michael@0 1214 movdqa xmm1, xmm2 // x0, x1 fractions.
michael@0 1215 paddd xmm2, xmm3 // x += dx
michael@0 1216 movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
michael@0 1217 psrlw xmm1, 9 // 7 bit fractions.
michael@0 1218 movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels
michael@0 1219 pshufb xmm1, xmm5 // 0000000011111111
michael@0 1220 pshufb xmm0, xmm4 // arrange pixels into pairs
michael@0 1221 pxor xmm1, xmm6 // 0..7f and 7f..0
michael@0 1222 pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels.
michael@0 1223 pextrw eax, xmm2, 1 // get x0 integer. next iteration.
michael@0 1224 pextrw edx, xmm2, 3 // get x1 integer. next iteration.
michael@0 1225 psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits.
michael@0 1226 packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels.
michael@0 1227 movq qword ptr [edi], xmm0
michael@0 1228 lea edi, [edi + 8]
michael@0 1229 sub ecx, 2 // 2 pixels
michael@0 1230 jge xloop2
michael@0 1231
michael@0 1232 align 4
michael@0 1233 xloop29:
michael@0 1234
michael@0 1235 add ecx, 2 - 1
michael@0 1236 jl xloop99
michael@0 1237
michael@0 1238 // 1 pixel remainder
michael@0 1239 psrlw xmm2, 9 // 7 bit fractions.
michael@0 1240 movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels
michael@0 1241 pshufb xmm2, xmm5 // 00000000
michael@0 1242 pshufb xmm0, xmm4 // arrange pixels into pairs
michael@0 1243 pxor xmm2, xmm6 // 0..7f and 7f..0
michael@0 1244 pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel.
michael@0 1245 psrlw xmm0, 7
michael@0 1246 packuswb xmm0, xmm0 // argb 8 bits, 1 pixel.
michael@0 1247 movd [edi], xmm0
michael@0 1248
michael@0 1249 align 4
michael@0 1250 xloop99:
michael@0 1251
michael@0 1252 pop edi
michael@0 1253 pop esi
michael@0 1254 ret
michael@0 1255 }
michael@0 1256 }
michael@0 1257
michael@0 1258 // Reads 4 pixels, duplicates them and writes 8 pixels.
michael@0 1259 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
michael@0 1260 __declspec(naked) __declspec(align(16))
michael@0 1261 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
michael@0 1262 int dst_width, int x, int dx) {
michael@0 1263 __asm {
michael@0 1264 mov edx, [esp + 4] // dst_argb
michael@0 1265 mov eax, [esp + 8] // src_argb
michael@0 1266 mov ecx, [esp + 12] // dst_width
michael@0 1267
michael@0 1268 align 4
michael@0 1269 wloop:
michael@0 1270 movdqa xmm0, [eax]
michael@0 1271 lea eax, [eax + 16]
michael@0 1272 movdqa xmm1, xmm0
michael@0 1273 punpckldq xmm0, xmm0
michael@0 1274 punpckhdq xmm1, xmm1
michael@0 1275 sub ecx, 8
michael@0 1276 movdqa [edx], xmm0
michael@0 1277 movdqa [edx + 16], xmm1
michael@0 1278 lea edx, [edx + 32]
michael@0 1279 jg wloop
michael@0 1280
michael@0 1281 ret
michael@0 1282 }
michael@0 1283 }
michael@0 1284
michael@0 1285 // Divide num by div and return as 16.16 fixed point result.
michael@0 1286 __declspec(naked) __declspec(align(16))
michael@0 1287 int FixedDiv_X86(int num, int div) {
michael@0 1288 __asm {
michael@0 1289 mov eax, [esp + 4] // num
michael@0 1290 cdq // extend num to 64 bits
michael@0 1291 shld edx, eax, 16 // 32.16
michael@0 1292 shl eax, 16
michael@0 1293 idiv dword ptr [esp + 8]
michael@0 1294 ret
michael@0 1295 }
michael@0 1296 }
michael@0 1297
michael@0 1298 // Divide num by div and return as 16.16 fixed point result.
michael@0 1299 __declspec(naked) __declspec(align(16))
michael@0 1300 int FixedDiv1_X86(int num, int div) {
michael@0 1301 __asm {
michael@0 1302 mov eax, [esp + 4] // num
michael@0 1303 mov ecx, [esp + 8] // denom
michael@0 1304 cdq // extend num to 64 bits
michael@0 1305 shld edx, eax, 16 // 32.16
michael@0 1306 shl eax, 16
michael@0 1307 sub eax, 0x00010001
michael@0 1308 sbb edx, 0
michael@0 1309 sub ecx, 1
michael@0 1310 idiv ecx
michael@0 1311 ret
michael@0 1312 }
michael@0 1313 }
michael@0 1314
michael@0 1315 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
michael@0 1316
michael@0 1317 #ifdef __cplusplus
michael@0 1318 } // extern "C"
michael@0 1319 } // namespace libyuv
michael@0 1320 #endif

mercurial