media/libyuv/source/scale_win.cc

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libyuv/source/scale_win.cc	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1320 @@
     1.4 +/*
     1.5 + *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
     1.6 + *
     1.7 + *  Use of this source code is governed by a BSD-style license
     1.8 + *  that can be found in the LICENSE file in the root of the source
     1.9 + *  tree. An additional intellectual property rights grant can be found
    1.10 + *  in the file PATENTS. All contributing project authors may
    1.11 + *  be found in the AUTHORS file in the root of the source tree.
    1.12 + */
    1.13 +
    1.14 +#include "libyuv/row.h"
    1.15 +
    1.16 +#ifdef __cplusplus
    1.17 +namespace libyuv {
    1.18 +extern "C" {
    1.19 +#endif
    1.20 +
    1.21 +// This module is for Visual C x86.
    1.22 +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
    1.23 +
    1.24 +// Offsets for source bytes 0 to 9
    1.25 +static uvec8 kShuf0 =
    1.26 +  { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
    1.27 +
    1.28 +// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
    1.29 +static uvec8 kShuf1 =
    1.30 +  { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
    1.31 +
    1.32 +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
    1.33 +static uvec8 kShuf2 =
    1.34 +  { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
    1.35 +
    1.36 +// Offsets for source bytes 0 to 10
    1.37 +static uvec8 kShuf01 =
    1.38 +  { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
    1.39 +
    1.40 +// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
    1.41 +static uvec8 kShuf11 =
    1.42 +  { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
    1.43 +
    1.44 +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
    1.45 +static uvec8 kShuf21 =
    1.46 +  { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
    1.47 +
    1.48 +// Coefficients for source bytes 0 to 10
    1.49 +static uvec8 kMadd01 =
    1.50 +  { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
    1.51 +
    1.52 +// Coefficients for source bytes 10 to 21
    1.53 +static uvec8 kMadd11 =
    1.54 +  { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
    1.55 +
    1.56 +// Coefficients for source bytes 21 to 31
    1.57 +static uvec8 kMadd21 =
    1.58 +  { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
    1.59 +
    1.60 +// Coefficients for source bytes 21 to 31
    1.61 +static vec16 kRound34 =
    1.62 +  { 2, 2, 2, 2, 2, 2, 2, 2 };
    1.63 +
    1.64 +static uvec8 kShuf38a =
    1.65 +  { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
    1.66 +
    1.67 +static uvec8 kShuf38b =
    1.68 +  { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
    1.69 +
    1.70 +// Arrange words 0,3,6 into 0,1,2
    1.71 +static uvec8 kShufAc =
    1.72 +  { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
    1.73 +
    1.74 +// Arrange words 0,3,6 into 3,4,5
    1.75 +static uvec8 kShufAc3 =
    1.76 +  { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
    1.77 +
    1.78 +// Scaling values for boxes of 3x3 and 2x3
    1.79 +static uvec16 kScaleAc33 =
    1.80 +  { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
    1.81 +
    1.82 +// Arrange first value for pixels 0,1,2,3,4,5
    1.83 +static uvec8 kShufAb0 =
    1.84 +  { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
    1.85 +
    1.86 +// Arrange second value for pixels 0,1,2,3,4,5
    1.87 +static uvec8 kShufAb1 =
    1.88 +  { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
    1.89 +
    1.90 +// Arrange third value for pixels 0,1,2,3,4,5
    1.91 +static uvec8 kShufAb2 =
    1.92 +  { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
    1.93 +
    1.94 +// Scaling values for boxes of 3x2 and 2x2
    1.95 +static uvec16 kScaleAb2 =
    1.96 +  { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
    1.97 +
    1.98 +// Reads 32 pixels, throws half away and writes 16 pixels.
    1.99 +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
   1.100 +__declspec(naked) __declspec(align(16))
   1.101 +void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   1.102 +                        uint8* dst_ptr, int dst_width) {
   1.103 +  __asm {
   1.104 +    mov        eax, [esp + 4]        // src_ptr
   1.105 +                                     // src_stride ignored
   1.106 +    mov        edx, [esp + 12]       // dst_ptr
   1.107 +    mov        ecx, [esp + 16]       // dst_width
   1.108 +
   1.109 +    align      4
   1.110 +  wloop:
   1.111 +    movdqa     xmm0, [eax]
   1.112 +    movdqa     xmm1, [eax + 16]
   1.113 +    lea        eax,  [eax + 32]
   1.114 +    psrlw      xmm0, 8               // isolate odd pixels.
   1.115 +    psrlw      xmm1, 8
   1.116 +    packuswb   xmm0, xmm1
   1.117 +    sub        ecx, 16
   1.118 +    movdqa     [edx], xmm0
   1.119 +    lea        edx, [edx + 16]
   1.120 +    jg         wloop
   1.121 +
   1.122 +    ret
   1.123 +  }
   1.124 +}
   1.125 +
   1.126 +// Blends 32x1 rectangle to 16x1.
   1.127 +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
   1.128 +__declspec(naked) __declspec(align(16))
   1.129 +void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   1.130 +                              uint8* dst_ptr, int dst_width) {
   1.131 +  __asm {
   1.132 +    mov        eax, [esp + 4]        // src_ptr
   1.133 +                                     // src_stride
   1.134 +    mov        edx, [esp + 12]       // dst_ptr
   1.135 +    mov        ecx, [esp + 16]       // dst_width
   1.136 +    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   1.137 +    psrlw      xmm5, 8
   1.138 +
   1.139 +    align      4
   1.140 +  wloop:
   1.141 +    movdqa     xmm0, [eax]
   1.142 +    movdqa     xmm1, [eax + 16]
   1.143 +    lea        eax,  [eax + 32]
   1.144 +
   1.145 +    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
   1.146 +    psrlw      xmm0, 8
   1.147 +    movdqa     xmm3, xmm1
   1.148 +    psrlw      xmm1, 8
   1.149 +    pand       xmm2, xmm5
   1.150 +    pand       xmm3, xmm5
   1.151 +    pavgw      xmm0, xmm2
   1.152 +    pavgw      xmm1, xmm3
   1.153 +    packuswb   xmm0, xmm1
   1.154 +
   1.155 +    sub        ecx, 16
   1.156 +    movdqa     [edx], xmm0
   1.157 +    lea        edx, [edx + 16]
   1.158 +    jg         wloop
   1.159 +
   1.160 +    ret
   1.161 +  }
   1.162 +}
   1.163 +
   1.164 +// Blends 32x2 rectangle to 16x1.
   1.165 +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
   1.166 +__declspec(naked) __declspec(align(16))
   1.167 +void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   1.168 +                           uint8* dst_ptr, int dst_width) {
   1.169 +  __asm {
   1.170 +    push       esi
   1.171 +    mov        eax, [esp + 4 + 4]    // src_ptr
   1.172 +    mov        esi, [esp + 4 + 8]    // src_stride
   1.173 +    mov        edx, [esp + 4 + 12]   // dst_ptr
   1.174 +    mov        ecx, [esp + 4 + 16]   // dst_width
   1.175 +    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   1.176 +    psrlw      xmm5, 8
   1.177 +
   1.178 +    align      4
   1.179 +  wloop:
   1.180 +    movdqa     xmm0, [eax]
   1.181 +    movdqa     xmm1, [eax + 16]
   1.182 +    movdqa     xmm2, [eax + esi]
   1.183 +    movdqa     xmm3, [eax + esi + 16]
   1.184 +    lea        eax,  [eax + 32]
   1.185 +    pavgb      xmm0, xmm2            // average rows
   1.186 +    pavgb      xmm1, xmm3
   1.187 +
   1.188 +    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
   1.189 +    psrlw      xmm0, 8
   1.190 +    movdqa     xmm3, xmm1
   1.191 +    psrlw      xmm1, 8
   1.192 +    pand       xmm2, xmm5
   1.193 +    pand       xmm3, xmm5
   1.194 +    pavgw      xmm0, xmm2
   1.195 +    pavgw      xmm1, xmm3
   1.196 +    packuswb   xmm0, xmm1
   1.197 +
   1.198 +    sub        ecx, 16
   1.199 +    movdqa     [edx], xmm0
   1.200 +    lea        edx, [edx + 16]
   1.201 +    jg         wloop
   1.202 +
   1.203 +    pop        esi
   1.204 +    ret
   1.205 +  }
   1.206 +}
   1.207 +
   1.208 +// Reads 32 pixels, throws half away and writes 16 pixels.
   1.209 +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
   1.210 +__declspec(naked) __declspec(align(16))
   1.211 +void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
   1.212 +                                  ptrdiff_t src_stride,
   1.213 +                                  uint8* dst_ptr, int dst_width) {
   1.214 +  __asm {
   1.215 +    mov        eax, [esp + 4]        // src_ptr
   1.216 +                                     // src_stride ignored
   1.217 +    mov        edx, [esp + 12]       // dst_ptr
   1.218 +    mov        ecx, [esp + 16]       // dst_width
   1.219 +
   1.220 +    align      4
   1.221 +  wloop:
   1.222 +    movdqu     xmm0, [eax]
   1.223 +    movdqu     xmm1, [eax + 16]
   1.224 +    lea        eax,  [eax + 32]
   1.225 +    psrlw      xmm0, 8               // isolate odd pixels.
   1.226 +    psrlw      xmm1, 8
   1.227 +    packuswb   xmm0, xmm1
   1.228 +    sub        ecx, 16
   1.229 +    movdqu     [edx], xmm0
   1.230 +    lea        edx, [edx + 16]
   1.231 +    jg         wloop
   1.232 +
   1.233 +    ret
   1.234 +  }
   1.235 +}
   1.236 +
   1.237 +// Blends 32x1 rectangle to 16x1.
   1.238 +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
   1.239 +__declspec(naked) __declspec(align(16))
   1.240 +void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
   1.241 +                                        ptrdiff_t src_stride,
   1.242 +                                        uint8* dst_ptr, int dst_width) {
   1.243 +  __asm {
   1.244 +    mov        eax, [esp + 4]        // src_ptr
   1.245 +                                     // src_stride
   1.246 +    mov        edx, [esp + 12]       // dst_ptr
   1.247 +    mov        ecx, [esp + 16]       // dst_width
   1.248 +    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   1.249 +    psrlw      xmm5, 8
   1.250 +
   1.251 +    align      4
   1.252 +  wloop:
   1.253 +    movdqu     xmm0, [eax]
   1.254 +    movdqu     xmm1, [eax + 16]
   1.255 +    lea        eax,  [eax + 32]
   1.256 +
   1.257 +    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
   1.258 +    psrlw      xmm0, 8
   1.259 +    movdqa     xmm3, xmm1
   1.260 +    psrlw      xmm1, 8
   1.261 +    pand       xmm2, xmm5
   1.262 +    pand       xmm3, xmm5
   1.263 +    pavgw      xmm0, xmm2
   1.264 +    pavgw      xmm1, xmm3
   1.265 +    packuswb   xmm0, xmm1
   1.266 +
   1.267 +    sub        ecx, 16
   1.268 +    movdqu     [edx], xmm0
   1.269 +    lea        edx, [edx + 16]
   1.270 +    jg         wloop
   1.271 +
   1.272 +    ret
   1.273 +  }
   1.274 +}
   1.275 +
   1.276 +// Blends 32x2 rectangle to 16x1.
   1.277 +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
   1.278 +__declspec(naked) __declspec(align(16))
   1.279 +void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
   1.280 +                                     ptrdiff_t src_stride,
   1.281 +                                     uint8* dst_ptr, int dst_width) {
   1.282 +  __asm {
   1.283 +    push       esi
   1.284 +    mov        eax, [esp + 4 + 4]    // src_ptr
   1.285 +    mov        esi, [esp + 4 + 8]    // src_stride
   1.286 +    mov        edx, [esp + 4 + 12]   // dst_ptr
   1.287 +    mov        ecx, [esp + 4 + 16]   // dst_width
   1.288 +    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   1.289 +    psrlw      xmm5, 8
   1.290 +
   1.291 +    align      4
   1.292 +  wloop:
   1.293 +    movdqu     xmm0, [eax]
   1.294 +    movdqu     xmm1, [eax + 16]
   1.295 +    movdqu     xmm2, [eax + esi]
   1.296 +    movdqu     xmm3, [eax + esi + 16]
   1.297 +    lea        eax,  [eax + 32]
   1.298 +    pavgb      xmm0, xmm2            // average rows
   1.299 +    pavgb      xmm1, xmm3
   1.300 +
   1.301 +    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
   1.302 +    psrlw      xmm0, 8
   1.303 +    movdqa     xmm3, xmm1
   1.304 +    psrlw      xmm1, 8
   1.305 +    pand       xmm2, xmm5
   1.306 +    pand       xmm3, xmm5
   1.307 +    pavgw      xmm0, xmm2
   1.308 +    pavgw      xmm1, xmm3
   1.309 +    packuswb   xmm0, xmm1
   1.310 +
   1.311 +    sub        ecx, 16
   1.312 +    movdqu     [edx], xmm0
   1.313 +    lea        edx, [edx + 16]
   1.314 +    jg         wloop
   1.315 +
   1.316 +    pop        esi
   1.317 +    ret
   1.318 +  }
   1.319 +}
   1.320 +
   1.321 +// Point samples 32 pixels to 8 pixels.
   1.322 +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
   1.323 +__declspec(naked) __declspec(align(16))
   1.324 +void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   1.325 +                        uint8* dst_ptr, int dst_width) {
   1.326 +  __asm {
   1.327 +    mov        eax, [esp + 4]        // src_ptr
   1.328 +                                     // src_stride ignored
   1.329 +    mov        edx, [esp + 12]       // dst_ptr
   1.330 +    mov        ecx, [esp + 16]       // dst_width
   1.331 +    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff0000
   1.332 +    psrld      xmm5, 24
   1.333 +    pslld      xmm5, 16
   1.334 +
   1.335 +    align      4
   1.336 +  wloop:
   1.337 +    movdqa     xmm0, [eax]
   1.338 +    movdqa     xmm1, [eax + 16]
   1.339 +    lea        eax,  [eax + 32]
   1.340 +    pand       xmm0, xmm5
   1.341 +    pand       xmm1, xmm5
   1.342 +    packuswb   xmm0, xmm1
   1.343 +    psrlw      xmm0, 8
   1.344 +    packuswb   xmm0, xmm0
   1.345 +    sub        ecx, 8
   1.346 +    movq       qword ptr [edx], xmm0
   1.347 +    lea        edx, [edx + 8]
   1.348 +    jg         wloop
   1.349 +
   1.350 +    ret
   1.351 +  }
   1.352 +}
   1.353 +
   1.354 +// Blends 32x4 rectangle to 8x1.
   1.355 +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
   1.356 +__declspec(naked) __declspec(align(16))
   1.357 +void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   1.358 +                           uint8* dst_ptr, int dst_width) {
   1.359 +  __asm {
   1.360 +    push       esi
   1.361 +    push       edi
   1.362 +    mov        eax, [esp + 8 + 4]    // src_ptr
   1.363 +    mov        esi, [esp + 8 + 8]    // src_stride
   1.364 +    mov        edx, [esp + 8 + 12]   // dst_ptr
   1.365 +    mov        ecx, [esp + 8 + 16]   // dst_width
   1.366 +    lea        edi, [esi + esi * 2]  // src_stride * 3
   1.367 +    pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
   1.368 +    psrlw      xmm7, 8
   1.369 +
   1.370 +    align      4
   1.371 +  wloop:
   1.372 +    movdqa     xmm0, [eax]
   1.373 +    movdqa     xmm1, [eax + 16]
   1.374 +    movdqa     xmm2, [eax + esi]
   1.375 +    movdqa     xmm3, [eax + esi + 16]
   1.376 +    pavgb      xmm0, xmm2            // average rows
   1.377 +    pavgb      xmm1, xmm3
   1.378 +    movdqa     xmm2, [eax + esi * 2]
   1.379 +    movdqa     xmm3, [eax + esi * 2 + 16]
   1.380 +    movdqa     xmm4, [eax + edi]
   1.381 +    movdqa     xmm5, [eax + edi + 16]
   1.382 +    lea        eax, [eax + 32]
   1.383 +    pavgb      xmm2, xmm4
   1.384 +    pavgb      xmm3, xmm5
   1.385 +    pavgb      xmm0, xmm2
   1.386 +    pavgb      xmm1, xmm3
   1.387 +
   1.388 +    movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
   1.389 +    psrlw      xmm0, 8
   1.390 +    movdqa     xmm3, xmm1
   1.391 +    psrlw      xmm1, 8
   1.392 +    pand       xmm2, xmm7
   1.393 +    pand       xmm3, xmm7
   1.394 +    pavgw      xmm0, xmm2
   1.395 +    pavgw      xmm1, xmm3
   1.396 +    packuswb   xmm0, xmm1
   1.397 +
   1.398 +    movdqa     xmm2, xmm0            // average columns (16 to 8 pixels)
   1.399 +    psrlw      xmm0, 8
   1.400 +    pand       xmm2, xmm7
   1.401 +    pavgw      xmm0, xmm2
   1.402 +    packuswb   xmm0, xmm0
   1.403 +
   1.404 +    sub        ecx, 8
   1.405 +    movq       qword ptr [edx], xmm0
   1.406 +    lea        edx, [edx + 8]
   1.407 +    jg         wloop
   1.408 +
   1.409 +    pop        edi
   1.410 +    pop        esi
   1.411 +    ret
   1.412 +  }
   1.413 +}
   1.414 +
   1.415 +// Point samples 32 pixels to 24 pixels.
   1.416 +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
   1.417 +// Then shuffled to do the scaling.
   1.418 +
   1.419 +// Note that movdqa+palign may be better than movdqu.
   1.420 +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
   1.421 +__declspec(naked) __declspec(align(16))
   1.422 +void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
   1.423 +                          uint8* dst_ptr, int dst_width) {
   1.424 +  __asm {
   1.425 +    mov        eax, [esp + 4]        // src_ptr
   1.426 +                                     // src_stride ignored
   1.427 +    mov        edx, [esp + 12]       // dst_ptr
   1.428 +    mov        ecx, [esp + 16]       // dst_width
   1.429 +    movdqa     xmm3, kShuf0
   1.430 +    movdqa     xmm4, kShuf1
   1.431 +    movdqa     xmm5, kShuf2
   1.432 +
   1.433 +    align      4
   1.434 +  wloop:
   1.435 +    movdqa     xmm0, [eax]
   1.436 +    movdqa     xmm1, [eax + 16]
   1.437 +    lea        eax,  [eax + 32]
   1.438 +    movdqa     xmm2, xmm1
   1.439 +    palignr    xmm1, xmm0, 8
   1.440 +    pshufb     xmm0, xmm3
   1.441 +    pshufb     xmm1, xmm4
   1.442 +    pshufb     xmm2, xmm5
   1.443 +    movq       qword ptr [edx], xmm0
   1.444 +    movq       qword ptr [edx + 8], xmm1
   1.445 +    movq       qword ptr [edx + 16], xmm2
   1.446 +    lea        edx, [edx + 24]
   1.447 +    sub        ecx, 24
   1.448 +    jg         wloop
   1.449 +
   1.450 +    ret
   1.451 +  }
   1.452 +}
   1.453 +
   1.454 +// Blends 32x2 rectangle to 24x1
   1.455 +// Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
   1.456 +// Then shuffled to do the scaling.
   1.457 +
   1.458 +// Register usage:
   1.459 +// xmm0 src_row 0
   1.460 +// xmm1 src_row 1
   1.461 +// xmm2 shuf 0
   1.462 +// xmm3 shuf 1
   1.463 +// xmm4 shuf 2
   1.464 +// xmm5 madd 0
   1.465 +// xmm6 madd 1
   1.466 +// xmm7 kRound34
   1.467 +
   1.468 +// Note that movdqa+palign may be better than movdqu.
   1.469 +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
   1.470 +__declspec(naked) __declspec(align(16))
   1.471 +void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
   1.472 +                                ptrdiff_t src_stride,
   1.473 +                                uint8* dst_ptr, int dst_width) {
   1.474 +  __asm {
   1.475 +    push       esi
   1.476 +    mov        eax, [esp + 4 + 4]    // src_ptr
   1.477 +    mov        esi, [esp + 4 + 8]    // src_stride
   1.478 +    mov        edx, [esp + 4 + 12]   // dst_ptr
   1.479 +    mov        ecx, [esp + 4 + 16]   // dst_width
   1.480 +    movdqa     xmm2, kShuf01
   1.481 +    movdqa     xmm3, kShuf11
   1.482 +    movdqa     xmm4, kShuf21
   1.483 +    movdqa     xmm5, kMadd01
   1.484 +    movdqa     xmm6, kMadd11
   1.485 +    movdqa     xmm7, kRound34
   1.486 +
   1.487 +    align      4
   1.488 +  wloop:
   1.489 +    movdqa     xmm0, [eax]           // pixels 0..7
   1.490 +    movdqa     xmm1, [eax + esi]
   1.491 +    pavgb      xmm0, xmm1
   1.492 +    pshufb     xmm0, xmm2
   1.493 +    pmaddubsw  xmm0, xmm5
   1.494 +    paddsw     xmm0, xmm7
   1.495 +    psrlw      xmm0, 2
   1.496 +    packuswb   xmm0, xmm0
   1.497 +    movq       qword ptr [edx], xmm0
   1.498 +    movdqu     xmm0, [eax + 8]       // pixels 8..15
   1.499 +    movdqu     xmm1, [eax + esi + 8]
   1.500 +    pavgb      xmm0, xmm1
   1.501 +    pshufb     xmm0, xmm3
   1.502 +    pmaddubsw  xmm0, xmm6
   1.503 +    paddsw     xmm0, xmm7
   1.504 +    psrlw      xmm0, 2
   1.505 +    packuswb   xmm0, xmm0
   1.506 +    movq       qword ptr [edx + 8], xmm0
   1.507 +    movdqa     xmm0, [eax + 16]      // pixels 16..23
   1.508 +    movdqa     xmm1, [eax + esi + 16]
   1.509 +    lea        eax, [eax + 32]
   1.510 +    pavgb      xmm0, xmm1
   1.511 +    pshufb     xmm0, xmm4
   1.512 +    movdqa     xmm1, kMadd21
   1.513 +    pmaddubsw  xmm0, xmm1
   1.514 +    paddsw     xmm0, xmm7
   1.515 +    psrlw      xmm0, 2
   1.516 +    packuswb   xmm0, xmm0
   1.517 +    sub        ecx, 24
   1.518 +    movq       qword ptr [edx + 16], xmm0
   1.519 +    lea        edx, [edx + 24]
   1.520 +    jg         wloop
   1.521 +
   1.522 +    pop        esi
   1.523 +    ret
   1.524 +  }
   1.525 +}
   1.526 +
   1.527 +// Note that movdqa+palign may be better than movdqu.
   1.528 +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
   1.529 +__declspec(naked) __declspec(align(16))
   1.530 +void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
   1.531 +                                ptrdiff_t src_stride,
   1.532 +                                uint8* dst_ptr, int dst_width) {
   1.533 +  __asm {
   1.534 +    push       esi
   1.535 +    mov        eax, [esp + 4 + 4]    // src_ptr
   1.536 +    mov        esi, [esp + 4 + 8]    // src_stride
   1.537 +    mov        edx, [esp + 4 + 12]   // dst_ptr
   1.538 +    mov        ecx, [esp + 4 + 16]   // dst_width
   1.539 +    movdqa     xmm2, kShuf01
   1.540 +    movdqa     xmm3, kShuf11
   1.541 +    movdqa     xmm4, kShuf21
   1.542 +    movdqa     xmm5, kMadd01
   1.543 +    movdqa     xmm6, kMadd11
   1.544 +    movdqa     xmm7, kRound34
   1.545 +
   1.546 +    align      4
   1.547 +  wloop:
   1.548 +    movdqa     xmm0, [eax]           // pixels 0..7
   1.549 +    movdqa     xmm1, [eax + esi]
   1.550 +    pavgb      xmm1, xmm0
   1.551 +    pavgb      xmm0, xmm1
   1.552 +    pshufb     xmm0, xmm2
   1.553 +    pmaddubsw  xmm0, xmm5
   1.554 +    paddsw     xmm0, xmm7
   1.555 +    psrlw      xmm0, 2
   1.556 +    packuswb   xmm0, xmm0
   1.557 +    movq       qword ptr [edx], xmm0
   1.558 +    movdqu     xmm0, [eax + 8]       // pixels 8..15
   1.559 +    movdqu     xmm1, [eax + esi + 8]
   1.560 +    pavgb      xmm1, xmm0
   1.561 +    pavgb      xmm0, xmm1
   1.562 +    pshufb     xmm0, xmm3
   1.563 +    pmaddubsw  xmm0, xmm6
   1.564 +    paddsw     xmm0, xmm7
   1.565 +    psrlw      xmm0, 2
   1.566 +    packuswb   xmm0, xmm0
   1.567 +    movq       qword ptr [edx + 8], xmm0
   1.568 +    movdqa     xmm0, [eax + 16]      // pixels 16..23
   1.569 +    movdqa     xmm1, [eax + esi + 16]
   1.570 +    lea        eax, [eax + 32]
   1.571 +    pavgb      xmm1, xmm0
   1.572 +    pavgb      xmm0, xmm1
   1.573 +    pshufb     xmm0, xmm4
   1.574 +    movdqa     xmm1, kMadd21
   1.575 +    pmaddubsw  xmm0, xmm1
   1.576 +    paddsw     xmm0, xmm7
   1.577 +    psrlw      xmm0, 2
   1.578 +    packuswb   xmm0, xmm0
   1.579 +    sub        ecx, 24
   1.580 +    movq       qword ptr [edx + 16], xmm0
   1.581 +    lea        edx, [edx+24]
   1.582 +    jg         wloop
   1.583 +
   1.584 +    pop        esi
   1.585 +    ret
   1.586 +  }
   1.587 +}
   1.588 +
   1.589 +// 3/8 point sampler
   1.590 +
   1.591 +// Scale 32 pixels to 12
   1.592 +__declspec(naked) __declspec(align(16))
   1.593 +void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
   1.594 +                          uint8* dst_ptr, int dst_width) {
   1.595 +  __asm {
   1.596 +    mov        eax, [esp + 4]        // src_ptr
   1.597 +                                     // src_stride ignored
   1.598 +    mov        edx, [esp + 12]       // dst_ptr
   1.599 +    mov        ecx, [esp + 16]       // dst_width
   1.600 +    movdqa     xmm4, kShuf38a
   1.601 +    movdqa     xmm5, kShuf38b
   1.602 +
   1.603 +    align      4
   1.604 +  xloop:
   1.605 +    movdqa     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
   1.606 +    movdqa     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
   1.607 +    lea        eax, [eax + 32]
   1.608 +    pshufb     xmm0, xmm4
   1.609 +    pshufb     xmm1, xmm5
   1.610 +    paddusb    xmm0, xmm1
   1.611 +
   1.612 +    sub        ecx, 12
   1.613 +    movq       qword ptr [edx], xmm0  // write 12 pixels
   1.614 +    movhlps    xmm1, xmm0
   1.615 +    movd       [edx + 8], xmm1
   1.616 +    lea        edx, [edx + 12]
   1.617 +    jg         xloop
   1.618 +
   1.619 +    ret
   1.620 +  }
   1.621 +}
   1.622 +
   1.623 +// Scale 16x3 pixels to 6x1 with interpolation
   1.624 +__declspec(naked) __declspec(align(16))
   1.625 +void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
   1.626 +                                ptrdiff_t src_stride,
   1.627 +                                uint8* dst_ptr, int dst_width) {
   1.628 +  __asm {
   1.629 +    push       esi
   1.630 +    mov        eax, [esp + 4 + 4]    // src_ptr
   1.631 +    mov        esi, [esp + 4 + 8]    // src_stride
   1.632 +    mov        edx, [esp + 4 + 12]   // dst_ptr
   1.633 +    mov        ecx, [esp + 4 + 16]   // dst_width
   1.634 +    movdqa     xmm2, kShufAc
   1.635 +    movdqa     xmm3, kShufAc3
   1.636 +    movdqa     xmm4, kScaleAc33
   1.637 +    pxor       xmm5, xmm5
   1.638 +
   1.639 +    align      4
   1.640 +  xloop:
   1.641 +    movdqa     xmm0, [eax]           // sum up 3 rows into xmm0/1
   1.642 +    movdqa     xmm6, [eax + esi]
   1.643 +    movhlps    xmm1, xmm0
   1.644 +    movhlps    xmm7, xmm6
   1.645 +    punpcklbw  xmm0, xmm5
   1.646 +    punpcklbw  xmm1, xmm5
   1.647 +    punpcklbw  xmm6, xmm5
   1.648 +    punpcklbw  xmm7, xmm5
   1.649 +    paddusw    xmm0, xmm6
   1.650 +    paddusw    xmm1, xmm7
   1.651 +    movdqa     xmm6, [eax + esi * 2]
   1.652 +    lea        eax, [eax + 16]
   1.653 +    movhlps    xmm7, xmm6
   1.654 +    punpcklbw  xmm6, xmm5
   1.655 +    punpcklbw  xmm7, xmm5
   1.656 +    paddusw    xmm0, xmm6
   1.657 +    paddusw    xmm1, xmm7
   1.658 +
   1.659 +    movdqa     xmm6, xmm0            // 8 pixels -> 0,1,2 of xmm6
   1.660 +    psrldq     xmm0, 2
   1.661 +    paddusw    xmm6, xmm0
   1.662 +    psrldq     xmm0, 2
   1.663 +    paddusw    xmm6, xmm0
   1.664 +    pshufb     xmm6, xmm2
   1.665 +
   1.666 +    movdqa     xmm7, xmm1            // 8 pixels -> 3,4,5 of xmm6
   1.667 +    psrldq     xmm1, 2
   1.668 +    paddusw    xmm7, xmm1
   1.669 +    psrldq     xmm1, 2
   1.670 +    paddusw    xmm7, xmm1
   1.671 +    pshufb     xmm7, xmm3
   1.672 +    paddusw    xmm6, xmm7
   1.673 +
   1.674 +    pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6
   1.675 +    packuswb   xmm6, xmm6
   1.676 +
   1.677 +    sub        ecx, 6
   1.678 +    movd       [edx], xmm6           // write 6 pixels
   1.679 +    psrlq      xmm6, 16
   1.680 +    movd       [edx + 2], xmm6
   1.681 +    lea        edx, [edx + 6]
   1.682 +    jg         xloop
   1.683 +
   1.684 +    pop        esi
   1.685 +    ret
   1.686 +  }
   1.687 +}
   1.688 +
   1.689 +// Scale 16x2 pixels to 6x1 with interpolation
   1.690 +__declspec(naked) __declspec(align(16))
   1.691 +void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
   1.692 +                                ptrdiff_t src_stride,
   1.693 +                                uint8* dst_ptr, int dst_width) {
   1.694 +  __asm {
   1.695 +    push       esi
   1.696 +    mov        eax, [esp + 4 + 4]    // src_ptr
   1.697 +    mov        esi, [esp + 4 + 8]    // src_stride
   1.698 +    mov        edx, [esp + 4 + 12]   // dst_ptr
   1.699 +    mov        ecx, [esp + 4 + 16]   // dst_width
   1.700 +    movdqa     xmm2, kShufAb0
   1.701 +    movdqa     xmm3, kShufAb1
   1.702 +    movdqa     xmm4, kShufAb2
   1.703 +    movdqa     xmm5, kScaleAb2
   1.704 +
   1.705 +    align      4
   1.706 +  xloop:
   1.707 +    movdqa     xmm0, [eax]           // average 2 rows into xmm0
   1.708 +    pavgb      xmm0, [eax + esi]
   1.709 +    lea        eax, [eax + 16]
   1.710 +
   1.711 +    movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1
   1.712 +    pshufb     xmm1, xmm2
   1.713 +    movdqa     xmm6, xmm0
   1.714 +    pshufb     xmm6, xmm3
   1.715 +    paddusw    xmm1, xmm6
   1.716 +    pshufb     xmm0, xmm4
   1.717 +    paddusw    xmm1, xmm0
   1.718 +
   1.719 +    pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2
   1.720 +    packuswb   xmm1, xmm1
   1.721 +
   1.722 +    sub        ecx, 6
   1.723 +    movd       [edx], xmm1           // write 6 pixels
   1.724 +    psrlq      xmm1, 16
   1.725 +    movd       [edx + 2], xmm1
   1.726 +    lea        edx, [edx + 6]
   1.727 +    jg         xloop
   1.728 +
   1.729 +    pop        esi
   1.730 +    ret
   1.731 +  }
   1.732 +}
   1.733 +
   1.734 +// Reads 16xN bytes and produces 16 shorts at a time.
   1.735 +// TODO(fbarchard): Make this handle 4xN bytes for any width ARGB.
   1.736 +__declspec(naked) __declspec(align(16))
   1.737 +void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   1.738 +                       uint16* dst_ptr, int src_width,
   1.739 +                       int src_height) {
   1.740 +  __asm {
   1.741 +    push       esi
   1.742 +    push       edi
   1.743 +    push       ebx
   1.744 +    push       ebp
   1.745 +    mov        esi, [esp + 16 + 4]   // src_ptr
   1.746 +    mov        edx, [esp + 16 + 8]   // src_stride
   1.747 +    mov        edi, [esp + 16 + 12]  // dst_ptr
   1.748 +    mov        ecx, [esp + 16 + 16]  // dst_width
   1.749 +    mov        ebx, [esp + 16 + 20]  // height
   1.750 +    pxor       xmm4, xmm4
   1.751 +    dec        ebx
   1.752 +
   1.753 +    align      4
   1.754 +  xloop:
   1.755 +    // first row
   1.756 +    movdqa     xmm0, [esi]
   1.757 +    lea        eax, [esi + edx]
   1.758 +    movdqa     xmm1, xmm0
   1.759 +    punpcklbw  xmm0, xmm4
   1.760 +    punpckhbw  xmm1, xmm4
   1.761 +    lea        esi, [esi + 16]
   1.762 +    mov        ebp, ebx
   1.763 +    test       ebp, ebp
   1.764 +    je         ydone
   1.765 +
   1.766 +    // sum remaining rows
   1.767 +    align      4
   1.768 +  yloop:
   1.769 +    movdqa     xmm2, [eax]       // read 16 pixels
   1.770 +    lea        eax, [eax + edx]  // advance to next row
   1.771 +    movdqa     xmm3, xmm2
   1.772 +    punpcklbw  xmm2, xmm4
   1.773 +    punpckhbw  xmm3, xmm4
   1.774 +    paddusw    xmm0, xmm2        // sum 16 words
   1.775 +    paddusw    xmm1, xmm3
   1.776 +    sub        ebp, 1
   1.777 +    jg         yloop
   1.778 +
   1.779 +    align      4
   1.780 +  ydone:
   1.781 +    movdqa     [edi], xmm0
   1.782 +    movdqa     [edi + 16], xmm1
   1.783 +    lea        edi, [edi + 32]
   1.784 +
   1.785 +    sub        ecx, 16
   1.786 +    jg         xloop
   1.787 +
   1.788 +    pop        ebp
   1.789 +    pop        ebx
   1.790 +    pop        edi
   1.791 +    pop        esi
   1.792 +    ret
   1.793 +  }
   1.794 +}
   1.795 +
   1.796 +// Bilinear column filtering. SSSE3 version.
   1.797 +// TODO(fbarchard): Port to Neon
   1.798 +// TODO(fbarchard): Switch the following:
   1.799 +//    xor        ebx, ebx
   1.800 +//    mov        bx, word ptr [esi + eax]  // 2 source x0 pixels
   1.801 +// To
   1.802 +//    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
   1.803 +// when drmemory bug fixed.
   1.804 +// https://code.google.com/p/drmemory/issues/detail?id=1396
   1.805 +
   1.806 +__declspec(naked) __declspec(align(16))
   1.807 +void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
   1.808 +                           int dst_width, int x, int dx) {
   1.809 +  __asm {
   1.810 +    push       ebx
   1.811 +    push       esi
   1.812 +    push       edi
   1.813 +    mov        edi, [esp + 12 + 4]    // dst_ptr
   1.814 +    mov        esi, [esp + 12 + 8]    // src_ptr
   1.815 +    mov        ecx, [esp + 12 + 12]   // dst_width
   1.816 +    movd       xmm2, [esp + 12 + 16]  // x
   1.817 +    movd       xmm3, [esp + 12 + 20]  // dx
   1.818 +    mov        eax, 0x04040000      // shuffle to line up fractions with pixel.
   1.819 +    movd       xmm5, eax
   1.820 +    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
   1.821 +    psrlw      xmm6, 9
   1.822 +    pextrw     eax, xmm2, 1         // get x0 integer. preroll
   1.823 +    sub        ecx, 2
   1.824 +    jl         xloop29
   1.825 +
   1.826 +    movdqa     xmm0, xmm2           // x1 = x0 + dx
   1.827 +    paddd      xmm0, xmm3
   1.828 +    punpckldq  xmm2, xmm0           // x0 x1
   1.829 +    punpckldq  xmm3, xmm3           // dx dx
   1.830 +    paddd      xmm3, xmm3           // dx * 2, dx * 2
   1.831 +    pextrw     edx, xmm2, 3         // get x1 integer. preroll
   1.832 +
   1.833 +    // 2 Pixel loop.
   1.834 +    align      4
   1.835 +  xloop2:
   1.836 +    movdqa     xmm1, xmm2           // x0, x1 fractions.
   1.837 +    paddd      xmm2, xmm3           // x += dx
   1.838 +    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
   1.839 +    movd       xmm0, ebx
   1.840 +    psrlw      xmm1, 9              // 7 bit fractions.
   1.841 +    movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels
   1.842 +    movd       xmm4, ebx
   1.843 +    pshufb     xmm1, xmm5           // 0011
   1.844 +    punpcklwd  xmm0, xmm4
   1.845 +    pxor       xmm1, xmm6           // 0..7f and 7f..0
   1.846 +    pmaddubsw  xmm0, xmm1           // 16 bit, 2 pixels.
   1.847 +    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
   1.848 +    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
   1.849 +    psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
   1.850 +    packuswb   xmm0, xmm0           // 8 bits, 2 pixels.
   1.851 +    movd       ebx, xmm0
   1.852 +    mov        [edi], bx
   1.853 +    lea        edi, [edi + 2]
   1.854 +    sub        ecx, 2               // 2 pixels
   1.855 +    jge        xloop2
   1.856 +
   1.857 +    align      4
   1.858 + xloop29:
   1.859 +
   1.860 +    add        ecx, 2 - 1
   1.861 +    jl         xloop99
   1.862 +
   1.863 +    // 1 pixel remainder
   1.864 +    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
   1.865 +    movd       xmm0, ebx
   1.866 +    psrlw      xmm2, 9              // 7 bit fractions.
   1.867 +    pshufb     xmm2, xmm5           // 0011
   1.868 +    pxor       xmm2, xmm6           // 0..7f and 7f..0
   1.869 +    pmaddubsw  xmm0, xmm2           // 16 bit
   1.870 +    psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
   1.871 +    packuswb   xmm0, xmm0           // 8 bits
   1.872 +    movd       ebx, xmm0
   1.873 +    mov        [edi], bl
   1.874 +
   1.875 +    align      4
   1.876 + xloop99:
   1.877 +
   1.878 +    pop        edi
   1.879 +    pop        esi
   1.880 +    pop        ebx
   1.881 +    ret
   1.882 +  }
   1.883 +}
   1.884 +
   1.885 +// Reads 16 pixels, duplicates them and writes 32 pixels.
   1.886 +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
   1.887 +__declspec(naked) __declspec(align(16))
   1.888 +void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
   1.889 +                       int dst_width, int x, int dx) {
   1.890 +  __asm {
   1.891 +    mov        edx, [esp + 4]    // dst_ptr
   1.892 +    mov        eax, [esp + 8]    // src_ptr
   1.893 +    mov        ecx, [esp + 12]   // dst_width
   1.894 +
   1.895 +    align      4
   1.896 +  wloop:
   1.897 +    movdqa     xmm0, [eax]
   1.898 +    lea        eax,  [eax + 16]
   1.899 +    movdqa     xmm1, xmm0
   1.900 +    punpcklbw  xmm0, xmm0
   1.901 +    punpckhbw  xmm1, xmm1
   1.902 +    sub        ecx, 32
   1.903 +    movdqa     [edx], xmm0
   1.904 +    movdqa     [edx + 16], xmm1
   1.905 +    lea        edx, [edx + 32]
   1.906 +    jg         wloop
   1.907 +
   1.908 +    ret
   1.909 +  }
   1.910 +}
   1.911 +
   1.912 +// Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
   1.913 +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
   1.914 +__declspec(naked) __declspec(align(16))
   1.915 +void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
   1.916 +                            ptrdiff_t src_stride,
   1.917 +                            uint8* dst_argb, int dst_width) {
   1.918 +  __asm {
   1.919 +    mov        eax, [esp + 4]        // src_argb
   1.920 +                                     // src_stride ignored
   1.921 +    mov        edx, [esp + 12]       // dst_argb
   1.922 +    mov        ecx, [esp + 16]       // dst_width
   1.923 +
   1.924 +    align      4
   1.925 +  wloop:
   1.926 +    movdqa     xmm0, [eax]
   1.927 +    movdqa     xmm1, [eax + 16]
   1.928 +    lea        eax,  [eax + 32]
   1.929 +    shufps     xmm0, xmm1, 0xdd
   1.930 +    sub        ecx, 4
   1.931 +    movdqa     [edx], xmm0
   1.932 +    lea        edx, [edx + 16]
   1.933 +    jg         wloop
   1.934 +
   1.935 +    ret
   1.936 +  }
   1.937 +}
   1.938 +
   1.939 +// Blends 8x1 rectangle to 4x1.
   1.940 +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
   1.941 +__declspec(naked) __declspec(align(16))
   1.942 +void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
   1.943 +                                  ptrdiff_t src_stride,
   1.944 +                                  uint8* dst_argb, int dst_width) {
   1.945 +  __asm {
   1.946 +    mov        eax, [esp + 4]        // src_argb
   1.947 +                                     // src_stride ignored
   1.948 +    mov        edx, [esp + 12]       // dst_argb
   1.949 +    mov        ecx, [esp + 16]       // dst_width
   1.950 +
   1.951 +    align      4
   1.952 +  wloop:
   1.953 +    movdqa     xmm0, [eax]
   1.954 +    movdqa     xmm1, [eax + 16]
   1.955 +    lea        eax,  [eax + 32]
   1.956 +    movdqa     xmm2, xmm0
   1.957 +    shufps     xmm0, xmm1, 0x88      // even pixels
   1.958 +    shufps     xmm2, xmm1, 0xdd      // odd pixels
   1.959 +    pavgb      xmm0, xmm2
   1.960 +    sub        ecx, 4
   1.961 +    movdqa     [edx], xmm0
   1.962 +    lea        edx, [edx + 16]
   1.963 +    jg         wloop
   1.964 +
   1.965 +    ret
   1.966 +  }
   1.967 +}
   1.968 +
   1.969 +// Blends 8x2 rectangle to 4x1.
   1.970 +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
   1.971 +__declspec(naked) __declspec(align(16))
   1.972 +void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
   1.973 +                               ptrdiff_t src_stride,
   1.974 +                               uint8* dst_argb, int dst_width) {
   1.975 +  __asm {
   1.976 +    push       esi
   1.977 +    mov        eax, [esp + 4 + 4]    // src_argb
   1.978 +    mov        esi, [esp + 4 + 8]    // src_stride
   1.979 +    mov        edx, [esp + 4 + 12]   // dst_argb
   1.980 +    mov        ecx, [esp + 4 + 16]   // dst_width
   1.981 +
   1.982 +    align      4
   1.983 +  wloop:
   1.984 +    movdqa     xmm0, [eax]
   1.985 +    movdqa     xmm1, [eax + 16]
   1.986 +    movdqa     xmm2, [eax + esi]
   1.987 +    movdqa     xmm3, [eax + esi + 16]
   1.988 +    lea        eax,  [eax + 32]
   1.989 +    pavgb      xmm0, xmm2            // average rows
   1.990 +    pavgb      xmm1, xmm3
   1.991 +    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
   1.992 +    shufps     xmm0, xmm1, 0x88      // even pixels
   1.993 +    shufps     xmm2, xmm1, 0xdd      // odd pixels
   1.994 +    pavgb      xmm0, xmm2
   1.995 +    sub        ecx, 4
   1.996 +    movdqa     [edx], xmm0
   1.997 +    lea        edx, [edx + 16]
   1.998 +    jg         wloop
   1.999 +
  1.1000 +    pop        esi
  1.1001 +    ret
  1.1002 +  }
  1.1003 +}
  1.1004 +
  1.1005 +// Reads 4 pixels at a time.
  1.1006 +// Alignment requirement: dst_argb 16 byte aligned.
  1.1007 +__declspec(naked) __declspec(align(16))
  1.1008 +void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
  1.1009 +                               int src_stepx,
  1.1010 +                               uint8* dst_argb, int dst_width) {
  1.1011 +  __asm {
  1.1012 +    push       ebx
  1.1013 +    push       edi
  1.1014 +    mov        eax, [esp + 8 + 4]    // src_argb
  1.1015 +                                     // src_stride ignored
  1.1016 +    mov        ebx, [esp + 8 + 12]   // src_stepx
  1.1017 +    mov        edx, [esp + 8 + 16]   // dst_argb
  1.1018 +    mov        ecx, [esp + 8 + 20]   // dst_width
  1.1019 +    lea        ebx, [ebx * 4]
  1.1020 +    lea        edi, [ebx + ebx * 2]
  1.1021 +
  1.1022 +    align      4
  1.1023 +  wloop:
  1.1024 +    movd       xmm0, [eax]
  1.1025 +    movd       xmm1, [eax + ebx]
  1.1026 +    punpckldq  xmm0, xmm1
  1.1027 +    movd       xmm2, [eax + ebx * 2]
  1.1028 +    movd       xmm3, [eax + edi]
  1.1029 +    lea        eax,  [eax + ebx * 4]
  1.1030 +    punpckldq  xmm2, xmm3
  1.1031 +    punpcklqdq xmm0, xmm2
  1.1032 +    sub        ecx, 4
  1.1033 +    movdqa     [edx], xmm0
  1.1034 +    lea        edx, [edx + 16]
  1.1035 +    jg         wloop
  1.1036 +
  1.1037 +    pop        edi
  1.1038 +    pop        ebx
  1.1039 +    ret
  1.1040 +  }
  1.1041 +}
  1.1042 +
  1.1043 +// Blends four 2x2 to 4x1.
  1.1044 +// Alignment requirement: dst_argb 16 byte aligned.
  1.1045 +__declspec(naked) __declspec(align(16))
  1.1046 +void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
  1.1047 +                                  ptrdiff_t src_stride,
  1.1048 +                                  int src_stepx,
  1.1049 +                                  uint8* dst_argb, int dst_width) {
  1.1050 +  __asm {
  1.1051 +    push       ebx
  1.1052 +    push       esi
  1.1053 +    push       edi
  1.1054 +    mov        eax, [esp + 12 + 4]    // src_argb
  1.1055 +    mov        esi, [esp + 12 + 8]    // src_stride
  1.1056 +    mov        ebx, [esp + 12 + 12]   // src_stepx
  1.1057 +    mov        edx, [esp + 12 + 16]   // dst_argb
  1.1058 +    mov        ecx, [esp + 12 + 20]   // dst_width
  1.1059 +    lea        esi, [eax + esi]       // row1 pointer
  1.1060 +    lea        ebx, [ebx * 4]
  1.1061 +    lea        edi, [ebx + ebx * 2]
  1.1062 +
  1.1063 +    align      4
  1.1064 +  wloop:
  1.1065 +    movq       xmm0, qword ptr [eax]  // row0 4 pairs
  1.1066 +    movhps     xmm0, qword ptr [eax + ebx]
  1.1067 +    movq       xmm1, qword ptr [eax + ebx * 2]
  1.1068 +    movhps     xmm1, qword ptr [eax + edi]
  1.1069 +    lea        eax,  [eax + ebx * 4]
  1.1070 +    movq       xmm2, qword ptr [esi]  // row1 4 pairs
  1.1071 +    movhps     xmm2, qword ptr [esi + ebx]
  1.1072 +    movq       xmm3, qword ptr [esi + ebx * 2]
  1.1073 +    movhps     xmm3, qword ptr [esi + edi]
  1.1074 +    lea        esi,  [esi + ebx * 4]
  1.1075 +    pavgb      xmm0, xmm2            // average rows
  1.1076 +    pavgb      xmm1, xmm3
  1.1077 +    movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
  1.1078 +    shufps     xmm0, xmm1, 0x88      // even pixels
  1.1079 +    shufps     xmm2, xmm1, 0xdd      // odd pixels
  1.1080 +    pavgb      xmm0, xmm2
  1.1081 +    sub        ecx, 4
  1.1082 +    movdqa     [edx], xmm0
  1.1083 +    lea        edx, [edx + 16]
  1.1084 +    jg         wloop
  1.1085 +
  1.1086 +    pop        edi
  1.1087 +    pop        esi
  1.1088 +    pop        ebx
  1.1089 +    ret
  1.1090 +  }
  1.1091 +}
  1.1092 +
  1.1093 +// Column scaling unfiltered. SSE2 version.
  1.1094 +__declspec(naked) __declspec(align(16))
  1.1095 +void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
  1.1096 +                        int dst_width, int x, int dx) {
  1.1097 +  __asm {
  1.1098 +    push       edi
  1.1099 +    push       esi
  1.1100 +    mov        edi, [esp + 8 + 4]    // dst_argb
  1.1101 +    mov        esi, [esp + 8 + 8]    // src_argb
  1.1102 +    mov        ecx, [esp + 8 + 12]   // dst_width
  1.1103 +    movd       xmm2, [esp + 8 + 16]  // x
  1.1104 +    movd       xmm3, [esp + 8 + 20]  // dx
  1.1105 +
  1.1106 +    pshufd     xmm2, xmm2, 0         // x0 x0 x0 x0
  1.1107 +    pshufd     xmm0, xmm3, 0x11      // dx  0 dx  0
  1.1108 +    paddd      xmm2, xmm0
  1.1109 +    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 2
  1.1110 +    pshufd     xmm0, xmm3, 0x05      // dx * 2, dx * 2, 0, 0
  1.1111 +    paddd      xmm2, xmm0            // x3 x2 x1 x0
  1.1112 +    paddd      xmm3, xmm3            // 0, 0, 0,  dx * 4
  1.1113 +    pshufd     xmm3, xmm3, 0         // dx * 4, dx * 4, dx * 4, dx * 4
  1.1114 +
  1.1115 +    pextrw     eax, xmm2, 1          // get x0 integer.
  1.1116 +    pextrw     edx, xmm2, 3          // get x1 integer.
  1.1117 +
  1.1118 +    cmp        ecx, 0
  1.1119 +    jle        xloop99
  1.1120 +    sub        ecx, 4
  1.1121 +    jl         xloop49
  1.1122 +
  1.1123 +    // 4 Pixel loop.
  1.1124 +    align      4
  1.1125 + xloop4:
  1.1126 +    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
  1.1127 +    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
  1.1128 +    pextrw     eax, xmm2, 5           // get x2 integer.
  1.1129 +    pextrw     edx, xmm2, 7           // get x3 integer.
  1.1130 +    paddd      xmm2, xmm3             // x += dx
  1.1131 +    punpckldq  xmm0, xmm1             // x0 x1
  1.1132 +
  1.1133 +    movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels
  1.1134 +    movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels
  1.1135 +    pextrw     eax, xmm2, 1           // get x0 integer. next iteration.
  1.1136 +    pextrw     edx, xmm2, 3           // get x1 integer. next iteration.
  1.1137 +    punpckldq  xmm1, xmm4             // x2 x3
  1.1138 +    punpcklqdq xmm0, xmm1             // x0 x1 x2 x3
  1.1139 +    sub        ecx, 4                 // 4 pixels
  1.1140 +    movdqu     [edi], xmm0
  1.1141 +    lea        edi, [edi + 16]
  1.1142 +    jge        xloop4
  1.1143 +
  1.1144 +    align      4
  1.1145 + xloop49:
  1.1146 +    test       ecx, 2
  1.1147 +    je         xloop29
  1.1148 +
  1.1149 +    // 2 Pixels.
  1.1150 +    movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
  1.1151 +    movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
  1.1152 +    pextrw     eax, xmm2, 5           // get x2 integer.
  1.1153 +    punpckldq  xmm0, xmm1             // x0 x1
  1.1154 +
  1.1155 +    movq       qword ptr [edi], xmm0
  1.1156 +    lea        edi, [edi + 8]
  1.1157 +
  1.1158 + xloop29:
  1.1159 +    test       ecx, 1
  1.1160 +    je         xloop99
  1.1161 +
  1.1162 +    // 1 Pixels.
  1.1163 +    movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
  1.1164 +    movd       dword ptr [edi], xmm0
  1.1165 +    align      4
  1.1166 + xloop99:
  1.1167 +
  1.1168 +    pop        esi
  1.1169 +    pop        edi
  1.1170 +    ret
  1.1171 +  }
  1.1172 +}
  1.1173 +
  1.1174 +// Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
  1.1175 +// TODO(fbarchard): Port to Neon
  1.1176 +
  1.1177 +// Shuffle table for arranging 2 pixels into pairs for pmaddubsw
  1.1178 +static uvec8 kShuffleColARGB = {
  1.1179 +  0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
  1.1180 +  8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
  1.1181 +};
  1.1182 +
  1.1183 +// Shuffle table for duplicating 2 fractions into 8 bytes each
  1.1184 +static uvec8 kShuffleFractions = {
  1.1185 +  0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
  1.1186 +};
  1.1187 +
  1.1188 +__declspec(naked) __declspec(align(16))
  1.1189 +void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
  1.1190 +                               int dst_width, int x, int dx) {
  1.1191 +  __asm {
  1.1192 +    push       esi
  1.1193 +    push       edi
  1.1194 +    mov        edi, [esp + 8 + 4]    // dst_argb
  1.1195 +    mov        esi, [esp + 8 + 8]    // src_argb
  1.1196 +    mov        ecx, [esp + 8 + 12]   // dst_width
  1.1197 +    movd       xmm2, [esp + 8 + 16]  // x
  1.1198 +    movd       xmm3, [esp + 8 + 20]  // dx
  1.1199 +    movdqa     xmm4, kShuffleColARGB
  1.1200 +    movdqa     xmm5, kShuffleFractions
  1.1201 +    pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
  1.1202 +    psrlw      xmm6, 9
  1.1203 +    pextrw     eax, xmm2, 1         // get x0 integer. preroll
  1.1204 +    sub        ecx, 2
  1.1205 +    jl         xloop29
  1.1206 +
  1.1207 +    movdqa     xmm0, xmm2           // x1 = x0 + dx
  1.1208 +    paddd      xmm0, xmm3
  1.1209 +    punpckldq  xmm2, xmm0           // x0 x1
  1.1210 +    punpckldq  xmm3, xmm3           // dx dx
  1.1211 +    paddd      xmm3, xmm3           // dx * 2, dx * 2
  1.1212 +    pextrw     edx, xmm2, 3         // get x1 integer. preroll
  1.1213 +
  1.1214 +    // 2 Pixel loop.
  1.1215 +    align      4
  1.1216 +  xloop2:
  1.1217 +    movdqa     xmm1, xmm2           // x0, x1 fractions.
  1.1218 +    paddd      xmm2, xmm3           // x += dx
  1.1219 +    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
  1.1220 +    psrlw      xmm1, 9              // 7 bit fractions.
  1.1221 +    movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
  1.1222 +    pshufb     xmm1, xmm5           // 0000000011111111
  1.1223 +    pshufb     xmm0, xmm4           // arrange pixels into pairs
  1.1224 +    pxor       xmm1, xmm6           // 0..7f and 7f..0
  1.1225 +    pmaddubsw  xmm0, xmm1           // argb_argb 16 bit, 2 pixels.
  1.1226 +    pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
  1.1227 +    pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
  1.1228 +    psrlw      xmm0, 7              // argb 8.7 fixed point to low 8 bits.
  1.1229 +    packuswb   xmm0, xmm0           // argb_argb 8 bits, 2 pixels.
  1.1230 +    movq       qword ptr [edi], xmm0
  1.1231 +    lea        edi, [edi + 8]
  1.1232 +    sub        ecx, 2               // 2 pixels
  1.1233 +    jge        xloop2
  1.1234 +
  1.1235 +    align      4
  1.1236 + xloop29:
  1.1237 +
  1.1238 +    add        ecx, 2 - 1
  1.1239 +    jl         xloop99
  1.1240 +
  1.1241 +    // 1 pixel remainder
  1.1242 +    psrlw      xmm2, 9              // 7 bit fractions.
  1.1243 +    movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
  1.1244 +    pshufb     xmm2, xmm5           // 00000000
  1.1245 +    pshufb     xmm0, xmm4           // arrange pixels into pairs
  1.1246 +    pxor       xmm2, xmm6           // 0..7f and 7f..0
  1.1247 +    pmaddubsw  xmm0, xmm2           // argb 16 bit, 1 pixel.
  1.1248 +    psrlw      xmm0, 7
  1.1249 +    packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel.
  1.1250 +    movd       [edi], xmm0
  1.1251 +
  1.1252 +    align      4
  1.1253 + xloop99:
  1.1254 +
  1.1255 +    pop        edi
  1.1256 +    pop        esi
  1.1257 +    ret
  1.1258 +  }
  1.1259 +}
  1.1260 +
  1.1261 +// Reads 4 pixels, duplicates them and writes 8 pixels.
  1.1262 +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
  1.1263 +__declspec(naked) __declspec(align(16))
  1.1264 +void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
  1.1265 +                           int dst_width, int x, int dx) {
  1.1266 +  __asm {
  1.1267 +    mov        edx, [esp + 4]    // dst_argb
  1.1268 +    mov        eax, [esp + 8]    // src_argb
  1.1269 +    mov        ecx, [esp + 12]   // dst_width
  1.1270 +
  1.1271 +    align      4
  1.1272 +  wloop:
  1.1273 +    movdqa     xmm0, [eax]
  1.1274 +    lea        eax,  [eax + 16]
  1.1275 +    movdqa     xmm1, xmm0
  1.1276 +    punpckldq  xmm0, xmm0
  1.1277 +    punpckhdq  xmm1, xmm1
  1.1278 +    sub        ecx, 8
  1.1279 +    movdqa     [edx], xmm0
  1.1280 +    movdqa     [edx + 16], xmm1
  1.1281 +    lea        edx, [edx + 32]
  1.1282 +    jg         wloop
  1.1283 +
  1.1284 +    ret
  1.1285 +  }
  1.1286 +}
  1.1287 +
  1.1288 +// Divide num by div and return as 16.16 fixed point result.
  1.1289 +__declspec(naked) __declspec(align(16))
  1.1290 +int FixedDiv_X86(int num, int div) {
  1.1291 +  __asm {
  1.1292 +    mov        eax, [esp + 4]    // num
  1.1293 +    cdq                          // extend num to 64 bits
  1.1294 +    shld       edx, eax, 16      // 32.16
  1.1295 +    shl        eax, 16
  1.1296 +    idiv       dword ptr [esp + 8]
  1.1297 +    ret
  1.1298 +  }
  1.1299 +}
  1.1300 +
  1.1301 +// Divide num by div and return as 16.16 fixed point result.
  1.1302 +__declspec(naked) __declspec(align(16))
  1.1303 +int FixedDiv1_X86(int num, int div) {
  1.1304 +  __asm {
  1.1305 +    mov        eax, [esp + 4]    // num
  1.1306 +    mov        ecx, [esp + 8]    // denom
  1.1307 +    cdq                          // extend num to 64 bits
  1.1308 +    shld       edx, eax, 16      // 32.16
  1.1309 +    shl        eax, 16
  1.1310 +    sub        eax, 0x00010001
  1.1311 +    sbb        edx, 0
  1.1312 +    sub        ecx, 1
  1.1313 +    idiv       ecx
  1.1314 +    ret
  1.1315 +  }
  1.1316 +}
  1.1317 +
  1.1318 +#endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
  1.1319 +
  1.1320 +#ifdef __cplusplus
  1.1321 +}  // extern "C"
  1.1322 +}  // namespace libyuv
  1.1323 +#endif

mercurial