media/libyuv/source/scale_win.cc

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*
     2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.
     3  *
     4  *  Use of this source code is governed by a BSD-style license
     5  *  that can be found in the LICENSE file in the root of the source
     6  *  tree. An additional intellectual property rights grant can be found
     7  *  in the file PATENTS. All contributing project authors may
     8  *  be found in the AUTHORS file in the root of the source tree.
     9  */
    11 #include "libyuv/row.h"
    13 #ifdef __cplusplus
    14 namespace libyuv {
    15 extern "C" {
    16 #endif
    18 // This module is for Visual C x86.
    19 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
    21 // Offsets for source bytes 0 to 9
    22 static uvec8 kShuf0 =
    23   { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
    25 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
    26 static uvec8 kShuf1 =
    27   { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
    29 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
    30 static uvec8 kShuf2 =
    31   { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
    33 // Offsets for source bytes 0 to 10
    34 static uvec8 kShuf01 =
    35   { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
    37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
    38 static uvec8 kShuf11 =
    39   { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
    41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
    42 static uvec8 kShuf21 =
    43   { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
    45 // Coefficients for source bytes 0 to 10
    46 static uvec8 kMadd01 =
    47   { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
    49 // Coefficients for source bytes 10 to 21
    50 static uvec8 kMadd11 =
    51   { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
    53 // Coefficients for source bytes 21 to 31
    54 static uvec8 kMadd21 =
    55   { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
    57 // Coefficients for source bytes 21 to 31
    58 static vec16 kRound34 =
    59   { 2, 2, 2, 2, 2, 2, 2, 2 };
    61 static uvec8 kShuf38a =
    62   { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
    64 static uvec8 kShuf38b =
    65   { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
    67 // Arrange words 0,3,6 into 0,1,2
    68 static uvec8 kShufAc =
    69   { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
    71 // Arrange words 0,3,6 into 3,4,5
    72 static uvec8 kShufAc3 =
    73   { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
    75 // Scaling values for boxes of 3x3 and 2x3
    76 static uvec16 kScaleAc33 =
    77   { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
    79 // Arrange first value for pixels 0,1,2,3,4,5
    80 static uvec8 kShufAb0 =
    81   { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
    83 // Arrange second value for pixels 0,1,2,3,4,5
    84 static uvec8 kShufAb1 =
    85   { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
    87 // Arrange third value for pixels 0,1,2,3,4,5
    88 static uvec8 kShufAb2 =
    89   { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
    91 // Scaling values for boxes of 3x2 and 2x2
    92 static uvec16 kScaleAb2 =
    93   { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
    95 // Reads 32 pixels, throws half away and writes 16 pixels.
    96 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
    97 __declspec(naked) __declspec(align(16))
    98 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
    99                         uint8* dst_ptr, int dst_width) {
   100   __asm {
   101     mov        eax, [esp + 4]        // src_ptr
   102                                      // src_stride ignored
   103     mov        edx, [esp + 12]       // dst_ptr
   104     mov        ecx, [esp + 16]       // dst_width
   106     align      4
   107   wloop:
   108     movdqa     xmm0, [eax]
   109     movdqa     xmm1, [eax + 16]
   110     lea        eax,  [eax + 32]
   111     psrlw      xmm0, 8               // isolate odd pixels.
   112     psrlw      xmm1, 8
   113     packuswb   xmm0, xmm1
   114     sub        ecx, 16
   115     movdqa     [edx], xmm0
   116     lea        edx, [edx + 16]
   117     jg         wloop
   119     ret
   120   }
   121 }
   123 // Blends 32x1 rectangle to 16x1.
   124 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
   125 __declspec(naked) __declspec(align(16))
   126 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   127                               uint8* dst_ptr, int dst_width) {
   128   __asm {
   129     mov        eax, [esp + 4]        // src_ptr
   130                                      // src_stride
   131     mov        edx, [esp + 12]       // dst_ptr
   132     mov        ecx, [esp + 16]       // dst_width
   133     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   134     psrlw      xmm5, 8
   136     align      4
   137   wloop:
   138     movdqa     xmm0, [eax]
   139     movdqa     xmm1, [eax + 16]
   140     lea        eax,  [eax + 32]
   142     movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
   143     psrlw      xmm0, 8
   144     movdqa     xmm3, xmm1
   145     psrlw      xmm1, 8
   146     pand       xmm2, xmm5
   147     pand       xmm3, xmm5
   148     pavgw      xmm0, xmm2
   149     pavgw      xmm1, xmm3
   150     packuswb   xmm0, xmm1
   152     sub        ecx, 16
   153     movdqa     [edx], xmm0
   154     lea        edx, [edx + 16]
   155     jg         wloop
   157     ret
   158   }
   159 }
   161 // Blends 32x2 rectangle to 16x1.
   162 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
   163 __declspec(naked) __declspec(align(16))
   164 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   165                            uint8* dst_ptr, int dst_width) {
   166   __asm {
   167     push       esi
   168     mov        eax, [esp + 4 + 4]    // src_ptr
   169     mov        esi, [esp + 4 + 8]    // src_stride
   170     mov        edx, [esp + 4 + 12]   // dst_ptr
   171     mov        ecx, [esp + 4 + 16]   // dst_width
   172     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   173     psrlw      xmm5, 8
   175     align      4
   176   wloop:
   177     movdqa     xmm0, [eax]
   178     movdqa     xmm1, [eax + 16]
   179     movdqa     xmm2, [eax + esi]
   180     movdqa     xmm3, [eax + esi + 16]
   181     lea        eax,  [eax + 32]
   182     pavgb      xmm0, xmm2            // average rows
   183     pavgb      xmm1, xmm3
   185     movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
   186     psrlw      xmm0, 8
   187     movdqa     xmm3, xmm1
   188     psrlw      xmm1, 8
   189     pand       xmm2, xmm5
   190     pand       xmm3, xmm5
   191     pavgw      xmm0, xmm2
   192     pavgw      xmm1, xmm3
   193     packuswb   xmm0, xmm1
   195     sub        ecx, 16
   196     movdqa     [edx], xmm0
   197     lea        edx, [edx + 16]
   198     jg         wloop
   200     pop        esi
   201     ret
   202   }
   203 }
   205 // Reads 32 pixels, throws half away and writes 16 pixels.
   206 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
   207 __declspec(naked) __declspec(align(16))
   208 void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,
   209                                   ptrdiff_t src_stride,
   210                                   uint8* dst_ptr, int dst_width) {
   211   __asm {
   212     mov        eax, [esp + 4]        // src_ptr
   213                                      // src_stride ignored
   214     mov        edx, [esp + 12]       // dst_ptr
   215     mov        ecx, [esp + 16]       // dst_width
   217     align      4
   218   wloop:
   219     movdqu     xmm0, [eax]
   220     movdqu     xmm1, [eax + 16]
   221     lea        eax,  [eax + 32]
   222     psrlw      xmm0, 8               // isolate odd pixels.
   223     psrlw      xmm1, 8
   224     packuswb   xmm0, xmm1
   225     sub        ecx, 16
   226     movdqu     [edx], xmm0
   227     lea        edx, [edx + 16]
   228     jg         wloop
   230     ret
   231   }
   232 }
   234 // Blends 32x1 rectangle to 16x1.
   235 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
   236 __declspec(naked) __declspec(align(16))
   237 void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
   238                                         ptrdiff_t src_stride,
   239                                         uint8* dst_ptr, int dst_width) {
   240   __asm {
   241     mov        eax, [esp + 4]        // src_ptr
   242                                      // src_stride
   243     mov        edx, [esp + 12]       // dst_ptr
   244     mov        ecx, [esp + 16]       // dst_width
   245     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   246     psrlw      xmm5, 8
   248     align      4
   249   wloop:
   250     movdqu     xmm0, [eax]
   251     movdqu     xmm1, [eax + 16]
   252     lea        eax,  [eax + 32]
   254     movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
   255     psrlw      xmm0, 8
   256     movdqa     xmm3, xmm1
   257     psrlw      xmm1, 8
   258     pand       xmm2, xmm5
   259     pand       xmm3, xmm5
   260     pavgw      xmm0, xmm2
   261     pavgw      xmm1, xmm3
   262     packuswb   xmm0, xmm1
   264     sub        ecx, 16
   265     movdqu     [edx], xmm0
   266     lea        edx, [edx + 16]
   267     jg         wloop
   269     ret
   270   }
   271 }
   273 // Blends 32x2 rectangle to 16x1.
   274 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
   275 __declspec(naked) __declspec(align(16))
   276 void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
   277                                      ptrdiff_t src_stride,
   278                                      uint8* dst_ptr, int dst_width) {
   279   __asm {
   280     push       esi
   281     mov        eax, [esp + 4 + 4]    // src_ptr
   282     mov        esi, [esp + 4 + 8]    // src_stride
   283     mov        edx, [esp + 4 + 12]   // dst_ptr
   284     mov        ecx, [esp + 4 + 16]   // dst_width
   285     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
   286     psrlw      xmm5, 8
   288     align      4
   289   wloop:
   290     movdqu     xmm0, [eax]
   291     movdqu     xmm1, [eax + 16]
   292     movdqu     xmm2, [eax + esi]
   293     movdqu     xmm3, [eax + esi + 16]
   294     lea        eax,  [eax + 32]
   295     pavgb      xmm0, xmm2            // average rows
   296     pavgb      xmm1, xmm3
   298     movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
   299     psrlw      xmm0, 8
   300     movdqa     xmm3, xmm1
   301     psrlw      xmm1, 8
   302     pand       xmm2, xmm5
   303     pand       xmm3, xmm5
   304     pavgw      xmm0, xmm2
   305     pavgw      xmm1, xmm3
   306     packuswb   xmm0, xmm1
   308     sub        ecx, 16
   309     movdqu     [edx], xmm0
   310     lea        edx, [edx + 16]
   311     jg         wloop
   313     pop        esi
   314     ret
   315   }
   316 }
   318 // Point samples 32 pixels to 8 pixels.
   319 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
   320 __declspec(naked) __declspec(align(16))
   321 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   322                         uint8* dst_ptr, int dst_width) {
   323   __asm {
   324     mov        eax, [esp + 4]        // src_ptr
   325                                      // src_stride ignored
   326     mov        edx, [esp + 12]       // dst_ptr
   327     mov        ecx, [esp + 16]       // dst_width
   328     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff0000
   329     psrld      xmm5, 24
   330     pslld      xmm5, 16
   332     align      4
   333   wloop:
   334     movdqa     xmm0, [eax]
   335     movdqa     xmm1, [eax + 16]
   336     lea        eax,  [eax + 32]
   337     pand       xmm0, xmm5
   338     pand       xmm1, xmm5
   339     packuswb   xmm0, xmm1
   340     psrlw      xmm0, 8
   341     packuswb   xmm0, xmm0
   342     sub        ecx, 8
   343     movq       qword ptr [edx], xmm0
   344     lea        edx, [edx + 8]
   345     jg         wloop
   347     ret
   348   }
   349 }
   351 // Blends 32x4 rectangle to 8x1.
   352 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
   353 __declspec(naked) __declspec(align(16))
   354 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   355                            uint8* dst_ptr, int dst_width) {
   356   __asm {
   357     push       esi
   358     push       edi
   359     mov        eax, [esp + 8 + 4]    // src_ptr
   360     mov        esi, [esp + 8 + 8]    // src_stride
   361     mov        edx, [esp + 8 + 12]   // dst_ptr
   362     mov        ecx, [esp + 8 + 16]   // dst_width
   363     lea        edi, [esi + esi * 2]  // src_stride * 3
   364     pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff
   365     psrlw      xmm7, 8
   367     align      4
   368   wloop:
   369     movdqa     xmm0, [eax]
   370     movdqa     xmm1, [eax + 16]
   371     movdqa     xmm2, [eax + esi]
   372     movdqa     xmm3, [eax + esi + 16]
   373     pavgb      xmm0, xmm2            // average rows
   374     pavgb      xmm1, xmm3
   375     movdqa     xmm2, [eax + esi * 2]
   376     movdqa     xmm3, [eax + esi * 2 + 16]
   377     movdqa     xmm4, [eax + edi]
   378     movdqa     xmm5, [eax + edi + 16]
   379     lea        eax, [eax + 32]
   380     pavgb      xmm2, xmm4
   381     pavgb      xmm3, xmm5
   382     pavgb      xmm0, xmm2
   383     pavgb      xmm1, xmm3
   385     movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)
   386     psrlw      xmm0, 8
   387     movdqa     xmm3, xmm1
   388     psrlw      xmm1, 8
   389     pand       xmm2, xmm7
   390     pand       xmm3, xmm7
   391     pavgw      xmm0, xmm2
   392     pavgw      xmm1, xmm3
   393     packuswb   xmm0, xmm1
   395     movdqa     xmm2, xmm0            // average columns (16 to 8 pixels)
   396     psrlw      xmm0, 8
   397     pand       xmm2, xmm7
   398     pavgw      xmm0, xmm2
   399     packuswb   xmm0, xmm0
   401     sub        ecx, 8
   402     movq       qword ptr [edx], xmm0
   403     lea        edx, [edx + 8]
   404     jg         wloop
   406     pop        edi
   407     pop        esi
   408     ret
   409   }
   410 }
   412 // Point samples 32 pixels to 24 pixels.
   413 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
   414 // Then shuffled to do the scaling.
   416 // Note that movdqa+palign may be better than movdqu.
   417 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
   418 __declspec(naked) __declspec(align(16))
   419 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
   420                           uint8* dst_ptr, int dst_width) {
   421   __asm {
   422     mov        eax, [esp + 4]        // src_ptr
   423                                      // src_stride ignored
   424     mov        edx, [esp + 12]       // dst_ptr
   425     mov        ecx, [esp + 16]       // dst_width
   426     movdqa     xmm3, kShuf0
   427     movdqa     xmm4, kShuf1
   428     movdqa     xmm5, kShuf2
   430     align      4
   431   wloop:
   432     movdqa     xmm0, [eax]
   433     movdqa     xmm1, [eax + 16]
   434     lea        eax,  [eax + 32]
   435     movdqa     xmm2, xmm1
   436     palignr    xmm1, xmm0, 8
   437     pshufb     xmm0, xmm3
   438     pshufb     xmm1, xmm4
   439     pshufb     xmm2, xmm5
   440     movq       qword ptr [edx], xmm0
   441     movq       qword ptr [edx + 8], xmm1
   442     movq       qword ptr [edx + 16], xmm2
   443     lea        edx, [edx + 24]
   444     sub        ecx, 24
   445     jg         wloop
   447     ret
   448   }
   449 }
   451 // Blends 32x2 rectangle to 24x1
   452 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.
   453 // Then shuffled to do the scaling.
   455 // Register usage:
   456 // xmm0 src_row 0
   457 // xmm1 src_row 1
   458 // xmm2 shuf 0
   459 // xmm3 shuf 1
   460 // xmm4 shuf 2
   461 // xmm5 madd 0
   462 // xmm6 madd 1
   463 // xmm7 kRound34
   465 // Note that movdqa+palign may be better than movdqu.
   466 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
   467 __declspec(naked) __declspec(align(16))
   468 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
   469                                 ptrdiff_t src_stride,
   470                                 uint8* dst_ptr, int dst_width) {
   471   __asm {
   472     push       esi
   473     mov        eax, [esp + 4 + 4]    // src_ptr
   474     mov        esi, [esp + 4 + 8]    // src_stride
   475     mov        edx, [esp + 4 + 12]   // dst_ptr
   476     mov        ecx, [esp + 4 + 16]   // dst_width
   477     movdqa     xmm2, kShuf01
   478     movdqa     xmm3, kShuf11
   479     movdqa     xmm4, kShuf21
   480     movdqa     xmm5, kMadd01
   481     movdqa     xmm6, kMadd11
   482     movdqa     xmm7, kRound34
   484     align      4
   485   wloop:
   486     movdqa     xmm0, [eax]           // pixels 0..7
   487     movdqa     xmm1, [eax + esi]
   488     pavgb      xmm0, xmm1
   489     pshufb     xmm0, xmm2
   490     pmaddubsw  xmm0, xmm5
   491     paddsw     xmm0, xmm7
   492     psrlw      xmm0, 2
   493     packuswb   xmm0, xmm0
   494     movq       qword ptr [edx], xmm0
   495     movdqu     xmm0, [eax + 8]       // pixels 8..15
   496     movdqu     xmm1, [eax + esi + 8]
   497     pavgb      xmm0, xmm1
   498     pshufb     xmm0, xmm3
   499     pmaddubsw  xmm0, xmm6
   500     paddsw     xmm0, xmm7
   501     psrlw      xmm0, 2
   502     packuswb   xmm0, xmm0
   503     movq       qword ptr [edx + 8], xmm0
   504     movdqa     xmm0, [eax + 16]      // pixels 16..23
   505     movdqa     xmm1, [eax + esi + 16]
   506     lea        eax, [eax + 32]
   507     pavgb      xmm0, xmm1
   508     pshufb     xmm0, xmm4
   509     movdqa     xmm1, kMadd21
   510     pmaddubsw  xmm0, xmm1
   511     paddsw     xmm0, xmm7
   512     psrlw      xmm0, 2
   513     packuswb   xmm0, xmm0
   514     sub        ecx, 24
   515     movq       qword ptr [edx + 16], xmm0
   516     lea        edx, [edx + 24]
   517     jg         wloop
   519     pop        esi
   520     ret
   521   }
   522 }
   524 // Note that movdqa+palign may be better than movdqu.
   525 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
   526 __declspec(naked) __declspec(align(16))
   527 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
   528                                 ptrdiff_t src_stride,
   529                                 uint8* dst_ptr, int dst_width) {
   530   __asm {
   531     push       esi
   532     mov        eax, [esp + 4 + 4]    // src_ptr
   533     mov        esi, [esp + 4 + 8]    // src_stride
   534     mov        edx, [esp + 4 + 12]   // dst_ptr
   535     mov        ecx, [esp + 4 + 16]   // dst_width
   536     movdqa     xmm2, kShuf01
   537     movdqa     xmm3, kShuf11
   538     movdqa     xmm4, kShuf21
   539     movdqa     xmm5, kMadd01
   540     movdqa     xmm6, kMadd11
   541     movdqa     xmm7, kRound34
   543     align      4
   544   wloop:
   545     movdqa     xmm0, [eax]           // pixels 0..7
   546     movdqa     xmm1, [eax + esi]
   547     pavgb      xmm1, xmm0
   548     pavgb      xmm0, xmm1
   549     pshufb     xmm0, xmm2
   550     pmaddubsw  xmm0, xmm5
   551     paddsw     xmm0, xmm7
   552     psrlw      xmm0, 2
   553     packuswb   xmm0, xmm0
   554     movq       qword ptr [edx], xmm0
   555     movdqu     xmm0, [eax + 8]       // pixels 8..15
   556     movdqu     xmm1, [eax + esi + 8]
   557     pavgb      xmm1, xmm0
   558     pavgb      xmm0, xmm1
   559     pshufb     xmm0, xmm3
   560     pmaddubsw  xmm0, xmm6
   561     paddsw     xmm0, xmm7
   562     psrlw      xmm0, 2
   563     packuswb   xmm0, xmm0
   564     movq       qword ptr [edx + 8], xmm0
   565     movdqa     xmm0, [eax + 16]      // pixels 16..23
   566     movdqa     xmm1, [eax + esi + 16]
   567     lea        eax, [eax + 32]
   568     pavgb      xmm1, xmm0
   569     pavgb      xmm0, xmm1
   570     pshufb     xmm0, xmm4
   571     movdqa     xmm1, kMadd21
   572     pmaddubsw  xmm0, xmm1
   573     paddsw     xmm0, xmm7
   574     psrlw      xmm0, 2
   575     packuswb   xmm0, xmm0
   576     sub        ecx, 24
   577     movq       qword ptr [edx + 16], xmm0
   578     lea        edx, [edx+24]
   579     jg         wloop
   581     pop        esi
   582     ret
   583   }
   584 }
   586 // 3/8 point sampler
   588 // Scale 32 pixels to 12
   589 __declspec(naked) __declspec(align(16))
   590 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
   591                           uint8* dst_ptr, int dst_width) {
   592   __asm {
   593     mov        eax, [esp + 4]        // src_ptr
   594                                      // src_stride ignored
   595     mov        edx, [esp + 12]       // dst_ptr
   596     mov        ecx, [esp + 16]       // dst_width
   597     movdqa     xmm4, kShuf38a
   598     movdqa     xmm5, kShuf38b
   600     align      4
   601   xloop:
   602     movdqa     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5
   603     movdqa     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11
   604     lea        eax, [eax + 32]
   605     pshufb     xmm0, xmm4
   606     pshufb     xmm1, xmm5
   607     paddusb    xmm0, xmm1
   609     sub        ecx, 12
   610     movq       qword ptr [edx], xmm0  // write 12 pixels
   611     movhlps    xmm1, xmm0
   612     movd       [edx + 8], xmm1
   613     lea        edx, [edx + 12]
   614     jg         xloop
   616     ret
   617   }
   618 }
   620 // Scale 16x3 pixels to 6x1 with interpolation
   621 __declspec(naked) __declspec(align(16))
   622 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
   623                                 ptrdiff_t src_stride,
   624                                 uint8* dst_ptr, int dst_width) {
   625   __asm {
   626     push       esi
   627     mov        eax, [esp + 4 + 4]    // src_ptr
   628     mov        esi, [esp + 4 + 8]    // src_stride
   629     mov        edx, [esp + 4 + 12]   // dst_ptr
   630     mov        ecx, [esp + 4 + 16]   // dst_width
   631     movdqa     xmm2, kShufAc
   632     movdqa     xmm3, kShufAc3
   633     movdqa     xmm4, kScaleAc33
   634     pxor       xmm5, xmm5
   636     align      4
   637   xloop:
   638     movdqa     xmm0, [eax]           // sum up 3 rows into xmm0/1
   639     movdqa     xmm6, [eax + esi]
   640     movhlps    xmm1, xmm0
   641     movhlps    xmm7, xmm6
   642     punpcklbw  xmm0, xmm5
   643     punpcklbw  xmm1, xmm5
   644     punpcklbw  xmm6, xmm5
   645     punpcklbw  xmm7, xmm5
   646     paddusw    xmm0, xmm6
   647     paddusw    xmm1, xmm7
   648     movdqa     xmm6, [eax + esi * 2]
   649     lea        eax, [eax + 16]
   650     movhlps    xmm7, xmm6
   651     punpcklbw  xmm6, xmm5
   652     punpcklbw  xmm7, xmm5
   653     paddusw    xmm0, xmm6
   654     paddusw    xmm1, xmm7
   656     movdqa     xmm6, xmm0            // 8 pixels -> 0,1,2 of xmm6
   657     psrldq     xmm0, 2
   658     paddusw    xmm6, xmm0
   659     psrldq     xmm0, 2
   660     paddusw    xmm6, xmm0
   661     pshufb     xmm6, xmm2
   663     movdqa     xmm7, xmm1            // 8 pixels -> 3,4,5 of xmm6
   664     psrldq     xmm1, 2
   665     paddusw    xmm7, xmm1
   666     psrldq     xmm1, 2
   667     paddusw    xmm7, xmm1
   668     pshufb     xmm7, xmm3
   669     paddusw    xmm6, xmm7
   671     pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6
   672     packuswb   xmm6, xmm6
   674     sub        ecx, 6
   675     movd       [edx], xmm6           // write 6 pixels
   676     psrlq      xmm6, 16
   677     movd       [edx + 2], xmm6
   678     lea        edx, [edx + 6]
   679     jg         xloop
   681     pop        esi
   682     ret
   683   }
   684 }
   686 // Scale 16x2 pixels to 6x1 with interpolation
   687 __declspec(naked) __declspec(align(16))
   688 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
   689                                 ptrdiff_t src_stride,
   690                                 uint8* dst_ptr, int dst_width) {
   691   __asm {
   692     push       esi
   693     mov        eax, [esp + 4 + 4]    // src_ptr
   694     mov        esi, [esp + 4 + 8]    // src_stride
   695     mov        edx, [esp + 4 + 12]   // dst_ptr
   696     mov        ecx, [esp + 4 + 16]   // dst_width
   697     movdqa     xmm2, kShufAb0
   698     movdqa     xmm3, kShufAb1
   699     movdqa     xmm4, kShufAb2
   700     movdqa     xmm5, kScaleAb2
   702     align      4
   703   xloop:
   704     movdqa     xmm0, [eax]           // average 2 rows into xmm0
   705     pavgb      xmm0, [eax + esi]
   706     lea        eax, [eax + 16]
   708     movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1
   709     pshufb     xmm1, xmm2
   710     movdqa     xmm6, xmm0
   711     pshufb     xmm6, xmm3
   712     paddusw    xmm1, xmm6
   713     pshufb     xmm0, xmm4
   714     paddusw    xmm1, xmm0
   716     pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2
   717     packuswb   xmm1, xmm1
   719     sub        ecx, 6
   720     movd       [edx], xmm1           // write 6 pixels
   721     psrlq      xmm1, 16
   722     movd       [edx + 2], xmm1
   723     lea        edx, [edx + 6]
   724     jg         xloop
   726     pop        esi
   727     ret
   728   }
   729 }
   731 // Reads 16xN bytes and produces 16 shorts at a time.
   732 // TODO(fbarchard): Make this handle 4xN bytes for any width ARGB.
   733 __declspec(naked) __declspec(align(16))
   734 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
   735                        uint16* dst_ptr, int src_width,
   736                        int src_height) {
   737   __asm {
   738     push       esi
   739     push       edi
   740     push       ebx
   741     push       ebp
   742     mov        esi, [esp + 16 + 4]   // src_ptr
   743     mov        edx, [esp + 16 + 8]   // src_stride
   744     mov        edi, [esp + 16 + 12]  // dst_ptr
   745     mov        ecx, [esp + 16 + 16]  // dst_width
   746     mov        ebx, [esp + 16 + 20]  // height
   747     pxor       xmm4, xmm4
   748     dec        ebx
   750     align      4
   751   xloop:
   752     // first row
   753     movdqa     xmm0, [esi]
   754     lea        eax, [esi + edx]
   755     movdqa     xmm1, xmm0
   756     punpcklbw  xmm0, xmm4
   757     punpckhbw  xmm1, xmm4
   758     lea        esi, [esi + 16]
   759     mov        ebp, ebx
   760     test       ebp, ebp
   761     je         ydone
   763     // sum remaining rows
   764     align      4
   765   yloop:
   766     movdqa     xmm2, [eax]       // read 16 pixels
   767     lea        eax, [eax + edx]  // advance to next row
   768     movdqa     xmm3, xmm2
   769     punpcklbw  xmm2, xmm4
   770     punpckhbw  xmm3, xmm4
   771     paddusw    xmm0, xmm2        // sum 16 words
   772     paddusw    xmm1, xmm3
   773     sub        ebp, 1
   774     jg         yloop
   776     align      4
   777   ydone:
   778     movdqa     [edi], xmm0
   779     movdqa     [edi + 16], xmm1
   780     lea        edi, [edi + 32]
   782     sub        ecx, 16
   783     jg         xloop
   785     pop        ebp
   786     pop        ebx
   787     pop        edi
   788     pop        esi
   789     ret
   790   }
   791 }
   793 // Bilinear column filtering. SSSE3 version.
   794 // TODO(fbarchard): Port to Neon
   795 // TODO(fbarchard): Switch the following:
   796 //    xor        ebx, ebx
   797 //    mov        bx, word ptr [esi + eax]  // 2 source x0 pixels
   798 // To
   799 //    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
   800 // when drmemory bug fixed.
   801 // https://code.google.com/p/drmemory/issues/detail?id=1396
   803 __declspec(naked) __declspec(align(16))
   804 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
   805                            int dst_width, int x, int dx) {
   806   __asm {
   807     push       ebx
   808     push       esi
   809     push       edi
   810     mov        edi, [esp + 12 + 4]    // dst_ptr
   811     mov        esi, [esp + 12 + 8]    // src_ptr
   812     mov        ecx, [esp + 12 + 12]   // dst_width
   813     movd       xmm2, [esp + 12 + 16]  // x
   814     movd       xmm3, [esp + 12 + 20]  // dx
   815     mov        eax, 0x04040000      // shuffle to line up fractions with pixel.
   816     movd       xmm5, eax
   817     pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
   818     psrlw      xmm6, 9
   819     pextrw     eax, xmm2, 1         // get x0 integer. preroll
   820     sub        ecx, 2
   821     jl         xloop29
   823     movdqa     xmm0, xmm2           // x1 = x0 + dx
   824     paddd      xmm0, xmm3
   825     punpckldq  xmm2, xmm0           // x0 x1
   826     punpckldq  xmm3, xmm3           // dx dx
   827     paddd      xmm3, xmm3           // dx * 2, dx * 2
   828     pextrw     edx, xmm2, 3         // get x1 integer. preroll
   830     // 2 Pixel loop.
   831     align      4
   832   xloop2:
   833     movdqa     xmm1, xmm2           // x0, x1 fractions.
   834     paddd      xmm2, xmm3           // x += dx
   835     movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
   836     movd       xmm0, ebx
   837     psrlw      xmm1, 9              // 7 bit fractions.
   838     movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels
   839     movd       xmm4, ebx
   840     pshufb     xmm1, xmm5           // 0011
   841     punpcklwd  xmm0, xmm4
   842     pxor       xmm1, xmm6           // 0..7f and 7f..0
   843     pmaddubsw  xmm0, xmm1           // 16 bit, 2 pixels.
   844     pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
   845     pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
   846     psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
   847     packuswb   xmm0, xmm0           // 8 bits, 2 pixels.
   848     movd       ebx, xmm0
   849     mov        [edi], bx
   850     lea        edi, [edi + 2]
   851     sub        ecx, 2               // 2 pixels
   852     jge        xloop2
   854     align      4
   855  xloop29:
   857     add        ecx, 2 - 1
   858     jl         xloop99
   860     // 1 pixel remainder
   861     movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels
   862     movd       xmm0, ebx
   863     psrlw      xmm2, 9              // 7 bit fractions.
   864     pshufb     xmm2, xmm5           // 0011
   865     pxor       xmm2, xmm6           // 0..7f and 7f..0
   866     pmaddubsw  xmm0, xmm2           // 16 bit
   867     psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.
   868     packuswb   xmm0, xmm0           // 8 bits
   869     movd       ebx, xmm0
   870     mov        [edi], bl
   872     align      4
   873  xloop99:
   875     pop        edi
   876     pop        esi
   877     pop        ebx
   878     ret
   879   }
   880 }
   882 // Reads 16 pixels, duplicates them and writes 32 pixels.
   883 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
   884 __declspec(naked) __declspec(align(16))
   885 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
   886                        int dst_width, int x, int dx) {
   887   __asm {
   888     mov        edx, [esp + 4]    // dst_ptr
   889     mov        eax, [esp + 8]    // src_ptr
   890     mov        ecx, [esp + 12]   // dst_width
   892     align      4
   893   wloop:
   894     movdqa     xmm0, [eax]
   895     lea        eax,  [eax + 16]
   896     movdqa     xmm1, xmm0
   897     punpcklbw  xmm0, xmm0
   898     punpckhbw  xmm1, xmm1
   899     sub        ecx, 32
   900     movdqa     [edx], xmm0
   901     movdqa     [edx + 16], xmm1
   902     lea        edx, [edx + 32]
   903     jg         wloop
   905     ret
   906   }
   907 }
   909 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)
   910 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
   911 __declspec(naked) __declspec(align(16))
   912 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
   913                             ptrdiff_t src_stride,
   914                             uint8* dst_argb, int dst_width) {
   915   __asm {
   916     mov        eax, [esp + 4]        // src_argb
   917                                      // src_stride ignored
   918     mov        edx, [esp + 12]       // dst_argb
   919     mov        ecx, [esp + 16]       // dst_width
   921     align      4
   922   wloop:
   923     movdqa     xmm0, [eax]
   924     movdqa     xmm1, [eax + 16]
   925     lea        eax,  [eax + 32]
   926     shufps     xmm0, xmm1, 0xdd
   927     sub        ecx, 4
   928     movdqa     [edx], xmm0
   929     lea        edx, [edx + 16]
   930     jg         wloop
   932     ret
   933   }
   934 }
   936 // Blends 8x1 rectangle to 4x1.
   937 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
   938 __declspec(naked) __declspec(align(16))
   939 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
   940                                   ptrdiff_t src_stride,
   941                                   uint8* dst_argb, int dst_width) {
   942   __asm {
   943     mov        eax, [esp + 4]        // src_argb
   944                                      // src_stride ignored
   945     mov        edx, [esp + 12]       // dst_argb
   946     mov        ecx, [esp + 16]       // dst_width
   948     align      4
   949   wloop:
   950     movdqa     xmm0, [eax]
   951     movdqa     xmm1, [eax + 16]
   952     lea        eax,  [eax + 32]
   953     movdqa     xmm2, xmm0
   954     shufps     xmm0, xmm1, 0x88      // even pixels
   955     shufps     xmm2, xmm1, 0xdd      // odd pixels
   956     pavgb      xmm0, xmm2
   957     sub        ecx, 4
   958     movdqa     [edx], xmm0
   959     lea        edx, [edx + 16]
   960     jg         wloop
   962     ret
   963   }
   964 }
   966 // Blends 8x2 rectangle to 4x1.
   967 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
   968 __declspec(naked) __declspec(align(16))
   969 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
   970                                ptrdiff_t src_stride,
   971                                uint8* dst_argb, int dst_width) {
   972   __asm {
   973     push       esi
   974     mov        eax, [esp + 4 + 4]    // src_argb
   975     mov        esi, [esp + 4 + 8]    // src_stride
   976     mov        edx, [esp + 4 + 12]   // dst_argb
   977     mov        ecx, [esp + 4 + 16]   // dst_width
   979     align      4
   980   wloop:
   981     movdqa     xmm0, [eax]
   982     movdqa     xmm1, [eax + 16]
   983     movdqa     xmm2, [eax + esi]
   984     movdqa     xmm3, [eax + esi + 16]
   985     lea        eax,  [eax + 32]
   986     pavgb      xmm0, xmm2            // average rows
   987     pavgb      xmm1, xmm3
   988     movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
   989     shufps     xmm0, xmm1, 0x88      // even pixels
   990     shufps     xmm2, xmm1, 0xdd      // odd pixels
   991     pavgb      xmm0, xmm2
   992     sub        ecx, 4
   993     movdqa     [edx], xmm0
   994     lea        edx, [edx + 16]
   995     jg         wloop
   997     pop        esi
   998     ret
   999   }
  1002 // Reads 4 pixels at a time.
  1003 // Alignment requirement: dst_argb 16 byte aligned.
  1004 __declspec(naked) __declspec(align(16))
  1005 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
  1006                                int src_stepx,
  1007                                uint8* dst_argb, int dst_width) {
  1008   __asm {
  1009     push       ebx
  1010     push       edi
  1011     mov        eax, [esp + 8 + 4]    // src_argb
  1012                                      // src_stride ignored
  1013     mov        ebx, [esp + 8 + 12]   // src_stepx
  1014     mov        edx, [esp + 8 + 16]   // dst_argb
  1015     mov        ecx, [esp + 8 + 20]   // dst_width
  1016     lea        ebx, [ebx * 4]
  1017     lea        edi, [ebx + ebx * 2]
  1019     align      4
  1020   wloop:
  1021     movd       xmm0, [eax]
  1022     movd       xmm1, [eax + ebx]
  1023     punpckldq  xmm0, xmm1
  1024     movd       xmm2, [eax + ebx * 2]
  1025     movd       xmm3, [eax + edi]
  1026     lea        eax,  [eax + ebx * 4]
  1027     punpckldq  xmm2, xmm3
  1028     punpcklqdq xmm0, xmm2
  1029     sub        ecx, 4
  1030     movdqa     [edx], xmm0
  1031     lea        edx, [edx + 16]
  1032     jg         wloop
  1034     pop        edi
  1035     pop        ebx
  1036     ret
  1040 // Blends four 2x2 to 4x1.
  1041 // Alignment requirement: dst_argb 16 byte aligned.
  1042 __declspec(naked) __declspec(align(16))
  1043 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
  1044                                   ptrdiff_t src_stride,
  1045                                   int src_stepx,
  1046                                   uint8* dst_argb, int dst_width) {
  1047   __asm {
  1048     push       ebx
  1049     push       esi
  1050     push       edi
  1051     mov        eax, [esp + 12 + 4]    // src_argb
  1052     mov        esi, [esp + 12 + 8]    // src_stride
  1053     mov        ebx, [esp + 12 + 12]   // src_stepx
  1054     mov        edx, [esp + 12 + 16]   // dst_argb
  1055     mov        ecx, [esp + 12 + 20]   // dst_width
  1056     lea        esi, [eax + esi]       // row1 pointer
  1057     lea        ebx, [ebx * 4]
  1058     lea        edi, [ebx + ebx * 2]
  1060     align      4
  1061   wloop:
  1062     movq       xmm0, qword ptr [eax]  // row0 4 pairs
  1063     movhps     xmm0, qword ptr [eax + ebx]
  1064     movq       xmm1, qword ptr [eax + ebx * 2]
  1065     movhps     xmm1, qword ptr [eax + edi]
  1066     lea        eax,  [eax + ebx * 4]
  1067     movq       xmm2, qword ptr [esi]  // row1 4 pairs
  1068     movhps     xmm2, qword ptr [esi + ebx]
  1069     movq       xmm3, qword ptr [esi + ebx * 2]
  1070     movhps     xmm3, qword ptr [esi + edi]
  1071     lea        esi,  [esi + ebx * 4]
  1072     pavgb      xmm0, xmm2            // average rows
  1073     pavgb      xmm1, xmm3
  1074     movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)
  1075     shufps     xmm0, xmm1, 0x88      // even pixels
  1076     shufps     xmm2, xmm1, 0xdd      // odd pixels
  1077     pavgb      xmm0, xmm2
  1078     sub        ecx, 4
  1079     movdqa     [edx], xmm0
  1080     lea        edx, [edx + 16]
  1081     jg         wloop
  1083     pop        edi
  1084     pop        esi
  1085     pop        ebx
  1086     ret
  1090 // Column scaling unfiltered. SSE2 version.
  1091 __declspec(naked) __declspec(align(16))
  1092 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
  1093                         int dst_width, int x, int dx) {
  1094   __asm {
  1095     push       edi
  1096     push       esi
  1097     mov        edi, [esp + 8 + 4]    // dst_argb
  1098     mov        esi, [esp + 8 + 8]    // src_argb
  1099     mov        ecx, [esp + 8 + 12]   // dst_width
  1100     movd       xmm2, [esp + 8 + 16]  // x
  1101     movd       xmm3, [esp + 8 + 20]  // dx
  1103     pshufd     xmm2, xmm2, 0         // x0 x0 x0 x0
  1104     pshufd     xmm0, xmm3, 0x11      // dx  0 dx  0
  1105     paddd      xmm2, xmm0
  1106     paddd      xmm3, xmm3            // 0, 0, 0,  dx * 2
  1107     pshufd     xmm0, xmm3, 0x05      // dx * 2, dx * 2, 0, 0
  1108     paddd      xmm2, xmm0            // x3 x2 x1 x0
  1109     paddd      xmm3, xmm3            // 0, 0, 0,  dx * 4
  1110     pshufd     xmm3, xmm3, 0         // dx * 4, dx * 4, dx * 4, dx * 4
  1112     pextrw     eax, xmm2, 1          // get x0 integer.
  1113     pextrw     edx, xmm2, 3          // get x1 integer.
  1115     cmp        ecx, 0
  1116     jle        xloop99
  1117     sub        ecx, 4
  1118     jl         xloop49
  1120     // 4 Pixel loop.
  1121     align      4
  1122  xloop4:
  1123     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
  1124     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
  1125     pextrw     eax, xmm2, 5           // get x2 integer.
  1126     pextrw     edx, xmm2, 7           // get x3 integer.
  1127     paddd      xmm2, xmm3             // x += dx
  1128     punpckldq  xmm0, xmm1             // x0 x1
  1130     movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels
  1131     movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels
  1132     pextrw     eax, xmm2, 1           // get x0 integer. next iteration.
  1133     pextrw     edx, xmm2, 3           // get x1 integer. next iteration.
  1134     punpckldq  xmm1, xmm4             // x2 x3
  1135     punpcklqdq xmm0, xmm1             // x0 x1 x2 x3
  1136     sub        ecx, 4                 // 4 pixels
  1137     movdqu     [edi], xmm0
  1138     lea        edi, [edi + 16]
  1139     jge        xloop4
  1141     align      4
  1142  xloop49:
  1143     test       ecx, 2
  1144     je         xloop29
  1146     // 2 Pixels.
  1147     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels
  1148     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels
  1149     pextrw     eax, xmm2, 5           // get x2 integer.
  1150     punpckldq  xmm0, xmm1             // x0 x1
  1152     movq       qword ptr [edi], xmm0
  1153     lea        edi, [edi + 8]
  1155  xloop29:
  1156     test       ecx, 1
  1157     je         xloop99
  1159     // 1 Pixels.
  1160     movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels
  1161     movd       dword ptr [edi], xmm0
  1162     align      4
  1163  xloop99:
  1165     pop        esi
  1166     pop        edi
  1167     ret
  1171 // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.
  1172 // TODO(fbarchard): Port to Neon
  1174 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
  1175 static uvec8 kShuffleColARGB = {
  1176   0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel
  1177   8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel
  1178 };
  1180 // Shuffle table for duplicating 2 fractions into 8 bytes each
  1181 static uvec8 kShuffleFractions = {
  1182   0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
  1183 };
  1185 __declspec(naked) __declspec(align(16))
  1186 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
  1187                                int dst_width, int x, int dx) {
  1188   __asm {
  1189     push       esi
  1190     push       edi
  1191     mov        edi, [esp + 8 + 4]    // dst_argb
  1192     mov        esi, [esp + 8 + 8]    // src_argb
  1193     mov        ecx, [esp + 8 + 12]   // dst_width
  1194     movd       xmm2, [esp + 8 + 16]  // x
  1195     movd       xmm3, [esp + 8 + 20]  // dx
  1196     movdqa     xmm4, kShuffleColARGB
  1197     movdqa     xmm5, kShuffleFractions
  1198     pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.
  1199     psrlw      xmm6, 9
  1200     pextrw     eax, xmm2, 1         // get x0 integer. preroll
  1201     sub        ecx, 2
  1202     jl         xloop29
  1204     movdqa     xmm0, xmm2           // x1 = x0 + dx
  1205     paddd      xmm0, xmm3
  1206     punpckldq  xmm2, xmm0           // x0 x1
  1207     punpckldq  xmm3, xmm3           // dx dx
  1208     paddd      xmm3, xmm3           // dx * 2, dx * 2
  1209     pextrw     edx, xmm2, 3         // get x1 integer. preroll
  1211     // 2 Pixel loop.
  1212     align      4
  1213   xloop2:
  1214     movdqa     xmm1, xmm2           // x0, x1 fractions.
  1215     paddd      xmm2, xmm3           // x += dx
  1216     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
  1217     psrlw      xmm1, 9              // 7 bit fractions.
  1218     movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels
  1219     pshufb     xmm1, xmm5           // 0000000011111111
  1220     pshufb     xmm0, xmm4           // arrange pixels into pairs
  1221     pxor       xmm1, xmm6           // 0..7f and 7f..0
  1222     pmaddubsw  xmm0, xmm1           // argb_argb 16 bit, 2 pixels.
  1223     pextrw     eax, xmm2, 1         // get x0 integer. next iteration.
  1224     pextrw     edx, xmm2, 3         // get x1 integer. next iteration.
  1225     psrlw      xmm0, 7              // argb 8.7 fixed point to low 8 bits.
  1226     packuswb   xmm0, xmm0           // argb_argb 8 bits, 2 pixels.
  1227     movq       qword ptr [edi], xmm0
  1228     lea        edi, [edi + 8]
  1229     sub        ecx, 2               // 2 pixels
  1230     jge        xloop2
  1232     align      4
  1233  xloop29:
  1235     add        ecx, 2 - 1
  1236     jl         xloop99
  1238     // 1 pixel remainder
  1239     psrlw      xmm2, 9              // 7 bit fractions.
  1240     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels
  1241     pshufb     xmm2, xmm5           // 00000000
  1242     pshufb     xmm0, xmm4           // arrange pixels into pairs
  1243     pxor       xmm2, xmm6           // 0..7f and 7f..0
  1244     pmaddubsw  xmm0, xmm2           // argb 16 bit, 1 pixel.
  1245     psrlw      xmm0, 7
  1246     packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel.
  1247     movd       [edi], xmm0
  1249     align      4
  1250  xloop99:
  1252     pop        edi
  1253     pop        esi
  1254     ret
  1258 // Reads 4 pixels, duplicates them and writes 8 pixels.
  1259 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
  1260 __declspec(naked) __declspec(align(16))
  1261 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
  1262                            int dst_width, int x, int dx) {
  1263   __asm {
  1264     mov        edx, [esp + 4]    // dst_argb
  1265     mov        eax, [esp + 8]    // src_argb
  1266     mov        ecx, [esp + 12]   // dst_width
  1268     align      4
  1269   wloop:
  1270     movdqa     xmm0, [eax]
  1271     lea        eax,  [eax + 16]
  1272     movdqa     xmm1, xmm0
  1273     punpckldq  xmm0, xmm0
  1274     punpckhdq  xmm1, xmm1
  1275     sub        ecx, 8
  1276     movdqa     [edx], xmm0
  1277     movdqa     [edx + 16], xmm1
  1278     lea        edx, [edx + 32]
  1279     jg         wloop
  1281     ret
  1285 // Divide num by div and return as 16.16 fixed point result.
  1286 __declspec(naked) __declspec(align(16))
  1287 int FixedDiv_X86(int num, int div) {
  1288   __asm {
  1289     mov        eax, [esp + 4]    // num
  1290     cdq                          // extend num to 64 bits
  1291     shld       edx, eax, 16      // 32.16
  1292     shl        eax, 16
  1293     idiv       dword ptr [esp + 8]
  1294     ret
  1298 // Divide num by div and return as 16.16 fixed point result.
  1299 __declspec(naked) __declspec(align(16))
  1300 int FixedDiv1_X86(int num, int div) {
  1301   __asm {
  1302     mov        eax, [esp + 4]    // num
  1303     mov        ecx, [esp + 8]    // denom
  1304     cdq                          // extend num to 64 bits
  1305     shld       edx, eax, 16      // 32.16
  1306     shl        eax, 16
  1307     sub        eax, 0x00010001
  1308     sbb        edx, 0
  1309     sub        ecx, 1
  1310     idiv       ecx
  1311     ret
  1315 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
  1317 #ifdef __cplusplus
  1318 }  // extern "C"
  1319 }  // namespace libyuv
  1320 #endif

mercurial