The Tor Browser: media/libyuv/source/scale

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*

     2  *  Copyright 2013 The LibYuv Project Authors. All rights reserved.

     3  *

     4  *  Use of this source code is governed by a BSD-style license

     5  *  that can be found in the LICENSE file in the root of the source

     6  *  tree. An additional intellectual property rights grant can be found

     7  *  in the file PATENTS. All contributing project authors may

     8  *  be found in the AUTHORS file in the root of the source tree.

     9  */

    11 #include "libyuv/row.h"

    13 #ifdef __cplusplus

    14 namespace libyuv {

    15 extern "C" {

    16 #endif

    18 // This module is for Visual C x86.

    19 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)

    21 // Offsets for source bytes 0 to 9

    22 static uvec8 kShuf0 =

    23   { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };

    25 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.

    26 static uvec8 kShuf1 =

    27   { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };

    29 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.

    30 static uvec8 kShuf2 =

    31   { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };

    33 // Offsets for source bytes 0 to 10

    34 static uvec8 kShuf01 =

    35   { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };

    37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.

    38 static uvec8 kShuf11 =

    39   { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };

    41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.

    42 static uvec8 kShuf21 =

    43   { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };

    45 // Coefficients for source bytes 0 to 10

    46 static uvec8 kMadd01 =

    47   { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };

    49 // Coefficients for source bytes 10 to 21

    50 static uvec8 kMadd11 =

    51   { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };

    53 // Coefficients for source bytes 21 to 31

    54 static uvec8 kMadd21 =

    55   { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };

    57 // Coefficients for source bytes 21 to 31

    58 static vec16 kRound34 =

    59   { 2, 2, 2, 2, 2, 2, 2, 2 };

    61 static uvec8 kShuf38a =

    62   { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };

    64 static uvec8 kShuf38b =

    65   { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };

    67 // Arrange words 0,3,6 into 0,1,2

    68 static uvec8 kShufAc =

    69   { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };

    71 // Arrange words 0,3,6 into 3,4,5

    72 static uvec8 kShufAc3 =

    73   { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };

    75 // Scaling values for boxes of 3x3 and 2x3

    76 static uvec16 kScaleAc33 =

    77   { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };

    79 // Arrange first value for pixels 0,1,2,3,4,5

    80 static uvec8 kShufAb0 =

    81   { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };

    83 // Arrange second value for pixels 0,1,2,3,4,5

    84 static uvec8 kShufAb1 =

    85   { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };

    87 // Arrange third value for pixels 0,1,2,3,4,5

    88 static uvec8 kShufAb2 =

    89   { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };

    91 // Scaling values for boxes of 3x2 and 2x2

    92 static uvec16 kScaleAb2 =

    93   { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };

    95 // Reads 32 pixels, throws half away and writes 16 pixels.

    96 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.

    97 __declspec(naked) __declspec(align(16))

    98 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

    99                         uint8* dst_ptr, int dst_width) {

   100   __asm {

   101     mov        eax, [esp + 4]        // src_ptr

   102                                      // src_stride ignored

   103     mov        edx, [esp + 12]       // dst_ptr

   104     mov        ecx, [esp + 16]       // dst_width

   106     align      4

   107   wloop:

   108     movdqa     xmm0, [eax]

   109     movdqa     xmm1, [eax + 16]

   110     lea        eax,  [eax + 32]

   111     psrlw      xmm0, 8               // isolate odd pixels.

   112     psrlw      xmm1, 8

   113     packuswb   xmm0, xmm1

   114     sub        ecx, 16

   115     movdqa     [edx], xmm0

   116     lea        edx, [edx + 16]

   117     jg         wloop

   119     ret

   120   }

   121 }

   123 // Blends 32x1 rectangle to 16x1.

   124 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.

   125 __declspec(naked) __declspec(align(16))

   126 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

   127                               uint8* dst_ptr, int dst_width) {

   128   __asm {

   129     mov        eax, [esp + 4]        // src_ptr

   130                                      // src_stride

   131     mov        edx, [esp + 12]       // dst_ptr

   132     mov        ecx, [esp + 16]       // dst_width

   133     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

   134     psrlw      xmm5, 8

   136     align      4

   137   wloop:

   138     movdqa     xmm0, [eax]

   139     movdqa     xmm1, [eax + 16]

   140     lea        eax,  [eax + 32]

   142     movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)

   143     psrlw      xmm0, 8

   144     movdqa     xmm3, xmm1

   145     psrlw      xmm1, 8

   146     pand       xmm2, xmm5

   147     pand       xmm3, xmm5

   148     pavgw      xmm0, xmm2

   149     pavgw      xmm1, xmm3

   150     packuswb   xmm0, xmm1

   152     sub        ecx, 16

   153     movdqa     [edx], xmm0

   154     lea        edx, [edx + 16]

   155     jg         wloop

   157     ret

   158   }

   159 }

   161 // Blends 32x2 rectangle to 16x1.

   162 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.

   163 __declspec(naked) __declspec(align(16))

   164 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

   165                            uint8* dst_ptr, int dst_width) {

   166   __asm {

   167     push       esi

   168     mov        eax, [esp + 4 + 4]    // src_ptr

   169     mov        esi, [esp + 4 + 8]    // src_stride

   170     mov        edx, [esp + 4 + 12]   // dst_ptr

   171     mov        ecx, [esp + 4 + 16]   // dst_width

   172     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

   173     psrlw      xmm5, 8

   175     align      4

   176   wloop:

   177     movdqa     xmm0, [eax]

   178     movdqa     xmm1, [eax + 16]

   179     movdqa     xmm2, [eax + esi]

   180     movdqa     xmm3, [eax + esi + 16]

   181     lea        eax,  [eax + 32]

   182     pavgb      xmm0, xmm2            // average rows

   183     pavgb      xmm1, xmm3

   185     movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)

   186     psrlw      xmm0, 8

   187     movdqa     xmm3, xmm1

   188     psrlw      xmm1, 8

   189     pand       xmm2, xmm5

   190     pand       xmm3, xmm5

   191     pavgw      xmm0, xmm2

   192     pavgw      xmm1, xmm3

   193     packuswb   xmm0, xmm1

   195     sub        ecx, 16

   196     movdqa     [edx], xmm0

   197     lea        edx, [edx + 16]

   198     jg         wloop

   200     pop        esi

   201     ret

   202   }

   203 }

   205 // Reads 32 pixels, throws half away and writes 16 pixels.

   206 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.

   207 __declspec(naked) __declspec(align(16))

   208 void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr,

   209                                   ptrdiff_t src_stride,

   210                                   uint8* dst_ptr, int dst_width) {

   211   __asm {

   212     mov        eax, [esp + 4]        // src_ptr

   213                                      // src_stride ignored

   214     mov        edx, [esp + 12]       // dst_ptr

   215     mov        ecx, [esp + 16]       // dst_width

   217     align      4

   218   wloop:

   219     movdqu     xmm0, [eax]

   220     movdqu     xmm1, [eax + 16]

   221     lea        eax,  [eax + 32]

   222     psrlw      xmm0, 8               // isolate odd pixels.

   223     psrlw      xmm1, 8

   224     packuswb   xmm0, xmm1

   225     sub        ecx, 16

   226     movdqu     [edx], xmm0

   227     lea        edx, [edx + 16]

   228     jg         wloop

   230     ret

   231   }

   232 }

   234 // Blends 32x1 rectangle to 16x1.

   235 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.

   236 __declspec(naked) __declspec(align(16))

   237 void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,

   238                                         ptrdiff_t src_stride,

   239                                         uint8* dst_ptr, int dst_width) {

   240   __asm {

   241     mov        eax, [esp + 4]        // src_ptr

   242                                      // src_stride

   243     mov        edx, [esp + 12]       // dst_ptr

   244     mov        ecx, [esp + 16]       // dst_width

   245     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

   246     psrlw      xmm5, 8

   248     align      4

   249   wloop:

   250     movdqu     xmm0, [eax]

   251     movdqu     xmm1, [eax + 16]

   252     lea        eax,  [eax + 32]

   254     movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)

   255     psrlw      xmm0, 8

   256     movdqa     xmm3, xmm1

   257     psrlw      xmm1, 8

   258     pand       xmm2, xmm5

   259     pand       xmm3, xmm5

   260     pavgw      xmm0, xmm2

   261     pavgw      xmm1, xmm3

   262     packuswb   xmm0, xmm1

   264     sub        ecx, 16

   265     movdqu     [edx], xmm0

   266     lea        edx, [edx + 16]

   267     jg         wloop

   269     ret

   270   }

   271 }

   273 // Blends 32x2 rectangle to 16x1.

   274 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.

   275 __declspec(naked) __declspec(align(16))

   276 void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,

   277                                      ptrdiff_t src_stride,

   278                                      uint8* dst_ptr, int dst_width) {

   279   __asm {

   280     push       esi

   281     mov        eax, [esp + 4 + 4]    // src_ptr

   282     mov        esi, [esp + 4 + 8]    // src_stride

   283     mov        edx, [esp + 4 + 12]   // dst_ptr

   284     mov        ecx, [esp + 4 + 16]   // dst_width

   285     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff

   286     psrlw      xmm5, 8

   288     align      4

   289   wloop:

   290     movdqu     xmm0, [eax]

   291     movdqu     xmm1, [eax + 16]

   292     movdqu     xmm2, [eax + esi]

   293     movdqu     xmm3, [eax + esi + 16]

   294     lea        eax,  [eax + 32]

   295     pavgb      xmm0, xmm2            // average rows

   296     pavgb      xmm1, xmm3

   298     movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)

   299     psrlw      xmm0, 8

   300     movdqa     xmm3, xmm1

   301     psrlw      xmm1, 8

   302     pand       xmm2, xmm5

   303     pand       xmm3, xmm5

   304     pavgw      xmm0, xmm2

   305     pavgw      xmm1, xmm3

   306     packuswb   xmm0, xmm1

   308     sub        ecx, 16

   309     movdqu     [edx], xmm0

   310     lea        edx, [edx + 16]

   311     jg         wloop

   313     pop        esi

   314     ret

   315   }

   316 }

   318 // Point samples 32 pixels to 8 pixels.

   319 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.

   320 __declspec(naked) __declspec(align(16))

   321 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

   322                         uint8* dst_ptr, int dst_width) {

   323   __asm {

   324     mov        eax, [esp + 4]        // src_ptr

   325                                      // src_stride ignored

   326     mov        edx, [esp + 12]       // dst_ptr

   327     mov        ecx, [esp + 16]       // dst_width

   328     pcmpeqb    xmm5, xmm5            // generate mask 0x00ff0000

   329     psrld      xmm5, 24

   330     pslld      xmm5, 16

   332     align      4

   333   wloop:

   334     movdqa     xmm0, [eax]

   335     movdqa     xmm1, [eax + 16]

   336     lea        eax,  [eax + 32]

   337     pand       xmm0, xmm5

   338     pand       xmm1, xmm5

   339     packuswb   xmm0, xmm1

   340     psrlw      xmm0, 8

   341     packuswb   xmm0, xmm0

   342     sub        ecx, 8

   343     movq       qword ptr [edx], xmm0

   344     lea        edx, [edx + 8]

   345     jg         wloop

   347     ret

   348   }

   349 }

   351 // Blends 32x4 rectangle to 8x1.

   352 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.

   353 __declspec(naked) __declspec(align(16))

   354 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

   355                            uint8* dst_ptr, int dst_width) {

   356   __asm {

   357     push       esi

   358     push       edi

   359     mov        eax, [esp + 8 + 4]    // src_ptr

   360     mov        esi, [esp + 8 + 8]    // src_stride

   361     mov        edx, [esp + 8 + 12]   // dst_ptr

   362     mov        ecx, [esp + 8 + 16]   // dst_width

   363     lea        edi, [esi + esi * 2]  // src_stride * 3

   364     pcmpeqb    xmm7, xmm7            // generate mask 0x00ff00ff

   365     psrlw      xmm7, 8

   367     align      4

   368   wloop:

   369     movdqa     xmm0, [eax]

   370     movdqa     xmm1, [eax + 16]

   371     movdqa     xmm2, [eax + esi]

   372     movdqa     xmm3, [eax + esi + 16]

   373     pavgb      xmm0, xmm2            // average rows

   374     pavgb      xmm1, xmm3

   375     movdqa     xmm2, [eax + esi * 2]

   376     movdqa     xmm3, [eax + esi * 2 + 16]

   377     movdqa     xmm4, [eax + edi]

   378     movdqa     xmm5, [eax + edi + 16]

   379     lea        eax, [eax + 32]

   380     pavgb      xmm2, xmm4

   381     pavgb      xmm3, xmm5

   382     pavgb      xmm0, xmm2

   383     pavgb      xmm1, xmm3

   385     movdqa     xmm2, xmm0            // average columns (32 to 16 pixels)

   386     psrlw      xmm0, 8

   387     movdqa     xmm3, xmm1

   388     psrlw      xmm1, 8

   389     pand       xmm2, xmm7

   390     pand       xmm3, xmm7

   391     pavgw      xmm0, xmm2

   392     pavgw      xmm1, xmm3

   393     packuswb   xmm0, xmm1

   395     movdqa     xmm2, xmm0            // average columns (16 to 8 pixels)

   396     psrlw      xmm0, 8

   397     pand       xmm2, xmm7

   398     pavgw      xmm0, xmm2

   399     packuswb   xmm0, xmm0

   401     sub        ecx, 8

   402     movq       qword ptr [edx], xmm0

   403     lea        edx, [edx + 8]

   404     jg         wloop

   406     pop        edi

   407     pop        esi

   408     ret

   409   }

   410 }

   412 // Point samples 32 pixels to 24 pixels.

   413 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.

   414 // Then shuffled to do the scaling.

   416 // Note that movdqa+palign may be better than movdqu.

   417 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.

   418 __declspec(naked) __declspec(align(16))

   419 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

   420                           uint8* dst_ptr, int dst_width) {

   421   __asm {

   422     mov        eax, [esp + 4]        // src_ptr

   423                                      // src_stride ignored

   424     mov        edx, [esp + 12]       // dst_ptr

   425     mov        ecx, [esp + 16]       // dst_width

   426     movdqa     xmm3, kShuf0

   427     movdqa     xmm4, kShuf1

   428     movdqa     xmm5, kShuf2

   430     align      4

   431   wloop:

   432     movdqa     xmm0, [eax]

   433     movdqa     xmm1, [eax + 16]

   434     lea        eax,  [eax + 32]

   435     movdqa     xmm2, xmm1

   436     palignr    xmm1, xmm0, 8

   437     pshufb     xmm0, xmm3

   438     pshufb     xmm1, xmm4

   439     pshufb     xmm2, xmm5

   440     movq       qword ptr [edx], xmm0

   441     movq       qword ptr [edx + 8], xmm1

   442     movq       qword ptr [edx + 16], xmm2

   443     lea        edx, [edx + 24]

   444     sub        ecx, 24

   445     jg         wloop

   447     ret

   448   }

   449 }

   451 // Blends 32x2 rectangle to 24x1

   452 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read.

   453 // Then shuffled to do the scaling.

   455 // Register usage:

   456 // xmm0 src_row 0

   457 // xmm1 src_row 1

   458 // xmm2 shuf 0

   459 // xmm3 shuf 1

   460 // xmm4 shuf 2

   461 // xmm5 madd 0

   462 // xmm6 madd 1

   463 // xmm7 kRound34

   465 // Note that movdqa+palign may be better than movdqu.

   466 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.

   467 __declspec(naked) __declspec(align(16))

   468 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,

   469                                 ptrdiff_t src_stride,

   470                                 uint8* dst_ptr, int dst_width) {

   471   __asm {

   472     push       esi

   473     mov        eax, [esp + 4 + 4]    // src_ptr

   474     mov        esi, [esp + 4 + 8]    // src_stride

   475     mov        edx, [esp + 4 + 12]   // dst_ptr

   476     mov        ecx, [esp + 4 + 16]   // dst_width

   477     movdqa     xmm2, kShuf01

   478     movdqa     xmm3, kShuf11

   479     movdqa     xmm4, kShuf21

   480     movdqa     xmm5, kMadd01

   481     movdqa     xmm6, kMadd11

   482     movdqa     xmm7, kRound34

   484     align      4

   485   wloop:

   486     movdqa     xmm0, [eax]           // pixels 0..7

   487     movdqa     xmm1, [eax + esi]

   488     pavgb      xmm0, xmm1

   489     pshufb     xmm0, xmm2

   490     pmaddubsw  xmm0, xmm5

   491     paddsw     xmm0, xmm7

   492     psrlw      xmm0, 2

   493     packuswb   xmm0, xmm0

   494     movq       qword ptr [edx], xmm0

   495     movdqu     xmm0, [eax + 8]       // pixels 8..15

   496     movdqu     xmm1, [eax + esi + 8]

   497     pavgb      xmm0, xmm1

   498     pshufb     xmm0, xmm3

   499     pmaddubsw  xmm0, xmm6

   500     paddsw     xmm0, xmm7

   501     psrlw      xmm0, 2

   502     packuswb   xmm0, xmm0

   503     movq       qword ptr [edx + 8], xmm0

   504     movdqa     xmm0, [eax + 16]      // pixels 16..23

   505     movdqa     xmm1, [eax + esi + 16]

   506     lea        eax, [eax + 32]

   507     pavgb      xmm0, xmm1

   508     pshufb     xmm0, xmm4

   509     movdqa     xmm1, kMadd21

   510     pmaddubsw  xmm0, xmm1

   511     paddsw     xmm0, xmm7

   512     psrlw      xmm0, 2

   513     packuswb   xmm0, xmm0

   514     sub        ecx, 24

   515     movq       qword ptr [edx + 16], xmm0

   516     lea        edx, [edx + 24]

   517     jg         wloop

   519     pop        esi

   520     ret

   521   }

   522 }

   524 // Note that movdqa+palign may be better than movdqu.

   525 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.

   526 __declspec(naked) __declspec(align(16))

   527 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,

   528                                 ptrdiff_t src_stride,

   529                                 uint8* dst_ptr, int dst_width) {

   530   __asm {

   531     push       esi

   532     mov        eax, [esp + 4 + 4]    // src_ptr

   533     mov        esi, [esp + 4 + 8]    // src_stride

   534     mov        edx, [esp + 4 + 12]   // dst_ptr

   535     mov        ecx, [esp + 4 + 16]   // dst_width

   536     movdqa     xmm2, kShuf01

   537     movdqa     xmm3, kShuf11

   538     movdqa     xmm4, kShuf21

   539     movdqa     xmm5, kMadd01

   540     movdqa     xmm6, kMadd11

   541     movdqa     xmm7, kRound34

   543     align      4

   544   wloop:

   545     movdqa     xmm0, [eax]           // pixels 0..7

   546     movdqa     xmm1, [eax + esi]

   547     pavgb      xmm1, xmm0

   548     pavgb      xmm0, xmm1

   549     pshufb     xmm0, xmm2

   550     pmaddubsw  xmm0, xmm5

   551     paddsw     xmm0, xmm7

   552     psrlw      xmm0, 2

   553     packuswb   xmm0, xmm0

   554     movq       qword ptr [edx], xmm0

   555     movdqu     xmm0, [eax + 8]       // pixels 8..15

   556     movdqu     xmm1, [eax + esi + 8]

   557     pavgb      xmm1, xmm0

   558     pavgb      xmm0, xmm1

   559     pshufb     xmm0, xmm3

   560     pmaddubsw  xmm0, xmm6

   561     paddsw     xmm0, xmm7

   562     psrlw      xmm0, 2

   563     packuswb   xmm0, xmm0

   564     movq       qword ptr [edx + 8], xmm0

   565     movdqa     xmm0, [eax + 16]      // pixels 16..23

   566     movdqa     xmm1, [eax + esi + 16]

   567     lea        eax, [eax + 32]

   568     pavgb      xmm1, xmm0

   569     pavgb      xmm0, xmm1

   570     pshufb     xmm0, xmm4

   571     movdqa     xmm1, kMadd21

   572     pmaddubsw  xmm0, xmm1

   573     paddsw     xmm0, xmm7

   574     psrlw      xmm0, 2

   575     packuswb   xmm0, xmm0

   576     sub        ecx, 24

   577     movq       qword ptr [edx + 16], xmm0

   578     lea        edx, [edx+24]

   579     jg         wloop

   581     pop        esi

   582     ret

   583   }

   584 }

   586 // 3/8 point sampler

   588 // Scale 32 pixels to 12

   589 __declspec(naked) __declspec(align(16))

   590 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,

   591                           uint8* dst_ptr, int dst_width) {

   592   __asm {

   593     mov        eax, [esp + 4]        // src_ptr

   594                                      // src_stride ignored

   595     mov        edx, [esp + 12]       // dst_ptr

   596     mov        ecx, [esp + 16]       // dst_width

   597     movdqa     xmm4, kShuf38a

   598     movdqa     xmm5, kShuf38b

   600     align      4

   601   xloop:

   602     movdqa     xmm0, [eax]           // 16 pixels -> 0,1,2,3,4,5

   603     movdqa     xmm1, [eax + 16]      // 16 pixels -> 6,7,8,9,10,11

   604     lea        eax, [eax + 32]

   605     pshufb     xmm0, xmm4

   606     pshufb     xmm1, xmm5

   607     paddusb    xmm0, xmm1

   609     sub        ecx, 12

   610     movq       qword ptr [edx], xmm0  // write 12 pixels

   611     movhlps    xmm1, xmm0

   612     movd       [edx + 8], xmm1

   613     lea        edx, [edx + 12]

   614     jg         xloop

   616     ret

   617   }

   618 }

   620 // Scale 16x3 pixels to 6x1 with interpolation

   621 __declspec(naked) __declspec(align(16))

   622 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,

   623                                 ptrdiff_t src_stride,

   624                                 uint8* dst_ptr, int dst_width) {

   625   __asm {

   626     push       esi

   627     mov        eax, [esp + 4 + 4]    // src_ptr

   628     mov        esi, [esp + 4 + 8]    // src_stride

   629     mov        edx, [esp + 4 + 12]   // dst_ptr

   630     mov        ecx, [esp + 4 + 16]   // dst_width

   631     movdqa     xmm2, kShufAc

   632     movdqa     xmm3, kShufAc3

   633     movdqa     xmm4, kScaleAc33

   634     pxor       xmm5, xmm5

   636     align      4

   637   xloop:

   638     movdqa     xmm0, [eax]           // sum up 3 rows into xmm0/1

   639     movdqa     xmm6, [eax + esi]

   640     movhlps    xmm1, xmm0

   641     movhlps    xmm7, xmm6

   642     punpcklbw  xmm0, xmm5

   643     punpcklbw  xmm1, xmm5

   644     punpcklbw  xmm6, xmm5

   645     punpcklbw  xmm7, xmm5

   646     paddusw    xmm0, xmm6

   647     paddusw    xmm1, xmm7

   648     movdqa     xmm6, [eax + esi * 2]

   649     lea        eax, [eax + 16]

   650     movhlps    xmm7, xmm6

   651     punpcklbw  xmm6, xmm5

   652     punpcklbw  xmm7, xmm5

   653     paddusw    xmm0, xmm6

   654     paddusw    xmm1, xmm7

   656     movdqa     xmm6, xmm0            // 8 pixels -> 0,1,2 of xmm6

   657     psrldq     xmm0, 2

   658     paddusw    xmm6, xmm0

   659     psrldq     xmm0, 2

   660     paddusw    xmm6, xmm0

   661     pshufb     xmm6, xmm2

   663     movdqa     xmm7, xmm1            // 8 pixels -> 3,4,5 of xmm6

   664     psrldq     xmm1, 2

   665     paddusw    xmm7, xmm1

   666     psrldq     xmm1, 2

   667     paddusw    xmm7, xmm1

   668     pshufb     xmm7, xmm3

   669     paddusw    xmm6, xmm7

   671     pmulhuw    xmm6, xmm4            // divide by 9,9,6, 9,9,6

   672     packuswb   xmm6, xmm6

   674     sub        ecx, 6

   675     movd       [edx], xmm6           // write 6 pixels

   676     psrlq      xmm6, 16

   677     movd       [edx + 2], xmm6

   678     lea        edx, [edx + 6]

   679     jg         xloop

   681     pop        esi

   682     ret

   683   }

   684 }

   686 // Scale 16x2 pixels to 6x1 with interpolation

   687 __declspec(naked) __declspec(align(16))

   688 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,

   689                                 ptrdiff_t src_stride,

   690                                 uint8* dst_ptr, int dst_width) {

   691   __asm {

   692     push       esi

   693     mov        eax, [esp + 4 + 4]    // src_ptr

   694     mov        esi, [esp + 4 + 8]    // src_stride

   695     mov        edx, [esp + 4 + 12]   // dst_ptr

   696     mov        ecx, [esp + 4 + 16]   // dst_width

   697     movdqa     xmm2, kShufAb0

   698     movdqa     xmm3, kShufAb1

   699     movdqa     xmm4, kShufAb2

   700     movdqa     xmm5, kScaleAb2

   702     align      4

   703   xloop:

   704     movdqa     xmm0, [eax]           // average 2 rows into xmm0

   705     pavgb      xmm0, [eax + esi]

   706     lea        eax, [eax + 16]

   708     movdqa     xmm1, xmm0            // 16 pixels -> 0,1,2,3,4,5 of xmm1

   709     pshufb     xmm1, xmm2

   710     movdqa     xmm6, xmm0

   711     pshufb     xmm6, xmm3

   712     paddusw    xmm1, xmm6

   713     pshufb     xmm0, xmm4

   714     paddusw    xmm1, xmm0

   716     pmulhuw    xmm1, xmm5            // divide by 3,3,2, 3,3,2

   717     packuswb   xmm1, xmm1

   719     sub        ecx, 6

   720     movd       [edx], xmm1           // write 6 pixels

   721     psrlq      xmm1, 16

   722     movd       [edx + 2], xmm1

   723     lea        edx, [edx + 6]

   724     jg         xloop

   726     pop        esi

   727     ret

   728   }

   729 }

   731 // Reads 16xN bytes and produces 16 shorts at a time.

   732 // TODO(fbarchard): Make this handle 4xN bytes for any width ARGB.

   733 __declspec(naked) __declspec(align(16))

   734 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,

   735                        uint16* dst_ptr, int src_width,

   736                        int src_height) {

   737   __asm {

   738     push       esi

   739     push       edi

   740     push       ebx

   741     push       ebp

   742     mov        esi, [esp + 16 + 4]   // src_ptr

   743     mov        edx, [esp + 16 + 8]   // src_stride

   744     mov        edi, [esp + 16 + 12]  // dst_ptr

   745     mov        ecx, [esp + 16 + 16]  // dst_width

   746     mov        ebx, [esp + 16 + 20]  // height

   747     pxor       xmm4, xmm4

   748     dec        ebx

   750     align      4

   751   xloop:

   752     // first row

   753     movdqa     xmm0, [esi]

   754     lea        eax, [esi + edx]

   755     movdqa     xmm1, xmm0

   756     punpcklbw  xmm0, xmm4

   757     punpckhbw  xmm1, xmm4

   758     lea        esi, [esi + 16]

   759     mov        ebp, ebx

   760     test       ebp, ebp

   761     je         ydone

   763     // sum remaining rows

   764     align      4

   765   yloop:

   766     movdqa     xmm2, [eax]       // read 16 pixels

   767     lea        eax, [eax + edx]  // advance to next row

   768     movdqa     xmm3, xmm2

   769     punpcklbw  xmm2, xmm4

   770     punpckhbw  xmm3, xmm4

   771     paddusw    xmm0, xmm2        // sum 16 words

   772     paddusw    xmm1, xmm3

   773     sub        ebp, 1

   774     jg         yloop

   776     align      4

   777   ydone:

   778     movdqa     [edi], xmm0

   779     movdqa     [edi + 16], xmm1

   780     lea        edi, [edi + 32]

   782     sub        ecx, 16

   783     jg         xloop

   785     pop        ebp

   786     pop        ebx

   787     pop        edi

   788     pop        esi

   789     ret

   790   }

   791 }

   793 // Bilinear column filtering. SSSE3 version.

   794 // TODO(fbarchard): Port to Neon

   795 // TODO(fbarchard): Switch the following:

   796 //    xor        ebx, ebx

   797 //    mov        bx, word ptr [esi + eax]  // 2 source x0 pixels

   798 // To

   799 //    movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels

   800 // when drmemory bug fixed.

   801 // https://code.google.com/p/drmemory/issues/detail?id=1396

   803 __declspec(naked) __declspec(align(16))

   804 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,

   805                            int dst_width, int x, int dx) {

   806   __asm {

   807     push       ebx

   808     push       esi

   809     push       edi

   810     mov        edi, [esp + 12 + 4]    // dst_ptr

   811     mov        esi, [esp + 12 + 8]    // src_ptr

   812     mov        ecx, [esp + 12 + 12]   // dst_width

   813     movd       xmm2, [esp + 12 + 16]  // x

   814     movd       xmm3, [esp + 12 + 20]  // dx

   815     mov        eax, 0x04040000      // shuffle to line up fractions with pixel.

   816     movd       xmm5, eax

   817     pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.

   818     psrlw      xmm6, 9

   819     pextrw     eax, xmm2, 1         // get x0 integer. preroll

   820     sub        ecx, 2

   821     jl         xloop29

   823     movdqa     xmm0, xmm2           // x1 = x0 + dx

   824     paddd      xmm0, xmm3

   825     punpckldq  xmm2, xmm0           // x0 x1

   826     punpckldq  xmm3, xmm3           // dx dx

   827     paddd      xmm3, xmm3           // dx * 2, dx * 2

   828     pextrw     edx, xmm2, 3         // get x1 integer. preroll

   830     // 2 Pixel loop.

   831     align      4

   832   xloop2:

   833     movdqa     xmm1, xmm2           // x0, x1 fractions.

   834     paddd      xmm2, xmm3           // x += dx

   835     movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels

   836     movd       xmm0, ebx

   837     psrlw      xmm1, 9              // 7 bit fractions.

   838     movzx      ebx, word ptr [esi + edx]  // 2 source x1 pixels

   839     movd       xmm4, ebx

   840     pshufb     xmm1, xmm5           // 0011

   841     punpcklwd  xmm0, xmm4

   842     pxor       xmm1, xmm6           // 0..7f and 7f..0

   843     pmaddubsw  xmm0, xmm1           // 16 bit, 2 pixels.

   844     pextrw     eax, xmm2, 1         // get x0 integer. next iteration.

   845     pextrw     edx, xmm2, 3         // get x1 integer. next iteration.

   846     psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.

   847     packuswb   xmm0, xmm0           // 8 bits, 2 pixels.

   848     movd       ebx, xmm0

   849     mov        [edi], bx

   850     lea        edi, [edi + 2]

   851     sub        ecx, 2               // 2 pixels

   852     jge        xloop2

   854     align      4

   855  xloop29:

   857     add        ecx, 2 - 1

   858     jl         xloop99

   860     // 1 pixel remainder

   861     movzx      ebx, word ptr [esi + eax]  // 2 source x0 pixels

   862     movd       xmm0, ebx

   863     psrlw      xmm2, 9              // 7 bit fractions.

   864     pshufb     xmm2, xmm5           // 0011

   865     pxor       xmm2, xmm6           // 0..7f and 7f..0

   866     pmaddubsw  xmm0, xmm2           // 16 bit

   867     psrlw      xmm0, 7              // 8.7 fixed point to low 8 bits.

   868     packuswb   xmm0, xmm0           // 8 bits

   869     movd       ebx, xmm0

   870     mov        [edi], bl

   872     align      4

   873  xloop99:

   875     pop        edi

   876     pop        esi

   877     pop        ebx

   878     ret

   879   }

   880 }

   882 // Reads 16 pixels, duplicates them and writes 32 pixels.

   883 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.

   884 __declspec(naked) __declspec(align(16))

   885 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,

   886                        int dst_width, int x, int dx) {

   887   __asm {

   888     mov        edx, [esp + 4]    // dst_ptr

   889     mov        eax, [esp + 8]    // src_ptr

   890     mov        ecx, [esp + 12]   // dst_width

   892     align      4

   893   wloop:

   894     movdqa     xmm0, [eax]

   895     lea        eax,  [eax + 16]

   896     movdqa     xmm1, xmm0

   897     punpcklbw  xmm0, xmm0

   898     punpckhbw  xmm1, xmm1

   899     sub        ecx, 32

   900     movdqa     [edx], xmm0

   901     movdqa     [edx + 16], xmm1

   902     lea        edx, [edx + 32]

   903     jg         wloop

   905     ret

   906   }

   907 }

   909 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6)

   910 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.

   911 __declspec(naked) __declspec(align(16))

   912 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,

   913                             ptrdiff_t src_stride,

   914                             uint8* dst_argb, int dst_width) {

   915   __asm {

   916     mov        eax, [esp + 4]        // src_argb

   917                                      // src_stride ignored

   918     mov        edx, [esp + 12]       // dst_argb

   919     mov        ecx, [esp + 16]       // dst_width

   921     align      4

   922   wloop:

   923     movdqa     xmm0, [eax]

   924     movdqa     xmm1, [eax + 16]

   925     lea        eax,  [eax + 32]

   926     shufps     xmm0, xmm1, 0xdd

   927     sub        ecx, 4

   928     movdqa     [edx], xmm0

   929     lea        edx, [edx + 16]

   930     jg         wloop

   932     ret

   933   }

   934 }

   936 // Blends 8x1 rectangle to 4x1.

   937 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.

   938 __declspec(naked) __declspec(align(16))

   939 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,

   940                                   ptrdiff_t src_stride,

   941                                   uint8* dst_argb, int dst_width) {

   942   __asm {

   943     mov        eax, [esp + 4]        // src_argb

   944                                      // src_stride ignored

   945     mov        edx, [esp + 12]       // dst_argb

   946     mov        ecx, [esp + 16]       // dst_width

   948     align      4

   949   wloop:

   950     movdqa     xmm0, [eax]

   951     movdqa     xmm1, [eax + 16]

   952     lea        eax,  [eax + 32]

   953     movdqa     xmm2, xmm0

   954     shufps     xmm0, xmm1, 0x88      // even pixels

   955     shufps     xmm2, xmm1, 0xdd      // odd pixels

   956     pavgb      xmm0, xmm2

   957     sub        ecx, 4

   958     movdqa     [edx], xmm0

   959     lea        edx, [edx + 16]

   960     jg         wloop

   962     ret

   963   }

   964 }

   966 // Blends 8x2 rectangle to 4x1.

   967 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.

   968 __declspec(naked) __declspec(align(16))

   969 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,

   970                                ptrdiff_t src_stride,

   971                                uint8* dst_argb, int dst_width) {

   972   __asm {

   973     push       esi

   974     mov        eax, [esp + 4 + 4]    // src_argb

   975     mov        esi, [esp + 4 + 8]    // src_stride

   976     mov        edx, [esp + 4 + 12]   // dst_argb

   977     mov        ecx, [esp + 4 + 16]   // dst_width

   979     align      4

   980   wloop:

   981     movdqa     xmm0, [eax]

   982     movdqa     xmm1, [eax + 16]

   983     movdqa     xmm2, [eax + esi]

   984     movdqa     xmm3, [eax + esi + 16]

   985     lea        eax,  [eax + 32]

   986     pavgb      xmm0, xmm2            // average rows

   987     pavgb      xmm1, xmm3

   988     movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)

   989     shufps     xmm0, xmm1, 0x88      // even pixels

   990     shufps     xmm2, xmm1, 0xdd      // odd pixels

   991     pavgb      xmm0, xmm2

   992     sub        ecx, 4

   993     movdqa     [edx], xmm0

   994     lea        edx, [edx + 16]

   995     jg         wloop

   997     pop        esi

   998     ret

   999   }

  1000 }

  1002 // Reads 4 pixels at a time.

  1003 // Alignment requirement: dst_argb 16 byte aligned.

  1004 __declspec(naked) __declspec(align(16))

  1005 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,

  1006                                int src_stepx,

  1007                                uint8* dst_argb, int dst_width) {

  1008   __asm {

  1009     push       ebx

  1010     push       edi

  1011     mov        eax, [esp + 8 + 4]    // src_argb

  1012                                      // src_stride ignored

  1013     mov        ebx, [esp + 8 + 12]   // src_stepx

  1014     mov        edx, [esp + 8 + 16]   // dst_argb

  1015     mov        ecx, [esp + 8 + 20]   // dst_width

  1016     lea        ebx, [ebx * 4]

  1017     lea        edi, [ebx + ebx * 2]

  1019     align      4

  1020   wloop:

  1021     movd       xmm0, [eax]

  1022     movd       xmm1, [eax + ebx]

  1023     punpckldq  xmm0, xmm1

  1024     movd       xmm2, [eax + ebx * 2]

  1025     movd       xmm3, [eax + edi]

  1026     lea        eax,  [eax + ebx * 4]

  1027     punpckldq  xmm2, xmm3

  1028     punpcklqdq xmm0, xmm2

  1029     sub        ecx, 4

  1030     movdqa     [edx], xmm0

  1031     lea        edx, [edx + 16]

  1032     jg         wloop

  1034     pop        edi

  1035     pop        ebx

  1036     ret

  1037   }

  1038 }

  1040 // Blends four 2x2 to 4x1.

  1041 // Alignment requirement: dst_argb 16 byte aligned.

  1042 __declspec(naked) __declspec(align(16))

  1043 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,

  1044                                   ptrdiff_t src_stride,

  1045                                   int src_stepx,

  1046                                   uint8* dst_argb, int dst_width) {

  1047   __asm {

  1048     push       ebx

  1049     push       esi

  1050     push       edi

  1051     mov        eax, [esp + 12 + 4]    // src_argb

  1052     mov        esi, [esp + 12 + 8]    // src_stride

  1053     mov        ebx, [esp + 12 + 12]   // src_stepx

  1054     mov        edx, [esp + 12 + 16]   // dst_argb

  1055     mov        ecx, [esp + 12 + 20]   // dst_width

  1056     lea        esi, [eax + esi]       // row1 pointer

  1057     lea        ebx, [ebx * 4]

  1058     lea        edi, [ebx + ebx * 2]

  1060     align      4

  1061   wloop:

  1062     movq       xmm0, qword ptr [eax]  // row0 4 pairs

  1063     movhps     xmm0, qword ptr [eax + ebx]

  1064     movq       xmm1, qword ptr [eax + ebx * 2]

  1065     movhps     xmm1, qword ptr [eax + edi]

  1066     lea        eax,  [eax + ebx * 4]

  1067     movq       xmm2, qword ptr [esi]  // row1 4 pairs

  1068     movhps     xmm2, qword ptr [esi + ebx]

  1069     movq       xmm3, qword ptr [esi + ebx * 2]

  1070     movhps     xmm3, qword ptr [esi + edi]

  1071     lea        esi,  [esi + ebx * 4]

  1072     pavgb      xmm0, xmm2            // average rows

  1073     pavgb      xmm1, xmm3

  1074     movdqa     xmm2, xmm0            // average columns (8 to 4 pixels)

  1075     shufps     xmm0, xmm1, 0x88      // even pixels

  1076     shufps     xmm2, xmm1, 0xdd      // odd pixels

  1077     pavgb      xmm0, xmm2

  1078     sub        ecx, 4

  1079     movdqa     [edx], xmm0

  1080     lea        edx, [edx + 16]

  1081     jg         wloop

  1083     pop        edi

  1084     pop        esi

  1085     pop        ebx

  1086     ret

  1087   }

  1088 }

  1090 // Column scaling unfiltered. SSE2 version.

  1091 __declspec(naked) __declspec(align(16))

  1092 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,

  1093                         int dst_width, int x, int dx) {

  1094   __asm {

  1095     push       edi

  1096     push       esi

  1097     mov        edi, [esp + 8 + 4]    // dst_argb

  1098     mov        esi, [esp + 8 + 8]    // src_argb

  1099     mov        ecx, [esp + 8 + 12]   // dst_width

  1100     movd       xmm2, [esp + 8 + 16]  // x

  1101     movd       xmm3, [esp + 8 + 20]  // dx

  1103     pshufd     xmm2, xmm2, 0         // x0 x0 x0 x0

  1104     pshufd     xmm0, xmm3, 0x11      // dx  0 dx  0

  1105     paddd      xmm2, xmm0

  1106     paddd      xmm3, xmm3            // 0, 0, 0,  dx * 2

  1107     pshufd     xmm0, xmm3, 0x05      // dx * 2, dx * 2, 0, 0

  1108     paddd      xmm2, xmm0            // x3 x2 x1 x0

  1109     paddd      xmm3, xmm3            // 0, 0, 0,  dx * 4

  1110     pshufd     xmm3, xmm3, 0         // dx * 4, dx * 4, dx * 4, dx * 4

  1112     pextrw     eax, xmm2, 1          // get x0 integer.

  1113     pextrw     edx, xmm2, 3          // get x1 integer.

  1115     cmp        ecx, 0

  1116     jle        xloop99

  1117     sub        ecx, 4

  1118     jl         xloop49

  1120     // 4 Pixel loop.

  1121     align      4

  1122  xloop4:

  1123     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels

  1124     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels

  1125     pextrw     eax, xmm2, 5           // get x2 integer.

  1126     pextrw     edx, xmm2, 7           // get x3 integer.

  1127     paddd      xmm2, xmm3             // x += dx

  1128     punpckldq  xmm0, xmm1             // x0 x1

  1130     movd       xmm1, [esi + eax * 4]  // 1 source x2 pixels

  1131     movd       xmm4, [esi + edx * 4]  // 1 source x3 pixels

  1132     pextrw     eax, xmm2, 1           // get x0 integer. next iteration.

  1133     pextrw     edx, xmm2, 3           // get x1 integer. next iteration.

  1134     punpckldq  xmm1, xmm4             // x2 x3

  1135     punpcklqdq xmm0, xmm1             // x0 x1 x2 x3

  1136     sub        ecx, 4                 // 4 pixels

  1137     movdqu     [edi], xmm0

  1138     lea        edi, [edi + 16]

  1139     jge        xloop4

  1141     align      4

  1142  xloop49:

  1143     test       ecx, 2

  1144     je         xloop29

  1146     // 2 Pixels.

  1147     movd       xmm0, [esi + eax * 4]  // 1 source x0 pixels

  1148     movd       xmm1, [esi + edx * 4]  // 1 source x1 pixels

  1149     pextrw     eax, xmm2, 5           // get x2 integer.

  1150     punpckldq  xmm0, xmm1             // x0 x1

  1152     movq       qword ptr [edi], xmm0

  1153     lea        edi, [edi + 8]

  1155  xloop29:

  1156     test       ecx, 1

  1157     je         xloop99

  1159     // 1 Pixels.

  1160     movd       xmm0, [esi + eax * 4]  // 1 source x2 pixels

  1161     movd       dword ptr [edi], xmm0

  1162     align      4

  1163  xloop99:

  1165     pop        esi

  1166     pop        edi

  1167     ret

  1168   }

  1169 }

  1171 // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version.

  1172 // TODO(fbarchard): Port to Neon

  1174 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw

  1175 static uvec8 kShuffleColARGB = {

  1176   0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u,  // bbggrraa 1st pixel

  1177   8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u  // bbggrraa 2nd pixel

  1178 };

  1180 // Shuffle table for duplicating 2 fractions into 8 bytes each

  1181 static uvec8 kShuffleFractions = {

  1182   0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,

  1183 };

  1185 __declspec(naked) __declspec(align(16))

  1186 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,

  1187                                int dst_width, int x, int dx) {

  1188   __asm {

  1189     push       esi

  1190     push       edi

  1191     mov        edi, [esp + 8 + 4]    // dst_argb

  1192     mov        esi, [esp + 8 + 8]    // src_argb

  1193     mov        ecx, [esp + 8 + 12]   // dst_width

  1194     movd       xmm2, [esp + 8 + 16]  // x

  1195     movd       xmm3, [esp + 8 + 20]  // dx

  1196     movdqa     xmm4, kShuffleColARGB

  1197     movdqa     xmm5, kShuffleFractions

  1198     pcmpeqb    xmm6, xmm6           // generate 0x007f for inverting fraction.

  1199     psrlw      xmm6, 9

  1200     pextrw     eax, xmm2, 1         // get x0 integer. preroll

  1201     sub        ecx, 2

  1202     jl         xloop29

  1204     movdqa     xmm0, xmm2           // x1 = x0 + dx

  1205     paddd      xmm0, xmm3

  1206     punpckldq  xmm2, xmm0           // x0 x1

  1207     punpckldq  xmm3, xmm3           // dx dx

  1208     paddd      xmm3, xmm3           // dx * 2, dx * 2

  1209     pextrw     edx, xmm2, 3         // get x1 integer. preroll

  1211     // 2 Pixel loop.

  1212     align      4

  1213   xloop2:

  1214     movdqa     xmm1, xmm2           // x0, x1 fractions.

  1215     paddd      xmm2, xmm3           // x += dx

  1216     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels

  1217     psrlw      xmm1, 9              // 7 bit fractions.

  1218     movhps     xmm0, qword ptr [esi + edx * 4]  // 2 source x1 pixels

  1219     pshufb     xmm1, xmm5           // 0000000011111111

  1220     pshufb     xmm0, xmm4           // arrange pixels into pairs

  1221     pxor       xmm1, xmm6           // 0..7f and 7f..0

  1222     pmaddubsw  xmm0, xmm1           // argb_argb 16 bit, 2 pixels.

  1223     pextrw     eax, xmm2, 1         // get x0 integer. next iteration.

  1224     pextrw     edx, xmm2, 3         // get x1 integer. next iteration.

  1225     psrlw      xmm0, 7              // argb 8.7 fixed point to low 8 bits.

  1226     packuswb   xmm0, xmm0           // argb_argb 8 bits, 2 pixels.

  1227     movq       qword ptr [edi], xmm0

  1228     lea        edi, [edi + 8]

  1229     sub        ecx, 2               // 2 pixels

  1230     jge        xloop2

  1232     align      4

  1233  xloop29:

  1235     add        ecx, 2 - 1

  1236     jl         xloop99

  1238     // 1 pixel remainder

  1239     psrlw      xmm2, 9              // 7 bit fractions.

  1240     movq       xmm0, qword ptr [esi + eax * 4]  // 2 source x0 pixels

  1241     pshufb     xmm2, xmm5           // 00000000

  1242     pshufb     xmm0, xmm4           // arrange pixels into pairs

  1243     pxor       xmm2, xmm6           // 0..7f and 7f..0

  1244     pmaddubsw  xmm0, xmm2           // argb 16 bit, 1 pixel.

  1245     psrlw      xmm0, 7

  1246     packuswb   xmm0, xmm0           // argb 8 bits, 1 pixel.

  1247     movd       [edi], xmm0

  1249     align      4

  1250  xloop99:

  1252     pop        edi

  1253     pop        esi

  1254     ret

  1255   }

  1256 }

  1258 // Reads 4 pixels, duplicates them and writes 8 pixels.

  1259 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.

  1260 __declspec(naked) __declspec(align(16))

  1261 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,

  1262                            int dst_width, int x, int dx) {

  1263   __asm {

  1264     mov        edx, [esp + 4]    // dst_argb

  1265     mov        eax, [esp + 8]    // src_argb

  1266     mov        ecx, [esp + 12]   // dst_width

  1268     align      4

  1269   wloop:

  1270     movdqa     xmm0, [eax]

  1271     lea        eax,  [eax + 16]

  1272     movdqa     xmm1, xmm0

  1273     punpckldq  xmm0, xmm0

  1274     punpckhdq  xmm1, xmm1

  1275     sub        ecx, 8

  1276     movdqa     [edx], xmm0

  1277     movdqa     [edx + 16], xmm1

  1278     lea        edx, [edx + 32]

  1279     jg         wloop

  1281     ret

  1282   }

  1283 }

  1285 // Divide num by div and return as 16.16 fixed point result.

  1286 __declspec(naked) __declspec(align(16))

  1287 int FixedDiv_X86(int num, int div) {

  1288   __asm {

  1289     mov        eax, [esp + 4]    // num

  1290     cdq                          // extend num to 64 bits

  1291     shld       edx, eax, 16      // 32.16

  1292     shl        eax, 16

  1293     idiv       dword ptr [esp + 8]

  1294     ret

  1295   }

  1296 }

  1298 // Divide num by div and return as 16.16 fixed point result.

  1299 __declspec(naked) __declspec(align(16))

  1300 int FixedDiv1_X86(int num, int div) {

  1301   __asm {

  1302     mov        eax, [esp + 4]    // num

  1303     mov        ecx, [esp + 8]    // denom

  1304     cdq                          // extend num to 64 bits

  1305     shld       edx, eax, 16      // 32.16

  1306     shl        eax, 16

  1307     sub        eax, 0x00010001

  1308     sbb        edx, 0

  1309     sub        ecx, 1

  1310     idiv       ecx

  1311     ret

  1312   }

  1313 }

  1315 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)

  1317 #ifdef __cplusplus

  1318 }  // extern "C"

  1319 }  // namespace libyuv

  1320 #endif

The Tor Browser / file revision

media/libyuv/source/scale_win.cc@6474c204b198

media/libyuv/source/scale_win.cc