media/libyuv/source/compare_win.cc

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /*
     2  *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
     3  *
     4  *  Use of this source code is governed by a BSD-style license
     5  *  that can be found in the LICENSE file in the root of the source
     6  *  tree. An additional intellectual property rights grant can be found
     7  *  in the file PATENTS. All contributing project authors may
     8  *  be found in the AUTHORS file in the root of the source tree.
     9  */
    11 #include "libyuv/basic_types.h"
    12 #include "libyuv/row.h"
    14 #ifdef __cplusplus
    15 namespace libyuv {
    16 extern "C" {
    17 #endif
    19 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
    21 __declspec(naked) __declspec(align(16))
    22 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
    23   __asm {
    24     mov        eax, [esp + 4]    // src_a
    25     mov        edx, [esp + 8]    // src_b
    26     mov        ecx, [esp + 12]   // count
    27     pxor       xmm0, xmm0
    28     pxor       xmm5, xmm5
    30     align      4
    31   wloop:
    32     movdqa     xmm1, [eax]
    33     lea        eax,  [eax + 16]
    34     movdqa     xmm2, [edx]
    35     lea        edx,  [edx + 16]
    36     sub        ecx, 16
    37     movdqa     xmm3, xmm1  // abs trick
    38     psubusb    xmm1, xmm2
    39     psubusb    xmm2, xmm3
    40     por        xmm1, xmm2
    41     movdqa     xmm2, xmm1
    42     punpcklbw  xmm1, xmm5
    43     punpckhbw  xmm2, xmm5
    44     pmaddwd    xmm1, xmm1
    45     pmaddwd    xmm2, xmm2
    46     paddd      xmm0, xmm1
    47     paddd      xmm0, xmm2
    48     jg         wloop
    50     pshufd     xmm1, xmm0, 0xee
    51     paddd      xmm0, xmm1
    52     pshufd     xmm1, xmm0, 0x01
    53     paddd      xmm0, xmm1
    54     movd       eax, xmm0
    55     ret
    56   }
    57 }
    59 // Visual C 2012 required for AVX2.
    60 #if _MSC_VER >= 1700
    61 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
    62 #pragma warning(disable: 4752)
    63 __declspec(naked) __declspec(align(16))
    64 uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
    65   __asm {
    66     mov        eax, [esp + 4]    // src_a
    67     mov        edx, [esp + 8]    // src_b
    68     mov        ecx, [esp + 12]   // count
    69     vpxor      ymm0, ymm0, ymm0  // sum
    70     vpxor      ymm5, ymm5, ymm5  // constant 0 for unpck
    71     sub        edx, eax
    73     align      4
    74   wloop:
    75     vmovdqu    ymm1, [eax]
    76     vmovdqu    ymm2, [eax + edx]
    77     lea        eax,  [eax + 32]
    78     sub        ecx, 32
    79     vpsubusb   ymm3, ymm1, ymm2  // abs difference trick
    80     vpsubusb   ymm2, ymm2, ymm1
    81     vpor       ymm1, ymm2, ymm3
    82     vpunpcklbw ymm2, ymm1, ymm5  // u16.  mutates order.
    83     vpunpckhbw ymm1, ymm1, ymm5
    84     vpmaddwd   ymm2, ymm2, ymm2  // square + hadd to u32.
    85     vpmaddwd   ymm1, ymm1, ymm1
    86     vpaddd     ymm0, ymm0, ymm1
    87     vpaddd     ymm0, ymm0, ymm2
    88     jg         wloop
    90     vpshufd    ymm1, ymm0, 0xee  // 3, 2 + 1, 0 both lanes.
    91     vpaddd     ymm0, ymm0, ymm1
    92     vpshufd    ymm1, ymm0, 0x01  // 1 + 0 both lanes.
    93     vpaddd     ymm0, ymm0, ymm1
    94     vpermq     ymm1, ymm0, 0x02  // high + low lane.
    95     vpaddd     ymm0, ymm0, ymm1
    96     vmovd      eax, xmm0
    97     vzeroupper
    98     ret
    99   }
   100 }
   101 #endif  // _MSC_VER >= 1700
   103 #define HAS_HASHDJB2_SSE41
   104 static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 };  // 33 ^ 16
   105 static uvec32 kHashMul0 = {
   106   0x0c3525e1,  // 33 ^ 15
   107   0xa3476dc1,  // 33 ^ 14
   108   0x3b4039a1,  // 33 ^ 13
   109   0x4f5f0981,  // 33 ^ 12
   110 };
   111 static uvec32 kHashMul1 = {
   112   0x30f35d61,  // 33 ^ 11
   113   0x855cb541,  // 33 ^ 10
   114   0x040a9121,  // 33 ^ 9
   115   0x747c7101,  // 33 ^ 8
   116 };
   117 static uvec32 kHashMul2 = {
   118   0xec41d4e1,  // 33 ^ 7
   119   0x4cfa3cc1,  // 33 ^ 6
   120   0x025528a1,  // 33 ^ 5
   121   0x00121881,  // 33 ^ 4
   122 };
   123 static uvec32 kHashMul3 = {
   124   0x00008c61,  // 33 ^ 3
   125   0x00000441,  // 33 ^ 2
   126   0x00000021,  // 33 ^ 1
   127   0x00000001,  // 33 ^ 0
   128 };
   130 // 27: 66 0F 38 40 C6     pmulld      xmm0,xmm6
   131 // 44: 66 0F 38 40 DD     pmulld      xmm3,xmm5
   132 // 59: 66 0F 38 40 E5     pmulld      xmm4,xmm5
   133 // 72: 66 0F 38 40 D5     pmulld      xmm2,xmm5
   134 // 83: 66 0F 38 40 CD     pmulld      xmm1,xmm5
   135 #define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
   136     _asm _emit 0x40 _asm _emit reg
   138 __declspec(naked) __declspec(align(16))
   139 uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
   140   __asm {
   141     mov        eax, [esp + 4]    // src
   142     mov        ecx, [esp + 8]    // count
   143     movd       xmm0, [esp + 12]  // seed
   145     pxor       xmm7, xmm7        // constant 0 for unpck
   146     movdqa     xmm6, kHash16x33
   148     align      4
   149   wloop:
   150     movdqu     xmm1, [eax]       // src[0-15]
   151     lea        eax, [eax + 16]
   152     pmulld(0xc6)                 // pmulld      xmm0,xmm6  hash *= 33 ^ 16
   153     movdqa     xmm5, kHashMul0
   154     movdqa     xmm2, xmm1
   155     punpcklbw  xmm2, xmm7        // src[0-7]
   156     movdqa     xmm3, xmm2
   157     punpcklwd  xmm3, xmm7        // src[0-3]
   158     pmulld(0xdd)                 // pmulld     xmm3, xmm5
   159     movdqa     xmm5, kHashMul1
   160     movdqa     xmm4, xmm2
   161     punpckhwd  xmm4, xmm7        // src[4-7]
   162     pmulld(0xe5)                 // pmulld     xmm4, xmm5
   163     movdqa     xmm5, kHashMul2
   164     punpckhbw  xmm1, xmm7        // src[8-15]
   165     movdqa     xmm2, xmm1
   166     punpcklwd  xmm2, xmm7        // src[8-11]
   167     pmulld(0xd5)                 // pmulld     xmm2, xmm5
   168     movdqa     xmm5, kHashMul3
   169     punpckhwd  xmm1, xmm7        // src[12-15]
   170     pmulld(0xcd)                 // pmulld     xmm1, xmm5
   171     paddd      xmm3, xmm4        // add 16 results
   172     paddd      xmm1, xmm2
   173     sub        ecx, 16
   174     paddd      xmm1, xmm3
   176     pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
   177     paddd      xmm1, xmm2
   178     pshufd     xmm2, xmm1, 0x01
   179     paddd      xmm1, xmm2
   180     paddd      xmm0, xmm1
   181     jg         wloop
   183     movd       eax, xmm0         // return hash
   184     ret
   185   }
   186 }
   188 // Visual C 2012 required for AVX2.
   189 #if _MSC_VER >= 1700
   190 __declspec(naked) __declspec(align(16))
   191 uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
   192   __asm {
   193     mov        eax, [esp + 4]    // src
   194     mov        ecx, [esp + 8]    // count
   195     movd       xmm0, [esp + 12]  // seed
   196     movdqa     xmm6, kHash16x33
   198     align      4
   199   wloop:
   200     vpmovzxbd  xmm3, dword ptr [eax]  // src[0-3]
   201     pmulld     xmm0, xmm6  // hash *= 33 ^ 16
   202     vpmovzxbd  xmm4, dword ptr [eax + 4]  // src[4-7]
   203     pmulld     xmm3, kHashMul0
   204     vpmovzxbd  xmm2, dword ptr [eax + 8]  // src[8-11]
   205     pmulld     xmm4, kHashMul1
   206     vpmovzxbd  xmm1, dword ptr [eax + 12]  // src[12-15]
   207     pmulld     xmm2, kHashMul2
   208     lea        eax, [eax + 16]
   209     pmulld     xmm1, kHashMul3
   210     paddd      xmm3, xmm4        // add 16 results
   211     paddd      xmm1, xmm2
   212     sub        ecx, 16
   213     paddd      xmm1, xmm3
   214     pshufd     xmm2, xmm1, 0x0e  // upper 2 dwords
   215     paddd      xmm1, xmm2
   216     pshufd     xmm2, xmm1, 0x01
   217     paddd      xmm1, xmm2
   218     paddd      xmm0, xmm1
   219     jg         wloop
   221     movd       eax, xmm0         // return hash
   222     ret
   223   }
   224 }
   225 #endif  // _MSC_VER >= 1700
   227 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
   229 #ifdef __cplusplus
   230 }  // extern "C"
   231 }  // namespace libyuv
   232 #endif

mercurial