media/libvpx/vp9/encoder/x86/vp9_sad4d_sse2.asm

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    11 %include "third_party/x86inc/x86inc.asm"
    13 SECTION .text
    15 ; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
    16 %macro PROCESS_4x2x4 5-6 0
    17   movd                  m0, [srcq +%2]
    18 %if %1 == 1
    19   movd                  m6, [ref1q+%3]
    20   movd                  m4, [ref2q+%3]
    21   movd                  m7, [ref3q+%3]
    22   movd                  m5, [ref4q+%3]
    23   punpckldq             m0, [srcq +%4]
    24   punpckldq             m6, [ref1q+%5]
    25   punpckldq             m4, [ref2q+%5]
    26   punpckldq             m7, [ref3q+%5]
    27   punpckldq             m5, [ref4q+%5]
    28   psadbw                m6, m0
    29   psadbw                m4, m0
    30   psadbw                m7, m0
    31   psadbw                m5, m0
    32   punpckldq             m6, m4
    33   punpckldq             m7, m5
    34 %else
    35   movd                  m1, [ref1q+%3]
    36   movd                  m2, [ref2q+%3]
    37   movd                  m3, [ref3q+%3]
    38   movd                  m4, [ref4q+%3]
    39   punpckldq             m0, [srcq +%4]
    40   punpckldq             m1, [ref1q+%5]
    41   punpckldq             m2, [ref2q+%5]
    42   punpckldq             m3, [ref3q+%5]
    43   punpckldq             m4, [ref4q+%5]
    44   psadbw                m1, m0
    45   psadbw                m2, m0
    46   psadbw                m3, m0
    47   psadbw                m4, m0
    48   punpckldq             m1, m2
    49   punpckldq             m3, m4
    50   paddd                 m6, m1
    51   paddd                 m7, m3
    52 %endif
    53 %if %6 == 1
    54   lea                 srcq, [srcq +src_strideq*2]
    55   lea                ref1q, [ref1q+ref_strideq*2]
    56   lea                ref2q, [ref2q+ref_strideq*2]
    57   lea                ref3q, [ref3q+ref_strideq*2]
    58   lea                ref4q, [ref4q+ref_strideq*2]
    59 %endif
    60 %endmacro
    62 ; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
    63 %macro PROCESS_8x2x4 5-6 0
    64   movh                  m0, [srcq +%2]
    65 %if %1 == 1
    66   movh                  m4, [ref1q+%3]
    67   movh                  m5, [ref2q+%3]
    68   movh                  m6, [ref3q+%3]
    69   movh                  m7, [ref4q+%3]
    70   movhps                m0, [srcq +%4]
    71   movhps                m4, [ref1q+%5]
    72   movhps                m5, [ref2q+%5]
    73   movhps                m6, [ref3q+%5]
    74   movhps                m7, [ref4q+%5]
    75   psadbw                m4, m0
    76   psadbw                m5, m0
    77   psadbw                m6, m0
    78   psadbw                m7, m0
    79 %else
    80   movh                  m1, [ref1q+%3]
    81   movh                  m2, [ref2q+%3]
    82   movh                  m3, [ref3q+%3]
    83   movhps                m0, [srcq +%4]
    84   movhps                m1, [ref1q+%5]
    85   movhps                m2, [ref2q+%5]
    86   movhps                m3, [ref3q+%5]
    87   psadbw                m1, m0
    88   psadbw                m2, m0
    89   psadbw                m3, m0
    90   paddd                 m4, m1
    91   movh                  m1, [ref4q+%3]
    92   movhps                m1, [ref4q+%5]
    93   paddd                 m5, m2
    94   paddd                 m6, m3
    95   psadbw                m1, m0
    96   paddd                 m7, m1
    97 %endif
    98 %if %6 == 1
    99   lea                 srcq, [srcq +src_strideq*2]
   100   lea                ref1q, [ref1q+ref_strideq*2]
   101   lea                ref2q, [ref2q+ref_strideq*2]
   102   lea                ref3q, [ref3q+ref_strideq*2]
   103   lea                ref4q, [ref4q+ref_strideq*2]
   104 %endif
   105 %endmacro
   107 ; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
   108 %macro PROCESS_16x2x4 5-6 0
   109   ; 1st 16 px
   110   mova                  m0, [srcq +%2]
   111 %if %1 == 1
   112   movu                  m4, [ref1q+%3]
   113   movu                  m5, [ref2q+%3]
   114   movu                  m6, [ref3q+%3]
   115   movu                  m7, [ref4q+%3]
   116   psadbw                m4, m0
   117   psadbw                m5, m0
   118   psadbw                m6, m0
   119   psadbw                m7, m0
   120 %else
   121   movu                  m1, [ref1q+%3]
   122   movu                  m2, [ref2q+%3]
   123   movu                  m3, [ref3q+%3]
   124   psadbw                m1, m0
   125   psadbw                m2, m0
   126   psadbw                m3, m0
   127   paddd                 m4, m1
   128   movu                  m1, [ref4q+%3]
   129   paddd                 m5, m2
   130   paddd                 m6, m3
   131   psadbw                m1, m0
   132   paddd                 m7, m1
   133 %endif
   135   ; 2nd 16 px
   136   mova                  m0, [srcq +%4]
   137   movu                  m1, [ref1q+%5]
   138   movu                  m2, [ref2q+%5]
   139   movu                  m3, [ref3q+%5]
   140   psadbw                m1, m0
   141   psadbw                m2, m0
   142   psadbw                m3, m0
   143   paddd                 m4, m1
   144   movu                  m1, [ref4q+%5]
   145   paddd                 m5, m2
   146   paddd                 m6, m3
   147 %if %6 == 1
   148   lea                 srcq, [srcq +src_strideq*2]
   149   lea                ref1q, [ref1q+ref_strideq*2]
   150   lea                ref2q, [ref2q+ref_strideq*2]
   151   lea                ref3q, [ref3q+ref_strideq*2]
   152   lea                ref4q, [ref4q+ref_strideq*2]
   153 %endif
   154   psadbw                m1, m0
   155   paddd                 m7, m1
   156 %endmacro
   158 ; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
   159 %macro PROCESS_32x2x4 5-6 0
   160   PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16
   161   PROCESS_16x2x4  0, %4, %5, %4 + 16, %5 + 16, %6
   162 %endmacro
   164 ; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
   165 %macro PROCESS_64x2x4 5-6 0
   166   PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32
   167   PROCESS_32x2x4  0, %4, %5, %4 + 32, %5 + 32, %6
   168 %endmacro
   170 ; void vp9_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
   171 ;                         uint8_t *ref[4], int ref_stride,
   172 ;                         unsigned int res[4]);
   173 ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
   174 %macro SADNXN4D 2
   175 %if UNIX64
   176 cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
   177                               res, ref2, ref3, ref4
   178 %else
   179 cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
   180                               ref2, ref3, ref4
   181 %endif
   182   movsxdifnidn src_strideq, src_strided
   183   movsxdifnidn ref_strideq, ref_strided
   184   mov                ref2q, [ref1q+gprsize*1]
   185   mov                ref3q, [ref1q+gprsize*2]
   186   mov                ref4q, [ref1q+gprsize*3]
   187   mov                ref1q, [ref1q+gprsize*0]
   189   PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
   190 %rep (%2-4)/2
   191   PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
   192 %endrep
   193   PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
   195 %if mmsize == 16
   196   pslldq                m5, 4
   197   pslldq                m7, 4
   198   por                   m4, m5
   199   por                   m6, m7
   200   mova                  m5, m4
   201   mova                  m7, m6
   202   punpcklqdq            m4, m6
   203   punpckhqdq            m5, m7
   204   movifnidn             r4, r4mp
   205   paddd                 m4, m5
   206   movu                [r4], m4
   207   RET
   208 %else
   209   movifnidn             r4, r4mp
   210   movq               [r4+0], m6
   211   movq               [r4+8], m7
   212   RET
   213 %endif
   214 %endmacro
   216 INIT_XMM sse2
   217 SADNXN4D 64, 64
   218 SADNXN4D 64, 32
   219 SADNXN4D 32, 64
   220 SADNXN4D 32, 32
   221 SADNXN4D 32, 16
   222 SADNXN4D 16, 32
   223 SADNXN4D 16, 16
   224 SADNXN4D 16,  8
   225 SADNXN4D  8, 16
   226 SADNXN4D  8,  8
   227 SADNXN4D  8,  4
   229 INIT_MMX sse
   230 SADNXN4D  4,  8
   231 SADNXN4D  4,  4

mercurial