media/libvpx/vp9/encoder/x86/vp9_sad_sse3.asm

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    11 %include "vpx_ports/x86_abi_support.asm"
    13 %macro STACK_FRAME_CREATE_X3 0
    14 %if ABI_IS_32BIT
    15   %define     src_ptr       rsi
    16   %define     src_stride    rax
    17   %define     ref_ptr       rdi
    18   %define     ref_stride    rdx
    19   %define     end_ptr       rcx
    20   %define     ret_var       rbx
    21   %define     result_ptr    arg(4)
    22   %define     max_err       arg(4)
    23   %define     height        dword ptr arg(4)
    24     push        rbp
    25     mov         rbp,        rsp
    26     push        rsi
    27     push        rdi
    28     push        rbx
    30     mov         rsi,        arg(0)              ; src_ptr
    31     mov         rdi,        arg(2)              ; ref_ptr
    33     movsxd      rax,        dword ptr arg(1)    ; src_stride
    34     movsxd      rdx,        dword ptr arg(3)    ; ref_stride
    35 %else
    36   %if LIBVPX_YASM_WIN64
    37     SAVE_XMM 7, u
    38     %define     src_ptr     rcx
    39     %define     src_stride  rdx
    40     %define     ref_ptr     r8
    41     %define     ref_stride  r9
    42     %define     end_ptr     r10
    43     %define     ret_var     r11
    44     %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
    45     %define     max_err     [rsp+xmm_stack_space+8+4*8]
    46     %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
    47   %else
    48     %define     src_ptr     rdi
    49     %define     src_stride  rsi
    50     %define     ref_ptr     rdx
    51     %define     ref_stride  rcx
    52     %define     end_ptr     r9
    53     %define     ret_var     r10
    54     %define     result_ptr  r8
    55     %define     max_err     r8
    56     %define     height      r8
    57   %endif
    58 %endif
    60 %endmacro
    62 %macro STACK_FRAME_DESTROY_X3 0
    63   %define     src_ptr
    64   %define     src_stride
    65   %define     ref_ptr
    66   %define     ref_stride
    67   %define     end_ptr
    68   %define     ret_var
    69   %define     result_ptr
    70   %define     max_err
    71   %define     height
    73 %if ABI_IS_32BIT
    74     pop         rbx
    75     pop         rdi
    76     pop         rsi
    77     pop         rbp
    78 %else
    79   %if LIBVPX_YASM_WIN64
    80     RESTORE_XMM
    81   %endif
    82 %endif
    83     ret
    84 %endmacro
    86 %macro PROCESS_16X2X3 5
    87 %if %1==0
    88         movdqa          xmm0,       XMMWORD PTR [%2]
    89         lddqu           xmm5,       XMMWORD PTR [%3]
    90         lddqu           xmm6,       XMMWORD PTR [%3+1]
    91         lddqu           xmm7,       XMMWORD PTR [%3+2]
    93         psadbw          xmm5,       xmm0
    94         psadbw          xmm6,       xmm0
    95         psadbw          xmm7,       xmm0
    96 %else
    97         movdqa          xmm0,       XMMWORD PTR [%2]
    98         lddqu           xmm1,       XMMWORD PTR [%3]
    99         lddqu           xmm2,       XMMWORD PTR [%3+1]
   100         lddqu           xmm3,       XMMWORD PTR [%3+2]
   102         psadbw          xmm1,       xmm0
   103         psadbw          xmm2,       xmm0
   104         psadbw          xmm3,       xmm0
   106         paddw           xmm5,       xmm1
   107         paddw           xmm6,       xmm2
   108         paddw           xmm7,       xmm3
   109 %endif
   110         movdqa          xmm0,       XMMWORD PTR [%2+%4]
   111         lddqu           xmm1,       XMMWORD PTR [%3+%5]
   112         lddqu           xmm2,       XMMWORD PTR [%3+%5+1]
   113         lddqu           xmm3,       XMMWORD PTR [%3+%5+2]
   115 %if %1==0 || %1==1
   116         lea             %2,         [%2+%4*2]
   117         lea             %3,         [%3+%5*2]
   118 %endif
   120         psadbw          xmm1,       xmm0
   121         psadbw          xmm2,       xmm0
   122         psadbw          xmm3,       xmm0
   124         paddw           xmm5,       xmm1
   125         paddw           xmm6,       xmm2
   126         paddw           xmm7,       xmm3
   127 %endmacro
   129 %macro PROCESS_8X2X3 5
   130 %if %1==0
   131         movq            mm0,       QWORD PTR [%2]
   132         movq            mm5,       QWORD PTR [%3]
   133         movq            mm6,       QWORD PTR [%3+1]
   134         movq            mm7,       QWORD PTR [%3+2]
   136         psadbw          mm5,       mm0
   137         psadbw          mm6,       mm0
   138         psadbw          mm7,       mm0
   139 %else
   140         movq            mm0,       QWORD PTR [%2]
   141         movq            mm1,       QWORD PTR [%3]
   142         movq            mm2,       QWORD PTR [%3+1]
   143         movq            mm3,       QWORD PTR [%3+2]
   145         psadbw          mm1,       mm0
   146         psadbw          mm2,       mm0
   147         psadbw          mm3,       mm0
   149         paddw           mm5,       mm1
   150         paddw           mm6,       mm2
   151         paddw           mm7,       mm3
   152 %endif
   153         movq            mm0,       QWORD PTR [%2+%4]
   154         movq            mm1,       QWORD PTR [%3+%5]
   155         movq            mm2,       QWORD PTR [%3+%5+1]
   156         movq            mm3,       QWORD PTR [%3+%5+2]
   158 %if %1==0 || %1==1
   159         lea             %2,        [%2+%4*2]
   160         lea             %3,        [%3+%5*2]
   161 %endif
   163         psadbw          mm1,       mm0
   164         psadbw          mm2,       mm0
   165         psadbw          mm3,       mm0
   167         paddw           mm5,       mm1
   168         paddw           mm6,       mm2
   169         paddw           mm7,       mm3
   170 %endmacro
   172 ;void int vp9_sad16x16x3_sse3(
   173 ;    unsigned char *src_ptr,
   174 ;    int  src_stride,
   175 ;    unsigned char *ref_ptr,
   176 ;    int  ref_stride,
   177 ;    int  *results)
   178 global sym(vp9_sad16x16x3_sse3) PRIVATE
   179 sym(vp9_sad16x16x3_sse3):
   181     STACK_FRAME_CREATE_X3
   183         PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
   184         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   185         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   186         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   187         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   188         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   189         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   190         PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
   192         mov             rcx,        result_ptr
   194         movq            xmm0,       xmm5
   195         psrldq          xmm5,       8
   197         paddw           xmm0,       xmm5
   198         movd            [rcx],      xmm0
   199 ;-
   200         movq            xmm0,       xmm6
   201         psrldq          xmm6,       8
   203         paddw           xmm0,       xmm6
   204         movd            [rcx+4],    xmm0
   205 ;-
   206         movq            xmm0,       xmm7
   207         psrldq          xmm7,       8
   209         paddw           xmm0,       xmm7
   210         movd            [rcx+8],    xmm0
   212     STACK_FRAME_DESTROY_X3
   214 ;void int vp9_sad16x8x3_sse3(
   215 ;    unsigned char *src_ptr,
   216 ;    int  src_stride,
   217 ;    unsigned char *ref_ptr,
   218 ;    int  ref_stride,
   219 ;    int  *results)
   220 global sym(vp9_sad16x8x3_sse3) PRIVATE
   221 sym(vp9_sad16x8x3_sse3):
   223     STACK_FRAME_CREATE_X3
   225         PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
   226         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   227         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   228         PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
   230         mov             rcx,        result_ptr
   232         movq            xmm0,       xmm5
   233         psrldq          xmm5,       8
   235         paddw           xmm0,       xmm5
   236         movd            [rcx],      xmm0
   237 ;-
   238         movq            xmm0,       xmm6
   239         psrldq          xmm6,       8
   241         paddw           xmm0,       xmm6
   242         movd            [rcx+4],    xmm0
   243 ;-
   244         movq            xmm0,       xmm7
   245         psrldq          xmm7,       8
   247         paddw           xmm0,       xmm7
   248         movd            [rcx+8],    xmm0
   250     STACK_FRAME_DESTROY_X3
   252 ;void int vp9_sad8x16x3_sse3(
   253 ;    unsigned char *src_ptr,
   254 ;    int  src_stride,
   255 ;    unsigned char *ref_ptr,
   256 ;    int  ref_stride,
   257 ;    int  *results)
   258 global sym(vp9_sad8x16x3_sse3) PRIVATE
   259 sym(vp9_sad8x16x3_sse3):
   261     STACK_FRAME_CREATE_X3
   263         PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
   264         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   265         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   266         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   267         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   268         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   269         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   270         PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
   272         mov             rcx,        result_ptr
   274         punpckldq       mm5,        mm6
   276         movq            [rcx],      mm5
   277         movd            [rcx+8],    mm7
   279     STACK_FRAME_DESTROY_X3
   281 ;void int vp9_sad8x8x3_sse3(
   282 ;    unsigned char *src_ptr,
   283 ;    int  src_stride,
   284 ;    unsigned char *ref_ptr,
   285 ;    int  ref_stride,
   286 ;    int  *results)
   287 global sym(vp9_sad8x8x3_sse3) PRIVATE
   288 sym(vp9_sad8x8x3_sse3):
   290     STACK_FRAME_CREATE_X3
   292         PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
   293         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   294         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   295         PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
   297         mov             rcx,        result_ptr
   299         punpckldq       mm5,        mm6
   301         movq            [rcx],      mm5
   302         movd            [rcx+8],    mm7
   304     STACK_FRAME_DESTROY_X3
   306 ;void int vp9_sad4x4x3_sse3(
   307 ;    unsigned char *src_ptr,
   308 ;    int  src_stride,
   309 ;    unsigned char *ref_ptr,
   310 ;    int  ref_stride,
   311 ;    int  *results)
   312 global sym(vp9_sad4x4x3_sse3) PRIVATE
   313 sym(vp9_sad4x4x3_sse3):
   315     STACK_FRAME_CREATE_X3
   317         movd            mm0,        DWORD PTR [src_ptr]
   318         movd            mm1,        DWORD PTR [ref_ptr]
   320         movd            mm2,        DWORD PTR [src_ptr+src_stride]
   321         movd            mm3,        DWORD PTR [ref_ptr+ref_stride]
   323         punpcklbw       mm0,        mm2
   324         punpcklbw       mm1,        mm3
   326         movd            mm4,        DWORD PTR [ref_ptr+1]
   327         movd            mm5,        DWORD PTR [ref_ptr+2]
   329         movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
   330         movd            mm3,        DWORD PTR [ref_ptr+ref_stride+2]
   332         psadbw          mm1,        mm0
   334         punpcklbw       mm4,        mm2
   335         punpcklbw       mm5,        mm3
   337         psadbw          mm4,        mm0
   338         psadbw          mm5,        mm0
   340         lea             src_ptr,    [src_ptr+src_stride*2]
   341         lea             ref_ptr,    [ref_ptr+ref_stride*2]
   343         movd            mm0,        DWORD PTR [src_ptr]
   344         movd            mm2,        DWORD PTR [ref_ptr]
   346         movd            mm3,        DWORD PTR [src_ptr+src_stride]
   347         movd            mm6,        DWORD PTR [ref_ptr+ref_stride]
   349         punpcklbw       mm0,        mm3
   350         punpcklbw       mm2,        mm6
   352         movd            mm3,        DWORD PTR [ref_ptr+1]
   353         movd            mm7,        DWORD PTR [ref_ptr+2]
   355         psadbw          mm2,        mm0
   357         paddw           mm1,        mm2
   359         movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
   360         movd            mm6,        DWORD PTR [ref_ptr+ref_stride+2]
   362         punpcklbw       mm3,        mm2
   363         punpcklbw       mm7,        mm6
   365         psadbw          mm3,        mm0
   366         psadbw          mm7,        mm0
   368         paddw           mm3,        mm4
   369         paddw           mm7,        mm5
   371         mov             rcx,        result_ptr
   373         punpckldq       mm1,        mm3
   375         movq            [rcx],      mm1
   376         movd            [rcx+8],    mm7
   378     STACK_FRAME_DESTROY_X3

mercurial