media/libvpx/vp8/common/x86/sad_sse3.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    11 %include "vpx_ports/x86_abi_support.asm"
    13 %macro STACK_FRAME_CREATE_X3 0
    14 %if ABI_IS_32BIT
    15   %define     src_ptr       rsi
    16   %define     src_stride    rax
    17   %define     ref_ptr       rdi
    18   %define     ref_stride    rdx
    19   %define     end_ptr       rcx
    20   %define     ret_var       rbx
    21   %define     result_ptr    arg(4)
    22   %define     max_sad       arg(4)
    23   %define     height        dword ptr arg(4)
    24     push        rbp
    25     mov         rbp,        rsp
    26     push        rsi
    27     push        rdi
    28     push        rbx
    30     mov         rsi,        arg(0)              ; src_ptr
    31     mov         rdi,        arg(2)              ; ref_ptr
    33     movsxd      rax,        dword ptr arg(1)    ; src_stride
    34     movsxd      rdx,        dword ptr arg(3)    ; ref_stride
    35 %else
    36   %if LIBVPX_YASM_WIN64
    37     SAVE_XMM 7, u
    38     %define     src_ptr     rcx
    39     %define     src_stride  rdx
    40     %define     ref_ptr     r8
    41     %define     ref_stride  r9
    42     %define     end_ptr     r10
    43     %define     ret_var     r11
    44     %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
    45     %define     max_sad     [rsp+xmm_stack_space+8+4*8]
    46     %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
    47   %else
    48     %define     src_ptr     rdi
    49     %define     src_stride  rsi
    50     %define     ref_ptr     rdx
    51     %define     ref_stride  rcx
    52     %define     end_ptr     r9
    53     %define     ret_var     r10
    54     %define     result_ptr  r8
    55     %define     max_sad     r8
    56     %define     height      r8
    57   %endif
    58 %endif
    60 %endmacro
    62 %macro STACK_FRAME_DESTROY_X3 0
    63   %define     src_ptr
    64   %define     src_stride
    65   %define     ref_ptr
    66   %define     ref_stride
    67   %define     end_ptr
    68   %define     ret_var
    69   %define     result_ptr
    70   %define     max_sad
    71   %define     height
    73 %if ABI_IS_32BIT
    74     pop         rbx
    75     pop         rdi
    76     pop         rsi
    77     pop         rbp
    78 %else
    79   %if LIBVPX_YASM_WIN64
    80     RESTORE_XMM
    81   %endif
    82 %endif
    83     ret
    84 %endmacro
    86 %macro STACK_FRAME_CREATE_X4 0
    87 %if ABI_IS_32BIT
    88   %define     src_ptr       rsi
    89   %define     src_stride    rax
    90   %define     r0_ptr        rcx
    91   %define     r1_ptr        rdx
    92   %define     r2_ptr        rbx
    93   %define     r3_ptr        rdi
    94   %define     ref_stride    rbp
    95   %define     result_ptr    arg(4)
    96     push        rbp
    97     mov         rbp,        rsp
    98     push        rsi
    99     push        rdi
   100     push        rbx
   102     push        rbp
   103     mov         rdi,        arg(2)              ; ref_ptr_base
   105     LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
   107     mov         rsi,        arg(0)              ; src_ptr
   109     movsxd      rbx,        dword ptr arg(1)    ; src_stride
   110     movsxd      rbp,        dword ptr arg(3)    ; ref_stride
   112     xchg        rbx,        rax
   113 %else
   114   %if LIBVPX_YASM_WIN64
   115     SAVE_XMM 7, u
   116     %define     src_ptr     rcx
   117     %define     src_stride  rdx
   118     %define     r0_ptr      rsi
   119     %define     r1_ptr      r10
   120     %define     r2_ptr      r11
   121     %define     r3_ptr      r8
   122     %define     ref_stride  r9
   123     %define     result_ptr  [rsp+xmm_stack_space+16+4*8]
   124     push        rsi
   126     LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
   127   %else
   128     %define     src_ptr     rdi
   129     %define     src_stride  rsi
   130     %define     r0_ptr      r9
   131     %define     r1_ptr      r10
   132     %define     r2_ptr      r11
   133     %define     r3_ptr      rdx
   134     %define     ref_stride  rcx
   135     %define     result_ptr  r8
   137     LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr
   139   %endif
   140 %endif
   141 %endmacro
   143 %macro STACK_FRAME_DESTROY_X4 0
   144   %define     src_ptr
   145   %define     src_stride
   146   %define     r0_ptr
   147   %define     r1_ptr
   148   %define     r2_ptr
   149   %define     r3_ptr
   150   %define     ref_stride
   151   %define     result_ptr
   153 %if ABI_IS_32BIT
   154     pop         rbx
   155     pop         rdi
   156     pop         rsi
   157     pop         rbp
   158 %else
   159   %if LIBVPX_YASM_WIN64
   160     pop         rsi
   161     RESTORE_XMM
   162   %endif
   163 %endif
   164     ret
   165 %endmacro
   167 %macro PROCESS_16X2X3 5
   168 %if %1==0
   169         movdqa          xmm0,       XMMWORD PTR [%2]
   170         lddqu           xmm5,       XMMWORD PTR [%3]
   171         lddqu           xmm6,       XMMWORD PTR [%3+1]
   172         lddqu           xmm7,       XMMWORD PTR [%3+2]
   174         psadbw          xmm5,       xmm0
   175         psadbw          xmm6,       xmm0
   176         psadbw          xmm7,       xmm0
   177 %else
   178         movdqa          xmm0,       XMMWORD PTR [%2]
   179         lddqu           xmm1,       XMMWORD PTR [%3]
   180         lddqu           xmm2,       XMMWORD PTR [%3+1]
   181         lddqu           xmm3,       XMMWORD PTR [%3+2]
   183         psadbw          xmm1,       xmm0
   184         psadbw          xmm2,       xmm0
   185         psadbw          xmm3,       xmm0
   187         paddw           xmm5,       xmm1
   188         paddw           xmm6,       xmm2
   189         paddw           xmm7,       xmm3
   190 %endif
   191         movdqa          xmm0,       XMMWORD PTR [%2+%4]
   192         lddqu           xmm1,       XMMWORD PTR [%3+%5]
   193         lddqu           xmm2,       XMMWORD PTR [%3+%5+1]
   194         lddqu           xmm3,       XMMWORD PTR [%3+%5+2]
   196 %if %1==0 || %1==1
   197         lea             %2,         [%2+%4*2]
   198         lea             %3,         [%3+%5*2]
   199 %endif
   201         psadbw          xmm1,       xmm0
   202         psadbw          xmm2,       xmm0
   203         psadbw          xmm3,       xmm0
   205         paddw           xmm5,       xmm1
   206         paddw           xmm6,       xmm2
   207         paddw           xmm7,       xmm3
   208 %endmacro
   210 %macro PROCESS_8X2X3 5
   211 %if %1==0
   212         movq            mm0,       QWORD PTR [%2]
   213         movq            mm5,       QWORD PTR [%3]
   214         movq            mm6,       QWORD PTR [%3+1]
   215         movq            mm7,       QWORD PTR [%3+2]
   217         psadbw          mm5,       mm0
   218         psadbw          mm6,       mm0
   219         psadbw          mm7,       mm0
   220 %else
   221         movq            mm0,       QWORD PTR [%2]
   222         movq            mm1,       QWORD PTR [%3]
   223         movq            mm2,       QWORD PTR [%3+1]
   224         movq            mm3,       QWORD PTR [%3+2]
   226         psadbw          mm1,       mm0
   227         psadbw          mm2,       mm0
   228         psadbw          mm3,       mm0
   230         paddw           mm5,       mm1
   231         paddw           mm6,       mm2
   232         paddw           mm7,       mm3
   233 %endif
   234         movq            mm0,       QWORD PTR [%2+%4]
   235         movq            mm1,       QWORD PTR [%3+%5]
   236         movq            mm2,       QWORD PTR [%3+%5+1]
   237         movq            mm3,       QWORD PTR [%3+%5+2]
   239 %if %1==0 || %1==1
   240         lea             %2,        [%2+%4*2]
   241         lea             %3,        [%3+%5*2]
   242 %endif
   244         psadbw          mm1,       mm0
   245         psadbw          mm2,       mm0
   246         psadbw          mm3,       mm0
   248         paddw           mm5,       mm1
   249         paddw           mm6,       mm2
   250         paddw           mm7,       mm3
   251 %endmacro
   253 %macro LOAD_X4_ADDRESSES 5
   254         mov             %2,         [%1+REG_SZ_BYTES*0]
   255         mov             %3,         [%1+REG_SZ_BYTES*1]
   257         mov             %4,         [%1+REG_SZ_BYTES*2]
   258         mov             %5,         [%1+REG_SZ_BYTES*3]
   259 %endmacro
   261 %macro PROCESS_16X2X4 8
   262 %if %1==0
   263         movdqa          xmm0,       XMMWORD PTR [%2]
   264         lddqu           xmm4,       XMMWORD PTR [%3]
   265         lddqu           xmm5,       XMMWORD PTR [%4]
   266         lddqu           xmm6,       XMMWORD PTR [%5]
   267         lddqu           xmm7,       XMMWORD PTR [%6]
   269         psadbw          xmm4,       xmm0
   270         psadbw          xmm5,       xmm0
   271         psadbw          xmm6,       xmm0
   272         psadbw          xmm7,       xmm0
   273 %else
   274         movdqa          xmm0,       XMMWORD PTR [%2]
   275         lddqu           xmm1,       XMMWORD PTR [%3]
   276         lddqu           xmm2,       XMMWORD PTR [%4]
   277         lddqu           xmm3,       XMMWORD PTR [%5]
   279         psadbw          xmm1,       xmm0
   280         psadbw          xmm2,       xmm0
   281         psadbw          xmm3,       xmm0
   283         paddw           xmm4,       xmm1
   284         lddqu           xmm1,       XMMWORD PTR [%6]
   285         paddw           xmm5,       xmm2
   286         paddw           xmm6,       xmm3
   288         psadbw          xmm1,       xmm0
   289         paddw           xmm7,       xmm1
   290 %endif
   291         movdqa          xmm0,       XMMWORD PTR [%2+%7]
   292         lddqu           xmm1,       XMMWORD PTR [%3+%8]
   293         lddqu           xmm2,       XMMWORD PTR [%4+%8]
   294         lddqu           xmm3,       XMMWORD PTR [%5+%8]
   296         psadbw          xmm1,       xmm0
   297         psadbw          xmm2,       xmm0
   298         psadbw          xmm3,       xmm0
   300         paddw           xmm4,       xmm1
   301         lddqu           xmm1,       XMMWORD PTR [%6+%8]
   302         paddw           xmm5,       xmm2
   303         paddw           xmm6,       xmm3
   305 %if %1==0 || %1==1
   306         lea             %2,         [%2+%7*2]
   307         lea             %3,         [%3+%8*2]
   309         lea             %4,         [%4+%8*2]
   310         lea             %5,         [%5+%8*2]
   312         lea             %6,         [%6+%8*2]
   313 %endif
   314         psadbw          xmm1,       xmm0
   315         paddw           xmm7,       xmm1
   317 %endmacro
   319 %macro PROCESS_8X2X4 8
   320 %if %1==0
   321         movq            mm0,        QWORD PTR [%2]
   322         movq            mm4,        QWORD PTR [%3]
   323         movq            mm5,        QWORD PTR [%4]
   324         movq            mm6,        QWORD PTR [%5]
   325         movq            mm7,        QWORD PTR [%6]
   327         psadbw          mm4,        mm0
   328         psadbw          mm5,        mm0
   329         psadbw          mm6,        mm0
   330         psadbw          mm7,        mm0
   331 %else
   332         movq            mm0,        QWORD PTR [%2]
   333         movq            mm1,        QWORD PTR [%3]
   334         movq            mm2,        QWORD PTR [%4]
   335         movq            mm3,        QWORD PTR [%5]
   337         psadbw          mm1,        mm0
   338         psadbw          mm2,        mm0
   339         psadbw          mm3,        mm0
   341         paddw           mm4,        mm1
   342         movq            mm1,        QWORD PTR [%6]
   343         paddw           mm5,        mm2
   344         paddw           mm6,        mm3
   346         psadbw          mm1,        mm0
   347         paddw           mm7,        mm1
   348 %endif
   349         movq            mm0,        QWORD PTR [%2+%7]
   350         movq            mm1,        QWORD PTR [%3+%8]
   351         movq            mm2,        QWORD PTR [%4+%8]
   352         movq            mm3,        QWORD PTR [%5+%8]
   354         psadbw          mm1,        mm0
   355         psadbw          mm2,        mm0
   356         psadbw          mm3,        mm0
   358         paddw           mm4,        mm1
   359         movq            mm1,        QWORD PTR [%6+%8]
   360         paddw           mm5,        mm2
   361         paddw           mm6,        mm3
   363 %if %1==0 || %1==1
   364         lea             %2,         [%2+%7*2]
   365         lea             %3,         [%3+%8*2]
   367         lea             %4,         [%4+%8*2]
   368         lea             %5,         [%5+%8*2]
   370         lea             %6,         [%6+%8*2]
   371 %endif
   372         psadbw          mm1,        mm0
   373         paddw           mm7,        mm1
   375 %endmacro
   377 ;void int vp8_sad16x16x3_sse3(
   378 ;    unsigned char *src_ptr,
   379 ;    int  src_stride,
   380 ;    unsigned char *ref_ptr,
   381 ;    int  ref_stride,
   382 ;    int  *results)
   383 global sym(vp8_sad16x16x3_sse3) PRIVATE
   384 sym(vp8_sad16x16x3_sse3):
   386     STACK_FRAME_CREATE_X3
   388         PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
   389         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   390         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   391         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   392         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   393         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   394         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   395         PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
   397         mov             rcx,        result_ptr
   399         movq            xmm0,       xmm5
   400         psrldq          xmm5,       8
   402         paddw           xmm0,       xmm5
   403         movd            [rcx],      xmm0
   404 ;-
   405         movq            xmm0,       xmm6
   406         psrldq          xmm6,       8
   408         paddw           xmm0,       xmm6
   409         movd            [rcx+4],    xmm0
   410 ;-
   411         movq            xmm0,       xmm7
   412         psrldq          xmm7,       8
   414         paddw           xmm0,       xmm7
   415         movd            [rcx+8],    xmm0
   417     STACK_FRAME_DESTROY_X3
   419 ;void int vp8_sad16x8x3_sse3(
   420 ;    unsigned char *src_ptr,
   421 ;    int  src_stride,
   422 ;    unsigned char *ref_ptr,
   423 ;    int  ref_stride,
   424 ;    int  *results)
   425 global sym(vp8_sad16x8x3_sse3) PRIVATE
   426 sym(vp8_sad16x8x3_sse3):
   428     STACK_FRAME_CREATE_X3
   430         PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
   431         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   432         PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   433         PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
   435         mov             rcx,        result_ptr
   437         movq            xmm0,       xmm5
   438         psrldq          xmm5,       8
   440         paddw           xmm0,       xmm5
   441         movd            [rcx],      xmm0
   442 ;-
   443         movq            xmm0,       xmm6
   444         psrldq          xmm6,       8
   446         paddw           xmm0,       xmm6
   447         movd            [rcx+4],    xmm0
   448 ;-
   449         movq            xmm0,       xmm7
   450         psrldq          xmm7,       8
   452         paddw           xmm0,       xmm7
   453         movd            [rcx+8],    xmm0
   455     STACK_FRAME_DESTROY_X3
   457 ;void int vp8_sad8x16x3_sse3(
   458 ;    unsigned char *src_ptr,
   459 ;    int  src_stride,
   460 ;    unsigned char *ref_ptr,
   461 ;    int  ref_stride,
   462 ;    int  *results)
   463 global sym(vp8_sad8x16x3_sse3) PRIVATE
   464 sym(vp8_sad8x16x3_sse3):
   466     STACK_FRAME_CREATE_X3
   468         PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
   469         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   470         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   471         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   472         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   473         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   474         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   475         PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
   477         mov             rcx,        result_ptr
   479         punpckldq       mm5,        mm6
   481         movq            [rcx],      mm5
   482         movd            [rcx+8],    mm7
   484     STACK_FRAME_DESTROY_X3
   486 ;void int vp8_sad8x8x3_sse3(
   487 ;    unsigned char *src_ptr,
   488 ;    int  src_stride,
   489 ;    unsigned char *ref_ptr,
   490 ;    int  ref_stride,
   491 ;    int  *results)
   492 global sym(vp8_sad8x8x3_sse3) PRIVATE
   493 sym(vp8_sad8x8x3_sse3):
   495     STACK_FRAME_CREATE_X3
   497         PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
   498         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   499         PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   500         PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
   502         mov             rcx,        result_ptr
   504         punpckldq       mm5,        mm6
   506         movq            [rcx],      mm5
   507         movd            [rcx+8],    mm7
   509     STACK_FRAME_DESTROY_X3
   511 ;void int vp8_sad4x4x3_sse3(
   512 ;    unsigned char *src_ptr,
   513 ;    int  src_stride,
   514 ;    unsigned char *ref_ptr,
   515 ;    int  ref_stride,
   516 ;    int  *results)
   517 global sym(vp8_sad4x4x3_sse3) PRIVATE
   518 sym(vp8_sad4x4x3_sse3):
   520     STACK_FRAME_CREATE_X3
   522         movd            mm0,        DWORD PTR [src_ptr]
   523         movd            mm1,        DWORD PTR [ref_ptr]
   525         movd            mm2,        DWORD PTR [src_ptr+src_stride]
   526         movd            mm3,        DWORD PTR [ref_ptr+ref_stride]
   528         punpcklbw       mm0,        mm2
   529         punpcklbw       mm1,        mm3
   531         movd            mm4,        DWORD PTR [ref_ptr+1]
   532         movd            mm5,        DWORD PTR [ref_ptr+2]
   534         movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
   535         movd            mm3,        DWORD PTR [ref_ptr+ref_stride+2]
   537         psadbw          mm1,        mm0
   539         punpcklbw       mm4,        mm2
   540         punpcklbw       mm5,        mm3
   542         psadbw          mm4,        mm0
   543         psadbw          mm5,        mm0
   545         lea             src_ptr,    [src_ptr+src_stride*2]
   546         lea             ref_ptr,    [ref_ptr+ref_stride*2]
   548         movd            mm0,        DWORD PTR [src_ptr]
   549         movd            mm2,        DWORD PTR [ref_ptr]
   551         movd            mm3,        DWORD PTR [src_ptr+src_stride]
   552         movd            mm6,        DWORD PTR [ref_ptr+ref_stride]
   554         punpcklbw       mm0,        mm3
   555         punpcklbw       mm2,        mm6
   557         movd            mm3,        DWORD PTR [ref_ptr+1]
   558         movd            mm7,        DWORD PTR [ref_ptr+2]
   560         psadbw          mm2,        mm0
   562         paddw           mm1,        mm2
   564         movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
   565         movd            mm6,        DWORD PTR [ref_ptr+ref_stride+2]
   567         punpcklbw       mm3,        mm2
   568         punpcklbw       mm7,        mm6
   570         psadbw          mm3,        mm0
   571         psadbw          mm7,        mm0
   573         paddw           mm3,        mm4
   574         paddw           mm7,        mm5
   576         mov             rcx,        result_ptr
   578         punpckldq       mm1,        mm3
   580         movq            [rcx],      mm1
   581         movd            [rcx+8],    mm7
   583     STACK_FRAME_DESTROY_X3
   585 ;unsigned int vp8_sad16x16_sse3(
   586 ;    unsigned char *src_ptr,
   587 ;    int  src_stride,
   588 ;    unsigned char *ref_ptr,
   589 ;    int  ref_stride,
   590 ;    int  max_sad)
   591 ;%define lddqu movdqu
   592 global sym(vp8_sad16x16_sse3) PRIVATE
   593 sym(vp8_sad16x16_sse3):
   595     STACK_FRAME_CREATE_X3
   597         mov             end_ptr,    4
   598         pxor            xmm7,        xmm7
   600 .vp8_sad16x16_sse3_loop:
   601         movdqa          xmm0,       XMMWORD PTR [src_ptr]
   602         movdqu          xmm1,       XMMWORD PTR [ref_ptr]
   603         movdqa          xmm2,       XMMWORD PTR [src_ptr+src_stride]
   604         movdqu          xmm3,       XMMWORD PTR [ref_ptr+ref_stride]
   606         lea             src_ptr,    [src_ptr+src_stride*2]
   607         lea             ref_ptr,    [ref_ptr+ref_stride*2]
   609         movdqa          xmm4,       XMMWORD PTR [src_ptr]
   610         movdqu          xmm5,       XMMWORD PTR [ref_ptr]
   611         movdqa          xmm6,       XMMWORD PTR [src_ptr+src_stride]
   613         psadbw          xmm0,       xmm1
   615         movdqu          xmm1,       XMMWORD PTR [ref_ptr+ref_stride]
   617         psadbw          xmm2,       xmm3
   618         psadbw          xmm4,       xmm5
   619         psadbw          xmm6,       xmm1
   621         lea             src_ptr,    [src_ptr+src_stride*2]
   622         lea             ref_ptr,    [ref_ptr+ref_stride*2]
   624         paddw           xmm7,        xmm0
   625         paddw           xmm7,        xmm2
   626         paddw           xmm7,        xmm4
   627         paddw           xmm7,        xmm6
   629         sub             end_ptr,     1
   630         jne             .vp8_sad16x16_sse3_loop
   632         movq            xmm0,       xmm7
   633         psrldq          xmm7,       8
   634         paddw           xmm0,       xmm7
   635         movq            rax,        xmm0
   637     STACK_FRAME_DESTROY_X3
   639 ;void vp8_copy32xn_sse3(
   640 ;    unsigned char *src_ptr,
   641 ;    int  src_stride,
   642 ;    unsigned char *dst_ptr,
   643 ;    int  dst_stride,
   644 ;    int height);
   645 global sym(vp8_copy32xn_sse3) PRIVATE
   646 sym(vp8_copy32xn_sse3):
   648     STACK_FRAME_CREATE_X3
   650 .block_copy_sse3_loopx4:
   651         lea             end_ptr,    [src_ptr+src_stride*2]
   653         movdqu          xmm0,       XMMWORD PTR [src_ptr]
   654         movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
   655         movdqu          xmm2,       XMMWORD PTR [src_ptr + src_stride]
   656         movdqu          xmm3,       XMMWORD PTR [src_ptr + src_stride + 16]
   657         movdqu          xmm4,       XMMWORD PTR [end_ptr]
   658         movdqu          xmm5,       XMMWORD PTR [end_ptr + 16]
   659         movdqu          xmm6,       XMMWORD PTR [end_ptr + src_stride]
   660         movdqu          xmm7,       XMMWORD PTR [end_ptr + src_stride + 16]
   662         lea             src_ptr,    [src_ptr+src_stride*4]
   664         lea             end_ptr,    [ref_ptr+ref_stride*2]
   666         movdqa          XMMWORD PTR [ref_ptr], xmm0
   667         movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
   668         movdqa          XMMWORD PTR [ref_ptr + ref_stride], xmm2
   669         movdqa          XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
   670         movdqa          XMMWORD PTR [end_ptr], xmm4
   671         movdqa          XMMWORD PTR [end_ptr + 16], xmm5
   672         movdqa          XMMWORD PTR [end_ptr + ref_stride], xmm6
   673         movdqa          XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
   675         lea             ref_ptr,    [ref_ptr+ref_stride*4]
   677         sub             height,     4
   678         cmp             height,     4
   679         jge             .block_copy_sse3_loopx4
   681         ;Check to see if there is more rows need to be copied.
   682         cmp             height, 0
   683         je              .copy_is_done
   685 .block_copy_sse3_loop:
   686         movdqu          xmm0,       XMMWORD PTR [src_ptr]
   687         movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
   688         lea             src_ptr,    [src_ptr+src_stride]
   690         movdqa          XMMWORD PTR [ref_ptr], xmm0
   691         movdqa          XMMWORD PTR [ref_ptr + 16], xmm1
   692         lea             ref_ptr,    [ref_ptr+ref_stride]
   694         sub             height,     1
   695         jne             .block_copy_sse3_loop
   697 .copy_is_done:
   698     STACK_FRAME_DESTROY_X3
   700 ;void vp8_sad16x16x4d_sse3(
   701 ;    unsigned char *src_ptr,
   702 ;    int  src_stride,
   703 ;    unsigned char *ref_ptr_base,
   704 ;    int  ref_stride,
   705 ;    int  *results)
   706 global sym(vp8_sad16x16x4d_sse3) PRIVATE
   707 sym(vp8_sad16x16x4d_sse3):
   709     STACK_FRAME_CREATE_X4
   711         PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
   712         PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
   713         PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
   714         PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
   715         PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
   716         PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
   717         PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
   718         PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
   720 %if ABI_IS_32BIT
   721         pop             rbp
   722 %endif
   723         mov             rcx,        result_ptr
   725         movq            xmm0,       xmm4
   726         psrldq          xmm4,       8
   728         paddw           xmm0,       xmm4
   729         movd            [rcx],      xmm0
   730 ;-
   731         movq            xmm0,       xmm5
   732         psrldq          xmm5,       8
   734         paddw           xmm0,       xmm5
   735         movd            [rcx+4],    xmm0
   736 ;-
   737         movq            xmm0,       xmm6
   738         psrldq          xmm6,       8
   740         paddw           xmm0,       xmm6
   741         movd            [rcx+8],    xmm0
   742 ;-
   743         movq            xmm0,       xmm7
   744         psrldq          xmm7,       8
   746         paddw           xmm0,       xmm7
   747         movd            [rcx+12],   xmm0
   749     STACK_FRAME_DESTROY_X4
   751 ;void vp8_sad16x8x4d_sse3(
   752 ;    unsigned char *src_ptr,
   753 ;    int  src_stride,
   754 ;    unsigned char *ref_ptr_base,
   755 ;    int  ref_stride,
   756 ;    int  *results)
   757 global sym(vp8_sad16x8x4d_sse3) PRIVATE
   758 sym(vp8_sad16x8x4d_sse3):
   760     STACK_FRAME_CREATE_X4
   762         PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
   763         PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
   764         PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
   765         PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
   767 %if ABI_IS_32BIT
   768         pop             rbp
   769 %endif
   770         mov             rcx,        result_ptr
   772         movq            xmm0,       xmm4
   773         psrldq          xmm4,       8
   775         paddw           xmm0,       xmm4
   776         movd            [rcx],      xmm0
   777 ;-
   778         movq            xmm0,       xmm5
   779         psrldq          xmm5,       8
   781         paddw           xmm0,       xmm5
   782         movd            [rcx+4],    xmm0
   783 ;-
   784         movq            xmm0,       xmm6
   785         psrldq          xmm6,       8
   787         paddw           xmm0,       xmm6
   788         movd            [rcx+8],    xmm0
   789 ;-
   790         movq            xmm0,       xmm7
   791         psrldq          xmm7,       8
   793         paddw           xmm0,       xmm7
   794         movd            [rcx+12],   xmm0
   796     STACK_FRAME_DESTROY_X4
   798 ;void int vp8_sad8x16x4d_sse3(
   799 ;    unsigned char *src_ptr,
   800 ;    int  src_stride,
   801 ;    unsigned char *ref_ptr,
   802 ;    int  ref_stride,
   803 ;    int  *results)
   804 global sym(vp8_sad8x16x4d_sse3) PRIVATE
   805 sym(vp8_sad8x16x4d_sse3):
   807     STACK_FRAME_CREATE_X4
   809         PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
   810         PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
   811         PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
   812         PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
   813         PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
   814         PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
   815         PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
   816         PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
   818 %if ABI_IS_32BIT
   819         pop             rbp
   820 %endif
   821         mov             rcx,        result_ptr
   823         punpckldq       mm4,        mm5
   824         punpckldq       mm6,        mm7
   826         movq            [rcx],      mm4
   827         movq            [rcx+8],    mm6
   829     STACK_FRAME_DESTROY_X4
   831 ;void int vp8_sad8x8x4d_sse3(
   832 ;    unsigned char *src_ptr,
   833 ;    int  src_stride,
   834 ;    unsigned char *ref_ptr,
   835 ;    int  ref_stride,
   836 ;    int  *results)
   837 global sym(vp8_sad8x8x4d_sse3) PRIVATE
   838 sym(vp8_sad8x8x4d_sse3):
   840     STACK_FRAME_CREATE_X4
   842         PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
   843         PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
   844         PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
   845         PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
   847 %if ABI_IS_32BIT
   848         pop             rbp
   849 %endif
   850         mov             rcx,        result_ptr
   852         punpckldq       mm4,        mm5
   853         punpckldq       mm6,        mm7
   855         movq            [rcx],      mm4
   856         movq            [rcx+8],    mm6
   858     STACK_FRAME_DESTROY_X4
   860 ;void int vp8_sad4x4x4d_sse3(
   861 ;    unsigned char *src_ptr,
   862 ;    int  src_stride,
   863 ;    unsigned char *ref_ptr,
   864 ;    int  ref_stride,
   865 ;    int  *results)
   866 global sym(vp8_sad4x4x4d_sse3) PRIVATE
   867 sym(vp8_sad4x4x4d_sse3):
   869     STACK_FRAME_CREATE_X4
   871         movd            mm0,        DWORD PTR [src_ptr]
   872         movd            mm1,        DWORD PTR [r0_ptr]
   874         movd            mm2,        DWORD PTR [src_ptr+src_stride]
   875         movd            mm3,        DWORD PTR [r0_ptr+ref_stride]
   877         punpcklbw       mm0,        mm2
   878         punpcklbw       mm1,        mm3
   880         movd            mm4,        DWORD PTR [r1_ptr]
   881         movd            mm5,        DWORD PTR [r2_ptr]
   883         movd            mm6,        DWORD PTR [r3_ptr]
   884         movd            mm2,        DWORD PTR [r1_ptr+ref_stride]
   886         movd            mm3,        DWORD PTR [r2_ptr+ref_stride]
   887         movd            mm7,        DWORD PTR [r3_ptr+ref_stride]
   889         psadbw          mm1,        mm0
   891         punpcklbw       mm4,        mm2
   892         punpcklbw       mm5,        mm3
   894         punpcklbw       mm6,        mm7
   895         psadbw          mm4,        mm0
   897         psadbw          mm5,        mm0
   898         psadbw          mm6,        mm0
   902         lea             src_ptr,    [src_ptr+src_stride*2]
   903         lea             r0_ptr,     [r0_ptr+ref_stride*2]
   905         lea             r1_ptr,     [r1_ptr+ref_stride*2]
   906         lea             r2_ptr,     [r2_ptr+ref_stride*2]
   908         lea             r3_ptr,     [r3_ptr+ref_stride*2]
   910         movd            mm0,        DWORD PTR [src_ptr]
   911         movd            mm2,        DWORD PTR [r0_ptr]
   913         movd            mm3,        DWORD PTR [src_ptr+src_stride]
   914         movd            mm7,        DWORD PTR [r0_ptr+ref_stride]
   916         punpcklbw       mm0,        mm3
   917         punpcklbw       mm2,        mm7
   919         movd            mm3,        DWORD PTR [r1_ptr]
   920         movd            mm7,        DWORD PTR [r2_ptr]
   922         psadbw          mm2,        mm0
   923 %if ABI_IS_32BIT
   924         mov             rax,        rbp
   926         pop             rbp
   927 %define     ref_stride    rax
   928 %endif
   929         mov             rsi,        result_ptr
   931         paddw           mm1,        mm2
   932         movd            [rsi],      mm1
   934         movd            mm2,        DWORD PTR [r1_ptr+ref_stride]
   935         movd            mm1,        DWORD PTR [r2_ptr+ref_stride]
   937         punpcklbw       mm3,        mm2
   938         punpcklbw       mm7,        mm1
   940         psadbw          mm3,        mm0
   941         psadbw          mm7,        mm0
   943         movd            mm2,        DWORD PTR [r3_ptr]
   944         movd            mm1,        DWORD PTR [r3_ptr+ref_stride]
   946         paddw           mm3,        mm4
   947         paddw           mm7,        mm5
   949         movd            [rsi+4],    mm3
   950         punpcklbw       mm2,        mm1
   952         movd            [rsi+8],    mm7
   953         psadbw          mm2,        mm0
   955         paddw           mm2,        mm6
   956         movd            [rsi+12],   mm2
   959     STACK_FRAME_DESTROY_X4

mercurial