media/libvpx/vp8/common/x86/variance_impl_mmx.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    12 %include "vpx_ports/x86_abi_support.asm"
    14 ;unsigned int vp8_get_mb_ss_mmx( short *src_ptr )
    15 global sym(vp8_get_mb_ss_mmx) PRIVATE
    16 sym(vp8_get_mb_ss_mmx):
    17     push        rbp
    18     mov         rbp, rsp
    19     SHADOW_ARGS_TO_STACK 7
    20     GET_GOT     rbx
    21     push rsi
    22     push rdi
    23     sub         rsp, 8
    24     ; end prolog
    26         mov         rax, arg(0) ;src_ptr
    27         mov         rcx, 16
    28         pxor        mm4, mm4
    30 .NEXTROW:
    31         movq        mm0, [rax]
    32         movq        mm1, [rax+8]
    33         movq        mm2, [rax+16]
    34         movq        mm3, [rax+24]
    35         pmaddwd     mm0, mm0
    36         pmaddwd     mm1, mm1
    37         pmaddwd     mm2, mm2
    38         pmaddwd     mm3, mm3
    40         paddd       mm4, mm0
    41         paddd       mm4, mm1
    42         paddd       mm4, mm2
    43         paddd       mm4, mm3
    45         add         rax, 32
    46         dec         rcx
    47         ja          .NEXTROW
    48         movq        QWORD PTR [rsp], mm4
    50         ;return sum[0]+sum[1];
    51         movsxd      rax, dword ptr [rsp]
    52         movsxd      rcx, dword ptr [rsp+4]
    53         add         rax, rcx
    56     ; begin epilog
    57     add rsp, 8
    58     pop rdi
    59     pop rsi
    60     RESTORE_GOT
    61     UNSHADOW_ARGS
    62     pop         rbp
    63     ret
    66 ;unsigned int vp8_get8x8var_mmx
    67 ;(
    68 ;    unsigned char *src_ptr,
    69 ;    int  source_stride,
    70 ;    unsigned char *ref_ptr,
    71 ;    int  recon_stride,
    72 ;    unsigned int *SSE,
    73 ;    int *Sum
    74 ;)
    75 global sym(vp8_get8x8var_mmx) PRIVATE
    76 sym(vp8_get8x8var_mmx):
    77     push        rbp
    78     mov         rbp, rsp
    79     SHADOW_ARGS_TO_STACK 6
    80     push rsi
    81     push rdi
    82     push rbx
    83     sub         rsp, 16
    84     ; end prolog
    87         pxor        mm5, mm5                    ; Blank mmx6
    88         pxor        mm6, mm6                    ; Blank mmx7
    89         pxor        mm7, mm7                    ; Blank mmx7
    91         mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
    92         mov         rbx, arg(2) ;[ref_ptr]
    93         movsxd      rcx, dword ptr arg(1) ;[source_stride]
    94         movsxd      rdx, dword ptr arg(3) ;[recon_stride]
    96         ; Row 1
    97         movq        mm0, [rax]                  ; Copy eight bytes to mm0
    98         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
    99         movq        mm2, mm0                    ; Take copies
   100         movq        mm3, mm1                    ; Take copies
   102         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   103         punpcklbw   mm1, mm6
   104         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
   105         punpckhbw   mm3, mm6
   106         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   107         psubsw      mm2, mm3                    ; A-B (high order) to MM2
   109         paddw       mm5, mm0                    ; accumulate differences in mm5
   110         paddw       mm5, mm2                    ; accumulate differences in mm5
   112         pmaddwd     mm0, mm0                    ; square and accumulate
   113         pmaddwd     mm2, mm2                    ; square and accumulate
   114         add         rbx,rdx                     ; Inc pointer into ref data
   115         add         rax,rcx                     ; Inc pointer into the new data
   116         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
   117         paddd       mm7, mm0                    ; accumulate in mm7
   118         paddd       mm7, mm2                    ; accumulate in mm7
   121         ; Row 2
   122         movq        mm0, [rax]                  ; Copy eight bytes to mm0
   123         movq        mm2, mm0                    ; Take copies
   124         movq        mm3, mm1                    ; Take copies
   126         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   127         punpcklbw   mm1, mm6
   128         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
   129         punpckhbw   mm3, mm6
   130         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   131         psubsw      mm2, mm3                    ; A-B (high order) to MM2
   133         paddw       mm5, mm0                    ; accumulate differences in mm5
   134         paddw       mm5, mm2                    ; accumulate differences in mm5
   136         pmaddwd     mm0, mm0                    ; square and accumulate
   137         pmaddwd     mm2, mm2                    ; square and accumulate
   138         add         rbx,rdx                     ; Inc pointer into ref data
   139         add         rax,rcx                     ; Inc pointer into the new data
   140         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
   141         paddd       mm7, mm0                    ; accumulate in mm7
   142         paddd       mm7, mm2                    ; accumulate in mm7
   144         ; Row 3
   145         movq        mm0, [rax]                  ; Copy eight bytes to mm0
   146         movq        mm2, mm0                    ; Take copies
   147         movq        mm3, mm1                    ; Take copies
   149         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   150         punpcklbw   mm1, mm6
   151         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
   152         punpckhbw   mm3, mm6
   153         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   154         psubsw      mm2, mm3                    ; A-B (high order) to MM2
   156         paddw       mm5, mm0                    ; accumulate differences in mm5
   157         paddw       mm5, mm2                    ; accumulate differences in mm5
   159         pmaddwd     mm0, mm0                    ; square and accumulate
   160         pmaddwd     mm2, mm2                    ; square and accumulate
   161         add         rbx,rdx                     ; Inc pointer into ref data
   162         add         rax,rcx                     ; Inc pointer into the new data
   163         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
   164         paddd       mm7, mm0                    ; accumulate in mm7
   165         paddd       mm7, mm2                    ; accumulate in mm7
   167         ; Row 4
   168         movq        mm0, [rax]                  ; Copy eight bytes to mm0
   169         movq        mm2, mm0                    ; Take copies
   170         movq        mm3, mm1                    ; Take copies
   172         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   173         punpcklbw   mm1, mm6
   174         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
   175         punpckhbw   mm3, mm6
   176         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   177         psubsw      mm2, mm3                    ; A-B (high order) to MM2
   179         paddw       mm5, mm0                    ; accumulate differences in mm5
   180         paddw       mm5, mm2                    ; accumulate differences in mm5
   182         pmaddwd     mm0, mm0                    ; square and accumulate
   183         pmaddwd     mm2, mm2                    ; square and accumulate
   184         add         rbx,rdx                     ; Inc pointer into ref data
   185         add         rax,rcx                     ; Inc pointer into the new data
   186         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
   187         paddd       mm7, mm0                    ; accumulate in mm7
   188         paddd       mm7, mm2                    ; accumulate in mm7
   190         ; Row 5
   191         movq        mm0, [rax]                  ; Copy eight bytes to mm0
   192         movq        mm2, mm0                    ; Take copies
   193         movq        mm3, mm1                    ; Take copies
   195         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   196         punpcklbw   mm1, mm6
   197         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
   198         punpckhbw   mm3, mm6
   199         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   200         psubsw      mm2, mm3                    ; A-B (high order) to MM2
   202         paddw       mm5, mm0                    ; accumulate differences in mm5
   203         paddw       mm5, mm2                    ; accumulate differences in mm5
   205         pmaddwd     mm0, mm0                    ; square and accumulate
   206         pmaddwd     mm2, mm2                    ; square and accumulate
   207         add         rbx,rdx                     ; Inc pointer into ref data
   208         add         rax,rcx                     ; Inc pointer into the new data
   209         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
   210         ;              movq        mm4, [rbx + rdx]
   211         paddd       mm7, mm0                    ; accumulate in mm7
   212         paddd       mm7, mm2                    ; accumulate in mm7
   214         ; Row 6
   215         movq        mm0, [rax]                  ; Copy eight bytes to mm0
   216         movq        mm2, mm0                    ; Take copies
   217         movq        mm3, mm1                    ; Take copies
   219         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   220         punpcklbw   mm1, mm6
   221         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
   222         punpckhbw   mm3, mm6
   223         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   224         psubsw      mm2, mm3                    ; A-B (high order) to MM2
   226         paddw       mm5, mm0                    ; accumulate differences in mm5
   227         paddw       mm5, mm2                    ; accumulate differences in mm5
   229         pmaddwd     mm0, mm0                    ; square and accumulate
   230         pmaddwd     mm2, mm2                    ; square and accumulate
   231         add         rbx,rdx                     ; Inc pointer into ref data
   232         add         rax,rcx                     ; Inc pointer into the new data
   233         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
   234         paddd       mm7, mm0                    ; accumulate in mm7
   235         paddd       mm7, mm2                    ; accumulate in mm7
   237         ; Row 7
   238         movq        mm0, [rax]                  ; Copy eight bytes to mm0
   239         movq        mm2, mm0                    ; Take copies
   240         movq        mm3, mm1                    ; Take copies
   242         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   243         punpcklbw   mm1, mm6
   244         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
   245         punpckhbw   mm3, mm6
   246         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   247         psubsw      mm2, mm3                    ; A-B (high order) to MM2
   249         paddw       mm5, mm0                    ; accumulate differences in mm5
   250         paddw       mm5, mm2                    ; accumulate differences in mm5
   252         pmaddwd     mm0, mm0                    ; square and accumulate
   253         pmaddwd     mm2, mm2                    ; square and accumulate
   254         add         rbx,rdx                     ; Inc pointer into ref data
   255         add         rax,rcx                     ; Inc pointer into the new data
   256         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
   257         paddd       mm7, mm0                    ; accumulate in mm7
   258         paddd       mm7, mm2                    ; accumulate in mm7
   260         ; Row 8
   261         movq        mm0, [rax]                  ; Copy eight bytes to mm0
   262         movq        mm2, mm0                    ; Take copies
   263         movq        mm3, mm1                    ; Take copies
   265         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   266         punpcklbw   mm1, mm6
   267         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
   268         punpckhbw   mm3, mm6
   269         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   270         psubsw      mm2, mm3                    ; A-B (high order) to MM2
   272         paddw       mm5, mm0                    ; accumulate differences in mm5
   273         paddw       mm5, mm2                    ; accumulate differences in mm5
   275         pmaddwd     mm0, mm0                    ; square and accumulate
   276         pmaddwd     mm2, mm2                    ; square and accumulate
   277         add         rbx,rdx                     ; Inc pointer into ref data
   278         add         rax,rcx                     ; Inc pointer into the new data
   279         paddd       mm7, mm0                    ; accumulate in mm7
   280         paddd       mm7, mm2                    ; accumulate in mm7
   282         ; Now accumulate the final results.
   283         movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
   284         movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
   285         movsx       rdx, WORD PTR [rsp+8]
   286         movsx       rcx, WORD PTR [rsp+10]
   287         movsx       rbx, WORD PTR [rsp+12]
   288         movsx       rax, WORD PTR [rsp+14]
   289         add         rdx, rcx
   290         add         rbx, rax
   291         add         rdx, rbx    ;XSum
   292         movsxd      rax, DWORD PTR [rsp]
   293         movsxd      rcx, DWORD PTR [rsp+4]
   294         add         rax, rcx    ;XXSum
   295         mov         rsi, arg(4) ;SSE
   296         mov         rdi, arg(5) ;Sum
   297         mov         dword ptr [rsi], eax
   298         mov         dword ptr [rdi], edx
   299         xor         rax, rax    ; return 0
   302     ; begin epilog
   303     add rsp, 16
   304     pop rbx
   305     pop rdi
   306     pop rsi
   307     UNSHADOW_ARGS
   308     pop         rbp
   309     ret
   313 ;unsigned int
   314 ;vp8_get4x4var_mmx
   315 ;(
   316 ;    unsigned char *src_ptr,
   317 ;    int  source_stride,
   318 ;    unsigned char *ref_ptr,
   319 ;    int  recon_stride,
   320 ;    unsigned int *SSE,
   321 ;    int *Sum
   322 ;)
   323 global sym(vp8_get4x4var_mmx) PRIVATE
   324 sym(vp8_get4x4var_mmx):
   325     push        rbp
   326     mov         rbp, rsp
   327     SHADOW_ARGS_TO_STACK 6
   328     push rsi
   329     push rdi
   330     push rbx
   331     sub         rsp, 16
   332     ; end prolog
   335         pxor        mm5, mm5                    ; Blank mmx6
   336         pxor        mm6, mm6                    ; Blank mmx7
   337         pxor        mm7, mm7                    ; Blank mmx7
   339         mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
   340         mov         rbx, arg(2) ;[ref_ptr]
   341         movsxd      rcx, dword ptr arg(1) ;[source_stride]
   342         movsxd      rdx, dword ptr arg(3) ;[recon_stride]
   344         ; Row 1
   345         movq        mm0, [rax]                  ; Copy eight bytes to mm0
   346         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
   347         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   348         punpcklbw   mm1, mm6
   349         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   350         paddw       mm5, mm0                    ; accumulate differences in mm5
   351         pmaddwd     mm0, mm0                    ; square and accumulate
   352         add         rbx,rdx                     ; Inc pointer into ref data
   353         add         rax,rcx                     ; Inc pointer into the new data
   354         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
   355         paddd       mm7, mm0                    ; accumulate in mm7
   358         ; Row 2
   359         movq        mm0, [rax]                  ; Copy eight bytes to mm0
   360         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   361         punpcklbw   mm1, mm6
   362         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   363         paddw       mm5, mm0                    ; accumulate differences in mm5
   365         pmaddwd     mm0, mm0                    ; square and accumulate
   366         add         rbx,rdx                     ; Inc pointer into ref data
   367         add         rax,rcx                     ; Inc pointer into the new data
   368         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
   369         paddd       mm7, mm0                    ; accumulate in mm7
   371         ; Row 3
   372         movq        mm0, [rax]                  ; Copy eight bytes to mm0
   373         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   374         punpcklbw   mm1, mm6
   375         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   376         paddw       mm5, mm0                    ; accumulate differences in mm5
   378         pmaddwd     mm0, mm0                    ; square and accumulate
   379         add         rbx,rdx                     ; Inc pointer into ref data
   380         add         rax,rcx                     ; Inc pointer into the new data
   381         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
   382         paddd       mm7, mm0                    ; accumulate in mm7
   384         ; Row 4
   385         movq        mm0, [rax]                  ; Copy eight bytes to mm0
   387         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   388         punpcklbw   mm1, mm6
   389         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   391         paddw       mm5, mm0                    ; accumulate differences in mm5
   393         pmaddwd     mm0, mm0                    ; square and accumulate
   394         paddd       mm7, mm0                    ; accumulate in mm7
   397         ; Now accumulate the final results.
   398         movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
   399         movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
   400         movsx       rdx, WORD PTR [rsp+8]
   401         movsx       rcx, WORD PTR [rsp+10]
   402         movsx       rbx, WORD PTR [rsp+12]
   403         movsx       rax, WORD PTR [rsp+14]
   404         add         rdx, rcx
   405         add         rbx, rax
   406         add         rdx, rbx    ;XSum
   407         movsxd      rax, DWORD PTR [rsp]
   408         movsxd      rcx, DWORD PTR [rsp+4]
   409         add         rax, rcx    ;XXSum
   410         mov         rsi, arg(4) ;SSE
   411         mov         rdi, arg(5) ;Sum
   412         mov         dword ptr [rsi], eax
   413         mov         dword ptr [rdi], edx
   414         xor         rax, rax    ; return 0
   417     ; begin epilog
   418     add rsp, 16
   419     pop rbx
   420     pop rdi
   421     pop rsi
   422     UNSHADOW_ARGS
   423     pop         rbp
   424     ret
   428 ;unsigned int
   429 ;vp8_get4x4sse_cs_mmx
   430 ;(
   431 ;    unsigned char *src_ptr,
   432 ;    int  source_stride,
   433 ;    unsigned char *ref_ptr,
   434 ;    int  recon_stride
   435 ;)
   436 global sym(vp8_get4x4sse_cs_mmx) PRIVATE
   437 sym(vp8_get4x4sse_cs_mmx):
   438     push        rbp
   439     mov         rbp, rsp
   440     SHADOW_ARGS_TO_STACK 4
   441     push rsi
   442     push rdi
   443     push rbx
   444     ; end prolog
   447         pxor        mm6, mm6                    ; Blank mmx7
   448         pxor        mm7, mm7                    ; Blank mmx7
   450         mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
   451         mov         rbx, arg(2) ;[ref_ptr]
   452         movsxd      rcx, dword ptr arg(1) ;[source_stride]
   453         movsxd      rdx, dword ptr arg(3) ;[recon_stride]
   454         ; Row 1
   455         movd        mm0, [rax]                  ; Copy eight bytes to mm0
   456         movd        mm1, [rbx]                  ; Copy eight bytes to mm1
   457         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   458         punpcklbw   mm1, mm6
   459         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   460         pmaddwd     mm0, mm0                    ; square and accumulate
   461         add         rbx,rdx                     ; Inc pointer into ref data
   462         add         rax,rcx                     ; Inc pointer into the new data
   463         movd        mm1, [rbx]                  ; Copy eight bytes to mm1
   464         paddd       mm7, mm0                    ; accumulate in mm7
   466         ; Row 2
   467         movd        mm0, [rax]                  ; Copy eight bytes to mm0
   468         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   469         punpcklbw   mm1, mm6
   470         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   471         pmaddwd     mm0, mm0                    ; square and accumulate
   472         add         rbx,rdx                     ; Inc pointer into ref data
   473         add         rax,rcx                     ; Inc pointer into the new data
   474         movd        mm1, [rbx]                  ; Copy eight bytes to mm1
   475         paddd       mm7, mm0                    ; accumulate in mm7
   477         ; Row 3
   478         movd        mm0, [rax]                  ; Copy eight bytes to mm0
   479         punpcklbw   mm1, mm6
   480         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   481         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   483         pmaddwd     mm0, mm0                    ; square and accumulate
   484         add         rbx,rdx                     ; Inc pointer into ref data
   485         add         rax,rcx                     ; Inc pointer into the new data
   486         movd        mm1, [rbx]                  ; Copy eight bytes to mm1
   487         paddd       mm7, mm0                    ; accumulate in mm7
   489         ; Row 4
   490         movd        mm0, [rax]                  ; Copy eight bytes to mm0
   491         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   492         punpcklbw   mm1, mm6
   493         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   494         pmaddwd     mm0, mm0                    ; square and accumulate
   495         paddd       mm7, mm0                    ; accumulate in mm7
   497         movq        mm0,    mm7                 ;
   498         psrlq       mm7,    32
   500         paddd       mm0,    mm7
   501         movq        rax,    mm0
   504     ; begin epilog
   505     pop rbx
   506     pop rdi
   507     pop rsi
   508     UNSHADOW_ARGS
   509     pop         rbp
   510     ret
   512 %define mmx_filter_shift            7
   514 ;void vp8_filter_block2d_bil4x4_var_mmx
   515 ;(
   516 ;    unsigned char *ref_ptr,
   517 ;    int ref_pixels_per_line,
   518 ;    unsigned char *src_ptr,
   519 ;    int src_pixels_per_line,
   520 ;    unsigned short *HFilter,
   521 ;    unsigned short *VFilter,
   522 ;    int *sum,
   523 ;    unsigned int *sumsquared
   524 ;)
   525 global sym(vp8_filter_block2d_bil4x4_var_mmx) PRIVATE
   526 sym(vp8_filter_block2d_bil4x4_var_mmx):
   527     push        rbp
   528     mov         rbp, rsp
   529     SHADOW_ARGS_TO_STACK 8
   530     GET_GOT     rbx
   531     push rsi
   532     push rdi
   533     sub         rsp, 16
   534     ; end prolog
   537         pxor            mm6,            mm6                 ;
   538         pxor            mm7,            mm7                 ;
   540         mov             rax,            arg(4) ;HFilter             ;
   541         mov             rdx,            arg(5) ;VFilter             ;
   543         mov             rsi,            arg(0) ;ref_ptr              ;
   544         mov             rdi,            arg(2) ;src_ptr              ;
   546         mov             rcx,            4                   ;
   547         pxor            mm0,            mm0                 ;
   549         movd            mm1,            [rsi]               ;
   550         movd            mm3,            [rsi+1]             ;
   552         punpcklbw       mm1,            mm0                 ;
   553         pmullw          mm1,            [rax]               ;
   555         punpcklbw       mm3,            mm0                 ;
   556         pmullw          mm3,            [rax+8]             ;
   558         paddw           mm1,            mm3                 ;
   559         paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
   561         psraw           mm1,            mmx_filter_shift    ;
   562         movq            mm5,            mm1
   564 %if ABI_IS_32BIT
   565         add             rsi, dword ptr  arg(1) ;ref_pixels_per_line    ;
   566 %else
   567         movsxd          r8, dword ptr  arg(1) ;ref_pixels_per_line    ;
   568         add             rsi, r8
   569 %endif
   571 .filter_block2d_bil4x4_var_mmx_loop:
   573         movd            mm1,            [rsi]               ;
   574         movd            mm3,            [rsi+1]             ;
   576         punpcklbw       mm1,            mm0                 ;
   577         pmullw          mm1,            [rax]               ;
   579         punpcklbw       mm3,            mm0                 ;
   580         pmullw          mm3,            [rax+8]             ;
   582         paddw           mm1,            mm3                 ;
   583         paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
   585         psraw           mm1,            mmx_filter_shift    ;
   586         movq            mm3,            mm5                 ;
   588         movq            mm5,            mm1                 ;
   589         pmullw          mm3,            [rdx]               ;
   591         pmullw          mm1,            [rdx+8]             ;
   592         paddw           mm1,            mm3                 ;
   595         paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
   596         psraw           mm1,            mmx_filter_shift    ;
   598         movd            mm3,            [rdi]               ;
   599         punpcklbw       mm3,            mm0                 ;
   601         psubw           mm1,            mm3                 ;
   602         paddw           mm6,            mm1                 ;
   604         pmaddwd         mm1,            mm1                 ;
   605         paddd           mm7,            mm1                 ;
   607 %if ABI_IS_32BIT
   608         add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
   609         add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
   610 %else
   611         movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
   612         movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
   613         add             rsi,            r8
   614         add             rdi,            r9
   615 %endif
   616         sub             rcx,            1                   ;
   617         jnz             .filter_block2d_bil4x4_var_mmx_loop       ;
   620         pxor            mm3,            mm3                 ;
   621         pxor            mm2,            mm2                 ;
   623         punpcklwd       mm2,            mm6                 ;
   624         punpckhwd       mm3,            mm6                 ;
   626         paddd           mm2,            mm3                 ;
   627         movq            mm6,            mm2                 ;
   629         psrlq           mm6,            32                  ;
   630         paddd           mm2,            mm6                 ;
   632         psrad           mm2,            16                  ;
   633         movq            mm4,            mm7                 ;
   635         psrlq           mm4,            32                  ;
   636         paddd           mm4,            mm7                 ;
   638         mov             rdi,            arg(6) ;sum
   639         mov             rsi,            arg(7) ;sumsquared
   641         movd            dword ptr [rdi],          mm2                 ;
   642         movd            dword ptr [rsi],          mm4                 ;
   646     ; begin epilog
   647     add rsp, 16
   648     pop rdi
   649     pop rsi
   650     RESTORE_GOT
   651     UNSHADOW_ARGS
   652     pop         rbp
   653     ret
   658 ;void vp8_filter_block2d_bil_var_mmx
   659 ;(
   660 ;    unsigned char *ref_ptr,
   661 ;    int ref_pixels_per_line,
   662 ;    unsigned char *src_ptr,
   663 ;    int src_pixels_per_line,
   664 ;    unsigned int Height,
   665 ;    unsigned short *HFilter,
   666 ;    unsigned short *VFilter,
   667 ;    int *sum,
   668 ;    unsigned int *sumsquared
   669 ;)
   670 global sym(vp8_filter_block2d_bil_var_mmx) PRIVATE
   671 sym(vp8_filter_block2d_bil_var_mmx):
   672     push        rbp
   673     mov         rbp, rsp
   674     SHADOW_ARGS_TO_STACK 9
   675     GET_GOT     rbx
   676     push rsi
   677     push rdi
   678     sub         rsp, 16
   679     ; end prolog
   681         pxor            mm6,            mm6                 ;
   682         pxor            mm7,            mm7                 ;
   683         mov             rax,            arg(5) ;HFilter             ;
   685         mov             rdx,            arg(6) ;VFilter             ;
   686         mov             rsi,            arg(0) ;ref_ptr              ;
   688         mov             rdi,            arg(2) ;src_ptr              ;
   689         movsxd          rcx,            dword ptr arg(4) ;Height              ;
   691         pxor            mm0,            mm0                 ;
   692         movq            mm1,            [rsi]               ;
   694         movq            mm3,            [rsi+1]             ;
   695         movq            mm2,            mm1                 ;
   697         movq            mm4,            mm3                 ;
   698         punpcklbw       mm1,            mm0                 ;
   700         punpckhbw       mm2,            mm0                 ;
   701         pmullw          mm1,            [rax]               ;
   703         pmullw          mm2,            [rax]               ;
   704         punpcklbw       mm3,            mm0                 ;
   706         punpckhbw       mm4,            mm0                 ;
   707         pmullw          mm3,            [rax+8]             ;
   709         pmullw          mm4,            [rax+8]             ;
   710         paddw           mm1,            mm3                 ;
   712         paddw           mm2,            mm4                 ;
   713         paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
   715         psraw           mm1,            mmx_filter_shift    ;
   716         paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
   718         psraw           mm2,            mmx_filter_shift    ;
   719         movq            mm5,            mm1
   721         packuswb        mm5,            mm2                 ;
   722 %if ABI_IS_32BIT
   723         add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
   724 %else
   725         movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
   726         add             rsi,            r8
   727 %endif
   729 .filter_block2d_bil_var_mmx_loop:
   731         movq            mm1,            [rsi]               ;
   732         movq            mm3,            [rsi+1]             ;
   734         movq            mm2,            mm1                 ;
   735         movq            mm4,            mm3                 ;
   737         punpcklbw       mm1,            mm0                 ;
   738         punpckhbw       mm2,            mm0                 ;
   740         pmullw          mm1,            [rax]               ;
   741         pmullw          mm2,            [rax]               ;
   743         punpcklbw       mm3,            mm0                 ;
   744         punpckhbw       mm4,            mm0                 ;
   746         pmullw          mm3,            [rax+8]             ;
   747         pmullw          mm4,            [rax+8]             ;
   749         paddw           mm1,            mm3                 ;
   750         paddw           mm2,            mm4                 ;
   752         paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
   753         psraw           mm1,            mmx_filter_shift    ;
   755         paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
   756         psraw           mm2,            mmx_filter_shift    ;
   758         movq            mm3,            mm5                 ;
   759         movq            mm4,            mm5                 ;
   761         punpcklbw       mm3,            mm0                 ;
   762         punpckhbw       mm4,            mm0                 ;
   764         movq            mm5,            mm1                 ;
   765         packuswb        mm5,            mm2                 ;
   767         pmullw          mm3,            [rdx]               ;
   768         pmullw          mm4,            [rdx]               ;
   770         pmullw          mm1,            [rdx+8]             ;
   771         pmullw          mm2,            [rdx+8]             ;
   773         paddw           mm1,            mm3                 ;
   774         paddw           mm2,            mm4                 ;
   776         paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
   777         paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
   779         psraw           mm1,            mmx_filter_shift    ;
   780         psraw           mm2,            mmx_filter_shift    ;
   782         movq            mm3,            [rdi]               ;
   783         movq            mm4,            mm3                 ;
   785         punpcklbw       mm3,            mm0                 ;
   786         punpckhbw       mm4,            mm0                 ;
   788         psubw           mm1,            mm3                 ;
   789         psubw           mm2,            mm4                 ;
   791         paddw           mm6,            mm1                 ;
   792         pmaddwd         mm1,            mm1                 ;
   794         paddw           mm6,            mm2                 ;
   795         pmaddwd         mm2,            mm2                 ;
   797         paddd           mm7,            mm1                 ;
   798         paddd           mm7,            mm2                 ;
   800 %if ABI_IS_32BIT
   801         add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
   802         add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
   803 %else
   804         movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
   805         movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;
   806         add             rsi,            r8
   807         add             rdi,            r9
   808 %endif
   809         sub             rcx,            1                   ;
   810         jnz             .filter_block2d_bil_var_mmx_loop       ;
   813         pxor            mm3,            mm3                 ;
   814         pxor            mm2,            mm2                 ;
   816         punpcklwd       mm2,            mm6                 ;
   817         punpckhwd       mm3,            mm6                 ;
   819         paddd           mm2,            mm3                 ;
   820         movq            mm6,            mm2                 ;
   822         psrlq           mm6,            32                  ;
   823         paddd           mm2,            mm6                 ;
   825         psrad           mm2,            16                  ;
   826         movq            mm4,            mm7                 ;
   828         psrlq           mm4,            32                  ;
   829         paddd           mm4,            mm7                 ;
   831         mov             rdi,            arg(7) ;sum
   832         mov             rsi,            arg(8) ;sumsquared
   834         movd            dword ptr [rdi],          mm2                 ;
   835         movd            dword ptr [rsi],          mm4                 ;
   837     ; begin epilog
   838     add rsp, 16
   839     pop rdi
   840     pop rsi
   841     RESTORE_GOT
   842     UNSHADOW_ARGS
   843     pop         rbp
   844     ret
   847 SECTION_RODATA
   848 ;short mmx_bi_rd[4] = { 64, 64, 64, 64};
   849 align 16
   850 mmx_bi_rd:
   851     times 4 dw 64

mercurial