media/libvpx/vp9/encoder/x86/vp9_variance_impl_mmx.asm

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    12 %include "vpx_ports/x86_abi_support.asm"
    14 ;unsigned int vp9_get_mb_ss_mmx( short *src_ptr )
    15 global sym(vp9_get_mb_ss_mmx) PRIVATE
    16 sym(vp9_get_mb_ss_mmx):
    17     push        rbp
    18     mov         rbp, rsp
    19     SHADOW_ARGS_TO_STACK 7
    20     GET_GOT     rbx
    21     push rsi
    22     push rdi
    23     sub         rsp, 8
    24     ; end prolog
    26         mov         rax, arg(0) ;src_ptr
    27         mov         rcx, 16
    28         pxor        mm4, mm4
    30 .NEXTROW:
    31         movq        mm0, [rax]
    32         movq        mm1, [rax+8]
    33         movq        mm2, [rax+16]
    34         movq        mm3, [rax+24]
    35         pmaddwd     mm0, mm0
    36         pmaddwd     mm1, mm1
    37         pmaddwd     mm2, mm2
    38         pmaddwd     mm3, mm3
    40         paddd       mm4, mm0
    41         paddd       mm4, mm1
    42         paddd       mm4, mm2
    43         paddd       mm4, mm3
    45         add         rax, 32
    46         dec         rcx
    47         ja          .NEXTROW
    48         movq        QWORD PTR [rsp], mm4
    50         ;return sum[0]+sum[1];
    51         movsxd      rax, dword ptr [rsp]
    52         movsxd      rcx, dword ptr [rsp+4]
    53         add         rax, rcx
    56     ; begin epilog
    57     add rsp, 8
    58     pop rdi
    59     pop rsi
    60     RESTORE_GOT
    61     UNSHADOW_ARGS
    62     pop         rbp
    63     ret
    66 ;unsigned int vp9_get8x8var_mmx
    67 ;(
    68 ;    unsigned char *src_ptr,
    69 ;    int  source_stride,
    70 ;    unsigned char *ref_ptr,
    71 ;    int  recon_stride,
    72 ;    unsigned int *SSE,
    73 ;    int *Sum
    74 ;)
    75 global sym(vp9_get8x8var_mmx) PRIVATE
    76 sym(vp9_get8x8var_mmx):
    77     push        rbp
    78     mov         rbp, rsp
    79     SHADOW_ARGS_TO_STACK 6
    80     push rsi
    81     push rdi
    82     push rbx
    83     sub         rsp, 16
    84     ; end prolog
    87         pxor        mm5, mm5                    ; Blank mmx6
    88         pxor        mm6, mm6                    ; Blank mmx7
    89         pxor        mm7, mm7                    ; Blank mmx7
    91         mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
    92         mov         rbx, arg(2) ;[ref_ptr]
    93         movsxd      rcx, dword ptr arg(1) ;[source_stride]
    94         movsxd      rdx, dword ptr arg(3) ;[recon_stride]
    96         ; Row 1
    97         movq        mm0, [rax]                  ; Copy eight bytes to mm0
    98         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
    99         movq        mm2, mm0                    ; Take copies
   100         movq        mm3, mm1                    ; Take copies
   102         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   103         punpcklbw   mm1, mm6
   104         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
   105         punpckhbw   mm3, mm6
   106         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   107         psubsw      mm2, mm3                    ; A-B (high order) to MM2
   109         paddw       mm5, mm0                    ; accumulate differences in mm5
   110         paddw       mm5, mm2                    ; accumulate differences in mm5
   112         pmaddwd     mm0, mm0                    ; square and accumulate
   113         pmaddwd     mm2, mm2                    ; square and accumulate
   114         add         rbx,rdx                     ; Inc pointer into ref data
   115         add         rax,rcx                     ; Inc pointer into the new data
   116         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
   117         paddd       mm7, mm0                    ; accumulate in mm7
   118         paddd       mm7, mm2                    ; accumulate in mm7
   121         ; Row 2
   122         movq        mm0, [rax]                  ; Copy eight bytes to mm0
   123         movq        mm2, mm0                    ; Take copies
   124         movq        mm3, mm1                    ; Take copies
   126         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   127         punpcklbw   mm1, mm6
   128         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
   129         punpckhbw   mm3, mm6
   130         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   131         psubsw      mm2, mm3                    ; A-B (high order) to MM2
   133         paddw       mm5, mm0                    ; accumulate differences in mm5
   134         paddw       mm5, mm2                    ; accumulate differences in mm5
   136         pmaddwd     mm0, mm0                    ; square and accumulate
   137         pmaddwd     mm2, mm2                    ; square and accumulate
   138         add         rbx,rdx                     ; Inc pointer into ref data
   139         add         rax,rcx                     ; Inc pointer into the new data
   140         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
   141         paddd       mm7, mm0                    ; accumulate in mm7
   142         paddd       mm7, mm2                    ; accumulate in mm7
   144         ; Row 3
   145         movq        mm0, [rax]                  ; Copy eight bytes to mm0
   146         movq        mm2, mm0                    ; Take copies
   147         movq        mm3, mm1                    ; Take copies
   149         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   150         punpcklbw   mm1, mm6
   151         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
   152         punpckhbw   mm3, mm6
   153         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   154         psubsw      mm2, mm3                    ; A-B (high order) to MM2
   156         paddw       mm5, mm0                    ; accumulate differences in mm5
   157         paddw       mm5, mm2                    ; accumulate differences in mm5
   159         pmaddwd     mm0, mm0                    ; square and accumulate
   160         pmaddwd     mm2, mm2                    ; square and accumulate
   161         add         rbx,rdx                     ; Inc pointer into ref data
   162         add         rax,rcx                     ; Inc pointer into the new data
   163         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
   164         paddd       mm7, mm0                    ; accumulate in mm7
   165         paddd       mm7, mm2                    ; accumulate in mm7
   167         ; Row 4
   168         movq        mm0, [rax]                  ; Copy eight bytes to mm0
   169         movq        mm2, mm0                    ; Take copies
   170         movq        mm3, mm1                    ; Take copies
   172         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   173         punpcklbw   mm1, mm6
   174         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
   175         punpckhbw   mm3, mm6
   176         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   177         psubsw      mm2, mm3                    ; A-B (high order) to MM2
   179         paddw       mm5, mm0                    ; accumulate differences in mm5
   180         paddw       mm5, mm2                    ; accumulate differences in mm5
   182         pmaddwd     mm0, mm0                    ; square and accumulate
   183         pmaddwd     mm2, mm2                    ; square and accumulate
   184         add         rbx,rdx                     ; Inc pointer into ref data
   185         add         rax,rcx                     ; Inc pointer into the new data
   186         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
   187         paddd       mm7, mm0                    ; accumulate in mm7
   188         paddd       mm7, mm2                    ; accumulate in mm7
   190         ; Row 5
   191         movq        mm0, [rax]                  ; Copy eight bytes to mm0
   192         movq        mm2, mm0                    ; Take copies
   193         movq        mm3, mm1                    ; Take copies
   195         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   196         punpcklbw   mm1, mm6
   197         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
   198         punpckhbw   mm3, mm6
   199         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   200         psubsw      mm2, mm3                    ; A-B (high order) to MM2
   202         paddw       mm5, mm0                    ; accumulate differences in mm5
   203         paddw       mm5, mm2                    ; accumulate differences in mm5
   205         pmaddwd     mm0, mm0                    ; square and accumulate
   206         pmaddwd     mm2, mm2                    ; square and accumulate
   207         add         rbx,rdx                     ; Inc pointer into ref data
   208         add         rax,rcx                     ; Inc pointer into the new data
   209         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
   210         ;              movq        mm4, [rbx + rdx]
   211         paddd       mm7, mm0                    ; accumulate in mm7
   212         paddd       mm7, mm2                    ; accumulate in mm7
   214         ; Row 6
   215         movq        mm0, [rax]                  ; Copy eight bytes to mm0
   216         movq        mm2, mm0                    ; Take copies
   217         movq        mm3, mm1                    ; Take copies
   219         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   220         punpcklbw   mm1, mm6
   221         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
   222         punpckhbw   mm3, mm6
   223         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   224         psubsw      mm2, mm3                    ; A-B (high order) to MM2
   226         paddw       mm5, mm0                    ; accumulate differences in mm5
   227         paddw       mm5, mm2                    ; accumulate differences in mm5
   229         pmaddwd     mm0, mm0                    ; square and accumulate
   230         pmaddwd     mm2, mm2                    ; square and accumulate
   231         add         rbx,rdx                     ; Inc pointer into ref data
   232         add         rax,rcx                     ; Inc pointer into the new data
   233         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
   234         paddd       mm7, mm0                    ; accumulate in mm7
   235         paddd       mm7, mm2                    ; accumulate in mm7
   237         ; Row 7
   238         movq        mm0, [rax]                  ; Copy eight bytes to mm0
   239         movq        mm2, mm0                    ; Take copies
   240         movq        mm3, mm1                    ; Take copies
   242         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   243         punpcklbw   mm1, mm6
   244         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
   245         punpckhbw   mm3, mm6
   246         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   247         psubsw      mm2, mm3                    ; A-B (high order) to MM2
   249         paddw       mm5, mm0                    ; accumulate differences in mm5
   250         paddw       mm5, mm2                    ; accumulate differences in mm5
   252         pmaddwd     mm0, mm0                    ; square and accumulate
   253         pmaddwd     mm2, mm2                    ; square and accumulate
   254         add         rbx,rdx                     ; Inc pointer into ref data
   255         add         rax,rcx                     ; Inc pointer into the new data
   256         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
   257         paddd       mm7, mm0                    ; accumulate in mm7
   258         paddd       mm7, mm2                    ; accumulate in mm7
   260         ; Row 8
   261         movq        mm0, [rax]                  ; Copy eight bytes to mm0
   262         movq        mm2, mm0                    ; Take copies
   263         movq        mm3, mm1                    ; Take copies
   265         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   266         punpcklbw   mm1, mm6
   267         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
   268         punpckhbw   mm3, mm6
   269         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   270         psubsw      mm2, mm3                    ; A-B (high order) to MM2
   272         paddw       mm5, mm0                    ; accumulate differences in mm5
   273         paddw       mm5, mm2                    ; accumulate differences in mm5
   275         pmaddwd     mm0, mm0                    ; square and accumulate
   276         pmaddwd     mm2, mm2                    ; square and accumulate
   277         add         rbx,rdx                     ; Inc pointer into ref data
   278         add         rax,rcx                     ; Inc pointer into the new data
   279         paddd       mm7, mm0                    ; accumulate in mm7
   280         paddd       mm7, mm2                    ; accumulate in mm7
   282         ; Now accumulate the final results.
   283         movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
   284         movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
   285         movsx       rdx, WORD PTR [rsp+8]
   286         movsx       rcx, WORD PTR [rsp+10]
   287         movsx       rbx, WORD PTR [rsp+12]
   288         movsx       rax, WORD PTR [rsp+14]
   289         add         rdx, rcx
   290         add         rbx, rax
   291         add         rdx, rbx    ;XSum
   292         movsxd      rax, DWORD PTR [rsp]
   293         movsxd      rcx, DWORD PTR [rsp+4]
   294         add         rax, rcx    ;XXSum
   295         mov         rsi, arg(4) ;SSE
   296         mov         rdi, arg(5) ;Sum
   297         mov         dword ptr [rsi], eax
   298         mov         dword ptr [rdi], edx
   299         xor         rax, rax    ; return 0
   302     ; begin epilog
   303     add rsp, 16
   304     pop rbx
   305     pop rdi
   306     pop rsi
   307     UNSHADOW_ARGS
   308     pop         rbp
   309     ret
   313 ;unsigned int
   314 ;vp9_get4x4var_mmx
   315 ;(
   316 ;    unsigned char *src_ptr,
   317 ;    int  source_stride,
   318 ;    unsigned char *ref_ptr,
   319 ;    int  recon_stride,
   320 ;    unsigned int *SSE,
   321 ;    int *Sum
   322 ;)
   323 global sym(vp9_get4x4var_mmx) PRIVATE
   324 sym(vp9_get4x4var_mmx):
   325     push        rbp
   326     mov         rbp, rsp
   327     SHADOW_ARGS_TO_STACK 6
   328     push rsi
   329     push rdi
   330     push rbx
   331     sub         rsp, 16
   332     ; end prolog
   335         pxor        mm5, mm5                    ; Blank mmx6
   336         pxor        mm6, mm6                    ; Blank mmx7
   337         pxor        mm7, mm7                    ; Blank mmx7
   339         mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
   340         mov         rbx, arg(2) ;[ref_ptr]
   341         movsxd      rcx, dword ptr arg(1) ;[source_stride]
   342         movsxd      rdx, dword ptr arg(3) ;[recon_stride]
   344         ; Row 1
   345         movd        mm0, [rax]                  ; Copy 4 bytes to mm0
   346         movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
   347         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   348         punpcklbw   mm1, mm6
   349         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   350         paddw       mm5, mm0                    ; accumulate differences in mm5
   351         pmaddwd     mm0, mm0                    ; square and accumulate
   352         add         rbx,rdx                     ; Inc pointer into ref data
   353         add         rax,rcx                     ; Inc pointer into the new data
   354         movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
   355         paddd       mm7, mm0                    ; accumulate in mm7
   358         ; Row 2
   359         movd        mm0, [rax]                  ; Copy 4 bytes to mm0
   360         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   361         punpcklbw   mm1, mm6
   362         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   363         paddw       mm5, mm0                    ; accumulate differences in mm5
   365         pmaddwd     mm0, mm0                    ; square and accumulate
   366         add         rbx,rdx                     ; Inc pointer into ref data
   367         add         rax,rcx                     ; Inc pointer into the new data
   368         movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
   369         paddd       mm7, mm0                    ; accumulate in mm7
   371         ; Row 3
   372         movd        mm0, [rax]                  ; Copy 4 bytes to mm0
   373         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   374         punpcklbw   mm1, mm6
   375         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   376         paddw       mm5, mm0                    ; accumulate differences in mm5
   378         pmaddwd     mm0, mm0                    ; square and accumulate
   379         add         rbx,rdx                     ; Inc pointer into ref data
   380         add         rax,rcx                     ; Inc pointer into the new data
   381         movd        mm1, [rbx]                  ; Copy 4 bytes to mm1
   382         paddd       mm7, mm0                    ; accumulate in mm7
   384         ; Row 4
   385         movd        mm0, [rax]                  ; Copy 4 bytes to mm0
   387         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   388         punpcklbw   mm1, mm6
   389         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   391         paddw       mm5, mm0                    ; accumulate differences in mm5
   393         pmaddwd     mm0, mm0                    ; square and accumulate
   394         paddd       mm7, mm0                    ; accumulate in mm7
   397         ; Now accumulate the final results.
   398         movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
   399         movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
   400         movsx       rdx, WORD PTR [rsp+8]
   401         movsx       rcx, WORD PTR [rsp+10]
   402         movsx       rbx, WORD PTR [rsp+12]
   403         movsx       rax, WORD PTR [rsp+14]
   404         add         rdx, rcx
   405         add         rbx, rax
   406         add         rdx, rbx    ;XSum
   407         movsxd      rax, DWORD PTR [rsp]
   408         movsxd      rcx, DWORD PTR [rsp+4]
   409         add         rax, rcx    ;XXSum
   410         mov         rsi, arg(4) ;SSE
   411         mov         rdi, arg(5) ;Sum
   412         mov         dword ptr [rsi], eax
   413         mov         dword ptr [rdi], edx
   414         xor         rax, rax    ; return 0
   417     ; begin epilog
   418     add rsp, 16
   419     pop rbx
   420     pop rdi
   421     pop rsi
   422     UNSHADOW_ARGS
   423     pop         rbp
   424     ret
   428 ;unsigned int
   429 ;vp9_get4x4sse_cs_mmx
   430 ;(
   431 ;    unsigned char *src_ptr,
   432 ;    int  source_stride,
   433 ;    unsigned char *ref_ptr,
   434 ;    int  recon_stride
   435 ;)
   436 global sym(vp9_get4x4sse_cs_mmx) PRIVATE
   437 sym(vp9_get4x4sse_cs_mmx):
   438     push        rbp
   439     mov         rbp, rsp
   440     SHADOW_ARGS_TO_STACK 4
   441     push rsi
   442     push rdi
   443     push rbx
   444     ; end prolog
   447         pxor        mm6, mm6                    ; Blank mmx7
   448         pxor        mm7, mm7                    ; Blank mmx7
   450         mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
   451         mov         rbx, arg(2) ;[ref_ptr]
   452         movsxd      rcx, dword ptr arg(1) ;[source_stride]
   453         movsxd      rdx, dword ptr arg(3) ;[recon_stride]
   454         ; Row 1
   455         movd        mm0, [rax]                  ; Copy eight bytes to mm0
   456         movd        mm1, [rbx]                  ; Copy eight bytes to mm1
   457         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   458         punpcklbw   mm1, mm6
   459         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   460         pmaddwd     mm0, mm0                    ; square and accumulate
   461         add         rbx,rdx                     ; Inc pointer into ref data
   462         add         rax,rcx                     ; Inc pointer into the new data
   463         movd        mm1, [rbx]                  ; Copy eight bytes to mm1
   464         paddd       mm7, mm0                    ; accumulate in mm7
   466         ; Row 2
   467         movd        mm0, [rax]                  ; Copy eight bytes to mm0
   468         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   469         punpcklbw   mm1, mm6
   470         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   471         pmaddwd     mm0, mm0                    ; square and accumulate
   472         add         rbx,rdx                     ; Inc pointer into ref data
   473         add         rax,rcx                     ; Inc pointer into the new data
   474         movd        mm1, [rbx]                  ; Copy eight bytes to mm1
   475         paddd       mm7, mm0                    ; accumulate in mm7
   477         ; Row 3
   478         movd        mm0, [rax]                  ; Copy eight bytes to mm0
   479         punpcklbw   mm1, mm6
   480         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   481         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   483         pmaddwd     mm0, mm0                    ; square and accumulate
   484         add         rbx,rdx                     ; Inc pointer into ref data
   485         add         rax,rcx                     ; Inc pointer into the new data
   486         movd        mm1, [rbx]                  ; Copy eight bytes to mm1
   487         paddd       mm7, mm0                    ; accumulate in mm7
   489         ; Row 4
   490         movd        mm0, [rax]                  ; Copy eight bytes to mm0
   491         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
   492         punpcklbw   mm1, mm6
   493         psubsw      mm0, mm1                    ; A-B (low order) to MM0
   494         pmaddwd     mm0, mm0                    ; square and accumulate
   495         paddd       mm7, mm0                    ; accumulate in mm7
   497         movq        mm0,    mm7                 ;
   498         psrlq       mm7,    32
   500         paddd       mm0,    mm7
   501         movq        rax,    mm0
   504     ; begin epilog
   505     pop rbx
   506     pop rdi
   507     pop rsi
   508     UNSHADOW_ARGS
   509     pop         rbp
   510     ret

mercurial