media/libvpx/vp8/common/x86/subpixel_mmx.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    12 %include "vpx_ports/x86_abi_support.asm"
    13 extern sym(vp8_bilinear_filters_x86_8)
    16 %define BLOCK_HEIGHT_WIDTH 4
    17 %define vp8_filter_weight 128
    18 %define VP8_FILTER_SHIFT  7
    21 ;void vp8_filter_block1d_h6_mmx
    22 ;(
    23 ;    unsigned char   *src_ptr,
    24 ;    unsigned short  *output_ptr,
    25 ;    unsigned int    src_pixels_per_line,
    26 ;    unsigned int    pixel_step,
    27 ;    unsigned int    output_height,
    28 ;    unsigned int    output_width,
    29 ;    short           * vp8_filter
    30 ;)
    31 global sym(vp8_filter_block1d_h6_mmx) PRIVATE
    32 sym(vp8_filter_block1d_h6_mmx):
    33     push        rbp
    34     mov         rbp, rsp
    35     SHADOW_ARGS_TO_STACK 7
    36     GET_GOT     rbx
    37     push        rsi
    38     push        rdi
    39     ; end prolog
    41         mov         rdx,    arg(6) ;vp8_filter
    43         movq        mm1,    [rdx + 16]             ; do both the negative taps first!!!
    44         movq        mm2,    [rdx + 32]         ;
    45         movq        mm6,    [rdx + 48]        ;
    46         movq        mm7,    [rdx + 64]        ;
    48         mov         rdi,    arg(1) ;output_ptr
    49         mov         rsi,    arg(0) ;src_ptr
    50         movsxd      rcx,    dword ptr arg(4) ;output_height
    51         movsxd      rax,    dword ptr arg(5) ;output_width      ; destination pitch?
    52         pxor        mm0,    mm0              ; mm0 = 00000000
    54 .nextrow:
    55         movq        mm3,    [rsi-2]          ; mm3 = p-2..p5
    56         movq        mm4,    mm3              ; mm4 = p-2..p5
    57         psrlq       mm3,    8                ; mm3 = p-1..p5
    58         punpcklbw   mm3,    mm0              ; mm3 = p-1..p2
    59         pmullw      mm3,    mm1              ; mm3 *= kernel 1 modifiers.
    61         movq        mm5,    mm4              ; mm5 = p-2..p5
    62         punpckhbw   mm4,    mm0              ; mm5 = p2..p5
    63         pmullw      mm4,    mm7              ; mm5 *= kernel 4 modifiers
    64         paddsw      mm3,    mm4              ; mm3 += mm5
    66         movq        mm4,    mm5              ; mm4 = p-2..p5;
    67         psrlq       mm5,    16               ; mm5 = p0..p5;
    68         punpcklbw   mm5,    mm0              ; mm5 = p0..p3
    69         pmullw      mm5,    mm2              ; mm5 *= kernel 2 modifiers
    70         paddsw      mm3,    mm5              ; mm3 += mm5
    72         movq        mm5,    mm4              ; mm5 = p-2..p5
    73         psrlq       mm4,    24               ; mm4 = p1..p5
    74         punpcklbw   mm4,    mm0              ; mm4 = p1..p4
    75         pmullw      mm4,    mm6              ; mm5 *= kernel 3 modifiers
    76         paddsw      mm3,    mm4              ; mm3 += mm5
    78         ; do outer positive taps
    79         movd        mm4,    [rsi+3]
    80         punpcklbw   mm4,    mm0              ; mm5 = p3..p6
    81         pmullw      mm4,    [rdx+80]         ; mm5 *= kernel 0 modifiers
    82         paddsw      mm3,    mm4              ; mm3 += mm5
    84         punpcklbw   mm5,    mm0              ; mm5 = p-2..p1
    85         pmullw      mm5,    [rdx]            ; mm5 *= kernel 5 modifiers
    86         paddsw      mm3,    mm5              ; mm3 += mm5
    88         paddsw      mm3,    [GLOBAL(rd)]              ; mm3 += round value
    89         psraw       mm3,    VP8_FILTER_SHIFT     ; mm3 /= 128
    90         packuswb    mm3,    mm0              ; pack and unpack to saturate
    91         punpcklbw   mm3,    mm0              ;
    93         movq        [rdi],  mm3              ; store the results in the destination
    95 %if ABI_IS_32BIT
    96         add         rsi,    dword ptr arg(2) ;src_pixels_per_line ; next line
    97         add         rdi,    rax;
    98 %else
    99         movsxd      r8,     dword ptr arg(2) ;src_pixels_per_line
   100         add         rdi,    rax;
   102         add         rsi,    r8               ; next line
   103 %endif
   105         dec         rcx                      ; decrement count
   106         jnz         .nextrow                 ; next row
   108     ; begin epilog
   109     pop rdi
   110     pop rsi
   111     RESTORE_GOT
   112     UNSHADOW_ARGS
   113     pop         rbp
   114     ret
   117 ;void vp8_filter_block1dc_v6_mmx
   118 ;(
   119 ;   short *src_ptr,
   120 ;   unsigned char *output_ptr,
   121 ;    int output_pitch,
   122 ;   unsigned int pixels_per_line,
   123 ;   unsigned int pixel_step,
   124 ;   unsigned int output_height,
   125 ;   unsigned int output_width,
   126 ;   short * vp8_filter
   127 ;)
   128 global sym(vp8_filter_block1dc_v6_mmx) PRIVATE
   129 sym(vp8_filter_block1dc_v6_mmx):
   130     push        rbp
   131     mov         rbp, rsp
   132     SHADOW_ARGS_TO_STACK 8
   133     GET_GOT     rbx
   134     push        rsi
   135     push        rdi
   136     ; end prolog
   138         movq      mm5, [GLOBAL(rd)]
   139         push        rbx
   140         mov         rbx, arg(7) ;vp8_filter
   141         movq      mm1, [rbx + 16]             ; do both the negative taps first!!!
   142         movq      mm2, [rbx + 32]         ;
   143         movq      mm6, [rbx + 48]        ;
   144         movq      mm7, [rbx + 64]        ;
   146         movsxd      rdx, dword ptr arg(3) ;pixels_per_line
   147         mov         rdi, arg(1) ;output_ptr
   148         mov         rsi, arg(0) ;src_ptr
   149         sub         rsi, rdx
   150         sub         rsi, rdx
   151         movsxd      rcx, DWORD PTR arg(5) ;output_height
   152         movsxd      rax, DWORD PTR arg(2) ;output_pitch      ; destination pitch?
   153         pxor        mm0, mm0              ; mm0 = 00000000
   156 .nextrow_cv:
   157         movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1
   158         pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.
   161         movq        mm4, [rsi + 4*rdx]      ; mm4 = p0..p3  = row 2
   162         pmullw      mm4, mm7              ; mm4 *= kernel 4 modifiers.
   163         paddsw      mm3, mm4              ; mm3 += mm4
   165         movq        mm4, [rsi + 2*rdx]           ; mm4 = p0..p3  = row 0
   166         pmullw      mm4, mm2              ; mm4 *= kernel 2 modifiers.
   167         paddsw      mm3, mm4              ; mm3 += mm4
   169         movq        mm4, [rsi]            ; mm4 = p0..p3  = row -2
   170         pmullw      mm4, [rbx]            ; mm4 *= kernel 0 modifiers.
   171         paddsw      mm3, mm4              ; mm3 += mm4
   174         add         rsi, rdx              ; move source forward 1 line to avoid 3 * pitch
   175         movq        mm4, [rsi + 2*rdx]     ; mm4 = p0..p3  = row 1
   176         pmullw      mm4, mm6              ; mm4 *= kernel 3 modifiers.
   177         paddsw      mm3, mm4              ; mm3 += mm4
   179         movq        mm4, [rsi + 4*rdx]    ; mm4 = p0..p3  = row 3
   180         pmullw      mm4, [rbx +80]        ; mm4 *= kernel 3 modifiers.
   181         paddsw      mm3, mm4              ; mm3 += mm4
   184         paddsw      mm3, mm5               ; mm3 += round value
   185         psraw       mm3, VP8_FILTER_SHIFT     ; mm3 /= 128
   186         packuswb    mm3, mm0              ; pack and saturate
   188         movd        [rdi],mm3             ; store the results in the destination
   189         ; the subsequent iterations repeat 3 out of 4 of these reads.  Since the
   190         ; recon block should be in cache this shouldn't cost much.  Its obviously
   191         ; avoidable!!!.
   192         lea         rdi,  [rdi+rax] ;
   193         dec         rcx                   ; decrement count
   194         jnz         .nextrow_cv           ; next row
   196         pop         rbx
   198     ; begin epilog
   199     pop rdi
   200     pop rsi
   201     RESTORE_GOT
   202     UNSHADOW_ARGS
   203     pop         rbp
   204     ret
   207 ;void bilinear_predict8x8_mmx
   208 ;(
   209 ;    unsigned char  *src_ptr,
   210 ;    int   src_pixels_per_line,
   211 ;    int  xoffset,
   212 ;    int  yoffset,
   213 ;   unsigned char *dst_ptr,
   214 ;    int dst_pitch
   215 ;)
   216 global sym(vp8_bilinear_predict8x8_mmx) PRIVATE
   217 sym(vp8_bilinear_predict8x8_mmx):
   218     push        rbp
   219     mov         rbp, rsp
   220     SHADOW_ARGS_TO_STACK 6
   221     GET_GOT     rbx
   222     push        rsi
   223     push        rdi
   224     ; end prolog
   226     ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
   227     ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
   229         movsxd      rax,        dword ptr arg(2) ;xoffset
   230         mov         rdi,        arg(4) ;dst_ptr           ;
   232         shl         rax,        5 ; offset * 32
   233         lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
   235         add         rax,        rcx ; HFilter
   236         mov         rsi,        arg(0) ;src_ptr              ;
   238         movsxd      rdx,        dword ptr arg(5) ;dst_pitch
   239         movq        mm1,        [rax]               ;
   241         movq        mm2,        [rax+16]            ;
   242         movsxd      rax,        dword ptr arg(3) ;yoffset
   244         pxor        mm0,        mm0                 ;
   246         shl         rax,        5 ; offset*32
   247         add         rax,        rcx ; VFilter
   249         lea         rcx,        [rdi+rdx*8]          ;
   250         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
   254         ; get the first horizontal line done       ;
   255         movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
   256         movq        mm4,        mm3                 ; make a copy of current line
   258         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
   259         punpckhbw   mm4,        mm0                 ;
   261         pmullw      mm3,        mm1                 ;
   262         pmullw      mm4,        mm1                 ;
   264         movq        mm5,        [rsi+1]             ;
   265         movq        mm6,        mm5                 ;
   267         punpcklbw   mm5,        mm0                 ;
   268         punpckhbw   mm6,        mm0                 ;
   270         pmullw      mm5,        mm2                 ;
   271         pmullw      mm6,        mm2                 ;
   273         paddw       mm3,        mm5                 ;
   274         paddw       mm4,        mm6                 ;
   276         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
   277         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
   279         paddw       mm4,        [GLOBAL(rd)]                 ;
   280         psraw       mm4,        VP8_FILTER_SHIFT        ;
   282         movq        mm7,        mm3                 ;
   283         packuswb    mm7,        mm4                 ;
   285         add         rsi,        rdx                 ; next line
   286 .next_row_8x8:
   287         movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
   288         movq        mm4,        mm3                 ; make a copy of current line
   290         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
   291         punpckhbw   mm4,        mm0                 ;
   293         pmullw      mm3,        mm1                 ;
   294         pmullw      mm4,        mm1                 ;
   296         movq        mm5,        [rsi+1]             ;
   297         movq        mm6,        mm5                 ;
   299         punpcklbw   mm5,        mm0                 ;
   300         punpckhbw   mm6,        mm0                 ;
   302         pmullw      mm5,        mm2                 ;
   303         pmullw      mm6,        mm2                 ;
   305         paddw       mm3,        mm5                 ;
   306         paddw       mm4,        mm6                 ;
   308         movq        mm5,        mm7                 ;
   309         movq        mm6,        mm7                 ;
   311         punpcklbw   mm5,        mm0                 ;
   312         punpckhbw   mm6,        mm0
   314         pmullw      mm5,        [rax]               ;
   315         pmullw      mm6,        [rax]               ;
   317         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
   318         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
   320         paddw       mm4,        [GLOBAL(rd)]                 ;
   321         psraw       mm4,        VP8_FILTER_SHIFT        ;
   323         movq        mm7,        mm3                 ;
   324         packuswb    mm7,        mm4                 ;
   327         pmullw      mm3,        [rax+16]            ;
   328         pmullw      mm4,        [rax+16]            ;
   330         paddw       mm3,        mm5                 ;
   331         paddw       mm4,        mm6                 ;
   334         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
   335         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
   337         paddw       mm4,        [GLOBAL(rd)]                 ;
   338         psraw       mm4,        VP8_FILTER_SHIFT        ;
   340         packuswb    mm3,        mm4
   342         movq        [rdi],      mm3                 ; store the results in the destination
   344 %if ABI_IS_32BIT
   345         add         rsi,        rdx                 ; next line
   346         add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
   347 %else
   348         movsxd      r8,         dword ptr arg(5) ;dst_pitch
   349         add         rsi,        rdx                 ; next line
   350         add         rdi,        r8                  ;dst_pitch
   351 %endif
   352         cmp         rdi,        rcx                 ;
   353         jne         .next_row_8x8
   355     ; begin epilog
   356     pop rdi
   357     pop rsi
   358     RESTORE_GOT
   359     UNSHADOW_ARGS
   360     pop         rbp
   361     ret
   364 ;void bilinear_predict8x4_mmx
   365 ;(
   366 ;    unsigned char  *src_ptr,
   367 ;    int   src_pixels_per_line,
   368 ;    int  xoffset,
   369 ;    int  yoffset,
   370 ;    unsigned char *dst_ptr,
   371 ;    int dst_pitch
   372 ;)
   373 global sym(vp8_bilinear_predict8x4_mmx) PRIVATE
   374 sym(vp8_bilinear_predict8x4_mmx):
   375     push        rbp
   376     mov         rbp, rsp
   377     SHADOW_ARGS_TO_STACK 6
   378     GET_GOT     rbx
   379     push        rsi
   380     push        rdi
   381     ; end prolog
   383     ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
   384     ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
   386         movsxd      rax,        dword ptr arg(2) ;xoffset
   387         mov         rdi,        arg(4) ;dst_ptr           ;
   389         lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
   390         shl         rax,        5
   392         mov         rsi,        arg(0) ;src_ptr              ;
   393         add         rax,        rcx
   395         movsxd      rdx,        dword ptr arg(5) ;dst_pitch
   396         movq        mm1,        [rax]               ;
   398         movq        mm2,        [rax+16]            ;
   399         movsxd      rax,        dword ptr arg(3) ;yoffset
   401         pxor        mm0,        mm0                 ;
   402         shl         rax,        5
   404         add         rax,        rcx
   405         lea         rcx,        [rdi+rdx*4]          ;
   407         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
   409         ; get the first horizontal line done       ;
   410         movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
   411         movq        mm4,        mm3                 ; make a copy of current line
   413         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
   414         punpckhbw   mm4,        mm0                 ;
   416         pmullw      mm3,        mm1                 ;
   417         pmullw      mm4,        mm1                 ;
   419         movq        mm5,        [rsi+1]             ;
   420         movq        mm6,        mm5                 ;
   422         punpcklbw   mm5,        mm0                 ;
   423         punpckhbw   mm6,        mm0                 ;
   425         pmullw      mm5,        mm2                 ;
   426         pmullw      mm6,        mm2                 ;
   428         paddw       mm3,        mm5                 ;
   429         paddw       mm4,        mm6                 ;
   431         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
   432         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
   434         paddw       mm4,        [GLOBAL(rd)]                 ;
   435         psraw       mm4,        VP8_FILTER_SHIFT        ;
   437         movq        mm7,        mm3                 ;
   438         packuswb    mm7,        mm4                 ;
   440         add         rsi,        rdx                 ; next line
   441 .next_row_8x4:
   442         movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
   443         movq        mm4,        mm3                 ; make a copy of current line
   445         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
   446         punpckhbw   mm4,        mm0                 ;
   448         pmullw      mm3,        mm1                 ;
   449         pmullw      mm4,        mm1                 ;
   451         movq        mm5,        [rsi+1]             ;
   452         movq        mm6,        mm5                 ;
   454         punpcklbw   mm5,        mm0                 ;
   455         punpckhbw   mm6,        mm0                 ;
   457         pmullw      mm5,        mm2                 ;
   458         pmullw      mm6,        mm2                 ;
   460         paddw       mm3,        mm5                 ;
   461         paddw       mm4,        mm6                 ;
   463         movq        mm5,        mm7                 ;
   464         movq        mm6,        mm7                 ;
   466         punpcklbw   mm5,        mm0                 ;
   467         punpckhbw   mm6,        mm0
   469         pmullw      mm5,        [rax]               ;
   470         pmullw      mm6,        [rax]               ;
   472         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
   473         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
   475         paddw       mm4,        [GLOBAL(rd)]                 ;
   476         psraw       mm4,        VP8_FILTER_SHIFT        ;
   478         movq        mm7,        mm3                 ;
   479         packuswb    mm7,        mm4                 ;
   482         pmullw      mm3,        [rax+16]            ;
   483         pmullw      mm4,        [rax+16]            ;
   485         paddw       mm3,        mm5                 ;
   486         paddw       mm4,        mm6                 ;
   489         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
   490         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
   492         paddw       mm4,        [GLOBAL(rd)]                 ;
   493         psraw       mm4,        VP8_FILTER_SHIFT        ;
   495         packuswb    mm3,        mm4
   497         movq        [rdi],      mm3                 ; store the results in the destination
   499 %if ABI_IS_32BIT
   500         add         rsi,        rdx                 ; next line
   501         add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
   502 %else
   503         movsxd      r8,         dword ptr arg(5) ;dst_pitch
   504         add         rsi,        rdx                 ; next line
   505         add         rdi,        r8
   506 %endif
   507         cmp         rdi,        rcx                 ;
   508         jne         .next_row_8x4
   510     ; begin epilog
   511     pop rdi
   512     pop rsi
   513     RESTORE_GOT
   514     UNSHADOW_ARGS
   515     pop         rbp
   516     ret
   519 ;void bilinear_predict4x4_mmx
   520 ;(
   521 ;    unsigned char  *src_ptr,
   522 ;    int   src_pixels_per_line,
   523 ;    int  xoffset,
   524 ;    int  yoffset,
   525 ;    unsigned char *dst_ptr,
   526 ;    int dst_pitch
   527 ;)
   528 global sym(vp8_bilinear_predict4x4_mmx) PRIVATE
   529 sym(vp8_bilinear_predict4x4_mmx):
   530     push        rbp
   531     mov         rbp, rsp
   532     SHADOW_ARGS_TO_STACK 6
   533     GET_GOT     rbx
   534     push        rsi
   535     push        rdi
   536     ; end prolog
   538     ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
   539     ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
   541         movsxd      rax,        dword ptr arg(2) ;xoffset
   542         mov         rdi,        arg(4) ;dst_ptr           ;
   544         lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
   545         shl         rax,        5
   547         add         rax,        rcx ; HFilter
   548         mov         rsi,        arg(0) ;src_ptr              ;
   550         movsxd      rdx,        dword ptr arg(5) ;ldst_pitch
   551         movq        mm1,        [rax]               ;
   553         movq        mm2,        [rax+16]            ;
   554         movsxd      rax,        dword ptr arg(3) ;yoffset
   556         pxor        mm0,        mm0                 ;
   557         shl         rax,        5
   559         add         rax,        rcx
   560         lea         rcx,        [rdi+rdx*4]          ;
   562         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line    ;
   564         ; get the first horizontal line done       ;
   565         movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
   566         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
   568         pmullw      mm3,        mm1                 ;
   569         movd        mm5,        [rsi+1]             ;
   571         punpcklbw   mm5,        mm0                 ;
   572         pmullw      mm5,        mm2                 ;
   574         paddw       mm3,        mm5                 ;
   575         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
   577         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
   579         movq        mm7,        mm3                 ;
   580         packuswb    mm7,        mm0                 ;
   582         add         rsi,        rdx                 ; next line
   583 .next_row_4x4:
   584         movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
   585         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
   587         pmullw      mm3,        mm1                 ;
   588         movd        mm5,        [rsi+1]             ;
   590         punpcklbw   mm5,        mm0                 ;
   591         pmullw      mm5,        mm2                 ;
   593         paddw       mm3,        mm5                 ;
   595         movq        mm5,        mm7                 ;
   596         punpcklbw   mm5,        mm0                 ;
   598         pmullw      mm5,        [rax]               ;
   599         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
   601         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
   602         movq        mm7,        mm3                 ;
   604         packuswb    mm7,        mm0                 ;
   606         pmullw      mm3,        [rax+16]            ;
   607         paddw       mm3,        mm5                 ;
   610         paddw       mm3,        [GLOBAL(rd)]                 ; xmm3 += round value
   611         psraw       mm3,        VP8_FILTER_SHIFT        ; xmm3 /= 128
   613         packuswb    mm3,        mm0
   614         movd        [rdi],      mm3                 ; store the results in the destination
   616 %if ABI_IS_32BIT
   617         add         rsi,        rdx                 ; next line
   618         add         rdi,        dword ptr arg(5) ;dst_pitch                   ;
   619 %else
   620         movsxd      r8,         dword ptr arg(5) ;dst_pitch                   ;
   621         add         rsi,        rdx                 ; next line
   622         add         rdi,        r8
   623 %endif
   625         cmp         rdi,        rcx                 ;
   626         jne         .next_row_4x4
   628     ; begin epilog
   629     pop rdi
   630     pop rsi
   631     RESTORE_GOT
   632     UNSHADOW_ARGS
   633     pop         rbp
   634     ret
   638 SECTION_RODATA
   639 align 16
   640 rd:
   641     times 4 dw 0x40
   643 align 16
   644 global HIDDEN_DATA(sym(vp8_six_tap_mmx))
   645 sym(vp8_six_tap_mmx):
   646     times 8 dw 0
   647     times 8 dw 0
   648     times 8 dw 128
   649     times 8 dw 0
   650     times 8 dw 0
   651     times 8 dw 0
   653     times 8 dw 0
   654     times 8 dw -6
   655     times 8 dw 123
   656     times 8 dw 12
   657     times 8 dw -1
   658     times 8 dw 0
   660     times 8 dw 2
   661     times 8 dw -11
   662     times 8 dw 108
   663     times 8 dw 36
   664     times 8 dw -8
   665     times 8 dw 1
   667     times 8 dw 0
   668     times 8 dw -9
   669     times 8 dw 93
   670     times 8 dw 50
   671     times 8 dw -6
   672     times 8 dw 0
   674     times 8 dw 3
   675     times 8 dw -16
   676     times 8 dw 77
   677     times 8 dw 77
   678     times 8 dw -16
   679     times 8 dw 3
   681     times 8 dw 0
   682     times 8 dw -6
   683     times 8 dw 50
   684     times 8 dw 93
   685     times 8 dw -9
   686     times 8 dw 0
   688     times 8 dw 1
   689     times 8 dw -8
   690     times 8 dw 36
   691     times 8 dw 108
   692     times 8 dw -11
   693     times 8 dw 2
   695     times 8 dw 0
   696     times 8 dw -1
   697     times 8 dw 12
   698     times 8 dw 123
   699     times 8 dw -6
   700     times 8 dw 0

mercurial