media/libvpx/vp8/common/x86/idctllm_sse2.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    12 %include "vpx_ports/x86_abi_support.asm"
    14 ;void vp8_idct_dequant_0_2x_sse2
    15 ; (
    16 ;   short *qcoeff       - 0
    17 ;   short *dequant      - 1
    18 ;   unsigned char *dst  - 2
    19 ;   int dst_stride      - 3
    20 ; )
    22 global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE
    23 sym(vp8_idct_dequant_0_2x_sse2):
    24     push        rbp
    25     mov         rbp, rsp
    26     SHADOW_ARGS_TO_STACK 4
    27     GET_GOT     rbx
    28     ; end prolog
    30         mov         rdx,            arg(1) ; dequant
    31         mov         rax,            arg(0) ; qcoeff
    33         movd        xmm4,           [rax]
    34         movd        xmm5,           [rdx]
    36         pinsrw      xmm4,           [rax+32],   4
    37         pinsrw      xmm5,           [rdx],      4
    39         pmullw      xmm4,           xmm5
    41     ; Zero out xmm5, for use unpacking
    42         pxor        xmm5,           xmm5
    44     ; clear coeffs
    45         movd        [rax],          xmm5
    46         movd        [rax+32],       xmm5
    47 ;pshufb
    48         mov         rax,            arg(2) ; dst
    49         movsxd      rdx,            dword ptr arg(3) ; dst_stride
    51         pshuflw     xmm4,           xmm4,       00000000b
    52         pshufhw     xmm4,           xmm4,       00000000b
    54         lea         rcx,            [rdx + rdx*2]
    55         paddw       xmm4,           [GLOBAL(fours)]
    57         psraw       xmm4,           3
    59         movq        xmm0,           [rax]
    60         movq        xmm1,           [rax+rdx]
    61         movq        xmm2,           [rax+2*rdx]
    62         movq        xmm3,           [rax+rcx]
    64         punpcklbw   xmm0,           xmm5
    65         punpcklbw   xmm1,           xmm5
    66         punpcklbw   xmm2,           xmm5
    67         punpcklbw   xmm3,           xmm5
    70     ; Add to predict buffer
    71         paddw       xmm0,           xmm4
    72         paddw       xmm1,           xmm4
    73         paddw       xmm2,           xmm4
    74         paddw       xmm3,           xmm4
    76     ; pack up before storing
    77         packuswb    xmm0,           xmm5
    78         packuswb    xmm1,           xmm5
    79         packuswb    xmm2,           xmm5
    80         packuswb    xmm3,           xmm5
    82     ; store blocks back out
    83         movq        [rax],          xmm0
    84         movq        [rax + rdx],    xmm1
    86         lea         rax,            [rax + 2*rdx]
    88         movq        [rax],          xmm2
    89         movq        [rax + rdx],    xmm3
    91     ; begin epilog
    92     RESTORE_GOT
    93     UNSHADOW_ARGS
    94     pop         rbp
    95     ret
    97 ;void vp8_idct_dequant_full_2x_sse2
    98 ; (
    99 ;   short *qcoeff       - 0
   100 ;   short *dequant      - 1
   101 ;   unsigned char *dst  - 2
   102 ;   int dst_stride      - 3
   103 ; )
   104 global sym(vp8_idct_dequant_full_2x_sse2) PRIVATE
   105 sym(vp8_idct_dequant_full_2x_sse2):
   106     push        rbp
   107     mov         rbp, rsp
   108     SHADOW_ARGS_TO_STACK 4
   109     SAVE_XMM 7
   110     GET_GOT     rbx
   111     push        rsi
   112     push        rdi
   113     ; end prolog
   115     ; special case when 2 blocks have 0 or 1 coeffs
   116     ; dc is set as first coeff, so no need to load qcoeff
   117         mov         rax,            arg(0) ; qcoeff
   118         mov         rdx,            arg(1)  ; dequant
   119         mov         rdi,            arg(2) ; dst
   122     ; Zero out xmm7, for use unpacking
   123         pxor        xmm7,           xmm7
   126     ; note the transpose of xmm1 and xmm2, necessary for shuffle
   127     ;   to spit out sensicle data
   128         movdqa      xmm0,           [rax]
   129         movdqa      xmm2,           [rax+16]
   130         movdqa      xmm1,           [rax+32]
   131         movdqa      xmm3,           [rax+48]
   133     ; Clear out coeffs
   134         movdqa      [rax],          xmm7
   135         movdqa      [rax+16],       xmm7
   136         movdqa      [rax+32],       xmm7
   137         movdqa      [rax+48],       xmm7
   139     ; dequantize qcoeff buffer
   140         pmullw      xmm0,           [rdx]
   141         pmullw      xmm2,           [rdx+16]
   142         pmullw      xmm1,           [rdx]
   143         pmullw      xmm3,           [rdx+16]
   144         movsxd      rdx,            dword ptr arg(3) ; dst_stride
   146     ; repack so block 0 row x and block 1 row x are together
   147         movdqa      xmm4,           xmm0
   148         punpckldq   xmm0,           xmm1
   149         punpckhdq   xmm4,           xmm1
   151         pshufd      xmm0,           xmm0,       11011000b
   152         pshufd      xmm1,           xmm4,       11011000b
   154         movdqa      xmm4,           xmm2
   155         punpckldq   xmm2,           xmm3
   156         punpckhdq   xmm4,           xmm3
   158         pshufd      xmm2,           xmm2,       11011000b
   159         pshufd      xmm3,           xmm4,       11011000b
   161     ; first pass
   162         psubw       xmm0,           xmm2        ; b1 = 0-2
   163         paddw       xmm2,           xmm2        ;
   165         movdqa      xmm5,           xmm1
   166         paddw       xmm2,           xmm0        ; a1 = 0+2
   168         pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
   169         lea         rcx,            [rdx + rdx*2]   ;dst_stride * 3
   170         paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
   172         movdqa      xmm7,           xmm3
   173         pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
   175         paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
   176         psubw       xmm7,           xmm5        ; c1
   178         movdqa      xmm5,           xmm1
   179         movdqa      xmm4,           xmm3
   181         pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
   182         paddw       xmm5,           xmm1
   184         pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
   185         paddw       xmm3,           xmm4
   187         paddw       xmm3,           xmm5        ; d1
   188         movdqa      xmm6,           xmm2        ; a1
   190         movdqa      xmm4,           xmm0        ; b1
   191         paddw       xmm2,           xmm3        ;0
   193         paddw       xmm4,           xmm7        ;1
   194         psubw       xmm0,           xmm7        ;2
   196         psubw       xmm6,           xmm3        ;3
   198     ; transpose for the second pass
   199         movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
   200         punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
   201         punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
   203         movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
   204         punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
   205         punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
   208         movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
   209         punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
   210         punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
   212         movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
   213         punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
   214         punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
   217         movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
   218         punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
   219         punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
   221         movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
   222         punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
   223         punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
   225         pshufd      xmm0,           xmm2,       11011000b
   226         pshufd      xmm2,           xmm1,       11011000b
   228         pshufd      xmm1,           xmm5,       11011000b
   229         pshufd      xmm3,           xmm7,       11011000b
   231     ; second pass
   232         psubw       xmm0,           xmm2            ; b1 = 0-2
   233         paddw       xmm2,           xmm2
   235         movdqa      xmm5,           xmm1
   236         paddw       xmm2,           xmm0            ; a1 = 0+2
   238         pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
   239         paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
   241         movdqa      xmm7,           xmm3
   242         pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
   244         paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
   245         psubw       xmm7,           xmm5            ; c1
   247         movdqa      xmm5,           xmm1
   248         movdqa      xmm4,           xmm3
   250         pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
   251         paddw       xmm5,           xmm1
   253         pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
   254         paddw       xmm3,           xmm4
   256         paddw       xmm3,           xmm5            ; d1
   257         paddw       xmm0,           [GLOBAL(fours)]
   259         paddw       xmm2,           [GLOBAL(fours)]
   260         movdqa      xmm6,           xmm2            ; a1
   262         movdqa      xmm4,           xmm0            ; b1
   263         paddw       xmm2,           xmm3            ;0
   265         paddw       xmm4,           xmm7            ;1
   266         psubw       xmm0,           xmm7            ;2
   268         psubw       xmm6,           xmm3            ;3
   269         psraw       xmm2,           3
   271         psraw       xmm0,           3
   272         psraw       xmm4,           3
   274         psraw       xmm6,           3
   276     ; transpose to save
   277         movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
   278         punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
   279         punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
   281         movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
   282         punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
   283         punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
   286         movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
   287         punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
   288         punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
   290         movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
   291         punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
   292         punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
   295         movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
   296         punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
   297         punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
   299         movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
   300         punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
   301         punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
   303         pshufd      xmm0,           xmm2,       11011000b
   304         pshufd      xmm2,           xmm1,       11011000b
   306         pshufd      xmm1,           xmm5,       11011000b
   307         pshufd      xmm3,           xmm7,       11011000b
   309         pxor        xmm7,           xmm7
   311     ; Load up predict blocks
   312         movq        xmm4,           [rdi]
   313         movq        xmm5,           [rdi+rdx]
   315         punpcklbw   xmm4,           xmm7
   316         punpcklbw   xmm5,           xmm7
   318         paddw       xmm0,           xmm4
   319         paddw       xmm1,           xmm5
   321         movq        xmm4,           [rdi+2*rdx]
   322         movq        xmm5,           [rdi+rcx]
   324         punpcklbw   xmm4,           xmm7
   325         punpcklbw   xmm5,           xmm7
   327         paddw       xmm2,           xmm4
   328         paddw       xmm3,           xmm5
   330 .finish:
   332     ; pack up before storing
   333         packuswb    xmm0,           xmm7
   334         packuswb    xmm1,           xmm7
   335         packuswb    xmm2,           xmm7
   336         packuswb    xmm3,           xmm7
   338     ; store blocks back out
   339         movq        [rdi],          xmm0
   340         movq        [rdi + rdx],    xmm1
   341         movq        [rdi + rdx*2],  xmm2
   342         movq        [rdi + rcx],    xmm3
   344     ; begin epilog
   345     pop         rdi
   346     pop         rsi
   347     RESTORE_GOT
   348     RESTORE_XMM
   349     UNSHADOW_ARGS
   350     pop         rbp
   351     ret
   353 ;void vp8_idct_dequant_dc_0_2x_sse2
   354 ; (
   355 ;   short *qcoeff       - 0
   356 ;   short *dequant      - 1
   357 ;   unsigned char *dst  - 2
   358 ;   int dst_stride      - 3
   359 ;   short *dc           - 4
   360 ; )
   361 global sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE
   362 sym(vp8_idct_dequant_dc_0_2x_sse2):
   363     push        rbp
   364     mov         rbp, rsp
   365     SHADOW_ARGS_TO_STACK 5
   366     GET_GOT     rbx
   367     push        rdi
   368     ; end prolog
   370     ; special case when 2 blocks have 0 or 1 coeffs
   371     ; dc is set as first coeff, so no need to load qcoeff
   372         mov         rax,            arg(0) ; qcoeff
   374         mov         rdi,            arg(2) ; dst
   375         mov         rdx,            arg(4) ; dc
   377     ; Zero out xmm5, for use unpacking
   378         pxor        xmm5,           xmm5
   380     ; load up 2 dc words here == 2*16 = doubleword
   381         movd        xmm4,           [rdx]
   383         movsxd      rdx,            dword ptr arg(3) ; dst_stride
   384         lea         rcx, [rdx + rdx*2]
   385     ; Load up predict blocks
   386         movq        xmm0,           [rdi]
   387         movq        xmm1,           [rdi+rdx*1]
   388         movq        xmm2,           [rdi+rdx*2]
   389         movq        xmm3,           [rdi+rcx]
   391     ; Duplicate and expand dc across
   392         punpcklwd   xmm4,           xmm4
   393         punpckldq   xmm4,           xmm4
   395     ; Rounding to dequant and downshift
   396         paddw       xmm4,           [GLOBAL(fours)]
   397         psraw       xmm4,           3
   399     ; Predict buffer needs to be expanded from bytes to words
   400         punpcklbw   xmm0,           xmm5
   401         punpcklbw   xmm1,           xmm5
   402         punpcklbw   xmm2,           xmm5
   403         punpcklbw   xmm3,           xmm5
   405     ; Add to predict buffer
   406         paddw       xmm0,           xmm4
   407         paddw       xmm1,           xmm4
   408         paddw       xmm2,           xmm4
   409         paddw       xmm3,           xmm4
   411     ; pack up before storing
   412         packuswb    xmm0,           xmm5
   413         packuswb    xmm1,           xmm5
   414         packuswb    xmm2,           xmm5
   415         packuswb    xmm3,           xmm5
   417     ; store blocks back out
   418         movq        [rdi],          xmm0
   419         movq        [rdi + rdx],    xmm1
   420         movq        [rdi + rdx*2],  xmm2
   421         movq        [rdi + rcx],    xmm3
   423     ; begin epilog
   424     pop         rdi
   425     RESTORE_GOT
   426     UNSHADOW_ARGS
   427     pop         rbp
   428     ret
   429 ;void vp8_idct_dequant_dc_full_2x_sse2
   430 ; (
   431 ;   short *qcoeff       - 0
   432 ;   short *dequant      - 1
   433 ;   unsigned char *dst  - 2
   434 ;   int dst_stride      - 3
   435 ;   short *dc           - 4
   436 ; )
   437 global sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE
   438 sym(vp8_idct_dequant_dc_full_2x_sse2):
   439     push        rbp
   440     mov         rbp, rsp
   441     SHADOW_ARGS_TO_STACK 5
   442     SAVE_XMM 7
   443     GET_GOT     rbx
   444     push        rdi
   445     ; end prolog
   447     ; special case when 2 blocks have 0 or 1 coeffs
   448     ; dc is set as first coeff, so no need to load qcoeff
   449         mov         rax,            arg(0) ; qcoeff
   450         mov         rdx,            arg(1)  ; dequant
   452         mov         rdi,            arg(2) ; dst
   454     ; Zero out xmm7, for use unpacking
   455         pxor        xmm7,           xmm7
   458     ; note the transpose of xmm1 and xmm2, necessary for shuffle
   459     ;   to spit out sensicle data
   460         movdqa      xmm0,           [rax]
   461         movdqa      xmm2,           [rax+16]
   462         movdqa      xmm1,           [rax+32]
   463         movdqa      xmm3,           [rax+48]
   465     ; Clear out coeffs
   466         movdqa      [rax],          xmm7
   467         movdqa      [rax+16],       xmm7
   468         movdqa      [rax+32],       xmm7
   469         movdqa      [rax+48],       xmm7
   471     ; dequantize qcoeff buffer
   472         pmullw      xmm0,           [rdx]
   473         pmullw      xmm2,           [rdx+16]
   474         pmullw      xmm1,           [rdx]
   475         pmullw      xmm3,           [rdx+16]
   477     ; DC component
   478         mov         rdx,            arg(4)
   480     ; repack so block 0 row x and block 1 row x are together
   481         movdqa      xmm4,           xmm0
   482         punpckldq   xmm0,           xmm1
   483         punpckhdq   xmm4,           xmm1
   485         pshufd      xmm0,           xmm0,       11011000b
   486         pshufd      xmm1,           xmm4,       11011000b
   488         movdqa      xmm4,           xmm2
   489         punpckldq   xmm2,           xmm3
   490         punpckhdq   xmm4,           xmm3
   492         pshufd      xmm2,           xmm2,       11011000b
   493         pshufd      xmm3,           xmm4,       11011000b
   495     ; insert DC component
   496         pinsrw      xmm0,           [rdx],      0
   497         pinsrw      xmm0,           [rdx+2],    4
   499     ; first pass
   500         psubw       xmm0,           xmm2        ; b1 = 0-2
   501         paddw       xmm2,           xmm2        ;
   503         movdqa      xmm5,           xmm1
   504         paddw       xmm2,           xmm0        ; a1 = 0+2
   506         pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
   507         paddw       xmm5,           xmm1        ; ip1 * sin(pi/8) * sqrt(2)
   509         movdqa      xmm7,           xmm3
   510         pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
   512         paddw       xmm7,           xmm3        ; ip3 * cos(pi/8) * sqrt(2)
   513         psubw       xmm7,           xmm5        ; c1
   515         movdqa      xmm5,           xmm1
   516         movdqa      xmm4,           xmm3
   518         pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
   519         paddw       xmm5,           xmm1
   521         pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
   522         paddw       xmm3,           xmm4
   524         paddw       xmm3,           xmm5        ; d1
   525         movdqa      xmm6,           xmm2        ; a1
   527         movdqa      xmm4,           xmm0        ; b1
   528         paddw       xmm2,           xmm3        ;0
   530         paddw       xmm4,           xmm7        ;1
   531         psubw       xmm0,           xmm7        ;2
   533         psubw       xmm6,           xmm3        ;3
   535     ; transpose for the second pass
   536         movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
   537         punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
   538         punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
   540         movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
   541         punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
   542         punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
   545         movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
   546         punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
   547         punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
   549         movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
   550         punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
   551         punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
   554         movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
   555         punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
   556         punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
   558         movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
   559         punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
   560         punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
   562         pshufd      xmm0,           xmm2,       11011000b
   563         pshufd      xmm2,           xmm1,       11011000b
   565         pshufd      xmm1,           xmm5,       11011000b
   566         pshufd      xmm3,           xmm7,       11011000b
   568     ; second pass
   569         psubw       xmm0,           xmm2            ; b1 = 0-2
   570         paddw       xmm2,           xmm2
   572         movdqa      xmm5,           xmm1
   573         paddw       xmm2,           xmm0            ; a1 = 0+2
   575         pmulhw      xmm5,           [GLOBAL(x_s1sqr2)]
   576         paddw       xmm5,           xmm1            ; ip1 * sin(pi/8) * sqrt(2)
   578         movdqa      xmm7,           xmm3
   579         pmulhw      xmm7,           [GLOBAL(x_c1sqr2less1)]
   581         paddw       xmm7,           xmm3            ; ip3 * cos(pi/8) * sqrt(2)
   582         psubw       xmm7,           xmm5            ; c1
   584         movdqa      xmm5,           xmm1
   585         movdqa      xmm4,           xmm3
   587         pmulhw      xmm5,           [GLOBAL(x_c1sqr2less1)]
   588         paddw       xmm5,           xmm1
   590         pmulhw      xmm3,           [GLOBAL(x_s1sqr2)]
   591         paddw       xmm3,           xmm4
   593         paddw       xmm3,           xmm5            ; d1
   594         paddw       xmm0,           [GLOBAL(fours)]
   596         paddw       xmm2,           [GLOBAL(fours)]
   597         movdqa      xmm6,           xmm2            ; a1
   599         movdqa      xmm4,           xmm0            ; b1
   600         paddw       xmm2,           xmm3            ;0
   602         paddw       xmm4,           xmm7            ;1
   603         psubw       xmm0,           xmm7            ;2
   605         psubw       xmm6,           xmm3            ;3
   606         psraw       xmm2,           3
   608         psraw       xmm0,           3
   609         psraw       xmm4,           3
   611         psraw       xmm6,           3
   613     ; transpose to save
   614         movdqa      xmm7,           xmm2        ; 103 102 101 100 003 002 001 000
   615         punpcklwd   xmm2,           xmm0        ; 007 003 006 002 005 001 004 000
   616         punpckhwd   xmm7,           xmm0        ; 107 103 106 102 105 101 104 100
   618         movdqa      xmm5,           xmm4        ; 111 110 109 108 011 010 009 008
   619         punpcklwd   xmm4,           xmm6        ; 015 011 014 010 013 009 012 008
   620         punpckhwd   xmm5,           xmm6        ; 115 111 114 110 113 109 112 108
   623         movdqa      xmm1,           xmm2        ; 007 003 006 002 005 001 004 000
   624         punpckldq   xmm2,           xmm4        ; 013 009 005 001 012 008 004 000
   625         punpckhdq   xmm1,           xmm4        ; 015 011 007 003 014 010 006 002
   627         movdqa      xmm6,           xmm7        ; 107 103 106 102 105 101 104 100
   628         punpckldq   xmm7,           xmm5        ; 113 109 105 101 112 108 104 100
   629         punpckhdq   xmm6,           xmm5        ; 115 111 107 103 114 110 106 102
   632         movdqa      xmm5,           xmm2        ; 013 009 005 001 012 008 004 000
   633         punpckldq   xmm2,           xmm7        ; 112 108 012 008 104 100 004 000
   634         punpckhdq   xmm5,           xmm7        ; 113 109 013 009 105 101 005 001
   636         movdqa      xmm7,           xmm1        ; 015 011 007 003 014 010 006 002
   637         punpckldq   xmm1,           xmm6        ; 114 110 014 010 106 102 006 002
   638         punpckhdq   xmm7,           xmm6        ; 115 111 015 011 107 103 007 003
   640         pshufd      xmm0,           xmm2,       11011000b
   641         pshufd      xmm2,           xmm1,       11011000b
   643         pshufd      xmm1,           xmm5,       11011000b
   644         pshufd      xmm3,           xmm7,       11011000b
   646         pxor        xmm7,           xmm7
   648     ; Load up predict blocks
   649         movsxd      rdx,            dword ptr arg(3) ; dst_stride
   650         movq        xmm4,           [rdi]
   651         movq        xmm5,           [rdi+rdx]
   652         lea         rcx,            [rdx + rdx*2]
   654         punpcklbw   xmm4,           xmm7
   655         punpcklbw   xmm5,           xmm7
   657         paddw       xmm0,           xmm4
   658         paddw       xmm1,           xmm5
   660         movq        xmm4,           [rdi+rdx*2]
   661         movq        xmm5,           [rdi+rcx]
   663         punpcklbw   xmm4,           xmm7
   664         punpcklbw   xmm5,           xmm7
   666         paddw       xmm2,           xmm4
   667         paddw       xmm3,           xmm5
   669 .finish:
   671     ; pack up before storing
   672         packuswb    xmm0,           xmm7
   673         packuswb    xmm1,           xmm7
   674         packuswb    xmm2,           xmm7
   675         packuswb    xmm3,           xmm7
   677     ; Load destination stride before writing out,
   678     ;   doesn't need to persist
   679         movsxd      rdx,            dword ptr arg(3) ; dst_stride
   681     ; store blocks back out
   682         movq        [rdi],          xmm0
   683         movq        [rdi + rdx],    xmm1
   685         lea         rdi,            [rdi + 2*rdx]
   687         movq        [rdi],          xmm2
   688         movq        [rdi + rdx],    xmm3
   691     ; begin epilog
   692     pop         rdi
   693     RESTORE_GOT
   694     RESTORE_XMM
   695     UNSHADOW_ARGS
   696     pop         rbp
   697     ret
   699 SECTION_RODATA
   700 align 16
   701 fours:
   702     times 8 dw 0x0004
   703 align 16
   704 x_s1sqr2:
   705     times 8 dw 0x8A8C
   706 align 16
   707 x_c1sqr2less1:
   708     times 8 dw 0x4E7B

mercurial