media/libvpx/vp8/encoder/x86/dct_sse2.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    12 %include "vpx_ports/x86_abi_support.asm"
    14 %macro STACK_FRAME_CREATE 0
    15 %if ABI_IS_32BIT
    16   %define       input       rsi
    17   %define       output      rdi
    18   %define       pitch       rax
    19     push        rbp
    20     mov         rbp, rsp
    21     GET_GOT     rbx
    22     push        rsi
    23     push        rdi
    24     ; end prolog
    26     mov         rsi, arg(0)
    27     mov         rdi, arg(1)
    29     movsxd      rax, dword ptr arg(2)
    30     lea         rcx, [rsi + rax*2]
    31 %else
    32   %if LIBVPX_YASM_WIN64
    33     %define     input       rcx
    34     %define     output      rdx
    35     %define     pitch       r8
    36     SAVE_XMM 7, u
    37   %else
    38     %define     input       rdi
    39     %define     output      rsi
    40     %define     pitch       rdx
    41   %endif
    42 %endif
    43 %endmacro
    45 %macro STACK_FRAME_DESTROY 0
    46   %define     input
    47   %define     output
    48   %define     pitch
    50 %if ABI_IS_32BIT
    51     pop         rdi
    52     pop         rsi
    53     RESTORE_GOT
    54     pop         rbp
    55 %else
    56   %if LIBVPX_YASM_WIN64
    57     RESTORE_XMM
    58   %endif
    59 %endif
    60     ret
    61 %endmacro
    63 ;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
    64 global sym(vp8_short_fdct4x4_sse2) PRIVATE
    65 sym(vp8_short_fdct4x4_sse2):
    67     STACK_FRAME_CREATE
    69     movq        xmm0, MMWORD PTR[input        ] ;03 02 01 00
    70     movq        xmm2, MMWORD PTR[input+  pitch] ;13 12 11 10
    71     lea         input,          [input+2*pitch]
    72     movq        xmm1, MMWORD PTR[input        ] ;23 22 21 20
    73     movq        xmm3, MMWORD PTR[input+  pitch] ;33 32 31 30
    75     punpcklqdq  xmm0, xmm2                      ;13 12 11 10 03 02 01 00
    76     punpcklqdq  xmm1, xmm3                      ;33 32 31 30 23 22 21 20
    78     movdqa      xmm2, xmm0
    79     punpckldq   xmm0, xmm1                      ;23 22 03 02 21 20 01 00
    80     punpckhdq   xmm2, xmm1                      ;33 32 13 12 31 30 11 10
    81     movdqa      xmm1, xmm0
    82     punpckldq   xmm0, xmm2                      ;31 21 30 20 11 10 01 00
    83     pshufhw     xmm1, xmm1, 0b1h                ;22 23 02 03 xx xx xx xx
    84     pshufhw     xmm2, xmm2, 0b1h                ;32 33 12 13 xx xx xx xx
    86     punpckhdq   xmm1, xmm2                      ;32 33 22 23 12 13 02 03
    87     movdqa      xmm3, xmm0
    88     paddw       xmm0, xmm1                      ;b1 a1 b1 a1 b1 a1 b1 a1
    89     psubw       xmm3, xmm1                      ;c1 d1 c1 d1 c1 d1 c1 d1
    90     psllw       xmm0, 3                         ;b1 <<= 3 a1 <<= 3
    91     psllw       xmm3, 3                         ;c1 <<= 3 d1 <<= 3
    93     movdqa      xmm1, xmm0
    94     pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)]    ;a1 + b1
    95     pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)]    ;a1 - b1
    96     movdqa      xmm4, xmm3
    97     pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]   ;c1*2217 + d1*5352
    98     pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352
   100     paddd       xmm3, XMMWORD PTR[GLOBAL(_14500)]
   101     paddd       xmm4, XMMWORD PTR[GLOBAL(_7500)]
   102     psrad       xmm3, 12            ;(c1 * 2217 + d1 * 5352 +  14500)>>12
   103     psrad       xmm4, 12            ;(d1 * 2217 - c1 * 5352 +   7500)>>12
   105     packssdw    xmm0, xmm1                      ;op[2] op[0]
   106     packssdw    xmm3, xmm4                      ;op[3] op[1]
   107     ; 23 22 21 20 03 02 01 00
   108     ;
   109     ; 33 32 31 30 13 12 11 10
   110     ;
   111     movdqa      xmm2, xmm0
   112     punpcklqdq  xmm0, xmm3                      ;13 12 11 10 03 02 01 00
   113     punpckhqdq  xmm2, xmm3                      ;23 22 21 20 33 32 31 30
   115     movdqa      xmm3, xmm0
   116     punpcklwd   xmm0, xmm2                      ;32 30 22 20 12 10 02 00
   117     punpckhwd   xmm3, xmm2                      ;33 31 23 21 13 11 03 01
   118     movdqa      xmm2, xmm0
   119     punpcklwd   xmm0, xmm3                      ;13 12 11 10 03 02 01 00
   120     punpckhwd   xmm2, xmm3                      ;33 32 31 30 23 22 21 20
   122     movdqa      xmm5, XMMWORD PTR[GLOBAL(_7)]
   123     pshufd      xmm2, xmm2, 04eh
   124     movdqa      xmm3, xmm0
   125     paddw       xmm0, xmm2                      ;b1 b1 b1 b1 a1 a1 a1 a1
   126     psubw       xmm3, xmm2                      ;c1 c1 c1 c1 d1 d1 d1 d1
   128     pshufd      xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 b1 a1 a1
   129     movdqa      xmm2, xmm3                      ;save d1 for compare
   130     pshufd      xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 c1 d1 d1
   131     pshuflw     xmm0, xmm0, 0d8h                ;b1 b1 a1 a1 b1 a1 b1 a1
   132     pshuflw     xmm3, xmm3, 0d8h                ;c1 c1 d1 d1 c1 d1 c1 d1
   133     pshufhw     xmm0, xmm0, 0d8h                ;b1 a1 b1 a1 b1 a1 b1 a1
   134     pshufhw     xmm3, xmm3, 0d8h                ;c1 d1 c1 d1 c1 d1 c1 d1
   135     movdqa      xmm1, xmm0
   136     pmaddwd     xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1
   137     pmaddwd     xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1
   139     pxor        xmm4, xmm4                      ;zero out for compare
   140     paddd       xmm0, xmm5
   141     paddd       xmm1, xmm5
   142     pcmpeqw     xmm2, xmm4
   143     psrad       xmm0, 4                         ;(a1 + b1 + 7)>>4
   144     psrad       xmm1, 4                         ;(a1 - b1 + 7)>>4
   145     pandn       xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper,
   146                                                      ;and keep bit 0 of lower
   148     movdqa      xmm4, xmm3
   149     pmaddwd     xmm3, XMMWORD PTR[GLOBAL(_5352_2217)]    ;c1*2217 + d1*5352
   150     pmaddwd     xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352
   151     paddd       xmm3, XMMWORD PTR[GLOBAL(_12000)]
   152     paddd       xmm4, XMMWORD PTR[GLOBAL(_51000)]
   153     packssdw    xmm0, xmm1                      ;op[8] op[0]
   154     psrad       xmm3, 16                ;(c1 * 2217 + d1 * 5352 +  12000)>>16
   155     psrad       xmm4, 16                ;(d1 * 2217 - c1 * 5352 +  51000)>>16
   157     packssdw    xmm3, xmm4                      ;op[12] op[4]
   158     movdqa      xmm1, xmm0
   159     paddw       xmm3, xmm2                      ;op[4] += (d1!=0)
   160     punpcklqdq  xmm0, xmm3                      ;op[4] op[0]
   161     punpckhqdq  xmm1, xmm3                      ;op[12] op[8]
   163     movdqa      XMMWORD PTR[output +  0], xmm0
   164     movdqa      XMMWORD PTR[output + 16], xmm1
   166     STACK_FRAME_DESTROY
   168 ;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch)
   169 global sym(vp8_short_fdct8x4_sse2) PRIVATE
   170 sym(vp8_short_fdct8x4_sse2):
   172     STACK_FRAME_CREATE
   174         ; read the input data
   175         movdqa      xmm0,       [input        ]
   176         movdqa      xmm2,       [input+  pitch]
   177         lea         input,      [input+2*pitch]
   178         movdqa      xmm4,       [input        ]
   179         movdqa      xmm3,       [input+  pitch]
   181         ; transpose for the first stage
   182         movdqa      xmm1,       xmm0        ; 00 01 02 03 04 05 06 07
   183         movdqa      xmm5,       xmm4        ; 20 21 22 23 24 25 26 27
   185         punpcklwd   xmm0,       xmm2        ; 00 10 01 11 02 12 03 13
   186         punpckhwd   xmm1,       xmm2        ; 04 14 05 15 06 16 07 17
   188         punpcklwd   xmm4,       xmm3        ; 20 30 21 31 22 32 23 33
   189         punpckhwd   xmm5,       xmm3        ; 24 34 25 35 26 36 27 37
   191         movdqa      xmm2,       xmm0        ; 00 10 01 11 02 12 03 13
   192         punpckldq   xmm0,       xmm4        ; 00 10 20 30 01 11 21 31
   194         punpckhdq   xmm2,       xmm4        ; 02 12 22 32 03 13 23 33
   196         movdqa      xmm4,       xmm1        ; 04 14 05 15 06 16 07 17
   197         punpckldq   xmm4,       xmm5        ; 04 14 24 34 05 15 25 35
   199         punpckhdq   xmm1,       xmm5        ; 06 16 26 36 07 17 27 37
   200         movdqa      xmm3,       xmm2        ; 02 12 22 32 03 13 23 33
   202         punpckhqdq  xmm3,       xmm1        ; 03 13 23 33 07 17 27 37
   203         punpcklqdq  xmm2,       xmm1        ; 02 12 22 32 06 16 26 36
   205         movdqa      xmm1,       xmm0        ; 00 10 20 30 01 11 21 31
   206         punpcklqdq  xmm0,       xmm4        ; 00 10 20 30 04 14 24 34
   208         punpckhqdq  xmm1,       xmm4        ; 01 11 21 32 05 15 25 35
   210         ; xmm0 0
   211         ; xmm1 1
   212         ; xmm2 2
   213         ; xmm3 3
   215         ; first stage
   216         movdqa      xmm5,       xmm0
   217         movdqa      xmm4,       xmm1
   219         paddw       xmm0,       xmm3        ; a1 = 0 + 3
   220         paddw       xmm1,       xmm2        ; b1 = 1 + 2
   222         psubw       xmm4,       xmm2        ; c1 = 1 - 2
   223         psubw       xmm5,       xmm3        ; d1 = 0 - 3
   225         psllw       xmm5,        3
   226         psllw       xmm4,        3
   228         psllw       xmm0,        3
   229         psllw       xmm1,        3
   231         ; output 0 and 2
   232         movdqa      xmm2,       xmm0        ; a1
   234         paddw       xmm0,       xmm1        ; op[0] = a1 + b1
   235         psubw       xmm2,       xmm1        ; op[2] = a1 - b1
   237         ; output 1 and 3
   238         ; interleave c1, d1
   239         movdqa      xmm1,       xmm5        ; d1
   240         punpcklwd   xmm1,       xmm4        ; c1 d1
   241         punpckhwd   xmm5,       xmm4        ; c1 d1
   243         movdqa      xmm3,       xmm1
   244         movdqa      xmm4,       xmm5
   246         pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
   247         pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
   249         pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
   250         pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
   252         paddd       xmm1,       XMMWORD PTR[GLOBAL(_14500)]
   253         paddd       xmm4,       XMMWORD PTR[GLOBAL(_14500)]
   254         paddd       xmm3,       XMMWORD PTR[GLOBAL(_7500)]
   255         paddd       xmm5,       XMMWORD PTR[GLOBAL(_7500)]
   257         psrad       xmm1,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
   258         psrad       xmm4,       12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
   259         psrad       xmm3,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
   260         psrad       xmm5,       12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
   262         packssdw    xmm1,       xmm4        ; op[1]
   263         packssdw    xmm3,       xmm5        ; op[3]
   265         ; done with vertical
   266         ; transpose for the second stage
   267         movdqa      xmm4,       xmm0         ; 00 10 20 30 04 14 24 34
   268         movdqa      xmm5,       xmm2         ; 02 12 22 32 06 16 26 36
   270         punpcklwd   xmm0,       xmm1         ; 00 01 10 11 20 21 30 31
   271         punpckhwd   xmm4,       xmm1         ; 04 05 14 15 24 25 34 35
   273         punpcklwd   xmm2,       xmm3         ; 02 03 12 13 22 23 32 33
   274         punpckhwd   xmm5,       xmm3         ; 06 07 16 17 26 27 36 37
   276         movdqa      xmm1,       xmm0         ; 00 01 10 11 20 21 30 31
   277         punpckldq   xmm0,       xmm2         ; 00 01 02 03 10 11 12 13
   279         punpckhdq   xmm1,       xmm2         ; 20 21 22 23 30 31 32 33
   281         movdqa      xmm2,       xmm4         ; 04 05 14 15 24 25 34 35
   282         punpckldq   xmm2,       xmm5         ; 04 05 06 07 14 15 16 17
   284         punpckhdq   xmm4,       xmm5         ; 24 25 26 27 34 35 36 37
   285         movdqa      xmm3,       xmm1         ; 20 21 22 23 30 31 32 33
   287         punpckhqdq  xmm3,       xmm4         ; 30 31 32 33 34 35 36 37
   288         punpcklqdq  xmm1,       xmm4         ; 20 21 22 23 24 25 26 27
   290         movdqa      xmm4,       xmm0         ; 00 01 02 03 10 11 12 13
   291         punpcklqdq  xmm0,       xmm2         ; 00 01 02 03 04 05 06 07
   293         punpckhqdq  xmm4,       xmm2         ; 10 11 12 13 14 15 16 17
   295         ; xmm0 0
   296         ; xmm1 4
   297         ; xmm2 1
   298         ; xmm3 3
   300         movdqa      xmm5,       xmm0
   301         movdqa      xmm2,       xmm1
   303         paddw       xmm0,       xmm3        ; a1 = 0 + 3
   304         paddw       xmm1,       xmm4        ; b1 = 1 + 2
   306         psubw       xmm4,       xmm2        ; c1 = 1 - 2
   307         psubw       xmm5,       xmm3        ; d1 = 0 - 3
   309         pxor        xmm6,       xmm6        ; zero out for compare
   311         pcmpeqw     xmm6,       xmm5        ; d1 != 0
   313         pandn       xmm6,       XMMWORD PTR[GLOBAL(_cmp_mask8x4)]   ; clear upper,
   314                                                                     ; and keep bit 0 of lower
   316         ; output 0 and 2
   317         movdqa      xmm2,       xmm0        ; a1
   319         paddw       xmm0,       xmm1        ; a1 + b1
   320         psubw       xmm2,       xmm1        ; a1 - b1
   322         paddw       xmm0,       XMMWORD PTR[GLOBAL(_7w)]
   323         paddw       xmm2,       XMMWORD PTR[GLOBAL(_7w)]
   325         psraw       xmm0,       4           ; op[0] = (a1 + b1 + 7)>>4
   326         psraw       xmm2,       4           ; op[8] = (a1 - b1 + 7)>>4
   328         ; output 1 and 3
   329         ; interleave c1, d1
   330         movdqa      xmm1,       xmm5        ; d1
   331         punpcklwd   xmm1,       xmm4        ; c1 d1
   332         punpckhwd   xmm5,       xmm4        ; c1 d1
   334         movdqa      xmm3,       xmm1
   335         movdqa      xmm4,       xmm5
   337         pmaddwd     xmm1,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
   338         pmaddwd     xmm4,       XMMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
   340         pmaddwd     xmm3,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
   341         pmaddwd     xmm5,       XMMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
   343         paddd       xmm1,       XMMWORD PTR[GLOBAL(_12000)]
   344         paddd       xmm4,       XMMWORD PTR[GLOBAL(_12000)]
   345         paddd       xmm3,       XMMWORD PTR[GLOBAL(_51000)]
   346         paddd       xmm5,       XMMWORD PTR[GLOBAL(_51000)]
   348         psrad       xmm1,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
   349         psrad       xmm4,       16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
   350         psrad       xmm3,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
   351         psrad       xmm5,       16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
   353         packssdw    xmm1,       xmm4        ; op[4]
   354         packssdw    xmm3,       xmm5        ; op[12]
   356         paddw       xmm1,       xmm6        ; op[4] += (d1!=0)
   358         movdqa      xmm4,       xmm0
   359         movdqa      xmm5,       xmm2
   361         punpcklqdq  xmm0,       xmm1
   362         punpckhqdq  xmm4,       xmm1
   364         punpcklqdq  xmm2,       xmm3
   365         punpckhqdq  xmm5,       xmm3
   367         movdqa      XMMWORD PTR[output + 0 ],  xmm0
   368         movdqa      XMMWORD PTR[output + 16],  xmm2
   369         movdqa      XMMWORD PTR[output + 32],  xmm4
   370         movdqa      XMMWORD PTR[output + 48],  xmm5
   372     STACK_FRAME_DESTROY
   374 SECTION_RODATA
   375 align 16
   376 _5352_2217:
   377     dw 5352
   378     dw 2217
   379     dw 5352
   380     dw 2217
   381     dw 5352
   382     dw 2217
   383     dw 5352
   384     dw 2217
   385 align 16
   386 _2217_neg5352:
   387     dw 2217
   388     dw -5352
   389     dw 2217
   390     dw -5352
   391     dw 2217
   392     dw -5352
   393     dw 2217
   394     dw -5352
   395 align 16
   396 _mult_add:
   397     times 8 dw 1
   398 align 16
   399 _cmp_mask:
   400     times 4 dw 1
   401     times 4 dw 0
   402 align 16
   403 _cmp_mask8x4:
   404     times 8 dw 1
   405 align 16
   406 _mult_sub:
   407     dw 1
   408     dw -1
   409     dw 1
   410     dw -1
   411     dw 1
   412     dw -1
   413     dw 1
   414     dw -1
   415 align 16
   416 _7:
   417     times 4 dd 7
   418 align 16
   419 _7w:
   420     times 8 dw 7
   421 align 16
   422 _14500:
   423     times 4 dd 14500
   424 align 16
   425 _7500:
   426     times 4 dd 7500
   427 align 16
   428 _12000:
   429     times 4 dd 12000
   430 align 16
   431 _51000:
   432     times 4 dd 51000

mercurial