media/libvpx/vp9/encoder/x86/vp9_error_sse2.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    11 %include "third_party/x86inc/x86inc.asm"
    13 SECTION .text
    15 ; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
    16 ;                         int64_t *ssz)
    18 INIT_XMM sse2
    19 cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
    20   pxor      m4, m4                 ; sse accumulator
    21   pxor      m6, m6                 ; ssz accumulator
    22   pxor      m5, m5                 ; dedicated zero register
    23   lea     uqcq, [uqcq+sizeq*2]
    24   lea     dqcq, [dqcq+sizeq*2]
    25   neg    sizeq
    26 .loop:
    27   mova      m2, [uqcq+sizeq*2]
    28   mova      m0, [dqcq+sizeq*2]
    29   mova      m3, [uqcq+sizeq*2+mmsize]
    30   mova      m1, [dqcq+sizeq*2+mmsize]
    31   psubw     m0, m2
    32   psubw     m1, m3
    33   ; individual errors are max. 15bit+sign, so squares are 30bit, and
    34   ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
    35   pmaddwd   m0, m0
    36   pmaddwd   m1, m1
    37   pmaddwd   m2, m2
    38   pmaddwd   m3, m3
    39   ; accumulate in 64bit
    40   punpckldq m7, m0, m5
    41   punpckhdq m0, m5
    42   paddq     m4, m7
    43   punpckldq m7, m1, m5
    44   paddq     m4, m0
    45   punpckhdq m1, m5
    46   paddq     m4, m7
    47   punpckldq m7, m2, m5
    48   paddq     m4, m1
    49   punpckhdq m2, m5
    50   paddq     m6, m7
    51   punpckldq m7, m3, m5
    52   paddq     m6, m2
    53   punpckhdq m3, m5
    54   paddq     m6, m7
    55   paddq     m6, m3
    56   add    sizeq, mmsize
    57   jl .loop
    59   ; accumulate horizontally and store in return value
    60   movhlps   m5, m4
    61   movhlps   m7, m6
    62   paddq     m4, m5
    63   paddq     m6, m7
    64 %if ARCH_X86_64
    65   movq    rax, m4
    66   movq [sszq], m6
    67 %else
    68   mov     eax, sszm
    69   pshufd   m5, m4, 0x1
    70   movq  [eax], m6
    71   movd    eax, m4
    72   movd    edx, m5
    73 %endif
    74   RET

mercurial