media/libvpx/vp9/encoder/x86/vp9_error_sse2.asm

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp9/encoder/x86/vp9_error_sse2.asm	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,74 @@
     1.4 +;
     1.5 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 +;
     1.7 +;  Use of this source code is governed by a BSD-style license
     1.8 +;  that can be found in the LICENSE file in the root of the source
     1.9 +;  tree. An additional intellectual property rights grant can be found
    1.10 +;  in the file PATENTS.  All contributing project authors may
    1.11 +;  be found in the AUTHORS file in the root of the source tree.
    1.12 +;
    1.13 +
    1.14 +%include "third_party/x86inc/x86inc.asm"
    1.15 +
    1.16 +SECTION .text
    1.17 +
    1.18 +; int64_t vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
    1.19 +;                         int64_t *ssz)
    1.20 +
    1.21 +INIT_XMM sse2
    1.22 +cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
    1.23 +  pxor      m4, m4                 ; sse accumulator
    1.24 +  pxor      m6, m6                 ; ssz accumulator
    1.25 +  pxor      m5, m5                 ; dedicated zero register
    1.26 +  lea     uqcq, [uqcq+sizeq*2]
    1.27 +  lea     dqcq, [dqcq+sizeq*2]
    1.28 +  neg    sizeq
    1.29 +.loop:
    1.30 +  mova      m2, [uqcq+sizeq*2]
    1.31 +  mova      m0, [dqcq+sizeq*2]
    1.32 +  mova      m3, [uqcq+sizeq*2+mmsize]
    1.33 +  mova      m1, [dqcq+sizeq*2+mmsize]
    1.34 +  psubw     m0, m2
    1.35 +  psubw     m1, m3
    1.36 +  ; individual errors are max. 15bit+sign, so squares are 30bit, and
    1.37 +  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
    1.38 +  pmaddwd   m0, m0
    1.39 +  pmaddwd   m1, m1
    1.40 +  pmaddwd   m2, m2
    1.41 +  pmaddwd   m3, m3
    1.42 +  ; accumulate in 64bit
    1.43 +  punpckldq m7, m0, m5
    1.44 +  punpckhdq m0, m5
    1.45 +  paddq     m4, m7
    1.46 +  punpckldq m7, m1, m5
    1.47 +  paddq     m4, m0
    1.48 +  punpckhdq m1, m5
    1.49 +  paddq     m4, m7
    1.50 +  punpckldq m7, m2, m5
    1.51 +  paddq     m4, m1
    1.52 +  punpckhdq m2, m5
    1.53 +  paddq     m6, m7
    1.54 +  punpckldq m7, m3, m5
    1.55 +  paddq     m6, m2
    1.56 +  punpckhdq m3, m5
    1.57 +  paddq     m6, m7
    1.58 +  paddq     m6, m3
    1.59 +  add    sizeq, mmsize
    1.60 +  jl .loop
    1.61 +
    1.62 +  ; accumulate horizontally and store in return value
    1.63 +  movhlps   m5, m4
    1.64 +  movhlps   m7, m6
    1.65 +  paddq     m4, m5
    1.66 +  paddq     m6, m7
    1.67 +%if ARCH_X86_64
    1.68 +  movq    rax, m4
    1.69 +  movq [sszq], m6
    1.70 +%else
    1.71 +  mov     eax, sszm
    1.72 +  pshufd   m5, m4, 0x1
    1.73 +  movq  [eax], m6
    1.74 +  movd    eax, m4
    1.75 +  movd    edx, m5
    1.76 +%endif
    1.77 +  RET

mercurial