media/libvpx/vp8/encoder/x86/quantize_ssse3.asm

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp8/encoder/x86/quantize_ssse3.asm	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,138 @@
     1.4 +;
     1.5 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 +;
     1.7 +;  Use of this source code is governed by a BSD-style license and patent
     1.8 +;  grant that can be found in the LICENSE file in the root of the source
     1.9 +;  tree. All contributing project authors may be found in the AUTHORS
    1.10 +;  file in the root of the source tree.
    1.11 +;
    1.12 +
    1.13 +
    1.14 +%include "vpx_ports/x86_abi_support.asm"
    1.15 +%include "vp8_asm_enc_offsets.asm"
    1.16 +
    1.17 +
    1.18 +; void vp8_fast_quantize_b_ssse3 | arg
    1.19 +;  (BLOCK  *b,                   |  0
    1.20 +;   BLOCKD *d)                   |  1
    1.21 +;
    1.22 +
    1.23 +global sym(vp8_fast_quantize_b_ssse3) PRIVATE
    1.24 +sym(vp8_fast_quantize_b_ssse3):
    1.25 +    push        rbp
    1.26 +    mov         rbp, rsp
    1.27 +    GET_GOT     rbx
    1.28 +
    1.29 +%if ABI_IS_32BIT
    1.30 +    push        rdi
    1.31 +    push        rsi
    1.32 +%else
    1.33 +  %if LIBVPX_YASM_WIN64
    1.34 +    push        rdi
    1.35 +    push        rsi
    1.36 +  %endif
    1.37 +%endif
    1.38 +    ; end prolog
    1.39 +
    1.40 +%if ABI_IS_32BIT
    1.41 +    mov         rdi, arg(0)                 ; BLOCK *b
    1.42 +    mov         rsi, arg(1)                 ; BLOCKD *d
    1.43 +%else
    1.44 +  %if LIBVPX_YASM_WIN64
    1.45 +    mov         rdi, rcx                    ; BLOCK *b
    1.46 +    mov         rsi, rdx                    ; BLOCKD *d
    1.47 +  %else
    1.48 +    ;mov         rdi, rdi                    ; BLOCK *b
    1.49 +    ;mov         rsi, rsi                    ; BLOCKD *d
    1.50 +  %endif
    1.51 +%endif
    1.52 +
    1.53 +    mov         rax, [rdi + vp8_block_coeff]
    1.54 +    mov         rcx, [rdi + vp8_block_round]
    1.55 +    mov         rdx, [rdi + vp8_block_quant_fast]
    1.56 +
    1.57 +    ; coeff
    1.58 +    movdqa      xmm0, [rax]
    1.59 +    movdqa      xmm4, [rax + 16]
    1.60 +
    1.61 +    ; round
    1.62 +    movdqa      xmm2, [rcx]
    1.63 +    movdqa      xmm3, [rcx + 16]
    1.64 +
    1.65 +    movdqa      xmm1, xmm0
    1.66 +    movdqa      xmm5, xmm4
    1.67 +
    1.68 +    ; sz = z >> 15
    1.69 +    psraw       xmm0, 15
    1.70 +    psraw       xmm4, 15
    1.71 +
    1.72 +    pabsw       xmm1, xmm1
    1.73 +    pabsw       xmm5, xmm5
    1.74 +
    1.75 +    paddw       xmm1, xmm2
    1.76 +    paddw       xmm5, xmm3
    1.77 +
    1.78 +    ; quant_fast
    1.79 +    pmulhw      xmm1, [rdx]
    1.80 +    pmulhw      xmm5, [rdx + 16]
    1.81 +
    1.82 +    mov         rax, [rsi + vp8_blockd_qcoeff]
    1.83 +    mov         rdi, [rsi + vp8_blockd_dequant]
    1.84 +    mov         rcx, [rsi + vp8_blockd_dqcoeff]
    1.85 +
    1.86 +    movdqa      xmm2, xmm1                  ;store y for getting eob
    1.87 +    movdqa      xmm3, xmm5
    1.88 +
    1.89 +    pxor        xmm1, xmm0
    1.90 +    pxor        xmm5, xmm4
    1.91 +    psubw       xmm1, xmm0
    1.92 +    psubw       xmm5, xmm4
    1.93 +
    1.94 +    movdqa      [rax], xmm1
    1.95 +    movdqa      [rax + 16], xmm5
    1.96 +
    1.97 +    movdqa      xmm0, [rdi]
    1.98 +    movdqa      xmm4, [rdi + 16]
    1.99 +
   1.100 +    pmullw      xmm0, xmm1
   1.101 +    pmullw      xmm4, xmm5
   1.102 +    pxor        xmm1, xmm1
   1.103 +
   1.104 +    pcmpgtw     xmm2, xmm1                  ;calculate eob
   1.105 +    pcmpgtw     xmm3, xmm1
   1.106 +    packsswb    xmm2, xmm3
   1.107 +    pshufb      xmm2, [GLOBAL(zz_shuf)]
   1.108 +
   1.109 +    pmovmskb    edx, xmm2
   1.110 +
   1.111 +    movdqa      [rcx], xmm0                 ;store dqcoeff
   1.112 +    movdqa      [rcx + 16], xmm4            ;store dqcoeff
   1.113 +    mov         rcx, [rsi + vp8_blockd_eob]
   1.114 +
   1.115 +    bsr         eax, edx                    ;count 0
   1.116 +    add         eax, 1
   1.117 +
   1.118 +    cmp         edx, 0                      ;if all 0, eob=0
   1.119 +    cmove       eax, edx
   1.120 +
   1.121 +    mov         BYTE PTR [rcx], al          ;store eob
   1.122 +
   1.123 +    ; begin epilog
   1.124 +%if ABI_IS_32BIT
   1.125 +    pop         rsi
   1.126 +    pop         rdi
   1.127 +%else
   1.128 +  %if LIBVPX_YASM_WIN64
   1.129 +    pop         rsi
   1.130 +    pop         rdi
   1.131 +  %endif
   1.132 +%endif
   1.133 +
   1.134 +    RESTORE_GOT
   1.135 +    pop         rbp
   1.136 +    ret
   1.137 +
   1.138 +SECTION_RODATA
   1.139 +align 16
   1.140 +zz_shuf:
   1.141 +    db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15

mercurial