1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/encoder/x86/quantize_sse4.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,256 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license and patent 1.8 +; grant that can be found in the LICENSE file in the root of the source 1.9 +; tree. All contributing project authors may be found in the AUTHORS 1.10 +; file in the root of the source tree. 1.11 +; 1.12 + 1.13 + 1.14 +%include "vpx_ports/x86_abi_support.asm" 1.15 +%include "vp8_asm_enc_offsets.asm" 1.16 + 1.17 + 1.18 +; void vp8_regular_quantize_b_sse4 | arg 1.19 +; (BLOCK *b, | 0 1.20 +; BLOCKD *d) | 1 1.21 + 1.22 +global sym(vp8_regular_quantize_b_sse4) PRIVATE 1.23 +sym(vp8_regular_quantize_b_sse4): 1.24 + 1.25 +%if ABI_IS_32BIT 1.26 + push rbp 1.27 + mov rbp, rsp 1.28 + GET_GOT rbx 1.29 + push rdi 1.30 + push rsi 1.31 + 1.32 + ALIGN_STACK 16, rax 1.33 + %define qcoeff 0 ; 32 1.34 + %define stack_size 32 1.35 + sub rsp, stack_size 1.36 +%else 1.37 + %if LIBVPX_YASM_WIN64 1.38 + SAVE_XMM 8, u 1.39 + push rdi 1.40 + push rsi 1.41 + %endif 1.42 +%endif 1.43 + ; end prolog 1.44 + 1.45 +%if ABI_IS_32BIT 1.46 + mov rdi, arg(0) ; BLOCK *b 1.47 + mov rsi, arg(1) ; BLOCKD *d 1.48 +%else 1.49 + %if LIBVPX_YASM_WIN64 1.50 + mov rdi, rcx ; BLOCK *b 1.51 + mov rsi, rdx ; BLOCKD *d 1.52 + %else 1.53 + ;mov rdi, rdi ; BLOCK *b 1.54 + ;mov rsi, rsi ; BLOCKD *d 1.55 + %endif 1.56 +%endif 1.57 + 1.58 + mov rax, [rdi + vp8_block_coeff] 1.59 + mov rcx, [rdi + vp8_block_zbin] 1.60 + mov rdx, [rdi + vp8_block_round] 1.61 + movd xmm7, [rdi + vp8_block_zbin_extra] 1.62 + 1.63 + ; z 1.64 + movdqa xmm0, [rax] 1.65 + movdqa xmm1, [rax + 16] 1.66 + 1.67 + ; duplicate zbin_oq_value 1.68 + pshuflw xmm7, xmm7, 0 1.69 + punpcklwd xmm7, xmm7 1.70 + 1.71 + movdqa xmm2, xmm0 1.72 + movdqa xmm3, xmm1 1.73 + 1.74 + ; sz 1.75 + psraw xmm0, 15 1.76 + psraw xmm1, 15 1.77 + 1.78 + ; (z ^ sz) 1.79 + pxor xmm2, xmm0 1.80 + pxor xmm3, xmm1 1.81 + 1.82 + ; x = abs(z) 1.83 + psubw xmm2, xmm0 1.84 + psubw xmm3, xmm1 1.85 + 1.86 + ; zbin 1.87 + movdqa xmm4, [rcx] 1.88 + movdqa xmm5, [rcx + 16] 1.89 + 1.90 + ; *zbin_ptr + zbin_oq_value 1.91 + paddw xmm4, xmm7 1.92 + paddw xmm5, xmm7 1.93 + 1.94 + movdqa xmm6, xmm2 1.95 + movdqa xmm7, xmm3 1.96 + 1.97 + ; x - (*zbin_ptr + zbin_oq_value) 1.98 + psubw xmm6, xmm4 1.99 + psubw xmm7, xmm5 1.100 + 1.101 + ; round 1.102 + movdqa xmm4, [rdx] 1.103 + movdqa xmm5, [rdx + 16] 1.104 + 1.105 + mov rax, [rdi + vp8_block_quant_shift] 1.106 + mov rcx, [rdi + vp8_block_quant] 1.107 + mov rdx, [rdi + vp8_block_zrun_zbin_boost] 1.108 + 1.109 + ; x + round 1.110 + paddw xmm2, xmm4 1.111 + paddw xmm3, xmm5 1.112 + 1.113 + ; quant 1.114 + movdqa xmm4, [rcx] 1.115 + movdqa xmm5, [rcx + 16] 1.116 + 1.117 + ; y = x * quant_ptr >> 16 1.118 + pmulhw xmm4, xmm2 1.119 + pmulhw xmm5, xmm3 1.120 + 1.121 + ; y += x 1.122 + paddw xmm2, xmm4 1.123 + paddw xmm3, xmm5 1.124 + 1.125 + pxor xmm4, xmm4 1.126 +%if ABI_IS_32BIT 1.127 + movdqa [rsp + qcoeff], xmm4 1.128 + movdqa [rsp + qcoeff + 16], xmm4 1.129 +%else 1.130 + pxor xmm8, xmm8 1.131 +%endif 1.132 + 1.133 + ; quant_shift 1.134 + movdqa xmm5, [rax] 1.135 + 1.136 + ; zrun_zbin_boost 1.137 + mov rax, rdx 1.138 + 1.139 +%macro ZIGZAG_LOOP 5 1.140 + ; x 1.141 + pextrw ecx, %4, %2 1.142 + 1.143 + ; if (x >= zbin) 1.144 + sub cx, WORD PTR[rdx] ; x - zbin 1.145 + lea rdx, [rdx + 2] ; zbin_boost_ptr++ 1.146 + jl .rq_zigzag_loop_%1 ; x < zbin 1.147 + 1.148 + pextrw edi, %3, %2 ; y 1.149 + 1.150 + ; downshift by quant_shift[rc] 1.151 + pextrb ecx, xmm5, %1 ; quant_shift[rc] 1.152 + sar edi, cl ; also sets Z bit 1.153 + je .rq_zigzag_loop_%1 ; !y 1.154 +%if ABI_IS_32BIT 1.155 + mov WORD PTR[rsp + qcoeff + %1 *2], di 1.156 +%else 1.157 + pinsrw %5, edi, %2 ; qcoeff[rc] 1.158 +%endif 1.159 + mov rdx, rax ; reset to b->zrun_zbin_boost 1.160 +.rq_zigzag_loop_%1: 1.161 +%endmacro 1.162 +; in vp8_default_zig_zag1d order: see vp8/common/entropy.c 1.163 +ZIGZAG_LOOP 0, 0, xmm2, xmm6, xmm4 1.164 +ZIGZAG_LOOP 1, 1, xmm2, xmm6, xmm4 1.165 +ZIGZAG_LOOP 4, 4, xmm2, xmm6, xmm4 1.166 +ZIGZAG_LOOP 8, 0, xmm3, xmm7, xmm8 1.167 +ZIGZAG_LOOP 5, 5, xmm2, xmm6, xmm4 1.168 +ZIGZAG_LOOP 2, 2, xmm2, xmm6, xmm4 1.169 +ZIGZAG_LOOP 3, 3, xmm2, xmm6, xmm4 1.170 +ZIGZAG_LOOP 6, 6, xmm2, xmm6, xmm4 1.171 +ZIGZAG_LOOP 9, 1, xmm3, xmm7, xmm8 1.172 +ZIGZAG_LOOP 12, 4, xmm3, xmm7, xmm8 1.173 +ZIGZAG_LOOP 13, 5, xmm3, xmm7, xmm8 1.174 +ZIGZAG_LOOP 10, 2, xmm3, xmm7, xmm8 1.175 +ZIGZAG_LOOP 7, 7, xmm2, xmm6, xmm4 1.176 +ZIGZAG_LOOP 11, 3, xmm3, xmm7, xmm8 1.177 +ZIGZAG_LOOP 14, 6, xmm3, xmm7, xmm8 1.178 +ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8 1.179 + 1.180 + mov rcx, [rsi + vp8_blockd_dequant] 1.181 + mov rdi, [rsi + vp8_blockd_dqcoeff] 1.182 + 1.183 +%if ABI_IS_32BIT 1.184 + movdqa xmm4, [rsp + qcoeff] 1.185 + movdqa xmm5, [rsp + qcoeff + 16] 1.186 +%else 1.187 + %define xmm5 xmm8 1.188 +%endif 1.189 + 1.190 + ; y ^ sz 1.191 + pxor xmm4, xmm0 1.192 + pxor xmm5, xmm1 1.193 + ; x = (y ^ sz) - sz 1.194 + psubw xmm4, xmm0 1.195 + psubw xmm5, xmm1 1.196 + 1.197 + ; dequant 1.198 + movdqa xmm0, [rcx] 1.199 + movdqa xmm1, [rcx + 16] 1.200 + 1.201 + mov rcx, [rsi + vp8_blockd_qcoeff] 1.202 + 1.203 + pmullw xmm0, xmm4 1.204 + pmullw xmm1, xmm5 1.205 + 1.206 + ; store qcoeff 1.207 + movdqa [rcx], xmm4 1.208 + movdqa [rcx + 16], xmm5 1.209 + 1.210 + ; store dqcoeff 1.211 + movdqa [rdi], xmm0 1.212 + movdqa [rdi + 16], xmm1 1.213 + 1.214 + mov rcx, [rsi + vp8_blockd_eob] 1.215 + 1.216 + ; select the last value (in zig_zag order) for EOB 1.217 + pxor xmm6, xmm6 1.218 + pcmpeqw xmm4, xmm6 1.219 + pcmpeqw xmm5, xmm6 1.220 + 1.221 + packsswb xmm4, xmm5 1.222 + pshufb xmm4, [GLOBAL(zig_zag1d)] 1.223 + pmovmskb edx, xmm4 1.224 + xor rdi, rdi 1.225 + mov eax, -1 1.226 + xor dx, ax 1.227 + bsr eax, edx 1.228 + sub edi, edx 1.229 + sar edi, 31 1.230 + add eax, 1 1.231 + and eax, edi 1.232 + 1.233 + mov BYTE PTR [rcx], al ; store eob 1.234 + 1.235 + ; begin epilog 1.236 +%if ABI_IS_32BIT 1.237 + add rsp, stack_size 1.238 + pop rsp 1.239 + 1.240 + pop rsi 1.241 + pop rdi 1.242 + RESTORE_GOT 1.243 + pop rbp 1.244 +%else 1.245 + %undef xmm5 1.246 + %if LIBVPX_YASM_WIN64 1.247 + pop rsi 1.248 + pop rdi 1.249 + RESTORE_XMM 1.250 + %endif 1.251 +%endif 1.252 + 1.253 + ret 1.254 + 1.255 +SECTION_RODATA 1.256 +align 16 1.257 +; vp8/common/entropy.c: vp8_default_zig_zag1d 1.258 +zig_zag1d: 1.259 + db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15