media/libvpx/vp8/encoder/x86/quantize_sse4.asm

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license and patent
michael@0 5 ; grant that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. All contributing project authors may be found in the AUTHORS
michael@0 7 ; file in the root of the source tree.
michael@0 8 ;
michael@0 9
michael@0 10
michael@0 11 %include "vpx_ports/x86_abi_support.asm"
michael@0 12 %include "vp8_asm_enc_offsets.asm"
michael@0 13
michael@0 14
michael@0 15 ; void vp8_regular_quantize_b_sse4 | arg
michael@0 16 ; (BLOCK *b, | 0
michael@0 17 ; BLOCKD *d) | 1
michael@0 18
michael@0 19 global sym(vp8_regular_quantize_b_sse4) PRIVATE
michael@0 20 sym(vp8_regular_quantize_b_sse4):
michael@0 21
michael@0 22 %if ABI_IS_32BIT
michael@0 23 push rbp
michael@0 24 mov rbp, rsp
michael@0 25 GET_GOT rbx
michael@0 26 push rdi
michael@0 27 push rsi
michael@0 28
michael@0 29 ALIGN_STACK 16, rax
michael@0 30 %define qcoeff 0 ; 32
michael@0 31 %define stack_size 32
michael@0 32 sub rsp, stack_size
michael@0 33 %else
michael@0 34 %if LIBVPX_YASM_WIN64
michael@0 35 SAVE_XMM 8, u
michael@0 36 push rdi
michael@0 37 push rsi
michael@0 38 %endif
michael@0 39 %endif
michael@0 40 ; end prolog
michael@0 41
michael@0 42 %if ABI_IS_32BIT
michael@0 43 mov rdi, arg(0) ; BLOCK *b
michael@0 44 mov rsi, arg(1) ; BLOCKD *d
michael@0 45 %else
michael@0 46 %if LIBVPX_YASM_WIN64
michael@0 47 mov rdi, rcx ; BLOCK *b
michael@0 48 mov rsi, rdx ; BLOCKD *d
michael@0 49 %else
michael@0 50 ;mov rdi, rdi ; BLOCK *b
michael@0 51 ;mov rsi, rsi ; BLOCKD *d
michael@0 52 %endif
michael@0 53 %endif
michael@0 54
michael@0 55 mov rax, [rdi + vp8_block_coeff]
michael@0 56 mov rcx, [rdi + vp8_block_zbin]
michael@0 57 mov rdx, [rdi + vp8_block_round]
michael@0 58 movd xmm7, [rdi + vp8_block_zbin_extra]
michael@0 59
michael@0 60 ; z
michael@0 61 movdqa xmm0, [rax]
michael@0 62 movdqa xmm1, [rax + 16]
michael@0 63
michael@0 64 ; duplicate zbin_oq_value
michael@0 65 pshuflw xmm7, xmm7, 0
michael@0 66 punpcklwd xmm7, xmm7
michael@0 67
michael@0 68 movdqa xmm2, xmm0
michael@0 69 movdqa xmm3, xmm1
michael@0 70
michael@0 71 ; sz
michael@0 72 psraw xmm0, 15
michael@0 73 psraw xmm1, 15
michael@0 74
michael@0 75 ; (z ^ sz)
michael@0 76 pxor xmm2, xmm0
michael@0 77 pxor xmm3, xmm1
michael@0 78
michael@0 79 ; x = abs(z)
michael@0 80 psubw xmm2, xmm0
michael@0 81 psubw xmm3, xmm1
michael@0 82
michael@0 83 ; zbin
michael@0 84 movdqa xmm4, [rcx]
michael@0 85 movdqa xmm5, [rcx + 16]
michael@0 86
michael@0 87 ; *zbin_ptr + zbin_oq_value
michael@0 88 paddw xmm4, xmm7
michael@0 89 paddw xmm5, xmm7
michael@0 90
michael@0 91 movdqa xmm6, xmm2
michael@0 92 movdqa xmm7, xmm3
michael@0 93
michael@0 94 ; x - (*zbin_ptr + zbin_oq_value)
michael@0 95 psubw xmm6, xmm4
michael@0 96 psubw xmm7, xmm5
michael@0 97
michael@0 98 ; round
michael@0 99 movdqa xmm4, [rdx]
michael@0 100 movdqa xmm5, [rdx + 16]
michael@0 101
michael@0 102 mov rax, [rdi + vp8_block_quant_shift]
michael@0 103 mov rcx, [rdi + vp8_block_quant]
michael@0 104 mov rdx, [rdi + vp8_block_zrun_zbin_boost]
michael@0 105
michael@0 106 ; x + round
michael@0 107 paddw xmm2, xmm4
michael@0 108 paddw xmm3, xmm5
michael@0 109
michael@0 110 ; quant
michael@0 111 movdqa xmm4, [rcx]
michael@0 112 movdqa xmm5, [rcx + 16]
michael@0 113
michael@0 114 ; y = x * quant_ptr >> 16
michael@0 115 pmulhw xmm4, xmm2
michael@0 116 pmulhw xmm5, xmm3
michael@0 117
michael@0 118 ; y += x
michael@0 119 paddw xmm2, xmm4
michael@0 120 paddw xmm3, xmm5
michael@0 121
michael@0 122 pxor xmm4, xmm4
michael@0 123 %if ABI_IS_32BIT
michael@0 124 movdqa [rsp + qcoeff], xmm4
michael@0 125 movdqa [rsp + qcoeff + 16], xmm4
michael@0 126 %else
michael@0 127 pxor xmm8, xmm8
michael@0 128 %endif
michael@0 129
michael@0 130 ; quant_shift
michael@0 131 movdqa xmm5, [rax]
michael@0 132
michael@0 133 ; zrun_zbin_boost
michael@0 134 mov rax, rdx
michael@0 135
michael@0 136 %macro ZIGZAG_LOOP 5
michael@0 137 ; x
michael@0 138 pextrw ecx, %4, %2
michael@0 139
michael@0 140 ; if (x >= zbin)
michael@0 141 sub cx, WORD PTR[rdx] ; x - zbin
michael@0 142 lea rdx, [rdx + 2] ; zbin_boost_ptr++
michael@0 143 jl .rq_zigzag_loop_%1 ; x < zbin
michael@0 144
michael@0 145 pextrw edi, %3, %2 ; y
michael@0 146
michael@0 147 ; downshift by quant_shift[rc]
michael@0 148 pextrb ecx, xmm5, %1 ; quant_shift[rc]
michael@0 149 sar edi, cl ; also sets Z bit
michael@0 150 je .rq_zigzag_loop_%1 ; !y
michael@0 151 %if ABI_IS_32BIT
michael@0 152 mov WORD PTR[rsp + qcoeff + %1 *2], di
michael@0 153 %else
michael@0 154 pinsrw %5, edi, %2 ; qcoeff[rc]
michael@0 155 %endif
michael@0 156 mov rdx, rax ; reset to b->zrun_zbin_boost
michael@0 157 .rq_zigzag_loop_%1:
michael@0 158 %endmacro
michael@0 159 ; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
michael@0 160 ZIGZAG_LOOP 0, 0, xmm2, xmm6, xmm4
michael@0 161 ZIGZAG_LOOP 1, 1, xmm2, xmm6, xmm4
michael@0 162 ZIGZAG_LOOP 4, 4, xmm2, xmm6, xmm4
michael@0 163 ZIGZAG_LOOP 8, 0, xmm3, xmm7, xmm8
michael@0 164 ZIGZAG_LOOP 5, 5, xmm2, xmm6, xmm4
michael@0 165 ZIGZAG_LOOP 2, 2, xmm2, xmm6, xmm4
michael@0 166 ZIGZAG_LOOP 3, 3, xmm2, xmm6, xmm4
michael@0 167 ZIGZAG_LOOP 6, 6, xmm2, xmm6, xmm4
michael@0 168 ZIGZAG_LOOP 9, 1, xmm3, xmm7, xmm8
michael@0 169 ZIGZAG_LOOP 12, 4, xmm3, xmm7, xmm8
michael@0 170 ZIGZAG_LOOP 13, 5, xmm3, xmm7, xmm8
michael@0 171 ZIGZAG_LOOP 10, 2, xmm3, xmm7, xmm8
michael@0 172 ZIGZAG_LOOP 7, 7, xmm2, xmm6, xmm4
michael@0 173 ZIGZAG_LOOP 11, 3, xmm3, xmm7, xmm8
michael@0 174 ZIGZAG_LOOP 14, 6, xmm3, xmm7, xmm8
michael@0 175 ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8
michael@0 176
michael@0 177 mov rcx, [rsi + vp8_blockd_dequant]
michael@0 178 mov rdi, [rsi + vp8_blockd_dqcoeff]
michael@0 179
michael@0 180 %if ABI_IS_32BIT
michael@0 181 movdqa xmm4, [rsp + qcoeff]
michael@0 182 movdqa xmm5, [rsp + qcoeff + 16]
michael@0 183 %else
michael@0 184 %define xmm5 xmm8
michael@0 185 %endif
michael@0 186
michael@0 187 ; y ^ sz
michael@0 188 pxor xmm4, xmm0
michael@0 189 pxor xmm5, xmm1
michael@0 190 ; x = (y ^ sz) - sz
michael@0 191 psubw xmm4, xmm0
michael@0 192 psubw xmm5, xmm1
michael@0 193
michael@0 194 ; dequant
michael@0 195 movdqa xmm0, [rcx]
michael@0 196 movdqa xmm1, [rcx + 16]
michael@0 197
michael@0 198 mov rcx, [rsi + vp8_blockd_qcoeff]
michael@0 199
michael@0 200 pmullw xmm0, xmm4
michael@0 201 pmullw xmm1, xmm5
michael@0 202
michael@0 203 ; store qcoeff
michael@0 204 movdqa [rcx], xmm4
michael@0 205 movdqa [rcx + 16], xmm5
michael@0 206
michael@0 207 ; store dqcoeff
michael@0 208 movdqa [rdi], xmm0
michael@0 209 movdqa [rdi + 16], xmm1
michael@0 210
michael@0 211 mov rcx, [rsi + vp8_blockd_eob]
michael@0 212
michael@0 213 ; select the last value (in zig_zag order) for EOB
michael@0 214 pxor xmm6, xmm6
michael@0 215 pcmpeqw xmm4, xmm6
michael@0 216 pcmpeqw xmm5, xmm6
michael@0 217
michael@0 218 packsswb xmm4, xmm5
michael@0 219 pshufb xmm4, [GLOBAL(zig_zag1d)]
michael@0 220 pmovmskb edx, xmm4
michael@0 221 xor rdi, rdi
michael@0 222 mov eax, -1
michael@0 223 xor dx, ax
michael@0 224 bsr eax, edx
michael@0 225 sub edi, edx
michael@0 226 sar edi, 31
michael@0 227 add eax, 1
michael@0 228 and eax, edi
michael@0 229
michael@0 230 mov BYTE PTR [rcx], al ; store eob
michael@0 231
michael@0 232 ; begin epilog
michael@0 233 %if ABI_IS_32BIT
michael@0 234 add rsp, stack_size
michael@0 235 pop rsp
michael@0 236
michael@0 237 pop rsi
michael@0 238 pop rdi
michael@0 239 RESTORE_GOT
michael@0 240 pop rbp
michael@0 241 %else
michael@0 242 %undef xmm5
michael@0 243 %if LIBVPX_YASM_WIN64
michael@0 244 pop rsi
michael@0 245 pop rdi
michael@0 246 RESTORE_XMM
michael@0 247 %endif
michael@0 248 %endif
michael@0 249
michael@0 250 ret
michael@0 251
michael@0 252 SECTION_RODATA
michael@0 253 align 16
michael@0 254 ; vp8/common/entropy.c: vp8_default_zig_zag1d
michael@0 255 zig_zag1d:
michael@0 256 db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15

mercurial