media/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,218 @@
     1.4 +;
     1.5 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 +;
     1.7 +;  Use of this source code is governed by a BSD-style license
     1.8 +;  that can be found in the LICENSE file in the root of the source
     1.9 +;  tree. An additional intellectual property rights grant can be found
    1.10 +;  in the file PATENTS.  All contributing project authors may
    1.11 +;  be found in the AUTHORS file in the root of the source tree.
    1.12 +;
    1.13 +
    1.14 +%include "third_party/x86inc/x86inc.asm"
    1.15 +
    1.16 +SECTION_RODATA
    1.17 +pw_1: times 8 dw 1
    1.18 +
    1.19 +SECTION .text
    1.20 +
    1.21 +%macro QUANTIZE_FN 2
    1.22 +cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
    1.23 +                                shift, qcoeff, dqcoeff, dequant, zbin_oq, \
    1.24 +                                eob, scan, iscan
    1.25 +  cmp                    dword skipm, 0
    1.26 +  jne .blank
    1.27 +
    1.28 +  ; actual quantize loop - setup pointers, rounders, etc.
    1.29 +  movifnidn                   coeffq, coeffmp
    1.30 +  movifnidn                  ncoeffq, ncoeffmp
    1.31 +  mov                             r2, dequantmp
    1.32 +  movifnidn                    zbinq, zbinmp
    1.33 +  movifnidn                   roundq, roundmp
    1.34 +  movifnidn                   quantq, quantmp
    1.35 +  movd                            m4, dword zbin_oqm       ; m4 = zbin_oq
    1.36 +  mova                            m0, [zbinq]              ; m0 = zbin
    1.37 +  punpcklwd                       m4, m4
    1.38 +  mova                            m1, [roundq]             ; m1 = round
    1.39 +  pshufd                          m4, m4, 0
    1.40 +  mova                            m2, [quantq]             ; m2 = quant
    1.41 +  paddw                           m0, m4                   ; m0 = zbin + zbin_oq
    1.42 +%ifidn %1, b_32x32
    1.43 +  pcmpeqw                         m5, m5
    1.44 +  psrlw                           m5, 15
    1.45 +  paddw                           m0, m5
    1.46 +  paddw                           m1, m5
    1.47 +  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
    1.48 +  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
    1.49 +%endif
    1.50 +  mova                            m3, [r2q]                ; m3 = dequant
    1.51 +  psubw                           m0, [pw_1]
    1.52 +  mov                             r2, shiftmp
    1.53 +  mov                             r3, qcoeffmp
    1.54 +  mova                            m4, [r2]                 ; m4 = shift
    1.55 +  mov                             r4, dqcoeffmp
    1.56 +  mov                             r5, iscanmp
    1.57 +%ifidn %1, b_32x32
    1.58 +  psllw                           m4, 1
    1.59 +%endif
    1.60 +  pxor                            m5, m5                   ; m5 = dedicated zero
    1.61 +  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
    1.62 +  lea                         coeffq, [  coeffq+ncoeffq*2]
    1.63 +  lea                         iscanq, [  iscanq+ncoeffq*2]
    1.64 +  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
    1.65 +  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
    1.66 +  neg                        ncoeffq
    1.67 +
    1.68 +  ; get DC and first 15 AC coeffs
    1.69 +  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
    1.70 +  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
    1.71 +  pabsw                           m6, m9                   ; m6 = abs(m9)
    1.72 +  pabsw                          m11, m10                  ; m11 = abs(m10)
    1.73 +  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
    1.74 +  punpckhqdq                      m0, m0
    1.75 +  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
    1.76 +  paddsw                          m6, m1                   ; m6 += round
    1.77 +  punpckhqdq                      m1, m1
    1.78 +  paddsw                         m11, m1                   ; m11 += round
    1.79 +  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
    1.80 +  punpckhqdq                      m2, m2
    1.81 +  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
    1.82 +  paddw                           m8, m6                   ; m8 += m6
    1.83 +  paddw                          m13, m11                  ; m13 += m11
    1.84 +  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
    1.85 +  punpckhqdq                      m4, m4
    1.86 +  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
    1.87 +  psignw                          m8, m9                   ; m8 = reinsert sign
    1.88 +  psignw                         m13, m10                  ; m13 = reinsert sign
    1.89 +  pand                            m8, m7
    1.90 +  pand                           m13, m12
    1.91 +  mova        [qcoeffq+ncoeffq*2+ 0], m8
    1.92 +  mova        [qcoeffq+ncoeffq*2+16], m13
    1.93 +%ifidn %1, b_32x32
    1.94 +  pabsw                           m8, m8
    1.95 +  pabsw                          m13, m13
    1.96 +%endif
    1.97 +  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
    1.98 +  punpckhqdq                      m3, m3
    1.99 +  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
   1.100 +%ifidn %1, b_32x32
   1.101 +  psrlw                           m8, 1
   1.102 +  psrlw                          m13, 1
   1.103 +  psignw                          m8, m9
   1.104 +  psignw                         m13, m10
   1.105 +%endif
   1.106 +  mova       [dqcoeffq+ncoeffq*2+ 0], m8
   1.107 +  mova       [dqcoeffq+ncoeffq*2+16], m13
   1.108 +  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
   1.109 +  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
   1.110 +  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
   1.111 +  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
   1.112 +  psubw                           m6, m7                   ; m6 = scan[i] + 1
   1.113 +  psubw                          m11, m12                  ; m11 = scan[i] + 1
   1.114 +  pandn                           m8, m6                   ; m8 = max(eob)
   1.115 +  pandn                          m13, m11                  ; m13 = max(eob)
   1.116 +  pmaxsw                          m8, m13
   1.117 +  add                        ncoeffq, mmsize
   1.118 +  jz .accumulate_eob
   1.119 +
   1.120 +.ac_only_loop:
   1.121 +  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
   1.122 +  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
   1.123 +  pabsw                           m6, m9                   ; m6 = abs(m9)
   1.124 +  pabsw                          m11, m10                  ; m11 = abs(m10)
   1.125 +  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
   1.126 +  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
   1.127 +%ifidn %1, b_32x32
   1.128 +  pmovmskb                        r6, m7
   1.129 +  pmovmskb                        r2, m12
   1.130 +  or                              r6, r2
   1.131 +  jz .skip_iter
   1.132 +%endif
   1.133 +  paddsw                          m6, m1                   ; m6 += round
   1.134 +  paddsw                         m11, m1                   ; m11 += round
   1.135 +  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
   1.136 +  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
   1.137 +  paddw                          m14, m6                   ; m14 += m6
   1.138 +  paddw                          m13, m11                  ; m13 += m11
   1.139 +  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
   1.140 +  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
   1.141 +  psignw                         m14, m9                   ; m14 = reinsert sign
   1.142 +  psignw                         m13, m10                  ; m13 = reinsert sign
   1.143 +  pand                           m14, m7
   1.144 +  pand                           m13, m12
   1.145 +  mova        [qcoeffq+ncoeffq*2+ 0], m14
   1.146 +  mova        [qcoeffq+ncoeffq*2+16], m13
   1.147 +%ifidn %1, b_32x32
   1.148 +  pabsw                          m14, m14
   1.149 +  pabsw                          m13, m13
   1.150 +%endif
   1.151 +  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
   1.152 +  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
   1.153 +%ifidn %1, b_32x32
   1.154 +  psrlw                          m14, 1
   1.155 +  psrlw                          m13, 1
   1.156 +  psignw                         m14, m9
   1.157 +  psignw                         m13, m10
   1.158 +%endif
   1.159 +  mova       [dqcoeffq+ncoeffq*2+ 0], m14
   1.160 +  mova       [dqcoeffq+ncoeffq*2+16], m13
   1.161 +  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
   1.162 +  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
   1.163 +  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
   1.164 +  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
   1.165 +  psubw                           m6, m7                   ; m6 = scan[i] + 1
   1.166 +  psubw                          m11, m12                  ; m11 = scan[i] + 1
   1.167 +  pandn                          m14, m6                   ; m14 = max(eob)
   1.168 +  pandn                          m13, m11                  ; m13 = max(eob)
   1.169 +  pmaxsw                          m8, m14
   1.170 +  pmaxsw                          m8, m13
   1.171 +  add                        ncoeffq, mmsize
   1.172 +  jl .ac_only_loop
   1.173 +
   1.174 +%ifidn %1, b_32x32
   1.175 +  jmp .accumulate_eob
   1.176 +.skip_iter:
   1.177 +  mova        [qcoeffq+ncoeffq*2+ 0], m5
   1.178 +  mova        [qcoeffq+ncoeffq*2+16], m5
   1.179 +  mova       [dqcoeffq+ncoeffq*2+ 0], m5
   1.180 +  mova       [dqcoeffq+ncoeffq*2+16], m5
   1.181 +  add                        ncoeffq, mmsize
   1.182 +  jl .ac_only_loop
   1.183 +%endif
   1.184 +
   1.185 +.accumulate_eob:
   1.186 +  ; horizontally accumulate/max eobs and write into [eob] memory pointer
   1.187 +  mov                             r2, eobmp
   1.188 +  pshufd                          m7, m8, 0xe
   1.189 +  pmaxsw                          m8, m7
   1.190 +  pshuflw                         m7, m8, 0xe
   1.191 +  pmaxsw                          m8, m7
   1.192 +  pshuflw                         m7, m8, 0x1
   1.193 +  pmaxsw                          m8, m7
   1.194 +  pextrw                        [r2], m8, 0
   1.195 +  RET
   1.196 +
   1.197 +  ; skip-block, i.e. just write all zeroes
   1.198 +.blank:
   1.199 +  mov                             r0, dqcoeffmp
   1.200 +  movifnidn                  ncoeffq, ncoeffmp
   1.201 +  mov                             r2, qcoeffmp
   1.202 +  mov                             r3, eobmp
   1.203 +  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
   1.204 +  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
   1.205 +  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
   1.206 +  neg                        ncoeffq
   1.207 +  pxor                            m7, m7
   1.208 +.blank_loop:
   1.209 +  mova       [dqcoeffq+ncoeffq*2+ 0], m7
   1.210 +  mova       [dqcoeffq+ncoeffq*2+16], m7
   1.211 +  mova        [qcoeffq+ncoeffq*2+ 0], m7
   1.212 +  mova        [qcoeffq+ncoeffq*2+16], m7
   1.213 +  add                        ncoeffq, mmsize
   1.214 +  jl .blank_loop
   1.215 +  mov                    word [eobq], 0
   1.216 +  RET
   1.217 +%endmacro
   1.218 +
   1.219 +INIT_XMM ssse3
   1.220 +QUANTIZE_FN b, 6
   1.221 +QUANTIZE_FN b_32x32, 7

mercurial