media/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    11 %include "third_party/x86inc/x86inc.asm"
    13 SECTION_RODATA
    14 pw_1: times 8 dw 1
    16 SECTION .text
    18 %macro QUANTIZE_FN 2
    19 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
    20                                 shift, qcoeff, dqcoeff, dequant, zbin_oq, \
    21                                 eob, scan, iscan
    22   cmp                    dword skipm, 0
    23   jne .blank
    25   ; actual quantize loop - setup pointers, rounders, etc.
    26   movifnidn                   coeffq, coeffmp
    27   movifnidn                  ncoeffq, ncoeffmp
    28   mov                             r2, dequantmp
    29   movifnidn                    zbinq, zbinmp
    30   movifnidn                   roundq, roundmp
    31   movifnidn                   quantq, quantmp
    32   movd                            m4, dword zbin_oqm       ; m4 = zbin_oq
    33   mova                            m0, [zbinq]              ; m0 = zbin
    34   punpcklwd                       m4, m4
    35   mova                            m1, [roundq]             ; m1 = round
    36   pshufd                          m4, m4, 0
    37   mova                            m2, [quantq]             ; m2 = quant
    38   paddw                           m0, m4                   ; m0 = zbin + zbin_oq
    39 %ifidn %1, b_32x32
    40   pcmpeqw                         m5, m5
    41   psrlw                           m5, 15
    42   paddw                           m0, m5
    43   paddw                           m1, m5
    44   psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
    45   psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
    46 %endif
    47   mova                            m3, [r2q]                ; m3 = dequant
    48   psubw                           m0, [pw_1]
    49   mov                             r2, shiftmp
    50   mov                             r3, qcoeffmp
    51   mova                            m4, [r2]                 ; m4 = shift
    52   mov                             r4, dqcoeffmp
    53   mov                             r5, iscanmp
    54 %ifidn %1, b_32x32
    55   psllw                           m4, 1
    56 %endif
    57   pxor                            m5, m5                   ; m5 = dedicated zero
    58   DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
    59   lea                         coeffq, [  coeffq+ncoeffq*2]
    60   lea                         iscanq, [  iscanq+ncoeffq*2]
    61   lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
    62   lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
    63   neg                        ncoeffq
    65   ; get DC and first 15 AC coeffs
    66   mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
    67   mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
    68   pabsw                           m6, m9                   ; m6 = abs(m9)
    69   pabsw                          m11, m10                  ; m11 = abs(m10)
    70   pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
    71   punpckhqdq                      m0, m0
    72   pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
    73   paddsw                          m6, m1                   ; m6 += round
    74   punpckhqdq                      m1, m1
    75   paddsw                         m11, m1                   ; m11 += round
    76   pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
    77   punpckhqdq                      m2, m2
    78   pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
    79   paddw                           m8, m6                   ; m8 += m6
    80   paddw                          m13, m11                  ; m13 += m11
    81   pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
    82   punpckhqdq                      m4, m4
    83   pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
    84   psignw                          m8, m9                   ; m8 = reinsert sign
    85   psignw                         m13, m10                  ; m13 = reinsert sign
    86   pand                            m8, m7
    87   pand                           m13, m12
    88   mova        [qcoeffq+ncoeffq*2+ 0], m8
    89   mova        [qcoeffq+ncoeffq*2+16], m13
    90 %ifidn %1, b_32x32
    91   pabsw                           m8, m8
    92   pabsw                          m13, m13
    93 %endif
    94   pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
    95   punpckhqdq                      m3, m3
    96   pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
    97 %ifidn %1, b_32x32
    98   psrlw                           m8, 1
    99   psrlw                          m13, 1
   100   psignw                          m8, m9
   101   psignw                         m13, m10
   102 %endif
   103   mova       [dqcoeffq+ncoeffq*2+ 0], m8
   104   mova       [dqcoeffq+ncoeffq*2+16], m13
   105   pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
   106   pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
   107   mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
   108   mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
   109   psubw                           m6, m7                   ; m6 = scan[i] + 1
   110   psubw                          m11, m12                  ; m11 = scan[i] + 1
   111   pandn                           m8, m6                   ; m8 = max(eob)
   112   pandn                          m13, m11                  ; m13 = max(eob)
   113   pmaxsw                          m8, m13
   114   add                        ncoeffq, mmsize
   115   jz .accumulate_eob
   117 .ac_only_loop:
   118   mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
   119   mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
   120   pabsw                           m6, m9                   ; m6 = abs(m9)
   121   pabsw                          m11, m10                  ; m11 = abs(m10)
   122   pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
   123   pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
   124 %ifidn %1, b_32x32
   125   pmovmskb                        r6, m7
   126   pmovmskb                        r2, m12
   127   or                              r6, r2
   128   jz .skip_iter
   129 %endif
   130   paddsw                          m6, m1                   ; m6 += round
   131   paddsw                         m11, m1                   ; m11 += round
   132   pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
   133   pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
   134   paddw                          m14, m6                   ; m14 += m6
   135   paddw                          m13, m11                  ; m13 += m11
   136   pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
   137   pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
   138   psignw                         m14, m9                   ; m14 = reinsert sign
   139   psignw                         m13, m10                  ; m13 = reinsert sign
   140   pand                           m14, m7
   141   pand                           m13, m12
   142   mova        [qcoeffq+ncoeffq*2+ 0], m14
   143   mova        [qcoeffq+ncoeffq*2+16], m13
   144 %ifidn %1, b_32x32
   145   pabsw                          m14, m14
   146   pabsw                          m13, m13
   147 %endif
   148   pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
   149   pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
   150 %ifidn %1, b_32x32
   151   psrlw                          m14, 1
   152   psrlw                          m13, 1
   153   psignw                         m14, m9
   154   psignw                         m13, m10
   155 %endif
   156   mova       [dqcoeffq+ncoeffq*2+ 0], m14
   157   mova       [dqcoeffq+ncoeffq*2+16], m13
   158   pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
   159   pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
   160   mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
   161   mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
   162   psubw                           m6, m7                   ; m6 = scan[i] + 1
   163   psubw                          m11, m12                  ; m11 = scan[i] + 1
   164   pandn                          m14, m6                   ; m14 = max(eob)
   165   pandn                          m13, m11                  ; m13 = max(eob)
   166   pmaxsw                          m8, m14
   167   pmaxsw                          m8, m13
   168   add                        ncoeffq, mmsize
   169   jl .ac_only_loop
   171 %ifidn %1, b_32x32
   172   jmp .accumulate_eob
   173 .skip_iter:
   174   mova        [qcoeffq+ncoeffq*2+ 0], m5
   175   mova        [qcoeffq+ncoeffq*2+16], m5
   176   mova       [dqcoeffq+ncoeffq*2+ 0], m5
   177   mova       [dqcoeffq+ncoeffq*2+16], m5
   178   add                        ncoeffq, mmsize
   179   jl .ac_only_loop
   180 %endif
   182 .accumulate_eob:
   183   ; horizontally accumulate/max eobs and write into [eob] memory pointer
   184   mov                             r2, eobmp
   185   pshufd                          m7, m8, 0xe
   186   pmaxsw                          m8, m7
   187   pshuflw                         m7, m8, 0xe
   188   pmaxsw                          m8, m7
   189   pshuflw                         m7, m8, 0x1
   190   pmaxsw                          m8, m7
   191   pextrw                        [r2], m8, 0
   192   RET
   194   ; skip-block, i.e. just write all zeroes
   195 .blank:
   196   mov                             r0, dqcoeffmp
   197   movifnidn                  ncoeffq, ncoeffmp
   198   mov                             r2, qcoeffmp
   199   mov                             r3, eobmp
   200   DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
   201   lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
   202   lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
   203   neg                        ncoeffq
   204   pxor                            m7, m7
   205 .blank_loop:
   206   mova       [dqcoeffq+ncoeffq*2+ 0], m7
   207   mova       [dqcoeffq+ncoeffq*2+16], m7
   208   mova        [qcoeffq+ncoeffq*2+ 0], m7
   209   mova        [qcoeffq+ncoeffq*2+16], m7
   210   add                        ncoeffq, mmsize
   211   jl .blank_loop
   212   mov                    word [eobq], 0
   213   RET
   214 %endmacro
   216 INIT_XMM ssse3
   217 QUANTIZE_FN b, 6
   218 QUANTIZE_FN b_32x32, 7

mercurial