media/libvpx/vp9/encoder/x86/vp9_sad_sse2.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    11 %include "third_party/x86inc/x86inc.asm"
    13 SECTION .text
    15 %macro SAD_FN 4
    16 %if %4 == 0
    17 %if %3 == 5
    18 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
    19 %else ; %3 == 7
    20 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \
    21                             src_stride3, ref_stride3, n_rows
    22 %endif ; %3 == 5/7
    23 %else ; avg
    24 %if %3 == 5
    25 cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
    26                                     second_pred, n_rows
    27 %else ; %3 == 7
    28 cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \
    29                                               ref, ref_stride, \
    30                                               second_pred, \
    31                                               src_stride3, ref_stride3
    32 %if ARCH_X86_64
    33 %define n_rowsd r7d
    34 %else ; x86-32
    35 %define n_rowsd dword r0m
    36 %endif ; x86-32/64
    37 %endif ; %3 == 5/7
    38 %endif ; avg/sad
    39   movsxdifnidn src_strideq, src_strided
    40   movsxdifnidn ref_strideq, ref_strided
    41 %if %3 == 7
    42   lea         src_stride3q, [src_strideq*3]
    43   lea         ref_stride3q, [ref_strideq*3]
    44 %endif ; %3 == 7
    45 %endmacro
    47 ; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride,
    48 ;                                uint8_t *ref, int ref_stride);
    49 %macro SAD64XN 1-2 0
    50   SAD_FN 64, %1, 5, %2
    51   mov              n_rowsd, %1
    52   pxor                  m0, m0
    53 .loop:
    54   movu                  m1, [refq]
    55   movu                  m2, [refq+16]
    56   movu                  m3, [refq+32]
    57   movu                  m4, [refq+48]
    58 %if %2 == 1
    59   pavgb                 m1, [second_predq+mmsize*0]
    60   pavgb                 m2, [second_predq+mmsize*1]
    61   pavgb                 m3, [second_predq+mmsize*2]
    62   pavgb                 m4, [second_predq+mmsize*3]
    63   lea         second_predq, [second_predq+mmsize*4]
    64 %endif
    65   psadbw                m1, [srcq]
    66   psadbw                m2, [srcq+16]
    67   psadbw                m3, [srcq+32]
    68   psadbw                m4, [srcq+48]
    69   paddd                 m1, m2
    70   paddd                 m3, m4
    71   add                 refq, ref_strideq
    72   paddd                 m0, m1
    73   add                 srcq, src_strideq
    74   paddd                 m0, m3
    75   dec              n_rowsd
    76   jg .loop
    78   movhlps               m1, m0
    79   paddd                 m0, m1
    80   movd                 eax, m0
    81   RET
    82 %endmacro
    84 INIT_XMM sse2
    85 SAD64XN 64 ; sad64x64_sse2
    86 SAD64XN 32 ; sad64x32_sse2
    87 SAD64XN 64, 1 ; sad64x64_avg_sse2
    88 SAD64XN 32, 1 ; sad64x32_avg_sse2
    90 ; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride,
    91 ;                                uint8_t *ref, int ref_stride);
    92 %macro SAD32XN 1-2 0
    93   SAD_FN 32, %1, 5, %2
    94   mov              n_rowsd, %1/2
    95   pxor                  m0, m0
    96 .loop:
    97   movu                  m1, [refq]
    98   movu                  m2, [refq+16]
    99   movu                  m3, [refq+ref_strideq]
   100   movu                  m4, [refq+ref_strideq+16]
   101 %if %2 == 1
   102   pavgb                 m1, [second_predq+mmsize*0]
   103   pavgb                 m2, [second_predq+mmsize*1]
   104   pavgb                 m3, [second_predq+mmsize*2]
   105   pavgb                 m4, [second_predq+mmsize*3]
   106   lea         second_predq, [second_predq+mmsize*4]
   107 %endif
   108   psadbw                m1, [srcq]
   109   psadbw                m2, [srcq+16]
   110   psadbw                m3, [srcq+src_strideq]
   111   psadbw                m4, [srcq+src_strideq+16]
   112   paddd                 m1, m2
   113   paddd                 m3, m4
   114   lea                 refq, [refq+ref_strideq*2]
   115   paddd                 m0, m1
   116   lea                 srcq, [srcq+src_strideq*2]
   117   paddd                 m0, m3
   118   dec              n_rowsd
   119   jg .loop
   121   movhlps               m1, m0
   122   paddd                 m0, m1
   123   movd                 eax, m0
   124   RET
   125 %endmacro
   127 INIT_XMM sse2
   128 SAD32XN 64 ; sad32x64_sse2
   129 SAD32XN 32 ; sad32x32_sse2
   130 SAD32XN 16 ; sad32x16_sse2
   131 SAD32XN 64, 1 ; sad32x64_avg_sse2
   132 SAD32XN 32, 1 ; sad32x32_avg_sse2
   133 SAD32XN 16, 1 ; sad32x16_avg_sse2
   135 ; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
   136 ;                                    uint8_t *ref, int ref_stride);
   137 %macro SAD16XN 1-2 0
   138   SAD_FN 16, %1, 7, %2
   139   mov              n_rowsd, %1/4
   140   pxor                  m0, m0
   142 .loop:
   143   movu                  m1, [refq]
   144   movu                  m2, [refq+ref_strideq]
   145   movu                  m3, [refq+ref_strideq*2]
   146   movu                  m4, [refq+ref_stride3q]
   147 %if %2 == 1
   148   pavgb                 m1, [second_predq+mmsize*0]
   149   pavgb                 m2, [second_predq+mmsize*1]
   150   pavgb                 m3, [second_predq+mmsize*2]
   151   pavgb                 m4, [second_predq+mmsize*3]
   152   lea         second_predq, [second_predq+mmsize*4]
   153 %endif
   154   psadbw                m1, [srcq]
   155   psadbw                m2, [srcq+src_strideq]
   156   psadbw                m3, [srcq+src_strideq*2]
   157   psadbw                m4, [srcq+src_stride3q]
   158   paddd                 m1, m2
   159   paddd                 m3, m4
   160   lea                 refq, [refq+ref_strideq*4]
   161   paddd                 m0, m1
   162   lea                 srcq, [srcq+src_strideq*4]
   163   paddd                 m0, m3
   164   dec              n_rowsd
   165   jg .loop
   167   movhlps               m1, m0
   168   paddd                 m0, m1
   169   movd                 eax, m0
   170   RET
   171 %endmacro
   173 INIT_XMM sse2
   174 SAD16XN 32 ; sad16x32_sse2
   175 SAD16XN 16 ; sad16x16_sse2
   176 SAD16XN  8 ; sad16x8_sse2
   177 SAD16XN 32, 1 ; sad16x32_avg_sse2
   178 SAD16XN 16, 1 ; sad16x16_avg_sse2
   179 SAD16XN  8, 1 ; sad16x8_avg_sse2
   181 ; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
   182 ;                                   uint8_t *ref, int ref_stride);
   183 %macro SAD8XN 1-2 0
   184   SAD_FN 8, %1, 7, %2
   185   mov              n_rowsd, %1/4
   186   pxor                  m0, m0
   188 .loop:
   189   movh                  m1, [refq]
   190   movhps                m1, [refq+ref_strideq]
   191   movh                  m2, [refq+ref_strideq*2]
   192   movhps                m2, [refq+ref_stride3q]
   193 %if %2 == 1
   194   pavgb                 m1, [second_predq+mmsize*0]
   195   pavgb                 m2, [second_predq+mmsize*1]
   196   lea         second_predq, [second_predq+mmsize*2]
   197 %endif
   198   movh                  m3, [srcq]
   199   movhps                m3, [srcq+src_strideq]
   200   movh                  m4, [srcq+src_strideq*2]
   201   movhps                m4, [srcq+src_stride3q]
   202   psadbw                m1, m3
   203   psadbw                m2, m4
   204   lea                 refq, [refq+ref_strideq*4]
   205   paddd                 m0, m1
   206   lea                 srcq, [srcq+src_strideq*4]
   207   paddd                 m0, m2
   208   dec              n_rowsd
   209   jg .loop
   211   movhlps               m1, m0
   212   paddd                 m0, m1
   213   movd                 eax, m0
   214   RET
   215 %endmacro
   217 INIT_XMM sse2
   218 SAD8XN 16 ; sad8x16_sse2
   219 SAD8XN  8 ; sad8x8_sse2
   220 SAD8XN  4 ; sad8x4_sse2
   221 SAD8XN 16, 1 ; sad8x16_avg_sse2
   222 SAD8XN  8, 1 ; sad8x8_avg_sse2
   223 SAD8XN  4, 1 ; sad8x4_avg_sse2
   225 ; unsigned int vp9_sad4x{4, 8}_sse(uint8_t *src, int src_stride,
   226 ;                                  uint8_t *ref, int ref_stride);
   227 %macro SAD4XN 1-2 0
   228   SAD_FN 4, %1, 7, %2
   229   mov              n_rowsd, %1/4
   230   pxor                  m0, m0
   232 .loop:
   233   movd                  m1, [refq]
   234   movd                  m2, [refq+ref_strideq]
   235   movd                  m3, [refq+ref_strideq*2]
   236   movd                  m4, [refq+ref_stride3q]
   237   punpckldq             m1, m2
   238   punpckldq             m3, m4
   239 %if %2 == 1
   240   pavgb                 m1, [second_predq+mmsize*0]
   241   pavgb                 m3, [second_predq+mmsize*1]
   242   lea         second_predq, [second_predq+mmsize*2]
   243 %endif
   244   movd                  m2, [srcq]
   245   movd                  m5, [srcq+src_strideq]
   246   movd                  m4, [srcq+src_strideq*2]
   247   movd                  m6, [srcq+src_stride3q]
   248   punpckldq             m2, m5
   249   punpckldq             m4, m6
   250   psadbw                m1, m2
   251   psadbw                m3, m4
   252   lea                 refq, [refq+ref_strideq*4]
   253   paddd                 m0, m1
   254   lea                 srcq, [srcq+src_strideq*4]
   255   paddd                 m0, m3
   256   dec              n_rowsd
   257   jg .loop
   259   movd                 eax, m0
   260   RET
   261 %endmacro
   263 INIT_MMX sse
   264 SAD4XN  8 ; sad4x8_sse
   265 SAD4XN  4 ; sad4x4_sse
   266 SAD4XN  8, 1 ; sad4x8_avg_sse
   267 SAD4XN  4, 1 ; sad4x4_avg_sse

mercurial