media/libvpx/vp9/encoder/x86/vp9_sad4d_sse2.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11 %include "third_party/x86inc/x86inc.asm"
michael@0 12
michael@0 13 SECTION .text
michael@0 14
michael@0 15 ; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
michael@0 16 %macro PROCESS_4x2x4 5-6 0
michael@0 17 movd m0, [srcq +%2]
michael@0 18 %if %1 == 1
michael@0 19 movd m6, [ref1q+%3]
michael@0 20 movd m4, [ref2q+%3]
michael@0 21 movd m7, [ref3q+%3]
michael@0 22 movd m5, [ref4q+%3]
michael@0 23 punpckldq m0, [srcq +%4]
michael@0 24 punpckldq m6, [ref1q+%5]
michael@0 25 punpckldq m4, [ref2q+%5]
michael@0 26 punpckldq m7, [ref3q+%5]
michael@0 27 punpckldq m5, [ref4q+%5]
michael@0 28 psadbw m6, m0
michael@0 29 psadbw m4, m0
michael@0 30 psadbw m7, m0
michael@0 31 psadbw m5, m0
michael@0 32 punpckldq m6, m4
michael@0 33 punpckldq m7, m5
michael@0 34 %else
michael@0 35 movd m1, [ref1q+%3]
michael@0 36 movd m2, [ref2q+%3]
michael@0 37 movd m3, [ref3q+%3]
michael@0 38 movd m4, [ref4q+%3]
michael@0 39 punpckldq m0, [srcq +%4]
michael@0 40 punpckldq m1, [ref1q+%5]
michael@0 41 punpckldq m2, [ref2q+%5]
michael@0 42 punpckldq m3, [ref3q+%5]
michael@0 43 punpckldq m4, [ref4q+%5]
michael@0 44 psadbw m1, m0
michael@0 45 psadbw m2, m0
michael@0 46 psadbw m3, m0
michael@0 47 psadbw m4, m0
michael@0 48 punpckldq m1, m2
michael@0 49 punpckldq m3, m4
michael@0 50 paddd m6, m1
michael@0 51 paddd m7, m3
michael@0 52 %endif
michael@0 53 %if %6 == 1
michael@0 54 lea srcq, [srcq +src_strideq*2]
michael@0 55 lea ref1q, [ref1q+ref_strideq*2]
michael@0 56 lea ref2q, [ref2q+ref_strideq*2]
michael@0 57 lea ref3q, [ref3q+ref_strideq*2]
michael@0 58 lea ref4q, [ref4q+ref_strideq*2]
michael@0 59 %endif
michael@0 60 %endmacro
michael@0 61
michael@0 62 ; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
michael@0 63 %macro PROCESS_8x2x4 5-6 0
michael@0 64 movh m0, [srcq +%2]
michael@0 65 %if %1 == 1
michael@0 66 movh m4, [ref1q+%3]
michael@0 67 movh m5, [ref2q+%3]
michael@0 68 movh m6, [ref3q+%3]
michael@0 69 movh m7, [ref4q+%3]
michael@0 70 movhps m0, [srcq +%4]
michael@0 71 movhps m4, [ref1q+%5]
michael@0 72 movhps m5, [ref2q+%5]
michael@0 73 movhps m6, [ref3q+%5]
michael@0 74 movhps m7, [ref4q+%5]
michael@0 75 psadbw m4, m0
michael@0 76 psadbw m5, m0
michael@0 77 psadbw m6, m0
michael@0 78 psadbw m7, m0
michael@0 79 %else
michael@0 80 movh m1, [ref1q+%3]
michael@0 81 movh m2, [ref2q+%3]
michael@0 82 movh m3, [ref3q+%3]
michael@0 83 movhps m0, [srcq +%4]
michael@0 84 movhps m1, [ref1q+%5]
michael@0 85 movhps m2, [ref2q+%5]
michael@0 86 movhps m3, [ref3q+%5]
michael@0 87 psadbw m1, m0
michael@0 88 psadbw m2, m0
michael@0 89 psadbw m3, m0
michael@0 90 paddd m4, m1
michael@0 91 movh m1, [ref4q+%3]
michael@0 92 movhps m1, [ref4q+%5]
michael@0 93 paddd m5, m2
michael@0 94 paddd m6, m3
michael@0 95 psadbw m1, m0
michael@0 96 paddd m7, m1
michael@0 97 %endif
michael@0 98 %if %6 == 1
michael@0 99 lea srcq, [srcq +src_strideq*2]
michael@0 100 lea ref1q, [ref1q+ref_strideq*2]
michael@0 101 lea ref2q, [ref2q+ref_strideq*2]
michael@0 102 lea ref3q, [ref3q+ref_strideq*2]
michael@0 103 lea ref4q, [ref4q+ref_strideq*2]
michael@0 104 %endif
michael@0 105 %endmacro
michael@0 106
michael@0 107 ; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
michael@0 108 %macro PROCESS_16x2x4 5-6 0
michael@0 109 ; 1st 16 px
michael@0 110 mova m0, [srcq +%2]
michael@0 111 %if %1 == 1
michael@0 112 movu m4, [ref1q+%3]
michael@0 113 movu m5, [ref2q+%3]
michael@0 114 movu m6, [ref3q+%3]
michael@0 115 movu m7, [ref4q+%3]
michael@0 116 psadbw m4, m0
michael@0 117 psadbw m5, m0
michael@0 118 psadbw m6, m0
michael@0 119 psadbw m7, m0
michael@0 120 %else
michael@0 121 movu m1, [ref1q+%3]
michael@0 122 movu m2, [ref2q+%3]
michael@0 123 movu m3, [ref3q+%3]
michael@0 124 psadbw m1, m0
michael@0 125 psadbw m2, m0
michael@0 126 psadbw m3, m0
michael@0 127 paddd m4, m1
michael@0 128 movu m1, [ref4q+%3]
michael@0 129 paddd m5, m2
michael@0 130 paddd m6, m3
michael@0 131 psadbw m1, m0
michael@0 132 paddd m7, m1
michael@0 133 %endif
michael@0 134
michael@0 135 ; 2nd 16 px
michael@0 136 mova m0, [srcq +%4]
michael@0 137 movu m1, [ref1q+%5]
michael@0 138 movu m2, [ref2q+%5]
michael@0 139 movu m3, [ref3q+%5]
michael@0 140 psadbw m1, m0
michael@0 141 psadbw m2, m0
michael@0 142 psadbw m3, m0
michael@0 143 paddd m4, m1
michael@0 144 movu m1, [ref4q+%5]
michael@0 145 paddd m5, m2
michael@0 146 paddd m6, m3
michael@0 147 %if %6 == 1
michael@0 148 lea srcq, [srcq +src_strideq*2]
michael@0 149 lea ref1q, [ref1q+ref_strideq*2]
michael@0 150 lea ref2q, [ref2q+ref_strideq*2]
michael@0 151 lea ref3q, [ref3q+ref_strideq*2]
michael@0 152 lea ref4q, [ref4q+ref_strideq*2]
michael@0 153 %endif
michael@0 154 psadbw m1, m0
michael@0 155 paddd m7, m1
michael@0 156 %endmacro
michael@0 157
michael@0 158 ; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
michael@0 159 %macro PROCESS_32x2x4 5-6 0
michael@0 160 PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16
michael@0 161 PROCESS_16x2x4 0, %4, %5, %4 + 16, %5 + 16, %6
michael@0 162 %endmacro
michael@0 163
michael@0 164 ; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
michael@0 165 %macro PROCESS_64x2x4 5-6 0
michael@0 166 PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32
michael@0 167 PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6
michael@0 168 %endmacro
michael@0 169
michael@0 170 ; void vp9_sadNxNx4d_sse2(uint8_t *src, int src_stride,
michael@0 171 ; uint8_t *ref[4], int ref_stride,
michael@0 172 ; unsigned int res[4]);
michael@0 173 ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
michael@0 174 %macro SADNXN4D 2
michael@0 175 %if UNIX64
michael@0 176 cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
michael@0 177 res, ref2, ref3, ref4
michael@0 178 %else
michael@0 179 cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
michael@0 180 ref2, ref3, ref4
michael@0 181 %endif
michael@0 182 movsxdifnidn src_strideq, src_strided
michael@0 183 movsxdifnidn ref_strideq, ref_strided
michael@0 184 mov ref2q, [ref1q+gprsize*1]
michael@0 185 mov ref3q, [ref1q+gprsize*2]
michael@0 186 mov ref4q, [ref1q+gprsize*3]
michael@0 187 mov ref1q, [ref1q+gprsize*0]
michael@0 188
michael@0 189 PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
michael@0 190 %rep (%2-4)/2
michael@0 191 PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
michael@0 192 %endrep
michael@0 193 PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
michael@0 194
michael@0 195 %if mmsize == 16
michael@0 196 pslldq m5, 4
michael@0 197 pslldq m7, 4
michael@0 198 por m4, m5
michael@0 199 por m6, m7
michael@0 200 mova m5, m4
michael@0 201 mova m7, m6
michael@0 202 punpcklqdq m4, m6
michael@0 203 punpckhqdq m5, m7
michael@0 204 movifnidn r4, r4mp
michael@0 205 paddd m4, m5
michael@0 206 movu [r4], m4
michael@0 207 RET
michael@0 208 %else
michael@0 209 movifnidn r4, r4mp
michael@0 210 movq [r4+0], m6
michael@0 211 movq [r4+8], m7
michael@0 212 RET
michael@0 213 %endif
michael@0 214 %endmacro
michael@0 215
michael@0 216 INIT_XMM sse2
michael@0 217 SADNXN4D 64, 64
michael@0 218 SADNXN4D 64, 32
michael@0 219 SADNXN4D 32, 64
michael@0 220 SADNXN4D 32, 32
michael@0 221 SADNXN4D 32, 16
michael@0 222 SADNXN4D 16, 32
michael@0 223 SADNXN4D 16, 16
michael@0 224 SADNXN4D 16, 8
michael@0 225 SADNXN4D 8, 16
michael@0 226 SADNXN4D 8, 8
michael@0 227 SADNXN4D 8, 4
michael@0 228
michael@0 229 INIT_MMX sse
michael@0 230 SADNXN4D 4, 8
michael@0 231 SADNXN4D 4, 4

mercurial