media/libvpx/vp8/common/arm/neon/variance_neon.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11
michael@0 12 EXPORT |vp8_variance16x16_neon|
michael@0 13 EXPORT |vp8_variance16x8_neon|
michael@0 14 EXPORT |vp8_variance8x16_neon|
michael@0 15 EXPORT |vp8_variance8x8_neon|
michael@0 16
michael@0 17 ARM
michael@0 18 REQUIRE8
michael@0 19 PRESERVE8
michael@0 20
michael@0 21 AREA ||.text||, CODE, READONLY, ALIGN=2
michael@0 22
michael@0 23 ; r0 unsigned char *src_ptr
michael@0 24 ; r1 int source_stride
michael@0 25 ; r2 unsigned char *ref_ptr
michael@0 26 ; r3 int recon_stride
michael@0 27 ; stack unsigned int *sse
michael@0 28 |vp8_variance16x16_neon| PROC
michael@0 29 vmov.i8 q8, #0 ;q8 - sum
michael@0 30 vmov.i8 q9, #0 ;q9, q10 - sse
michael@0 31 vmov.i8 q10, #0
michael@0 32
michael@0 33 mov r12, #8
michael@0 34
michael@0 35 variance16x16_neon_loop
michael@0 36 vld1.8 {q0}, [r0], r1 ;Load up source and reference
michael@0 37 vld1.8 {q2}, [r2], r3
michael@0 38 vld1.8 {q1}, [r0], r1
michael@0 39 vld1.8 {q3}, [r2], r3
michael@0 40
michael@0 41 vsubl.u8 q11, d0, d4 ;calculate diff
michael@0 42 vsubl.u8 q12, d1, d5
michael@0 43 vsubl.u8 q13, d2, d6
michael@0 44 vsubl.u8 q14, d3, d7
michael@0 45
michael@0 46 ;VPADAL adds adjacent pairs of elements of a vector, and accumulates
michael@0 47 ;the results into the elements of the destination vector. The explanation
michael@0 48 ;in ARM guide is wrong.
michael@0 49 vpadal.s16 q8, q11 ;calculate sum
michael@0 50 vmlal.s16 q9, d22, d22 ;calculate sse
michael@0 51 vmlal.s16 q10, d23, d23
michael@0 52
michael@0 53 subs r12, r12, #1
michael@0 54
michael@0 55 vpadal.s16 q8, q12
michael@0 56 vmlal.s16 q9, d24, d24
michael@0 57 vmlal.s16 q10, d25, d25
michael@0 58 vpadal.s16 q8, q13
michael@0 59 vmlal.s16 q9, d26, d26
michael@0 60 vmlal.s16 q10, d27, d27
michael@0 61 vpadal.s16 q8, q14
michael@0 62 vmlal.s16 q9, d28, d28
michael@0 63 vmlal.s16 q10, d29, d29
michael@0 64
michael@0 65 bne variance16x16_neon_loop
michael@0 66
michael@0 67 vadd.u32 q10, q9, q10 ;accumulate sse
michael@0 68 vpaddl.s32 q0, q8 ;accumulate sum
michael@0 69
michael@0 70 ldr r12, [sp] ;load *sse from stack
michael@0 71
michael@0 72 vpaddl.u32 q1, q10
michael@0 73 vadd.s64 d0, d0, d1
michael@0 74 vadd.u64 d1, d2, d3
michael@0 75
michael@0 76 ;vmov.32 r0, d0[0] ;this instruction costs a lot
michael@0 77 ;vmov.32 r1, d1[0]
michael@0 78 ;mul r0, r0, r0
michael@0 79 ;str r1, [r12]
michael@0 80 ;sub r0, r1, r0, lsr #8
michael@0 81
michael@0 82 ; while sum is signed, sum * sum is always positive and must be treated as
michael@0 83 ; unsigned to avoid propagating the sign bit.
michael@0 84 vmull.s32 q5, d0, d0
michael@0 85 vst1.32 {d1[0]}, [r12] ;store sse
michael@0 86 vshr.u32 d10, d10, #8
michael@0 87 vsub.u32 d0, d1, d10
michael@0 88
michael@0 89 vmov.32 r0, d0[0] ;return
michael@0 90 bx lr
michael@0 91
michael@0 92 ENDP
michael@0 93
michael@0 94 ;================================
michael@0 95 ;unsigned int vp8_variance16x8_c(
michael@0 96 ; unsigned char *src_ptr,
michael@0 97 ; int source_stride,
michael@0 98 ; unsigned char *ref_ptr,
michael@0 99 ; int recon_stride,
michael@0 100 ; unsigned int *sse)
michael@0 101 |vp8_variance16x8_neon| PROC
michael@0 102 vmov.i8 q8, #0 ;q8 - sum
michael@0 103 vmov.i8 q9, #0 ;q9, q10 - sse
michael@0 104 vmov.i8 q10, #0
michael@0 105
michael@0 106 mov r12, #4
michael@0 107
michael@0 108 variance16x8_neon_loop
michael@0 109 vld1.8 {q0}, [r0], r1 ;Load up source and reference
michael@0 110 vld1.8 {q2}, [r2], r3
michael@0 111 vld1.8 {q1}, [r0], r1
michael@0 112 vld1.8 {q3}, [r2], r3
michael@0 113
michael@0 114 vsubl.u8 q11, d0, d4 ;calculate diff
michael@0 115 vsubl.u8 q12, d1, d5
michael@0 116 vsubl.u8 q13, d2, d6
michael@0 117 vsubl.u8 q14, d3, d7
michael@0 118
michael@0 119 vpadal.s16 q8, q11 ;calculate sum
michael@0 120 vmlal.s16 q9, d22, d22 ;calculate sse
michael@0 121 vmlal.s16 q10, d23, d23
michael@0 122
michael@0 123 subs r12, r12, #1
michael@0 124
michael@0 125 vpadal.s16 q8, q12
michael@0 126 vmlal.s16 q9, d24, d24
michael@0 127 vmlal.s16 q10, d25, d25
michael@0 128 vpadal.s16 q8, q13
michael@0 129 vmlal.s16 q9, d26, d26
michael@0 130 vmlal.s16 q10, d27, d27
michael@0 131 vpadal.s16 q8, q14
michael@0 132 vmlal.s16 q9, d28, d28
michael@0 133 vmlal.s16 q10, d29, d29
michael@0 134
michael@0 135 bne variance16x8_neon_loop
michael@0 136
michael@0 137 vadd.u32 q10, q9, q10 ;accumulate sse
michael@0 138 vpaddl.s32 q0, q8 ;accumulate sum
michael@0 139
michael@0 140 ldr r12, [sp] ;load *sse from stack
michael@0 141
michael@0 142 vpaddl.u32 q1, q10
michael@0 143 vadd.s64 d0, d0, d1
michael@0 144 vadd.u64 d1, d2, d3
michael@0 145
michael@0 146 vmull.s32 q5, d0, d0
michael@0 147 vst1.32 {d1[0]}, [r12] ;store sse
michael@0 148 vshr.u32 d10, d10, #7
michael@0 149 vsub.u32 d0, d1, d10
michael@0 150
michael@0 151 vmov.32 r0, d0[0] ;return
michael@0 152 bx lr
michael@0 153
michael@0 154 ENDP
michael@0 155
michael@0 156 ;=================================
michael@0 157 ;unsigned int vp8_variance8x16_c(
michael@0 158 ; unsigned char *src_ptr,
michael@0 159 ; int source_stride,
michael@0 160 ; unsigned char *ref_ptr,
michael@0 161 ; int recon_stride,
michael@0 162 ; unsigned int *sse)
michael@0 163
michael@0 164 |vp8_variance8x16_neon| PROC
michael@0 165 vmov.i8 q8, #0 ;q8 - sum
michael@0 166 vmov.i8 q9, #0 ;q9, q10 - sse
michael@0 167 vmov.i8 q10, #0
michael@0 168
michael@0 169 mov r12, #8
michael@0 170
michael@0 171 variance8x16_neon_loop
michael@0 172 vld1.8 {d0}, [r0], r1 ;Load up source and reference
michael@0 173 vld1.8 {d4}, [r2], r3
michael@0 174 vld1.8 {d2}, [r0], r1
michael@0 175 vld1.8 {d6}, [r2], r3
michael@0 176
michael@0 177 vsubl.u8 q11, d0, d4 ;calculate diff
michael@0 178 vsubl.u8 q12, d2, d6
michael@0 179
michael@0 180 vpadal.s16 q8, q11 ;calculate sum
michael@0 181 vmlal.s16 q9, d22, d22 ;calculate sse
michael@0 182 vmlal.s16 q10, d23, d23
michael@0 183
michael@0 184 subs r12, r12, #1
michael@0 185
michael@0 186 vpadal.s16 q8, q12
michael@0 187 vmlal.s16 q9, d24, d24
michael@0 188 vmlal.s16 q10, d25, d25
michael@0 189
michael@0 190 bne variance8x16_neon_loop
michael@0 191
michael@0 192 vadd.u32 q10, q9, q10 ;accumulate sse
michael@0 193 vpaddl.s32 q0, q8 ;accumulate sum
michael@0 194
michael@0 195 ldr r12, [sp] ;load *sse from stack
michael@0 196
michael@0 197 vpaddl.u32 q1, q10
michael@0 198 vadd.s64 d0, d0, d1
michael@0 199 vadd.u64 d1, d2, d3
michael@0 200
michael@0 201 vmull.s32 q5, d0, d0
michael@0 202 vst1.32 {d1[0]}, [r12] ;store sse
michael@0 203 vshr.u32 d10, d10, #7
michael@0 204 vsub.u32 d0, d1, d10
michael@0 205
michael@0 206 vmov.32 r0, d0[0] ;return
michael@0 207 bx lr
michael@0 208
michael@0 209 ENDP
michael@0 210
michael@0 211 ;==================================
michael@0 212 ; r0 unsigned char *src_ptr
michael@0 213 ; r1 int source_stride
michael@0 214 ; r2 unsigned char *ref_ptr
michael@0 215 ; r3 int recon_stride
michael@0 216 ; stack unsigned int *sse
michael@0 217 |vp8_variance8x8_neon| PROC
michael@0 218 vmov.i8 q8, #0 ;q8 - sum
michael@0 219 vmov.i8 q9, #0 ;q9, q10 - sse
michael@0 220 vmov.i8 q10, #0
michael@0 221
michael@0 222 mov r12, #2
michael@0 223
michael@0 224 variance8x8_neon_loop
michael@0 225 vld1.8 {d0}, [r0], r1 ;Load up source and reference
michael@0 226 vld1.8 {d4}, [r2], r3
michael@0 227 vld1.8 {d1}, [r0], r1
michael@0 228 vld1.8 {d5}, [r2], r3
michael@0 229 vld1.8 {d2}, [r0], r1
michael@0 230 vld1.8 {d6}, [r2], r3
michael@0 231 vld1.8 {d3}, [r0], r1
michael@0 232 vld1.8 {d7}, [r2], r3
michael@0 233
michael@0 234 vsubl.u8 q11, d0, d4 ;calculate diff
michael@0 235 vsubl.u8 q12, d1, d5
michael@0 236 vsubl.u8 q13, d2, d6
michael@0 237 vsubl.u8 q14, d3, d7
michael@0 238
michael@0 239 vpadal.s16 q8, q11 ;calculate sum
michael@0 240 vmlal.s16 q9, d22, d22 ;calculate sse
michael@0 241 vmlal.s16 q10, d23, d23
michael@0 242
michael@0 243 subs r12, r12, #1
michael@0 244
michael@0 245 vpadal.s16 q8, q12
michael@0 246 vmlal.s16 q9, d24, d24
michael@0 247 vmlal.s16 q10, d25, d25
michael@0 248 vpadal.s16 q8, q13
michael@0 249 vmlal.s16 q9, d26, d26
michael@0 250 vmlal.s16 q10, d27, d27
michael@0 251 vpadal.s16 q8, q14
michael@0 252 vmlal.s16 q9, d28, d28
michael@0 253 vmlal.s16 q10, d29, d29
michael@0 254
michael@0 255 bne variance8x8_neon_loop
michael@0 256
michael@0 257 vadd.u32 q10, q9, q10 ;accumulate sse
michael@0 258 vpaddl.s32 q0, q8 ;accumulate sum
michael@0 259
michael@0 260 ldr r12, [sp] ;load *sse from stack
michael@0 261
michael@0 262 vpaddl.u32 q1, q10
michael@0 263 vadd.s64 d0, d0, d1
michael@0 264 vadd.u64 d1, d2, d3
michael@0 265
michael@0 266 vmull.s32 q5, d0, d0
michael@0 267 vst1.32 {d1[0]}, [r12] ;store sse
michael@0 268 vshr.u32 d10, d10, #6
michael@0 269 vsub.u32 d0, d1, d10
michael@0 270
michael@0 271 vmov.32 r0, d0[0] ;return
michael@0 272 bx lr
michael@0 273
michael@0 274 ENDP
michael@0 275
michael@0 276 END

mercurial