media/libvpx/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11
michael@0 12 ;-----------------
michael@0 13
michael@0 14 EXPORT |vp8_sub_pixel_variance16x16_neon_func|
michael@0 15 ARM
michael@0 16 REQUIRE8
michael@0 17 PRESERVE8
michael@0 18
michael@0 19 AREA ||.text||, CODE, READONLY, ALIGN=2
michael@0 20 ; r0 unsigned char *src_ptr,
michael@0 21 ; r1 int src_pixels_per_line,
michael@0 22 ; r2 int xoffset,
michael@0 23 ; r3 int yoffset,
michael@0 24 ; stack(r4) unsigned char *dst_ptr,
michael@0 25 ; stack(r5) int dst_pixels_per_line,
michael@0 26 ; stack(r6) unsigned int *sse
michael@0 27 ;note: most of the code is copied from bilinear_predict16x16_neon and vp8_variance16x16_neon.
michael@0 28
michael@0 29 bilinear_taps_coeff
michael@0 30 DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
michael@0 31
michael@0 32 |vp8_sub_pixel_variance16x16_neon_func| PROC
michael@0 33 push {r4-r6, lr}
michael@0 34
michael@0 35 adr r12, bilinear_taps_coeff
michael@0 36 ldr r4, [sp, #16] ;load *dst_ptr from stack
michael@0 37 ldr r5, [sp, #20] ;load dst_pixels_per_line from stack
michael@0 38 ldr r6, [sp, #24] ;load *sse from stack
michael@0 39
michael@0 40 cmp r2, #0 ;skip first_pass filter if xoffset=0
michael@0 41 beq secondpass_bfilter16x16_only
michael@0 42
michael@0 43 add r2, r12, r2, lsl #3 ;calculate filter location
michael@0 44
michael@0 45 cmp r3, #0 ;skip second_pass filter if yoffset=0
michael@0 46
michael@0 47 vld1.s32 {d31}, [r2] ;load first_pass filter
michael@0 48
michael@0 49 beq firstpass_bfilter16x16_only
michael@0 50
michael@0 51 sub sp, sp, #272 ;reserve space on stack for temporary storage
michael@0 52 vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
michael@0 53 mov lr, sp
michael@0 54 vld1.u8 {d5, d6, d7}, [r0], r1
michael@0 55
michael@0 56 mov r2, #3 ;loop counter
michael@0 57 vld1.u8 {d8, d9, d10}, [r0], r1
michael@0 58
michael@0 59 vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
michael@0 60 vld1.u8 {d11, d12, d13}, [r0], r1
michael@0 61
michael@0 62 vdup.8 d1, d31[4]
michael@0 63
michael@0 64 ;First Pass: output_height lines x output_width columns (17x16)
michael@0 65 vp8e_filt_blk2d_fp16x16_loop_neon
michael@0 66 pld [r0]
michael@0 67 pld [r0, r1]
michael@0 68 pld [r0, r1, lsl #1]
michael@0 69
michael@0 70 vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0])
michael@0 71 vmull.u8 q8, d3, d0
michael@0 72 vmull.u8 q9, d5, d0
michael@0 73 vmull.u8 q10, d6, d0
michael@0 74 vmull.u8 q11, d8, d0
michael@0 75 vmull.u8 q12, d9, d0
michael@0 76 vmull.u8 q13, d11, d0
michael@0 77 vmull.u8 q14, d12, d0
michael@0 78
michael@0 79 vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
michael@0 80 vext.8 d5, d5, d6, #1
michael@0 81 vext.8 d8, d8, d9, #1
michael@0 82 vext.8 d11, d11, d12, #1
michael@0 83
michael@0 84 vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1])
michael@0 85 vmlal.u8 q9, d5, d1
michael@0 86 vmlal.u8 q11, d8, d1
michael@0 87 vmlal.u8 q13, d11, d1
michael@0 88
michael@0 89 vext.8 d3, d3, d4, #1
michael@0 90 vext.8 d6, d6, d7, #1
michael@0 91 vext.8 d9, d9, d10, #1
michael@0 92 vext.8 d12, d12, d13, #1
michael@0 93
michael@0 94 vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1])
michael@0 95 vmlal.u8 q10, d6, d1
michael@0 96 vmlal.u8 q12, d9, d1
michael@0 97 vmlal.u8 q14, d12, d1
michael@0 98
michael@0 99 subs r2, r2, #1
michael@0 100
michael@0 101 vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
michael@0 102 vqrshrn.u16 d15, q8, #7
michael@0 103 vqrshrn.u16 d16, q9, #7
michael@0 104 vqrshrn.u16 d17, q10, #7
michael@0 105 vqrshrn.u16 d18, q11, #7
michael@0 106 vqrshrn.u16 d19, q12, #7
michael@0 107 vqrshrn.u16 d20, q13, #7
michael@0 108
michael@0 109 vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
michael@0 110 vqrshrn.u16 d21, q14, #7
michael@0 111 vld1.u8 {d5, d6, d7}, [r0], r1
michael@0 112
michael@0 113 vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result
michael@0 114 vld1.u8 {d8, d9, d10}, [r0], r1
michael@0 115 vst1.u8 {d18, d19, d20, d21}, [lr]!
michael@0 116 vld1.u8 {d11, d12, d13}, [r0], r1
michael@0 117
michael@0 118 bne vp8e_filt_blk2d_fp16x16_loop_neon
michael@0 119
michael@0 120 ;First-pass filtering for rest 5 lines
michael@0 121 vld1.u8 {d14, d15, d16}, [r0], r1
michael@0 122
michael@0 123 vmull.u8 q9, d2, d0 ;(src_ptr[0] * Filter[0])
michael@0 124 vmull.u8 q10, d3, d0
michael@0 125 vmull.u8 q11, d5, d0
michael@0 126 vmull.u8 q12, d6, d0
michael@0 127 vmull.u8 q13, d8, d0
michael@0 128 vmull.u8 q14, d9, d0
michael@0 129
michael@0 130 vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
michael@0 131 vext.8 d5, d5, d6, #1
michael@0 132 vext.8 d8, d8, d9, #1
michael@0 133
michael@0 134 vmlal.u8 q9, d2, d1 ;(src_ptr[0] * Filter[1])
michael@0 135 vmlal.u8 q11, d5, d1
michael@0 136 vmlal.u8 q13, d8, d1
michael@0 137
michael@0 138 vext.8 d3, d3, d4, #1
michael@0 139 vext.8 d6, d6, d7, #1
michael@0 140 vext.8 d9, d9, d10, #1
michael@0 141
michael@0 142 vmlal.u8 q10, d3, d1 ;(src_ptr[0] * Filter[1])
michael@0 143 vmlal.u8 q12, d6, d1
michael@0 144 vmlal.u8 q14, d9, d1
michael@0 145
michael@0 146 vmull.u8 q1, d11, d0
michael@0 147 vmull.u8 q2, d12, d0
michael@0 148 vmull.u8 q3, d14, d0
michael@0 149 vmull.u8 q4, d15, d0
michael@0 150
michael@0 151 vext.8 d11, d11, d12, #1 ;construct src_ptr[1]
michael@0 152 vext.8 d14, d14, d15, #1
michael@0 153
michael@0 154 vmlal.u8 q1, d11, d1 ;(src_ptr[0] * Filter[1])
michael@0 155 vmlal.u8 q3, d14, d1
michael@0 156
michael@0 157 vext.8 d12, d12, d13, #1
michael@0 158 vext.8 d15, d15, d16, #1
michael@0 159
michael@0 160 vmlal.u8 q2, d12, d1 ;(src_ptr[0] * Filter[1])
michael@0 161 vmlal.u8 q4, d15, d1
michael@0 162
michael@0 163 vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8
michael@0 164 vqrshrn.u16 d11, q10, #7
michael@0 165 vqrshrn.u16 d12, q11, #7
michael@0 166 vqrshrn.u16 d13, q12, #7
michael@0 167 vqrshrn.u16 d14, q13, #7
michael@0 168 vqrshrn.u16 d15, q14, #7
michael@0 169 vqrshrn.u16 d16, q1, #7
michael@0 170 vqrshrn.u16 d17, q2, #7
michael@0 171 vqrshrn.u16 d18, q3, #7
michael@0 172 vqrshrn.u16 d19, q4, #7
michael@0 173
michael@0 174 vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result
michael@0 175 vst1.u8 {d14, d15, d16, d17}, [lr]!
michael@0 176 vst1.u8 {d18, d19}, [lr]!
michael@0 177
michael@0 178 ;Second pass: 16x16
michael@0 179 ;secondpass_filter
michael@0 180 add r3, r12, r3, lsl #3
michael@0 181 sub lr, lr, #272
michael@0 182
michael@0 183 vld1.u32 {d31}, [r3] ;load second_pass filter
michael@0 184
michael@0 185 sub sp, sp, #256
michael@0 186 mov r3, sp
michael@0 187
michael@0 188 vld1.u8 {d22, d23}, [lr]! ;load src data
michael@0 189
michael@0 190 vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
michael@0 191 vdup.8 d1, d31[4]
michael@0 192 mov r12, #4 ;loop counter
michael@0 193
michael@0 194 vp8e_filt_blk2d_sp16x16_loop_neon
michael@0 195 vld1.u8 {d24, d25}, [lr]!
michael@0 196 vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
michael@0 197 vld1.u8 {d26, d27}, [lr]!
michael@0 198 vmull.u8 q2, d23, d0
michael@0 199 vld1.u8 {d28, d29}, [lr]!
michael@0 200 vmull.u8 q3, d24, d0
michael@0 201 vld1.u8 {d30, d31}, [lr]!
michael@0 202
michael@0 203 vmull.u8 q4, d25, d0
michael@0 204 vmull.u8 q5, d26, d0
michael@0 205 vmull.u8 q6, d27, d0
michael@0 206 vmull.u8 q7, d28, d0
michael@0 207 vmull.u8 q8, d29, d0
michael@0 208
michael@0 209 vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1])
michael@0 210 vmlal.u8 q2, d25, d1
michael@0 211 vmlal.u8 q3, d26, d1
michael@0 212 vmlal.u8 q4, d27, d1
michael@0 213 vmlal.u8 q5, d28, d1
michael@0 214 vmlal.u8 q6, d29, d1
michael@0 215 vmlal.u8 q7, d30, d1
michael@0 216 vmlal.u8 q8, d31, d1
michael@0 217
michael@0 218 subs r12, r12, #1
michael@0 219
michael@0 220 vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
michael@0 221 vqrshrn.u16 d3, q2, #7
michael@0 222 vqrshrn.u16 d4, q3, #7
michael@0 223 vqrshrn.u16 d5, q4, #7
michael@0 224 vqrshrn.u16 d6, q5, #7
michael@0 225 vqrshrn.u16 d7, q6, #7
michael@0 226 vqrshrn.u16 d8, q7, #7
michael@0 227 vqrshrn.u16 d9, q8, #7
michael@0 228
michael@0 229 vst1.u8 {d2, d3}, [r3]! ;store result
michael@0 230 vst1.u8 {d4, d5}, [r3]!
michael@0 231 vst1.u8 {d6, d7}, [r3]!
michael@0 232 vmov q11, q15
michael@0 233 vst1.u8 {d8, d9}, [r3]!
michael@0 234
michael@0 235 bne vp8e_filt_blk2d_sp16x16_loop_neon
michael@0 236
michael@0 237 b sub_pixel_variance16x16_neon
michael@0 238
michael@0 239 ;--------------------
michael@0 240 firstpass_bfilter16x16_only
michael@0 241 mov r2, #4 ;loop counter
michael@0 242 sub sp, sp, #528 ;reserve space on stack for temporary storage
michael@0 243 vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
michael@0 244 vdup.8 d1, d31[4]
michael@0 245 mov r3, sp
michael@0 246
michael@0 247 ;First Pass: output_height lines x output_width columns (16x16)
michael@0 248 vp8e_filt_blk2d_fpo16x16_loop_neon
michael@0 249 vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
michael@0 250 vld1.u8 {d5, d6, d7}, [r0], r1
michael@0 251 vld1.u8 {d8, d9, d10}, [r0], r1
michael@0 252 vld1.u8 {d11, d12, d13}, [r0], r1
michael@0 253
michael@0 254 pld [r0]
michael@0 255 pld [r0, r1]
michael@0 256 pld [r0, r1, lsl #1]
michael@0 257
michael@0 258 vmull.u8 q7, d2, d0 ;(src_ptr[0] * Filter[0])
michael@0 259 vmull.u8 q8, d3, d0
michael@0 260 vmull.u8 q9, d5, d0
michael@0 261 vmull.u8 q10, d6, d0
michael@0 262 vmull.u8 q11, d8, d0
michael@0 263 vmull.u8 q12, d9, d0
michael@0 264 vmull.u8 q13, d11, d0
michael@0 265 vmull.u8 q14, d12, d0
michael@0 266
michael@0 267 vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
michael@0 268 vext.8 d5, d5, d6, #1
michael@0 269 vext.8 d8, d8, d9, #1
michael@0 270 vext.8 d11, d11, d12, #1
michael@0 271
michael@0 272 vmlal.u8 q7, d2, d1 ;(src_ptr[0] * Filter[1])
michael@0 273 vmlal.u8 q9, d5, d1
michael@0 274 vmlal.u8 q11, d8, d1
michael@0 275 vmlal.u8 q13, d11, d1
michael@0 276
michael@0 277 vext.8 d3, d3, d4, #1
michael@0 278 vext.8 d6, d6, d7, #1
michael@0 279 vext.8 d9, d9, d10, #1
michael@0 280 vext.8 d12, d12, d13, #1
michael@0 281
michael@0 282 vmlal.u8 q8, d3, d1 ;(src_ptr[0] * Filter[1])
michael@0 283 vmlal.u8 q10, d6, d1
michael@0 284 vmlal.u8 q12, d9, d1
michael@0 285 vmlal.u8 q14, d12, d1
michael@0 286
michael@0 287 subs r2, r2, #1
michael@0 288
michael@0 289 vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
michael@0 290 vqrshrn.u16 d15, q8, #7
michael@0 291 vqrshrn.u16 d16, q9, #7
michael@0 292 vqrshrn.u16 d17, q10, #7
michael@0 293 vqrshrn.u16 d18, q11, #7
michael@0 294 vqrshrn.u16 d19, q12, #7
michael@0 295 vqrshrn.u16 d20, q13, #7
michael@0 296 vst1.u8 {d14, d15}, [r3]! ;store result
michael@0 297 vqrshrn.u16 d21, q14, #7
michael@0 298
michael@0 299 vst1.u8 {d16, d17}, [r3]!
michael@0 300 vst1.u8 {d18, d19}, [r3]!
michael@0 301 vst1.u8 {d20, d21}, [r3]!
michael@0 302
michael@0 303 bne vp8e_filt_blk2d_fpo16x16_loop_neon
michael@0 304
michael@0 305 b sub_pixel_variance16x16_neon
michael@0 306
michael@0 307 ;---------------------
michael@0 308 secondpass_bfilter16x16_only
michael@0 309 ;Second pass: 16x16
michael@0 310 ;secondpass_filter
michael@0 311 sub sp, sp, #528 ;reserve space on stack for temporary storage
michael@0 312 add r3, r12, r3, lsl #3
michael@0 313 mov r12, #4 ;loop counter
michael@0 314 vld1.u32 {d31}, [r3] ;load second_pass filter
michael@0 315 vld1.u8 {d22, d23}, [r0], r1 ;load src data
michael@0 316 mov r3, sp
michael@0 317
michael@0 318 vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
michael@0 319 vdup.8 d1, d31[4]
michael@0 320
michael@0 321 vp8e_filt_blk2d_spo16x16_loop_neon
michael@0 322 vld1.u8 {d24, d25}, [r0], r1
michael@0 323 vmull.u8 q1, d22, d0 ;(src_ptr[0] * Filter[0])
michael@0 324 vld1.u8 {d26, d27}, [r0], r1
michael@0 325 vmull.u8 q2, d23, d0
michael@0 326 vld1.u8 {d28, d29}, [r0], r1
michael@0 327 vmull.u8 q3, d24, d0
michael@0 328 vld1.u8 {d30, d31}, [r0], r1
michael@0 329
michael@0 330 vmull.u8 q4, d25, d0
michael@0 331 vmull.u8 q5, d26, d0
michael@0 332 vmull.u8 q6, d27, d0
michael@0 333 vmull.u8 q7, d28, d0
michael@0 334 vmull.u8 q8, d29, d0
michael@0 335
michael@0 336 vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * Filter[1])
michael@0 337 vmlal.u8 q2, d25, d1
michael@0 338 vmlal.u8 q3, d26, d1
michael@0 339 vmlal.u8 q4, d27, d1
michael@0 340 vmlal.u8 q5, d28, d1
michael@0 341 vmlal.u8 q6, d29, d1
michael@0 342 vmlal.u8 q7, d30, d1
michael@0 343 vmlal.u8 q8, d31, d1
michael@0 344
michael@0 345 vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
michael@0 346 vqrshrn.u16 d3, q2, #7
michael@0 347 vqrshrn.u16 d4, q3, #7
michael@0 348 vqrshrn.u16 d5, q4, #7
michael@0 349 vqrshrn.u16 d6, q5, #7
michael@0 350 vqrshrn.u16 d7, q6, #7
michael@0 351 vqrshrn.u16 d8, q7, #7
michael@0 352 vqrshrn.u16 d9, q8, #7
michael@0 353
michael@0 354 vst1.u8 {d2, d3}, [r3]! ;store result
michael@0 355 subs r12, r12, #1
michael@0 356 vst1.u8 {d4, d5}, [r3]!
michael@0 357 vmov q11, q15
michael@0 358 vst1.u8 {d6, d7}, [r3]!
michael@0 359 vst1.u8 {d8, d9}, [r3]!
michael@0 360
michael@0 361 bne vp8e_filt_blk2d_spo16x16_loop_neon
michael@0 362
michael@0 363 b sub_pixel_variance16x16_neon
michael@0 364
michael@0 365 ;----------------------------
michael@0 366 ;variance16x16
michael@0 367 sub_pixel_variance16x16_neon
michael@0 368 vmov.i8 q8, #0 ;q8 - sum
michael@0 369 vmov.i8 q9, #0 ;q9, q10 - sse
michael@0 370 vmov.i8 q10, #0
michael@0 371
michael@0 372 sub r3, r3, #256
michael@0 373 mov r12, #8
michael@0 374
michael@0 375 sub_pixel_variance16x16_neon_loop
michael@0 376 vld1.8 {q0}, [r3]! ;Load up source and reference
michael@0 377 vld1.8 {q2}, [r4], r5
michael@0 378 vld1.8 {q1}, [r3]!
michael@0 379 vld1.8 {q3}, [r4], r5
michael@0 380
michael@0 381 vsubl.u8 q11, d0, d4 ;diff
michael@0 382 vsubl.u8 q12, d1, d5
michael@0 383 vsubl.u8 q13, d2, d6
michael@0 384 vsubl.u8 q14, d3, d7
michael@0 385
michael@0 386 vpadal.s16 q8, q11 ;sum
michael@0 387 vmlal.s16 q9, d22, d22 ;sse
michael@0 388 vmlal.s16 q10, d23, d23
michael@0 389
michael@0 390 subs r12, r12, #1
michael@0 391
michael@0 392 vpadal.s16 q8, q12
michael@0 393 vmlal.s16 q9, d24, d24
michael@0 394 vmlal.s16 q10, d25, d25
michael@0 395 vpadal.s16 q8, q13
michael@0 396 vmlal.s16 q9, d26, d26
michael@0 397 vmlal.s16 q10, d27, d27
michael@0 398 vpadal.s16 q8, q14
michael@0 399 vmlal.s16 q9, d28, d28
michael@0 400 vmlal.s16 q10, d29, d29
michael@0 401
michael@0 402 bne sub_pixel_variance16x16_neon_loop
michael@0 403
michael@0 404 vadd.u32 q10, q9, q10 ;accumulate sse
michael@0 405 vpaddl.s32 q0, q8 ;accumulate sum
michael@0 406
michael@0 407 vpaddl.u32 q1, q10
michael@0 408 vadd.s64 d0, d0, d1
michael@0 409 vadd.u64 d1, d2, d3
michael@0 410
michael@0 411 vmull.s32 q5, d0, d0
michael@0 412 vst1.32 {d1[0]}, [r6] ;store sse
michael@0 413 vshr.u32 d10, d10, #8
michael@0 414 vsub.u32 d0, d1, d10
michael@0 415
michael@0 416 add sp, sp, #528
michael@0 417 vmov.32 r0, d0[0] ;return
michael@0 418
michael@0 419 pop {r4-r6,pc}
michael@0 420
michael@0 421 ENDP
michael@0 422
michael@0 423 END

mercurial