media/libvpx/vp8/common/arm/neon/bilinearpredict16x16_neon.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11
michael@0 12 EXPORT |vp8_bilinear_predict16x16_neon|
michael@0 13 ARM
michael@0 14 REQUIRE8
michael@0 15 PRESERVE8
michael@0 16
michael@0 17 AREA ||.text||, CODE, READONLY, ALIGN=2
michael@0 18 ; r0 unsigned char *src_ptr,
michael@0 19 ; r1 int src_pixels_per_line,
michael@0 20 ; r2 int xoffset,
michael@0 21 ; r3 int yoffset,
michael@0 22 ; r4 unsigned char *dst_ptr,
michael@0 23 ; stack(r5) int dst_pitch
michael@0 24
michael@0 25 |vp8_bilinear_predict16x16_neon| PROC
michael@0 26 push {r4-r5, lr}
michael@0 27
michael@0 28 adr r12, bifilter16_coeff
michael@0 29 ldr r4, [sp, #12] ;load parameters from stack
michael@0 30 ldr r5, [sp, #16] ;load parameters from stack
michael@0 31
michael@0 32 cmp r2, #0 ;skip first_pass filter if xoffset=0
michael@0 33 beq secondpass_bfilter16x16_only
michael@0 34
michael@0 35 add r2, r12, r2, lsl #3 ;calculate filter location
michael@0 36
michael@0 37 cmp r3, #0 ;skip second_pass filter if yoffset=0
michael@0 38
michael@0 39 vld1.s32 {d31}, [r2] ;load first_pass filter
michael@0 40
michael@0 41 beq firstpass_bfilter16x16_only
michael@0 42
michael@0 43 sub sp, sp, #272 ;reserve space on stack for temporary storage
michael@0 44 vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
michael@0 45 mov lr, sp
michael@0 46 vld1.u8 {d5, d6, d7}, [r0], r1
michael@0 47
michael@0 48 mov r2, #3 ;loop counter
michael@0 49 vld1.u8 {d8, d9, d10}, [r0], r1
michael@0 50
michael@0 51 vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
michael@0 52 vld1.u8 {d11, d12, d13}, [r0], r1
michael@0 53
michael@0 54 vdup.8 d1, d31[4]
michael@0 55
michael@0 56 ;First Pass: output_height lines x output_width columns (17x16)
michael@0 57 filt_blk2d_fp16x16_loop_neon
michael@0 58 pld [r0]
michael@0 59 pld [r0, r1]
michael@0 60 pld [r0, r1, lsl #1]
michael@0 61
michael@0 62 vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0])
michael@0 63 vmull.u8 q8, d3, d0
michael@0 64 vmull.u8 q9, d5, d0
michael@0 65 vmull.u8 q10, d6, d0
michael@0 66 vmull.u8 q11, d8, d0
michael@0 67 vmull.u8 q12, d9, d0
michael@0 68 vmull.u8 q13, d11, d0
michael@0 69 vmull.u8 q14, d12, d0
michael@0 70
michael@0 71 vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
michael@0 72 vext.8 d5, d5, d6, #1
michael@0 73 vext.8 d8, d8, d9, #1
michael@0 74 vext.8 d11, d11, d12, #1
michael@0 75
michael@0 76 vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp8_filter[1])
michael@0 77 vmlal.u8 q9, d5, d1
michael@0 78 vmlal.u8 q11, d8, d1
michael@0 79 vmlal.u8 q13, d11, d1
michael@0 80
michael@0 81 vext.8 d3, d3, d4, #1
michael@0 82 vext.8 d6, d6, d7, #1
michael@0 83 vext.8 d9, d9, d10, #1
michael@0 84 vext.8 d12, d12, d13, #1
michael@0 85
michael@0 86 vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp8_filter[1])
michael@0 87 vmlal.u8 q10, d6, d1
michael@0 88 vmlal.u8 q12, d9, d1
michael@0 89 vmlal.u8 q14, d12, d1
michael@0 90
michael@0 91 subs r2, r2, #1
michael@0 92
michael@0 93 vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
michael@0 94 vqrshrn.u16 d15, q8, #7
michael@0 95 vqrshrn.u16 d16, q9, #7
michael@0 96 vqrshrn.u16 d17, q10, #7
michael@0 97 vqrshrn.u16 d18, q11, #7
michael@0 98 vqrshrn.u16 d19, q12, #7
michael@0 99 vqrshrn.u16 d20, q13, #7
michael@0 100
michael@0 101 vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
michael@0 102 vqrshrn.u16 d21, q14, #7
michael@0 103 vld1.u8 {d5, d6, d7}, [r0], r1
michael@0 104
michael@0 105 vst1.u8 {d14, d15, d16, d17}, [lr]! ;store result
michael@0 106 vld1.u8 {d8, d9, d10}, [r0], r1
michael@0 107 vst1.u8 {d18, d19, d20, d21}, [lr]!
michael@0 108 vld1.u8 {d11, d12, d13}, [r0], r1
michael@0 109
michael@0 110 bne filt_blk2d_fp16x16_loop_neon
michael@0 111
michael@0 112 ;First-pass filtering for rest 5 lines
michael@0 113 vld1.u8 {d14, d15, d16}, [r0], r1
michael@0 114
michael@0 115 vmull.u8 q9, d2, d0 ;(src_ptr[0] * vp8_filter[0])
michael@0 116 vmull.u8 q10, d3, d0
michael@0 117 vmull.u8 q11, d5, d0
michael@0 118 vmull.u8 q12, d6, d0
michael@0 119 vmull.u8 q13, d8, d0
michael@0 120 vmull.u8 q14, d9, d0
michael@0 121
michael@0 122 vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
michael@0 123 vext.8 d5, d5, d6, #1
michael@0 124 vext.8 d8, d8, d9, #1
michael@0 125
michael@0 126 vmlal.u8 q9, d2, d1 ;(src_ptr[0] * vp8_filter[1])
michael@0 127 vmlal.u8 q11, d5, d1
michael@0 128 vmlal.u8 q13, d8, d1
michael@0 129
michael@0 130 vext.8 d3, d3, d4, #1
michael@0 131 vext.8 d6, d6, d7, #1
michael@0 132 vext.8 d9, d9, d10, #1
michael@0 133
michael@0 134 vmlal.u8 q10, d3, d1 ;(src_ptr[0] * vp8_filter[1])
michael@0 135 vmlal.u8 q12, d6, d1
michael@0 136 vmlal.u8 q14, d9, d1
michael@0 137
michael@0 138 vmull.u8 q1, d11, d0
michael@0 139 vmull.u8 q2, d12, d0
michael@0 140 vmull.u8 q3, d14, d0
michael@0 141 vmull.u8 q4, d15, d0
michael@0 142
michael@0 143 vext.8 d11, d11, d12, #1 ;construct src_ptr[1]
michael@0 144 vext.8 d14, d14, d15, #1
michael@0 145
michael@0 146 vmlal.u8 q1, d11, d1 ;(src_ptr[0] * vp8_filter[1])
michael@0 147 vmlal.u8 q3, d14, d1
michael@0 148
michael@0 149 vext.8 d12, d12, d13, #1
michael@0 150 vext.8 d15, d15, d16, #1
michael@0 151
michael@0 152 vmlal.u8 q2, d12, d1 ;(src_ptr[0] * vp8_filter[1])
michael@0 153 vmlal.u8 q4, d15, d1
michael@0 154
michael@0 155 vqrshrn.u16 d10, q9, #7 ;shift/round/saturate to u8
michael@0 156 vqrshrn.u16 d11, q10, #7
michael@0 157 vqrshrn.u16 d12, q11, #7
michael@0 158 vqrshrn.u16 d13, q12, #7
michael@0 159 vqrshrn.u16 d14, q13, #7
michael@0 160 vqrshrn.u16 d15, q14, #7
michael@0 161 vqrshrn.u16 d16, q1, #7
michael@0 162 vqrshrn.u16 d17, q2, #7
michael@0 163 vqrshrn.u16 d18, q3, #7
michael@0 164 vqrshrn.u16 d19, q4, #7
michael@0 165
michael@0 166 vst1.u8 {d10, d11, d12, d13}, [lr]! ;store result
michael@0 167 vst1.u8 {d14, d15, d16, d17}, [lr]!
michael@0 168 vst1.u8 {d18, d19}, [lr]!
michael@0 169
michael@0 170 ;Second pass: 16x16
michael@0 171 ;secondpass_filter
michael@0 172 add r3, r12, r3, lsl #3
michael@0 173 sub lr, lr, #272
michael@0 174
michael@0 175 vld1.u32 {d31}, [r3] ;load second_pass filter
michael@0 176
michael@0 177 vld1.u8 {d22, d23}, [lr]! ;load src data
michael@0 178
michael@0 179 vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
michael@0 180 vdup.8 d1, d31[4]
michael@0 181 mov r12, #4 ;loop counter
michael@0 182
michael@0 183 filt_blk2d_sp16x16_loop_neon
michael@0 184 vld1.u8 {d24, d25}, [lr]!
michael@0 185 vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0])
michael@0 186 vld1.u8 {d26, d27}, [lr]!
michael@0 187 vmull.u8 q2, d23, d0
michael@0 188 vld1.u8 {d28, d29}, [lr]!
michael@0 189 vmull.u8 q3, d24, d0
michael@0 190 vld1.u8 {d30, d31}, [lr]!
michael@0 191
michael@0 192 vmull.u8 q4, d25, d0
michael@0 193 vmull.u8 q5, d26, d0
michael@0 194 vmull.u8 q6, d27, d0
michael@0 195 vmull.u8 q7, d28, d0
michael@0 196 vmull.u8 q8, d29, d0
michael@0 197
michael@0 198 vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp8_filter[1])
michael@0 199 vmlal.u8 q2, d25, d1
michael@0 200 vmlal.u8 q3, d26, d1
michael@0 201 vmlal.u8 q4, d27, d1
michael@0 202 vmlal.u8 q5, d28, d1
michael@0 203 vmlal.u8 q6, d29, d1
michael@0 204 vmlal.u8 q7, d30, d1
michael@0 205 vmlal.u8 q8, d31, d1
michael@0 206
michael@0 207 subs r12, r12, #1
michael@0 208
michael@0 209 vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
michael@0 210 vqrshrn.u16 d3, q2, #7
michael@0 211 vqrshrn.u16 d4, q3, #7
michael@0 212 vqrshrn.u16 d5, q4, #7
michael@0 213 vqrshrn.u16 d6, q5, #7
michael@0 214 vqrshrn.u16 d7, q6, #7
michael@0 215 vqrshrn.u16 d8, q7, #7
michael@0 216 vqrshrn.u16 d9, q8, #7
michael@0 217
michael@0 218 vst1.u8 {d2, d3}, [r4], r5 ;store result
michael@0 219 vst1.u8 {d4, d5}, [r4], r5
michael@0 220 vst1.u8 {d6, d7}, [r4], r5
michael@0 221 vmov q11, q15
michael@0 222 vst1.u8 {d8, d9}, [r4], r5
michael@0 223
michael@0 224 bne filt_blk2d_sp16x16_loop_neon
michael@0 225
michael@0 226 add sp, sp, #272
michael@0 227
michael@0 228 pop {r4-r5,pc}
michael@0 229
michael@0 230 ;--------------------
michael@0 231 firstpass_bfilter16x16_only
michael@0 232 mov r2, #4 ;loop counter
michael@0 233 vdup.8 d0, d31[0] ;first_pass filter (d0 d1)
michael@0 234 vdup.8 d1, d31[4]
michael@0 235
michael@0 236 ;First Pass: output_height lines x output_width columns (16x16)
michael@0 237 filt_blk2d_fpo16x16_loop_neon
michael@0 238 vld1.u8 {d2, d3, d4}, [r0], r1 ;load src data
michael@0 239 vld1.u8 {d5, d6, d7}, [r0], r1
michael@0 240 vld1.u8 {d8, d9, d10}, [r0], r1
michael@0 241 vld1.u8 {d11, d12, d13}, [r0], r1
michael@0 242
michael@0 243 pld [r0]
michael@0 244 pld [r0, r1]
michael@0 245 pld [r0, r1, lsl #1]
michael@0 246
michael@0 247 vmull.u8 q7, d2, d0 ;(src_ptr[0] * vp8_filter[0])
michael@0 248 vmull.u8 q8, d3, d0
michael@0 249 vmull.u8 q9, d5, d0
michael@0 250 vmull.u8 q10, d6, d0
michael@0 251 vmull.u8 q11, d8, d0
michael@0 252 vmull.u8 q12, d9, d0
michael@0 253 vmull.u8 q13, d11, d0
michael@0 254 vmull.u8 q14, d12, d0
michael@0 255
michael@0 256 vext.8 d2, d2, d3, #1 ;construct src_ptr[1]
michael@0 257 vext.8 d5, d5, d6, #1
michael@0 258 vext.8 d8, d8, d9, #1
michael@0 259 vext.8 d11, d11, d12, #1
michael@0 260
michael@0 261 vmlal.u8 q7, d2, d1 ;(src_ptr[0] * vp8_filter[1])
michael@0 262 vmlal.u8 q9, d5, d1
michael@0 263 vmlal.u8 q11, d8, d1
michael@0 264 vmlal.u8 q13, d11, d1
michael@0 265
michael@0 266 vext.8 d3, d3, d4, #1
michael@0 267 vext.8 d6, d6, d7, #1
michael@0 268 vext.8 d9, d9, d10, #1
michael@0 269 vext.8 d12, d12, d13, #1
michael@0 270
michael@0 271 vmlal.u8 q8, d3, d1 ;(src_ptr[0] * vp8_filter[1])
michael@0 272 vmlal.u8 q10, d6, d1
michael@0 273 vmlal.u8 q12, d9, d1
michael@0 274 vmlal.u8 q14, d12, d1
michael@0 275
michael@0 276 subs r2, r2, #1
michael@0 277
michael@0 278 vqrshrn.u16 d14, q7, #7 ;shift/round/saturate to u8
michael@0 279 vqrshrn.u16 d15, q8, #7
michael@0 280 vqrshrn.u16 d16, q9, #7
michael@0 281 vqrshrn.u16 d17, q10, #7
michael@0 282 vqrshrn.u16 d18, q11, #7
michael@0 283 vqrshrn.u16 d19, q12, #7
michael@0 284 vqrshrn.u16 d20, q13, #7
michael@0 285 vst1.u8 {d14, d15}, [r4], r5 ;store result
michael@0 286 vqrshrn.u16 d21, q14, #7
michael@0 287
michael@0 288 vst1.u8 {d16, d17}, [r4], r5
michael@0 289 vst1.u8 {d18, d19}, [r4], r5
michael@0 290 vst1.u8 {d20, d21}, [r4], r5
michael@0 291
michael@0 292 bne filt_blk2d_fpo16x16_loop_neon
michael@0 293 pop {r4-r5,pc}
michael@0 294
michael@0 295 ;---------------------
michael@0 296 secondpass_bfilter16x16_only
michael@0 297 ;Second pass: 16x16
michael@0 298 ;secondpass_filter
michael@0 299 add r3, r12, r3, lsl #3
michael@0 300 mov r12, #4 ;loop counter
michael@0 301 vld1.u32 {d31}, [r3] ;load second_pass filter
michael@0 302 vld1.u8 {d22, d23}, [r0], r1 ;load src data
michael@0 303
michael@0 304 vdup.8 d0, d31[0] ;second_pass filter parameters (d0 d1)
michael@0 305 vdup.8 d1, d31[4]
michael@0 306
michael@0 307 filt_blk2d_spo16x16_loop_neon
michael@0 308 vld1.u8 {d24, d25}, [r0], r1
michael@0 309 vmull.u8 q1, d22, d0 ;(src_ptr[0] * vp8_filter[0])
michael@0 310 vld1.u8 {d26, d27}, [r0], r1
michael@0 311 vmull.u8 q2, d23, d0
michael@0 312 vld1.u8 {d28, d29}, [r0], r1
michael@0 313 vmull.u8 q3, d24, d0
michael@0 314 vld1.u8 {d30, d31}, [r0], r1
michael@0 315
michael@0 316 vmull.u8 q4, d25, d0
michael@0 317 vmull.u8 q5, d26, d0
michael@0 318 vmull.u8 q6, d27, d0
michael@0 319 vmull.u8 q7, d28, d0
michael@0 320 vmull.u8 q8, d29, d0
michael@0 321
michael@0 322 vmlal.u8 q1, d24, d1 ;(src_ptr[pixel_step] * vp8_filter[1])
michael@0 323 vmlal.u8 q2, d25, d1
michael@0 324 vmlal.u8 q3, d26, d1
michael@0 325 vmlal.u8 q4, d27, d1
michael@0 326 vmlal.u8 q5, d28, d1
michael@0 327 vmlal.u8 q6, d29, d1
michael@0 328 vmlal.u8 q7, d30, d1
michael@0 329 vmlal.u8 q8, d31, d1
michael@0 330
michael@0 331 vqrshrn.u16 d2, q1, #7 ;shift/round/saturate to u8
michael@0 332 vqrshrn.u16 d3, q2, #7
michael@0 333 vqrshrn.u16 d4, q3, #7
michael@0 334 vqrshrn.u16 d5, q4, #7
michael@0 335 vqrshrn.u16 d6, q5, #7
michael@0 336 vqrshrn.u16 d7, q6, #7
michael@0 337 vqrshrn.u16 d8, q7, #7
michael@0 338 vqrshrn.u16 d9, q8, #7
michael@0 339
michael@0 340 vst1.u8 {d2, d3}, [r4], r5 ;store result
michael@0 341 subs r12, r12, #1
michael@0 342 vst1.u8 {d4, d5}, [r4], r5
michael@0 343 vmov q11, q15
michael@0 344 vst1.u8 {d6, d7}, [r4], r5
michael@0 345 vst1.u8 {d8, d9}, [r4], r5
michael@0 346
michael@0 347 bne filt_blk2d_spo16x16_loop_neon
michael@0 348 pop {r4-r5,pc}
michael@0 349
michael@0 350 ENDP
michael@0 351
michael@0 352 ;-----------------
michael@0 353
michael@0 354 bifilter16_coeff
michael@0 355 DCD 128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
michael@0 356
michael@0 357 END

mercurial