media/libvpx/vp8/common/arm/neon/sixtappredict8x8_neon.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11
michael@0 12 EXPORT |vp8_sixtap_predict8x8_neon|
michael@0 13 ARM
michael@0 14 REQUIRE8
michael@0 15 PRESERVE8
michael@0 16
michael@0 17 AREA ||.text||, CODE, READONLY, ALIGN=2
michael@0 18
michael@0 19 filter8_coeff
michael@0 20 DCD 0, 0, 128, 0, 0, 0, 0, 0
michael@0 21 DCD 0, -6, 123, 12, -1, 0, 0, 0
michael@0 22 DCD 2, -11, 108, 36, -8, 1, 0, 0
michael@0 23 DCD 0, -9, 93, 50, -6, 0, 0, 0
michael@0 24 DCD 3, -16, 77, 77, -16, 3, 0, 0
michael@0 25 DCD 0, -6, 50, 93, -9, 0, 0, 0
michael@0 26 DCD 1, -8, 36, 108, -11, 2, 0, 0
michael@0 27 DCD 0, -1, 12, 123, -6, 0, 0, 0
michael@0 28
michael@0 29 ; r0 unsigned char *src_ptr,
michael@0 30 ; r1 int src_pixels_per_line,
michael@0 31 ; r2 int xoffset,
michael@0 32 ; r3 int yoffset,
michael@0 33 ; stack(r4) unsigned char *dst_ptr,
michael@0 34 ; stack(r5) int dst_pitch
michael@0 35
michael@0 36 |vp8_sixtap_predict8x8_neon| PROC
michael@0 37 push {r4-r5, lr}
michael@0 38
michael@0 39 adr r12, filter8_coeff
michael@0 40
michael@0 41 ldr r4, [sp, #12] ;load parameters from stack
michael@0 42 ldr r5, [sp, #16] ;load parameters from stack
michael@0 43
michael@0 44 cmp r2, #0 ;skip first_pass filter if xoffset=0
michael@0 45 beq secondpass_filter8x8_only
michael@0 46
michael@0 47 add r2, r12, r2, lsl #5 ;calculate filter location
michael@0 48
michael@0 49 cmp r3, #0 ;skip second_pass filter if yoffset=0
michael@0 50
michael@0 51 vld1.s32 {q14, q15}, [r2] ;load first_pass filter
michael@0 52
michael@0 53 beq firstpass_filter8x8_only
michael@0 54
michael@0 55 sub sp, sp, #64 ;reserve space on stack for temporary storage
michael@0 56 mov lr, sp
michael@0 57
michael@0 58 vabs.s32 q12, q14
michael@0 59 vabs.s32 q13, q15
michael@0 60
michael@0 61 mov r2, #2 ;loop counter
michael@0 62 sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
michael@0 63 sub r0, r0, r1, lsl #1
michael@0 64
michael@0 65 vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
michael@0 66 vdup.8 d1, d24[4]
michael@0 67 vdup.8 d2, d25[0]
michael@0 68
michael@0 69 ;First pass: output_height lines x output_width columns (13x8)
michael@0 70 vld1.u8 {q3}, [r0], r1 ;load src data
michael@0 71 vdup.8 d3, d25[4]
michael@0 72 vld1.u8 {q4}, [r0], r1
michael@0 73 vdup.8 d4, d26[0]
michael@0 74 vld1.u8 {q5}, [r0], r1
michael@0 75 vdup.8 d5, d26[4]
michael@0 76 vld1.u8 {q6}, [r0], r1
michael@0 77
michael@0 78 filt_blk2d_fp8x8_loop_neon
michael@0 79 pld [r0]
michael@0 80 pld [r0, r1]
michael@0 81 pld [r0, r1, lsl #1]
michael@0 82
michael@0 83 vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
michael@0 84 vmull.u8 q8, d8, d0
michael@0 85 vmull.u8 q9, d10, d0
michael@0 86 vmull.u8 q10, d12, d0
michael@0 87
michael@0 88 vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
michael@0 89 vext.8 d29, d8, d9, #1
michael@0 90 vext.8 d30, d10, d11, #1
michael@0 91 vext.8 d31, d12, d13, #1
michael@0 92
michael@0 93 vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
michael@0 94 vmlsl.u8 q8, d29, d1
michael@0 95 vmlsl.u8 q9, d30, d1
michael@0 96 vmlsl.u8 q10, d31, d1
michael@0 97
michael@0 98 vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
michael@0 99 vext.8 d29, d8, d9, #4
michael@0 100 vext.8 d30, d10, d11, #4
michael@0 101 vext.8 d31, d12, d13, #4
michael@0 102
michael@0 103 vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
michael@0 104 vmlsl.u8 q8, d29, d4
michael@0 105 vmlsl.u8 q9, d30, d4
michael@0 106 vmlsl.u8 q10, d31, d4
michael@0 107
michael@0 108 vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
michael@0 109 vext.8 d29, d8, d9, #2
michael@0 110 vext.8 d30, d10, d11, #2
michael@0 111 vext.8 d31, d12, d13, #2
michael@0 112
michael@0 113 vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2])
michael@0 114 vmlal.u8 q8, d29, d2
michael@0 115 vmlal.u8 q9, d30, d2
michael@0 116 vmlal.u8 q10, d31, d2
michael@0 117
michael@0 118 vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
michael@0 119 vext.8 d29, d8, d9, #5
michael@0 120 vext.8 d30, d10, d11, #5
michael@0 121 vext.8 d31, d12, d13, #5
michael@0 122
michael@0 123 vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5])
michael@0 124 vmlal.u8 q8, d29, d5
michael@0 125 vmlal.u8 q9, d30, d5
michael@0 126 vmlal.u8 q10, d31, d5
michael@0 127
michael@0 128 vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
michael@0 129 vext.8 d29, d8, d9, #3
michael@0 130 vext.8 d30, d10, d11, #3
michael@0 131 vext.8 d31, d12, d13, #3
michael@0 132
michael@0 133 vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3])
michael@0 134 vmull.u8 q4, d29, d3
michael@0 135 vmull.u8 q5, d30, d3
michael@0 136 vmull.u8 q6, d31, d3
michael@0 137
michael@0 138 subs r2, r2, #1
michael@0 139
michael@0 140 vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
michael@0 141 vqadd.s16 q8, q4
michael@0 142 vqadd.s16 q9, q5
michael@0 143 vqadd.s16 q10, q6
michael@0 144
michael@0 145 vld1.u8 {q3}, [r0], r1 ;load src data
michael@0 146
michael@0 147 vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
michael@0 148 vqrshrun.s16 d23, q8, #7
michael@0 149 vqrshrun.s16 d24, q9, #7
michael@0 150 vqrshrun.s16 d25, q10, #7
michael@0 151
michael@0 152 vst1.u8 {d22}, [lr]! ;store result
michael@0 153 vld1.u8 {q4}, [r0], r1
michael@0 154 vst1.u8 {d23}, [lr]!
michael@0 155 vld1.u8 {q5}, [r0], r1
michael@0 156 vst1.u8 {d24}, [lr]!
michael@0 157 vld1.u8 {q6}, [r0], r1
michael@0 158 vst1.u8 {d25}, [lr]!
michael@0 159
michael@0 160 bne filt_blk2d_fp8x8_loop_neon
michael@0 161
michael@0 162 ;first_pass filtering on the rest 5-line data
michael@0 163 ;vld1.u8 {q3}, [r0], r1 ;load src data
michael@0 164 ;vld1.u8 {q4}, [r0], r1
michael@0 165 ;vld1.u8 {q5}, [r0], r1
michael@0 166 ;vld1.u8 {q6}, [r0], r1
michael@0 167 vld1.u8 {q7}, [r0], r1
michael@0 168
michael@0 169 vmull.u8 q8, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
michael@0 170 vmull.u8 q9, d8, d0
michael@0 171 vmull.u8 q10, d10, d0
michael@0 172 vmull.u8 q11, d12, d0
michael@0 173 vmull.u8 q12, d14, d0
michael@0 174
michael@0 175 vext.8 d27, d6, d7, #1 ;construct src_ptr[-1]
michael@0 176 vext.8 d28, d8, d9, #1
michael@0 177 vext.8 d29, d10, d11, #1
michael@0 178 vext.8 d30, d12, d13, #1
michael@0 179 vext.8 d31, d14, d15, #1
michael@0 180
michael@0 181 vmlsl.u8 q8, d27, d1 ;-(src_ptr[-1] * vp8_filter[1])
michael@0 182 vmlsl.u8 q9, d28, d1
michael@0 183 vmlsl.u8 q10, d29, d1
michael@0 184 vmlsl.u8 q11, d30, d1
michael@0 185 vmlsl.u8 q12, d31, d1
michael@0 186
michael@0 187 vext.8 d27, d6, d7, #4 ;construct src_ptr[2]
michael@0 188 vext.8 d28, d8, d9, #4
michael@0 189 vext.8 d29, d10, d11, #4
michael@0 190 vext.8 d30, d12, d13, #4
michael@0 191 vext.8 d31, d14, d15, #4
michael@0 192
michael@0 193 vmlsl.u8 q8, d27, d4 ;-(src_ptr[2] * vp8_filter[4])
michael@0 194 vmlsl.u8 q9, d28, d4
michael@0 195 vmlsl.u8 q10, d29, d4
michael@0 196 vmlsl.u8 q11, d30, d4
michael@0 197 vmlsl.u8 q12, d31, d4
michael@0 198
michael@0 199 vext.8 d27, d6, d7, #2 ;construct src_ptr[0]
michael@0 200 vext.8 d28, d8, d9, #2
michael@0 201 vext.8 d29, d10, d11, #2
michael@0 202 vext.8 d30, d12, d13, #2
michael@0 203 vext.8 d31, d14, d15, #2
michael@0 204
michael@0 205 vmlal.u8 q8, d27, d2 ;(src_ptr[0] * vp8_filter[2])
michael@0 206 vmlal.u8 q9, d28, d2
michael@0 207 vmlal.u8 q10, d29, d2
michael@0 208 vmlal.u8 q11, d30, d2
michael@0 209 vmlal.u8 q12, d31, d2
michael@0 210
michael@0 211 vext.8 d27, d6, d7, #5 ;construct src_ptr[3]
michael@0 212 vext.8 d28, d8, d9, #5
michael@0 213 vext.8 d29, d10, d11, #5
michael@0 214 vext.8 d30, d12, d13, #5
michael@0 215 vext.8 d31, d14, d15, #5
michael@0 216
michael@0 217 vmlal.u8 q8, d27, d5 ;(src_ptr[3] * vp8_filter[5])
michael@0 218 vmlal.u8 q9, d28, d5
michael@0 219 vmlal.u8 q10, d29, d5
michael@0 220 vmlal.u8 q11, d30, d5
michael@0 221 vmlal.u8 q12, d31, d5
michael@0 222
michael@0 223 vext.8 d27, d6, d7, #3 ;construct src_ptr[1]
michael@0 224 vext.8 d28, d8, d9, #3
michael@0 225 vext.8 d29, d10, d11, #3
michael@0 226 vext.8 d30, d12, d13, #3
michael@0 227 vext.8 d31, d14, d15, #3
michael@0 228
michael@0 229 vmull.u8 q3, d27, d3 ;(src_ptr[1] * vp8_filter[3])
michael@0 230 vmull.u8 q4, d28, d3
michael@0 231 vmull.u8 q5, d29, d3
michael@0 232 vmull.u8 q6, d30, d3
michael@0 233 vmull.u8 q7, d31, d3
michael@0 234
michael@0 235 vqadd.s16 q8, q3 ;sum of all (src_data*filter_parameters)
michael@0 236 vqadd.s16 q9, q4
michael@0 237 vqadd.s16 q10, q5
michael@0 238 vqadd.s16 q11, q6
michael@0 239 vqadd.s16 q12, q7
michael@0 240
michael@0 241 add r3, r12, r3, lsl #5
michael@0 242
michael@0 243 vqrshrun.s16 d26, q8, #7 ;shift/round/saturate to u8
michael@0 244 sub lr, lr, #64
michael@0 245 vqrshrun.s16 d27, q9, #7
michael@0 246 vld1.u8 {q9}, [lr]! ;load intermediate data from stack
michael@0 247 vqrshrun.s16 d28, q10, #7
michael@0 248 vld1.u8 {q10}, [lr]!
michael@0 249
michael@0 250 vld1.s32 {q5, q6}, [r3] ;load second_pass filter
michael@0 251
michael@0 252 vqrshrun.s16 d29, q11, #7
michael@0 253 vld1.u8 {q11}, [lr]!
michael@0 254
michael@0 255 vabs.s32 q7, q5
michael@0 256 vabs.s32 q8, q6
michael@0 257
michael@0 258 vqrshrun.s16 d30, q12, #7
michael@0 259 vld1.u8 {q12}, [lr]!
michael@0 260
michael@0 261 ;Second pass: 8x8
michael@0 262 mov r3, #2 ;loop counter
michael@0 263
michael@0 264 vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
michael@0 265 vdup.8 d1, d14[4]
michael@0 266 vdup.8 d2, d15[0]
michael@0 267 vdup.8 d3, d15[4]
michael@0 268 vdup.8 d4, d16[0]
michael@0 269 vdup.8 d5, d16[4]
michael@0 270
michael@0 271 filt_blk2d_sp8x8_loop_neon
michael@0 272 vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0])
michael@0 273 vmull.u8 q4, d19, d0
michael@0 274 vmull.u8 q5, d20, d0
michael@0 275 vmull.u8 q6, d21, d0
michael@0 276
michael@0 277 vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1])
michael@0 278 vmlsl.u8 q4, d20, d1
michael@0 279 vmlsl.u8 q5, d21, d1
michael@0 280 vmlsl.u8 q6, d22, d1
michael@0 281
michael@0 282 vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4])
michael@0 283 vmlsl.u8 q4, d23, d4
michael@0 284 vmlsl.u8 q5, d24, d4
michael@0 285 vmlsl.u8 q6, d25, d4
michael@0 286
michael@0 287 vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2])
michael@0 288 vmlal.u8 q4, d21, d2
michael@0 289 vmlal.u8 q5, d22, d2
michael@0 290 vmlal.u8 q6, d23, d2
michael@0 291
michael@0 292 vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5])
michael@0 293 vmlal.u8 q4, d24, d5
michael@0 294 vmlal.u8 q5, d25, d5
michael@0 295 vmlal.u8 q6, d26, d5
michael@0 296
michael@0 297 vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3])
michael@0 298 vmull.u8 q8, d22, d3
michael@0 299 vmull.u8 q9, d23, d3
michael@0 300 vmull.u8 q10, d24, d3
michael@0 301
michael@0 302 subs r3, r3, #1
michael@0 303
michael@0 304 vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
michael@0 305 vqadd.s16 q8, q4
michael@0 306 vqadd.s16 q9, q5
michael@0 307 vqadd.s16 q10, q6
michael@0 308
michael@0 309 vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
michael@0 310 vqrshrun.s16 d7, q8, #7
michael@0 311 vqrshrun.s16 d8, q9, #7
michael@0 312 vqrshrun.s16 d9, q10, #7
michael@0 313
michael@0 314 vmov q9, q11
michael@0 315 vst1.u8 {d6}, [r4], r5 ;store result
michael@0 316 vmov q10, q12
michael@0 317 vst1.u8 {d7}, [r4], r5
michael@0 318 vmov q11, q13
michael@0 319 vst1.u8 {d8}, [r4], r5
michael@0 320 vmov q12, q14
michael@0 321 vst1.u8 {d9}, [r4], r5
michael@0 322 vmov d26, d30
michael@0 323
michael@0 324 bne filt_blk2d_sp8x8_loop_neon
michael@0 325
michael@0 326 add sp, sp, #64
michael@0 327 pop {r4-r5,pc}
michael@0 328
michael@0 329 ;---------------------
michael@0 330 firstpass_filter8x8_only
michael@0 331 ;add r2, r12, r2, lsl #5 ;calculate filter location
michael@0 332 ;vld1.s32 {q14, q15}, [r2] ;load first_pass filter
michael@0 333 vabs.s32 q12, q14
michael@0 334 vabs.s32 q13, q15
michael@0 335
michael@0 336 mov r2, #2 ;loop counter
michael@0 337 sub r0, r0, #2 ;move srcptr back to (line-2) and (column-2)
michael@0 338
michael@0 339 vdup.8 d0, d24[0] ;first_pass filter (d0-d5)
michael@0 340 vdup.8 d1, d24[4]
michael@0 341 vdup.8 d2, d25[0]
michael@0 342 vdup.8 d3, d25[4]
michael@0 343 vdup.8 d4, d26[0]
michael@0 344 vdup.8 d5, d26[4]
michael@0 345
michael@0 346 ;First pass: output_height lines x output_width columns (8x8)
michael@0 347 filt_blk2d_fpo8x8_loop_neon
michael@0 348 vld1.u8 {q3}, [r0], r1 ;load src data
michael@0 349 vld1.u8 {q4}, [r0], r1
michael@0 350 vld1.u8 {q5}, [r0], r1
michael@0 351 vld1.u8 {q6}, [r0], r1
michael@0 352
michael@0 353 pld [r0]
michael@0 354 pld [r0, r1]
michael@0 355 pld [r0, r1, lsl #1]
michael@0 356
michael@0 357 vmull.u8 q7, d6, d0 ;(src_ptr[-2] * vp8_filter[0])
michael@0 358 vmull.u8 q8, d8, d0
michael@0 359 vmull.u8 q9, d10, d0
michael@0 360 vmull.u8 q10, d12, d0
michael@0 361
michael@0 362 vext.8 d28, d6, d7, #1 ;construct src_ptr[-1]
michael@0 363 vext.8 d29, d8, d9, #1
michael@0 364 vext.8 d30, d10, d11, #1
michael@0 365 vext.8 d31, d12, d13, #1
michael@0 366
michael@0 367 vmlsl.u8 q7, d28, d1 ;-(src_ptr[-1] * vp8_filter[1])
michael@0 368 vmlsl.u8 q8, d29, d1
michael@0 369 vmlsl.u8 q9, d30, d1
michael@0 370 vmlsl.u8 q10, d31, d1
michael@0 371
michael@0 372 vext.8 d28, d6, d7, #4 ;construct src_ptr[2]
michael@0 373 vext.8 d29, d8, d9, #4
michael@0 374 vext.8 d30, d10, d11, #4
michael@0 375 vext.8 d31, d12, d13, #4
michael@0 376
michael@0 377 vmlsl.u8 q7, d28, d4 ;-(src_ptr[2] * vp8_filter[4])
michael@0 378 vmlsl.u8 q8, d29, d4
michael@0 379 vmlsl.u8 q9, d30, d4
michael@0 380 vmlsl.u8 q10, d31, d4
michael@0 381
michael@0 382 vext.8 d28, d6, d7, #2 ;construct src_ptr[0]
michael@0 383 vext.8 d29, d8, d9, #2
michael@0 384 vext.8 d30, d10, d11, #2
michael@0 385 vext.8 d31, d12, d13, #2
michael@0 386
michael@0 387 vmlal.u8 q7, d28, d2 ;(src_ptr[0] * vp8_filter[2])
michael@0 388 vmlal.u8 q8, d29, d2
michael@0 389 vmlal.u8 q9, d30, d2
michael@0 390 vmlal.u8 q10, d31, d2
michael@0 391
michael@0 392 vext.8 d28, d6, d7, #5 ;construct src_ptr[3]
michael@0 393 vext.8 d29, d8, d9, #5
michael@0 394 vext.8 d30, d10, d11, #5
michael@0 395 vext.8 d31, d12, d13, #5
michael@0 396
michael@0 397 vmlal.u8 q7, d28, d5 ;(src_ptr[3] * vp8_filter[5])
michael@0 398 vmlal.u8 q8, d29, d5
michael@0 399 vmlal.u8 q9, d30, d5
michael@0 400 vmlal.u8 q10, d31, d5
michael@0 401
michael@0 402 vext.8 d28, d6, d7, #3 ;construct src_ptr[1]
michael@0 403 vext.8 d29, d8, d9, #3
michael@0 404 vext.8 d30, d10, d11, #3
michael@0 405 vext.8 d31, d12, d13, #3
michael@0 406
michael@0 407 vmull.u8 q3, d28, d3 ;(src_ptr[1] * vp8_filter[3])
michael@0 408 vmull.u8 q4, d29, d3
michael@0 409 vmull.u8 q5, d30, d3
michael@0 410 vmull.u8 q6, d31, d3
michael@0 411 ;
michael@0 412 vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
michael@0 413 vqadd.s16 q8, q4
michael@0 414 vqadd.s16 q9, q5
michael@0 415 vqadd.s16 q10, q6
michael@0 416
michael@0 417 subs r2, r2, #1
michael@0 418
michael@0 419 vqrshrun.s16 d22, q7, #7 ;shift/round/saturate to u8
michael@0 420 vqrshrun.s16 d23, q8, #7
michael@0 421 vqrshrun.s16 d24, q9, #7
michael@0 422 vqrshrun.s16 d25, q10, #7
michael@0 423
michael@0 424 vst1.u8 {d22}, [r4], r5 ;store result
michael@0 425 vst1.u8 {d23}, [r4], r5
michael@0 426 vst1.u8 {d24}, [r4], r5
michael@0 427 vst1.u8 {d25}, [r4], r5
michael@0 428
michael@0 429 bne filt_blk2d_fpo8x8_loop_neon
michael@0 430
michael@0 431 pop {r4-r5,pc}
michael@0 432
michael@0 433 ;---------------------
michael@0 434 secondpass_filter8x8_only
michael@0 435 sub r0, r0, r1, lsl #1
michael@0 436 add r3, r12, r3, lsl #5
michael@0 437
michael@0 438 vld1.u8 {d18}, [r0], r1 ;load src data
michael@0 439 vld1.s32 {q5, q6}, [r3] ;load second_pass filter
michael@0 440 vld1.u8 {d19}, [r0], r1
michael@0 441 vabs.s32 q7, q5
michael@0 442 vld1.u8 {d20}, [r0], r1
michael@0 443 vabs.s32 q8, q6
michael@0 444 vld1.u8 {d21}, [r0], r1
michael@0 445 mov r3, #2 ;loop counter
michael@0 446 vld1.u8 {d22}, [r0], r1
michael@0 447 vdup.8 d0, d14[0] ;second_pass filter parameters (d0-d5)
michael@0 448 vld1.u8 {d23}, [r0], r1
michael@0 449 vdup.8 d1, d14[4]
michael@0 450 vld1.u8 {d24}, [r0], r1
michael@0 451 vdup.8 d2, d15[0]
michael@0 452 vld1.u8 {d25}, [r0], r1
michael@0 453 vdup.8 d3, d15[4]
michael@0 454 vld1.u8 {d26}, [r0], r1
michael@0 455 vdup.8 d4, d16[0]
michael@0 456 vld1.u8 {d27}, [r0], r1
michael@0 457 vdup.8 d5, d16[4]
michael@0 458 vld1.u8 {d28}, [r0], r1
michael@0 459 vld1.u8 {d29}, [r0], r1
michael@0 460 vld1.u8 {d30}, [r0], r1
michael@0 461
michael@0 462 ;Second pass: 8x8
michael@0 463 filt_blk2d_spo8x8_loop_neon
michael@0 464 vmull.u8 q3, d18, d0 ;(src_ptr[-2] * vp8_filter[0])
michael@0 465 vmull.u8 q4, d19, d0
michael@0 466 vmull.u8 q5, d20, d0
michael@0 467 vmull.u8 q6, d21, d0
michael@0 468
michael@0 469 vmlsl.u8 q3, d19, d1 ;-(src_ptr[-1] * vp8_filter[1])
michael@0 470 vmlsl.u8 q4, d20, d1
michael@0 471 vmlsl.u8 q5, d21, d1
michael@0 472 vmlsl.u8 q6, d22, d1
michael@0 473
michael@0 474 vmlsl.u8 q3, d22, d4 ;-(src_ptr[2] * vp8_filter[4])
michael@0 475 vmlsl.u8 q4, d23, d4
michael@0 476 vmlsl.u8 q5, d24, d4
michael@0 477 vmlsl.u8 q6, d25, d4
michael@0 478
michael@0 479 vmlal.u8 q3, d20, d2 ;(src_ptr[0] * vp8_filter[2])
michael@0 480 vmlal.u8 q4, d21, d2
michael@0 481 vmlal.u8 q5, d22, d2
michael@0 482 vmlal.u8 q6, d23, d2
michael@0 483
michael@0 484 vmlal.u8 q3, d23, d5 ;(src_ptr[3] * vp8_filter[5])
michael@0 485 vmlal.u8 q4, d24, d5
michael@0 486 vmlal.u8 q5, d25, d5
michael@0 487 vmlal.u8 q6, d26, d5
michael@0 488
michael@0 489 vmull.u8 q7, d21, d3 ;(src_ptr[1] * vp8_filter[3])
michael@0 490 vmull.u8 q8, d22, d3
michael@0 491 vmull.u8 q9, d23, d3
michael@0 492 vmull.u8 q10, d24, d3
michael@0 493
michael@0 494 subs r3, r3, #1
michael@0 495
michael@0 496 vqadd.s16 q7, q3 ;sum of all (src_data*filter_parameters)
michael@0 497 vqadd.s16 q8, q4
michael@0 498 vqadd.s16 q9, q5
michael@0 499 vqadd.s16 q10, q6
michael@0 500
michael@0 501 vqrshrun.s16 d6, q7, #7 ;shift/round/saturate to u8
michael@0 502 vqrshrun.s16 d7, q8, #7
michael@0 503 vqrshrun.s16 d8, q9, #7
michael@0 504 vqrshrun.s16 d9, q10, #7
michael@0 505
michael@0 506 vmov q9, q11
michael@0 507 vst1.u8 {d6}, [r4], r5 ;store result
michael@0 508 vmov q10, q12
michael@0 509 vst1.u8 {d7}, [r4], r5
michael@0 510 vmov q11, q13
michael@0 511 vst1.u8 {d8}, [r4], r5
michael@0 512 vmov q12, q14
michael@0 513 vst1.u8 {d9}, [r4], r5
michael@0 514 vmov d26, d30
michael@0 515
michael@0 516 bne filt_blk2d_spo8x8_loop_neon
michael@0 517
michael@0 518 pop {r4-r5,pc}
michael@0 519
michael@0 520 ENDP
michael@0 521
michael@0 522 ;-----------------
michael@0 523
michael@0 524 END

mercurial