media/libvpx/vp8/encoder/arm/neon/subtract_neon.asm

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp8/encoder/arm/neon/subtract_neon.asm	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,199 @@
     1.4 +;
     1.5 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 +;
     1.7 +;  Use of this source code is governed by a BSD-style license
     1.8 +;  that can be found in the LICENSE file in the root of the source
     1.9 +;  tree. An additional intellectual property rights grant can be found
    1.10 +;  in the file PATENTS.  All contributing project authors may
    1.11 +;  be found in the AUTHORS file in the root of the source tree.
    1.12 +;
    1.13 +
    1.14 +    EXPORT |vp8_subtract_b_neon|
    1.15 +    EXPORT |vp8_subtract_mby_neon|
    1.16 +    EXPORT |vp8_subtract_mbuv_neon|
    1.17 +
    1.18 +    INCLUDE vp8_asm_enc_offsets.asm
    1.19 +
    1.20 +    ARM
    1.21 +    REQUIRE8
    1.22 +    PRESERVE8
    1.23 +
    1.24 +    AREA ||.text||, CODE, READONLY, ALIGN=2
    1.25 +
    1.26 +;void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)
    1.27 +|vp8_subtract_b_neon| PROC
    1.28 +
    1.29 +    stmfd   sp!, {r4-r7}
    1.30 +
    1.31 +    ldr     r3, [r0, #vp8_block_base_src]
    1.32 +    ldr     r4, [r0, #vp8_block_src]
    1.33 +    ldr     r5, [r0, #vp8_block_src_diff]
    1.34 +    ldr     r3, [r3]
    1.35 +    ldr     r6, [r0, #vp8_block_src_stride]
    1.36 +    add     r3, r3, r4                      ; src = *base_src + src
    1.37 +    ldr     r7, [r1, #vp8_blockd_predictor]
    1.38 +
    1.39 +    vld1.8          {d0}, [r3], r6          ;load src
    1.40 +    vld1.8          {d1}, [r7], r2          ;load pred
    1.41 +    vld1.8          {d2}, [r3], r6
    1.42 +    vld1.8          {d3}, [r7], r2
    1.43 +    vld1.8          {d4}, [r3], r6
    1.44 +    vld1.8          {d5}, [r7], r2
    1.45 +    vld1.8          {d6}, [r3], r6
    1.46 +    vld1.8          {d7}, [r7], r2
    1.47 +
    1.48 +    vsubl.u8        q10, d0, d1
    1.49 +    vsubl.u8        q11, d2, d3
    1.50 +    vsubl.u8        q12, d4, d5
    1.51 +    vsubl.u8        q13, d6, d7
    1.52 +
    1.53 +    mov             r2, r2, lsl #1
    1.54 +
    1.55 +    vst1.16         {d20}, [r5], r2         ;store diff
    1.56 +    vst1.16         {d22}, [r5], r2
    1.57 +    vst1.16         {d24}, [r5], r2
    1.58 +    vst1.16         {d26}, [r5], r2
    1.59 +
    1.60 +    ldmfd   sp!, {r4-r7}
    1.61 +    bx              lr
    1.62 +
    1.63 +    ENDP
    1.64 +
    1.65 +
    1.66 +;==========================================
    1.67 +;void vp8_subtract_mby_neon(short *diff, unsigned char *src, int src_stride
    1.68 +;                           unsigned char *pred, int pred_stride)
    1.69 +|vp8_subtract_mby_neon| PROC
    1.70 +    push            {r4-r7}
    1.71 +    mov             r12, #4
    1.72 +    ldr             r4, [sp, #16]           ; pred_stride
    1.73 +    mov             r6, #32                 ; "diff" stride x2
    1.74 +    add             r5, r0, #16             ; second diff pointer
    1.75 +
    1.76 +subtract_mby_loop
    1.77 +    vld1.8          {q0}, [r1], r2          ;load src
    1.78 +    vld1.8          {q1}, [r3], r4          ;load pred
    1.79 +    vld1.8          {q2}, [r1], r2
    1.80 +    vld1.8          {q3}, [r3], r4
    1.81 +    vld1.8          {q4}, [r1], r2
    1.82 +    vld1.8          {q5}, [r3], r4
    1.83 +    vld1.8          {q6}, [r1], r2
    1.84 +    vld1.8          {q7}, [r3], r4
    1.85 +
    1.86 +    vsubl.u8        q8, d0, d2
    1.87 +    vsubl.u8        q9, d1, d3
    1.88 +    vsubl.u8        q10, d4, d6
    1.89 +    vsubl.u8        q11, d5, d7
    1.90 +    vsubl.u8        q12, d8, d10
    1.91 +    vsubl.u8        q13, d9, d11
    1.92 +    vsubl.u8        q14, d12, d14
    1.93 +    vsubl.u8        q15, d13, d15
    1.94 +
    1.95 +    vst1.16         {q8}, [r0], r6          ;store diff
    1.96 +    vst1.16         {q9}, [r5], r6
    1.97 +    vst1.16         {q10}, [r0], r6
    1.98 +    vst1.16         {q11}, [r5], r6
    1.99 +    vst1.16         {q12}, [r0], r6
   1.100 +    vst1.16         {q13}, [r5], r6
   1.101 +    vst1.16         {q14}, [r0], r6
   1.102 +    vst1.16         {q15}, [r5], r6
   1.103 +
   1.104 +    subs            r12, r12, #1
   1.105 +    bne             subtract_mby_loop
   1.106 +
   1.107 +    pop             {r4-r7}
   1.108 +    bx              lr
   1.109 +    ENDP
   1.110 +
   1.111 +;=================================
   1.112 +;void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
   1.113 +;                         int src_stride, unsigned char *upred,
   1.114 +;                         unsigned char *vpred, int pred_stride)
   1.115 +
   1.116 +|vp8_subtract_mbuv_neon| PROC
   1.117 +    push            {r4-r7}
   1.118 +    ldr             r4, [sp, #16]       ; upred
   1.119 +    ldr             r5, [sp, #20]       ; vpred
   1.120 +    ldr             r6, [sp, #24]       ; pred_stride
   1.121 +    add             r0, r0, #512        ; short *udiff = diff + 256;
   1.122 +    mov             r12, #32            ; "diff" stride x2
   1.123 +    add             r7, r0, #16         ; second diff pointer
   1.124 +
   1.125 +;u
   1.126 +    vld1.8          {d0}, [r1], r3      ;load usrc
   1.127 +    vld1.8          {d1}, [r4], r6      ;load upred
   1.128 +    vld1.8          {d2}, [r1], r3
   1.129 +    vld1.8          {d3}, [r4], r6
   1.130 +    vld1.8          {d4}, [r1], r3
   1.131 +    vld1.8          {d5}, [r4], r6
   1.132 +    vld1.8          {d6}, [r1], r3
   1.133 +    vld1.8          {d7}, [r4], r6
   1.134 +    vld1.8          {d8}, [r1], r3
   1.135 +    vld1.8          {d9}, [r4], r6
   1.136 +    vld1.8          {d10}, [r1], r3
   1.137 +    vld1.8          {d11}, [r4], r6
   1.138 +    vld1.8          {d12}, [r1], r3
   1.139 +    vld1.8          {d13}, [r4], r6
   1.140 +    vld1.8          {d14}, [r1], r3
   1.141 +    vld1.8          {d15}, [r4], r6
   1.142 +
   1.143 +    vsubl.u8        q8, d0, d1
   1.144 +    vsubl.u8        q9, d2, d3
   1.145 +    vsubl.u8        q10, d4, d5
   1.146 +    vsubl.u8        q11, d6, d7
   1.147 +    vsubl.u8        q12, d8, d9
   1.148 +    vsubl.u8        q13, d10, d11
   1.149 +    vsubl.u8        q14, d12, d13
   1.150 +    vsubl.u8        q15, d14, d15
   1.151 +
   1.152 +    vst1.16         {q8}, [r0], r12     ;store diff
   1.153 +    vst1.16         {q9}, [r7], r12
   1.154 +    vst1.16         {q10}, [r0], r12
   1.155 +    vst1.16         {q11}, [r7], r12
   1.156 +    vst1.16         {q12}, [r0], r12
   1.157 +    vst1.16         {q13}, [r7], r12
   1.158 +    vst1.16         {q14}, [r0], r12
   1.159 +    vst1.16         {q15}, [r7], r12
   1.160 +
   1.161 +;v
   1.162 +    vld1.8          {d0}, [r2], r3      ;load vsrc
   1.163 +    vld1.8          {d1}, [r5], r6      ;load vpred
   1.164 +    vld1.8          {d2}, [r2], r3
   1.165 +    vld1.8          {d3}, [r5], r6
   1.166 +    vld1.8          {d4}, [r2], r3
   1.167 +    vld1.8          {d5}, [r5], r6
   1.168 +    vld1.8          {d6}, [r2], r3
   1.169 +    vld1.8          {d7}, [r5], r6
   1.170 +    vld1.8          {d8}, [r2], r3
   1.171 +    vld1.8          {d9}, [r5], r6
   1.172 +    vld1.8          {d10}, [r2], r3
   1.173 +    vld1.8          {d11}, [r5], r6
   1.174 +    vld1.8          {d12}, [r2], r3
   1.175 +    vld1.8          {d13}, [r5], r6
   1.176 +    vld1.8          {d14}, [r2], r3
   1.177 +    vld1.8          {d15}, [r5], r6
   1.178 +
   1.179 +    vsubl.u8        q8, d0, d1
   1.180 +    vsubl.u8        q9, d2, d3
   1.181 +    vsubl.u8        q10, d4, d5
   1.182 +    vsubl.u8        q11, d6, d7
   1.183 +    vsubl.u8        q12, d8, d9
   1.184 +    vsubl.u8        q13, d10, d11
   1.185 +    vsubl.u8        q14, d12, d13
   1.186 +    vsubl.u8        q15, d14, d15
   1.187 +
   1.188 +    vst1.16         {q8}, [r0], r12     ;store diff
   1.189 +    vst1.16         {q9}, [r7], r12
   1.190 +    vst1.16         {q10}, [r0], r12
   1.191 +    vst1.16         {q11}, [r7], r12
   1.192 +    vst1.16         {q12}, [r0], r12
   1.193 +    vst1.16         {q13}, [r7], r12
   1.194 +    vst1.16         {q14}, [r0], r12
   1.195 +    vst1.16         {q15}, [r7], r12
   1.196 +
   1.197 +    pop             {r4-r7}
   1.198 +    bx              lr
   1.199 +
   1.200 +    ENDP
   1.201 +
   1.202 +    END

mercurial