media/libvpx/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,272 @@
     1.4 +;
     1.5 +;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
     1.6 +;
     1.7 +;  Use of this source code is governed by a BSD-style license
     1.8 +;  that can be found in the LICENSE file in the root of the source
     1.9 +;  tree. An additional intellectual property rights grant can be found
    1.10 +;  in the file PATENTS.  All contributing project authors may
    1.11 +;  be found in the AUTHORS file in the root of the source tree.
    1.12 +;
    1.13 +
    1.14 +
    1.15 +    EXPORT  |vp8_subtract_mby_armv6|
    1.16 +    EXPORT  |vp8_subtract_mbuv_armv6|
    1.17 +    EXPORT  |vp8_subtract_b_armv6|
    1.18 +
    1.19 +    INCLUDE vp8_asm_enc_offsets.asm
    1.20 +
    1.21 +    ARM
    1.22 +    REQUIRE8
    1.23 +    PRESERVE8
    1.24 +
    1.25 +    AREA ||.text||, CODE, READONLY, ALIGN=2
    1.26 +
    1.27 +; r0    BLOCK *be
    1.28 +; r1    BLOCKD *bd
    1.29 +; r2    int pitch
    1.30 +|vp8_subtract_b_armv6| PROC
    1.31 +
    1.32 +    stmfd   sp!, {r4-r9}
    1.33 +
    1.34 +    ldr     r4, [r0, #vp8_block_base_src]
    1.35 +    ldr     r5, [r0, #vp8_block_src]
    1.36 +    ldr     r6, [r0, #vp8_block_src_diff]
    1.37 +
    1.38 +    ldr     r3, [r4]
    1.39 +    ldr     r7, [r0, #vp8_block_src_stride]
    1.40 +    add     r3, r3, r5          ; src = *base_src + src
    1.41 +    ldr     r8, [r1, #vp8_blockd_predictor]
    1.42 +
    1.43 +    mov     r9, #4              ; loop count
    1.44 +
    1.45 +loop_block
    1.46 +
    1.47 +    ldr     r0, [r3], r7        ; src
    1.48 +    ldr     r1, [r8], r2        ; pred
    1.49 +
    1.50 +    uxtb16  r4, r0              ; [s2 | s0]
    1.51 +    uxtb16  r5, r1              ; [p2 | p0]
    1.52 +    uxtb16  r0, r0, ror #8      ; [s3 | s1]
    1.53 +    uxtb16  r1, r1, ror #8      ; [p3 | p1]
    1.54 +
    1.55 +    usub16  r4, r4, r5          ; [d2 | d0]
    1.56 +    usub16  r5, r0, r1          ; [d3 | d1]
    1.57 +
    1.58 +    subs    r9, r9, #1          ; decrement loop counter
    1.59 +
    1.60 +    pkhbt   r0, r4, r5, lsl #16 ; [d1 | d0]
    1.61 +    pkhtb   r1, r5, r4, asr #16 ; [d3 | d2]
    1.62 +
    1.63 +    str     r0, [r6, #0]        ; diff
    1.64 +    str     r1, [r6, #4]        ; diff
    1.65 +
    1.66 +    add     r6, r6, r2, lsl #1  ; update diff pointer
    1.67 +    bne     loop_block
    1.68 +
    1.69 +    ldmfd   sp!, {r4-r9}
    1.70 +    mov     pc, lr
    1.71 +
    1.72 +    ENDP
    1.73 +
    1.74 +
    1.75 +; r0    short *diff
    1.76 +; r1    unsigned char *usrc
    1.77 +; r2    unsigned char *vsrc
    1.78 +; r3    int src_stride
    1.79 +; sp    unsigned char *upred
    1.80 +; sp    unsigned char *vpred
    1.81 +; sp    int pred_stride
    1.82 +|vp8_subtract_mbuv_armv6| PROC
    1.83 +
    1.84 +    stmfd   sp!, {r4-r11}
    1.85 +
    1.86 +    add     r0, r0, #512        ; set *diff point to Cb
    1.87 +    mov     r4, #8              ; loop count
    1.88 +    ldr     r5, [sp, #32]       ; upred
    1.89 +    ldr     r12, [sp, #40]      ; pred_stride
    1.90 +
    1.91 +    ; Subtract U block
    1.92 +loop_u
    1.93 +    ldr     r6, [r1]            ; usrc      (A)
    1.94 +    ldr     r7, [r5]            ; upred     (A)
    1.95 +
    1.96 +    uxtb16  r8, r6              ; [s2 | s0] (A)
    1.97 +    uxtb16  r9, r7              ; [p2 | p0] (A)
    1.98 +    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
    1.99 +    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
   1.100 +
   1.101 +    usub16  r6, r8, r9          ; [d2 | d0] (A)
   1.102 +    usub16  r7, r10, r11        ; [d3 | d1] (A)
   1.103 +
   1.104 +    ldr     r10, [r1, #4]       ; usrc      (B)
   1.105 +    ldr     r11, [r5, #4]       ; upred     (B)
   1.106 +
   1.107 +    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
   1.108 +    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
   1.109 +
   1.110 +    str     r8, [r0], #4        ; diff      (A)
   1.111 +    uxtb16  r8, r10             ; [s2 | s0] (B)
   1.112 +    str     r9, [r0], #4        ; diff      (A)
   1.113 +
   1.114 +    uxtb16  r9, r11             ; [p2 | p0] (B)
   1.115 +    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
   1.116 +    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
   1.117 +
   1.118 +    usub16  r6, r8, r9          ; [d2 | d0] (B)
   1.119 +    usub16  r7, r10, r11        ; [d3 | d1] (B)
   1.120 +
   1.121 +    add     r1, r1, r3          ; update usrc pointer
   1.122 +    add     r5, r5, r12         ; update upred pointer
   1.123 +
   1.124 +    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
   1.125 +    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
   1.126 +
   1.127 +    str     r8, [r0], #4        ; diff      (B)
   1.128 +    subs    r4, r4, #1          ; update loop counter
   1.129 +    str     r9, [r0], #4        ; diff      (B)
   1.130 +
   1.131 +    bne     loop_u
   1.132 +
   1.133 +    ldr     r5, [sp, #36]       ; vpred
   1.134 +    mov     r4, #8              ; loop count
   1.135 +
   1.136 +    ; Subtract V block
   1.137 +loop_v
   1.138 +    ldr     r6, [r2]            ; vsrc      (A)
   1.139 +    ldr     r7, [r5]            ; vpred     (A)
   1.140 +
   1.141 +    uxtb16  r8, r6              ; [s2 | s0] (A)
   1.142 +    uxtb16  r9, r7              ; [p2 | p0] (A)
   1.143 +    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
   1.144 +    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
   1.145 +
   1.146 +    usub16  r6, r8, r9          ; [d2 | d0] (A)
   1.147 +    usub16  r7, r10, r11        ; [d3 | d1] (A)
   1.148 +
   1.149 +    ldr     r10, [r2, #4]       ; vsrc      (B)
   1.150 +    ldr     r11, [r5, #4]       ; vpred     (B)
   1.151 +
   1.152 +    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
   1.153 +    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
   1.154 +
   1.155 +    str     r8, [r0], #4        ; diff      (A)
   1.156 +    uxtb16  r8, r10             ; [s2 | s0] (B)
   1.157 +    str     r9, [r0], #4        ; diff      (A)
   1.158 +
   1.159 +    uxtb16  r9, r11             ; [p2 | p0] (B)
   1.160 +    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
   1.161 +    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
   1.162 +
   1.163 +    usub16  r6, r8, r9          ; [d2 | d0] (B)
   1.164 +    usub16  r7, r10, r11        ; [d3 | d1] (B)
   1.165 +
   1.166 +    add     r2, r2, r3          ; update vsrc pointer
   1.167 +    add     r5, r5, r12         ; update vpred pointer
   1.168 +
   1.169 +    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
   1.170 +    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
   1.171 +
   1.172 +    str     r8, [r0], #4        ; diff      (B)
   1.173 +    subs    r4, r4, #1          ; update loop counter
   1.174 +    str     r9, [r0], #4        ; diff      (B)
   1.175 +
   1.176 +    bne     loop_v
   1.177 +
   1.178 +    ldmfd   sp!, {r4-r11}
   1.179 +    bx      lr
   1.180 +
   1.181 +    ENDP
   1.182 +
   1.183 +
   1.184 +; r0    short *diff
   1.185 +; r1    unsigned char *src
   1.186 +; r2    int src_stride
   1.187 +; r3    unsigned char *pred
   1.188 +; sp    int pred_stride
   1.189 +|vp8_subtract_mby_armv6| PROC
   1.190 +
   1.191 +    stmfd   sp!, {r4-r11}
   1.192 +    ldr     r12, [sp, #32]      ; pred_stride
   1.193 +    mov     r4, #16
   1.194 +loop
   1.195 +    ldr     r6, [r1]            ; src       (A)
   1.196 +    ldr     r7, [r3]            ; pred      (A)
   1.197 +
   1.198 +    uxtb16  r8, r6              ; [s2 | s0] (A)
   1.199 +    uxtb16  r9, r7              ; [p2 | p0] (A)
   1.200 +    uxtb16  r10, r6, ror #8     ; [s3 | s1] (A)
   1.201 +    uxtb16  r11, r7, ror #8     ; [p3 | p1] (A)
   1.202 +
   1.203 +    usub16  r6, r8, r9          ; [d2 | d0] (A)
   1.204 +    usub16  r7, r10, r11        ; [d3 | d1] (A)
   1.205 +
   1.206 +    ldr     r10, [r1, #4]       ; src       (B)
   1.207 +    ldr     r11, [r3, #4]       ; pred      (B)
   1.208 +
   1.209 +    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
   1.210 +    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
   1.211 +
   1.212 +    str     r8, [r0], #4        ; diff      (A)
   1.213 +    uxtb16  r8, r10             ; [s2 | s0] (B)
   1.214 +    str     r9, [r0], #4        ; diff      (A)
   1.215 +
   1.216 +    uxtb16  r9, r11             ; [p2 | p0] (B)
   1.217 +    uxtb16  r10, r10, ror #8    ; [s3 | s1] (B)
   1.218 +    uxtb16  r11, r11, ror #8    ; [p3 | p1] (B)
   1.219 +
   1.220 +    usub16  r6, r8, r9          ; [d2 | d0] (B)
   1.221 +    usub16  r7, r10, r11        ; [d3 | d1] (B)
   1.222 +
   1.223 +    ldr     r10, [r1, #8]       ; src       (C)
   1.224 +    ldr     r11, [r3, #8]       ; pred      (C)
   1.225 +
   1.226 +    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
   1.227 +    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
   1.228 +
   1.229 +    str     r8, [r0], #4        ; diff      (B)
   1.230 +    uxtb16  r8, r10             ; [s2 | s0] (C)
   1.231 +    str     r9, [r0], #4        ; diff      (B)
   1.232 +
   1.233 +    uxtb16  r9, r11             ; [p2 | p0] (C)
   1.234 +    uxtb16  r10, r10, ror #8    ; [s3 | s1] (C)
   1.235 +    uxtb16  r11, r11, ror #8    ; [p3 | p1] (C)
   1.236 +
   1.237 +    usub16  r6, r8, r9          ; [d2 | d0] (C)
   1.238 +    usub16  r7, r10, r11        ; [d3 | d1] (C)
   1.239 +
   1.240 +    ldr     r10, [r1, #12]      ; src       (D)
   1.241 +    ldr     r11, [r3, #12]      ; pred      (D)
   1.242 +
   1.243 +    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (C)
   1.244 +    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (C)
   1.245 +
   1.246 +    str     r8, [r0], #4        ; diff      (C)
   1.247 +    uxtb16  r8, r10             ; [s2 | s0] (D)
   1.248 +    str     r9, [r0], #4        ; diff      (C)
   1.249 +
   1.250 +    uxtb16  r9, r11             ; [p2 | p0] (D)
   1.251 +    uxtb16  r10, r10, ror #8    ; [s3 | s1] (D)
   1.252 +    uxtb16  r11, r11, ror #8    ; [p3 | p1] (D)
   1.253 +
   1.254 +    usub16  r6, r8, r9          ; [d2 | d0] (D)
   1.255 +    usub16  r7, r10, r11        ; [d3 | d1] (D)
   1.256 +
   1.257 +    add     r1, r1, r2          ; update src pointer
   1.258 +    add     r3, r3, r12         ; update pred pointer
   1.259 +
   1.260 +    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (D)
   1.261 +    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (D)
   1.262 +
   1.263 +    str     r8, [r0], #4        ; diff      (D)
   1.264 +    subs    r4, r4, #1          ; update loop counter
   1.265 +    str     r9, [r0], #4        ; diff      (D)
   1.266 +
   1.267 +    bne     loop
   1.268 +
   1.269 +    ldmfd   sp!, {r4-r11}
   1.270 +    bx      lr
   1.271 +
   1.272 +    ENDP
   1.273 +
   1.274 +    END
   1.275 +

mercurial