media/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,103 @@
     1.4 +;
     1.5 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 +;
     1.7 +;  Use of this source code is governed by a BSD-style license
     1.8 +;  that can be found in the LICENSE file in the root of the source
     1.9 +;  tree. An additional intellectual property rights grant can be found
    1.10 +;  in the file PATENTS.  All contributing project authors may
    1.11 +;  be found in the AUTHORS file in the root of the source tree.
    1.12 +;
    1.13 +
    1.14 +
    1.15 +    EXPORT  |vp8_short_walsh4x4_neon|
    1.16 +
    1.17 +    ARM
    1.18 +    REQUIRE8
    1.19 +    PRESERVE8
    1.20 +
    1.21 +    AREA ||.text||, CODE, READONLY, ALIGN=2
    1.22 +;void vp8_short_walsh4x4_neon(short *input, short *output, int pitch)
    1.23 +; r0   short *input,
    1.24 +; r1   short *output,
    1.25 +; r2   int pitch
    1.26 +|vp8_short_walsh4x4_neon| PROC
    1.27 +
    1.28 +    vld1.16         {d0}, [r0@64], r2   ; load input
    1.29 +    vld1.16         {d1}, [r0@64], r2
    1.30 +    vld1.16         {d2}, [r0@64], r2
    1.31 +    vld1.16         {d3}, [r0@64]
    1.32 +
    1.33 +    ;First for-loop
    1.34 +    ;transpose d0, d1, d2, d3. Then, d0=ip[0], d1=ip[1], d2=ip[2], d3=ip[3]
    1.35 +    vtrn.32         d0, d2
    1.36 +    vtrn.32         d1, d3
    1.37 +
    1.38 +    vmov.s32        q15, #3             ; add 3 to all values
    1.39 +
    1.40 +    vtrn.16         d0, d1
    1.41 +    vtrn.16         d2, d3
    1.42 +
    1.43 +    vadd.s16        d4, d0, d2          ; ip[0] + ip[2]
    1.44 +    vadd.s16        d5, d1, d3          ; ip[1] + ip[3]
    1.45 +    vsub.s16        d6, d1, d3          ; ip[1] - ip[3]
    1.46 +    vsub.s16        d7, d0, d2          ; ip[0] - ip[2]
    1.47 +
    1.48 +    vshl.s16        d4, d4, #2          ; a1 = (ip[0] + ip[2]) << 2
    1.49 +    vshl.s16        d5, d5, #2          ; d1 = (ip[1] + ip[3]) << 2
    1.50 +    vshl.s16        d6, d6, #2          ; c1 = (ip[1] - ip[3]) << 2
    1.51 +    vceq.s16        d16, d4, #0         ; a1 == 0
    1.52 +    vshl.s16        d7, d7, #2          ; b1 = (ip[0] - ip[2]) << 2
    1.53 +
    1.54 +    vadd.s16        d0, d4, d5          ; a1 + d1
    1.55 +    vmvn            d16, d16            ; a1 != 0
    1.56 +    vsub.s16        d3, d4, d5          ; op[3] = a1 - d1
    1.57 +    vadd.s16        d1, d7, d6          ; op[1] = b1 + c1
    1.58 +    vsub.s16        d2, d7, d6          ; op[2] = b1 - c1
    1.59 +    vsub.s16        d0, d0, d16         ; op[0] = a1 + d1 + (a1 != 0)
    1.60 +
    1.61 +    ;Second for-loop
    1.62 +    ;transpose d0, d1, d2, d3, Then, d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
    1.63 +    vtrn.32         d1, d3
    1.64 +    vtrn.32         d0, d2
    1.65 +    vtrn.16         d2, d3
    1.66 +    vtrn.16         d0, d1
    1.67 +
    1.68 +    vaddl.s16       q8, d0, d2          ; a1 = ip[0]+ip[8]
    1.69 +    vaddl.s16       q9, d1, d3          ; d1 = ip[4]+ip[12]
    1.70 +    vsubl.s16       q10, d1, d3         ; c1 = ip[4]-ip[12]
    1.71 +    vsubl.s16       q11, d0, d2         ; b1 = ip[0]-ip[8]
    1.72 +
    1.73 +    vadd.s32        q0, q8, q9          ; a2 = a1 + d1
    1.74 +    vadd.s32        q1, q11, q10        ; b2 = b1 + c1
    1.75 +    vsub.s32        q2, q11, q10        ; c2 = b1 - c1
    1.76 +    vsub.s32        q3, q8, q9          ; d2 = a1 - d1
    1.77 +
    1.78 +    vclt.s32        q8, q0, #0
    1.79 +    vclt.s32        q9, q1, #0
    1.80 +    vclt.s32        q10, q2, #0
    1.81 +    vclt.s32        q11, q3, #0
    1.82 +
    1.83 +    ; subtract -1 (or 0)
    1.84 +    vsub.s32        q0, q0, q8          ; a2 += a2 < 0
    1.85 +    vsub.s32        q1, q1, q9          ; b2 += b2 < 0
    1.86 +    vsub.s32        q2, q2, q10         ; c2 += c2 < 0
    1.87 +    vsub.s32        q3, q3, q11         ; d2 += d2 < 0
    1.88 +
    1.89 +    vadd.s32        q8, q0, q15         ; a2 + 3
    1.90 +    vadd.s32        q9, q1, q15         ; b2 + 3
    1.91 +    vadd.s32        q10, q2, q15        ; c2 + 3
    1.92 +    vadd.s32        q11, q3, q15        ; d2 + 3
    1.93 +
    1.94 +    ; vrshrn? would add 1 << 3-1 = 2
    1.95 +    vshrn.s32       d0, q8, #3
    1.96 +    vshrn.s32       d1, q9, #3
    1.97 +    vshrn.s32       d2, q10, #3
    1.98 +    vshrn.s32       d3, q11, #3
    1.99 +
   1.100 +    vst1.16         {q0, q1}, [r1@128]
   1.101 +
   1.102 +    bx              lr
   1.103 +
   1.104 +    ENDP
   1.105 +
   1.106 +    END

mercurial