media/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,116 @@
     1.4 +;
     1.5 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 +;
     1.7 +;  Use of this source code is governed by a BSD-style license
     1.8 +;  that can be found in the LICENSE file in the root of the source
     1.9 +;  tree. An additional intellectual property rights grant can be found
    1.10 +;  in the file PATENTS.  All contributing project authors may
    1.11 +;  be found in the AUTHORS file in the root of the source tree.
    1.12 +;
    1.13 +
    1.14 +
    1.15 +    EXPORT  |vp8_mse16x16_neon|
    1.16 +    EXPORT  |vp8_get4x4sse_cs_neon|
    1.17 +
    1.18 +    ARM
    1.19 +    REQUIRE8
    1.20 +    PRESERVE8
    1.21 +
    1.22 +    AREA ||.text||, CODE, READONLY, ALIGN=2
    1.23 +;============================
    1.24 +; r0    unsigned char *src_ptr
    1.25 +; r1    int source_stride
    1.26 +; r2    unsigned char *ref_ptr
    1.27 +; r3    int  recon_stride
    1.28 +; stack unsigned int *sse
    1.29 +;note: in this function, sum is never used. So, we can remove this part of calculation
    1.30 +;from vp8_variance().
    1.31 +
    1.32 +|vp8_mse16x16_neon| PROC
    1.33 +    vmov.i8         q7, #0                      ;q7, q8, q9, q10 - sse
    1.34 +    vmov.i8         q8, #0
    1.35 +    vmov.i8         q9, #0
    1.36 +    vmov.i8         q10, #0
    1.37 +
    1.38 +    mov             r12, #8
    1.39 +
    1.40 +mse16x16_neon_loop
    1.41 +    vld1.8          {q0}, [r0], r1              ;Load up source and reference
    1.42 +    vld1.8          {q2}, [r2], r3
    1.43 +    vld1.8          {q1}, [r0], r1
    1.44 +    vld1.8          {q3}, [r2], r3
    1.45 +
    1.46 +    vsubl.u8        q11, d0, d4
    1.47 +    vsubl.u8        q12, d1, d5
    1.48 +    vsubl.u8        q13, d2, d6
    1.49 +    vsubl.u8        q14, d3, d7
    1.50 +
    1.51 +    vmlal.s16       q7, d22, d22
    1.52 +    vmlal.s16       q8, d23, d23
    1.53 +
    1.54 +    subs            r12, r12, #1
    1.55 +
    1.56 +    vmlal.s16       q9, d24, d24
    1.57 +    vmlal.s16       q10, d25, d25
    1.58 +    vmlal.s16       q7, d26, d26
    1.59 +    vmlal.s16       q8, d27, d27
    1.60 +    vmlal.s16       q9, d28, d28
    1.61 +    vmlal.s16       q10, d29, d29
    1.62 +
    1.63 +    bne             mse16x16_neon_loop
    1.64 +
    1.65 +    vadd.u32        q7, q7, q8
    1.66 +    vadd.u32        q9, q9, q10
    1.67 +
    1.68 +    ldr             r12, [sp]               ;load *sse from stack
    1.69 +
    1.70 +    vadd.u32        q10, q7, q9
    1.71 +    vpaddl.u32      q1, q10
    1.72 +    vadd.u64        d0, d2, d3
    1.73 +
    1.74 +    vst1.32         {d0[0]}, [r12]
    1.75 +    vmov.32         r0, d0[0]
    1.76 +
    1.77 +    bx              lr
    1.78 +
    1.79 +    ENDP
    1.80 +
    1.81 +
    1.82 +;=============================
    1.83 +; r0    unsigned char *src_ptr,
    1.84 +; r1    int  source_stride,
    1.85 +; r2    unsigned char *ref_ptr,
    1.86 +; r3    int  recon_stride
    1.87 +|vp8_get4x4sse_cs_neon| PROC
    1.88 +    vld1.8          {d0}, [r0], r1              ;Load up source and reference
    1.89 +    vld1.8          {d4}, [r2], r3
    1.90 +    vld1.8          {d1}, [r0], r1
    1.91 +    vld1.8          {d5}, [r2], r3
    1.92 +    vld1.8          {d2}, [r0], r1
    1.93 +    vld1.8          {d6}, [r2], r3
    1.94 +    vld1.8          {d3}, [r0], r1
    1.95 +    vld1.8          {d7}, [r2], r3
    1.96 +
    1.97 +    vsubl.u8        q11, d0, d4
    1.98 +    vsubl.u8        q12, d1, d5
    1.99 +    vsubl.u8        q13, d2, d6
   1.100 +    vsubl.u8        q14, d3, d7
   1.101 +
   1.102 +    vmull.s16       q7, d22, d22
   1.103 +    vmull.s16       q8, d24, d24
   1.104 +    vmull.s16       q9, d26, d26
   1.105 +    vmull.s16       q10, d28, d28
   1.106 +
   1.107 +    vadd.u32        q7, q7, q8
   1.108 +    vadd.u32        q9, q9, q10
   1.109 +    vadd.u32        q9, q7, q9
   1.110 +
   1.111 +    vpaddl.u32      q1, q9
   1.112 +    vadd.u64        d0, d2, d3
   1.113 +
   1.114 +    vmov.32         r0, d0[0]
   1.115 +    bx              lr
   1.116 +
   1.117 +    ENDP
   1.118 +
   1.119 +    END

mercurial