media/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    12     EXPORT  |vp8_mse16x16_neon|
    13     EXPORT  |vp8_get4x4sse_cs_neon|
    15     ARM
    16     REQUIRE8
    17     PRESERVE8
    19     AREA ||.text||, CODE, READONLY, ALIGN=2
    20 ;============================
    21 ; r0    unsigned char *src_ptr
    22 ; r1    int source_stride
    23 ; r2    unsigned char *ref_ptr
    24 ; r3    int  recon_stride
    25 ; stack unsigned int *sse
    26 ;note: in this function, sum is never used. So, we can remove this part of calculation
    27 ;from vp8_variance().
    29 |vp8_mse16x16_neon| PROC
    30     vmov.i8         q7, #0                      ;q7, q8, q9, q10 - sse
    31     vmov.i8         q8, #0
    32     vmov.i8         q9, #0
    33     vmov.i8         q10, #0
    35     mov             r12, #8
    37 mse16x16_neon_loop
    38     vld1.8          {q0}, [r0], r1              ;Load up source and reference
    39     vld1.8          {q2}, [r2], r3
    40     vld1.8          {q1}, [r0], r1
    41     vld1.8          {q3}, [r2], r3
    43     vsubl.u8        q11, d0, d4
    44     vsubl.u8        q12, d1, d5
    45     vsubl.u8        q13, d2, d6
    46     vsubl.u8        q14, d3, d7
    48     vmlal.s16       q7, d22, d22
    49     vmlal.s16       q8, d23, d23
    51     subs            r12, r12, #1
    53     vmlal.s16       q9, d24, d24
    54     vmlal.s16       q10, d25, d25
    55     vmlal.s16       q7, d26, d26
    56     vmlal.s16       q8, d27, d27
    57     vmlal.s16       q9, d28, d28
    58     vmlal.s16       q10, d29, d29
    60     bne             mse16x16_neon_loop
    62     vadd.u32        q7, q7, q8
    63     vadd.u32        q9, q9, q10
    65     ldr             r12, [sp]               ;load *sse from stack
    67     vadd.u32        q10, q7, q9
    68     vpaddl.u32      q1, q10
    69     vadd.u64        d0, d2, d3
    71     vst1.32         {d0[0]}, [r12]
    72     vmov.32         r0, d0[0]
    74     bx              lr
    76     ENDP
    79 ;=============================
    80 ; r0    unsigned char *src_ptr,
    81 ; r1    int  source_stride,
    82 ; r2    unsigned char *ref_ptr,
    83 ; r3    int  recon_stride
    84 |vp8_get4x4sse_cs_neon| PROC
    85     vld1.8          {d0}, [r0], r1              ;Load up source and reference
    86     vld1.8          {d4}, [r2], r3
    87     vld1.8          {d1}, [r0], r1
    88     vld1.8          {d5}, [r2], r3
    89     vld1.8          {d2}, [r0], r1
    90     vld1.8          {d6}, [r2], r3
    91     vld1.8          {d3}, [r0], r1
    92     vld1.8          {d7}, [r2], r3
    94     vsubl.u8        q11, d0, d4
    95     vsubl.u8        q12, d1, d5
    96     vsubl.u8        q13, d2, d6
    97     vsubl.u8        q14, d3, d7
    99     vmull.s16       q7, d22, d22
   100     vmull.s16       q8, d24, d24
   101     vmull.s16       q9, d26, d26
   102     vmull.s16       q10, d28, d28
   104     vadd.u32        q7, q7, q8
   105     vadd.u32        q9, q9, q10
   106     vadd.u32        q9, q7, q9
   108     vpaddl.u32      q1, q9
   109     vadd.u64        d0, d2, d3
   111     vmov.32         r0, d0[0]
   112     bx              lr
   114     ENDP
   116     END

mercurial