michael@0: ;
michael@0: ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0: ;
michael@0: ;  Use of this source code is governed by a BSD-style license
michael@0: ;  that can be found in the LICENSE file in the root of the source
michael@0: ;  tree. An additional intellectual property rights grant can be found
michael@0: ;  in the file PATENTS.  All contributing project authors may
michael@0: ;  be found in the AUTHORS file in the root of the source tree.
michael@0: ;
michael@0: 
michael@0: 
michael@0:     EXPORT  |vp8_variance16x16_neon|
michael@0:     EXPORT  |vp8_variance16x8_neon|
michael@0:     EXPORT  |vp8_variance8x16_neon|
michael@0:     EXPORT  |vp8_variance8x8_neon|
michael@0: 
michael@0:     ARM
michael@0:     REQUIRE8
michael@0:     PRESERVE8
michael@0: 
michael@0:     AREA ||.text||, CODE, READONLY, ALIGN=2
michael@0: 
michael@0: ; r0    unsigned char *src_ptr
michael@0: ; r1    int source_stride
michael@0: ; r2    unsigned char *ref_ptr
michael@0: ; r3    int  recon_stride
michael@0: ; stack unsigned int *sse
michael@0: |vp8_variance16x16_neon| PROC
michael@0:     vmov.i8         q8, #0                      ;q8 - sum
michael@0:     vmov.i8         q9, #0                      ;q9, q10 - sse
michael@0:     vmov.i8         q10, #0
michael@0: 
michael@0:     mov             r12, #8
michael@0: 
michael@0: variance16x16_neon_loop
michael@0:     vld1.8          {q0}, [r0], r1              ;Load up source and reference
michael@0:     vld1.8          {q2}, [r2], r3
michael@0:     vld1.8          {q1}, [r0], r1
michael@0:     vld1.8          {q3}, [r2], r3
michael@0: 
michael@0:     vsubl.u8        q11, d0, d4                 ;calculate diff
michael@0:     vsubl.u8        q12, d1, d5
michael@0:     vsubl.u8        q13, d2, d6
michael@0:     vsubl.u8        q14, d3, d7
michael@0: 
michael@0:     ;VPADAL adds adjacent pairs of elements of a vector, and accumulates
michael@0:     ;the results into the elements of the destination vector. The explanation
michael@0:     ;in ARM guide is wrong.
michael@0:     vpadal.s16      q8, q11                     ;calculate sum
michael@0:     vmlal.s16       q9, d22, d22                ;calculate sse
michael@0:     vmlal.s16       q10, d23, d23
michael@0: 
michael@0:     subs            r12, r12, #1
michael@0: 
michael@0:     vpadal.s16      q8, q12
michael@0:     vmlal.s16       q9, d24, d24
michael@0:     vmlal.s16       q10, d25, d25
michael@0:     vpadal.s16      q8, q13
michael@0:     vmlal.s16       q9, d26, d26
michael@0:     vmlal.s16       q10, d27, d27
michael@0:     vpadal.s16      q8, q14
michael@0:     vmlal.s16       q9, d28, d28
michael@0:     vmlal.s16       q10, d29, d29
michael@0: 
michael@0:     bne             variance16x16_neon_loop
michael@0: 
michael@0:     vadd.u32        q10, q9, q10                ;accumulate sse
michael@0:     vpaddl.s32      q0, q8                      ;accumulate sum
michael@0: 
michael@0:     ldr             r12, [sp]                   ;load *sse from stack
michael@0: 
michael@0:     vpaddl.u32      q1, q10
michael@0:     vadd.s64        d0, d0, d1
michael@0:     vadd.u64        d1, d2, d3
michael@0: 
michael@0:     ;vmov.32        r0, d0[0]                   ;this instruction costs a lot
michael@0:     ;vmov.32        r1, d1[0]
michael@0:     ;mul            r0, r0, r0
michael@0:     ;str            r1, [r12]
michael@0:     ;sub            r0, r1, r0, lsr #8
michael@0: 
michael@0:     ; while sum is signed, sum * sum is always positive and must be treated as
michael@0:     ; unsigned to avoid propagating the sign bit.
michael@0:     vmull.s32       q5, d0, d0
michael@0:     vst1.32         {d1[0]}, [r12]              ;store sse
michael@0:     vshr.u32        d10, d10, #8
michael@0:     vsub.u32        d0, d1, d10
michael@0: 
michael@0:     vmov.32         r0, d0[0]                   ;return
michael@0:     bx              lr
michael@0: 
michael@0:     ENDP
michael@0: 
michael@0: ;================================
michael@0: ;unsigned int vp8_variance16x8_c(
michael@0: ;    unsigned char *src_ptr,
michael@0: ;    int  source_stride,
michael@0: ;    unsigned char *ref_ptr,
michael@0: ;    int  recon_stride,
michael@0: ;   unsigned int *sse)
michael@0: |vp8_variance16x8_neon| PROC
michael@0:     vmov.i8         q8, #0                      ;q8 - sum
michael@0:     vmov.i8         q9, #0                      ;q9, q10 - sse
michael@0:     vmov.i8         q10, #0
michael@0: 
michael@0:     mov             r12, #4
michael@0: 
michael@0: variance16x8_neon_loop
michael@0:     vld1.8          {q0}, [r0], r1              ;Load up source and reference
michael@0:     vld1.8          {q2}, [r2], r3
michael@0:     vld1.8          {q1}, [r0], r1
michael@0:     vld1.8          {q3}, [r2], r3
michael@0: 
michael@0:     vsubl.u8        q11, d0, d4                 ;calculate diff
michael@0:     vsubl.u8        q12, d1, d5
michael@0:     vsubl.u8        q13, d2, d6
michael@0:     vsubl.u8        q14, d3, d7
michael@0: 
michael@0:     vpadal.s16      q8, q11                     ;calculate sum
michael@0:     vmlal.s16       q9, d22, d22                ;calculate sse
michael@0:     vmlal.s16       q10, d23, d23
michael@0: 
michael@0:     subs            r12, r12, #1
michael@0: 
michael@0:     vpadal.s16      q8, q12
michael@0:     vmlal.s16       q9, d24, d24
michael@0:     vmlal.s16       q10, d25, d25
michael@0:     vpadal.s16      q8, q13
michael@0:     vmlal.s16       q9, d26, d26
michael@0:     vmlal.s16       q10, d27, d27
michael@0:     vpadal.s16      q8, q14
michael@0:     vmlal.s16       q9, d28, d28
michael@0:     vmlal.s16       q10, d29, d29
michael@0: 
michael@0:     bne             variance16x8_neon_loop
michael@0: 
michael@0:     vadd.u32        q10, q9, q10                ;accumulate sse
michael@0:     vpaddl.s32      q0, q8                      ;accumulate sum
michael@0: 
michael@0:     ldr             r12, [sp]                   ;load *sse from stack
michael@0: 
michael@0:     vpaddl.u32      q1, q10
michael@0:     vadd.s64        d0, d0, d1
michael@0:     vadd.u64        d1, d2, d3
michael@0: 
michael@0:     vmull.s32       q5, d0, d0
michael@0:     vst1.32         {d1[0]}, [r12]              ;store sse
michael@0:     vshr.u32        d10, d10, #7
michael@0:     vsub.u32        d0, d1, d10
michael@0: 
michael@0:     vmov.32         r0, d0[0]                   ;return
michael@0:     bx              lr
michael@0: 
michael@0:     ENDP
michael@0: 
michael@0: ;=================================
michael@0: ;unsigned int vp8_variance8x16_c(
michael@0: ;    unsigned char *src_ptr,
michael@0: ;    int  source_stride,
michael@0: ;    unsigned char *ref_ptr,
michael@0: ;    int  recon_stride,
michael@0: ;   unsigned int *sse)
michael@0: 
michael@0: |vp8_variance8x16_neon| PROC
michael@0:     vmov.i8         q8, #0                      ;q8 - sum
michael@0:     vmov.i8         q9, #0                      ;q9, q10 - sse
michael@0:     vmov.i8         q10, #0
michael@0: 
michael@0:     mov             r12, #8
michael@0: 
michael@0: variance8x16_neon_loop
michael@0:     vld1.8          {d0}, [r0], r1              ;Load up source and reference
michael@0:     vld1.8          {d4}, [r2], r3
michael@0:     vld1.8          {d2}, [r0], r1
michael@0:     vld1.8          {d6}, [r2], r3
michael@0: 
michael@0:     vsubl.u8        q11, d0, d4                 ;calculate diff
michael@0:     vsubl.u8        q12, d2, d6
michael@0: 
michael@0:     vpadal.s16      q8, q11                     ;calculate sum
michael@0:     vmlal.s16       q9, d22, d22                ;calculate sse
michael@0:     vmlal.s16       q10, d23, d23
michael@0: 
michael@0:     subs            r12, r12, #1
michael@0: 
michael@0:     vpadal.s16      q8, q12
michael@0:     vmlal.s16       q9, d24, d24
michael@0:     vmlal.s16       q10, d25, d25
michael@0: 
michael@0:     bne             variance8x16_neon_loop
michael@0: 
michael@0:     vadd.u32        q10, q9, q10                ;accumulate sse
michael@0:     vpaddl.s32      q0, q8                      ;accumulate sum
michael@0: 
michael@0:     ldr             r12, [sp]                   ;load *sse from stack
michael@0: 
michael@0:     vpaddl.u32      q1, q10
michael@0:     vadd.s64        d0, d0, d1
michael@0:     vadd.u64        d1, d2, d3
michael@0: 
michael@0:     vmull.s32       q5, d0, d0
michael@0:     vst1.32         {d1[0]}, [r12]              ;store sse
michael@0:     vshr.u32        d10, d10, #7
michael@0:     vsub.u32        d0, d1, d10
michael@0: 
michael@0:     vmov.32         r0, d0[0]                   ;return
michael@0:     bx              lr
michael@0: 
michael@0:     ENDP
michael@0: 
michael@0: ;==================================
michael@0: ; r0    unsigned char *src_ptr
michael@0: ; r1    int source_stride
michael@0: ; r2    unsigned char *ref_ptr
michael@0: ; r3    int  recon_stride
michael@0: ; stack unsigned int *sse
michael@0: |vp8_variance8x8_neon| PROC
michael@0:     vmov.i8         q8, #0                      ;q8 - sum
michael@0:     vmov.i8         q9, #0                      ;q9, q10 - sse
michael@0:     vmov.i8         q10, #0
michael@0: 
michael@0:     mov             r12, #2
michael@0: 
michael@0: variance8x8_neon_loop
michael@0:     vld1.8          {d0}, [r0], r1              ;Load up source and reference
michael@0:     vld1.8          {d4}, [r2], r3
michael@0:     vld1.8          {d1}, [r0], r1
michael@0:     vld1.8          {d5}, [r2], r3
michael@0:     vld1.8          {d2}, [r0], r1
michael@0:     vld1.8          {d6}, [r2], r3
michael@0:     vld1.8          {d3}, [r0], r1
michael@0:     vld1.8          {d7}, [r2], r3
michael@0: 
michael@0:     vsubl.u8        q11, d0, d4                 ;calculate diff
michael@0:     vsubl.u8        q12, d1, d5
michael@0:     vsubl.u8        q13, d2, d6
michael@0:     vsubl.u8        q14, d3, d7
michael@0: 
michael@0:     vpadal.s16      q8, q11                     ;calculate sum
michael@0:     vmlal.s16       q9, d22, d22                ;calculate sse
michael@0:     vmlal.s16       q10, d23, d23
michael@0: 
michael@0:     subs            r12, r12, #1
michael@0: 
michael@0:     vpadal.s16      q8, q12
michael@0:     vmlal.s16       q9, d24, d24
michael@0:     vmlal.s16       q10, d25, d25
michael@0:     vpadal.s16      q8, q13
michael@0:     vmlal.s16       q9, d26, d26
michael@0:     vmlal.s16       q10, d27, d27
michael@0:     vpadal.s16      q8, q14
michael@0:     vmlal.s16       q9, d28, d28
michael@0:     vmlal.s16       q10, d29, d29
michael@0: 
michael@0:     bne             variance8x8_neon_loop
michael@0: 
michael@0:     vadd.u32        q10, q9, q10                ;accumulate sse
michael@0:     vpaddl.s32      q0, q8                      ;accumulate sum
michael@0: 
michael@0:     ldr             r12, [sp]                   ;load *sse from stack
michael@0: 
michael@0:     vpaddl.u32      q1, q10
michael@0:     vadd.s64        d0, d0, d1
michael@0:     vadd.u64        d1, d2, d3
michael@0: 
michael@0:     vmull.s32       q5, d0, d0
michael@0:     vst1.32         {d1[0]}, [r12]              ;store sse
michael@0:     vshr.u32        d10, d10, #6
michael@0:     vsub.u32        d0, d1, d10
michael@0: 
michael@0:     vmov.32         r0, d0[0]                   ;return
michael@0:     bx              lr
michael@0: 
michael@0:     ENDP
michael@0: 
michael@0:     END