michael@0: ;
michael@0: ;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
michael@0: ;
michael@0: ;  Use of this source code is governed by a BSD-style license
michael@0: ;  that can be found in the LICENSE file in the root of the source
michael@0: ;  tree. An additional intellectual property rights grant can be found
michael@0: ;  in the file PATENTS.  All contributing project authors may
michael@0: ;  be found in the AUTHORS file in the root of the source tree.
michael@0: ;
michael@0: 
michael@0: 
michael@0:     EXPORT  |vp8_fast_quantize_b_neon|
michael@0:     EXPORT  |vp8_fast_quantize_b_pair_neon|
michael@0: 
michael@0:     INCLUDE vp8_asm_enc_offsets.asm
michael@0: 
michael@0:     ARM
michael@0:     REQUIRE8
michael@0:     PRESERVE8
michael@0: 
michael@0:     AREA ||.text||, CODE, READONLY, ALIGN=4
michael@0: 
michael@0: ;vp8_fast_quantize_b_pair_neon(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2);
michael@0: |vp8_fast_quantize_b_pair_neon| PROC
michael@0: 
michael@0:     stmfd           sp!, {r4-r9}
michael@0:     vstmdb          sp!, {q4-q7}
michael@0: 
michael@0:     ldr             r4, [r0, #vp8_block_coeff]
michael@0:     ldr             r5, [r0, #vp8_block_quant_fast]
michael@0:     ldr             r6, [r0, #vp8_block_round]
michael@0: 
michael@0:     vld1.16         {q0, q1}, [r4@128]  ; load z
michael@0: 
michael@0:     ldr             r7, [r2, #vp8_blockd_qcoeff]
michael@0: 
michael@0:     vabs.s16        q4, q0              ; calculate x = abs(z)
michael@0:     vabs.s16        q5, q1
michael@0: 
michael@0:     ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
michael@0:     vshr.s16        q2, q0, #15         ; sz
michael@0:     vshr.s16        q3, q1, #15
michael@0: 
michael@0:     vld1.s16        {q6, q7}, [r6@128]  ; load round_ptr [0-15]
michael@0:     vld1.s16        {q8, q9}, [r5@128]  ; load quant_ptr [0-15]
michael@0: 
michael@0:     ldr             r4, [r1, #vp8_block_coeff]
michael@0: 
michael@0:     vadd.s16        q4, q6              ; x + Round
michael@0:     vadd.s16        q5, q7
michael@0: 
michael@0:     vld1.16         {q0, q1}, [r4@128]  ; load z2
michael@0: 
michael@0:     vqdmulh.s16     q4, q8              ; y = ((Round+abs(z)) * Quant) >> 16
michael@0:     vqdmulh.s16     q5, q9
michael@0: 
michael@0:     vabs.s16        q10, q0             ; calculate x2 = abs(z_2)
michael@0:     vabs.s16        q11, q1
michael@0:     vshr.s16        q12, q0, #15        ; sz2
michael@0:     vshr.s16        q13, q1, #15
michael@0: 
michael@0:     ;modify data to have its original sign
michael@0:     veor.s16        q4, q2              ; y^sz
michael@0:     veor.s16        q5, q3
michael@0: 
michael@0:     vadd.s16        q10, q6             ; x2 + Round
michael@0:     vadd.s16        q11, q7
michael@0: 
michael@0:     ldr             r8, [r2, #vp8_blockd_dequant]
michael@0: 
michael@0:     vqdmulh.s16     q10, q8             ; y2 = ((Round+abs(z)) * Quant) >> 16
michael@0:     vqdmulh.s16     q11, q9
michael@0: 
michael@0:     vshr.s16        q4, #1              ; right shift 1 after vqdmulh
michael@0:     vshr.s16        q5, #1
michael@0: 
michael@0:     vld1.s16        {q6, q7}, [r8@128]  ;load dequant_ptr[i]
michael@0: 
michael@0:     vsub.s16        q4, q2              ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
michael@0:     vsub.s16        q5, q3
michael@0: 
michael@0:     vshr.s16        q10, #1             ; right shift 1 after vqdmulh
michael@0:     vshr.s16        q11, #1
michael@0: 
michael@0:     ldr             r9, [r2, #vp8_blockd_dqcoeff]
michael@0: 
michael@0:     veor.s16        q10, q12            ; y2^sz2
michael@0:     veor.s16        q11, q13
michael@0: 
michael@0:     vst1.s16        {q4, q5}, [r7]      ; store: qcoeff = x1
michael@0: 
michael@0: 
michael@0:     vsub.s16        q10, q12            ; x2=(y^sz)-sz = (y^sz)-(-1) (2's complement)
michael@0:     vsub.s16        q11, q13
michael@0: 
michael@0:     ldr             r6, [r3, #vp8_blockd_qcoeff]
michael@0: 
michael@0:     vmul.s16        q2, q6, q4          ; x * Dequant
michael@0:     vmul.s16        q3, q7, q5
michael@0: 
michael@0:     adr             r0, inv_zig_zag     ; load ptr of inverse zigzag table
michael@0: 
michael@0:     vceq.s16        q8, q8              ; set q8 to all 1
michael@0: 
michael@0:     vst1.s16        {q10, q11}, [r6]    ; store: qcoeff = x2
michael@0: 
michael@0:     vmul.s16        q12, q6, q10        ; x2 * Dequant
michael@0:     vmul.s16        q13, q7, q11
michael@0: 
michael@0:     vld1.16         {q6, q7}, [r0@128]  ; load inverse scan order
michael@0: 
michael@0:     vtst.16         q14, q4, q8         ; now find eob
michael@0:     vtst.16         q15, q5, q8         ; non-zero element is set to all 1
michael@0: 
michael@0:     vst1.s16        {q2, q3}, [r9]      ; store dqcoeff = x * Dequant
michael@0: 
michael@0:     ldr             r7, [r3, #vp8_blockd_dqcoeff]
michael@0: 
michael@0:     vand            q0, q6, q14         ; get all valid numbers from scan array
michael@0:     vand            q1, q7, q15
michael@0: 
michael@0:     vst1.s16        {q12, q13}, [r7]    ; store dqcoeff = x * Dequant
michael@0: 
michael@0:     vtst.16         q2, q10, q8         ; now find eob
michael@0:     vtst.16         q3, q11, q8         ; non-zero element is set to all 1
michael@0: 
michael@0:     vmax.u16        q0, q0, q1          ; find maximum value in q0, q1
michael@0: 
michael@0:     vand            q10, q6, q2         ; get all valid numbers from scan array
michael@0:     vand            q11, q7, q3
michael@0:     vmax.u16        q10, q10, q11       ; find maximum value in q10, q11
michael@0: 
michael@0:     vmax.u16        d0, d0, d1
michael@0:     vmax.u16        d20, d20, d21
michael@0:     vmovl.u16       q0, d0
michael@0:     vmovl.u16       q10, d20
michael@0: 
michael@0:     vmax.u32        d0, d0, d1
michael@0:     vmax.u32        d20, d20, d21
michael@0:     vpmax.u32       d0, d0, d0
michael@0:     vpmax.u32       d20, d20, d20
michael@0: 
michael@0:     ldr             r4, [r2, #vp8_blockd_eob]
michael@0:     ldr             r5, [r3, #vp8_blockd_eob]
michael@0: 
michael@0:     vst1.8          {d0[0]}, [r4]       ; store eob
michael@0:     vst1.8          {d20[0]}, [r5]      ; store eob
michael@0: 
michael@0:     vldmia          sp!, {q4-q7}
michael@0:     ldmfd           sp!, {r4-r9}
michael@0:     bx              lr
michael@0: 
michael@0:     ENDP
michael@0: 
michael@0: ;void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
michael@0: |vp8_fast_quantize_b_neon| PROC
michael@0: 
michael@0:     stmfd           sp!, {r4-r7}
michael@0: 
michael@0:     ldr             r3, [r0, #vp8_block_coeff]
michael@0:     ldr             r4, [r0, #vp8_block_quant_fast]
michael@0:     ldr             r5, [r0, #vp8_block_round]
michael@0: 
michael@0:     vld1.16         {q0, q1}, [r3@128]  ; load z
michael@0:     vorr.s16        q14, q0, q1         ; check if all zero (step 1)
michael@0:     ldr             r6, [r1, #vp8_blockd_qcoeff]
michael@0:     ldr             r7, [r1, #vp8_blockd_dqcoeff]
michael@0:     vorr.s16        d28, d28, d29       ; check if all zero (step 2)
michael@0: 
michael@0:     vabs.s16        q12, q0             ; calculate x = abs(z)
michael@0:     vabs.s16        q13, q1
michael@0: 
michael@0:     ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
michael@0:     vshr.s16        q2, q0, #15         ; sz
michael@0:     vmov            r2, r3, d28         ; check if all zero (step 3)
michael@0:     vshr.s16        q3, q1, #15
michael@0: 
michael@0:     vld1.s16        {q14, q15}, [r5@128]; load round_ptr [0-15]
michael@0:     vld1.s16        {q8, q9}, [r4@128]  ; load quant_ptr [0-15]
michael@0: 
michael@0:     vadd.s16        q12, q14            ; x + Round
michael@0:     vadd.s16        q13, q15
michael@0: 
michael@0:     adr             r0, inv_zig_zag     ; load ptr of inverse zigzag table
michael@0: 
michael@0:     vqdmulh.s16     q12, q8             ; y = ((Round+abs(z)) * Quant) >> 16
michael@0:     vqdmulh.s16     q13, q9
michael@0: 
michael@0:     vld1.16         {q10, q11}, [r0@128]; load inverse scan order
michael@0: 
michael@0:     vceq.s16        q8, q8              ; set q8 to all 1
michael@0: 
michael@0:     ldr             r4, [r1, #vp8_blockd_dequant]
michael@0: 
michael@0:     vshr.s16        q12, #1             ; right shift 1 after vqdmulh
michael@0:     vshr.s16        q13, #1
michael@0: 
michael@0:     ldr             r5, [r1, #vp8_blockd_eob]
michael@0: 
michael@0:     orr             r2, r2, r3          ; check if all zero (step 4)
michael@0:     cmp             r2, #0              ; check if all zero (step 5)
michael@0:     beq             zero_output         ; check if all zero (step 6)
michael@0: 
michael@0:     ;modify data to have its original sign
michael@0:     veor.s16        q12, q2             ; y^sz
michael@0:     veor.s16        q13, q3
michael@0: 
michael@0:     vsub.s16        q12, q2             ; x1=(y^sz)-sz = (y^sz)-(-1) (2's complement)
michael@0:     vsub.s16        q13, q3
michael@0: 
michael@0:     vld1.s16        {q2, q3}, [r4@128]  ; load dequant_ptr[i]
michael@0: 
michael@0:     vtst.16         q14, q12, q8        ; now find eob
michael@0:     vtst.16         q15, q13, q8        ; non-zero element is set to all 1
michael@0: 
michael@0:     vst1.s16        {q12, q13}, [r6@128]; store: qcoeff = x1
michael@0: 
michael@0:     vand            q10, q10, q14       ; get all valid numbers from scan array
michael@0:     vand            q11, q11, q15
michael@0: 
michael@0: 
michael@0:     vmax.u16        q0, q10, q11        ; find maximum value in q0, q1
michael@0:     vmax.u16        d0, d0, d1
michael@0:     vmovl.u16       q0, d0
michael@0: 
michael@0:     vmul.s16        q2, q12             ; x * Dequant
michael@0:     vmul.s16        q3, q13
michael@0: 
michael@0:     vmax.u32        d0, d0, d1
michael@0:     vpmax.u32       d0, d0, d0
michael@0: 
michael@0:     vst1.s16        {q2, q3}, [r7@128]  ; store dqcoeff = x * Dequant
michael@0: 
michael@0:     vst1.8          {d0[0]}, [r5]       ; store eob
michael@0: 
michael@0:     ldmfd           sp!, {r4-r7}
michael@0:     bx              lr
michael@0: 
michael@0: zero_output
michael@0:     strb            r2, [r5]            ; store eob
michael@0:     vst1.s16        {q0, q1}, [r6@128]  ; qcoeff = 0
michael@0:     vst1.s16        {q0, q1}, [r7@128]  ; dqcoeff = 0
michael@0: 
michael@0:     ldmfd           sp!, {r4-r7}
michael@0:     bx              lr
michael@0: 
michael@0:     ENDP
michael@0: 
michael@0: ; default inverse zigzag table is defined in vp8/common/entropy.c
michael@0:     ALIGN 16    ; enable use of @128 bit aligned loads
michael@0: inv_zig_zag
michael@0:     DCW 0x0001, 0x0002, 0x0006, 0x0007
michael@0:     DCW 0x0003, 0x0005, 0x0008, 0x000d
michael@0:     DCW 0x0004, 0x0009, 0x000c, 0x000e
michael@0:     DCW 0x000a, 0x000b, 0x000f, 0x0010
michael@0: 
michael@0:     END
michael@0: