media/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11
michael@0 12 EXPORT |vp8_short_idct4x4llm_neon|
michael@0 13 ARM
michael@0 14 REQUIRE8
michael@0 15 PRESERVE8
michael@0 16
michael@0 17 AREA ||.text||, CODE, READONLY, ALIGN=2
michael@0 18
michael@0 19 ;*************************************************************
michael@0 20 ;void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch,
michael@0 21 ; unsigned char *dst, int stride)
michael@0 22 ;r0 short * input
michael@0 23 ;r1 short * pred
michael@0 24 ;r2 int pitch
michael@0 25 ;r3 unsigned char dst
michael@0 26 ;sp int stride
michael@0 27 ;*************************************************************
michael@0 28
michael@0 29 ; static const int cospi8sqrt2minus1=20091;
michael@0 30 ; static const int sinpi8sqrt2 =35468;
michael@0 31 ; static const int rounding = 0;
michael@0 32
michael@0 33 ; Optimization note: The resulted data from dequantization are signed
michael@0 34 ; 13-bit data that is in the range of [-4096, 4095]. This allows to
michael@0 35 ; use "vqdmulh"(neon) instruction since it won't go out of range
michael@0 36 ; (13+16+1=30bits<32bits). This instruction gives the high half
michael@0 37 ; result of the multiplication that is needed in IDCT.
michael@0 38
michael@0 39 |vp8_short_idct4x4llm_neon| PROC
michael@0 40 adr r12, idct_coeff
michael@0 41 vld1.16 {q1, q2}, [r0]
michael@0 42 vld1.16 {d0}, [r12]
michael@0 43
michael@0 44 vswp d3, d4 ;q2(vp[4] vp[12])
michael@0 45 ldr r0, [sp] ; stride
michael@0 46
michael@0 47 vqdmulh.s16 q3, q2, d0[2]
michael@0 48 vqdmulh.s16 q4, q2, d0[0]
michael@0 49
michael@0 50 vqadd.s16 d12, d2, d3 ;a1
michael@0 51 vqsub.s16 d13, d2, d3 ;b1
michael@0 52
michael@0 53 vshr.s16 q3, q3, #1
michael@0 54 vshr.s16 q4, q4, #1
michael@0 55
michael@0 56 vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
michael@0 57 vqadd.s16 q4, q4, q2
michael@0 58
michael@0 59 ;d6 - c1:temp1
michael@0 60 ;d7 - d1:temp2
michael@0 61 ;d8 - d1:temp1
michael@0 62 ;d9 - c1:temp2
michael@0 63
michael@0 64 vqsub.s16 d10, d6, d9 ;c1
michael@0 65 vqadd.s16 d11, d7, d8 ;d1
michael@0 66
michael@0 67 vqadd.s16 d2, d12, d11
michael@0 68 vqadd.s16 d3, d13, d10
michael@0 69 vqsub.s16 d4, d13, d10
michael@0 70 vqsub.s16 d5, d12, d11
michael@0 71
michael@0 72 vtrn.32 d2, d4
michael@0 73 vtrn.32 d3, d5
michael@0 74 vtrn.16 d2, d3
michael@0 75 vtrn.16 d4, d5
michael@0 76
michael@0 77 vswp d3, d4
michael@0 78
michael@0 79 vqdmulh.s16 q3, q2, d0[2]
michael@0 80 vqdmulh.s16 q4, q2, d0[0]
michael@0 81
michael@0 82 vqadd.s16 d12, d2, d3 ;a1
michael@0 83 vqsub.s16 d13, d2, d3 ;b1
michael@0 84
michael@0 85 vshr.s16 q3, q3, #1
michael@0 86 vshr.s16 q4, q4, #1
michael@0 87
michael@0 88 vqadd.s16 q3, q3, q2 ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
michael@0 89 vqadd.s16 q4, q4, q2
michael@0 90
michael@0 91 vqsub.s16 d10, d6, d9 ;c1
michael@0 92 vqadd.s16 d11, d7, d8 ;d1
michael@0 93
michael@0 94 vqadd.s16 d2, d12, d11
michael@0 95 vqadd.s16 d3, d13, d10
michael@0 96 vqsub.s16 d4, d13, d10
michael@0 97 vqsub.s16 d5, d12, d11
michael@0 98
michael@0 99 vrshr.s16 d2, d2, #3
michael@0 100 vrshr.s16 d3, d3, #3
michael@0 101 vrshr.s16 d4, d4, #3
michael@0 102 vrshr.s16 d5, d5, #3
michael@0 103
michael@0 104 vtrn.32 d2, d4
michael@0 105 vtrn.32 d3, d5
michael@0 106 vtrn.16 d2, d3
michael@0 107 vtrn.16 d4, d5
michael@0 108
michael@0 109 ; load prediction data
michael@0 110 vld1.32 d6[0], [r1], r2
michael@0 111 vld1.32 d6[1], [r1], r2
michael@0 112 vld1.32 d7[0], [r1], r2
michael@0 113 vld1.32 d7[1], [r1], r2
michael@0 114
michael@0 115 ; add prediction and residual
michael@0 116 vaddw.u8 q1, q1, d6
michael@0 117 vaddw.u8 q2, q2, d7
michael@0 118
michael@0 119 vqmovun.s16 d1, q1
michael@0 120 vqmovun.s16 d2, q2
michael@0 121
michael@0 122 ; store to destination
michael@0 123 vst1.32 d1[0], [r3], r0
michael@0 124 vst1.32 d1[1], [r3], r0
michael@0 125 vst1.32 d2[0], [r3], r0
michael@0 126 vst1.32 d2[1], [r3], r0
michael@0 127
michael@0 128 bx lr
michael@0 129
michael@0 130 ENDP
michael@0 131
michael@0 132 ;-----------------
michael@0 133
michael@0 134 idct_coeff
michael@0 135 DCD 0x4e7b4e7b, 0x8a8c8a8c
michael@0 136
michael@0 137 ;20091, 20091, 35468, 35468
michael@0 138
michael@0 139 END

mercurial