media/libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    12     EXPORT  |vp8_short_idct4x4llm_neon|
    13     ARM
    14     REQUIRE8
    15     PRESERVE8
    17     AREA ||.text||, CODE, READONLY, ALIGN=2
    19 ;*************************************************************
    20 ;void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch,
    21 ;                            unsigned char *dst, int stride)
    22 ;r0 short * input
    23 ;r1 short * pred
    24 ;r2 int pitch
    25 ;r3 unsigned char dst
    26 ;sp int stride
    27 ;*************************************************************
    29 ; static const int cospi8sqrt2minus1=20091;
    30 ; static const int sinpi8sqrt2      =35468;
    31 ; static const int rounding = 0;
    33 ; Optimization note: The resulted data from dequantization are signed
    34 ; 13-bit data that is in the range of [-4096, 4095]. This allows to
    35 ; use "vqdmulh"(neon) instruction since it won't go out of range
    36 ; (13+16+1=30bits<32bits). This instruction gives the high half
    37 ; result of the multiplication that is needed in IDCT.
    39 |vp8_short_idct4x4llm_neon| PROC
    40     adr             r12, idct_coeff
    41     vld1.16         {q1, q2}, [r0]
    42     vld1.16         {d0}, [r12]
    44     vswp            d3, d4                  ;q2(vp[4] vp[12])
    45     ldr             r0, [sp]                ; stride
    47     vqdmulh.s16     q3, q2, d0[2]
    48     vqdmulh.s16     q4, q2, d0[0]
    50     vqadd.s16       d12, d2, d3             ;a1
    51     vqsub.s16       d13, d2, d3             ;b1
    53     vshr.s16        q3, q3, #1
    54     vshr.s16        q4, q4, #1
    56     vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
    57     vqadd.s16       q4, q4, q2
    59     ;d6 - c1:temp1
    60     ;d7 - d1:temp2
    61     ;d8 - d1:temp1
    62     ;d9 - c1:temp2
    64     vqsub.s16       d10, d6, d9             ;c1
    65     vqadd.s16       d11, d7, d8             ;d1
    67     vqadd.s16       d2, d12, d11
    68     vqadd.s16       d3, d13, d10
    69     vqsub.s16       d4, d13, d10
    70     vqsub.s16       d5, d12, d11
    72     vtrn.32         d2, d4
    73     vtrn.32         d3, d5
    74     vtrn.16         d2, d3
    75     vtrn.16         d4, d5
    77     vswp            d3, d4
    79     vqdmulh.s16     q3, q2, d0[2]
    80     vqdmulh.s16     q4, q2, d0[0]
    82     vqadd.s16       d12, d2, d3             ;a1
    83     vqsub.s16       d13, d2, d3             ;b1
    85     vshr.s16        q3, q3, #1
    86     vshr.s16        q4, q4, #1
    88     vqadd.s16       q3, q3, q2              ;modify since sinpi8sqrt2 > 65536/2 (negtive number)
    89     vqadd.s16       q4, q4, q2
    91     vqsub.s16       d10, d6, d9             ;c1
    92     vqadd.s16       d11, d7, d8             ;d1
    94     vqadd.s16       d2, d12, d11
    95     vqadd.s16       d3, d13, d10
    96     vqsub.s16       d4, d13, d10
    97     vqsub.s16       d5, d12, d11
    99     vrshr.s16       d2, d2, #3
   100     vrshr.s16       d3, d3, #3
   101     vrshr.s16       d4, d4, #3
   102     vrshr.s16       d5, d5, #3
   104     vtrn.32         d2, d4
   105     vtrn.32         d3, d5
   106     vtrn.16         d2, d3
   107     vtrn.16         d4, d5
   109     ; load prediction data
   110     vld1.32         d6[0], [r1], r2
   111     vld1.32         d6[1], [r1], r2
   112     vld1.32         d7[0], [r1], r2
   113     vld1.32         d7[1], [r1], r2
   115     ; add prediction and residual
   116     vaddw.u8        q1, q1, d6
   117     vaddw.u8        q2, q2, d7
   119     vqmovun.s16     d1, q1
   120     vqmovun.s16     d2, q2
   122     ; store to destination
   123     vst1.32         d1[0], [r3], r0
   124     vst1.32         d1[1], [r3], r0
   125     vst1.32         d2[0], [r3], r0
   126     vst1.32         d2[1], [r3], r0
   128     bx              lr
   130     ENDP
   132 ;-----------------
   134 idct_coeff
   135     DCD     0x4e7b4e7b, 0x8a8c8a8c
   137 ;20091, 20091, 35468, 35468
   139     END

mercurial