media/libvpx/vp8/encoder/x86/dct_mmx.asm

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    12 %include "vpx_ports/x86_abi_support.asm"
    14 ;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
    15 global sym(vp8_short_fdct4x4_mmx) PRIVATE
    16 sym(vp8_short_fdct4x4_mmx):
    17     push        rbp
    18     mov         rbp,        rsp
    19     SHADOW_ARGS_TO_STACK 3
    20     GET_GOT     rbx
    21     push        rsi
    22     push        rdi
    23     ; end prolog
    25         mov         rsi,        arg(0)      ; input
    26         mov         rdi,        arg(1)      ; output
    28         movsxd      rax,        dword ptr arg(2) ;pitch
    30         lea         rcx,        [rsi + rax*2]
    31         ; read the input data
    32         movq        mm0,        [rsi]
    33         movq        mm1,        [rsi + rax]
    35         movq        mm2,        [rcx]
    36         movq        mm4,        [rcx + rax]
    38         ; transpose for the first stage
    39         movq        mm3,        mm0         ; 00 01 02 03
    40         movq        mm5,        mm2         ; 20 21 22 23
    42         punpcklwd   mm0,        mm1         ; 00 10 01 11
    43         punpckhwd   mm3,        mm1         ; 02 12 03 13
    45         punpcklwd   mm2,        mm4         ; 20 30 21 31
    46         punpckhwd   mm5,        mm4         ; 22 32 23 33
    48         movq        mm1,        mm0         ; 00 10 01 11
    49         punpckldq   mm0,        mm2         ; 00 10 20 30
    51         punpckhdq   mm1,        mm2         ; 01 11 21 31
    53         movq        mm2,        mm3         ; 02 12 03 13
    54         punpckldq   mm2,        mm5         ; 02 12 22 32
    56         punpckhdq   mm3,        mm5         ; 03 13 23 33
    58         ; mm0 0
    59         ; mm1 1
    60         ; mm2 2
    61         ; mm3 3
    63         ; first stage
    64         movq        mm5,        mm0
    65         movq        mm4,        mm1
    67         paddw       mm0,        mm3         ; a1 = 0 + 3
    68         paddw       mm1,        mm2         ; b1 = 1 + 2
    70         psubw       mm4,        mm2         ; c1 = 1 - 2
    71         psubw       mm5,        mm3         ; d1 = 0 - 3
    73         psllw       mm5,        3
    74         psllw       mm4,        3
    76         psllw       mm0,        3
    77         psllw       mm1,        3
    79         ; output 0 and 2
    80         movq        mm2,        mm0         ; a1
    82         paddw       mm0,        mm1         ; op[0] = a1 + b1
    83         psubw       mm2,        mm1         ; op[2] = a1 - b1
    85         ; output 1 and 3
    86         ; interleave c1, d1
    87         movq        mm1,        mm5         ; d1
    88         punpcklwd   mm1,        mm4         ; c1 d1
    89         punpckhwd   mm5,        mm4         ; c1 d1
    91         movq        mm3,        mm1
    92         movq        mm4,        mm5
    94         pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
    95         pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
    97         pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
    98         pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
   100         paddd       mm1,        MMWORD PTR[GLOBAL(_14500)]
   101         paddd       mm4,        MMWORD PTR[GLOBAL(_14500)]
   102         paddd       mm3,        MMWORD PTR[GLOBAL(_7500)]
   103         paddd       mm5,        MMWORD PTR[GLOBAL(_7500)]
   105         psrad       mm1,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
   106         psrad       mm4,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
   107         psrad       mm3,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
   108         psrad       mm5,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
   110         packssdw    mm1,        mm4         ; op[1]
   111         packssdw    mm3,        mm5         ; op[3]
   113         ; done with vertical
   114         ; transpose for the second stage
   115         movq        mm4,        mm0         ; 00 10 20 30
   116         movq        mm5,        mm2         ; 02 12 22 32
   118         punpcklwd   mm0,        mm1         ; 00 01 10 11
   119         punpckhwd   mm4,        mm1         ; 20 21 30 31
   121         punpcklwd   mm2,        mm3         ; 02 03 12 13
   122         punpckhwd   mm5,        mm3         ; 22 23 32 33
   124         movq        mm1,        mm0         ; 00 01 10 11
   125         punpckldq   mm0,        mm2         ; 00 01 02 03
   127         punpckhdq   mm1,        mm2         ; 01 22 12 13
   129         movq        mm2,        mm4         ; 20 31 30 31
   130         punpckldq   mm2,        mm5         ; 20 21 22 23
   132         punpckhdq   mm4,        mm5         ; 30 31 32 33
   134         ; mm0 0
   135         ; mm1 1
   136         ; mm2 2
   137         ; mm3 4
   139         movq        mm5,        mm0
   140         movq        mm3,        mm1
   142         paddw       mm0,        mm4         ; a1 = 0 + 3
   143         paddw       mm1,        mm2         ; b1 = 1 + 2
   145         psubw       mm3,        mm2         ; c1 = 1 - 2
   146         psubw       mm5,        mm4         ; d1 = 0 - 3
   148         pxor        mm6,        mm6         ; zero out for compare
   150         pcmpeqw     mm6,        mm5         ; d1 != 0
   152         pandn       mm6,        MMWORD PTR[GLOBAL(_cmp_mask)]   ; clear upper,
   153                                                                 ; and keep bit 0 of lower
   155         ; output 0 and 2
   156         movq        mm2,        mm0         ; a1
   158         paddw       mm0,        mm1         ; a1 + b1
   159         psubw       mm2,        mm1         ; a1 - b1
   161         paddw       mm0,        MMWORD PTR[GLOBAL(_7w)]
   162         paddw       mm2,        MMWORD PTR[GLOBAL(_7w)]
   164         psraw       mm0,        4           ; op[0] = (a1 + b1 + 7)>>4
   165         psraw       mm2,        4           ; op[8] = (a1 - b1 + 7)>>4
   167         movq        MMWORD PTR[rdi + 0 ],  mm0
   168         movq        MMWORD PTR[rdi + 16],  mm2
   170         ; output 1 and 3
   171         ; interleave c1, d1
   172         movq        mm1,        mm5         ; d1
   173         punpcklwd   mm1,        mm3         ; c1 d1
   174         punpckhwd   mm5,        mm3         ; c1 d1
   176         movq        mm3,        mm1
   177         movq        mm4,        mm5
   179         pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
   180         pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
   182         pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
   183         pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
   185         paddd       mm1,        MMWORD PTR[GLOBAL(_12000)]
   186         paddd       mm4,        MMWORD PTR[GLOBAL(_12000)]
   187         paddd       mm3,        MMWORD PTR[GLOBAL(_51000)]
   188         paddd       mm5,        MMWORD PTR[GLOBAL(_51000)]
   190         psrad       mm1,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
   191         psrad       mm4,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
   192         psrad       mm3,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
   193         psrad       mm5,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
   195         packssdw    mm1,        mm4         ; op[4]
   196         packssdw    mm3,        mm5         ; op[12]
   198         paddw       mm1,        mm6         ; op[4] += (d1!=0)
   200         movq        MMWORD PTR[rdi + 8 ],  mm1
   201         movq        MMWORD PTR[rdi + 24],  mm3
   203      ; begin epilog
   204     pop         rdi
   205     pop         rsi
   206     RESTORE_GOT
   207     UNSHADOW_ARGS
   208     pop         rbp
   209     ret
   211 SECTION_RODATA
   212 align 8
   213 _5352_2217:
   214     dw 5352
   215     dw 2217
   216     dw 5352
   217     dw 2217
   218 align 8
   219 _2217_neg5352:
   220     dw 2217
   221     dw -5352
   222     dw 2217
   223     dw -5352
   224 align 8
   225 _cmp_mask:
   226     times 4 dw 1
   227 align 8
   228 _7w:
   229     times 4 dw 7
   230 align 8
   231 _14500:
   232     times 2 dd 14500
   233 align 8
   234 _7500:
   235     times 2 dd 7500
   236 align 8
   237 _12000:
   238     times 2 dd 12000
   239 align 8
   240 _51000:
   241     times 2 dd 51000

mercurial