1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/encoder/x86/dct_mmx.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,241 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 +%include "vpx_ports/x86_abi_support.asm" 1.16 + 1.17 +;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch) 1.18 +global sym(vp8_short_fdct4x4_mmx) PRIVATE 1.19 +sym(vp8_short_fdct4x4_mmx): 1.20 + push rbp 1.21 + mov rbp, rsp 1.22 + SHADOW_ARGS_TO_STACK 3 1.23 + GET_GOT rbx 1.24 + push rsi 1.25 + push rdi 1.26 + ; end prolog 1.27 + 1.28 + mov rsi, arg(0) ; input 1.29 + mov rdi, arg(1) ; output 1.30 + 1.31 + movsxd rax, dword ptr arg(2) ;pitch 1.32 + 1.33 + lea rcx, [rsi + rax*2] 1.34 + ; read the input data 1.35 + movq mm0, [rsi] 1.36 + movq mm1, [rsi + rax] 1.37 + 1.38 + movq mm2, [rcx] 1.39 + movq mm4, [rcx + rax] 1.40 + 1.41 + ; transpose for the first stage 1.42 + movq mm3, mm0 ; 00 01 02 03 1.43 + movq mm5, mm2 ; 20 21 22 23 1.44 + 1.45 + punpcklwd mm0, mm1 ; 00 10 01 11 1.46 + punpckhwd mm3, mm1 ; 02 12 03 13 1.47 + 1.48 + punpcklwd mm2, mm4 ; 20 30 21 31 1.49 + punpckhwd mm5, mm4 ; 22 32 23 33 1.50 + 1.51 + movq mm1, mm0 ; 00 10 01 11 1.52 + punpckldq mm0, mm2 ; 00 10 20 30 1.53 + 1.54 + punpckhdq mm1, mm2 ; 01 11 21 31 1.55 + 1.56 + movq mm2, mm3 ; 02 12 03 13 1.57 + punpckldq mm2, mm5 ; 02 12 22 32 1.58 + 1.59 + punpckhdq mm3, mm5 ; 03 13 23 33 1.60 + 1.61 + ; mm0 0 1.62 + ; mm1 1 1.63 + ; mm2 2 1.64 + ; mm3 3 1.65 + 1.66 + ; first stage 1.67 + movq mm5, mm0 1.68 + movq mm4, mm1 1.69 + 1.70 + paddw mm0, mm3 ; a1 = 0 + 3 1.71 + paddw mm1, mm2 ; b1 = 1 + 2 1.72 + 1.73 + psubw mm4, mm2 ; c1 = 1 - 2 1.74 + psubw mm5, mm3 ; d1 = 0 - 3 1.75 + 1.76 + psllw mm5, 3 1.77 + psllw mm4, 3 1.78 + 1.79 + psllw mm0, 3 1.80 + psllw mm1, 3 1.81 + 1.82 + ; output 0 and 2 1.83 + movq mm2, mm0 ; a1 1.84 + 1.85 + paddw mm0, mm1 ; op[0] = a1 + b1 1.86 + psubw mm2, mm1 ; op[2] = a1 - b1 1.87 + 1.88 + ; output 1 and 3 1.89 + ; interleave c1, d1 1.90 + movq mm1, mm5 ; d1 1.91 + punpcklwd mm1, mm4 ; c1 d1 1.92 + punpckhwd mm5, mm4 ; c1 d1 1.93 + 1.94 + movq mm3, mm1 1.95 + movq mm4, mm5 1.96 + 1.97 + pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 1.98 + pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 1.99 + 1.100 + pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 1.101 + pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 1.102 + 1.103 + paddd mm1, MMWORD PTR[GLOBAL(_14500)] 1.104 + paddd mm4, MMWORD PTR[GLOBAL(_14500)] 1.105 + paddd mm3, MMWORD PTR[GLOBAL(_7500)] 1.106 + paddd mm5, MMWORD PTR[GLOBAL(_7500)] 1.107 + 1.108 + psrad mm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 1.109 + psrad mm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 1.110 + psrad mm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 1.111 + psrad mm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 1.112 + 1.113 + packssdw mm1, mm4 ; op[1] 1.114 + packssdw mm3, mm5 ; op[3] 1.115 + 1.116 + ; done with vertical 1.117 + ; transpose for the second stage 1.118 + movq mm4, mm0 ; 00 10 20 30 1.119 + movq mm5, mm2 ; 02 12 22 32 1.120 + 1.121 + punpcklwd mm0, mm1 ; 00 01 10 11 1.122 + punpckhwd mm4, mm1 ; 20 21 30 31 1.123 + 1.124 + punpcklwd mm2, mm3 ; 02 03 12 13 1.125 + punpckhwd mm5, mm3 ; 22 23 32 33 1.126 + 1.127 + movq mm1, mm0 ; 00 01 10 11 1.128 + punpckldq mm0, mm2 ; 00 01 02 03 1.129 + 1.130 + punpckhdq mm1, mm2 ; 01 22 12 13 1.131 + 1.132 + movq mm2, mm4 ; 20 31 30 31 1.133 + punpckldq mm2, mm5 ; 20 21 22 23 1.134 + 1.135 + punpckhdq mm4, mm5 ; 30 31 32 33 1.136 + 1.137 + ; mm0 0 1.138 + ; mm1 1 1.139 + ; mm2 2 1.140 + ; mm3 4 1.141 + 1.142 + movq mm5, mm0 1.143 + movq mm3, mm1 1.144 + 1.145 + paddw mm0, mm4 ; a1 = 0 + 3 1.146 + paddw mm1, mm2 ; b1 = 1 + 2 1.147 + 1.148 + psubw mm3, mm2 ; c1 = 1 - 2 1.149 + psubw mm5, mm4 ; d1 = 0 - 3 1.150 + 1.151 + pxor mm6, mm6 ; zero out for compare 1.152 + 1.153 + pcmpeqw mm6, mm5 ; d1 != 0 1.154 + 1.155 + pandn mm6, MMWORD PTR[GLOBAL(_cmp_mask)] ; clear upper, 1.156 + ; and keep bit 0 of lower 1.157 + 1.158 + ; output 0 and 2 1.159 + movq mm2, mm0 ; a1 1.160 + 1.161 + paddw mm0, mm1 ; a1 + b1 1.162 + psubw mm2, mm1 ; a1 - b1 1.163 + 1.164 + paddw mm0, MMWORD PTR[GLOBAL(_7w)] 1.165 + paddw mm2, MMWORD PTR[GLOBAL(_7w)] 1.166 + 1.167 + psraw mm0, 4 ; op[0] = (a1 + b1 + 7)>>4 1.168 + psraw mm2, 4 ; op[8] = (a1 - b1 + 7)>>4 1.169 + 1.170 + movq MMWORD PTR[rdi + 0 ], mm0 1.171 + movq MMWORD PTR[rdi + 16], mm2 1.172 + 1.173 + ; output 1 and 3 1.174 + ; interleave c1, d1 1.175 + movq mm1, mm5 ; d1 1.176 + punpcklwd mm1, mm3 ; c1 d1 1.177 + punpckhwd mm5, mm3 ; c1 d1 1.178 + 1.179 + movq mm3, mm1 1.180 + movq mm4, mm5 1.181 + 1.182 + pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 1.183 + pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 1.184 + 1.185 + pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 1.186 + pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 1.187 + 1.188 + paddd mm1, MMWORD PTR[GLOBAL(_12000)] 1.189 + paddd mm4, MMWORD PTR[GLOBAL(_12000)] 1.190 + paddd mm3, MMWORD PTR[GLOBAL(_51000)] 1.191 + paddd mm5, MMWORD PTR[GLOBAL(_51000)] 1.192 + 1.193 + psrad mm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 1.194 + psrad mm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 1.195 + psrad mm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 1.196 + psrad mm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 1.197 + 1.198 + packssdw mm1, mm4 ; op[4] 1.199 + packssdw mm3, mm5 ; op[12] 1.200 + 1.201 + paddw mm1, mm6 ; op[4] += (d1!=0) 1.202 + 1.203 + movq MMWORD PTR[rdi + 8 ], mm1 1.204 + movq MMWORD PTR[rdi + 24], mm3 1.205 + 1.206 + ; begin epilog 1.207 + pop rdi 1.208 + pop rsi 1.209 + RESTORE_GOT 1.210 + UNSHADOW_ARGS 1.211 + pop rbp 1.212 + ret 1.213 + 1.214 +SECTION_RODATA 1.215 +align 8 1.216 +_5352_2217: 1.217 + dw 5352 1.218 + dw 2217 1.219 + dw 5352 1.220 + dw 2217 1.221 +align 8 1.222 +_2217_neg5352: 1.223 + dw 2217 1.224 + dw -5352 1.225 + dw 2217 1.226 + dw -5352 1.227 +align 8 1.228 +_cmp_mask: 1.229 + times 4 dw 1 1.230 +align 8 1.231 +_7w: 1.232 + times 4 dw 7 1.233 +align 8 1.234 +_14500: 1.235 + times 2 dd 14500 1.236 +align 8 1.237 +_7500: 1.238 + times 2 dd 7500 1.239 +align 8 1.240 +_12000: 1.241 + times 2 dd 12000 1.242 +align 8 1.243 +_51000: 1.244 + times 2 dd 51000