1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libjpeg/simd/jf3dnflt.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,320 @@ 1.4 +; 1.5 +; jf3dnflt.asm - floating-point FDCT (3DNow!) 1.6 +; 1.7 +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 1.8 +; 1.9 +; Based on 1.10 +; x86 SIMD extension for IJG JPEG library 1.11 +; Copyright (C) 1999-2006, MIYASAKA Masaru. 1.12 +; For conditions of distribution and use, see copyright notice in jsimdext.inc 1.13 +; 1.14 +; This file should be assembled with NASM (Netwide Assembler), 1.15 +; can *not* be assembled with Microsoft's MASM or any compatible 1.16 +; assembler (including Borland's Turbo Assembler). 1.17 +; NASM is available from http://nasm.sourceforge.net/ or 1.18 +; http://sourceforge.net/project/showfiles.php?group_id=6208 1.19 +; 1.20 +; This file contains a floating-point implementation of the forward DCT 1.21 +; (Discrete Cosine Transform). The following code is based directly on 1.22 +; the IJG's original jfdctflt.c; see the jfdctflt.c for more details. 1.23 +; 1.24 +; [TAB8] 1.25 + 1.26 +%include "jsimdext.inc" 1.27 +%include "jdct.inc" 1.28 + 1.29 +; -------------------------------------------------------------------------- 1.30 + SECTION SEG_CONST 1.31 + 1.32 + alignz 16 1.33 + global EXTN(jconst_fdct_float_3dnow) 1.34 + 1.35 +EXTN(jconst_fdct_float_3dnow): 1.36 + 1.37 +PD_0_382 times 2 dd 0.382683432365089771728460 1.38 +PD_0_707 times 2 dd 0.707106781186547524400844 1.39 +PD_0_541 times 2 dd 0.541196100146196984399723 1.40 +PD_1_306 times 2 dd 1.306562964876376527856643 1.41 + 1.42 + alignz 16 1.43 + 1.44 +; -------------------------------------------------------------------------- 1.45 + SECTION SEG_TEXT 1.46 + BITS 32 1.47 +; 1.48 +; Perform the forward DCT on one block of samples. 1.49 +; 1.50 +; GLOBAL(void) 1.51 +; jsimd_fdct_float_3dnow (FAST_FLOAT * data) 1.52 +; 1.53 + 1.54 +%define data(b) (b)+8 ; FAST_FLOAT * data 1.55 + 1.56 +%define original_ebp ebp+0 1.57 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] 1.58 +%define WK_NUM 2 1.59 + 1.60 + align 16 1.61 + global EXTN(jsimd_fdct_float_3dnow) 1.62 + 1.63 +EXTN(jsimd_fdct_float_3dnow): 1.64 + push ebp 1.65 + mov eax,esp ; eax = original ebp 1.66 + sub esp, byte 4 1.67 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits 1.68 + mov [esp],eax 1.69 + mov ebp,esp ; ebp = aligned ebp 1.70 + lea esp, [wk(0)] 1.71 + pushpic ebx 1.72 +; push ecx ; need not be preserved 1.73 +; push edx ; need not be preserved 1.74 +; push esi ; unused 1.75 +; push edi ; unused 1.76 + 1.77 + get_GOT ebx ; get GOT address 1.78 + 1.79 + ; ---- Pass 1: process rows. 1.80 + 1.81 + mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) 1.82 + mov ecx, DCTSIZE/2 1.83 + alignx 16,7 1.84 +.rowloop: 1.85 + 1.86 + movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] 1.87 + movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] 1.88 + movq mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)] 1.89 + movq mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)] 1.90 + 1.91 + ; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17) 1.92 + 1.93 + movq mm4,mm0 ; transpose coefficients 1.94 + punpckldq mm0,mm1 ; mm0=(00 10)=data0 1.95 + punpckhdq mm4,mm1 ; mm4=(01 11)=data1 1.96 + movq mm5,mm2 ; transpose coefficients 1.97 + punpckldq mm2,mm3 ; mm2=(06 16)=data6 1.98 + punpckhdq mm5,mm3 ; mm5=(07 17)=data7 1.99 + 1.100 + movq mm6,mm4 1.101 + movq mm7,mm0 1.102 + pfsub mm4,mm2 ; mm4=data1-data6=tmp6 1.103 + pfsub mm0,mm5 ; mm0=data0-data7=tmp7 1.104 + pfadd mm6,mm2 ; mm6=data1+data6=tmp1 1.105 + pfadd mm7,mm5 ; mm7=data0+data7=tmp0 1.106 + 1.107 + movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] 1.108 + movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] 1.109 + movq mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)] 1.110 + movq mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)] 1.111 + 1.112 + ; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15) 1.113 + 1.114 + movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6 1.115 + movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7 1.116 + 1.117 + movq mm4,mm1 ; transpose coefficients 1.118 + punpckldq mm1,mm3 ; mm1=(02 12)=data2 1.119 + punpckhdq mm4,mm3 ; mm4=(03 13)=data3 1.120 + movq mm0,mm2 ; transpose coefficients 1.121 + punpckldq mm2,mm5 ; mm2=(04 14)=data4 1.122 + punpckhdq mm0,mm5 ; mm0=(05 15)=data5 1.123 + 1.124 + movq mm3,mm4 1.125 + movq mm5,mm1 1.126 + pfadd mm4,mm2 ; mm4=data3+data4=tmp3 1.127 + pfadd mm1,mm0 ; mm1=data2+data5=tmp2 1.128 + pfsub mm3,mm2 ; mm3=data3-data4=tmp4 1.129 + pfsub mm5,mm0 ; mm5=data2-data5=tmp5 1.130 + 1.131 + ; -- Even part 1.132 + 1.133 + movq mm2,mm7 1.134 + movq mm0,mm6 1.135 + pfsub mm7,mm4 ; mm7=tmp13 1.136 + pfsub mm6,mm1 ; mm6=tmp12 1.137 + pfadd mm2,mm4 ; mm2=tmp10 1.138 + pfadd mm0,mm1 ; mm0=tmp11 1.139 + 1.140 + pfadd mm6,mm7 1.141 + pfmul mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1 1.142 + 1.143 + movq mm4,mm2 1.144 + movq mm1,mm7 1.145 + pfsub mm2,mm0 ; mm2=data4 1.146 + pfsub mm7,mm6 ; mm7=data6 1.147 + pfadd mm4,mm0 ; mm4=data0 1.148 + pfadd mm1,mm6 ; mm1=data2 1.149 + 1.150 + movq MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2 1.151 + movq MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7 1.152 + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4 1.153 + movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1 1.154 + 1.155 + ; -- Odd part 1.156 + 1.157 + movq mm0, MMWORD [wk(0)] ; mm0=tmp6 1.158 + movq mm6, MMWORD [wk(1)] ; mm6=tmp7 1.159 + 1.160 + pfadd mm3,mm5 ; mm3=tmp10 1.161 + pfadd mm5,mm0 ; mm5=tmp11 1.162 + pfadd mm0,mm6 ; mm0=tmp12, mm6=tmp7 1.163 + 1.164 + pfmul mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3 1.165 + 1.166 + movq mm2,mm3 ; mm2=tmp10 1.167 + pfsub mm3,mm0 1.168 + pfmul mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5 1.169 + pfmul mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610) 1.170 + pfmul mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296) 1.171 + pfadd mm2,mm3 ; mm2=z2 1.172 + pfadd mm0,mm3 ; mm0=z4 1.173 + 1.174 + movq mm7,mm6 1.175 + pfsub mm6,mm5 ; mm6=z13 1.176 + pfadd mm7,mm5 ; mm7=z11 1.177 + 1.178 + movq mm4,mm6 1.179 + movq mm1,mm7 1.180 + pfsub mm6,mm2 ; mm6=data3 1.181 + pfsub mm7,mm0 ; mm7=data7 1.182 + pfadd mm4,mm2 ; mm4=data5 1.183 + pfadd mm1,mm0 ; mm1=data1 1.184 + 1.185 + movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6 1.186 + movq MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7 1.187 + movq MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4 1.188 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1 1.189 + 1.190 + add edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT 1.191 + dec ecx 1.192 + jnz near .rowloop 1.193 + 1.194 + ; ---- Pass 2: process columns. 1.195 + 1.196 + mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) 1.197 + mov ecx, DCTSIZE/2 1.198 + alignx 16,7 1.199 +.columnloop: 1.200 + 1.201 + movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] 1.202 + movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] 1.203 + movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)] 1.204 + movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)] 1.205 + 1.206 + ; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71) 1.207 + 1.208 + movq mm4,mm0 ; transpose coefficients 1.209 + punpckldq mm0,mm1 ; mm0=(00 01)=data0 1.210 + punpckhdq mm4,mm1 ; mm4=(10 11)=data1 1.211 + movq mm5,mm2 ; transpose coefficients 1.212 + punpckldq mm2,mm3 ; mm2=(60 61)=data6 1.213 + punpckhdq mm5,mm3 ; mm5=(70 71)=data7 1.214 + 1.215 + movq mm6,mm4 1.216 + movq mm7,mm0 1.217 + pfsub mm4,mm2 ; mm4=data1-data6=tmp6 1.218 + pfsub mm0,mm5 ; mm0=data0-data7=tmp7 1.219 + pfadd mm6,mm2 ; mm6=data1+data6=tmp1 1.220 + pfadd mm7,mm5 ; mm7=data0+data7=tmp0 1.221 + 1.222 + movq mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] 1.223 + movq mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] 1.224 + movq mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)] 1.225 + movq mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)] 1.226 + 1.227 + ; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51) 1.228 + 1.229 + movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6 1.230 + movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7 1.231 + 1.232 + movq mm4,mm1 ; transpose coefficients 1.233 + punpckldq mm1,mm3 ; mm1=(20 21)=data2 1.234 + punpckhdq mm4,mm3 ; mm4=(30 31)=data3 1.235 + movq mm0,mm2 ; transpose coefficients 1.236 + punpckldq mm2,mm5 ; mm2=(40 41)=data4 1.237 + punpckhdq mm0,mm5 ; mm0=(50 51)=data5 1.238 + 1.239 + movq mm3,mm4 1.240 + movq mm5,mm1 1.241 + pfadd mm4,mm2 ; mm4=data3+data4=tmp3 1.242 + pfadd mm1,mm0 ; mm1=data2+data5=tmp2 1.243 + pfsub mm3,mm2 ; mm3=data3-data4=tmp4 1.244 + pfsub mm5,mm0 ; mm5=data2-data5=tmp5 1.245 + 1.246 + ; -- Even part 1.247 + 1.248 + movq mm2,mm7 1.249 + movq mm0,mm6 1.250 + pfsub mm7,mm4 ; mm7=tmp13 1.251 + pfsub mm6,mm1 ; mm6=tmp12 1.252 + pfadd mm2,mm4 ; mm2=tmp10 1.253 + pfadd mm0,mm1 ; mm0=tmp11 1.254 + 1.255 + pfadd mm6,mm7 1.256 + pfmul mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1 1.257 + 1.258 + movq mm4,mm2 1.259 + movq mm1,mm7 1.260 + pfsub mm2,mm0 ; mm2=data4 1.261 + pfsub mm7,mm6 ; mm7=data6 1.262 + pfadd mm4,mm0 ; mm4=data0 1.263 + pfadd mm1,mm6 ; mm1=data2 1.264 + 1.265 + movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2 1.266 + movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7 1.267 + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4 1.268 + movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1 1.269 + 1.270 + ; -- Odd part 1.271 + 1.272 + movq mm0, MMWORD [wk(0)] ; mm0=tmp6 1.273 + movq mm6, MMWORD [wk(1)] ; mm6=tmp7 1.274 + 1.275 + pfadd mm3,mm5 ; mm3=tmp10 1.276 + pfadd mm5,mm0 ; mm5=tmp11 1.277 + pfadd mm0,mm6 ; mm0=tmp12, mm6=tmp7 1.278 + 1.279 + pfmul mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3 1.280 + 1.281 + movq mm2,mm3 ; mm2=tmp10 1.282 + pfsub mm3,mm0 1.283 + pfmul mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5 1.284 + pfmul mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610) 1.285 + pfmul mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296) 1.286 + pfadd mm2,mm3 ; mm2=z2 1.287 + pfadd mm0,mm3 ; mm0=z4 1.288 + 1.289 + movq mm7,mm6 1.290 + pfsub mm6,mm5 ; mm6=z13 1.291 + pfadd mm7,mm5 ; mm7=z11 1.292 + 1.293 + movq mm4,mm6 1.294 + movq mm1,mm7 1.295 + pfsub mm6,mm2 ; mm6=data3 1.296 + pfsub mm7,mm0 ; mm7=data7 1.297 + pfadd mm4,mm2 ; mm4=data5 1.298 + pfadd mm1,mm0 ; mm1=data1 1.299 + 1.300 + movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6 1.301 + movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7 1.302 + movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4 1.303 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1 1.304 + 1.305 + add edx, byte 2*SIZEOF_FAST_FLOAT 1.306 + dec ecx 1.307 + jnz near .columnloop 1.308 + 1.309 + femms ; empty MMX/3DNow! state 1.310 + 1.311 +; pop edi ; unused 1.312 +; pop esi ; unused 1.313 +; pop edx ; need not be preserved 1.314 +; pop ecx ; need not be preserved 1.315 + poppic ebx 1.316 + mov esp,ebp ; esp <- aligned ebp 1.317 + pop esp ; esp <- original ebp 1.318 + pop ebp 1.319 + ret 1.320 + 1.321 +; For some reason, the OS X linker does not honor the request to align the 1.322 +; segment unless we do this. 1.323 + align 16