1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libjpeg/simd/ji3dnflt.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,452 @@ 1.4 +; 1.5 +; ji3dnflt.asm - floating-point IDCT (3DNow! & MMX) 1.6 +; 1.7 +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 1.8 +; 1.9 +; Based on 1.10 +; x86 SIMD extension for IJG JPEG library 1.11 +; Copyright (C) 1999-2006, MIYASAKA Masaru. 1.12 +; For conditions of distribution and use, see copyright notice in jsimdext.inc 1.13 +; 1.14 +; This file should be assembled with NASM (Netwide Assembler), 1.15 +; can *not* be assembled with Microsoft's MASM or any compatible 1.16 +; assembler (including Borland's Turbo Assembler). 1.17 +; NASM is available from http://nasm.sourceforge.net/ or 1.18 +; http://sourceforge.net/project/showfiles.php?group_id=6208 1.19 +; 1.20 +; This file contains a floating-point implementation of the inverse DCT 1.21 +; (Discrete Cosine Transform). The following code is based directly on 1.22 +; the IJG's original jidctflt.c; see the jidctflt.c for more details. 1.23 +; 1.24 +; [TAB8] 1.25 + 1.26 +%include "jsimdext.inc" 1.27 +%include "jdct.inc" 1.28 + 1.29 +; -------------------------------------------------------------------------- 1.30 + SECTION SEG_CONST 1.31 + 1.32 + alignz 16 1.33 + global EXTN(jconst_idct_float_3dnow) 1.34 + 1.35 +EXTN(jconst_idct_float_3dnow): 1.36 + 1.37 +PD_1_414 times 2 dd 1.414213562373095048801689 1.38 +PD_1_847 times 2 dd 1.847759065022573512256366 1.39 +PD_1_082 times 2 dd 1.082392200292393968799446 1.40 +PD_2_613 times 2 dd 2.613125929752753055713286 1.41 +PD_RNDINT_MAGIC times 2 dd 100663296.0 ; (float)(0x00C00000 << 3) 1.42 +PB_CENTERJSAMP times 8 db CENTERJSAMPLE 1.43 + 1.44 + alignz 16 1.45 + 1.46 +; -------------------------------------------------------------------------- 1.47 + SECTION SEG_TEXT 1.48 + BITS 32 1.49 +; 1.50 +; Perform dequantization and inverse DCT on one block of coefficients. 1.51 +; 1.52 +; GLOBAL(void) 1.53 +; jsimd_idct_float_3dnow (void * dct_table, JCOEFPTR coef_block, 1.54 +; JSAMPARRAY output_buf, JDIMENSION output_col) 1.55 +; 1.56 + 1.57 +%define dct_table(b) (b)+8 ; void * dct_table 1.58 +%define coef_block(b) (b)+12 ; JCOEFPTR coef_block 1.59 +%define output_buf(b) (b)+16 ; JSAMPARRAY output_buf 1.60 +%define output_col(b) (b)+20 ; JDIMENSION output_col 1.61 + 1.62 +%define original_ebp ebp+0 1.63 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] 1.64 +%define WK_NUM 2 1.65 +%define workspace wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT 1.66 + ; FAST_FLOAT workspace[DCTSIZE2] 1.67 + 1.68 + align 16 1.69 + global EXTN(jsimd_idct_float_3dnow) 1.70 + 1.71 +EXTN(jsimd_idct_float_3dnow): 1.72 + push ebp 1.73 + mov eax,esp ; eax = original ebp 1.74 + sub esp, byte 4 1.75 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits 1.76 + mov [esp],eax 1.77 + mov ebp,esp ; ebp = aligned ebp 1.78 + lea esp, [workspace] 1.79 + push ebx 1.80 +; push ecx ; need not be preserved 1.81 +; push edx ; need not be preserved 1.82 + push esi 1.83 + push edi 1.84 + 1.85 + get_GOT ebx ; get GOT address 1.86 + 1.87 + ; ---- Pass 1: process columns from input, store into work array. 1.88 + 1.89 +; mov eax, [original_ebp] 1.90 + mov edx, POINTER [dct_table(eax)] ; quantptr 1.91 + mov esi, JCOEFPTR [coef_block(eax)] ; inptr 1.92 + lea edi, [workspace] ; FAST_FLOAT * wsptr 1.93 + mov ecx, DCTSIZE/2 ; ctr 1.94 + alignx 16,7 1.95 +.columnloop: 1.96 +%ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW 1.97 + mov eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] 1.98 + or eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] 1.99 + jnz short .columnDCT 1.100 + 1.101 + pushpic ebx ; save GOT address 1.102 + mov ebx, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)] 1.103 + mov eax, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)] 1.104 + or ebx, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)] 1.105 + or eax, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)] 1.106 + or ebx, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)] 1.107 + or eax,ebx 1.108 + poppic ebx ; restore GOT address 1.109 + jnz short .columnDCT 1.110 + 1.111 + ; -- AC terms all zero 1.112 + 1.113 + movd mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)] 1.114 + 1.115 + punpcklwd mm0,mm0 1.116 + psrad mm0,(DWORD_BIT-WORD_BIT) 1.117 + pi2fd mm0,mm0 1.118 + 1.119 + pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 1.120 + 1.121 + movq mm1,mm0 1.122 + punpckldq mm0,mm0 1.123 + punpckhdq mm1,mm1 1.124 + 1.125 + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0 1.126 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0 1.127 + movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0 1.128 + movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 1.129 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1 1.130 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1 1.131 + movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1 1.132 + movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1 1.133 + jmp near .nextcolumn 1.134 + alignx 16,7 1.135 +%endif 1.136 +.columnDCT: 1.137 + 1.138 + ; -- Even part 1.139 + 1.140 + movd mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)] 1.141 + movd mm1, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)] 1.142 + movd mm2, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)] 1.143 + movd mm3, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)] 1.144 + 1.145 + punpcklwd mm0,mm0 1.146 + punpcklwd mm1,mm1 1.147 + psrad mm0,(DWORD_BIT-WORD_BIT) 1.148 + psrad mm1,(DWORD_BIT-WORD_BIT) 1.149 + pi2fd mm0,mm0 1.150 + pi2fd mm1,mm1 1.151 + 1.152 + pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 1.153 + pfmul mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 1.154 + 1.155 + punpcklwd mm2,mm2 1.156 + punpcklwd mm3,mm3 1.157 + psrad mm2,(DWORD_BIT-WORD_BIT) 1.158 + psrad mm3,(DWORD_BIT-WORD_BIT) 1.159 + pi2fd mm2,mm2 1.160 + pi2fd mm3,mm3 1.161 + 1.162 + pfmul mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 1.163 + pfmul mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 1.164 + 1.165 + movq mm4,mm0 1.166 + movq mm5,mm1 1.167 + pfsub mm0,mm2 ; mm0=tmp11 1.168 + pfsub mm1,mm3 1.169 + pfadd mm4,mm2 ; mm4=tmp10 1.170 + pfadd mm5,mm3 ; mm5=tmp13 1.171 + 1.172 + pfmul mm1,[GOTOFF(ebx,PD_1_414)] 1.173 + pfsub mm1,mm5 ; mm1=tmp12 1.174 + 1.175 + movq mm6,mm4 1.176 + movq mm7,mm0 1.177 + pfsub mm4,mm5 ; mm4=tmp3 1.178 + pfsub mm0,mm1 ; mm0=tmp2 1.179 + pfadd mm6,mm5 ; mm6=tmp0 1.180 + pfadd mm7,mm1 ; mm7=tmp1 1.181 + 1.182 + movq MMWORD [wk(1)], mm4 ; tmp3 1.183 + movq MMWORD [wk(0)], mm0 ; tmp2 1.184 + 1.185 + ; -- Odd part 1.186 + 1.187 + movd mm2, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)] 1.188 + movd mm3, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)] 1.189 + movd mm5, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)] 1.190 + movd mm1, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)] 1.191 + 1.192 + punpcklwd mm2,mm2 1.193 + punpcklwd mm3,mm3 1.194 + psrad mm2,(DWORD_BIT-WORD_BIT) 1.195 + psrad mm3,(DWORD_BIT-WORD_BIT) 1.196 + pi2fd mm2,mm2 1.197 + pi2fd mm3,mm3 1.198 + 1.199 + pfmul mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 1.200 + pfmul mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 1.201 + 1.202 + punpcklwd mm5,mm5 1.203 + punpcklwd mm1,mm1 1.204 + psrad mm5,(DWORD_BIT-WORD_BIT) 1.205 + psrad mm1,(DWORD_BIT-WORD_BIT) 1.206 + pi2fd mm5,mm5 1.207 + pi2fd mm1,mm1 1.208 + 1.209 + pfmul mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 1.210 + pfmul mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)] 1.211 + 1.212 + movq mm4,mm2 1.213 + movq mm0,mm5 1.214 + pfadd mm2,mm1 ; mm2=z11 1.215 + pfadd mm5,mm3 ; mm5=z13 1.216 + pfsub mm4,mm1 ; mm4=z12 1.217 + pfsub mm0,mm3 ; mm0=z10 1.218 + 1.219 + movq mm1,mm2 1.220 + pfsub mm2,mm5 1.221 + pfadd mm1,mm5 ; mm1=tmp7 1.222 + 1.223 + pfmul mm2,[GOTOFF(ebx,PD_1_414)] ; mm2=tmp11 1.224 + 1.225 + movq mm3,mm0 1.226 + pfadd mm0,mm4 1.227 + pfmul mm0,[GOTOFF(ebx,PD_1_847)] ; mm0=z5 1.228 + pfmul mm3,[GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930) 1.229 + pfmul mm4,[GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200) 1.230 + pfsubr mm3,mm0 ; mm3=tmp12 1.231 + pfsub mm4,mm0 ; mm4=tmp10 1.232 + 1.233 + ; -- Final output stage 1.234 + 1.235 + pfsub mm3,mm1 ; mm3=tmp6 1.236 + movq mm5,mm6 1.237 + movq mm0,mm7 1.238 + pfadd mm6,mm1 ; mm6=data0=(00 01) 1.239 + pfadd mm7,mm3 ; mm7=data1=(10 11) 1.240 + pfsub mm5,mm1 ; mm5=data7=(70 71) 1.241 + pfsub mm0,mm3 ; mm0=data6=(60 61) 1.242 + pfsub mm2,mm3 ; mm2=tmp5 1.243 + 1.244 + movq mm1,mm6 ; transpose coefficients 1.245 + punpckldq mm6,mm7 ; mm6=(00 10) 1.246 + punpckhdq mm1,mm7 ; mm1=(01 11) 1.247 + movq mm3,mm0 ; transpose coefficients 1.248 + punpckldq mm0,mm5 ; mm0=(60 70) 1.249 + punpckhdq mm3,mm5 ; mm3=(61 71) 1.250 + 1.251 + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6 1.252 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1 1.253 + movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 1.254 + movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3 1.255 + 1.256 + movq mm7, MMWORD [wk(0)] ; mm7=tmp2 1.257 + movq mm5, MMWORD [wk(1)] ; mm5=tmp3 1.258 + 1.259 + pfadd mm4,mm2 ; mm4=tmp4 1.260 + movq mm6,mm7 1.261 + movq mm1,mm5 1.262 + pfadd mm7,mm2 ; mm7=data2=(20 21) 1.263 + pfadd mm5,mm4 ; mm5=data4=(40 41) 1.264 + pfsub mm6,mm2 ; mm6=data5=(50 51) 1.265 + pfsub mm1,mm4 ; mm1=data3=(30 31) 1.266 + 1.267 + movq mm0,mm7 ; transpose coefficients 1.268 + punpckldq mm7,mm1 ; mm7=(20 30) 1.269 + punpckhdq mm0,mm1 ; mm0=(21 31) 1.270 + movq mm3,mm5 ; transpose coefficients 1.271 + punpckldq mm5,mm6 ; mm5=(40 50) 1.272 + punpckhdq mm3,mm6 ; mm3=(41 51) 1.273 + 1.274 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7 1.275 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0 1.276 + movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5 1.277 + movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3 1.278 + 1.279 +.nextcolumn: 1.280 + add esi, byte 2*SIZEOF_JCOEF ; coef_block 1.281 + add edx, byte 2*SIZEOF_FLOAT_MULT_TYPE ; quantptr 1.282 + add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT ; wsptr 1.283 + dec ecx ; ctr 1.284 + jnz near .columnloop 1.285 + 1.286 + ; -- Prefetch the next coefficient block 1.287 + 1.288 + prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32] 1.289 + prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32] 1.290 + prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32] 1.291 + prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32] 1.292 + 1.293 + ; ---- Pass 2: process rows from work array, store into output array. 1.294 + 1.295 + mov eax, [original_ebp] 1.296 + lea esi, [workspace] ; FAST_FLOAT * wsptr 1.297 + mov edi, JSAMPARRAY [output_buf(eax)] ; (JSAMPROW *) 1.298 + mov eax, JDIMENSION [output_col(eax)] 1.299 + mov ecx, DCTSIZE/2 ; ctr 1.300 + alignx 16,7 1.301 +.rowloop: 1.302 + 1.303 + ; -- Even part 1.304 + 1.305 + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] 1.306 + movq mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)] 1.307 + movq mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)] 1.308 + movq mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)] 1.309 + 1.310 + movq mm4,mm0 1.311 + movq mm5,mm1 1.312 + pfsub mm0,mm2 ; mm0=tmp11 1.313 + pfsub mm1,mm3 1.314 + pfadd mm4,mm2 ; mm4=tmp10 1.315 + pfadd mm5,mm3 ; mm5=tmp13 1.316 + 1.317 + pfmul mm1,[GOTOFF(ebx,PD_1_414)] 1.318 + pfsub mm1,mm5 ; mm1=tmp12 1.319 + 1.320 + movq mm6,mm4 1.321 + movq mm7,mm0 1.322 + pfsub mm4,mm5 ; mm4=tmp3 1.323 + pfsub mm0,mm1 ; mm0=tmp2 1.324 + pfadd mm6,mm5 ; mm6=tmp0 1.325 + pfadd mm7,mm1 ; mm7=tmp1 1.326 + 1.327 + movq MMWORD [wk(1)], mm4 ; tmp3 1.328 + movq MMWORD [wk(0)], mm0 ; tmp2 1.329 + 1.330 + ; -- Odd part 1.331 + 1.332 + movq mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] 1.333 + movq mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)] 1.334 + movq mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)] 1.335 + movq mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)] 1.336 + 1.337 + movq mm4,mm2 1.338 + movq mm0,mm5 1.339 + pfadd mm2,mm1 ; mm2=z11 1.340 + pfadd mm5,mm3 ; mm5=z13 1.341 + pfsub mm4,mm1 ; mm4=z12 1.342 + pfsub mm0,mm3 ; mm0=z10 1.343 + 1.344 + movq mm1,mm2 1.345 + pfsub mm2,mm5 1.346 + pfadd mm1,mm5 ; mm1=tmp7 1.347 + 1.348 + pfmul mm2,[GOTOFF(ebx,PD_1_414)] ; mm2=tmp11 1.349 + 1.350 + movq mm3,mm0 1.351 + pfadd mm0,mm4 1.352 + pfmul mm0,[GOTOFF(ebx,PD_1_847)] ; mm0=z5 1.353 + pfmul mm3,[GOTOFF(ebx,PD_2_613)] ; mm3=(z10 * 2.613125930) 1.354 + pfmul mm4,[GOTOFF(ebx,PD_1_082)] ; mm4=(z12 * 1.082392200) 1.355 + pfsubr mm3,mm0 ; mm3=tmp12 1.356 + pfsub mm4,mm0 ; mm4=tmp10 1.357 + 1.358 + ; -- Final output stage 1.359 + 1.360 + pfsub mm3,mm1 ; mm3=tmp6 1.361 + movq mm5,mm6 1.362 + movq mm0,mm7 1.363 + pfadd mm6,mm1 ; mm6=data0=(00 10) 1.364 + pfadd mm7,mm3 ; mm7=data1=(01 11) 1.365 + pfsub mm5,mm1 ; mm5=data7=(07 17) 1.366 + pfsub mm0,mm3 ; mm0=data6=(06 16) 1.367 + pfsub mm2,mm3 ; mm2=tmp5 1.368 + 1.369 + movq mm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm1=[PD_RNDINT_MAGIC] 1.370 + pcmpeqd mm3,mm3 1.371 + psrld mm3,WORD_BIT ; mm3={0xFFFF 0x0000 0xFFFF 0x0000} 1.372 + 1.373 + pfadd mm6,mm1 ; mm6=roundint(data0/8)=(00 ** 10 **) 1.374 + pfadd mm7,mm1 ; mm7=roundint(data1/8)=(01 ** 11 **) 1.375 + pfadd mm0,mm1 ; mm0=roundint(data6/8)=(06 ** 16 **) 1.376 + pfadd mm5,mm1 ; mm5=roundint(data7/8)=(07 ** 17 **) 1.377 + 1.378 + pand mm6,mm3 ; mm6=(00 -- 10 --) 1.379 + pslld mm7,WORD_BIT ; mm7=(-- 01 -- 11) 1.380 + pand mm0,mm3 ; mm0=(06 -- 16 --) 1.381 + pslld mm5,WORD_BIT ; mm5=(-- 07 -- 17) 1.382 + por mm6,mm7 ; mm6=(00 01 10 11) 1.383 + por mm0,mm5 ; mm0=(06 07 16 17) 1.384 + 1.385 + movq mm1, MMWORD [wk(0)] ; mm1=tmp2 1.386 + movq mm3, MMWORD [wk(1)] ; mm3=tmp3 1.387 + 1.388 + pfadd mm4,mm2 ; mm4=tmp4 1.389 + movq mm7,mm1 1.390 + movq mm5,mm3 1.391 + pfadd mm1,mm2 ; mm1=data2=(02 12) 1.392 + pfadd mm3,mm4 ; mm3=data4=(04 14) 1.393 + pfsub mm7,mm2 ; mm7=data5=(05 15) 1.394 + pfsub mm5,mm4 ; mm5=data3=(03 13) 1.395 + 1.396 + movq mm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)] ; mm2=[PD_RNDINT_MAGIC] 1.397 + pcmpeqd mm4,mm4 1.398 + psrld mm4,WORD_BIT ; mm4={0xFFFF 0x0000 0xFFFF 0x0000} 1.399 + 1.400 + pfadd mm3,mm2 ; mm3=roundint(data4/8)=(04 ** 14 **) 1.401 + pfadd mm7,mm2 ; mm7=roundint(data5/8)=(05 ** 15 **) 1.402 + pfadd mm1,mm2 ; mm1=roundint(data2/8)=(02 ** 12 **) 1.403 + pfadd mm5,mm2 ; mm5=roundint(data3/8)=(03 ** 13 **) 1.404 + 1.405 + pand mm3,mm4 ; mm3=(04 -- 14 --) 1.406 + pslld mm7,WORD_BIT ; mm7=(-- 05 -- 15) 1.407 + pand mm1,mm4 ; mm1=(02 -- 12 --) 1.408 + pslld mm5,WORD_BIT ; mm5=(-- 03 -- 13) 1.409 + por mm3,mm7 ; mm3=(04 05 14 15) 1.410 + por mm1,mm5 ; mm1=(02 03 12 13) 1.411 + 1.412 + movq mm2,[GOTOFF(ebx,PB_CENTERJSAMP)] ; mm2=[PB_CENTERJSAMP] 1.413 + 1.414 + packsswb mm6,mm3 ; mm6=(00 01 10 11 04 05 14 15) 1.415 + packsswb mm1,mm0 ; mm1=(02 03 12 13 06 07 16 17) 1.416 + paddb mm6,mm2 1.417 + paddb mm1,mm2 1.418 + 1.419 + movq mm4,mm6 ; transpose coefficients(phase 2) 1.420 + punpcklwd mm6,mm1 ; mm6=(00 01 02 03 10 11 12 13) 1.421 + punpckhwd mm4,mm1 ; mm4=(04 05 06 07 14 15 16 17) 1.422 + 1.423 + movq mm7,mm6 ; transpose coefficients(phase 3) 1.424 + punpckldq mm6,mm4 ; mm6=(00 01 02 03 04 05 06 07) 1.425 + punpckhdq mm7,mm4 ; mm7=(10 11 12 13 14 15 16 17) 1.426 + 1.427 + pushpic ebx ; save GOT address 1.428 + 1.429 + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] 1.430 + mov ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW] 1.431 + movq MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6 1.432 + movq MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7 1.433 + 1.434 + poppic ebx ; restore GOT address 1.435 + 1.436 + add esi, byte 2*SIZEOF_FAST_FLOAT ; wsptr 1.437 + add edi, byte 2*SIZEOF_JSAMPROW 1.438 + dec ecx ; ctr 1.439 + jnz near .rowloop 1.440 + 1.441 + femms ; empty MMX/3DNow! state 1.442 + 1.443 + pop edi 1.444 + pop esi 1.445 +; pop edx ; need not be preserved 1.446 +; pop ecx ; need not be preserved 1.447 + pop ebx 1.448 + mov esp,ebp ; esp <- aligned ebp 1.449 + pop esp ; esp <- original ebp 1.450 + pop ebp 1.451 + ret 1.452 + 1.453 +; For some reason, the OS X linker does not honor the request to align the 1.454 +; segment unless we do this. 1.455 + align 16