1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libjpeg/simd/jfmmxint.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,622 @@ 1.4 +; 1.5 +; jfmmxint.asm - accurate integer FDCT (MMX) 1.6 +; 1.7 +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 1.8 +; 1.9 +; Based on 1.10 +; x86 SIMD extension for IJG JPEG library 1.11 +; Copyright (C) 1999-2006, MIYASAKA Masaru. 1.12 +; For conditions of distribution and use, see copyright notice in jsimdext.inc 1.13 +; 1.14 +; This file should be assembled with NASM (Netwide Assembler), 1.15 +; can *not* be assembled with Microsoft's MASM or any compatible 1.16 +; assembler (including Borland's Turbo Assembler). 1.17 +; NASM is available from http://nasm.sourceforge.net/ or 1.18 +; http://sourceforge.net/project/showfiles.php?group_id=6208 1.19 +; 1.20 +; This file contains a slow-but-accurate integer implementation of the 1.21 +; forward DCT (Discrete Cosine Transform). The following code is based 1.22 +; directly on the IJG's original jfdctint.c; see the jfdctint.c for 1.23 +; more details. 1.24 +; 1.25 +; [TAB8] 1.26 + 1.27 +%include "jsimdext.inc" 1.28 +%include "jdct.inc" 1.29 + 1.30 +; -------------------------------------------------------------------------- 1.31 + 1.32 +%define CONST_BITS 13 1.33 +%define PASS1_BITS 2 1.34 + 1.35 +%define DESCALE_P1 (CONST_BITS-PASS1_BITS) 1.36 +%define DESCALE_P2 (CONST_BITS+PASS1_BITS) 1.37 + 1.38 +%if CONST_BITS == 13 1.39 +F_0_298 equ 2446 ; FIX(0.298631336) 1.40 +F_0_390 equ 3196 ; FIX(0.390180644) 1.41 +F_0_541 equ 4433 ; FIX(0.541196100) 1.42 +F_0_765 equ 6270 ; FIX(0.765366865) 1.43 +F_0_899 equ 7373 ; FIX(0.899976223) 1.44 +F_1_175 equ 9633 ; FIX(1.175875602) 1.45 +F_1_501 equ 12299 ; FIX(1.501321110) 1.46 +F_1_847 equ 15137 ; FIX(1.847759065) 1.47 +F_1_961 equ 16069 ; FIX(1.961570560) 1.48 +F_2_053 equ 16819 ; FIX(2.053119869) 1.49 +F_2_562 equ 20995 ; FIX(2.562915447) 1.50 +F_3_072 equ 25172 ; FIX(3.072711026) 1.51 +%else 1.52 +; NASM cannot do compile-time arithmetic on floating-point constants. 1.53 +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) 1.54 +F_0_298 equ DESCALE( 320652955,30-CONST_BITS) ; FIX(0.298631336) 1.55 +F_0_390 equ DESCALE( 418953276,30-CONST_BITS) ; FIX(0.390180644) 1.56 +F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) 1.57 +F_0_765 equ DESCALE( 821806413,30-CONST_BITS) ; FIX(0.765366865) 1.58 +F_0_899 equ DESCALE( 966342111,30-CONST_BITS) ; FIX(0.899976223) 1.59 +F_1_175 equ DESCALE(1262586813,30-CONST_BITS) ; FIX(1.175875602) 1.60 +F_1_501 equ DESCALE(1612031267,30-CONST_BITS) ; FIX(1.501321110) 1.61 +F_1_847 equ DESCALE(1984016188,30-CONST_BITS) ; FIX(1.847759065) 1.62 +F_1_961 equ DESCALE(2106220350,30-CONST_BITS) ; FIX(1.961570560) 1.63 +F_2_053 equ DESCALE(2204520673,30-CONST_BITS) ; FIX(2.053119869) 1.64 +F_2_562 equ DESCALE(2751909506,30-CONST_BITS) ; FIX(2.562915447) 1.65 +F_3_072 equ DESCALE(3299298341,30-CONST_BITS) ; FIX(3.072711026) 1.66 +%endif 1.67 + 1.68 +; -------------------------------------------------------------------------- 1.69 + SECTION SEG_CONST 1.70 + 1.71 + alignz 16 1.72 + global EXTN(jconst_fdct_islow_mmx) 1.73 + 1.74 +EXTN(jconst_fdct_islow_mmx): 1.75 + 1.76 +PW_F130_F054 times 2 dw (F_0_541+F_0_765), F_0_541 1.77 +PW_F054_MF130 times 2 dw F_0_541, (F_0_541-F_1_847) 1.78 +PW_MF078_F117 times 2 dw (F_1_175-F_1_961), F_1_175 1.79 +PW_F117_F078 times 2 dw F_1_175, (F_1_175-F_0_390) 1.80 +PW_MF060_MF089 times 2 dw (F_0_298-F_0_899),-F_0_899 1.81 +PW_MF089_F060 times 2 dw -F_0_899, (F_1_501-F_0_899) 1.82 +PW_MF050_MF256 times 2 dw (F_2_053-F_2_562),-F_2_562 1.83 +PW_MF256_F050 times 2 dw -F_2_562, (F_3_072-F_2_562) 1.84 +PD_DESCALE_P1 times 2 dd 1 << (DESCALE_P1-1) 1.85 +PD_DESCALE_P2 times 2 dd 1 << (DESCALE_P2-1) 1.86 +PW_DESCALE_P2X times 4 dw 1 << (PASS1_BITS-1) 1.87 + 1.88 + alignz 16 1.89 + 1.90 +; -------------------------------------------------------------------------- 1.91 + SECTION SEG_TEXT 1.92 + BITS 32 1.93 +; 1.94 +; Perform the forward DCT on one block of samples. 1.95 +; 1.96 +; GLOBAL(void) 1.97 +; jsimd_fdct_islow_mmx (DCTELEM * data) 1.98 +; 1.99 + 1.100 +%define data(b) (b)+8 ; DCTELEM * data 1.101 + 1.102 +%define original_ebp ebp+0 1.103 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] 1.104 +%define WK_NUM 2 1.105 + 1.106 + align 16 1.107 + global EXTN(jsimd_fdct_islow_mmx) 1.108 + 1.109 +EXTN(jsimd_fdct_islow_mmx): 1.110 + push ebp 1.111 + mov eax,esp ; eax = original ebp 1.112 + sub esp, byte 4 1.113 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits 1.114 + mov [esp],eax 1.115 + mov ebp,esp ; ebp = aligned ebp 1.116 + lea esp, [wk(0)] 1.117 + pushpic ebx 1.118 +; push ecx ; need not be preserved 1.119 +; push edx ; need not be preserved 1.120 +; push esi ; unused 1.121 +; push edi ; unused 1.122 + 1.123 + get_GOT ebx ; get GOT address 1.124 + 1.125 + ; ---- Pass 1: process rows. 1.126 + 1.127 + mov edx, POINTER [data(eax)] ; (DCTELEM *) 1.128 + mov ecx, DCTSIZE/4 1.129 + alignx 16,7 1.130 +.rowloop: 1.131 + 1.132 + movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] 1.133 + movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] 1.134 + movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)] 1.135 + movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)] 1.136 + 1.137 + ; mm0=(20 21 22 23), mm2=(24 25 26 27) 1.138 + ; mm1=(30 31 32 33), mm3=(34 35 36 37) 1.139 + 1.140 + movq mm4,mm0 ; transpose coefficients(phase 1) 1.141 + punpcklwd mm0,mm1 ; mm0=(20 30 21 31) 1.142 + punpckhwd mm4,mm1 ; mm4=(22 32 23 33) 1.143 + movq mm5,mm2 ; transpose coefficients(phase 1) 1.144 + punpcklwd mm2,mm3 ; mm2=(24 34 25 35) 1.145 + punpckhwd mm5,mm3 ; mm5=(26 36 27 37) 1.146 + 1.147 + movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] 1.148 + movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] 1.149 + movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)] 1.150 + movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)] 1.151 + 1.152 + ; mm6=(00 01 02 03), mm1=(04 05 06 07) 1.153 + ; mm7=(10 11 12 13), mm3=(14 15 16 17) 1.154 + 1.155 + movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33) 1.156 + movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35) 1.157 + 1.158 + movq mm4,mm6 ; transpose coefficients(phase 1) 1.159 + punpcklwd mm6,mm7 ; mm6=(00 10 01 11) 1.160 + punpckhwd mm4,mm7 ; mm4=(02 12 03 13) 1.161 + movq mm2,mm1 ; transpose coefficients(phase 1) 1.162 + punpcklwd mm1,mm3 ; mm1=(04 14 05 15) 1.163 + punpckhwd mm2,mm3 ; mm2=(06 16 07 17) 1.164 + 1.165 + movq mm7,mm6 ; transpose coefficients(phase 2) 1.166 + punpckldq mm6,mm0 ; mm6=(00 10 20 30)=data0 1.167 + punpckhdq mm7,mm0 ; mm7=(01 11 21 31)=data1 1.168 + movq mm3,mm2 ; transpose coefficients(phase 2) 1.169 + punpckldq mm2,mm5 ; mm2=(06 16 26 36)=data6 1.170 + punpckhdq mm3,mm5 ; mm3=(07 17 27 37)=data7 1.171 + 1.172 + movq mm0,mm7 1.173 + movq mm5,mm6 1.174 + psubw mm7,mm2 ; mm7=data1-data6=tmp6 1.175 + psubw mm6,mm3 ; mm6=data0-data7=tmp7 1.176 + paddw mm0,mm2 ; mm0=data1+data6=tmp1 1.177 + paddw mm5,mm3 ; mm5=data0+data7=tmp0 1.178 + 1.179 + movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33) 1.180 + movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35) 1.181 + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 1.182 + movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 1.183 + 1.184 + movq mm7,mm4 ; transpose coefficients(phase 2) 1.185 + punpckldq mm4,mm2 ; mm4=(02 12 22 32)=data2 1.186 + punpckhdq mm7,mm2 ; mm7=(03 13 23 33)=data3 1.187 + movq mm6,mm1 ; transpose coefficients(phase 2) 1.188 + punpckldq mm1,mm3 ; mm1=(04 14 24 34)=data4 1.189 + punpckhdq mm6,mm3 ; mm6=(05 15 25 35)=data5 1.190 + 1.191 + movq mm2,mm7 1.192 + movq mm3,mm4 1.193 + paddw mm7,mm1 ; mm7=data3+data4=tmp3 1.194 + paddw mm4,mm6 ; mm4=data2+data5=tmp2 1.195 + psubw mm2,mm1 ; mm2=data3-data4=tmp4 1.196 + psubw mm3,mm6 ; mm3=data2-data5=tmp5 1.197 + 1.198 + ; -- Even part 1.199 + 1.200 + movq mm1,mm5 1.201 + movq mm6,mm0 1.202 + paddw mm5,mm7 ; mm5=tmp10 1.203 + paddw mm0,mm4 ; mm0=tmp11 1.204 + psubw mm1,mm7 ; mm1=tmp13 1.205 + psubw mm6,mm4 ; mm6=tmp12 1.206 + 1.207 + movq mm7,mm5 1.208 + paddw mm5,mm0 ; mm5=tmp10+tmp11 1.209 + psubw mm7,mm0 ; mm7=tmp10-tmp11 1.210 + 1.211 + psllw mm5,PASS1_BITS ; mm5=data0 1.212 + psllw mm7,PASS1_BITS ; mm7=data4 1.213 + 1.214 + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5 1.215 + movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7 1.216 + 1.217 + ; (Original) 1.218 + ; z1 = (tmp12 + tmp13) * 0.541196100; 1.219 + ; data2 = z1 + tmp13 * 0.765366865; 1.220 + ; data6 = z1 + tmp12 * -1.847759065; 1.221 + ; 1.222 + ; (This implementation) 1.223 + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; 1.224 + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); 1.225 + 1.226 + movq mm4,mm1 ; mm1=tmp13 1.227 + movq mm0,mm1 1.228 + punpcklwd mm4,mm6 ; mm6=tmp12 1.229 + punpckhwd mm0,mm6 1.230 + movq mm1,mm4 1.231 + movq mm6,mm0 1.232 + pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L 1.233 + pmaddwd mm0,[GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H 1.234 + pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L 1.235 + pmaddwd mm6,[GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H 1.236 + 1.237 + paddd mm4,[GOTOFF(ebx,PD_DESCALE_P1)] 1.238 + paddd mm0,[GOTOFF(ebx,PD_DESCALE_P1)] 1.239 + psrad mm4,DESCALE_P1 1.240 + psrad mm0,DESCALE_P1 1.241 + paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)] 1.242 + paddd mm6,[GOTOFF(ebx,PD_DESCALE_P1)] 1.243 + psrad mm1,DESCALE_P1 1.244 + psrad mm6,DESCALE_P1 1.245 + 1.246 + packssdw mm4,mm0 ; mm4=data2 1.247 + packssdw mm1,mm6 ; mm1=data6 1.248 + 1.249 + movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 1.250 + movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1 1.251 + 1.252 + ; -- Odd part 1.253 + 1.254 + movq mm5, MMWORD [wk(0)] ; mm5=tmp6 1.255 + movq mm7, MMWORD [wk(1)] ; mm7=tmp7 1.256 + 1.257 + movq mm0,mm2 ; mm2=tmp4 1.258 + movq mm6,mm3 ; mm3=tmp5 1.259 + paddw mm0,mm5 ; mm0=z3 1.260 + paddw mm6,mm7 ; mm6=z4 1.261 + 1.262 + ; (Original) 1.263 + ; z5 = (z3 + z4) * 1.175875602; 1.264 + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; 1.265 + ; z3 += z5; z4 += z5; 1.266 + ; 1.267 + ; (This implementation) 1.268 + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; 1.269 + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); 1.270 + 1.271 + movq mm4,mm0 1.272 + movq mm1,mm0 1.273 + punpcklwd mm4,mm6 1.274 + punpckhwd mm1,mm6 1.275 + movq mm0,mm4 1.276 + movq mm6,mm1 1.277 + pmaddwd mm4,[GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L 1.278 + pmaddwd mm1,[GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H 1.279 + pmaddwd mm0,[GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L 1.280 + pmaddwd mm6,[GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H 1.281 + 1.282 + movq MMWORD [wk(0)], mm4 ; wk(0)=z3L 1.283 + movq MMWORD [wk(1)], mm1 ; wk(1)=z3H 1.284 + 1.285 + ; (Original) 1.286 + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; 1.287 + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; 1.288 + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; 1.289 + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; 1.290 + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; 1.291 + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; 1.292 + ; 1.293 + ; (This implementation) 1.294 + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; 1.295 + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; 1.296 + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); 1.297 + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); 1.298 + ; data7 = tmp4 + z3; data5 = tmp5 + z4; 1.299 + ; data3 = tmp6 + z3; data1 = tmp7 + z4; 1.300 + 1.301 + movq mm4,mm2 1.302 + movq mm1,mm2 1.303 + punpcklwd mm4,mm7 1.304 + punpckhwd mm1,mm7 1.305 + movq mm2,mm4 1.306 + movq mm7,mm1 1.307 + pmaddwd mm4,[GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L 1.308 + pmaddwd mm1,[GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H 1.309 + pmaddwd mm2,[GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L 1.310 + pmaddwd mm7,[GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H 1.311 + 1.312 + paddd mm4, MMWORD [wk(0)] ; mm4=data7L 1.313 + paddd mm1, MMWORD [wk(1)] ; mm1=data7H 1.314 + paddd mm2,mm0 ; mm2=data1L 1.315 + paddd mm7,mm6 ; mm7=data1H 1.316 + 1.317 + paddd mm4,[GOTOFF(ebx,PD_DESCALE_P1)] 1.318 + paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)] 1.319 + psrad mm4,DESCALE_P1 1.320 + psrad mm1,DESCALE_P1 1.321 + paddd mm2,[GOTOFF(ebx,PD_DESCALE_P1)] 1.322 + paddd mm7,[GOTOFF(ebx,PD_DESCALE_P1)] 1.323 + psrad mm2,DESCALE_P1 1.324 + psrad mm7,DESCALE_P1 1.325 + 1.326 + packssdw mm4,mm1 ; mm4=data7 1.327 + packssdw mm2,mm7 ; mm2=data1 1.328 + 1.329 + movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4 1.330 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2 1.331 + 1.332 + movq mm1,mm3 1.333 + movq mm7,mm3 1.334 + punpcklwd mm1,mm5 1.335 + punpckhwd mm7,mm5 1.336 + movq mm3,mm1 1.337 + movq mm5,mm7 1.338 + pmaddwd mm1,[GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L 1.339 + pmaddwd mm7,[GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H 1.340 + pmaddwd mm3,[GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L 1.341 + pmaddwd mm5,[GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H 1.342 + 1.343 + paddd mm1,mm0 ; mm1=data5L 1.344 + paddd mm7,mm6 ; mm7=data5H 1.345 + paddd mm3, MMWORD [wk(0)] ; mm3=data3L 1.346 + paddd mm5, MMWORD [wk(1)] ; mm5=data3H 1.347 + 1.348 + paddd mm1,[GOTOFF(ebx,PD_DESCALE_P1)] 1.349 + paddd mm7,[GOTOFF(ebx,PD_DESCALE_P1)] 1.350 + psrad mm1,DESCALE_P1 1.351 + psrad mm7,DESCALE_P1 1.352 + paddd mm3,[GOTOFF(ebx,PD_DESCALE_P1)] 1.353 + paddd mm5,[GOTOFF(ebx,PD_DESCALE_P1)] 1.354 + psrad mm3,DESCALE_P1 1.355 + psrad mm5,DESCALE_P1 1.356 + 1.357 + packssdw mm1,mm7 ; mm1=data5 1.358 + packssdw mm3,mm5 ; mm3=data3 1.359 + 1.360 + movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1 1.361 + movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3 1.362 + 1.363 + add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM 1.364 + dec ecx 1.365 + jnz near .rowloop 1.366 + 1.367 + ; ---- Pass 2: process columns. 1.368 + 1.369 + mov edx, POINTER [data(eax)] ; (DCTELEM *) 1.370 + mov ecx, DCTSIZE/4 1.371 + alignx 16,7 1.372 +.columnloop: 1.373 + 1.374 + movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] 1.375 + movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] 1.376 + movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)] 1.377 + movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)] 1.378 + 1.379 + ; mm0=(02 12 22 32), mm2=(42 52 62 72) 1.380 + ; mm1=(03 13 23 33), mm3=(43 53 63 73) 1.381 + 1.382 + movq mm4,mm0 ; transpose coefficients(phase 1) 1.383 + punpcklwd mm0,mm1 ; mm0=(02 03 12 13) 1.384 + punpckhwd mm4,mm1 ; mm4=(22 23 32 33) 1.385 + movq mm5,mm2 ; transpose coefficients(phase 1) 1.386 + punpcklwd mm2,mm3 ; mm2=(42 43 52 53) 1.387 + punpckhwd mm5,mm3 ; mm5=(62 63 72 73) 1.388 + 1.389 + movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] 1.390 + movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] 1.391 + movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)] 1.392 + movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)] 1.393 + 1.394 + ; mm6=(00 10 20 30), mm1=(40 50 60 70) 1.395 + ; mm7=(01 11 21 31), mm3=(41 51 61 71) 1.396 + 1.397 + movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33) 1.398 + movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53) 1.399 + 1.400 + movq mm4,mm6 ; transpose coefficients(phase 1) 1.401 + punpcklwd mm6,mm7 ; mm6=(00 01 10 11) 1.402 + punpckhwd mm4,mm7 ; mm4=(20 21 30 31) 1.403 + movq mm2,mm1 ; transpose coefficients(phase 1) 1.404 + punpcklwd mm1,mm3 ; mm1=(40 41 50 51) 1.405 + punpckhwd mm2,mm3 ; mm2=(60 61 70 71) 1.406 + 1.407 + movq mm7,mm6 ; transpose coefficients(phase 2) 1.408 + punpckldq mm6,mm0 ; mm6=(00 01 02 03)=data0 1.409 + punpckhdq mm7,mm0 ; mm7=(10 11 12 13)=data1 1.410 + movq mm3,mm2 ; transpose coefficients(phase 2) 1.411 + punpckldq mm2,mm5 ; mm2=(60 61 62 63)=data6 1.412 + punpckhdq mm3,mm5 ; mm3=(70 71 72 73)=data7 1.413 + 1.414 + movq mm0,mm7 1.415 + movq mm5,mm6 1.416 + psubw mm7,mm2 ; mm7=data1-data6=tmp6 1.417 + psubw mm6,mm3 ; mm6=data0-data7=tmp7 1.418 + paddw mm0,mm2 ; mm0=data1+data6=tmp1 1.419 + paddw mm5,mm3 ; mm5=data0+data7=tmp0 1.420 + 1.421 + movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33) 1.422 + movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53) 1.423 + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 1.424 + movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 1.425 + 1.426 + movq mm7,mm4 ; transpose coefficients(phase 2) 1.427 + punpckldq mm4,mm2 ; mm4=(20 21 22 23)=data2 1.428 + punpckhdq mm7,mm2 ; mm7=(30 31 32 33)=data3 1.429 + movq mm6,mm1 ; transpose coefficients(phase 2) 1.430 + punpckldq mm1,mm3 ; mm1=(40 41 42 43)=data4 1.431 + punpckhdq mm6,mm3 ; mm6=(50 51 52 53)=data5 1.432 + 1.433 + movq mm2,mm7 1.434 + movq mm3,mm4 1.435 + paddw mm7,mm1 ; mm7=data3+data4=tmp3 1.436 + paddw mm4,mm6 ; mm4=data2+data5=tmp2 1.437 + psubw mm2,mm1 ; mm2=data3-data4=tmp4 1.438 + psubw mm3,mm6 ; mm3=data2-data5=tmp5 1.439 + 1.440 + ; -- Even part 1.441 + 1.442 + movq mm1,mm5 1.443 + movq mm6,mm0 1.444 + paddw mm5,mm7 ; mm5=tmp10 1.445 + paddw mm0,mm4 ; mm0=tmp11 1.446 + psubw mm1,mm7 ; mm1=tmp13 1.447 + psubw mm6,mm4 ; mm6=tmp12 1.448 + 1.449 + movq mm7,mm5 1.450 + paddw mm5,mm0 ; mm5=tmp10+tmp11 1.451 + psubw mm7,mm0 ; mm7=tmp10-tmp11 1.452 + 1.453 + paddw mm5,[GOTOFF(ebx,PW_DESCALE_P2X)] 1.454 + paddw mm7,[GOTOFF(ebx,PW_DESCALE_P2X)] 1.455 + psraw mm5,PASS1_BITS ; mm5=data0 1.456 + psraw mm7,PASS1_BITS ; mm7=data4 1.457 + 1.458 + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5 1.459 + movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7 1.460 + 1.461 + ; (Original) 1.462 + ; z1 = (tmp12 + tmp13) * 0.541196100; 1.463 + ; data2 = z1 + tmp13 * 0.765366865; 1.464 + ; data6 = z1 + tmp12 * -1.847759065; 1.465 + ; 1.466 + ; (This implementation) 1.467 + ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100; 1.468 + ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065); 1.469 + 1.470 + movq mm4,mm1 ; mm1=tmp13 1.471 + movq mm0,mm1 1.472 + punpcklwd mm4,mm6 ; mm6=tmp12 1.473 + punpckhwd mm0,mm6 1.474 + movq mm1,mm4 1.475 + movq mm6,mm0 1.476 + pmaddwd mm4,[GOTOFF(ebx,PW_F130_F054)] ; mm4=data2L 1.477 + pmaddwd mm0,[GOTOFF(ebx,PW_F130_F054)] ; mm0=data2H 1.478 + pmaddwd mm1,[GOTOFF(ebx,PW_F054_MF130)] ; mm1=data6L 1.479 + pmaddwd mm6,[GOTOFF(ebx,PW_F054_MF130)] ; mm6=data6H 1.480 + 1.481 + paddd mm4,[GOTOFF(ebx,PD_DESCALE_P2)] 1.482 + paddd mm0,[GOTOFF(ebx,PD_DESCALE_P2)] 1.483 + psrad mm4,DESCALE_P2 1.484 + psrad mm0,DESCALE_P2 1.485 + paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)] 1.486 + paddd mm6,[GOTOFF(ebx,PD_DESCALE_P2)] 1.487 + psrad mm1,DESCALE_P2 1.488 + psrad mm6,DESCALE_P2 1.489 + 1.490 + packssdw mm4,mm0 ; mm4=data2 1.491 + packssdw mm1,mm6 ; mm1=data6 1.492 + 1.493 + movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 1.494 + movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1 1.495 + 1.496 + ; -- Odd part 1.497 + 1.498 + movq mm5, MMWORD [wk(0)] ; mm5=tmp6 1.499 + movq mm7, MMWORD [wk(1)] ; mm7=tmp7 1.500 + 1.501 + movq mm0,mm2 ; mm2=tmp4 1.502 + movq mm6,mm3 ; mm3=tmp5 1.503 + paddw mm0,mm5 ; mm0=z3 1.504 + paddw mm6,mm7 ; mm6=z4 1.505 + 1.506 + ; (Original) 1.507 + ; z5 = (z3 + z4) * 1.175875602; 1.508 + ; z3 = z3 * -1.961570560; z4 = z4 * -0.390180644; 1.509 + ; z3 += z5; z4 += z5; 1.510 + ; 1.511 + ; (This implementation) 1.512 + ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; 1.513 + ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); 1.514 + 1.515 + movq mm4,mm0 1.516 + movq mm1,mm0 1.517 + punpcklwd mm4,mm6 1.518 + punpckhwd mm1,mm6 1.519 + movq mm0,mm4 1.520 + movq mm6,mm1 1.521 + pmaddwd mm4,[GOTOFF(ebx,PW_MF078_F117)] ; mm4=z3L 1.522 + pmaddwd mm1,[GOTOFF(ebx,PW_MF078_F117)] ; mm1=z3H 1.523 + pmaddwd mm0,[GOTOFF(ebx,PW_F117_F078)] ; mm0=z4L 1.524 + pmaddwd mm6,[GOTOFF(ebx,PW_F117_F078)] ; mm6=z4H 1.525 + 1.526 + movq MMWORD [wk(0)], mm4 ; wk(0)=z3L 1.527 + movq MMWORD [wk(1)], mm1 ; wk(1)=z3H 1.528 + 1.529 + ; (Original) 1.530 + ; z1 = tmp4 + tmp7; z2 = tmp5 + tmp6; 1.531 + ; tmp4 = tmp4 * 0.298631336; tmp5 = tmp5 * 2.053119869; 1.532 + ; tmp6 = tmp6 * 3.072711026; tmp7 = tmp7 * 1.501321110; 1.533 + ; z1 = z1 * -0.899976223; z2 = z2 * -2.562915447; 1.534 + ; data7 = tmp4 + z1 + z3; data5 = tmp5 + z2 + z4; 1.535 + ; data3 = tmp6 + z2 + z3; data1 = tmp7 + z1 + z4; 1.536 + ; 1.537 + ; (This implementation) 1.538 + ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223; 1.539 + ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447; 1.540 + ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447); 1.541 + ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223); 1.542 + ; data7 = tmp4 + z3; data5 = tmp5 + z4; 1.543 + ; data3 = tmp6 + z3; data1 = tmp7 + z4; 1.544 + 1.545 + movq mm4,mm2 1.546 + movq mm1,mm2 1.547 + punpcklwd mm4,mm7 1.548 + punpckhwd mm1,mm7 1.549 + movq mm2,mm4 1.550 + movq mm7,mm1 1.551 + pmaddwd mm4,[GOTOFF(ebx,PW_MF060_MF089)] ; mm4=tmp4L 1.552 + pmaddwd mm1,[GOTOFF(ebx,PW_MF060_MF089)] ; mm1=tmp4H 1.553 + pmaddwd mm2,[GOTOFF(ebx,PW_MF089_F060)] ; mm2=tmp7L 1.554 + pmaddwd mm7,[GOTOFF(ebx,PW_MF089_F060)] ; mm7=tmp7H 1.555 + 1.556 + paddd mm4, MMWORD [wk(0)] ; mm4=data7L 1.557 + paddd mm1, MMWORD [wk(1)] ; mm1=data7H 1.558 + paddd mm2,mm0 ; mm2=data1L 1.559 + paddd mm7,mm6 ; mm7=data1H 1.560 + 1.561 + paddd mm4,[GOTOFF(ebx,PD_DESCALE_P2)] 1.562 + paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)] 1.563 + psrad mm4,DESCALE_P2 1.564 + psrad mm1,DESCALE_P2 1.565 + paddd mm2,[GOTOFF(ebx,PD_DESCALE_P2)] 1.566 + paddd mm7,[GOTOFF(ebx,PD_DESCALE_P2)] 1.567 + psrad mm2,DESCALE_P2 1.568 + psrad mm7,DESCALE_P2 1.569 + 1.570 + packssdw mm4,mm1 ; mm4=data7 1.571 + packssdw mm2,mm7 ; mm2=data1 1.572 + 1.573 + movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4 1.574 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2 1.575 + 1.576 + movq mm1,mm3 1.577 + movq mm7,mm3 1.578 + punpcklwd mm1,mm5 1.579 + punpckhwd mm7,mm5 1.580 + movq mm3,mm1 1.581 + movq mm5,mm7 1.582 + pmaddwd mm1,[GOTOFF(ebx,PW_MF050_MF256)] ; mm1=tmp5L 1.583 + pmaddwd mm7,[GOTOFF(ebx,PW_MF050_MF256)] ; mm7=tmp5H 1.584 + pmaddwd mm3,[GOTOFF(ebx,PW_MF256_F050)] ; mm3=tmp6L 1.585 + pmaddwd mm5,[GOTOFF(ebx,PW_MF256_F050)] ; mm5=tmp6H 1.586 + 1.587 + paddd mm1,mm0 ; mm1=data5L 1.588 + paddd mm7,mm6 ; mm7=data5H 1.589 + paddd mm3, MMWORD [wk(0)] ; mm3=data3L 1.590 + paddd mm5, MMWORD [wk(1)] ; mm5=data3H 1.591 + 1.592 + paddd mm1,[GOTOFF(ebx,PD_DESCALE_P2)] 1.593 + paddd mm7,[GOTOFF(ebx,PD_DESCALE_P2)] 1.594 + psrad mm1,DESCALE_P2 1.595 + psrad mm7,DESCALE_P2 1.596 + paddd mm3,[GOTOFF(ebx,PD_DESCALE_P2)] 1.597 + paddd mm5,[GOTOFF(ebx,PD_DESCALE_P2)] 1.598 + psrad mm3,DESCALE_P2 1.599 + psrad mm5,DESCALE_P2 1.600 + 1.601 + packssdw mm1,mm7 ; mm1=data5 1.602 + packssdw mm3,mm5 ; mm3=data3 1.603 + 1.604 + movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1 1.605 + movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3 1.606 + 1.607 + add edx, byte 4*SIZEOF_DCTELEM 1.608 + dec ecx 1.609 + jnz near .columnloop 1.610 + 1.611 + emms ; empty MMX state 1.612 + 1.613 +; pop edi ; unused 1.614 +; pop esi ; unused 1.615 +; pop edx ; need not be preserved 1.616 +; pop ecx ; need not be preserved 1.617 + poppic ebx 1.618 + mov esp,ebp ; esp <- aligned ebp 1.619 + pop esp ; esp <- original ebp 1.620 + pop ebp 1.621 + ret 1.622 + 1.623 +; For some reason, the OS X linker does not honor the request to align the 1.624 +; segment unless we do this. 1.625 + align 16