1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libjpeg/simd/jfmmxfst.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,397 @@ 1.4 +; 1.5 +; jfmmxfst.asm - fast integer FDCT (MMX) 1.6 +; 1.7 +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 1.8 +; 1.9 +; Based on 1.10 +; x86 SIMD extension for IJG JPEG library 1.11 +; Copyright (C) 1999-2006, MIYASAKA Masaru. 1.12 +; For conditions of distribution and use, see copyright notice in jsimdext.inc 1.13 +; 1.14 +; This file should be assembled with NASM (Netwide Assembler), 1.15 +; can *not* be assembled with Microsoft's MASM or any compatible 1.16 +; assembler (including Borland's Turbo Assembler). 1.17 +; NASM is available from http://nasm.sourceforge.net/ or 1.18 +; http://sourceforge.net/project/showfiles.php?group_id=6208 1.19 +; 1.20 +; This file contains a fast, not so accurate integer implementation of 1.21 +; the forward DCT (Discrete Cosine Transform). The following code is 1.22 +; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c 1.23 +; for more details. 1.24 +; 1.25 +; [TAB8] 1.26 + 1.27 +%include "jsimdext.inc" 1.28 +%include "jdct.inc" 1.29 + 1.30 +; -------------------------------------------------------------------------- 1.31 + 1.32 +%define CONST_BITS 8 ; 14 is also OK. 1.33 + 1.34 +%if CONST_BITS == 8 1.35 +F_0_382 equ 98 ; FIX(0.382683433) 1.36 +F_0_541 equ 139 ; FIX(0.541196100) 1.37 +F_0_707 equ 181 ; FIX(0.707106781) 1.38 +F_1_306 equ 334 ; FIX(1.306562965) 1.39 +%else 1.40 +; NASM cannot do compile-time arithmetic on floating-point constants. 1.41 +%define DESCALE(x,n) (((x)+(1<<((n)-1)))>>(n)) 1.42 +F_0_382 equ DESCALE( 410903207,30-CONST_BITS) ; FIX(0.382683433) 1.43 +F_0_541 equ DESCALE( 581104887,30-CONST_BITS) ; FIX(0.541196100) 1.44 +F_0_707 equ DESCALE( 759250124,30-CONST_BITS) ; FIX(0.707106781) 1.45 +F_1_306 equ DESCALE(1402911301,30-CONST_BITS) ; FIX(1.306562965) 1.46 +%endif 1.47 + 1.48 +; -------------------------------------------------------------------------- 1.49 + SECTION SEG_CONST 1.50 + 1.51 +; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow) 1.52 +; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw) 1.53 + 1.54 +%define PRE_MULTIPLY_SCALE_BITS 2 1.55 +%define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS) 1.56 + 1.57 + alignz 16 1.58 + global EXTN(jconst_fdct_ifast_mmx) 1.59 + 1.60 +EXTN(jconst_fdct_ifast_mmx): 1.61 + 1.62 +PW_F0707 times 4 dw F_0_707 << CONST_SHIFT 1.63 +PW_F0382 times 4 dw F_0_382 << CONST_SHIFT 1.64 +PW_F0541 times 4 dw F_0_541 << CONST_SHIFT 1.65 +PW_F1306 times 4 dw F_1_306 << CONST_SHIFT 1.66 + 1.67 + alignz 16 1.68 + 1.69 +; -------------------------------------------------------------------------- 1.70 + SECTION SEG_TEXT 1.71 + BITS 32 1.72 +; 1.73 +; Perform the forward DCT on one block of samples. 1.74 +; 1.75 +; GLOBAL(void) 1.76 +; jsimd_fdct_ifast_mmx (DCTELEM * data) 1.77 +; 1.78 + 1.79 +%define data(b) (b)+8 ; DCTELEM * data 1.80 + 1.81 +%define original_ebp ebp+0 1.82 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] 1.83 +%define WK_NUM 2 1.84 + 1.85 + align 16 1.86 + global EXTN(jsimd_fdct_ifast_mmx) 1.87 + 1.88 +EXTN(jsimd_fdct_ifast_mmx): 1.89 + push ebp 1.90 + mov eax,esp ; eax = original ebp 1.91 + sub esp, byte 4 1.92 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits 1.93 + mov [esp],eax 1.94 + mov ebp,esp ; ebp = aligned ebp 1.95 + lea esp, [wk(0)] 1.96 + pushpic ebx 1.97 +; push ecx ; need not be preserved 1.98 +; push edx ; need not be preserved 1.99 +; push esi ; unused 1.100 +; push edi ; unused 1.101 + 1.102 + get_GOT ebx ; get GOT address 1.103 + 1.104 + ; ---- Pass 1: process rows. 1.105 + 1.106 + mov edx, POINTER [data(eax)] ; (DCTELEM *) 1.107 + mov ecx, DCTSIZE/4 1.108 + alignx 16,7 1.109 +.rowloop: 1.110 + 1.111 + movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] 1.112 + movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] 1.113 + movq mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)] 1.114 + movq mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)] 1.115 + 1.116 + ; mm0=(20 21 22 23), mm2=(24 25 26 27) 1.117 + ; mm1=(30 31 32 33), mm3=(34 35 36 37) 1.118 + 1.119 + movq mm4,mm0 ; transpose coefficients(phase 1) 1.120 + punpcklwd mm0,mm1 ; mm0=(20 30 21 31) 1.121 + punpckhwd mm4,mm1 ; mm4=(22 32 23 33) 1.122 + movq mm5,mm2 ; transpose coefficients(phase 1) 1.123 + punpcklwd mm2,mm3 ; mm2=(24 34 25 35) 1.124 + punpckhwd mm5,mm3 ; mm5=(26 36 27 37) 1.125 + 1.126 + movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] 1.127 + movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] 1.128 + movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)] 1.129 + movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)] 1.130 + 1.131 + ; mm6=(00 01 02 03), mm1=(04 05 06 07) 1.132 + ; mm7=(10 11 12 13), mm3=(14 15 16 17) 1.133 + 1.134 + movq MMWORD [wk(0)], mm4 ; wk(0)=(22 32 23 33) 1.135 + movq MMWORD [wk(1)], mm2 ; wk(1)=(24 34 25 35) 1.136 + 1.137 + movq mm4,mm6 ; transpose coefficients(phase 1) 1.138 + punpcklwd mm6,mm7 ; mm6=(00 10 01 11) 1.139 + punpckhwd mm4,mm7 ; mm4=(02 12 03 13) 1.140 + movq mm2,mm1 ; transpose coefficients(phase 1) 1.141 + punpcklwd mm1,mm3 ; mm1=(04 14 05 15) 1.142 + punpckhwd mm2,mm3 ; mm2=(06 16 07 17) 1.143 + 1.144 + movq mm7,mm6 ; transpose coefficients(phase 2) 1.145 + punpckldq mm6,mm0 ; mm6=(00 10 20 30)=data0 1.146 + punpckhdq mm7,mm0 ; mm7=(01 11 21 31)=data1 1.147 + movq mm3,mm2 ; transpose coefficients(phase 2) 1.148 + punpckldq mm2,mm5 ; mm2=(06 16 26 36)=data6 1.149 + punpckhdq mm3,mm5 ; mm3=(07 17 27 37)=data7 1.150 + 1.151 + movq mm0,mm7 1.152 + movq mm5,mm6 1.153 + psubw mm7,mm2 ; mm7=data1-data6=tmp6 1.154 + psubw mm6,mm3 ; mm6=data0-data7=tmp7 1.155 + paddw mm0,mm2 ; mm0=data1+data6=tmp1 1.156 + paddw mm5,mm3 ; mm5=data0+data7=tmp0 1.157 + 1.158 + movq mm2, MMWORD [wk(0)] ; mm2=(22 32 23 33) 1.159 + movq mm3, MMWORD [wk(1)] ; mm3=(24 34 25 35) 1.160 + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 1.161 + movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 1.162 + 1.163 + movq mm7,mm4 ; transpose coefficients(phase 2) 1.164 + punpckldq mm4,mm2 ; mm4=(02 12 22 32)=data2 1.165 + punpckhdq mm7,mm2 ; mm7=(03 13 23 33)=data3 1.166 + movq mm6,mm1 ; transpose coefficients(phase 2) 1.167 + punpckldq mm1,mm3 ; mm1=(04 14 24 34)=data4 1.168 + punpckhdq mm6,mm3 ; mm6=(05 15 25 35)=data5 1.169 + 1.170 + movq mm2,mm7 1.171 + movq mm3,mm4 1.172 + paddw mm7,mm1 ; mm7=data3+data4=tmp3 1.173 + paddw mm4,mm6 ; mm4=data2+data5=tmp2 1.174 + psubw mm2,mm1 ; mm2=data3-data4=tmp4 1.175 + psubw mm3,mm6 ; mm3=data2-data5=tmp5 1.176 + 1.177 + ; -- Even part 1.178 + 1.179 + movq mm1,mm5 1.180 + movq mm6,mm0 1.181 + psubw mm5,mm7 ; mm5=tmp13 1.182 + psubw mm0,mm4 ; mm0=tmp12 1.183 + paddw mm1,mm7 ; mm1=tmp10 1.184 + paddw mm6,mm4 ; mm6=tmp11 1.185 + 1.186 + paddw mm0,mm5 1.187 + psllw mm0,PRE_MULTIPLY_SCALE_BITS 1.188 + pmulhw mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1 1.189 + 1.190 + movq mm7,mm1 1.191 + movq mm4,mm5 1.192 + psubw mm1,mm6 ; mm1=data4 1.193 + psubw mm5,mm0 ; mm5=data6 1.194 + paddw mm7,mm6 ; mm7=data0 1.195 + paddw mm4,mm0 ; mm4=data2 1.196 + 1.197 + movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1 1.198 + movq MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5 1.199 + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7 1.200 + movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 1.201 + 1.202 + ; -- Odd part 1.203 + 1.204 + movq mm6, MMWORD [wk(0)] ; mm6=tmp6 1.205 + movq mm0, MMWORD [wk(1)] ; mm0=tmp7 1.206 + 1.207 + paddw mm2,mm3 ; mm2=tmp10 1.208 + paddw mm3,mm6 ; mm3=tmp11 1.209 + paddw mm6,mm0 ; mm6=tmp12, mm0=tmp7 1.210 + 1.211 + psllw mm2,PRE_MULTIPLY_SCALE_BITS 1.212 + psllw mm6,PRE_MULTIPLY_SCALE_BITS 1.213 + 1.214 + psllw mm3,PRE_MULTIPLY_SCALE_BITS 1.215 + pmulhw mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3 1.216 + 1.217 + movq mm1,mm2 ; mm1=tmp10 1.218 + psubw mm2,mm6 1.219 + pmulhw mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5 1.220 + pmulhw mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610) 1.221 + pmulhw mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296) 1.222 + paddw mm1,mm2 ; mm1=z2 1.223 + paddw mm6,mm2 ; mm6=z4 1.224 + 1.225 + movq mm5,mm0 1.226 + psubw mm0,mm3 ; mm0=z13 1.227 + paddw mm5,mm3 ; mm5=z11 1.228 + 1.229 + movq mm7,mm0 1.230 + movq mm4,mm5 1.231 + psubw mm0,mm1 ; mm0=data3 1.232 + psubw mm5,mm6 ; mm5=data7 1.233 + paddw mm7,mm1 ; mm7=data5 1.234 + paddw mm4,mm6 ; mm4=data1 1.235 + 1.236 + movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0 1.237 + movq MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5 1.238 + movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7 1.239 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4 1.240 + 1.241 + add edx, byte 4*DCTSIZE*SIZEOF_DCTELEM 1.242 + dec ecx 1.243 + jnz near .rowloop 1.244 + 1.245 + ; ---- Pass 2: process columns. 1.246 + 1.247 + mov edx, POINTER [data(eax)] ; (DCTELEM *) 1.248 + mov ecx, DCTSIZE/4 1.249 + alignx 16,7 1.250 +.columnloop: 1.251 + 1.252 + movq mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)] 1.253 + movq mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)] 1.254 + movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)] 1.255 + movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)] 1.256 + 1.257 + ; mm0=(02 12 22 32), mm2=(42 52 62 72) 1.258 + ; mm1=(03 13 23 33), mm3=(43 53 63 73) 1.259 + 1.260 + movq mm4,mm0 ; transpose coefficients(phase 1) 1.261 + punpcklwd mm0,mm1 ; mm0=(02 03 12 13) 1.262 + punpckhwd mm4,mm1 ; mm4=(22 23 32 33) 1.263 + movq mm5,mm2 ; transpose coefficients(phase 1) 1.264 + punpcklwd mm2,mm3 ; mm2=(42 43 52 53) 1.265 + punpckhwd mm5,mm3 ; mm5=(62 63 72 73) 1.266 + 1.267 + movq mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)] 1.268 + movq mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)] 1.269 + movq mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)] 1.270 + movq mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)] 1.271 + 1.272 + ; mm6=(00 10 20 30), mm1=(40 50 60 70) 1.273 + ; mm7=(01 11 21 31), mm3=(41 51 61 71) 1.274 + 1.275 + movq MMWORD [wk(0)], mm4 ; wk(0)=(22 23 32 33) 1.276 + movq MMWORD [wk(1)], mm2 ; wk(1)=(42 43 52 53) 1.277 + 1.278 + movq mm4,mm6 ; transpose coefficients(phase 1) 1.279 + punpcklwd mm6,mm7 ; mm6=(00 01 10 11) 1.280 + punpckhwd mm4,mm7 ; mm4=(20 21 30 31) 1.281 + movq mm2,mm1 ; transpose coefficients(phase 1) 1.282 + punpcklwd mm1,mm3 ; mm1=(40 41 50 51) 1.283 + punpckhwd mm2,mm3 ; mm2=(60 61 70 71) 1.284 + 1.285 + movq mm7,mm6 ; transpose coefficients(phase 2) 1.286 + punpckldq mm6,mm0 ; mm6=(00 01 02 03)=data0 1.287 + punpckhdq mm7,mm0 ; mm7=(10 11 12 13)=data1 1.288 + movq mm3,mm2 ; transpose coefficients(phase 2) 1.289 + punpckldq mm2,mm5 ; mm2=(60 61 62 63)=data6 1.290 + punpckhdq mm3,mm5 ; mm3=(70 71 72 73)=data7 1.291 + 1.292 + movq mm0,mm7 1.293 + movq mm5,mm6 1.294 + psubw mm7,mm2 ; mm7=data1-data6=tmp6 1.295 + psubw mm6,mm3 ; mm6=data0-data7=tmp7 1.296 + paddw mm0,mm2 ; mm0=data1+data6=tmp1 1.297 + paddw mm5,mm3 ; mm5=data0+data7=tmp0 1.298 + 1.299 + movq mm2, MMWORD [wk(0)] ; mm2=(22 23 32 33) 1.300 + movq mm3, MMWORD [wk(1)] ; mm3=(42 43 52 53) 1.301 + movq MMWORD [wk(0)], mm7 ; wk(0)=tmp6 1.302 + movq MMWORD [wk(1)], mm6 ; wk(1)=tmp7 1.303 + 1.304 + movq mm7,mm4 ; transpose coefficients(phase 2) 1.305 + punpckldq mm4,mm2 ; mm4=(20 21 22 23)=data2 1.306 + punpckhdq mm7,mm2 ; mm7=(30 31 32 33)=data3 1.307 + movq mm6,mm1 ; transpose coefficients(phase 2) 1.308 + punpckldq mm1,mm3 ; mm1=(40 41 42 43)=data4 1.309 + punpckhdq mm6,mm3 ; mm6=(50 51 52 53)=data5 1.310 + 1.311 + movq mm2,mm7 1.312 + movq mm3,mm4 1.313 + paddw mm7,mm1 ; mm7=data3+data4=tmp3 1.314 + paddw mm4,mm6 ; mm4=data2+data5=tmp2 1.315 + psubw mm2,mm1 ; mm2=data3-data4=tmp4 1.316 + psubw mm3,mm6 ; mm3=data2-data5=tmp5 1.317 + 1.318 + ; -- Even part 1.319 + 1.320 + movq mm1,mm5 1.321 + movq mm6,mm0 1.322 + psubw mm5,mm7 ; mm5=tmp13 1.323 + psubw mm0,mm4 ; mm0=tmp12 1.324 + paddw mm1,mm7 ; mm1=tmp10 1.325 + paddw mm6,mm4 ; mm6=tmp11 1.326 + 1.327 + paddw mm0,mm5 1.328 + psllw mm0,PRE_MULTIPLY_SCALE_BITS 1.329 + pmulhw mm0,[GOTOFF(ebx,PW_F0707)] ; mm0=z1 1.330 + 1.331 + movq mm7,mm1 1.332 + movq mm4,mm5 1.333 + psubw mm1,mm6 ; mm1=data4 1.334 + psubw mm5,mm0 ; mm5=data6 1.335 + paddw mm7,mm6 ; mm7=data0 1.336 + paddw mm4,mm0 ; mm4=data2 1.337 + 1.338 + movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1 1.339 + movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5 1.340 + movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7 1.341 + movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4 1.342 + 1.343 + ; -- Odd part 1.344 + 1.345 + movq mm6, MMWORD [wk(0)] ; mm6=tmp6 1.346 + movq mm0, MMWORD [wk(1)] ; mm0=tmp7 1.347 + 1.348 + paddw mm2,mm3 ; mm2=tmp10 1.349 + paddw mm3,mm6 ; mm3=tmp11 1.350 + paddw mm6,mm0 ; mm6=tmp12, mm0=tmp7 1.351 + 1.352 + psllw mm2,PRE_MULTIPLY_SCALE_BITS 1.353 + psllw mm6,PRE_MULTIPLY_SCALE_BITS 1.354 + 1.355 + psllw mm3,PRE_MULTIPLY_SCALE_BITS 1.356 + pmulhw mm3,[GOTOFF(ebx,PW_F0707)] ; mm3=z3 1.357 + 1.358 + movq mm1,mm2 ; mm1=tmp10 1.359 + psubw mm2,mm6 1.360 + pmulhw mm2,[GOTOFF(ebx,PW_F0382)] ; mm2=z5 1.361 + pmulhw mm1,[GOTOFF(ebx,PW_F0541)] ; mm1=MULTIPLY(tmp10,FIX_0_54119610) 1.362 + pmulhw mm6,[GOTOFF(ebx,PW_F1306)] ; mm6=MULTIPLY(tmp12,FIX_1_30656296) 1.363 + paddw mm1,mm2 ; mm1=z2 1.364 + paddw mm6,mm2 ; mm6=z4 1.365 + 1.366 + movq mm5,mm0 1.367 + psubw mm0,mm3 ; mm0=z13 1.368 + paddw mm5,mm3 ; mm5=z11 1.369 + 1.370 + movq mm7,mm0 1.371 + movq mm4,mm5 1.372 + psubw mm0,mm1 ; mm0=data3 1.373 + psubw mm5,mm6 ; mm5=data7 1.374 + paddw mm7,mm1 ; mm7=data5 1.375 + paddw mm4,mm6 ; mm4=data1 1.376 + 1.377 + movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0 1.378 + movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5 1.379 + movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7 1.380 + movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4 1.381 + 1.382 + add edx, byte 4*SIZEOF_DCTELEM 1.383 + dec ecx 1.384 + jnz near .columnloop 1.385 + 1.386 + emms ; empty MMX state 1.387 + 1.388 +; pop edi ; unused 1.389 +; pop esi ; unused 1.390 +; pop edx ; need not be preserved 1.391 +; pop ecx ; need not be preserved 1.392 + poppic ebx 1.393 + mov esp,ebp ; esp <- aligned ebp 1.394 + pop esp ; esp <- original ebp 1.395 + pop ebp 1.396 + ret 1.397 + 1.398 +; For some reason, the OS X linker does not honor the request to align the 1.399 +; segment unless we do this. 1.400 + align 16