1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libjpeg/simd/jcqnt3dn.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,233 @@ 1.4 +; 1.5 +; jcqnt3dn.asm - sample data conversion and quantization (3DNow! & MMX) 1.6 +; 1.7 +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 1.8 +; 1.9 +; Based on 1.10 +; x86 SIMD extension for IJG JPEG library 1.11 +; Copyright (C) 1999-2006, MIYASAKA Masaru. 1.12 +; For conditions of distribution and use, see copyright notice in jsimdext.inc 1.13 +; 1.14 +; This file should be assembled with NASM (Netwide Assembler), 1.15 +; can *not* be assembled with Microsoft's MASM or any compatible 1.16 +; assembler (including Borland's Turbo Assembler). 1.17 +; NASM is available from http://nasm.sourceforge.net/ or 1.18 +; http://sourceforge.net/project/showfiles.php?group_id=6208 1.19 +; 1.20 +; [TAB8] 1.21 + 1.22 +%include "jsimdext.inc" 1.23 +%include "jdct.inc" 1.24 + 1.25 +; -------------------------------------------------------------------------- 1.26 + SECTION SEG_TEXT 1.27 + BITS 32 1.28 +; 1.29 +; Load data into workspace, applying unsigned->signed conversion 1.30 +; 1.31 +; GLOBAL(void) 1.32 +; jsimd_convsamp_float_3dnow (JSAMPARRAY sample_data, JDIMENSION start_col, 1.33 +; FAST_FLOAT * workspace); 1.34 +; 1.35 + 1.36 +%define sample_data ebp+8 ; JSAMPARRAY sample_data 1.37 +%define start_col ebp+12 ; JDIMENSION start_col 1.38 +%define workspace ebp+16 ; FAST_FLOAT * workspace 1.39 + 1.40 + align 16 1.41 + global EXTN(jsimd_convsamp_float_3dnow) 1.42 + 1.43 +EXTN(jsimd_convsamp_float_3dnow): 1.44 + push ebp 1.45 + mov ebp,esp 1.46 + push ebx 1.47 +; push ecx ; need not be preserved 1.48 +; push edx ; need not be preserved 1.49 + push esi 1.50 + push edi 1.51 + 1.52 + pcmpeqw mm7,mm7 1.53 + psllw mm7,7 1.54 + packsswb mm7,mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..) 1.55 + 1.56 + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) 1.57 + mov eax, JDIMENSION [start_col] 1.58 + mov edi, POINTER [workspace] ; (DCTELEM *) 1.59 + mov ecx, DCTSIZE/2 1.60 + alignx 16,7 1.61 +.convloop: 1.62 + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) 1.63 + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) 1.64 + 1.65 + movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] 1.66 + movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] 1.67 + 1.68 + psubb mm0,mm7 ; mm0=(01234567) 1.69 + psubb mm1,mm7 ; mm1=(89ABCDEF) 1.70 + 1.71 + punpcklbw mm2,mm0 ; mm2=(*0*1*2*3) 1.72 + punpckhbw mm0,mm0 ; mm0=(*4*5*6*7) 1.73 + punpcklbw mm3,mm1 ; mm3=(*8*9*A*B) 1.74 + punpckhbw mm1,mm1 ; mm1=(*C*D*E*F) 1.75 + 1.76 + punpcklwd mm4,mm2 ; mm4=(***0***1) 1.77 + punpckhwd mm2,mm2 ; mm2=(***2***3) 1.78 + punpcklwd mm5,mm0 ; mm5=(***4***5) 1.79 + punpckhwd mm0,mm0 ; mm0=(***6***7) 1.80 + 1.81 + psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(01) 1.82 + psrad mm2,(DWORD_BIT-BYTE_BIT) ; mm2=(23) 1.83 + pi2fd mm4,mm4 1.84 + pi2fd mm2,mm2 1.85 + psrad mm5,(DWORD_BIT-BYTE_BIT) ; mm5=(45) 1.86 + psrad mm0,(DWORD_BIT-BYTE_BIT) ; mm0=(67) 1.87 + pi2fd mm5,mm5 1.88 + pi2fd mm0,mm0 1.89 + 1.90 + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm4 1.91 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm2 1.92 + movq MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5 1.93 + movq MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0 1.94 + 1.95 + punpcklwd mm6,mm3 ; mm6=(***8***9) 1.96 + punpckhwd mm3,mm3 ; mm3=(***A***B) 1.97 + punpcklwd mm4,mm1 ; mm4=(***C***D) 1.98 + punpckhwd mm1,mm1 ; mm1=(***E***F) 1.99 + 1.100 + psrad mm6,(DWORD_BIT-BYTE_BIT) ; mm6=(89) 1.101 + psrad mm3,(DWORD_BIT-BYTE_BIT) ; mm3=(AB) 1.102 + pi2fd mm6,mm6 1.103 + pi2fd mm3,mm3 1.104 + psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(CD) 1.105 + psrad mm1,(DWORD_BIT-BYTE_BIT) ; mm1=(EF) 1.106 + pi2fd mm4,mm4 1.107 + pi2fd mm1,mm1 1.108 + 1.109 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm6 1.110 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm3 1.111 + movq MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm4 1.112 + movq MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1 1.113 + 1.114 + add esi, byte 2*SIZEOF_JSAMPROW 1.115 + add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT 1.116 + dec ecx 1.117 + jnz near .convloop 1.118 + 1.119 + femms ; empty MMX/3DNow! state 1.120 + 1.121 + pop edi 1.122 + pop esi 1.123 +; pop edx ; need not be preserved 1.124 +; pop ecx ; need not be preserved 1.125 + pop ebx 1.126 + pop ebp 1.127 + ret 1.128 + 1.129 + 1.130 +; -------------------------------------------------------------------------- 1.131 +; 1.132 +; Quantize/descale the coefficients, and store into coef_block 1.133 +; 1.134 +; GLOBAL(void) 1.135 +; jsimd_quantize_float_3dnow (JCOEFPTR coef_block, FAST_FLOAT * divisors, 1.136 +; FAST_FLOAT * workspace); 1.137 +; 1.138 + 1.139 +%define coef_block ebp+8 ; JCOEFPTR coef_block 1.140 +%define divisors ebp+12 ; FAST_FLOAT * divisors 1.141 +%define workspace ebp+16 ; FAST_FLOAT * workspace 1.142 + 1.143 + align 16 1.144 + global EXTN(jsimd_quantize_float_3dnow) 1.145 + 1.146 +EXTN(jsimd_quantize_float_3dnow): 1.147 + push ebp 1.148 + mov ebp,esp 1.149 +; push ebx ; unused 1.150 +; push ecx ; unused 1.151 +; push edx ; need not be preserved 1.152 + push esi 1.153 + push edi 1.154 + 1.155 + mov eax, 0x4B400000 ; (float)0x00C00000 (rndint_magic) 1.156 + movd mm7,eax 1.157 + punpckldq mm7,mm7 ; mm7={12582912.0F 12582912.0F} 1.158 + 1.159 + mov esi, POINTER [workspace] 1.160 + mov edx, POINTER [divisors] 1.161 + mov edi, JCOEFPTR [coef_block] 1.162 + mov eax, DCTSIZE2/16 1.163 + alignx 16,7 1.164 +.quantloop: 1.165 + movq mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] 1.166 + movq mm1, MMWORD [MMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] 1.167 + pfmul mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] 1.168 + pfmul mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] 1.169 + movq mm2, MMWORD [MMBLOCK(0,2,esi,SIZEOF_FAST_FLOAT)] 1.170 + movq mm3, MMWORD [MMBLOCK(0,3,esi,SIZEOF_FAST_FLOAT)] 1.171 + pfmul mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)] 1.172 + pfmul mm3, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)] 1.173 + 1.174 + pfadd mm0,mm7 ; mm0=(00 ** 01 **) 1.175 + pfadd mm1,mm7 ; mm1=(02 ** 03 **) 1.176 + pfadd mm2,mm7 ; mm0=(04 ** 05 **) 1.177 + pfadd mm3,mm7 ; mm1=(06 ** 07 **) 1.178 + 1.179 + movq mm4,mm0 1.180 + punpcklwd mm0,mm1 ; mm0=(00 02 ** **) 1.181 + punpckhwd mm4,mm1 ; mm4=(01 03 ** **) 1.182 + movq mm5,mm2 1.183 + punpcklwd mm2,mm3 ; mm2=(04 06 ** **) 1.184 + punpckhwd mm5,mm3 ; mm5=(05 07 ** **) 1.185 + 1.186 + punpcklwd mm0,mm4 ; mm0=(00 01 02 03) 1.187 + punpcklwd mm2,mm5 ; mm2=(04 05 06 07) 1.188 + 1.189 + movq mm6, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] 1.190 + movq mm1, MMWORD [MMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] 1.191 + pfmul mm6, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] 1.192 + pfmul mm1, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] 1.193 + movq mm3, MMWORD [MMBLOCK(1,2,esi,SIZEOF_FAST_FLOAT)] 1.194 + movq mm4, MMWORD [MMBLOCK(1,3,esi,SIZEOF_FAST_FLOAT)] 1.195 + pfmul mm3, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)] 1.196 + pfmul mm4, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)] 1.197 + 1.198 + pfadd mm6,mm7 ; mm0=(10 ** 11 **) 1.199 + pfadd mm1,mm7 ; mm4=(12 ** 13 **) 1.200 + pfadd mm3,mm7 ; mm0=(14 ** 15 **) 1.201 + pfadd mm4,mm7 ; mm4=(16 ** 17 **) 1.202 + 1.203 + movq mm5,mm6 1.204 + punpcklwd mm6,mm1 ; mm6=(10 12 ** **) 1.205 + punpckhwd mm5,mm1 ; mm5=(11 13 ** **) 1.206 + movq mm1,mm3 1.207 + punpcklwd mm3,mm4 ; mm3=(14 16 ** **) 1.208 + punpckhwd mm1,mm4 ; mm1=(15 17 ** **) 1.209 + 1.210 + punpcklwd mm6,mm5 ; mm6=(10 11 12 13) 1.211 + punpcklwd mm3,mm1 ; mm3=(14 15 16 17) 1.212 + 1.213 + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 1.214 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm2 1.215 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm6 1.216 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3 1.217 + 1.218 + add esi, byte 16*SIZEOF_FAST_FLOAT 1.219 + add edx, byte 16*SIZEOF_FAST_FLOAT 1.220 + add edi, byte 16*SIZEOF_JCOEF 1.221 + dec eax 1.222 + jnz near .quantloop 1.223 + 1.224 + femms ; empty MMX/3DNow! state 1.225 + 1.226 + pop edi 1.227 + pop esi 1.228 +; pop edx ; need not be preserved 1.229 +; pop ecx ; unused 1.230 +; pop ebx ; unused 1.231 + pop ebp 1.232 + ret 1.233 + 1.234 +; For some reason, the OS X linker does not honor the request to align the 1.235 +; segment unless we do this. 1.236 + align 16