1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libjpeg/simd/jcqntsse.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,211 @@ 1.4 +; 1.5 +; jcqntsse.asm - sample data conversion and quantization (SSE & MMX) 1.6 +; 1.7 +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 1.8 +; 1.9 +; Based on 1.10 +; x86 SIMD extension for IJG JPEG library 1.11 +; Copyright (C) 1999-2006, MIYASAKA Masaru. 1.12 +; For conditions of distribution and use, see copyright notice in jsimdext.inc 1.13 +; 1.14 +; This file should be assembled with NASM (Netwide Assembler), 1.15 +; can *not* be assembled with Microsoft's MASM or any compatible 1.16 +; assembler (including Borland's Turbo Assembler). 1.17 +; NASM is available from http://nasm.sourceforge.net/ or 1.18 +; http://sourceforge.net/project/showfiles.php?group_id=6208 1.19 +; 1.20 +; [TAB8] 1.21 + 1.22 +%include "jsimdext.inc" 1.23 +%include "jdct.inc" 1.24 + 1.25 +; -------------------------------------------------------------------------- 1.26 + SECTION SEG_TEXT 1.27 + BITS 32 1.28 +; 1.29 +; Load data into workspace, applying unsigned->signed conversion 1.30 +; 1.31 +; GLOBAL(void) 1.32 +; jsimd_convsamp_float_sse (JSAMPARRAY sample_data, JDIMENSION start_col, 1.33 +; FAST_FLOAT * workspace); 1.34 +; 1.35 + 1.36 +%define sample_data ebp+8 ; JSAMPARRAY sample_data 1.37 +%define start_col ebp+12 ; JDIMENSION start_col 1.38 +%define workspace ebp+16 ; FAST_FLOAT * workspace 1.39 + 1.40 + align 16 1.41 + global EXTN(jsimd_convsamp_float_sse) 1.42 + 1.43 +EXTN(jsimd_convsamp_float_sse): 1.44 + push ebp 1.45 + mov ebp,esp 1.46 + push ebx 1.47 +; push ecx ; need not be preserved 1.48 +; push edx ; need not be preserved 1.49 + push esi 1.50 + push edi 1.51 + 1.52 + pcmpeqw mm7,mm7 1.53 + psllw mm7,7 1.54 + packsswb mm7,mm7 ; mm7 = PB_CENTERJSAMPLE (0x808080..) 1.55 + 1.56 + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) 1.57 + mov eax, JDIMENSION [start_col] 1.58 + mov edi, POINTER [workspace] ; (DCTELEM *) 1.59 + mov ecx, DCTSIZE/2 1.60 + alignx 16,7 1.61 +.convloop: 1.62 + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) 1.63 + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) 1.64 + 1.65 + movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] 1.66 + movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] 1.67 + 1.68 + psubb mm0,mm7 ; mm0=(01234567) 1.69 + psubb mm1,mm7 ; mm1=(89ABCDEF) 1.70 + 1.71 + punpcklbw mm2,mm0 ; mm2=(*0*1*2*3) 1.72 + punpckhbw mm0,mm0 ; mm0=(*4*5*6*7) 1.73 + punpcklbw mm3,mm1 ; mm3=(*8*9*A*B) 1.74 + punpckhbw mm1,mm1 ; mm1=(*C*D*E*F) 1.75 + 1.76 + punpcklwd mm4,mm2 ; mm4=(***0***1) 1.77 + punpckhwd mm2,mm2 ; mm2=(***2***3) 1.78 + punpcklwd mm5,mm0 ; mm5=(***4***5) 1.79 + punpckhwd mm0,mm0 ; mm0=(***6***7) 1.80 + 1.81 + psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(01) 1.82 + psrad mm2,(DWORD_BIT-BYTE_BIT) ; mm2=(23) 1.83 + cvtpi2ps xmm0,mm4 ; xmm0=(01**) 1.84 + cvtpi2ps xmm1,mm2 ; xmm1=(23**) 1.85 + psrad mm5,(DWORD_BIT-BYTE_BIT) ; mm5=(45) 1.86 + psrad mm0,(DWORD_BIT-BYTE_BIT) ; mm0=(67) 1.87 + cvtpi2ps xmm2,mm5 ; xmm2=(45**) 1.88 + cvtpi2ps xmm3,mm0 ; xmm3=(67**) 1.89 + 1.90 + punpcklwd mm6,mm3 ; mm6=(***8***9) 1.91 + punpckhwd mm3,mm3 ; mm3=(***A***B) 1.92 + punpcklwd mm4,mm1 ; mm4=(***C***D) 1.93 + punpckhwd mm1,mm1 ; mm1=(***E***F) 1.94 + 1.95 + psrad mm6,(DWORD_BIT-BYTE_BIT) ; mm6=(89) 1.96 + psrad mm3,(DWORD_BIT-BYTE_BIT) ; mm3=(AB) 1.97 + cvtpi2ps xmm4,mm6 ; xmm4=(89**) 1.98 + cvtpi2ps xmm5,mm3 ; xmm5=(AB**) 1.99 + psrad mm4,(DWORD_BIT-BYTE_BIT) ; mm4=(CD) 1.100 + psrad mm1,(DWORD_BIT-BYTE_BIT) ; mm1=(EF) 1.101 + cvtpi2ps xmm6,mm4 ; xmm6=(CD**) 1.102 + cvtpi2ps xmm7,mm1 ; xmm7=(EF**) 1.103 + 1.104 + movlhps xmm0,xmm1 ; xmm0=(0123) 1.105 + movlhps xmm2,xmm3 ; xmm2=(4567) 1.106 + movlhps xmm4,xmm5 ; xmm4=(89AB) 1.107 + movlhps xmm6,xmm7 ; xmm6=(CDEF) 1.108 + 1.109 + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0 1.110 + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2 1.111 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4 1.112 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6 1.113 + 1.114 + add esi, byte 2*SIZEOF_JSAMPROW 1.115 + add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT 1.116 + dec ecx 1.117 + jnz near .convloop 1.118 + 1.119 + emms ; empty MMX state 1.120 + 1.121 + pop edi 1.122 + pop esi 1.123 +; pop edx ; need not be preserved 1.124 +; pop ecx ; need not be preserved 1.125 + pop ebx 1.126 + pop ebp 1.127 + ret 1.128 + 1.129 + 1.130 +; -------------------------------------------------------------------------- 1.131 +; 1.132 +; Quantize/descale the coefficients, and store into coef_block 1.133 +; 1.134 +; GLOBAL(void) 1.135 +; jsimd_quantize_float_sse (JCOEFPTR coef_block, FAST_FLOAT * divisors, 1.136 +; FAST_FLOAT * workspace); 1.137 +; 1.138 + 1.139 +%define coef_block ebp+8 ; JCOEFPTR coef_block 1.140 +%define divisors ebp+12 ; FAST_FLOAT * divisors 1.141 +%define workspace ebp+16 ; FAST_FLOAT * workspace 1.142 + 1.143 + align 16 1.144 + global EXTN(jsimd_quantize_float_sse) 1.145 + 1.146 +EXTN(jsimd_quantize_float_sse): 1.147 + push ebp 1.148 + mov ebp,esp 1.149 +; push ebx ; unused 1.150 +; push ecx ; unused 1.151 +; push edx ; need not be preserved 1.152 + push esi 1.153 + push edi 1.154 + 1.155 + mov esi, POINTER [workspace] 1.156 + mov edx, POINTER [divisors] 1.157 + mov edi, JCOEFPTR [coef_block] 1.158 + mov eax, DCTSIZE2/16 1.159 + alignx 16,7 1.160 +.quantloop: 1.161 + movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] 1.162 + movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] 1.163 + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] 1.164 + mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] 1.165 + movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] 1.166 + movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] 1.167 + mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] 1.168 + mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] 1.169 + 1.170 + movhlps xmm4,xmm0 1.171 + movhlps xmm5,xmm1 1.172 + 1.173 + cvtps2pi mm0,xmm0 1.174 + cvtps2pi mm1,xmm1 1.175 + cvtps2pi mm4,xmm4 1.176 + cvtps2pi mm5,xmm5 1.177 + 1.178 + movhlps xmm6,xmm2 1.179 + movhlps xmm7,xmm3 1.180 + 1.181 + cvtps2pi mm2,xmm2 1.182 + cvtps2pi mm3,xmm3 1.183 + cvtps2pi mm6,xmm6 1.184 + cvtps2pi mm7,xmm7 1.185 + 1.186 + packssdw mm0,mm4 1.187 + packssdw mm1,mm5 1.188 + packssdw mm2,mm6 1.189 + packssdw mm3,mm7 1.190 + 1.191 + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0 1.192 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1 1.193 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2 1.194 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3 1.195 + 1.196 + add esi, byte 16*SIZEOF_FAST_FLOAT 1.197 + add edx, byte 16*SIZEOF_FAST_FLOAT 1.198 + add edi, byte 16*SIZEOF_JCOEF 1.199 + dec eax 1.200 + jnz short .quantloop 1.201 + 1.202 + emms ; empty MMX state 1.203 + 1.204 + pop edi 1.205 + pop esi 1.206 +; pop edx ; need not be preserved 1.207 +; pop ecx ; unused 1.208 +; pop ebx ; unused 1.209 + pop ebp 1.210 + ret 1.211 + 1.212 +; For some reason, the OS X linker does not honor the request to align the 1.213 +; segment unless we do this. 1.214 + align 16