1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libjpeg/simd/jcqnts2f.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,171 @@ 1.4 +; 1.5 +; jcqnts2f.asm - sample data conversion and quantization (SSE & SSE2) 1.6 +; 1.7 +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 1.8 +; 1.9 +; Based on 1.10 +; x86 SIMD extension for IJG JPEG library 1.11 +; Copyright (C) 1999-2006, MIYASAKA Masaru. 1.12 +; For conditions of distribution and use, see copyright notice in jsimdext.inc 1.13 +; 1.14 +; This file should be assembled with NASM (Netwide Assembler), 1.15 +; can *not* be assembled with Microsoft's MASM or any compatible 1.16 +; assembler (including Borland's Turbo Assembler). 1.17 +; NASM is available from http://nasm.sourceforge.net/ or 1.18 +; http://sourceforge.net/project/showfiles.php?group_id=6208 1.19 +; 1.20 +; [TAB8] 1.21 + 1.22 +%include "jsimdext.inc" 1.23 +%include "jdct.inc" 1.24 + 1.25 +; -------------------------------------------------------------------------- 1.26 + SECTION SEG_TEXT 1.27 + BITS 32 1.28 +; 1.29 +; Load data into workspace, applying unsigned->signed conversion 1.30 +; 1.31 +; GLOBAL(void) 1.32 +; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col, 1.33 +; FAST_FLOAT * workspace); 1.34 +; 1.35 + 1.36 +%define sample_data ebp+8 ; JSAMPARRAY sample_data 1.37 +%define start_col ebp+12 ; JDIMENSION start_col 1.38 +%define workspace ebp+16 ; FAST_FLOAT * workspace 1.39 + 1.40 + align 16 1.41 + global EXTN(jsimd_convsamp_float_sse2) 1.42 + 1.43 +EXTN(jsimd_convsamp_float_sse2): 1.44 + push ebp 1.45 + mov ebp,esp 1.46 + push ebx 1.47 +; push ecx ; need not be preserved 1.48 +; push edx ; need not be preserved 1.49 + push esi 1.50 + push edi 1.51 + 1.52 + pcmpeqw xmm7,xmm7 1.53 + psllw xmm7,7 1.54 + packsswb xmm7,xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) 1.55 + 1.56 + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) 1.57 + mov eax, JDIMENSION [start_col] 1.58 + mov edi, POINTER [workspace] ; (DCTELEM *) 1.59 + mov ecx, DCTSIZE/2 1.60 + alignx 16,7 1.61 +.convloop: 1.62 + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) 1.63 + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) 1.64 + 1.65 + movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] 1.66 + movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] 1.67 + 1.68 + psubb xmm0,xmm7 ; xmm0=(01234567) 1.69 + psubb xmm1,xmm7 ; xmm1=(89ABCDEF) 1.70 + 1.71 + punpcklbw xmm0,xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) 1.72 + punpcklbw xmm1,xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) 1.73 + 1.74 + punpcklwd xmm2,xmm0 ; xmm2=(***0***1***2***3) 1.75 + punpckhwd xmm0,xmm0 ; xmm0=(***4***5***6***7) 1.76 + punpcklwd xmm3,xmm1 ; xmm3=(***8***9***A***B) 1.77 + punpckhwd xmm1,xmm1 ; xmm1=(***C***D***E***F) 1.78 + 1.79 + psrad xmm2,(DWORD_BIT-BYTE_BIT) ; xmm2=(0123) 1.80 + psrad xmm0,(DWORD_BIT-BYTE_BIT) ; xmm0=(4567) 1.81 + cvtdq2ps xmm2,xmm2 ; xmm2=(0123) 1.82 + cvtdq2ps xmm0,xmm0 ; xmm0=(4567) 1.83 + psrad xmm3,(DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) 1.84 + psrad xmm1,(DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) 1.85 + cvtdq2ps xmm3,xmm3 ; xmm3=(89AB) 1.86 + cvtdq2ps xmm1,xmm1 ; xmm1=(CDEF) 1.87 + 1.88 + movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2 1.89 + movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 1.90 + movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 1.91 + movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 1.92 + 1.93 + add esi, byte 2*SIZEOF_JSAMPROW 1.94 + add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT 1.95 + dec ecx 1.96 + jnz short .convloop 1.97 + 1.98 + pop edi 1.99 + pop esi 1.100 +; pop edx ; need not be preserved 1.101 +; pop ecx ; need not be preserved 1.102 + pop ebx 1.103 + pop ebp 1.104 + ret 1.105 + 1.106 + 1.107 +; -------------------------------------------------------------------------- 1.108 +; 1.109 +; Quantize/descale the coefficients, and store into coef_block 1.110 +; 1.111 +; GLOBAL(void) 1.112 +; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors, 1.113 +; FAST_FLOAT * workspace); 1.114 +; 1.115 + 1.116 +%define coef_block ebp+8 ; JCOEFPTR coef_block 1.117 +%define divisors ebp+12 ; FAST_FLOAT * divisors 1.118 +%define workspace ebp+16 ; FAST_FLOAT * workspace 1.119 + 1.120 + align 16 1.121 + global EXTN(jsimd_quantize_float_sse2) 1.122 + 1.123 +EXTN(jsimd_quantize_float_sse2): 1.124 + push ebp 1.125 + mov ebp,esp 1.126 +; push ebx ; unused 1.127 +; push ecx ; unused 1.128 +; push edx ; need not be preserved 1.129 + push esi 1.130 + push edi 1.131 + 1.132 + mov esi, POINTER [workspace] 1.133 + mov edx, POINTER [divisors] 1.134 + mov edi, JCOEFPTR [coef_block] 1.135 + mov eax, DCTSIZE2/16 1.136 + alignx 16,7 1.137 +.quantloop: 1.138 + movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] 1.139 + movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] 1.140 + mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] 1.141 + mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] 1.142 + movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] 1.143 + movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] 1.144 + mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] 1.145 + mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] 1.146 + 1.147 + cvtps2dq xmm0,xmm0 1.148 + cvtps2dq xmm1,xmm1 1.149 + cvtps2dq xmm2,xmm2 1.150 + cvtps2dq xmm3,xmm3 1.151 + 1.152 + packssdw xmm0,xmm1 1.153 + packssdw xmm2,xmm3 1.154 + 1.155 + movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0 1.156 + movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2 1.157 + 1.158 + add esi, byte 16*SIZEOF_FAST_FLOAT 1.159 + add edx, byte 16*SIZEOF_FAST_FLOAT 1.160 + add edi, byte 16*SIZEOF_JCOEF 1.161 + dec eax 1.162 + jnz short .quantloop 1.163 + 1.164 + pop edi 1.165 + pop esi 1.166 +; pop edx ; need not be preserved 1.167 +; pop ecx ; unused 1.168 +; pop ebx ; unused 1.169 + pop ebp 1.170 + ret 1.171 + 1.172 +; For some reason, the OS X linker does not honor the request to align the 1.173 +; segment unless we do this. 1.174 + align 16