1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libjpeg/simd/jcqnts2i.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,200 @@ 1.4 +; 1.5 +; jcqnts2i.asm - sample data conversion and quantization (SSE2) 1.6 +; 1.7 +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 1.8 +; 1.9 +; Based on 1.10 +; x86 SIMD extension for IJG JPEG library 1.11 +; Copyright (C) 1999-2006, MIYASAKA Masaru. 1.12 +; For conditions of distribution and use, see copyright notice in jsimdext.inc 1.13 +; 1.14 +; This file should be assembled with NASM (Netwide Assembler), 1.15 +; can *not* be assembled with Microsoft's MASM or any compatible 1.16 +; assembler (including Borland's Turbo Assembler). 1.17 +; NASM is available from http://nasm.sourceforge.net/ or 1.18 +; http://sourceforge.net/project/showfiles.php?group_id=6208 1.19 +; 1.20 +; [TAB8] 1.21 + 1.22 +%include "jsimdext.inc" 1.23 +%include "jdct.inc" 1.24 + 1.25 +; -------------------------------------------------------------------------- 1.26 + SECTION SEG_TEXT 1.27 + BITS 32 1.28 +; 1.29 +; Load data into workspace, applying unsigned->signed conversion 1.30 +; 1.31 +; GLOBAL(void) 1.32 +; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col, 1.33 +; DCTELEM * workspace); 1.34 +; 1.35 + 1.36 +%define sample_data ebp+8 ; JSAMPARRAY sample_data 1.37 +%define start_col ebp+12 ; JDIMENSION start_col 1.38 +%define workspace ebp+16 ; DCTELEM * workspace 1.39 + 1.40 + align 16 1.41 + global EXTN(jsimd_convsamp_sse2) 1.42 + 1.43 +EXTN(jsimd_convsamp_sse2): 1.44 + push ebp 1.45 + mov ebp,esp 1.46 + push ebx 1.47 +; push ecx ; need not be preserved 1.48 +; push edx ; need not be preserved 1.49 + push esi 1.50 + push edi 1.51 + 1.52 + pxor xmm6,xmm6 ; xmm6=(all 0's) 1.53 + pcmpeqw xmm7,xmm7 1.54 + psllw xmm7,7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} 1.55 + 1.56 + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) 1.57 + mov eax, JDIMENSION [start_col] 1.58 + mov edi, POINTER [workspace] ; (DCTELEM *) 1.59 + mov ecx, DCTSIZE/4 1.60 + alignx 16,7 1.61 +.convloop: 1.62 + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) 1.63 + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) 1.64 + 1.65 + movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm0=(01234567) 1.66 + movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF) 1.67 + 1.68 + mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) 1.69 + mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) 1.70 + 1.71 + movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) 1.72 + movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) 1.73 + 1.74 + punpcklbw xmm0,xmm6 ; xmm0=(01234567) 1.75 + punpcklbw xmm1,xmm6 ; xmm1=(89ABCDEF) 1.76 + paddw xmm0,xmm7 1.77 + paddw xmm1,xmm7 1.78 + punpcklbw xmm2,xmm6 ; xmm2=(GHIJKLMN) 1.79 + punpcklbw xmm3,xmm6 ; xmm3=(OPQRSTUV) 1.80 + paddw xmm2,xmm7 1.81 + paddw xmm3,xmm7 1.82 + 1.83 + movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 1.84 + movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 1.85 + movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 1.86 + movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 1.87 + 1.88 + add esi, byte 4*SIZEOF_JSAMPROW 1.89 + add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM 1.90 + dec ecx 1.91 + jnz short .convloop 1.92 + 1.93 + pop edi 1.94 + pop esi 1.95 +; pop edx ; need not be preserved 1.96 +; pop ecx ; need not be preserved 1.97 + pop ebx 1.98 + pop ebp 1.99 + ret 1.100 + 1.101 +; -------------------------------------------------------------------------- 1.102 +; 1.103 +; Quantize/descale the coefficients, and store into coef_block 1.104 +; 1.105 +; This implementation is based on an algorithm described in 1.106 +; "How to optimize for the Pentium family of microprocessors" 1.107 +; (http://www.agner.org/assem/). 1.108 +; 1.109 +; GLOBAL(void) 1.110 +; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM * divisors, 1.111 +; DCTELEM * workspace); 1.112 +; 1.113 + 1.114 +%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM) 1.115 +%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM) 1.116 +%define SCALE(m,n,b) XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM) 1.117 + 1.118 +%define coef_block ebp+8 ; JCOEFPTR coef_block 1.119 +%define divisors ebp+12 ; DCTELEM * divisors 1.120 +%define workspace ebp+16 ; DCTELEM * workspace 1.121 + 1.122 + align 16 1.123 + global EXTN(jsimd_quantize_sse2) 1.124 + 1.125 +EXTN(jsimd_quantize_sse2): 1.126 + push ebp 1.127 + mov ebp,esp 1.128 +; push ebx ; unused 1.129 +; push ecx ; unused 1.130 +; push edx ; need not be preserved 1.131 + push esi 1.132 + push edi 1.133 + 1.134 + mov esi, POINTER [workspace] 1.135 + mov edx, POINTER [divisors] 1.136 + mov edi, JCOEFPTR [coef_block] 1.137 + mov eax, DCTSIZE2/32 1.138 + alignx 16,7 1.139 +.quantloop: 1.140 + movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)] 1.141 + movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)] 1.142 + movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)] 1.143 + movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)] 1.144 + movdqa xmm0,xmm4 1.145 + movdqa xmm1,xmm5 1.146 + movdqa xmm2,xmm6 1.147 + movdqa xmm3,xmm7 1.148 + psraw xmm4,(WORD_BIT-1) 1.149 + psraw xmm5,(WORD_BIT-1) 1.150 + psraw xmm6,(WORD_BIT-1) 1.151 + psraw xmm7,(WORD_BIT-1) 1.152 + pxor xmm0,xmm4 1.153 + pxor xmm1,xmm5 1.154 + pxor xmm2,xmm6 1.155 + pxor xmm3,xmm7 1.156 + psubw xmm0,xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; 1.157 + psubw xmm1,xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; 1.158 + psubw xmm2,xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; 1.159 + psubw xmm3,xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; 1.160 + 1.161 + paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor 1.162 + paddw xmm1, XMMWORD [CORRECTION(1,0,edx)] 1.163 + paddw xmm2, XMMWORD [CORRECTION(2,0,edx)] 1.164 + paddw xmm3, XMMWORD [CORRECTION(3,0,edx)] 1.165 + pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal 1.166 + pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)] 1.167 + pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)] 1.168 + pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)] 1.169 + pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)] ; scale 1.170 + pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)] 1.171 + pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)] 1.172 + pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)] 1.173 + 1.174 + pxor xmm0,xmm4 1.175 + pxor xmm1,xmm5 1.176 + pxor xmm2,xmm6 1.177 + pxor xmm3,xmm7 1.178 + psubw xmm0,xmm4 1.179 + psubw xmm1,xmm5 1.180 + psubw xmm2,xmm6 1.181 + psubw xmm3,xmm7 1.182 + movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 1.183 + movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 1.184 + movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 1.185 + movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 1.186 + 1.187 + add esi, byte 32*SIZEOF_DCTELEM 1.188 + add edx, byte 32*SIZEOF_DCTELEM 1.189 + add edi, byte 32*SIZEOF_JCOEF 1.190 + dec eax 1.191 + jnz near .quantloop 1.192 + 1.193 + pop edi 1.194 + pop esi 1.195 +; pop edx ; need not be preserved 1.196 +; pop ecx ; unused 1.197 +; pop ebx ; unused 1.198 + pop ebp 1.199 + ret 1.200 + 1.201 +; For some reason, the OS X linker does not honor the request to align the 1.202 +; segment unless we do this. 1.203 + align 16