1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libjpeg/simd/jcqntmmx.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,274 @@ 1.4 +; 1.5 +; jcqntmmx.asm - sample data conversion and quantization (MMX) 1.6 +; 1.7 +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 1.8 +; 1.9 +; Based on 1.10 +; x86 SIMD extension for IJG JPEG library 1.11 +; Copyright (C) 1999-2006, MIYASAKA Masaru. 1.12 +; For conditions of distribution and use, see copyright notice in jsimdext.inc 1.13 +; 1.14 +; This file should be assembled with NASM (Netwide Assembler), 1.15 +; can *not* be assembled with Microsoft's MASM or any compatible 1.16 +; assembler (including Borland's Turbo Assembler). 1.17 +; NASM is available from http://nasm.sourceforge.net/ or 1.18 +; http://sourceforge.net/project/showfiles.php?group_id=6208 1.19 +; 1.20 +; [TAB8] 1.21 + 1.22 +%include "jsimdext.inc" 1.23 +%include "jdct.inc" 1.24 + 1.25 +; -------------------------------------------------------------------------- 1.26 + SECTION SEG_TEXT 1.27 + BITS 32 1.28 +; 1.29 +; Load data into workspace, applying unsigned->signed conversion 1.30 +; 1.31 +; GLOBAL(void) 1.32 +; jsimd_convsamp_mmx (JSAMPARRAY sample_data, JDIMENSION start_col, 1.33 +; DCTELEM * workspace); 1.34 +; 1.35 + 1.36 +%define sample_data ebp+8 ; JSAMPARRAY sample_data 1.37 +%define start_col ebp+12 ; JDIMENSION start_col 1.38 +%define workspace ebp+16 ; DCTELEM * workspace 1.39 + 1.40 + align 16 1.41 + global EXTN(jsimd_convsamp_mmx) 1.42 + 1.43 +EXTN(jsimd_convsamp_mmx): 1.44 + push ebp 1.45 + mov ebp,esp 1.46 + push ebx 1.47 +; push ecx ; need not be preserved 1.48 +; push edx ; need not be preserved 1.49 + push esi 1.50 + push edi 1.51 + 1.52 + pxor mm6,mm6 ; mm6=(all 0's) 1.53 + pcmpeqw mm7,mm7 1.54 + psllw mm7,7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80} 1.55 + 1.56 + mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) 1.57 + mov eax, JDIMENSION [start_col] 1.58 + mov edi, POINTER [workspace] ; (DCTELEM *) 1.59 + mov ecx, DCTSIZE/4 1.60 + alignx 16,7 1.61 +.convloop: 1.62 + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) 1.63 + mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) 1.64 + 1.65 + movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm0=(01234567) 1.66 + movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm1=(89ABCDEF) 1.67 + 1.68 + mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) 1.69 + mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) 1.70 + 1.71 + movq mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm2=(GHIJKLMN) 1.72 + movq mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm3=(OPQRSTUV) 1.73 + 1.74 + movq mm4,mm0 1.75 + punpcklbw mm0,mm6 ; mm0=(0123) 1.76 + punpckhbw mm4,mm6 ; mm4=(4567) 1.77 + movq mm5,mm1 1.78 + punpcklbw mm1,mm6 ; mm1=(89AB) 1.79 + punpckhbw mm5,mm6 ; mm5=(CDEF) 1.80 + 1.81 + paddw mm0,mm7 1.82 + paddw mm4,mm7 1.83 + paddw mm1,mm7 1.84 + paddw mm5,mm7 1.85 + 1.86 + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0 1.87 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4 1.88 + movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1 1.89 + movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5 1.90 + 1.91 + movq mm0,mm2 1.92 + punpcklbw mm2,mm6 ; mm2=(GHIJ) 1.93 + punpckhbw mm0,mm6 ; mm0=(KLMN) 1.94 + movq mm4,mm3 1.95 + punpcklbw mm3,mm6 ; mm3=(OPQR) 1.96 + punpckhbw mm4,mm6 ; mm4=(STUV) 1.97 + 1.98 + paddw mm2,mm7 1.99 + paddw mm0,mm7 1.100 + paddw mm3,mm7 1.101 + paddw mm4,mm7 1.102 + 1.103 + movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2 1.104 + movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0 1.105 + movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3 1.106 + movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4 1.107 + 1.108 + add esi, byte 4*SIZEOF_JSAMPROW 1.109 + add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM 1.110 + dec ecx 1.111 + jnz short .convloop 1.112 + 1.113 + emms ; empty MMX state 1.114 + 1.115 + pop edi 1.116 + pop esi 1.117 +; pop edx ; need not be preserved 1.118 +; pop ecx ; need not be preserved 1.119 + pop ebx 1.120 + pop ebp 1.121 + ret 1.122 + 1.123 +; -------------------------------------------------------------------------- 1.124 +; 1.125 +; Quantize/descale the coefficients, and store into coef_block 1.126 +; 1.127 +; This implementation is based on an algorithm described in 1.128 +; "How to optimize for the Pentium family of microprocessors" 1.129 +; (http://www.agner.org/assem/). 1.130 +; 1.131 +; GLOBAL(void) 1.132 +; jsimd_quantize_mmx (JCOEFPTR coef_block, DCTELEM * divisors, 1.133 +; DCTELEM * workspace); 1.134 +; 1.135 + 1.136 +%define RECIPROCAL(m,n,b) MMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM) 1.137 +%define CORRECTION(m,n,b) MMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM) 1.138 +%define SCALE(m,n,b) MMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM) 1.139 +%define SHIFT(m,n,b) MMBLOCK(DCTSIZE*3+(m),(n),(b),SIZEOF_DCTELEM) 1.140 + 1.141 +%define coef_block ebp+8 ; JCOEFPTR coef_block 1.142 +%define divisors ebp+12 ; DCTELEM * divisors 1.143 +%define workspace ebp+16 ; DCTELEM * workspace 1.144 + 1.145 + align 16 1.146 + global EXTN(jsimd_quantize_mmx) 1.147 + 1.148 +EXTN(jsimd_quantize_mmx): 1.149 + push ebp 1.150 + mov ebp,esp 1.151 +; push ebx ; unused 1.152 +; push ecx ; unused 1.153 +; push edx ; need not be preserved 1.154 + push esi 1.155 + push edi 1.156 + 1.157 + mov esi, POINTER [workspace] 1.158 + mov edx, POINTER [divisors] 1.159 + mov edi, JCOEFPTR [coef_block] 1.160 + mov ah, 2 1.161 + alignx 16,7 1.162 +.quantloop1: 1.163 + mov al, DCTSIZE2/8/2 1.164 + alignx 16,7 1.165 +.quantloop2: 1.166 + movq mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)] 1.167 + movq mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)] 1.168 + 1.169 + movq mm0,mm2 1.170 + movq mm1,mm3 1.171 + 1.172 + psraw mm2,(WORD_BIT-1) ; -1 if value < 0, 0 otherwise 1.173 + psraw mm3,(WORD_BIT-1) 1.174 + 1.175 + pxor mm0,mm2 ; val = -val 1.176 + pxor mm1,mm3 1.177 + psubw mm0,mm2 1.178 + psubw mm1,mm3 1.179 + 1.180 + ; 1.181 + ; MMX is an annoyingly crappy instruction set. It has two 1.182 + ; misfeatures that are causing problems here: 1.183 + ; 1.184 + ; - All multiplications are signed. 1.185 + ; 1.186 + ; - The second operand for the shifts is not treated as packed. 1.187 + ; 1.188 + ; 1.189 + ; We work around the first problem by implementing this algorithm: 1.190 + ; 1.191 + ; unsigned long unsigned_multiply(unsigned short x, unsigned short y) 1.192 + ; { 1.193 + ; enum { SHORT_BIT = 16 }; 1.194 + ; signed short sx = (signed short) x; 1.195 + ; signed short sy = (signed short) y; 1.196 + ; signed long sz; 1.197 + ; 1.198 + ; sz = (long) sx * (long) sy; /* signed multiply */ 1.199 + ; 1.200 + ; if (sx < 0) sz += (long) sy << SHORT_BIT; 1.201 + ; if (sy < 0) sz += (long) sx << SHORT_BIT; 1.202 + ; 1.203 + ; return (unsigned long) sz; 1.204 + ; } 1.205 + ; 1.206 + ; (note that a negative sx adds _sy_ and vice versa) 1.207 + ; 1.208 + ; For the second problem, we replace the shift by a multiplication. 1.209 + ; Unfortunately that means we have to deal with the signed issue again. 1.210 + ; 1.211 + 1.212 + paddw mm0, MMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor 1.213 + paddw mm1, MMWORD [CORRECTION(0,1,edx)] 1.214 + 1.215 + movq mm4,mm0 ; store current value for later 1.216 + movq mm5,mm1 1.217 + pmulhw mm0, MMWORD [RECIPROCAL(0,0,edx)] ; reciprocal 1.218 + pmulhw mm1, MMWORD [RECIPROCAL(0,1,edx)] 1.219 + paddw mm0,mm4 ; reciprocal is always negative (MSB=1), 1.220 + paddw mm1,mm5 ; so we always need to add the initial value 1.221 + ; (input value is never negative as we 1.222 + ; inverted it at the start of this routine) 1.223 + 1.224 + ; here it gets a bit tricky as both scale 1.225 + ; and mm0/mm1 can be negative 1.226 + movq mm6, MMWORD [SCALE(0,0,edx)] ; scale 1.227 + movq mm7, MMWORD [SCALE(0,1,edx)] 1.228 + movq mm4,mm0 1.229 + movq mm5,mm1 1.230 + pmulhw mm0,mm6 1.231 + pmulhw mm1,mm7 1.232 + 1.233 + psraw mm6,(WORD_BIT-1) ; determine if scale is negative 1.234 + psraw mm7,(WORD_BIT-1) 1.235 + 1.236 + pand mm6,mm4 ; and add input if it is 1.237 + pand mm7,mm5 1.238 + paddw mm0,mm6 1.239 + paddw mm1,mm7 1.240 + 1.241 + psraw mm4,(WORD_BIT-1) ; then check if negative input 1.242 + psraw mm5,(WORD_BIT-1) 1.243 + 1.244 + pand mm4, MMWORD [SCALE(0,0,edx)] ; and add scale if it is 1.245 + pand mm5, MMWORD [SCALE(0,1,edx)] 1.246 + paddw mm0,mm4 1.247 + paddw mm1,mm5 1.248 + 1.249 + pxor mm0,mm2 ; val = -val 1.250 + pxor mm1,mm3 1.251 + psubw mm0,mm2 1.252 + psubw mm1,mm3 1.253 + 1.254 + movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0 1.255 + movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1 1.256 + 1.257 + add esi, byte 8*SIZEOF_DCTELEM 1.258 + add edx, byte 8*SIZEOF_DCTELEM 1.259 + add edi, byte 8*SIZEOF_JCOEF 1.260 + dec al 1.261 + jnz near .quantloop2 1.262 + dec ah 1.263 + jnz near .quantloop1 ; to avoid branch misprediction 1.264 + 1.265 + emms ; empty MMX state 1.266 + 1.267 + pop edi 1.268 + pop esi 1.269 +; pop edx ; need not be preserved 1.270 +; pop ecx ; unused 1.271 +; pop ebx ; unused 1.272 + pop ebp 1.273 + ret 1.274 + 1.275 +; For some reason, the OS X linker does not honor the request to align the 1.276 +; segment unless we do this. 1.277 + align 16