1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libjpeg/simd/jcsammmx.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,324 @@ 1.4 +; 1.5 +; jcsammmx.asm - downsampling (MMX) 1.6 +; 1.7 +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 1.8 +; 1.9 +; Based on 1.10 +; x86 SIMD extension for IJG JPEG library 1.11 +; Copyright (C) 1999-2006, MIYASAKA Masaru. 1.12 +; For conditions of distribution and use, see copyright notice in jsimdext.inc 1.13 +; 1.14 +; This file should be assembled with NASM (Netwide Assembler), 1.15 +; can *not* be assembled with Microsoft's MASM or any compatible 1.16 +; assembler (including Borland's Turbo Assembler). 1.17 +; NASM is available from http://nasm.sourceforge.net/ or 1.18 +; http://sourceforge.net/project/showfiles.php?group_id=6208 1.19 +; 1.20 +; [TAB8] 1.21 + 1.22 +%include "jsimdext.inc" 1.23 + 1.24 +; -------------------------------------------------------------------------- 1.25 + SECTION SEG_TEXT 1.26 + BITS 32 1.27 +; 1.28 +; Downsample pixel values of a single component. 1.29 +; This version handles the common case of 2:1 horizontal and 1:1 vertical, 1.30 +; without smoothing. 1.31 +; 1.32 +; GLOBAL(void) 1.33 +; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor, 1.34 +; JDIMENSION v_samp_factor, JDIMENSION width_blocks, 1.35 +; JSAMPARRAY input_data, JSAMPARRAY output_data); 1.36 +; 1.37 + 1.38 +%define img_width(b) (b)+8 ; JDIMENSION image_width 1.39 +%define max_v_samp(b) (b)+12 ; int max_v_samp_factor 1.40 +%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor 1.41 +%define width_blks(b) (b)+20 ; JDIMENSION width_blocks 1.42 +%define input_data(b) (b)+24 ; JSAMPARRAY input_data 1.43 +%define output_data(b) (b)+28 ; JSAMPARRAY output_data 1.44 + 1.45 + align 16 1.46 + global EXTN(jsimd_h2v1_downsample_mmx) 1.47 + 1.48 +EXTN(jsimd_h2v1_downsample_mmx): 1.49 + push ebp 1.50 + mov ebp,esp 1.51 +; push ebx ; unused 1.52 +; push ecx ; need not be preserved 1.53 +; push edx ; need not be preserved 1.54 + push esi 1.55 + push edi 1.56 + 1.57 + mov ecx, JDIMENSION [width_blks(ebp)] 1.58 + shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) 1.59 + jz near .return 1.60 + 1.61 + mov edx, JDIMENSION [img_width(ebp)] 1.62 + 1.63 + ; -- expand_right_edge 1.64 + 1.65 + push ecx 1.66 + shl ecx,1 ; output_cols * 2 1.67 + sub ecx,edx 1.68 + jle short .expand_end 1.69 + 1.70 + mov eax, INT [max_v_samp(ebp)] 1.71 + test eax,eax 1.72 + jle short .expand_end 1.73 + 1.74 + cld 1.75 + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 1.76 + alignx 16,7 1.77 +.expandloop: 1.78 + push eax 1.79 + push ecx 1.80 + 1.81 + mov edi, JSAMPROW [esi] 1.82 + add edi,edx 1.83 + mov al, JSAMPLE [edi-1] 1.84 + 1.85 + rep stosb 1.86 + 1.87 + pop ecx 1.88 + pop eax 1.89 + 1.90 + add esi, byte SIZEOF_JSAMPROW 1.91 + dec eax 1.92 + jg short .expandloop 1.93 + 1.94 +.expand_end: 1.95 + pop ecx ; output_cols 1.96 + 1.97 + ; -- h2v1_downsample 1.98 + 1.99 + mov eax, JDIMENSION [v_samp(ebp)] ; rowctr 1.100 + test eax,eax 1.101 + jle near .return 1.102 + 1.103 + mov edx, 0x00010000 ; bias pattern 1.104 + movd mm7,edx 1.105 + pcmpeqw mm6,mm6 1.106 + punpckldq mm7,mm7 ; mm7={0, 1, 0, 1} 1.107 + psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} 1.108 + 1.109 + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 1.110 + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data 1.111 + alignx 16,7 1.112 +.rowloop: 1.113 + push ecx 1.114 + push edi 1.115 + push esi 1.116 + 1.117 + mov esi, JSAMPROW [esi] ; inptr 1.118 + mov edi, JSAMPROW [edi] ; outptr 1.119 + alignx 16,7 1.120 +.columnloop: 1.121 + 1.122 + movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] 1.123 + movq mm1, MMWORD [esi+1*SIZEOF_MMWORD] 1.124 + movq mm2,mm0 1.125 + movq mm3,mm1 1.126 + 1.127 + pand mm0,mm6 1.128 + psrlw mm2,BYTE_BIT 1.129 + pand mm1,mm6 1.130 + psrlw mm3,BYTE_BIT 1.131 + 1.132 + paddw mm0,mm2 1.133 + paddw mm1,mm3 1.134 + paddw mm0,mm7 1.135 + paddw mm1,mm7 1.136 + psrlw mm0,1 1.137 + psrlw mm1,1 1.138 + 1.139 + packuswb mm0,mm1 1.140 + 1.141 + movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 1.142 + 1.143 + add esi, byte 2*SIZEOF_MMWORD ; inptr 1.144 + add edi, byte 1*SIZEOF_MMWORD ; outptr 1.145 + sub ecx, byte SIZEOF_MMWORD ; outcol 1.146 + jnz short .columnloop 1.147 + 1.148 + pop esi 1.149 + pop edi 1.150 + pop ecx 1.151 + 1.152 + add esi, byte SIZEOF_JSAMPROW ; input_data 1.153 + add edi, byte SIZEOF_JSAMPROW ; output_data 1.154 + dec eax ; rowctr 1.155 + jg short .rowloop 1.156 + 1.157 + emms ; empty MMX state 1.158 + 1.159 +.return: 1.160 + pop edi 1.161 + pop esi 1.162 +; pop edx ; need not be preserved 1.163 +; pop ecx ; need not be preserved 1.164 +; pop ebx ; unused 1.165 + pop ebp 1.166 + ret 1.167 + 1.168 +; -------------------------------------------------------------------------- 1.169 +; 1.170 +; Downsample pixel values of a single component. 1.171 +; This version handles the standard case of 2:1 horizontal and 2:1 vertical, 1.172 +; without smoothing. 1.173 +; 1.174 +; GLOBAL(void) 1.175 +; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor, 1.176 +; JDIMENSION v_samp_factor, JDIMENSION width_blocks, 1.177 +; JSAMPARRAY input_data, JSAMPARRAY output_data); 1.178 +; 1.179 + 1.180 +%define img_width(b) (b)+8 ; JDIMENSION image_width 1.181 +%define max_v_samp(b) (b)+12 ; int max_v_samp_factor 1.182 +%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor 1.183 +%define width_blks(b) (b)+20 ; JDIMENSION width_blocks 1.184 +%define input_data(b) (b)+24 ; JSAMPARRAY input_data 1.185 +%define output_data(b) (b)+28 ; JSAMPARRAY output_data 1.186 + 1.187 + align 16 1.188 + global EXTN(jsimd_h2v2_downsample_mmx) 1.189 + 1.190 +EXTN(jsimd_h2v2_downsample_mmx): 1.191 + push ebp 1.192 + mov ebp,esp 1.193 +; push ebx ; unused 1.194 +; push ecx ; need not be preserved 1.195 +; push edx ; need not be preserved 1.196 + push esi 1.197 + push edi 1.198 + 1.199 + mov ecx, JDIMENSION [width_blks(ebp)] 1.200 + shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) 1.201 + jz near .return 1.202 + 1.203 + mov edx, JDIMENSION [img_width(ebp)] 1.204 + 1.205 + ; -- expand_right_edge 1.206 + 1.207 + push ecx 1.208 + shl ecx,1 ; output_cols * 2 1.209 + sub ecx,edx 1.210 + jle short .expand_end 1.211 + 1.212 + mov eax, INT [max_v_samp(ebp)] 1.213 + test eax,eax 1.214 + jle short .expand_end 1.215 + 1.216 + cld 1.217 + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 1.218 + alignx 16,7 1.219 +.expandloop: 1.220 + push eax 1.221 + push ecx 1.222 + 1.223 + mov edi, JSAMPROW [esi] 1.224 + add edi,edx 1.225 + mov al, JSAMPLE [edi-1] 1.226 + 1.227 + rep stosb 1.228 + 1.229 + pop ecx 1.230 + pop eax 1.231 + 1.232 + add esi, byte SIZEOF_JSAMPROW 1.233 + dec eax 1.234 + jg short .expandloop 1.235 + 1.236 +.expand_end: 1.237 + pop ecx ; output_cols 1.238 + 1.239 + ; -- h2v2_downsample 1.240 + 1.241 + mov eax, JDIMENSION [v_samp(ebp)] ; rowctr 1.242 + test eax,eax 1.243 + jle near .return 1.244 + 1.245 + mov edx, 0x00020001 ; bias pattern 1.246 + movd mm7,edx 1.247 + pcmpeqw mm6,mm6 1.248 + punpckldq mm7,mm7 ; mm7={1, 2, 1, 2} 1.249 + psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} 1.250 + 1.251 + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 1.252 + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data 1.253 + alignx 16,7 1.254 +.rowloop: 1.255 + push ecx 1.256 + push edi 1.257 + push esi 1.258 + 1.259 + mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 1.260 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 1.261 + mov edi, JSAMPROW [edi] ; outptr 1.262 + alignx 16,7 1.263 +.columnloop: 1.264 + 1.265 + movq mm0, MMWORD [edx+0*SIZEOF_MMWORD] 1.266 + movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] 1.267 + movq mm2, MMWORD [edx+1*SIZEOF_MMWORD] 1.268 + movq mm3, MMWORD [esi+1*SIZEOF_MMWORD] 1.269 + 1.270 + movq mm4,mm0 1.271 + movq mm5,mm1 1.272 + pand mm0,mm6 1.273 + psrlw mm4,BYTE_BIT 1.274 + pand mm1,mm6 1.275 + psrlw mm5,BYTE_BIT 1.276 + paddw mm0,mm4 1.277 + paddw mm1,mm5 1.278 + 1.279 + movq mm4,mm2 1.280 + movq mm5,mm3 1.281 + pand mm2,mm6 1.282 + psrlw mm4,BYTE_BIT 1.283 + pand mm3,mm6 1.284 + psrlw mm5,BYTE_BIT 1.285 + paddw mm2,mm4 1.286 + paddw mm3,mm5 1.287 + 1.288 + paddw mm0,mm1 1.289 + paddw mm2,mm3 1.290 + paddw mm0,mm7 1.291 + paddw mm2,mm7 1.292 + psrlw mm0,2 1.293 + psrlw mm2,2 1.294 + 1.295 + packuswb mm0,mm2 1.296 + 1.297 + movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 1.298 + 1.299 + add edx, byte 2*SIZEOF_MMWORD ; inptr0 1.300 + add esi, byte 2*SIZEOF_MMWORD ; inptr1 1.301 + add edi, byte 1*SIZEOF_MMWORD ; outptr 1.302 + sub ecx, byte SIZEOF_MMWORD ; outcol 1.303 + jnz near .columnloop 1.304 + 1.305 + pop esi 1.306 + pop edi 1.307 + pop ecx 1.308 + 1.309 + add esi, byte 2*SIZEOF_JSAMPROW ; input_data 1.310 + add edi, byte 1*SIZEOF_JSAMPROW ; output_data 1.311 + dec eax ; rowctr 1.312 + jg near .rowloop 1.313 + 1.314 + emms ; empty MMX state 1.315 + 1.316 +.return: 1.317 + pop edi 1.318 + pop esi 1.319 +; pop edx ; need not be preserved 1.320 +; pop ecx ; need not be preserved 1.321 +; pop ebx ; unused 1.322 + pop ebp 1.323 + ret 1.324 + 1.325 +; For some reason, the OS X linker does not honor the request to align the 1.326 +; segment unless we do this. 1.327 + align 16