1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libjpeg/simd/jcsamss2.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,351 @@ 1.4 +; 1.5 +; jcsamss2.asm - downsampling (SSE2) 1.6 +; 1.7 +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 1.8 +; 1.9 +; Based on 1.10 +; x86 SIMD extension for IJG JPEG library 1.11 +; Copyright (C) 1999-2006, MIYASAKA Masaru. 1.12 +; For conditions of distribution and use, see copyright notice in jsimdext.inc 1.13 +; 1.14 +; This file should be assembled with NASM (Netwide Assembler), 1.15 +; can *not* be assembled with Microsoft's MASM or any compatible 1.16 +; assembler (including Borland's Turbo Assembler). 1.17 +; NASM is available from http://nasm.sourceforge.net/ or 1.18 +; http://sourceforge.net/project/showfiles.php?group_id=6208 1.19 +; 1.20 +; [TAB8] 1.21 + 1.22 +%include "jsimdext.inc" 1.23 + 1.24 +; -------------------------------------------------------------------------- 1.25 + SECTION SEG_TEXT 1.26 + BITS 32 1.27 +; 1.28 +; Downsample pixel values of a single component. 1.29 +; This version handles the common case of 2:1 horizontal and 1:1 vertical, 1.30 +; without smoothing. 1.31 +; 1.32 +; GLOBAL(void) 1.33 +; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, 1.34 +; JDIMENSION v_samp_factor, JDIMENSION width_blocks, 1.35 +; JSAMPARRAY input_data, JSAMPARRAY output_data); 1.36 +; 1.37 + 1.38 +%define img_width(b) (b)+8 ; JDIMENSION image_width 1.39 +%define max_v_samp(b) (b)+12 ; int max_v_samp_factor 1.40 +%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor 1.41 +%define width_blks(b) (b)+20 ; JDIMENSION width_blocks 1.42 +%define input_data(b) (b)+24 ; JSAMPARRAY input_data 1.43 +%define output_data(b) (b)+28 ; JSAMPARRAY output_data 1.44 + 1.45 + align 16 1.46 + global EXTN(jsimd_h2v1_downsample_sse2) 1.47 + 1.48 +EXTN(jsimd_h2v1_downsample_sse2): 1.49 + push ebp 1.50 + mov ebp,esp 1.51 +; push ebx ; unused 1.52 +; push ecx ; need not be preserved 1.53 +; push edx ; need not be preserved 1.54 + push esi 1.55 + push edi 1.56 + 1.57 + mov ecx, JDIMENSION [width_blks(ebp)] 1.58 + shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) 1.59 + jz near .return 1.60 + 1.61 + mov edx, JDIMENSION [img_width(ebp)] 1.62 + 1.63 + ; -- expand_right_edge 1.64 + 1.65 + push ecx 1.66 + shl ecx,1 ; output_cols * 2 1.67 + sub ecx,edx 1.68 + jle short .expand_end 1.69 + 1.70 + mov eax, INT [max_v_samp(ebp)] 1.71 + test eax,eax 1.72 + jle short .expand_end 1.73 + 1.74 + cld 1.75 + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 1.76 + alignx 16,7 1.77 +.expandloop: 1.78 + push eax 1.79 + push ecx 1.80 + 1.81 + mov edi, JSAMPROW [esi] 1.82 + add edi,edx 1.83 + mov al, JSAMPLE [edi-1] 1.84 + 1.85 + rep stosb 1.86 + 1.87 + pop ecx 1.88 + pop eax 1.89 + 1.90 + add esi, byte SIZEOF_JSAMPROW 1.91 + dec eax 1.92 + jg short .expandloop 1.93 + 1.94 +.expand_end: 1.95 + pop ecx ; output_cols 1.96 + 1.97 + ; -- h2v1_downsample 1.98 + 1.99 + mov eax, JDIMENSION [v_samp(ebp)] ; rowctr 1.100 + test eax,eax 1.101 + jle near .return 1.102 + 1.103 + mov edx, 0x00010000 ; bias pattern 1.104 + movd xmm7,edx 1.105 + pcmpeqw xmm6,xmm6 1.106 + pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} 1.107 + psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} 1.108 + 1.109 + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 1.110 + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data 1.111 + alignx 16,7 1.112 +.rowloop: 1.113 + push ecx 1.114 + push edi 1.115 + push esi 1.116 + 1.117 + mov esi, JSAMPROW [esi] ; inptr 1.118 + mov edi, JSAMPROW [edi] ; outptr 1.119 + 1.120 + cmp ecx, byte SIZEOF_XMMWORD 1.121 + jae short .columnloop 1.122 + alignx 16,7 1.123 + 1.124 +.columnloop_r8: 1.125 + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] 1.126 + pxor xmm1,xmm1 1.127 + mov ecx, SIZEOF_XMMWORD 1.128 + jmp short .downsample 1.129 + alignx 16,7 1.130 + 1.131 +.columnloop: 1.132 + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] 1.133 + movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD] 1.134 + 1.135 +.downsample: 1.136 + movdqa xmm2,xmm0 1.137 + movdqa xmm3,xmm1 1.138 + 1.139 + pand xmm0,xmm6 1.140 + psrlw xmm2,BYTE_BIT 1.141 + pand xmm1,xmm6 1.142 + psrlw xmm3,BYTE_BIT 1.143 + 1.144 + paddw xmm0,xmm2 1.145 + paddw xmm1,xmm3 1.146 + paddw xmm0,xmm7 1.147 + paddw xmm1,xmm7 1.148 + psrlw xmm0,1 1.149 + psrlw xmm1,1 1.150 + 1.151 + packuswb xmm0,xmm1 1.152 + 1.153 + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 1.154 + 1.155 + sub ecx, byte SIZEOF_XMMWORD ; outcol 1.156 + add esi, byte 2*SIZEOF_XMMWORD ; inptr 1.157 + add edi, byte 1*SIZEOF_XMMWORD ; outptr 1.158 + cmp ecx, byte SIZEOF_XMMWORD 1.159 + jae short .columnloop 1.160 + test ecx,ecx 1.161 + jnz short .columnloop_r8 1.162 + 1.163 + pop esi 1.164 + pop edi 1.165 + pop ecx 1.166 + 1.167 + add esi, byte SIZEOF_JSAMPROW ; input_data 1.168 + add edi, byte SIZEOF_JSAMPROW ; output_data 1.169 + dec eax ; rowctr 1.170 + jg near .rowloop 1.171 + 1.172 +.return: 1.173 + pop edi 1.174 + pop esi 1.175 +; pop edx ; need not be preserved 1.176 +; pop ecx ; need not be preserved 1.177 +; pop ebx ; unused 1.178 + pop ebp 1.179 + ret 1.180 + 1.181 +; -------------------------------------------------------------------------- 1.182 +; 1.183 +; Downsample pixel values of a single component. 1.184 +; This version handles the standard case of 2:1 horizontal and 2:1 vertical, 1.185 +; without smoothing. 1.186 +; 1.187 +; GLOBAL(void) 1.188 +; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, 1.189 +; JDIMENSION v_samp_factor, JDIMENSION width_blocks, 1.190 +; JSAMPARRAY input_data, JSAMPARRAY output_data); 1.191 +; 1.192 + 1.193 +%define img_width(b) (b)+8 ; JDIMENSION image_width 1.194 +%define max_v_samp(b) (b)+12 ; int max_v_samp_factor 1.195 +%define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor 1.196 +%define width_blks(b) (b)+20 ; JDIMENSION width_blocks 1.197 +%define input_data(b) (b)+24 ; JSAMPARRAY input_data 1.198 +%define output_data(b) (b)+28 ; JSAMPARRAY output_data 1.199 + 1.200 + align 16 1.201 + global EXTN(jsimd_h2v2_downsample_sse2) 1.202 + 1.203 +EXTN(jsimd_h2v2_downsample_sse2): 1.204 + push ebp 1.205 + mov ebp,esp 1.206 +; push ebx ; unused 1.207 +; push ecx ; need not be preserved 1.208 +; push edx ; need not be preserved 1.209 + push esi 1.210 + push edi 1.211 + 1.212 + mov ecx, JDIMENSION [width_blks(ebp)] 1.213 + shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) 1.214 + jz near .return 1.215 + 1.216 + mov edx, JDIMENSION [img_width(ebp)] 1.217 + 1.218 + ; -- expand_right_edge 1.219 + 1.220 + push ecx 1.221 + shl ecx,1 ; output_cols * 2 1.222 + sub ecx,edx 1.223 + jle short .expand_end 1.224 + 1.225 + mov eax, INT [max_v_samp(ebp)] 1.226 + test eax,eax 1.227 + jle short .expand_end 1.228 + 1.229 + cld 1.230 + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 1.231 + alignx 16,7 1.232 +.expandloop: 1.233 + push eax 1.234 + push ecx 1.235 + 1.236 + mov edi, JSAMPROW [esi] 1.237 + add edi,edx 1.238 + mov al, JSAMPLE [edi-1] 1.239 + 1.240 + rep stosb 1.241 + 1.242 + pop ecx 1.243 + pop eax 1.244 + 1.245 + add esi, byte SIZEOF_JSAMPROW 1.246 + dec eax 1.247 + jg short .expandloop 1.248 + 1.249 +.expand_end: 1.250 + pop ecx ; output_cols 1.251 + 1.252 + ; -- h2v2_downsample 1.253 + 1.254 + mov eax, JDIMENSION [v_samp(ebp)] ; rowctr 1.255 + test eax,eax 1.256 + jle near .return 1.257 + 1.258 + mov edx, 0x00020001 ; bias pattern 1.259 + movd xmm7,edx 1.260 + pcmpeqw xmm6,xmm6 1.261 + pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} 1.262 + psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} 1.263 + 1.264 + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 1.265 + mov edi, JSAMPARRAY [output_data(ebp)] ; output_data 1.266 + alignx 16,7 1.267 +.rowloop: 1.268 + push ecx 1.269 + push edi 1.270 + push esi 1.271 + 1.272 + mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 1.273 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 1.274 + mov edi, JSAMPROW [edi] ; outptr 1.275 + 1.276 + cmp ecx, byte SIZEOF_XMMWORD 1.277 + jae short .columnloop 1.278 + alignx 16,7 1.279 + 1.280 +.columnloop_r8: 1.281 + movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] 1.282 + movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] 1.283 + pxor xmm2,xmm2 1.284 + pxor xmm3,xmm3 1.285 + mov ecx, SIZEOF_XMMWORD 1.286 + jmp short .downsample 1.287 + alignx 16,7 1.288 + 1.289 +.columnloop: 1.290 + movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] 1.291 + movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] 1.292 + movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD] 1.293 + movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD] 1.294 + 1.295 +.downsample: 1.296 + movdqa xmm4,xmm0 1.297 + movdqa xmm5,xmm1 1.298 + pand xmm0,xmm6 1.299 + psrlw xmm4,BYTE_BIT 1.300 + pand xmm1,xmm6 1.301 + psrlw xmm5,BYTE_BIT 1.302 + paddw xmm0,xmm4 1.303 + paddw xmm1,xmm5 1.304 + 1.305 + movdqa xmm4,xmm2 1.306 + movdqa xmm5,xmm3 1.307 + pand xmm2,xmm6 1.308 + psrlw xmm4,BYTE_BIT 1.309 + pand xmm3,xmm6 1.310 + psrlw xmm5,BYTE_BIT 1.311 + paddw xmm2,xmm4 1.312 + paddw xmm3,xmm5 1.313 + 1.314 + paddw xmm0,xmm1 1.315 + paddw xmm2,xmm3 1.316 + paddw xmm0,xmm7 1.317 + paddw xmm2,xmm7 1.318 + psrlw xmm0,2 1.319 + psrlw xmm2,2 1.320 + 1.321 + packuswb xmm0,xmm2 1.322 + 1.323 + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 1.324 + 1.325 + sub ecx, byte SIZEOF_XMMWORD ; outcol 1.326 + add edx, byte 2*SIZEOF_XMMWORD ; inptr0 1.327 + add esi, byte 2*SIZEOF_XMMWORD ; inptr1 1.328 + add edi, byte 1*SIZEOF_XMMWORD ; outptr 1.329 + cmp ecx, byte SIZEOF_XMMWORD 1.330 + jae near .columnloop 1.331 + test ecx,ecx 1.332 + jnz near .columnloop_r8 1.333 + 1.334 + pop esi 1.335 + pop edi 1.336 + pop ecx 1.337 + 1.338 + add esi, byte 2*SIZEOF_JSAMPROW ; input_data 1.339 + add edi, byte 1*SIZEOF_JSAMPROW ; output_data 1.340 + dec eax ; rowctr 1.341 + jg near .rowloop 1.342 + 1.343 +.return: 1.344 + pop edi 1.345 + pop esi 1.346 +; pop edx ; need not be preserved 1.347 +; pop ecx ; need not be preserved 1.348 +; pop ebx ; unused 1.349 + pop ebp 1.350 + ret 1.351 + 1.352 +; For some reason, the OS X linker does not honor the request to align the 1.353 +; segment unless we do this. 1.354 + align 16