1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libjpeg/simd/jdsammmx.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,737 @@ 1.4 +; 1.5 +; jdsammmx.asm - upsampling (MMX) 1.6 +; 1.7 +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 1.8 +; 1.9 +; Based on 1.10 +; x86 SIMD extension for IJG JPEG library 1.11 +; Copyright (C) 1999-2006, MIYASAKA Masaru. 1.12 +; For conditions of distribution and use, see copyright notice in jsimdext.inc 1.13 +; 1.14 +; This file should be assembled with NASM (Netwide Assembler), 1.15 +; can *not* be assembled with Microsoft's MASM or any compatible 1.16 +; assembler (including Borland's Turbo Assembler). 1.17 +; NASM is available from http://nasm.sourceforge.net/ or 1.18 +; http://sourceforge.net/project/showfiles.php?group_id=6208 1.19 +; 1.20 +; [TAB8] 1.21 + 1.22 +%include "jsimdext.inc" 1.23 + 1.24 +; -------------------------------------------------------------------------- 1.25 + SECTION SEG_CONST 1.26 + 1.27 + alignz 16 1.28 + global EXTN(jconst_fancy_upsample_mmx) 1.29 + 1.30 +EXTN(jconst_fancy_upsample_mmx): 1.31 + 1.32 +PW_ONE times 4 dw 1 1.33 +PW_TWO times 4 dw 2 1.34 +PW_THREE times 4 dw 3 1.35 +PW_SEVEN times 4 dw 7 1.36 +PW_EIGHT times 4 dw 8 1.37 + 1.38 + alignz 16 1.39 + 1.40 +; -------------------------------------------------------------------------- 1.41 + SECTION SEG_TEXT 1.42 + BITS 32 1.43 +; 1.44 +; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. 1.45 +; 1.46 +; The upsampling algorithm is linear interpolation between pixel centers, 1.47 +; also known as a "triangle filter". This is a good compromise between 1.48 +; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 1.49 +; of the way between input pixel centers. 1.50 +; 1.51 +; GLOBAL(void) 1.52 +; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor, 1.53 +; JDIMENSION downsampled_width, 1.54 +; JSAMPARRAY input_data, 1.55 +; JSAMPARRAY * output_data_ptr); 1.56 +; 1.57 + 1.58 +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor 1.59 +%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width 1.60 +%define input_data(b) (b)+16 ; JSAMPARRAY input_data 1.61 +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr 1.62 + 1.63 + align 16 1.64 + global EXTN(jsimd_h2v1_fancy_upsample_mmx) 1.65 + 1.66 +EXTN(jsimd_h2v1_fancy_upsample_mmx): 1.67 + push ebp 1.68 + mov ebp,esp 1.69 + pushpic ebx 1.70 +; push ecx ; need not be preserved 1.71 +; push edx ; need not be preserved 1.72 + push esi 1.73 + push edi 1.74 + 1.75 + get_GOT ebx ; get GOT address 1.76 + 1.77 + mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr 1.78 + test eax,eax 1.79 + jz near .return 1.80 + 1.81 + mov ecx, INT [max_v_samp(ebp)] ; rowctr 1.82 + test ecx,ecx 1.83 + jz near .return 1.84 + 1.85 + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 1.86 + mov edi, POINTER [output_data_ptr(ebp)] 1.87 + mov edi, JSAMPARRAY [edi] ; output_data 1.88 + alignx 16,7 1.89 +.rowloop: 1.90 + push eax ; colctr 1.91 + push edi 1.92 + push esi 1.93 + 1.94 + mov esi, JSAMPROW [esi] ; inptr 1.95 + mov edi, JSAMPROW [edi] ; outptr 1.96 + 1.97 + test eax, SIZEOF_MMWORD-1 1.98 + jz short .skip 1.99 + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] 1.100 + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 1.101 +.skip: 1.102 + pxor mm0,mm0 ; mm0=(all 0's) 1.103 + pcmpeqb mm7,mm7 1.104 + psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT 1.105 + pand mm7, MMWORD [esi+0*SIZEOF_MMWORD] 1.106 + 1.107 + add eax, byte SIZEOF_MMWORD-1 1.108 + and eax, byte -SIZEOF_MMWORD 1.109 + cmp eax, byte SIZEOF_MMWORD 1.110 + ja short .columnloop 1.111 + alignx 16,7 1.112 + 1.113 +.columnloop_last: 1.114 + pcmpeqb mm6,mm6 1.115 + psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT 1.116 + pand mm6, MMWORD [esi+0*SIZEOF_MMWORD] 1.117 + jmp short .upsample 1.118 + alignx 16,7 1.119 + 1.120 +.columnloop: 1.121 + movq mm6, MMWORD [esi+1*SIZEOF_MMWORD] 1.122 + psllq mm6,(SIZEOF_MMWORD-1)*BYTE_BIT 1.123 + 1.124 +.upsample: 1.125 + movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] 1.126 + movq mm2,mm1 1.127 + movq mm3,mm1 ; mm1=( 0 1 2 3 4 5 6 7) 1.128 + psllq mm2,BYTE_BIT ; mm2=( - 0 1 2 3 4 5 6) 1.129 + psrlq mm3,BYTE_BIT ; mm3=( 1 2 3 4 5 6 7 -) 1.130 + 1.131 + por mm2,mm7 ; mm2=(-1 0 1 2 3 4 5 6) 1.132 + por mm3,mm6 ; mm3=( 1 2 3 4 5 6 7 8) 1.133 + 1.134 + movq mm7,mm1 1.135 + psrlq mm7,(SIZEOF_MMWORD-1)*BYTE_BIT ; mm7=( 7 - - - - - - -) 1.136 + 1.137 + movq mm4,mm1 1.138 + punpcklbw mm1,mm0 ; mm1=( 0 1 2 3) 1.139 + punpckhbw mm4,mm0 ; mm4=( 4 5 6 7) 1.140 + movq mm5,mm2 1.141 + punpcklbw mm2,mm0 ; mm2=(-1 0 1 2) 1.142 + punpckhbw mm5,mm0 ; mm5=( 3 4 5 6) 1.143 + movq mm6,mm3 1.144 + punpcklbw mm3,mm0 ; mm3=( 1 2 3 4) 1.145 + punpckhbw mm6,mm0 ; mm6=( 5 6 7 8) 1.146 + 1.147 + pmullw mm1,[GOTOFF(ebx,PW_THREE)] 1.148 + pmullw mm4,[GOTOFF(ebx,PW_THREE)] 1.149 + paddw mm2,[GOTOFF(ebx,PW_ONE)] 1.150 + paddw mm5,[GOTOFF(ebx,PW_ONE)] 1.151 + paddw mm3,[GOTOFF(ebx,PW_TWO)] 1.152 + paddw mm6,[GOTOFF(ebx,PW_TWO)] 1.153 + 1.154 + paddw mm2,mm1 1.155 + paddw mm5,mm4 1.156 + psrlw mm2,2 ; mm2=OutLE=( 0 2 4 6) 1.157 + psrlw mm5,2 ; mm5=OutHE=( 8 10 12 14) 1.158 + paddw mm3,mm1 1.159 + paddw mm6,mm4 1.160 + psrlw mm3,2 ; mm3=OutLO=( 1 3 5 7) 1.161 + psrlw mm6,2 ; mm6=OutHO=( 9 11 13 15) 1.162 + 1.163 + psllw mm3,BYTE_BIT 1.164 + psllw mm6,BYTE_BIT 1.165 + por mm2,mm3 ; mm2=OutL=( 0 1 2 3 4 5 6 7) 1.166 + por mm5,mm6 ; mm5=OutH=( 8 9 10 11 12 13 14 15) 1.167 + 1.168 + movq MMWORD [edi+0*SIZEOF_MMWORD], mm2 1.169 + movq MMWORD [edi+1*SIZEOF_MMWORD], mm5 1.170 + 1.171 + sub eax, byte SIZEOF_MMWORD 1.172 + add esi, byte 1*SIZEOF_MMWORD ; inptr 1.173 + add edi, byte 2*SIZEOF_MMWORD ; outptr 1.174 + cmp eax, byte SIZEOF_MMWORD 1.175 + ja near .columnloop 1.176 + test eax,eax 1.177 + jnz near .columnloop_last 1.178 + 1.179 + pop esi 1.180 + pop edi 1.181 + pop eax 1.182 + 1.183 + add esi, byte SIZEOF_JSAMPROW ; input_data 1.184 + add edi, byte SIZEOF_JSAMPROW ; output_data 1.185 + dec ecx ; rowctr 1.186 + jg near .rowloop 1.187 + 1.188 + emms ; empty MMX state 1.189 + 1.190 +.return: 1.191 + pop edi 1.192 + pop esi 1.193 +; pop edx ; need not be preserved 1.194 +; pop ecx ; need not be preserved 1.195 + poppic ebx 1.196 + pop ebp 1.197 + ret 1.198 + 1.199 +; -------------------------------------------------------------------------- 1.200 +; 1.201 +; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. 1.202 +; Again a triangle filter; see comments for h2v1 case, above. 1.203 +; 1.204 +; GLOBAL(void) 1.205 +; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor, 1.206 +; JDIMENSION downsampled_width, 1.207 +; JSAMPARRAY input_data, 1.208 +; JSAMPARRAY * output_data_ptr); 1.209 +; 1.210 + 1.211 +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor 1.212 +%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width 1.213 +%define input_data(b) (b)+16 ; JSAMPARRAY input_data 1.214 +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr 1.215 + 1.216 +%define original_ebp ebp+0 1.217 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] 1.218 +%define WK_NUM 4 1.219 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr 1.220 + 1.221 + align 16 1.222 + global EXTN(jsimd_h2v2_fancy_upsample_mmx) 1.223 + 1.224 +EXTN(jsimd_h2v2_fancy_upsample_mmx): 1.225 + push ebp 1.226 + mov eax,esp ; eax = original ebp 1.227 + sub esp, byte 4 1.228 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits 1.229 + mov [esp],eax 1.230 + mov ebp,esp ; ebp = aligned ebp 1.231 + lea esp, [wk(0)] 1.232 + pushpic eax ; make a room for GOT address 1.233 + push ebx 1.234 +; push ecx ; need not be preserved 1.235 +; push edx ; need not be preserved 1.236 + push esi 1.237 + push edi 1.238 + 1.239 + get_GOT ebx ; get GOT address 1.240 + movpic POINTER [gotptr], ebx ; save GOT address 1.241 + 1.242 + mov edx,eax ; edx = original ebp 1.243 + mov eax, JDIMENSION [downsamp_width(edx)] ; colctr 1.244 + test eax,eax 1.245 + jz near .return 1.246 + 1.247 + mov ecx, INT [max_v_samp(edx)] ; rowctr 1.248 + test ecx,ecx 1.249 + jz near .return 1.250 + 1.251 + mov esi, JSAMPARRAY [input_data(edx)] ; input_data 1.252 + mov edi, POINTER [output_data_ptr(edx)] 1.253 + mov edi, JSAMPARRAY [edi] ; output_data 1.254 + alignx 16,7 1.255 +.rowloop: 1.256 + push eax ; colctr 1.257 + push ecx 1.258 + push edi 1.259 + push esi 1.260 + 1.261 + mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) 1.262 + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 1.263 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) 1.264 + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 1.265 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 1.266 + 1.267 + test eax, SIZEOF_MMWORD-1 1.268 + jz short .skip 1.269 + push edx 1.270 + mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] 1.271 + mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl 1.272 + mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] 1.273 + mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl 1.274 + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] 1.275 + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 1.276 + pop edx 1.277 +.skip: 1.278 + ; -- process the first column block 1.279 + 1.280 + movq mm0, MMWORD [ebx+0*SIZEOF_MMWORD] ; mm0=row[ 0][0] 1.281 + movq mm1, MMWORD [ecx+0*SIZEOF_MMWORD] ; mm1=row[-1][0] 1.282 + movq mm2, MMWORD [esi+0*SIZEOF_MMWORD] ; mm2=row[+1][0] 1.283 + 1.284 + pushpic ebx 1.285 + movpic ebx, POINTER [gotptr] ; load GOT address 1.286 + 1.287 + pxor mm3,mm3 ; mm3=(all 0's) 1.288 + movq mm4,mm0 1.289 + punpcklbw mm0,mm3 ; mm0=row[ 0][0]( 0 1 2 3) 1.290 + punpckhbw mm4,mm3 ; mm4=row[ 0][0]( 4 5 6 7) 1.291 + movq mm5,mm1 1.292 + punpcklbw mm1,mm3 ; mm1=row[-1][0]( 0 1 2 3) 1.293 + punpckhbw mm5,mm3 ; mm5=row[-1][0]( 4 5 6 7) 1.294 + movq mm6,mm2 1.295 + punpcklbw mm2,mm3 ; mm2=row[+1][0]( 0 1 2 3) 1.296 + punpckhbw mm6,mm3 ; mm6=row[+1][0]( 4 5 6 7) 1.297 + 1.298 + pmullw mm0,[GOTOFF(ebx,PW_THREE)] 1.299 + pmullw mm4,[GOTOFF(ebx,PW_THREE)] 1.300 + 1.301 + pcmpeqb mm7,mm7 1.302 + psrlq mm7,(SIZEOF_MMWORD-2)*BYTE_BIT 1.303 + 1.304 + paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3) 1.305 + paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7) 1.306 + paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3) 1.307 + paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7) 1.308 + 1.309 + movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 ; temporarily save 1.310 + movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 ; the intermediate data 1.311 + movq MMWORD [edi+0*SIZEOF_MMWORD], mm2 1.312 + movq MMWORD [edi+1*SIZEOF_MMWORD], mm6 1.313 + 1.314 + pand mm1,mm7 ; mm1=( 0 - - -) 1.315 + pand mm2,mm7 ; mm2=( 0 - - -) 1.316 + 1.317 + movq MMWORD [wk(0)], mm1 1.318 + movq MMWORD [wk(1)], mm2 1.319 + 1.320 + poppic ebx 1.321 + 1.322 + add eax, byte SIZEOF_MMWORD-1 1.323 + and eax, byte -SIZEOF_MMWORD 1.324 + cmp eax, byte SIZEOF_MMWORD 1.325 + ja short .columnloop 1.326 + alignx 16,7 1.327 + 1.328 +.columnloop_last: 1.329 + ; -- process the last column block 1.330 + 1.331 + pushpic ebx 1.332 + movpic ebx, POINTER [gotptr] ; load GOT address 1.333 + 1.334 + pcmpeqb mm1,mm1 1.335 + psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT 1.336 + movq mm2,mm1 1.337 + 1.338 + pand mm1, MMWORD [edx+1*SIZEOF_MMWORD] ; mm1=( - - - 7) 1.339 + pand mm2, MMWORD [edi+1*SIZEOF_MMWORD] ; mm2=( - - - 7) 1.340 + 1.341 + movq MMWORD [wk(2)], mm1 1.342 + movq MMWORD [wk(3)], mm2 1.343 + 1.344 + jmp short .upsample 1.345 + alignx 16,7 1.346 + 1.347 +.columnloop: 1.348 + ; -- process the next column block 1.349 + 1.350 + movq mm0, MMWORD [ebx+1*SIZEOF_MMWORD] ; mm0=row[ 0][1] 1.351 + movq mm1, MMWORD [ecx+1*SIZEOF_MMWORD] ; mm1=row[-1][1] 1.352 + movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] ; mm2=row[+1][1] 1.353 + 1.354 + pushpic ebx 1.355 + movpic ebx, POINTER [gotptr] ; load GOT address 1.356 + 1.357 + pxor mm3,mm3 ; mm3=(all 0's) 1.358 + movq mm4,mm0 1.359 + punpcklbw mm0,mm3 ; mm0=row[ 0][1]( 0 1 2 3) 1.360 + punpckhbw mm4,mm3 ; mm4=row[ 0][1]( 4 5 6 7) 1.361 + movq mm5,mm1 1.362 + punpcklbw mm1,mm3 ; mm1=row[-1][1]( 0 1 2 3) 1.363 + punpckhbw mm5,mm3 ; mm5=row[-1][1]( 4 5 6 7) 1.364 + movq mm6,mm2 1.365 + punpcklbw mm2,mm3 ; mm2=row[+1][1]( 0 1 2 3) 1.366 + punpckhbw mm6,mm3 ; mm6=row[+1][1]( 4 5 6 7) 1.367 + 1.368 + pmullw mm0,[GOTOFF(ebx,PW_THREE)] 1.369 + pmullw mm4,[GOTOFF(ebx,PW_THREE)] 1.370 + 1.371 + paddw mm1,mm0 ; mm1=Int0L=( 0 1 2 3) 1.372 + paddw mm5,mm4 ; mm5=Int0H=( 4 5 6 7) 1.373 + paddw mm2,mm0 ; mm2=Int1L=( 0 1 2 3) 1.374 + paddw mm6,mm4 ; mm6=Int1H=( 4 5 6 7) 1.375 + 1.376 + movq MMWORD [edx+2*SIZEOF_MMWORD], mm1 ; temporarily save 1.377 + movq MMWORD [edx+3*SIZEOF_MMWORD], mm5 ; the intermediate data 1.378 + movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 1.379 + movq MMWORD [edi+3*SIZEOF_MMWORD], mm6 1.380 + 1.381 + psllq mm1,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm1=( - - - 0) 1.382 + psllq mm2,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm2=( - - - 0) 1.383 + 1.384 + movq MMWORD [wk(2)], mm1 1.385 + movq MMWORD [wk(3)], mm2 1.386 + 1.387 +.upsample: 1.388 + ; -- process the upper row 1.389 + 1.390 + movq mm7, MMWORD [edx+0*SIZEOF_MMWORD] ; mm7=Int0L=( 0 1 2 3) 1.391 + movq mm3, MMWORD [edx+1*SIZEOF_MMWORD] ; mm3=Int0H=( 4 5 6 7) 1.392 + 1.393 + movq mm0,mm7 1.394 + movq mm4,mm3 1.395 + psrlq mm0,2*BYTE_BIT ; mm0=( 1 2 3 -) 1.396 + psllq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( - - - 4) 1.397 + movq mm5,mm7 1.398 + movq mm6,mm3 1.399 + psrlq mm5,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm5=( 3 - - -) 1.400 + psllq mm6,2*BYTE_BIT ; mm6=( - 4 5 6) 1.401 + 1.402 + por mm0,mm4 ; mm0=( 1 2 3 4) 1.403 + por mm5,mm6 ; mm5=( 3 4 5 6) 1.404 + 1.405 + movq mm1,mm7 1.406 + movq mm2,mm3 1.407 + psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2) 1.408 + psrlq mm2,2*BYTE_BIT ; mm2=( 5 6 7 -) 1.409 + movq mm4,mm3 1.410 + psrlq mm4,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm4=( 7 - - -) 1.411 + 1.412 + por mm1, MMWORD [wk(0)] ; mm1=(-1 0 1 2) 1.413 + por mm2, MMWORD [wk(2)] ; mm2=( 5 6 7 8) 1.414 + 1.415 + movq MMWORD [wk(0)], mm4 1.416 + 1.417 + pmullw mm7,[GOTOFF(ebx,PW_THREE)] 1.418 + pmullw mm3,[GOTOFF(ebx,PW_THREE)] 1.419 + paddw mm1,[GOTOFF(ebx,PW_EIGHT)] 1.420 + paddw mm5,[GOTOFF(ebx,PW_EIGHT)] 1.421 + paddw mm0,[GOTOFF(ebx,PW_SEVEN)] 1.422 + paddw mm2,[GOTOFF(ebx,PW_SEVEN)] 1.423 + 1.424 + paddw mm1,mm7 1.425 + paddw mm5,mm3 1.426 + psrlw mm1,4 ; mm1=Out0LE=( 0 2 4 6) 1.427 + psrlw mm5,4 ; mm5=Out0HE=( 8 10 12 14) 1.428 + paddw mm0,mm7 1.429 + paddw mm2,mm3 1.430 + psrlw mm0,4 ; mm0=Out0LO=( 1 3 5 7) 1.431 + psrlw mm2,4 ; mm2=Out0HO=( 9 11 13 15) 1.432 + 1.433 + psllw mm0,BYTE_BIT 1.434 + psllw mm2,BYTE_BIT 1.435 + por mm1,mm0 ; mm1=Out0L=( 0 1 2 3 4 5 6 7) 1.436 + por mm5,mm2 ; mm5=Out0H=( 8 9 10 11 12 13 14 15) 1.437 + 1.438 + movq MMWORD [edx+0*SIZEOF_MMWORD], mm1 1.439 + movq MMWORD [edx+1*SIZEOF_MMWORD], mm5 1.440 + 1.441 + ; -- process the lower row 1.442 + 1.443 + movq mm6, MMWORD [edi+0*SIZEOF_MMWORD] ; mm6=Int1L=( 0 1 2 3) 1.444 + movq mm4, MMWORD [edi+1*SIZEOF_MMWORD] ; mm4=Int1H=( 4 5 6 7) 1.445 + 1.446 + movq mm7,mm6 1.447 + movq mm3,mm4 1.448 + psrlq mm7,2*BYTE_BIT ; mm7=( 1 2 3 -) 1.449 + psllq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( - - - 4) 1.450 + movq mm0,mm6 1.451 + movq mm2,mm4 1.452 + psrlq mm0,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm0=( 3 - - -) 1.453 + psllq mm2,2*BYTE_BIT ; mm2=( - 4 5 6) 1.454 + 1.455 + por mm7,mm3 ; mm7=( 1 2 3 4) 1.456 + por mm0,mm2 ; mm0=( 3 4 5 6) 1.457 + 1.458 + movq mm1,mm6 1.459 + movq mm5,mm4 1.460 + psllq mm1,2*BYTE_BIT ; mm1=( - 0 1 2) 1.461 + psrlq mm5,2*BYTE_BIT ; mm5=( 5 6 7 -) 1.462 + movq mm3,mm4 1.463 + psrlq mm3,(SIZEOF_MMWORD-2)*BYTE_BIT ; mm3=( 7 - - -) 1.464 + 1.465 + por mm1, MMWORD [wk(1)] ; mm1=(-1 0 1 2) 1.466 + por mm5, MMWORD [wk(3)] ; mm5=( 5 6 7 8) 1.467 + 1.468 + movq MMWORD [wk(1)], mm3 1.469 + 1.470 + pmullw mm6,[GOTOFF(ebx,PW_THREE)] 1.471 + pmullw mm4,[GOTOFF(ebx,PW_THREE)] 1.472 + paddw mm1,[GOTOFF(ebx,PW_EIGHT)] 1.473 + paddw mm0,[GOTOFF(ebx,PW_EIGHT)] 1.474 + paddw mm7,[GOTOFF(ebx,PW_SEVEN)] 1.475 + paddw mm5,[GOTOFF(ebx,PW_SEVEN)] 1.476 + 1.477 + paddw mm1,mm6 1.478 + paddw mm0,mm4 1.479 + psrlw mm1,4 ; mm1=Out1LE=( 0 2 4 6) 1.480 + psrlw mm0,4 ; mm0=Out1HE=( 8 10 12 14) 1.481 + paddw mm7,mm6 1.482 + paddw mm5,mm4 1.483 + psrlw mm7,4 ; mm7=Out1LO=( 1 3 5 7) 1.484 + psrlw mm5,4 ; mm5=Out1HO=( 9 11 13 15) 1.485 + 1.486 + psllw mm7,BYTE_BIT 1.487 + psllw mm5,BYTE_BIT 1.488 + por mm1,mm7 ; mm1=Out1L=( 0 1 2 3 4 5 6 7) 1.489 + por mm0,mm5 ; mm0=Out1H=( 8 9 10 11 12 13 14 15) 1.490 + 1.491 + movq MMWORD [edi+0*SIZEOF_MMWORD], mm1 1.492 + movq MMWORD [edi+1*SIZEOF_MMWORD], mm0 1.493 + 1.494 + poppic ebx 1.495 + 1.496 + sub eax, byte SIZEOF_MMWORD 1.497 + add ecx, byte 1*SIZEOF_MMWORD ; inptr1(above) 1.498 + add ebx, byte 1*SIZEOF_MMWORD ; inptr0 1.499 + add esi, byte 1*SIZEOF_MMWORD ; inptr1(below) 1.500 + add edx, byte 2*SIZEOF_MMWORD ; outptr0 1.501 + add edi, byte 2*SIZEOF_MMWORD ; outptr1 1.502 + cmp eax, byte SIZEOF_MMWORD 1.503 + ja near .columnloop 1.504 + test eax,eax 1.505 + jnz near .columnloop_last 1.506 + 1.507 + pop esi 1.508 + pop edi 1.509 + pop ecx 1.510 + pop eax 1.511 + 1.512 + add esi, byte 1*SIZEOF_JSAMPROW ; input_data 1.513 + add edi, byte 2*SIZEOF_JSAMPROW ; output_data 1.514 + sub ecx, byte 2 ; rowctr 1.515 + jg near .rowloop 1.516 + 1.517 + emms ; empty MMX state 1.518 + 1.519 +.return: 1.520 + pop edi 1.521 + pop esi 1.522 +; pop edx ; need not be preserved 1.523 +; pop ecx ; need not be preserved 1.524 + pop ebx 1.525 + mov esp,ebp ; esp <- aligned ebp 1.526 + pop esp ; esp <- original ebp 1.527 + pop ebp 1.528 + ret 1.529 + 1.530 +; -------------------------------------------------------------------------- 1.531 +; 1.532 +; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. 1.533 +; It's still a box filter. 1.534 +; 1.535 +; GLOBAL(void) 1.536 +; jsimd_h2v1_upsample_mmx (int max_v_samp_factor, 1.537 +; JDIMENSION output_width, 1.538 +; JSAMPARRAY input_data, 1.539 +; JSAMPARRAY * output_data_ptr); 1.540 +; 1.541 + 1.542 +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor 1.543 +%define output_width(b) (b)+12 ; JDIMENSION output_width 1.544 +%define input_data(b) (b)+16 ; JSAMPARRAY input_data 1.545 +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr 1.546 + 1.547 + align 16 1.548 + global EXTN(jsimd_h2v1_upsample_mmx) 1.549 + 1.550 +EXTN(jsimd_h2v1_upsample_mmx): 1.551 + push ebp 1.552 + mov ebp,esp 1.553 +; push ebx ; unused 1.554 +; push ecx ; need not be preserved 1.555 +; push edx ; need not be preserved 1.556 + push esi 1.557 + push edi 1.558 + 1.559 + mov edx, JDIMENSION [output_width(ebp)] 1.560 + add edx, byte (2*SIZEOF_MMWORD)-1 1.561 + and edx, byte -(2*SIZEOF_MMWORD) 1.562 + jz short .return 1.563 + 1.564 + mov ecx, INT [max_v_samp(ebp)] ; rowctr 1.565 + test ecx,ecx 1.566 + jz short .return 1.567 + 1.568 + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 1.569 + mov edi, POINTER [output_data_ptr(ebp)] 1.570 + mov edi, JSAMPARRAY [edi] ; output_data 1.571 + alignx 16,7 1.572 +.rowloop: 1.573 + push edi 1.574 + push esi 1.575 + 1.576 + mov esi, JSAMPROW [esi] ; inptr 1.577 + mov edi, JSAMPROW [edi] ; outptr 1.578 + mov eax,edx ; colctr 1.579 + alignx 16,7 1.580 +.columnloop: 1.581 + 1.582 + movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] 1.583 + 1.584 + movq mm1,mm0 1.585 + punpcklbw mm0,mm0 1.586 + punpckhbw mm1,mm1 1.587 + 1.588 + movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 1.589 + movq MMWORD [edi+1*SIZEOF_MMWORD], mm1 1.590 + 1.591 + sub eax, byte 2*SIZEOF_MMWORD 1.592 + jz short .nextrow 1.593 + 1.594 + movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] 1.595 + 1.596 + movq mm3,mm2 1.597 + punpcklbw mm2,mm2 1.598 + punpckhbw mm3,mm3 1.599 + 1.600 + movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 1.601 + movq MMWORD [edi+3*SIZEOF_MMWORD], mm3 1.602 + 1.603 + sub eax, byte 2*SIZEOF_MMWORD 1.604 + jz short .nextrow 1.605 + 1.606 + add esi, byte 2*SIZEOF_MMWORD ; inptr 1.607 + add edi, byte 4*SIZEOF_MMWORD ; outptr 1.608 + jmp short .columnloop 1.609 + alignx 16,7 1.610 + 1.611 +.nextrow: 1.612 + pop esi 1.613 + pop edi 1.614 + 1.615 + add esi, byte SIZEOF_JSAMPROW ; input_data 1.616 + add edi, byte SIZEOF_JSAMPROW ; output_data 1.617 + dec ecx ; rowctr 1.618 + jg short .rowloop 1.619 + 1.620 + emms ; empty MMX state 1.621 + 1.622 +.return: 1.623 + pop edi 1.624 + pop esi 1.625 +; pop edx ; need not be preserved 1.626 +; pop ecx ; need not be preserved 1.627 +; pop ebx ; unused 1.628 + pop ebp 1.629 + ret 1.630 + 1.631 +; -------------------------------------------------------------------------- 1.632 +; 1.633 +; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. 1.634 +; It's still a box filter. 1.635 +; 1.636 +; GLOBAL(void) 1.637 +; jsimd_h2v2_upsample_mmx (int max_v_samp_factor, 1.638 +; JDIMENSION output_width, 1.639 +; JSAMPARRAY input_data, 1.640 +; JSAMPARRAY * output_data_ptr); 1.641 +; 1.642 + 1.643 +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor 1.644 +%define output_width(b) (b)+12 ; JDIMENSION output_width 1.645 +%define input_data(b) (b)+16 ; JSAMPARRAY input_data 1.646 +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr 1.647 + 1.648 + align 16 1.649 + global EXTN(jsimd_h2v2_upsample_mmx) 1.650 + 1.651 +EXTN(jsimd_h2v2_upsample_mmx): 1.652 + push ebp 1.653 + mov ebp,esp 1.654 + push ebx 1.655 +; push ecx ; need not be preserved 1.656 +; push edx ; need not be preserved 1.657 + push esi 1.658 + push edi 1.659 + 1.660 + mov edx, JDIMENSION [output_width(ebp)] 1.661 + add edx, byte (2*SIZEOF_MMWORD)-1 1.662 + and edx, byte -(2*SIZEOF_MMWORD) 1.663 + jz near .return 1.664 + 1.665 + mov ecx, INT [max_v_samp(ebp)] ; rowctr 1.666 + test ecx,ecx 1.667 + jz short .return 1.668 + 1.669 + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 1.670 + mov edi, POINTER [output_data_ptr(ebp)] 1.671 + mov edi, JSAMPARRAY [edi] ; output_data 1.672 + alignx 16,7 1.673 +.rowloop: 1.674 + push edi 1.675 + push esi 1.676 + 1.677 + mov esi, JSAMPROW [esi] ; inptr 1.678 + mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 1.679 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 1.680 + mov eax,edx ; colctr 1.681 + alignx 16,7 1.682 +.columnloop: 1.683 + 1.684 + movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] 1.685 + 1.686 + movq mm1,mm0 1.687 + punpcklbw mm0,mm0 1.688 + punpckhbw mm1,mm1 1.689 + 1.690 + movq MMWORD [ebx+0*SIZEOF_MMWORD], mm0 1.691 + movq MMWORD [ebx+1*SIZEOF_MMWORD], mm1 1.692 + movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 1.693 + movq MMWORD [edi+1*SIZEOF_MMWORD], mm1 1.694 + 1.695 + sub eax, byte 2*SIZEOF_MMWORD 1.696 + jz short .nextrow 1.697 + 1.698 + movq mm2, MMWORD [esi+1*SIZEOF_MMWORD] 1.699 + 1.700 + movq mm3,mm2 1.701 + punpcklbw mm2,mm2 1.702 + punpckhbw mm3,mm3 1.703 + 1.704 + movq MMWORD [ebx+2*SIZEOF_MMWORD], mm2 1.705 + movq MMWORD [ebx+3*SIZEOF_MMWORD], mm3 1.706 + movq MMWORD [edi+2*SIZEOF_MMWORD], mm2 1.707 + movq MMWORD [edi+3*SIZEOF_MMWORD], mm3 1.708 + 1.709 + sub eax, byte 2*SIZEOF_MMWORD 1.710 + jz short .nextrow 1.711 + 1.712 + add esi, byte 2*SIZEOF_MMWORD ; inptr 1.713 + add ebx, byte 4*SIZEOF_MMWORD ; outptr0 1.714 + add edi, byte 4*SIZEOF_MMWORD ; outptr1 1.715 + jmp short .columnloop 1.716 + alignx 16,7 1.717 + 1.718 +.nextrow: 1.719 + pop esi 1.720 + pop edi 1.721 + 1.722 + add esi, byte 1*SIZEOF_JSAMPROW ; input_data 1.723 + add edi, byte 2*SIZEOF_JSAMPROW ; output_data 1.724 + sub ecx, byte 2 ; rowctr 1.725 + jg short .rowloop 1.726 + 1.727 + emms ; empty MMX state 1.728 + 1.729 +.return: 1.730 + pop edi 1.731 + pop esi 1.732 +; pop edx ; need not be preserved 1.733 +; pop ecx ; need not be preserved 1.734 + pop ebx 1.735 + pop ebp 1.736 + ret 1.737 + 1.738 +; For some reason, the OS X linker does not honor the request to align the 1.739 +; segment unless we do this. 1.740 + align 16