1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libjpeg/simd/jdsamss2.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,729 @@ 1.4 +; 1.5 +; jdsamss2.asm - upsampling (SSE2) 1.6 +; 1.7 +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 1.8 +; 1.9 +; Based on 1.10 +; x86 SIMD extension for IJG JPEG library 1.11 +; Copyright (C) 1999-2006, MIYASAKA Masaru. 1.12 +; For conditions of distribution and use, see copyright notice in jsimdext.inc 1.13 +; 1.14 +; This file should be assembled with NASM (Netwide Assembler), 1.15 +; can *not* be assembled with Microsoft's MASM or any compatible 1.16 +; assembler (including Borland's Turbo Assembler). 1.17 +; NASM is available from http://nasm.sourceforge.net/ or 1.18 +; http://sourceforge.net/project/showfiles.php?group_id=6208 1.19 +; 1.20 +; [TAB8] 1.21 + 1.22 +%include "jsimdext.inc" 1.23 + 1.24 +; -------------------------------------------------------------------------- 1.25 + SECTION SEG_CONST 1.26 + 1.27 + alignz 16 1.28 + global EXTN(jconst_fancy_upsample_sse2) 1.29 + 1.30 +EXTN(jconst_fancy_upsample_sse2): 1.31 + 1.32 +PW_ONE times 8 dw 1 1.33 +PW_TWO times 8 dw 2 1.34 +PW_THREE times 8 dw 3 1.35 +PW_SEVEN times 8 dw 7 1.36 +PW_EIGHT times 8 dw 8 1.37 + 1.38 + alignz 16 1.39 + 1.40 +; -------------------------------------------------------------------------- 1.41 + SECTION SEG_TEXT 1.42 + BITS 32 1.43 +; 1.44 +; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical. 1.45 +; 1.46 +; The upsampling algorithm is linear interpolation between pixel centers, 1.47 +; also known as a "triangle filter". This is a good compromise between 1.48 +; speed and visual quality. The centers of the output pixels are 1/4 and 3/4 1.49 +; of the way between input pixel centers. 1.50 +; 1.51 +; GLOBAL(void) 1.52 +; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor, 1.53 +; JDIMENSION downsampled_width, 1.54 +; JSAMPARRAY input_data, 1.55 +; JSAMPARRAY * output_data_ptr); 1.56 +; 1.57 + 1.58 +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor 1.59 +%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width 1.60 +%define input_data(b) (b)+16 ; JSAMPARRAY input_data 1.61 +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr 1.62 + 1.63 + align 16 1.64 + global EXTN(jsimd_h2v1_fancy_upsample_sse2) 1.65 + 1.66 +EXTN(jsimd_h2v1_fancy_upsample_sse2): 1.67 + push ebp 1.68 + mov ebp,esp 1.69 + pushpic ebx 1.70 +; push ecx ; need not be preserved 1.71 +; push edx ; need not be preserved 1.72 + push esi 1.73 + push edi 1.74 + 1.75 + get_GOT ebx ; get GOT address 1.76 + 1.77 + mov eax, JDIMENSION [downsamp_width(ebp)] ; colctr 1.78 + test eax,eax 1.79 + jz near .return 1.80 + 1.81 + mov ecx, INT [max_v_samp(ebp)] ; rowctr 1.82 + test ecx,ecx 1.83 + jz near .return 1.84 + 1.85 + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 1.86 + mov edi, POINTER [output_data_ptr(ebp)] 1.87 + mov edi, JSAMPARRAY [edi] ; output_data 1.88 + alignx 16,7 1.89 +.rowloop: 1.90 + push eax ; colctr 1.91 + push edi 1.92 + push esi 1.93 + 1.94 + mov esi, JSAMPROW [esi] ; inptr 1.95 + mov edi, JSAMPROW [edi] ; outptr 1.96 + 1.97 + test eax, SIZEOF_XMMWORD-1 1.98 + jz short .skip 1.99 + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] 1.100 + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 1.101 +.skip: 1.102 + pxor xmm0,xmm0 ; xmm0=(all 0's) 1.103 + pcmpeqb xmm7,xmm7 1.104 + psrldq xmm7,(SIZEOF_XMMWORD-1) 1.105 + pand xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD] 1.106 + 1.107 + add eax, byte SIZEOF_XMMWORD-1 1.108 + and eax, byte -SIZEOF_XMMWORD 1.109 + cmp eax, byte SIZEOF_XMMWORD 1.110 + ja short .columnloop 1.111 + alignx 16,7 1.112 + 1.113 +.columnloop_last: 1.114 + pcmpeqb xmm6,xmm6 1.115 + pslldq xmm6,(SIZEOF_XMMWORD-1) 1.116 + pand xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD] 1.117 + jmp short .upsample 1.118 + alignx 16,7 1.119 + 1.120 +.columnloop: 1.121 + movdqa xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD] 1.122 + pslldq xmm6,(SIZEOF_XMMWORD-1) 1.123 + 1.124 +.upsample: 1.125 + movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] 1.126 + movdqa xmm2,xmm1 1.127 + movdqa xmm3,xmm1 ; xmm1=( 0 1 2 ... 13 14 15) 1.128 + pslldq xmm2,1 ; xmm2=(-- 0 1 ... 12 13 14) 1.129 + psrldq xmm3,1 ; xmm3=( 1 2 3 ... 14 15 --) 1.130 + 1.131 + por xmm2,xmm7 ; xmm2=(-1 0 1 ... 12 13 14) 1.132 + por xmm3,xmm6 ; xmm3=( 1 2 3 ... 14 15 16) 1.133 + 1.134 + movdqa xmm7,xmm1 1.135 + psrldq xmm7,(SIZEOF_XMMWORD-1) ; xmm7=(15 -- -- ... -- -- --) 1.136 + 1.137 + movdqa xmm4,xmm1 1.138 + punpcklbw xmm1,xmm0 ; xmm1=( 0 1 2 3 4 5 6 7) 1.139 + punpckhbw xmm4,xmm0 ; xmm4=( 8 9 10 11 12 13 14 15) 1.140 + movdqa xmm5,xmm2 1.141 + punpcklbw xmm2,xmm0 ; xmm2=(-1 0 1 2 3 4 5 6) 1.142 + punpckhbw xmm5,xmm0 ; xmm5=( 7 8 9 10 11 12 13 14) 1.143 + movdqa xmm6,xmm3 1.144 + punpcklbw xmm3,xmm0 ; xmm3=( 1 2 3 4 5 6 7 8) 1.145 + punpckhbw xmm6,xmm0 ; xmm6=( 9 10 11 12 13 14 15 16) 1.146 + 1.147 + pmullw xmm1,[GOTOFF(ebx,PW_THREE)] 1.148 + pmullw xmm4,[GOTOFF(ebx,PW_THREE)] 1.149 + paddw xmm2,[GOTOFF(ebx,PW_ONE)] 1.150 + paddw xmm5,[GOTOFF(ebx,PW_ONE)] 1.151 + paddw xmm3,[GOTOFF(ebx,PW_TWO)] 1.152 + paddw xmm6,[GOTOFF(ebx,PW_TWO)] 1.153 + 1.154 + paddw xmm2,xmm1 1.155 + paddw xmm5,xmm4 1.156 + psrlw xmm2,2 ; xmm2=OutLE=( 0 2 4 6 8 10 12 14) 1.157 + psrlw xmm5,2 ; xmm5=OutHE=(16 18 20 22 24 26 28 30) 1.158 + paddw xmm3,xmm1 1.159 + paddw xmm6,xmm4 1.160 + psrlw xmm3,2 ; xmm3=OutLO=( 1 3 5 7 9 11 13 15) 1.161 + psrlw xmm6,2 ; xmm6=OutHO=(17 19 21 23 25 27 29 31) 1.162 + 1.163 + psllw xmm3,BYTE_BIT 1.164 + psllw xmm6,BYTE_BIT 1.165 + por xmm2,xmm3 ; xmm2=OutL=( 0 1 2 ... 13 14 15) 1.166 + por xmm5,xmm6 ; xmm5=OutH=(16 17 18 ... 29 30 31) 1.167 + 1.168 + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 1.169 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5 1.170 + 1.171 + sub eax, byte SIZEOF_XMMWORD 1.172 + add esi, byte 1*SIZEOF_XMMWORD ; inptr 1.173 + add edi, byte 2*SIZEOF_XMMWORD ; outptr 1.174 + cmp eax, byte SIZEOF_XMMWORD 1.175 + ja near .columnloop 1.176 + test eax,eax 1.177 + jnz near .columnloop_last 1.178 + 1.179 + pop esi 1.180 + pop edi 1.181 + pop eax 1.182 + 1.183 + add esi, byte SIZEOF_JSAMPROW ; input_data 1.184 + add edi, byte SIZEOF_JSAMPROW ; output_data 1.185 + dec ecx ; rowctr 1.186 + jg near .rowloop 1.187 + 1.188 +.return: 1.189 + pop edi 1.190 + pop esi 1.191 +; pop edx ; need not be preserved 1.192 +; pop ecx ; need not be preserved 1.193 + poppic ebx 1.194 + pop ebp 1.195 + ret 1.196 + 1.197 +; -------------------------------------------------------------------------- 1.198 +; 1.199 +; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical. 1.200 +; Again a triangle filter; see comments for h2v1 case, above. 1.201 +; 1.202 +; GLOBAL(void) 1.203 +; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor, 1.204 +; JDIMENSION downsampled_width, 1.205 +; JSAMPARRAY input_data, 1.206 +; JSAMPARRAY * output_data_ptr); 1.207 +; 1.208 + 1.209 +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor 1.210 +%define downsamp_width(b) (b)+12 ; JDIMENSION downsampled_width 1.211 +%define input_data(b) (b)+16 ; JSAMPARRAY input_data 1.212 +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr 1.213 + 1.214 +%define original_ebp ebp+0 1.215 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 1.216 +%define WK_NUM 4 1.217 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr 1.218 + 1.219 + align 16 1.220 + global EXTN(jsimd_h2v2_fancy_upsample_sse2) 1.221 + 1.222 +EXTN(jsimd_h2v2_fancy_upsample_sse2): 1.223 + push ebp 1.224 + mov eax,esp ; eax = original ebp 1.225 + sub esp, byte 4 1.226 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 1.227 + mov [esp],eax 1.228 + mov ebp,esp ; ebp = aligned ebp 1.229 + lea esp, [wk(0)] 1.230 + pushpic eax ; make a room for GOT address 1.231 + push ebx 1.232 +; push ecx ; need not be preserved 1.233 +; push edx ; need not be preserved 1.234 + push esi 1.235 + push edi 1.236 + 1.237 + get_GOT ebx ; get GOT address 1.238 + movpic POINTER [gotptr], ebx ; save GOT address 1.239 + 1.240 + mov edx,eax ; edx = original ebp 1.241 + mov eax, JDIMENSION [downsamp_width(edx)] ; colctr 1.242 + test eax,eax 1.243 + jz near .return 1.244 + 1.245 + mov ecx, INT [max_v_samp(edx)] ; rowctr 1.246 + test ecx,ecx 1.247 + jz near .return 1.248 + 1.249 + mov esi, JSAMPARRAY [input_data(edx)] ; input_data 1.250 + mov edi, POINTER [output_data_ptr(edx)] 1.251 + mov edi, JSAMPARRAY [edi] ; output_data 1.252 + alignx 16,7 1.253 +.rowloop: 1.254 + push eax ; colctr 1.255 + push ecx 1.256 + push edi 1.257 + push esi 1.258 + 1.259 + mov ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW] ; inptr1(above) 1.260 + mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 1.261 + mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1(below) 1.262 + mov edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 1.263 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 1.264 + 1.265 + test eax, SIZEOF_XMMWORD-1 1.266 + jz short .skip 1.267 + push edx 1.268 + mov dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE] 1.269 + mov JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl 1.270 + mov dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE] 1.271 + mov JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl 1.272 + mov dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE] 1.273 + mov JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl ; insert a dummy sample 1.274 + pop edx 1.275 +.skip: 1.276 + ; -- process the first column block 1.277 + 1.278 + movdqa xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD] ; xmm0=row[ 0][0] 1.279 + movdqa xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD] ; xmm1=row[-1][0] 1.280 + movdqa xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD] ; xmm2=row[+1][0] 1.281 + 1.282 + pushpic ebx 1.283 + movpic ebx, POINTER [gotptr] ; load GOT address 1.284 + 1.285 + pxor xmm3,xmm3 ; xmm3=(all 0's) 1.286 + movdqa xmm4,xmm0 1.287 + punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) 1.288 + punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) 1.289 + movdqa xmm5,xmm1 1.290 + punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) 1.291 + punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) 1.292 + movdqa xmm6,xmm2 1.293 + punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) 1.294 + punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) 1.295 + 1.296 + pmullw xmm0,[GOTOFF(ebx,PW_THREE)] 1.297 + pmullw xmm4,[GOTOFF(ebx,PW_THREE)] 1.298 + 1.299 + pcmpeqb xmm7,xmm7 1.300 + psrldq xmm7,(SIZEOF_XMMWORD-2) 1.301 + 1.302 + paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) 1.303 + paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) 1.304 + paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) 1.305 + paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) 1.306 + 1.307 + movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 ; temporarily save 1.308 + movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 ; the intermediate data 1.309 + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2 1.310 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6 1.311 + 1.312 + pand xmm1,xmm7 ; xmm1=( 0 -- -- -- -- -- -- --) 1.313 + pand xmm2,xmm7 ; xmm2=( 0 -- -- -- -- -- -- --) 1.314 + 1.315 + movdqa XMMWORD [wk(0)], xmm1 1.316 + movdqa XMMWORD [wk(1)], xmm2 1.317 + 1.318 + poppic ebx 1.319 + 1.320 + add eax, byte SIZEOF_XMMWORD-1 1.321 + and eax, byte -SIZEOF_XMMWORD 1.322 + cmp eax, byte SIZEOF_XMMWORD 1.323 + ja short .columnloop 1.324 + alignx 16,7 1.325 + 1.326 +.columnloop_last: 1.327 + ; -- process the last column block 1.328 + 1.329 + pushpic ebx 1.330 + movpic ebx, POINTER [gotptr] ; load GOT address 1.331 + 1.332 + pcmpeqb xmm1,xmm1 1.333 + pslldq xmm1,(SIZEOF_XMMWORD-2) 1.334 + movdqa xmm2,xmm1 1.335 + 1.336 + pand xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD] 1.337 + pand xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD] 1.338 + 1.339 + movdqa XMMWORD [wk(2)], xmm1 ; xmm1=(-- -- -- -- -- -- -- 15) 1.340 + movdqa XMMWORD [wk(3)], xmm2 ; xmm2=(-- -- -- -- -- -- -- 15) 1.341 + 1.342 + jmp near .upsample 1.343 + alignx 16,7 1.344 + 1.345 +.columnloop: 1.346 + ; -- process the next column block 1.347 + 1.348 + movdqa xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD] ; xmm0=row[ 0][1] 1.349 + movdqa xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD] ; xmm1=row[-1][1] 1.350 + movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] ; xmm2=row[+1][1] 1.351 + 1.352 + pushpic ebx 1.353 + movpic ebx, POINTER [gotptr] ; load GOT address 1.354 + 1.355 + pxor xmm3,xmm3 ; xmm3=(all 0's) 1.356 + movdqa xmm4,xmm0 1.357 + punpcklbw xmm0,xmm3 ; xmm0=row[ 0]( 0 1 2 3 4 5 6 7) 1.358 + punpckhbw xmm4,xmm3 ; xmm4=row[ 0]( 8 9 10 11 12 13 14 15) 1.359 + movdqa xmm5,xmm1 1.360 + punpcklbw xmm1,xmm3 ; xmm1=row[-1]( 0 1 2 3 4 5 6 7) 1.361 + punpckhbw xmm5,xmm3 ; xmm5=row[-1]( 8 9 10 11 12 13 14 15) 1.362 + movdqa xmm6,xmm2 1.363 + punpcklbw xmm2,xmm3 ; xmm2=row[+1]( 0 1 2 3 4 5 6 7) 1.364 + punpckhbw xmm6,xmm3 ; xmm6=row[+1]( 8 9 10 11 12 13 14 15) 1.365 + 1.366 + pmullw xmm0,[GOTOFF(ebx,PW_THREE)] 1.367 + pmullw xmm4,[GOTOFF(ebx,PW_THREE)] 1.368 + 1.369 + paddw xmm1,xmm0 ; xmm1=Int0L=( 0 1 2 3 4 5 6 7) 1.370 + paddw xmm5,xmm4 ; xmm5=Int0H=( 8 9 10 11 12 13 14 15) 1.371 + paddw xmm2,xmm0 ; xmm2=Int1L=( 0 1 2 3 4 5 6 7) 1.372 + paddw xmm6,xmm4 ; xmm6=Int1H=( 8 9 10 11 12 13 14 15) 1.373 + 1.374 + movdqa XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1 ; temporarily save 1.375 + movdqa XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5 ; the intermediate data 1.376 + movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 1.377 + movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6 1.378 + 1.379 + pslldq xmm1,(SIZEOF_XMMWORD-2) ; xmm1=(-- -- -- -- -- -- -- 0) 1.380 + pslldq xmm2,(SIZEOF_XMMWORD-2) ; xmm2=(-- -- -- -- -- -- -- 0) 1.381 + 1.382 + movdqa XMMWORD [wk(2)], xmm1 1.383 + movdqa XMMWORD [wk(3)], xmm2 1.384 + 1.385 +.upsample: 1.386 + ; -- process the upper row 1.387 + 1.388 + movdqa xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD] 1.389 + movdqa xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD] 1.390 + 1.391 + movdqa xmm0,xmm7 ; xmm7=Int0L=( 0 1 2 3 4 5 6 7) 1.392 + movdqa xmm4,xmm3 ; xmm3=Int0H=( 8 9 10 11 12 13 14 15) 1.393 + psrldq xmm0,2 ; xmm0=( 1 2 3 4 5 6 7 --) 1.394 + pslldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(-- -- -- -- -- -- -- 8) 1.395 + movdqa xmm5,xmm7 1.396 + movdqa xmm6,xmm3 1.397 + psrldq xmm5,(SIZEOF_XMMWORD-2) ; xmm5=( 7 -- -- -- -- -- -- --) 1.398 + pslldq xmm6,2 ; xmm6=(-- 8 9 10 11 12 13 14) 1.399 + 1.400 + por xmm0,xmm4 ; xmm0=( 1 2 3 4 5 6 7 8) 1.401 + por xmm5,xmm6 ; xmm5=( 7 8 9 10 11 12 13 14) 1.402 + 1.403 + movdqa xmm1,xmm7 1.404 + movdqa xmm2,xmm3 1.405 + pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) 1.406 + psrldq xmm2,2 ; xmm2=( 9 10 11 12 13 14 15 --) 1.407 + movdqa xmm4,xmm3 1.408 + psrldq xmm4,(SIZEOF_XMMWORD-2) ; xmm4=(15 -- -- -- -- -- -- --) 1.409 + 1.410 + por xmm1, XMMWORD [wk(0)] ; xmm1=(-1 0 1 2 3 4 5 6) 1.411 + por xmm2, XMMWORD [wk(2)] ; xmm2=( 9 10 11 12 13 14 15 16) 1.412 + 1.413 + movdqa XMMWORD [wk(0)], xmm4 1.414 + 1.415 + pmullw xmm7,[GOTOFF(ebx,PW_THREE)] 1.416 + pmullw xmm3,[GOTOFF(ebx,PW_THREE)] 1.417 + paddw xmm1,[GOTOFF(ebx,PW_EIGHT)] 1.418 + paddw xmm5,[GOTOFF(ebx,PW_EIGHT)] 1.419 + paddw xmm0,[GOTOFF(ebx,PW_SEVEN)] 1.420 + paddw xmm2,[GOTOFF(ebx,PW_SEVEN)] 1.421 + 1.422 + paddw xmm1,xmm7 1.423 + paddw xmm5,xmm3 1.424 + psrlw xmm1,4 ; xmm1=Out0LE=( 0 2 4 6 8 10 12 14) 1.425 + psrlw xmm5,4 ; xmm5=Out0HE=(16 18 20 22 24 26 28 30) 1.426 + paddw xmm0,xmm7 1.427 + paddw xmm2,xmm3 1.428 + psrlw xmm0,4 ; xmm0=Out0LO=( 1 3 5 7 9 11 13 15) 1.429 + psrlw xmm2,4 ; xmm2=Out0HO=(17 19 21 23 25 27 29 31) 1.430 + 1.431 + psllw xmm0,BYTE_BIT 1.432 + psllw xmm2,BYTE_BIT 1.433 + por xmm1,xmm0 ; xmm1=Out0L=( 0 1 2 ... 13 14 15) 1.434 + por xmm5,xmm2 ; xmm5=Out0H=(16 17 18 ... 29 30 31) 1.435 + 1.436 + movdqa XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1 1.437 + movdqa XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5 1.438 + 1.439 + ; -- process the lower row 1.440 + 1.441 + movdqa xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD] 1.442 + movdqa xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD] 1.443 + 1.444 + movdqa xmm7,xmm6 ; xmm6=Int1L=( 0 1 2 3 4 5 6 7) 1.445 + movdqa xmm3,xmm4 ; xmm4=Int1H=( 8 9 10 11 12 13 14 15) 1.446 + psrldq xmm7,2 ; xmm7=( 1 2 3 4 5 6 7 --) 1.447 + pslldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(-- -- -- -- -- -- -- 8) 1.448 + movdqa xmm0,xmm6 1.449 + movdqa xmm2,xmm4 1.450 + psrldq xmm0,(SIZEOF_XMMWORD-2) ; xmm0=( 7 -- -- -- -- -- -- --) 1.451 + pslldq xmm2,2 ; xmm2=(-- 8 9 10 11 12 13 14) 1.452 + 1.453 + por xmm7,xmm3 ; xmm7=( 1 2 3 4 5 6 7 8) 1.454 + por xmm0,xmm2 ; xmm0=( 7 8 9 10 11 12 13 14) 1.455 + 1.456 + movdqa xmm1,xmm6 1.457 + movdqa xmm5,xmm4 1.458 + pslldq xmm1,2 ; xmm1=(-- 0 1 2 3 4 5 6) 1.459 + psrldq xmm5,2 ; xmm5=( 9 10 11 12 13 14 15 --) 1.460 + movdqa xmm3,xmm4 1.461 + psrldq xmm3,(SIZEOF_XMMWORD-2) ; xmm3=(15 -- -- -- -- -- -- --) 1.462 + 1.463 + por xmm1, XMMWORD [wk(1)] ; xmm1=(-1 0 1 2 3 4 5 6) 1.464 + por xmm5, XMMWORD [wk(3)] ; xmm5=( 9 10 11 12 13 14 15 16) 1.465 + 1.466 + movdqa XMMWORD [wk(1)], xmm3 1.467 + 1.468 + pmullw xmm6,[GOTOFF(ebx,PW_THREE)] 1.469 + pmullw xmm4,[GOTOFF(ebx,PW_THREE)] 1.470 + paddw xmm1,[GOTOFF(ebx,PW_EIGHT)] 1.471 + paddw xmm0,[GOTOFF(ebx,PW_EIGHT)] 1.472 + paddw xmm7,[GOTOFF(ebx,PW_SEVEN)] 1.473 + paddw xmm5,[GOTOFF(ebx,PW_SEVEN)] 1.474 + 1.475 + paddw xmm1,xmm6 1.476 + paddw xmm0,xmm4 1.477 + psrlw xmm1,4 ; xmm1=Out1LE=( 0 2 4 6 8 10 12 14) 1.478 + psrlw xmm0,4 ; xmm0=Out1HE=(16 18 20 22 24 26 28 30) 1.479 + paddw xmm7,xmm6 1.480 + paddw xmm5,xmm4 1.481 + psrlw xmm7,4 ; xmm7=Out1LO=( 1 3 5 7 9 11 13 15) 1.482 + psrlw xmm5,4 ; xmm5=Out1HO=(17 19 21 23 25 27 29 31) 1.483 + 1.484 + psllw xmm7,BYTE_BIT 1.485 + psllw xmm5,BYTE_BIT 1.486 + por xmm1,xmm7 ; xmm1=Out1L=( 0 1 2 ... 13 14 15) 1.487 + por xmm0,xmm5 ; xmm0=Out1H=(16 17 18 ... 29 30 31) 1.488 + 1.489 + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1 1.490 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0 1.491 + 1.492 + poppic ebx 1.493 + 1.494 + sub eax, byte SIZEOF_XMMWORD 1.495 + add ecx, byte 1*SIZEOF_XMMWORD ; inptr1(above) 1.496 + add ebx, byte 1*SIZEOF_XMMWORD ; inptr0 1.497 + add esi, byte 1*SIZEOF_XMMWORD ; inptr1(below) 1.498 + add edx, byte 2*SIZEOF_XMMWORD ; outptr0 1.499 + add edi, byte 2*SIZEOF_XMMWORD ; outptr1 1.500 + cmp eax, byte SIZEOF_XMMWORD 1.501 + ja near .columnloop 1.502 + test eax,eax 1.503 + jnz near .columnloop_last 1.504 + 1.505 + pop esi 1.506 + pop edi 1.507 + pop ecx 1.508 + pop eax 1.509 + 1.510 + add esi, byte 1*SIZEOF_JSAMPROW ; input_data 1.511 + add edi, byte 2*SIZEOF_JSAMPROW ; output_data 1.512 + sub ecx, byte 2 ; rowctr 1.513 + jg near .rowloop 1.514 + 1.515 +.return: 1.516 + pop edi 1.517 + pop esi 1.518 +; pop edx ; need not be preserved 1.519 +; pop ecx ; need not be preserved 1.520 + pop ebx 1.521 + mov esp,ebp ; esp <- aligned ebp 1.522 + pop esp ; esp <- original ebp 1.523 + pop ebp 1.524 + ret 1.525 + 1.526 +; -------------------------------------------------------------------------- 1.527 +; 1.528 +; Fast processing for the common case of 2:1 horizontal and 1:1 vertical. 1.529 +; It's still a box filter. 1.530 +; 1.531 +; GLOBAL(void) 1.532 +; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor, 1.533 +; JDIMENSION output_width, 1.534 +; JSAMPARRAY input_data, 1.535 +; JSAMPARRAY * output_data_ptr); 1.536 +; 1.537 + 1.538 +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor 1.539 +%define output_width(b) (b)+12 ; JDIMENSION output_width 1.540 +%define input_data(b) (b)+16 ; JSAMPARRAY input_data 1.541 +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr 1.542 + 1.543 + align 16 1.544 + global EXTN(jsimd_h2v1_upsample_sse2) 1.545 + 1.546 +EXTN(jsimd_h2v1_upsample_sse2): 1.547 + push ebp 1.548 + mov ebp,esp 1.549 +; push ebx ; unused 1.550 +; push ecx ; need not be preserved 1.551 +; push edx ; need not be preserved 1.552 + push esi 1.553 + push edi 1.554 + 1.555 + mov edx, JDIMENSION [output_width(ebp)] 1.556 + add edx, byte (2*SIZEOF_XMMWORD)-1 1.557 + and edx, byte -(2*SIZEOF_XMMWORD) 1.558 + jz short .return 1.559 + 1.560 + mov ecx, INT [max_v_samp(ebp)] ; rowctr 1.561 + test ecx,ecx 1.562 + jz short .return 1.563 + 1.564 + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 1.565 + mov edi, POINTER [output_data_ptr(ebp)] 1.566 + mov edi, JSAMPARRAY [edi] ; output_data 1.567 + alignx 16,7 1.568 +.rowloop: 1.569 + push edi 1.570 + push esi 1.571 + 1.572 + mov esi, JSAMPROW [esi] ; inptr 1.573 + mov edi, JSAMPROW [edi] ; outptr 1.574 + mov eax,edx ; colctr 1.575 + alignx 16,7 1.576 +.columnloop: 1.577 + 1.578 + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] 1.579 + 1.580 + movdqa xmm1,xmm0 1.581 + punpcklbw xmm0,xmm0 1.582 + punpckhbw xmm1,xmm1 1.583 + 1.584 + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 1.585 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 1.586 + 1.587 + sub eax, byte 2*SIZEOF_XMMWORD 1.588 + jz short .nextrow 1.589 + 1.590 + movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] 1.591 + 1.592 + movdqa xmm3,xmm2 1.593 + punpcklbw xmm2,xmm2 1.594 + punpckhbw xmm3,xmm3 1.595 + 1.596 + movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 1.597 + movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3 1.598 + 1.599 + sub eax, byte 2*SIZEOF_XMMWORD 1.600 + jz short .nextrow 1.601 + 1.602 + add esi, byte 2*SIZEOF_XMMWORD ; inptr 1.603 + add edi, byte 4*SIZEOF_XMMWORD ; outptr 1.604 + jmp short .columnloop 1.605 + alignx 16,7 1.606 + 1.607 +.nextrow: 1.608 + pop esi 1.609 + pop edi 1.610 + 1.611 + add esi, byte SIZEOF_JSAMPROW ; input_data 1.612 + add edi, byte SIZEOF_JSAMPROW ; output_data 1.613 + dec ecx ; rowctr 1.614 + jg short .rowloop 1.615 + 1.616 +.return: 1.617 + pop edi 1.618 + pop esi 1.619 +; pop edx ; need not be preserved 1.620 +; pop ecx ; need not be preserved 1.621 +; pop ebx ; unused 1.622 + pop ebp 1.623 + ret 1.624 + 1.625 +; -------------------------------------------------------------------------- 1.626 +; 1.627 +; Fast processing for the common case of 2:1 horizontal and 2:1 vertical. 1.628 +; It's still a box filter. 1.629 +; 1.630 +; GLOBAL(void) 1.631 +; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor, 1.632 +; JDIMENSION output_width, 1.633 +; JSAMPARRAY input_data, 1.634 +; JSAMPARRAY * output_data_ptr); 1.635 +; 1.636 + 1.637 +%define max_v_samp(b) (b)+8 ; int max_v_samp_factor 1.638 +%define output_width(b) (b)+12 ; JDIMENSION output_width 1.639 +%define input_data(b) (b)+16 ; JSAMPARRAY input_data 1.640 +%define output_data_ptr(b) (b)+20 ; JSAMPARRAY * output_data_ptr 1.641 + 1.642 + align 16 1.643 + global EXTN(jsimd_h2v2_upsample_sse2) 1.644 + 1.645 +EXTN(jsimd_h2v2_upsample_sse2): 1.646 + push ebp 1.647 + mov ebp,esp 1.648 + push ebx 1.649 +; push ecx ; need not be preserved 1.650 +; push edx ; need not be preserved 1.651 + push esi 1.652 + push edi 1.653 + 1.654 + mov edx, JDIMENSION [output_width(ebp)] 1.655 + add edx, byte (2*SIZEOF_XMMWORD)-1 1.656 + and edx, byte -(2*SIZEOF_XMMWORD) 1.657 + jz near .return 1.658 + 1.659 + mov ecx, INT [max_v_samp(ebp)] ; rowctr 1.660 + test ecx,ecx 1.661 + jz near .return 1.662 + 1.663 + mov esi, JSAMPARRAY [input_data(ebp)] ; input_data 1.664 + mov edi, POINTER [output_data_ptr(ebp)] 1.665 + mov edi, JSAMPARRAY [edi] ; output_data 1.666 + alignx 16,7 1.667 +.rowloop: 1.668 + push edi 1.669 + push esi 1.670 + 1.671 + mov esi, JSAMPROW [esi] ; inptr 1.672 + mov ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW] ; outptr0 1.673 + mov edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW] ; outptr1 1.674 + mov eax,edx ; colctr 1.675 + alignx 16,7 1.676 +.columnloop: 1.677 + 1.678 + movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] 1.679 + 1.680 + movdqa xmm1,xmm0 1.681 + punpcklbw xmm0,xmm0 1.682 + punpckhbw xmm1,xmm1 1.683 + 1.684 + movdqa XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0 1.685 + movdqa XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1 1.686 + movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 1.687 + movdqa XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1 1.688 + 1.689 + sub eax, byte 2*SIZEOF_XMMWORD 1.690 + jz short .nextrow 1.691 + 1.692 + movdqa xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD] 1.693 + 1.694 + movdqa xmm3,xmm2 1.695 + punpcklbw xmm2,xmm2 1.696 + punpckhbw xmm3,xmm3 1.697 + 1.698 + movdqa XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2 1.699 + movdqa XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3 1.700 + movdqa XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2 1.701 + movdqa XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3 1.702 + 1.703 + sub eax, byte 2*SIZEOF_XMMWORD 1.704 + jz short .nextrow 1.705 + 1.706 + add esi, byte 2*SIZEOF_XMMWORD ; inptr 1.707 + add ebx, byte 4*SIZEOF_XMMWORD ; outptr0 1.708 + add edi, byte 4*SIZEOF_XMMWORD ; outptr1 1.709 + jmp short .columnloop 1.710 + alignx 16,7 1.711 + 1.712 +.nextrow: 1.713 + pop esi 1.714 + pop edi 1.715 + 1.716 + add esi, byte 1*SIZEOF_JSAMPROW ; input_data 1.717 + add edi, byte 2*SIZEOF_JSAMPROW ; output_data 1.718 + sub ecx, byte 2 ; rowctr 1.719 + jg short .rowloop 1.720 + 1.721 +.return: 1.722 + pop edi 1.723 + pop esi 1.724 +; pop edx ; need not be preserved 1.725 +; pop ecx ; need not be preserved 1.726 + pop ebx 1.727 + pop ebp 1.728 + ret 1.729 + 1.730 +; For some reason, the OS X linker does not honor the request to align the 1.731 +; segment unless we do this. 1.732 + align 16