1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libjpeg/simd/jdmrgmmx.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,464 @@ 1.4 +; 1.5 +; jdmrgmmx.asm - merged upsampling/color conversion (MMX) 1.6 +; 1.7 +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 1.8 +; 1.9 +; Based on 1.10 +; x86 SIMD extension for IJG JPEG library 1.11 +; Copyright (C) 1999-2006, MIYASAKA Masaru. 1.12 +; For conditions of distribution and use, see copyright notice in jsimdext.inc 1.13 +; 1.14 +; This file should be assembled with NASM (Netwide Assembler), 1.15 +; can *not* be assembled with Microsoft's MASM or any compatible 1.16 +; assembler (including Borland's Turbo Assembler). 1.17 +; NASM is available from http://nasm.sourceforge.net/ or 1.18 +; http://sourceforge.net/project/showfiles.php?group_id=6208 1.19 +; 1.20 +; [TAB8] 1.21 + 1.22 +%include "jcolsamp.inc" 1.23 + 1.24 +; -------------------------------------------------------------------------- 1.25 +; 1.26 +; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical. 1.27 +; 1.28 +; GLOBAL(void) 1.29 +; jsimd_h2v1_merged_upsample_mmx (JDIMENSION output_width, 1.30 +; JSAMPIMAGE input_buf, 1.31 +; JDIMENSION in_row_group_ctr, 1.32 +; JSAMPARRAY output_buf); 1.33 +; 1.34 + 1.35 +%define output_width(b) (b)+8 ; JDIMENSION output_width 1.36 +%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf 1.37 +%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr 1.38 +%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf 1.39 + 1.40 +%define original_ebp ebp+0 1.41 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] 1.42 +%define WK_NUM 3 1.43 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr 1.44 + 1.45 + align 16 1.46 + global EXTN(jsimd_h2v1_merged_upsample_mmx) 1.47 + 1.48 +EXTN(jsimd_h2v1_merged_upsample_mmx): 1.49 + push ebp 1.50 + mov eax,esp ; eax = original ebp 1.51 + sub esp, byte 4 1.52 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits 1.53 + mov [esp],eax 1.54 + mov ebp,esp ; ebp = aligned ebp 1.55 + lea esp, [wk(0)] 1.56 + pushpic eax ; make a room for GOT address 1.57 + push ebx 1.58 +; push ecx ; need not be preserved 1.59 +; push edx ; need not be preserved 1.60 + push esi 1.61 + push edi 1.62 + 1.63 + get_GOT ebx ; get GOT address 1.64 + movpic POINTER [gotptr], ebx ; save GOT address 1.65 + 1.66 + mov ecx, JDIMENSION [output_width(eax)] ; col 1.67 + test ecx,ecx 1.68 + jz near .return 1.69 + 1.70 + push ecx 1.71 + 1.72 + mov edi, JSAMPIMAGE [input_buf(eax)] 1.73 + mov ecx, JDIMENSION [in_row_group_ctr(eax)] 1.74 + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] 1.75 + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] 1.76 + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] 1.77 + mov edi, JSAMPARRAY [output_buf(eax)] 1.78 + mov esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW] ; inptr0 1.79 + mov ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW] ; inptr1 1.80 + mov edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW] ; inptr2 1.81 + mov edi, JSAMPROW [edi] ; outptr 1.82 + 1.83 + pop ecx ; col 1.84 + 1.85 + alignx 16,7 1.86 +.columnloop: 1.87 + movpic eax, POINTER [gotptr] ; load GOT address (eax) 1.88 + 1.89 + movq mm6, MMWORD [ebx] ; mm6=Cb(01234567) 1.90 + movq mm7, MMWORD [edx] ; mm7=Cr(01234567) 1.91 + 1.92 + pxor mm1,mm1 ; mm1=(all 0's) 1.93 + pcmpeqw mm3,mm3 1.94 + psllw mm3,7 ; mm3={0xFF80 0xFF80 0xFF80 0xFF80} 1.95 + 1.96 + movq mm4,mm6 1.97 + punpckhbw mm6,mm1 ; mm6=Cb(4567)=CbH 1.98 + punpcklbw mm4,mm1 ; mm4=Cb(0123)=CbL 1.99 + movq mm0,mm7 1.100 + punpckhbw mm7,mm1 ; mm7=Cr(4567)=CrH 1.101 + punpcklbw mm0,mm1 ; mm0=Cr(0123)=CrL 1.102 + 1.103 + paddw mm6,mm3 1.104 + paddw mm4,mm3 1.105 + paddw mm7,mm3 1.106 + paddw mm0,mm3 1.107 + 1.108 + ; (Original) 1.109 + ; R = Y + 1.40200 * Cr 1.110 + ; G = Y - 0.34414 * Cb - 0.71414 * Cr 1.111 + ; B = Y + 1.77200 * Cb 1.112 + ; 1.113 + ; (This implementation) 1.114 + ; R = Y + 0.40200 * Cr + Cr 1.115 + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr 1.116 + ; B = Y - 0.22800 * Cb + Cb + Cb 1.117 + 1.118 + movq mm5,mm6 ; mm5=CbH 1.119 + movq mm2,mm4 ; mm2=CbL 1.120 + paddw mm6,mm6 ; mm6=2*CbH 1.121 + paddw mm4,mm4 ; mm4=2*CbL 1.122 + movq mm1,mm7 ; mm1=CrH 1.123 + movq mm3,mm0 ; mm3=CrL 1.124 + paddw mm7,mm7 ; mm7=2*CrH 1.125 + paddw mm0,mm0 ; mm0=2*CrL 1.126 + 1.127 + pmulhw mm6,[GOTOFF(eax,PW_MF0228)] ; mm6=(2*CbH * -FIX(0.22800)) 1.128 + pmulhw mm4,[GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbL * -FIX(0.22800)) 1.129 + pmulhw mm7,[GOTOFF(eax,PW_F0402)] ; mm7=(2*CrH * FIX(0.40200)) 1.130 + pmulhw mm0,[GOTOFF(eax,PW_F0402)] ; mm0=(2*CrL * FIX(0.40200)) 1.131 + 1.132 + paddw mm6,[GOTOFF(eax,PW_ONE)] 1.133 + paddw mm4,[GOTOFF(eax,PW_ONE)] 1.134 + psraw mm6,1 ; mm6=(CbH * -FIX(0.22800)) 1.135 + psraw mm4,1 ; mm4=(CbL * -FIX(0.22800)) 1.136 + paddw mm7,[GOTOFF(eax,PW_ONE)] 1.137 + paddw mm0,[GOTOFF(eax,PW_ONE)] 1.138 + psraw mm7,1 ; mm7=(CrH * FIX(0.40200)) 1.139 + psraw mm0,1 ; mm0=(CrL * FIX(0.40200)) 1.140 + 1.141 + paddw mm6,mm5 1.142 + paddw mm4,mm2 1.143 + paddw mm6,mm5 ; mm6=(CbH * FIX(1.77200))=(B-Y)H 1.144 + paddw mm4,mm2 ; mm4=(CbL * FIX(1.77200))=(B-Y)L 1.145 + paddw mm7,mm1 ; mm7=(CrH * FIX(1.40200))=(R-Y)H 1.146 + paddw mm0,mm3 ; mm0=(CrL * FIX(1.40200))=(R-Y)L 1.147 + 1.148 + movq MMWORD [wk(0)], mm6 ; wk(0)=(B-Y)H 1.149 + movq MMWORD [wk(1)], mm7 ; wk(1)=(R-Y)H 1.150 + 1.151 + movq mm6,mm5 1.152 + movq mm7,mm2 1.153 + punpcklwd mm5,mm1 1.154 + punpckhwd mm6,mm1 1.155 + pmaddwd mm5,[GOTOFF(eax,PW_MF0344_F0285)] 1.156 + pmaddwd mm6,[GOTOFF(eax,PW_MF0344_F0285)] 1.157 + punpcklwd mm2,mm3 1.158 + punpckhwd mm7,mm3 1.159 + pmaddwd mm2,[GOTOFF(eax,PW_MF0344_F0285)] 1.160 + pmaddwd mm7,[GOTOFF(eax,PW_MF0344_F0285)] 1.161 + 1.162 + paddd mm5,[GOTOFF(eax,PD_ONEHALF)] 1.163 + paddd mm6,[GOTOFF(eax,PD_ONEHALF)] 1.164 + psrad mm5,SCALEBITS 1.165 + psrad mm6,SCALEBITS 1.166 + paddd mm2,[GOTOFF(eax,PD_ONEHALF)] 1.167 + paddd mm7,[GOTOFF(eax,PD_ONEHALF)] 1.168 + psrad mm2,SCALEBITS 1.169 + psrad mm7,SCALEBITS 1.170 + 1.171 + packssdw mm5,mm6 ; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285) 1.172 + packssdw mm2,mm7 ; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285) 1.173 + psubw mm5,mm1 ; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H 1.174 + psubw mm2,mm3 ; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L 1.175 + 1.176 + movq MMWORD [wk(2)], mm5 ; wk(2)=(G-Y)H 1.177 + 1.178 + mov al,2 ; Yctr 1.179 + jmp short .Yloop_1st 1.180 + alignx 16,7 1.181 + 1.182 +.Yloop_2nd: 1.183 + movq mm0, MMWORD [wk(1)] ; mm0=(R-Y)H 1.184 + movq mm2, MMWORD [wk(2)] ; mm2=(G-Y)H 1.185 + movq mm4, MMWORD [wk(0)] ; mm4=(B-Y)H 1.186 + alignx 16,7 1.187 + 1.188 +.Yloop_1st: 1.189 + movq mm7, MMWORD [esi] ; mm7=Y(01234567) 1.190 + 1.191 + pcmpeqw mm6,mm6 1.192 + psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} 1.193 + pand mm6,mm7 ; mm6=Y(0246)=YE 1.194 + psrlw mm7,BYTE_BIT ; mm7=Y(1357)=YO 1.195 + 1.196 + movq mm1,mm0 ; mm1=mm0=(R-Y)(L/H) 1.197 + movq mm3,mm2 ; mm3=mm2=(G-Y)(L/H) 1.198 + movq mm5,mm4 ; mm5=mm4=(B-Y)(L/H) 1.199 + 1.200 + paddw mm0,mm6 ; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6) 1.201 + paddw mm1,mm7 ; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7) 1.202 + packuswb mm0,mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **) 1.203 + packuswb mm1,mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **) 1.204 + 1.205 + paddw mm2,mm6 ; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6) 1.206 + paddw mm3,mm7 ; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7) 1.207 + packuswb mm2,mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **) 1.208 + packuswb mm3,mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **) 1.209 + 1.210 + paddw mm4,mm6 ; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6) 1.211 + paddw mm5,mm7 ; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7) 1.212 + packuswb mm4,mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **) 1.213 + packuswb mm5,mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **) 1.214 + 1.215 +%if RGB_PIXELSIZE == 3 ; --------------- 1.216 + 1.217 + ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) 1.218 + ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) 1.219 + ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) 1.220 + ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **) 1.221 + 1.222 + punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16) 1.223 + punpcklbw mmE,mmB ; mmE=(20 01 22 03 24 05 26 07) 1.224 + punpcklbw mmD,mmF ; mmD=(11 21 13 23 15 25 17 27) 1.225 + 1.226 + movq mmG,mmA 1.227 + movq mmH,mmA 1.228 + punpcklwd mmA,mmE ; mmA=(00 10 20 01 02 12 22 03) 1.229 + punpckhwd mmG,mmE ; mmG=(04 14 24 05 06 16 26 07) 1.230 + 1.231 + psrlq mmH,2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --) 1.232 + psrlq mmE,2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --) 1.233 + 1.234 + movq mmC,mmD 1.235 + movq mmB,mmD 1.236 + punpcklwd mmD,mmH ; mmD=(11 21 02 12 13 23 04 14) 1.237 + punpckhwd mmC,mmH ; mmC=(15 25 06 16 17 27 -- --) 1.238 + 1.239 + psrlq mmB,2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --) 1.240 + 1.241 + movq mmF,mmE 1.242 + punpcklwd mmE,mmB ; mmE=(22 03 13 23 24 05 15 25) 1.243 + punpckhwd mmF,mmB ; mmF=(26 07 17 27 -- -- -- --) 1.244 + 1.245 + punpckldq mmA,mmD ; mmA=(00 10 20 01 11 21 02 12) 1.246 + punpckldq mmE,mmG ; mmE=(22 03 13 23 04 14 24 05) 1.247 + punpckldq mmC,mmF ; mmC=(15 25 06 16 26 07 17 27) 1.248 + 1.249 + cmp ecx, byte SIZEOF_MMWORD 1.250 + jb short .column_st16 1.251 + 1.252 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 1.253 + movq MMWORD [edi+1*SIZEOF_MMWORD], mmE 1.254 + movq MMWORD [edi+2*SIZEOF_MMWORD], mmC 1.255 + 1.256 + sub ecx, byte SIZEOF_MMWORD 1.257 + jz near .endcolumn 1.258 + 1.259 + add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr 1.260 + add esi, byte SIZEOF_MMWORD ; inptr0 1.261 + dec al ; Yctr 1.262 + jnz near .Yloop_2nd 1.263 + 1.264 + add ebx, byte SIZEOF_MMWORD ; inptr1 1.265 + add edx, byte SIZEOF_MMWORD ; inptr2 1.266 + jmp near .columnloop 1.267 + alignx 16,7 1.268 + 1.269 +.column_st16: 1.270 + lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE 1.271 + cmp ecx, byte 2*SIZEOF_MMWORD 1.272 + jb short .column_st8 1.273 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 1.274 + movq MMWORD [edi+1*SIZEOF_MMWORD], mmE 1.275 + movq mmA,mmC 1.276 + sub ecx, byte 2*SIZEOF_MMWORD 1.277 + add edi, byte 2*SIZEOF_MMWORD 1.278 + jmp short .column_st4 1.279 +.column_st8: 1.280 + cmp ecx, byte SIZEOF_MMWORD 1.281 + jb short .column_st4 1.282 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 1.283 + movq mmA,mmE 1.284 + sub ecx, byte SIZEOF_MMWORD 1.285 + add edi, byte SIZEOF_MMWORD 1.286 +.column_st4: 1.287 + movd eax,mmA 1.288 + cmp ecx, byte SIZEOF_DWORD 1.289 + jb short .column_st2 1.290 + mov DWORD [edi+0*SIZEOF_DWORD], eax 1.291 + psrlq mmA,DWORD_BIT 1.292 + movd eax,mmA 1.293 + sub ecx, byte SIZEOF_DWORD 1.294 + add edi, byte SIZEOF_DWORD 1.295 +.column_st2: 1.296 + cmp ecx, byte SIZEOF_WORD 1.297 + jb short .column_st1 1.298 + mov WORD [edi+0*SIZEOF_WORD], ax 1.299 + shr eax,WORD_BIT 1.300 + sub ecx, byte SIZEOF_WORD 1.301 + add edi, byte SIZEOF_WORD 1.302 +.column_st1: 1.303 + cmp ecx, byte SIZEOF_BYTE 1.304 + jb short .endcolumn 1.305 + mov BYTE [edi+0*SIZEOF_BYTE], al 1.306 + 1.307 +%else ; RGB_PIXELSIZE == 4 ; ----------- 1.308 + 1.309 +%ifdef RGBX_FILLER_0XFF 1.310 + pcmpeqb mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) 1.311 + pcmpeqb mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) 1.312 +%else 1.313 + pxor mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) 1.314 + pxor mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) 1.315 +%endif 1.316 + ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) 1.317 + ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) 1.318 + ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) 1.319 + ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) 1.320 + 1.321 + punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16) 1.322 + punpcklbw mmE,mmG ; mmE=(20 30 22 32 24 34 26 36) 1.323 + punpcklbw mmB,mmD ; mmB=(01 11 03 13 05 15 07 17) 1.324 + punpcklbw mmF,mmH ; mmF=(21 31 23 33 25 35 27 37) 1.325 + 1.326 + movq mmC,mmA 1.327 + punpcklwd mmA,mmE ; mmA=(00 10 20 30 02 12 22 32) 1.328 + punpckhwd mmC,mmE ; mmC=(04 14 24 34 06 16 26 36) 1.329 + movq mmG,mmB 1.330 + punpcklwd mmB,mmF ; mmB=(01 11 21 31 03 13 23 33) 1.331 + punpckhwd mmG,mmF ; mmG=(05 15 25 35 07 17 27 37) 1.332 + 1.333 + movq mmD,mmA 1.334 + punpckldq mmA,mmB ; mmA=(00 10 20 30 01 11 21 31) 1.335 + punpckhdq mmD,mmB ; mmD=(02 12 22 32 03 13 23 33) 1.336 + movq mmH,mmC 1.337 + punpckldq mmC,mmG ; mmC=(04 14 24 34 05 15 25 35) 1.338 + punpckhdq mmH,mmG ; mmH=(06 16 26 36 07 17 27 37) 1.339 + 1.340 + cmp ecx, byte SIZEOF_MMWORD 1.341 + jb short .column_st16 1.342 + 1.343 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 1.344 + movq MMWORD [edi+1*SIZEOF_MMWORD], mmD 1.345 + movq MMWORD [edi+2*SIZEOF_MMWORD], mmC 1.346 + movq MMWORD [edi+3*SIZEOF_MMWORD], mmH 1.347 + 1.348 + sub ecx, byte SIZEOF_MMWORD 1.349 + jz short .endcolumn 1.350 + 1.351 + add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr 1.352 + add esi, byte SIZEOF_MMWORD ; inptr0 1.353 + dec al ; Yctr 1.354 + jnz near .Yloop_2nd 1.355 + 1.356 + add ebx, byte SIZEOF_MMWORD ; inptr1 1.357 + add edx, byte SIZEOF_MMWORD ; inptr2 1.358 + jmp near .columnloop 1.359 + alignx 16,7 1.360 + 1.361 +.column_st16: 1.362 + cmp ecx, byte SIZEOF_MMWORD/2 1.363 + jb short .column_st8 1.364 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 1.365 + movq MMWORD [edi+1*SIZEOF_MMWORD], mmD 1.366 + movq mmA,mmC 1.367 + movq mmD,mmH 1.368 + sub ecx, byte SIZEOF_MMWORD/2 1.369 + add edi, byte 2*SIZEOF_MMWORD 1.370 +.column_st8: 1.371 + cmp ecx, byte SIZEOF_MMWORD/4 1.372 + jb short .column_st4 1.373 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 1.374 + movq mmA,mmD 1.375 + sub ecx, byte SIZEOF_MMWORD/4 1.376 + add edi, byte 1*SIZEOF_MMWORD 1.377 +.column_st4: 1.378 + cmp ecx, byte SIZEOF_MMWORD/8 1.379 + jb short .endcolumn 1.380 + movd DWORD [edi+0*SIZEOF_DWORD], mmA 1.381 + 1.382 +%endif ; RGB_PIXELSIZE ; --------------- 1.383 + 1.384 +.endcolumn: 1.385 + emms ; empty MMX state 1.386 + 1.387 +.return: 1.388 + pop edi 1.389 + pop esi 1.390 +; pop edx ; need not be preserved 1.391 +; pop ecx ; need not be preserved 1.392 + pop ebx 1.393 + mov esp,ebp ; esp <- aligned ebp 1.394 + pop esp ; esp <- original ebp 1.395 + pop ebp 1.396 + ret 1.397 + 1.398 +; -------------------------------------------------------------------------- 1.399 +; 1.400 +; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical. 1.401 +; 1.402 +; GLOBAL(void) 1.403 +; jsimd_h2v2_merged_upsample_mmx (JDIMENSION output_width, 1.404 +; JSAMPIMAGE input_buf, 1.405 +; JDIMENSION in_row_group_ctr, 1.406 +; JSAMPARRAY output_buf); 1.407 +; 1.408 + 1.409 +%define output_width(b) (b)+8 ; JDIMENSION output_width 1.410 +%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf 1.411 +%define in_row_group_ctr(b) (b)+16 ; JDIMENSION in_row_group_ctr 1.412 +%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf 1.413 + 1.414 + align 16 1.415 + global EXTN(jsimd_h2v2_merged_upsample_mmx) 1.416 + 1.417 +EXTN(jsimd_h2v2_merged_upsample_mmx): 1.418 + push ebp 1.419 + mov ebp,esp 1.420 + push ebx 1.421 +; push ecx ; need not be preserved 1.422 +; push edx ; need not be preserved 1.423 + push esi 1.424 + push edi 1.425 + 1.426 + mov eax, JDIMENSION [output_width(ebp)] 1.427 + 1.428 + mov edi, JSAMPIMAGE [input_buf(ebp)] 1.429 + mov ecx, JDIMENSION [in_row_group_ctr(ebp)] 1.430 + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] 1.431 + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] 1.432 + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] 1.433 + mov edi, JSAMPARRAY [output_buf(ebp)] 1.434 + lea esi, [esi+ecx*SIZEOF_JSAMPROW] 1.435 + 1.436 + push edx ; inptr2 1.437 + push ebx ; inptr1 1.438 + push esi ; inptr00 1.439 + mov ebx,esp 1.440 + 1.441 + push edi ; output_buf (outptr0) 1.442 + push ecx ; in_row_group_ctr 1.443 + push ebx ; input_buf 1.444 + push eax ; output_width 1.445 + 1.446 + call near EXTN(jsimd_h2v1_merged_upsample_mmx) 1.447 + 1.448 + add esi, byte SIZEOF_JSAMPROW ; inptr01 1.449 + add edi, byte SIZEOF_JSAMPROW ; outptr1 1.450 + mov POINTER [ebx+0*SIZEOF_POINTER], esi 1.451 + mov POINTER [ebx-1*SIZEOF_POINTER], edi 1.452 + 1.453 + call near EXTN(jsimd_h2v1_merged_upsample_mmx) 1.454 + 1.455 + add esp, byte 7*SIZEOF_DWORD 1.456 + 1.457 + pop edi 1.458 + pop esi 1.459 +; pop edx ; need not be preserved 1.460 +; pop ecx ; need not be preserved 1.461 + pop ebx 1.462 + pop ebp 1.463 + ret 1.464 + 1.465 +; For some reason, the OS X linker does not honor the request to align the 1.466 +; segment unless we do this. 1.467 + align 16