media/libjpeg/simd/jdmrgmmx.asm

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 ;
     2 ; jdmrgmmx.asm - merged upsampling/color conversion (MMX)
     3 ;
     4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
     5 ;
     6 ; Based on
     7 ; x86 SIMD extension for IJG JPEG library
     8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
    10 ;
    11 ; This file should be assembled with NASM (Netwide Assembler),
    12 ; can *not* be assembled with Microsoft's MASM or any compatible
    13 ; assembler (including Borland's Turbo Assembler).
    14 ; NASM is available from http://nasm.sourceforge.net/ or
    15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
    16 ;
    17 ; [TAB8]
    19 %include "jcolsamp.inc"
    21 ; --------------------------------------------------------------------------
    22 ;
    23 ; Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
    24 ;
    25 ; GLOBAL(void)
    26 ; jsimd_h2v1_merged_upsample_mmx (JDIMENSION output_width,
    27 ;                                 JSAMPIMAGE input_buf,
    28 ;                                 JDIMENSION in_row_group_ctr,
    29 ;                                 JSAMPARRAY output_buf);
    30 ;
    32 %define output_width(b)	(b)+8			; JDIMENSION output_width
    33 %define input_buf(b)		(b)+12		; JSAMPIMAGE input_buf
    34 %define in_row_group_ctr(b)	(b)+16		; JDIMENSION in_row_group_ctr
    35 %define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
    37 %define original_ebp	ebp+0
    38 %define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
    39 %define WK_NUM		3
    40 %define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
    42 	align	16
    43 	global	EXTN(jsimd_h2v1_merged_upsample_mmx)
    45 EXTN(jsimd_h2v1_merged_upsample_mmx):
    46 	push	ebp
    47 	mov	eax,esp				; eax = original ebp
    48 	sub	esp, byte 4
    49 	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
    50 	mov	[esp],eax
    51 	mov	ebp,esp				; ebp = aligned ebp
    52 	lea	esp, [wk(0)]
    53 	pushpic	eax		; make a room for GOT address
    54 	push	ebx
    55 ;	push	ecx		; need not be preserved
    56 ;	push	edx		; need not be preserved
    57 	push	esi
    58 	push	edi
    60 	get_GOT	ebx			; get GOT address
    61 	movpic	POINTER [gotptr], ebx	; save GOT address
    63 	mov	ecx, JDIMENSION [output_width(eax)]	; col
    64 	test	ecx,ecx
    65 	jz	near .return
    67 	push	ecx
    69 	mov	edi, JSAMPIMAGE [input_buf(eax)]
    70 	mov	ecx, JDIMENSION [in_row_group_ctr(eax)]
    71 	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
    72 	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
    73 	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
    74 	mov	edi, JSAMPARRAY [output_buf(eax)]
    75 	mov	esi, JSAMPROW [esi+ecx*SIZEOF_JSAMPROW]		; inptr0
    76 	mov	ebx, JSAMPROW [ebx+ecx*SIZEOF_JSAMPROW]		; inptr1
    77 	mov	edx, JSAMPROW [edx+ecx*SIZEOF_JSAMPROW]		; inptr2
    78 	mov	edi, JSAMPROW [edi]				; outptr
    80 	pop	ecx			; col
    82 	alignx	16,7
    83 .columnloop:
    84 	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
    86 	movq      mm6, MMWORD [ebx]	; mm6=Cb(01234567)
    87 	movq      mm7, MMWORD [edx]	; mm7=Cr(01234567)
    89 	pxor      mm1,mm1		; mm1=(all 0's)
    90 	pcmpeqw   mm3,mm3
    91 	psllw     mm3,7			; mm3={0xFF80 0xFF80 0xFF80 0xFF80}
    93 	movq      mm4,mm6
    94 	punpckhbw mm6,mm1		; mm6=Cb(4567)=CbH
    95 	punpcklbw mm4,mm1		; mm4=Cb(0123)=CbL
    96 	movq      mm0,mm7
    97 	punpckhbw mm7,mm1		; mm7=Cr(4567)=CrH
    98 	punpcklbw mm0,mm1		; mm0=Cr(0123)=CrL
   100 	paddw     mm6,mm3
   101 	paddw     mm4,mm3
   102 	paddw     mm7,mm3
   103 	paddw     mm0,mm3
   105 	; (Original)
   106 	; R = Y                + 1.40200 * Cr
   107 	; G = Y - 0.34414 * Cb - 0.71414 * Cr
   108 	; B = Y + 1.77200 * Cb
   109 	;
   110 	; (This implementation)
   111 	; R = Y                + 0.40200 * Cr + Cr
   112 	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
   113 	; B = Y - 0.22800 * Cb + Cb + Cb
   115 	movq	mm5,mm6			; mm5=CbH
   116 	movq	mm2,mm4			; mm2=CbL
   117 	paddw	mm6,mm6			; mm6=2*CbH
   118 	paddw	mm4,mm4			; mm4=2*CbL
   119 	movq	mm1,mm7			; mm1=CrH
   120 	movq	mm3,mm0			; mm3=CrL
   121 	paddw	mm7,mm7			; mm7=2*CrH
   122 	paddw	mm0,mm0			; mm0=2*CrL
   124 	pmulhw	mm6,[GOTOFF(eax,PW_MF0228)]	; mm6=(2*CbH * -FIX(0.22800))
   125 	pmulhw	mm4,[GOTOFF(eax,PW_MF0228)]	; mm4=(2*CbL * -FIX(0.22800))
   126 	pmulhw	mm7,[GOTOFF(eax,PW_F0402)]	; mm7=(2*CrH * FIX(0.40200))
   127 	pmulhw	mm0,[GOTOFF(eax,PW_F0402)]	; mm0=(2*CrL * FIX(0.40200))
   129 	paddw	mm6,[GOTOFF(eax,PW_ONE)]
   130 	paddw	mm4,[GOTOFF(eax,PW_ONE)]
   131 	psraw	mm6,1			; mm6=(CbH * -FIX(0.22800))
   132 	psraw	mm4,1			; mm4=(CbL * -FIX(0.22800))
   133 	paddw	mm7,[GOTOFF(eax,PW_ONE)]
   134 	paddw	mm0,[GOTOFF(eax,PW_ONE)]
   135 	psraw	mm7,1			; mm7=(CrH * FIX(0.40200))
   136 	psraw	mm0,1			; mm0=(CrL * FIX(0.40200))
   138 	paddw	mm6,mm5
   139 	paddw	mm4,mm2
   140 	paddw	mm6,mm5			; mm6=(CbH * FIX(1.77200))=(B-Y)H
   141 	paddw	mm4,mm2			; mm4=(CbL * FIX(1.77200))=(B-Y)L
   142 	paddw	mm7,mm1			; mm7=(CrH * FIX(1.40200))=(R-Y)H
   143 	paddw	mm0,mm3			; mm0=(CrL * FIX(1.40200))=(R-Y)L
   145 	movq	MMWORD [wk(0)], mm6	; wk(0)=(B-Y)H
   146 	movq	MMWORD [wk(1)], mm7	; wk(1)=(R-Y)H
   148 	movq      mm6,mm5
   149 	movq      mm7,mm2
   150 	punpcklwd mm5,mm1
   151 	punpckhwd mm6,mm1
   152 	pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
   153 	pmaddwd   mm6,[GOTOFF(eax,PW_MF0344_F0285)]
   154 	punpcklwd mm2,mm3
   155 	punpckhwd mm7,mm3
   156 	pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
   157 	pmaddwd   mm7,[GOTOFF(eax,PW_MF0344_F0285)]
   159 	paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
   160 	paddd     mm6,[GOTOFF(eax,PD_ONEHALF)]
   161 	psrad     mm5,SCALEBITS
   162 	psrad     mm6,SCALEBITS
   163 	paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
   164 	paddd     mm7,[GOTOFF(eax,PD_ONEHALF)]
   165 	psrad     mm2,SCALEBITS
   166 	psrad     mm7,SCALEBITS
   168 	packssdw  mm5,mm6	; mm5=CbH*-FIX(0.344)+CrH*FIX(0.285)
   169 	packssdw  mm2,mm7	; mm2=CbL*-FIX(0.344)+CrL*FIX(0.285)
   170 	psubw     mm5,mm1	; mm5=CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H
   171 	psubw     mm2,mm3	; mm2=CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L
   173 	movq	MMWORD [wk(2)], mm5	; wk(2)=(G-Y)H
   175 	mov	al,2			; Yctr
   176 	jmp	short .Yloop_1st
   177 	alignx	16,7
   179 .Yloop_2nd:
   180 	movq	mm0, MMWORD [wk(1)]	; mm0=(R-Y)H
   181 	movq	mm2, MMWORD [wk(2)]	; mm2=(G-Y)H
   182 	movq	mm4, MMWORD [wk(0)]	; mm4=(B-Y)H
   183 	alignx	16,7
   185 .Yloop_1st:
   186 	movq	mm7, MMWORD [esi]	; mm7=Y(01234567)
   188 	pcmpeqw	mm6,mm6
   189 	psrlw	mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
   190 	pand	mm6,mm7			; mm6=Y(0246)=YE
   191 	psrlw	mm7,BYTE_BIT		; mm7=Y(1357)=YO
   193 	movq	mm1,mm0			; mm1=mm0=(R-Y)(L/H)
   194 	movq	mm3,mm2			; mm3=mm2=(G-Y)(L/H)
   195 	movq	mm5,mm4			; mm5=mm4=(B-Y)(L/H)
   197 	paddw     mm0,mm6		; mm0=((R-Y)+YE)=RE=(R0 R2 R4 R6)
   198 	paddw     mm1,mm7		; mm1=((R-Y)+YO)=RO=(R1 R3 R5 R7)
   199 	packuswb  mm0,mm0		; mm0=(R0 R2 R4 R6 ** ** ** **)
   200 	packuswb  mm1,mm1		; mm1=(R1 R3 R5 R7 ** ** ** **)
   202 	paddw     mm2,mm6		; mm2=((G-Y)+YE)=GE=(G0 G2 G4 G6)
   203 	paddw     mm3,mm7		; mm3=((G-Y)+YO)=GO=(G1 G3 G5 G7)
   204 	packuswb  mm2,mm2		; mm2=(G0 G2 G4 G6 ** ** ** **)
   205 	packuswb  mm3,mm3		; mm3=(G1 G3 G5 G7 ** ** ** **)
   207 	paddw     mm4,mm6		; mm4=((B-Y)+YE)=BE=(B0 B2 B4 B6)
   208 	paddw     mm5,mm7		; mm5=((B-Y)+YO)=BO=(B1 B3 B5 B7)
   209 	packuswb  mm4,mm4		; mm4=(B0 B2 B4 B6 ** ** ** **)
   210 	packuswb  mm5,mm5		; mm5=(B1 B3 B5 B7 ** ** ** **)
   212 %if RGB_PIXELSIZE == 3 ; ---------------
   214 	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
   215 	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
   216 	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
   217 	; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
   219 	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
   220 	punpcklbw mmE,mmB		; mmE=(20 01 22 03 24 05 26 07)
   221 	punpcklbw mmD,mmF		; mmD=(11 21 13 23 15 25 17 27)
   223 	movq      mmG,mmA
   224 	movq      mmH,mmA
   225 	punpcklwd mmA,mmE		; mmA=(00 10 20 01 02 12 22 03)
   226 	punpckhwd mmG,mmE		; mmG=(04 14 24 05 06 16 26 07)
   228 	psrlq     mmH,2*BYTE_BIT	; mmH=(02 12 04 14 06 16 -- --)
   229 	psrlq     mmE,2*BYTE_BIT	; mmE=(22 03 24 05 26 07 -- --)
   231 	movq      mmC,mmD
   232 	movq      mmB,mmD
   233 	punpcklwd mmD,mmH		; mmD=(11 21 02 12 13 23 04 14)
   234 	punpckhwd mmC,mmH		; mmC=(15 25 06 16 17 27 -- --)
   236 	psrlq     mmB,2*BYTE_BIT	; mmB=(13 23 15 25 17 27 -- --)
   238 	movq      mmF,mmE
   239 	punpcklwd mmE,mmB		; mmE=(22 03 13 23 24 05 15 25)
   240 	punpckhwd mmF,mmB		; mmF=(26 07 17 27 -- -- -- --)
   242 	punpckldq mmA,mmD		; mmA=(00 10 20 01 11 21 02 12)
   243 	punpckldq mmE,mmG		; mmE=(22 03 13 23 04 14 24 05)
   244 	punpckldq mmC,mmF		; mmC=(15 25 06 16 26 07 17 27)
   246 	cmp	ecx, byte SIZEOF_MMWORD
   247 	jb	short .column_st16
   249 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
   250 	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
   251 	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
   253 	sub	ecx, byte SIZEOF_MMWORD
   254 	jz	near .endcolumn
   256 	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
   257 	add	esi, byte SIZEOF_MMWORD			; inptr0
   258 	dec	al			; Yctr
   259 	jnz	near .Yloop_2nd
   261 	add	ebx, byte SIZEOF_MMWORD			; inptr1
   262 	add	edx, byte SIZEOF_MMWORD			; inptr2
   263 	jmp	near .columnloop
   264 	alignx	16,7
   266 .column_st16:
   267 	lea	ecx, [ecx+ecx*2]	; imul ecx, RGB_PIXELSIZE
   268 	cmp	ecx, byte 2*SIZEOF_MMWORD
   269 	jb	short .column_st8
   270 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
   271 	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
   272 	movq	mmA,mmC
   273 	sub	ecx, byte 2*SIZEOF_MMWORD
   274 	add	edi, byte 2*SIZEOF_MMWORD
   275 	jmp	short .column_st4
   276 .column_st8:
   277 	cmp	ecx, byte SIZEOF_MMWORD
   278 	jb	short .column_st4
   279 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
   280 	movq	mmA,mmE
   281 	sub	ecx, byte SIZEOF_MMWORD
   282 	add	edi, byte SIZEOF_MMWORD
   283 .column_st4:
   284 	movd	eax,mmA
   285 	cmp	ecx, byte SIZEOF_DWORD
   286 	jb	short .column_st2
   287 	mov	DWORD [edi+0*SIZEOF_DWORD], eax
   288 	psrlq	mmA,DWORD_BIT
   289 	movd	eax,mmA
   290 	sub	ecx, byte SIZEOF_DWORD
   291 	add	edi, byte SIZEOF_DWORD
   292 .column_st2:
   293 	cmp	ecx, byte SIZEOF_WORD
   294 	jb	short .column_st1
   295 	mov	WORD [edi+0*SIZEOF_WORD], ax
   296 	shr	eax,WORD_BIT
   297 	sub	ecx, byte SIZEOF_WORD
   298 	add	edi, byte SIZEOF_WORD
   299 .column_st1:
   300 	cmp	ecx, byte SIZEOF_BYTE
   301 	jb	short .endcolumn
   302 	mov	BYTE [edi+0*SIZEOF_BYTE], al
   304 %else ; RGB_PIXELSIZE == 4 ; -----------
   306 %ifdef RGBX_FILLER_0XFF
   307 	pcmpeqb   mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
   308 	pcmpeqb   mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
   309 %else
   310 	pxor      mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
   311 	pxor      mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
   312 %endif
   313 	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
   314 	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
   315 	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
   316 	; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
   318 	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
   319 	punpcklbw mmE,mmG		; mmE=(20 30 22 32 24 34 26 36)
   320 	punpcklbw mmB,mmD		; mmB=(01 11 03 13 05 15 07 17)
   321 	punpcklbw mmF,mmH		; mmF=(21 31 23 33 25 35 27 37)
   323 	movq      mmC,mmA
   324 	punpcklwd mmA,mmE		; mmA=(00 10 20 30 02 12 22 32)
   325 	punpckhwd mmC,mmE		; mmC=(04 14 24 34 06 16 26 36)
   326 	movq      mmG,mmB
   327 	punpcklwd mmB,mmF		; mmB=(01 11 21 31 03 13 23 33)
   328 	punpckhwd mmG,mmF		; mmG=(05 15 25 35 07 17 27 37)
   330 	movq      mmD,mmA
   331 	punpckldq mmA,mmB		; mmA=(00 10 20 30 01 11 21 31)
   332 	punpckhdq mmD,mmB		; mmD=(02 12 22 32 03 13 23 33)
   333 	movq      mmH,mmC
   334 	punpckldq mmC,mmG		; mmC=(04 14 24 34 05 15 25 35)
   335 	punpckhdq mmH,mmG		; mmH=(06 16 26 36 07 17 27 37)
   337 	cmp	ecx, byte SIZEOF_MMWORD
   338 	jb	short .column_st16
   340 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
   341 	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
   342 	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
   343 	movq	MMWORD [edi+3*SIZEOF_MMWORD], mmH
   345 	sub	ecx, byte SIZEOF_MMWORD
   346 	jz	short .endcolumn
   348 	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
   349 	add	esi, byte SIZEOF_MMWORD			; inptr0
   350 	dec	al			; Yctr
   351 	jnz	near .Yloop_2nd
   353 	add	ebx, byte SIZEOF_MMWORD			; inptr1
   354 	add	edx, byte SIZEOF_MMWORD			; inptr2
   355 	jmp	near .columnloop
   356 	alignx	16,7
   358 .column_st16:
   359 	cmp	ecx, byte SIZEOF_MMWORD/2
   360 	jb	short .column_st8
   361 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
   362 	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
   363 	movq	mmA,mmC
   364 	movq	mmD,mmH
   365 	sub	ecx, byte SIZEOF_MMWORD/2
   366 	add	edi, byte 2*SIZEOF_MMWORD
   367 .column_st8:
   368 	cmp	ecx, byte SIZEOF_MMWORD/4
   369 	jb	short .column_st4
   370 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
   371 	movq	mmA,mmD
   372 	sub	ecx, byte SIZEOF_MMWORD/4
   373 	add	edi, byte 1*SIZEOF_MMWORD
   374 .column_st4:
   375 	cmp	ecx, byte SIZEOF_MMWORD/8
   376 	jb	short .endcolumn
   377 	movd	DWORD [edi+0*SIZEOF_DWORD], mmA
   379 %endif ; RGB_PIXELSIZE ; ---------------
   381 .endcolumn:
   382 	emms		; empty MMX state
   384 .return:
   385 	pop	edi
   386 	pop	esi
   387 ;	pop	edx		; need not be preserved
   388 ;	pop	ecx		; need not be preserved
   389 	pop	ebx
   390 	mov	esp,ebp		; esp <- aligned ebp
   391 	pop	esp		; esp <- original ebp
   392 	pop	ebp
   393 	ret
   395 ; --------------------------------------------------------------------------
   396 ;
   397 ; Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
   398 ;
   399 ; GLOBAL(void)
   400 ; jsimd_h2v2_merged_upsample_mmx (JDIMENSION output_width,
   401 ;                                 JSAMPIMAGE input_buf,
   402 ;                                 JDIMENSION in_row_group_ctr,
   403 ;                                 JSAMPARRAY output_buf);
   404 ;
   406 %define output_width(b)	(b)+8			; JDIMENSION output_width
   407 %define input_buf(b)		(b)+12		; JSAMPIMAGE input_buf
   408 %define in_row_group_ctr(b)	(b)+16		; JDIMENSION in_row_group_ctr
   409 %define output_buf(b)		(b)+20		; JSAMPARRAY output_buf
   411 	align	16
   412 	global	EXTN(jsimd_h2v2_merged_upsample_mmx)
   414 EXTN(jsimd_h2v2_merged_upsample_mmx):
   415 	push	ebp
   416 	mov	ebp,esp
   417 	push	ebx
   418 ;	push	ecx		; need not be preserved
   419 ;	push	edx		; need not be preserved
   420 	push	esi
   421 	push	edi
   423 	mov	eax, JDIMENSION [output_width(ebp)]
   425 	mov	edi, JSAMPIMAGE [input_buf(ebp)]
   426 	mov	ecx, JDIMENSION [in_row_group_ctr(ebp)]
   427 	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
   428 	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
   429 	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
   430 	mov	edi, JSAMPARRAY [output_buf(ebp)]
   431 	lea	esi, [esi+ecx*SIZEOF_JSAMPROW]
   433 	push	edx			; inptr2
   434 	push	ebx			; inptr1
   435 	push	esi			; inptr00
   436 	mov	ebx,esp
   438 	push	edi			; output_buf (outptr0)
   439 	push	ecx			; in_row_group_ctr
   440 	push	ebx			; input_buf
   441 	push	eax			; output_width
   443 	call	near EXTN(jsimd_h2v1_merged_upsample_mmx)
   445 	add	esi, byte SIZEOF_JSAMPROW	; inptr01
   446 	add	edi, byte SIZEOF_JSAMPROW	; outptr1
   447 	mov	POINTER [ebx+0*SIZEOF_POINTER], esi
   448 	mov	POINTER [ebx-1*SIZEOF_POINTER], edi
   450 	call	near EXTN(jsimd_h2v1_merged_upsample_mmx)
   452 	add	esp, byte 7*SIZEOF_DWORD
   454 	pop	edi
   455 	pop	esi
   456 ;	pop	edx		; need not be preserved
   457 ;	pop	ecx		; need not be preserved
   458 	pop	ebx
   459 	pop	ebp
   460 	ret
   462 ; For some reason, the OS X linker does not honor the request to align the
   463 ; segment unless we do this.
   464 	align	16

mercurial