media/libjpeg/simd/jdsammmx.asm

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 ;
     2 ; jdsammmx.asm - upsampling (MMX)
     3 ;
     4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
     5 ;
     6 ; Based on
     7 ; x86 SIMD extension for IJG JPEG library
     8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
    10 ;
    11 ; This file should be assembled with NASM (Netwide Assembler),
    12 ; can *not* be assembled with Microsoft's MASM or any compatible
    13 ; assembler (including Borland's Turbo Assembler).
    14 ; NASM is available from http://nasm.sourceforge.net/ or
    15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
    16 ;
    17 ; [TAB8]
    19 %include "jsimdext.inc"
    21 ; --------------------------------------------------------------------------
    22 	SECTION	SEG_CONST
    24 	alignz	16
    25 	global	EXTN(jconst_fancy_upsample_mmx)
    27 EXTN(jconst_fancy_upsample_mmx):
    29 PW_ONE		times 4 dw  1
    30 PW_TWO		times 4 dw  2
    31 PW_THREE	times 4 dw  3
    32 PW_SEVEN	times 4 dw  7
    33 PW_EIGHT	times 4 dw  8
    35 	alignz	16
    37 ; --------------------------------------------------------------------------
    38 	SECTION	SEG_TEXT
    39 	BITS	32
    40 ;
    41 ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
    42 ;
    43 ; The upsampling algorithm is linear interpolation between pixel centers,
    44 ; also known as a "triangle filter".  This is a good compromise between
    45 ; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
    46 ; of the way between input pixel centers.
    47 ;
    48 ; GLOBAL(void)
    49 ; jsimd_h2v1_fancy_upsample_mmx (int max_v_samp_factor,
    50 ;                                JDIMENSION downsampled_width,
    51 ;                                JSAMPARRAY input_data,
    52 ;                                JSAMPARRAY * output_data_ptr);
    53 ;
    55 %define max_v_samp(b)		(b)+8			; int max_v_samp_factor
    56 %define downsamp_width(b)	(b)+12	; JDIMENSION downsampled_width
    57 %define input_data(b)		(b)+16		; JSAMPARRAY input_data
    58 %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
    60 	align	16
    61 	global	EXTN(jsimd_h2v1_fancy_upsample_mmx)
    63 EXTN(jsimd_h2v1_fancy_upsample_mmx):
    64 	push	ebp
    65 	mov	ebp,esp
    66 	pushpic	ebx
    67 ;	push	ecx		; need not be preserved
    68 ;	push	edx		; need not be preserved
    69 	push	esi
    70 	push	edi
    72 	get_GOT	ebx		; get GOT address
    74 	mov	eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
    75 	test	eax,eax
    76 	jz	near .return
    78 	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
    79 	test	ecx,ecx
    80 	jz	near .return
    82 	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
    83 	mov	edi, POINTER [output_data_ptr(ebp)]
    84 	mov	edi, JSAMPARRAY [edi]			; output_data
    85 	alignx	16,7
    86 .rowloop:
    87 	push	eax			; colctr
    88 	push	edi
    89 	push	esi
    91 	mov	esi, JSAMPROW [esi]	; inptr
    92 	mov	edi, JSAMPROW [edi]	; outptr
    94 	test	eax, SIZEOF_MMWORD-1
    95 	jz	short .skip
    96 	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
    97 	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
    98 .skip:
    99 	pxor	mm0,mm0			; mm0=(all 0's)
   100 	pcmpeqb	mm7,mm7
   101 	psrlq	mm7,(SIZEOF_MMWORD-1)*BYTE_BIT
   102 	pand	mm7, MMWORD [esi+0*SIZEOF_MMWORD]
   104 	add	eax, byte SIZEOF_MMWORD-1
   105 	and	eax, byte -SIZEOF_MMWORD
   106 	cmp	eax, byte SIZEOF_MMWORD
   107 	ja	short .columnloop
   108 	alignx	16,7
   110 .columnloop_last:
   111 	pcmpeqb	mm6,mm6
   112 	psllq	mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
   113 	pand	mm6, MMWORD [esi+0*SIZEOF_MMWORD]
   114 	jmp	short .upsample
   115 	alignx	16,7
   117 .columnloop:
   118 	movq	mm6, MMWORD [esi+1*SIZEOF_MMWORD]
   119 	psllq	mm6,(SIZEOF_MMWORD-1)*BYTE_BIT
   121 .upsample:
   122 	movq	mm1, MMWORD [esi+0*SIZEOF_MMWORD]
   123 	movq	mm2,mm1
   124 	movq	mm3,mm1			; mm1=( 0 1 2 3 4 5 6 7)
   125 	psllq	mm2,BYTE_BIT		; mm2=( - 0 1 2 3 4 5 6)
   126 	psrlq	mm3,BYTE_BIT		; mm3=( 1 2 3 4 5 6 7 -)
   128 	por	mm2,mm7			; mm2=(-1 0 1 2 3 4 5 6)
   129 	por	mm3,mm6			; mm3=( 1 2 3 4 5 6 7 8)
   131 	movq	mm7,mm1
   132 	psrlq	mm7,(SIZEOF_MMWORD-1)*BYTE_BIT	; mm7=( 7 - - - - - - -)
   134 	movq      mm4,mm1
   135 	punpcklbw mm1,mm0		; mm1=( 0 1 2 3)
   136 	punpckhbw mm4,mm0		; mm4=( 4 5 6 7)
   137 	movq      mm5,mm2
   138 	punpcklbw mm2,mm0		; mm2=(-1 0 1 2)
   139 	punpckhbw mm5,mm0		; mm5=( 3 4 5 6)
   140 	movq      mm6,mm3
   141 	punpcklbw mm3,mm0		; mm3=( 1 2 3 4)
   142 	punpckhbw mm6,mm0		; mm6=( 5 6 7 8)
   144 	pmullw	mm1,[GOTOFF(ebx,PW_THREE)]
   145 	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
   146 	paddw	mm2,[GOTOFF(ebx,PW_ONE)]
   147 	paddw	mm5,[GOTOFF(ebx,PW_ONE)]
   148 	paddw	mm3,[GOTOFF(ebx,PW_TWO)]
   149 	paddw	mm6,[GOTOFF(ebx,PW_TWO)]
   151 	paddw	mm2,mm1
   152 	paddw	mm5,mm4
   153 	psrlw	mm2,2			; mm2=OutLE=( 0  2  4  6)
   154 	psrlw	mm5,2			; mm5=OutHE=( 8 10 12 14)
   155 	paddw	mm3,mm1
   156 	paddw	mm6,mm4
   157 	psrlw	mm3,2			; mm3=OutLO=( 1  3  5  7)
   158 	psrlw	mm6,2			; mm6=OutHO=( 9 11 13 15)
   160 	psllw	mm3,BYTE_BIT
   161 	psllw	mm6,BYTE_BIT
   162 	por	mm2,mm3			; mm2=OutL=( 0  1  2  3  4  5  6  7)
   163 	por	mm5,mm6			; mm5=OutH=( 8  9 10 11 12 13 14 15)
   165 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm2
   166 	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm5
   168 	sub	eax, byte SIZEOF_MMWORD
   169 	add	esi, byte 1*SIZEOF_MMWORD	; inptr
   170 	add	edi, byte 2*SIZEOF_MMWORD	; outptr
   171 	cmp	eax, byte SIZEOF_MMWORD
   172 	ja	near .columnloop
   173 	test	eax,eax
   174 	jnz	near .columnloop_last
   176 	pop	esi
   177 	pop	edi
   178 	pop	eax
   180 	add	esi, byte SIZEOF_JSAMPROW	; input_data
   181 	add	edi, byte SIZEOF_JSAMPROW	; output_data
   182 	dec	ecx				; rowctr
   183 	jg	near .rowloop
   185 	emms		; empty MMX state
   187 .return:
   188 	pop	edi
   189 	pop	esi
   190 ;	pop	edx		; need not be preserved
   191 ;	pop	ecx		; need not be preserved
   192 	poppic	ebx
   193 	pop	ebp
   194 	ret
   196 ; --------------------------------------------------------------------------
   197 ;
   198 ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
   199 ; Again a triangle filter; see comments for h2v1 case, above.
   200 ;
   201 ; GLOBAL(void)
   202 ; jsimd_h2v2_fancy_upsample_mmx (int max_v_samp_factor,
   203 ;                                JDIMENSION downsampled_width,
   204 ;                                JSAMPARRAY input_data,
   205 ;                                JSAMPARRAY * output_data_ptr);
   206 ;
   208 %define max_v_samp(b)		(b)+8			; int max_v_samp_factor
   209 %define downsamp_width(b)	(b)+12	; JDIMENSION downsampled_width
   210 %define input_data(b)		(b)+16		; JSAMPARRAY input_data
   211 %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
   213 %define original_ebp	ebp+0
   214 %define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
   215 %define WK_NUM		4
   216 %define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
   218 	align	16
   219 	global	EXTN(jsimd_h2v2_fancy_upsample_mmx)
   221 EXTN(jsimd_h2v2_fancy_upsample_mmx):
   222 	push	ebp
   223 	mov	eax,esp				; eax = original ebp
   224 	sub	esp, byte 4
   225 	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
   226 	mov	[esp],eax
   227 	mov	ebp,esp				; ebp = aligned ebp
   228 	lea	esp, [wk(0)]
   229 	pushpic	eax		; make a room for GOT address
   230 	push	ebx
   231 ;	push	ecx		; need not be preserved
   232 ;	push	edx		; need not be preserved
   233 	push	esi
   234 	push	edi
   236 	get_GOT	ebx			; get GOT address
   237 	movpic	POINTER [gotptr], ebx	; save GOT address
   239 	mov	edx,eax				; edx = original ebp
   240 	mov	eax, JDIMENSION [downsamp_width(edx)]  ; colctr
   241 	test	eax,eax
   242 	jz	near .return
   244 	mov	ecx, INT [max_v_samp(edx)]	; rowctr
   245 	test	ecx,ecx
   246 	jz	near .return
   248 	mov	esi, JSAMPARRAY [input_data(edx)]	; input_data
   249 	mov	edi, POINTER [output_data_ptr(edx)]
   250 	mov	edi, JSAMPARRAY [edi]			; output_data
   251 	alignx	16,7
   252 .rowloop:
   253 	push	eax					; colctr
   254 	push	ecx
   255 	push	edi
   256 	push	esi
   258 	mov	ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]	; inptr1(above)
   259 	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
   260 	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1(below)
   261 	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
   262 	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
   264 	test	eax, SIZEOF_MMWORD-1
   265 	jz	short .skip
   266 	push	edx
   267 	mov	dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
   268 	mov	JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
   269 	mov	dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
   270 	mov	JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
   271 	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
   272 	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
   273 	pop	edx
   274 .skip:
   275 	; -- process the first column block
   277 	movq	mm0, MMWORD [ebx+0*SIZEOF_MMWORD]	; mm0=row[ 0][0]
   278 	movq	mm1, MMWORD [ecx+0*SIZEOF_MMWORD]	; mm1=row[-1][0]
   279 	movq	mm2, MMWORD [esi+0*SIZEOF_MMWORD]	; mm2=row[+1][0]
   281 	pushpic	ebx
   282 	movpic	ebx, POINTER [gotptr]	; load GOT address
   284 	pxor      mm3,mm3		; mm3=(all 0's)
   285 	movq      mm4,mm0
   286 	punpcklbw mm0,mm3		; mm0=row[ 0][0]( 0 1 2 3)
   287 	punpckhbw mm4,mm3		; mm4=row[ 0][0]( 4 5 6 7)
   288 	movq      mm5,mm1
   289 	punpcklbw mm1,mm3		; mm1=row[-1][0]( 0 1 2 3)
   290 	punpckhbw mm5,mm3		; mm5=row[-1][0]( 4 5 6 7)
   291 	movq      mm6,mm2
   292 	punpcklbw mm2,mm3		; mm2=row[+1][0]( 0 1 2 3)
   293 	punpckhbw mm6,mm3		; mm6=row[+1][0]( 4 5 6 7)
   295 	pmullw	mm0,[GOTOFF(ebx,PW_THREE)]
   296 	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
   298 	pcmpeqb	mm7,mm7
   299 	psrlq	mm7,(SIZEOF_MMWORD-2)*BYTE_BIT
   301 	paddw	mm1,mm0			; mm1=Int0L=( 0 1 2 3)
   302 	paddw	mm5,mm4			; mm5=Int0H=( 4 5 6 7)
   303 	paddw	mm2,mm0			; mm2=Int1L=( 0 1 2 3)
   304 	paddw	mm6,mm4			; mm6=Int1H=( 4 5 6 7)
   306 	movq	MMWORD [edx+0*SIZEOF_MMWORD], mm1	; temporarily save
   307 	movq	MMWORD [edx+1*SIZEOF_MMWORD], mm5	; the intermediate data
   308 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm2
   309 	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm6
   311 	pand	mm1,mm7			; mm1=( 0 - - -)
   312 	pand	mm2,mm7			; mm2=( 0 - - -)
   314 	movq	MMWORD [wk(0)], mm1
   315 	movq	MMWORD [wk(1)], mm2
   317 	poppic	ebx
   319 	add	eax, byte SIZEOF_MMWORD-1
   320 	and	eax, byte -SIZEOF_MMWORD
   321 	cmp	eax, byte SIZEOF_MMWORD
   322 	ja	short .columnloop
   323 	alignx	16,7
   325 .columnloop_last:
   326 	; -- process the last column block
   328 	pushpic	ebx
   329 	movpic	ebx, POINTER [gotptr]	; load GOT address
   331 	pcmpeqb	mm1,mm1
   332 	psllq	mm1,(SIZEOF_MMWORD-2)*BYTE_BIT
   333 	movq	mm2,mm1
   335 	pand	mm1, MMWORD [edx+1*SIZEOF_MMWORD]	; mm1=( - - - 7)
   336 	pand	mm2, MMWORD [edi+1*SIZEOF_MMWORD]	; mm2=( - - - 7)
   338 	movq	MMWORD [wk(2)], mm1
   339 	movq	MMWORD [wk(3)], mm2
   341 	jmp	short .upsample
   342 	alignx	16,7
   344 .columnloop:
   345 	; -- process the next column block
   347 	movq	mm0, MMWORD [ebx+1*SIZEOF_MMWORD]	; mm0=row[ 0][1]
   348 	movq	mm1, MMWORD [ecx+1*SIZEOF_MMWORD]	; mm1=row[-1][1]
   349 	movq	mm2, MMWORD [esi+1*SIZEOF_MMWORD]	; mm2=row[+1][1]
   351 	pushpic	ebx
   352 	movpic	ebx, POINTER [gotptr]	; load GOT address
   354 	pxor      mm3,mm3		; mm3=(all 0's)
   355 	movq      mm4,mm0
   356 	punpcklbw mm0,mm3		; mm0=row[ 0][1]( 0 1 2 3)
   357 	punpckhbw mm4,mm3		; mm4=row[ 0][1]( 4 5 6 7)
   358 	movq      mm5,mm1
   359 	punpcklbw mm1,mm3		; mm1=row[-1][1]( 0 1 2 3)
   360 	punpckhbw mm5,mm3		; mm5=row[-1][1]( 4 5 6 7)
   361 	movq      mm6,mm2
   362 	punpcklbw mm2,mm3		; mm2=row[+1][1]( 0 1 2 3)
   363 	punpckhbw mm6,mm3		; mm6=row[+1][1]( 4 5 6 7)
   365 	pmullw	mm0,[GOTOFF(ebx,PW_THREE)]
   366 	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
   368 	paddw	mm1,mm0			; mm1=Int0L=( 0 1 2 3)
   369 	paddw	mm5,mm4			; mm5=Int0H=( 4 5 6 7)
   370 	paddw	mm2,mm0			; mm2=Int1L=( 0 1 2 3)
   371 	paddw	mm6,mm4			; mm6=Int1H=( 4 5 6 7)
   373 	movq	MMWORD [edx+2*SIZEOF_MMWORD], mm1	; temporarily save
   374 	movq	MMWORD [edx+3*SIZEOF_MMWORD], mm5	; the intermediate data
   375 	movq	MMWORD [edi+2*SIZEOF_MMWORD], mm2
   376 	movq	MMWORD [edi+3*SIZEOF_MMWORD], mm6
   378 	psllq	mm1,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm1=( - - - 0)
   379 	psllq	mm2,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm2=( - - - 0)
   381 	movq	MMWORD [wk(2)], mm1
   382 	movq	MMWORD [wk(3)], mm2
   384 .upsample:
   385 	; -- process the upper row
   387 	movq	mm7, MMWORD [edx+0*SIZEOF_MMWORD]	; mm7=Int0L=( 0 1 2 3)
   388 	movq	mm3, MMWORD [edx+1*SIZEOF_MMWORD]	; mm3=Int0H=( 4 5 6 7)
   390 	movq	mm0,mm7
   391 	movq	mm4,mm3
   392 	psrlq	mm0,2*BYTE_BIT			; mm0=( 1 2 3 -)
   393 	psllq	mm4,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm4=( - - - 4)
   394 	movq	mm5,mm7
   395 	movq	mm6,mm3
   396 	psrlq	mm5,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm5=( 3 - - -)
   397 	psllq	mm6,2*BYTE_BIT			; mm6=( - 4 5 6)
   399 	por	mm0,mm4				; mm0=( 1 2 3 4)
   400 	por	mm5,mm6				; mm5=( 3 4 5 6)
   402 	movq	mm1,mm7
   403 	movq	mm2,mm3
   404 	psllq	mm1,2*BYTE_BIT			; mm1=( - 0 1 2)
   405 	psrlq	mm2,2*BYTE_BIT			; mm2=( 5 6 7 -)
   406 	movq	mm4,mm3
   407 	psrlq	mm4,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm4=( 7 - - -)
   409 	por	mm1, MMWORD [wk(0)]		; mm1=(-1 0 1 2)
   410 	por	mm2, MMWORD [wk(2)]		; mm2=( 5 6 7 8)
   412 	movq	MMWORD [wk(0)], mm4
   414 	pmullw	mm7,[GOTOFF(ebx,PW_THREE)]
   415 	pmullw	mm3,[GOTOFF(ebx,PW_THREE)]
   416 	paddw	mm1,[GOTOFF(ebx,PW_EIGHT)]
   417 	paddw	mm5,[GOTOFF(ebx,PW_EIGHT)]
   418 	paddw	mm0,[GOTOFF(ebx,PW_SEVEN)]
   419 	paddw	mm2,[GOTOFF(ebx,PW_SEVEN)]
   421 	paddw	mm1,mm7
   422 	paddw	mm5,mm3
   423 	psrlw	mm1,4			; mm1=Out0LE=( 0  2  4  6)
   424 	psrlw	mm5,4			; mm5=Out0HE=( 8 10 12 14)
   425 	paddw	mm0,mm7
   426 	paddw	mm2,mm3
   427 	psrlw	mm0,4			; mm0=Out0LO=( 1  3  5  7)
   428 	psrlw	mm2,4			; mm2=Out0HO=( 9 11 13 15)
   430 	psllw	mm0,BYTE_BIT
   431 	psllw	mm2,BYTE_BIT
   432 	por	mm1,mm0			; mm1=Out0L=( 0  1  2  3  4  5  6  7)
   433 	por	mm5,mm2			; mm5=Out0H=( 8  9 10 11 12 13 14 15)
   435 	movq	MMWORD [edx+0*SIZEOF_MMWORD], mm1
   436 	movq	MMWORD [edx+1*SIZEOF_MMWORD], mm5
   438 	; -- process the lower row
   440 	movq	mm6, MMWORD [edi+0*SIZEOF_MMWORD]	; mm6=Int1L=( 0 1 2 3)
   441 	movq	mm4, MMWORD [edi+1*SIZEOF_MMWORD]	; mm4=Int1H=( 4 5 6 7)
   443 	movq	mm7,mm6
   444 	movq	mm3,mm4
   445 	psrlq	mm7,2*BYTE_BIT			; mm7=( 1 2 3 -)
   446 	psllq	mm3,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm3=( - - - 4)
   447 	movq	mm0,mm6
   448 	movq	mm2,mm4
   449 	psrlq	mm0,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm0=( 3 - - -)
   450 	psllq	mm2,2*BYTE_BIT			; mm2=( - 4 5 6)
   452 	por	mm7,mm3				; mm7=( 1 2 3 4)
   453 	por	mm0,mm2				; mm0=( 3 4 5 6)
   455 	movq	mm1,mm6
   456 	movq	mm5,mm4
   457 	psllq	mm1,2*BYTE_BIT			; mm1=( - 0 1 2)
   458 	psrlq	mm5,2*BYTE_BIT			; mm5=( 5 6 7 -)
   459 	movq	mm3,mm4
   460 	psrlq	mm3,(SIZEOF_MMWORD-2)*BYTE_BIT	; mm3=( 7 - - -)
   462 	por	mm1, MMWORD [wk(1)]		; mm1=(-1 0 1 2)
   463 	por	mm5, MMWORD [wk(3)]		; mm5=( 5 6 7 8)
   465 	movq	MMWORD [wk(1)], mm3
   467 	pmullw	mm6,[GOTOFF(ebx,PW_THREE)]
   468 	pmullw	mm4,[GOTOFF(ebx,PW_THREE)]
   469 	paddw	mm1,[GOTOFF(ebx,PW_EIGHT)]
   470 	paddw	mm0,[GOTOFF(ebx,PW_EIGHT)]
   471 	paddw	mm7,[GOTOFF(ebx,PW_SEVEN)]
   472 	paddw	mm5,[GOTOFF(ebx,PW_SEVEN)]
   474 	paddw	mm1,mm6
   475 	paddw	mm0,mm4
   476 	psrlw	mm1,4			; mm1=Out1LE=( 0  2  4  6)
   477 	psrlw	mm0,4			; mm0=Out1HE=( 8 10 12 14)
   478 	paddw	mm7,mm6
   479 	paddw	mm5,mm4
   480 	psrlw	mm7,4			; mm7=Out1LO=( 1  3  5  7)
   481 	psrlw	mm5,4			; mm5=Out1HO=( 9 11 13 15)
   483 	psllw	mm7,BYTE_BIT
   484 	psllw	mm5,BYTE_BIT
   485 	por	mm1,mm7			; mm1=Out1L=( 0  1  2  3  4  5  6  7)
   486 	por	mm0,mm5			; mm0=Out1H=( 8  9 10 11 12 13 14 15)
   488 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm1
   489 	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm0
   491 	poppic	ebx
   493 	sub	eax, byte SIZEOF_MMWORD
   494 	add	ecx, byte 1*SIZEOF_MMWORD	; inptr1(above)
   495 	add	ebx, byte 1*SIZEOF_MMWORD	; inptr0
   496 	add	esi, byte 1*SIZEOF_MMWORD	; inptr1(below)
   497 	add	edx, byte 2*SIZEOF_MMWORD	; outptr0
   498 	add	edi, byte 2*SIZEOF_MMWORD	; outptr1
   499 	cmp	eax, byte SIZEOF_MMWORD
   500 	ja	near .columnloop
   501 	test	eax,eax
   502 	jnz	near .columnloop_last
   504 	pop	esi
   505 	pop	edi
   506 	pop	ecx
   507 	pop	eax
   509 	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
   510 	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
   511 	sub	ecx, byte 2			; rowctr
   512 	jg	near .rowloop
   514 	emms		; empty MMX state
   516 .return:
   517 	pop	edi
   518 	pop	esi
   519 ;	pop	edx		; need not be preserved
   520 ;	pop	ecx		; need not be preserved
   521 	pop	ebx
   522 	mov	esp,ebp		; esp <- aligned ebp
   523 	pop	esp		; esp <- original ebp
   524 	pop	ebp
   525 	ret
   527 ; --------------------------------------------------------------------------
   528 ;
   529 ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
   530 ; It's still a box filter.
   531 ;
   532 ; GLOBAL(void)
   533 ; jsimd_h2v1_upsample_mmx (int max_v_samp_factor,
   534 ;                          JDIMENSION output_width,
   535 ;                          JSAMPARRAY input_data,
   536 ;                          JSAMPARRAY * output_data_ptr);
   537 ;
   539 %define max_v_samp(b)		(b)+8			; int max_v_samp_factor
   540 %define output_width(b)	(b)+12		; JDIMENSION output_width
   541 %define input_data(b)		(b)+16		; JSAMPARRAY input_data
   542 %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
   544 	align	16
   545 	global	EXTN(jsimd_h2v1_upsample_mmx)
   547 EXTN(jsimd_h2v1_upsample_mmx):
   548 	push	ebp
   549 	mov	ebp,esp
   550 ;	push	ebx		; unused
   551 ;	push	ecx		; need not be preserved
   552 ;	push	edx		; need not be preserved
   553 	push	esi
   554 	push	edi
   556 	mov	edx, JDIMENSION [output_width(ebp)]
   557 	add	edx, byte (2*SIZEOF_MMWORD)-1
   558 	and	edx, byte -(2*SIZEOF_MMWORD)
   559 	jz	short .return
   561 	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
   562 	test	ecx,ecx
   563 	jz	short .return
   565 	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
   566 	mov	edi, POINTER [output_data_ptr(ebp)]
   567 	mov	edi, JSAMPARRAY [edi]			; output_data
   568 	alignx	16,7
   569 .rowloop:
   570 	push	edi
   571 	push	esi
   573 	mov	esi, JSAMPROW [esi]		; inptr
   574 	mov	edi, JSAMPROW [edi]		; outptr
   575 	mov	eax,edx				; colctr
   576 	alignx	16,7
   577 .columnloop:
   579 	movq	mm0, MMWORD [esi+0*SIZEOF_MMWORD]
   581 	movq      mm1,mm0
   582 	punpcklbw mm0,mm0
   583 	punpckhbw mm1,mm1
   585 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
   586 	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm1
   588 	sub	eax, byte 2*SIZEOF_MMWORD
   589 	jz	short .nextrow
   591 	movq	mm2, MMWORD [esi+1*SIZEOF_MMWORD]
   593 	movq      mm3,mm2
   594 	punpcklbw mm2,mm2
   595 	punpckhbw mm3,mm3
   597 	movq	MMWORD [edi+2*SIZEOF_MMWORD], mm2
   598 	movq	MMWORD [edi+3*SIZEOF_MMWORD], mm3
   600 	sub	eax, byte 2*SIZEOF_MMWORD
   601 	jz	short .nextrow
   603 	add	esi, byte 2*SIZEOF_MMWORD	; inptr
   604 	add	edi, byte 4*SIZEOF_MMWORD	; outptr
   605 	jmp	short .columnloop
   606 	alignx	16,7
   608 .nextrow:
   609 	pop	esi
   610 	pop	edi
   612 	add	esi, byte SIZEOF_JSAMPROW	; input_data
   613 	add	edi, byte SIZEOF_JSAMPROW	; output_data
   614 	dec	ecx				; rowctr
   615 	jg	short .rowloop
   617 	emms		; empty MMX state
   619 .return:
   620 	pop	edi
   621 	pop	esi
   622 ;	pop	edx		; need not be preserved
   623 ;	pop	ecx		; need not be preserved
   624 ;	pop	ebx		; unused
   625 	pop	ebp
   626 	ret
   628 ; --------------------------------------------------------------------------
   629 ;
   630 ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
   631 ; It's still a box filter.
   632 ;
   633 ; GLOBAL(void)
   634 ; jsimd_h2v2_upsample_mmx (int max_v_samp_factor,
   635 ;                          JDIMENSION output_width,
   636 ;                          JSAMPARRAY input_data,
   637 ;                          JSAMPARRAY * output_data_ptr);
   638 ;
   640 %define max_v_samp(b)		(b)+8			; int max_v_samp_factor
   641 %define output_width(b)	(b)+12		; JDIMENSION output_width
   642 %define input_data(b)		(b)+16		; JSAMPARRAY input_data
   643 %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
   645 	align	16
   646 	global	EXTN(jsimd_h2v2_upsample_mmx)
   648 EXTN(jsimd_h2v2_upsample_mmx):
   649 	push	ebp
   650 	mov	ebp,esp
   651 	push	ebx
   652 ;	push	ecx		; need not be preserved
   653 ;	push	edx		; need not be preserved
   654 	push	esi
   655 	push	edi
   657 	mov	edx, JDIMENSION [output_width(ebp)]
   658 	add	edx, byte (2*SIZEOF_MMWORD)-1
   659 	and	edx, byte -(2*SIZEOF_MMWORD)
   660 	jz	near .return
   662 	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
   663 	test	ecx,ecx
   664 	jz	short .return
   666 	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
   667 	mov	edi, POINTER [output_data_ptr(ebp)]
   668 	mov	edi, JSAMPARRAY [edi]			; output_data
   669 	alignx	16,7
   670 .rowloop:
   671 	push	edi
   672 	push	esi
   674 	mov	esi, JSAMPROW [esi]			; inptr
   675 	mov	ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
   676 	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
   677 	mov	eax,edx					; colctr
   678 	alignx	16,7
   679 .columnloop:
   681 	movq	mm0, MMWORD [esi+0*SIZEOF_MMWORD]
   683 	movq      mm1,mm0
   684 	punpcklbw mm0,mm0
   685 	punpckhbw mm1,mm1
   687 	movq	MMWORD [ebx+0*SIZEOF_MMWORD], mm0
   688 	movq	MMWORD [ebx+1*SIZEOF_MMWORD], mm1
   689 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
   690 	movq	MMWORD [edi+1*SIZEOF_MMWORD], mm1
   692 	sub	eax, byte 2*SIZEOF_MMWORD
   693 	jz	short .nextrow
   695 	movq	mm2, MMWORD [esi+1*SIZEOF_MMWORD]
   697 	movq      mm3,mm2
   698 	punpcklbw mm2,mm2
   699 	punpckhbw mm3,mm3
   701 	movq	MMWORD [ebx+2*SIZEOF_MMWORD], mm2
   702 	movq	MMWORD [ebx+3*SIZEOF_MMWORD], mm3
   703 	movq	MMWORD [edi+2*SIZEOF_MMWORD], mm2
   704 	movq	MMWORD [edi+3*SIZEOF_MMWORD], mm3
   706 	sub	eax, byte 2*SIZEOF_MMWORD
   707 	jz	short .nextrow
   709 	add	esi, byte 2*SIZEOF_MMWORD	; inptr
   710 	add	ebx, byte 4*SIZEOF_MMWORD	; outptr0
   711 	add	edi, byte 4*SIZEOF_MMWORD	; outptr1
   712 	jmp	short .columnloop
   713 	alignx	16,7
   715 .nextrow:
   716 	pop	esi
   717 	pop	edi
   719 	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
   720 	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
   721 	sub	ecx, byte 2			; rowctr
   722 	jg	short .rowloop
   724 	emms		; empty MMX state
   726 .return:
   727 	pop	edi
   728 	pop	esi
   729 ;	pop	edx		; need not be preserved
   730 ;	pop	ecx		; need not be preserved
   731 	pop	ebx
   732 	pop	ebp
   733 	ret
   735 ; For some reason, the OS X linker does not honor the request to align the
   736 ; segment unless we do this.
   737 	align	16

mercurial