media/libjpeg/simd/jdsamss2.asm

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 ;
     2 ; jdsamss2.asm - upsampling (SSE2)
     3 ;
     4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
     5 ;
     6 ; Based on
     7 ; x86 SIMD extension for IJG JPEG library
     8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
    10 ;
    11 ; This file should be assembled with NASM (Netwide Assembler),
    12 ; can *not* be assembled with Microsoft's MASM or any compatible
    13 ; assembler (including Borland's Turbo Assembler).
    14 ; NASM is available from http://nasm.sourceforge.net/ or
    15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
    16 ;
    17 ; [TAB8]
    19 %include "jsimdext.inc"
    21 ; --------------------------------------------------------------------------
    22 	SECTION	SEG_CONST
    24 	alignz	16
    25 	global	EXTN(jconst_fancy_upsample_sse2)
    27 EXTN(jconst_fancy_upsample_sse2):
    29 PW_ONE		times 8 dw  1
    30 PW_TWO		times 8 dw  2
    31 PW_THREE	times 8 dw  3
    32 PW_SEVEN	times 8 dw  7
    33 PW_EIGHT	times 8 dw  8
    35 	alignz	16
    37 ; --------------------------------------------------------------------------
    38 	SECTION	SEG_TEXT
    39 	BITS	32
    40 ;
    41 ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
    42 ;
    43 ; The upsampling algorithm is linear interpolation between pixel centers,
    44 ; also known as a "triangle filter".  This is a good compromise between
    45 ; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
    46 ; of the way between input pixel centers.
    47 ;
    48 ; GLOBAL(void)
    49 ; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
    50 ;                                 JDIMENSION downsampled_width,
    51 ;                                 JSAMPARRAY input_data,
    52 ;                                 JSAMPARRAY * output_data_ptr);
    53 ;
    55 %define max_v_samp(b)		(b)+8			; int max_v_samp_factor
    56 %define downsamp_width(b)	(b)+12	; JDIMENSION downsampled_width
    57 %define input_data(b)		(b)+16		; JSAMPARRAY input_data
    58 %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
    60 	align	16
    61 	global	EXTN(jsimd_h2v1_fancy_upsample_sse2)
    63 EXTN(jsimd_h2v1_fancy_upsample_sse2):
    64 	push	ebp
    65 	mov	ebp,esp
    66 	pushpic	ebx
    67 ;	push	ecx		; need not be preserved
    68 ;	push	edx		; need not be preserved
    69 	push	esi
    70 	push	edi
    72 	get_GOT	ebx		; get GOT address
    74 	mov	eax, JDIMENSION [downsamp_width(ebp)]  ; colctr
    75 	test	eax,eax
    76 	jz	near .return
    78 	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
    79 	test	ecx,ecx
    80 	jz	near .return
    82 	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
    83 	mov	edi, POINTER [output_data_ptr(ebp)]
    84 	mov	edi, JSAMPARRAY [edi]			; output_data
    85 	alignx	16,7
    86 .rowloop:
    87 	push	eax			; colctr
    88 	push	edi
    89 	push	esi
    91 	mov	esi, JSAMPROW [esi]	; inptr
    92 	mov	edi, JSAMPROW [edi]	; outptr
    94 	test	eax, SIZEOF_XMMWORD-1
    95 	jz	short .skip
    96 	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
    97 	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
    98 .skip:
    99 	pxor	xmm0,xmm0		; xmm0=(all 0's)
   100 	pcmpeqb	xmm7,xmm7
   101 	psrldq	xmm7,(SIZEOF_XMMWORD-1)
   102 	pand	xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]
   104 	add	eax, byte SIZEOF_XMMWORD-1
   105 	and	eax, byte -SIZEOF_XMMWORD
   106 	cmp	eax, byte SIZEOF_XMMWORD
   107 	ja	short .columnloop
   108 	alignx	16,7
   110 .columnloop_last:
   111 	pcmpeqb	xmm6,xmm6
   112 	pslldq	xmm6,(SIZEOF_XMMWORD-1)
   113 	pand	xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]
   114 	jmp	short .upsample
   115 	alignx	16,7
   117 .columnloop:
   118 	movdqa	xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]
   119 	pslldq	xmm6,(SIZEOF_XMMWORD-1)
   121 .upsample:
   122 	movdqa	xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
   123 	movdqa	xmm2,xmm1
   124 	movdqa	xmm3,xmm1		; xmm1=( 0  1  2 ... 13 14 15)
   125 	pslldq	xmm2,1			; xmm2=(--  0  1 ... 12 13 14)
   126 	psrldq	xmm3,1			; xmm3=( 1  2  3 ... 14 15 --)
   128 	por	xmm2,xmm7		; xmm2=(-1  0  1 ... 12 13 14)
   129 	por	xmm3,xmm6		; xmm3=( 1  2  3 ... 14 15 16)
   131 	movdqa	xmm7,xmm1
   132 	psrldq	xmm7,(SIZEOF_XMMWORD-1)	; xmm7=(15 -- -- ... -- -- --)
   134 	movdqa    xmm4,xmm1
   135 	punpcklbw xmm1,xmm0		; xmm1=( 0  1  2  3  4  5  6  7)
   136 	punpckhbw xmm4,xmm0		; xmm4=( 8  9 10 11 12 13 14 15)
   137 	movdqa    xmm5,xmm2
   138 	punpcklbw xmm2,xmm0		; xmm2=(-1  0  1  2  3  4  5  6)
   139 	punpckhbw xmm5,xmm0		; xmm5=( 7  8  9 10 11 12 13 14)
   140 	movdqa    xmm6,xmm3
   141 	punpcklbw xmm3,xmm0		; xmm3=( 1  2  3  4  5  6  7  8)
   142 	punpckhbw xmm6,xmm0		; xmm6=( 9 10 11 12 13 14 15 16)
   144 	pmullw	xmm1,[GOTOFF(ebx,PW_THREE)]
   145 	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
   146 	paddw	xmm2,[GOTOFF(ebx,PW_ONE)]
   147 	paddw	xmm5,[GOTOFF(ebx,PW_ONE)]
   148 	paddw	xmm3,[GOTOFF(ebx,PW_TWO)]
   149 	paddw	xmm6,[GOTOFF(ebx,PW_TWO)]
   151 	paddw	xmm2,xmm1
   152 	paddw	xmm5,xmm4
   153 	psrlw	xmm2,2			; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
   154 	psrlw	xmm5,2			; xmm5=OutHE=(16 18 20 22 24 26 28 30)
   155 	paddw	xmm3,xmm1
   156 	paddw	xmm6,xmm4
   157 	psrlw	xmm3,2			; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
   158 	psrlw	xmm6,2			; xmm6=OutHO=(17 19 21 23 25 27 29 31)
   160 	psllw	xmm3,BYTE_BIT
   161 	psllw	xmm6,BYTE_BIT
   162 	por	xmm2,xmm3		; xmm2=OutL=( 0  1  2 ... 13 14 15)
   163 	por	xmm5,xmm6		; xmm5=OutH=(16 17 18 ... 29 30 31)
   165 	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
   166 	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5
   168 	sub	eax, byte SIZEOF_XMMWORD
   169 	add	esi, byte 1*SIZEOF_XMMWORD	; inptr
   170 	add	edi, byte 2*SIZEOF_XMMWORD	; outptr
   171 	cmp	eax, byte SIZEOF_XMMWORD
   172 	ja	near .columnloop
   173 	test	eax,eax
   174 	jnz	near .columnloop_last
   176 	pop	esi
   177 	pop	edi
   178 	pop	eax
   180 	add	esi, byte SIZEOF_JSAMPROW	; input_data
   181 	add	edi, byte SIZEOF_JSAMPROW	; output_data
   182 	dec	ecx				; rowctr
   183 	jg	near .rowloop
   185 .return:
   186 	pop	edi
   187 	pop	esi
   188 ;	pop	edx		; need not be preserved
   189 ;	pop	ecx		; need not be preserved
   190 	poppic	ebx
   191 	pop	ebp
   192 	ret
   194 ; --------------------------------------------------------------------------
   195 ;
   196 ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
   197 ; Again a triangle filter; see comments for h2v1 case, above.
   198 ;
   199 ; GLOBAL(void)
   200 ; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
   201 ;                                 JDIMENSION downsampled_width,
   202 ;                                 JSAMPARRAY input_data,
   203 ;                                 JSAMPARRAY * output_data_ptr);
   204 ;
   206 %define max_v_samp(b)		(b)+8			; int max_v_samp_factor
   207 %define downsamp_width(b)	(b)+12	; JDIMENSION downsampled_width
   208 %define input_data(b)		(b)+16		; JSAMPARRAY input_data
   209 %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
   211 %define original_ebp	ebp+0
   212 %define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
   213 %define WK_NUM		4
   214 %define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
   216 	align	16
   217 	global	EXTN(jsimd_h2v2_fancy_upsample_sse2)
   219 EXTN(jsimd_h2v2_fancy_upsample_sse2):
   220 	push	ebp
   221 	mov	eax,esp				; eax = original ebp
   222 	sub	esp, byte 4
   223 	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
   224 	mov	[esp],eax
   225 	mov	ebp,esp				; ebp = aligned ebp
   226 	lea	esp, [wk(0)]
   227 	pushpic	eax		; make a room for GOT address
   228 	push	ebx
   229 ;	push	ecx		; need not be preserved
   230 ;	push	edx		; need not be preserved
   231 	push	esi
   232 	push	edi
   234 	get_GOT	ebx			; get GOT address
   235 	movpic	POINTER [gotptr], ebx	; save GOT address
   237 	mov	edx,eax				; edx = original ebp
   238 	mov	eax, JDIMENSION [downsamp_width(edx)]  ; colctr
   239 	test	eax,eax
   240 	jz	near .return
   242 	mov	ecx, INT [max_v_samp(edx)]	; rowctr
   243 	test	ecx,ecx
   244 	jz	near .return
   246 	mov	esi, JSAMPARRAY [input_data(edx)]	; input_data
   247 	mov	edi, POINTER [output_data_ptr(edx)]
   248 	mov	edi, JSAMPARRAY [edi]			; output_data
   249 	alignx	16,7
   250 .rowloop:
   251 	push	eax					; colctr
   252 	push	ecx
   253 	push	edi
   254 	push	esi
   256 	mov	ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]	; inptr1(above)
   257 	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
   258 	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1(below)
   259 	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
   260 	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
   262 	test	eax, SIZEOF_XMMWORD-1
   263 	jz	short .skip
   264 	push	edx
   265 	mov	dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]
   266 	mov	JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl
   267 	mov	dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]
   268 	mov	JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl
   269 	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]
   270 	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
   271 	pop	edx
   272 .skip:
   273 	; -- process the first column block
   275 	movdqa	xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD]	; xmm0=row[ 0][0]
   276 	movdqa	xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD]	; xmm1=row[-1][0]
   277 	movdqa	xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD]	; xmm2=row[+1][0]
   279 	pushpic	ebx
   280 	movpic	ebx, POINTER [gotptr]	; load GOT address
   282 	pxor      xmm3,xmm3		; xmm3=(all 0's)
   283 	movdqa    xmm4,xmm0
   284 	punpcklbw xmm0,xmm3		; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
   285 	punpckhbw xmm4,xmm3		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
   286 	movdqa    xmm5,xmm1
   287 	punpcklbw xmm1,xmm3		; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
   288 	punpckhbw xmm5,xmm3		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
   289 	movdqa    xmm6,xmm2
   290 	punpcklbw xmm2,xmm3		; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
   291 	punpckhbw xmm6,xmm3		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
   293 	pmullw	xmm0,[GOTOFF(ebx,PW_THREE)]
   294 	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
   296 	pcmpeqb	xmm7,xmm7
   297 	psrldq	xmm7,(SIZEOF_XMMWORD-2)
   299 	paddw	xmm1,xmm0		; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
   300 	paddw	xmm5,xmm4		; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
   301 	paddw	xmm2,xmm0		; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
   302 	paddw	xmm6,xmm4		; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
   304 	movdqa	XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1	; temporarily save
   305 	movdqa	XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5	; the intermediate data
   306 	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2
   307 	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6
   309 	pand	xmm1,xmm7		; xmm1=( 0 -- -- -- -- -- -- --)
   310 	pand	xmm2,xmm7		; xmm2=( 0 -- -- -- -- -- -- --)
   312 	movdqa	XMMWORD [wk(0)], xmm1
   313 	movdqa	XMMWORD [wk(1)], xmm2
   315 	poppic	ebx
   317 	add	eax, byte SIZEOF_XMMWORD-1
   318 	and	eax, byte -SIZEOF_XMMWORD
   319 	cmp	eax, byte SIZEOF_XMMWORD
   320 	ja	short .columnloop
   321 	alignx	16,7
   323 .columnloop_last:
   324 	; -- process the last column block
   326 	pushpic	ebx
   327 	movpic	ebx, POINTER [gotptr]	; load GOT address
   329 	pcmpeqb	xmm1,xmm1
   330 	pslldq	xmm1,(SIZEOF_XMMWORD-2)
   331 	movdqa	xmm2,xmm1
   333 	pand	xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]
   334 	pand	xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]
   336 	movdqa	XMMWORD [wk(2)], xmm1	; xmm1=(-- -- -- -- -- -- -- 15)
   337 	movdqa	XMMWORD [wk(3)], xmm2	; xmm2=(-- -- -- -- -- -- -- 15)
   339 	jmp	near .upsample
   340 	alignx	16,7
   342 .columnloop:
   343 	; -- process the next column block
   345 	movdqa	xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD]	; xmm0=row[ 0][1]
   346 	movdqa	xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD]	; xmm1=row[-1][1]
   347 	movdqa	xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]	; xmm2=row[+1][1]
   349 	pushpic	ebx
   350 	movpic	ebx, POINTER [gotptr]	; load GOT address
   352 	pxor      xmm3,xmm3		; xmm3=(all 0's)
   353 	movdqa    xmm4,xmm0
   354 	punpcklbw xmm0,xmm3		; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
   355 	punpckhbw xmm4,xmm3		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
   356 	movdqa    xmm5,xmm1
   357 	punpcklbw xmm1,xmm3		; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
   358 	punpckhbw xmm5,xmm3		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
   359 	movdqa    xmm6,xmm2
   360 	punpcklbw xmm2,xmm3		; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
   361 	punpckhbw xmm6,xmm3		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
   363 	pmullw	xmm0,[GOTOFF(ebx,PW_THREE)]
   364 	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
   366 	paddw	xmm1,xmm0		; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
   367 	paddw	xmm5,xmm4		; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
   368 	paddw	xmm2,xmm0		; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
   369 	paddw	xmm6,xmm4		; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
   371 	movdqa	XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1	; temporarily save
   372 	movdqa	XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5	; the intermediate data
   373 	movdqa	XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
   374 	movdqa	XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6
   376 	pslldq	xmm1,(SIZEOF_XMMWORD-2)	; xmm1=(-- -- -- -- -- -- --  0)
   377 	pslldq	xmm2,(SIZEOF_XMMWORD-2)	; xmm2=(-- -- -- -- -- -- --  0)
   379 	movdqa	XMMWORD [wk(2)], xmm1
   380 	movdqa	XMMWORD [wk(3)], xmm2
   382 .upsample:
   383 	; -- process the upper row
   385 	movdqa	xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]
   386 	movdqa	xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]
   388 	movdqa	xmm0,xmm7		; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
   389 	movdqa	xmm4,xmm3		; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
   390 	psrldq	xmm0,2			; xmm0=( 1  2  3  4  5  6  7 --)
   391 	pslldq	xmm4,(SIZEOF_XMMWORD-2)	; xmm4=(-- -- -- -- -- -- --  8)
   392 	movdqa	xmm5,xmm7
   393 	movdqa	xmm6,xmm3
   394 	psrldq	xmm5,(SIZEOF_XMMWORD-2)	; xmm5=( 7 -- -- -- -- -- -- --)
   395 	pslldq	xmm6,2			; xmm6=(--  8  9 10 11 12 13 14)
   397 	por	xmm0,xmm4		; xmm0=( 1  2  3  4  5  6  7  8)
   398 	por	xmm5,xmm6		; xmm5=( 7  8  9 10 11 12 13 14)
   400 	movdqa	xmm1,xmm7
   401 	movdqa	xmm2,xmm3
   402 	pslldq	xmm1,2			; xmm1=(--  0  1  2  3  4  5  6)
   403 	psrldq	xmm2,2			; xmm2=( 9 10 11 12 13 14 15 --)
   404 	movdqa	xmm4,xmm3
   405 	psrldq	xmm4,(SIZEOF_XMMWORD-2)	; xmm4=(15 -- -- -- -- -- -- --)
   407 	por	xmm1, XMMWORD [wk(0)]	; xmm1=(-1  0  1  2  3  4  5  6)
   408 	por	xmm2, XMMWORD [wk(2)]	; xmm2=( 9 10 11 12 13 14 15 16)
   410 	movdqa	XMMWORD [wk(0)], xmm4
   412 	pmullw	xmm7,[GOTOFF(ebx,PW_THREE)]
   413 	pmullw	xmm3,[GOTOFF(ebx,PW_THREE)]
   414 	paddw	xmm1,[GOTOFF(ebx,PW_EIGHT)]
   415 	paddw	xmm5,[GOTOFF(ebx,PW_EIGHT)]
   416 	paddw	xmm0,[GOTOFF(ebx,PW_SEVEN)]
   417 	paddw	xmm2,[GOTOFF(ebx,PW_SEVEN)]
   419 	paddw	xmm1,xmm7
   420 	paddw	xmm5,xmm3
   421 	psrlw	xmm1,4			; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
   422 	psrlw	xmm5,4			; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
   423 	paddw	xmm0,xmm7
   424 	paddw	xmm2,xmm3
   425 	psrlw	xmm0,4			; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
   426 	psrlw	xmm2,4			; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
   428 	psllw	xmm0,BYTE_BIT
   429 	psllw	xmm2,BYTE_BIT
   430 	por	xmm1,xmm0		; xmm1=Out0L=( 0  1  2 ... 13 14 15)
   431 	por	xmm5,xmm2		; xmm5=Out0H=(16 17 18 ... 29 30 31)
   433 	movdqa	XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1
   434 	movdqa	XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5
   436 	; -- process the lower row
   438 	movdqa	xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]
   439 	movdqa	xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]
   441 	movdqa	xmm7,xmm6		; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
   442 	movdqa	xmm3,xmm4		; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
   443 	psrldq	xmm7,2			; xmm7=( 1  2  3  4  5  6  7 --)
   444 	pslldq	xmm3,(SIZEOF_XMMWORD-2)	; xmm3=(-- -- -- -- -- -- --  8)
   445 	movdqa	xmm0,xmm6
   446 	movdqa	xmm2,xmm4
   447 	psrldq	xmm0,(SIZEOF_XMMWORD-2)	; xmm0=( 7 -- -- -- -- -- -- --)
   448 	pslldq	xmm2,2			; xmm2=(--  8  9 10 11 12 13 14)
   450 	por	xmm7,xmm3		; xmm7=( 1  2  3  4  5  6  7  8)
   451 	por	xmm0,xmm2		; xmm0=( 7  8  9 10 11 12 13 14)
   453 	movdqa	xmm1,xmm6
   454 	movdqa	xmm5,xmm4
   455 	pslldq	xmm1,2			; xmm1=(--  0  1  2  3  4  5  6)
   456 	psrldq	xmm5,2			; xmm5=( 9 10 11 12 13 14 15 --)
   457 	movdqa	xmm3,xmm4
   458 	psrldq	xmm3,(SIZEOF_XMMWORD-2)	; xmm3=(15 -- -- -- -- -- -- --)
   460 	por	xmm1, XMMWORD [wk(1)]	; xmm1=(-1  0  1  2  3  4  5  6)
   461 	por	xmm5, XMMWORD [wk(3)]	; xmm5=( 9 10 11 12 13 14 15 16)
   463 	movdqa	XMMWORD [wk(1)], xmm3
   465 	pmullw	xmm6,[GOTOFF(ebx,PW_THREE)]
   466 	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]
   467 	paddw	xmm1,[GOTOFF(ebx,PW_EIGHT)]
   468 	paddw	xmm0,[GOTOFF(ebx,PW_EIGHT)]
   469 	paddw	xmm7,[GOTOFF(ebx,PW_SEVEN)]
   470 	paddw	xmm5,[GOTOFF(ebx,PW_SEVEN)]
   472 	paddw	xmm1,xmm6
   473 	paddw	xmm0,xmm4
   474 	psrlw	xmm1,4			; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
   475 	psrlw	xmm0,4			; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
   476 	paddw	xmm7,xmm6
   477 	paddw	xmm5,xmm4
   478 	psrlw	xmm7,4			; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
   479 	psrlw	xmm5,4			; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
   481 	psllw	xmm7,BYTE_BIT
   482 	psllw	xmm5,BYTE_BIT
   483 	por	xmm1,xmm7		; xmm1=Out1L=( 0  1  2 ... 13 14 15)
   484 	por	xmm0,xmm5		; xmm0=Out1H=(16 17 18 ... 29 30 31)
   486 	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1
   487 	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0
   489 	poppic	ebx
   491 	sub	eax, byte SIZEOF_XMMWORD
   492 	add	ecx, byte 1*SIZEOF_XMMWORD	; inptr1(above)
   493 	add	ebx, byte 1*SIZEOF_XMMWORD	; inptr0
   494 	add	esi, byte 1*SIZEOF_XMMWORD	; inptr1(below)
   495 	add	edx, byte 2*SIZEOF_XMMWORD	; outptr0
   496 	add	edi, byte 2*SIZEOF_XMMWORD	; outptr1
   497 	cmp	eax, byte SIZEOF_XMMWORD
   498 	ja	near .columnloop
   499 	test	eax,eax
   500 	jnz	near .columnloop_last
   502 	pop	esi
   503 	pop	edi
   504 	pop	ecx
   505 	pop	eax
   507 	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
   508 	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
   509 	sub	ecx, byte 2			; rowctr
   510 	jg	near .rowloop
   512 .return:
   513 	pop	edi
   514 	pop	esi
   515 ;	pop	edx		; need not be preserved
   516 ;	pop	ecx		; need not be preserved
   517 	pop	ebx
   518 	mov	esp,ebp		; esp <- aligned ebp
   519 	pop	esp		; esp <- original ebp
   520 	pop	ebp
   521 	ret
   523 ; --------------------------------------------------------------------------
   524 ;
   525 ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
   526 ; It's still a box filter.
   527 ;
   528 ; GLOBAL(void)
   529 ; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
   530 ;                           JDIMENSION output_width,
   531 ;                           JSAMPARRAY input_data,
   532 ;                           JSAMPARRAY * output_data_ptr);
   533 ;
   535 %define max_v_samp(b)		(b)+8			; int max_v_samp_factor
   536 %define output_width(b)	(b)+12		; JDIMENSION output_width
   537 %define input_data(b)		(b)+16		; JSAMPARRAY input_data
   538 %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
   540 	align	16
   541 	global	EXTN(jsimd_h2v1_upsample_sse2)
   543 EXTN(jsimd_h2v1_upsample_sse2):
   544 	push	ebp
   545 	mov	ebp,esp
   546 ;	push	ebx		; unused
   547 ;	push	ecx		; need not be preserved
   548 ;	push	edx		; need not be preserved
   549 	push	esi
   550 	push	edi
   552 	mov	edx, JDIMENSION [output_width(ebp)]
   553 	add	edx, byte (2*SIZEOF_XMMWORD)-1
   554 	and	edx, byte -(2*SIZEOF_XMMWORD)
   555 	jz	short .return
   557 	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
   558 	test	ecx,ecx
   559 	jz	short .return
   561 	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
   562 	mov	edi, POINTER [output_data_ptr(ebp)]
   563 	mov	edi, JSAMPARRAY [edi]			; output_data
   564 	alignx	16,7
   565 .rowloop:
   566 	push	edi
   567 	push	esi
   569 	mov	esi, JSAMPROW [esi]		; inptr
   570 	mov	edi, JSAMPROW [edi]		; outptr
   571 	mov	eax,edx				; colctr
   572 	alignx	16,7
   573 .columnloop:
   575 	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
   577 	movdqa    xmm1,xmm0
   578 	punpcklbw xmm0,xmm0
   579 	punpckhbw xmm1,xmm1
   581 	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
   582 	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
   584 	sub	eax, byte 2*SIZEOF_XMMWORD
   585 	jz	short .nextrow
   587 	movdqa	xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
   589 	movdqa    xmm3,xmm2
   590 	punpcklbw xmm2,xmm2
   591 	punpckhbw xmm3,xmm3
   593 	movdqa	XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
   594 	movdqa	XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
   596 	sub	eax, byte 2*SIZEOF_XMMWORD
   597 	jz	short .nextrow
   599 	add	esi, byte 2*SIZEOF_XMMWORD	; inptr
   600 	add	edi, byte 4*SIZEOF_XMMWORD	; outptr
   601 	jmp	short .columnloop
   602 	alignx	16,7
   604 .nextrow:
   605 	pop	esi
   606 	pop	edi
   608 	add	esi, byte SIZEOF_JSAMPROW	; input_data
   609 	add	edi, byte SIZEOF_JSAMPROW	; output_data
   610 	dec	ecx				; rowctr
   611 	jg	short .rowloop
   613 .return:
   614 	pop	edi
   615 	pop	esi
   616 ;	pop	edx		; need not be preserved
   617 ;	pop	ecx		; need not be preserved
   618 ;	pop	ebx		; unused
   619 	pop	ebp
   620 	ret
   622 ; --------------------------------------------------------------------------
   623 ;
   624 ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
   625 ; It's still a box filter.
   626 ;
   627 ; GLOBAL(void)
   628 ; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
   629 ;                           JDIMENSION output_width,
   630 ;                           JSAMPARRAY input_data,
   631 ;                           JSAMPARRAY * output_data_ptr);
   632 ;
   634 %define max_v_samp(b)		(b)+8			; int max_v_samp_factor
   635 %define output_width(b)	(b)+12		; JDIMENSION output_width
   636 %define input_data(b)		(b)+16		; JSAMPARRAY input_data
   637 %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr
   639 	align	16
   640 	global	EXTN(jsimd_h2v2_upsample_sse2)
   642 EXTN(jsimd_h2v2_upsample_sse2):
   643 	push	ebp
   644 	mov	ebp,esp
   645 	push	ebx
   646 ;	push	ecx		; need not be preserved
   647 ;	push	edx		; need not be preserved
   648 	push	esi
   649 	push	edi
   651 	mov	edx, JDIMENSION [output_width(ebp)]
   652 	add	edx, byte (2*SIZEOF_XMMWORD)-1
   653 	and	edx, byte -(2*SIZEOF_XMMWORD)
   654 	jz	near .return
   656 	mov	ecx, INT [max_v_samp(ebp)]	; rowctr
   657 	test	ecx,ecx
   658 	jz	near .return
   660 	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
   661 	mov	edi, POINTER [output_data_ptr(ebp)]
   662 	mov	edi, JSAMPARRAY [edi]			; output_data
   663 	alignx	16,7
   664 .rowloop:
   665 	push	edi
   666 	push	esi
   668 	mov	esi, JSAMPROW [esi]			; inptr
   669 	mov	ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0
   670 	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1
   671 	mov	eax,edx					; colctr
   672 	alignx	16,7
   673 .columnloop:
   675 	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
   677 	movdqa    xmm1,xmm0
   678 	punpcklbw xmm0,xmm0
   679 	punpckhbw xmm1,xmm1
   681 	movdqa	XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0
   682 	movdqa	XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1
   683 	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
   684 	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1
   686 	sub	eax, byte 2*SIZEOF_XMMWORD
   687 	jz	short .nextrow
   689 	movdqa	xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]
   691 	movdqa    xmm3,xmm2
   692 	punpcklbw xmm2,xmm2
   693 	punpckhbw xmm3,xmm3
   695 	movdqa	XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2
   696 	movdqa	XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3
   697 	movdqa	XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2
   698 	movdqa	XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3
   700 	sub	eax, byte 2*SIZEOF_XMMWORD
   701 	jz	short .nextrow
   703 	add	esi, byte 2*SIZEOF_XMMWORD	; inptr
   704 	add	ebx, byte 4*SIZEOF_XMMWORD	; outptr0
   705 	add	edi, byte 4*SIZEOF_XMMWORD	; outptr1
   706 	jmp	short .columnloop
   707 	alignx	16,7
   709 .nextrow:
   710 	pop	esi
   711 	pop	edi
   713 	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data
   714 	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data
   715 	sub	ecx, byte 2			; rowctr
   716 	jg	short .rowloop
   718 .return:
   719 	pop	edi
   720 	pop	esi
   721 ;	pop	edx		; need not be preserved
   722 ;	pop	ecx		; need not be preserved
   723 	pop	ebx
   724 	pop	ebp
   725 	ret
   727 ; For some reason, the OS X linker does not honor the request to align the
   728 ; segment unless we do this.
   729 	align	16

mercurial