The Tor Browser: media/libjpeg/simd/jdsamss2.asm@b8a032363ba2

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

1 ;

     2 ; jdsamss2.asm - upsampling (SSE2)

3 ;

     4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB

5 ;

     6 ; Based on

     7 ; x86 SIMD extension for IJG JPEG library

     8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.

     9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc

    10 ;

    11 ; This file should be assembled with NASM (Netwide Assembler),

    12 ; can *not* be assembled with Microsoft's MASM or any compatible

    13 ; assembler (including Borland's Turbo Assembler).

    14 ; NASM is available from http://nasm.sourceforge.net/ or

    15 ; http://sourceforge.net/project/showfiles.php?group_id=6208

    16 ;

    17 ; [TAB8]

    19 %include "jsimdext.inc"

    21 ; --------------------------------------------------------------------------

    22 	SECTION	SEG_CONST

    24 	alignz	16

    25 	global	EXTN(jconst_fancy_upsample_sse2)

    27 EXTN(jconst_fancy_upsample_sse2):

    29 PW_ONE		times 8 dw  1

    30 PW_TWO		times 8 dw  2

    31 PW_THREE	times 8 dw  3

    32 PW_SEVEN	times 8 dw  7

    33 PW_EIGHT	times 8 dw  8

    35 	alignz	16

    37 ; --------------------------------------------------------------------------

    38 	SECTION	SEG_TEXT

    39 	BITS	32

    40 ;

    41 ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.

    42 ;

    43 ; The upsampling algorithm is linear interpolation between pixel centers,

    44 ; also known as a "triangle filter".  This is a good compromise between

    45 ; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4

    46 ; of the way between input pixel centers.

    47 ;

    48 ; GLOBAL(void)

    49 ; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,

    50 ;                                 JDIMENSION downsampled_width,

    51 ;                                 JSAMPARRAY input_data,

    52 ;                                 JSAMPARRAY * output_data_ptr);

    53 ;

    55 %define max_v_samp(b)		(b)+8			; int max_v_samp_factor

    56 %define downsamp_width(b)	(b)+12	; JDIMENSION downsampled_width

    57 %define input_data(b)		(b)+16		; JSAMPARRAY input_data

    58 %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr

    60 	align	16

    61 	global	EXTN(jsimd_h2v1_fancy_upsample_sse2)

    63 EXTN(jsimd_h2v1_fancy_upsample_sse2):

    64 	push	ebp

    65 	mov	ebp,esp

    66 	pushpic	ebx

    67 ;	push	ecx		; need not be preserved

    68 ;	push	edx		; need not be preserved

    69 	push	esi

    70 	push	edi

    72 	get_GOT	ebx		; get GOT address

    74 	mov	eax, JDIMENSION [downsamp_width(ebp)]  ; colctr

    75 	test	eax,eax

    76 	jz	near .return

    78 	mov	ecx, INT [max_v_samp(ebp)]	; rowctr

    79 	test	ecx,ecx

    80 	jz	near .return

    82 	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data

    83 	mov	edi, POINTER [output_data_ptr(ebp)]

    84 	mov	edi, JSAMPARRAY [edi]			; output_data

    85 	alignx	16,7

    86 .rowloop:

    87 	push	eax			; colctr

    88 	push	edi

    89 	push	esi

    91 	mov	esi, JSAMPROW [esi]	; inptr

    92 	mov	edi, JSAMPROW [edi]	; outptr

    94 	test	eax, SIZEOF_XMMWORD-1

    95 	jz	short .skip

    96 	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]

    97 	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample

    98 .skip:

    99 	pxor	xmm0,xmm0		; xmm0=(all 0's)

   100 	pcmpeqb	xmm7,xmm7

   101 	psrldq	xmm7,(SIZEOF_XMMWORD-1)

   102 	pand	xmm7, XMMWORD [esi+0*SIZEOF_XMMWORD]

   104 	add	eax, byte SIZEOF_XMMWORD-1

   105 	and	eax, byte -SIZEOF_XMMWORD

   106 	cmp	eax, byte SIZEOF_XMMWORD

   107 	ja	short .columnloop

   108 	alignx	16,7

   110 .columnloop_last:

   111 	pcmpeqb	xmm6,xmm6

   112 	pslldq	xmm6,(SIZEOF_XMMWORD-1)

   113 	pand	xmm6, XMMWORD [esi+0*SIZEOF_XMMWORD]

   114 	jmp	short .upsample

   115 	alignx	16,7

   117 .columnloop:

   118 	movdqa	xmm6, XMMWORD [esi+1*SIZEOF_XMMWORD]

   119 	pslldq	xmm6,(SIZEOF_XMMWORD-1)

   121 .upsample:

   122 	movdqa	xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]

   123 	movdqa	xmm2,xmm1

   124 	movdqa	xmm3,xmm1		; xmm1=( 0  1  2 ... 13 14 15)

   125 	pslldq	xmm2,1			; xmm2=(--  0  1 ... 12 13 14)

   126 	psrldq	xmm3,1			; xmm3=( 1  2  3 ... 14 15 --)

   128 	por	xmm2,xmm7		; xmm2=(-1  0  1 ... 12 13 14)

   129 	por	xmm3,xmm6		; xmm3=( 1  2  3 ... 14 15 16)

   131 	movdqa	xmm7,xmm1

   132 	psrldq	xmm7,(SIZEOF_XMMWORD-1)	; xmm7=(15 -- -- ... -- -- --)

   134 	movdqa    xmm4,xmm1

   135 	punpcklbw xmm1,xmm0		; xmm1=( 0  1  2  3  4  5  6  7)

   136 	punpckhbw xmm4,xmm0		; xmm4=( 8  9 10 11 12 13 14 15)

   137 	movdqa    xmm5,xmm2

   138 	punpcklbw xmm2,xmm0		; xmm2=(-1  0  1  2  3  4  5  6)

   139 	punpckhbw xmm5,xmm0		; xmm5=( 7  8  9 10 11 12 13 14)

   140 	movdqa    xmm6,xmm3

   141 	punpcklbw xmm3,xmm0		; xmm3=( 1  2  3  4  5  6  7  8)

   142 	punpckhbw xmm6,xmm0		; xmm6=( 9 10 11 12 13 14 15 16)

   144 	pmullw	xmm1,[GOTOFF(ebx,PW_THREE)]

   145 	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]

   146 	paddw	xmm2,[GOTOFF(ebx,PW_ONE)]

   147 	paddw	xmm5,[GOTOFF(ebx,PW_ONE)]

   148 	paddw	xmm3,[GOTOFF(ebx,PW_TWO)]

   149 	paddw	xmm6,[GOTOFF(ebx,PW_TWO)]

   151 	paddw	xmm2,xmm1

   152 	paddw	xmm5,xmm4

   153 	psrlw	xmm2,2			; xmm2=OutLE=( 0  2  4  6  8 10 12 14)

   154 	psrlw	xmm5,2			; xmm5=OutHE=(16 18 20 22 24 26 28 30)

   155 	paddw	xmm3,xmm1

   156 	paddw	xmm6,xmm4

   157 	psrlw	xmm3,2			; xmm3=OutLO=( 1  3  5  7  9 11 13 15)

   158 	psrlw	xmm6,2			; xmm6=OutHO=(17 19 21 23 25 27 29 31)

   160 	psllw	xmm3,BYTE_BIT

   161 	psllw	xmm6,BYTE_BIT

   162 	por	xmm2,xmm3		; xmm2=OutL=( 0  1  2 ... 13 14 15)

   163 	por	xmm5,xmm6		; xmm5=OutH=(16 17 18 ... 29 30 31)

   165 	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2

   166 	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm5

   168 	sub	eax, byte SIZEOF_XMMWORD

   169 	add	esi, byte 1*SIZEOF_XMMWORD	; inptr

   170 	add	edi, byte 2*SIZEOF_XMMWORD	; outptr

   171 	cmp	eax, byte SIZEOF_XMMWORD

   172 	ja	near .columnloop

   173 	test	eax,eax

   174 	jnz	near .columnloop_last

   176 	pop	esi

   177 	pop	edi

   178 	pop	eax

   180 	add	esi, byte SIZEOF_JSAMPROW	; input_data

   181 	add	edi, byte SIZEOF_JSAMPROW	; output_data

   182 	dec	ecx				; rowctr

   183 	jg	near .rowloop

   185 .return:

   186 	pop	edi

   187 	pop	esi

   188 ;	pop	edx		; need not be preserved

   189 ;	pop	ecx		; need not be preserved

   190 	poppic	ebx

   191 	pop	ebp

   192 	ret

   194 ; --------------------------------------------------------------------------

   195 ;

   196 ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.

   197 ; Again a triangle filter; see comments for h2v1 case, above.

   198 ;

   199 ; GLOBAL(void)

   200 ; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,

   201 ;                                 JDIMENSION downsampled_width,

   202 ;                                 JSAMPARRAY input_data,

   203 ;                                 JSAMPARRAY * output_data_ptr);

   204 ;

   206 %define max_v_samp(b)		(b)+8			; int max_v_samp_factor

   207 %define downsamp_width(b)	(b)+12	; JDIMENSION downsampled_width

   208 %define input_data(b)		(b)+16		; JSAMPARRAY input_data

   209 %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr

   211 %define original_ebp	ebp+0

   212 %define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]

   213 %define WK_NUM		4

   214 %define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr

   216 	align	16

   217 	global	EXTN(jsimd_h2v2_fancy_upsample_sse2)

   219 EXTN(jsimd_h2v2_fancy_upsample_sse2):

   220 	push	ebp

   221 	mov	eax,esp				; eax = original ebp

   222 	sub	esp, byte 4

   223 	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits

   224 	mov	[esp],eax

   225 	mov	ebp,esp				; ebp = aligned ebp

   226 	lea	esp, [wk(0)]

   227 	pushpic	eax		; make a room for GOT address

   228 	push	ebx

   229 ;	push	ecx		; need not be preserved

   230 ;	push	edx		; need not be preserved

   231 	push	esi

   232 	push	edi

   234 	get_GOT	ebx			; get GOT address

   235 	movpic	POINTER [gotptr], ebx	; save GOT address

   237 	mov	edx,eax				; edx = original ebp

   238 	mov	eax, JDIMENSION [downsamp_width(edx)]  ; colctr

   239 	test	eax,eax

   240 	jz	near .return

   242 	mov	ecx, INT [max_v_samp(edx)]	; rowctr

   243 	test	ecx,ecx

   244 	jz	near .return

   246 	mov	esi, JSAMPARRAY [input_data(edx)]	; input_data

   247 	mov	edi, POINTER [output_data_ptr(edx)]

   248 	mov	edi, JSAMPARRAY [edi]			; output_data

   249 	alignx	16,7

   250 .rowloop:

   251 	push	eax					; colctr

   252 	push	ecx

   253 	push	edi

   254 	push	esi

   256 	mov	ecx, JSAMPROW [esi-1*SIZEOF_JSAMPROW]	; inptr1(above)

   257 	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0

   258 	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1(below)

   259 	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0

   260 	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1

   262 	test	eax, SIZEOF_XMMWORD-1

   263 	jz	short .skip

   264 	push	edx

   265 	mov	dl, JSAMPLE [ecx+(eax-1)*SIZEOF_JSAMPLE]

   266 	mov	JSAMPLE [ecx+eax*SIZEOF_JSAMPLE], dl

   267 	mov	dl, JSAMPLE [ebx+(eax-1)*SIZEOF_JSAMPLE]

   268 	mov	JSAMPLE [ebx+eax*SIZEOF_JSAMPLE], dl

   269 	mov	dl, JSAMPLE [esi+(eax-1)*SIZEOF_JSAMPLE]

   270 	mov	JSAMPLE [esi+eax*SIZEOF_JSAMPLE], dl	; insert a dummy sample

   271 	pop	edx

   272 .skip:

   273 	; -- process the first column block

   275 	movdqa	xmm0, XMMWORD [ebx+0*SIZEOF_XMMWORD]	; xmm0=row[ 0][0]

   276 	movdqa	xmm1, XMMWORD [ecx+0*SIZEOF_XMMWORD]	; xmm1=row[-1][0]

   277 	movdqa	xmm2, XMMWORD [esi+0*SIZEOF_XMMWORD]	; xmm2=row[+1][0]

   279 	pushpic	ebx

   280 	movpic	ebx, POINTER [gotptr]	; load GOT address

   282 	pxor      xmm3,xmm3		; xmm3=(all 0's)

   283 	movdqa    xmm4,xmm0

   284 	punpcklbw xmm0,xmm3		; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)

   285 	punpckhbw xmm4,xmm3		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)

   286 	movdqa    xmm5,xmm1

   287 	punpcklbw xmm1,xmm3		; xmm1=row[-1]( 0  1  2  3  4  5  6  7)

   288 	punpckhbw xmm5,xmm3		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)

   289 	movdqa    xmm6,xmm2

   290 	punpcklbw xmm2,xmm3		; xmm2=row[+1]( 0  1  2  3  4  5  6  7)

   291 	punpckhbw xmm6,xmm3		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)

   293 	pmullw	xmm0,[GOTOFF(ebx,PW_THREE)]

   294 	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]

   296 	pcmpeqb	xmm7,xmm7

   297 	psrldq	xmm7,(SIZEOF_XMMWORD-2)

   299 	paddw	xmm1,xmm0		; xmm1=Int0L=( 0  1  2  3  4  5  6  7)

   300 	paddw	xmm5,xmm4		; xmm5=Int0H=( 8  9 10 11 12 13 14 15)

   301 	paddw	xmm2,xmm0		; xmm2=Int1L=( 0  1  2  3  4  5  6  7)

   302 	paddw	xmm6,xmm4		; xmm6=Int1H=( 8  9 10 11 12 13 14 15)

   304 	movdqa	XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1	; temporarily save

   305 	movdqa	XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5	; the intermediate data

   306 	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm2

   307 	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm6

   309 	pand	xmm1,xmm7		; xmm1=( 0 -- -- -- -- -- -- --)

   310 	pand	xmm2,xmm7		; xmm2=( 0 -- -- -- -- -- -- --)

   312 	movdqa	XMMWORD [wk(0)], xmm1

   313 	movdqa	XMMWORD [wk(1)], xmm2

   315 	poppic	ebx

   317 	add	eax, byte SIZEOF_XMMWORD-1

   318 	and	eax, byte -SIZEOF_XMMWORD

   319 	cmp	eax, byte SIZEOF_XMMWORD

   320 	ja	short .columnloop

   321 	alignx	16,7

   323 .columnloop_last:

   324 	; -- process the last column block

   326 	pushpic	ebx

   327 	movpic	ebx, POINTER [gotptr]	; load GOT address

   329 	pcmpeqb	xmm1,xmm1

   330 	pslldq	xmm1,(SIZEOF_XMMWORD-2)

   331 	movdqa	xmm2,xmm1

   333 	pand	xmm1, XMMWORD [edx+1*SIZEOF_XMMWORD]

   334 	pand	xmm2, XMMWORD [edi+1*SIZEOF_XMMWORD]

   336 	movdqa	XMMWORD [wk(2)], xmm1	; xmm1=(-- -- -- -- -- -- -- 15)

   337 	movdqa	XMMWORD [wk(3)], xmm2	; xmm2=(-- -- -- -- -- -- -- 15)

   339 	jmp	near .upsample

   340 	alignx	16,7

   342 .columnloop:

   343 	; -- process the next column block

   345 	movdqa	xmm0, XMMWORD [ebx+1*SIZEOF_XMMWORD]	; xmm0=row[ 0][1]

   346 	movdqa	xmm1, XMMWORD [ecx+1*SIZEOF_XMMWORD]	; xmm1=row[-1][1]

   347 	movdqa	xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]	; xmm2=row[+1][1]

   349 	pushpic	ebx

   350 	movpic	ebx, POINTER [gotptr]	; load GOT address

   352 	pxor      xmm3,xmm3		; xmm3=(all 0's)

   353 	movdqa    xmm4,xmm0

   354 	punpcklbw xmm0,xmm3		; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)

   355 	punpckhbw xmm4,xmm3		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)

   356 	movdqa    xmm5,xmm1

   357 	punpcklbw xmm1,xmm3		; xmm1=row[-1]( 0  1  2  3  4  5  6  7)

   358 	punpckhbw xmm5,xmm3		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)

   359 	movdqa    xmm6,xmm2

   360 	punpcklbw xmm2,xmm3		; xmm2=row[+1]( 0  1  2  3  4  5  6  7)

   361 	punpckhbw xmm6,xmm3		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)

   363 	pmullw	xmm0,[GOTOFF(ebx,PW_THREE)]

   364 	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]

   366 	paddw	xmm1,xmm0		; xmm1=Int0L=( 0  1  2  3  4  5  6  7)

   367 	paddw	xmm5,xmm4		; xmm5=Int0H=( 8  9 10 11 12 13 14 15)

   368 	paddw	xmm2,xmm0		; xmm2=Int1L=( 0  1  2  3  4  5  6  7)

   369 	paddw	xmm6,xmm4		; xmm6=Int1H=( 8  9 10 11 12 13 14 15)

   371 	movdqa	XMMWORD [edx+2*SIZEOF_XMMWORD], xmm1	; temporarily save

   372 	movdqa	XMMWORD [edx+3*SIZEOF_XMMWORD], xmm5	; the intermediate data

   373 	movdqa	XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2

   374 	movdqa	XMMWORD [edi+3*SIZEOF_XMMWORD], xmm6

   376 	pslldq	xmm1,(SIZEOF_XMMWORD-2)	; xmm1=(-- -- -- -- -- -- --  0)

   377 	pslldq	xmm2,(SIZEOF_XMMWORD-2)	; xmm2=(-- -- -- -- -- -- --  0)

   379 	movdqa	XMMWORD [wk(2)], xmm1

   380 	movdqa	XMMWORD [wk(3)], xmm2

   382 .upsample:

   383 	; -- process the upper row

   385 	movdqa	xmm7, XMMWORD [edx+0*SIZEOF_XMMWORD]

   386 	movdqa	xmm3, XMMWORD [edx+1*SIZEOF_XMMWORD]

   388 	movdqa	xmm0,xmm7		; xmm7=Int0L=( 0  1  2  3  4  5  6  7)

   389 	movdqa	xmm4,xmm3		; xmm3=Int0H=( 8  9 10 11 12 13 14 15)

   390 	psrldq	xmm0,2			; xmm0=( 1  2  3  4  5  6  7 --)

   391 	pslldq	xmm4,(SIZEOF_XMMWORD-2)	; xmm4=(-- -- -- -- -- -- --  8)

   392 	movdqa	xmm5,xmm7

   393 	movdqa	xmm6,xmm3

   394 	psrldq	xmm5,(SIZEOF_XMMWORD-2)	; xmm5=( 7 -- -- -- -- -- -- --)

   395 	pslldq	xmm6,2			; xmm6=(--  8  9 10 11 12 13 14)

   397 	por	xmm0,xmm4		; xmm0=( 1  2  3  4  5  6  7  8)

   398 	por	xmm5,xmm6		; xmm5=( 7  8  9 10 11 12 13 14)

   400 	movdqa	xmm1,xmm7

   401 	movdqa	xmm2,xmm3

   402 	pslldq	xmm1,2			; xmm1=(--  0  1  2  3  4  5  6)

   403 	psrldq	xmm2,2			; xmm2=( 9 10 11 12 13 14 15 --)

   404 	movdqa	xmm4,xmm3

   405 	psrldq	xmm4,(SIZEOF_XMMWORD-2)	; xmm4=(15 -- -- -- -- -- -- --)

   407 	por	xmm1, XMMWORD [wk(0)]	; xmm1=(-1  0  1  2  3  4  5  6)

   408 	por	xmm2, XMMWORD [wk(2)]	; xmm2=( 9 10 11 12 13 14 15 16)

   410 	movdqa	XMMWORD [wk(0)], xmm4

   412 	pmullw	xmm7,[GOTOFF(ebx,PW_THREE)]

   413 	pmullw	xmm3,[GOTOFF(ebx,PW_THREE)]

   414 	paddw	xmm1,[GOTOFF(ebx,PW_EIGHT)]

   415 	paddw	xmm5,[GOTOFF(ebx,PW_EIGHT)]

   416 	paddw	xmm0,[GOTOFF(ebx,PW_SEVEN)]

   417 	paddw	xmm2,[GOTOFF(ebx,PW_SEVEN)]

   419 	paddw	xmm1,xmm7

   420 	paddw	xmm5,xmm3

   421 	psrlw	xmm1,4			; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)

   422 	psrlw	xmm5,4			; xmm5=Out0HE=(16 18 20 22 24 26 28 30)

   423 	paddw	xmm0,xmm7

   424 	paddw	xmm2,xmm3

   425 	psrlw	xmm0,4			; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)

   426 	psrlw	xmm2,4			; xmm2=Out0HO=(17 19 21 23 25 27 29 31)

   428 	psllw	xmm0,BYTE_BIT

   429 	psllw	xmm2,BYTE_BIT

   430 	por	xmm1,xmm0		; xmm1=Out0L=( 0  1  2 ... 13 14 15)

   431 	por	xmm5,xmm2		; xmm5=Out0H=(16 17 18 ... 29 30 31)

   433 	movdqa	XMMWORD [edx+0*SIZEOF_XMMWORD], xmm1

   434 	movdqa	XMMWORD [edx+1*SIZEOF_XMMWORD], xmm5

   436 	; -- process the lower row

   438 	movdqa	xmm6, XMMWORD [edi+0*SIZEOF_XMMWORD]

   439 	movdqa	xmm4, XMMWORD [edi+1*SIZEOF_XMMWORD]

   441 	movdqa	xmm7,xmm6		; xmm6=Int1L=( 0  1  2  3  4  5  6  7)

   442 	movdqa	xmm3,xmm4		; xmm4=Int1H=( 8  9 10 11 12 13 14 15)

   443 	psrldq	xmm7,2			; xmm7=( 1  2  3  4  5  6  7 --)

   444 	pslldq	xmm3,(SIZEOF_XMMWORD-2)	; xmm3=(-- -- -- -- -- -- --  8)

   445 	movdqa	xmm0,xmm6

   446 	movdqa	xmm2,xmm4

   447 	psrldq	xmm0,(SIZEOF_XMMWORD-2)	; xmm0=( 7 -- -- -- -- -- -- --)

   448 	pslldq	xmm2,2			; xmm2=(--  8  9 10 11 12 13 14)

   450 	por	xmm7,xmm3		; xmm7=( 1  2  3  4  5  6  7  8)

   451 	por	xmm0,xmm2		; xmm0=( 7  8  9 10 11 12 13 14)

   453 	movdqa	xmm1,xmm6

   454 	movdqa	xmm5,xmm4

   455 	pslldq	xmm1,2			; xmm1=(--  0  1  2  3  4  5  6)

   456 	psrldq	xmm5,2			; xmm5=( 9 10 11 12 13 14 15 --)

   457 	movdqa	xmm3,xmm4

   458 	psrldq	xmm3,(SIZEOF_XMMWORD-2)	; xmm3=(15 -- -- -- -- -- -- --)

   460 	por	xmm1, XMMWORD [wk(1)]	; xmm1=(-1  0  1  2  3  4  5  6)

   461 	por	xmm5, XMMWORD [wk(3)]	; xmm5=( 9 10 11 12 13 14 15 16)

   463 	movdqa	XMMWORD [wk(1)], xmm3

   465 	pmullw	xmm6,[GOTOFF(ebx,PW_THREE)]

   466 	pmullw	xmm4,[GOTOFF(ebx,PW_THREE)]

   467 	paddw	xmm1,[GOTOFF(ebx,PW_EIGHT)]

   468 	paddw	xmm0,[GOTOFF(ebx,PW_EIGHT)]

   469 	paddw	xmm7,[GOTOFF(ebx,PW_SEVEN)]

   470 	paddw	xmm5,[GOTOFF(ebx,PW_SEVEN)]

   472 	paddw	xmm1,xmm6

   473 	paddw	xmm0,xmm4

   474 	psrlw	xmm1,4			; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)

   475 	psrlw	xmm0,4			; xmm0=Out1HE=(16 18 20 22 24 26 28 30)

   476 	paddw	xmm7,xmm6

   477 	paddw	xmm5,xmm4

   478 	psrlw	xmm7,4			; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)

   479 	psrlw	xmm5,4			; xmm5=Out1HO=(17 19 21 23 25 27 29 31)

   481 	psllw	xmm7,BYTE_BIT

   482 	psllw	xmm5,BYTE_BIT

   483 	por	xmm1,xmm7		; xmm1=Out1L=( 0  1  2 ... 13 14 15)

   484 	por	xmm0,xmm5		; xmm0=Out1H=(16 17 18 ... 29 30 31)

   486 	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm1

   487 	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm0

   489 	poppic	ebx

   491 	sub	eax, byte SIZEOF_XMMWORD

   492 	add	ecx, byte 1*SIZEOF_XMMWORD	; inptr1(above)

   493 	add	ebx, byte 1*SIZEOF_XMMWORD	; inptr0

   494 	add	esi, byte 1*SIZEOF_XMMWORD	; inptr1(below)

   495 	add	edx, byte 2*SIZEOF_XMMWORD	; outptr0

   496 	add	edi, byte 2*SIZEOF_XMMWORD	; outptr1

   497 	cmp	eax, byte SIZEOF_XMMWORD

   498 	ja	near .columnloop

   499 	test	eax,eax

   500 	jnz	near .columnloop_last

   502 	pop	esi

   503 	pop	edi

   504 	pop	ecx

   505 	pop	eax

   507 	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data

   508 	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data

   509 	sub	ecx, byte 2			; rowctr

   510 	jg	near .rowloop

   512 .return:

   513 	pop	edi

   514 	pop	esi

   515 ;	pop	edx		; need not be preserved

   516 ;	pop	ecx		; need not be preserved

   517 	pop	ebx

   518 	mov	esp,ebp		; esp <- aligned ebp

   519 	pop	esp		; esp <- original ebp

   520 	pop	ebp

   521 	ret

   523 ; --------------------------------------------------------------------------

   524 ;

   525 ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.

   526 ; It's still a box filter.

   527 ;

   528 ; GLOBAL(void)

   529 ; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,

   530 ;                           JDIMENSION output_width,

   531 ;                           JSAMPARRAY input_data,

   532 ;                           JSAMPARRAY * output_data_ptr);

   533 ;

   535 %define max_v_samp(b)		(b)+8			; int max_v_samp_factor

   536 %define output_width(b)	(b)+12		; JDIMENSION output_width

   537 %define input_data(b)		(b)+16		; JSAMPARRAY input_data

   538 %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr

   540 	align	16

   541 	global	EXTN(jsimd_h2v1_upsample_sse2)

   543 EXTN(jsimd_h2v1_upsample_sse2):

   544 	push	ebp

   545 	mov	ebp,esp

   546 ;	push	ebx		; unused

   547 ;	push	ecx		; need not be preserved

   548 ;	push	edx		; need not be preserved

   549 	push	esi

   550 	push	edi

   552 	mov	edx, JDIMENSION [output_width(ebp)]

   553 	add	edx, byte (2*SIZEOF_XMMWORD)-1

   554 	and	edx, byte -(2*SIZEOF_XMMWORD)

   555 	jz	short .return

   557 	mov	ecx, INT [max_v_samp(ebp)]	; rowctr

   558 	test	ecx,ecx

   559 	jz	short .return

   561 	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data

   562 	mov	edi, POINTER [output_data_ptr(ebp)]

   563 	mov	edi, JSAMPARRAY [edi]			; output_data

   564 	alignx	16,7

   565 .rowloop:

   566 	push	edi

   567 	push	esi

   569 	mov	esi, JSAMPROW [esi]		; inptr

   570 	mov	edi, JSAMPROW [edi]		; outptr

   571 	mov	eax,edx				; colctr

   572 	alignx	16,7

   573 .columnloop:

   575 	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]

   577 	movdqa    xmm1,xmm0

   578 	punpcklbw xmm0,xmm0

   579 	punpckhbw xmm1,xmm1

   581 	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0

   582 	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1

   584 	sub	eax, byte 2*SIZEOF_XMMWORD

   585 	jz	short .nextrow

   587 	movdqa	xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]

   589 	movdqa    xmm3,xmm2

   590 	punpcklbw xmm2,xmm2

   591 	punpckhbw xmm3,xmm3

   593 	movdqa	XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2

   594 	movdqa	XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3

   596 	sub	eax, byte 2*SIZEOF_XMMWORD

   597 	jz	short .nextrow

   599 	add	esi, byte 2*SIZEOF_XMMWORD	; inptr

   600 	add	edi, byte 4*SIZEOF_XMMWORD	; outptr

   601 	jmp	short .columnloop

   602 	alignx	16,7

   604 .nextrow:

   605 	pop	esi

   606 	pop	edi

   608 	add	esi, byte SIZEOF_JSAMPROW	; input_data

   609 	add	edi, byte SIZEOF_JSAMPROW	; output_data

   610 	dec	ecx				; rowctr

   611 	jg	short .rowloop

   613 .return:

   614 	pop	edi

   615 	pop	esi

   616 ;	pop	edx		; need not be preserved

   617 ;	pop	ecx		; need not be preserved

   618 ;	pop	ebx		; unused

   619 	pop	ebp

   620 	ret

   622 ; --------------------------------------------------------------------------

   623 ;

   624 ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.

   625 ; It's still a box filter.

   626 ;

   627 ; GLOBAL(void)

   628 ; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,

   629 ;                           JDIMENSION output_width,

   630 ;                           JSAMPARRAY input_data,

   631 ;                           JSAMPARRAY * output_data_ptr);

   632 ;

   634 %define max_v_samp(b)		(b)+8			; int max_v_samp_factor

   635 %define output_width(b)	(b)+12		; JDIMENSION output_width

   636 %define input_data(b)		(b)+16		; JSAMPARRAY input_data

   637 %define output_data_ptr(b)	(b)+20		; JSAMPARRAY * output_data_ptr

   639 	align	16

   640 	global	EXTN(jsimd_h2v2_upsample_sse2)

   642 EXTN(jsimd_h2v2_upsample_sse2):

   643 	push	ebp

   644 	mov	ebp,esp

   645 	push	ebx

   646 ;	push	ecx		; need not be preserved

   647 ;	push	edx		; need not be preserved

   648 	push	esi

   649 	push	edi

   651 	mov	edx, JDIMENSION [output_width(ebp)]

   652 	add	edx, byte (2*SIZEOF_XMMWORD)-1

   653 	and	edx, byte -(2*SIZEOF_XMMWORD)

   654 	jz	near .return

   656 	mov	ecx, INT [max_v_samp(ebp)]	; rowctr

   657 	test	ecx,ecx

   658 	jz	near .return

   660 	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data

   661 	mov	edi, POINTER [output_data_ptr(ebp)]

   662 	mov	edi, JSAMPARRAY [edi]			; output_data

   663 	alignx	16,7

   664 .rowloop:

   665 	push	edi

   666 	push	esi

   668 	mov	esi, JSAMPROW [esi]			; inptr

   669 	mov	ebx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]	; outptr0

   670 	mov	edi, JSAMPROW [edi+1*SIZEOF_JSAMPROW]	; outptr1

   671 	mov	eax,edx					; colctr

   672 	alignx	16,7

   673 .columnloop:

   675 	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]

   677 	movdqa    xmm1,xmm0

   678 	punpcklbw xmm0,xmm0

   679 	punpckhbw xmm1,xmm1

   681 	movdqa	XMMWORD [ebx+0*SIZEOF_XMMWORD], xmm0

   682 	movdqa	XMMWORD [ebx+1*SIZEOF_XMMWORD], xmm1

   683 	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0

   684 	movdqa	XMMWORD [edi+1*SIZEOF_XMMWORD], xmm1

   686 	sub	eax, byte 2*SIZEOF_XMMWORD

   687 	jz	short .nextrow

   689 	movdqa	xmm2, XMMWORD [esi+1*SIZEOF_XMMWORD]

   691 	movdqa    xmm3,xmm2

   692 	punpcklbw xmm2,xmm2

   693 	punpckhbw xmm3,xmm3

   695 	movdqa	XMMWORD [ebx+2*SIZEOF_XMMWORD], xmm2

   696 	movdqa	XMMWORD [ebx+3*SIZEOF_XMMWORD], xmm3

   697 	movdqa	XMMWORD [edi+2*SIZEOF_XMMWORD], xmm2

   698 	movdqa	XMMWORD [edi+3*SIZEOF_XMMWORD], xmm3

   700 	sub	eax, byte 2*SIZEOF_XMMWORD

   701 	jz	short .nextrow

   703 	add	esi, byte 2*SIZEOF_XMMWORD	; inptr

   704 	add	ebx, byte 4*SIZEOF_XMMWORD	; outptr0

   705 	add	edi, byte 4*SIZEOF_XMMWORD	; outptr1

   706 	jmp	short .columnloop

   707 	alignx	16,7

   709 .nextrow:

   710 	pop	esi

   711 	pop	edi

   713 	add	esi, byte 1*SIZEOF_JSAMPROW	; input_data

   714 	add	edi, byte 2*SIZEOF_JSAMPROW	; output_data

   715 	sub	ecx, byte 2			; rowctr

   716 	jg	short .rowloop

   718 .return:

   719 	pop	edi

   720 	pop	esi

   721 ;	pop	edx		; need not be preserved

   722 ;	pop	ecx		; need not be preserved

   723 	pop	ebx

   724 	pop	ebp

   725 	ret

   727 ; For some reason, the OS X linker does not honor the request to align the

   728 ; segment unless we do this.

   729 	align	16

The Tor Browser / file revision

media/libjpeg/simd/jdsamss2.asm@b8a032363ba2

media/libjpeg/simd/jdsamss2.asm