The Tor Browser: media/libjpeg/simd/jiss2flt.asm@b8a032363ba2

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

1 ;

     2 ; jiss2flt.asm - floating-point IDCT (SSE & SSE2)

3 ;

     4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB

5 ;

     6 ; Based on

     7 ; x86 SIMD extension for IJG JPEG library

     8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.

     9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc

    10 ;

    11 ; This file should be assembled with NASM (Netwide Assembler),

    12 ; can *not* be assembled with Microsoft's MASM or any compatible

    13 ; assembler (including Borland's Turbo Assembler).

    14 ; NASM is available from http://nasm.sourceforge.net/ or

    15 ; http://sourceforge.net/project/showfiles.php?group_id=6208

    16 ;

    17 ; This file contains a floating-point implementation of the inverse DCT

    18 ; (Discrete Cosine Transform). The following code is based directly on

    19 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.

    20 ;

    21 ; [TAB8]

    23 %include "jsimdext.inc"

    24 %include "jdct.inc"

    26 ; --------------------------------------------------------------------------

    28 %macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)

    29 	shufps	%1,%2,0x44

    30 %endmacro

    32 %macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)

    33 	shufps	%1,%2,0xEE

    34 %endmacro

    36 ; --------------------------------------------------------------------------

    37 	SECTION	SEG_CONST

    39 	alignz	16

    40 	global	EXTN(jconst_idct_float_sse2)

    42 EXTN(jconst_idct_float_sse2):

    44 PD_1_414	times 4 dd  1.414213562373095048801689

    45 PD_1_847	times 4 dd  1.847759065022573512256366

    46 PD_1_082	times 4 dd  1.082392200292393968799446

    47 PD_M2_613	times 4 dd -2.613125929752753055713286

    48 PD_RNDINT_MAGIC	times 4 dd  100663296.0	; (float)(0x00C00000 << 3)

    49 PB_CENTERJSAMP	times 16 db CENTERJSAMPLE

    51 	alignz	16

    53 ; --------------------------------------------------------------------------

    54 	SECTION	SEG_TEXT

    55 	BITS	32

    56 ;

    57 ; Perform dequantization and inverse DCT on one block of coefficients.

    58 ;

    59 ; GLOBAL(void)

    60 ; jsimd_idct_float_sse2 (void * dct_table, JCOEFPTR coef_block,

    61 ;                        JSAMPARRAY output_buf, JDIMENSION output_col)

    62 ;

    64 %define dct_table(b)	(b)+8			; void * dct_table

    65 %define coef_block(b)	(b)+12		; JCOEFPTR coef_block

    66 %define output_buf(b)	(b)+16		; JSAMPARRAY output_buf

    67 %define output_col(b)	(b)+20		; JDIMENSION output_col

    69 %define original_ebp	ebp+0

    70 %define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]

    71 %define WK_NUM		2

    72 %define workspace	wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT

    73 					; FAST_FLOAT workspace[DCTSIZE2]

    75 	align	16

    76 	global	EXTN(jsimd_idct_float_sse2)

    78 EXTN(jsimd_idct_float_sse2):

    79 	push	ebp

    80 	mov	eax,esp				; eax = original ebp

    81 	sub	esp, byte 4

    82 	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits

    83 	mov	[esp],eax

    84 	mov	ebp,esp				; ebp = aligned ebp

    85 	lea	esp, [workspace]

    86 	push	ebx

    87 ;	push	ecx		; need not be preserved

    88 ;	push	edx		; need not be preserved

    89 	push	esi

    90 	push	edi

    92 	get_GOT	ebx		; get GOT address

    94 	; ---- Pass 1: process columns from input, store into work array.

    96 ;	mov	eax, [original_ebp]

    97 	mov	edx, POINTER [dct_table(eax)]	; quantptr

    98 	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr

    99 	lea	edi, [workspace]			; FAST_FLOAT * wsptr

   100 	mov	ecx, DCTSIZE/4				; ctr

   101 	alignx	16,7

   102 .columnloop:

   103 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE

   104 	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]

   105 	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]

   106 	jnz	near .columnDCT

   108 	movq	xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]

   109 	movq	xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]

   110 	movq	xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]

   111 	movq	xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]

   112 	movq	xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]

   113 	movq	xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]

   114 	movq	xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]

   115 	por	xmm1,xmm2

   116 	por	xmm3,xmm4

   117 	por	xmm5,xmm6

   118 	por	xmm1,xmm3

   119 	por	xmm5,xmm7

   120 	por	xmm1,xmm5

   121 	packsswb xmm1,xmm1

   122 	movd	eax,xmm1

   123 	test	eax,eax

   124 	jnz	short .columnDCT

   126 	; -- AC terms all zero

   128 	movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]

   130 	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)

   131 	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)

   132 	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)

   134 	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]

   136 	movaps	xmm1,xmm0

   137 	movaps	xmm2,xmm0

   138 	movaps	xmm3,xmm0

   140 	shufps	xmm0,xmm0,0x00			; xmm0=(00 00 00 00)

   141 	shufps	xmm1,xmm1,0x55			; xmm1=(01 01 01 01)

   142 	shufps	xmm2,xmm2,0xAA			; xmm2=(02 02 02 02)

   143 	shufps	xmm3,xmm3,0xFF			; xmm3=(03 03 03 03)

   145 	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0

   146 	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0

   147 	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1

   148 	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1

   149 	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2

   150 	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2

   151 	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3

   152 	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3

   153 	jmp	near .nextcolumn

   154 	alignx	16,7

   155 %endif

   156 .columnDCT:

   158 	; -- Even part

   160 	movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]

   161 	movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]

   162 	movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]

   163 	movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]

   165 	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)

   166 	punpcklwd xmm1,xmm1		; xmm1=(20 20 21 21 22 22 23 23)

   167 	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)

   168 	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in2=(20 21 22 23)

   169 	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)

   170 	cvtdq2ps  xmm1,xmm1			; xmm1=in2=(20 21 22 23)

   172 	punpcklwd xmm2,xmm2		; xmm2=(40 40 41 41 42 42 43 43)

   173 	punpcklwd xmm3,xmm3		; xmm3=(60 60 61 61 62 62 63 63)

   174 	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in4=(40 41 42 43)

   175 	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in6=(60 61 62 63)

   176 	cvtdq2ps  xmm2,xmm2			; xmm2=in4=(40 41 42 43)

   177 	cvtdq2ps  xmm3,xmm3			; xmm3=in6=(60 61 62 63)

   179 	mulps     xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]

   180 	mulps     xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]

   181 	mulps     xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]

   182 	mulps     xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]

   184 	movaps	xmm4,xmm0

   185 	movaps	xmm5,xmm1

   186 	subps	xmm0,xmm2		; xmm0=tmp11

   187 	subps	xmm1,xmm3

   188 	addps	xmm4,xmm2		; xmm4=tmp10

   189 	addps	xmm5,xmm3		; xmm5=tmp13

   191 	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]

   192 	subps	xmm1,xmm5		; xmm1=tmp12

   194 	movaps	xmm6,xmm4

   195 	movaps	xmm7,xmm0

   196 	subps	xmm4,xmm5		; xmm4=tmp3

   197 	subps	xmm0,xmm1		; xmm0=tmp2

   198 	addps	xmm6,xmm5		; xmm6=tmp0

   199 	addps	xmm7,xmm1		; xmm7=tmp1

   201 	movaps	XMMWORD [wk(1)], xmm4	; tmp3

   202 	movaps	XMMWORD [wk(0)], xmm0	; tmp2

   204 	; -- Odd part

   206 	movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]

   207 	movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]

   208 	movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]

   209 	movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]

   211 	punpcklwd xmm2,xmm2		; xmm2=(10 10 11 11 12 12 13 13)

   212 	punpcklwd xmm3,xmm3		; xmm3=(30 30 31 31 32 32 33 33)

   213 	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in1=(10 11 12 13)

   214 	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in3=(30 31 32 33)

   215 	cvtdq2ps  xmm2,xmm2			; xmm2=in1=(10 11 12 13)

   216 	cvtdq2ps  xmm3,xmm3			; xmm3=in3=(30 31 32 33)

   218 	punpcklwd xmm5,xmm5		; xmm5=(50 50 51 51 52 52 53 53)

   219 	punpcklwd xmm1,xmm1		; xmm1=(70 70 71 71 72 72 73 73)

   220 	psrad     xmm5,(DWORD_BIT-WORD_BIT)	; xmm5=in5=(50 51 52 53)

   221 	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in7=(70 71 72 73)

   222 	cvtdq2ps  xmm5,xmm5			; xmm5=in5=(50 51 52 53)

   223 	cvtdq2ps  xmm1,xmm1			; xmm1=in7=(70 71 72 73)

   225 	mulps     xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]

   226 	mulps     xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]

   227 	mulps     xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]

   228 	mulps     xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]

   230 	movaps	xmm4,xmm2

   231 	movaps	xmm0,xmm5

   232 	addps	xmm2,xmm1		; xmm2=z11

   233 	addps	xmm5,xmm3		; xmm5=z13

   234 	subps	xmm4,xmm1		; xmm4=z12

   235 	subps	xmm0,xmm3		; xmm0=z10

   237 	movaps	xmm1,xmm2

   238 	subps	xmm2,xmm5

   239 	addps	xmm1,xmm5		; xmm1=tmp7

   241 	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11

   243 	movaps	xmm3,xmm0

   244 	addps	xmm0,xmm4

   245 	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5

   246 	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)

   247 	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)

   248 	addps	xmm3,xmm0		; xmm3=tmp12

   249 	subps	xmm4,xmm0		; xmm4=tmp10

   251 	; -- Final output stage

   253 	subps	xmm3,xmm1		; xmm3=tmp6

   254 	movaps	xmm5,xmm6

   255 	movaps	xmm0,xmm7

   256 	addps	xmm6,xmm1		; xmm6=data0=(00 01 02 03)

   257 	addps	xmm7,xmm3		; xmm7=data1=(10 11 12 13)

   258 	subps	xmm5,xmm1		; xmm5=data7=(70 71 72 73)

   259 	subps	xmm0,xmm3		; xmm0=data6=(60 61 62 63)

   260 	subps	xmm2,xmm3		; xmm2=tmp5

   262 	movaps    xmm1,xmm6		; transpose coefficients(phase 1)

   263 	unpcklps  xmm6,xmm7		; xmm6=(00 10 01 11)

   264 	unpckhps  xmm1,xmm7		; xmm1=(02 12 03 13)

   265 	movaps    xmm3,xmm0		; transpose coefficients(phase 1)

   266 	unpcklps  xmm0,xmm5		; xmm0=(60 70 61 71)

   267 	unpckhps  xmm3,xmm5		; xmm3=(62 72 63 73)

   269 	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2

   270 	movaps	xmm5, XMMWORD [wk(1)]	; xmm5=tmp3

   272 	movaps	XMMWORD [wk(0)], xmm0	; wk(0)=(60 70 61 71)

   273 	movaps	XMMWORD [wk(1)], xmm3	; wk(1)=(62 72 63 73)

   275 	addps	xmm4,xmm2		; xmm4=tmp4

   276 	movaps	xmm0,xmm7

   277 	movaps	xmm3,xmm5

   278 	addps	xmm7,xmm2		; xmm7=data2=(20 21 22 23)

   279 	addps	xmm5,xmm4		; xmm5=data4=(40 41 42 43)

   280 	subps	xmm0,xmm2		; xmm0=data5=(50 51 52 53)

   281 	subps	xmm3,xmm4		; xmm3=data3=(30 31 32 33)

   283 	movaps    xmm2,xmm7		; transpose coefficients(phase 1)

   284 	unpcklps  xmm7,xmm3		; xmm7=(20 30 21 31)

   285 	unpckhps  xmm2,xmm3		; xmm2=(22 32 23 33)

   286 	movaps    xmm4,xmm5		; transpose coefficients(phase 1)

   287 	unpcklps  xmm5,xmm0		; xmm5=(40 50 41 51)

   288 	unpckhps  xmm4,xmm0		; xmm4=(42 52 43 53)

   290 	movaps    xmm3,xmm6		; transpose coefficients(phase 2)

   291 	unpcklps2 xmm6,xmm7		; xmm6=(00 10 20 30)

   292 	unpckhps2 xmm3,xmm7		; xmm3=(01 11 21 31)

   293 	movaps    xmm0,xmm1		; transpose coefficients(phase 2)

   294 	unpcklps2 xmm1,xmm2		; xmm1=(02 12 22 32)

   295 	unpckhps2 xmm0,xmm2		; xmm0=(03 13 23 33)

   297 	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=(60 70 61 71)

   298 	movaps	xmm2, XMMWORD [wk(1)]	; xmm2=(62 72 63 73)

   300 	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6

   301 	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3

   302 	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1

   303 	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0

   305 	movaps    xmm6,xmm5		; transpose coefficients(phase 2)

   306 	unpcklps2 xmm5,xmm7		; xmm5=(40 50 60 70)

   307 	unpckhps2 xmm6,xmm7		; xmm6=(41 51 61 71)

   308 	movaps    xmm3,xmm4		; transpose coefficients(phase 2)

   309 	unpcklps2 xmm4,xmm2		; xmm4=(42 52 62 72)

   310 	unpckhps2 xmm3,xmm2		; xmm3=(43 53 63 73)

   312 	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5

   313 	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6

   314 	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4

   315 	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3

   317 .nextcolumn:

   318 	add	esi, byte 4*SIZEOF_JCOEF		; coef_block

   319 	add	edx, byte 4*SIZEOF_FLOAT_MULT_TYPE	; quantptr

   320 	add	edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT	; wsptr

   321 	dec	ecx					; ctr

   322 	jnz	near .columnloop

   324 	; -- Prefetch the next coefficient block

   326 	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]

   327 	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]

   328 	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]

   329 	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]

   331 	; ---- Pass 2: process rows from work array, store into output array.

   333 	mov	eax, [original_ebp]

   334 	lea	esi, [workspace]			; FAST_FLOAT * wsptr

   335 	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)

   336 	mov	eax, JDIMENSION [output_col(eax)]

   337 	mov	ecx, DCTSIZE/4				; ctr

   338 	alignx	16,7

   339 .rowloop:

   341 	; -- Even part

   343 	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]

   344 	movaps	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]

   345 	movaps	xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]

   346 	movaps	xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]

   348 	movaps	xmm4,xmm0

   349 	movaps	xmm5,xmm1

   350 	subps	xmm0,xmm2		; xmm0=tmp11

   351 	subps	xmm1,xmm3

   352 	addps	xmm4,xmm2		; xmm4=tmp10

   353 	addps	xmm5,xmm3		; xmm5=tmp13

   355 	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]

   356 	subps	xmm1,xmm5		; xmm1=tmp12

   358 	movaps	xmm6,xmm4

   359 	movaps	xmm7,xmm0

   360 	subps	xmm4,xmm5		; xmm4=tmp3

   361 	subps	xmm0,xmm1		; xmm0=tmp2

   362 	addps	xmm6,xmm5		; xmm6=tmp0

   363 	addps	xmm7,xmm1		; xmm7=tmp1

   365 	movaps	XMMWORD [wk(1)], xmm4	; tmp3

   366 	movaps	XMMWORD [wk(0)], xmm0	; tmp2

   368 	; -- Odd part

   370 	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]

   371 	movaps	xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]

   372 	movaps	xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]

   373 	movaps	xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]

   375 	movaps	xmm4,xmm2

   376 	movaps	xmm0,xmm5

   377 	addps	xmm2,xmm1		; xmm2=z11

   378 	addps	xmm5,xmm3		; xmm5=z13

   379 	subps	xmm4,xmm1		; xmm4=z12

   380 	subps	xmm0,xmm3		; xmm0=z10

   382 	movaps	xmm1,xmm2

   383 	subps	xmm2,xmm5

   384 	addps	xmm1,xmm5		; xmm1=tmp7

   386 	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11

   388 	movaps	xmm3,xmm0

   389 	addps	xmm0,xmm4

   390 	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5

   391 	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)

   392 	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)

   393 	addps	xmm3,xmm0		; xmm3=tmp12

   394 	subps	xmm4,xmm0		; xmm4=tmp10

   396 	; -- Final output stage

   398 	subps	xmm3,xmm1		; xmm3=tmp6

   399 	movaps	xmm5,xmm6

   400 	movaps	xmm0,xmm7

   401 	addps	xmm6,xmm1		; xmm6=data0=(00 10 20 30)

   402 	addps	xmm7,xmm3		; xmm7=data1=(01 11 21 31)

   403 	subps	xmm5,xmm1		; xmm5=data7=(07 17 27 37)

   404 	subps	xmm0,xmm3		; xmm0=data6=(06 16 26 36)

   405 	subps	xmm2,xmm3		; xmm2=tmp5

   407 	movaps	xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)]	; xmm1=[PD_RNDINT_MAGIC]

   408 	pcmpeqd	xmm3,xmm3

   409 	psrld	xmm3,WORD_BIT		; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}

   411 	addps	xmm6,xmm1	; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)

   412 	addps	xmm7,xmm1	; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)

   413 	addps	xmm0,xmm1	; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)

   414 	addps	xmm5,xmm1	; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)

   416 	pand	xmm6,xmm3		; xmm6=(00 -- 10 -- 20 -- 30 --)

   417 	pslld	xmm7,WORD_BIT		; xmm7=(-- 01 -- 11 -- 21 -- 31)

   418 	pand	xmm0,xmm3		; xmm0=(06 -- 16 -- 26 -- 36 --)

   419 	pslld	xmm5,WORD_BIT		; xmm5=(-- 07 -- 17 -- 27 -- 37)

   420 	por	xmm6,xmm7		; xmm6=(00 01 10 11 20 21 30 31)

   421 	por	xmm0,xmm5		; xmm0=(06 07 16 17 26 27 36 37)

   423 	movaps	xmm1, XMMWORD [wk(0)]	; xmm1=tmp2

   424 	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=tmp3

   426 	addps	xmm4,xmm2		; xmm4=tmp4

   427 	movaps	xmm7,xmm1

   428 	movaps	xmm5,xmm3

   429 	addps	xmm1,xmm2		; xmm1=data2=(02 12 22 32)

   430 	addps	xmm3,xmm4		; xmm3=data4=(04 14 24 34)

   431 	subps	xmm7,xmm2		; xmm7=data5=(05 15 25 35)

   432 	subps	xmm5,xmm4		; xmm5=data3=(03 13 23 33)

   434 	movaps	xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)]	; xmm2=[PD_RNDINT_MAGIC]

   435 	pcmpeqd	xmm4,xmm4

   436 	psrld	xmm4,WORD_BIT		; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}

   438 	addps	xmm3,xmm2	; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)

   439 	addps	xmm7,xmm2	; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)

   440 	addps	xmm1,xmm2	; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)

   441 	addps	xmm5,xmm2	; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)

   443 	pand	xmm3,xmm4		; xmm3=(04 -- 14 -- 24 -- 34 --)

   444 	pslld	xmm7,WORD_BIT		; xmm7=(-- 05 -- 15 -- 25 -- 35)

   445 	pand	xmm1,xmm4		; xmm1=(02 -- 12 -- 22 -- 32 --)

   446 	pslld	xmm5,WORD_BIT		; xmm5=(-- 03 -- 13 -- 23 -- 33)

   447 	por	xmm3,xmm7		; xmm3=(04 05 14 15 24 25 34 35)

   448 	por	xmm1,xmm5		; xmm1=(02 03 12 13 22 23 32 33)

   450 	movdqa    xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)]	; xmm2=[PB_CENTERJSAMP]

   452 	packsswb  xmm6,xmm3	; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)

   453 	packsswb  xmm1,xmm0	; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)

   454 	paddb     xmm6,xmm2

   455 	paddb     xmm1,xmm2

   457 	movdqa    xmm4,xmm6	; transpose coefficients(phase 2)

   458 	punpcklwd xmm6,xmm1	; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)

   459 	punpckhwd xmm4,xmm1	; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)

   461 	movdqa    xmm7,xmm6	; transpose coefficients(phase 3)

   462 	punpckldq xmm6,xmm4	; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)

   463 	punpckhdq xmm7,xmm4	; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)

   465 	pshufd	xmm5,xmm6,0x4E	; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)

   466 	pshufd	xmm3,xmm7,0x4E	; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)

   468 	pushpic	ebx			; save GOT address

   470 	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]

   471 	mov	ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]

   472 	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6

   473 	movq	XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7

   474 	mov	edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]

   475 	mov	ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]

   476 	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5

   477 	movq	XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3

   479 	poppic	ebx			; restore GOT address

   481 	add	esi, byte 4*SIZEOF_FAST_FLOAT	; wsptr

   482 	add	edi, byte 4*SIZEOF_JSAMPROW

   483 	dec	ecx				; ctr

   484 	jnz	near .rowloop

   486 	pop	edi

   487 	pop	esi

   488 ;	pop	edx		; need not be preserved

   489 ;	pop	ecx		; need not be preserved

   490 	pop	ebx

   491 	mov	esp,ebp		; esp <- aligned ebp

   492 	pop	esp		; esp <- original ebp

   493 	pop	ebp

   494 	ret

   496 ; For some reason, the OS X linker does not honor the request to align the

   497 ; segment unless we do this.

   498 	align	16

The Tor Browser / file revision

media/libjpeg/simd/jiss2flt.asm@b8a032363ba2

media/libjpeg/simd/jiss2flt.asm