media/libjpeg/simd/jiss2flt.asm

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 ;
     2 ; jiss2flt.asm - floating-point IDCT (SSE & SSE2)
     3 ;
     4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
     5 ;
     6 ; Based on
     7 ; x86 SIMD extension for IJG JPEG library
     8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
    10 ;
    11 ; This file should be assembled with NASM (Netwide Assembler),
    12 ; can *not* be assembled with Microsoft's MASM or any compatible
    13 ; assembler (including Borland's Turbo Assembler).
    14 ; NASM is available from http://nasm.sourceforge.net/ or
    15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
    16 ;
    17 ; This file contains a floating-point implementation of the inverse DCT
    18 ; (Discrete Cosine Transform). The following code is based directly on
    19 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
    20 ;
    21 ; [TAB8]
    23 %include "jsimdext.inc"
    24 %include "jdct.inc"
    26 ; --------------------------------------------------------------------------
    28 %macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
    29 	shufps	%1,%2,0x44
    30 %endmacro
    32 %macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
    33 	shufps	%1,%2,0xEE
    34 %endmacro
    36 ; --------------------------------------------------------------------------
    37 	SECTION	SEG_CONST
    39 	alignz	16
    40 	global	EXTN(jconst_idct_float_sse2)
    42 EXTN(jconst_idct_float_sse2):
    44 PD_1_414	times 4 dd  1.414213562373095048801689
    45 PD_1_847	times 4 dd  1.847759065022573512256366
    46 PD_1_082	times 4 dd  1.082392200292393968799446
    47 PD_M2_613	times 4 dd -2.613125929752753055713286
    48 PD_RNDINT_MAGIC	times 4 dd  100663296.0	; (float)(0x00C00000 << 3)
    49 PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
    51 	alignz	16
    53 ; --------------------------------------------------------------------------
    54 	SECTION	SEG_TEXT
    55 	BITS	32
    56 ;
    57 ; Perform dequantization and inverse DCT on one block of coefficients.
    58 ;
    59 ; GLOBAL(void)
    60 ; jsimd_idct_float_sse2 (void * dct_table, JCOEFPTR coef_block,
    61 ;                        JSAMPARRAY output_buf, JDIMENSION output_col)
    62 ;
    64 %define dct_table(b)	(b)+8			; void * dct_table
    65 %define coef_block(b)	(b)+12		; JCOEFPTR coef_block
    66 %define output_buf(b)	(b)+16		; JSAMPARRAY output_buf
    67 %define output_col(b)	(b)+20		; JDIMENSION output_col
    69 %define original_ebp	ebp+0
    70 %define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
    71 %define WK_NUM		2
    72 %define workspace	wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
    73 					; FAST_FLOAT workspace[DCTSIZE2]
    75 	align	16
    76 	global	EXTN(jsimd_idct_float_sse2)
    78 EXTN(jsimd_idct_float_sse2):
    79 	push	ebp
    80 	mov	eax,esp				; eax = original ebp
    81 	sub	esp, byte 4
    82 	and	esp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
    83 	mov	[esp],eax
    84 	mov	ebp,esp				; ebp = aligned ebp
    85 	lea	esp, [workspace]
    86 	push	ebx
    87 ;	push	ecx		; need not be preserved
    88 ;	push	edx		; need not be preserved
    89 	push	esi
    90 	push	edi
    92 	get_GOT	ebx		; get GOT address
    94 	; ---- Pass 1: process columns from input, store into work array.
    96 ;	mov	eax, [original_ebp]
    97 	mov	edx, POINTER [dct_table(eax)]	; quantptr
    98 	mov	esi, JCOEFPTR [coef_block(eax)]		; inptr
    99 	lea	edi, [workspace]			; FAST_FLOAT * wsptr
   100 	mov	ecx, DCTSIZE/4				; ctr
   101 	alignx	16,7
   102 .columnloop:
   103 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
   104 	mov	eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
   105 	or	eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
   106 	jnz	near .columnDCT
   108 	movq	xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
   109 	movq	xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
   110 	movq	xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
   111 	movq	xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
   112 	movq	xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
   113 	movq	xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
   114 	movq	xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
   115 	por	xmm1,xmm2
   116 	por	xmm3,xmm4
   117 	por	xmm5,xmm6
   118 	por	xmm1,xmm3
   119 	por	xmm5,xmm7
   120 	por	xmm1,xmm5
   121 	packsswb xmm1,xmm1
   122 	movd	eax,xmm1
   123 	test	eax,eax
   124 	jnz	short .columnDCT
   126 	; -- AC terms all zero
   128 	movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
   130 	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
   131 	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)
   132 	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)
   134 	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
   136 	movaps	xmm1,xmm0
   137 	movaps	xmm2,xmm0
   138 	movaps	xmm3,xmm0
   140 	shufps	xmm0,xmm0,0x00			; xmm0=(00 00 00 00)
   141 	shufps	xmm1,xmm1,0x55			; xmm1=(01 01 01 01)
   142 	shufps	xmm2,xmm2,0xAA			; xmm2=(02 02 02 02)
   143 	shufps	xmm3,xmm3,0xFF			; xmm3=(03 03 03 03)
   145 	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
   146 	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
   147 	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
   148 	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
   149 	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
   150 	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
   151 	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
   152 	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
   153 	jmp	near .nextcolumn
   154 	alignx	16,7
   155 %endif
   156 .columnDCT:
   158 	; -- Even part
   160 	movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
   161 	movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
   162 	movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
   163 	movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
   165 	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
   166 	punpcklwd xmm1,xmm1		; xmm1=(20 20 21 21 22 22 23 23)
   167 	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)
   168 	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in2=(20 21 22 23)
   169 	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)
   170 	cvtdq2ps  xmm1,xmm1			; xmm1=in2=(20 21 22 23)
   172 	punpcklwd xmm2,xmm2		; xmm2=(40 40 41 41 42 42 43 43)
   173 	punpcklwd xmm3,xmm3		; xmm3=(60 60 61 61 62 62 63 63)
   174 	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in4=(40 41 42 43)
   175 	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in6=(60 61 62 63)
   176 	cvtdq2ps  xmm2,xmm2			; xmm2=in4=(40 41 42 43)
   177 	cvtdq2ps  xmm3,xmm3			; xmm3=in6=(60 61 62 63)
   179 	mulps     xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
   180 	mulps     xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
   181 	mulps     xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
   182 	mulps     xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
   184 	movaps	xmm4,xmm0
   185 	movaps	xmm5,xmm1
   186 	subps	xmm0,xmm2		; xmm0=tmp11
   187 	subps	xmm1,xmm3
   188 	addps	xmm4,xmm2		; xmm4=tmp10
   189 	addps	xmm5,xmm3		; xmm5=tmp13
   191 	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
   192 	subps	xmm1,xmm5		; xmm1=tmp12
   194 	movaps	xmm6,xmm4
   195 	movaps	xmm7,xmm0
   196 	subps	xmm4,xmm5		; xmm4=tmp3
   197 	subps	xmm0,xmm1		; xmm0=tmp2
   198 	addps	xmm6,xmm5		; xmm6=tmp0
   199 	addps	xmm7,xmm1		; xmm7=tmp1
   201 	movaps	XMMWORD [wk(1)], xmm4	; tmp3
   202 	movaps	XMMWORD [wk(0)], xmm0	; tmp2
   204 	; -- Odd part
   206 	movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
   207 	movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
   208 	movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
   209 	movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
   211 	punpcklwd xmm2,xmm2		; xmm2=(10 10 11 11 12 12 13 13)
   212 	punpcklwd xmm3,xmm3		; xmm3=(30 30 31 31 32 32 33 33)
   213 	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in1=(10 11 12 13)
   214 	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in3=(30 31 32 33)
   215 	cvtdq2ps  xmm2,xmm2			; xmm2=in1=(10 11 12 13)
   216 	cvtdq2ps  xmm3,xmm3			; xmm3=in3=(30 31 32 33)
   218 	punpcklwd xmm5,xmm5		; xmm5=(50 50 51 51 52 52 53 53)
   219 	punpcklwd xmm1,xmm1		; xmm1=(70 70 71 71 72 72 73 73)
   220 	psrad     xmm5,(DWORD_BIT-WORD_BIT)	; xmm5=in5=(50 51 52 53)
   221 	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in7=(70 71 72 73)
   222 	cvtdq2ps  xmm5,xmm5			; xmm5=in5=(50 51 52 53)
   223 	cvtdq2ps  xmm1,xmm1			; xmm1=in7=(70 71 72 73)
   225 	mulps     xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
   226 	mulps     xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
   227 	mulps     xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
   228 	mulps     xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
   230 	movaps	xmm4,xmm2
   231 	movaps	xmm0,xmm5
   232 	addps	xmm2,xmm1		; xmm2=z11
   233 	addps	xmm5,xmm3		; xmm5=z13
   234 	subps	xmm4,xmm1		; xmm4=z12
   235 	subps	xmm0,xmm3		; xmm0=z10
   237 	movaps	xmm1,xmm2
   238 	subps	xmm2,xmm5
   239 	addps	xmm1,xmm5		; xmm1=tmp7
   241 	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
   243 	movaps	xmm3,xmm0
   244 	addps	xmm0,xmm4
   245 	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
   246 	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
   247 	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
   248 	addps	xmm3,xmm0		; xmm3=tmp12
   249 	subps	xmm4,xmm0		; xmm4=tmp10
   251 	; -- Final output stage
   253 	subps	xmm3,xmm1		; xmm3=tmp6
   254 	movaps	xmm5,xmm6
   255 	movaps	xmm0,xmm7
   256 	addps	xmm6,xmm1		; xmm6=data0=(00 01 02 03)
   257 	addps	xmm7,xmm3		; xmm7=data1=(10 11 12 13)
   258 	subps	xmm5,xmm1		; xmm5=data7=(70 71 72 73)
   259 	subps	xmm0,xmm3		; xmm0=data6=(60 61 62 63)
   260 	subps	xmm2,xmm3		; xmm2=tmp5
   262 	movaps    xmm1,xmm6		; transpose coefficients(phase 1)
   263 	unpcklps  xmm6,xmm7		; xmm6=(00 10 01 11)
   264 	unpckhps  xmm1,xmm7		; xmm1=(02 12 03 13)
   265 	movaps    xmm3,xmm0		; transpose coefficients(phase 1)
   266 	unpcklps  xmm0,xmm5		; xmm0=(60 70 61 71)
   267 	unpckhps  xmm3,xmm5		; xmm3=(62 72 63 73)
   269 	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
   270 	movaps	xmm5, XMMWORD [wk(1)]	; xmm5=tmp3
   272 	movaps	XMMWORD [wk(0)], xmm0	; wk(0)=(60 70 61 71)
   273 	movaps	XMMWORD [wk(1)], xmm3	; wk(1)=(62 72 63 73)
   275 	addps	xmm4,xmm2		; xmm4=tmp4
   276 	movaps	xmm0,xmm7
   277 	movaps	xmm3,xmm5
   278 	addps	xmm7,xmm2		; xmm7=data2=(20 21 22 23)
   279 	addps	xmm5,xmm4		; xmm5=data4=(40 41 42 43)
   280 	subps	xmm0,xmm2		; xmm0=data5=(50 51 52 53)
   281 	subps	xmm3,xmm4		; xmm3=data3=(30 31 32 33)
   283 	movaps    xmm2,xmm7		; transpose coefficients(phase 1)
   284 	unpcklps  xmm7,xmm3		; xmm7=(20 30 21 31)
   285 	unpckhps  xmm2,xmm3		; xmm2=(22 32 23 33)
   286 	movaps    xmm4,xmm5		; transpose coefficients(phase 1)
   287 	unpcklps  xmm5,xmm0		; xmm5=(40 50 41 51)
   288 	unpckhps  xmm4,xmm0		; xmm4=(42 52 43 53)
   290 	movaps    xmm3,xmm6		; transpose coefficients(phase 2)
   291 	unpcklps2 xmm6,xmm7		; xmm6=(00 10 20 30)
   292 	unpckhps2 xmm3,xmm7		; xmm3=(01 11 21 31)
   293 	movaps    xmm0,xmm1		; transpose coefficients(phase 2)
   294 	unpcklps2 xmm1,xmm2		; xmm1=(02 12 22 32)
   295 	unpckhps2 xmm0,xmm2		; xmm0=(03 13 23 33)
   297 	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=(60 70 61 71)
   298 	movaps	xmm2, XMMWORD [wk(1)]	; xmm2=(62 72 63 73)
   300 	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
   301 	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
   302 	movaps	XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
   303 	movaps	XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
   305 	movaps    xmm6,xmm5		; transpose coefficients(phase 2)
   306 	unpcklps2 xmm5,xmm7		; xmm5=(40 50 60 70)
   307 	unpckhps2 xmm6,xmm7		; xmm6=(41 51 61 71)
   308 	movaps    xmm3,xmm4		; transpose coefficients(phase 2)
   309 	unpcklps2 xmm4,xmm2		; xmm4=(42 52 62 72)
   310 	unpckhps2 xmm3,xmm2		; xmm3=(43 53 63 73)
   312 	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
   313 	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
   314 	movaps	XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
   315 	movaps	XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
   317 .nextcolumn:
   318 	add	esi, byte 4*SIZEOF_JCOEF		; coef_block
   319 	add	edx, byte 4*SIZEOF_FLOAT_MULT_TYPE	; quantptr
   320 	add	edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT	; wsptr
   321 	dec	ecx					; ctr
   322 	jnz	near .columnloop
   324 	; -- Prefetch the next coefficient block
   326 	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
   327 	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
   328 	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
   329 	prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
   331 	; ---- Pass 2: process rows from work array, store into output array.
   333 	mov	eax, [original_ebp]
   334 	lea	esi, [workspace]			; FAST_FLOAT * wsptr
   335 	mov	edi, JSAMPARRAY [output_buf(eax)]	; (JSAMPROW *)
   336 	mov	eax, JDIMENSION [output_col(eax)]
   337 	mov	ecx, DCTSIZE/4				; ctr
   338 	alignx	16,7
   339 .rowloop:
   341 	; -- Even part
   343 	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
   344 	movaps	xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
   345 	movaps	xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
   346 	movaps	xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
   348 	movaps	xmm4,xmm0
   349 	movaps	xmm5,xmm1
   350 	subps	xmm0,xmm2		; xmm0=tmp11
   351 	subps	xmm1,xmm3
   352 	addps	xmm4,xmm2		; xmm4=tmp10
   353 	addps	xmm5,xmm3		; xmm5=tmp13
   355 	mulps	xmm1,[GOTOFF(ebx,PD_1_414)]
   356 	subps	xmm1,xmm5		; xmm1=tmp12
   358 	movaps	xmm6,xmm4
   359 	movaps	xmm7,xmm0
   360 	subps	xmm4,xmm5		; xmm4=tmp3
   361 	subps	xmm0,xmm1		; xmm0=tmp2
   362 	addps	xmm6,xmm5		; xmm6=tmp0
   363 	addps	xmm7,xmm1		; xmm7=tmp1
   365 	movaps	XMMWORD [wk(1)], xmm4	; tmp3
   366 	movaps	XMMWORD [wk(0)], xmm0	; tmp2
   368 	; -- Odd part
   370 	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
   371 	movaps	xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
   372 	movaps	xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
   373 	movaps	xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
   375 	movaps	xmm4,xmm2
   376 	movaps	xmm0,xmm5
   377 	addps	xmm2,xmm1		; xmm2=z11
   378 	addps	xmm5,xmm3		; xmm5=z13
   379 	subps	xmm4,xmm1		; xmm4=z12
   380 	subps	xmm0,xmm3		; xmm0=z10
   382 	movaps	xmm1,xmm2
   383 	subps	xmm2,xmm5
   384 	addps	xmm1,xmm5		; xmm1=tmp7
   386 	mulps	xmm2,[GOTOFF(ebx,PD_1_414)]	; xmm2=tmp11
   388 	movaps	xmm3,xmm0
   389 	addps	xmm0,xmm4
   390 	mulps	xmm0,[GOTOFF(ebx,PD_1_847)]	; xmm0=z5
   391 	mulps	xmm3,[GOTOFF(ebx,PD_M2_613)]	; xmm3=(z10 * -2.613125930)
   392 	mulps	xmm4,[GOTOFF(ebx,PD_1_082)]	; xmm4=(z12 * 1.082392200)
   393 	addps	xmm3,xmm0		; xmm3=tmp12
   394 	subps	xmm4,xmm0		; xmm4=tmp10
   396 	; -- Final output stage
   398 	subps	xmm3,xmm1		; xmm3=tmp6
   399 	movaps	xmm5,xmm6
   400 	movaps	xmm0,xmm7
   401 	addps	xmm6,xmm1		; xmm6=data0=(00 10 20 30)
   402 	addps	xmm7,xmm3		; xmm7=data1=(01 11 21 31)
   403 	subps	xmm5,xmm1		; xmm5=data7=(07 17 27 37)
   404 	subps	xmm0,xmm3		; xmm0=data6=(06 16 26 36)
   405 	subps	xmm2,xmm3		; xmm2=tmp5
   407 	movaps	xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)]	; xmm1=[PD_RNDINT_MAGIC]
   408 	pcmpeqd	xmm3,xmm3
   409 	psrld	xmm3,WORD_BIT		; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
   411 	addps	xmm6,xmm1	; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
   412 	addps	xmm7,xmm1	; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
   413 	addps	xmm0,xmm1	; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
   414 	addps	xmm5,xmm1	; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
   416 	pand	xmm6,xmm3		; xmm6=(00 -- 10 -- 20 -- 30 --)
   417 	pslld	xmm7,WORD_BIT		; xmm7=(-- 01 -- 11 -- 21 -- 31)
   418 	pand	xmm0,xmm3		; xmm0=(06 -- 16 -- 26 -- 36 --)
   419 	pslld	xmm5,WORD_BIT		; xmm5=(-- 07 -- 17 -- 27 -- 37)
   420 	por	xmm6,xmm7		; xmm6=(00 01 10 11 20 21 30 31)
   421 	por	xmm0,xmm5		; xmm0=(06 07 16 17 26 27 36 37)
   423 	movaps	xmm1, XMMWORD [wk(0)]	; xmm1=tmp2
   424 	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=tmp3
   426 	addps	xmm4,xmm2		; xmm4=tmp4
   427 	movaps	xmm7,xmm1
   428 	movaps	xmm5,xmm3
   429 	addps	xmm1,xmm2		; xmm1=data2=(02 12 22 32)
   430 	addps	xmm3,xmm4		; xmm3=data4=(04 14 24 34)
   431 	subps	xmm7,xmm2		; xmm7=data5=(05 15 25 35)
   432 	subps	xmm5,xmm4		; xmm5=data3=(03 13 23 33)
   434 	movaps	xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)]	; xmm2=[PD_RNDINT_MAGIC]
   435 	pcmpeqd	xmm4,xmm4
   436 	psrld	xmm4,WORD_BIT		; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
   438 	addps	xmm3,xmm2	; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
   439 	addps	xmm7,xmm2	; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
   440 	addps	xmm1,xmm2	; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
   441 	addps	xmm5,xmm2	; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
   443 	pand	xmm3,xmm4		; xmm3=(04 -- 14 -- 24 -- 34 --)
   444 	pslld	xmm7,WORD_BIT		; xmm7=(-- 05 -- 15 -- 25 -- 35)
   445 	pand	xmm1,xmm4		; xmm1=(02 -- 12 -- 22 -- 32 --)
   446 	pslld	xmm5,WORD_BIT		; xmm5=(-- 03 -- 13 -- 23 -- 33)
   447 	por	xmm3,xmm7		; xmm3=(04 05 14 15 24 25 34 35)
   448 	por	xmm1,xmm5		; xmm1=(02 03 12 13 22 23 32 33)
   450 	movdqa    xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)]	; xmm2=[PB_CENTERJSAMP]
   452 	packsswb  xmm6,xmm3	; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
   453 	packsswb  xmm1,xmm0	; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
   454 	paddb     xmm6,xmm2
   455 	paddb     xmm1,xmm2
   457 	movdqa    xmm4,xmm6	; transpose coefficients(phase 2)
   458 	punpcklwd xmm6,xmm1	; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
   459 	punpckhwd xmm4,xmm1	; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
   461 	movdqa    xmm7,xmm6	; transpose coefficients(phase 3)
   462 	punpckldq xmm6,xmm4	; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
   463 	punpckhdq xmm7,xmm4	; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
   465 	pshufd	xmm5,xmm6,0x4E	; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
   466 	pshufd	xmm3,xmm7,0x4E	; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
   468 	pushpic	ebx			; save GOT address
   470 	mov	edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
   471 	mov	ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
   472 	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
   473 	movq	XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
   474 	mov	edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
   475 	mov	ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
   476 	movq	XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
   477 	movq	XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
   479 	poppic	ebx			; restore GOT address
   481 	add	esi, byte 4*SIZEOF_FAST_FLOAT	; wsptr
   482 	add	edi, byte 4*SIZEOF_JSAMPROW
   483 	dec	ecx				; ctr
   484 	jnz	near .rowloop
   486 	pop	edi
   487 	pop	esi
   488 ;	pop	edx		; need not be preserved
   489 ;	pop	ecx		; need not be preserved
   490 	pop	ebx
   491 	mov	esp,ebp		; esp <- aligned ebp
   492 	pop	esp		; esp <- original ebp
   493 	pop	ebp
   494 	ret
   496 ; For some reason, the OS X linker does not honor the request to align the
   497 ; segment unless we do this.
   498 	align	16

mercurial