media/libjpeg/simd/jiss2flt-64.asm

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 ;
     2 ; jiss2flt-64.asm - floating-point IDCT (64-bit SSE & SSE2)
     3 ;
     4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
     5 ; Copyright 2009 D. R. Commander
     6 ;
     7 ; Based on
     8 ; x86 SIMD extension for IJG JPEG library
     9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
    10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
    11 ;
    12 ; This file should be assembled with NASM (Netwide Assembler),
    13 ; can *not* be assembled with Microsoft's MASM or any compatible
    14 ; assembler (including Borland's Turbo Assembler).
    15 ; NASM is available from http://nasm.sourceforge.net/ or
    16 ; http://sourceforge.net/project/showfiles.php?group_id=6208
    17 ;
    18 ; This file contains a floating-point implementation of the inverse DCT
    19 ; (Discrete Cosine Transform). The following code is based directly on
    20 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.
    21 ;
    22 ; [TAB8]
    24 %include "jsimdext.inc"
    25 %include "jdct.inc"
    27 ; --------------------------------------------------------------------------
    29 %macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
    30 	shufps	%1,%2,0x44
    31 %endmacro
    33 %macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
    34 	shufps	%1,%2,0xEE
    35 %endmacro
    37 ; --------------------------------------------------------------------------
    38 	SECTION	SEG_CONST
    40 	alignz	16
    41 	global	EXTN(jconst_idct_float_sse2)
    43 EXTN(jconst_idct_float_sse2):
    45 PD_1_414	times 4 dd  1.414213562373095048801689
    46 PD_1_847	times 4 dd  1.847759065022573512256366
    47 PD_1_082	times 4 dd  1.082392200292393968799446
    48 PD_M2_613	times 4 dd -2.613125929752753055713286
    49 PD_RNDINT_MAGIC	times 4 dd  100663296.0	; (float)(0x00C00000 << 3)
    50 PB_CENTERJSAMP	times 16 db CENTERJSAMPLE
    52 	alignz	16
    54 ; --------------------------------------------------------------------------
    55 	SECTION	SEG_TEXT
    56 	BITS	64
    57 ;
    58 ; Perform dequantization and inverse DCT on one block of coefficients.
    59 ;
    60 ; GLOBAL(void)
    61 ; jsimd_idct_float_sse2 (void * dct_table, JCOEFPTR coef_block,
    62 ;                        JSAMPARRAY output_buf, JDIMENSION output_col)
    63 ;
    65 ; r10 = void * dct_table
    66 ; r11 = JCOEFPTR coef_block
    67 ; r12 = JSAMPARRAY output_buf
    68 ; r13 = JDIMENSION output_col
    70 %define original_rbp	rbp+0
    71 %define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
    72 %define WK_NUM		2
    73 %define workspace	wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
    74 					; FAST_FLOAT workspace[DCTSIZE2]
    76 	align	16
    77 	global	EXTN(jsimd_idct_float_sse2)
    79 EXTN(jsimd_idct_float_sse2):
    80 	push	rbp
    81 	mov	rax,rsp				; rax = original rbp
    82 	sub	rsp, byte 4
    83 	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
    84 	mov	[rsp],rax
    85 	mov	rbp,rsp				; rbp = aligned rbp
    86 	lea	rsp, [workspace]
    87 	collect_args
    88 	push	rbx
    90 	; ---- Pass 1: process columns from input, store into work array.
    92 	mov	rdx, r10	; quantptr
    93 	mov	rsi, r11		; inptr
    94 	lea	rdi, [workspace]			; FAST_FLOAT * wsptr
    95 	mov	rcx, DCTSIZE/4				; ctr
    96 .columnloop:
    97 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
    98 	mov	eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
    99 	or	eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
   100 	jnz	near .columnDCT
   102 	movq	xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
   103 	movq	xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
   104 	movq	xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
   105 	movq	xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
   106 	movq	xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
   107 	movq	xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
   108 	movq	xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
   109 	por	xmm1,xmm2
   110 	por	xmm3,xmm4
   111 	por	xmm5,xmm6
   112 	por	xmm1,xmm3
   113 	por	xmm5,xmm7
   114 	por	xmm1,xmm5
   115 	packsswb xmm1,xmm1
   116 	movd	eax,xmm1
   117 	test	rax,rax
   118 	jnz	short .columnDCT
   120 	; -- AC terms all zero
   122 	movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
   124 	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
   125 	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)
   126 	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)
   128 	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
   130 	movaps	xmm1,xmm0
   131 	movaps	xmm2,xmm0
   132 	movaps	xmm3,xmm0
   134 	shufps	xmm0,xmm0,0x00			; xmm0=(00 00 00 00)
   135 	shufps	xmm1,xmm1,0x55			; xmm1=(01 01 01 01)
   136 	shufps	xmm2,xmm2,0xAA			; xmm2=(02 02 02 02)
   137 	shufps	xmm3,xmm3,0xFF			; xmm3=(03 03 03 03)
   139 	movaps	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
   140 	movaps	XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
   141 	movaps	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
   142 	movaps	XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
   143 	movaps	XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
   144 	movaps	XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
   145 	movaps	XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
   146 	movaps	XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
   147 	jmp	near .nextcolumn
   148 %endif
   149 .columnDCT:
   151 	; -- Even part
   153 	movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
   154 	movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
   155 	movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
   156 	movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
   158 	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)
   159 	punpcklwd xmm1,xmm1		; xmm1=(20 20 21 21 22 22 23 23)
   160 	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)
   161 	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in2=(20 21 22 23)
   162 	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)
   163 	cvtdq2ps  xmm1,xmm1			; xmm1=in2=(20 21 22 23)
   165 	punpcklwd xmm2,xmm2		; xmm2=(40 40 41 41 42 42 43 43)
   166 	punpcklwd xmm3,xmm3		; xmm3=(60 60 61 61 62 62 63 63)
   167 	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in4=(40 41 42 43)
   168 	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in6=(60 61 62 63)
   169 	cvtdq2ps  xmm2,xmm2			; xmm2=in4=(40 41 42 43)
   170 	cvtdq2ps  xmm3,xmm3			; xmm3=in6=(60 61 62 63)
   172 	mulps     xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
   173 	mulps     xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
   174 	mulps     xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
   175 	mulps     xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
   177 	movaps	xmm4,xmm0
   178 	movaps	xmm5,xmm1
   179 	subps	xmm0,xmm2		; xmm0=tmp11
   180 	subps	xmm1,xmm3
   181 	addps	xmm4,xmm2		; xmm4=tmp10
   182 	addps	xmm5,xmm3		; xmm5=tmp13
   184 	mulps	xmm1,[rel PD_1_414]
   185 	subps	xmm1,xmm5		; xmm1=tmp12
   187 	movaps	xmm6,xmm4
   188 	movaps	xmm7,xmm0
   189 	subps	xmm4,xmm5		; xmm4=tmp3
   190 	subps	xmm0,xmm1		; xmm0=tmp2
   191 	addps	xmm6,xmm5		; xmm6=tmp0
   192 	addps	xmm7,xmm1		; xmm7=tmp1
   194 	movaps	XMMWORD [wk(1)], xmm4	; tmp3
   195 	movaps	XMMWORD [wk(0)], xmm0	; tmp2
   197 	; -- Odd part
   199 	movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
   200 	movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
   201 	movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
   202 	movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
   204 	punpcklwd xmm2,xmm2		; xmm2=(10 10 11 11 12 12 13 13)
   205 	punpcklwd xmm3,xmm3		; xmm3=(30 30 31 31 32 32 33 33)
   206 	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in1=(10 11 12 13)
   207 	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in3=(30 31 32 33)
   208 	cvtdq2ps  xmm2,xmm2			; xmm2=in1=(10 11 12 13)
   209 	cvtdq2ps  xmm3,xmm3			; xmm3=in3=(30 31 32 33)
   211 	punpcklwd xmm5,xmm5		; xmm5=(50 50 51 51 52 52 53 53)
   212 	punpcklwd xmm1,xmm1		; xmm1=(70 70 71 71 72 72 73 73)
   213 	psrad     xmm5,(DWORD_BIT-WORD_BIT)	; xmm5=in5=(50 51 52 53)
   214 	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in7=(70 71 72 73)
   215 	cvtdq2ps  xmm5,xmm5			; xmm5=in5=(50 51 52 53)
   216 	cvtdq2ps  xmm1,xmm1			; xmm1=in7=(70 71 72 73)
   218 	mulps     xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
   219 	mulps     xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
   220 	mulps     xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
   221 	mulps     xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
   223 	movaps	xmm4,xmm2
   224 	movaps	xmm0,xmm5
   225 	addps	xmm2,xmm1		; xmm2=z11
   226 	addps	xmm5,xmm3		; xmm5=z13
   227 	subps	xmm4,xmm1		; xmm4=z12
   228 	subps	xmm0,xmm3		; xmm0=z10
   230 	movaps	xmm1,xmm2
   231 	subps	xmm2,xmm5
   232 	addps	xmm1,xmm5		; xmm1=tmp7
   234 	mulps	xmm2,[rel PD_1_414]	; xmm2=tmp11
   236 	movaps	xmm3,xmm0
   237 	addps	xmm0,xmm4
   238 	mulps	xmm0,[rel PD_1_847]	; xmm0=z5
   239 	mulps	xmm3,[rel PD_M2_613]	; xmm3=(z10 * -2.613125930)
   240 	mulps	xmm4,[rel PD_1_082]	; xmm4=(z12 * 1.082392200)
   241 	addps	xmm3,xmm0		; xmm3=tmp12
   242 	subps	xmm4,xmm0		; xmm4=tmp10
   244 	; -- Final output stage
   246 	subps	xmm3,xmm1		; xmm3=tmp6
   247 	movaps	xmm5,xmm6
   248 	movaps	xmm0,xmm7
   249 	addps	xmm6,xmm1		; xmm6=data0=(00 01 02 03)
   250 	addps	xmm7,xmm3		; xmm7=data1=(10 11 12 13)
   251 	subps	xmm5,xmm1		; xmm5=data7=(70 71 72 73)
   252 	subps	xmm0,xmm3		; xmm0=data6=(60 61 62 63)
   253 	subps	xmm2,xmm3		; xmm2=tmp5
   255 	movaps    xmm1,xmm6		; transpose coefficients(phase 1)
   256 	unpcklps  xmm6,xmm7		; xmm6=(00 10 01 11)
   257 	unpckhps  xmm1,xmm7		; xmm1=(02 12 03 13)
   258 	movaps    xmm3,xmm0		; transpose coefficients(phase 1)
   259 	unpcklps  xmm0,xmm5		; xmm0=(60 70 61 71)
   260 	unpckhps  xmm3,xmm5		; xmm3=(62 72 63 73)
   262 	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2
   263 	movaps	xmm5, XMMWORD [wk(1)]	; xmm5=tmp3
   265 	movaps	XMMWORD [wk(0)], xmm0	; wk(0)=(60 70 61 71)
   266 	movaps	XMMWORD [wk(1)], xmm3	; wk(1)=(62 72 63 73)
   268 	addps	xmm4,xmm2		; xmm4=tmp4
   269 	movaps	xmm0,xmm7
   270 	movaps	xmm3,xmm5
   271 	addps	xmm7,xmm2		; xmm7=data2=(20 21 22 23)
   272 	addps	xmm5,xmm4		; xmm5=data4=(40 41 42 43)
   273 	subps	xmm0,xmm2		; xmm0=data5=(50 51 52 53)
   274 	subps	xmm3,xmm4		; xmm3=data3=(30 31 32 33)
   276 	movaps    xmm2,xmm7		; transpose coefficients(phase 1)
   277 	unpcklps  xmm7,xmm3		; xmm7=(20 30 21 31)
   278 	unpckhps  xmm2,xmm3		; xmm2=(22 32 23 33)
   279 	movaps    xmm4,xmm5		; transpose coefficients(phase 1)
   280 	unpcklps  xmm5,xmm0		; xmm5=(40 50 41 51)
   281 	unpckhps  xmm4,xmm0		; xmm4=(42 52 43 53)
   283 	movaps    xmm3,xmm6		; transpose coefficients(phase 2)
   284 	unpcklps2 xmm6,xmm7		; xmm6=(00 10 20 30)
   285 	unpckhps2 xmm3,xmm7		; xmm3=(01 11 21 31)
   286 	movaps    xmm0,xmm1		; transpose coefficients(phase 2)
   287 	unpcklps2 xmm1,xmm2		; xmm1=(02 12 22 32)
   288 	unpckhps2 xmm0,xmm2		; xmm0=(03 13 23 33)
   290 	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=(60 70 61 71)
   291 	movaps	xmm2, XMMWORD [wk(1)]	; xmm2=(62 72 63 73)
   293 	movaps	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
   294 	movaps	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
   295 	movaps	XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
   296 	movaps	XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
   298 	movaps    xmm6,xmm5		; transpose coefficients(phase 2)
   299 	unpcklps2 xmm5,xmm7		; xmm5=(40 50 60 70)
   300 	unpckhps2 xmm6,xmm7		; xmm6=(41 51 61 71)
   301 	movaps    xmm3,xmm4		; transpose coefficients(phase 2)
   302 	unpcklps2 xmm4,xmm2		; xmm4=(42 52 62 72)
   303 	unpckhps2 xmm3,xmm2		; xmm3=(43 53 63 73)
   305 	movaps	XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
   306 	movaps	XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
   307 	movaps	XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
   308 	movaps	XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
   310 .nextcolumn:
   311 	add	rsi, byte 4*SIZEOF_JCOEF		; coef_block
   312 	add	rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE	; quantptr
   313 	add	rdi,      4*DCTSIZE*SIZEOF_FAST_FLOAT	; wsptr
   314 	dec	rcx					; ctr
   315 	jnz	near .columnloop
   317 	; -- Prefetch the next coefficient block
   319 	prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
   320 	prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
   321 	prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
   322 	prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
   324 	; ---- Pass 2: process rows from work array, store into output array.
   326 	mov	rax, [original_rbp]
   327 	lea	rsi, [workspace]			; FAST_FLOAT * wsptr
   328 	mov	rdi, r12	; (JSAMPROW *)
   329 	mov	rax, r13
   330 	mov	rcx, DCTSIZE/4				; ctr
   331 .rowloop:
   333 	; -- Even part
   335 	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
   336 	movaps	xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
   337 	movaps	xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
   338 	movaps	xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
   340 	movaps	xmm4,xmm0
   341 	movaps	xmm5,xmm1
   342 	subps	xmm0,xmm2		; xmm0=tmp11
   343 	subps	xmm1,xmm3
   344 	addps	xmm4,xmm2		; xmm4=tmp10
   345 	addps	xmm5,xmm3		; xmm5=tmp13
   347 	mulps	xmm1,[rel PD_1_414]
   348 	subps	xmm1,xmm5		; xmm1=tmp12
   350 	movaps	xmm6,xmm4
   351 	movaps	xmm7,xmm0
   352 	subps	xmm4,xmm5		; xmm4=tmp3
   353 	subps	xmm0,xmm1		; xmm0=tmp2
   354 	addps	xmm6,xmm5		; xmm6=tmp0
   355 	addps	xmm7,xmm1		; xmm7=tmp1
   357 	movaps	XMMWORD [wk(1)], xmm4	; tmp3
   358 	movaps	XMMWORD [wk(0)], xmm0	; tmp2
   360 	; -- Odd part
   362 	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
   363 	movaps	xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
   364 	movaps	xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
   365 	movaps	xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
   367 	movaps	xmm4,xmm2
   368 	movaps	xmm0,xmm5
   369 	addps	xmm2,xmm1		; xmm2=z11
   370 	addps	xmm5,xmm3		; xmm5=z13
   371 	subps	xmm4,xmm1		; xmm4=z12
   372 	subps	xmm0,xmm3		; xmm0=z10
   374 	movaps	xmm1,xmm2
   375 	subps	xmm2,xmm5
   376 	addps	xmm1,xmm5		; xmm1=tmp7
   378 	mulps	xmm2,[rel PD_1_414]	; xmm2=tmp11
   380 	movaps	xmm3,xmm0
   381 	addps	xmm0,xmm4
   382 	mulps	xmm0,[rel PD_1_847]	; xmm0=z5
   383 	mulps	xmm3,[rel PD_M2_613]	; xmm3=(z10 * -2.613125930)
   384 	mulps	xmm4,[rel PD_1_082]	; xmm4=(z12 * 1.082392200)
   385 	addps	xmm3,xmm0		; xmm3=tmp12
   386 	subps	xmm4,xmm0		; xmm4=tmp10
   388 	; -- Final output stage
   390 	subps	xmm3,xmm1		; xmm3=tmp6
   391 	movaps	xmm5,xmm6
   392 	movaps	xmm0,xmm7
   393 	addps	xmm6,xmm1		; xmm6=data0=(00 10 20 30)
   394 	addps	xmm7,xmm3		; xmm7=data1=(01 11 21 31)
   395 	subps	xmm5,xmm1		; xmm5=data7=(07 17 27 37)
   396 	subps	xmm0,xmm3		; xmm0=data6=(06 16 26 36)
   397 	subps	xmm2,xmm3		; xmm2=tmp5
   399 	movaps	xmm1,[rel PD_RNDINT_MAGIC]	; xmm1=[rel PD_RNDINT_MAGIC]
   400 	pcmpeqd	xmm3,xmm3
   401 	psrld	xmm3,WORD_BIT		; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
   403 	addps	xmm6,xmm1	; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
   404 	addps	xmm7,xmm1	; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
   405 	addps	xmm0,xmm1	; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
   406 	addps	xmm5,xmm1	; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
   408 	pand	xmm6,xmm3		; xmm6=(00 -- 10 -- 20 -- 30 --)
   409 	pslld	xmm7,WORD_BIT		; xmm7=(-- 01 -- 11 -- 21 -- 31)
   410 	pand	xmm0,xmm3		; xmm0=(06 -- 16 -- 26 -- 36 --)
   411 	pslld	xmm5,WORD_BIT		; xmm5=(-- 07 -- 17 -- 27 -- 37)
   412 	por	xmm6,xmm7		; xmm6=(00 01 10 11 20 21 30 31)
   413 	por	xmm0,xmm5		; xmm0=(06 07 16 17 26 27 36 37)
   415 	movaps	xmm1, XMMWORD [wk(0)]	; xmm1=tmp2
   416 	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=tmp3
   418 	addps	xmm4,xmm2		; xmm4=tmp4
   419 	movaps	xmm7,xmm1
   420 	movaps	xmm5,xmm3
   421 	addps	xmm1,xmm2		; xmm1=data2=(02 12 22 32)
   422 	addps	xmm3,xmm4		; xmm3=data4=(04 14 24 34)
   423 	subps	xmm7,xmm2		; xmm7=data5=(05 15 25 35)
   424 	subps	xmm5,xmm4		; xmm5=data3=(03 13 23 33)
   426 	movaps	xmm2,[rel PD_RNDINT_MAGIC]	; xmm2=[rel PD_RNDINT_MAGIC]
   427 	pcmpeqd	xmm4,xmm4
   428 	psrld	xmm4,WORD_BIT		; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
   430 	addps	xmm3,xmm2	; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
   431 	addps	xmm7,xmm2	; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
   432 	addps	xmm1,xmm2	; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
   433 	addps	xmm5,xmm2	; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
   435 	pand	xmm3,xmm4		; xmm3=(04 -- 14 -- 24 -- 34 --)
   436 	pslld	xmm7,WORD_BIT		; xmm7=(-- 05 -- 15 -- 25 -- 35)
   437 	pand	xmm1,xmm4		; xmm1=(02 -- 12 -- 22 -- 32 --)
   438 	pslld	xmm5,WORD_BIT		; xmm5=(-- 03 -- 13 -- 23 -- 33)
   439 	por	xmm3,xmm7		; xmm3=(04 05 14 15 24 25 34 35)
   440 	por	xmm1,xmm5		; xmm1=(02 03 12 13 22 23 32 33)
   442 	movdqa    xmm2,[rel PB_CENTERJSAMP]	; xmm2=[rel PB_CENTERJSAMP]
   444 	packsswb  xmm6,xmm3	; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
   445 	packsswb  xmm1,xmm0	; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
   446 	paddb     xmm6,xmm2
   447 	paddb     xmm1,xmm2
   449 	movdqa    xmm4,xmm6	; transpose coefficients(phase 2)
   450 	punpcklwd xmm6,xmm1	; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
   451 	punpckhwd xmm4,xmm1	; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
   453 	movdqa    xmm7,xmm6	; transpose coefficients(phase 3)
   454 	punpckldq xmm6,xmm4	; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
   455 	punpckhdq xmm7,xmm4	; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
   457 	pshufd	xmm5,xmm6,0x4E	; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
   458 	pshufd	xmm3,xmm7,0x4E	; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
   460 	mov	rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
   461 	mov	rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
   462 	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
   463 	movq	XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
   464 	mov	rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
   465 	mov	rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
   466 	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
   467 	movq	XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
   469 	add	rsi, byte 4*SIZEOF_FAST_FLOAT	; wsptr
   470 	add	rdi, byte 4*SIZEOF_JSAMPROW
   471 	dec	rcx				; ctr
   472 	jnz	near .rowloop
   474 	pop	rbx
   475 	uncollect_args
   476 	mov	rsp,rbp		; rsp <- aligned rbp
   477 	pop	rsp		; rsp <- original rbp
   478 	pop	rbp
   479 	ret
   481 ; For some reason, the OS X linker does not honor the request to align the
   482 ; segment unless we do this.
   483 	align	16

mercurial