The Tor Browser: media/libjpeg/simd/jiss2flt-64.asm@6474c204b198

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

1 ;

     2 ; jiss2flt-64.asm - floating-point IDCT (64-bit SSE & SSE2)

3 ;

     4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB

     5 ; Copyright 2009 D. R. Commander

6 ;

     7 ; Based on

     8 ; x86 SIMD extension for IJG JPEG library

     9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.

    10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc

    11 ;

    12 ; This file should be assembled with NASM (Netwide Assembler),

    13 ; can *not* be assembled with Microsoft's MASM or any compatible

    14 ; assembler (including Borland's Turbo Assembler).

    15 ; NASM is available from http://nasm.sourceforge.net/ or

    16 ; http://sourceforge.net/project/showfiles.php?group_id=6208

    17 ;

    18 ; This file contains a floating-point implementation of the inverse DCT

    19 ; (Discrete Cosine Transform). The following code is based directly on

    20 ; the IJG's original jidctflt.c; see the jidctflt.c for more details.

    21 ;

    22 ; [TAB8]

    24 %include "jsimdext.inc"

    25 %include "jdct.inc"

    27 ; --------------------------------------------------------------------------

    29 %macro	unpcklps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)

    30 	shufps	%1,%2,0x44

    31 %endmacro

    33 %macro	unpckhps2 2	; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)

    34 	shufps	%1,%2,0xEE

    35 %endmacro

    37 ; --------------------------------------------------------------------------

    38 	SECTION	SEG_CONST

    40 	alignz	16

    41 	global	EXTN(jconst_idct_float_sse2)

    43 EXTN(jconst_idct_float_sse2):

    45 PD_1_414	times 4 dd  1.414213562373095048801689

    46 PD_1_847	times 4 dd  1.847759065022573512256366

    47 PD_1_082	times 4 dd  1.082392200292393968799446

    48 PD_M2_613	times 4 dd -2.613125929752753055713286

    49 PD_RNDINT_MAGIC	times 4 dd  100663296.0	; (float)(0x00C00000 << 3)

    50 PB_CENTERJSAMP	times 16 db CENTERJSAMPLE

    52 	alignz	16

    54 ; --------------------------------------------------------------------------

    55 	SECTION	SEG_TEXT

    56 	BITS	64

    57 ;

    58 ; Perform dequantization and inverse DCT on one block of coefficients.

    59 ;

    60 ; GLOBAL(void)

    61 ; jsimd_idct_float_sse2 (void * dct_table, JCOEFPTR coef_block,

    62 ;                        JSAMPARRAY output_buf, JDIMENSION output_col)

    63 ;

    65 ; r10 = void * dct_table

    66 ; r11 = JCOEFPTR coef_block

    67 ; r12 = JSAMPARRAY output_buf

    68 ; r13 = JDIMENSION output_col

    70 %define original_rbp	rbp+0

    71 %define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]

    72 %define WK_NUM		2

    73 %define workspace	wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT

    74 					; FAST_FLOAT workspace[DCTSIZE2]

    76 	align	16

    77 	global	EXTN(jsimd_idct_float_sse2)

    79 EXTN(jsimd_idct_float_sse2):

    80 	push	rbp

    81 	mov	rax,rsp				; rax = original rbp

    82 	sub	rsp, byte 4

    83 	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits

    84 	mov	[rsp],rax

    85 	mov	rbp,rsp				; rbp = aligned rbp

    86 	lea	rsp, [workspace]

    87 	collect_args

    88 	push	rbx

    90 	; ---- Pass 1: process columns from input, store into work array.

    92 	mov	rdx, r10	; quantptr

    93 	mov	rsi, r11		; inptr

    94 	lea	rdi, [workspace]			; FAST_FLOAT * wsptr

    95 	mov	rcx, DCTSIZE/4				; ctr

    96 .columnloop:

    97 %ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE

    98 	mov	eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]

    99 	or	eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]

   100 	jnz	near .columnDCT

   102 	movq	xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]

   103 	movq	xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]

   104 	movq	xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]

   105 	movq	xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]

   106 	movq	xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]

   107 	movq	xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]

   108 	movq	xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]

   109 	por	xmm1,xmm2

   110 	por	xmm3,xmm4

   111 	por	xmm5,xmm6

   112 	por	xmm1,xmm3

   113 	por	xmm5,xmm7

   114 	por	xmm1,xmm5

   115 	packsswb xmm1,xmm1

   116 	movd	eax,xmm1

   117 	test	rax,rax

   118 	jnz	short .columnDCT

   120 	; -- AC terms all zero

   122 	movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]

   124 	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)

   125 	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)

   126 	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)

   128 	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]

   130 	movaps	xmm1,xmm0

   131 	movaps	xmm2,xmm0

   132 	movaps	xmm3,xmm0

   134 	shufps	xmm0,xmm0,0x00			; xmm0=(00 00 00 00)

   135 	shufps	xmm1,xmm1,0x55			; xmm1=(01 01 01 01)

   136 	shufps	xmm2,xmm2,0xAA			; xmm2=(02 02 02 02)

   137 	shufps	xmm3,xmm3,0xFF			; xmm3=(03 03 03 03)

   139 	movaps	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0

   140 	movaps	XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0

   141 	movaps	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1

   142 	movaps	XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1

   143 	movaps	XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2

   144 	movaps	XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2

   145 	movaps	XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3

   146 	movaps	XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3

   147 	jmp	near .nextcolumn

   148 %endif

   149 .columnDCT:

   151 	; -- Even part

   153 	movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]

   154 	movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]

   155 	movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]

   156 	movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]

   158 	punpcklwd xmm0,xmm0		; xmm0=(00 00 01 01 02 02 03 03)

   159 	punpcklwd xmm1,xmm1		; xmm1=(20 20 21 21 22 22 23 23)

   160 	psrad     xmm0,(DWORD_BIT-WORD_BIT)	; xmm0=in0=(00 01 02 03)

   161 	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in2=(20 21 22 23)

   162 	cvtdq2ps  xmm0,xmm0			; xmm0=in0=(00 01 02 03)

   163 	cvtdq2ps  xmm1,xmm1			; xmm1=in2=(20 21 22 23)

   165 	punpcklwd xmm2,xmm2		; xmm2=(40 40 41 41 42 42 43 43)

   166 	punpcklwd xmm3,xmm3		; xmm3=(60 60 61 61 62 62 63 63)

   167 	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in4=(40 41 42 43)

   168 	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in6=(60 61 62 63)

   169 	cvtdq2ps  xmm2,xmm2			; xmm2=in4=(40 41 42 43)

   170 	cvtdq2ps  xmm3,xmm3			; xmm3=in6=(60 61 62 63)

   172 	mulps     xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]

   173 	mulps     xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]

   174 	mulps     xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]

   175 	mulps     xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]

   177 	movaps	xmm4,xmm0

   178 	movaps	xmm5,xmm1

   179 	subps	xmm0,xmm2		; xmm0=tmp11

   180 	subps	xmm1,xmm3

   181 	addps	xmm4,xmm2		; xmm4=tmp10

   182 	addps	xmm5,xmm3		; xmm5=tmp13

   184 	mulps	xmm1,[rel PD_1_414]

   185 	subps	xmm1,xmm5		; xmm1=tmp12

   187 	movaps	xmm6,xmm4

   188 	movaps	xmm7,xmm0

   189 	subps	xmm4,xmm5		; xmm4=tmp3

   190 	subps	xmm0,xmm1		; xmm0=tmp2

   191 	addps	xmm6,xmm5		; xmm6=tmp0

   192 	addps	xmm7,xmm1		; xmm7=tmp1

   194 	movaps	XMMWORD [wk(1)], xmm4	; tmp3

   195 	movaps	XMMWORD [wk(0)], xmm0	; tmp2

   197 	; -- Odd part

   199 	movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]

   200 	movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]

   201 	movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]

   202 	movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]

   204 	punpcklwd xmm2,xmm2		; xmm2=(10 10 11 11 12 12 13 13)

   205 	punpcklwd xmm3,xmm3		; xmm3=(30 30 31 31 32 32 33 33)

   206 	psrad     xmm2,(DWORD_BIT-WORD_BIT)	; xmm2=in1=(10 11 12 13)

   207 	psrad     xmm3,(DWORD_BIT-WORD_BIT)	; xmm3=in3=(30 31 32 33)

   208 	cvtdq2ps  xmm2,xmm2			; xmm2=in1=(10 11 12 13)

   209 	cvtdq2ps  xmm3,xmm3			; xmm3=in3=(30 31 32 33)

   211 	punpcklwd xmm5,xmm5		; xmm5=(50 50 51 51 52 52 53 53)

   212 	punpcklwd xmm1,xmm1		; xmm1=(70 70 71 71 72 72 73 73)

   213 	psrad     xmm5,(DWORD_BIT-WORD_BIT)	; xmm5=in5=(50 51 52 53)

   214 	psrad     xmm1,(DWORD_BIT-WORD_BIT)	; xmm1=in7=(70 71 72 73)

   215 	cvtdq2ps  xmm5,xmm5			; xmm5=in5=(50 51 52 53)

   216 	cvtdq2ps  xmm1,xmm1			; xmm1=in7=(70 71 72 73)

   218 	mulps     xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]

   219 	mulps     xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]

   220 	mulps     xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]

   221 	mulps     xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]

   223 	movaps	xmm4,xmm2

   224 	movaps	xmm0,xmm5

   225 	addps	xmm2,xmm1		; xmm2=z11

   226 	addps	xmm5,xmm3		; xmm5=z13

   227 	subps	xmm4,xmm1		; xmm4=z12

   228 	subps	xmm0,xmm3		; xmm0=z10

   230 	movaps	xmm1,xmm2

   231 	subps	xmm2,xmm5

   232 	addps	xmm1,xmm5		; xmm1=tmp7

   234 	mulps	xmm2,[rel PD_1_414]	; xmm2=tmp11

   236 	movaps	xmm3,xmm0

   237 	addps	xmm0,xmm4

   238 	mulps	xmm0,[rel PD_1_847]	; xmm0=z5

   239 	mulps	xmm3,[rel PD_M2_613]	; xmm3=(z10 * -2.613125930)

   240 	mulps	xmm4,[rel PD_1_082]	; xmm4=(z12 * 1.082392200)

   241 	addps	xmm3,xmm0		; xmm3=tmp12

   242 	subps	xmm4,xmm0		; xmm4=tmp10

   244 	; -- Final output stage

   246 	subps	xmm3,xmm1		; xmm3=tmp6

   247 	movaps	xmm5,xmm6

   248 	movaps	xmm0,xmm7

   249 	addps	xmm6,xmm1		; xmm6=data0=(00 01 02 03)

   250 	addps	xmm7,xmm3		; xmm7=data1=(10 11 12 13)

   251 	subps	xmm5,xmm1		; xmm5=data7=(70 71 72 73)

   252 	subps	xmm0,xmm3		; xmm0=data6=(60 61 62 63)

   253 	subps	xmm2,xmm3		; xmm2=tmp5

   255 	movaps    xmm1,xmm6		; transpose coefficients(phase 1)

   256 	unpcklps  xmm6,xmm7		; xmm6=(00 10 01 11)

   257 	unpckhps  xmm1,xmm7		; xmm1=(02 12 03 13)

   258 	movaps    xmm3,xmm0		; transpose coefficients(phase 1)

   259 	unpcklps  xmm0,xmm5		; xmm0=(60 70 61 71)

   260 	unpckhps  xmm3,xmm5		; xmm3=(62 72 63 73)

   262 	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=tmp2

   263 	movaps	xmm5, XMMWORD [wk(1)]	; xmm5=tmp3

   265 	movaps	XMMWORD [wk(0)], xmm0	; wk(0)=(60 70 61 71)

   266 	movaps	XMMWORD [wk(1)], xmm3	; wk(1)=(62 72 63 73)

   268 	addps	xmm4,xmm2		; xmm4=tmp4

   269 	movaps	xmm0,xmm7

   270 	movaps	xmm3,xmm5

   271 	addps	xmm7,xmm2		; xmm7=data2=(20 21 22 23)

   272 	addps	xmm5,xmm4		; xmm5=data4=(40 41 42 43)

   273 	subps	xmm0,xmm2		; xmm0=data5=(50 51 52 53)

   274 	subps	xmm3,xmm4		; xmm3=data3=(30 31 32 33)

   276 	movaps    xmm2,xmm7		; transpose coefficients(phase 1)

   277 	unpcklps  xmm7,xmm3		; xmm7=(20 30 21 31)

   278 	unpckhps  xmm2,xmm3		; xmm2=(22 32 23 33)

   279 	movaps    xmm4,xmm5		; transpose coefficients(phase 1)

   280 	unpcklps  xmm5,xmm0		; xmm5=(40 50 41 51)

   281 	unpckhps  xmm4,xmm0		; xmm4=(42 52 43 53)

   283 	movaps    xmm3,xmm6		; transpose coefficients(phase 2)

   284 	unpcklps2 xmm6,xmm7		; xmm6=(00 10 20 30)

   285 	unpckhps2 xmm3,xmm7		; xmm3=(01 11 21 31)

   286 	movaps    xmm0,xmm1		; transpose coefficients(phase 2)

   287 	unpcklps2 xmm1,xmm2		; xmm1=(02 12 22 32)

   288 	unpckhps2 xmm0,xmm2		; xmm0=(03 13 23 33)

   290 	movaps	xmm7, XMMWORD [wk(0)]	; xmm7=(60 70 61 71)

   291 	movaps	xmm2, XMMWORD [wk(1)]	; xmm2=(62 72 63 73)

   293 	movaps	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6

   294 	movaps	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3

   295 	movaps	XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1

   296 	movaps	XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0

   298 	movaps    xmm6,xmm5		; transpose coefficients(phase 2)

   299 	unpcklps2 xmm5,xmm7		; xmm5=(40 50 60 70)

   300 	unpckhps2 xmm6,xmm7		; xmm6=(41 51 61 71)

   301 	movaps    xmm3,xmm4		; transpose coefficients(phase 2)

   302 	unpcklps2 xmm4,xmm2		; xmm4=(42 52 62 72)

   303 	unpckhps2 xmm3,xmm2		; xmm3=(43 53 63 73)

   305 	movaps	XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5

   306 	movaps	XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6

   307 	movaps	XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4

   308 	movaps	XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3

   310 .nextcolumn:

   311 	add	rsi, byte 4*SIZEOF_JCOEF		; coef_block

   312 	add	rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE	; quantptr

   313 	add	rdi,      4*DCTSIZE*SIZEOF_FAST_FLOAT	; wsptr

   314 	dec	rcx					; ctr

   315 	jnz	near .columnloop

   317 	; -- Prefetch the next coefficient block

   319 	prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]

   320 	prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]

   321 	prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]

   322 	prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]

   324 	; ---- Pass 2: process rows from work array, store into output array.

   326 	mov	rax, [original_rbp]

   327 	lea	rsi, [workspace]			; FAST_FLOAT * wsptr

   328 	mov	rdi, r12	; (JSAMPROW *)

   329 	mov	rax, r13

   330 	mov	rcx, DCTSIZE/4				; ctr

   331 .rowloop:

   333 	; -- Even part

   335 	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]

   336 	movaps	xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]

   337 	movaps	xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]

   338 	movaps	xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]

   340 	movaps	xmm4,xmm0

   341 	movaps	xmm5,xmm1

   342 	subps	xmm0,xmm2		; xmm0=tmp11

   343 	subps	xmm1,xmm3

   344 	addps	xmm4,xmm2		; xmm4=tmp10

   345 	addps	xmm5,xmm3		; xmm5=tmp13

   347 	mulps	xmm1,[rel PD_1_414]

   348 	subps	xmm1,xmm5		; xmm1=tmp12

   350 	movaps	xmm6,xmm4

   351 	movaps	xmm7,xmm0

   352 	subps	xmm4,xmm5		; xmm4=tmp3

   353 	subps	xmm0,xmm1		; xmm0=tmp2

   354 	addps	xmm6,xmm5		; xmm6=tmp0

   355 	addps	xmm7,xmm1		; xmm7=tmp1

   357 	movaps	XMMWORD [wk(1)], xmm4	; tmp3

   358 	movaps	XMMWORD [wk(0)], xmm0	; tmp2

   360 	; -- Odd part

   362 	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]

   363 	movaps	xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]

   364 	movaps	xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]

   365 	movaps	xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]

   367 	movaps	xmm4,xmm2

   368 	movaps	xmm0,xmm5

   369 	addps	xmm2,xmm1		; xmm2=z11

   370 	addps	xmm5,xmm3		; xmm5=z13

   371 	subps	xmm4,xmm1		; xmm4=z12

   372 	subps	xmm0,xmm3		; xmm0=z10

   374 	movaps	xmm1,xmm2

   375 	subps	xmm2,xmm5

   376 	addps	xmm1,xmm5		; xmm1=tmp7

   378 	mulps	xmm2,[rel PD_1_414]	; xmm2=tmp11

   380 	movaps	xmm3,xmm0

   381 	addps	xmm0,xmm4

   382 	mulps	xmm0,[rel PD_1_847]	; xmm0=z5

   383 	mulps	xmm3,[rel PD_M2_613]	; xmm3=(z10 * -2.613125930)

   384 	mulps	xmm4,[rel PD_1_082]	; xmm4=(z12 * 1.082392200)

   385 	addps	xmm3,xmm0		; xmm3=tmp12

   386 	subps	xmm4,xmm0		; xmm4=tmp10

   388 	; -- Final output stage

   390 	subps	xmm3,xmm1		; xmm3=tmp6

   391 	movaps	xmm5,xmm6

   392 	movaps	xmm0,xmm7

   393 	addps	xmm6,xmm1		; xmm6=data0=(00 10 20 30)

   394 	addps	xmm7,xmm3		; xmm7=data1=(01 11 21 31)

   395 	subps	xmm5,xmm1		; xmm5=data7=(07 17 27 37)

   396 	subps	xmm0,xmm3		; xmm0=data6=(06 16 26 36)

   397 	subps	xmm2,xmm3		; xmm2=tmp5

   399 	movaps	xmm1,[rel PD_RNDINT_MAGIC]	; xmm1=[rel PD_RNDINT_MAGIC]

   400 	pcmpeqd	xmm3,xmm3

   401 	psrld	xmm3,WORD_BIT		; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}

   403 	addps	xmm6,xmm1	; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)

   404 	addps	xmm7,xmm1	; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)

   405 	addps	xmm0,xmm1	; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)

   406 	addps	xmm5,xmm1	; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)

   408 	pand	xmm6,xmm3		; xmm6=(00 -- 10 -- 20 -- 30 --)

   409 	pslld	xmm7,WORD_BIT		; xmm7=(-- 01 -- 11 -- 21 -- 31)

   410 	pand	xmm0,xmm3		; xmm0=(06 -- 16 -- 26 -- 36 --)

   411 	pslld	xmm5,WORD_BIT		; xmm5=(-- 07 -- 17 -- 27 -- 37)

   412 	por	xmm6,xmm7		; xmm6=(00 01 10 11 20 21 30 31)

   413 	por	xmm0,xmm5		; xmm0=(06 07 16 17 26 27 36 37)

   415 	movaps	xmm1, XMMWORD [wk(0)]	; xmm1=tmp2

   416 	movaps	xmm3, XMMWORD [wk(1)]	; xmm3=tmp3

   418 	addps	xmm4,xmm2		; xmm4=tmp4

   419 	movaps	xmm7,xmm1

   420 	movaps	xmm5,xmm3

   421 	addps	xmm1,xmm2		; xmm1=data2=(02 12 22 32)

   422 	addps	xmm3,xmm4		; xmm3=data4=(04 14 24 34)

   423 	subps	xmm7,xmm2		; xmm7=data5=(05 15 25 35)

   424 	subps	xmm5,xmm4		; xmm5=data3=(03 13 23 33)

   426 	movaps	xmm2,[rel PD_RNDINT_MAGIC]	; xmm2=[rel PD_RNDINT_MAGIC]

   427 	pcmpeqd	xmm4,xmm4

   428 	psrld	xmm4,WORD_BIT		; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}

   430 	addps	xmm3,xmm2	; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)

   431 	addps	xmm7,xmm2	; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)

   432 	addps	xmm1,xmm2	; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)

   433 	addps	xmm5,xmm2	; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)

   435 	pand	xmm3,xmm4		; xmm3=(04 -- 14 -- 24 -- 34 --)

   436 	pslld	xmm7,WORD_BIT		; xmm7=(-- 05 -- 15 -- 25 -- 35)

   437 	pand	xmm1,xmm4		; xmm1=(02 -- 12 -- 22 -- 32 --)

   438 	pslld	xmm5,WORD_BIT		; xmm5=(-- 03 -- 13 -- 23 -- 33)

   439 	por	xmm3,xmm7		; xmm3=(04 05 14 15 24 25 34 35)

   440 	por	xmm1,xmm5		; xmm1=(02 03 12 13 22 23 32 33)

   442 	movdqa    xmm2,[rel PB_CENTERJSAMP]	; xmm2=[rel PB_CENTERJSAMP]

   444 	packsswb  xmm6,xmm3	; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)

   445 	packsswb  xmm1,xmm0	; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)

   446 	paddb     xmm6,xmm2

   447 	paddb     xmm1,xmm2

   449 	movdqa    xmm4,xmm6	; transpose coefficients(phase 2)

   450 	punpcklwd xmm6,xmm1	; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)

   451 	punpckhwd xmm4,xmm1	; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)

   453 	movdqa    xmm7,xmm6	; transpose coefficients(phase 3)

   454 	punpckldq xmm6,xmm4	; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)

   455 	punpckhdq xmm7,xmm4	; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)

   457 	pshufd	xmm5,xmm6,0x4E	; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)

   458 	pshufd	xmm3,xmm7,0x4E	; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)

   460 	mov	rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]

   461 	mov	rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]

   462 	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6

   463 	movq	XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7

   464 	mov	rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]

   465 	mov	rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]

   466 	movq	XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5

   467 	movq	XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3

   469 	add	rsi, byte 4*SIZEOF_FAST_FLOAT	; wsptr

   470 	add	rdi, byte 4*SIZEOF_JSAMPROW

   471 	dec	rcx				; ctr

   472 	jnz	near .rowloop

   474 	pop	rbx

   475 	uncollect_args

   476 	mov	rsp,rbp		; rsp <- aligned rbp

   477 	pop	rsp		; rsp <- original rbp

   478 	pop	rbp

   479 	ret

   481 ; For some reason, the OS X linker does not honor the request to align the

   482 ; segment unless we do this.

   483 	align	16

The Tor Browser / file revision

media/libjpeg/simd/jiss2flt-64.asm@6474c204b198

media/libjpeg/simd/jiss2flt-64.asm