The Tor Browser: media/libjpeg/simd/jcqnts2i-64.asm@6474c204b198

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

1 ;

     2 ; jcqnts2i-64.asm - sample data conversion and quantization (64-bit SSE2)

3 ;

     4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB

     5 ; Copyright 2009 D. R. Commander

6 ;

     7 ; Based on

     8 ; x86 SIMD extension for IJG JPEG library

     9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.

    10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc

    11 ;

    12 ; This file should be assembled with NASM (Netwide Assembler),

    13 ; can *not* be assembled with Microsoft's MASM or any compatible

    14 ; assembler (including Borland's Turbo Assembler).

    15 ; NASM is available from http://nasm.sourceforge.net/ or

    16 ; http://sourceforge.net/project/showfiles.php?group_id=6208

    17 ;

    18 ; [TAB8]

    20 %include "jsimdext.inc"

    21 %include "jdct.inc"

    23 ; --------------------------------------------------------------------------

    24 	SECTION	SEG_TEXT

    25 	BITS	64

    26 ;

    27 ; Load data into workspace, applying unsigned->signed conversion

    28 ;

    29 ; GLOBAL(void)

    30 ; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,

    31 ;                      DCTELEM * workspace);

    32 ;

    34 ; r10 = JSAMPARRAY sample_data

    35 ; r11 = JDIMENSION start_col

    36 ; r12 = DCTELEM * workspace

    38 	align	16

    39 	global	EXTN(jsimd_convsamp_sse2)

    41 EXTN(jsimd_convsamp_sse2):

    42 	push	rbp

    43 	mov	rax,rsp

    44 	mov	rbp,rsp

    45 	collect_args

    46 	push	rbx

    48 	pxor	xmm6,xmm6		; xmm6=(all 0's)

    49 	pcmpeqw	xmm7,xmm7

    50 	psllw	xmm7,7			; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}

    52 	mov rsi, r10

    53 	mov rax, r11

    54 	mov rdi, r12

    55 	mov	rcx, DCTSIZE/4

    56 .convloop:

    57 	mov	rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)

    58 	mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)

    60 	movq	xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]	; xmm0=(01234567)

    61 	movq	xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]	; xmm1=(89ABCDEF)

    63 	mov	rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW]	; (JSAMPLE *)

    64 	mov	rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW]	; (JSAMPLE *)

    66 	movq	xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]	; xmm2=(GHIJKLMN)

    67 	movq	xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]	; xmm3=(OPQRSTUV)

    69 	punpcklbw xmm0,xmm6		; xmm0=(01234567)

    70 	punpcklbw xmm1,xmm6		; xmm1=(89ABCDEF)

    71 	paddw     xmm0,xmm7

    72 	paddw     xmm1,xmm7

    73 	punpcklbw xmm2,xmm6		; xmm2=(GHIJKLMN)

    74 	punpcklbw xmm3,xmm6		; xmm3=(OPQRSTUV)

    75 	paddw     xmm2,xmm7

    76 	paddw     xmm3,xmm7

    78 	movdqa	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0

    79 	movdqa	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1

    80 	movdqa	XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2

    81 	movdqa	XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3

    83 	add	rsi, byte 4*SIZEOF_JSAMPROW

    84 	add	rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM

    85 	dec	rcx

    86 	jnz	short .convloop

    88 	pop	rbx

    89 	uncollect_args

    90 	pop	rbp

    91 	ret

    93 ; --------------------------------------------------------------------------

    94 ;

    95 ; Quantize/descale the coefficients, and store into coef_block

    96 ;

    97 ; This implementation is based on an algorithm described in

    98 ;   "How to optimize for the Pentium family of microprocessors"

    99 ;   (http://www.agner.org/assem/).

   100 ;

   101 ; GLOBAL(void)

   102 ; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM * divisors,

   103 ;                      DCTELEM * workspace);

   104 ;

   106 %define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)

   107 %define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)

   108 %define SCALE(m,n,b)      XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)

   110 ; r10 = JCOEFPTR coef_block

   111 ; r11 = DCTELEM * divisors

   112 ; r12 = DCTELEM * workspace

   114 	align	16

   115 	global	EXTN(jsimd_quantize_sse2)

   117 EXTN(jsimd_quantize_sse2):

   118 	push	rbp

   119 	mov	rax,rsp

   120 	mov	rbp,rsp

   121 	collect_args

   123 	mov rsi, r12

   124 	mov rdx, r11

   125 	mov rdi, r10

   126 	mov	rax, DCTSIZE2/32

   127 .quantloop:

   128 	movdqa	xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]

   129 	movdqa	xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]

   130 	movdqa	xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]

   131 	movdqa	xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]

   132 	movdqa	xmm0,xmm4

   133 	movdqa	xmm1,xmm5

   134 	movdqa	xmm2,xmm6

   135 	movdqa	xmm3,xmm7

   136 	psraw	xmm4,(WORD_BIT-1)

   137 	psraw	xmm5,(WORD_BIT-1)

   138 	psraw	xmm6,(WORD_BIT-1)

   139 	psraw	xmm7,(WORD_BIT-1)

   140 	pxor	xmm0,xmm4

   141 	pxor	xmm1,xmm5

   142 	pxor	xmm2,xmm6

   143 	pxor	xmm3,xmm7

   144 	psubw	xmm0,xmm4		; if (xmm0 < 0) xmm0 = -xmm0;

   145 	psubw	xmm1,xmm5		; if (xmm1 < 0) xmm1 = -xmm1;

   146 	psubw	xmm2,xmm6		; if (xmm2 < 0) xmm2 = -xmm2;

   147 	psubw	xmm3,xmm7		; if (xmm3 < 0) xmm3 = -xmm3;

   149 	paddw	xmm0, XMMWORD [CORRECTION(0,0,rdx)]  ; correction + roundfactor

   150 	paddw	xmm1, XMMWORD [CORRECTION(1,0,rdx)]

   151 	paddw	xmm2, XMMWORD [CORRECTION(2,0,rdx)]

   152 	paddw	xmm3, XMMWORD [CORRECTION(3,0,rdx)]

   153 	pmulhuw	xmm0, XMMWORD [RECIPROCAL(0,0,rdx)]  ; reciprocal

   154 	pmulhuw	xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]

   155 	pmulhuw	xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]

   156 	pmulhuw	xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]

   157 	pmulhuw	xmm0, XMMWORD [SCALE(0,0,rdx)]	; scale

   158 	pmulhuw	xmm1, XMMWORD [SCALE(1,0,rdx)]

   159 	pmulhuw	xmm2, XMMWORD [SCALE(2,0,rdx)]

   160 	pmulhuw	xmm3, XMMWORD [SCALE(3,0,rdx)]

   162 	pxor	xmm0,xmm4

   163 	pxor	xmm1,xmm5

   164 	pxor	xmm2,xmm6

   165 	pxor	xmm3,xmm7

   166 	psubw	xmm0,xmm4

   167 	psubw	xmm1,xmm5

   168 	psubw	xmm2,xmm6

   169 	psubw	xmm3,xmm7

   170 	movdqa	XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0

   171 	movdqa	XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1

   172 	movdqa	XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2

   173 	movdqa	XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3

   175 	add	rsi, byte 32*SIZEOF_DCTELEM

   176 	add	rdx, byte 32*SIZEOF_DCTELEM

   177 	add	rdi, byte 32*SIZEOF_JCOEF

   178 	dec	rax

   179 	jnz	near .quantloop

   181 	uncollect_args

   182 	pop	rbp

   183 	ret

   185 ; For some reason, the OS X linker does not honor the request to align the

   186 ; segment unless we do this.

   187 	align	16

The Tor Browser / file revision