media/libjpeg/simd/jcqntsse.asm

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 ;
     2 ; jcqntsse.asm - sample data conversion and quantization (SSE & MMX)
     3 ;
     4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
     5 ;
     6 ; Based on
     7 ; x86 SIMD extension for IJG JPEG library
     8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
    10 ;
    11 ; This file should be assembled with NASM (Netwide Assembler),
    12 ; can *not* be assembled with Microsoft's MASM or any compatible
    13 ; assembler (including Borland's Turbo Assembler).
    14 ; NASM is available from http://nasm.sourceforge.net/ or
    15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
    16 ;
    17 ; [TAB8]
    19 %include "jsimdext.inc"
    20 %include "jdct.inc"
    22 ; --------------------------------------------------------------------------
    23 	SECTION	SEG_TEXT
    24 	BITS	32
    25 ;
    26 ; Load data into workspace, applying unsigned->signed conversion
    27 ;
    28 ; GLOBAL(void)
    29 ; jsimd_convsamp_float_sse (JSAMPARRAY sample_data, JDIMENSION start_col,
    30 ;                           FAST_FLOAT * workspace);
    31 ;
    33 %define sample_data	ebp+8		; JSAMPARRAY sample_data
    34 %define start_col	ebp+12		; JDIMENSION start_col
    35 %define workspace	ebp+16		; FAST_FLOAT * workspace
    37 	align	16
    38 	global	EXTN(jsimd_convsamp_float_sse)
    40 EXTN(jsimd_convsamp_float_sse):
    41 	push	ebp
    42 	mov	ebp,esp
    43 	push	ebx
    44 ;	push	ecx		; need not be preserved
    45 ;	push	edx		; need not be preserved
    46 	push	esi
    47 	push	edi
    49 	pcmpeqw  mm7,mm7
    50 	psllw    mm7,7
    51 	packsswb mm7,mm7		; mm7 = PB_CENTERJSAMPLE (0x808080..)
    53 	mov	esi, JSAMPARRAY [sample_data]	; (JSAMPROW *)
    54 	mov	eax, JDIMENSION [start_col]
    55 	mov	edi, POINTER [workspace]	; (DCTELEM *)
    56 	mov	ecx, DCTSIZE/2
    57 	alignx	16,7
    58 .convloop:
    59 	mov	ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; (JSAMPLE *)
    60 	mov	edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; (JSAMPLE *)
    62 	movq	mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
    63 	movq	mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
    65 	psubb	mm0,mm7				; mm0=(01234567)
    66 	psubb	mm1,mm7				; mm1=(89ABCDEF)
    68 	punpcklbw mm2,mm0			; mm2=(*0*1*2*3)
    69 	punpckhbw mm0,mm0			; mm0=(*4*5*6*7)
    70 	punpcklbw mm3,mm1			; mm3=(*8*9*A*B)
    71 	punpckhbw mm1,mm1			; mm1=(*C*D*E*F)
    73 	punpcklwd mm4,mm2			; mm4=(***0***1)
    74 	punpckhwd mm2,mm2			; mm2=(***2***3)
    75 	punpcklwd mm5,mm0			; mm5=(***4***5)
    76 	punpckhwd mm0,mm0			; mm0=(***6***7)
    78 	psrad     mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(01)
    79 	psrad     mm2,(DWORD_BIT-BYTE_BIT)	; mm2=(23)
    80 	cvtpi2ps  xmm0,mm4			; xmm0=(01**)
    81 	cvtpi2ps  xmm1,mm2			; xmm1=(23**)
    82 	psrad     mm5,(DWORD_BIT-BYTE_BIT)	; mm5=(45)
    83 	psrad     mm0,(DWORD_BIT-BYTE_BIT)	; mm0=(67)
    84 	cvtpi2ps  xmm2,mm5			; xmm2=(45**)
    85 	cvtpi2ps  xmm3,mm0			; xmm3=(67**)
    87 	punpcklwd mm6,mm3			; mm6=(***8***9)
    88 	punpckhwd mm3,mm3			; mm3=(***A***B)
    89 	punpcklwd mm4,mm1			; mm4=(***C***D)
    90 	punpckhwd mm1,mm1			; mm1=(***E***F)
    92 	psrad     mm6,(DWORD_BIT-BYTE_BIT)	; mm6=(89)
    93 	psrad     mm3,(DWORD_BIT-BYTE_BIT)	; mm3=(AB)
    94 	cvtpi2ps  xmm4,mm6			; xmm4=(89**)
    95 	cvtpi2ps  xmm5,mm3			; xmm5=(AB**)
    96 	psrad     mm4,(DWORD_BIT-BYTE_BIT)	; mm4=(CD)
    97 	psrad     mm1,(DWORD_BIT-BYTE_BIT)	; mm1=(EF)
    98 	cvtpi2ps  xmm6,mm4			; xmm6=(CD**)
    99 	cvtpi2ps  xmm7,mm1			; xmm7=(EF**)
   101 	movlhps   xmm0,xmm1			; xmm0=(0123)
   102 	movlhps   xmm2,xmm3			; xmm2=(4567)
   103 	movlhps   xmm4,xmm5			; xmm4=(89AB)
   104 	movlhps   xmm6,xmm7			; xmm6=(CDEF)
   106 	movaps	XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
   107 	movaps	XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
   108 	movaps	XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
   109 	movaps	XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
   111 	add	esi, byte 2*SIZEOF_JSAMPROW
   112 	add	edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
   113 	dec	ecx
   114 	jnz	near .convloop
   116 	emms		; empty MMX state
   118 	pop	edi
   119 	pop	esi
   120 ;	pop	edx		; need not be preserved
   121 ;	pop	ecx		; need not be preserved
   122 	pop	ebx
   123 	pop	ebp
   124 	ret
   127 ; --------------------------------------------------------------------------
   128 ;
   129 ; Quantize/descale the coefficients, and store into coef_block
   130 ;
   131 ; GLOBAL(void)
   132 ; jsimd_quantize_float_sse (JCOEFPTR coef_block, FAST_FLOAT * divisors,
   133 ;                           FAST_FLOAT * workspace);
   134 ;
   136 %define coef_block	ebp+8		; JCOEFPTR coef_block
   137 %define divisors	ebp+12		; FAST_FLOAT * divisors
   138 %define workspace	ebp+16		; FAST_FLOAT * workspace
   140 	align	16
   141 	global	EXTN(jsimd_quantize_float_sse)
   143 EXTN(jsimd_quantize_float_sse):
   144 	push	ebp
   145 	mov	ebp,esp
   146 ;	push	ebx		; unused
   147 ;	push	ecx		; unused
   148 ;	push	edx		; need not be preserved
   149 	push	esi
   150 	push	edi
   152 	mov	esi, POINTER [workspace]
   153 	mov	edx, POINTER [divisors]
   154 	mov	edi, JCOEFPTR [coef_block]
   155 	mov	eax, DCTSIZE2/16
   156 	alignx	16,7
   157 .quantloop:
   158 	movaps	xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
   159 	movaps	xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
   160 	mulps	xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
   161 	mulps	xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
   162 	movaps	xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
   163 	movaps	xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
   164 	mulps	xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
   165 	mulps	xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
   167 	movhlps  xmm4,xmm0
   168 	movhlps  xmm5,xmm1
   170 	cvtps2pi mm0,xmm0
   171 	cvtps2pi mm1,xmm1
   172 	cvtps2pi mm4,xmm4
   173 	cvtps2pi mm5,xmm5
   175 	movhlps  xmm6,xmm2
   176 	movhlps  xmm7,xmm3
   178 	cvtps2pi mm2,xmm2
   179 	cvtps2pi mm3,xmm3
   180 	cvtps2pi mm6,xmm6
   181 	cvtps2pi mm7,xmm7
   183 	packssdw mm0,mm4
   184 	packssdw mm1,mm5
   185 	packssdw mm2,mm6
   186 	packssdw mm3,mm7
   188 	movq	MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
   189 	movq	MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
   190 	movq	MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
   191 	movq	MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
   193 	add	esi, byte 16*SIZEOF_FAST_FLOAT
   194 	add	edx, byte 16*SIZEOF_FAST_FLOAT
   195 	add	edi, byte 16*SIZEOF_JCOEF
   196 	dec	eax
   197 	jnz	short .quantloop
   199 	emms		; empty MMX state
   201 	pop	edi
   202 	pop	esi
   203 ;	pop	edx		; need not be preserved
   204 ;	pop	ecx		; unused
   205 ;	pop	ebx		; unused
   206 	pop	ebp
   207 	ret
   209 ; For some reason, the OS X linker does not honor the request to align the
   210 ; segment unless we do this.
   211 	align	16

mercurial