media/libjpeg/simd/jdclrmmx.asm

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 ;
     2 ; jdclrmmx.asm - colorspace conversion (MMX)
     3 ;
     4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
     5 ;
     6 ; Based on
     7 ; x86 SIMD extension for IJG JPEG library
     8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
    10 ;
    11 ; This file should be assembled with NASM (Netwide Assembler),
    12 ; can *not* be assembled with Microsoft's MASM or any compatible
    13 ; assembler (including Borland's Turbo Assembler).
    14 ; NASM is available from http://nasm.sourceforge.net/ or
    15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
    16 ;
    17 ; [TAB8]
    19 %include "jcolsamp.inc"
    21 ; --------------------------------------------------------------------------
    22 ;
    23 ; Convert some rows of samples to the output colorspace.
    24 ;
    25 ; GLOBAL(void)
    26 ; jsimd_ycc_rgb_convert_mmx (JDIMENSION out_width,
    27 ;                            JSAMPIMAGE input_buf, JDIMENSION input_row,
    28 ;                            JSAMPARRAY output_buf, int num_rows)
    29 ;
    31 %define out_width(b)	(b)+8			; JDIMENSION out_width
    32 %define input_buf(b)	(b)+12		; JSAMPIMAGE input_buf
    33 %define input_row(b)	(b)+16		; JDIMENSION input_row
    34 %define output_buf(b)	(b)+20		; JSAMPARRAY output_buf
    35 %define num_rows(b)	(b)+24		; int num_rows
    37 %define original_ebp	ebp+0
    38 %define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
    39 %define WK_NUM		2
    40 %define gotptr		wk(0)-SIZEOF_POINTER	; void * gotptr
    42 	align	16
    43 	global	EXTN(jsimd_ycc_rgb_convert_mmx)
    45 EXTN(jsimd_ycc_rgb_convert_mmx):
    46 	push	ebp
    47 	mov	eax,esp				; eax = original ebp
    48 	sub	esp, byte 4
    49 	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
    50 	mov	[esp],eax
    51 	mov	ebp,esp				; ebp = aligned ebp
    52 	lea	esp, [wk(0)]
    53 	pushpic	eax		; make a room for GOT address
    54 	push	ebx
    55 ;	push	ecx		; need not be preserved
    56 ;	push	edx		; need not be preserved
    57 	push	esi
    58 	push	edi
    60 	get_GOT	ebx			; get GOT address
    61 	movpic	POINTER [gotptr], ebx	; save GOT address
    63 	mov	ecx, JDIMENSION [out_width(eax)]	; num_cols
    64 	test	ecx,ecx
    65 	jz	near .return
    67 	push	ecx
    69 	mov	edi, JSAMPIMAGE [input_buf(eax)]
    70 	mov	ecx, JDIMENSION [input_row(eax)]
    71 	mov	esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY]
    72 	mov	ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY]
    73 	mov	edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY]
    74 	lea	esi, [esi+ecx*SIZEOF_JSAMPROW]
    75 	lea	ebx, [ebx+ecx*SIZEOF_JSAMPROW]
    76 	lea	edx, [edx+ecx*SIZEOF_JSAMPROW]
    78 	pop	ecx
    80 	mov	edi, JSAMPARRAY [output_buf(eax)]
    81 	mov	eax, INT [num_rows(eax)]
    82 	test	eax,eax
    83 	jle	near .return
    84 	alignx	16,7
    85 .rowloop:
    86 	push	eax
    87 	push	edi
    88 	push	edx
    89 	push	ebx
    90 	push	esi
    91 	push	ecx			; col
    93 	mov	esi, JSAMPROW [esi]	; inptr0
    94 	mov	ebx, JSAMPROW [ebx]	; inptr1
    95 	mov	edx, JSAMPROW [edx]	; inptr2
    96 	mov	edi, JSAMPROW [edi]	; outptr
    97 	movpic	eax, POINTER [gotptr]	; load GOT address (eax)
    98 	alignx	16,7
    99 .columnloop:
   101 	movq	mm5, MMWORD [ebx]	; mm5=Cb(01234567)
   102 	movq	mm1, MMWORD [edx]	; mm1=Cr(01234567)
   104 	pcmpeqw	mm4,mm4
   105 	pcmpeqw	mm7,mm7
   106 	psrlw	mm4,BYTE_BIT
   107 	psllw	mm7,7			; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
   108 	movq	mm0,mm4			; mm0=mm4={0xFF 0x00 0xFF 0x00 ..}
   110 	pand	mm4,mm5			; mm4=Cb(0246)=CbE
   111 	psrlw	mm5,BYTE_BIT		; mm5=Cb(1357)=CbO
   112 	pand	mm0,mm1			; mm0=Cr(0246)=CrE
   113 	psrlw	mm1,BYTE_BIT		; mm1=Cr(1357)=CrO
   115 	paddw	mm4,mm7
   116 	paddw	mm5,mm7
   117 	paddw	mm0,mm7
   118 	paddw	mm1,mm7
   120 	; (Original)
   121 	; R = Y                + 1.40200 * Cr
   122 	; G = Y - 0.34414 * Cb - 0.71414 * Cr
   123 	; B = Y + 1.77200 * Cb
   124 	;
   125 	; (This implementation)
   126 	; R = Y                + 0.40200 * Cr + Cr
   127 	; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
   128 	; B = Y - 0.22800 * Cb + Cb + Cb
   130 	movq	mm2,mm4			; mm2=CbE
   131 	movq	mm3,mm5			; mm3=CbO
   132 	paddw	mm4,mm4			; mm4=2*CbE
   133 	paddw	mm5,mm5			; mm5=2*CbO
   134 	movq	mm6,mm0			; mm6=CrE
   135 	movq	mm7,mm1			; mm7=CrO
   136 	paddw	mm0,mm0			; mm0=2*CrE
   137 	paddw	mm1,mm1			; mm1=2*CrO
   139 	pmulhw	mm4,[GOTOFF(eax,PW_MF0228)]	; mm4=(2*CbE * -FIX(0.22800))
   140 	pmulhw	mm5,[GOTOFF(eax,PW_MF0228)]	; mm5=(2*CbO * -FIX(0.22800))
   141 	pmulhw	mm0,[GOTOFF(eax,PW_F0402)]	; mm0=(2*CrE * FIX(0.40200))
   142 	pmulhw	mm1,[GOTOFF(eax,PW_F0402)]	; mm1=(2*CrO * FIX(0.40200))
   144 	paddw	mm4,[GOTOFF(eax,PW_ONE)]
   145 	paddw	mm5,[GOTOFF(eax,PW_ONE)]
   146 	psraw	mm4,1			; mm4=(CbE * -FIX(0.22800))
   147 	psraw	mm5,1			; mm5=(CbO * -FIX(0.22800))
   148 	paddw	mm0,[GOTOFF(eax,PW_ONE)]
   149 	paddw	mm1,[GOTOFF(eax,PW_ONE)]
   150 	psraw	mm0,1			; mm0=(CrE * FIX(0.40200))
   151 	psraw	mm1,1			; mm1=(CrO * FIX(0.40200))
   153 	paddw	mm4,mm2
   154 	paddw	mm5,mm3
   155 	paddw	mm4,mm2			; mm4=(CbE * FIX(1.77200))=(B-Y)E
   156 	paddw	mm5,mm3			; mm5=(CbO * FIX(1.77200))=(B-Y)O
   157 	paddw	mm0,mm6			; mm0=(CrE * FIX(1.40200))=(R-Y)E
   158 	paddw	mm1,mm7			; mm1=(CrO * FIX(1.40200))=(R-Y)O
   160 	movq	MMWORD [wk(0)], mm4	; wk(0)=(B-Y)E
   161 	movq	MMWORD [wk(1)], mm5	; wk(1)=(B-Y)O
   163 	movq      mm4,mm2
   164 	movq      mm5,mm3
   165 	punpcklwd mm2,mm6
   166 	punpckhwd mm4,mm6
   167 	pmaddwd   mm2,[GOTOFF(eax,PW_MF0344_F0285)]
   168 	pmaddwd   mm4,[GOTOFF(eax,PW_MF0344_F0285)]
   169 	punpcklwd mm3,mm7
   170 	punpckhwd mm5,mm7
   171 	pmaddwd   mm3,[GOTOFF(eax,PW_MF0344_F0285)]
   172 	pmaddwd   mm5,[GOTOFF(eax,PW_MF0344_F0285)]
   174 	paddd     mm2,[GOTOFF(eax,PD_ONEHALF)]
   175 	paddd     mm4,[GOTOFF(eax,PD_ONEHALF)]
   176 	psrad     mm2,SCALEBITS
   177 	psrad     mm4,SCALEBITS
   178 	paddd     mm3,[GOTOFF(eax,PD_ONEHALF)]
   179 	paddd     mm5,[GOTOFF(eax,PD_ONEHALF)]
   180 	psrad     mm3,SCALEBITS
   181 	psrad     mm5,SCALEBITS
   183 	packssdw  mm2,mm4	; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285)
   184 	packssdw  mm3,mm5	; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285)
   185 	psubw     mm2,mm6	; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E
   186 	psubw     mm3,mm7	; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O
   188 	movq      mm5, MMWORD [esi]	; mm5=Y(01234567)
   190 	pcmpeqw   mm4,mm4
   191 	psrlw     mm4,BYTE_BIT		; mm4={0xFF 0x00 0xFF 0x00 ..}
   192 	pand      mm4,mm5		; mm4=Y(0246)=YE
   193 	psrlw     mm5,BYTE_BIT		; mm5=Y(1357)=YO
   195 	paddw     mm0,mm4		; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6)
   196 	paddw     mm1,mm5		; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7)
   197 	packuswb  mm0,mm0		; mm0=(R0 R2 R4 R6 ** ** ** **)
   198 	packuswb  mm1,mm1		; mm1=(R1 R3 R5 R7 ** ** ** **)
   200 	paddw     mm2,mm4		; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6)
   201 	paddw     mm3,mm5		; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7)
   202 	packuswb  mm2,mm2		; mm2=(G0 G2 G4 G6 ** ** ** **)
   203 	packuswb  mm3,mm3		; mm3=(G1 G3 G5 G7 ** ** ** **)
   205 	paddw     mm4, MMWORD [wk(0)]	; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6)
   206 	paddw     mm5, MMWORD [wk(1)]	; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7)
   207 	packuswb  mm4,mm4		; mm4=(B0 B2 B4 B6 ** ** ** **)
   208 	packuswb  mm5,mm5		; mm5=(B1 B3 B5 B7 ** ** ** **)
   210 %if RGB_PIXELSIZE == 3 ; ---------------
   212 	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
   213 	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
   214 	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
   215 	; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **)
   217 	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
   218 	punpcklbw mmE,mmB		; mmE=(20 01 22 03 24 05 26 07)
   219 	punpcklbw mmD,mmF		; mmD=(11 21 13 23 15 25 17 27)
   221 	movq      mmG,mmA
   222 	movq      mmH,mmA
   223 	punpcklwd mmA,mmE		; mmA=(00 10 20 01 02 12 22 03)
   224 	punpckhwd mmG,mmE		; mmG=(04 14 24 05 06 16 26 07)
   226 	psrlq     mmH,2*BYTE_BIT	; mmH=(02 12 04 14 06 16 -- --)
   227 	psrlq     mmE,2*BYTE_BIT	; mmE=(22 03 24 05 26 07 -- --)
   229 	movq      mmC,mmD
   230 	movq      mmB,mmD
   231 	punpcklwd mmD,mmH		; mmD=(11 21 02 12 13 23 04 14)
   232 	punpckhwd mmC,mmH		; mmC=(15 25 06 16 17 27 -- --)
   234 	psrlq     mmB,2*BYTE_BIT	; mmB=(13 23 15 25 17 27 -- --)
   236 	movq      mmF,mmE
   237 	punpcklwd mmE,mmB		; mmE=(22 03 13 23 24 05 15 25)
   238 	punpckhwd mmF,mmB		; mmF=(26 07 17 27 -- -- -- --)
   240 	punpckldq mmA,mmD		; mmA=(00 10 20 01 11 21 02 12)
   241 	punpckldq mmE,mmG		; mmE=(22 03 13 23 04 14 24 05)
   242 	punpckldq mmC,mmF		; mmC=(15 25 06 16 26 07 17 27)
   244 	cmp	ecx, byte SIZEOF_MMWORD
   245 	jb	short .column_st16
   247 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
   248 	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
   249 	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
   251 	sub	ecx, byte SIZEOF_MMWORD
   252 	jz	short .nextrow
   254 	add	esi, byte SIZEOF_MMWORD			; inptr0
   255 	add	ebx, byte SIZEOF_MMWORD			; inptr1
   256 	add	edx, byte SIZEOF_MMWORD			; inptr2
   257 	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
   258 	jmp	near .columnloop
   259 	alignx	16,7
   261 .column_st16:
   262 	lea	ecx, [ecx+ecx*2]	; imul ecx, RGB_PIXELSIZE
   263 	cmp	ecx, byte 2*SIZEOF_MMWORD
   264 	jb	short .column_st8
   265 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
   266 	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmE
   267 	movq	mmA,mmC
   268 	sub	ecx, byte 2*SIZEOF_MMWORD
   269 	add	edi, byte 2*SIZEOF_MMWORD
   270 	jmp	short .column_st4
   271 .column_st8:
   272 	cmp	ecx, byte SIZEOF_MMWORD
   273 	jb	short .column_st4
   274 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
   275 	movq	mmA,mmE
   276 	sub	ecx, byte SIZEOF_MMWORD
   277 	add	edi, byte SIZEOF_MMWORD
   278 .column_st4:
   279 	movd	eax,mmA
   280 	cmp	ecx, byte SIZEOF_DWORD
   281 	jb	short .column_st2
   282 	mov	DWORD [edi+0*SIZEOF_DWORD], eax
   283 	psrlq	mmA,DWORD_BIT
   284 	movd	eax,mmA
   285 	sub	ecx, byte SIZEOF_DWORD
   286 	add	edi, byte SIZEOF_DWORD
   287 .column_st2:
   288 	cmp	ecx, byte SIZEOF_WORD
   289 	jb	short .column_st1
   290 	mov	WORD [edi+0*SIZEOF_WORD], ax
   291 	shr	eax,WORD_BIT
   292 	sub	ecx, byte SIZEOF_WORD
   293 	add	edi, byte SIZEOF_WORD
   294 .column_st1:
   295 	cmp	ecx, byte SIZEOF_BYTE
   296 	jb	short .nextrow
   297 	mov	BYTE [edi+0*SIZEOF_BYTE], al
   299 %else ; RGB_PIXELSIZE == 4 ; -----------
   301 %ifdef RGBX_FILLER_0XFF
   302 	pcmpeqb   mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
   303 	pcmpeqb   mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
   304 %else
   305 	pxor      mm6,mm6		; mm6=(X0 X2 X4 X6 ** ** ** **)
   306 	pxor      mm7,mm7		; mm7=(X1 X3 X5 X7 ** ** ** **)
   307 %endif
   308 	; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **)
   309 	; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **)
   310 	; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **)
   311 	; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **)
   313 	punpcklbw mmA,mmC		; mmA=(00 10 02 12 04 14 06 16)
   314 	punpcklbw mmE,mmG		; mmE=(20 30 22 32 24 34 26 36)
   315 	punpcklbw mmB,mmD		; mmB=(01 11 03 13 05 15 07 17)
   316 	punpcklbw mmF,mmH		; mmF=(21 31 23 33 25 35 27 37)
   318 	movq      mmC,mmA
   319 	punpcklwd mmA,mmE		; mmA=(00 10 20 30 02 12 22 32)
   320 	punpckhwd mmC,mmE		; mmC=(04 14 24 34 06 16 26 36)
   321 	movq      mmG,mmB
   322 	punpcklwd mmB,mmF		; mmB=(01 11 21 31 03 13 23 33)
   323 	punpckhwd mmG,mmF		; mmG=(05 15 25 35 07 17 27 37)
   325 	movq      mmD,mmA
   326 	punpckldq mmA,mmB		; mmA=(00 10 20 30 01 11 21 31)
   327 	punpckhdq mmD,mmB		; mmD=(02 12 22 32 03 13 23 33)
   328 	movq      mmH,mmC
   329 	punpckldq mmC,mmG		; mmC=(04 14 24 34 05 15 25 35)
   330 	punpckhdq mmH,mmG		; mmH=(06 16 26 36 07 17 27 37)
   332 	cmp	ecx, byte SIZEOF_MMWORD
   333 	jb	short .column_st16
   335 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
   336 	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
   337 	movq	MMWORD [edi+2*SIZEOF_MMWORD], mmC
   338 	movq	MMWORD [edi+3*SIZEOF_MMWORD], mmH
   340 	sub	ecx, byte SIZEOF_MMWORD
   341 	jz	short .nextrow
   343 	add	esi, byte SIZEOF_MMWORD			; inptr0
   344 	add	ebx, byte SIZEOF_MMWORD			; inptr1
   345 	add	edx, byte SIZEOF_MMWORD			; inptr2
   346 	add	edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD	; outptr
   347 	jmp	near .columnloop
   348 	alignx	16,7
   350 .column_st16:
   351 	cmp	ecx, byte SIZEOF_MMWORD/2
   352 	jb	short .column_st8
   353 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
   354 	movq	MMWORD [edi+1*SIZEOF_MMWORD], mmD
   355 	movq	mmA,mmC
   356 	movq	mmD,mmH
   357 	sub	ecx, byte SIZEOF_MMWORD/2
   358 	add	edi, byte 2*SIZEOF_MMWORD
   359 .column_st8:
   360 	cmp	ecx, byte SIZEOF_MMWORD/4
   361 	jb	short .column_st4
   362 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mmA
   363 	movq	mmA,mmD
   364 	sub	ecx, byte SIZEOF_MMWORD/4
   365 	add	edi, byte 1*SIZEOF_MMWORD
   366 .column_st4:
   367 	cmp	ecx, byte SIZEOF_MMWORD/8
   368 	jb	short .nextrow
   369 	movd	DWORD [edi+0*SIZEOF_DWORD], mmA
   371 %endif ; RGB_PIXELSIZE ; ---------------
   373 	alignx	16,7
   375 .nextrow:
   376 	pop	ecx
   377 	pop	esi
   378 	pop	ebx
   379 	pop	edx
   380 	pop	edi
   381 	pop	eax
   383 	add	esi, byte SIZEOF_JSAMPROW
   384 	add	ebx, byte SIZEOF_JSAMPROW
   385 	add	edx, byte SIZEOF_JSAMPROW
   386 	add	edi, byte SIZEOF_JSAMPROW	; output_buf
   387 	dec	eax				; num_rows
   388 	jg	near .rowloop
   390 	emms		; empty MMX state
   392 .return:
   393 	pop	edi
   394 	pop	esi
   395 ;	pop	edx		; need not be preserved
   396 ;	pop	ecx		; need not be preserved
   397 	pop	ebx
   398 	mov	esp,ebp		; esp <- aligned ebp
   399 	pop	esp		; esp <- original ebp
   400 	pop	ebp
   401 	ret
   403 ; For some reason, the OS X linker does not honor the request to align the
   404 ; segment unless we do this.
   405 	align	16

mercurial