media/libjpeg/simd/jcsamss2.asm

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 ;
     2 ; jcsamss2.asm - downsampling (SSE2)
     3 ;
     4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
     5 ;
     6 ; Based on
     7 ; x86 SIMD extension for IJG JPEG library
     8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
    10 ;
    11 ; This file should be assembled with NASM (Netwide Assembler),
    12 ; can *not* be assembled with Microsoft's MASM or any compatible
    13 ; assembler (including Borland's Turbo Assembler).
    14 ; NASM is available from http://nasm.sourceforge.net/ or
    15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
    16 ;
    17 ; [TAB8]
    19 %include "jsimdext.inc"
    21 ; --------------------------------------------------------------------------
    22 	SECTION	SEG_TEXT
    23 	BITS	32
    24 ;
    25 ; Downsample pixel values of a single component.
    26 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
    27 ; without smoothing.
    28 ;
    29 ; GLOBAL(void)
    30 ; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
    31 ;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
    32 ;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
    33 ;
    35 %define img_width(b)	(b)+8			; JDIMENSION image_width
    36 %define max_v_samp(b)	(b)+12		; int max_v_samp_factor
    37 %define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
    38 %define width_blks(b)	(b)+20		; JDIMENSION width_blocks
    39 %define input_data(b)	(b)+24		; JSAMPARRAY input_data
    40 %define output_data(b)	(b)+28		; JSAMPARRAY output_data
    42 	align	16
    43 	global	EXTN(jsimd_h2v1_downsample_sse2)
    45 EXTN(jsimd_h2v1_downsample_sse2):
    46 	push	ebp
    47 	mov	ebp,esp
    48 ;	push	ebx		; unused
    49 ;	push	ecx		; need not be preserved
    50 ;	push	edx		; need not be preserved
    51 	push	esi
    52 	push	edi
    54 	mov	ecx, JDIMENSION [width_blks(ebp)]
    55 	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
    56 	jz	near .return
    58 	mov	edx, JDIMENSION [img_width(ebp)]
    60 	; -- expand_right_edge
    62 	push	ecx
    63 	shl	ecx,1				; output_cols * 2
    64 	sub	ecx,edx
    65 	jle	short .expand_end
    67 	mov	eax, INT [max_v_samp(ebp)]
    68 	test	eax,eax
    69 	jle	short .expand_end
    71 	cld
    72 	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
    73 	alignx	16,7
    74 .expandloop:
    75 	push	eax
    76 	push	ecx
    78 	mov	edi, JSAMPROW [esi]
    79 	add	edi,edx
    80 	mov	al, JSAMPLE [edi-1]
    82 	rep stosb
    84 	pop	ecx
    85 	pop	eax
    87 	add	esi, byte SIZEOF_JSAMPROW
    88 	dec	eax
    89 	jg	short .expandloop
    91 .expand_end:
    92 	pop	ecx				; output_cols
    94 	; -- h2v1_downsample
    96 	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
    97 	test	eax,eax
    98 	jle	near .return
   100 	mov	edx, 0x00010000		; bias pattern
   101 	movd	xmm7,edx
   102 	pcmpeqw	xmm6,xmm6
   103 	pshufd	xmm7,xmm7,0x00		; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
   104 	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
   106 	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
   107 	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
   108 	alignx	16,7
   109 .rowloop:
   110 	push	ecx
   111 	push	edi
   112 	push	esi
   114 	mov	esi, JSAMPROW [esi]		; inptr
   115 	mov	edi, JSAMPROW [edi]		; outptr
   117 	cmp	ecx, byte SIZEOF_XMMWORD
   118 	jae	short .columnloop
   119 	alignx	16,7
   121 .columnloop_r8:
   122 	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
   123 	pxor	xmm1,xmm1
   124 	mov	ecx, SIZEOF_XMMWORD
   125 	jmp	short .downsample
   126 	alignx	16,7
   128 .columnloop:
   129 	movdqa	xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
   130 	movdqa	xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
   132 .downsample:
   133 	movdqa	xmm2,xmm0
   134 	movdqa	xmm3,xmm1
   136 	pand	xmm0,xmm6
   137 	psrlw	xmm2,BYTE_BIT
   138 	pand	xmm1,xmm6
   139 	psrlw	xmm3,BYTE_BIT
   141 	paddw	xmm0,xmm2
   142 	paddw	xmm1,xmm3
   143 	paddw	xmm0,xmm7
   144 	paddw	xmm1,xmm7
   145 	psrlw	xmm0,1
   146 	psrlw	xmm1,1
   148 	packuswb xmm0,xmm1
   150 	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
   152 	sub	ecx, byte SIZEOF_XMMWORD	; outcol
   153 	add	esi, byte 2*SIZEOF_XMMWORD	; inptr
   154 	add	edi, byte 1*SIZEOF_XMMWORD	; outptr
   155 	cmp	ecx, byte SIZEOF_XMMWORD
   156 	jae	short .columnloop
   157 	test	ecx,ecx
   158 	jnz	short .columnloop_r8
   160 	pop	esi
   161 	pop	edi
   162 	pop	ecx
   164 	add	esi, byte SIZEOF_JSAMPROW	; input_data
   165 	add	edi, byte SIZEOF_JSAMPROW	; output_data
   166 	dec	eax				; rowctr
   167 	jg	near .rowloop
   169 .return:
   170 	pop	edi
   171 	pop	esi
   172 ;	pop	edx		; need not be preserved
   173 ;	pop	ecx		; need not be preserved
   174 ;	pop	ebx		; unused
   175 	pop	ebp
   176 	ret
   178 ; --------------------------------------------------------------------------
   179 ;
   180 ; Downsample pixel values of a single component.
   181 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
   182 ; without smoothing.
   183 ;
   184 ; GLOBAL(void)
   185 ; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
   186 ;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
   187 ;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
   188 ;
   190 %define img_width(b)	(b)+8			; JDIMENSION image_width
   191 %define max_v_samp(b)	(b)+12		; int max_v_samp_factor
   192 %define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
   193 %define width_blks(b)	(b)+20		; JDIMENSION width_blocks
   194 %define input_data(b)	(b)+24		; JSAMPARRAY input_data
   195 %define output_data(b)	(b)+28	; JSAMPARRAY output_data
   197 	align	16
   198 	global	EXTN(jsimd_h2v2_downsample_sse2)
   200 EXTN(jsimd_h2v2_downsample_sse2):
   201 	push	ebp
   202 	mov	ebp,esp
   203 ;	push	ebx		; unused
   204 ;	push	ecx		; need not be preserved
   205 ;	push	edx		; need not be preserved
   206 	push	esi
   207 	push	edi
   209 	mov	ecx, JDIMENSION [width_blks(ebp)]
   210 	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
   211 	jz	near .return
   213 	mov	edx, JDIMENSION [img_width(ebp)]
   215 	; -- expand_right_edge
   217 	push	ecx
   218 	shl	ecx,1				; output_cols * 2
   219 	sub	ecx,edx
   220 	jle	short .expand_end
   222 	mov	eax, INT [max_v_samp(ebp)]
   223 	test	eax,eax
   224 	jle	short .expand_end
   226 	cld
   227 	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
   228 	alignx	16,7
   229 .expandloop:
   230 	push	eax
   231 	push	ecx
   233 	mov	edi, JSAMPROW [esi]
   234 	add	edi,edx
   235 	mov	al, JSAMPLE [edi-1]
   237 	rep stosb
   239 	pop	ecx
   240 	pop	eax
   242 	add	esi, byte SIZEOF_JSAMPROW
   243 	dec	eax
   244 	jg	short .expandloop
   246 .expand_end:
   247 	pop	ecx				; output_cols
   249 	; -- h2v2_downsample
   251 	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
   252 	test	eax,eax
   253 	jle	near .return
   255 	mov	edx, 0x00020001		; bias pattern
   256 	movd	xmm7,edx
   257 	pcmpeqw	xmm6,xmm6
   258 	pshufd	xmm7,xmm7,0x00		; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
   259 	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
   261 	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
   262 	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
   263 	alignx	16,7
   264 .rowloop:
   265 	push	ecx
   266 	push	edi
   267 	push	esi
   269 	mov	edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
   270 	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1
   271 	mov	edi, JSAMPROW [edi]			; outptr
   273 	cmp	ecx, byte SIZEOF_XMMWORD
   274 	jae	short .columnloop
   275 	alignx	16,7
   277 .columnloop_r8:
   278 	movdqa	xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
   279 	movdqa	xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
   280 	pxor	xmm2,xmm2
   281 	pxor	xmm3,xmm3
   282 	mov	ecx, SIZEOF_XMMWORD
   283 	jmp	short .downsample
   284 	alignx	16,7
   286 .columnloop:
   287 	movdqa	xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
   288 	movdqa	xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
   289 	movdqa	xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
   290 	movdqa	xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
   292 .downsample:
   293 	movdqa	xmm4,xmm0
   294 	movdqa	xmm5,xmm1
   295 	pand	xmm0,xmm6
   296 	psrlw	xmm4,BYTE_BIT
   297 	pand	xmm1,xmm6
   298 	psrlw	xmm5,BYTE_BIT
   299 	paddw	xmm0,xmm4
   300 	paddw	xmm1,xmm5
   302 	movdqa	xmm4,xmm2
   303 	movdqa	xmm5,xmm3
   304 	pand	xmm2,xmm6
   305 	psrlw	xmm4,BYTE_BIT
   306 	pand	xmm3,xmm6
   307 	psrlw	xmm5,BYTE_BIT
   308 	paddw	xmm2,xmm4
   309 	paddw	xmm3,xmm5
   311 	paddw	xmm0,xmm1
   312 	paddw	xmm2,xmm3
   313 	paddw	xmm0,xmm7
   314 	paddw	xmm2,xmm7
   315 	psrlw	xmm0,2
   316 	psrlw	xmm2,2
   318 	packuswb xmm0,xmm2
   320 	movdqa	XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
   322 	sub	ecx, byte SIZEOF_XMMWORD	; outcol
   323 	add	edx, byte 2*SIZEOF_XMMWORD	; inptr0
   324 	add	esi, byte 2*SIZEOF_XMMWORD	; inptr1
   325 	add	edi, byte 1*SIZEOF_XMMWORD	; outptr
   326 	cmp	ecx, byte SIZEOF_XMMWORD
   327 	jae	near .columnloop
   328 	test	ecx,ecx
   329 	jnz	near .columnloop_r8
   331 	pop	esi
   332 	pop	edi
   333 	pop	ecx
   335 	add	esi, byte 2*SIZEOF_JSAMPROW	; input_data
   336 	add	edi, byte 1*SIZEOF_JSAMPROW	; output_data
   337 	dec	eax				; rowctr
   338 	jg	near .rowloop
   340 .return:
   341 	pop	edi
   342 	pop	esi
   343 ;	pop	edx		; need not be preserved
   344 ;	pop	ecx		; need not be preserved
   345 ;	pop	ebx		; unused
   346 	pop	ebp
   347 	ret
   349 ; For some reason, the OS X linker does not honor the request to align the
   350 ; segment unless we do this.
   351 	align	16

mercurial