media/libjpeg/simd/jcsammmx.asm

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 ;
     2 ; jcsammmx.asm - downsampling (MMX)
     3 ;
     4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
     5 ;
     6 ; Based on
     7 ; x86 SIMD extension for IJG JPEG library
     8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
    10 ;
    11 ; This file should be assembled with NASM (Netwide Assembler),
    12 ; can *not* be assembled with Microsoft's MASM or any compatible
    13 ; assembler (including Borland's Turbo Assembler).
    14 ; NASM is available from http://nasm.sourceforge.net/ or
    15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
    16 ;
    17 ; [TAB8]
    19 %include "jsimdext.inc"
    21 ; --------------------------------------------------------------------------
    22 	SECTION	SEG_TEXT
    23 	BITS	32
    24 ;
    25 ; Downsample pixel values of a single component.
    26 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
    27 ; without smoothing.
    28 ;
    29 ; GLOBAL(void)
    30 ; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
    31 ;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
    32 ;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
    33 ;
    35 %define img_width(b)	(b)+8			; JDIMENSION image_width
    36 %define max_v_samp(b)	(b)+12		; int max_v_samp_factor
    37 %define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
    38 %define width_blks(b)	(b)+20		; JDIMENSION width_blocks
    39 %define input_data(b)	(b)+24		; JSAMPARRAY input_data
    40 %define output_data(b)	(b)+28	; JSAMPARRAY output_data
    42 	align	16
    43 	global	EXTN(jsimd_h2v1_downsample_mmx)
    45 EXTN(jsimd_h2v1_downsample_mmx):
    46 	push	ebp
    47 	mov	ebp,esp
    48 ;	push	ebx		; unused
    49 ;	push	ecx		; need not be preserved
    50 ;	push	edx		; need not be preserved
    51 	push	esi
    52 	push	edi
    54 	mov	ecx, JDIMENSION [width_blks(ebp)]
    55 	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
    56 	jz	near .return
    58 	mov	edx, JDIMENSION [img_width(ebp)]
    60 	; -- expand_right_edge
    62 	push	ecx
    63 	shl	ecx,1				; output_cols * 2
    64 	sub	ecx,edx
    65 	jle	short .expand_end
    67 	mov	eax, INT [max_v_samp(ebp)]
    68 	test	eax,eax
    69 	jle	short .expand_end
    71 	cld
    72 	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
    73 	alignx	16,7
    74 .expandloop:
    75 	push	eax
    76 	push	ecx
    78 	mov	edi, JSAMPROW [esi]
    79 	add	edi,edx
    80 	mov	al, JSAMPLE [edi-1]
    82 	rep stosb
    84 	pop	ecx
    85 	pop	eax
    87 	add	esi, byte SIZEOF_JSAMPROW
    88 	dec	eax
    89 	jg	short .expandloop
    91 .expand_end:
    92 	pop	ecx				; output_cols
    94 	; -- h2v1_downsample
    96 	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
    97 	test	eax,eax
    98 	jle	near .return
   100 	mov       edx, 0x00010000	; bias pattern
   101 	movd      mm7,edx
   102 	pcmpeqw   mm6,mm6
   103 	punpckldq mm7,mm7		; mm7={0, 1, 0, 1}
   104 	psrlw     mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
   106 	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
   107 	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
   108 	alignx	16,7
   109 .rowloop:
   110 	push	ecx
   111 	push	edi
   112 	push	esi
   114 	mov	esi, JSAMPROW [esi]		; inptr
   115 	mov	edi, JSAMPROW [edi]		; outptr
   116 	alignx	16,7
   117 .columnloop:
   119 	movq	mm0, MMWORD [esi+0*SIZEOF_MMWORD]
   120 	movq	mm1, MMWORD [esi+1*SIZEOF_MMWORD]
   121 	movq	mm2,mm0
   122 	movq	mm3,mm1
   124 	pand	mm0,mm6
   125 	psrlw	mm2,BYTE_BIT
   126 	pand	mm1,mm6
   127 	psrlw	mm3,BYTE_BIT
   129 	paddw	mm0,mm2
   130 	paddw	mm1,mm3
   131 	paddw	mm0,mm7
   132 	paddw	mm1,mm7
   133 	psrlw	mm0,1
   134 	psrlw	mm1,1
   136 	packuswb mm0,mm1
   138 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
   140 	add	esi, byte 2*SIZEOF_MMWORD	; inptr
   141 	add	edi, byte 1*SIZEOF_MMWORD	; outptr
   142 	sub	ecx, byte SIZEOF_MMWORD		; outcol
   143 	jnz	short .columnloop
   145 	pop	esi
   146 	pop	edi
   147 	pop	ecx
   149 	add	esi, byte SIZEOF_JSAMPROW	; input_data
   150 	add	edi, byte SIZEOF_JSAMPROW	; output_data
   151 	dec	eax				; rowctr
   152 	jg	short .rowloop
   154 	emms		; empty MMX state
   156 .return:
   157 	pop	edi
   158 	pop	esi
   159 ;	pop	edx		; need not be preserved
   160 ;	pop	ecx		; need not be preserved
   161 ;	pop	ebx		; unused
   162 	pop	ebp
   163 	ret
   165 ; --------------------------------------------------------------------------
   166 ;
   167 ; Downsample pixel values of a single component.
   168 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
   169 ; without smoothing.
   170 ;
   171 ; GLOBAL(void)
   172 ; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
   173 ;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
   174 ;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
   175 ;
   177 %define img_width(b)	(b)+8			; JDIMENSION image_width
   178 %define max_v_samp(b)	(b)+12		; int max_v_samp_factor
   179 %define v_samp(b)			(b)+16		; JDIMENSION v_samp_factor
   180 %define width_blks(b)	(b)+20		; JDIMENSION width_blocks
   181 %define input_data(b)	(b)+24		; JSAMPARRAY input_data
   182 %define output_data(b)	(b)+28	; JSAMPARRAY output_data
   184 	align	16
   185 	global	EXTN(jsimd_h2v2_downsample_mmx)
   187 EXTN(jsimd_h2v2_downsample_mmx):
   188 	push	ebp
   189 	mov	ebp,esp
   190 ;	push	ebx		; unused
   191 ;	push	ecx		; need not be preserved
   192 ;	push	edx		; need not be preserved
   193 	push	esi
   194 	push	edi
   196 	mov	ecx, JDIMENSION [width_blks(ebp)]
   197 	shl	ecx,3			; imul ecx,DCTSIZE (ecx = output_cols)
   198 	jz	near .return
   200 	mov	edx, JDIMENSION [img_width(ebp)]
   202 	; -- expand_right_edge
   204 	push	ecx
   205 	shl	ecx,1				; output_cols * 2
   206 	sub	ecx,edx
   207 	jle	short .expand_end
   209 	mov	eax, INT [max_v_samp(ebp)]
   210 	test	eax,eax
   211 	jle	short .expand_end
   213 	cld
   214 	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
   215 	alignx	16,7
   216 .expandloop:
   217 	push	eax
   218 	push	ecx
   220 	mov	edi, JSAMPROW [esi]
   221 	add	edi,edx
   222 	mov	al, JSAMPLE [edi-1]
   224 	rep stosb
   226 	pop	ecx
   227 	pop	eax
   229 	add	esi, byte SIZEOF_JSAMPROW
   230 	dec	eax
   231 	jg	short .expandloop
   233 .expand_end:
   234 	pop	ecx				; output_cols
   236 	; -- h2v2_downsample
   238 	mov	eax, JDIMENSION [v_samp(ebp)]	; rowctr
   239 	test	eax,eax
   240 	jle	near .return
   242 	mov       edx, 0x00020001	; bias pattern
   243 	movd      mm7,edx
   244 	pcmpeqw   mm6,mm6
   245 	punpckldq mm7,mm7		; mm7={1, 2, 1, 2}
   246 	psrlw     mm6,BYTE_BIT		; mm6={0xFF 0x00 0xFF 0x00 ..}
   248 	mov	esi, JSAMPARRAY [input_data(ebp)]	; input_data
   249 	mov	edi, JSAMPARRAY [output_data(ebp)]	; output_data
   250 	alignx	16,7
   251 .rowloop:
   252 	push	ecx
   253 	push	edi
   254 	push	esi
   256 	mov	edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]	; inptr0
   257 	mov	esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]	; inptr1
   258 	mov	edi, JSAMPROW [edi]			; outptr
   259 	alignx	16,7
   260 .columnloop:
   262 	movq	mm0, MMWORD [edx+0*SIZEOF_MMWORD]
   263 	movq	mm1, MMWORD [esi+0*SIZEOF_MMWORD]
   264 	movq	mm2, MMWORD [edx+1*SIZEOF_MMWORD]
   265 	movq	mm3, MMWORD [esi+1*SIZEOF_MMWORD]
   267 	movq	mm4,mm0
   268 	movq	mm5,mm1
   269 	pand	mm0,mm6
   270 	psrlw	mm4,BYTE_BIT
   271 	pand	mm1,mm6
   272 	psrlw	mm5,BYTE_BIT
   273 	paddw	mm0,mm4
   274 	paddw	mm1,mm5
   276 	movq	mm4,mm2
   277 	movq	mm5,mm3
   278 	pand	mm2,mm6
   279 	psrlw	mm4,BYTE_BIT
   280 	pand	mm3,mm6
   281 	psrlw	mm5,BYTE_BIT
   282 	paddw	mm2,mm4
   283 	paddw	mm3,mm5
   285 	paddw	mm0,mm1
   286 	paddw	mm2,mm3
   287 	paddw	mm0,mm7
   288 	paddw	mm2,mm7
   289 	psrlw	mm0,2
   290 	psrlw	mm2,2
   292 	packuswb mm0,mm2
   294 	movq	MMWORD [edi+0*SIZEOF_MMWORD], mm0
   296 	add	edx, byte 2*SIZEOF_MMWORD	; inptr0
   297 	add	esi, byte 2*SIZEOF_MMWORD	; inptr1
   298 	add	edi, byte 1*SIZEOF_MMWORD	; outptr
   299 	sub	ecx, byte SIZEOF_MMWORD		; outcol
   300 	jnz	near .columnloop
   302 	pop	esi
   303 	pop	edi
   304 	pop	ecx
   306 	add	esi, byte 2*SIZEOF_JSAMPROW	; input_data
   307 	add	edi, byte 1*SIZEOF_JSAMPROW	; output_data
   308 	dec	eax				; rowctr
   309 	jg	near .rowloop
   311 	emms		; empty MMX state
   313 .return:
   314 	pop	edi
   315 	pop	esi
   316 ;	pop	edx		; need not be preserved
   317 ;	pop	ecx		; need not be preserved
   318 ;	pop	ebx		; unused
   319 	pop	ebp
   320 	ret
   322 ; For some reason, the OS X linker does not honor the request to align the
   323 ; segment unless we do this.
   324 	align	16

mercurial