media/libjpeg/simd/jcsamss2-64.asm

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 ;
     2 ; jcsamss2-64.asm - downsampling (64-bit SSE2)
     3 ;
     4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
     5 ; Copyright 2009 D. R. Commander
     6 ;
     7 ; Based on
     8 ; x86 SIMD extension for IJG JPEG library
     9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
    10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
    11 ;
    12 ; This file should be assembled with NASM (Netwide Assembler),
    13 ; can *not* be assembled with Microsoft's MASM or any compatible
    14 ; assembler (including Borland's Turbo Assembler).
    15 ; NASM is available from http://nasm.sourceforge.net/ or
    16 ; http://sourceforge.net/project/showfiles.php?group_id=6208
    17 ;
    18 ; [TAB8]
    20 %include "jsimdext.inc"
    22 ; --------------------------------------------------------------------------
    23 	SECTION	SEG_TEXT
    24 	BITS	64
    25 ;
    26 ; Downsample pixel values of a single component.
    27 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
    28 ; without smoothing.
    29 ;
    30 ; GLOBAL(void)
    31 ; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
    32 ;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
    33 ;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
    34 ;
    36 ; r10 = JDIMENSION image_width
    37 ; r11 = int max_v_samp_factor
    38 ; r12 = JDIMENSION v_samp_factor
    39 ; r13 = JDIMENSION width_blocks
    40 ; r14 = JSAMPARRAY input_data
    41 ; r15 = JSAMPARRAY output_data
    43 	align	16
    44 	global	EXTN(jsimd_h2v1_downsample_sse2)
    46 EXTN(jsimd_h2v1_downsample_sse2):
    47 	push	rbp
    48 	mov	rax,rsp
    49 	mov	rbp,rsp
    50 	collect_args
    52 	mov rcx, r13
    53 	shl	rcx,3			; imul rcx,DCTSIZE (rcx = output_cols)
    54 	jz	near .return
    56 	mov rdx, r10
    58 	; -- expand_right_edge
    60 	push	rcx
    61 	shl	rcx,1				; output_cols * 2
    62 	sub	rcx,rdx
    63 	jle	short .expand_end
    65 	mov	rax, r11
    66 	test	rax,rax
    67 	jle	short .expand_end
    69 	cld
    70 	mov	rsi, r14	; input_data
    71 .expandloop:
    72 	push	rax
    73 	push	rcx
    75 	mov	rdi, JSAMPROW [rsi]
    76 	add	rdi,rdx
    77 	mov	al, JSAMPLE [rdi-1]
    79 	rep stosb
    81 	pop	rcx
    82 	pop	rax
    84 	add	rsi, byte SIZEOF_JSAMPROW
    85 	dec	rax
    86 	jg	short .expandloop
    88 .expand_end:
    89 	pop	rcx				; output_cols
    91 	; -- h2v1_downsample
    93 	mov	rax, r12	; rowctr
    94 	test	eax,eax
    95 	jle	near .return
    97 	mov	rdx, 0x00010000		; bias pattern
    98 	movd	xmm7,edx
    99 	pcmpeqw	xmm6,xmm6
   100 	pshufd	xmm7,xmm7,0x00		; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
   101 	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
   103 	mov	rsi, r14	; input_data
   104 	mov	rdi, r15	; output_data
   105 .rowloop:
   106 	push	rcx
   107 	push	rdi
   108 	push	rsi
   110 	mov	rsi, JSAMPROW [rsi]		; inptr
   111 	mov rdi, JSAMPROW [rdi]		; outptr
   113 	cmp	rcx, byte SIZEOF_XMMWORD
   114 	jae	short .columnloop
   116 .columnloop_r8:
   117 	movdqa	xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
   118 	pxor	xmm1,xmm1
   119 	mov	rcx, SIZEOF_XMMWORD
   120 	jmp	short .downsample
   122 .columnloop:
   123 	movdqa	xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
   124 	movdqa	xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
   126 .downsample:
   127 	movdqa	xmm2,xmm0
   128 	movdqa	xmm3,xmm1
   130 	pand	xmm0,xmm6
   131 	psrlw	xmm2,BYTE_BIT
   132 	pand	xmm1,xmm6
   133 	psrlw	xmm3,BYTE_BIT
   135 	paddw	xmm0,xmm2
   136 	paddw	xmm1,xmm3
   137 	paddw	xmm0,xmm7
   138 	paddw	xmm1,xmm7
   139 	psrlw	xmm0,1
   140 	psrlw	xmm1,1
   142 	packuswb xmm0,xmm1
   144 	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
   146 	sub	rcx, byte SIZEOF_XMMWORD	; outcol
   147 	add	rsi, byte 2*SIZEOF_XMMWORD	; inptr
   148 	add	rdi, byte 1*SIZEOF_XMMWORD	; outptr
   149 	cmp	rcx, byte SIZEOF_XMMWORD
   150 	jae	short .columnloop
   151 	test	rcx,rcx
   152 	jnz	short .columnloop_r8
   154 	pop	rsi
   155 	pop	rdi
   156 	pop	rcx
   158 	add	rsi, byte SIZEOF_JSAMPROW	; input_data
   159 	add	rdi, byte SIZEOF_JSAMPROW	; output_data
   160 	dec	rax				; rowctr
   161 	jg	near .rowloop
   163 .return:
   164 	uncollect_args
   165 	pop	rbp
   166 	ret
   168 ; --------------------------------------------------------------------------
   169 ;
   170 ; Downsample pixel values of a single component.
   171 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
   172 ; without smoothing.
   173 ;
   174 ; GLOBAL(void)
   175 ; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
   176 ;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
   177 ;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
   178 ;
   180 ; r10 = JDIMENSION image_width
   181 ; r11 = int max_v_samp_factor
   182 ; r12 = JDIMENSION v_samp_factor
   183 ; r13 = JDIMENSION width_blocks
   184 ; r14 = JSAMPARRAY input_data
   185 ; r15 = JSAMPARRAY output_data
   187 	align	16
   188 	global	EXTN(jsimd_h2v2_downsample_sse2)
   190 EXTN(jsimd_h2v2_downsample_sse2):
   191 	push	rbp
   192 	mov	rax,rsp
   193 	mov	rbp,rsp
   194 	collect_args
   196 	mov	rcx, r13
   197 	shl	rcx,3			; imul rcx,DCTSIZE (rcx = output_cols)
   198 	jz	near .return
   200 	mov	rdx, r10
   202 	; -- expand_right_edge
   204 	push	rcx
   205 	shl	rcx,1				; output_cols * 2
   206 	sub	rcx,rdx
   207 	jle	short .expand_end
   209 	mov	rax, r11
   210 	test	rax,rax
   211 	jle	short .expand_end
   213 	cld
   214 	mov	rsi, r14	; input_data
   215 .expandloop:
   216 	push	rax
   217 	push	rcx
   219 	mov	rdi, JSAMPROW [rsi]
   220 	add	rdi,rdx
   221 	mov	al, JSAMPLE [rdi-1]
   223 	rep stosb
   225 	pop	rcx
   226 	pop	rax
   228 	add	rsi, byte SIZEOF_JSAMPROW
   229 	dec	rax
   230 	jg	short .expandloop
   232 .expand_end:
   233 	pop	rcx				; output_cols
   235 	; -- h2v2_downsample
   237 	mov	rax, r12	; rowctr
   238 	test	rax,rax
   239 	jle	near .return
   241 	mov	rdx, 0x00020001		; bias pattern
   242 	movd	xmm7,edx
   243 	pcmpeqw	xmm6,xmm6
   244 	pshufd	xmm7,xmm7,0x00		; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
   245 	psrlw	xmm6,BYTE_BIT		; xmm6={0xFF 0x00 0xFF 0x00 ..}
   247 	mov	rsi, r14	; input_data
   248 	mov	rdi, r15	; output_data
   249 .rowloop:
   250 	push	rcx
   251 	push	rdi
   252 	push	rsi
   254 	mov	rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]	; inptr0
   255 	mov	rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]	; inptr1
   256 	mov	rdi, JSAMPROW [rdi]			; outptr
   258 	cmp	rcx, byte SIZEOF_XMMWORD
   259 	jae	short .columnloop
   261 .columnloop_r8:
   262 	movdqa	xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
   263 	movdqa	xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
   264 	pxor	xmm2,xmm2
   265 	pxor	xmm3,xmm3
   266 	mov	rcx, SIZEOF_XMMWORD
   267 	jmp	short .downsample
   269 .columnloop:
   270 	movdqa	xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
   271 	movdqa	xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
   272 	movdqa	xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
   273 	movdqa	xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
   275 .downsample:
   276 	movdqa	xmm4,xmm0
   277 	movdqa	xmm5,xmm1
   278 	pand	xmm0,xmm6
   279 	psrlw	xmm4,BYTE_BIT
   280 	pand	xmm1,xmm6
   281 	psrlw	xmm5,BYTE_BIT
   282 	paddw	xmm0,xmm4
   283 	paddw	xmm1,xmm5
   285 	movdqa	xmm4,xmm2
   286 	movdqa	xmm5,xmm3
   287 	pand	xmm2,xmm6
   288 	psrlw	xmm4,BYTE_BIT
   289 	pand	xmm3,xmm6
   290 	psrlw	xmm5,BYTE_BIT
   291 	paddw	xmm2,xmm4
   292 	paddw	xmm3,xmm5
   294 	paddw	xmm0,xmm1
   295 	paddw	xmm2,xmm3
   296 	paddw	xmm0,xmm7
   297 	paddw	xmm2,xmm7
   298 	psrlw	xmm0,2
   299 	psrlw	xmm2,2
   301 	packuswb xmm0,xmm2
   303 	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
   305 	sub	rcx, byte SIZEOF_XMMWORD	; outcol
   306 	add	rdx, byte 2*SIZEOF_XMMWORD	; inptr0
   307 	add	rsi, byte 2*SIZEOF_XMMWORD	; inptr1
   308 	add	rdi, byte 1*SIZEOF_XMMWORD	; outptr
   309 	cmp	rcx, byte SIZEOF_XMMWORD
   310 	jae	near .columnloop
   311 	test	rcx,rcx
   312 	jnz	near .columnloop_r8
   314 	pop	rsi
   315 	pop	rdi
   316 	pop	rcx
   318 	add	rsi, byte 2*SIZEOF_JSAMPROW	; input_data
   319 	add	rdi, byte 1*SIZEOF_JSAMPROW	; output_data
   320 	dec	rax				; rowctr
   321 	jg	near .rowloop
   323 .return:
   324 	uncollect_args
   325 	pop	rbp
   326 	ret
   328 ; For some reason, the OS X linker does not honor the request to align the
   329 ; segment unless we do this.
   330 	align	16

mercurial