media/libjpeg/simd/jdsamss2-64.asm

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 ;
     2 ; jdsamss2-64.asm - upsampling (64-bit SSE2)
     3 ;
     4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
     5 ; Copyright 2009 D. R. Commander
     6 ;
     7 ; Based on
     8 ; x86 SIMD extension for IJG JPEG library
     9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
    10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
    11 ;
    12 ; This file should be assembled with NASM (Netwide Assembler),
    13 ; can *not* be assembled with Microsoft's MASM or any compatible
    14 ; assembler (including Borland's Turbo Assembler).
    15 ; NASM is available from http://nasm.sourceforge.net/ or
    16 ; http://sourceforge.net/project/showfiles.php?group_id=6208
    17 ;
    18 ; [TAB8]
    20 %include "jsimdext.inc"
    22 ; --------------------------------------------------------------------------
    23 	SECTION	SEG_CONST
    25 	alignz	16
    26 	global	EXTN(jconst_fancy_upsample_sse2)
    28 EXTN(jconst_fancy_upsample_sse2):
    30 PW_ONE		times 8 dw  1
    31 PW_TWO		times 8 dw  2
    32 PW_THREE	times 8 dw  3
    33 PW_SEVEN	times 8 dw  7
    34 PW_EIGHT	times 8 dw  8
    36 	alignz	16
    38 ; --------------------------------------------------------------------------
    39 	SECTION	SEG_TEXT
    40 	BITS	64
    41 ;
    42 ; Fancy processing for the common case of 2:1 horizontal and 1:1 vertical.
    43 ;
    44 ; The upsampling algorithm is linear interpolation between pixel centers,
    45 ; also known as a "triangle filter".  This is a good compromise between
    46 ; speed and visual quality.  The centers of the output pixels are 1/4 and 3/4
    47 ; of the way between input pixel centers.
    48 ;
    49 ; GLOBAL(void)
    50 ; jsimd_h2v1_fancy_upsample_sse2 (int max_v_samp_factor,
    51 ;                                 JDIMENSION downsampled_width,
    52 ;                                 JSAMPARRAY input_data,
    53 ;                                 JSAMPARRAY * output_data_ptr);
    54 ;
    56 ; r10 = int max_v_samp_factor
    57 ; r11 = JDIMENSION downsampled_width
    58 ; r12 = JSAMPARRAY input_data
    59 ; r13 = JSAMPARRAY * output_data_ptr
    61 	align	16
    62 	global	EXTN(jsimd_h2v1_fancy_upsample_sse2)
    64 EXTN(jsimd_h2v1_fancy_upsample_sse2):
    65 	push	rbp
    66 	mov	rax,rsp
    67 	mov	rbp,rsp
    68 	collect_args
    70 	mov	rax, r11  ; colctr
    71 	test	rax,rax
    72 	jz	near .return
    74 	mov	rcx, r10	; rowctr
    75 	test	rcx,rcx
    76 	jz	near .return
    78 	mov	rsi, r12	; input_data
    79 	mov	rdi, r13
    80 	mov	rdi, JSAMPARRAY [rdi]			; output_data
    81 .rowloop:
    82 	push	rax			; colctr
    83 	push	rdi
    84 	push	rsi
    86 	mov	rsi, JSAMPROW [rsi]	; inptr
    87 	mov	rdi, JSAMPROW [rdi]	; outptr
    89 	test	rax, SIZEOF_XMMWORD-1
    90 	jz	short .skip
    91 	mov	dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
    92 	mov	JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
    93 .skip:
    94 	pxor	xmm0,xmm0		; xmm0=(all 0's)
    95 	pcmpeqb	xmm7,xmm7
    96 	psrldq	xmm7,(SIZEOF_XMMWORD-1)
    97 	pand	xmm7, XMMWORD [rsi+0*SIZEOF_XMMWORD]
    99 	add	rax, byte SIZEOF_XMMWORD-1
   100 	and	rax, byte -SIZEOF_XMMWORD
   101 	cmp	rax, byte SIZEOF_XMMWORD
   102 	ja	short .columnloop
   104 .columnloop_last:
   105 	pcmpeqb	xmm6,xmm6
   106 	pslldq	xmm6,(SIZEOF_XMMWORD-1)
   107 	pand	xmm6, XMMWORD [rsi+0*SIZEOF_XMMWORD]
   108 	jmp	short .upsample
   110 .columnloop:
   111 	movdqa	xmm6, XMMWORD [rsi+1*SIZEOF_XMMWORD]
   112 	pslldq	xmm6,(SIZEOF_XMMWORD-1)
   114 .upsample:
   115 	movdqa	xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
   116 	movdqa	xmm2,xmm1
   117 	movdqa	xmm3,xmm1		; xmm1=( 0  1  2 ... 13 14 15)
   118 	pslldq	xmm2,1			; xmm2=(--  0  1 ... 12 13 14)
   119 	psrldq	xmm3,1			; xmm3=( 1  2  3 ... 14 15 --)
   121 	por	xmm2,xmm7		; xmm2=(-1  0  1 ... 12 13 14)
   122 	por	xmm3,xmm6		; xmm3=( 1  2  3 ... 14 15 16)
   124 	movdqa	xmm7,xmm1
   125 	psrldq	xmm7,(SIZEOF_XMMWORD-1)	; xmm7=(15 -- -- ... -- -- --)
   127 	movdqa    xmm4,xmm1
   128 	punpcklbw xmm1,xmm0		; xmm1=( 0  1  2  3  4  5  6  7)
   129 	punpckhbw xmm4,xmm0		; xmm4=( 8  9 10 11 12 13 14 15)
   130 	movdqa    xmm5,xmm2
   131 	punpcklbw xmm2,xmm0		; xmm2=(-1  0  1  2  3  4  5  6)
   132 	punpckhbw xmm5,xmm0		; xmm5=( 7  8  9 10 11 12 13 14)
   133 	movdqa    xmm6,xmm3
   134 	punpcklbw xmm3,xmm0		; xmm3=( 1  2  3  4  5  6  7  8)
   135 	punpckhbw xmm6,xmm0		; xmm6=( 9 10 11 12 13 14 15 16)
   137 	pmullw	xmm1,[rel PW_THREE]
   138 	pmullw	xmm4,[rel PW_THREE]
   139 	paddw	xmm2,[rel PW_ONE]
   140 	paddw	xmm5,[rel PW_ONE]
   141 	paddw	xmm3,[rel PW_TWO]
   142 	paddw	xmm6,[rel PW_TWO]
   144 	paddw	xmm2,xmm1
   145 	paddw	xmm5,xmm4
   146 	psrlw	xmm2,2			; xmm2=OutLE=( 0  2  4  6  8 10 12 14)
   147 	psrlw	xmm5,2			; xmm5=OutHE=(16 18 20 22 24 26 28 30)
   148 	paddw	xmm3,xmm1
   149 	paddw	xmm6,xmm4
   150 	psrlw	xmm3,2			; xmm3=OutLO=( 1  3  5  7  9 11 13 15)
   151 	psrlw	xmm6,2			; xmm6=OutHO=(17 19 21 23 25 27 29 31)
   153 	psllw	xmm3,BYTE_BIT
   154 	psllw	xmm6,BYTE_BIT
   155 	por	xmm2,xmm3		; xmm2=OutL=( 0  1  2 ... 13 14 15)
   156 	por	xmm5,xmm6		; xmm5=OutH=(16 17 18 ... 29 30 31)
   158 	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
   159 	movdqa	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm5
   161 	sub	rax, byte SIZEOF_XMMWORD
   162 	add	rsi, byte 1*SIZEOF_XMMWORD	; inptr
   163 	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr
   164 	cmp	rax, byte SIZEOF_XMMWORD
   165 	ja	near .columnloop
   166 	test	eax,eax
   167 	jnz	near .columnloop_last
   169 	pop	rsi
   170 	pop	rdi
   171 	pop	rax
   173 	add	rsi, byte SIZEOF_JSAMPROW	; input_data
   174 	add	rdi, byte SIZEOF_JSAMPROW	; output_data
   175 	dec	rcx				; rowctr
   176 	jg	near .rowloop
   178 .return:
   179 	uncollect_args
   180 	pop	rbp
   181 	ret
   183 ; --------------------------------------------------------------------------
   184 ;
   185 ; Fancy processing for the common case of 2:1 horizontal and 2:1 vertical.
   186 ; Again a triangle filter; see comments for h2v1 case, above.
   187 ;
   188 ; GLOBAL(void)
   189 ; jsimd_h2v2_fancy_upsample_sse2 (int max_v_samp_factor,
   190 ;                                 JDIMENSION downsampled_width,
   191 ;                                 JSAMPARRAY input_data,
   192 ;                                 JSAMPARRAY * output_data_ptr);
   193 ;
   195 ; r10 = int max_v_samp_factor
   196 ; r11 = JDIMENSION downsampled_width
   197 ; r12 = JSAMPARRAY input_data
   198 ; r13 = JSAMPARRAY * output_data_ptr
   200 %define wk(i)		rbp-(WK_NUM-(i))*SIZEOF_XMMWORD	; xmmword wk[WK_NUM]
   201 %define WK_NUM		4
   203 	align	16
   204 	global	EXTN(jsimd_h2v2_fancy_upsample_sse2)
   206 EXTN(jsimd_h2v2_fancy_upsample_sse2):
   207 	push	rbp
   208 	mov	rax,rsp				; rax = original rbp
   209 	sub	rsp, byte 4
   210 	and	rsp, byte (-SIZEOF_XMMWORD)	; align to 128 bits
   211 	mov	[rsp],rax
   212 	mov	rbp,rsp				; rbp = aligned rbp
   213 	lea	rsp, [wk(0)]
   214 	collect_args
   215 	push	rbx
   217 	mov	rax, r11  ; colctr
   218 	test	rax,rax
   219 	jz	near .return
   221 	mov	rcx, r10	; rowctr
   222 	test	rcx,rcx
   223 	jz	near .return
   225 	mov	rsi, r12	; input_data
   226 	mov	rdi, r13
   227 	mov	rdi, JSAMPARRAY [rdi]			; output_data
   228 .rowloop:
   229 	push	rax					; colctr
   230 	push	rcx
   231 	push	rdi
   232 	push	rsi
   234 	mov	rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]	; inptr1(above)
   235 	mov	rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]	; inptr0
   236 	mov	rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]	; inptr1(below)
   237 	mov	rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]	; outptr0
   238 	mov	rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]	; outptr1
   240 	test	rax, SIZEOF_XMMWORD-1
   241 	jz	short .skip
   242 	push	rdx
   243 	mov	dl, JSAMPLE [rcx+(rax-1)*SIZEOF_JSAMPLE]
   244 	mov	JSAMPLE [rcx+rax*SIZEOF_JSAMPLE], dl
   245 	mov	dl, JSAMPLE [rbx+(rax-1)*SIZEOF_JSAMPLE]
   246 	mov	JSAMPLE [rbx+rax*SIZEOF_JSAMPLE], dl
   247 	mov	dl, JSAMPLE [rsi+(rax-1)*SIZEOF_JSAMPLE]
   248 	mov	JSAMPLE [rsi+rax*SIZEOF_JSAMPLE], dl	; insert a dummy sample
   249 	pop	rdx
   250 .skip:
   251 	; -- process the first column block
   253 	movdqa	xmm0, XMMWORD [rbx+0*SIZEOF_XMMWORD]	; xmm0=row[ 0][0]
   254 	movdqa	xmm1, XMMWORD [rcx+0*SIZEOF_XMMWORD]	; xmm1=row[-1][0]
   255 	movdqa	xmm2, XMMWORD [rsi+0*SIZEOF_XMMWORD]	; xmm2=row[+1][0]
   257 	pxor      xmm3,xmm3		; xmm3=(all 0's)
   258 	movdqa    xmm4,xmm0
   259 	punpcklbw xmm0,xmm3		; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
   260 	punpckhbw xmm4,xmm3		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
   261 	movdqa    xmm5,xmm1
   262 	punpcklbw xmm1,xmm3		; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
   263 	punpckhbw xmm5,xmm3		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
   264 	movdqa    xmm6,xmm2
   265 	punpcklbw xmm2,xmm3		; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
   266 	punpckhbw xmm6,xmm3		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
   268 	pmullw	xmm0,[rel PW_THREE]
   269 	pmullw	xmm4,[rel PW_THREE]
   271 	pcmpeqb	xmm7,xmm7
   272 	psrldq	xmm7,(SIZEOF_XMMWORD-2)
   274 	paddw	xmm1,xmm0		; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
   275 	paddw	xmm5,xmm4		; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
   276 	paddw	xmm2,xmm0		; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
   277 	paddw	xmm6,xmm4		; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
   279 	movdqa	XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1	; temporarily save
   280 	movdqa	XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5	; the intermediate data
   281 	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm2
   282 	movdqa	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm6
   284 	pand	xmm1,xmm7		; xmm1=( 0 -- -- -- -- -- -- --)
   285 	pand	xmm2,xmm7		; xmm2=( 0 -- -- -- -- -- -- --)
   287 	movdqa	XMMWORD [wk(0)], xmm1
   288 	movdqa	XMMWORD [wk(1)], xmm2
   290 	add	rax, byte SIZEOF_XMMWORD-1
   291 	and	rax, byte -SIZEOF_XMMWORD
   292 	cmp	rax, byte SIZEOF_XMMWORD
   293 	ja	short .columnloop
   295 .columnloop_last:
   296 	; -- process the last column block
   298 	pcmpeqb	xmm1,xmm1
   299 	pslldq	xmm1,(SIZEOF_XMMWORD-2)
   300 	movdqa	xmm2,xmm1
   302 	pand	xmm1, XMMWORD [rdx+1*SIZEOF_XMMWORD]
   303 	pand	xmm2, XMMWORD [rdi+1*SIZEOF_XMMWORD]
   305 	movdqa	XMMWORD [wk(2)], xmm1	; xmm1=(-- -- -- -- -- -- -- 15)
   306 	movdqa	XMMWORD [wk(3)], xmm2	; xmm2=(-- -- -- -- -- -- -- 15)
   308 	jmp	near .upsample
   310 .columnloop:
   311 	; -- process the next column block
   313 	movdqa	xmm0, XMMWORD [rbx+1*SIZEOF_XMMWORD]	; xmm0=row[ 0][1]
   314 	movdqa	xmm1, XMMWORD [rcx+1*SIZEOF_XMMWORD]	; xmm1=row[-1][1]
   315 	movdqa	xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]	; xmm2=row[+1][1]
   317 	pxor      xmm3,xmm3		; xmm3=(all 0's)
   318 	movdqa    xmm4,xmm0
   319 	punpcklbw xmm0,xmm3		; xmm0=row[ 0]( 0  1  2  3  4  5  6  7)
   320 	punpckhbw xmm4,xmm3		; xmm4=row[ 0]( 8  9 10 11 12 13 14 15)
   321 	movdqa    xmm5,xmm1
   322 	punpcklbw xmm1,xmm3		; xmm1=row[-1]( 0  1  2  3  4  5  6  7)
   323 	punpckhbw xmm5,xmm3		; xmm5=row[-1]( 8  9 10 11 12 13 14 15)
   324 	movdqa    xmm6,xmm2
   325 	punpcklbw xmm2,xmm3		; xmm2=row[+1]( 0  1  2  3  4  5  6  7)
   326 	punpckhbw xmm6,xmm3		; xmm6=row[+1]( 8  9 10 11 12 13 14 15)
   328 	pmullw	xmm0,[rel PW_THREE]
   329 	pmullw	xmm4,[rel PW_THREE]
   331 	paddw	xmm1,xmm0		; xmm1=Int0L=( 0  1  2  3  4  5  6  7)
   332 	paddw	xmm5,xmm4		; xmm5=Int0H=( 8  9 10 11 12 13 14 15)
   333 	paddw	xmm2,xmm0		; xmm2=Int1L=( 0  1  2  3  4  5  6  7)
   334 	paddw	xmm6,xmm4		; xmm6=Int1H=( 8  9 10 11 12 13 14 15)
   336 	movdqa	XMMWORD [rdx+2*SIZEOF_XMMWORD], xmm1	; temporarily save
   337 	movdqa	XMMWORD [rdx+3*SIZEOF_XMMWORD], xmm5	; the intermediate data
   338 	movdqa	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
   339 	movdqa	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm6
   341 	pslldq	xmm1,(SIZEOF_XMMWORD-2)	; xmm1=(-- -- -- -- -- -- --  0)
   342 	pslldq	xmm2,(SIZEOF_XMMWORD-2)	; xmm2=(-- -- -- -- -- -- --  0)
   344 	movdqa	XMMWORD [wk(2)], xmm1
   345 	movdqa	XMMWORD [wk(3)], xmm2
   347 .upsample:
   348 	; -- process the upper row
   350 	movdqa	xmm7, XMMWORD [rdx+0*SIZEOF_XMMWORD]
   351 	movdqa	xmm3, XMMWORD [rdx+1*SIZEOF_XMMWORD]
   353 	movdqa	xmm0,xmm7		; xmm7=Int0L=( 0  1  2  3  4  5  6  7)
   354 	movdqa	xmm4,xmm3		; xmm3=Int0H=( 8  9 10 11 12 13 14 15)
   355 	psrldq	xmm0,2			; xmm0=( 1  2  3  4  5  6  7 --)
   356 	pslldq	xmm4,(SIZEOF_XMMWORD-2)	; xmm4=(-- -- -- -- -- -- --  8)
   357 	movdqa	xmm5,xmm7
   358 	movdqa	xmm6,xmm3
   359 	psrldq	xmm5,(SIZEOF_XMMWORD-2)	; xmm5=( 7 -- -- -- -- -- -- --)
   360 	pslldq	xmm6,2			; xmm6=(--  8  9 10 11 12 13 14)
   362 	por	xmm0,xmm4		; xmm0=( 1  2  3  4  5  6  7  8)
   363 	por	xmm5,xmm6		; xmm5=( 7  8  9 10 11 12 13 14)
   365 	movdqa	xmm1,xmm7
   366 	movdqa	xmm2,xmm3
   367 	pslldq	xmm1,2			; xmm1=(--  0  1  2  3  4  5  6)
   368 	psrldq	xmm2,2			; xmm2=( 9 10 11 12 13 14 15 --)
   369 	movdqa	xmm4,xmm3
   370 	psrldq	xmm4,(SIZEOF_XMMWORD-2)	; xmm4=(15 -- -- -- -- -- -- --)
   372 	por	xmm1, XMMWORD [wk(0)]	; xmm1=(-1  0  1  2  3  4  5  6)
   373 	por	xmm2, XMMWORD [wk(2)]	; xmm2=( 9 10 11 12 13 14 15 16)
   375 	movdqa	XMMWORD [wk(0)], xmm4
   377 	pmullw	xmm7,[rel PW_THREE]
   378 	pmullw	xmm3,[rel PW_THREE]
   379 	paddw	xmm1,[rel PW_EIGHT]
   380 	paddw	xmm5,[rel PW_EIGHT]
   381 	paddw	xmm0,[rel PW_SEVEN]
   382 	paddw	xmm2,[rel PW_SEVEN]
   384 	paddw	xmm1,xmm7
   385 	paddw	xmm5,xmm3
   386 	psrlw	xmm1,4			; xmm1=Out0LE=( 0  2  4  6  8 10 12 14)
   387 	psrlw	xmm5,4			; xmm5=Out0HE=(16 18 20 22 24 26 28 30)
   388 	paddw	xmm0,xmm7
   389 	paddw	xmm2,xmm3
   390 	psrlw	xmm0,4			; xmm0=Out0LO=( 1  3  5  7  9 11 13 15)
   391 	psrlw	xmm2,4			; xmm2=Out0HO=(17 19 21 23 25 27 29 31)
   393 	psllw	xmm0,BYTE_BIT
   394 	psllw	xmm2,BYTE_BIT
   395 	por	xmm1,xmm0		; xmm1=Out0L=( 0  1  2 ... 13 14 15)
   396 	por	xmm5,xmm2		; xmm5=Out0H=(16 17 18 ... 29 30 31)
   398 	movdqa	XMMWORD [rdx+0*SIZEOF_XMMWORD], xmm1
   399 	movdqa	XMMWORD [rdx+1*SIZEOF_XMMWORD], xmm5
   401 	; -- process the lower row
   403 	movdqa	xmm6, XMMWORD [rdi+0*SIZEOF_XMMWORD]
   404 	movdqa	xmm4, XMMWORD [rdi+1*SIZEOF_XMMWORD]
   406 	movdqa	xmm7,xmm6		; xmm6=Int1L=( 0  1  2  3  4  5  6  7)
   407 	movdqa	xmm3,xmm4		; xmm4=Int1H=( 8  9 10 11 12 13 14 15)
   408 	psrldq	xmm7,2			; xmm7=( 1  2  3  4  5  6  7 --)
   409 	pslldq	xmm3,(SIZEOF_XMMWORD-2)	; xmm3=(-- -- -- -- -- -- --  8)
   410 	movdqa	xmm0,xmm6
   411 	movdqa	xmm2,xmm4
   412 	psrldq	xmm0,(SIZEOF_XMMWORD-2)	; xmm0=( 7 -- -- -- -- -- -- --)
   413 	pslldq	xmm2,2			; xmm2=(--  8  9 10 11 12 13 14)
   415 	por	xmm7,xmm3		; xmm7=( 1  2  3  4  5  6  7  8)
   416 	por	xmm0,xmm2		; xmm0=( 7  8  9 10 11 12 13 14)
   418 	movdqa	xmm1,xmm6
   419 	movdqa	xmm5,xmm4
   420 	pslldq	xmm1,2			; xmm1=(--  0  1  2  3  4  5  6)
   421 	psrldq	xmm5,2			; xmm5=( 9 10 11 12 13 14 15 --)
   422 	movdqa	xmm3,xmm4
   423 	psrldq	xmm3,(SIZEOF_XMMWORD-2)	; xmm3=(15 -- -- -- -- -- -- --)
   425 	por	xmm1, XMMWORD [wk(1)]	; xmm1=(-1  0  1  2  3  4  5  6)
   426 	por	xmm5, XMMWORD [wk(3)]	; xmm5=( 9 10 11 12 13 14 15 16)
   428 	movdqa	XMMWORD [wk(1)], xmm3
   430 	pmullw	xmm6,[rel PW_THREE]
   431 	pmullw	xmm4,[rel PW_THREE]
   432 	paddw	xmm1,[rel PW_EIGHT]
   433 	paddw	xmm0,[rel PW_EIGHT]
   434 	paddw	xmm7,[rel PW_SEVEN]
   435 	paddw	xmm5,[rel PW_SEVEN]
   437 	paddw	xmm1,xmm6
   438 	paddw	xmm0,xmm4
   439 	psrlw	xmm1,4			; xmm1=Out1LE=( 0  2  4  6  8 10 12 14)
   440 	psrlw	xmm0,4			; xmm0=Out1HE=(16 18 20 22 24 26 28 30)
   441 	paddw	xmm7,xmm6
   442 	paddw	xmm5,xmm4
   443 	psrlw	xmm7,4			; xmm7=Out1LO=( 1  3  5  7  9 11 13 15)
   444 	psrlw	xmm5,4			; xmm5=Out1HO=(17 19 21 23 25 27 29 31)
   446 	psllw	xmm7,BYTE_BIT
   447 	psllw	xmm5,BYTE_BIT
   448 	por	xmm1,xmm7		; xmm1=Out1L=( 0  1  2 ... 13 14 15)
   449 	por	xmm0,xmm5		; xmm0=Out1H=(16 17 18 ... 29 30 31)
   451 	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm1
   452 	movdqa	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm0
   454 	sub	rax, byte SIZEOF_XMMWORD
   455 	add	rcx, byte 1*SIZEOF_XMMWORD	; inptr1(above)
   456 	add	rbx, byte 1*SIZEOF_XMMWORD	; inptr0
   457 	add	rsi, byte 1*SIZEOF_XMMWORD	; inptr1(below)
   458 	add	rdx, byte 2*SIZEOF_XMMWORD	; outptr0
   459 	add	rdi, byte 2*SIZEOF_XMMWORD	; outptr1
   460 	cmp	rax, byte SIZEOF_XMMWORD
   461 	ja	near .columnloop
   462 	test	rax,rax
   463 	jnz	near .columnloop_last
   465 	pop	rsi
   466 	pop	rdi
   467 	pop	rcx
   468 	pop	rax
   470 	add	rsi, byte 1*SIZEOF_JSAMPROW	; input_data
   471 	add	rdi, byte 2*SIZEOF_JSAMPROW	; output_data
   472 	sub	rcx, byte 2			; rowctr
   473 	jg	near .rowloop
   475 .return:
   476 	pop	rbx
   477 	uncollect_args
   478 	mov	rsp,rbp		; rsp <- aligned rbp
   479 	pop	rsp		; rsp <- original rbp
   480 	pop	rbp
   481 	ret
   483 ; --------------------------------------------------------------------------
   484 ;
   485 ; Fast processing for the common case of 2:1 horizontal and 1:1 vertical.
   486 ; It's still a box filter.
   487 ;
   488 ; GLOBAL(void)
   489 ; jsimd_h2v1_upsample_sse2 (int max_v_samp_factor,
   490 ;                           JDIMENSION output_width,
   491 ;                           JSAMPARRAY input_data,
   492 ;                           JSAMPARRAY * output_data_ptr);
   493 ;
   495 ; r10 = int max_v_samp_factor
   496 ; r11 = JDIMENSION output_width
   497 ; r12 = JSAMPARRAY input_data
   498 ; r13 = JSAMPARRAY * output_data_ptr
   500 	align	16
   501 	global	EXTN(jsimd_h2v1_upsample_sse2)
   503 EXTN(jsimd_h2v1_upsample_sse2):
   504 	push	rbp
   505 	mov	rax,rsp
   506 	mov	rbp,rsp
   507 	collect_args
   509 	mov	rdx, r11
   510 	add	rdx, byte (2*SIZEOF_XMMWORD)-1
   511 	and	rdx, byte -(2*SIZEOF_XMMWORD)
   512 	jz	near .return
   514 	mov	rcx, r10	; rowctr
   515 	test	rcx,rcx
   516 	jz	short .return
   518 	mov	rsi, r12 ; input_data
   519 	mov	rdi, r13
   520 	mov	rdi, JSAMPARRAY [rdi]			; output_data
   521 .rowloop:
   522 	push	rdi
   523 	push	rsi
   525 	mov	rsi, JSAMPROW [rsi]		; inptr
   526 	mov	rdi, JSAMPROW [rdi]		; outptr
   527 	mov	rax,rdx				; colctr
   528 .columnloop:
   530 	movdqa	xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
   532 	movdqa    xmm1,xmm0
   533 	punpcklbw xmm0,xmm0
   534 	punpckhbw xmm1,xmm1
   536 	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
   537 	movdqa	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
   539 	sub	rax, byte 2*SIZEOF_XMMWORD
   540 	jz	short .nextrow
   542 	movdqa	xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
   544 	movdqa    xmm3,xmm2
   545 	punpcklbw xmm2,xmm2
   546 	punpckhbw xmm3,xmm3
   548 	movdqa	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
   549 	movdqa	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
   551 	sub	rax, byte 2*SIZEOF_XMMWORD
   552 	jz	short .nextrow
   554 	add	rsi, byte 2*SIZEOF_XMMWORD	; inptr
   555 	add	rdi, byte 4*SIZEOF_XMMWORD	; outptr
   556 	jmp	short .columnloop
   558 .nextrow:
   559 	pop	rsi
   560 	pop	rdi
   562 	add	rsi, byte SIZEOF_JSAMPROW	; input_data
   563 	add	rdi, byte SIZEOF_JSAMPROW	; output_data
   564 	dec	rcx				; rowctr
   565 	jg	short .rowloop
   567 .return:
   568 	uncollect_args
   569 	pop	rbp
   570 	ret
   572 ; --------------------------------------------------------------------------
   573 ;
   574 ; Fast processing for the common case of 2:1 horizontal and 2:1 vertical.
   575 ; It's still a box filter.
   576 ;
   577 ; GLOBAL(void)
   578 ; jsimd_h2v2_upsample_sse2 (nt max_v_samp_factor,
   579 ;                           JDIMENSION output_width,
   580 ;                           JSAMPARRAY input_data,
   581 ;                           JSAMPARRAY * output_data_ptr);
   582 ;
   584 ; r10 = int max_v_samp_factor
   585 ; r11 = JDIMENSION output_width
   586 ; r12 = JSAMPARRAY input_data
   587 ; r13 = JSAMPARRAY * output_data_ptr
   589 	align	16
   590 	global	EXTN(jsimd_h2v2_upsample_sse2)
   592 EXTN(jsimd_h2v2_upsample_sse2):
   593 	push	rbp
   594 	mov	rax,rsp
   595 	mov	rbp,rsp
   596 	collect_args
   597 	push	rbx
   599 	mov	rdx, r11
   600 	add	rdx, byte (2*SIZEOF_XMMWORD)-1
   601 	and	rdx, byte -(2*SIZEOF_XMMWORD)
   602 	jz	near .return
   604 	mov	rcx, r10	; rowctr
   605 	test	rcx,rcx
   606 	jz	near .return
   608 	mov	rsi, r12	; input_data
   609 	mov	rdi, r13
   610 	mov	rdi, JSAMPARRAY [rdi]			; output_data
   611 .rowloop:
   612 	push	rdi
   613 	push	rsi
   615 	mov	rsi, JSAMPROW [rsi]			; inptr
   616 	mov	rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]	; outptr0
   617 	mov	rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]	; outptr1
   618 	mov	rax,rdx					; colctr
   619 .columnloop:
   621 	movdqa	xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
   623 	movdqa    xmm1,xmm0
   624 	punpcklbw xmm0,xmm0
   625 	punpckhbw xmm1,xmm1
   627 	movdqa	XMMWORD [rbx+0*SIZEOF_XMMWORD], xmm0
   628 	movdqa	XMMWORD [rbx+1*SIZEOF_XMMWORD], xmm1
   629 	movdqa	XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
   630 	movdqa	XMMWORD [rdi+1*SIZEOF_XMMWORD], xmm1
   632 	sub	rax, byte 2*SIZEOF_XMMWORD
   633 	jz	short .nextrow
   635 	movdqa	xmm2, XMMWORD [rsi+1*SIZEOF_XMMWORD]
   637 	movdqa    xmm3,xmm2
   638 	punpcklbw xmm2,xmm2
   639 	punpckhbw xmm3,xmm3
   641 	movdqa	XMMWORD [rbx+2*SIZEOF_XMMWORD], xmm2
   642 	movdqa	XMMWORD [rbx+3*SIZEOF_XMMWORD], xmm3
   643 	movdqa	XMMWORD [rdi+2*SIZEOF_XMMWORD], xmm2
   644 	movdqa	XMMWORD [rdi+3*SIZEOF_XMMWORD], xmm3
   646 	sub	rax, byte 2*SIZEOF_XMMWORD
   647 	jz	short .nextrow
   649 	add	rsi, byte 2*SIZEOF_XMMWORD	; inptr
   650 	add	rbx, byte 4*SIZEOF_XMMWORD	; outptr0
   651 	add	rdi, byte 4*SIZEOF_XMMWORD	; outptr1
   652 	jmp	short .columnloop
   654 .nextrow:
   655 	pop	rsi
   656 	pop	rdi
   658 	add	rsi, byte 1*SIZEOF_JSAMPROW	; input_data
   659 	add	rdi, byte 2*SIZEOF_JSAMPROW	; output_data
   660 	sub	rcx, byte 2			; rowctr
   661 	jg	near .rowloop
   663 .return:
   664 	pop	rbx
   665 	uncollect_args
   666 	pop	rbp
   667 	ret
   669 ; For some reason, the OS X linker does not honor the request to align the
   670 ; segment unless we do this.
   671 	align	16

mercurial