michael@0: ; michael@0: ; jcsamss2-64.asm - downsampling (64-bit SSE2) michael@0: ; michael@0: ; Copyright 2009 Pierre Ossman for Cendio AB michael@0: ; Copyright 2009 D. R. Commander michael@0: ; michael@0: ; Based on michael@0: ; x86 SIMD extension for IJG JPEG library michael@0: ; Copyright (C) 1999-2006, MIYASAKA Masaru. michael@0: ; For conditions of distribution and use, see copyright notice in jsimdext.inc michael@0: ; michael@0: ; This file should be assembled with NASM (Netwide Assembler), michael@0: ; can *not* be assembled with Microsoft's MASM or any compatible michael@0: ; assembler (including Borland's Turbo Assembler). michael@0: ; NASM is available from http://nasm.sourceforge.net/ or michael@0: ; http://sourceforge.net/project/showfiles.php?group_id=6208 michael@0: ; michael@0: ; [TAB8] michael@0: michael@0: %include "jsimdext.inc" michael@0: michael@0: ; -------------------------------------------------------------------------- michael@0: SECTION SEG_TEXT michael@0: BITS 64 michael@0: ; michael@0: ; Downsample pixel values of a single component. michael@0: ; This version handles the common case of 2:1 horizontal and 1:1 vertical, michael@0: ; without smoothing. michael@0: ; michael@0: ; GLOBAL(void) michael@0: ; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, michael@0: ; JDIMENSION v_samp_factor, JDIMENSION width_blocks, michael@0: ; JSAMPARRAY input_data, JSAMPARRAY output_data); michael@0: ; michael@0: michael@0: ; r10 = JDIMENSION image_width michael@0: ; r11 = int max_v_samp_factor michael@0: ; r12 = JDIMENSION v_samp_factor michael@0: ; r13 = JDIMENSION width_blocks michael@0: ; r14 = JSAMPARRAY input_data michael@0: ; r15 = JSAMPARRAY output_data michael@0: michael@0: align 16 michael@0: global EXTN(jsimd_h2v1_downsample_sse2) michael@0: michael@0: EXTN(jsimd_h2v1_downsample_sse2): michael@0: push rbp michael@0: mov rax,rsp michael@0: mov rbp,rsp michael@0: collect_args michael@0: michael@0: mov rcx, r13 michael@0: shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) michael@0: jz near .return michael@0: michael@0: mov rdx, r10 michael@0: michael@0: ; -- expand_right_edge michael@0: michael@0: push rcx michael@0: shl rcx,1 ; output_cols * 2 michael@0: sub rcx,rdx michael@0: jle short .expand_end michael@0: michael@0: mov rax, r11 michael@0: test rax,rax michael@0: jle short .expand_end michael@0: michael@0: cld michael@0: mov rsi, r14 ; input_data michael@0: .expandloop: michael@0: push rax michael@0: push rcx michael@0: michael@0: mov rdi, JSAMPROW [rsi] michael@0: add rdi,rdx michael@0: mov al, JSAMPLE [rdi-1] michael@0: michael@0: rep stosb michael@0: michael@0: pop rcx michael@0: pop rax michael@0: michael@0: add rsi, byte SIZEOF_JSAMPROW michael@0: dec rax michael@0: jg short .expandloop michael@0: michael@0: .expand_end: michael@0: pop rcx ; output_cols michael@0: michael@0: ; -- h2v1_downsample michael@0: michael@0: mov rax, r12 ; rowctr michael@0: test eax,eax michael@0: jle near .return michael@0: michael@0: mov rdx, 0x00010000 ; bias pattern michael@0: movd xmm7,edx michael@0: pcmpeqw xmm6,xmm6 michael@0: pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} michael@0: psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} michael@0: michael@0: mov rsi, r14 ; input_data michael@0: mov rdi, r15 ; output_data michael@0: .rowloop: michael@0: push rcx michael@0: push rdi michael@0: push rsi michael@0: michael@0: mov rsi, JSAMPROW [rsi] ; inptr michael@0: mov rdi, JSAMPROW [rdi] ; outptr michael@0: michael@0: cmp rcx, byte SIZEOF_XMMWORD michael@0: jae short .columnloop michael@0: michael@0: .columnloop_r8: michael@0: movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] michael@0: pxor xmm1,xmm1 michael@0: mov rcx, SIZEOF_XMMWORD michael@0: jmp short .downsample michael@0: michael@0: .columnloop: michael@0: movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD] michael@0: movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD] michael@0: michael@0: .downsample: michael@0: movdqa xmm2,xmm0 michael@0: movdqa xmm3,xmm1 michael@0: michael@0: pand xmm0,xmm6 michael@0: psrlw xmm2,BYTE_BIT michael@0: pand xmm1,xmm6 michael@0: psrlw xmm3,BYTE_BIT michael@0: michael@0: paddw xmm0,xmm2 michael@0: paddw xmm1,xmm3 michael@0: paddw xmm0,xmm7 michael@0: paddw xmm1,xmm7 michael@0: psrlw xmm0,1 michael@0: psrlw xmm1,1 michael@0: michael@0: packuswb xmm0,xmm1 michael@0: michael@0: movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 michael@0: michael@0: sub rcx, byte SIZEOF_XMMWORD ; outcol michael@0: add rsi, byte 2*SIZEOF_XMMWORD ; inptr michael@0: add rdi, byte 1*SIZEOF_XMMWORD ; outptr michael@0: cmp rcx, byte SIZEOF_XMMWORD michael@0: jae short .columnloop michael@0: test rcx,rcx michael@0: jnz short .columnloop_r8 michael@0: michael@0: pop rsi michael@0: pop rdi michael@0: pop rcx michael@0: michael@0: add rsi, byte SIZEOF_JSAMPROW ; input_data michael@0: add rdi, byte SIZEOF_JSAMPROW ; output_data michael@0: dec rax ; rowctr michael@0: jg near .rowloop michael@0: michael@0: .return: michael@0: uncollect_args michael@0: pop rbp michael@0: ret michael@0: michael@0: ; -------------------------------------------------------------------------- michael@0: ; michael@0: ; Downsample pixel values of a single component. michael@0: ; This version handles the standard case of 2:1 horizontal and 2:1 vertical, michael@0: ; without smoothing. michael@0: ; michael@0: ; GLOBAL(void) michael@0: ; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, michael@0: ; JDIMENSION v_samp_factor, JDIMENSION width_blocks, michael@0: ; JSAMPARRAY input_data, JSAMPARRAY output_data); michael@0: ; michael@0: michael@0: ; r10 = JDIMENSION image_width michael@0: ; r11 = int max_v_samp_factor michael@0: ; r12 = JDIMENSION v_samp_factor michael@0: ; r13 = JDIMENSION width_blocks michael@0: ; r14 = JSAMPARRAY input_data michael@0: ; r15 = JSAMPARRAY output_data michael@0: michael@0: align 16 michael@0: global EXTN(jsimd_h2v2_downsample_sse2) michael@0: michael@0: EXTN(jsimd_h2v2_downsample_sse2): michael@0: push rbp michael@0: mov rax,rsp michael@0: mov rbp,rsp michael@0: collect_args michael@0: michael@0: mov rcx, r13 michael@0: shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols) michael@0: jz near .return michael@0: michael@0: mov rdx, r10 michael@0: michael@0: ; -- expand_right_edge michael@0: michael@0: push rcx michael@0: shl rcx,1 ; output_cols * 2 michael@0: sub rcx,rdx michael@0: jle short .expand_end michael@0: michael@0: mov rax, r11 michael@0: test rax,rax michael@0: jle short .expand_end michael@0: michael@0: cld michael@0: mov rsi, r14 ; input_data michael@0: .expandloop: michael@0: push rax michael@0: push rcx michael@0: michael@0: mov rdi, JSAMPROW [rsi] michael@0: add rdi,rdx michael@0: mov al, JSAMPLE [rdi-1] michael@0: michael@0: rep stosb michael@0: michael@0: pop rcx michael@0: pop rax michael@0: michael@0: add rsi, byte SIZEOF_JSAMPROW michael@0: dec rax michael@0: jg short .expandloop michael@0: michael@0: .expand_end: michael@0: pop rcx ; output_cols michael@0: michael@0: ; -- h2v2_downsample michael@0: michael@0: mov rax, r12 ; rowctr michael@0: test rax,rax michael@0: jle near .return michael@0: michael@0: mov rdx, 0x00020001 ; bias pattern michael@0: movd xmm7,edx michael@0: pcmpeqw xmm6,xmm6 michael@0: pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} michael@0: psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} michael@0: michael@0: mov rsi, r14 ; input_data michael@0: mov rdi, r15 ; output_data michael@0: .rowloop: michael@0: push rcx michael@0: push rdi michael@0: push rsi michael@0: michael@0: mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0 michael@0: mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1 michael@0: mov rdi, JSAMPROW [rdi] ; outptr michael@0: michael@0: cmp rcx, byte SIZEOF_XMMWORD michael@0: jae short .columnloop michael@0: michael@0: .columnloop_r8: michael@0: movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] michael@0: movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] michael@0: pxor xmm2,xmm2 michael@0: pxor xmm3,xmm3 michael@0: mov rcx, SIZEOF_XMMWORD michael@0: jmp short .downsample michael@0: michael@0: .columnloop: michael@0: movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD] michael@0: movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD] michael@0: movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD] michael@0: movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD] michael@0: michael@0: .downsample: michael@0: movdqa xmm4,xmm0 michael@0: movdqa xmm5,xmm1 michael@0: pand xmm0,xmm6 michael@0: psrlw xmm4,BYTE_BIT michael@0: pand xmm1,xmm6 michael@0: psrlw xmm5,BYTE_BIT michael@0: paddw xmm0,xmm4 michael@0: paddw xmm1,xmm5 michael@0: michael@0: movdqa xmm4,xmm2 michael@0: movdqa xmm5,xmm3 michael@0: pand xmm2,xmm6 michael@0: psrlw xmm4,BYTE_BIT michael@0: pand xmm3,xmm6 michael@0: psrlw xmm5,BYTE_BIT michael@0: paddw xmm2,xmm4 michael@0: paddw xmm3,xmm5 michael@0: michael@0: paddw xmm0,xmm1 michael@0: paddw xmm2,xmm3 michael@0: paddw xmm0,xmm7 michael@0: paddw xmm2,xmm7 michael@0: psrlw xmm0,2 michael@0: psrlw xmm2,2 michael@0: michael@0: packuswb xmm0,xmm2 michael@0: michael@0: movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0 michael@0: michael@0: sub rcx, byte SIZEOF_XMMWORD ; outcol michael@0: add rdx, byte 2*SIZEOF_XMMWORD ; inptr0 michael@0: add rsi, byte 2*SIZEOF_XMMWORD ; inptr1 michael@0: add rdi, byte 1*SIZEOF_XMMWORD ; outptr michael@0: cmp rcx, byte SIZEOF_XMMWORD michael@0: jae near .columnloop michael@0: test rcx,rcx michael@0: jnz near .columnloop_r8 michael@0: michael@0: pop rsi michael@0: pop rdi michael@0: pop rcx michael@0: michael@0: add rsi, byte 2*SIZEOF_JSAMPROW ; input_data michael@0: add rdi, byte 1*SIZEOF_JSAMPROW ; output_data michael@0: dec rax ; rowctr michael@0: jg near .rowloop michael@0: michael@0: .return: michael@0: uncollect_args michael@0: pop rbp michael@0: ret michael@0: michael@0: ; For some reason, the OS X linker does not honor the request to align the michael@0: ; segment unless we do this. michael@0: align 16