michael@0: ; michael@0: ; jcsamss2.asm - downsampling (SSE2) michael@0: ; michael@0: ; Copyright 2009 Pierre Ossman for Cendio AB michael@0: ; michael@0: ; Based on michael@0: ; x86 SIMD extension for IJG JPEG library michael@0: ; Copyright (C) 1999-2006, MIYASAKA Masaru. michael@0: ; For conditions of distribution and use, see copyright notice in jsimdext.inc michael@0: ; michael@0: ; This file should be assembled with NASM (Netwide Assembler), michael@0: ; can *not* be assembled with Microsoft's MASM or any compatible michael@0: ; assembler (including Borland's Turbo Assembler). michael@0: ; NASM is available from http://nasm.sourceforge.net/ or michael@0: ; http://sourceforge.net/project/showfiles.php?group_id=6208 michael@0: ; michael@0: ; [TAB8] michael@0: michael@0: %include "jsimdext.inc" michael@0: michael@0: ; -------------------------------------------------------------------------- michael@0: SECTION SEG_TEXT michael@0: BITS 32 michael@0: ; michael@0: ; Downsample pixel values of a single component. michael@0: ; This version handles the common case of 2:1 horizontal and 1:1 vertical, michael@0: ; without smoothing. michael@0: ; michael@0: ; GLOBAL(void) michael@0: ; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, michael@0: ; JDIMENSION v_samp_factor, JDIMENSION width_blocks, michael@0: ; JSAMPARRAY input_data, JSAMPARRAY output_data); michael@0: ; michael@0: michael@0: %define img_width(b) (b)+8 ; JDIMENSION image_width michael@0: %define max_v_samp(b) (b)+12 ; int max_v_samp_factor michael@0: %define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor michael@0: %define width_blks(b) (b)+20 ; JDIMENSION width_blocks michael@0: %define input_data(b) (b)+24 ; JSAMPARRAY input_data michael@0: %define output_data(b) (b)+28 ; JSAMPARRAY output_data michael@0: michael@0: align 16 michael@0: global EXTN(jsimd_h2v1_downsample_sse2) michael@0: michael@0: EXTN(jsimd_h2v1_downsample_sse2): michael@0: push ebp michael@0: mov ebp,esp michael@0: ; push ebx ; unused michael@0: ; push ecx ; need not be preserved michael@0: ; push edx ; need not be preserved michael@0: push esi michael@0: push edi michael@0: michael@0: mov ecx, JDIMENSION [width_blks(ebp)] michael@0: shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) michael@0: jz near .return michael@0: michael@0: mov edx, JDIMENSION [img_width(ebp)] michael@0: michael@0: ; -- expand_right_edge michael@0: michael@0: push ecx michael@0: shl ecx,1 ; output_cols * 2 michael@0: sub ecx,edx michael@0: jle short .expand_end michael@0: michael@0: mov eax, INT [max_v_samp(ebp)] michael@0: test eax,eax michael@0: jle short .expand_end michael@0: michael@0: cld michael@0: mov esi, JSAMPARRAY [input_data(ebp)] ; input_data michael@0: alignx 16,7 michael@0: .expandloop: michael@0: push eax michael@0: push ecx michael@0: michael@0: mov edi, JSAMPROW [esi] michael@0: add edi,edx michael@0: mov al, JSAMPLE [edi-1] michael@0: michael@0: rep stosb michael@0: michael@0: pop ecx michael@0: pop eax michael@0: michael@0: add esi, byte SIZEOF_JSAMPROW michael@0: dec eax michael@0: jg short .expandloop michael@0: michael@0: .expand_end: michael@0: pop ecx ; output_cols michael@0: michael@0: ; -- h2v1_downsample michael@0: michael@0: mov eax, JDIMENSION [v_samp(ebp)] ; rowctr michael@0: test eax,eax michael@0: jle near .return michael@0: michael@0: mov edx, 0x00010000 ; bias pattern michael@0: movd xmm7,edx michael@0: pcmpeqw xmm6,xmm6 michael@0: pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1} michael@0: psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} michael@0: michael@0: mov esi, JSAMPARRAY [input_data(ebp)] ; input_data michael@0: mov edi, JSAMPARRAY [output_data(ebp)] ; output_data michael@0: alignx 16,7 michael@0: .rowloop: michael@0: push ecx michael@0: push edi michael@0: push esi michael@0: michael@0: mov esi, JSAMPROW [esi] ; inptr michael@0: mov edi, JSAMPROW [edi] ; outptr michael@0: michael@0: cmp ecx, byte SIZEOF_XMMWORD michael@0: jae short .columnloop michael@0: alignx 16,7 michael@0: michael@0: .columnloop_r8: michael@0: movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] michael@0: pxor xmm1,xmm1 michael@0: mov ecx, SIZEOF_XMMWORD michael@0: jmp short .downsample michael@0: alignx 16,7 michael@0: michael@0: .columnloop: michael@0: movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD] michael@0: movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD] michael@0: michael@0: .downsample: michael@0: movdqa xmm2,xmm0 michael@0: movdqa xmm3,xmm1 michael@0: michael@0: pand xmm0,xmm6 michael@0: psrlw xmm2,BYTE_BIT michael@0: pand xmm1,xmm6 michael@0: psrlw xmm3,BYTE_BIT michael@0: michael@0: paddw xmm0,xmm2 michael@0: paddw xmm1,xmm3 michael@0: paddw xmm0,xmm7 michael@0: paddw xmm1,xmm7 michael@0: psrlw xmm0,1 michael@0: psrlw xmm1,1 michael@0: michael@0: packuswb xmm0,xmm1 michael@0: michael@0: movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 michael@0: michael@0: sub ecx, byte SIZEOF_XMMWORD ; outcol michael@0: add esi, byte 2*SIZEOF_XMMWORD ; inptr michael@0: add edi, byte 1*SIZEOF_XMMWORD ; outptr michael@0: cmp ecx, byte SIZEOF_XMMWORD michael@0: jae short .columnloop michael@0: test ecx,ecx michael@0: jnz short .columnloop_r8 michael@0: michael@0: pop esi michael@0: pop edi michael@0: pop ecx michael@0: michael@0: add esi, byte SIZEOF_JSAMPROW ; input_data michael@0: add edi, byte SIZEOF_JSAMPROW ; output_data michael@0: dec eax ; rowctr michael@0: jg near .rowloop michael@0: michael@0: .return: michael@0: pop edi michael@0: pop esi michael@0: ; pop edx ; need not be preserved michael@0: ; pop ecx ; need not be preserved michael@0: ; pop ebx ; unused michael@0: pop ebp michael@0: ret michael@0: michael@0: ; -------------------------------------------------------------------------- michael@0: ; michael@0: ; Downsample pixel values of a single component. michael@0: ; This version handles the standard case of 2:1 horizontal and 2:1 vertical, michael@0: ; without smoothing. michael@0: ; michael@0: ; GLOBAL(void) michael@0: ; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor, michael@0: ; JDIMENSION v_samp_factor, JDIMENSION width_blocks, michael@0: ; JSAMPARRAY input_data, JSAMPARRAY output_data); michael@0: ; michael@0: michael@0: %define img_width(b) (b)+8 ; JDIMENSION image_width michael@0: %define max_v_samp(b) (b)+12 ; int max_v_samp_factor michael@0: %define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor michael@0: %define width_blks(b) (b)+20 ; JDIMENSION width_blocks michael@0: %define input_data(b) (b)+24 ; JSAMPARRAY input_data michael@0: %define output_data(b) (b)+28 ; JSAMPARRAY output_data michael@0: michael@0: align 16 michael@0: global EXTN(jsimd_h2v2_downsample_sse2) michael@0: michael@0: EXTN(jsimd_h2v2_downsample_sse2): michael@0: push ebp michael@0: mov ebp,esp michael@0: ; push ebx ; unused michael@0: ; push ecx ; need not be preserved michael@0: ; push edx ; need not be preserved michael@0: push esi michael@0: push edi michael@0: michael@0: mov ecx, JDIMENSION [width_blks(ebp)] michael@0: shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) michael@0: jz near .return michael@0: michael@0: mov edx, JDIMENSION [img_width(ebp)] michael@0: michael@0: ; -- expand_right_edge michael@0: michael@0: push ecx michael@0: shl ecx,1 ; output_cols * 2 michael@0: sub ecx,edx michael@0: jle short .expand_end michael@0: michael@0: mov eax, INT [max_v_samp(ebp)] michael@0: test eax,eax michael@0: jle short .expand_end michael@0: michael@0: cld michael@0: mov esi, JSAMPARRAY [input_data(ebp)] ; input_data michael@0: alignx 16,7 michael@0: .expandloop: michael@0: push eax michael@0: push ecx michael@0: michael@0: mov edi, JSAMPROW [esi] michael@0: add edi,edx michael@0: mov al, JSAMPLE [edi-1] michael@0: michael@0: rep stosb michael@0: michael@0: pop ecx michael@0: pop eax michael@0: michael@0: add esi, byte SIZEOF_JSAMPROW michael@0: dec eax michael@0: jg short .expandloop michael@0: michael@0: .expand_end: michael@0: pop ecx ; output_cols michael@0: michael@0: ; -- h2v2_downsample michael@0: michael@0: mov eax, JDIMENSION [v_samp(ebp)] ; rowctr michael@0: test eax,eax michael@0: jle near .return michael@0: michael@0: mov edx, 0x00020001 ; bias pattern michael@0: movd xmm7,edx michael@0: pcmpeqw xmm6,xmm6 michael@0: pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2} michael@0: psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..} michael@0: michael@0: mov esi, JSAMPARRAY [input_data(ebp)] ; input_data michael@0: mov edi, JSAMPARRAY [output_data(ebp)] ; output_data michael@0: alignx 16,7 michael@0: .rowloop: michael@0: push ecx michael@0: push edi michael@0: push esi michael@0: michael@0: mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 michael@0: mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 michael@0: mov edi, JSAMPROW [edi] ; outptr michael@0: michael@0: cmp ecx, byte SIZEOF_XMMWORD michael@0: jae short .columnloop michael@0: alignx 16,7 michael@0: michael@0: .columnloop_r8: michael@0: movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] michael@0: movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] michael@0: pxor xmm2,xmm2 michael@0: pxor xmm3,xmm3 michael@0: mov ecx, SIZEOF_XMMWORD michael@0: jmp short .downsample michael@0: alignx 16,7 michael@0: michael@0: .columnloop: michael@0: movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD] michael@0: movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD] michael@0: movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD] michael@0: movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD] michael@0: michael@0: .downsample: michael@0: movdqa xmm4,xmm0 michael@0: movdqa xmm5,xmm1 michael@0: pand xmm0,xmm6 michael@0: psrlw xmm4,BYTE_BIT michael@0: pand xmm1,xmm6 michael@0: psrlw xmm5,BYTE_BIT michael@0: paddw xmm0,xmm4 michael@0: paddw xmm1,xmm5 michael@0: michael@0: movdqa xmm4,xmm2 michael@0: movdqa xmm5,xmm3 michael@0: pand xmm2,xmm6 michael@0: psrlw xmm4,BYTE_BIT michael@0: pand xmm3,xmm6 michael@0: psrlw xmm5,BYTE_BIT michael@0: paddw xmm2,xmm4 michael@0: paddw xmm3,xmm5 michael@0: michael@0: paddw xmm0,xmm1 michael@0: paddw xmm2,xmm3 michael@0: paddw xmm0,xmm7 michael@0: paddw xmm2,xmm7 michael@0: psrlw xmm0,2 michael@0: psrlw xmm2,2 michael@0: michael@0: packuswb xmm0,xmm2 michael@0: michael@0: movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0 michael@0: michael@0: sub ecx, byte SIZEOF_XMMWORD ; outcol michael@0: add edx, byte 2*SIZEOF_XMMWORD ; inptr0 michael@0: add esi, byte 2*SIZEOF_XMMWORD ; inptr1 michael@0: add edi, byte 1*SIZEOF_XMMWORD ; outptr michael@0: cmp ecx, byte SIZEOF_XMMWORD michael@0: jae near .columnloop michael@0: test ecx,ecx michael@0: jnz near .columnloop_r8 michael@0: michael@0: pop esi michael@0: pop edi michael@0: pop ecx michael@0: michael@0: add esi, byte 2*SIZEOF_JSAMPROW ; input_data michael@0: add edi, byte 1*SIZEOF_JSAMPROW ; output_data michael@0: dec eax ; rowctr michael@0: jg near .rowloop michael@0: michael@0: .return: michael@0: pop edi michael@0: pop esi michael@0: ; pop edx ; need not be preserved michael@0: ; pop ecx ; need not be preserved michael@0: ; pop ebx ; unused michael@0: pop ebp michael@0: ret michael@0: michael@0: ; For some reason, the OS X linker does not honor the request to align the michael@0: ; segment unless we do this. michael@0: align 16