michael@0: ; michael@0: ; jcsammmx.asm - downsampling (MMX) michael@0: ; michael@0: ; Copyright 2009 Pierre Ossman for Cendio AB michael@0: ; michael@0: ; Based on michael@0: ; x86 SIMD extension for IJG JPEG library michael@0: ; Copyright (C) 1999-2006, MIYASAKA Masaru. michael@0: ; For conditions of distribution and use, see copyright notice in jsimdext.inc michael@0: ; michael@0: ; This file should be assembled with NASM (Netwide Assembler), michael@0: ; can *not* be assembled with Microsoft's MASM or any compatible michael@0: ; assembler (including Borland's Turbo Assembler). michael@0: ; NASM is available from http://nasm.sourceforge.net/ or michael@0: ; http://sourceforge.net/project/showfiles.php?group_id=6208 michael@0: ; michael@0: ; [TAB8] michael@0: michael@0: %include "jsimdext.inc" michael@0: michael@0: ; -------------------------------------------------------------------------- michael@0: SECTION SEG_TEXT michael@0: BITS 32 michael@0: ; michael@0: ; Downsample pixel values of a single component. michael@0: ; This version handles the common case of 2:1 horizontal and 1:1 vertical, michael@0: ; without smoothing. michael@0: ; michael@0: ; GLOBAL(void) michael@0: ; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor, michael@0: ; JDIMENSION v_samp_factor, JDIMENSION width_blocks, michael@0: ; JSAMPARRAY input_data, JSAMPARRAY output_data); michael@0: ; michael@0: michael@0: %define img_width(b) (b)+8 ; JDIMENSION image_width michael@0: %define max_v_samp(b) (b)+12 ; int max_v_samp_factor michael@0: %define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor michael@0: %define width_blks(b) (b)+20 ; JDIMENSION width_blocks michael@0: %define input_data(b) (b)+24 ; JSAMPARRAY input_data michael@0: %define output_data(b) (b)+28 ; JSAMPARRAY output_data michael@0: michael@0: align 16 michael@0: global EXTN(jsimd_h2v1_downsample_mmx) michael@0: michael@0: EXTN(jsimd_h2v1_downsample_mmx): michael@0: push ebp michael@0: mov ebp,esp michael@0: ; push ebx ; unused michael@0: ; push ecx ; need not be preserved michael@0: ; push edx ; need not be preserved michael@0: push esi michael@0: push edi michael@0: michael@0: mov ecx, JDIMENSION [width_blks(ebp)] michael@0: shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) michael@0: jz near .return michael@0: michael@0: mov edx, JDIMENSION [img_width(ebp)] michael@0: michael@0: ; -- expand_right_edge michael@0: michael@0: push ecx michael@0: shl ecx,1 ; output_cols * 2 michael@0: sub ecx,edx michael@0: jle short .expand_end michael@0: michael@0: mov eax, INT [max_v_samp(ebp)] michael@0: test eax,eax michael@0: jle short .expand_end michael@0: michael@0: cld michael@0: mov esi, JSAMPARRAY [input_data(ebp)] ; input_data michael@0: alignx 16,7 michael@0: .expandloop: michael@0: push eax michael@0: push ecx michael@0: michael@0: mov edi, JSAMPROW [esi] michael@0: add edi,edx michael@0: mov al, JSAMPLE [edi-1] michael@0: michael@0: rep stosb michael@0: michael@0: pop ecx michael@0: pop eax michael@0: michael@0: add esi, byte SIZEOF_JSAMPROW michael@0: dec eax michael@0: jg short .expandloop michael@0: michael@0: .expand_end: michael@0: pop ecx ; output_cols michael@0: michael@0: ; -- h2v1_downsample michael@0: michael@0: mov eax, JDIMENSION [v_samp(ebp)] ; rowctr michael@0: test eax,eax michael@0: jle near .return michael@0: michael@0: mov edx, 0x00010000 ; bias pattern michael@0: movd mm7,edx michael@0: pcmpeqw mm6,mm6 michael@0: punpckldq mm7,mm7 ; mm7={0, 1, 0, 1} michael@0: psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} michael@0: michael@0: mov esi, JSAMPARRAY [input_data(ebp)] ; input_data michael@0: mov edi, JSAMPARRAY [output_data(ebp)] ; output_data michael@0: alignx 16,7 michael@0: .rowloop: michael@0: push ecx michael@0: push edi michael@0: push esi michael@0: michael@0: mov esi, JSAMPROW [esi] ; inptr michael@0: mov edi, JSAMPROW [edi] ; outptr michael@0: alignx 16,7 michael@0: .columnloop: michael@0: michael@0: movq mm0, MMWORD [esi+0*SIZEOF_MMWORD] michael@0: movq mm1, MMWORD [esi+1*SIZEOF_MMWORD] michael@0: movq mm2,mm0 michael@0: movq mm3,mm1 michael@0: michael@0: pand mm0,mm6 michael@0: psrlw mm2,BYTE_BIT michael@0: pand mm1,mm6 michael@0: psrlw mm3,BYTE_BIT michael@0: michael@0: paddw mm0,mm2 michael@0: paddw mm1,mm3 michael@0: paddw mm0,mm7 michael@0: paddw mm1,mm7 michael@0: psrlw mm0,1 michael@0: psrlw mm1,1 michael@0: michael@0: packuswb mm0,mm1 michael@0: michael@0: movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 michael@0: michael@0: add esi, byte 2*SIZEOF_MMWORD ; inptr michael@0: add edi, byte 1*SIZEOF_MMWORD ; outptr michael@0: sub ecx, byte SIZEOF_MMWORD ; outcol michael@0: jnz short .columnloop michael@0: michael@0: pop esi michael@0: pop edi michael@0: pop ecx michael@0: michael@0: add esi, byte SIZEOF_JSAMPROW ; input_data michael@0: add edi, byte SIZEOF_JSAMPROW ; output_data michael@0: dec eax ; rowctr michael@0: jg short .rowloop michael@0: michael@0: emms ; empty MMX state michael@0: michael@0: .return: michael@0: pop edi michael@0: pop esi michael@0: ; pop edx ; need not be preserved michael@0: ; pop ecx ; need not be preserved michael@0: ; pop ebx ; unused michael@0: pop ebp michael@0: ret michael@0: michael@0: ; -------------------------------------------------------------------------- michael@0: ; michael@0: ; Downsample pixel values of a single component. michael@0: ; This version handles the standard case of 2:1 horizontal and 2:1 vertical, michael@0: ; without smoothing. michael@0: ; michael@0: ; GLOBAL(void) michael@0: ; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor, michael@0: ; JDIMENSION v_samp_factor, JDIMENSION width_blocks, michael@0: ; JSAMPARRAY input_data, JSAMPARRAY output_data); michael@0: ; michael@0: michael@0: %define img_width(b) (b)+8 ; JDIMENSION image_width michael@0: %define max_v_samp(b) (b)+12 ; int max_v_samp_factor michael@0: %define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor michael@0: %define width_blks(b) (b)+20 ; JDIMENSION width_blocks michael@0: %define input_data(b) (b)+24 ; JSAMPARRAY input_data michael@0: %define output_data(b) (b)+28 ; JSAMPARRAY output_data michael@0: michael@0: align 16 michael@0: global EXTN(jsimd_h2v2_downsample_mmx) michael@0: michael@0: EXTN(jsimd_h2v2_downsample_mmx): michael@0: push ebp michael@0: mov ebp,esp michael@0: ; push ebx ; unused michael@0: ; push ecx ; need not be preserved michael@0: ; push edx ; need not be preserved michael@0: push esi michael@0: push edi michael@0: michael@0: mov ecx, JDIMENSION [width_blks(ebp)] michael@0: shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols) michael@0: jz near .return michael@0: michael@0: mov edx, JDIMENSION [img_width(ebp)] michael@0: michael@0: ; -- expand_right_edge michael@0: michael@0: push ecx michael@0: shl ecx,1 ; output_cols * 2 michael@0: sub ecx,edx michael@0: jle short .expand_end michael@0: michael@0: mov eax, INT [max_v_samp(ebp)] michael@0: test eax,eax michael@0: jle short .expand_end michael@0: michael@0: cld michael@0: mov esi, JSAMPARRAY [input_data(ebp)] ; input_data michael@0: alignx 16,7 michael@0: .expandloop: michael@0: push eax michael@0: push ecx michael@0: michael@0: mov edi, JSAMPROW [esi] michael@0: add edi,edx michael@0: mov al, JSAMPLE [edi-1] michael@0: michael@0: rep stosb michael@0: michael@0: pop ecx michael@0: pop eax michael@0: michael@0: add esi, byte SIZEOF_JSAMPROW michael@0: dec eax michael@0: jg short .expandloop michael@0: michael@0: .expand_end: michael@0: pop ecx ; output_cols michael@0: michael@0: ; -- h2v2_downsample michael@0: michael@0: mov eax, JDIMENSION [v_samp(ebp)] ; rowctr michael@0: test eax,eax michael@0: jle near .return michael@0: michael@0: mov edx, 0x00020001 ; bias pattern michael@0: movd mm7,edx michael@0: pcmpeqw mm6,mm6 michael@0: punpckldq mm7,mm7 ; mm7={1, 2, 1, 2} michael@0: psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..} michael@0: michael@0: mov esi, JSAMPARRAY [input_data(ebp)] ; input_data michael@0: mov edi, JSAMPARRAY [output_data(ebp)] ; output_data michael@0: alignx 16,7 michael@0: .rowloop: michael@0: push ecx michael@0: push edi michael@0: push esi michael@0: michael@0: mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0 michael@0: mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1 michael@0: mov edi, JSAMPROW [edi] ; outptr michael@0: alignx 16,7 michael@0: .columnloop: michael@0: michael@0: movq mm0, MMWORD [edx+0*SIZEOF_MMWORD] michael@0: movq mm1, MMWORD [esi+0*SIZEOF_MMWORD] michael@0: movq mm2, MMWORD [edx+1*SIZEOF_MMWORD] michael@0: movq mm3, MMWORD [esi+1*SIZEOF_MMWORD] michael@0: michael@0: movq mm4,mm0 michael@0: movq mm5,mm1 michael@0: pand mm0,mm6 michael@0: psrlw mm4,BYTE_BIT michael@0: pand mm1,mm6 michael@0: psrlw mm5,BYTE_BIT michael@0: paddw mm0,mm4 michael@0: paddw mm1,mm5 michael@0: michael@0: movq mm4,mm2 michael@0: movq mm5,mm3 michael@0: pand mm2,mm6 michael@0: psrlw mm4,BYTE_BIT michael@0: pand mm3,mm6 michael@0: psrlw mm5,BYTE_BIT michael@0: paddw mm2,mm4 michael@0: paddw mm3,mm5 michael@0: michael@0: paddw mm0,mm1 michael@0: paddw mm2,mm3 michael@0: paddw mm0,mm7 michael@0: paddw mm2,mm7 michael@0: psrlw mm0,2 michael@0: psrlw mm2,2 michael@0: michael@0: packuswb mm0,mm2 michael@0: michael@0: movq MMWORD [edi+0*SIZEOF_MMWORD], mm0 michael@0: michael@0: add edx, byte 2*SIZEOF_MMWORD ; inptr0 michael@0: add esi, byte 2*SIZEOF_MMWORD ; inptr1 michael@0: add edi, byte 1*SIZEOF_MMWORD ; outptr michael@0: sub ecx, byte SIZEOF_MMWORD ; outcol michael@0: jnz near .columnloop michael@0: michael@0: pop esi michael@0: pop edi michael@0: pop ecx michael@0: michael@0: add esi, byte 2*SIZEOF_JSAMPROW ; input_data michael@0: add edi, byte 1*SIZEOF_JSAMPROW ; output_data michael@0: dec eax ; rowctr michael@0: jg near .rowloop michael@0: michael@0: emms ; empty MMX state michael@0: michael@0: .return: michael@0: pop edi michael@0: pop esi michael@0: ; pop edx ; need not be preserved michael@0: ; pop ecx ; need not be preserved michael@0: ; pop ebx ; unused michael@0: pop ebp michael@0: ret michael@0: michael@0: ; For some reason, the OS X linker does not honor the request to align the michael@0: ; segment unless we do this. michael@0: align 16