1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libjpeg/simd/jcgrymmx.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,357 @@ 1.4 +; 1.5 +; jcgrymmx.asm - grayscale colorspace conversion (MMX) 1.6 +; 1.7 +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 1.8 +; Copyright 2011 D. R. Commander 1.9 +; 1.10 +; Based on 1.11 +; x86 SIMD extension for IJG JPEG library 1.12 +; Copyright (C) 1999-2006, MIYASAKA Masaru. 1.13 +; For conditions of distribution and use, see copyright notice in jsimdext.inc 1.14 +; 1.15 +; This file should be assembled with NASM (Netwide Assembler), 1.16 +; can *not* be assembled with Microsoft's MASM or any compatible 1.17 +; assembler (including Borland's Turbo Assembler). 1.18 +; NASM is available from http://nasm.sourceforge.net/ or 1.19 +; http://sourceforge.net/project/showfiles.php?group_id=6208 1.20 +; 1.21 +; [TAB8] 1.22 + 1.23 +%include "jcolsamp.inc" 1.24 + 1.25 +; -------------------------------------------------------------------------- 1.26 +; 1.27 +; Convert some rows of samples to the output colorspace. 1.28 +; 1.29 +; GLOBAL(void) 1.30 +; jsimd_rgb_gray_convert_mmx (JDIMENSION img_width, 1.31 +; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, 1.32 +; JDIMENSION output_row, int num_rows); 1.33 +; 1.34 + 1.35 +%define img_width(b) (b)+8 ; JDIMENSION img_width 1.36 +%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf 1.37 +%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf 1.38 +%define output_row(b) (b)+20 ; JDIMENSION output_row 1.39 +%define num_rows(b) (b)+24 ; int num_rows 1.40 + 1.41 +%define original_ebp ebp+0 1.42 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] 1.43 +%define WK_NUM 2 1.44 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr 1.45 + 1.46 + align 16 1.47 + global EXTN(jsimd_rgb_gray_convert_mmx) 1.48 + 1.49 +EXTN(jsimd_rgb_gray_convert_mmx): 1.50 + push ebp 1.51 + mov eax,esp ; eax = original ebp 1.52 + sub esp, byte 4 1.53 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits 1.54 + mov [esp],eax 1.55 + mov ebp,esp ; ebp = aligned ebp 1.56 + lea esp, [wk(0)] 1.57 + pushpic eax ; make a room for GOT address 1.58 + push ebx 1.59 +; push ecx ; need not be preserved 1.60 +; push edx ; need not be preserved 1.61 + push esi 1.62 + push edi 1.63 + 1.64 + get_GOT ebx ; get GOT address 1.65 + movpic POINTER [gotptr], ebx ; save GOT address 1.66 + 1.67 + mov ecx, JDIMENSION [img_width(eax)] ; num_cols 1.68 + test ecx,ecx 1.69 + jz near .return 1.70 + 1.71 + push ecx 1.72 + 1.73 + mov esi, JSAMPIMAGE [output_buf(eax)] 1.74 + mov ecx, JDIMENSION [output_row(eax)] 1.75 + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] 1.76 + lea edi, [edi+ecx*SIZEOF_JSAMPROW] 1.77 + 1.78 + pop ecx 1.79 + 1.80 + mov esi, JSAMPARRAY [input_buf(eax)] 1.81 + mov eax, INT [num_rows(eax)] 1.82 + test eax,eax 1.83 + jle near .return 1.84 + alignx 16,7 1.85 +.rowloop: 1.86 + pushpic eax 1.87 + push edi 1.88 + push esi 1.89 + push ecx ; col 1.90 + 1.91 + mov esi, JSAMPROW [esi] ; inptr 1.92 + mov edi, JSAMPROW [edi] ; outptr0 1.93 + movpic eax, POINTER [gotptr] ; load GOT address (eax) 1.94 + 1.95 + cmp ecx, byte SIZEOF_MMWORD 1.96 + jae short .columnloop 1.97 + alignx 16,7 1.98 + 1.99 +%if RGB_PIXELSIZE == 3 ; --------------- 1.100 + 1.101 +.column_ld1: 1.102 + push eax 1.103 + push edx 1.104 + lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE 1.105 + test cl, SIZEOF_BYTE 1.106 + jz short .column_ld2 1.107 + sub ecx, byte SIZEOF_BYTE 1.108 + xor eax,eax 1.109 + mov al, BYTE [esi+ecx] 1.110 +.column_ld2: 1.111 + test cl, SIZEOF_WORD 1.112 + jz short .column_ld4 1.113 + sub ecx, byte SIZEOF_WORD 1.114 + xor edx,edx 1.115 + mov dx, WORD [esi+ecx] 1.116 + shl eax, WORD_BIT 1.117 + or eax,edx 1.118 +.column_ld4: 1.119 + movd mmA,eax 1.120 + pop edx 1.121 + pop eax 1.122 + test cl, SIZEOF_DWORD 1.123 + jz short .column_ld8 1.124 + sub ecx, byte SIZEOF_DWORD 1.125 + movd mmG, DWORD [esi+ecx] 1.126 + psllq mmA, DWORD_BIT 1.127 + por mmA,mmG 1.128 +.column_ld8: 1.129 + test cl, SIZEOF_MMWORD 1.130 + jz short .column_ld16 1.131 + movq mmG,mmA 1.132 + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] 1.133 + mov ecx, SIZEOF_MMWORD 1.134 + jmp short .rgb_gray_cnv 1.135 +.column_ld16: 1.136 + test cl, 2*SIZEOF_MMWORD 1.137 + mov ecx, SIZEOF_MMWORD 1.138 + jz short .rgb_gray_cnv 1.139 + movq mmF,mmA 1.140 + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] 1.141 + movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] 1.142 + jmp short .rgb_gray_cnv 1.143 + alignx 16,7 1.144 + 1.145 +.columnloop: 1.146 + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] 1.147 + movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] 1.148 + movq mmF, MMWORD [esi+2*SIZEOF_MMWORD] 1.149 + 1.150 +.rgb_gray_cnv: 1.151 + ; mmA=(00 10 20 01 11 21 02 12) 1.152 + ; mmG=(22 03 13 23 04 14 24 05) 1.153 + ; mmF=(15 25 06 16 26 07 17 27) 1.154 + 1.155 + movq mmD,mmA 1.156 + psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01) 1.157 + psrlq mmD,4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --) 1.158 + 1.159 + punpckhbw mmA,mmG ; mmA=(00 04 10 14 20 24 01 05) 1.160 + psllq mmG,4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23) 1.161 + 1.162 + punpcklbw mmD,mmF ; mmD=(11 15 21 25 02 06 12 16) 1.163 + punpckhbw mmG,mmF ; mmG=(22 26 03 07 13 17 23 27) 1.164 + 1.165 + movq mmE,mmA 1.166 + psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14) 1.167 + psrlq mmE,4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --) 1.168 + 1.169 + punpckhbw mmA,mmD ; mmA=(00 02 04 06 10 12 14 16) 1.170 + psllq mmD,4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25) 1.171 + 1.172 + punpcklbw mmE,mmG ; mmE=(20 22 24 26 01 03 05 07) 1.173 + punpckhbw mmD,mmG ; mmD=(11 13 15 17 21 23 25 27) 1.174 + 1.175 + pxor mmH,mmH 1.176 + 1.177 + movq mmC,mmA 1.178 + punpcklbw mmA,mmH ; mmA=(00 02 04 06) 1.179 + punpckhbw mmC,mmH ; mmC=(10 12 14 16) 1.180 + 1.181 + movq mmB,mmE 1.182 + punpcklbw mmE,mmH ; mmE=(20 22 24 26) 1.183 + punpckhbw mmB,mmH ; mmB=(01 03 05 07) 1.184 + 1.185 + movq mmF,mmD 1.186 + punpcklbw mmD,mmH ; mmD=(11 13 15 17) 1.187 + punpckhbw mmF,mmH ; mmF=(21 23 25 27) 1.188 + 1.189 +%else ; RGB_PIXELSIZE == 4 ; ----------- 1.190 + 1.191 +.column_ld1: 1.192 + test cl, SIZEOF_MMWORD/8 1.193 + jz short .column_ld2 1.194 + sub ecx, byte SIZEOF_MMWORD/8 1.195 + movd mmA, DWORD [esi+ecx*RGB_PIXELSIZE] 1.196 +.column_ld2: 1.197 + test cl, SIZEOF_MMWORD/4 1.198 + jz short .column_ld4 1.199 + sub ecx, byte SIZEOF_MMWORD/4 1.200 + movq mmF,mmA 1.201 + movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE] 1.202 +.column_ld4: 1.203 + test cl, SIZEOF_MMWORD/2 1.204 + mov ecx, SIZEOF_MMWORD 1.205 + jz short .rgb_gray_cnv 1.206 + movq mmD,mmA 1.207 + movq mmC,mmF 1.208 + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] 1.209 + movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] 1.210 + jmp short .rgb_gray_cnv 1.211 + alignx 16,7 1.212 + 1.213 +.columnloop: 1.214 + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] 1.215 + movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] 1.216 + movq mmD, MMWORD [esi+2*SIZEOF_MMWORD] 1.217 + movq mmC, MMWORD [esi+3*SIZEOF_MMWORD] 1.218 + 1.219 +.rgb_gray_cnv: 1.220 + ; mmA=(00 10 20 30 01 11 21 31) 1.221 + ; mmF=(02 12 22 32 03 13 23 33) 1.222 + ; mmD=(04 14 24 34 05 15 25 35) 1.223 + ; mmC=(06 16 26 36 07 17 27 37) 1.224 + 1.225 + movq mmB,mmA 1.226 + punpcklbw mmA,mmF ; mmA=(00 02 10 12 20 22 30 32) 1.227 + punpckhbw mmB,mmF ; mmB=(01 03 11 13 21 23 31 33) 1.228 + 1.229 + movq mmG,mmD 1.230 + punpcklbw mmD,mmC ; mmD=(04 06 14 16 24 26 34 36) 1.231 + punpckhbw mmG,mmC ; mmG=(05 07 15 17 25 27 35 37) 1.232 + 1.233 + movq mmE,mmA 1.234 + punpcklwd mmA,mmD ; mmA=(00 02 04 06 10 12 14 16) 1.235 + punpckhwd mmE,mmD ; mmE=(20 22 24 26 30 32 34 36) 1.236 + 1.237 + movq mmH,mmB 1.238 + punpcklwd mmB,mmG ; mmB=(01 03 05 07 11 13 15 17) 1.239 + punpckhwd mmH,mmG ; mmH=(21 23 25 27 31 33 35 37) 1.240 + 1.241 + pxor mmF,mmF 1.242 + 1.243 + movq mmC,mmA 1.244 + punpcklbw mmA,mmF ; mmA=(00 02 04 06) 1.245 + punpckhbw mmC,mmF ; mmC=(10 12 14 16) 1.246 + 1.247 + movq mmD,mmB 1.248 + punpcklbw mmB,mmF ; mmB=(01 03 05 07) 1.249 + punpckhbw mmD,mmF ; mmD=(11 13 15 17) 1.250 + 1.251 + movq mmG,mmE 1.252 + punpcklbw mmE,mmF ; mmE=(20 22 24 26) 1.253 + punpckhbw mmG,mmF ; mmG=(30 32 34 36) 1.254 + 1.255 + punpcklbw mmF,mmH 1.256 + punpckhbw mmH,mmH 1.257 + psrlw mmF,BYTE_BIT ; mmF=(21 23 25 27) 1.258 + psrlw mmH,BYTE_BIT ; mmH=(31 33 35 37) 1.259 + 1.260 +%endif ; RGB_PIXELSIZE ; --------------- 1.261 + 1.262 + ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE 1.263 + ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO 1.264 + 1.265 + ; (Original) 1.266 + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B 1.267 + ; 1.268 + ; (This implementation) 1.269 + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G 1.270 + 1.271 + movq mm6,mm1 1.272 + punpcklwd mm1,mm3 1.273 + punpckhwd mm6,mm3 1.274 + pmaddwd mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337) 1.275 + pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337) 1.276 + 1.277 + movq mm7, mm6 ; mm7=ROH*FIX(0.299)+GOH*FIX(0.337) 1.278 + 1.279 + movq mm6,mm0 1.280 + punpcklwd mm0,mm2 1.281 + punpckhwd mm6,mm2 1.282 + pmaddwd mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337) 1.283 + pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337) 1.284 + 1.285 + movq MMWORD [wk(0)], mm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) 1.286 + movq MMWORD [wk(1)], mm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) 1.287 + 1.288 + movq mm0, mm5 ; mm0=BO 1.289 + movq mm6, mm4 ; mm6=BE 1.290 + 1.291 + movq mm4,mm0 1.292 + punpcklwd mm0,mm3 1.293 + punpckhwd mm4,mm3 1.294 + pmaddwd mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250) 1.295 + pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250) 1.296 + 1.297 + movq mm3,[GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF] 1.298 + 1.299 + paddd mm0, mm1 1.300 + paddd mm4, mm7 1.301 + paddd mm0,mm3 1.302 + paddd mm4,mm3 1.303 + psrld mm0,SCALEBITS ; mm0=YOL 1.304 + psrld mm4,SCALEBITS ; mm4=YOH 1.305 + packssdw mm0,mm4 ; mm0=YO 1.306 + 1.307 + movq mm4,mm6 1.308 + punpcklwd mm6,mm2 1.309 + punpckhwd mm4,mm2 1.310 + pmaddwd mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250) 1.311 + pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250) 1.312 + 1.313 + movq mm2,[GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF] 1.314 + 1.315 + paddd mm6, MMWORD [wk(0)] 1.316 + paddd mm4, MMWORD [wk(1)] 1.317 + paddd mm6,mm2 1.318 + paddd mm4,mm2 1.319 + psrld mm6,SCALEBITS ; mm6=YEL 1.320 + psrld mm4,SCALEBITS ; mm4=YEH 1.321 + packssdw mm6,mm4 ; mm6=YE 1.322 + 1.323 + psllw mm0,BYTE_BIT 1.324 + por mm6,mm0 ; mm6=Y 1.325 + movq MMWORD [edi], mm6 ; Save Y 1.326 + 1.327 + sub ecx, byte SIZEOF_MMWORD 1.328 + add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr 1.329 + add edi, byte SIZEOF_MMWORD ; outptr0 1.330 + cmp ecx, byte SIZEOF_MMWORD 1.331 + jae near .columnloop 1.332 + test ecx,ecx 1.333 + jnz near .column_ld1 1.334 + 1.335 + pop ecx ; col 1.336 + pop esi 1.337 + pop edi 1.338 + poppic eax 1.339 + 1.340 + add esi, byte SIZEOF_JSAMPROW ; input_buf 1.341 + add edi, byte SIZEOF_JSAMPROW 1.342 + dec eax ; num_rows 1.343 + jg near .rowloop 1.344 + 1.345 + emms ; empty MMX state 1.346 + 1.347 +.return: 1.348 + pop edi 1.349 + pop esi 1.350 +; pop edx ; need not be preserved 1.351 +; pop ecx ; need not be preserved 1.352 + pop ebx 1.353 + mov esp,ebp ; esp <- aligned ebp 1.354 + pop esp ; esp <- original ebp 1.355 + pop ebp 1.356 + ret 1.357 + 1.358 +; For some reason, the OS X linker does not honor the request to align the 1.359 +; segment unless we do this. 1.360 + align 16