1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libjpeg/simd/jdclrmmx.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,405 @@ 1.4 +; 1.5 +; jdclrmmx.asm - colorspace conversion (MMX) 1.6 +; 1.7 +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 1.8 +; 1.9 +; Based on 1.10 +; x86 SIMD extension for IJG JPEG library 1.11 +; Copyright (C) 1999-2006, MIYASAKA Masaru. 1.12 +; For conditions of distribution and use, see copyright notice in jsimdext.inc 1.13 +; 1.14 +; This file should be assembled with NASM (Netwide Assembler), 1.15 +; can *not* be assembled with Microsoft's MASM or any compatible 1.16 +; assembler (including Borland's Turbo Assembler). 1.17 +; NASM is available from http://nasm.sourceforge.net/ or 1.18 +; http://sourceforge.net/project/showfiles.php?group_id=6208 1.19 +; 1.20 +; [TAB8] 1.21 + 1.22 +%include "jcolsamp.inc" 1.23 + 1.24 +; -------------------------------------------------------------------------- 1.25 +; 1.26 +; Convert some rows of samples to the output colorspace. 1.27 +; 1.28 +; GLOBAL(void) 1.29 +; jsimd_ycc_rgb_convert_mmx (JDIMENSION out_width, 1.30 +; JSAMPIMAGE input_buf, JDIMENSION input_row, 1.31 +; JSAMPARRAY output_buf, int num_rows) 1.32 +; 1.33 + 1.34 +%define out_width(b) (b)+8 ; JDIMENSION out_width 1.35 +%define input_buf(b) (b)+12 ; JSAMPIMAGE input_buf 1.36 +%define input_row(b) (b)+16 ; JDIMENSION input_row 1.37 +%define output_buf(b) (b)+20 ; JSAMPARRAY output_buf 1.38 +%define num_rows(b) (b)+24 ; int num_rows 1.39 + 1.40 +%define original_ebp ebp+0 1.41 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] 1.42 +%define WK_NUM 2 1.43 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr 1.44 + 1.45 + align 16 1.46 + global EXTN(jsimd_ycc_rgb_convert_mmx) 1.47 + 1.48 +EXTN(jsimd_ycc_rgb_convert_mmx): 1.49 + push ebp 1.50 + mov eax,esp ; eax = original ebp 1.51 + sub esp, byte 4 1.52 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits 1.53 + mov [esp],eax 1.54 + mov ebp,esp ; ebp = aligned ebp 1.55 + lea esp, [wk(0)] 1.56 + pushpic eax ; make a room for GOT address 1.57 + push ebx 1.58 +; push ecx ; need not be preserved 1.59 +; push edx ; need not be preserved 1.60 + push esi 1.61 + push edi 1.62 + 1.63 + get_GOT ebx ; get GOT address 1.64 + movpic POINTER [gotptr], ebx ; save GOT address 1.65 + 1.66 + mov ecx, JDIMENSION [out_width(eax)] ; num_cols 1.67 + test ecx,ecx 1.68 + jz near .return 1.69 + 1.70 + push ecx 1.71 + 1.72 + mov edi, JSAMPIMAGE [input_buf(eax)] 1.73 + mov ecx, JDIMENSION [input_row(eax)] 1.74 + mov esi, JSAMPARRAY [edi+0*SIZEOF_JSAMPARRAY] 1.75 + mov ebx, JSAMPARRAY [edi+1*SIZEOF_JSAMPARRAY] 1.76 + mov edx, JSAMPARRAY [edi+2*SIZEOF_JSAMPARRAY] 1.77 + lea esi, [esi+ecx*SIZEOF_JSAMPROW] 1.78 + lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] 1.79 + lea edx, [edx+ecx*SIZEOF_JSAMPROW] 1.80 + 1.81 + pop ecx 1.82 + 1.83 + mov edi, JSAMPARRAY [output_buf(eax)] 1.84 + mov eax, INT [num_rows(eax)] 1.85 + test eax,eax 1.86 + jle near .return 1.87 + alignx 16,7 1.88 +.rowloop: 1.89 + push eax 1.90 + push edi 1.91 + push edx 1.92 + push ebx 1.93 + push esi 1.94 + push ecx ; col 1.95 + 1.96 + mov esi, JSAMPROW [esi] ; inptr0 1.97 + mov ebx, JSAMPROW [ebx] ; inptr1 1.98 + mov edx, JSAMPROW [edx] ; inptr2 1.99 + mov edi, JSAMPROW [edi] ; outptr 1.100 + movpic eax, POINTER [gotptr] ; load GOT address (eax) 1.101 + alignx 16,7 1.102 +.columnloop: 1.103 + 1.104 + movq mm5, MMWORD [ebx] ; mm5=Cb(01234567) 1.105 + movq mm1, MMWORD [edx] ; mm1=Cr(01234567) 1.106 + 1.107 + pcmpeqw mm4,mm4 1.108 + pcmpeqw mm7,mm7 1.109 + psrlw mm4,BYTE_BIT 1.110 + psllw mm7,7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80} 1.111 + movq mm0,mm4 ; mm0=mm4={0xFF 0x00 0xFF 0x00 ..} 1.112 + 1.113 + pand mm4,mm5 ; mm4=Cb(0246)=CbE 1.114 + psrlw mm5,BYTE_BIT ; mm5=Cb(1357)=CbO 1.115 + pand mm0,mm1 ; mm0=Cr(0246)=CrE 1.116 + psrlw mm1,BYTE_BIT ; mm1=Cr(1357)=CrO 1.117 + 1.118 + paddw mm4,mm7 1.119 + paddw mm5,mm7 1.120 + paddw mm0,mm7 1.121 + paddw mm1,mm7 1.122 + 1.123 + ; (Original) 1.124 + ; R = Y + 1.40200 * Cr 1.125 + ; G = Y - 0.34414 * Cb - 0.71414 * Cr 1.126 + ; B = Y + 1.77200 * Cb 1.127 + ; 1.128 + ; (This implementation) 1.129 + ; R = Y + 0.40200 * Cr + Cr 1.130 + ; G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr 1.131 + ; B = Y - 0.22800 * Cb + Cb + Cb 1.132 + 1.133 + movq mm2,mm4 ; mm2=CbE 1.134 + movq mm3,mm5 ; mm3=CbO 1.135 + paddw mm4,mm4 ; mm4=2*CbE 1.136 + paddw mm5,mm5 ; mm5=2*CbO 1.137 + movq mm6,mm0 ; mm6=CrE 1.138 + movq mm7,mm1 ; mm7=CrO 1.139 + paddw mm0,mm0 ; mm0=2*CrE 1.140 + paddw mm1,mm1 ; mm1=2*CrO 1.141 + 1.142 + pmulhw mm4,[GOTOFF(eax,PW_MF0228)] ; mm4=(2*CbE * -FIX(0.22800)) 1.143 + pmulhw mm5,[GOTOFF(eax,PW_MF0228)] ; mm5=(2*CbO * -FIX(0.22800)) 1.144 + pmulhw mm0,[GOTOFF(eax,PW_F0402)] ; mm0=(2*CrE * FIX(0.40200)) 1.145 + pmulhw mm1,[GOTOFF(eax,PW_F0402)] ; mm1=(2*CrO * FIX(0.40200)) 1.146 + 1.147 + paddw mm4,[GOTOFF(eax,PW_ONE)] 1.148 + paddw mm5,[GOTOFF(eax,PW_ONE)] 1.149 + psraw mm4,1 ; mm4=(CbE * -FIX(0.22800)) 1.150 + psraw mm5,1 ; mm5=(CbO * -FIX(0.22800)) 1.151 + paddw mm0,[GOTOFF(eax,PW_ONE)] 1.152 + paddw mm1,[GOTOFF(eax,PW_ONE)] 1.153 + psraw mm0,1 ; mm0=(CrE * FIX(0.40200)) 1.154 + psraw mm1,1 ; mm1=(CrO * FIX(0.40200)) 1.155 + 1.156 + paddw mm4,mm2 1.157 + paddw mm5,mm3 1.158 + paddw mm4,mm2 ; mm4=(CbE * FIX(1.77200))=(B-Y)E 1.159 + paddw mm5,mm3 ; mm5=(CbO * FIX(1.77200))=(B-Y)O 1.160 + paddw mm0,mm6 ; mm0=(CrE * FIX(1.40200))=(R-Y)E 1.161 + paddw mm1,mm7 ; mm1=(CrO * FIX(1.40200))=(R-Y)O 1.162 + 1.163 + movq MMWORD [wk(0)], mm4 ; wk(0)=(B-Y)E 1.164 + movq MMWORD [wk(1)], mm5 ; wk(1)=(B-Y)O 1.165 + 1.166 + movq mm4,mm2 1.167 + movq mm5,mm3 1.168 + punpcklwd mm2,mm6 1.169 + punpckhwd mm4,mm6 1.170 + pmaddwd mm2,[GOTOFF(eax,PW_MF0344_F0285)] 1.171 + pmaddwd mm4,[GOTOFF(eax,PW_MF0344_F0285)] 1.172 + punpcklwd mm3,mm7 1.173 + punpckhwd mm5,mm7 1.174 + pmaddwd mm3,[GOTOFF(eax,PW_MF0344_F0285)] 1.175 + pmaddwd mm5,[GOTOFF(eax,PW_MF0344_F0285)] 1.176 + 1.177 + paddd mm2,[GOTOFF(eax,PD_ONEHALF)] 1.178 + paddd mm4,[GOTOFF(eax,PD_ONEHALF)] 1.179 + psrad mm2,SCALEBITS 1.180 + psrad mm4,SCALEBITS 1.181 + paddd mm3,[GOTOFF(eax,PD_ONEHALF)] 1.182 + paddd mm5,[GOTOFF(eax,PD_ONEHALF)] 1.183 + psrad mm3,SCALEBITS 1.184 + psrad mm5,SCALEBITS 1.185 + 1.186 + packssdw mm2,mm4 ; mm2=CbE*-FIX(0.344)+CrE*FIX(0.285) 1.187 + packssdw mm3,mm5 ; mm3=CbO*-FIX(0.344)+CrO*FIX(0.285) 1.188 + psubw mm2,mm6 ; mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E 1.189 + psubw mm3,mm7 ; mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O 1.190 + 1.191 + movq mm5, MMWORD [esi] ; mm5=Y(01234567) 1.192 + 1.193 + pcmpeqw mm4,mm4 1.194 + psrlw mm4,BYTE_BIT ; mm4={0xFF 0x00 0xFF 0x00 ..} 1.195 + pand mm4,mm5 ; mm4=Y(0246)=YE 1.196 + psrlw mm5,BYTE_BIT ; mm5=Y(1357)=YO 1.197 + 1.198 + paddw mm0,mm4 ; mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6) 1.199 + paddw mm1,mm5 ; mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7) 1.200 + packuswb mm0,mm0 ; mm0=(R0 R2 R4 R6 ** ** ** **) 1.201 + packuswb mm1,mm1 ; mm1=(R1 R3 R5 R7 ** ** ** **) 1.202 + 1.203 + paddw mm2,mm4 ; mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6) 1.204 + paddw mm3,mm5 ; mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7) 1.205 + packuswb mm2,mm2 ; mm2=(G0 G2 G4 G6 ** ** ** **) 1.206 + packuswb mm3,mm3 ; mm3=(G1 G3 G5 G7 ** ** ** **) 1.207 + 1.208 + paddw mm4, MMWORD [wk(0)] ; mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6) 1.209 + paddw mm5, MMWORD [wk(1)] ; mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7) 1.210 + packuswb mm4,mm4 ; mm4=(B0 B2 B4 B6 ** ** ** **) 1.211 + packuswb mm5,mm5 ; mm5=(B1 B3 B5 B7 ** ** ** **) 1.212 + 1.213 +%if RGB_PIXELSIZE == 3 ; --------------- 1.214 + 1.215 + ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) 1.216 + ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) 1.217 + ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) 1.218 + ; mmG=(** ** ** ** ** ** ** **), mmH=(** ** ** ** ** ** ** **) 1.219 + 1.220 + punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16) 1.221 + punpcklbw mmE,mmB ; mmE=(20 01 22 03 24 05 26 07) 1.222 + punpcklbw mmD,mmF ; mmD=(11 21 13 23 15 25 17 27) 1.223 + 1.224 + movq mmG,mmA 1.225 + movq mmH,mmA 1.226 + punpcklwd mmA,mmE ; mmA=(00 10 20 01 02 12 22 03) 1.227 + punpckhwd mmG,mmE ; mmG=(04 14 24 05 06 16 26 07) 1.228 + 1.229 + psrlq mmH,2*BYTE_BIT ; mmH=(02 12 04 14 06 16 -- --) 1.230 + psrlq mmE,2*BYTE_BIT ; mmE=(22 03 24 05 26 07 -- --) 1.231 + 1.232 + movq mmC,mmD 1.233 + movq mmB,mmD 1.234 + punpcklwd mmD,mmH ; mmD=(11 21 02 12 13 23 04 14) 1.235 + punpckhwd mmC,mmH ; mmC=(15 25 06 16 17 27 -- --) 1.236 + 1.237 + psrlq mmB,2*BYTE_BIT ; mmB=(13 23 15 25 17 27 -- --) 1.238 + 1.239 + movq mmF,mmE 1.240 + punpcklwd mmE,mmB ; mmE=(22 03 13 23 24 05 15 25) 1.241 + punpckhwd mmF,mmB ; mmF=(26 07 17 27 -- -- -- --) 1.242 + 1.243 + punpckldq mmA,mmD ; mmA=(00 10 20 01 11 21 02 12) 1.244 + punpckldq mmE,mmG ; mmE=(22 03 13 23 04 14 24 05) 1.245 + punpckldq mmC,mmF ; mmC=(15 25 06 16 26 07 17 27) 1.246 + 1.247 + cmp ecx, byte SIZEOF_MMWORD 1.248 + jb short .column_st16 1.249 + 1.250 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 1.251 + movq MMWORD [edi+1*SIZEOF_MMWORD], mmE 1.252 + movq MMWORD [edi+2*SIZEOF_MMWORD], mmC 1.253 + 1.254 + sub ecx, byte SIZEOF_MMWORD 1.255 + jz short .nextrow 1.256 + 1.257 + add esi, byte SIZEOF_MMWORD ; inptr0 1.258 + add ebx, byte SIZEOF_MMWORD ; inptr1 1.259 + add edx, byte SIZEOF_MMWORD ; inptr2 1.260 + add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr 1.261 + jmp near .columnloop 1.262 + alignx 16,7 1.263 + 1.264 +.column_st16: 1.265 + lea ecx, [ecx+ecx*2] ; imul ecx, RGB_PIXELSIZE 1.266 + cmp ecx, byte 2*SIZEOF_MMWORD 1.267 + jb short .column_st8 1.268 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 1.269 + movq MMWORD [edi+1*SIZEOF_MMWORD], mmE 1.270 + movq mmA,mmC 1.271 + sub ecx, byte 2*SIZEOF_MMWORD 1.272 + add edi, byte 2*SIZEOF_MMWORD 1.273 + jmp short .column_st4 1.274 +.column_st8: 1.275 + cmp ecx, byte SIZEOF_MMWORD 1.276 + jb short .column_st4 1.277 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 1.278 + movq mmA,mmE 1.279 + sub ecx, byte SIZEOF_MMWORD 1.280 + add edi, byte SIZEOF_MMWORD 1.281 +.column_st4: 1.282 + movd eax,mmA 1.283 + cmp ecx, byte SIZEOF_DWORD 1.284 + jb short .column_st2 1.285 + mov DWORD [edi+0*SIZEOF_DWORD], eax 1.286 + psrlq mmA,DWORD_BIT 1.287 + movd eax,mmA 1.288 + sub ecx, byte SIZEOF_DWORD 1.289 + add edi, byte SIZEOF_DWORD 1.290 +.column_st2: 1.291 + cmp ecx, byte SIZEOF_WORD 1.292 + jb short .column_st1 1.293 + mov WORD [edi+0*SIZEOF_WORD], ax 1.294 + shr eax,WORD_BIT 1.295 + sub ecx, byte SIZEOF_WORD 1.296 + add edi, byte SIZEOF_WORD 1.297 +.column_st1: 1.298 + cmp ecx, byte SIZEOF_BYTE 1.299 + jb short .nextrow 1.300 + mov BYTE [edi+0*SIZEOF_BYTE], al 1.301 + 1.302 +%else ; RGB_PIXELSIZE == 4 ; ----------- 1.303 + 1.304 +%ifdef RGBX_FILLER_0XFF 1.305 + pcmpeqb mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) 1.306 + pcmpeqb mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) 1.307 +%else 1.308 + pxor mm6,mm6 ; mm6=(X0 X2 X4 X6 ** ** ** **) 1.309 + pxor mm7,mm7 ; mm7=(X1 X3 X5 X7 ** ** ** **) 1.310 +%endif 1.311 + ; mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) 1.312 + ; mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) 1.313 + ; mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) 1.314 + ; mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) 1.315 + 1.316 + punpcklbw mmA,mmC ; mmA=(00 10 02 12 04 14 06 16) 1.317 + punpcklbw mmE,mmG ; mmE=(20 30 22 32 24 34 26 36) 1.318 + punpcklbw mmB,mmD ; mmB=(01 11 03 13 05 15 07 17) 1.319 + punpcklbw mmF,mmH ; mmF=(21 31 23 33 25 35 27 37) 1.320 + 1.321 + movq mmC,mmA 1.322 + punpcklwd mmA,mmE ; mmA=(00 10 20 30 02 12 22 32) 1.323 + punpckhwd mmC,mmE ; mmC=(04 14 24 34 06 16 26 36) 1.324 + movq mmG,mmB 1.325 + punpcklwd mmB,mmF ; mmB=(01 11 21 31 03 13 23 33) 1.326 + punpckhwd mmG,mmF ; mmG=(05 15 25 35 07 17 27 37) 1.327 + 1.328 + movq mmD,mmA 1.329 + punpckldq mmA,mmB ; mmA=(00 10 20 30 01 11 21 31) 1.330 + punpckhdq mmD,mmB ; mmD=(02 12 22 32 03 13 23 33) 1.331 + movq mmH,mmC 1.332 + punpckldq mmC,mmG ; mmC=(04 14 24 34 05 15 25 35) 1.333 + punpckhdq mmH,mmG ; mmH=(06 16 26 36 07 17 27 37) 1.334 + 1.335 + cmp ecx, byte SIZEOF_MMWORD 1.336 + jb short .column_st16 1.337 + 1.338 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 1.339 + movq MMWORD [edi+1*SIZEOF_MMWORD], mmD 1.340 + movq MMWORD [edi+2*SIZEOF_MMWORD], mmC 1.341 + movq MMWORD [edi+3*SIZEOF_MMWORD], mmH 1.342 + 1.343 + sub ecx, byte SIZEOF_MMWORD 1.344 + jz short .nextrow 1.345 + 1.346 + add esi, byte SIZEOF_MMWORD ; inptr0 1.347 + add ebx, byte SIZEOF_MMWORD ; inptr1 1.348 + add edx, byte SIZEOF_MMWORD ; inptr2 1.349 + add edi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; outptr 1.350 + jmp near .columnloop 1.351 + alignx 16,7 1.352 + 1.353 +.column_st16: 1.354 + cmp ecx, byte SIZEOF_MMWORD/2 1.355 + jb short .column_st8 1.356 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 1.357 + movq MMWORD [edi+1*SIZEOF_MMWORD], mmD 1.358 + movq mmA,mmC 1.359 + movq mmD,mmH 1.360 + sub ecx, byte SIZEOF_MMWORD/2 1.361 + add edi, byte 2*SIZEOF_MMWORD 1.362 +.column_st8: 1.363 + cmp ecx, byte SIZEOF_MMWORD/4 1.364 + jb short .column_st4 1.365 + movq MMWORD [edi+0*SIZEOF_MMWORD], mmA 1.366 + movq mmA,mmD 1.367 + sub ecx, byte SIZEOF_MMWORD/4 1.368 + add edi, byte 1*SIZEOF_MMWORD 1.369 +.column_st4: 1.370 + cmp ecx, byte SIZEOF_MMWORD/8 1.371 + jb short .nextrow 1.372 + movd DWORD [edi+0*SIZEOF_DWORD], mmA 1.373 + 1.374 +%endif ; RGB_PIXELSIZE ; --------------- 1.375 + 1.376 + alignx 16,7 1.377 + 1.378 +.nextrow: 1.379 + pop ecx 1.380 + pop esi 1.381 + pop ebx 1.382 + pop edx 1.383 + pop edi 1.384 + pop eax 1.385 + 1.386 + add esi, byte SIZEOF_JSAMPROW 1.387 + add ebx, byte SIZEOF_JSAMPROW 1.388 + add edx, byte SIZEOF_JSAMPROW 1.389 + add edi, byte SIZEOF_JSAMPROW ; output_buf 1.390 + dec eax ; num_rows 1.391 + jg near .rowloop 1.392 + 1.393 + emms ; empty MMX state 1.394 + 1.395 +.return: 1.396 + pop edi 1.397 + pop esi 1.398 +; pop edx ; need not be preserved 1.399 +; pop ecx ; need not be preserved 1.400 + pop ebx 1.401 + mov esp,ebp ; esp <- aligned ebp 1.402 + pop esp ; esp <- original ebp 1.403 + pop ebp 1.404 + ret 1.405 + 1.406 +; For some reason, the OS X linker does not honor the request to align the 1.407 +; segment unless we do this. 1.408 + align 16