1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libjpeg/simd/jcclrmmx.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,477 @@ 1.4 +; 1.5 +; jcclrmmx.asm - colorspace conversion (MMX) 1.6 +; 1.7 +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 1.8 +; 1.9 +; Based on 1.10 +; x86 SIMD extension for IJG JPEG library 1.11 +; Copyright (C) 1999-2006, MIYASAKA Masaru. 1.12 +; For conditions of distribution and use, see copyright notice in jsimdext.inc 1.13 +; 1.14 +; This file should be assembled with NASM (Netwide Assembler), 1.15 +; can *not* be assembled with Microsoft's MASM or any compatible 1.16 +; assembler (including Borland's Turbo Assembler). 1.17 +; NASM is available from http://nasm.sourceforge.net/ or 1.18 +; http://sourceforge.net/project/showfiles.php?group_id=6208 1.19 +; 1.20 +; [TAB8] 1.21 + 1.22 +%include "jcolsamp.inc" 1.23 + 1.24 +; -------------------------------------------------------------------------- 1.25 +; 1.26 +; Convert some rows of samples to the output colorspace. 1.27 +; 1.28 +; GLOBAL(void) 1.29 +; jsimd_rgb_ycc_convert_mmx (JDIMENSION img_width, 1.30 +; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, 1.31 +; JDIMENSION output_row, int num_rows); 1.32 +; 1.33 + 1.34 +%define img_width(b) (b)+8 ; JDIMENSION img_width 1.35 +%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf 1.36 +%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf 1.37 +%define output_row(b) (b)+20 ; JDIMENSION output_row 1.38 +%define num_rows(b) (b)+24 ; int num_rows 1.39 + 1.40 +%define original_ebp ebp+0 1.41 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] 1.42 +%define WK_NUM 8 1.43 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr 1.44 + 1.45 + align 16 1.46 + global EXTN(jsimd_rgb_ycc_convert_mmx) 1.47 + 1.48 +EXTN(jsimd_rgb_ycc_convert_mmx): 1.49 + push ebp 1.50 + mov eax,esp ; eax = original ebp 1.51 + sub esp, byte 4 1.52 + and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits 1.53 + mov [esp],eax 1.54 + mov ebp,esp ; ebp = aligned ebp 1.55 + lea esp, [wk(0)] 1.56 + pushpic eax ; make a room for GOT address 1.57 + push ebx 1.58 +; push ecx ; need not be preserved 1.59 +; push edx ; need not be preserved 1.60 + push esi 1.61 + push edi 1.62 + 1.63 + get_GOT ebx ; get GOT address 1.64 + movpic POINTER [gotptr], ebx ; save GOT address 1.65 + 1.66 + mov ecx, JDIMENSION [img_width(eax)] ; num_cols 1.67 + test ecx,ecx 1.68 + jz near .return 1.69 + 1.70 + push ecx 1.71 + 1.72 + mov esi, JSAMPIMAGE [output_buf(eax)] 1.73 + mov ecx, JDIMENSION [output_row(eax)] 1.74 + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] 1.75 + mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY] 1.76 + mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY] 1.77 + lea edi, [edi+ecx*SIZEOF_JSAMPROW] 1.78 + lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] 1.79 + lea edx, [edx+ecx*SIZEOF_JSAMPROW] 1.80 + 1.81 + pop ecx 1.82 + 1.83 + mov esi, JSAMPARRAY [input_buf(eax)] 1.84 + mov eax, INT [num_rows(eax)] 1.85 + test eax,eax 1.86 + jle near .return 1.87 + alignx 16,7 1.88 +.rowloop: 1.89 + pushpic eax 1.90 + push edx 1.91 + push ebx 1.92 + push edi 1.93 + push esi 1.94 + push ecx ; col 1.95 + 1.96 + mov esi, JSAMPROW [esi] ; inptr 1.97 + mov edi, JSAMPROW [edi] ; outptr0 1.98 + mov ebx, JSAMPROW [ebx] ; outptr1 1.99 + mov edx, JSAMPROW [edx] ; outptr2 1.100 + movpic eax, POINTER [gotptr] ; load GOT address (eax) 1.101 + 1.102 + cmp ecx, byte SIZEOF_MMWORD 1.103 + jae short .columnloop 1.104 + alignx 16,7 1.105 + 1.106 +%if RGB_PIXELSIZE == 3 ; --------------- 1.107 + 1.108 +.column_ld1: 1.109 + push eax 1.110 + push edx 1.111 + lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE 1.112 + test cl, SIZEOF_BYTE 1.113 + jz short .column_ld2 1.114 + sub ecx, byte SIZEOF_BYTE 1.115 + xor eax,eax 1.116 + mov al, BYTE [esi+ecx] 1.117 +.column_ld2: 1.118 + test cl, SIZEOF_WORD 1.119 + jz short .column_ld4 1.120 + sub ecx, byte SIZEOF_WORD 1.121 + xor edx,edx 1.122 + mov dx, WORD [esi+ecx] 1.123 + shl eax, WORD_BIT 1.124 + or eax,edx 1.125 +.column_ld4: 1.126 + movd mmA,eax 1.127 + pop edx 1.128 + pop eax 1.129 + test cl, SIZEOF_DWORD 1.130 + jz short .column_ld8 1.131 + sub ecx, byte SIZEOF_DWORD 1.132 + movd mmG, DWORD [esi+ecx] 1.133 + psllq mmA, DWORD_BIT 1.134 + por mmA,mmG 1.135 +.column_ld8: 1.136 + test cl, SIZEOF_MMWORD 1.137 + jz short .column_ld16 1.138 + movq mmG,mmA 1.139 + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] 1.140 + mov ecx, SIZEOF_MMWORD 1.141 + jmp short .rgb_ycc_cnv 1.142 +.column_ld16: 1.143 + test cl, 2*SIZEOF_MMWORD 1.144 + mov ecx, SIZEOF_MMWORD 1.145 + jz short .rgb_ycc_cnv 1.146 + movq mmF,mmA 1.147 + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] 1.148 + movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] 1.149 + jmp short .rgb_ycc_cnv 1.150 + alignx 16,7 1.151 + 1.152 +.columnloop: 1.153 + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] 1.154 + movq mmG, MMWORD [esi+1*SIZEOF_MMWORD] 1.155 + movq mmF, MMWORD [esi+2*SIZEOF_MMWORD] 1.156 + 1.157 +.rgb_ycc_cnv: 1.158 + ; mmA=(00 10 20 01 11 21 02 12) 1.159 + ; mmG=(22 03 13 23 04 14 24 05) 1.160 + ; mmF=(15 25 06 16 26 07 17 27) 1.161 + 1.162 + movq mmD,mmA 1.163 + psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 10 20 01) 1.164 + psrlq mmD,4*BYTE_BIT ; mmD=(11 21 02 12 -- -- -- --) 1.165 + 1.166 + punpckhbw mmA,mmG ; mmA=(00 04 10 14 20 24 01 05) 1.167 + psllq mmG,4*BYTE_BIT ; mmG=(-- -- -- -- 22 03 13 23) 1.168 + 1.169 + punpcklbw mmD,mmF ; mmD=(11 15 21 25 02 06 12 16) 1.170 + punpckhbw mmG,mmF ; mmG=(22 26 03 07 13 17 23 27) 1.171 + 1.172 + movq mmE,mmA 1.173 + psllq mmA,4*BYTE_BIT ; mmA=(-- -- -- -- 00 04 10 14) 1.174 + psrlq mmE,4*BYTE_BIT ; mmE=(20 24 01 05 -- -- -- --) 1.175 + 1.176 + punpckhbw mmA,mmD ; mmA=(00 02 04 06 10 12 14 16) 1.177 + psllq mmD,4*BYTE_BIT ; mmD=(-- -- -- -- 11 15 21 25) 1.178 + 1.179 + punpcklbw mmE,mmG ; mmE=(20 22 24 26 01 03 05 07) 1.180 + punpckhbw mmD,mmG ; mmD=(11 13 15 17 21 23 25 27) 1.181 + 1.182 + pxor mmH,mmH 1.183 + 1.184 + movq mmC,mmA 1.185 + punpcklbw mmA,mmH ; mmA=(00 02 04 06) 1.186 + punpckhbw mmC,mmH ; mmC=(10 12 14 16) 1.187 + 1.188 + movq mmB,mmE 1.189 + punpcklbw mmE,mmH ; mmE=(20 22 24 26) 1.190 + punpckhbw mmB,mmH ; mmB=(01 03 05 07) 1.191 + 1.192 + movq mmF,mmD 1.193 + punpcklbw mmD,mmH ; mmD=(11 13 15 17) 1.194 + punpckhbw mmF,mmH ; mmF=(21 23 25 27) 1.195 + 1.196 +%else ; RGB_PIXELSIZE == 4 ; ----------- 1.197 + 1.198 +.column_ld1: 1.199 + test cl, SIZEOF_MMWORD/8 1.200 + jz short .column_ld2 1.201 + sub ecx, byte SIZEOF_MMWORD/8 1.202 + movd mmA, DWORD [esi+ecx*RGB_PIXELSIZE] 1.203 +.column_ld2: 1.204 + test cl, SIZEOF_MMWORD/4 1.205 + jz short .column_ld4 1.206 + sub ecx, byte SIZEOF_MMWORD/4 1.207 + movq mmF,mmA 1.208 + movq mmA, MMWORD [esi+ecx*RGB_PIXELSIZE] 1.209 +.column_ld4: 1.210 + test cl, SIZEOF_MMWORD/2 1.211 + mov ecx, SIZEOF_MMWORD 1.212 + jz short .rgb_ycc_cnv 1.213 + movq mmD,mmA 1.214 + movq mmC,mmF 1.215 + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] 1.216 + movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] 1.217 + jmp short .rgb_ycc_cnv 1.218 + alignx 16,7 1.219 + 1.220 +.columnloop: 1.221 + movq mmA, MMWORD [esi+0*SIZEOF_MMWORD] 1.222 + movq mmF, MMWORD [esi+1*SIZEOF_MMWORD] 1.223 + movq mmD, MMWORD [esi+2*SIZEOF_MMWORD] 1.224 + movq mmC, MMWORD [esi+3*SIZEOF_MMWORD] 1.225 + 1.226 +.rgb_ycc_cnv: 1.227 + ; mmA=(00 10 20 30 01 11 21 31) 1.228 + ; mmF=(02 12 22 32 03 13 23 33) 1.229 + ; mmD=(04 14 24 34 05 15 25 35) 1.230 + ; mmC=(06 16 26 36 07 17 27 37) 1.231 + 1.232 + movq mmB,mmA 1.233 + punpcklbw mmA,mmF ; mmA=(00 02 10 12 20 22 30 32) 1.234 + punpckhbw mmB,mmF ; mmB=(01 03 11 13 21 23 31 33) 1.235 + 1.236 + movq mmG,mmD 1.237 + punpcklbw mmD,mmC ; mmD=(04 06 14 16 24 26 34 36) 1.238 + punpckhbw mmG,mmC ; mmG=(05 07 15 17 25 27 35 37) 1.239 + 1.240 + movq mmE,mmA 1.241 + punpcklwd mmA,mmD ; mmA=(00 02 04 06 10 12 14 16) 1.242 + punpckhwd mmE,mmD ; mmE=(20 22 24 26 30 32 34 36) 1.243 + 1.244 + movq mmH,mmB 1.245 + punpcklwd mmB,mmG ; mmB=(01 03 05 07 11 13 15 17) 1.246 + punpckhwd mmH,mmG ; mmH=(21 23 25 27 31 33 35 37) 1.247 + 1.248 + pxor mmF,mmF 1.249 + 1.250 + movq mmC,mmA 1.251 + punpcklbw mmA,mmF ; mmA=(00 02 04 06) 1.252 + punpckhbw mmC,mmF ; mmC=(10 12 14 16) 1.253 + 1.254 + movq mmD,mmB 1.255 + punpcklbw mmB,mmF ; mmB=(01 03 05 07) 1.256 + punpckhbw mmD,mmF ; mmD=(11 13 15 17) 1.257 + 1.258 + movq mmG,mmE 1.259 + punpcklbw mmE,mmF ; mmE=(20 22 24 26) 1.260 + punpckhbw mmG,mmF ; mmG=(30 32 34 36) 1.261 + 1.262 + punpcklbw mmF,mmH 1.263 + punpckhbw mmH,mmH 1.264 + psrlw mmF,BYTE_BIT ; mmF=(21 23 25 27) 1.265 + psrlw mmH,BYTE_BIT ; mmH=(31 33 35 37) 1.266 + 1.267 +%endif ; RGB_PIXELSIZE ; --------------- 1.268 + 1.269 + ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE 1.270 + ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO 1.271 + 1.272 + ; (Original) 1.273 + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B 1.274 + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE 1.275 + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE 1.276 + ; 1.277 + ; (This implementation) 1.278 + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G 1.279 + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE 1.280 + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE 1.281 + 1.282 + movq MMWORD [wk(0)], mm0 ; wk(0)=RE 1.283 + movq MMWORD [wk(1)], mm1 ; wk(1)=RO 1.284 + movq MMWORD [wk(2)], mm4 ; wk(2)=BE 1.285 + movq MMWORD [wk(3)], mm5 ; wk(3)=BO 1.286 + 1.287 + movq mm6,mm1 1.288 + punpcklwd mm1,mm3 1.289 + punpckhwd mm6,mm3 1.290 + movq mm7,mm1 1.291 + movq mm4,mm6 1.292 + pmaddwd mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337) 1.293 + pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337) 1.294 + pmaddwd mm7,[GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) 1.295 + pmaddwd mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) 1.296 + 1.297 + movq MMWORD [wk(4)], mm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) 1.298 + movq MMWORD [wk(5)], mm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) 1.299 + 1.300 + pxor mm1,mm1 1.301 + pxor mm6,mm6 1.302 + punpcklwd mm1,mm5 ; mm1=BOL 1.303 + punpckhwd mm6,mm5 ; mm6=BOH 1.304 + psrld mm1,1 ; mm1=BOL*FIX(0.500) 1.305 + psrld mm6,1 ; mm6=BOH*FIX(0.500) 1.306 + 1.307 + movq mm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ] 1.308 + 1.309 + paddd mm7,mm1 1.310 + paddd mm4,mm6 1.311 + paddd mm7,mm5 1.312 + paddd mm4,mm5 1.313 + psrld mm7,SCALEBITS ; mm7=CbOL 1.314 + psrld mm4,SCALEBITS ; mm4=CbOH 1.315 + packssdw mm7,mm4 ; mm7=CbO 1.316 + 1.317 + movq mm1, MMWORD [wk(2)] ; mm1=BE 1.318 + 1.319 + movq mm6,mm0 1.320 + punpcklwd mm0,mm2 1.321 + punpckhwd mm6,mm2 1.322 + movq mm5,mm0 1.323 + movq mm4,mm6 1.324 + pmaddwd mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337) 1.325 + pmaddwd mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337) 1.326 + pmaddwd mm5,[GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331) 1.327 + pmaddwd mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331) 1.328 + 1.329 + movq MMWORD [wk(6)], mm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) 1.330 + movq MMWORD [wk(7)], mm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) 1.331 + 1.332 + pxor mm0,mm0 1.333 + pxor mm6,mm6 1.334 + punpcklwd mm0,mm1 ; mm0=BEL 1.335 + punpckhwd mm6,mm1 ; mm6=BEH 1.336 + psrld mm0,1 ; mm0=BEL*FIX(0.500) 1.337 + psrld mm6,1 ; mm6=BEH*FIX(0.500) 1.338 + 1.339 + movq mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ] 1.340 + 1.341 + paddd mm5,mm0 1.342 + paddd mm4,mm6 1.343 + paddd mm5,mm1 1.344 + paddd mm4,mm1 1.345 + psrld mm5,SCALEBITS ; mm5=CbEL 1.346 + psrld mm4,SCALEBITS ; mm4=CbEH 1.347 + packssdw mm5,mm4 ; mm5=CbE 1.348 + 1.349 + psllw mm7,BYTE_BIT 1.350 + por mm5,mm7 ; mm5=Cb 1.351 + movq MMWORD [ebx], mm5 ; Save Cb 1.352 + 1.353 + movq mm0, MMWORD [wk(3)] ; mm0=BO 1.354 + movq mm6, MMWORD [wk(2)] ; mm6=BE 1.355 + movq mm1, MMWORD [wk(1)] ; mm1=RO 1.356 + 1.357 + movq mm4,mm0 1.358 + punpcklwd mm0,mm3 1.359 + punpckhwd mm4,mm3 1.360 + movq mm7,mm0 1.361 + movq mm5,mm4 1.362 + pmaddwd mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250) 1.363 + pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250) 1.364 + pmaddwd mm7,[GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) 1.365 + pmaddwd mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) 1.366 + 1.367 + movq mm3,[GOTOFF(eax,PD_ONEHALF)] ; mm3=[PD_ONEHALF] 1.368 + 1.369 + paddd mm0, MMWORD [wk(4)] 1.370 + paddd mm4, MMWORD [wk(5)] 1.371 + paddd mm0,mm3 1.372 + paddd mm4,mm3 1.373 + psrld mm0,SCALEBITS ; mm0=YOL 1.374 + psrld mm4,SCALEBITS ; mm4=YOH 1.375 + packssdw mm0,mm4 ; mm0=YO 1.376 + 1.377 + pxor mm3,mm3 1.378 + pxor mm4,mm4 1.379 + punpcklwd mm3,mm1 ; mm3=ROL 1.380 + punpckhwd mm4,mm1 ; mm4=ROH 1.381 + psrld mm3,1 ; mm3=ROL*FIX(0.500) 1.382 + psrld mm4,1 ; mm4=ROH*FIX(0.500) 1.383 + 1.384 + movq mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ] 1.385 + 1.386 + paddd mm7,mm3 1.387 + paddd mm5,mm4 1.388 + paddd mm7,mm1 1.389 + paddd mm5,mm1 1.390 + psrld mm7,SCALEBITS ; mm7=CrOL 1.391 + psrld mm5,SCALEBITS ; mm5=CrOH 1.392 + packssdw mm7,mm5 ; mm7=CrO 1.393 + 1.394 + movq mm3, MMWORD [wk(0)] ; mm3=RE 1.395 + 1.396 + movq mm4,mm6 1.397 + punpcklwd mm6,mm2 1.398 + punpckhwd mm4,mm2 1.399 + movq mm1,mm6 1.400 + movq mm5,mm4 1.401 + pmaddwd mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250) 1.402 + pmaddwd mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250) 1.403 + pmaddwd mm1,[GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) 1.404 + pmaddwd mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) 1.405 + 1.406 + movq mm2,[GOTOFF(eax,PD_ONEHALF)] ; mm2=[PD_ONEHALF] 1.407 + 1.408 + paddd mm6, MMWORD [wk(6)] 1.409 + paddd mm4, MMWORD [wk(7)] 1.410 + paddd mm6,mm2 1.411 + paddd mm4,mm2 1.412 + psrld mm6,SCALEBITS ; mm6=YEL 1.413 + psrld mm4,SCALEBITS ; mm4=YEH 1.414 + packssdw mm6,mm4 ; mm6=YE 1.415 + 1.416 + psllw mm0,BYTE_BIT 1.417 + por mm6,mm0 ; mm6=Y 1.418 + movq MMWORD [edi], mm6 ; Save Y 1.419 + 1.420 + pxor mm2,mm2 1.421 + pxor mm4,mm4 1.422 + punpcklwd mm2,mm3 ; mm2=REL 1.423 + punpckhwd mm4,mm3 ; mm4=REH 1.424 + psrld mm2,1 ; mm2=REL*FIX(0.500) 1.425 + psrld mm4,1 ; mm4=REH*FIX(0.500) 1.426 + 1.427 + movq mm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ] 1.428 + 1.429 + paddd mm1,mm2 1.430 + paddd mm5,mm4 1.431 + paddd mm1,mm0 1.432 + paddd mm5,mm0 1.433 + psrld mm1,SCALEBITS ; mm1=CrEL 1.434 + psrld mm5,SCALEBITS ; mm5=CrEH 1.435 + packssdw mm1,mm5 ; mm1=CrE 1.436 + 1.437 + psllw mm7,BYTE_BIT 1.438 + por mm1,mm7 ; mm1=Cr 1.439 + movq MMWORD [edx], mm1 ; Save Cr 1.440 + 1.441 + sub ecx, byte SIZEOF_MMWORD 1.442 + add esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD ; inptr 1.443 + add edi, byte SIZEOF_MMWORD ; outptr0 1.444 + add ebx, byte SIZEOF_MMWORD ; outptr1 1.445 + add edx, byte SIZEOF_MMWORD ; outptr2 1.446 + cmp ecx, byte SIZEOF_MMWORD 1.447 + jae near .columnloop 1.448 + test ecx,ecx 1.449 + jnz near .column_ld1 1.450 + 1.451 + pop ecx ; col 1.452 + pop esi 1.453 + pop edi 1.454 + pop ebx 1.455 + pop edx 1.456 + poppic eax 1.457 + 1.458 + add esi, byte SIZEOF_JSAMPROW ; input_buf 1.459 + add edi, byte SIZEOF_JSAMPROW 1.460 + add ebx, byte SIZEOF_JSAMPROW 1.461 + add edx, byte SIZEOF_JSAMPROW 1.462 + dec eax ; num_rows 1.463 + jg near .rowloop 1.464 + 1.465 + emms ; empty MMX state 1.466 + 1.467 +.return: 1.468 + pop edi 1.469 + pop esi 1.470 +; pop edx ; need not be preserved 1.471 +; pop ecx ; need not be preserved 1.472 + pop ebx 1.473 + mov esp,ebp ; esp <- aligned ebp 1.474 + pop esp ; esp <- original ebp 1.475 + pop ebp 1.476 + ret 1.477 + 1.478 +; For some reason, the OS X linker does not honor the request to align the 1.479 +; segment unless we do this. 1.480 + align 16