1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libjpeg/simd/jcclrss2.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,503 @@ 1.4 +; 1.5 +; jcclrss2.asm - colorspace conversion (SSE2) 1.6 +; 1.7 +; x86 SIMD extension for IJG JPEG library 1.8 +; Copyright (C) 1999-2006, MIYASAKA Masaru. 1.9 +; For conditions of distribution and use, see copyright notice in jsimdext.inc 1.10 +; 1.11 +; This file should be assembled with NASM (Netwide Assembler), 1.12 +; can *not* be assembled with Microsoft's MASM or any compatible 1.13 +; assembler (including Borland's Turbo Assembler). 1.14 +; NASM is available from http://nasm.sourceforge.net/ or 1.15 +; http://sourceforge.net/project/showfiles.php?group_id=6208 1.16 +; 1.17 +; [TAB8] 1.18 + 1.19 +%include "jcolsamp.inc" 1.20 + 1.21 +; -------------------------------------------------------------------------- 1.22 +; 1.23 +; Convert some rows of samples to the output colorspace. 1.24 +; 1.25 +; GLOBAL(void) 1.26 +; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width, 1.27 +; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, 1.28 +; JDIMENSION output_row, int num_rows); 1.29 +; 1.30 + 1.31 +%define img_width(b) (b)+8 ; JDIMENSION img_width 1.32 +%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf 1.33 +%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf 1.34 +%define output_row(b) (b)+20 ; JDIMENSION output_row 1.35 +%define num_rows(b) (b)+24 ; int num_rows 1.36 + 1.37 +%define original_ebp ebp+0 1.38 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 1.39 +%define WK_NUM 8 1.40 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr 1.41 + 1.42 + align 16 1.43 + 1.44 + global EXTN(jsimd_rgb_ycc_convert_sse2) 1.45 + 1.46 +EXTN(jsimd_rgb_ycc_convert_sse2): 1.47 + push ebp 1.48 + mov eax,esp ; eax = original ebp 1.49 + sub esp, byte 4 1.50 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 1.51 + mov [esp],eax 1.52 + mov ebp,esp ; ebp = aligned ebp 1.53 + lea esp, [wk(0)] 1.54 + pushpic eax ; make a room for GOT address 1.55 + push ebx 1.56 +; push ecx ; need not be preserved 1.57 +; push edx ; need not be preserved 1.58 + push esi 1.59 + push edi 1.60 + 1.61 + get_GOT ebx ; get GOT address 1.62 + movpic POINTER [gotptr], ebx ; save GOT address 1.63 + 1.64 + mov ecx, JDIMENSION [img_width(eax)] 1.65 + test ecx,ecx 1.66 + jz near .return 1.67 + 1.68 + push ecx 1.69 + 1.70 + mov esi, JSAMPIMAGE [output_buf(eax)] 1.71 + mov ecx, JDIMENSION [output_row(eax)] 1.72 + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] 1.73 + mov ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY] 1.74 + mov edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY] 1.75 + lea edi, [edi+ecx*SIZEOF_JSAMPROW] 1.76 + lea ebx, [ebx+ecx*SIZEOF_JSAMPROW] 1.77 + lea edx, [edx+ecx*SIZEOF_JSAMPROW] 1.78 + 1.79 + pop ecx 1.80 + 1.81 + mov esi, JSAMPARRAY [input_buf(eax)] 1.82 + mov eax, INT [num_rows(eax)] 1.83 + test eax,eax 1.84 + jle near .return 1.85 + alignx 16,7 1.86 +.rowloop: 1.87 + pushpic eax 1.88 + push edx 1.89 + push ebx 1.90 + push edi 1.91 + push esi 1.92 + push ecx ; col 1.93 + 1.94 + mov esi, JSAMPROW [esi] ; inptr 1.95 + mov edi, JSAMPROW [edi] ; outptr0 1.96 + mov ebx, JSAMPROW [ebx] ; outptr1 1.97 + mov edx, JSAMPROW [edx] ; outptr2 1.98 + movpic eax, POINTER [gotptr] ; load GOT address (eax) 1.99 + 1.100 + cmp ecx, byte SIZEOF_XMMWORD 1.101 + jae near .columnloop 1.102 + alignx 16,7 1.103 + 1.104 +%if RGB_PIXELSIZE == 3 ; --------------- 1.105 + 1.106 +.column_ld1: 1.107 + push eax 1.108 + push edx 1.109 + lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE 1.110 + test cl, SIZEOF_BYTE 1.111 + jz short .column_ld2 1.112 + sub ecx, byte SIZEOF_BYTE 1.113 + movzx eax, BYTE [esi+ecx] 1.114 +.column_ld2: 1.115 + test cl, SIZEOF_WORD 1.116 + jz short .column_ld4 1.117 + sub ecx, byte SIZEOF_WORD 1.118 + movzx edx, WORD [esi+ecx] 1.119 + shl eax, WORD_BIT 1.120 + or eax,edx 1.121 +.column_ld4: 1.122 + movd xmmA,eax 1.123 + pop edx 1.124 + pop eax 1.125 + test cl, SIZEOF_DWORD 1.126 + jz short .column_ld8 1.127 + sub ecx, byte SIZEOF_DWORD 1.128 + movd xmmF, XMM_DWORD [esi+ecx] 1.129 + pslldq xmmA, SIZEOF_DWORD 1.130 + por xmmA,xmmF 1.131 +.column_ld8: 1.132 + test cl, SIZEOF_MMWORD 1.133 + jz short .column_ld16 1.134 + sub ecx, byte SIZEOF_MMWORD 1.135 + movq xmmB, XMM_MMWORD [esi+ecx] 1.136 + pslldq xmmA, SIZEOF_MMWORD 1.137 + por xmmA,xmmB 1.138 +.column_ld16: 1.139 + test cl, SIZEOF_XMMWORD 1.140 + jz short .column_ld32 1.141 + movdqa xmmF,xmmA 1.142 + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] 1.143 + mov ecx, SIZEOF_XMMWORD 1.144 + jmp short .rgb_ycc_cnv 1.145 +.column_ld32: 1.146 + test cl, 2*SIZEOF_XMMWORD 1.147 + mov ecx, SIZEOF_XMMWORD 1.148 + jz short .rgb_ycc_cnv 1.149 + movdqa xmmB,xmmA 1.150 + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] 1.151 + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] 1.152 + jmp short .rgb_ycc_cnv 1.153 + alignx 16,7 1.154 + 1.155 +.columnloop: 1.156 + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] 1.157 + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] 1.158 + movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD] 1.159 + 1.160 +.rgb_ycc_cnv: 1.161 + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) 1.162 + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 1.163 + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) 1.164 + 1.165 + movdqa xmmG,xmmA 1.166 + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) 1.167 + psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) 1.168 + 1.169 + punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) 1.170 + pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) 1.171 + 1.172 + punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) 1.173 + punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) 1.174 + 1.175 + movdqa xmmD,xmmA 1.176 + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) 1.177 + psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) 1.178 + 1.179 + punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) 1.180 + pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) 1.181 + 1.182 + punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) 1.183 + punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) 1.184 + 1.185 + movdqa xmmE,xmmA 1.186 + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) 1.187 + psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) 1.188 + 1.189 + punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) 1.190 + pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) 1.191 + 1.192 + punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) 1.193 + punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) 1.194 + 1.195 + pxor xmmH,xmmH 1.196 + 1.197 + movdqa xmmC,xmmA 1.198 + punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) 1.199 + punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) 1.200 + 1.201 + movdqa xmmB,xmmE 1.202 + punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) 1.203 + punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) 1.204 + 1.205 + movdqa xmmF,xmmD 1.206 + punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) 1.207 + punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) 1.208 + 1.209 +%else ; RGB_PIXELSIZE == 4 ; ----------- 1.210 + 1.211 +.column_ld1: 1.212 + test cl, SIZEOF_XMMWORD/16 1.213 + jz short .column_ld2 1.214 + sub ecx, byte SIZEOF_XMMWORD/16 1.215 + movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE] 1.216 +.column_ld2: 1.217 + test cl, SIZEOF_XMMWORD/8 1.218 + jz short .column_ld4 1.219 + sub ecx, byte SIZEOF_XMMWORD/8 1.220 + movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] 1.221 + pslldq xmmA, SIZEOF_MMWORD 1.222 + por xmmA,xmmE 1.223 +.column_ld4: 1.224 + test cl, SIZEOF_XMMWORD/4 1.225 + jz short .column_ld8 1.226 + sub ecx, byte SIZEOF_XMMWORD/4 1.227 + movdqa xmmE,xmmA 1.228 + movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] 1.229 +.column_ld8: 1.230 + test cl, SIZEOF_XMMWORD/2 1.231 + mov ecx, SIZEOF_XMMWORD 1.232 + jz short .rgb_ycc_cnv 1.233 + movdqa xmmF,xmmA 1.234 + movdqa xmmH,xmmE 1.235 + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] 1.236 + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] 1.237 + jmp short .rgb_ycc_cnv 1.238 + alignx 16,7 1.239 + 1.240 +.columnloop: 1.241 + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] 1.242 + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] 1.243 + movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD] 1.244 + movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD] 1.245 + 1.246 +.rgb_ycc_cnv: 1.247 + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) 1.248 + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) 1.249 + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) 1.250 + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) 1.251 + 1.252 + movdqa xmmD,xmmA 1.253 + punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) 1.254 + punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) 1.255 + 1.256 + movdqa xmmC,xmmF 1.257 + punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) 1.258 + punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) 1.259 + 1.260 + movdqa xmmB,xmmA 1.261 + punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) 1.262 + punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) 1.263 + 1.264 + movdqa xmmG,xmmD 1.265 + punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) 1.266 + punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) 1.267 + 1.268 + movdqa xmmE,xmmA 1.269 + punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) 1.270 + punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) 1.271 + 1.272 + movdqa xmmH,xmmB 1.273 + punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) 1.274 + punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) 1.275 + 1.276 + pxor xmmF,xmmF 1.277 + 1.278 + movdqa xmmC,xmmA 1.279 + punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) 1.280 + punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) 1.281 + 1.282 + movdqa xmmD,xmmB 1.283 + punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) 1.284 + punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) 1.285 + 1.286 + movdqa xmmG,xmmE 1.287 + punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) 1.288 + punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) 1.289 + 1.290 + punpcklbw xmmF,xmmH 1.291 + punpckhbw xmmH,xmmH 1.292 + psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) 1.293 + psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) 1.294 + 1.295 +%endif ; RGB_PIXELSIZE ; --------------- 1.296 + 1.297 + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE 1.298 + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO 1.299 + 1.300 + ; (Original) 1.301 + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B 1.302 + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE 1.303 + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE 1.304 + ; 1.305 + ; (This implementation) 1.306 + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G 1.307 + ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE 1.308 + ; Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE 1.309 + 1.310 + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=RE 1.311 + movdqa XMMWORD [wk(1)], xmm1 ; wk(1)=RO 1.312 + movdqa XMMWORD [wk(2)], xmm4 ; wk(2)=BE 1.313 + movdqa XMMWORD [wk(3)], xmm5 ; wk(3)=BO 1.314 + 1.315 + movdqa xmm6,xmm1 1.316 + punpcklwd xmm1,xmm3 1.317 + punpckhwd xmm6,xmm3 1.318 + movdqa xmm7,xmm1 1.319 + movdqa xmm4,xmm6 1.320 + pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) 1.321 + pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) 1.322 + pmaddwd xmm7,[GOTOFF(eax,PW_MF016_MF033)] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331) 1.323 + pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331) 1.324 + 1.325 + movdqa XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337) 1.326 + movdqa XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337) 1.327 + 1.328 + pxor xmm1,xmm1 1.329 + pxor xmm6,xmm6 1.330 + punpcklwd xmm1,xmm5 ; xmm1=BOL 1.331 + punpckhwd xmm6,xmm5 ; xmm6=BOH 1.332 + psrld xmm1,1 ; xmm1=BOL*FIX(0.500) 1.333 + psrld xmm6,1 ; xmm6=BOH*FIX(0.500) 1.334 + 1.335 + movdqa xmm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm5=[PD_ONEHALFM1_CJ] 1.336 + 1.337 + paddd xmm7,xmm1 1.338 + paddd xmm4,xmm6 1.339 + paddd xmm7,xmm5 1.340 + paddd xmm4,xmm5 1.341 + psrld xmm7,SCALEBITS ; xmm7=CbOL 1.342 + psrld xmm4,SCALEBITS ; xmm4=CbOH 1.343 + packssdw xmm7,xmm4 ; xmm7=CbO 1.344 + 1.345 + movdqa xmm1, XMMWORD [wk(2)] ; xmm1=BE 1.346 + 1.347 + movdqa xmm6,xmm0 1.348 + punpcklwd xmm0,xmm2 1.349 + punpckhwd xmm6,xmm2 1.350 + movdqa xmm5,xmm0 1.351 + movdqa xmm4,xmm6 1.352 + pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) 1.353 + pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) 1.354 + pmaddwd xmm5,[GOTOFF(eax,PW_MF016_MF033)] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331) 1.355 + pmaddwd xmm4,[GOTOFF(eax,PW_MF016_MF033)] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331) 1.356 + 1.357 + movdqa XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337) 1.358 + movdqa XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337) 1.359 + 1.360 + pxor xmm0,xmm0 1.361 + pxor xmm6,xmm6 1.362 + punpcklwd xmm0,xmm1 ; xmm0=BEL 1.363 + punpckhwd xmm6,xmm1 ; xmm6=BEH 1.364 + psrld xmm0,1 ; xmm0=BEL*FIX(0.500) 1.365 + psrld xmm6,1 ; xmm6=BEH*FIX(0.500) 1.366 + 1.367 + movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ] 1.368 + 1.369 + paddd xmm5,xmm0 1.370 + paddd xmm4,xmm6 1.371 + paddd xmm5,xmm1 1.372 + paddd xmm4,xmm1 1.373 + psrld xmm5,SCALEBITS ; xmm5=CbEL 1.374 + psrld xmm4,SCALEBITS ; xmm4=CbEH 1.375 + packssdw xmm5,xmm4 ; xmm5=CbE 1.376 + 1.377 + psllw xmm7,BYTE_BIT 1.378 + por xmm5,xmm7 ; xmm5=Cb 1.379 + movdqa XMMWORD [ebx], xmm5 ; Save Cb 1.380 + 1.381 + movdqa xmm0, XMMWORD [wk(3)] ; xmm0=BO 1.382 + movdqa xmm6, XMMWORD [wk(2)] ; xmm6=BE 1.383 + movdqa xmm1, XMMWORD [wk(1)] ; xmm1=RO 1.384 + 1.385 + movdqa xmm4,xmm0 1.386 + punpcklwd xmm0,xmm3 1.387 + punpckhwd xmm4,xmm3 1.388 + movdqa xmm7,xmm0 1.389 + movdqa xmm5,xmm4 1.390 + pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) 1.391 + pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) 1.392 + pmaddwd xmm7,[GOTOFF(eax,PW_MF008_MF041)] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418) 1.393 + pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418) 1.394 + 1.395 + movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF] 1.396 + 1.397 + paddd xmm0, XMMWORD [wk(4)] 1.398 + paddd xmm4, XMMWORD [wk(5)] 1.399 + paddd xmm0,xmm3 1.400 + paddd xmm4,xmm3 1.401 + psrld xmm0,SCALEBITS ; xmm0=YOL 1.402 + psrld xmm4,SCALEBITS ; xmm4=YOH 1.403 + packssdw xmm0,xmm4 ; xmm0=YO 1.404 + 1.405 + pxor xmm3,xmm3 1.406 + pxor xmm4,xmm4 1.407 + punpcklwd xmm3,xmm1 ; xmm3=ROL 1.408 + punpckhwd xmm4,xmm1 ; xmm4=ROH 1.409 + psrld xmm3,1 ; xmm3=ROL*FIX(0.500) 1.410 + psrld xmm4,1 ; xmm4=ROH*FIX(0.500) 1.411 + 1.412 + movdqa xmm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm1=[PD_ONEHALFM1_CJ] 1.413 + 1.414 + paddd xmm7,xmm3 1.415 + paddd xmm5,xmm4 1.416 + paddd xmm7,xmm1 1.417 + paddd xmm5,xmm1 1.418 + psrld xmm7,SCALEBITS ; xmm7=CrOL 1.419 + psrld xmm5,SCALEBITS ; xmm5=CrOH 1.420 + packssdw xmm7,xmm5 ; xmm7=CrO 1.421 + 1.422 + movdqa xmm3, XMMWORD [wk(0)] ; xmm3=RE 1.423 + 1.424 + movdqa xmm4,xmm6 1.425 + punpcklwd xmm6,xmm2 1.426 + punpckhwd xmm4,xmm2 1.427 + movdqa xmm1,xmm6 1.428 + movdqa xmm5,xmm4 1.429 + pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) 1.430 + pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) 1.431 + pmaddwd xmm1,[GOTOFF(eax,PW_MF008_MF041)] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418) 1.432 + pmaddwd xmm5,[GOTOFF(eax,PW_MF008_MF041)] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418) 1.433 + 1.434 + movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF] 1.435 + 1.436 + paddd xmm6, XMMWORD [wk(6)] 1.437 + paddd xmm4, XMMWORD [wk(7)] 1.438 + paddd xmm6,xmm2 1.439 + paddd xmm4,xmm2 1.440 + psrld xmm6,SCALEBITS ; xmm6=YEL 1.441 + psrld xmm4,SCALEBITS ; xmm4=YEH 1.442 + packssdw xmm6,xmm4 ; xmm6=YE 1.443 + 1.444 + psllw xmm0,BYTE_BIT 1.445 + por xmm6,xmm0 ; xmm6=Y 1.446 + movdqa XMMWORD [edi], xmm6 ; Save Y 1.447 + 1.448 + pxor xmm2,xmm2 1.449 + pxor xmm4,xmm4 1.450 + punpcklwd xmm2,xmm3 ; xmm2=REL 1.451 + punpckhwd xmm4,xmm3 ; xmm4=REH 1.452 + psrld xmm2,1 ; xmm2=REL*FIX(0.500) 1.453 + psrld xmm4,1 ; xmm4=REH*FIX(0.500) 1.454 + 1.455 + movdqa xmm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; xmm0=[PD_ONEHALFM1_CJ] 1.456 + 1.457 + paddd xmm1,xmm2 1.458 + paddd xmm5,xmm4 1.459 + paddd xmm1,xmm0 1.460 + paddd xmm5,xmm0 1.461 + psrld xmm1,SCALEBITS ; xmm1=CrEL 1.462 + psrld xmm5,SCALEBITS ; xmm5=CrEH 1.463 + packssdw xmm1,xmm5 ; xmm1=CrE 1.464 + 1.465 + psllw xmm7,BYTE_BIT 1.466 + por xmm1,xmm7 ; xmm1=Cr 1.467 + movdqa XMMWORD [edx], xmm1 ; Save Cr 1.468 + 1.469 + sub ecx, byte SIZEOF_XMMWORD 1.470 + add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr 1.471 + add edi, byte SIZEOF_XMMWORD ; outptr0 1.472 + add ebx, byte SIZEOF_XMMWORD ; outptr1 1.473 + add edx, byte SIZEOF_XMMWORD ; outptr2 1.474 + cmp ecx, byte SIZEOF_XMMWORD 1.475 + jae near .columnloop 1.476 + test ecx,ecx 1.477 + jnz near .column_ld1 1.478 + 1.479 + pop ecx ; col 1.480 + pop esi 1.481 + pop edi 1.482 + pop ebx 1.483 + pop edx 1.484 + poppic eax 1.485 + 1.486 + add esi, byte SIZEOF_JSAMPROW ; input_buf 1.487 + add edi, byte SIZEOF_JSAMPROW 1.488 + add ebx, byte SIZEOF_JSAMPROW 1.489 + add edx, byte SIZEOF_JSAMPROW 1.490 + dec eax ; num_rows 1.491 + jg near .rowloop 1.492 + 1.493 +.return: 1.494 + pop edi 1.495 + pop esi 1.496 +; pop edx ; need not be preserved 1.497 +; pop ecx ; need not be preserved 1.498 + pop ebx 1.499 + mov esp,ebp ; esp <- aligned ebp 1.500 + pop esp ; esp <- original ebp 1.501 + pop ebp 1.502 + ret 1.503 + 1.504 +; For some reason, the OS X linker does not honor the request to align the 1.505 +; segment unless we do this. 1.506 + align 16