1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libjpeg/simd/jcgryss2.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,383 @@ 1.4 +; 1.5 +; jcgryss2.asm - grayscale colorspace conversion (SSE2) 1.6 +; 1.7 +; x86 SIMD extension for IJG JPEG library 1.8 +; Copyright (C) 1999-2006, MIYASAKA Masaru. 1.9 +; Copyright (C) 2011, D. R. Commander. 1.10 +; For conditions of distribution and use, see copyright notice in jsimdext.inc 1.11 +; 1.12 +; This file should be assembled with NASM (Netwide Assembler), 1.13 +; can *not* be assembled with Microsoft's MASM or any compatible 1.14 +; assembler (including Borland's Turbo Assembler). 1.15 +; NASM is available from http://nasm.sourceforge.net/ or 1.16 +; http://sourceforge.net/project/showfiles.php?group_id=6208 1.17 +; 1.18 +; [TAB8] 1.19 + 1.20 +%include "jcolsamp.inc" 1.21 + 1.22 +; -------------------------------------------------------------------------- 1.23 +; 1.24 +; Convert some rows of samples to the output colorspace. 1.25 +; 1.26 +; GLOBAL(void) 1.27 +; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width, 1.28 +; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, 1.29 +; JDIMENSION output_row, int num_rows); 1.30 +; 1.31 + 1.32 +%define img_width(b) (b)+8 ; JDIMENSION img_width 1.33 +%define input_buf(b) (b)+12 ; JSAMPARRAY input_buf 1.34 +%define output_buf(b) (b)+16 ; JSAMPIMAGE output_buf 1.35 +%define output_row(b) (b)+20 ; JDIMENSION output_row 1.36 +%define num_rows(b) (b)+24 ; int num_rows 1.37 + 1.38 +%define original_ebp ebp+0 1.39 +%define wk(i) ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 1.40 +%define WK_NUM 2 1.41 +%define gotptr wk(0)-SIZEOF_POINTER ; void * gotptr 1.42 + 1.43 + align 16 1.44 + 1.45 + global EXTN(jsimd_rgb_gray_convert_sse2) 1.46 + 1.47 +EXTN(jsimd_rgb_gray_convert_sse2): 1.48 + push ebp 1.49 + mov eax,esp ; eax = original ebp 1.50 + sub esp, byte 4 1.51 + and esp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 1.52 + mov [esp],eax 1.53 + mov ebp,esp ; ebp = aligned ebp 1.54 + lea esp, [wk(0)] 1.55 + pushpic eax ; make a room for GOT address 1.56 + push ebx 1.57 +; push ecx ; need not be preserved 1.58 +; push edx ; need not be preserved 1.59 + push esi 1.60 + push edi 1.61 + 1.62 + get_GOT ebx ; get GOT address 1.63 + movpic POINTER [gotptr], ebx ; save GOT address 1.64 + 1.65 + mov ecx, JDIMENSION [img_width(eax)] 1.66 + test ecx,ecx 1.67 + jz near .return 1.68 + 1.69 + push ecx 1.70 + 1.71 + mov esi, JSAMPIMAGE [output_buf(eax)] 1.72 + mov ecx, JDIMENSION [output_row(eax)] 1.73 + mov edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY] 1.74 + lea edi, [edi+ecx*SIZEOF_JSAMPROW] 1.75 + 1.76 + pop ecx 1.77 + 1.78 + mov esi, JSAMPARRAY [input_buf(eax)] 1.79 + mov eax, INT [num_rows(eax)] 1.80 + test eax,eax 1.81 + jle near .return 1.82 + alignx 16,7 1.83 +.rowloop: 1.84 + pushpic eax 1.85 + push edi 1.86 + push esi 1.87 + push ecx ; col 1.88 + 1.89 + mov esi, JSAMPROW [esi] ; inptr 1.90 + mov edi, JSAMPROW [edi] ; outptr0 1.91 + movpic eax, POINTER [gotptr] ; load GOT address (eax) 1.92 + 1.93 + cmp ecx, byte SIZEOF_XMMWORD 1.94 + jae near .columnloop 1.95 + alignx 16,7 1.96 + 1.97 +%if RGB_PIXELSIZE == 3 ; --------------- 1.98 + 1.99 +.column_ld1: 1.100 + push eax 1.101 + push edx 1.102 + lea ecx,[ecx+ecx*2] ; imul ecx,RGB_PIXELSIZE 1.103 + test cl, SIZEOF_BYTE 1.104 + jz short .column_ld2 1.105 + sub ecx, byte SIZEOF_BYTE 1.106 + movzx eax, BYTE [esi+ecx] 1.107 +.column_ld2: 1.108 + test cl, SIZEOF_WORD 1.109 + jz short .column_ld4 1.110 + sub ecx, byte SIZEOF_WORD 1.111 + movzx edx, WORD [esi+ecx] 1.112 + shl eax, WORD_BIT 1.113 + or eax,edx 1.114 +.column_ld4: 1.115 + movd xmmA,eax 1.116 + pop edx 1.117 + pop eax 1.118 + test cl, SIZEOF_DWORD 1.119 + jz short .column_ld8 1.120 + sub ecx, byte SIZEOF_DWORD 1.121 + movd xmmF, XMM_DWORD [esi+ecx] 1.122 + pslldq xmmA, SIZEOF_DWORD 1.123 + por xmmA,xmmF 1.124 +.column_ld8: 1.125 + test cl, SIZEOF_MMWORD 1.126 + jz short .column_ld16 1.127 + sub ecx, byte SIZEOF_MMWORD 1.128 + movq xmmB, XMM_MMWORD [esi+ecx] 1.129 + pslldq xmmA, SIZEOF_MMWORD 1.130 + por xmmA,xmmB 1.131 +.column_ld16: 1.132 + test cl, SIZEOF_XMMWORD 1.133 + jz short .column_ld32 1.134 + movdqa xmmF,xmmA 1.135 + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] 1.136 + mov ecx, SIZEOF_XMMWORD 1.137 + jmp short .rgb_gray_cnv 1.138 +.column_ld32: 1.139 + test cl, 2*SIZEOF_XMMWORD 1.140 + mov ecx, SIZEOF_XMMWORD 1.141 + jz short .rgb_gray_cnv 1.142 + movdqa xmmB,xmmA 1.143 + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] 1.144 + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] 1.145 + jmp short .rgb_gray_cnv 1.146 + alignx 16,7 1.147 + 1.148 +.columnloop: 1.149 + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] 1.150 + movdqu xmmF, XMMWORD [esi+1*SIZEOF_XMMWORD] 1.151 + movdqu xmmB, XMMWORD [esi+2*SIZEOF_XMMWORD] 1.152 + 1.153 +.rgb_gray_cnv: 1.154 + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) 1.155 + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 1.156 + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) 1.157 + 1.158 + movdqa xmmG,xmmA 1.159 + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) 1.160 + psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) 1.161 + 1.162 + punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) 1.163 + pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) 1.164 + 1.165 + punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) 1.166 + punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) 1.167 + 1.168 + movdqa xmmD,xmmA 1.169 + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) 1.170 + psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) 1.171 + 1.172 + punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) 1.173 + pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) 1.174 + 1.175 + punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) 1.176 + punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) 1.177 + 1.178 + movdqa xmmE,xmmA 1.179 + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) 1.180 + psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) 1.181 + 1.182 + punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) 1.183 + pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) 1.184 + 1.185 + punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) 1.186 + punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) 1.187 + 1.188 + pxor xmmH,xmmH 1.189 + 1.190 + movdqa xmmC,xmmA 1.191 + punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) 1.192 + punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) 1.193 + 1.194 + movdqa xmmB,xmmE 1.195 + punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) 1.196 + punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) 1.197 + 1.198 + movdqa xmmF,xmmD 1.199 + punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) 1.200 + punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) 1.201 + 1.202 +%else ; RGB_PIXELSIZE == 4 ; ----------- 1.203 + 1.204 +.column_ld1: 1.205 + test cl, SIZEOF_XMMWORD/16 1.206 + jz short .column_ld2 1.207 + sub ecx, byte SIZEOF_XMMWORD/16 1.208 + movd xmmA, XMM_DWORD [esi+ecx*RGB_PIXELSIZE] 1.209 +.column_ld2: 1.210 + test cl, SIZEOF_XMMWORD/8 1.211 + jz short .column_ld4 1.212 + sub ecx, byte SIZEOF_XMMWORD/8 1.213 + movq xmmE, XMM_MMWORD [esi+ecx*RGB_PIXELSIZE] 1.214 + pslldq xmmA, SIZEOF_MMWORD 1.215 + por xmmA,xmmE 1.216 +.column_ld4: 1.217 + test cl, SIZEOF_XMMWORD/4 1.218 + jz short .column_ld8 1.219 + sub ecx, byte SIZEOF_XMMWORD/4 1.220 + movdqa xmmE,xmmA 1.221 + movdqu xmmA, XMMWORD [esi+ecx*RGB_PIXELSIZE] 1.222 +.column_ld8: 1.223 + test cl, SIZEOF_XMMWORD/2 1.224 + mov ecx, SIZEOF_XMMWORD 1.225 + jz short .rgb_gray_cnv 1.226 + movdqa xmmF,xmmA 1.227 + movdqa xmmH,xmmE 1.228 + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] 1.229 + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] 1.230 + jmp short .rgb_gray_cnv 1.231 + alignx 16,7 1.232 + 1.233 +.columnloop: 1.234 + movdqu xmmA, XMMWORD [esi+0*SIZEOF_XMMWORD] 1.235 + movdqu xmmE, XMMWORD [esi+1*SIZEOF_XMMWORD] 1.236 + movdqu xmmF, XMMWORD [esi+2*SIZEOF_XMMWORD] 1.237 + movdqu xmmH, XMMWORD [esi+3*SIZEOF_XMMWORD] 1.238 + 1.239 +.rgb_gray_cnv: 1.240 + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) 1.241 + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) 1.242 + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) 1.243 + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) 1.244 + 1.245 + movdqa xmmD,xmmA 1.246 + punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) 1.247 + punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) 1.248 + 1.249 + movdqa xmmC,xmmF 1.250 + punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) 1.251 + punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) 1.252 + 1.253 + movdqa xmmB,xmmA 1.254 + punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) 1.255 + punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) 1.256 + 1.257 + movdqa xmmG,xmmD 1.258 + punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) 1.259 + punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) 1.260 + 1.261 + movdqa xmmE,xmmA 1.262 + punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) 1.263 + punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) 1.264 + 1.265 + movdqa xmmH,xmmB 1.266 + punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) 1.267 + punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) 1.268 + 1.269 + pxor xmmF,xmmF 1.270 + 1.271 + movdqa xmmC,xmmA 1.272 + punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) 1.273 + punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) 1.274 + 1.275 + movdqa xmmD,xmmB 1.276 + punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) 1.277 + punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) 1.278 + 1.279 + movdqa xmmG,xmmE 1.280 + punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) 1.281 + punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) 1.282 + 1.283 + punpcklbw xmmF,xmmH 1.284 + punpckhbw xmmH,xmmH 1.285 + psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) 1.286 + psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) 1.287 + 1.288 +%endif ; RGB_PIXELSIZE ; --------------- 1.289 + 1.290 + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE 1.291 + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO 1.292 + 1.293 + ; (Original) 1.294 + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B 1.295 + ; 1.296 + ; (This implementation) 1.297 + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G 1.298 + 1.299 + movdqa xmm6,xmm1 1.300 + punpcklwd xmm1,xmm3 1.301 + punpckhwd xmm6,xmm3 1.302 + pmaddwd xmm1,[GOTOFF(eax,PW_F0299_F0337)] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) 1.303 + pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) 1.304 + 1.305 + movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) 1.306 + 1.307 + movdqa xmm6,xmm0 1.308 + punpcklwd xmm0,xmm2 1.309 + punpckhwd xmm6,xmm2 1.310 + pmaddwd xmm0,[GOTOFF(eax,PW_F0299_F0337)] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) 1.311 + pmaddwd xmm6,[GOTOFF(eax,PW_F0299_F0337)] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) 1.312 + 1.313 + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) 1.314 + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) 1.315 + 1.316 + movdqa xmm0, xmm5 ; xmm0=BO 1.317 + movdqa xmm6, xmm4 ; xmm6=BE 1.318 + 1.319 + movdqa xmm4,xmm0 1.320 + punpcklwd xmm0,xmm3 1.321 + punpckhwd xmm4,xmm3 1.322 + pmaddwd xmm0,[GOTOFF(eax,PW_F0114_F0250)] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) 1.323 + pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) 1.324 + 1.325 + movdqa xmm3,[GOTOFF(eax,PD_ONEHALF)] ; xmm3=[PD_ONEHALF] 1.326 + 1.327 + paddd xmm0, xmm1 1.328 + paddd xmm4, xmm7 1.329 + paddd xmm0,xmm3 1.330 + paddd xmm4,xmm3 1.331 + psrld xmm0,SCALEBITS ; xmm0=YOL 1.332 + psrld xmm4,SCALEBITS ; xmm4=YOH 1.333 + packssdw xmm0,xmm4 ; xmm0=YO 1.334 + 1.335 + movdqa xmm4,xmm6 1.336 + punpcklwd xmm6,xmm2 1.337 + punpckhwd xmm4,xmm2 1.338 + pmaddwd xmm6,[GOTOFF(eax,PW_F0114_F0250)] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) 1.339 + pmaddwd xmm4,[GOTOFF(eax,PW_F0114_F0250)] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) 1.340 + 1.341 + movdqa xmm2,[GOTOFF(eax,PD_ONEHALF)] ; xmm2=[PD_ONEHALF] 1.342 + 1.343 + paddd xmm6, XMMWORD [wk(0)] 1.344 + paddd xmm4, XMMWORD [wk(1)] 1.345 + paddd xmm6,xmm2 1.346 + paddd xmm4,xmm2 1.347 + psrld xmm6,SCALEBITS ; xmm6=YEL 1.348 + psrld xmm4,SCALEBITS ; xmm4=YEH 1.349 + packssdw xmm6,xmm4 ; xmm6=YE 1.350 + 1.351 + psllw xmm0,BYTE_BIT 1.352 + por xmm6,xmm0 ; xmm6=Y 1.353 + movdqa XMMWORD [edi], xmm6 ; Save Y 1.354 + 1.355 + sub ecx, byte SIZEOF_XMMWORD 1.356 + add esi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr 1.357 + add edi, byte SIZEOF_XMMWORD ; outptr0 1.358 + cmp ecx, byte SIZEOF_XMMWORD 1.359 + jae near .columnloop 1.360 + test ecx,ecx 1.361 + jnz near .column_ld1 1.362 + 1.363 + pop ecx ; col 1.364 + pop esi 1.365 + pop edi 1.366 + poppic eax 1.367 + 1.368 + add esi, byte SIZEOF_JSAMPROW ; input_buf 1.369 + add edi, byte SIZEOF_JSAMPROW 1.370 + dec eax ; num_rows 1.371 + jg near .rowloop 1.372 + 1.373 +.return: 1.374 + pop edi 1.375 + pop esi 1.376 +; pop edx ; need not be preserved 1.377 +; pop ecx ; need not be preserved 1.378 + pop ebx 1.379 + mov esp,ebp ; esp <- aligned ebp 1.380 + pop esp ; esp <- original ebp 1.381 + pop ebp 1.382 + ret 1.383 + 1.384 +; For some reason, the OS X linker does not honor the request to align the 1.385 +; segment unless we do this. 1.386 + align 16