1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libjpeg/simd/jcgryss2-64.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,364 @@ 1.4 +; 1.5 +; jcgryss2-64.asm - grayscale colorspace conversion (64-bit SSE2) 1.6 +; 1.7 +; x86 SIMD extension for IJG JPEG library 1.8 +; Copyright (C) 1999-2006, MIYASAKA Masaru. 1.9 +; Copyright (C) 2011, D. R. Commander. 1.10 +; For conditions of distribution and use, see copyright notice in jsimdext.inc 1.11 +; 1.12 +; This file should be assembled with NASM (Netwide Assembler), 1.13 +; can *not* be assembled with Microsoft's MASM or any compatible 1.14 +; assembler (including Borland's Turbo Assembler). 1.15 +; NASM is available from http://nasm.sourceforge.net/ or 1.16 +; http://sourceforge.net/project/showfiles.php?group_id=6208 1.17 +; 1.18 +; [TAB8] 1.19 + 1.20 +%include "jcolsamp.inc" 1.21 + 1.22 +; -------------------------------------------------------------------------- 1.23 +; 1.24 +; Convert some rows of samples to the output colorspace. 1.25 +; 1.26 +; GLOBAL(void) 1.27 +; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width, 1.28 +; JSAMPARRAY input_buf, JSAMPIMAGE output_buf, 1.29 +; JDIMENSION output_row, int num_rows); 1.30 +; 1.31 + 1.32 +; r10 = JDIMENSION img_width 1.33 +; r11 = JSAMPARRAY input_buf 1.34 +; r12 = JSAMPIMAGE output_buf 1.35 +; r13 = JDIMENSION output_row 1.36 +; r14 = int num_rows 1.37 + 1.38 +%define wk(i) rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM] 1.39 +%define WK_NUM 2 1.40 + 1.41 + align 16 1.42 + 1.43 + global EXTN(jsimd_rgb_gray_convert_sse2) 1.44 + 1.45 +EXTN(jsimd_rgb_gray_convert_sse2): 1.46 + push rbp 1.47 + mov rax,rsp ; rax = original rbp 1.48 + sub rsp, byte 4 1.49 + and rsp, byte (-SIZEOF_XMMWORD) ; align to 128 bits 1.50 + mov [rsp],rax 1.51 + mov rbp,rsp ; rbp = aligned rbp 1.52 + lea rsp, [wk(0)] 1.53 + collect_args 1.54 + push rbx 1.55 + 1.56 + mov rcx, r10 1.57 + test rcx,rcx 1.58 + jz near .return 1.59 + 1.60 + push rcx 1.61 + 1.62 + mov rsi, r12 1.63 + mov rcx, r13 1.64 + mov rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY] 1.65 + lea rdi, [rdi+rcx*SIZEOF_JSAMPROW] 1.66 + 1.67 + pop rcx 1.68 + 1.69 + mov rsi, r11 1.70 + mov eax, r14d 1.71 + test rax,rax 1.72 + jle near .return 1.73 +.rowloop: 1.74 + push rdi 1.75 + push rsi 1.76 + push rcx ; col 1.77 + 1.78 + mov rsi, JSAMPROW [rsi] ; inptr 1.79 + mov rdi, JSAMPROW [rdi] ; outptr0 1.80 + 1.81 + cmp rcx, byte SIZEOF_XMMWORD 1.82 + jae near .columnloop 1.83 + 1.84 +%if RGB_PIXELSIZE == 3 ; --------------- 1.85 + 1.86 +.column_ld1: 1.87 + push rax 1.88 + push rdx 1.89 + lea rcx,[rcx+rcx*2] ; imul ecx,RGB_PIXELSIZE 1.90 + test cl, SIZEOF_BYTE 1.91 + jz short .column_ld2 1.92 + sub rcx, byte SIZEOF_BYTE 1.93 + movzx rax, BYTE [rsi+rcx] 1.94 +.column_ld2: 1.95 + test cl, SIZEOF_WORD 1.96 + jz short .column_ld4 1.97 + sub rcx, byte SIZEOF_WORD 1.98 + movzx rdx, WORD [rsi+rcx] 1.99 + shl rax, WORD_BIT 1.100 + or rax,rdx 1.101 +.column_ld4: 1.102 + movd xmmA,eax 1.103 + pop rdx 1.104 + pop rax 1.105 + test cl, SIZEOF_DWORD 1.106 + jz short .column_ld8 1.107 + sub rcx, byte SIZEOF_DWORD 1.108 + movd xmmF, XMM_DWORD [rsi+rcx] 1.109 + pslldq xmmA, SIZEOF_DWORD 1.110 + por xmmA,xmmF 1.111 +.column_ld8: 1.112 + test cl, SIZEOF_MMWORD 1.113 + jz short .column_ld16 1.114 + sub rcx, byte SIZEOF_MMWORD 1.115 + movq xmmB, XMM_MMWORD [rsi+rcx] 1.116 + pslldq xmmA, SIZEOF_MMWORD 1.117 + por xmmA,xmmB 1.118 +.column_ld16: 1.119 + test cl, SIZEOF_XMMWORD 1.120 + jz short .column_ld32 1.121 + movdqa xmmF,xmmA 1.122 + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 1.123 + mov rcx, SIZEOF_XMMWORD 1.124 + jmp short .rgb_gray_cnv 1.125 +.column_ld32: 1.126 + test cl, 2*SIZEOF_XMMWORD 1.127 + mov rcx, SIZEOF_XMMWORD 1.128 + jz short .rgb_gray_cnv 1.129 + movdqa xmmB,xmmA 1.130 + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 1.131 + movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] 1.132 + jmp short .rgb_gray_cnv 1.133 + 1.134 +.columnloop: 1.135 + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 1.136 + movdqu xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD] 1.137 + movdqu xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD] 1.138 + 1.139 +.rgb_gray_cnv: 1.140 + ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05) 1.141 + ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A) 1.142 + ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F) 1.143 + 1.144 + movdqa xmmG,xmmA 1.145 + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12) 1.146 + psrldq xmmG,8 ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --) 1.147 + 1.148 + punpckhbw xmmA,xmmF ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A) 1.149 + pslldq xmmF,8 ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27) 1.150 + 1.151 + punpcklbw xmmG,xmmB ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D) 1.152 + punpckhbw xmmF,xmmB ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F) 1.153 + 1.154 + movdqa xmmD,xmmA 1.155 + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09) 1.156 + psrldq xmmD,8 ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --) 1.157 + 1.158 + punpckhbw xmmA,xmmG ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D) 1.159 + pslldq xmmG,8 ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B) 1.160 + 1.161 + punpcklbw xmmD,xmmF ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E) 1.162 + punpckhbw xmmG,xmmF ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F) 1.163 + 1.164 + movdqa xmmE,xmmA 1.165 + pslldq xmmA,8 ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C) 1.166 + psrldq xmmE,8 ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --) 1.167 + 1.168 + punpckhbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) 1.169 + pslldq xmmD,8 ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D) 1.170 + 1.171 + punpcklbw xmmE,xmmG ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F) 1.172 + punpckhbw xmmD,xmmG ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F) 1.173 + 1.174 + pxor xmmH,xmmH 1.175 + 1.176 + movdqa xmmC,xmmA 1.177 + punpcklbw xmmA,xmmH ; xmmA=(00 02 04 06 08 0A 0C 0E) 1.178 + punpckhbw xmmC,xmmH ; xmmC=(10 12 14 16 18 1A 1C 1E) 1.179 + 1.180 + movdqa xmmB,xmmE 1.181 + punpcklbw xmmE,xmmH ; xmmE=(20 22 24 26 28 2A 2C 2E) 1.182 + punpckhbw xmmB,xmmH ; xmmB=(01 03 05 07 09 0B 0D 0F) 1.183 + 1.184 + movdqa xmmF,xmmD 1.185 + punpcklbw xmmD,xmmH ; xmmD=(11 13 15 17 19 1B 1D 1F) 1.186 + punpckhbw xmmF,xmmH ; xmmF=(21 23 25 27 29 2B 2D 2F) 1.187 + 1.188 +%else ; RGB_PIXELSIZE == 4 ; ----------- 1.189 + 1.190 +.column_ld1: 1.191 + test cl, SIZEOF_XMMWORD/16 1.192 + jz short .column_ld2 1.193 + sub rcx, byte SIZEOF_XMMWORD/16 1.194 + movd xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE] 1.195 +.column_ld2: 1.196 + test cl, SIZEOF_XMMWORD/8 1.197 + jz short .column_ld4 1.198 + sub rcx, byte SIZEOF_XMMWORD/8 1.199 + movq xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE] 1.200 + pslldq xmmA, SIZEOF_MMWORD 1.201 + por xmmA,xmmE 1.202 +.column_ld4: 1.203 + test cl, SIZEOF_XMMWORD/4 1.204 + jz short .column_ld8 1.205 + sub rcx, byte SIZEOF_XMMWORD/4 1.206 + movdqa xmmE,xmmA 1.207 + movdqu xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE] 1.208 +.column_ld8: 1.209 + test cl, SIZEOF_XMMWORD/2 1.210 + mov rcx, SIZEOF_XMMWORD 1.211 + jz short .rgb_gray_cnv 1.212 + movdqa xmmF,xmmA 1.213 + movdqa xmmH,xmmE 1.214 + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 1.215 + movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] 1.216 + jmp short .rgb_gray_cnv 1.217 + 1.218 +.columnloop: 1.219 + movdqu xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD] 1.220 + movdqu xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD] 1.221 + movdqu xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD] 1.222 + movdqu xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD] 1.223 + 1.224 +.rgb_gray_cnv: 1.225 + ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33) 1.226 + ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37) 1.227 + ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B) 1.228 + ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F) 1.229 + 1.230 + movdqa xmmD,xmmA 1.231 + punpcklbw xmmA,xmmE ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35) 1.232 + punpckhbw xmmD,xmmE ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37) 1.233 + 1.234 + movdqa xmmC,xmmF 1.235 + punpcklbw xmmF,xmmH ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D) 1.236 + punpckhbw xmmC,xmmH ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F) 1.237 + 1.238 + movdqa xmmB,xmmA 1.239 + punpcklwd xmmA,xmmF ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C) 1.240 + punpckhwd xmmB,xmmF ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D) 1.241 + 1.242 + movdqa xmmG,xmmD 1.243 + punpcklwd xmmD,xmmC ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E) 1.244 + punpckhwd xmmG,xmmC ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F) 1.245 + 1.246 + movdqa xmmE,xmmA 1.247 + punpcklbw xmmA,xmmD ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E) 1.248 + punpckhbw xmmE,xmmD ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E) 1.249 + 1.250 + movdqa xmmH,xmmB 1.251 + punpcklbw xmmB,xmmG ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F) 1.252 + punpckhbw xmmH,xmmG ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F) 1.253 + 1.254 + pxor xmmF,xmmF 1.255 + 1.256 + movdqa xmmC,xmmA 1.257 + punpcklbw xmmA,xmmF ; xmmA=(00 02 04 06 08 0A 0C 0E) 1.258 + punpckhbw xmmC,xmmF ; xmmC=(10 12 14 16 18 1A 1C 1E) 1.259 + 1.260 + movdqa xmmD,xmmB 1.261 + punpcklbw xmmB,xmmF ; xmmB=(01 03 05 07 09 0B 0D 0F) 1.262 + punpckhbw xmmD,xmmF ; xmmD=(11 13 15 17 19 1B 1D 1F) 1.263 + 1.264 + movdqa xmmG,xmmE 1.265 + punpcklbw xmmE,xmmF ; xmmE=(20 22 24 26 28 2A 2C 2E) 1.266 + punpckhbw xmmG,xmmF ; xmmG=(30 32 34 36 38 3A 3C 3E) 1.267 + 1.268 + punpcklbw xmmF,xmmH 1.269 + punpckhbw xmmH,xmmH 1.270 + psrlw xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F) 1.271 + psrlw xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F) 1.272 + 1.273 +%endif ; RGB_PIXELSIZE ; --------------- 1.274 + 1.275 + ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE 1.276 + ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO 1.277 + 1.278 + ; (Original) 1.279 + ; Y = 0.29900 * R + 0.58700 * G + 0.11400 * B 1.280 + ; 1.281 + ; (This implementation) 1.282 + ; Y = 0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G 1.283 + 1.284 + movdqa xmm6,xmm1 1.285 + punpcklwd xmm1,xmm3 1.286 + punpckhwd xmm6,xmm3 1.287 + pmaddwd xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337) 1.288 + pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337) 1.289 + 1.290 + movdqa xmm7, xmm6 ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337) 1.291 + 1.292 + movdqa xmm6,xmm0 1.293 + punpcklwd xmm0,xmm2 1.294 + punpckhwd xmm6,xmm2 1.295 + pmaddwd xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337) 1.296 + pmaddwd xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337) 1.297 + 1.298 + movdqa XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337) 1.299 + movdqa XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337) 1.300 + 1.301 + movdqa xmm0, xmm5 ; xmm0=BO 1.302 + movdqa xmm6, xmm4 ; xmm6=BE 1.303 + 1.304 + movdqa xmm4,xmm0 1.305 + punpcklwd xmm0,xmm3 1.306 + punpckhwd xmm4,xmm3 1.307 + pmaddwd xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250) 1.308 + pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250) 1.309 + 1.310 + movdqa xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF] 1.311 + 1.312 + paddd xmm0, xmm1 1.313 + paddd xmm4, xmm7 1.314 + paddd xmm0,xmm3 1.315 + paddd xmm4,xmm3 1.316 + psrld xmm0,SCALEBITS ; xmm0=YOL 1.317 + psrld xmm4,SCALEBITS ; xmm4=YOH 1.318 + packssdw xmm0,xmm4 ; xmm0=YO 1.319 + 1.320 + movdqa xmm4,xmm6 1.321 + punpcklwd xmm6,xmm2 1.322 + punpckhwd xmm4,xmm2 1.323 + pmaddwd xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250) 1.324 + pmaddwd xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250) 1.325 + 1.326 + movdqa xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF] 1.327 + 1.328 + paddd xmm6, XMMWORD [wk(0)] 1.329 + paddd xmm4, XMMWORD [wk(1)] 1.330 + paddd xmm6,xmm2 1.331 + paddd xmm4,xmm2 1.332 + psrld xmm6,SCALEBITS ; xmm6=YEL 1.333 + psrld xmm4,SCALEBITS ; xmm4=YEH 1.334 + packssdw xmm6,xmm4 ; xmm6=YE 1.335 + 1.336 + psllw xmm0,BYTE_BIT 1.337 + por xmm6,xmm0 ; xmm6=Y 1.338 + movdqa XMMWORD [rdi], xmm6 ; Save Y 1.339 + 1.340 + sub rcx, byte SIZEOF_XMMWORD 1.341 + add rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD ; inptr 1.342 + add rdi, byte SIZEOF_XMMWORD ; outptr0 1.343 + cmp rcx, byte SIZEOF_XMMWORD 1.344 + jae near .columnloop 1.345 + test rcx,rcx 1.346 + jnz near .column_ld1 1.347 + 1.348 + pop rcx ; col 1.349 + pop rsi 1.350 + pop rdi 1.351 + 1.352 + add rsi, byte SIZEOF_JSAMPROW ; input_buf 1.353 + add rdi, byte SIZEOF_JSAMPROW 1.354 + dec rax ; num_rows 1.355 + jg near .rowloop 1.356 + 1.357 +.return: 1.358 + pop rbx 1.359 + uncollect_args 1.360 + mov rsp,rbp ; rsp <- aligned rbp 1.361 + pop rsp ; rsp <- original rbp 1.362 + pop rbp 1.363 + ret 1.364 + 1.365 +; For some reason, the OS X linker does not honor the request to align the 1.366 +; segment unless we do this. 1.367 + align 16