media/libjpeg/simd/jcsamss2-64.asm

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 ;
michael@0 2 ; jcsamss2-64.asm - downsampling (64-bit SSE2)
michael@0 3 ;
michael@0 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
michael@0 5 ; Copyright 2009 D. R. Commander
michael@0 6 ;
michael@0 7 ; Based on
michael@0 8 ; x86 SIMD extension for IJG JPEG library
michael@0 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
michael@0 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
michael@0 11 ;
michael@0 12 ; This file should be assembled with NASM (Netwide Assembler),
michael@0 13 ; can *not* be assembled with Microsoft's MASM or any compatible
michael@0 14 ; assembler (including Borland's Turbo Assembler).
michael@0 15 ; NASM is available from http://nasm.sourceforge.net/ or
michael@0 16 ; http://sourceforge.net/project/showfiles.php?group_id=6208
michael@0 17 ;
michael@0 18 ; [TAB8]
michael@0 19
michael@0 20 %include "jsimdext.inc"
michael@0 21
michael@0 22 ; --------------------------------------------------------------------------
michael@0 23 SECTION SEG_TEXT
michael@0 24 BITS 64
michael@0 25 ;
michael@0 26 ; Downsample pixel values of a single component.
michael@0 27 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
michael@0 28 ; without smoothing.
michael@0 29 ;
michael@0 30 ; GLOBAL(void)
michael@0 31 ; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
michael@0 32 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
michael@0 33 ; JSAMPARRAY input_data, JSAMPARRAY output_data);
michael@0 34 ;
michael@0 35
michael@0 36 ; r10 = JDIMENSION image_width
michael@0 37 ; r11 = int max_v_samp_factor
michael@0 38 ; r12 = JDIMENSION v_samp_factor
michael@0 39 ; r13 = JDIMENSION width_blocks
michael@0 40 ; r14 = JSAMPARRAY input_data
michael@0 41 ; r15 = JSAMPARRAY output_data
michael@0 42
michael@0 43 align 16
michael@0 44 global EXTN(jsimd_h2v1_downsample_sse2)
michael@0 45
michael@0 46 EXTN(jsimd_h2v1_downsample_sse2):
michael@0 47 push rbp
michael@0 48 mov rax,rsp
michael@0 49 mov rbp,rsp
michael@0 50 collect_args
michael@0 51
michael@0 52 mov rcx, r13
michael@0 53 shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols)
michael@0 54 jz near .return
michael@0 55
michael@0 56 mov rdx, r10
michael@0 57
michael@0 58 ; -- expand_right_edge
michael@0 59
michael@0 60 push rcx
michael@0 61 shl rcx,1 ; output_cols * 2
michael@0 62 sub rcx,rdx
michael@0 63 jle short .expand_end
michael@0 64
michael@0 65 mov rax, r11
michael@0 66 test rax,rax
michael@0 67 jle short .expand_end
michael@0 68
michael@0 69 cld
michael@0 70 mov rsi, r14 ; input_data
michael@0 71 .expandloop:
michael@0 72 push rax
michael@0 73 push rcx
michael@0 74
michael@0 75 mov rdi, JSAMPROW [rsi]
michael@0 76 add rdi,rdx
michael@0 77 mov al, JSAMPLE [rdi-1]
michael@0 78
michael@0 79 rep stosb
michael@0 80
michael@0 81 pop rcx
michael@0 82 pop rax
michael@0 83
michael@0 84 add rsi, byte SIZEOF_JSAMPROW
michael@0 85 dec rax
michael@0 86 jg short .expandloop
michael@0 87
michael@0 88 .expand_end:
michael@0 89 pop rcx ; output_cols
michael@0 90
michael@0 91 ; -- h2v1_downsample
michael@0 92
michael@0 93 mov rax, r12 ; rowctr
michael@0 94 test eax,eax
michael@0 95 jle near .return
michael@0 96
michael@0 97 mov rdx, 0x00010000 ; bias pattern
michael@0 98 movd xmm7,edx
michael@0 99 pcmpeqw xmm6,xmm6
michael@0 100 pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
michael@0 101 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
michael@0 102
michael@0 103 mov rsi, r14 ; input_data
michael@0 104 mov rdi, r15 ; output_data
michael@0 105 .rowloop:
michael@0 106 push rcx
michael@0 107 push rdi
michael@0 108 push rsi
michael@0 109
michael@0 110 mov rsi, JSAMPROW [rsi] ; inptr
michael@0 111 mov rdi, JSAMPROW [rdi] ; outptr
michael@0 112
michael@0 113 cmp rcx, byte SIZEOF_XMMWORD
michael@0 114 jae short .columnloop
michael@0 115
michael@0 116 .columnloop_r8:
michael@0 117 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
michael@0 118 pxor xmm1,xmm1
michael@0 119 mov rcx, SIZEOF_XMMWORD
michael@0 120 jmp short .downsample
michael@0 121
michael@0 122 .columnloop:
michael@0 123 movdqa xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
michael@0 124 movdqa xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
michael@0 125
michael@0 126 .downsample:
michael@0 127 movdqa xmm2,xmm0
michael@0 128 movdqa xmm3,xmm1
michael@0 129
michael@0 130 pand xmm0,xmm6
michael@0 131 psrlw xmm2,BYTE_BIT
michael@0 132 pand xmm1,xmm6
michael@0 133 psrlw xmm3,BYTE_BIT
michael@0 134
michael@0 135 paddw xmm0,xmm2
michael@0 136 paddw xmm1,xmm3
michael@0 137 paddw xmm0,xmm7
michael@0 138 paddw xmm1,xmm7
michael@0 139 psrlw xmm0,1
michael@0 140 psrlw xmm1,1
michael@0 141
michael@0 142 packuswb xmm0,xmm1
michael@0 143
michael@0 144 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
michael@0 145
michael@0 146 sub rcx, byte SIZEOF_XMMWORD ; outcol
michael@0 147 add rsi, byte 2*SIZEOF_XMMWORD ; inptr
michael@0 148 add rdi, byte 1*SIZEOF_XMMWORD ; outptr
michael@0 149 cmp rcx, byte SIZEOF_XMMWORD
michael@0 150 jae short .columnloop
michael@0 151 test rcx,rcx
michael@0 152 jnz short .columnloop_r8
michael@0 153
michael@0 154 pop rsi
michael@0 155 pop rdi
michael@0 156 pop rcx
michael@0 157
michael@0 158 add rsi, byte SIZEOF_JSAMPROW ; input_data
michael@0 159 add rdi, byte SIZEOF_JSAMPROW ; output_data
michael@0 160 dec rax ; rowctr
michael@0 161 jg near .rowloop
michael@0 162
michael@0 163 .return:
michael@0 164 uncollect_args
michael@0 165 pop rbp
michael@0 166 ret
michael@0 167
michael@0 168 ; --------------------------------------------------------------------------
michael@0 169 ;
michael@0 170 ; Downsample pixel values of a single component.
michael@0 171 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
michael@0 172 ; without smoothing.
michael@0 173 ;
michael@0 174 ; GLOBAL(void)
michael@0 175 ; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
michael@0 176 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
michael@0 177 ; JSAMPARRAY input_data, JSAMPARRAY output_data);
michael@0 178 ;
michael@0 179
michael@0 180 ; r10 = JDIMENSION image_width
michael@0 181 ; r11 = int max_v_samp_factor
michael@0 182 ; r12 = JDIMENSION v_samp_factor
michael@0 183 ; r13 = JDIMENSION width_blocks
michael@0 184 ; r14 = JSAMPARRAY input_data
michael@0 185 ; r15 = JSAMPARRAY output_data
michael@0 186
michael@0 187 align 16
michael@0 188 global EXTN(jsimd_h2v2_downsample_sse2)
michael@0 189
michael@0 190 EXTN(jsimd_h2v2_downsample_sse2):
michael@0 191 push rbp
michael@0 192 mov rax,rsp
michael@0 193 mov rbp,rsp
michael@0 194 collect_args
michael@0 195
michael@0 196 mov rcx, r13
michael@0 197 shl rcx,3 ; imul rcx,DCTSIZE (rcx = output_cols)
michael@0 198 jz near .return
michael@0 199
michael@0 200 mov rdx, r10
michael@0 201
michael@0 202 ; -- expand_right_edge
michael@0 203
michael@0 204 push rcx
michael@0 205 shl rcx,1 ; output_cols * 2
michael@0 206 sub rcx,rdx
michael@0 207 jle short .expand_end
michael@0 208
michael@0 209 mov rax, r11
michael@0 210 test rax,rax
michael@0 211 jle short .expand_end
michael@0 212
michael@0 213 cld
michael@0 214 mov rsi, r14 ; input_data
michael@0 215 .expandloop:
michael@0 216 push rax
michael@0 217 push rcx
michael@0 218
michael@0 219 mov rdi, JSAMPROW [rsi]
michael@0 220 add rdi,rdx
michael@0 221 mov al, JSAMPLE [rdi-1]
michael@0 222
michael@0 223 rep stosb
michael@0 224
michael@0 225 pop rcx
michael@0 226 pop rax
michael@0 227
michael@0 228 add rsi, byte SIZEOF_JSAMPROW
michael@0 229 dec rax
michael@0 230 jg short .expandloop
michael@0 231
michael@0 232 .expand_end:
michael@0 233 pop rcx ; output_cols
michael@0 234
michael@0 235 ; -- h2v2_downsample
michael@0 236
michael@0 237 mov rax, r12 ; rowctr
michael@0 238 test rax,rax
michael@0 239 jle near .return
michael@0 240
michael@0 241 mov rdx, 0x00020001 ; bias pattern
michael@0 242 movd xmm7,edx
michael@0 243 pcmpeqw xmm6,xmm6
michael@0 244 pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
michael@0 245 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
michael@0 246
michael@0 247 mov rsi, r14 ; input_data
michael@0 248 mov rdi, r15 ; output_data
michael@0 249 .rowloop:
michael@0 250 push rcx
michael@0 251 push rdi
michael@0 252 push rsi
michael@0 253
michael@0 254 mov rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; inptr0
michael@0 255 mov rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; inptr1
michael@0 256 mov rdi, JSAMPROW [rdi] ; outptr
michael@0 257
michael@0 258 cmp rcx, byte SIZEOF_XMMWORD
michael@0 259 jae short .columnloop
michael@0 260
michael@0 261 .columnloop_r8:
michael@0 262 movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
michael@0 263 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
michael@0 264 pxor xmm2,xmm2
michael@0 265 pxor xmm3,xmm3
michael@0 266 mov rcx, SIZEOF_XMMWORD
michael@0 267 jmp short .downsample
michael@0 268
michael@0 269 .columnloop:
michael@0 270 movdqa xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
michael@0 271 movdqa xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
michael@0 272 movdqa xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
michael@0 273 movdqa xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
michael@0 274
michael@0 275 .downsample:
michael@0 276 movdqa xmm4,xmm0
michael@0 277 movdqa xmm5,xmm1
michael@0 278 pand xmm0,xmm6
michael@0 279 psrlw xmm4,BYTE_BIT
michael@0 280 pand xmm1,xmm6
michael@0 281 psrlw xmm5,BYTE_BIT
michael@0 282 paddw xmm0,xmm4
michael@0 283 paddw xmm1,xmm5
michael@0 284
michael@0 285 movdqa xmm4,xmm2
michael@0 286 movdqa xmm5,xmm3
michael@0 287 pand xmm2,xmm6
michael@0 288 psrlw xmm4,BYTE_BIT
michael@0 289 pand xmm3,xmm6
michael@0 290 psrlw xmm5,BYTE_BIT
michael@0 291 paddw xmm2,xmm4
michael@0 292 paddw xmm3,xmm5
michael@0 293
michael@0 294 paddw xmm0,xmm1
michael@0 295 paddw xmm2,xmm3
michael@0 296 paddw xmm0,xmm7
michael@0 297 paddw xmm2,xmm7
michael@0 298 psrlw xmm0,2
michael@0 299 psrlw xmm2,2
michael@0 300
michael@0 301 packuswb xmm0,xmm2
michael@0 302
michael@0 303 movdqa XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
michael@0 304
michael@0 305 sub rcx, byte SIZEOF_XMMWORD ; outcol
michael@0 306 add rdx, byte 2*SIZEOF_XMMWORD ; inptr0
michael@0 307 add rsi, byte 2*SIZEOF_XMMWORD ; inptr1
michael@0 308 add rdi, byte 1*SIZEOF_XMMWORD ; outptr
michael@0 309 cmp rcx, byte SIZEOF_XMMWORD
michael@0 310 jae near .columnloop
michael@0 311 test rcx,rcx
michael@0 312 jnz near .columnloop_r8
michael@0 313
michael@0 314 pop rsi
michael@0 315 pop rdi
michael@0 316 pop rcx
michael@0 317
michael@0 318 add rsi, byte 2*SIZEOF_JSAMPROW ; input_data
michael@0 319 add rdi, byte 1*SIZEOF_JSAMPROW ; output_data
michael@0 320 dec rax ; rowctr
michael@0 321 jg near .rowloop
michael@0 322
michael@0 323 .return:
michael@0 324 uncollect_args
michael@0 325 pop rbp
michael@0 326 ret
michael@0 327
michael@0 328 ; For some reason, the OS X linker does not honor the request to align the
michael@0 329 ; segment unless we do this.
michael@0 330 align 16

mercurial