media/libjpeg/simd/jcsamss2.asm

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 ;
michael@0 2 ; jcsamss2.asm - downsampling (SSE2)
michael@0 3 ;
michael@0 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
michael@0 5 ;
michael@0 6 ; Based on
michael@0 7 ; x86 SIMD extension for IJG JPEG library
michael@0 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
michael@0 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
michael@0 10 ;
michael@0 11 ; This file should be assembled with NASM (Netwide Assembler),
michael@0 12 ; can *not* be assembled with Microsoft's MASM or any compatible
michael@0 13 ; assembler (including Borland's Turbo Assembler).
michael@0 14 ; NASM is available from http://nasm.sourceforge.net/ or
michael@0 15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
michael@0 16 ;
michael@0 17 ; [TAB8]
michael@0 18
michael@0 19 %include "jsimdext.inc"
michael@0 20
michael@0 21 ; --------------------------------------------------------------------------
michael@0 22 SECTION SEG_TEXT
michael@0 23 BITS 32
michael@0 24 ;
michael@0 25 ; Downsample pixel values of a single component.
michael@0 26 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
michael@0 27 ; without smoothing.
michael@0 28 ;
michael@0 29 ; GLOBAL(void)
michael@0 30 ; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
michael@0 31 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
michael@0 32 ; JSAMPARRAY input_data, JSAMPARRAY output_data);
michael@0 33 ;
michael@0 34
michael@0 35 %define img_width(b) (b)+8 ; JDIMENSION image_width
michael@0 36 %define max_v_samp(b) (b)+12 ; int max_v_samp_factor
michael@0 37 %define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor
michael@0 38 %define width_blks(b) (b)+20 ; JDIMENSION width_blocks
michael@0 39 %define input_data(b) (b)+24 ; JSAMPARRAY input_data
michael@0 40 %define output_data(b) (b)+28 ; JSAMPARRAY output_data
michael@0 41
michael@0 42 align 16
michael@0 43 global EXTN(jsimd_h2v1_downsample_sse2)
michael@0 44
michael@0 45 EXTN(jsimd_h2v1_downsample_sse2):
michael@0 46 push ebp
michael@0 47 mov ebp,esp
michael@0 48 ; push ebx ; unused
michael@0 49 ; push ecx ; need not be preserved
michael@0 50 ; push edx ; need not be preserved
michael@0 51 push esi
michael@0 52 push edi
michael@0 53
michael@0 54 mov ecx, JDIMENSION [width_blks(ebp)]
michael@0 55 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
michael@0 56 jz near .return
michael@0 57
michael@0 58 mov edx, JDIMENSION [img_width(ebp)]
michael@0 59
michael@0 60 ; -- expand_right_edge
michael@0 61
michael@0 62 push ecx
michael@0 63 shl ecx,1 ; output_cols * 2
michael@0 64 sub ecx,edx
michael@0 65 jle short .expand_end
michael@0 66
michael@0 67 mov eax, INT [max_v_samp(ebp)]
michael@0 68 test eax,eax
michael@0 69 jle short .expand_end
michael@0 70
michael@0 71 cld
michael@0 72 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
michael@0 73 alignx 16,7
michael@0 74 .expandloop:
michael@0 75 push eax
michael@0 76 push ecx
michael@0 77
michael@0 78 mov edi, JSAMPROW [esi]
michael@0 79 add edi,edx
michael@0 80 mov al, JSAMPLE [edi-1]
michael@0 81
michael@0 82 rep stosb
michael@0 83
michael@0 84 pop ecx
michael@0 85 pop eax
michael@0 86
michael@0 87 add esi, byte SIZEOF_JSAMPROW
michael@0 88 dec eax
michael@0 89 jg short .expandloop
michael@0 90
michael@0 91 .expand_end:
michael@0 92 pop ecx ; output_cols
michael@0 93
michael@0 94 ; -- h2v1_downsample
michael@0 95
michael@0 96 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
michael@0 97 test eax,eax
michael@0 98 jle near .return
michael@0 99
michael@0 100 mov edx, 0x00010000 ; bias pattern
michael@0 101 movd xmm7,edx
michael@0 102 pcmpeqw xmm6,xmm6
michael@0 103 pshufd xmm7,xmm7,0x00 ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
michael@0 104 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
michael@0 105
michael@0 106 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
michael@0 107 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
michael@0 108 alignx 16,7
michael@0 109 .rowloop:
michael@0 110 push ecx
michael@0 111 push edi
michael@0 112 push esi
michael@0 113
michael@0 114 mov esi, JSAMPROW [esi] ; inptr
michael@0 115 mov edi, JSAMPROW [edi] ; outptr
michael@0 116
michael@0 117 cmp ecx, byte SIZEOF_XMMWORD
michael@0 118 jae short .columnloop
michael@0 119 alignx 16,7
michael@0 120
michael@0 121 .columnloop_r8:
michael@0 122 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
michael@0 123 pxor xmm1,xmm1
michael@0 124 mov ecx, SIZEOF_XMMWORD
michael@0 125 jmp short .downsample
michael@0 126 alignx 16,7
michael@0 127
michael@0 128 .columnloop:
michael@0 129 movdqa xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
michael@0 130 movdqa xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
michael@0 131
michael@0 132 .downsample:
michael@0 133 movdqa xmm2,xmm0
michael@0 134 movdqa xmm3,xmm1
michael@0 135
michael@0 136 pand xmm0,xmm6
michael@0 137 psrlw xmm2,BYTE_BIT
michael@0 138 pand xmm1,xmm6
michael@0 139 psrlw xmm3,BYTE_BIT
michael@0 140
michael@0 141 paddw xmm0,xmm2
michael@0 142 paddw xmm1,xmm3
michael@0 143 paddw xmm0,xmm7
michael@0 144 paddw xmm1,xmm7
michael@0 145 psrlw xmm0,1
michael@0 146 psrlw xmm1,1
michael@0 147
michael@0 148 packuswb xmm0,xmm1
michael@0 149
michael@0 150 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
michael@0 151
michael@0 152 sub ecx, byte SIZEOF_XMMWORD ; outcol
michael@0 153 add esi, byte 2*SIZEOF_XMMWORD ; inptr
michael@0 154 add edi, byte 1*SIZEOF_XMMWORD ; outptr
michael@0 155 cmp ecx, byte SIZEOF_XMMWORD
michael@0 156 jae short .columnloop
michael@0 157 test ecx,ecx
michael@0 158 jnz short .columnloop_r8
michael@0 159
michael@0 160 pop esi
michael@0 161 pop edi
michael@0 162 pop ecx
michael@0 163
michael@0 164 add esi, byte SIZEOF_JSAMPROW ; input_data
michael@0 165 add edi, byte SIZEOF_JSAMPROW ; output_data
michael@0 166 dec eax ; rowctr
michael@0 167 jg near .rowloop
michael@0 168
michael@0 169 .return:
michael@0 170 pop edi
michael@0 171 pop esi
michael@0 172 ; pop edx ; need not be preserved
michael@0 173 ; pop ecx ; need not be preserved
michael@0 174 ; pop ebx ; unused
michael@0 175 pop ebp
michael@0 176 ret
michael@0 177
michael@0 178 ; --------------------------------------------------------------------------
michael@0 179 ;
michael@0 180 ; Downsample pixel values of a single component.
michael@0 181 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
michael@0 182 ; without smoothing.
michael@0 183 ;
michael@0 184 ; GLOBAL(void)
michael@0 185 ; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
michael@0 186 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
michael@0 187 ; JSAMPARRAY input_data, JSAMPARRAY output_data);
michael@0 188 ;
michael@0 189
michael@0 190 %define img_width(b) (b)+8 ; JDIMENSION image_width
michael@0 191 %define max_v_samp(b) (b)+12 ; int max_v_samp_factor
michael@0 192 %define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor
michael@0 193 %define width_blks(b) (b)+20 ; JDIMENSION width_blocks
michael@0 194 %define input_data(b) (b)+24 ; JSAMPARRAY input_data
michael@0 195 %define output_data(b) (b)+28 ; JSAMPARRAY output_data
michael@0 196
michael@0 197 align 16
michael@0 198 global EXTN(jsimd_h2v2_downsample_sse2)
michael@0 199
michael@0 200 EXTN(jsimd_h2v2_downsample_sse2):
michael@0 201 push ebp
michael@0 202 mov ebp,esp
michael@0 203 ; push ebx ; unused
michael@0 204 ; push ecx ; need not be preserved
michael@0 205 ; push edx ; need not be preserved
michael@0 206 push esi
michael@0 207 push edi
michael@0 208
michael@0 209 mov ecx, JDIMENSION [width_blks(ebp)]
michael@0 210 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
michael@0 211 jz near .return
michael@0 212
michael@0 213 mov edx, JDIMENSION [img_width(ebp)]
michael@0 214
michael@0 215 ; -- expand_right_edge
michael@0 216
michael@0 217 push ecx
michael@0 218 shl ecx,1 ; output_cols * 2
michael@0 219 sub ecx,edx
michael@0 220 jle short .expand_end
michael@0 221
michael@0 222 mov eax, INT [max_v_samp(ebp)]
michael@0 223 test eax,eax
michael@0 224 jle short .expand_end
michael@0 225
michael@0 226 cld
michael@0 227 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
michael@0 228 alignx 16,7
michael@0 229 .expandloop:
michael@0 230 push eax
michael@0 231 push ecx
michael@0 232
michael@0 233 mov edi, JSAMPROW [esi]
michael@0 234 add edi,edx
michael@0 235 mov al, JSAMPLE [edi-1]
michael@0 236
michael@0 237 rep stosb
michael@0 238
michael@0 239 pop ecx
michael@0 240 pop eax
michael@0 241
michael@0 242 add esi, byte SIZEOF_JSAMPROW
michael@0 243 dec eax
michael@0 244 jg short .expandloop
michael@0 245
michael@0 246 .expand_end:
michael@0 247 pop ecx ; output_cols
michael@0 248
michael@0 249 ; -- h2v2_downsample
michael@0 250
michael@0 251 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
michael@0 252 test eax,eax
michael@0 253 jle near .return
michael@0 254
michael@0 255 mov edx, 0x00020001 ; bias pattern
michael@0 256 movd xmm7,edx
michael@0 257 pcmpeqw xmm6,xmm6
michael@0 258 pshufd xmm7,xmm7,0x00 ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
michael@0 259 psrlw xmm6,BYTE_BIT ; xmm6={0xFF 0x00 0xFF 0x00 ..}
michael@0 260
michael@0 261 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
michael@0 262 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
michael@0 263 alignx 16,7
michael@0 264 .rowloop:
michael@0 265 push ecx
michael@0 266 push edi
michael@0 267 push esi
michael@0 268
michael@0 269 mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
michael@0 270 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
michael@0 271 mov edi, JSAMPROW [edi] ; outptr
michael@0 272
michael@0 273 cmp ecx, byte SIZEOF_XMMWORD
michael@0 274 jae short .columnloop
michael@0 275 alignx 16,7
michael@0 276
michael@0 277 .columnloop_r8:
michael@0 278 movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
michael@0 279 movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
michael@0 280 pxor xmm2,xmm2
michael@0 281 pxor xmm3,xmm3
michael@0 282 mov ecx, SIZEOF_XMMWORD
michael@0 283 jmp short .downsample
michael@0 284 alignx 16,7
michael@0 285
michael@0 286 .columnloop:
michael@0 287 movdqa xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
michael@0 288 movdqa xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
michael@0 289 movdqa xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
michael@0 290 movdqa xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
michael@0 291
michael@0 292 .downsample:
michael@0 293 movdqa xmm4,xmm0
michael@0 294 movdqa xmm5,xmm1
michael@0 295 pand xmm0,xmm6
michael@0 296 psrlw xmm4,BYTE_BIT
michael@0 297 pand xmm1,xmm6
michael@0 298 psrlw xmm5,BYTE_BIT
michael@0 299 paddw xmm0,xmm4
michael@0 300 paddw xmm1,xmm5
michael@0 301
michael@0 302 movdqa xmm4,xmm2
michael@0 303 movdqa xmm5,xmm3
michael@0 304 pand xmm2,xmm6
michael@0 305 psrlw xmm4,BYTE_BIT
michael@0 306 pand xmm3,xmm6
michael@0 307 psrlw xmm5,BYTE_BIT
michael@0 308 paddw xmm2,xmm4
michael@0 309 paddw xmm3,xmm5
michael@0 310
michael@0 311 paddw xmm0,xmm1
michael@0 312 paddw xmm2,xmm3
michael@0 313 paddw xmm0,xmm7
michael@0 314 paddw xmm2,xmm7
michael@0 315 psrlw xmm0,2
michael@0 316 psrlw xmm2,2
michael@0 317
michael@0 318 packuswb xmm0,xmm2
michael@0 319
michael@0 320 movdqa XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
michael@0 321
michael@0 322 sub ecx, byte SIZEOF_XMMWORD ; outcol
michael@0 323 add edx, byte 2*SIZEOF_XMMWORD ; inptr0
michael@0 324 add esi, byte 2*SIZEOF_XMMWORD ; inptr1
michael@0 325 add edi, byte 1*SIZEOF_XMMWORD ; outptr
michael@0 326 cmp ecx, byte SIZEOF_XMMWORD
michael@0 327 jae near .columnloop
michael@0 328 test ecx,ecx
michael@0 329 jnz near .columnloop_r8
michael@0 330
michael@0 331 pop esi
michael@0 332 pop edi
michael@0 333 pop ecx
michael@0 334
michael@0 335 add esi, byte 2*SIZEOF_JSAMPROW ; input_data
michael@0 336 add edi, byte 1*SIZEOF_JSAMPROW ; output_data
michael@0 337 dec eax ; rowctr
michael@0 338 jg near .rowloop
michael@0 339
michael@0 340 .return:
michael@0 341 pop edi
michael@0 342 pop esi
michael@0 343 ; pop edx ; need not be preserved
michael@0 344 ; pop ecx ; need not be preserved
michael@0 345 ; pop ebx ; unused
michael@0 346 pop ebp
michael@0 347 ret
michael@0 348
michael@0 349 ; For some reason, the OS X linker does not honor the request to align the
michael@0 350 ; segment unless we do this.
michael@0 351 align 16

mercurial