media/libjpeg/simd/jcsammmx.asm

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 ;
michael@0 2 ; jcsammmx.asm - downsampling (MMX)
michael@0 3 ;
michael@0 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
michael@0 5 ;
michael@0 6 ; Based on
michael@0 7 ; x86 SIMD extension for IJG JPEG library
michael@0 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
michael@0 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
michael@0 10 ;
michael@0 11 ; This file should be assembled with NASM (Netwide Assembler),
michael@0 12 ; can *not* be assembled with Microsoft's MASM or any compatible
michael@0 13 ; assembler (including Borland's Turbo Assembler).
michael@0 14 ; NASM is available from http://nasm.sourceforge.net/ or
michael@0 15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
michael@0 16 ;
michael@0 17 ; [TAB8]
michael@0 18
michael@0 19 %include "jsimdext.inc"
michael@0 20
michael@0 21 ; --------------------------------------------------------------------------
michael@0 22 SECTION SEG_TEXT
michael@0 23 BITS 32
michael@0 24 ;
michael@0 25 ; Downsample pixel values of a single component.
michael@0 26 ; This version handles the common case of 2:1 horizontal and 1:1 vertical,
michael@0 27 ; without smoothing.
michael@0 28 ;
michael@0 29 ; GLOBAL(void)
michael@0 30 ; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
michael@0 31 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
michael@0 32 ; JSAMPARRAY input_data, JSAMPARRAY output_data);
michael@0 33 ;
michael@0 34
michael@0 35 %define img_width(b) (b)+8 ; JDIMENSION image_width
michael@0 36 %define max_v_samp(b) (b)+12 ; int max_v_samp_factor
michael@0 37 %define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor
michael@0 38 %define width_blks(b) (b)+20 ; JDIMENSION width_blocks
michael@0 39 %define input_data(b) (b)+24 ; JSAMPARRAY input_data
michael@0 40 %define output_data(b) (b)+28 ; JSAMPARRAY output_data
michael@0 41
michael@0 42 align 16
michael@0 43 global EXTN(jsimd_h2v1_downsample_mmx)
michael@0 44
michael@0 45 EXTN(jsimd_h2v1_downsample_mmx):
michael@0 46 push ebp
michael@0 47 mov ebp,esp
michael@0 48 ; push ebx ; unused
michael@0 49 ; push ecx ; need not be preserved
michael@0 50 ; push edx ; need not be preserved
michael@0 51 push esi
michael@0 52 push edi
michael@0 53
michael@0 54 mov ecx, JDIMENSION [width_blks(ebp)]
michael@0 55 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
michael@0 56 jz near .return
michael@0 57
michael@0 58 mov edx, JDIMENSION [img_width(ebp)]
michael@0 59
michael@0 60 ; -- expand_right_edge
michael@0 61
michael@0 62 push ecx
michael@0 63 shl ecx,1 ; output_cols * 2
michael@0 64 sub ecx,edx
michael@0 65 jle short .expand_end
michael@0 66
michael@0 67 mov eax, INT [max_v_samp(ebp)]
michael@0 68 test eax,eax
michael@0 69 jle short .expand_end
michael@0 70
michael@0 71 cld
michael@0 72 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
michael@0 73 alignx 16,7
michael@0 74 .expandloop:
michael@0 75 push eax
michael@0 76 push ecx
michael@0 77
michael@0 78 mov edi, JSAMPROW [esi]
michael@0 79 add edi,edx
michael@0 80 mov al, JSAMPLE [edi-1]
michael@0 81
michael@0 82 rep stosb
michael@0 83
michael@0 84 pop ecx
michael@0 85 pop eax
michael@0 86
michael@0 87 add esi, byte SIZEOF_JSAMPROW
michael@0 88 dec eax
michael@0 89 jg short .expandloop
michael@0 90
michael@0 91 .expand_end:
michael@0 92 pop ecx ; output_cols
michael@0 93
michael@0 94 ; -- h2v1_downsample
michael@0 95
michael@0 96 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
michael@0 97 test eax,eax
michael@0 98 jle near .return
michael@0 99
michael@0 100 mov edx, 0x00010000 ; bias pattern
michael@0 101 movd mm7,edx
michael@0 102 pcmpeqw mm6,mm6
michael@0 103 punpckldq mm7,mm7 ; mm7={0, 1, 0, 1}
michael@0 104 psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
michael@0 105
michael@0 106 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
michael@0 107 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
michael@0 108 alignx 16,7
michael@0 109 .rowloop:
michael@0 110 push ecx
michael@0 111 push edi
michael@0 112 push esi
michael@0 113
michael@0 114 mov esi, JSAMPROW [esi] ; inptr
michael@0 115 mov edi, JSAMPROW [edi] ; outptr
michael@0 116 alignx 16,7
michael@0 117 .columnloop:
michael@0 118
michael@0 119 movq mm0, MMWORD [esi+0*SIZEOF_MMWORD]
michael@0 120 movq mm1, MMWORD [esi+1*SIZEOF_MMWORD]
michael@0 121 movq mm2,mm0
michael@0 122 movq mm3,mm1
michael@0 123
michael@0 124 pand mm0,mm6
michael@0 125 psrlw mm2,BYTE_BIT
michael@0 126 pand mm1,mm6
michael@0 127 psrlw mm3,BYTE_BIT
michael@0 128
michael@0 129 paddw mm0,mm2
michael@0 130 paddw mm1,mm3
michael@0 131 paddw mm0,mm7
michael@0 132 paddw mm1,mm7
michael@0 133 psrlw mm0,1
michael@0 134 psrlw mm1,1
michael@0 135
michael@0 136 packuswb mm0,mm1
michael@0 137
michael@0 138 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
michael@0 139
michael@0 140 add esi, byte 2*SIZEOF_MMWORD ; inptr
michael@0 141 add edi, byte 1*SIZEOF_MMWORD ; outptr
michael@0 142 sub ecx, byte SIZEOF_MMWORD ; outcol
michael@0 143 jnz short .columnloop
michael@0 144
michael@0 145 pop esi
michael@0 146 pop edi
michael@0 147 pop ecx
michael@0 148
michael@0 149 add esi, byte SIZEOF_JSAMPROW ; input_data
michael@0 150 add edi, byte SIZEOF_JSAMPROW ; output_data
michael@0 151 dec eax ; rowctr
michael@0 152 jg short .rowloop
michael@0 153
michael@0 154 emms ; empty MMX state
michael@0 155
michael@0 156 .return:
michael@0 157 pop edi
michael@0 158 pop esi
michael@0 159 ; pop edx ; need not be preserved
michael@0 160 ; pop ecx ; need not be preserved
michael@0 161 ; pop ebx ; unused
michael@0 162 pop ebp
michael@0 163 ret
michael@0 164
michael@0 165 ; --------------------------------------------------------------------------
michael@0 166 ;
michael@0 167 ; Downsample pixel values of a single component.
michael@0 168 ; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
michael@0 169 ; without smoothing.
michael@0 170 ;
michael@0 171 ; GLOBAL(void)
michael@0 172 ; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
michael@0 173 ; JDIMENSION v_samp_factor, JDIMENSION width_blocks,
michael@0 174 ; JSAMPARRAY input_data, JSAMPARRAY output_data);
michael@0 175 ;
michael@0 176
michael@0 177 %define img_width(b) (b)+8 ; JDIMENSION image_width
michael@0 178 %define max_v_samp(b) (b)+12 ; int max_v_samp_factor
michael@0 179 %define v_samp(b) (b)+16 ; JDIMENSION v_samp_factor
michael@0 180 %define width_blks(b) (b)+20 ; JDIMENSION width_blocks
michael@0 181 %define input_data(b) (b)+24 ; JSAMPARRAY input_data
michael@0 182 %define output_data(b) (b)+28 ; JSAMPARRAY output_data
michael@0 183
michael@0 184 align 16
michael@0 185 global EXTN(jsimd_h2v2_downsample_mmx)
michael@0 186
michael@0 187 EXTN(jsimd_h2v2_downsample_mmx):
michael@0 188 push ebp
michael@0 189 mov ebp,esp
michael@0 190 ; push ebx ; unused
michael@0 191 ; push ecx ; need not be preserved
michael@0 192 ; push edx ; need not be preserved
michael@0 193 push esi
michael@0 194 push edi
michael@0 195
michael@0 196 mov ecx, JDIMENSION [width_blks(ebp)]
michael@0 197 shl ecx,3 ; imul ecx,DCTSIZE (ecx = output_cols)
michael@0 198 jz near .return
michael@0 199
michael@0 200 mov edx, JDIMENSION [img_width(ebp)]
michael@0 201
michael@0 202 ; -- expand_right_edge
michael@0 203
michael@0 204 push ecx
michael@0 205 shl ecx,1 ; output_cols * 2
michael@0 206 sub ecx,edx
michael@0 207 jle short .expand_end
michael@0 208
michael@0 209 mov eax, INT [max_v_samp(ebp)]
michael@0 210 test eax,eax
michael@0 211 jle short .expand_end
michael@0 212
michael@0 213 cld
michael@0 214 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
michael@0 215 alignx 16,7
michael@0 216 .expandloop:
michael@0 217 push eax
michael@0 218 push ecx
michael@0 219
michael@0 220 mov edi, JSAMPROW [esi]
michael@0 221 add edi,edx
michael@0 222 mov al, JSAMPLE [edi-1]
michael@0 223
michael@0 224 rep stosb
michael@0 225
michael@0 226 pop ecx
michael@0 227 pop eax
michael@0 228
michael@0 229 add esi, byte SIZEOF_JSAMPROW
michael@0 230 dec eax
michael@0 231 jg short .expandloop
michael@0 232
michael@0 233 .expand_end:
michael@0 234 pop ecx ; output_cols
michael@0 235
michael@0 236 ; -- h2v2_downsample
michael@0 237
michael@0 238 mov eax, JDIMENSION [v_samp(ebp)] ; rowctr
michael@0 239 test eax,eax
michael@0 240 jle near .return
michael@0 241
michael@0 242 mov edx, 0x00020001 ; bias pattern
michael@0 243 movd mm7,edx
michael@0 244 pcmpeqw mm6,mm6
michael@0 245 punpckldq mm7,mm7 ; mm7={1, 2, 1, 2}
michael@0 246 psrlw mm6,BYTE_BIT ; mm6={0xFF 0x00 0xFF 0x00 ..}
michael@0 247
michael@0 248 mov esi, JSAMPARRAY [input_data(ebp)] ; input_data
michael@0 249 mov edi, JSAMPARRAY [output_data(ebp)] ; output_data
michael@0 250 alignx 16,7
michael@0 251 .rowloop:
michael@0 252 push ecx
michael@0 253 push edi
michael@0 254 push esi
michael@0 255
michael@0 256 mov edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; inptr0
michael@0 257 mov esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; inptr1
michael@0 258 mov edi, JSAMPROW [edi] ; outptr
michael@0 259 alignx 16,7
michael@0 260 .columnloop:
michael@0 261
michael@0 262 movq mm0, MMWORD [edx+0*SIZEOF_MMWORD]
michael@0 263 movq mm1, MMWORD [esi+0*SIZEOF_MMWORD]
michael@0 264 movq mm2, MMWORD [edx+1*SIZEOF_MMWORD]
michael@0 265 movq mm3, MMWORD [esi+1*SIZEOF_MMWORD]
michael@0 266
michael@0 267 movq mm4,mm0
michael@0 268 movq mm5,mm1
michael@0 269 pand mm0,mm6
michael@0 270 psrlw mm4,BYTE_BIT
michael@0 271 pand mm1,mm6
michael@0 272 psrlw mm5,BYTE_BIT
michael@0 273 paddw mm0,mm4
michael@0 274 paddw mm1,mm5
michael@0 275
michael@0 276 movq mm4,mm2
michael@0 277 movq mm5,mm3
michael@0 278 pand mm2,mm6
michael@0 279 psrlw mm4,BYTE_BIT
michael@0 280 pand mm3,mm6
michael@0 281 psrlw mm5,BYTE_BIT
michael@0 282 paddw mm2,mm4
michael@0 283 paddw mm3,mm5
michael@0 284
michael@0 285 paddw mm0,mm1
michael@0 286 paddw mm2,mm3
michael@0 287 paddw mm0,mm7
michael@0 288 paddw mm2,mm7
michael@0 289 psrlw mm0,2
michael@0 290 psrlw mm2,2
michael@0 291
michael@0 292 packuswb mm0,mm2
michael@0 293
michael@0 294 movq MMWORD [edi+0*SIZEOF_MMWORD], mm0
michael@0 295
michael@0 296 add edx, byte 2*SIZEOF_MMWORD ; inptr0
michael@0 297 add esi, byte 2*SIZEOF_MMWORD ; inptr1
michael@0 298 add edi, byte 1*SIZEOF_MMWORD ; outptr
michael@0 299 sub ecx, byte SIZEOF_MMWORD ; outcol
michael@0 300 jnz near .columnloop
michael@0 301
michael@0 302 pop esi
michael@0 303 pop edi
michael@0 304 pop ecx
michael@0 305
michael@0 306 add esi, byte 2*SIZEOF_JSAMPROW ; input_data
michael@0 307 add edi, byte 1*SIZEOF_JSAMPROW ; output_data
michael@0 308 dec eax ; rowctr
michael@0 309 jg near .rowloop
michael@0 310
michael@0 311 emms ; empty MMX state
michael@0 312
michael@0 313 .return:
michael@0 314 pop edi
michael@0 315 pop esi
michael@0 316 ; pop edx ; need not be preserved
michael@0 317 ; pop ecx ; need not be preserved
michael@0 318 ; pop ebx ; unused
michael@0 319 pop ebp
michael@0 320 ret
michael@0 321
michael@0 322 ; For some reason, the OS X linker does not honor the request to align the
michael@0 323 ; segment unless we do this.
michael@0 324 align 16

mercurial