media/libjpeg/simd/jcqntmmx.asm

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 ;
michael@0 2 ; jcqntmmx.asm - sample data conversion and quantization (MMX)
michael@0 3 ;
michael@0 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
michael@0 5 ;
michael@0 6 ; Based on
michael@0 7 ; x86 SIMD extension for IJG JPEG library
michael@0 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
michael@0 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
michael@0 10 ;
michael@0 11 ; This file should be assembled with NASM (Netwide Assembler),
michael@0 12 ; can *not* be assembled with Microsoft's MASM or any compatible
michael@0 13 ; assembler (including Borland's Turbo Assembler).
michael@0 14 ; NASM is available from http://nasm.sourceforge.net/ or
michael@0 15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
michael@0 16 ;
michael@0 17 ; [TAB8]
michael@0 18
michael@0 19 %include "jsimdext.inc"
michael@0 20 %include "jdct.inc"
michael@0 21
michael@0 22 ; --------------------------------------------------------------------------
michael@0 23 SECTION SEG_TEXT
michael@0 24 BITS 32
michael@0 25 ;
michael@0 26 ; Load data into workspace, applying unsigned->signed conversion
michael@0 27 ;
michael@0 28 ; GLOBAL(void)
michael@0 29 ; jsimd_convsamp_mmx (JSAMPARRAY sample_data, JDIMENSION start_col,
michael@0 30 ; DCTELEM * workspace);
michael@0 31 ;
michael@0 32
michael@0 33 %define sample_data ebp+8 ; JSAMPARRAY sample_data
michael@0 34 %define start_col ebp+12 ; JDIMENSION start_col
michael@0 35 %define workspace ebp+16 ; DCTELEM * workspace
michael@0 36
michael@0 37 align 16
michael@0 38 global EXTN(jsimd_convsamp_mmx)
michael@0 39
michael@0 40 EXTN(jsimd_convsamp_mmx):
michael@0 41 push ebp
michael@0 42 mov ebp,esp
michael@0 43 push ebx
michael@0 44 ; push ecx ; need not be preserved
michael@0 45 ; push edx ; need not be preserved
michael@0 46 push esi
michael@0 47 push edi
michael@0 48
michael@0 49 pxor mm6,mm6 ; mm6=(all 0's)
michael@0 50 pcmpeqw mm7,mm7
michael@0 51 psllw mm7,7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80}
michael@0 52
michael@0 53 mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *)
michael@0 54 mov eax, JDIMENSION [start_col]
michael@0 55 mov edi, POINTER [workspace] ; (DCTELEM *)
michael@0 56 mov ecx, DCTSIZE/4
michael@0 57 alignx 16,7
michael@0 58 .convloop:
michael@0 59 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *)
michael@0 60 mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *)
michael@0 61
michael@0 62 movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm0=(01234567)
michael@0 63 movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm1=(89ABCDEF)
michael@0 64
michael@0 65 mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *)
michael@0 66 mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *)
michael@0 67
michael@0 68 movq mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm2=(GHIJKLMN)
michael@0 69 movq mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm3=(OPQRSTUV)
michael@0 70
michael@0 71 movq mm4,mm0
michael@0 72 punpcklbw mm0,mm6 ; mm0=(0123)
michael@0 73 punpckhbw mm4,mm6 ; mm4=(4567)
michael@0 74 movq mm5,mm1
michael@0 75 punpcklbw mm1,mm6 ; mm1=(89AB)
michael@0 76 punpckhbw mm5,mm6 ; mm5=(CDEF)
michael@0 77
michael@0 78 paddw mm0,mm7
michael@0 79 paddw mm4,mm7
michael@0 80 paddw mm1,mm7
michael@0 81 paddw mm5,mm7
michael@0 82
michael@0 83 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
michael@0 84 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4
michael@0 85 movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1
michael@0 86 movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5
michael@0 87
michael@0 88 movq mm0,mm2
michael@0 89 punpcklbw mm2,mm6 ; mm2=(GHIJ)
michael@0 90 punpckhbw mm0,mm6 ; mm0=(KLMN)
michael@0 91 movq mm4,mm3
michael@0 92 punpcklbw mm3,mm6 ; mm3=(OPQR)
michael@0 93 punpckhbw mm4,mm6 ; mm4=(STUV)
michael@0 94
michael@0 95 paddw mm2,mm7
michael@0 96 paddw mm0,mm7
michael@0 97 paddw mm3,mm7
michael@0 98 paddw mm4,mm7
michael@0 99
michael@0 100 movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2
michael@0 101 movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0
michael@0 102 movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3
michael@0 103 movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4
michael@0 104
michael@0 105 add esi, byte 4*SIZEOF_JSAMPROW
michael@0 106 add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
michael@0 107 dec ecx
michael@0 108 jnz short .convloop
michael@0 109
michael@0 110 emms ; empty MMX state
michael@0 111
michael@0 112 pop edi
michael@0 113 pop esi
michael@0 114 ; pop edx ; need not be preserved
michael@0 115 ; pop ecx ; need not be preserved
michael@0 116 pop ebx
michael@0 117 pop ebp
michael@0 118 ret
michael@0 119
michael@0 120 ; --------------------------------------------------------------------------
michael@0 121 ;
michael@0 122 ; Quantize/descale the coefficients, and store into coef_block
michael@0 123 ;
michael@0 124 ; This implementation is based on an algorithm described in
michael@0 125 ; "How to optimize for the Pentium family of microprocessors"
michael@0 126 ; (http://www.agner.org/assem/).
michael@0 127 ;
michael@0 128 ; GLOBAL(void)
michael@0 129 ; jsimd_quantize_mmx (JCOEFPTR coef_block, DCTELEM * divisors,
michael@0 130 ; DCTELEM * workspace);
michael@0 131 ;
michael@0 132
michael@0 133 %define RECIPROCAL(m,n,b) MMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
michael@0 134 %define CORRECTION(m,n,b) MMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
michael@0 135 %define SCALE(m,n,b) MMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
michael@0 136 %define SHIFT(m,n,b) MMBLOCK(DCTSIZE*3+(m),(n),(b),SIZEOF_DCTELEM)
michael@0 137
michael@0 138 %define coef_block ebp+8 ; JCOEFPTR coef_block
michael@0 139 %define divisors ebp+12 ; DCTELEM * divisors
michael@0 140 %define workspace ebp+16 ; DCTELEM * workspace
michael@0 141
michael@0 142 align 16
michael@0 143 global EXTN(jsimd_quantize_mmx)
michael@0 144
michael@0 145 EXTN(jsimd_quantize_mmx):
michael@0 146 push ebp
michael@0 147 mov ebp,esp
michael@0 148 ; push ebx ; unused
michael@0 149 ; push ecx ; unused
michael@0 150 ; push edx ; need not be preserved
michael@0 151 push esi
michael@0 152 push edi
michael@0 153
michael@0 154 mov esi, POINTER [workspace]
michael@0 155 mov edx, POINTER [divisors]
michael@0 156 mov edi, JCOEFPTR [coef_block]
michael@0 157 mov ah, 2
michael@0 158 alignx 16,7
michael@0 159 .quantloop1:
michael@0 160 mov al, DCTSIZE2/8/2
michael@0 161 alignx 16,7
michael@0 162 .quantloop2:
michael@0 163 movq mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
michael@0 164 movq mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)]
michael@0 165
michael@0 166 movq mm0,mm2
michael@0 167 movq mm1,mm3
michael@0 168
michael@0 169 psraw mm2,(WORD_BIT-1) ; -1 if value < 0, 0 otherwise
michael@0 170 psraw mm3,(WORD_BIT-1)
michael@0 171
michael@0 172 pxor mm0,mm2 ; val = -val
michael@0 173 pxor mm1,mm3
michael@0 174 psubw mm0,mm2
michael@0 175 psubw mm1,mm3
michael@0 176
michael@0 177 ;
michael@0 178 ; MMX is an annoyingly crappy instruction set. It has two
michael@0 179 ; misfeatures that are causing problems here:
michael@0 180 ;
michael@0 181 ; - All multiplications are signed.
michael@0 182 ;
michael@0 183 ; - The second operand for the shifts is not treated as packed.
michael@0 184 ;
michael@0 185 ;
michael@0 186 ; We work around the first problem by implementing this algorithm:
michael@0 187 ;
michael@0 188 ; unsigned long unsigned_multiply(unsigned short x, unsigned short y)
michael@0 189 ; {
michael@0 190 ; enum { SHORT_BIT = 16 };
michael@0 191 ; signed short sx = (signed short) x;
michael@0 192 ; signed short sy = (signed short) y;
michael@0 193 ; signed long sz;
michael@0 194 ;
michael@0 195 ; sz = (long) sx * (long) sy; /* signed multiply */
michael@0 196 ;
michael@0 197 ; if (sx < 0) sz += (long) sy << SHORT_BIT;
michael@0 198 ; if (sy < 0) sz += (long) sx << SHORT_BIT;
michael@0 199 ;
michael@0 200 ; return (unsigned long) sz;
michael@0 201 ; }
michael@0 202 ;
michael@0 203 ; (note that a negative sx adds _sy_ and vice versa)
michael@0 204 ;
michael@0 205 ; For the second problem, we replace the shift by a multiplication.
michael@0 206 ; Unfortunately that means we have to deal with the signed issue again.
michael@0 207 ;
michael@0 208
michael@0 209 paddw mm0, MMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor
michael@0 210 paddw mm1, MMWORD [CORRECTION(0,1,edx)]
michael@0 211
michael@0 212 movq mm4,mm0 ; store current value for later
michael@0 213 movq mm5,mm1
michael@0 214 pmulhw mm0, MMWORD [RECIPROCAL(0,0,edx)] ; reciprocal
michael@0 215 pmulhw mm1, MMWORD [RECIPROCAL(0,1,edx)]
michael@0 216 paddw mm0,mm4 ; reciprocal is always negative (MSB=1),
michael@0 217 paddw mm1,mm5 ; so we always need to add the initial value
michael@0 218 ; (input value is never negative as we
michael@0 219 ; inverted it at the start of this routine)
michael@0 220
michael@0 221 ; here it gets a bit tricky as both scale
michael@0 222 ; and mm0/mm1 can be negative
michael@0 223 movq mm6, MMWORD [SCALE(0,0,edx)] ; scale
michael@0 224 movq mm7, MMWORD [SCALE(0,1,edx)]
michael@0 225 movq mm4,mm0
michael@0 226 movq mm5,mm1
michael@0 227 pmulhw mm0,mm6
michael@0 228 pmulhw mm1,mm7
michael@0 229
michael@0 230 psraw mm6,(WORD_BIT-1) ; determine if scale is negative
michael@0 231 psraw mm7,(WORD_BIT-1)
michael@0 232
michael@0 233 pand mm6,mm4 ; and add input if it is
michael@0 234 pand mm7,mm5
michael@0 235 paddw mm0,mm6
michael@0 236 paddw mm1,mm7
michael@0 237
michael@0 238 psraw mm4,(WORD_BIT-1) ; then check if negative input
michael@0 239 psraw mm5,(WORD_BIT-1)
michael@0 240
michael@0 241 pand mm4, MMWORD [SCALE(0,0,edx)] ; and add scale if it is
michael@0 242 pand mm5, MMWORD [SCALE(0,1,edx)]
michael@0 243 paddw mm0,mm4
michael@0 244 paddw mm1,mm5
michael@0 245
michael@0 246 pxor mm0,mm2 ; val = -val
michael@0 247 pxor mm1,mm3
michael@0 248 psubw mm0,mm2
michael@0 249 psubw mm1,mm3
michael@0 250
michael@0 251 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0
michael@0 252 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1
michael@0 253
michael@0 254 add esi, byte 8*SIZEOF_DCTELEM
michael@0 255 add edx, byte 8*SIZEOF_DCTELEM
michael@0 256 add edi, byte 8*SIZEOF_JCOEF
michael@0 257 dec al
michael@0 258 jnz near .quantloop2
michael@0 259 dec ah
michael@0 260 jnz near .quantloop1 ; to avoid branch misprediction
michael@0 261
michael@0 262 emms ; empty MMX state
michael@0 263
michael@0 264 pop edi
michael@0 265 pop esi
michael@0 266 ; pop edx ; need not be preserved
michael@0 267 ; pop ecx ; unused
michael@0 268 ; pop ebx ; unused
michael@0 269 pop ebp
michael@0 270 ret
michael@0 271
michael@0 272 ; For some reason, the OS X linker does not honor the request to align the
michael@0 273 ; segment unless we do this.
michael@0 274 align 16

mercurial