media/libjpeg/simd/jf3dnflt.asm

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 ;
     2 ; jf3dnflt.asm - floating-point FDCT (3DNow!)
     3 ;
     4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
     5 ;
     6 ; Based on
     7 ; x86 SIMD extension for IJG JPEG library
     8 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
    10 ;
    11 ; This file should be assembled with NASM (Netwide Assembler),
    12 ; can *not* be assembled with Microsoft's MASM or any compatible
    13 ; assembler (including Borland's Turbo Assembler).
    14 ; NASM is available from http://nasm.sourceforge.net/ or
    15 ; http://sourceforge.net/project/showfiles.php?group_id=6208
    16 ;
    17 ; This file contains a floating-point implementation of the forward DCT
    18 ; (Discrete Cosine Transform). The following code is based directly on
    19 ; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
    20 ;
    21 ; [TAB8]
    23 %include "jsimdext.inc"
    24 %include "jdct.inc"
    26 ; --------------------------------------------------------------------------
    27 	SECTION	SEG_CONST
    29 	alignz	16
    30 	global	EXTN(jconst_fdct_float_3dnow)
    32 EXTN(jconst_fdct_float_3dnow):
    34 PD_0_382	times 2 dd  0.382683432365089771728460
    35 PD_0_707	times 2 dd  0.707106781186547524400844
    36 PD_0_541	times 2 dd  0.541196100146196984399723
    37 PD_1_306	times 2 dd  1.306562964876376527856643
    39 	alignz	16
    41 ; --------------------------------------------------------------------------
    42 	SECTION	SEG_TEXT
    43 	BITS	32
    44 ;
    45 ; Perform the forward DCT on one block of samples.
    46 ;
    47 ; GLOBAL(void)
    48 ; jsimd_fdct_float_3dnow (FAST_FLOAT * data)
    49 ;
    51 %define data(b)		(b)+8		; FAST_FLOAT * data
    53 %define original_ebp	ebp+0
    54 %define wk(i)		ebp-(WK_NUM-(i))*SIZEOF_MMWORD	; mmword wk[WK_NUM]
    55 %define WK_NUM		2
    57 	align	16
    58 	global	EXTN(jsimd_fdct_float_3dnow)
    60 EXTN(jsimd_fdct_float_3dnow):
    61 	push	ebp
    62 	mov	eax,esp				; eax = original ebp
    63 	sub	esp, byte 4
    64 	and	esp, byte (-SIZEOF_MMWORD)	; align to 64 bits
    65 	mov	[esp],eax
    66 	mov	ebp,esp				; ebp = aligned ebp
    67 	lea	esp, [wk(0)]
    68 	pushpic	ebx
    69 ;	push	ecx		; need not be preserved
    70 ;	push	edx		; need not be preserved
    71 ;	push	esi		; unused
    72 ;	push	edi		; unused
    74 	get_GOT	ebx		; get GOT address
    76 	; ---- Pass 1: process rows.
    78 	mov	edx, POINTER [data(eax)]	; (FAST_FLOAT *)
    79 	mov	ecx, DCTSIZE/2
    80 	alignx	16,7
    81 .rowloop:
    83 	movq	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
    84 	movq	mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
    85 	movq	mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)]
    86 	movq	mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)]
    88 	; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17)
    90 	movq      mm4,mm0		; transpose coefficients
    91 	punpckldq mm0,mm1		; mm0=(00 10)=data0
    92 	punpckhdq mm4,mm1		; mm4=(01 11)=data1
    93 	movq      mm5,mm2		; transpose coefficients
    94 	punpckldq mm2,mm3		; mm2=(06 16)=data6
    95 	punpckhdq mm5,mm3		; mm5=(07 17)=data7
    97 	movq	mm6,mm4
    98 	movq	mm7,mm0
    99 	pfsub	mm4,mm2			; mm4=data1-data6=tmp6
   100 	pfsub	mm0,mm5			; mm0=data0-data7=tmp7
   101 	pfadd	mm6,mm2			; mm6=data1+data6=tmp1
   102 	pfadd	mm7,mm5			; mm7=data0+data7=tmp0
   104 	movq	mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
   105 	movq	mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
   106 	movq	mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)]
   107 	movq	mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)]
   109 	; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15)
   111 	movq	MMWORD [wk(0)], mm4	; wk(0)=tmp6
   112 	movq	MMWORD [wk(1)], mm0	; wk(1)=tmp7
   114 	movq      mm4,mm1		; transpose coefficients
   115 	punpckldq mm1,mm3		; mm1=(02 12)=data2
   116 	punpckhdq mm4,mm3		; mm4=(03 13)=data3
   117 	movq      mm0,mm2		; transpose coefficients
   118 	punpckldq mm2,mm5		; mm2=(04 14)=data4
   119 	punpckhdq mm0,mm5		; mm0=(05 15)=data5
   121 	movq	mm3,mm4
   122 	movq	mm5,mm1
   123 	pfadd	mm4,mm2			; mm4=data3+data4=tmp3
   124 	pfadd	mm1,mm0			; mm1=data2+data5=tmp2
   125 	pfsub	mm3,mm2			; mm3=data3-data4=tmp4
   126 	pfsub	mm5,mm0			; mm5=data2-data5=tmp5
   128 	; -- Even part
   130 	movq	mm2,mm7
   131 	movq	mm0,mm6
   132 	pfsub	mm7,mm4			; mm7=tmp13
   133 	pfsub	mm6,mm1			; mm6=tmp12
   134 	pfadd	mm2,mm4			; mm2=tmp10
   135 	pfadd	mm0,mm1			; mm0=tmp11
   137 	pfadd	mm6,mm7
   138 	pfmul	mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1
   140 	movq	mm4,mm2
   141 	movq	mm1,mm7
   142 	pfsub	mm2,mm0			; mm2=data4
   143 	pfsub	mm7,mm6			; mm7=data6
   144 	pfadd	mm4,mm0			; mm4=data0
   145 	pfadd	mm1,mm6			; mm1=data2
   147 	movq	MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2
   148 	movq	MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7
   149 	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
   150 	movq	MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1
   152 	; -- Odd part
   154 	movq	mm0, MMWORD [wk(0)]	; mm0=tmp6
   155 	movq	mm6, MMWORD [wk(1)]	; mm6=tmp7
   157 	pfadd	mm3,mm5			; mm3=tmp10
   158 	pfadd	mm5,mm0			; mm5=tmp11
   159 	pfadd	mm0,mm6			; mm0=tmp12, mm6=tmp7
   161 	pfmul	mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3
   163 	movq	mm2,mm3			; mm2=tmp10
   164 	pfsub	mm3,mm0
   165 	pfmul	mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5
   166 	pfmul	mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
   167 	pfmul	mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
   168 	pfadd	mm2,mm3			; mm2=z2
   169 	pfadd	mm0,mm3			; mm0=z4
   171 	movq	mm7,mm6
   172 	pfsub	mm6,mm5			; mm6=z13
   173 	pfadd	mm7,mm5			; mm7=z11
   175 	movq	mm4,mm6
   176 	movq	mm1,mm7
   177 	pfsub	mm6,mm2			; mm6=data3
   178 	pfsub	mm7,mm0			; mm7=data7
   179 	pfadd	mm4,mm2			; mm4=data5
   180 	pfadd	mm1,mm0			; mm1=data1
   182 	movq	MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6
   183 	movq	MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7
   184 	movq	MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4
   185 	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
   187 	add	edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
   188 	dec	ecx
   189 	jnz	near .rowloop
   191 	; ---- Pass 2: process columns.
   193 	mov	edx, POINTER [data(eax)]	; (FAST_FLOAT *)
   194 	mov	ecx, DCTSIZE/2
   195 	alignx	16,7
   196 .columnloop:
   198 	movq	mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
   199 	movq	mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
   200 	movq	mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)]
   201 	movq	mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)]
   203 	; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71)
   205 	movq      mm4,mm0		; transpose coefficients
   206 	punpckldq mm0,mm1		; mm0=(00 01)=data0
   207 	punpckhdq mm4,mm1		; mm4=(10 11)=data1
   208 	movq      mm5,mm2		; transpose coefficients
   209 	punpckldq mm2,mm3		; mm2=(60 61)=data6
   210 	punpckhdq mm5,mm3		; mm5=(70 71)=data7
   212 	movq	mm6,mm4
   213 	movq	mm7,mm0
   214 	pfsub	mm4,mm2			; mm4=data1-data6=tmp6
   215 	pfsub	mm0,mm5			; mm0=data0-data7=tmp7
   216 	pfadd	mm6,mm2			; mm6=data1+data6=tmp1
   217 	pfadd	mm7,mm5			; mm7=data0+data7=tmp0
   219 	movq	mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)]
   220 	movq	mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)]
   221 	movq	mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)]
   222 	movq	mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)]
   224 	; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51)
   226 	movq	MMWORD [wk(0)], mm4	; wk(0)=tmp6
   227 	movq	MMWORD [wk(1)], mm0	; wk(1)=tmp7
   229 	movq      mm4,mm1		; transpose coefficients
   230 	punpckldq mm1,mm3		; mm1=(20 21)=data2
   231 	punpckhdq mm4,mm3		; mm4=(30 31)=data3
   232 	movq      mm0,mm2		; transpose coefficients
   233 	punpckldq mm2,mm5		; mm2=(40 41)=data4
   234 	punpckhdq mm0,mm5		; mm0=(50 51)=data5
   236 	movq	mm3,mm4
   237 	movq	mm5,mm1
   238 	pfadd	mm4,mm2			; mm4=data3+data4=tmp3
   239 	pfadd	mm1,mm0			; mm1=data2+data5=tmp2
   240 	pfsub	mm3,mm2			; mm3=data3-data4=tmp4
   241 	pfsub	mm5,mm0			; mm5=data2-data5=tmp5
   243 	; -- Even part
   245 	movq	mm2,mm7
   246 	movq	mm0,mm6
   247 	pfsub	mm7,mm4			; mm7=tmp13
   248 	pfsub	mm6,mm1			; mm6=tmp12
   249 	pfadd	mm2,mm4			; mm2=tmp10
   250 	pfadd	mm0,mm1			; mm0=tmp11
   252 	pfadd	mm6,mm7
   253 	pfmul	mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1
   255 	movq	mm4,mm2
   256 	movq	mm1,mm7
   257 	pfsub	mm2,mm0			; mm2=data4
   258 	pfsub	mm7,mm6			; mm7=data6
   259 	pfadd	mm4,mm0			; mm4=data0
   260 	pfadd	mm1,mm6			; mm1=data2
   262 	movq	MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2
   263 	movq	MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7
   264 	movq	MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4
   265 	movq	MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1
   267 	; -- Odd part
   269 	movq	mm0, MMWORD [wk(0)]	; mm0=tmp6
   270 	movq	mm6, MMWORD [wk(1)]	; mm6=tmp7
   272 	pfadd	mm3,mm5			; mm3=tmp10
   273 	pfadd	mm5,mm0			; mm5=tmp11
   274 	pfadd	mm0,mm6			; mm0=tmp12, mm6=tmp7
   276 	pfmul	mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3
   278 	movq	mm2,mm3			; mm2=tmp10
   279 	pfsub	mm3,mm0
   280 	pfmul	mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5
   281 	pfmul	mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610)
   282 	pfmul	mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296)
   283 	pfadd	mm2,mm3			; mm2=z2
   284 	pfadd	mm0,mm3			; mm0=z4
   286 	movq	mm7,mm6
   287 	pfsub	mm6,mm5			; mm6=z13
   288 	pfadd	mm7,mm5			; mm7=z11
   290 	movq	mm4,mm6
   291 	movq	mm1,mm7
   292 	pfsub	mm6,mm2			; mm6=data3
   293 	pfsub	mm7,mm0			; mm7=data7
   294 	pfadd	mm4,mm2			; mm4=data5
   295 	pfadd	mm1,mm0			; mm1=data1
   297 	movq	MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6
   298 	movq	MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7
   299 	movq	MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4
   300 	movq	MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1
   302 	add	edx, byte 2*SIZEOF_FAST_FLOAT
   303 	dec	ecx
   304 	jnz	near .columnloop
   306 	femms		; empty MMX/3DNow! state
   308 ;	pop	edi		; unused
   309 ;	pop	esi		; unused
   310 ;	pop	edx		; need not be preserved
   311 ;	pop	ecx		; need not be preserved
   312 	poppic	ebx
   313 	mov	esp,ebp		; esp <- aligned ebp
   314 	pop	esp		; esp <- original ebp
   315 	pop	ebp
   316 	ret
   318 ; For some reason, the OS X linker does not honor the request to align the
   319 ; segment unless we do this.
   320 	align	16

mercurial