media/libjpeg/simd/jsimdext.inc

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libjpeg/simd/jsimdext.inc	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,376 @@
     1.4 +;
     1.5 +; jsimdext.inc - common declarations
     1.6 +;
     1.7 +; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
     1.8 +; Copyright 2010 D. R. Commander
     1.9 +;
    1.10 +; Based on
    1.11 +; x86 SIMD extension for IJG JPEG library - version 1.02
    1.12 +;
    1.13 +; Copyright (C) 1999-2006, MIYASAKA Masaru.
    1.14 +;
    1.15 +; This software is provided 'as-is', without any express or implied
    1.16 +; warranty.  In no event will the authors be held liable for any damages
    1.17 +; arising from the use of this software.
    1.18 +;
    1.19 +; Permission is granted to anyone to use this software for any purpose,
    1.20 +; including commercial applications, and to alter it and redistribute it
    1.21 +; freely, subject to the following restrictions:
    1.22 +;
    1.23 +; 1. The origin of this software must not be misrepresented; you must not
    1.24 +;    claim that you wrote the original software. If you use this software
    1.25 +;    in a product, an acknowledgment in the product documentation would be
    1.26 +;    appreciated but is not required.
    1.27 +; 2. Altered source versions must be plainly marked as such, and must not be
    1.28 +;    misrepresented as being the original software.
    1.29 +; 3. This notice may not be removed or altered from any source distribution.
    1.30 +;
    1.31 +; [TAB8]
    1.32 +
    1.33 +; ==========================================================================
    1.34 +;  System-dependent configurations
    1.35 +
    1.36 +%ifdef WIN32	; ----(nasm -fwin32 -DWIN32 ...)--------
    1.37 +; * Microsoft Visual C++
    1.38 +; * MinGW (Minimalist GNU for Windows)
    1.39 +; * CygWin
    1.40 +; * LCC-Win32
    1.41 +
    1.42 +; -- segment definition --
    1.43 +;
    1.44 +%ifdef __YASM_VER__
    1.45 +%define SEG_TEXT    .text  align=16
    1.46 +%define SEG_CONST   .rdata align=16
    1.47 +%else
    1.48 +%define SEG_TEXT    .text  align=16 public use32 class=CODE
    1.49 +%define SEG_CONST   .rdata align=16 public use32 class=CONST
    1.50 +%endif
    1.51 +
    1.52 +%elifdef WIN64	; ----(nasm -fwin64 -DWIN64 ...)--------
    1.53 +; * Microsoft Visual C++
    1.54 +
    1.55 +; -- segment definition --
    1.56 +;
    1.57 +%ifdef __YASM_VER__
    1.58 +%define SEG_TEXT    .text  align=16
    1.59 +%define SEG_CONST   .rdata align=16
    1.60 +%else
    1.61 +%define SEG_TEXT    .text  align=16 public use64 class=CODE
    1.62 +%define SEG_CONST   .rdata align=16 public use64 class=CONST
    1.63 +%endif
    1.64 +%define EXTN(name)  name			; foo() -> foo
    1.65 +
    1.66 +%elifdef OBJ32	; ----(nasm -fobj -DOBJ32 ...)----------
    1.67 +; * Borland C++ (Win32)
    1.68 +
    1.69 +; -- segment definition --
    1.70 +;
    1.71 +%define SEG_TEXT    .text  align=16 public use32 class=CODE
    1.72 +%define SEG_CONST   .data  align=16 public use32 class=DATA
    1.73 +
    1.74 +%elifdef ELF	; ----(nasm -felf[64] -DELF ...)------------
    1.75 +; * Linux
    1.76 +; * *BSD family Unix using elf format
    1.77 +; * Unix System V, including Solaris x86, UnixWare and SCO Unix
    1.78 +
    1.79 +; mark stack as non-executable
    1.80 +section .note.GNU-stack noalloc noexec nowrite progbits
    1.81 +
    1.82 +; -- segment definition --
    1.83 +;
    1.84 +%ifdef __x86_64__
    1.85 +%define SEG_TEXT    .text   progbits align=16
    1.86 +%define SEG_CONST   .rodata progbits align=16
    1.87 +%else
    1.88 +%define SEG_TEXT    .text   progbits alloc exec   nowrite align=16
    1.89 +%define SEG_CONST   .rodata progbits alloc noexec nowrite align=16
    1.90 +%endif
    1.91 +
    1.92 +; To make the code position-independent, append -DPIC to the commandline
    1.93 +;
    1.94 +%define GOT_SYMBOL  _GLOBAL_OFFSET_TABLE_	; ELF supports PIC
    1.95 +%define EXTN(name)  name			; foo() -> foo
    1.96 +
    1.97 +%elifdef AOUT	; ----(nasm -faoutb/aout -DAOUT ...)----
    1.98 +; * Older Linux using a.out format  (nasm -f aout -DAOUT ...)
    1.99 +; * *BSD family Unix using a.out format  (nasm -f aoutb -DAOUT ...)
   1.100 +
   1.101 +; -- segment definition --
   1.102 +;
   1.103 +%define SEG_TEXT    .text
   1.104 +%define SEG_CONST   .data
   1.105 +
   1.106 +; To make the code position-independent, append -DPIC to the commandline
   1.107 +;
   1.108 +%define GOT_SYMBOL  __GLOBAL_OFFSET_TABLE_	; BSD-style a.out supports PIC
   1.109 +
   1.110 +%elifdef MACHO	; ----(nasm -fmacho -DMACHO ...)--------
   1.111 +; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
   1.112 +
   1.113 +; -- segment definition --
   1.114 +;
   1.115 +%define SEG_TEXT    .text  ;align=16	; nasm doesn't accept align=16. why?
   1.116 +%define SEG_CONST   .rodata align=16
   1.117 +
   1.118 +; The generation of position-independent code (PIC) is the default on Darwin.
   1.119 +;
   1.120 +%define PIC
   1.121 +%define GOT_SYMBOL  _MACHO_PIC_		; Mach-O style code-relative addressing
   1.122 +
   1.123 +%else		; ----(Other case)----------------------
   1.124 +
   1.125 +; -- segment definition --
   1.126 +;
   1.127 +%define SEG_TEXT    .text
   1.128 +%define SEG_CONST   .data
   1.129 +
   1.130 +%endif	; ----------------------------------------------
   1.131 +
   1.132 +; ==========================================================================
   1.133 +
   1.134 +; --------------------------------------------------------------------------
   1.135 +;  Common types
   1.136 +;
   1.137 +%ifdef __x86_64__
   1.138 +%define POINTER                 qword           ; general pointer type
   1.139 +%define SIZEOF_POINTER          SIZEOF_QWORD    ; sizeof(POINTER)
   1.140 +%define POINTER_BIT             QWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
   1.141 +%else
   1.142 +%define POINTER                 dword           ; general pointer type
   1.143 +%define SIZEOF_POINTER          SIZEOF_DWORD    ; sizeof(POINTER)
   1.144 +%define POINTER_BIT             DWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
   1.145 +%endif
   1.146 +
   1.147 +%define INT                     dword           ; signed integer type
   1.148 +%define SIZEOF_INT              SIZEOF_DWORD    ; sizeof(INT)
   1.149 +%define INT_BIT                 DWORD_BIT       ; sizeof(INT)*BYTE_BIT
   1.150 +
   1.151 +%define FP32                    dword           ; IEEE754 single
   1.152 +%define SIZEOF_FP32             SIZEOF_DWORD    ; sizeof(FP32)
   1.153 +%define FP32_BIT                DWORD_BIT       ; sizeof(FP32)*BYTE_BIT
   1.154 +
   1.155 +%define MMWORD                  qword           ; int64  (MMX register)
   1.156 +%define SIZEOF_MMWORD           SIZEOF_QWORD    ; sizeof(MMWORD)
   1.157 +%define MMWORD_BIT              QWORD_BIT       ; sizeof(MMWORD)*BYTE_BIT
   1.158 +
   1.159 +; NASM is buggy and doesn't properly handle operand sizes for SSE
   1.160 +; instructions, so for now we have to define XMMWORD as blank.
   1.161 +%define XMMWORD                                 ; int128 (SSE register)
   1.162 +%define SIZEOF_XMMWORD          SIZEOF_OWORD    ; sizeof(XMMWORD)
   1.163 +%define XMMWORD_BIT             OWORD_BIT       ; sizeof(XMMWORD)*BYTE_BIT
   1.164 +
   1.165 +; Similar hacks for when we load a dword or MMWORD into an xmm# register
   1.166 +%define XMM_DWORD
   1.167 +%define XMM_MMWORD
   1.168 +
   1.169 +%define SIZEOF_BYTE             1               ; sizeof(BYTE)
   1.170 +%define SIZEOF_WORD             2               ; sizeof(WORD)
   1.171 +%define SIZEOF_DWORD            4               ; sizeof(DWORD)
   1.172 +%define SIZEOF_QWORD            8               ; sizeof(QWORD)
   1.173 +%define SIZEOF_OWORD            16              ; sizeof(OWORD)
   1.174 +
   1.175 +%define BYTE_BIT                8               ; CHAR_BIT in C
   1.176 +%define WORD_BIT                16              ; sizeof(WORD)*BYTE_BIT
   1.177 +%define DWORD_BIT               32              ; sizeof(DWORD)*BYTE_BIT
   1.178 +%define QWORD_BIT               64              ; sizeof(QWORD)*BYTE_BIT
   1.179 +%define OWORD_BIT               128             ; sizeof(OWORD)*BYTE_BIT
   1.180 +
   1.181 +; --------------------------------------------------------------------------
   1.182 +;  External Symbol Name
   1.183 +;
   1.184 +%ifndef EXTN
   1.185 +%define EXTN(name)   _ %+ name		; foo() -> _foo
   1.186 +%endif
   1.187 +
   1.188 +; --------------------------------------------------------------------------
   1.189 +;  Macros for position-independent code (PIC) support
   1.190 +;
   1.191 +%ifndef GOT_SYMBOL
   1.192 +%undef PIC
   1.193 +%endif
   1.194 +
   1.195 +%ifdef PIC ; -------------------------------------------
   1.196 +
   1.197 +%ifidn GOT_SYMBOL,_MACHO_PIC_ ; --------------------
   1.198 +
   1.199 +; At present, nasm doesn't seem to support PIC generation for Mach-O.
   1.200 +; The PIC support code below is a little tricky.
   1.201 +
   1.202 +	SECTION	SEG_CONST
   1.203 +const_base:
   1.204 +
   1.205 +%define GOTOFF(got,sym) (got) + (sym) - const_base
   1.206 +
   1.207 +%imacro get_GOT	1
   1.208 +	; NOTE: this macro destroys ecx resister.
   1.209 +	call	%%geteip
   1.210 +	add	ecx, byte (%%ref - $)
   1.211 +	jmp	short %%adjust
   1.212 +%%geteip:
   1.213 +	mov	ecx, POINTER [esp]
   1.214 +	ret
   1.215 +%%adjust:
   1.216 +	push	ebp
   1.217 +	xor	ebp,ebp		; ebp = 0
   1.218 +%ifidni %1,ebx	; (%1 == ebx)
   1.219 +	; db 0x8D,0x9C + jmp near const_base =
   1.220 +	;   lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
   1.221 +	db	0x8D,0x9C		; 8D,9C
   1.222 +	jmp	near const_base		; E9,(const_base-%%ref)
   1.223 +%%ref:
   1.224 +%else  ; (%1 != ebx)
   1.225 +	; db 0x8D,0x8C + jmp near const_base =
   1.226 +	;   lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
   1.227 +	db	0x8D,0x8C		; 8D,8C
   1.228 +	jmp	near const_base		; E9,(const_base-%%ref)
   1.229 +%%ref:	mov	%1, ecx
   1.230 +%endif ; (%1 == ebx)
   1.231 +	pop	ebp
   1.232 +%endmacro
   1.233 +
   1.234 +%else	; GOT_SYMBOL != _MACHO_PIC_ ----------------
   1.235 +
   1.236 +%define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff
   1.237 +
   1.238 +%imacro get_GOT	1
   1.239 +	extern	GOT_SYMBOL
   1.240 +	call	%%geteip
   1.241 +	add	%1, GOT_SYMBOL + $$ - $ wrt ..gotpc
   1.242 +	jmp	short %%done
   1.243 +%%geteip:
   1.244 +	mov	%1, POINTER [esp]
   1.245 +	ret
   1.246 +%%done:
   1.247 +%endmacro
   1.248 +
   1.249 +%endif	; GOT_SYMBOL == _MACHO_PIC_ ----------------
   1.250 +
   1.251 +%imacro pushpic	1.nolist
   1.252 +	push	%1
   1.253 +%endmacro
   1.254 +%imacro poppic	1.nolist
   1.255 +	pop	%1
   1.256 +%endmacro
   1.257 +%imacro movpic	2.nolist
   1.258 +	mov	%1,%2
   1.259 +%endmacro
   1.260 +
   1.261 +%else	; !PIC -----------------------------------------
   1.262 +
   1.263 +%define GOTOFF(got,sym) (sym)
   1.264 +
   1.265 +%imacro get_GOT	1.nolist
   1.266 +%endmacro
   1.267 +%imacro pushpic	1.nolist
   1.268 +%endmacro
   1.269 +%imacro poppic	1.nolist
   1.270 +%endmacro
   1.271 +%imacro movpic	2.nolist
   1.272 +%endmacro
   1.273 +
   1.274 +%endif	;  PIC -----------------------------------------
   1.275 +
   1.276 +; --------------------------------------------------------------------------
   1.277 +;  Align the next instruction on {2,4,8,16,..}-byte boundary.
   1.278 +;  ".balign n,,m" in GNU as
   1.279 +;
   1.280 +%define MSKLE(x,y)  (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
   1.281 +%define FILLB(b,n)  (($$-(b)) & ((n)-1))
   1.282 +
   1.283 +%imacro alignx 1-2.nolist 0xFFFF
   1.284 +%%bs:	times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \
   1.285 +	       db 0x90                               ; nop
   1.286 +	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \
   1.287 +	       db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000]
   1.288 +	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \
   1.289 +	       db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000]
   1.290 +	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \
   1.291 +	       db 0x8D,0xAD,0x00,0x00,0x00,0x00      ; lea ebp,[ebp+0x00000000]
   1.292 +	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \
   1.293 +	       db 0x8D,0x6C,0x25,0x00                ; lea ebp,[ebp+0x00]
   1.294 +	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \
   1.295 +	       db 0x8D,0x6D,0x00                     ; lea ebp,[ebp+0x00]
   1.296 +	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \
   1.297 +	       db 0x8B,0xED                          ; mov ebp,ebp
   1.298 +	times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \
   1.299 +	       db 0x90                               ; nop
   1.300 +%endmacro
   1.301 +
   1.302 +; Align the next data on {2,4,8,16,..}-byte boundary.
   1.303 +;
   1.304 +%imacro alignz 1.nolist
   1.305 +	align %1, db 0		; filling zeros
   1.306 +%endmacro
   1.307 +
   1.308 +%ifdef __x86_64__
   1.309 +
   1.310 +%ifdef WIN64
   1.311 +
   1.312 +%imacro collect_args 0
   1.313 +	push r12
   1.314 +	push r13
   1.315 +	push r14
   1.316 +	push r15
   1.317 +	mov r10, rcx
   1.318 +	mov r11, rdx
   1.319 +	mov r12, r8
   1.320 +	mov r13, r9
   1.321 +	mov r14, [rax+48]
   1.322 +	mov r15, [rax+56]
   1.323 +	push rsi
   1.324 +	push rdi
   1.325 +	sub     rsp, SIZEOF_XMMWORD
   1.326 +	movaps  XMMWORD [rsp], xmm6
   1.327 +	sub     rsp, SIZEOF_XMMWORD
   1.328 +	movaps  XMMWORD [rsp], xmm7
   1.329 +%endmacro
   1.330 +
   1.331 +%imacro uncollect_args 0
   1.332 +	movaps  xmm7, XMMWORD [rsp]
   1.333 +	add     rsp, SIZEOF_XMMWORD
   1.334 +	movaps  xmm6, XMMWORD [rsp]
   1.335 +	add     rsp, SIZEOF_XMMWORD
   1.336 +	pop rdi
   1.337 +	pop rsi
   1.338 +	pop r15
   1.339 +	pop r14
   1.340 +	pop r13
   1.341 +	pop r12
   1.342 +%endmacro
   1.343 +
   1.344 +%else
   1.345 +
   1.346 +%imacro collect_args 0
   1.347 +	push r10
   1.348 +	push r11
   1.349 +	push r12
   1.350 +	push r13
   1.351 +	push r14
   1.352 +	push r15
   1.353 +	mov r10, rdi
   1.354 +	mov r11, rsi
   1.355 +	mov r12, rdx
   1.356 +	mov r13, rcx
   1.357 +	mov r14, r8
   1.358 +	mov r15, r9
   1.359 +%endmacro
   1.360 +
   1.361 +%imacro uncollect_args 0
   1.362 +	pop r15
   1.363 +	pop r14
   1.364 +	pop r13
   1.365 +	pop r12
   1.366 +	pop r11
   1.367 +	pop r10
   1.368 +%endmacro
   1.369 +
   1.370 +%endif
   1.371 +
   1.372 +%endif
   1.373 +
   1.374 +; --------------------------------------------------------------------------
   1.375 +;  Defines picked up from the C headers
   1.376 +;
   1.377 +%include "jsimdcfg.inc"
   1.378 +
   1.379 +; --------------------------------------------------------------------------

mercurial