|
1 ; |
|
2 ; jsimdext.inc - common declarations |
|
3 ; |
|
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB |
|
5 ; Copyright 2010 D. R. Commander |
|
6 ; |
|
7 ; Based on |
|
8 ; x86 SIMD extension for IJG JPEG library - version 1.02 |
|
9 ; |
|
10 ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
|
11 ; |
|
12 ; This software is provided 'as-is', without any express or implied |
|
13 ; warranty. In no event will the authors be held liable for any damages |
|
14 ; arising from the use of this software. |
|
15 ; |
|
16 ; Permission is granted to anyone to use this software for any purpose, |
|
17 ; including commercial applications, and to alter it and redistribute it |
|
18 ; freely, subject to the following restrictions: |
|
19 ; |
|
20 ; 1. The origin of this software must not be misrepresented; you must not |
|
21 ; claim that you wrote the original software. If you use this software |
|
22 ; in a product, an acknowledgment in the product documentation would be |
|
23 ; appreciated but is not required. |
|
24 ; 2. Altered source versions must be plainly marked as such, and must not be |
|
25 ; misrepresented as being the original software. |
|
26 ; 3. This notice may not be removed or altered from any source distribution. |
|
27 ; |
|
28 ; [TAB8] |
|
29 |
|
30 ; ========================================================================== |
|
31 ; System-dependent configurations |
|
32 |
|
33 %ifdef WIN32 ; ----(nasm -fwin32 -DWIN32 ...)-------- |
|
34 ; * Microsoft Visual C++ |
|
35 ; * MinGW (Minimalist GNU for Windows) |
|
36 ; * CygWin |
|
37 ; * LCC-Win32 |
|
38 |
|
39 ; -- segment definition -- |
|
40 ; |
|
41 %ifdef __YASM_VER__ |
|
42 %define SEG_TEXT .text align=16 |
|
43 %define SEG_CONST .rdata align=16 |
|
44 %else |
|
45 %define SEG_TEXT .text align=16 public use32 class=CODE |
|
46 %define SEG_CONST .rdata align=16 public use32 class=CONST |
|
47 %endif |
|
48 |
|
49 %elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)-------- |
|
50 ; * Microsoft Visual C++ |
|
51 |
|
52 ; -- segment definition -- |
|
53 ; |
|
54 %ifdef __YASM_VER__ |
|
55 %define SEG_TEXT .text align=16 |
|
56 %define SEG_CONST .rdata align=16 |
|
57 %else |
|
58 %define SEG_TEXT .text align=16 public use64 class=CODE |
|
59 %define SEG_CONST .rdata align=16 public use64 class=CONST |
|
60 %endif |
|
61 %define EXTN(name) name ; foo() -> foo |
|
62 |
|
63 %elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)---------- |
|
64 ; * Borland C++ (Win32) |
|
65 |
|
66 ; -- segment definition -- |
|
67 ; |
|
68 %define SEG_TEXT .text align=16 public use32 class=CODE |
|
69 %define SEG_CONST .data align=16 public use32 class=DATA |
|
70 |
|
71 %elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------ |
|
72 ; * Linux |
|
73 ; * *BSD family Unix using elf format |
|
74 ; * Unix System V, including Solaris x86, UnixWare and SCO Unix |
|
75 |
|
76 ; mark stack as non-executable |
|
77 section .note.GNU-stack noalloc noexec nowrite progbits |
|
78 |
|
79 ; -- segment definition -- |
|
80 ; |
|
81 %ifdef __x86_64__ |
|
82 %define SEG_TEXT .text progbits align=16 |
|
83 %define SEG_CONST .rodata progbits align=16 |
|
84 %else |
|
85 %define SEG_TEXT .text progbits alloc exec nowrite align=16 |
|
86 %define SEG_CONST .rodata progbits alloc noexec nowrite align=16 |
|
87 %endif |
|
88 |
|
89 ; To make the code position-independent, append -DPIC to the commandline |
|
90 ; |
|
91 %define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC |
|
92 %define EXTN(name) name ; foo() -> foo |
|
93 |
|
94 %elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)---- |
|
95 ; * Older Linux using a.out format (nasm -f aout -DAOUT ...) |
|
96 ; * *BSD family Unix using a.out format (nasm -f aoutb -DAOUT ...) |
|
97 |
|
98 ; -- segment definition -- |
|
99 ; |
|
100 %define SEG_TEXT .text |
|
101 %define SEG_CONST .data |
|
102 |
|
103 ; To make the code position-independent, append -DPIC to the commandline |
|
104 ; |
|
105 %define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC |
|
106 |
|
107 %elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)-------- |
|
108 ; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format) |
|
109 |
|
110 ; -- segment definition -- |
|
111 ; |
|
112 %define SEG_TEXT .text ;align=16 ; nasm doesn't accept align=16. why? |
|
113 %define SEG_CONST .rodata align=16 |
|
114 |
|
115 ; The generation of position-independent code (PIC) is the default on Darwin. |
|
116 ; |
|
117 %define PIC |
|
118 %define GOT_SYMBOL _MACHO_PIC_ ; Mach-O style code-relative addressing |
|
119 |
|
120 %else ; ----(Other case)---------------------- |
|
121 |
|
122 ; -- segment definition -- |
|
123 ; |
|
124 %define SEG_TEXT .text |
|
125 %define SEG_CONST .data |
|
126 |
|
127 %endif ; ---------------------------------------------- |
|
128 |
|
129 ; ========================================================================== |
|
130 |
|
131 ; -------------------------------------------------------------------------- |
|
132 ; Common types |
|
133 ; |
|
134 %ifdef __x86_64__ |
|
135 %define POINTER qword ; general pointer type |
|
136 %define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER) |
|
137 %define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT |
|
138 %else |
|
139 %define POINTER dword ; general pointer type |
|
140 %define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER) |
|
141 %define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT |
|
142 %endif |
|
143 |
|
144 %define INT dword ; signed integer type |
|
145 %define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT) |
|
146 %define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT |
|
147 |
|
148 %define FP32 dword ; IEEE754 single |
|
149 %define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32) |
|
150 %define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT |
|
151 |
|
152 %define MMWORD qword ; int64 (MMX register) |
|
153 %define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD) |
|
154 %define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT |
|
155 |
|
156 ; NASM is buggy and doesn't properly handle operand sizes for SSE |
|
157 ; instructions, so for now we have to define XMMWORD as blank. |
|
158 %define XMMWORD ; int128 (SSE register) |
|
159 %define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD) |
|
160 %define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT |
|
161 |
|
162 ; Similar hacks for when we load a dword or MMWORD into an xmm# register |
|
163 %define XMM_DWORD |
|
164 %define XMM_MMWORD |
|
165 |
|
166 %define SIZEOF_BYTE 1 ; sizeof(BYTE) |
|
167 %define SIZEOF_WORD 2 ; sizeof(WORD) |
|
168 %define SIZEOF_DWORD 4 ; sizeof(DWORD) |
|
169 %define SIZEOF_QWORD 8 ; sizeof(QWORD) |
|
170 %define SIZEOF_OWORD 16 ; sizeof(OWORD) |
|
171 |
|
172 %define BYTE_BIT 8 ; CHAR_BIT in C |
|
173 %define WORD_BIT 16 ; sizeof(WORD)*BYTE_BIT |
|
174 %define DWORD_BIT 32 ; sizeof(DWORD)*BYTE_BIT |
|
175 %define QWORD_BIT 64 ; sizeof(QWORD)*BYTE_BIT |
|
176 %define OWORD_BIT 128 ; sizeof(OWORD)*BYTE_BIT |
|
177 |
|
178 ; -------------------------------------------------------------------------- |
|
179 ; External Symbol Name |
|
180 ; |
|
181 %ifndef EXTN |
|
182 %define EXTN(name) _ %+ name ; foo() -> _foo |
|
183 %endif |
|
184 |
|
185 ; -------------------------------------------------------------------------- |
|
186 ; Macros for position-independent code (PIC) support |
|
187 ; |
|
188 %ifndef GOT_SYMBOL |
|
189 %undef PIC |
|
190 %endif |
|
191 |
|
192 %ifdef PIC ; ------------------------------------------- |
|
193 |
|
194 %ifidn GOT_SYMBOL,_MACHO_PIC_ ; -------------------- |
|
195 |
|
196 ; At present, nasm doesn't seem to support PIC generation for Mach-O. |
|
197 ; The PIC support code below is a little tricky. |
|
198 |
|
199 SECTION SEG_CONST |
|
200 const_base: |
|
201 |
|
202 %define GOTOFF(got,sym) (got) + (sym) - const_base |
|
203 |
|
204 %imacro get_GOT 1 |
|
205 ; NOTE: this macro destroys ecx resister. |
|
206 call %%geteip |
|
207 add ecx, byte (%%ref - $) |
|
208 jmp short %%adjust |
|
209 %%geteip: |
|
210 mov ecx, POINTER [esp] |
|
211 ret |
|
212 %%adjust: |
|
213 push ebp |
|
214 xor ebp,ebp ; ebp = 0 |
|
215 %ifidni %1,ebx ; (%1 == ebx) |
|
216 ; db 0x8D,0x9C + jmp near const_base = |
|
217 ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32) |
|
218 db 0x8D,0x9C ; 8D,9C |
|
219 jmp near const_base ; E9,(const_base-%%ref) |
|
220 %%ref: |
|
221 %else ; (%1 != ebx) |
|
222 ; db 0x8D,0x8C + jmp near const_base = |
|
223 ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32) |
|
224 db 0x8D,0x8C ; 8D,8C |
|
225 jmp near const_base ; E9,(const_base-%%ref) |
|
226 %%ref: mov %1, ecx |
|
227 %endif ; (%1 == ebx) |
|
228 pop ebp |
|
229 %endmacro |
|
230 |
|
231 %else ; GOT_SYMBOL != _MACHO_PIC_ ---------------- |
|
232 |
|
233 %define GOTOFF(got,sym) (got) + (sym) wrt ..gotoff |
|
234 |
|
235 %imacro get_GOT 1 |
|
236 extern GOT_SYMBOL |
|
237 call %%geteip |
|
238 add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc |
|
239 jmp short %%done |
|
240 %%geteip: |
|
241 mov %1, POINTER [esp] |
|
242 ret |
|
243 %%done: |
|
244 %endmacro |
|
245 |
|
246 %endif ; GOT_SYMBOL == _MACHO_PIC_ ---------------- |
|
247 |
|
248 %imacro pushpic 1.nolist |
|
249 push %1 |
|
250 %endmacro |
|
251 %imacro poppic 1.nolist |
|
252 pop %1 |
|
253 %endmacro |
|
254 %imacro movpic 2.nolist |
|
255 mov %1,%2 |
|
256 %endmacro |
|
257 |
|
258 %else ; !PIC ----------------------------------------- |
|
259 |
|
260 %define GOTOFF(got,sym) (sym) |
|
261 |
|
262 %imacro get_GOT 1.nolist |
|
263 %endmacro |
|
264 %imacro pushpic 1.nolist |
|
265 %endmacro |
|
266 %imacro poppic 1.nolist |
|
267 %endmacro |
|
268 %imacro movpic 2.nolist |
|
269 %endmacro |
|
270 |
|
271 %endif ; PIC ----------------------------------------- |
|
272 |
|
273 ; -------------------------------------------------------------------------- |
|
274 ; Align the next instruction on {2,4,8,16,..}-byte boundary. |
|
275 ; ".balign n,,m" in GNU as |
|
276 ; |
|
277 %define MSKLE(x,y) (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16) |
|
278 %define FILLB(b,n) (($$-(b)) & ((n)-1)) |
|
279 |
|
280 %imacro alignx 1-2.nolist 0xFFFF |
|
281 %%bs: times MSKLE(FILLB(%%bs,%1),%2) & MSKLE(16,FILLB($,%1)) & FILLB($,%1) \ |
|
282 db 0x90 ; nop |
|
283 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/9 \ |
|
284 db 0x8D,0x9C,0x23,0x00,0x00,0x00,0x00 ; lea ebx,[ebx+0x00000000] |
|
285 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/7 \ |
|
286 db 0x8D,0xAC,0x25,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000] |
|
287 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/6 \ |
|
288 db 0x8D,0xAD,0x00,0x00,0x00,0x00 ; lea ebp,[ebp+0x00000000] |
|
289 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/4 \ |
|
290 db 0x8D,0x6C,0x25,0x00 ; lea ebp,[ebp+0x00] |
|
291 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/3 \ |
|
292 db 0x8D,0x6D,0x00 ; lea ebp,[ebp+0x00] |
|
293 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/2 \ |
|
294 db 0x8B,0xED ; mov ebp,ebp |
|
295 times MSKLE(FILLB(%%bs,%1),%2) & FILLB($,%1)/1 \ |
|
296 db 0x90 ; nop |
|
297 %endmacro |
|
298 |
|
299 ; Align the next data on {2,4,8,16,..}-byte boundary. |
|
300 ; |
|
301 %imacro alignz 1.nolist |
|
302 align %1, db 0 ; filling zeros |
|
303 %endmacro |
|
304 |
|
305 %ifdef __x86_64__ |
|
306 |
|
307 %ifdef WIN64 |
|
308 |
|
309 %imacro collect_args 0 |
|
310 push r12 |
|
311 push r13 |
|
312 push r14 |
|
313 push r15 |
|
314 mov r10, rcx |
|
315 mov r11, rdx |
|
316 mov r12, r8 |
|
317 mov r13, r9 |
|
318 mov r14, [rax+48] |
|
319 mov r15, [rax+56] |
|
320 push rsi |
|
321 push rdi |
|
322 sub rsp, SIZEOF_XMMWORD |
|
323 movaps XMMWORD [rsp], xmm6 |
|
324 sub rsp, SIZEOF_XMMWORD |
|
325 movaps XMMWORD [rsp], xmm7 |
|
326 %endmacro |
|
327 |
|
328 %imacro uncollect_args 0 |
|
329 movaps xmm7, XMMWORD [rsp] |
|
330 add rsp, SIZEOF_XMMWORD |
|
331 movaps xmm6, XMMWORD [rsp] |
|
332 add rsp, SIZEOF_XMMWORD |
|
333 pop rdi |
|
334 pop rsi |
|
335 pop r15 |
|
336 pop r14 |
|
337 pop r13 |
|
338 pop r12 |
|
339 %endmacro |
|
340 |
|
341 %else |
|
342 |
|
343 %imacro collect_args 0 |
|
344 push r10 |
|
345 push r11 |
|
346 push r12 |
|
347 push r13 |
|
348 push r14 |
|
349 push r15 |
|
350 mov r10, rdi |
|
351 mov r11, rsi |
|
352 mov r12, rdx |
|
353 mov r13, rcx |
|
354 mov r14, r8 |
|
355 mov r15, r9 |
|
356 %endmacro |
|
357 |
|
358 %imacro uncollect_args 0 |
|
359 pop r15 |
|
360 pop r14 |
|
361 pop r13 |
|
362 pop r12 |
|
363 pop r11 |
|
364 pop r10 |
|
365 %endmacro |
|
366 |
|
367 %endif |
|
368 |
|
369 %endif |
|
370 |
|
371 ; -------------------------------------------------------------------------- |
|
372 ; Defines picked up from the C headers |
|
373 ; |
|
374 %include "jsimdcfg.inc" |
|
375 |
|
376 ; -------------------------------------------------------------------------- |