|
1 ; This Source Code Form is subject to the terms of the Mozilla Public |
|
2 ; License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3 ; file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
4 |
|
5 ; |
|
6 ; This code is converted from mpi_amd64_gas.asm for MASM for x64. |
|
7 ; |
|
8 |
|
9 ; ------------------------------------------------------------------------ |
|
10 ; |
|
11 ; Implementation of s_mpv_mul_set_vec which exploits |
|
12 ; the 64X64->128 bit unsigned multiply instruction. |
|
13 ; |
|
14 ; ------------------------------------------------------------------------ |
|
15 |
|
16 ; r = a * digit, r and a are vectors of length len |
|
17 ; returns the carry digit |
|
18 ; r and a are 64 bit aligned. |
|
19 ; |
|
20 ; uint64_t |
|
21 ; s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit) |
|
22 ; |
|
23 |
|
24 .CODE |
|
25 |
|
26 s_mpv_mul_set_vec64 PROC |
|
27 |
|
28 ; compatibilities for paramenter registers |
|
29 ; |
|
30 ; About GAS and MASM, the usage of parameter registers are different. |
|
31 |
|
32 push rdi |
|
33 push rsi |
|
34 |
|
35 mov rdi, rcx |
|
36 mov rsi, rdx |
|
37 mov edx, r8d |
|
38 mov rcx, r9 |
|
39 |
|
40 xor rax, rax |
|
41 test rdx, rdx |
|
42 jz L17 |
|
43 mov r8, rdx |
|
44 xor r9, r9 |
|
45 |
|
46 L15: |
|
47 cmp r8, 8 |
|
48 jb L16 |
|
49 mov rax, [rsi] |
|
50 mov r11, [8+rsi] |
|
51 mul rcx |
|
52 add rax, r9 |
|
53 adc rdx, 0 |
|
54 mov [0+rdi], rax |
|
55 mov r9, rdx |
|
56 mov rax,r11 |
|
57 mov r11, [16+rsi] |
|
58 mul rcx |
|
59 add rax,r9 |
|
60 adc rdx,0 |
|
61 mov [8+rdi],rax |
|
62 mov r9,rdx |
|
63 mov rax,r11 |
|
64 mov r11, [24+rsi] |
|
65 mul rcx |
|
66 add rax,r9 |
|
67 adc rdx,0 |
|
68 mov [16+rdi],rax |
|
69 mov r9,rdx |
|
70 mov rax,r11 |
|
71 mov r11, [32+rsi] |
|
72 mul rcx |
|
73 add rax,r9 |
|
74 adc rdx,0 |
|
75 mov [24+rdi],rax |
|
76 mov r9,rdx |
|
77 mov rax,r11 |
|
78 mov r11, [40+rsi] |
|
79 mul rcx |
|
80 add rax,r9 |
|
81 adc rdx,0 |
|
82 mov [32+rdi],rax |
|
83 mov r9,rdx |
|
84 mov rax,r11 |
|
85 mov r11, [48+rsi] |
|
86 mul rcx |
|
87 add rax,r9 |
|
88 adc rdx,0 |
|
89 mov [40+rdi],rax |
|
90 mov r9,rdx |
|
91 mov rax,r11 |
|
92 mov r11, [56+rsi] |
|
93 mul rcx |
|
94 add rax,r9 |
|
95 adc rdx,0 |
|
96 mov [48+rdi],rax |
|
97 mov r9,rdx |
|
98 mov rax,r11 |
|
99 mul rcx |
|
100 add rax,r9 |
|
101 adc rdx,0 |
|
102 mov [56+rdi],rax |
|
103 mov r9,rdx |
|
104 add rsi, 64 |
|
105 add rdi, 64 |
|
106 sub r8, 8 |
|
107 jz L17 |
|
108 jmp L15 |
|
109 |
|
110 L16: |
|
111 mov rax, [0+rsi] |
|
112 mul rcx |
|
113 add rax, r9 |
|
114 adc rdx,0 |
|
115 mov [0+rdi],rax |
|
116 mov r9,rdx |
|
117 dec r8 |
|
118 jz L17 |
|
119 mov rax, [8+rsi] |
|
120 mul rcx |
|
121 add rax,r9 |
|
122 adc rdx,0 |
|
123 mov [8+rdi], rax |
|
124 mov r9, rdx |
|
125 dec r8 |
|
126 jz L17 |
|
127 mov rax, [16+rsi] |
|
128 mul rcx |
|
129 add rax, r9 |
|
130 adc rdx, 0 |
|
131 mov [16+rdi],rax |
|
132 mov r9,rdx |
|
133 dec r8 |
|
134 jz L17 |
|
135 mov rax, [24+rsi] |
|
136 mul rcx |
|
137 add rax, r9 |
|
138 adc rdx, 0 |
|
139 mov [24+rdi], rax |
|
140 mov r9, rdx |
|
141 dec r8 |
|
142 jz L17 |
|
143 mov rax, [32+rsi] |
|
144 mul rcx |
|
145 add rax, r9 |
|
146 adc rdx, 0 |
|
147 mov [32+rdi],rax |
|
148 mov r9, rdx |
|
149 dec r8 |
|
150 jz L17 |
|
151 mov rax, [40+rsi] |
|
152 mul rcx |
|
153 add rax, r9 |
|
154 adc rdx, 0 |
|
155 mov [40+rdi], rax |
|
156 mov r9, rdx |
|
157 dec r8 |
|
158 jz L17 |
|
159 mov rax, [48+rsi] |
|
160 mul rcx |
|
161 add rax, r9 |
|
162 adc rdx, 0 |
|
163 mov [48+rdi], rax |
|
164 mov r9, rdx |
|
165 dec r8 |
|
166 jz L17 |
|
167 |
|
168 L17: |
|
169 mov rax, r9 |
|
170 pop rsi |
|
171 pop rdi |
|
172 ret |
|
173 |
|
174 s_mpv_mul_set_vec64 ENDP |
|
175 |
|
176 |
|
177 ;------------------------------------------------------------------------ |
|
178 ; |
|
179 ; Implementation of s_mpv_mul_add_vec which exploits |
|
180 ; the 64X64->128 bit unsigned multiply instruction. |
|
181 ; |
|
182 ;------------------------------------------------------------------------ |
|
183 |
|
184 ; r += a * digit, r and a are vectors of length len |
|
185 ; returns the carry digit |
|
186 ; r and a are 64 bit aligned. |
|
187 ; |
|
188 ; uint64_t |
|
189 ; s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit) |
|
190 ; |
|
191 |
|
192 s_mpv_mul_add_vec64 PROC |
|
193 |
|
194 ; compatibilities for paramenter registers |
|
195 ; |
|
196 ; About GAS and MASM, the usage of parameter registers are different. |
|
197 |
|
198 push rdi |
|
199 push rsi |
|
200 |
|
201 mov rdi, rcx |
|
202 mov rsi, rdx |
|
203 mov edx, r8d |
|
204 mov rcx, r9 |
|
205 |
|
206 xor rax, rax |
|
207 test rdx, rdx |
|
208 jz L27 |
|
209 mov r8, rdx |
|
210 xor r9, r9 |
|
211 |
|
212 L25: |
|
213 cmp r8, 8 |
|
214 jb L26 |
|
215 mov rax, [0+rsi] |
|
216 mov r10, [0+rdi] |
|
217 mov r11, [8+rsi] |
|
218 mul rcx |
|
219 add rax,r10 |
|
220 adc rdx,0 |
|
221 mov r10, [8+rdi] |
|
222 add rax,r9 |
|
223 adc rdx,0 |
|
224 mov [0+rdi],rax |
|
225 mov r9,rdx |
|
226 mov rax,r11 |
|
227 mov r11, [16+rsi] |
|
228 mul rcx |
|
229 add rax,r10 |
|
230 adc rdx,0 |
|
231 mov r10, [16+rdi] |
|
232 add rax,r9 |
|
233 adc rdx,0 |
|
234 mov [8+rdi],rax |
|
235 mov r9,rdx |
|
236 mov rax,r11 |
|
237 mov r11, [24+rsi] |
|
238 mul rcx |
|
239 add rax,r10 |
|
240 adc rdx,0 |
|
241 mov r10, [24+rdi] |
|
242 add rax,r9 |
|
243 adc rdx,0 |
|
244 mov [16+rdi],rax |
|
245 mov r9,rdx |
|
246 mov rax,r11 |
|
247 mov r11, [32+rsi] |
|
248 mul rcx |
|
249 add rax,r10 |
|
250 adc rdx,0 |
|
251 mov r10, [32+rdi] |
|
252 add rax,r9 |
|
253 adc rdx,0 |
|
254 mov [24+rdi],rax |
|
255 mov r9,rdx |
|
256 mov rax,r11 |
|
257 mov r11, [40+rsi] |
|
258 mul rcx |
|
259 add rax,r10 |
|
260 adc rdx,0 |
|
261 mov r10, [40+rdi] |
|
262 add rax,r9 |
|
263 adc rdx,0 |
|
264 mov [32+rdi],rax |
|
265 mov r9,rdx |
|
266 mov rax,r11 |
|
267 mov r11, [48+rsi] |
|
268 mul rcx |
|
269 add rax,r10 |
|
270 adc rdx,0 |
|
271 mov r10, [48+rdi] |
|
272 add rax,r9 |
|
273 adc rdx,0 |
|
274 mov [40+rdi],rax |
|
275 mov r9,rdx |
|
276 mov rax,r11 |
|
277 mov r11, [56+rsi] |
|
278 mul rcx |
|
279 add rax,r10 |
|
280 adc rdx,0 |
|
281 mov r10, [56+rdi] |
|
282 add rax,r9 |
|
283 adc rdx,0 |
|
284 mov [48+rdi],rax |
|
285 mov r9,rdx |
|
286 mov rax,r11 |
|
287 mul rcx |
|
288 add rax,r10 |
|
289 adc rdx,0 |
|
290 add rax,r9 |
|
291 adc rdx,0 |
|
292 mov [56+rdi],rax |
|
293 mov r9,rdx |
|
294 add rsi,64 |
|
295 add rdi,64 |
|
296 sub r8, 8 |
|
297 jz L27 |
|
298 jmp L25 |
|
299 |
|
300 L26: |
|
301 mov rax, [0+rsi] |
|
302 mov r10, [0+rdi] |
|
303 mul rcx |
|
304 add rax,r10 |
|
305 adc rdx,0 |
|
306 add rax,r9 |
|
307 adc rdx,0 |
|
308 mov [0+rdi],rax |
|
309 mov r9,rdx |
|
310 dec r8 |
|
311 jz L27 |
|
312 mov rax, [8+rsi] |
|
313 mov r10, [8+rdi] |
|
314 mul rcx |
|
315 add rax,r10 |
|
316 adc rdx,0 |
|
317 add rax,r9 |
|
318 adc rdx,0 |
|
319 mov [8+rdi],rax |
|
320 mov r9,rdx |
|
321 dec r8 |
|
322 jz L27 |
|
323 mov rax, [16+rsi] |
|
324 mov r10, [16+rdi] |
|
325 mul rcx |
|
326 add rax,r10 |
|
327 adc rdx,0 |
|
328 add rax,r9 |
|
329 adc rdx,0 |
|
330 mov [16+rdi],rax |
|
331 mov r9,rdx |
|
332 dec r8 |
|
333 jz L27 |
|
334 mov rax, [24+rsi] |
|
335 mov r10, [24+rdi] |
|
336 mul rcx |
|
337 add rax,r10 |
|
338 adc rdx,0 |
|
339 add rax,r9 |
|
340 adc rdx,0 |
|
341 mov [24+rdi],rax |
|
342 mov r9,rdx |
|
343 dec r8 |
|
344 jz L27 |
|
345 mov rax, [32+rsi] |
|
346 mov r10, [32+rdi] |
|
347 mul rcx |
|
348 add rax,r10 |
|
349 adc rdx,0 |
|
350 add rax,r9 |
|
351 adc rdx,0 |
|
352 mov [32+rdi],rax |
|
353 mov r9,rdx |
|
354 dec r8 |
|
355 jz L27 |
|
356 mov rax, [40+rsi] |
|
357 mov r10, [40+rdi] |
|
358 mul rcx |
|
359 add rax,r10 |
|
360 adc rdx,0 |
|
361 add rax,r9 |
|
362 adc rdx,0 |
|
363 mov [40+rdi],rax |
|
364 mov r9,rdx |
|
365 dec r8 |
|
366 jz L27 |
|
367 mov rax, [48+rsi] |
|
368 mov r10, [48+rdi] |
|
369 mul rcx |
|
370 add rax,r10 |
|
371 adc rdx,0 |
|
372 add rax, r9 |
|
373 adc rdx, 0 |
|
374 mov [48+rdi], rax |
|
375 mov r9, rdx |
|
376 dec r8 |
|
377 jz L27 |
|
378 |
|
379 L27: |
|
380 mov rax, r9 |
|
381 |
|
382 pop rsi |
|
383 pop rdi |
|
384 ret |
|
385 |
|
386 s_mpv_mul_add_vec64 ENDP |
|
387 |
|
388 END |