|
1 # This Source Code Form is subject to the terms of the Mozilla Public |
|
2 # License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3 # file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
4 |
|
5 |
|
6 # ------------------------------------------------------------------------ |
|
7 # |
|
8 # Implementation of s_mpv_mul_set_vec which exploits |
|
9 # the 64X64->128 bit unsigned multiply instruction. |
|
10 # |
|
11 # ------------------------------------------------------------------------ |
|
12 |
|
13 # r = a * digit, r and a are vectors of length len |
|
14 # returns the carry digit |
|
15 # r and a are 64 bit aligned. |
|
16 # |
|
17 # uint64_t |
|
18 # s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit) |
|
19 # |
|
20 |
|
21 .text; .align 16; .globl s_mpv_mul_set_vec64; .type s_mpv_mul_set_vec64, @function; s_mpv_mul_set_vec64: |
|
22 |
|
23 xorq %rax, %rax # if (len == 0) return (0) |
|
24 testq %rdx, %rdx |
|
25 jz .L17 |
|
26 |
|
27 movq %rdx, %r8 # Use r8 for len; %rdx is used by mul |
|
28 xorq %r9, %r9 # cy = 0 |
|
29 |
|
30 .L15: |
|
31 cmpq $8, %r8 # 8 - len |
|
32 jb .L16 |
|
33 movq 0(%rsi), %rax # rax = a[0] |
|
34 movq 8(%rsi), %r11 # prefetch a[1] |
|
35 mulq %rcx # p = a[0] * digit |
|
36 addq %r9, %rax |
|
37 adcq $0, %rdx # p += cy |
|
38 movq %rax, 0(%rdi) # r[0] = lo(p) |
|
39 movq %rdx, %r9 # cy = hi(p) |
|
40 |
|
41 movq %r11, %rax |
|
42 movq 16(%rsi), %r11 # prefetch a[2] |
|
43 mulq %rcx # p = a[1] * digit |
|
44 addq %r9, %rax |
|
45 adcq $0, %rdx # p += cy |
|
46 movq %rax, 8(%rdi) # r[1] = lo(p) |
|
47 movq %rdx, %r9 # cy = hi(p) |
|
48 |
|
49 movq %r11, %rax |
|
50 movq 24(%rsi), %r11 # prefetch a[3] |
|
51 mulq %rcx # p = a[2] * digit |
|
52 addq %r9, %rax |
|
53 adcq $0, %rdx # p += cy |
|
54 movq %rax, 16(%rdi) # r[2] = lo(p) |
|
55 movq %rdx, %r9 # cy = hi(p) |
|
56 |
|
57 movq %r11, %rax |
|
58 movq 32(%rsi), %r11 # prefetch a[4] |
|
59 mulq %rcx # p = a[3] * digit |
|
60 addq %r9, %rax |
|
61 adcq $0, %rdx # p += cy |
|
62 movq %rax, 24(%rdi) # r[3] = lo(p) |
|
63 movq %rdx, %r9 # cy = hi(p) |
|
64 |
|
65 movq %r11, %rax |
|
66 movq 40(%rsi), %r11 # prefetch a[5] |
|
67 mulq %rcx # p = a[4] * digit |
|
68 addq %r9, %rax |
|
69 adcq $0, %rdx # p += cy |
|
70 movq %rax, 32(%rdi) # r[4] = lo(p) |
|
71 movq %rdx, %r9 # cy = hi(p) |
|
72 |
|
73 movq %r11, %rax |
|
74 movq 48(%rsi), %r11 # prefetch a[6] |
|
75 mulq %rcx # p = a[5] * digit |
|
76 addq %r9, %rax |
|
77 adcq $0, %rdx # p += cy |
|
78 movq %rax, 40(%rdi) # r[5] = lo(p) |
|
79 movq %rdx, %r9 # cy = hi(p) |
|
80 |
|
81 movq %r11, %rax |
|
82 movq 56(%rsi), %r11 # prefetch a[7] |
|
83 mulq %rcx # p = a[6] * digit |
|
84 addq %r9, %rax |
|
85 adcq $0, %rdx # p += cy |
|
86 movq %rax, 48(%rdi) # r[6] = lo(p) |
|
87 movq %rdx, %r9 # cy = hi(p) |
|
88 |
|
89 movq %r11, %rax |
|
90 mulq %rcx # p = a[7] * digit |
|
91 addq %r9, %rax |
|
92 adcq $0, %rdx # p += cy |
|
93 movq %rax, 56(%rdi) # r[7] = lo(p) |
|
94 movq %rdx, %r9 # cy = hi(p) |
|
95 |
|
96 addq $64, %rsi |
|
97 addq $64, %rdi |
|
98 subq $8, %r8 |
|
99 |
|
100 jz .L17 |
|
101 jmp .L15 |
|
102 |
|
103 .L16: |
|
104 movq 0(%rsi), %rax |
|
105 mulq %rcx # p = a[0] * digit |
|
106 addq %r9, %rax |
|
107 adcq $0, %rdx # p += cy |
|
108 movq %rax, 0(%rdi) # r[0] = lo(p) |
|
109 movq %rdx, %r9 # cy = hi(p) |
|
110 decq %r8 |
|
111 jz .L17 |
|
112 |
|
113 movq 8(%rsi), %rax |
|
114 mulq %rcx # p = a[1] * digit |
|
115 addq %r9, %rax |
|
116 adcq $0, %rdx # p += cy |
|
117 movq %rax, 8(%rdi) # r[1] = lo(p) |
|
118 movq %rdx, %r9 # cy = hi(p) |
|
119 decq %r8 |
|
120 jz .L17 |
|
121 |
|
122 movq 16(%rsi), %rax |
|
123 mulq %rcx # p = a[2] * digit |
|
124 addq %r9, %rax |
|
125 adcq $0, %rdx # p += cy |
|
126 movq %rax, 16(%rdi) # r[2] = lo(p) |
|
127 movq %rdx, %r9 # cy = hi(p) |
|
128 decq %r8 |
|
129 jz .L17 |
|
130 |
|
131 movq 24(%rsi), %rax |
|
132 mulq %rcx # p = a[3] * digit |
|
133 addq %r9, %rax |
|
134 adcq $0, %rdx # p += cy |
|
135 movq %rax, 24(%rdi) # r[3] = lo(p) |
|
136 movq %rdx, %r9 # cy = hi(p) |
|
137 decq %r8 |
|
138 jz .L17 |
|
139 |
|
140 movq 32(%rsi), %rax |
|
141 mulq %rcx # p = a[4] * digit |
|
142 addq %r9, %rax |
|
143 adcq $0, %rdx # p += cy |
|
144 movq %rax, 32(%rdi) # r[4] = lo(p) |
|
145 movq %rdx, %r9 # cy = hi(p) |
|
146 decq %r8 |
|
147 jz .L17 |
|
148 |
|
149 movq 40(%rsi), %rax |
|
150 mulq %rcx # p = a[5] * digit |
|
151 addq %r9, %rax |
|
152 adcq $0, %rdx # p += cy |
|
153 movq %rax, 40(%rdi) # r[5] = lo(p) |
|
154 movq %rdx, %r9 # cy = hi(p) |
|
155 decq %r8 |
|
156 jz .L17 |
|
157 |
|
158 movq 48(%rsi), %rax |
|
159 mulq %rcx # p = a[6] * digit |
|
160 addq %r9, %rax |
|
161 adcq $0, %rdx # p += cy |
|
162 movq %rax, 48(%rdi) # r[6] = lo(p) |
|
163 movq %rdx, %r9 # cy = hi(p) |
|
164 decq %r8 |
|
165 jz .L17 |
|
166 |
|
167 |
|
168 .L17: |
|
169 movq %r9, %rax |
|
170 ret |
|
171 |
|
172 .size s_mpv_mul_set_vec64, .-s_mpv_mul_set_vec64 |
|
173 |
|
174 # ------------------------------------------------------------------------ |
|
175 # |
|
176 # Implementation of s_mpv_mul_add_vec which exploits |
|
177 # the 64X64->128 bit unsigned multiply instruction. |
|
178 # |
|
179 # ------------------------------------------------------------------------ |
|
180 |
|
181 # r += a * digit, r and a are vectors of length len |
|
182 # returns the carry digit |
|
183 # r and a are 64 bit aligned. |
|
184 # |
|
185 # uint64_t |
|
186 # s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit) |
|
187 # |
|
188 |
|
189 .text; .align 16; .globl s_mpv_mul_add_vec64; .type s_mpv_mul_add_vec64, @function; s_mpv_mul_add_vec64: |
|
190 |
|
191 xorq %rax, %rax # if (len == 0) return (0) |
|
192 testq %rdx, %rdx |
|
193 jz .L27 |
|
194 |
|
195 movq %rdx, %r8 # Use r8 for len; %rdx is used by mul |
|
196 xorq %r9, %r9 # cy = 0 |
|
197 |
|
198 .L25: |
|
199 cmpq $8, %r8 # 8 - len |
|
200 jb .L26 |
|
201 movq 0(%rsi), %rax # rax = a[0] |
|
202 movq 0(%rdi), %r10 # r10 = r[0] |
|
203 movq 8(%rsi), %r11 # prefetch a[1] |
|
204 mulq %rcx # p = a[0] * digit |
|
205 addq %r10, %rax |
|
206 adcq $0, %rdx # p += r[0] |
|
207 movq 8(%rdi), %r10 # prefetch r[1] |
|
208 addq %r9, %rax |
|
209 adcq $0, %rdx # p += cy |
|
210 movq %rax, 0(%rdi) # r[0] = lo(p) |
|
211 movq %rdx, %r9 # cy = hi(p) |
|
212 |
|
213 movq %r11, %rax |
|
214 movq 16(%rsi), %r11 # prefetch a[2] |
|
215 mulq %rcx # p = a[1] * digit |
|
216 addq %r10, %rax |
|
217 adcq $0, %rdx # p += r[1] |
|
218 movq 16(%rdi), %r10 # prefetch r[2] |
|
219 addq %r9, %rax |
|
220 adcq $0, %rdx # p += cy |
|
221 movq %rax, 8(%rdi) # r[1] = lo(p) |
|
222 movq %rdx, %r9 # cy = hi(p) |
|
223 |
|
224 movq %r11, %rax |
|
225 movq 24(%rsi), %r11 # prefetch a[3] |
|
226 mulq %rcx # p = a[2] * digit |
|
227 addq %r10, %rax |
|
228 adcq $0, %rdx # p += r[2] |
|
229 movq 24(%rdi), %r10 # prefetch r[3] |
|
230 addq %r9, %rax |
|
231 adcq $0, %rdx # p += cy |
|
232 movq %rax, 16(%rdi) # r[2] = lo(p) |
|
233 movq %rdx, %r9 # cy = hi(p) |
|
234 |
|
235 movq %r11, %rax |
|
236 movq 32(%rsi), %r11 # prefetch a[4] |
|
237 mulq %rcx # p = a[3] * digit |
|
238 addq %r10, %rax |
|
239 adcq $0, %rdx # p += r[3] |
|
240 movq 32(%rdi), %r10 # prefetch r[4] |
|
241 addq %r9, %rax |
|
242 adcq $0, %rdx # p += cy |
|
243 movq %rax, 24(%rdi) # r[3] = lo(p) |
|
244 movq %rdx, %r9 # cy = hi(p) |
|
245 |
|
246 movq %r11, %rax |
|
247 movq 40(%rsi), %r11 # prefetch a[5] |
|
248 mulq %rcx # p = a[4] * digit |
|
249 addq %r10, %rax |
|
250 adcq $0, %rdx # p += r[4] |
|
251 movq 40(%rdi), %r10 # prefetch r[5] |
|
252 addq %r9, %rax |
|
253 adcq $0, %rdx # p += cy |
|
254 movq %rax, 32(%rdi) # r[4] = lo(p) |
|
255 movq %rdx, %r9 # cy = hi(p) |
|
256 |
|
257 movq %r11, %rax |
|
258 movq 48(%rsi), %r11 # prefetch a[6] |
|
259 mulq %rcx # p = a[5] * digit |
|
260 addq %r10, %rax |
|
261 adcq $0, %rdx # p += r[5] |
|
262 movq 48(%rdi), %r10 # prefetch r[6] |
|
263 addq %r9, %rax |
|
264 adcq $0, %rdx # p += cy |
|
265 movq %rax, 40(%rdi) # r[5] = lo(p) |
|
266 movq %rdx, %r9 # cy = hi(p) |
|
267 |
|
268 movq %r11, %rax |
|
269 movq 56(%rsi), %r11 # prefetch a[7] |
|
270 mulq %rcx # p = a[6] * digit |
|
271 addq %r10, %rax |
|
272 adcq $0, %rdx # p += r[6] |
|
273 movq 56(%rdi), %r10 # prefetch r[7] |
|
274 addq %r9, %rax |
|
275 adcq $0, %rdx # p += cy |
|
276 movq %rax, 48(%rdi) # r[6] = lo(p) |
|
277 movq %rdx, %r9 # cy = hi(p) |
|
278 |
|
279 movq %r11, %rax |
|
280 mulq %rcx # p = a[7] * digit |
|
281 addq %r10, %rax |
|
282 adcq $0, %rdx # p += r[7] |
|
283 addq %r9, %rax |
|
284 adcq $0, %rdx # p += cy |
|
285 movq %rax, 56(%rdi) # r[7] = lo(p) |
|
286 movq %rdx, %r9 # cy = hi(p) |
|
287 |
|
288 addq $64, %rsi |
|
289 addq $64, %rdi |
|
290 subq $8, %r8 |
|
291 |
|
292 jz .L27 |
|
293 jmp .L25 |
|
294 |
|
295 .L26: |
|
296 movq 0(%rsi), %rax |
|
297 movq 0(%rdi), %r10 |
|
298 mulq %rcx # p = a[0] * digit |
|
299 addq %r10, %rax |
|
300 adcq $0, %rdx # p += r[0] |
|
301 addq %r9, %rax |
|
302 adcq $0, %rdx # p += cy |
|
303 movq %rax, 0(%rdi) # r[0] = lo(p) |
|
304 movq %rdx, %r9 # cy = hi(p) |
|
305 decq %r8 |
|
306 jz .L27 |
|
307 |
|
308 movq 8(%rsi), %rax |
|
309 movq 8(%rdi), %r10 |
|
310 mulq %rcx # p = a[1] * digit |
|
311 addq %r10, %rax |
|
312 adcq $0, %rdx # p += r[1] |
|
313 addq %r9, %rax |
|
314 adcq $0, %rdx # p += cy |
|
315 movq %rax, 8(%rdi) # r[1] = lo(p) |
|
316 movq %rdx, %r9 # cy = hi(p) |
|
317 decq %r8 |
|
318 jz .L27 |
|
319 |
|
320 movq 16(%rsi), %rax |
|
321 movq 16(%rdi), %r10 |
|
322 mulq %rcx # p = a[2] * digit |
|
323 addq %r10, %rax |
|
324 adcq $0, %rdx # p += r[2] |
|
325 addq %r9, %rax |
|
326 adcq $0, %rdx # p += cy |
|
327 movq %rax, 16(%rdi) # r[2] = lo(p) |
|
328 movq %rdx, %r9 # cy = hi(p) |
|
329 decq %r8 |
|
330 jz .L27 |
|
331 |
|
332 movq 24(%rsi), %rax |
|
333 movq 24(%rdi), %r10 |
|
334 mulq %rcx # p = a[3] * digit |
|
335 addq %r10, %rax |
|
336 adcq $0, %rdx # p += r[3] |
|
337 addq %r9, %rax |
|
338 adcq $0, %rdx # p += cy |
|
339 movq %rax, 24(%rdi) # r[3] = lo(p) |
|
340 movq %rdx, %r9 # cy = hi(p) |
|
341 decq %r8 |
|
342 jz .L27 |
|
343 |
|
344 movq 32(%rsi), %rax |
|
345 movq 32(%rdi), %r10 |
|
346 mulq %rcx # p = a[4] * digit |
|
347 addq %r10, %rax |
|
348 adcq $0, %rdx # p += r[4] |
|
349 addq %r9, %rax |
|
350 adcq $0, %rdx # p += cy |
|
351 movq %rax, 32(%rdi) # r[4] = lo(p) |
|
352 movq %rdx, %r9 # cy = hi(p) |
|
353 decq %r8 |
|
354 jz .L27 |
|
355 |
|
356 movq 40(%rsi), %rax |
|
357 movq 40(%rdi), %r10 |
|
358 mulq %rcx # p = a[5] * digit |
|
359 addq %r10, %rax |
|
360 adcq $0, %rdx # p += r[5] |
|
361 addq %r9, %rax |
|
362 adcq $0, %rdx # p += cy |
|
363 movq %rax, 40(%rdi) # r[5] = lo(p) |
|
364 movq %rdx, %r9 # cy = hi(p) |
|
365 decq %r8 |
|
366 jz .L27 |
|
367 |
|
368 movq 48(%rsi), %rax |
|
369 movq 48(%rdi), %r10 |
|
370 mulq %rcx # p = a[6] * digit |
|
371 addq %r10, %rax |
|
372 adcq $0, %rdx # p += r[6] |
|
373 addq %r9, %rax |
|
374 adcq $0, %rdx # p += cy |
|
375 movq %rax, 48(%rdi) # r[6] = lo(p) |
|
376 movq %rdx, %r9 # cy = hi(p) |
|
377 decq %r8 |
|
378 jz .L27 |
|
379 |
|
380 |
|
381 .L27: |
|
382 movq %r9, %rax |
|
383 ret |
|
384 |
|
385 .size s_mpv_mul_add_vec64, .-s_mpv_mul_add_vec64 |
|
386 |
|
387 # Magic indicating no need for an executable stack |
|
388 .section .note.GNU-stack, "", @progbits |
|
389 .previous |