|
1 / This Source Code Form is subject to the terms of the Mozilla Public |
|
2 / License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3 / file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
4 |
|
5 |
|
6 / ------------------------------------------------------------------------ |
|
7 / |
|
8 / Implementation of s_mpv_mul_set_vec which exploits |
|
9 / the 64X64->128 bit unsigned multiply instruction. |
|
10 / |
|
11 / ------------------------------------------------------------------------ |
|
12 |
|
13 / r = a * digit, r and a are vectors of length len |
|
14 / returns the carry digit |
|
15 / r and a are 64 bit aligned. |
|
16 / |
|
17 / uint64_t |
|
18 / s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit) |
|
19 / |
|
20 |
|
21 .text; .align 16; .globl s_mpv_mul_set_vec64; .type s_mpv_mul_set_vec64, @function; s_mpv_mul_set_vec64: |
|
22 |
|
23 xorq %rax, %rax / if (len == 0) return (0) |
|
24 testq %rdx, %rdx |
|
25 jz .L17 |
|
26 |
|
27 movq %rdx, %r8 / Use r8 for len; %rdx is used by mul |
|
28 xorq %r9, %r9 / cy = 0 |
|
29 |
|
30 .L15: |
|
31 cmpq $8, %r8 / 8 - len |
|
32 jb .L16 |
|
33 movq 0(%rsi), %rax / rax = a[0] |
|
34 movq 8(%rsi), %r11 / prefetch a[1] |
|
35 mulq %rcx / p = a[0] * digit |
|
36 addq %r9, %rax |
|
37 adcq $0, %rdx / p += cy |
|
38 movq %rax, 0(%rdi) / r[0] = lo(p) |
|
39 movq %rdx, %r9 / cy = hi(p) |
|
40 |
|
41 movq %r11, %rax |
|
42 movq 16(%rsi), %r11 / prefetch a[2] |
|
43 mulq %rcx / p = a[1] * digit |
|
44 addq %r9, %rax |
|
45 adcq $0, %rdx / p += cy |
|
46 movq %rax, 8(%rdi) / r[1] = lo(p) |
|
47 movq %rdx, %r9 / cy = hi(p) |
|
48 |
|
49 movq %r11, %rax |
|
50 movq 24(%rsi), %r11 / prefetch a[3] |
|
51 mulq %rcx / p = a[2] * digit |
|
52 addq %r9, %rax |
|
53 adcq $0, %rdx / p += cy |
|
54 movq %rax, 16(%rdi) / r[2] = lo(p) |
|
55 movq %rdx, %r9 / cy = hi(p) |
|
56 |
|
57 movq %r11, %rax |
|
58 movq 32(%rsi), %r11 / prefetch a[4] |
|
59 mulq %rcx / p = a[3] * digit |
|
60 addq %r9, %rax |
|
61 adcq $0, %rdx / p += cy |
|
62 movq %rax, 24(%rdi) / r[3] = lo(p) |
|
63 movq %rdx, %r9 / cy = hi(p) |
|
64 |
|
65 movq %r11, %rax |
|
66 movq 40(%rsi), %r11 / prefetch a[5] |
|
67 mulq %rcx / p = a[4] * digit |
|
68 addq %r9, %rax |
|
69 adcq $0, %rdx / p += cy |
|
70 movq %rax, 32(%rdi) / r[4] = lo(p) |
|
71 movq %rdx, %r9 / cy = hi(p) |
|
72 |
|
73 movq %r11, %rax |
|
74 movq 48(%rsi), %r11 / prefetch a[6] |
|
75 mulq %rcx / p = a[5] * digit |
|
76 addq %r9, %rax |
|
77 adcq $0, %rdx / p += cy |
|
78 movq %rax, 40(%rdi) / r[5] = lo(p) |
|
79 movq %rdx, %r9 / cy = hi(p) |
|
80 |
|
81 movq %r11, %rax |
|
82 movq 56(%rsi), %r11 / prefetch a[7] |
|
83 mulq %rcx / p = a[6] * digit |
|
84 addq %r9, %rax |
|
85 adcq $0, %rdx / p += cy |
|
86 movq %rax, 48(%rdi) / r[6] = lo(p) |
|
87 movq %rdx, %r9 / cy = hi(p) |
|
88 |
|
89 movq %r11, %rax |
|
90 mulq %rcx / p = a[7] * digit |
|
91 addq %r9, %rax |
|
92 adcq $0, %rdx / p += cy |
|
93 movq %rax, 56(%rdi) / r[7] = lo(p) |
|
94 movq %rdx, %r9 / cy = hi(p) |
|
95 |
|
96 addq $64, %rsi |
|
97 addq $64, %rdi |
|
98 subq $8, %r8 |
|
99 |
|
100 jz .L17 |
|
101 jmp .L15 |
|
102 |
|
103 .L16: |
|
104 movq 0(%rsi), %rax |
|
105 mulq %rcx / p = a[0] * digit |
|
106 addq %r9, %rax |
|
107 adcq $0, %rdx / p += cy |
|
108 movq %rax, 0(%rdi) / r[0] = lo(p) |
|
109 movq %rdx, %r9 / cy = hi(p) |
|
110 decq %r8 |
|
111 jz .L17 |
|
112 |
|
113 movq 8(%rsi), %rax |
|
114 mulq %rcx / p = a[1] * digit |
|
115 addq %r9, %rax |
|
116 adcq $0, %rdx / p += cy |
|
117 movq %rax, 8(%rdi) / r[1] = lo(p) |
|
118 movq %rdx, %r9 / cy = hi(p) |
|
119 decq %r8 |
|
120 jz .L17 |
|
121 |
|
122 movq 16(%rsi), %rax |
|
123 mulq %rcx / p = a[2] * digit |
|
124 addq %r9, %rax |
|
125 adcq $0, %rdx / p += cy |
|
126 movq %rax, 16(%rdi) / r[2] = lo(p) |
|
127 movq %rdx, %r9 / cy = hi(p) |
|
128 decq %r8 |
|
129 jz .L17 |
|
130 |
|
131 movq 24(%rsi), %rax |
|
132 mulq %rcx / p = a[3] * digit |
|
133 addq %r9, %rax |
|
134 adcq $0, %rdx / p += cy |
|
135 movq %rax, 24(%rdi) / r[3] = lo(p) |
|
136 movq %rdx, %r9 / cy = hi(p) |
|
137 decq %r8 |
|
138 jz .L17 |
|
139 |
|
140 movq 32(%rsi), %rax |
|
141 mulq %rcx / p = a[4] * digit |
|
142 addq %r9, %rax |
|
143 adcq $0, %rdx / p += cy |
|
144 movq %rax, 32(%rdi) / r[4] = lo(p) |
|
145 movq %rdx, %r9 / cy = hi(p) |
|
146 decq %r8 |
|
147 jz .L17 |
|
148 |
|
149 movq 40(%rsi), %rax |
|
150 mulq %rcx / p = a[5] * digit |
|
151 addq %r9, %rax |
|
152 adcq $0, %rdx / p += cy |
|
153 movq %rax, 40(%rdi) / r[5] = lo(p) |
|
154 movq %rdx, %r9 / cy = hi(p) |
|
155 decq %r8 |
|
156 jz .L17 |
|
157 |
|
158 movq 48(%rsi), %rax |
|
159 mulq %rcx / p = a[6] * digit |
|
160 addq %r9, %rax |
|
161 adcq $0, %rdx / p += cy |
|
162 movq %rax, 48(%rdi) / r[6] = lo(p) |
|
163 movq %rdx, %r9 / cy = hi(p) |
|
164 decq %r8 |
|
165 jz .L17 |
|
166 |
|
167 |
|
168 .L17: |
|
169 movq %r9, %rax |
|
170 ret |
|
171 |
|
172 .size s_mpv_mul_set_vec64, .-s_mpv_mul_set_vec64 |
|
173 |
|
174 / ------------------------------------------------------------------------ |
|
175 / |
|
176 / Implementation of s_mpv_mul_add_vec which exploits |
|
177 / the 64X64->128 bit unsigned multiply instruction. |
|
178 / |
|
179 / ------------------------------------------------------------------------ |
|
180 |
|
181 / r += a * digit, r and a are vectors of length len |
|
182 / returns the carry digit |
|
183 / r and a are 64 bit aligned. |
|
184 / |
|
185 / uint64_t |
|
186 / s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit) |
|
187 / |
|
188 |
|
189 .text; .align 16; .globl s_mpv_mul_add_vec64; .type s_mpv_mul_add_vec64, @function; s_mpv_mul_add_vec64: |
|
190 |
|
191 xorq %rax, %rax / if (len == 0) return (0) |
|
192 testq %rdx, %rdx |
|
193 jz .L27 |
|
194 |
|
195 movq %rdx, %r8 / Use r8 for len; %rdx is used by mul |
|
196 xorq %r9, %r9 / cy = 0 |
|
197 |
|
198 .L25: |
|
199 cmpq $8, %r8 / 8 - len |
|
200 jb .L26 |
|
201 movq 0(%rsi), %rax / rax = a[0] |
|
202 movq 0(%rdi), %r10 / r10 = r[0] |
|
203 movq 8(%rsi), %r11 / prefetch a[1] |
|
204 mulq %rcx / p = a[0] * digit |
|
205 addq %r10, %rax |
|
206 adcq $0, %rdx / p += r[0] |
|
207 movq 8(%rdi), %r10 / prefetch r[1] |
|
208 addq %r9, %rax |
|
209 adcq $0, %rdx / p += cy |
|
210 movq %rax, 0(%rdi) / r[0] = lo(p) |
|
211 movq %rdx, %r9 / cy = hi(p) |
|
212 |
|
213 movq %r11, %rax |
|
214 movq 16(%rsi), %r11 / prefetch a[2] |
|
215 mulq %rcx / p = a[1] * digit |
|
216 addq %r10, %rax |
|
217 adcq $0, %rdx / p += r[1] |
|
218 movq 16(%rdi), %r10 / prefetch r[2] |
|
219 addq %r9, %rax |
|
220 adcq $0, %rdx / p += cy |
|
221 movq %rax, 8(%rdi) / r[1] = lo(p) |
|
222 movq %rdx, %r9 / cy = hi(p) |
|
223 |
|
224 movq %r11, %rax |
|
225 movq 24(%rsi), %r11 / prefetch a[3] |
|
226 mulq %rcx / p = a[2] * digit |
|
227 addq %r10, %rax |
|
228 adcq $0, %rdx / p += r[2] |
|
229 movq 24(%rdi), %r10 / prefetch r[3] |
|
230 addq %r9, %rax |
|
231 adcq $0, %rdx / p += cy |
|
232 movq %rax, 16(%rdi) / r[2] = lo(p) |
|
233 movq %rdx, %r9 / cy = hi(p) |
|
234 |
|
235 movq %r11, %rax |
|
236 movq 32(%rsi), %r11 / prefetch a[4] |
|
237 mulq %rcx / p = a[3] * digit |
|
238 addq %r10, %rax |
|
239 adcq $0, %rdx / p += r[3] |
|
240 movq 32(%rdi), %r10 / prefetch r[4] |
|
241 addq %r9, %rax |
|
242 adcq $0, %rdx / p += cy |
|
243 movq %rax, 24(%rdi) / r[3] = lo(p) |
|
244 movq %rdx, %r9 / cy = hi(p) |
|
245 |
|
246 movq %r11, %rax |
|
247 movq 40(%rsi), %r11 / prefetch a[5] |
|
248 mulq %rcx / p = a[4] * digit |
|
249 addq %r10, %rax |
|
250 adcq $0, %rdx / p += r[4] |
|
251 movq 40(%rdi), %r10 / prefetch r[5] |
|
252 addq %r9, %rax |
|
253 adcq $0, %rdx / p += cy |
|
254 movq %rax, 32(%rdi) / r[4] = lo(p) |
|
255 movq %rdx, %r9 / cy = hi(p) |
|
256 |
|
257 movq %r11, %rax |
|
258 movq 48(%rsi), %r11 / prefetch a[6] |
|
259 mulq %rcx / p = a[5] * digit |
|
260 addq %r10, %rax |
|
261 adcq $0, %rdx / p += r[5] |
|
262 movq 48(%rdi), %r10 / prefetch r[6] |
|
263 addq %r9, %rax |
|
264 adcq $0, %rdx / p += cy |
|
265 movq %rax, 40(%rdi) / r[5] = lo(p) |
|
266 movq %rdx, %r9 / cy = hi(p) |
|
267 |
|
268 movq %r11, %rax |
|
269 movq 56(%rsi), %r11 / prefetch a[7] |
|
270 mulq %rcx / p = a[6] * digit |
|
271 addq %r10, %rax |
|
272 adcq $0, %rdx / p += r[6] |
|
273 movq 56(%rdi), %r10 / prefetch r[7] |
|
274 addq %r9, %rax |
|
275 adcq $0, %rdx / p += cy |
|
276 movq %rax, 48(%rdi) / r[6] = lo(p) |
|
277 movq %rdx, %r9 / cy = hi(p) |
|
278 |
|
279 movq %r11, %rax |
|
280 mulq %rcx / p = a[7] * digit |
|
281 addq %r10, %rax |
|
282 adcq $0, %rdx / p += r[7] |
|
283 addq %r9, %rax |
|
284 adcq $0, %rdx / p += cy |
|
285 movq %rax, 56(%rdi) / r[7] = lo(p) |
|
286 movq %rdx, %r9 / cy = hi(p) |
|
287 |
|
288 addq $64, %rsi |
|
289 addq $64, %rdi |
|
290 subq $8, %r8 |
|
291 |
|
292 jz .L27 |
|
293 jmp .L25 |
|
294 |
|
295 .L26: |
|
296 movq 0(%rsi), %rax |
|
297 movq 0(%rdi), %r10 |
|
298 mulq %rcx / p = a[0] * digit |
|
299 addq %r10, %rax |
|
300 adcq $0, %rdx / p += r[0] |
|
301 addq %r9, %rax |
|
302 adcq $0, %rdx / p += cy |
|
303 movq %rax, 0(%rdi) / r[0] = lo(p) |
|
304 movq %rdx, %r9 / cy = hi(p) |
|
305 decq %r8 |
|
306 jz .L27 |
|
307 |
|
308 movq 8(%rsi), %rax |
|
309 movq 8(%rdi), %r10 |
|
310 mulq %rcx / p = a[1] * digit |
|
311 addq %r10, %rax |
|
312 adcq $0, %rdx / p += r[1] |
|
313 addq %r9, %rax |
|
314 adcq $0, %rdx / p += cy |
|
315 movq %rax, 8(%rdi) / r[1] = lo(p) |
|
316 movq %rdx, %r9 / cy = hi(p) |
|
317 decq %r8 |
|
318 jz .L27 |
|
319 |
|
320 movq 16(%rsi), %rax |
|
321 movq 16(%rdi), %r10 |
|
322 mulq %rcx / p = a[2] * digit |
|
323 addq %r10, %rax |
|
324 adcq $0, %rdx / p += r[2] |
|
325 addq %r9, %rax |
|
326 adcq $0, %rdx / p += cy |
|
327 movq %rax, 16(%rdi) / r[2] = lo(p) |
|
328 movq %rdx, %r9 / cy = hi(p) |
|
329 decq %r8 |
|
330 jz .L27 |
|
331 |
|
332 movq 24(%rsi), %rax |
|
333 movq 24(%rdi), %r10 |
|
334 mulq %rcx / p = a[3] * digit |
|
335 addq %r10, %rax |
|
336 adcq $0, %rdx / p += r[3] |
|
337 addq %r9, %rax |
|
338 adcq $0, %rdx / p += cy |
|
339 movq %rax, 24(%rdi) / r[3] = lo(p) |
|
340 movq %rdx, %r9 / cy = hi(p) |
|
341 decq %r8 |
|
342 jz .L27 |
|
343 |
|
344 movq 32(%rsi), %rax |
|
345 movq 32(%rdi), %r10 |
|
346 mulq %rcx / p = a[4] * digit |
|
347 addq %r10, %rax |
|
348 adcq $0, %rdx / p += r[4] |
|
349 addq %r9, %rax |
|
350 adcq $0, %rdx / p += cy |
|
351 movq %rax, 32(%rdi) / r[4] = lo(p) |
|
352 movq %rdx, %r9 / cy = hi(p) |
|
353 decq %r8 |
|
354 jz .L27 |
|
355 |
|
356 movq 40(%rsi), %rax |
|
357 movq 40(%rdi), %r10 |
|
358 mulq %rcx / p = a[5] * digit |
|
359 addq %r10, %rax |
|
360 adcq $0, %rdx / p += r[5] |
|
361 addq %r9, %rax |
|
362 adcq $0, %rdx / p += cy |
|
363 movq %rax, 40(%rdi) / r[5] = lo(p) |
|
364 movq %rdx, %r9 / cy = hi(p) |
|
365 decq %r8 |
|
366 jz .L27 |
|
367 |
|
368 movq 48(%rsi), %rax |
|
369 movq 48(%rdi), %r10 |
|
370 mulq %rcx / p = a[6] * digit |
|
371 addq %r10, %rax |
|
372 adcq $0, %rdx / p += r[6] |
|
373 addq %r9, %rax |
|
374 adcq $0, %rdx / p += cy |
|
375 movq %rax, 48(%rdi) / r[6] = lo(p) |
|
376 movq %rdx, %r9 / cy = hi(p) |
|
377 decq %r8 |
|
378 jz .L27 |
|
379 |
|
380 |
|
381 .L27: |
|
382 movq %r9, %rax |
|
383 ret |
|
384 |
|
385 .size s_mpv_mul_add_vec64, .-s_mpv_mul_add_vec64 |