Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
1 # This Source Code Form is subject to the terms of the Mozilla Public
2 # License, v. 2.0. If a copy of the MPL was not distributed with this
3 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 # ------------------------------------------------------------------------
7 #
8 # Implementation of s_mpv_mul_set_vec which exploits
9 # the 64X64->128 bit unsigned multiply instruction.
10 #
11 # ------------------------------------------------------------------------
13 # r = a * digit, r and a are vectors of length len
14 # returns the carry digit
15 # r and a are 64 bit aligned.
16 #
17 # uint64_t
18 # s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
19 #
21 .text; .align 16; .globl s_mpv_mul_set_vec64; .type s_mpv_mul_set_vec64, @function; s_mpv_mul_set_vec64:
23 xorq %rax, %rax # if (len == 0) return (0)
24 testq %rdx, %rdx
25 jz .L17
27 movq %rdx, %r8 # Use r8 for len; %rdx is used by mul
28 xorq %r9, %r9 # cy = 0
30 .L15:
31 cmpq $8, %r8 # 8 - len
32 jb .L16
33 movq 0(%rsi), %rax # rax = a[0]
34 movq 8(%rsi), %r11 # prefetch a[1]
35 mulq %rcx # p = a[0] * digit
36 addq %r9, %rax
37 adcq $0, %rdx # p += cy
38 movq %rax, 0(%rdi) # r[0] = lo(p)
39 movq %rdx, %r9 # cy = hi(p)
41 movq %r11, %rax
42 movq 16(%rsi), %r11 # prefetch a[2]
43 mulq %rcx # p = a[1] * digit
44 addq %r9, %rax
45 adcq $0, %rdx # p += cy
46 movq %rax, 8(%rdi) # r[1] = lo(p)
47 movq %rdx, %r9 # cy = hi(p)
49 movq %r11, %rax
50 movq 24(%rsi), %r11 # prefetch a[3]
51 mulq %rcx # p = a[2] * digit
52 addq %r9, %rax
53 adcq $0, %rdx # p += cy
54 movq %rax, 16(%rdi) # r[2] = lo(p)
55 movq %rdx, %r9 # cy = hi(p)
57 movq %r11, %rax
58 movq 32(%rsi), %r11 # prefetch a[4]
59 mulq %rcx # p = a[3] * digit
60 addq %r9, %rax
61 adcq $0, %rdx # p += cy
62 movq %rax, 24(%rdi) # r[3] = lo(p)
63 movq %rdx, %r9 # cy = hi(p)
65 movq %r11, %rax
66 movq 40(%rsi), %r11 # prefetch a[5]
67 mulq %rcx # p = a[4] * digit
68 addq %r9, %rax
69 adcq $0, %rdx # p += cy
70 movq %rax, 32(%rdi) # r[4] = lo(p)
71 movq %rdx, %r9 # cy = hi(p)
73 movq %r11, %rax
74 movq 48(%rsi), %r11 # prefetch a[6]
75 mulq %rcx # p = a[5] * digit
76 addq %r9, %rax
77 adcq $0, %rdx # p += cy
78 movq %rax, 40(%rdi) # r[5] = lo(p)
79 movq %rdx, %r9 # cy = hi(p)
81 movq %r11, %rax
82 movq 56(%rsi), %r11 # prefetch a[7]
83 mulq %rcx # p = a[6] * digit
84 addq %r9, %rax
85 adcq $0, %rdx # p += cy
86 movq %rax, 48(%rdi) # r[6] = lo(p)
87 movq %rdx, %r9 # cy = hi(p)
89 movq %r11, %rax
90 mulq %rcx # p = a[7] * digit
91 addq %r9, %rax
92 adcq $0, %rdx # p += cy
93 movq %rax, 56(%rdi) # r[7] = lo(p)
94 movq %rdx, %r9 # cy = hi(p)
96 addq $64, %rsi
97 addq $64, %rdi
98 subq $8, %r8
100 jz .L17
101 jmp .L15
103 .L16:
104 movq 0(%rsi), %rax
105 mulq %rcx # p = a[0] * digit
106 addq %r9, %rax
107 adcq $0, %rdx # p += cy
108 movq %rax, 0(%rdi) # r[0] = lo(p)
109 movq %rdx, %r9 # cy = hi(p)
110 decq %r8
111 jz .L17
113 movq 8(%rsi), %rax
114 mulq %rcx # p = a[1] * digit
115 addq %r9, %rax
116 adcq $0, %rdx # p += cy
117 movq %rax, 8(%rdi) # r[1] = lo(p)
118 movq %rdx, %r9 # cy = hi(p)
119 decq %r8
120 jz .L17
122 movq 16(%rsi), %rax
123 mulq %rcx # p = a[2] * digit
124 addq %r9, %rax
125 adcq $0, %rdx # p += cy
126 movq %rax, 16(%rdi) # r[2] = lo(p)
127 movq %rdx, %r9 # cy = hi(p)
128 decq %r8
129 jz .L17
131 movq 24(%rsi), %rax
132 mulq %rcx # p = a[3] * digit
133 addq %r9, %rax
134 adcq $0, %rdx # p += cy
135 movq %rax, 24(%rdi) # r[3] = lo(p)
136 movq %rdx, %r9 # cy = hi(p)
137 decq %r8
138 jz .L17
140 movq 32(%rsi), %rax
141 mulq %rcx # p = a[4] * digit
142 addq %r9, %rax
143 adcq $0, %rdx # p += cy
144 movq %rax, 32(%rdi) # r[4] = lo(p)
145 movq %rdx, %r9 # cy = hi(p)
146 decq %r8
147 jz .L17
149 movq 40(%rsi), %rax
150 mulq %rcx # p = a[5] * digit
151 addq %r9, %rax
152 adcq $0, %rdx # p += cy
153 movq %rax, 40(%rdi) # r[5] = lo(p)
154 movq %rdx, %r9 # cy = hi(p)
155 decq %r8
156 jz .L17
158 movq 48(%rsi), %rax
159 mulq %rcx # p = a[6] * digit
160 addq %r9, %rax
161 adcq $0, %rdx # p += cy
162 movq %rax, 48(%rdi) # r[6] = lo(p)
163 movq %rdx, %r9 # cy = hi(p)
164 decq %r8
165 jz .L17
168 .L17:
169 movq %r9, %rax
170 ret
172 .size s_mpv_mul_set_vec64, .-s_mpv_mul_set_vec64
174 # ------------------------------------------------------------------------
175 #
176 # Implementation of s_mpv_mul_add_vec which exploits
177 # the 64X64->128 bit unsigned multiply instruction.
178 #
179 # ------------------------------------------------------------------------
181 # r += a * digit, r and a are vectors of length len
182 # returns the carry digit
183 # r and a are 64 bit aligned.
184 #
185 # uint64_t
186 # s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
187 #
189 .text; .align 16; .globl s_mpv_mul_add_vec64; .type s_mpv_mul_add_vec64, @function; s_mpv_mul_add_vec64:
191 xorq %rax, %rax # if (len == 0) return (0)
192 testq %rdx, %rdx
193 jz .L27
195 movq %rdx, %r8 # Use r8 for len; %rdx is used by mul
196 xorq %r9, %r9 # cy = 0
198 .L25:
199 cmpq $8, %r8 # 8 - len
200 jb .L26
201 movq 0(%rsi), %rax # rax = a[0]
202 movq 0(%rdi), %r10 # r10 = r[0]
203 movq 8(%rsi), %r11 # prefetch a[1]
204 mulq %rcx # p = a[0] * digit
205 addq %r10, %rax
206 adcq $0, %rdx # p += r[0]
207 movq 8(%rdi), %r10 # prefetch r[1]
208 addq %r9, %rax
209 adcq $0, %rdx # p += cy
210 movq %rax, 0(%rdi) # r[0] = lo(p)
211 movq %rdx, %r9 # cy = hi(p)
213 movq %r11, %rax
214 movq 16(%rsi), %r11 # prefetch a[2]
215 mulq %rcx # p = a[1] * digit
216 addq %r10, %rax
217 adcq $0, %rdx # p += r[1]
218 movq 16(%rdi), %r10 # prefetch r[2]
219 addq %r9, %rax
220 adcq $0, %rdx # p += cy
221 movq %rax, 8(%rdi) # r[1] = lo(p)
222 movq %rdx, %r9 # cy = hi(p)
224 movq %r11, %rax
225 movq 24(%rsi), %r11 # prefetch a[3]
226 mulq %rcx # p = a[2] * digit
227 addq %r10, %rax
228 adcq $0, %rdx # p += r[2]
229 movq 24(%rdi), %r10 # prefetch r[3]
230 addq %r9, %rax
231 adcq $0, %rdx # p += cy
232 movq %rax, 16(%rdi) # r[2] = lo(p)
233 movq %rdx, %r9 # cy = hi(p)
235 movq %r11, %rax
236 movq 32(%rsi), %r11 # prefetch a[4]
237 mulq %rcx # p = a[3] * digit
238 addq %r10, %rax
239 adcq $0, %rdx # p += r[3]
240 movq 32(%rdi), %r10 # prefetch r[4]
241 addq %r9, %rax
242 adcq $0, %rdx # p += cy
243 movq %rax, 24(%rdi) # r[3] = lo(p)
244 movq %rdx, %r9 # cy = hi(p)
246 movq %r11, %rax
247 movq 40(%rsi), %r11 # prefetch a[5]
248 mulq %rcx # p = a[4] * digit
249 addq %r10, %rax
250 adcq $0, %rdx # p += r[4]
251 movq 40(%rdi), %r10 # prefetch r[5]
252 addq %r9, %rax
253 adcq $0, %rdx # p += cy
254 movq %rax, 32(%rdi) # r[4] = lo(p)
255 movq %rdx, %r9 # cy = hi(p)
257 movq %r11, %rax
258 movq 48(%rsi), %r11 # prefetch a[6]
259 mulq %rcx # p = a[5] * digit
260 addq %r10, %rax
261 adcq $0, %rdx # p += r[5]
262 movq 48(%rdi), %r10 # prefetch r[6]
263 addq %r9, %rax
264 adcq $0, %rdx # p += cy
265 movq %rax, 40(%rdi) # r[5] = lo(p)
266 movq %rdx, %r9 # cy = hi(p)
268 movq %r11, %rax
269 movq 56(%rsi), %r11 # prefetch a[7]
270 mulq %rcx # p = a[6] * digit
271 addq %r10, %rax
272 adcq $0, %rdx # p += r[6]
273 movq 56(%rdi), %r10 # prefetch r[7]
274 addq %r9, %rax
275 adcq $0, %rdx # p += cy
276 movq %rax, 48(%rdi) # r[6] = lo(p)
277 movq %rdx, %r9 # cy = hi(p)
279 movq %r11, %rax
280 mulq %rcx # p = a[7] * digit
281 addq %r10, %rax
282 adcq $0, %rdx # p += r[7]
283 addq %r9, %rax
284 adcq $0, %rdx # p += cy
285 movq %rax, 56(%rdi) # r[7] = lo(p)
286 movq %rdx, %r9 # cy = hi(p)
288 addq $64, %rsi
289 addq $64, %rdi
290 subq $8, %r8
292 jz .L27
293 jmp .L25
295 .L26:
296 movq 0(%rsi), %rax
297 movq 0(%rdi), %r10
298 mulq %rcx # p = a[0] * digit
299 addq %r10, %rax
300 adcq $0, %rdx # p += r[0]
301 addq %r9, %rax
302 adcq $0, %rdx # p += cy
303 movq %rax, 0(%rdi) # r[0] = lo(p)
304 movq %rdx, %r9 # cy = hi(p)
305 decq %r8
306 jz .L27
308 movq 8(%rsi), %rax
309 movq 8(%rdi), %r10
310 mulq %rcx # p = a[1] * digit
311 addq %r10, %rax
312 adcq $0, %rdx # p += r[1]
313 addq %r9, %rax
314 adcq $0, %rdx # p += cy
315 movq %rax, 8(%rdi) # r[1] = lo(p)
316 movq %rdx, %r9 # cy = hi(p)
317 decq %r8
318 jz .L27
320 movq 16(%rsi), %rax
321 movq 16(%rdi), %r10
322 mulq %rcx # p = a[2] * digit
323 addq %r10, %rax
324 adcq $0, %rdx # p += r[2]
325 addq %r9, %rax
326 adcq $0, %rdx # p += cy
327 movq %rax, 16(%rdi) # r[2] = lo(p)
328 movq %rdx, %r9 # cy = hi(p)
329 decq %r8
330 jz .L27
332 movq 24(%rsi), %rax
333 movq 24(%rdi), %r10
334 mulq %rcx # p = a[3] * digit
335 addq %r10, %rax
336 adcq $0, %rdx # p += r[3]
337 addq %r9, %rax
338 adcq $0, %rdx # p += cy
339 movq %rax, 24(%rdi) # r[3] = lo(p)
340 movq %rdx, %r9 # cy = hi(p)
341 decq %r8
342 jz .L27
344 movq 32(%rsi), %rax
345 movq 32(%rdi), %r10
346 mulq %rcx # p = a[4] * digit
347 addq %r10, %rax
348 adcq $0, %rdx # p += r[4]
349 addq %r9, %rax
350 adcq $0, %rdx # p += cy
351 movq %rax, 32(%rdi) # r[4] = lo(p)
352 movq %rdx, %r9 # cy = hi(p)
353 decq %r8
354 jz .L27
356 movq 40(%rsi), %rax
357 movq 40(%rdi), %r10
358 mulq %rcx # p = a[5] * digit
359 addq %r10, %rax
360 adcq $0, %rdx # p += r[5]
361 addq %r9, %rax
362 adcq $0, %rdx # p += cy
363 movq %rax, 40(%rdi) # r[5] = lo(p)
364 movq %rdx, %r9 # cy = hi(p)
365 decq %r8
366 jz .L27
368 movq 48(%rsi), %rax
369 movq 48(%rdi), %r10
370 mulq %rcx # p = a[6] * digit
371 addq %r10, %rax
372 adcq $0, %rdx # p += r[6]
373 addq %r9, %rax
374 adcq $0, %rdx # p += cy
375 movq %rax, 48(%rdi) # r[6] = lo(p)
376 movq %rdx, %r9 # cy = hi(p)
377 decq %r8
378 jz .L27
381 .L27:
382 movq %r9, %rax
383 ret
385 .size s_mpv_mul_add_vec64, .-s_mpv_mul_add_vec64
387 # Magic indicating no need for an executable stack
388 .section .note.GNU-stack, "", @progbits
389 .previous