|
1 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
2 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
4 |
|
5 /* Multiplication performance enhancements for sparc v8+vis CPUs. */ |
|
6 |
|
7 #include "mpi-priv.h" |
|
8 #include <stddef.h> |
|
9 #include <sys/systeminfo.h> |
|
10 #include <strings.h> |
|
11 |
|
12 /* In the functions below, */ |
|
13 /* vector y must be 8-byte aligned, and n must be even */ |
|
14 /* returns carry out of high order word of result */ |
|
15 /* maximum n is 256 */ |
|
16 |
|
17 /* vector x += vector y * scaler a; where y is of length n words. */ |
|
18 extern mp_digit mul_add_inp(mp_digit *x, const mp_digit *y, int n, mp_digit a); |
|
19 |
|
20 /* vector z = vector x + vector y * scaler a; where y is of length n words. */ |
|
21 extern mp_digit mul_add(mp_digit *z, const mp_digit *x, const mp_digit *y, |
|
22 int n, mp_digit a); |
|
23 |
|
24 /* v8 versions of these functions run on any Sparc v8 CPU. */ |
|
25 |
|
26 /* This trick works on Sparc V8 CPUs with the Workshop compilers. */ |
|
27 #define MP_MUL_DxD(a, b, Phi, Plo) \ |
|
28 { unsigned long long product = (unsigned long long)a * b; \ |
|
29 Plo = (mp_digit)product; \ |
|
30 Phi = (mp_digit)(product >> MP_DIGIT_BIT); } |
|
31 |
|
32 /* c = a * b */ |
|
33 static void |
|
34 v8_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) |
|
35 { |
|
36 #if !defined(MP_NO_MP_WORD) |
|
37 mp_digit d = 0; |
|
38 |
|
39 /* Inner product: Digits of a */ |
|
40 while (a_len--) { |
|
41 mp_word w = ((mp_word)b * *a++) + d; |
|
42 *c++ = ACCUM(w); |
|
43 d = CARRYOUT(w); |
|
44 } |
|
45 *c = d; |
|
46 #else |
|
47 mp_digit carry = 0; |
|
48 while (a_len--) { |
|
49 mp_digit a_i = *a++; |
|
50 mp_digit a0b0, a1b1; |
|
51 |
|
52 MP_MUL_DxD(a_i, b, a1b1, a0b0); |
|
53 |
|
54 a0b0 += carry; |
|
55 if (a0b0 < carry) |
|
56 ++a1b1; |
|
57 *c++ = a0b0; |
|
58 carry = a1b1; |
|
59 } |
|
60 *c = carry; |
|
61 #endif |
|
62 } |
|
63 |
|
64 /* c += a * b */ |
|
65 static void |
|
66 v8_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) |
|
67 { |
|
68 #if !defined(MP_NO_MP_WORD) |
|
69 mp_digit d = 0; |
|
70 |
|
71 /* Inner product: Digits of a */ |
|
72 while (a_len--) { |
|
73 mp_word w = ((mp_word)b * *a++) + *c + d; |
|
74 *c++ = ACCUM(w); |
|
75 d = CARRYOUT(w); |
|
76 } |
|
77 *c = d; |
|
78 #else |
|
79 mp_digit carry = 0; |
|
80 while (a_len--) { |
|
81 mp_digit a_i = *a++; |
|
82 mp_digit a0b0, a1b1; |
|
83 |
|
84 MP_MUL_DxD(a_i, b, a1b1, a0b0); |
|
85 |
|
86 a0b0 += carry; |
|
87 if (a0b0 < carry) |
|
88 ++a1b1; |
|
89 a0b0 += a_i = *c; |
|
90 if (a0b0 < a_i) |
|
91 ++a1b1; |
|
92 *c++ = a0b0; |
|
93 carry = a1b1; |
|
94 } |
|
95 *c = carry; |
|
96 #endif |
|
97 } |
|
98 |
|
99 /* Presently, this is only used by the Montgomery arithmetic code. */ |
|
100 /* c += a * b */ |
|
101 static void |
|
102 v8_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) |
|
103 { |
|
104 #if !defined(MP_NO_MP_WORD) |
|
105 mp_digit d = 0; |
|
106 |
|
107 /* Inner product: Digits of a */ |
|
108 while (a_len--) { |
|
109 mp_word w = ((mp_word)b * *a++) + *c + d; |
|
110 *c++ = ACCUM(w); |
|
111 d = CARRYOUT(w); |
|
112 } |
|
113 |
|
114 while (d) { |
|
115 mp_word w = (mp_word)*c + d; |
|
116 *c++ = ACCUM(w); |
|
117 d = CARRYOUT(w); |
|
118 } |
|
119 #else |
|
120 mp_digit carry = 0; |
|
121 while (a_len--) { |
|
122 mp_digit a_i = *a++; |
|
123 mp_digit a0b0, a1b1; |
|
124 |
|
125 MP_MUL_DxD(a_i, b, a1b1, a0b0); |
|
126 |
|
127 a0b0 += carry; |
|
128 if (a0b0 < carry) |
|
129 ++a1b1; |
|
130 |
|
131 a0b0 += a_i = *c; |
|
132 if (a0b0 < a_i) |
|
133 ++a1b1; |
|
134 |
|
135 *c++ = a0b0; |
|
136 carry = a1b1; |
|
137 } |
|
138 while (carry) { |
|
139 mp_digit c_i = *c; |
|
140 carry += c_i; |
|
141 *c++ = carry; |
|
142 carry = carry < c_i; |
|
143 } |
|
144 #endif |
|
145 } |
|
146 |
|
147 /* These functions run only on v8plus+vis or v9+vis CPUs. */ |
|
148 |
|
149 /* c = a * b */ |
|
150 void |
|
151 s_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) |
|
152 { |
|
153 mp_digit d; |
|
154 mp_digit x[258]; |
|
155 if (a_len <= 256) { |
|
156 if (a == c || ((ptrdiff_t)a & 0x7) != 0 || (a_len & 1) != 0) { |
|
157 mp_digit * px; |
|
158 px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x; |
|
159 memcpy(px, a, a_len * sizeof(*a)); |
|
160 a = px; |
|
161 if (a_len & 1) { |
|
162 px[a_len] = 0; |
|
163 } |
|
164 } |
|
165 s_mp_setz(c, a_len + 1); |
|
166 d = mul_add_inp(c, a, a_len, b); |
|
167 c[a_len] = d; |
|
168 } else { |
|
169 v8_mpv_mul_d(a, a_len, b, c); |
|
170 } |
|
171 } |
|
172 |
|
173 /* c += a * b, where a is a_len words long. */ |
|
174 void |
|
175 s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) |
|
176 { |
|
177 mp_digit d; |
|
178 mp_digit x[258]; |
|
179 if (a_len <= 256) { |
|
180 if (((ptrdiff_t)a & 0x7) != 0 || (a_len & 1) != 0) { |
|
181 mp_digit * px; |
|
182 px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x; |
|
183 memcpy(px, a, a_len * sizeof(*a)); |
|
184 a = px; |
|
185 if (a_len & 1) { |
|
186 px[a_len] = 0; |
|
187 } |
|
188 } |
|
189 d = mul_add_inp(c, a, a_len, b); |
|
190 c[a_len] = d; |
|
191 } else { |
|
192 v8_mpv_mul_d_add(a, a_len, b, c); |
|
193 } |
|
194 } |
|
195 |
|
196 /* c += a * b, where a is y words long. */ |
|
197 void |
|
198 s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) |
|
199 { |
|
200 mp_digit d; |
|
201 mp_digit x[258]; |
|
202 if (a_len <= 256) { |
|
203 if (((ptrdiff_t)a & 0x7) != 0 || (a_len & 1) != 0) { |
|
204 mp_digit * px; |
|
205 px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x; |
|
206 memcpy(px, a, a_len * sizeof(*a)); |
|
207 a = px; |
|
208 if (a_len & 1) { |
|
209 px[a_len] = 0; |
|
210 } |
|
211 } |
|
212 d = mul_add_inp(c, a, a_len, b); |
|
213 if (d) { |
|
214 c += a_len; |
|
215 do { |
|
216 mp_digit sum = d + *c; |
|
217 *c++ = sum; |
|
218 d = sum < d; |
|
219 } while (d); |
|
220 } |
|
221 } else { |
|
222 v8_mpv_mul_d_add_prop(a, a_len, b, c); |
|
223 } |
|
224 } |