Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
1 /* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 /* Multiplication performance enhancements for sparc v8+vis CPUs. */
7 #include "mpi-priv.h"
8 #include <stddef.h>
9 #include <sys/systeminfo.h>
10 #include <strings.h>
12 /* In the functions below, */
13 /* vector y must be 8-byte aligned, and n must be even */
14 /* returns carry out of high order word of result */
15 /* maximum n is 256 */
17 /* vector x += vector y * scaler a; where y is of length n words. */
18 extern mp_digit mul_add_inp(mp_digit *x, const mp_digit *y, int n, mp_digit a);
20 /* vector z = vector x + vector y * scaler a; where y is of length n words. */
21 extern mp_digit mul_add(mp_digit *z, const mp_digit *x, const mp_digit *y,
22 int n, mp_digit a);
24 /* v8 versions of these functions run on any Sparc v8 CPU. */
26 /* This trick works on Sparc V8 CPUs with the Workshop compilers. */
27 #define MP_MUL_DxD(a, b, Phi, Plo) \
28 { unsigned long long product = (unsigned long long)a * b; \
29 Plo = (mp_digit)product; \
30 Phi = (mp_digit)(product >> MP_DIGIT_BIT); }
32 /* c = a * b */
33 static void
34 v8_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
35 {
36 #if !defined(MP_NO_MP_WORD)
37 mp_digit d = 0;
39 /* Inner product: Digits of a */
40 while (a_len--) {
41 mp_word w = ((mp_word)b * *a++) + d;
42 *c++ = ACCUM(w);
43 d = CARRYOUT(w);
44 }
45 *c = d;
46 #else
47 mp_digit carry = 0;
48 while (a_len--) {
49 mp_digit a_i = *a++;
50 mp_digit a0b0, a1b1;
52 MP_MUL_DxD(a_i, b, a1b1, a0b0);
54 a0b0 += carry;
55 if (a0b0 < carry)
56 ++a1b1;
57 *c++ = a0b0;
58 carry = a1b1;
59 }
60 *c = carry;
61 #endif
62 }
64 /* c += a * b */
65 static void
66 v8_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
67 {
68 #if !defined(MP_NO_MP_WORD)
69 mp_digit d = 0;
71 /* Inner product: Digits of a */
72 while (a_len--) {
73 mp_word w = ((mp_word)b * *a++) + *c + d;
74 *c++ = ACCUM(w);
75 d = CARRYOUT(w);
76 }
77 *c = d;
78 #else
79 mp_digit carry = 0;
80 while (a_len--) {
81 mp_digit a_i = *a++;
82 mp_digit a0b0, a1b1;
84 MP_MUL_DxD(a_i, b, a1b1, a0b0);
86 a0b0 += carry;
87 if (a0b0 < carry)
88 ++a1b1;
89 a0b0 += a_i = *c;
90 if (a0b0 < a_i)
91 ++a1b1;
92 *c++ = a0b0;
93 carry = a1b1;
94 }
95 *c = carry;
96 #endif
97 }
99 /* Presently, this is only used by the Montgomery arithmetic code. */
100 /* c += a * b */
101 static void
102 v8_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
103 {
104 #if !defined(MP_NO_MP_WORD)
105 mp_digit d = 0;
107 /* Inner product: Digits of a */
108 while (a_len--) {
109 mp_word w = ((mp_word)b * *a++) + *c + d;
110 *c++ = ACCUM(w);
111 d = CARRYOUT(w);
112 }
114 while (d) {
115 mp_word w = (mp_word)*c + d;
116 *c++ = ACCUM(w);
117 d = CARRYOUT(w);
118 }
119 #else
120 mp_digit carry = 0;
121 while (a_len--) {
122 mp_digit a_i = *a++;
123 mp_digit a0b0, a1b1;
125 MP_MUL_DxD(a_i, b, a1b1, a0b0);
127 a0b0 += carry;
128 if (a0b0 < carry)
129 ++a1b1;
131 a0b0 += a_i = *c;
132 if (a0b0 < a_i)
133 ++a1b1;
135 *c++ = a0b0;
136 carry = a1b1;
137 }
138 while (carry) {
139 mp_digit c_i = *c;
140 carry += c_i;
141 *c++ = carry;
142 carry = carry < c_i;
143 }
144 #endif
145 }
147 /* These functions run only on v8plus+vis or v9+vis CPUs. */
149 /* c = a * b */
150 void
151 s_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
152 {
153 mp_digit d;
154 mp_digit x[258];
155 if (a_len <= 256) {
156 if (a == c || ((ptrdiff_t)a & 0x7) != 0 || (a_len & 1) != 0) {
157 mp_digit * px;
158 px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x;
159 memcpy(px, a, a_len * sizeof(*a));
160 a = px;
161 if (a_len & 1) {
162 px[a_len] = 0;
163 }
164 }
165 s_mp_setz(c, a_len + 1);
166 d = mul_add_inp(c, a, a_len, b);
167 c[a_len] = d;
168 } else {
169 v8_mpv_mul_d(a, a_len, b, c);
170 }
171 }
173 /* c += a * b, where a is a_len words long. */
174 void
175 s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
176 {
177 mp_digit d;
178 mp_digit x[258];
179 if (a_len <= 256) {
180 if (((ptrdiff_t)a & 0x7) != 0 || (a_len & 1) != 0) {
181 mp_digit * px;
182 px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x;
183 memcpy(px, a, a_len * sizeof(*a));
184 a = px;
185 if (a_len & 1) {
186 px[a_len] = 0;
187 }
188 }
189 d = mul_add_inp(c, a, a_len, b);
190 c[a_len] = d;
191 } else {
192 v8_mpv_mul_d_add(a, a_len, b, c);
193 }
194 }
196 /* c += a * b, where a is y words long. */
197 void
198 s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
199 {
200 mp_digit d;
201 mp_digit x[258];
202 if (a_len <= 256) {
203 if (((ptrdiff_t)a & 0x7) != 0 || (a_len & 1) != 0) {
204 mp_digit * px;
205 px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x;
206 memcpy(px, a, a_len * sizeof(*a));
207 a = px;
208 if (a_len & 1) {
209 px[a_len] = 0;
210 }
211 }
212 d = mul_add_inp(c, a, a_len, b);
213 if (d) {
214 c += a_len;
215 do {
216 mp_digit sum = d + *c;
217 *c++ = sum;
218 d = sum < d;
219 } while (d);
220 }
221 } else {
222 v8_mpv_mul_d_add_prop(a, a_len, b, c);
223 }
224 }