|
1 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
2 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
4 |
|
5 #include "mpi-priv.h" |
|
6 #include <c_asm.h> |
|
7 |
|
8 |
|
9 #define MP_MUL_DxD(a, b, Phi, Plo) \ |
|
10 { Plo = asm ("mulq %a0, %a1, %v0", a, b); \ |
|
11 Phi = asm ("umulh %a0, %a1, %v0", a, b); } \ |
|
12 |
|
13 /* This is empty for the loop in s_mpv_mul_d */ |
|
14 #define CARRY_ADD |
|
15 |
|
16 #define ONE_MUL \ |
|
17 a_i = *a++; \ |
|
18 MP_MUL_DxD(a_i, b, a1b1, a0b0); \ |
|
19 a0b0 += carry; \ |
|
20 if (a0b0 < carry) \ |
|
21 ++a1b1; \ |
|
22 CARRY_ADD \ |
|
23 *c++ = a0b0; \ |
|
24 carry = a1b1; \ |
|
25 |
|
26 #define FOUR_MUL \ |
|
27 ONE_MUL \ |
|
28 ONE_MUL \ |
|
29 ONE_MUL \ |
|
30 ONE_MUL \ |
|
31 |
|
32 #define SIXTEEN_MUL \ |
|
33 FOUR_MUL \ |
|
34 FOUR_MUL \ |
|
35 FOUR_MUL \ |
|
36 FOUR_MUL \ |
|
37 |
|
38 #define THIRTYTWO_MUL \ |
|
39 SIXTEEN_MUL \ |
|
40 SIXTEEN_MUL \ |
|
41 |
|
42 #define ONETWENTYEIGHT_MUL \ |
|
43 THIRTYTWO_MUL \ |
|
44 THIRTYTWO_MUL \ |
|
45 THIRTYTWO_MUL \ |
|
46 THIRTYTWO_MUL \ |
|
47 |
|
48 |
|
49 #define EXPAND_256(CALL) \ |
|
50 mp_digit carry = 0; \ |
|
51 mp_digit a_i; \ |
|
52 mp_digit a0b0, a1b1; \ |
|
53 if (a_len &255) { \ |
|
54 if (a_len &1) { \ |
|
55 ONE_MUL \ |
|
56 } \ |
|
57 if (a_len &2) { \ |
|
58 ONE_MUL \ |
|
59 ONE_MUL \ |
|
60 } \ |
|
61 if (a_len &4) { \ |
|
62 FOUR_MUL \ |
|
63 } \ |
|
64 if (a_len &8) { \ |
|
65 FOUR_MUL \ |
|
66 FOUR_MUL \ |
|
67 } \ |
|
68 if (a_len & 16 ) { \ |
|
69 SIXTEEN_MUL \ |
|
70 } \ |
|
71 if (a_len & 32 ) { \ |
|
72 THIRTYTWO_MUL \ |
|
73 } \ |
|
74 if (a_len & 64 ) { \ |
|
75 THIRTYTWO_MUL \ |
|
76 THIRTYTWO_MUL \ |
|
77 } \ |
|
78 if (a_len & 128) { \ |
|
79 ONETWENTYEIGHT_MUL \ |
|
80 } \ |
|
81 a_len = a_len & (-256); \ |
|
82 } \ |
|
83 if (a_len>=256 ) { \ |
|
84 carry = CALL(a, a_len, b, c, carry); \ |
|
85 c += a_len; \ |
|
86 } \ |
|
87 |
|
88 #define FUNC_NAME(NAME) \ |
|
89 mp_digit NAME(const mp_digit *a, \ |
|
90 mp_size a_len, \ |
|
91 mp_digit b, mp_digit *c, \ |
|
92 mp_digit carry) \ |
|
93 |
|
94 #define DECLARE_MUL_256(FNAME) \ |
|
95 FUNC_NAME(FNAME) \ |
|
96 { \ |
|
97 mp_digit a_i; \ |
|
98 mp_digit a0b0, a1b1; \ |
|
99 while (a_len) { \ |
|
100 ONETWENTYEIGHT_MUL \ |
|
101 ONETWENTYEIGHT_MUL \ |
|
102 a_len-= 256; \ |
|
103 } \ |
|
104 return carry; \ |
|
105 } \ |
|
106 |
|
107 /* Expanding the loop in s_mpv_mul_d appeared to slow down the |
|
108 (admittedly) small number of tests (i.e., timetest) used to |
|
109 measure performance, so this define disables that optimization. */ |
|
110 #define DO_NOT_EXPAND 1 |
|
111 |
|
112 /* Need forward declaration so it can be instantiated after |
|
113 the routine that uses it; this helps locality somewhat */ |
|
114 #if !defined(DO_NOT_EXPAND) |
|
115 FUNC_NAME(s_mpv_mul_d_MUL256); |
|
116 #endif |
|
117 |
|
118 /* c = a * b */ |
|
119 void s_mpv_mul_d(const mp_digit *a, mp_size a_len, |
|
120 mp_digit b, mp_digit *c) |
|
121 { |
|
122 #if defined(DO_NOT_EXPAND) |
|
123 mp_digit carry = 0; |
|
124 while (a_len--) { |
|
125 mp_digit a_i = *a++; |
|
126 mp_digit a0b0, a1b1; |
|
127 |
|
128 MP_MUL_DxD(a_i, b, a1b1, a0b0); |
|
129 |
|
130 a0b0 += carry; |
|
131 if (a0b0 < carry) |
|
132 ++a1b1; |
|
133 *c++ = a0b0; |
|
134 carry = a1b1; |
|
135 } |
|
136 #else |
|
137 EXPAND_256(s_mpv_mul_d_MUL256) |
|
138 #endif |
|
139 *c = carry; |
|
140 } |
|
141 |
|
142 #if !defined(DO_NOT_EXPAND) |
|
143 DECLARE_MUL_256(s_mpv_mul_d_MUL256) |
|
144 #endif |
|
145 |
|
146 #undef CARRY_ADD |
|
147 /* This is redefined for the loop in s_mpv_mul_d_add */ |
|
148 #define CARRY_ADD \ |
|
149 a0b0 += a_i = *c; \ |
|
150 if (a0b0 < a_i) \ |
|
151 ++a1b1; \ |
|
152 |
|
153 /* Need forward declaration so it can be instantiated between the |
|
154 two routines that use it; this helps locality somewhat */ |
|
155 FUNC_NAME(s_mpv_mul_d_add_MUL256); |
|
156 |
|
157 /* c += a * b */ |
|
158 void s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, |
|
159 mp_digit b, mp_digit *c) |
|
160 { |
|
161 EXPAND_256(s_mpv_mul_d_add_MUL256) |
|
162 *c = carry; |
|
163 } |
|
164 |
|
165 /* Instantiate multiply 256 routine here */ |
|
166 DECLARE_MUL_256(s_mpv_mul_d_add_MUL256) |
|
167 |
|
168 /* Presently, this is only used by the Montgomery arithmetic code. */ |
|
169 /* c += a * b */ |
|
170 void s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, |
|
171 mp_digit b, mp_digit *c) |
|
172 { |
|
173 EXPAND_256(s_mpv_mul_d_add_MUL256) |
|
174 while (carry) { |
|
175 mp_digit c_i = *c; |
|
176 carry += c_i; |
|
177 *c++ = carry; |
|
178 carry = carry < c_i; |
|
179 } |
|
180 } |
|
181 |