Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* This Source Code Form is subject to the terms of the Mozilla Public |
michael@0 | 2 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
michael@0 | 3 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
michael@0 | 4 | |
michael@0 | 5 | /* Multiplication performance enhancements for sparc v8+vis CPUs. */ |
michael@0 | 6 | |
michael@0 | 7 | #include "mpi-priv.h" |
michael@0 | 8 | #include <stddef.h> |
michael@0 | 9 | #include <sys/systeminfo.h> |
michael@0 | 10 | #include <strings.h> |
michael@0 | 11 | |
michael@0 | 12 | /* In the functions below, */ |
michael@0 | 13 | /* vector y must be 8-byte aligned, and n must be even */ |
michael@0 | 14 | /* returns carry out of high order word of result */ |
michael@0 | 15 | /* maximum n is 256 */ |
michael@0 | 16 | |
michael@0 | 17 | /* vector x += vector y * scaler a; where y is of length n words. */ |
michael@0 | 18 | extern mp_digit mul_add_inp(mp_digit *x, const mp_digit *y, int n, mp_digit a); |
michael@0 | 19 | |
michael@0 | 20 | /* vector z = vector x + vector y * scaler a; where y is of length n words. */ |
michael@0 | 21 | extern mp_digit mul_add(mp_digit *z, const mp_digit *x, const mp_digit *y, |
michael@0 | 22 | int n, mp_digit a); |
michael@0 | 23 | |
michael@0 | 24 | /* v8 versions of these functions run on any Sparc v8 CPU. */ |
michael@0 | 25 | |
michael@0 | 26 | /* This trick works on Sparc V8 CPUs with the Workshop compilers. */ |
michael@0 | 27 | #define MP_MUL_DxD(a, b, Phi, Plo) \ |
michael@0 | 28 | { unsigned long long product = (unsigned long long)a * b; \ |
michael@0 | 29 | Plo = (mp_digit)product; \ |
michael@0 | 30 | Phi = (mp_digit)(product >> MP_DIGIT_BIT); } |
michael@0 | 31 | |
michael@0 | 32 | /* c = a * b */ |
michael@0 | 33 | static void |
michael@0 | 34 | v8_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) |
michael@0 | 35 | { |
michael@0 | 36 | #if !defined(MP_NO_MP_WORD) |
michael@0 | 37 | mp_digit d = 0; |
michael@0 | 38 | |
michael@0 | 39 | /* Inner product: Digits of a */ |
michael@0 | 40 | while (a_len--) { |
michael@0 | 41 | mp_word w = ((mp_word)b * *a++) + d; |
michael@0 | 42 | *c++ = ACCUM(w); |
michael@0 | 43 | d = CARRYOUT(w); |
michael@0 | 44 | } |
michael@0 | 45 | *c = d; |
michael@0 | 46 | #else |
michael@0 | 47 | mp_digit carry = 0; |
michael@0 | 48 | while (a_len--) { |
michael@0 | 49 | mp_digit a_i = *a++; |
michael@0 | 50 | mp_digit a0b0, a1b1; |
michael@0 | 51 | |
michael@0 | 52 | MP_MUL_DxD(a_i, b, a1b1, a0b0); |
michael@0 | 53 | |
michael@0 | 54 | a0b0 += carry; |
michael@0 | 55 | if (a0b0 < carry) |
michael@0 | 56 | ++a1b1; |
michael@0 | 57 | *c++ = a0b0; |
michael@0 | 58 | carry = a1b1; |
michael@0 | 59 | } |
michael@0 | 60 | *c = carry; |
michael@0 | 61 | #endif |
michael@0 | 62 | } |
michael@0 | 63 | |
michael@0 | 64 | /* c += a * b */ |
michael@0 | 65 | static void |
michael@0 | 66 | v8_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) |
michael@0 | 67 | { |
michael@0 | 68 | #if !defined(MP_NO_MP_WORD) |
michael@0 | 69 | mp_digit d = 0; |
michael@0 | 70 | |
michael@0 | 71 | /* Inner product: Digits of a */ |
michael@0 | 72 | while (a_len--) { |
michael@0 | 73 | mp_word w = ((mp_word)b * *a++) + *c + d; |
michael@0 | 74 | *c++ = ACCUM(w); |
michael@0 | 75 | d = CARRYOUT(w); |
michael@0 | 76 | } |
michael@0 | 77 | *c = d; |
michael@0 | 78 | #else |
michael@0 | 79 | mp_digit carry = 0; |
michael@0 | 80 | while (a_len--) { |
michael@0 | 81 | mp_digit a_i = *a++; |
michael@0 | 82 | mp_digit a0b0, a1b1; |
michael@0 | 83 | |
michael@0 | 84 | MP_MUL_DxD(a_i, b, a1b1, a0b0); |
michael@0 | 85 | |
michael@0 | 86 | a0b0 += carry; |
michael@0 | 87 | if (a0b0 < carry) |
michael@0 | 88 | ++a1b1; |
michael@0 | 89 | a0b0 += a_i = *c; |
michael@0 | 90 | if (a0b0 < a_i) |
michael@0 | 91 | ++a1b1; |
michael@0 | 92 | *c++ = a0b0; |
michael@0 | 93 | carry = a1b1; |
michael@0 | 94 | } |
michael@0 | 95 | *c = carry; |
michael@0 | 96 | #endif |
michael@0 | 97 | } |
michael@0 | 98 | |
michael@0 | 99 | /* Presently, this is only used by the Montgomery arithmetic code. */ |
michael@0 | 100 | /* c += a * b */ |
michael@0 | 101 | static void |
michael@0 | 102 | v8_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) |
michael@0 | 103 | { |
michael@0 | 104 | #if !defined(MP_NO_MP_WORD) |
michael@0 | 105 | mp_digit d = 0; |
michael@0 | 106 | |
michael@0 | 107 | /* Inner product: Digits of a */ |
michael@0 | 108 | while (a_len--) { |
michael@0 | 109 | mp_word w = ((mp_word)b * *a++) + *c + d; |
michael@0 | 110 | *c++ = ACCUM(w); |
michael@0 | 111 | d = CARRYOUT(w); |
michael@0 | 112 | } |
michael@0 | 113 | |
michael@0 | 114 | while (d) { |
michael@0 | 115 | mp_word w = (mp_word)*c + d; |
michael@0 | 116 | *c++ = ACCUM(w); |
michael@0 | 117 | d = CARRYOUT(w); |
michael@0 | 118 | } |
michael@0 | 119 | #else |
michael@0 | 120 | mp_digit carry = 0; |
michael@0 | 121 | while (a_len--) { |
michael@0 | 122 | mp_digit a_i = *a++; |
michael@0 | 123 | mp_digit a0b0, a1b1; |
michael@0 | 124 | |
michael@0 | 125 | MP_MUL_DxD(a_i, b, a1b1, a0b0); |
michael@0 | 126 | |
michael@0 | 127 | a0b0 += carry; |
michael@0 | 128 | if (a0b0 < carry) |
michael@0 | 129 | ++a1b1; |
michael@0 | 130 | |
michael@0 | 131 | a0b0 += a_i = *c; |
michael@0 | 132 | if (a0b0 < a_i) |
michael@0 | 133 | ++a1b1; |
michael@0 | 134 | |
michael@0 | 135 | *c++ = a0b0; |
michael@0 | 136 | carry = a1b1; |
michael@0 | 137 | } |
michael@0 | 138 | while (carry) { |
michael@0 | 139 | mp_digit c_i = *c; |
michael@0 | 140 | carry += c_i; |
michael@0 | 141 | *c++ = carry; |
michael@0 | 142 | carry = carry < c_i; |
michael@0 | 143 | } |
michael@0 | 144 | #endif |
michael@0 | 145 | } |
michael@0 | 146 | |
michael@0 | 147 | /* These functions run only on v8plus+vis or v9+vis CPUs. */ |
michael@0 | 148 | |
michael@0 | 149 | /* c = a * b */ |
michael@0 | 150 | void |
michael@0 | 151 | s_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) |
michael@0 | 152 | { |
michael@0 | 153 | mp_digit d; |
michael@0 | 154 | mp_digit x[258]; |
michael@0 | 155 | if (a_len <= 256) { |
michael@0 | 156 | if (a == c || ((ptrdiff_t)a & 0x7) != 0 || (a_len & 1) != 0) { |
michael@0 | 157 | mp_digit * px; |
michael@0 | 158 | px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x; |
michael@0 | 159 | memcpy(px, a, a_len * sizeof(*a)); |
michael@0 | 160 | a = px; |
michael@0 | 161 | if (a_len & 1) { |
michael@0 | 162 | px[a_len] = 0; |
michael@0 | 163 | } |
michael@0 | 164 | } |
michael@0 | 165 | s_mp_setz(c, a_len + 1); |
michael@0 | 166 | d = mul_add_inp(c, a, a_len, b); |
michael@0 | 167 | c[a_len] = d; |
michael@0 | 168 | } else { |
michael@0 | 169 | v8_mpv_mul_d(a, a_len, b, c); |
michael@0 | 170 | } |
michael@0 | 171 | } |
michael@0 | 172 | |
michael@0 | 173 | /* c += a * b, where a is a_len words long. */ |
michael@0 | 174 | void |
michael@0 | 175 | s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) |
michael@0 | 176 | { |
michael@0 | 177 | mp_digit d; |
michael@0 | 178 | mp_digit x[258]; |
michael@0 | 179 | if (a_len <= 256) { |
michael@0 | 180 | if (((ptrdiff_t)a & 0x7) != 0 || (a_len & 1) != 0) { |
michael@0 | 181 | mp_digit * px; |
michael@0 | 182 | px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x; |
michael@0 | 183 | memcpy(px, a, a_len * sizeof(*a)); |
michael@0 | 184 | a = px; |
michael@0 | 185 | if (a_len & 1) { |
michael@0 | 186 | px[a_len] = 0; |
michael@0 | 187 | } |
michael@0 | 188 | } |
michael@0 | 189 | d = mul_add_inp(c, a, a_len, b); |
michael@0 | 190 | c[a_len] = d; |
michael@0 | 191 | } else { |
michael@0 | 192 | v8_mpv_mul_d_add(a, a_len, b, c); |
michael@0 | 193 | } |
michael@0 | 194 | } |
michael@0 | 195 | |
michael@0 | 196 | /* c += a * b, where a is y words long. */ |
michael@0 | 197 | void |
michael@0 | 198 | s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) |
michael@0 | 199 | { |
michael@0 | 200 | mp_digit d; |
michael@0 | 201 | mp_digit x[258]; |
michael@0 | 202 | if (a_len <= 256) { |
michael@0 | 203 | if (((ptrdiff_t)a & 0x7) != 0 || (a_len & 1) != 0) { |
michael@0 | 204 | mp_digit * px; |
michael@0 | 205 | px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x; |
michael@0 | 206 | memcpy(px, a, a_len * sizeof(*a)); |
michael@0 | 207 | a = px; |
michael@0 | 208 | if (a_len & 1) { |
michael@0 | 209 | px[a_len] = 0; |
michael@0 | 210 | } |
michael@0 | 211 | } |
michael@0 | 212 | d = mul_add_inp(c, a, a_len, b); |
michael@0 | 213 | if (d) { |
michael@0 | 214 | c += a_len; |
michael@0 | 215 | do { |
michael@0 | 216 | mp_digit sum = d + *c; |
michael@0 | 217 | *c++ = sum; |
michael@0 | 218 | d = sum < d; |
michael@0 | 219 | } while (d); |
michael@0 | 220 | } |
michael@0 | 221 | } else { |
michael@0 | 222 | v8_mpv_mul_d_add_prop(a, a_len, b, c); |
michael@0 | 223 | } |
michael@0 | 224 | } |