Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* This Source Code Form is subject to the terms of the Mozilla Public |
michael@0 | 2 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
michael@0 | 3 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
michael@0 | 4 | |
michael@0 | 5 | #include "mpi-priv.h" |
michael@0 | 6 | #include <c_asm.h> |
michael@0 | 7 | |
michael@0 | 8 | |
michael@0 | 9 | #define MP_MUL_DxD(a, b, Phi, Plo) \ |
michael@0 | 10 | { Plo = asm ("mulq %a0, %a1, %v0", a, b); \ |
michael@0 | 11 | Phi = asm ("umulh %a0, %a1, %v0", a, b); } \ |
michael@0 | 12 | |
michael@0 | 13 | /* This is empty for the loop in s_mpv_mul_d */ |
michael@0 | 14 | #define CARRY_ADD |
michael@0 | 15 | |
michael@0 | 16 | #define ONE_MUL \ |
michael@0 | 17 | a_i = *a++; \ |
michael@0 | 18 | MP_MUL_DxD(a_i, b, a1b1, a0b0); \ |
michael@0 | 19 | a0b0 += carry; \ |
michael@0 | 20 | if (a0b0 < carry) \ |
michael@0 | 21 | ++a1b1; \ |
michael@0 | 22 | CARRY_ADD \ |
michael@0 | 23 | *c++ = a0b0; \ |
michael@0 | 24 | carry = a1b1; \ |
michael@0 | 25 | |
michael@0 | 26 | #define FOUR_MUL \ |
michael@0 | 27 | ONE_MUL \ |
michael@0 | 28 | ONE_MUL \ |
michael@0 | 29 | ONE_MUL \ |
michael@0 | 30 | ONE_MUL \ |
michael@0 | 31 | |
michael@0 | 32 | #define SIXTEEN_MUL \ |
michael@0 | 33 | FOUR_MUL \ |
michael@0 | 34 | FOUR_MUL \ |
michael@0 | 35 | FOUR_MUL \ |
michael@0 | 36 | FOUR_MUL \ |
michael@0 | 37 | |
michael@0 | 38 | #define THIRTYTWO_MUL \ |
michael@0 | 39 | SIXTEEN_MUL \ |
michael@0 | 40 | SIXTEEN_MUL \ |
michael@0 | 41 | |
michael@0 | 42 | #define ONETWENTYEIGHT_MUL \ |
michael@0 | 43 | THIRTYTWO_MUL \ |
michael@0 | 44 | THIRTYTWO_MUL \ |
michael@0 | 45 | THIRTYTWO_MUL \ |
michael@0 | 46 | THIRTYTWO_MUL \ |
michael@0 | 47 | |
michael@0 | 48 | |
michael@0 | 49 | #define EXPAND_256(CALL) \ |
michael@0 | 50 | mp_digit carry = 0; \ |
michael@0 | 51 | mp_digit a_i; \ |
michael@0 | 52 | mp_digit a0b0, a1b1; \ |
michael@0 | 53 | if (a_len &255) { \ |
michael@0 | 54 | if (a_len &1) { \ |
michael@0 | 55 | ONE_MUL \ |
michael@0 | 56 | } \ |
michael@0 | 57 | if (a_len &2) { \ |
michael@0 | 58 | ONE_MUL \ |
michael@0 | 59 | ONE_MUL \ |
michael@0 | 60 | } \ |
michael@0 | 61 | if (a_len &4) { \ |
michael@0 | 62 | FOUR_MUL \ |
michael@0 | 63 | } \ |
michael@0 | 64 | if (a_len &8) { \ |
michael@0 | 65 | FOUR_MUL \ |
michael@0 | 66 | FOUR_MUL \ |
michael@0 | 67 | } \ |
michael@0 | 68 | if (a_len & 16 ) { \ |
michael@0 | 69 | SIXTEEN_MUL \ |
michael@0 | 70 | } \ |
michael@0 | 71 | if (a_len & 32 ) { \ |
michael@0 | 72 | THIRTYTWO_MUL \ |
michael@0 | 73 | } \ |
michael@0 | 74 | if (a_len & 64 ) { \ |
michael@0 | 75 | THIRTYTWO_MUL \ |
michael@0 | 76 | THIRTYTWO_MUL \ |
michael@0 | 77 | } \ |
michael@0 | 78 | if (a_len & 128) { \ |
michael@0 | 79 | ONETWENTYEIGHT_MUL \ |
michael@0 | 80 | } \ |
michael@0 | 81 | a_len = a_len & (-256); \ |
michael@0 | 82 | } \ |
michael@0 | 83 | if (a_len>=256 ) { \ |
michael@0 | 84 | carry = CALL(a, a_len, b, c, carry); \ |
michael@0 | 85 | c += a_len; \ |
michael@0 | 86 | } \ |
michael@0 | 87 | |
michael@0 | 88 | #define FUNC_NAME(NAME) \ |
michael@0 | 89 | mp_digit NAME(const mp_digit *a, \ |
michael@0 | 90 | mp_size a_len, \ |
michael@0 | 91 | mp_digit b, mp_digit *c, \ |
michael@0 | 92 | mp_digit carry) \ |
michael@0 | 93 | |
michael@0 | 94 | #define DECLARE_MUL_256(FNAME) \ |
michael@0 | 95 | FUNC_NAME(FNAME) \ |
michael@0 | 96 | { \ |
michael@0 | 97 | mp_digit a_i; \ |
michael@0 | 98 | mp_digit a0b0, a1b1; \ |
michael@0 | 99 | while (a_len) { \ |
michael@0 | 100 | ONETWENTYEIGHT_MUL \ |
michael@0 | 101 | ONETWENTYEIGHT_MUL \ |
michael@0 | 102 | a_len-= 256; \ |
michael@0 | 103 | } \ |
michael@0 | 104 | return carry; \ |
michael@0 | 105 | } \ |
michael@0 | 106 | |
michael@0 | 107 | /* Expanding the loop in s_mpv_mul_d appeared to slow down the |
michael@0 | 108 | (admittedly) small number of tests (i.e., timetest) used to |
michael@0 | 109 | measure performance, so this define disables that optimization. */ |
michael@0 | 110 | #define DO_NOT_EXPAND 1 |
michael@0 | 111 | |
michael@0 | 112 | /* Need forward declaration so it can be instantiated after |
michael@0 | 113 | the routine that uses it; this helps locality somewhat */ |
michael@0 | 114 | #if !defined(DO_NOT_EXPAND) |
michael@0 | 115 | FUNC_NAME(s_mpv_mul_d_MUL256); |
michael@0 | 116 | #endif |
michael@0 | 117 | |
michael@0 | 118 | /* c = a * b */ |
michael@0 | 119 | void s_mpv_mul_d(const mp_digit *a, mp_size a_len, |
michael@0 | 120 | mp_digit b, mp_digit *c) |
michael@0 | 121 | { |
michael@0 | 122 | #if defined(DO_NOT_EXPAND) |
michael@0 | 123 | mp_digit carry = 0; |
michael@0 | 124 | while (a_len--) { |
michael@0 | 125 | mp_digit a_i = *a++; |
michael@0 | 126 | mp_digit a0b0, a1b1; |
michael@0 | 127 | |
michael@0 | 128 | MP_MUL_DxD(a_i, b, a1b1, a0b0); |
michael@0 | 129 | |
michael@0 | 130 | a0b0 += carry; |
michael@0 | 131 | if (a0b0 < carry) |
michael@0 | 132 | ++a1b1; |
michael@0 | 133 | *c++ = a0b0; |
michael@0 | 134 | carry = a1b1; |
michael@0 | 135 | } |
michael@0 | 136 | #else |
michael@0 | 137 | EXPAND_256(s_mpv_mul_d_MUL256) |
michael@0 | 138 | #endif |
michael@0 | 139 | *c = carry; |
michael@0 | 140 | } |
michael@0 | 141 | |
michael@0 | 142 | #if !defined(DO_NOT_EXPAND) |
michael@0 | 143 | DECLARE_MUL_256(s_mpv_mul_d_MUL256) |
michael@0 | 144 | #endif |
michael@0 | 145 | |
michael@0 | 146 | #undef CARRY_ADD |
michael@0 | 147 | /* This is redefined for the loop in s_mpv_mul_d_add */ |
michael@0 | 148 | #define CARRY_ADD \ |
michael@0 | 149 | a0b0 += a_i = *c; \ |
michael@0 | 150 | if (a0b0 < a_i) \ |
michael@0 | 151 | ++a1b1; \ |
michael@0 | 152 | |
michael@0 | 153 | /* Need forward declaration so it can be instantiated between the |
michael@0 | 154 | two routines that use it; this helps locality somewhat */ |
michael@0 | 155 | FUNC_NAME(s_mpv_mul_d_add_MUL256); |
michael@0 | 156 | |
michael@0 | 157 | /* c += a * b */ |
michael@0 | 158 | void s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, |
michael@0 | 159 | mp_digit b, mp_digit *c) |
michael@0 | 160 | { |
michael@0 | 161 | EXPAND_256(s_mpv_mul_d_add_MUL256) |
michael@0 | 162 | *c = carry; |
michael@0 | 163 | } |
michael@0 | 164 | |
michael@0 | 165 | /* Instantiate multiply 256 routine here */ |
michael@0 | 166 | DECLARE_MUL_256(s_mpv_mul_d_add_MUL256) |
michael@0 | 167 | |
michael@0 | 168 | /* Presently, this is only used by the Montgomery arithmetic code. */ |
michael@0 | 169 | /* c += a * b */ |
michael@0 | 170 | void s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, |
michael@0 | 171 | mp_digit b, mp_digit *c) |
michael@0 | 172 | { |
michael@0 | 173 | EXPAND_256(s_mpv_mul_d_add_MUL256) |
michael@0 | 174 | while (carry) { |
michael@0 | 175 | mp_digit c_i = *c; |
michael@0 | 176 | carry += c_i; |
michael@0 | 177 | *c++ = carry; |
michael@0 | 178 | carry = carry < c_i; |
michael@0 | 179 | } |
michael@0 | 180 | } |
michael@0 | 181 |