security/nss/lib/freebl/mpi/mpi_sparc.c

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /* This Source Code Form is subject to the terms of the Mozilla Public
     2  * License, v. 2.0. If a copy of the MPL was not distributed with this
     3  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     5 /* Multiplication performance enhancements for sparc v8+vis CPUs. */
     7 #include "mpi-priv.h"
     8 #include <stddef.h>
     9 #include <sys/systeminfo.h>
    10 #include <strings.h>
    12 /* In the functions below, */
    13 /* vector y must be 8-byte aligned, and n must be even */
    14 /* returns carry out of high order word of result */
    15 /* maximum n is 256 */
    17 /* vector x += vector y * scaler a; where y is of length n words. */
    18 extern mp_digit mul_add_inp(mp_digit *x, const mp_digit *y, int n, mp_digit a);
    20 /* vector z = vector x + vector y * scaler a; where y is of length n words. */
    21 extern mp_digit mul_add(mp_digit *z, const mp_digit *x, const mp_digit *y, 
    22 			int n, mp_digit a);
    24 /* v8 versions of these functions run on any Sparc v8 CPU. */
    26 /* This trick works on Sparc V8 CPUs with the Workshop compilers. */
    27 #define MP_MUL_DxD(a, b, Phi, Plo) \
    28   { unsigned long long product = (unsigned long long)a * b; \
    29     Plo = (mp_digit)product; \
    30     Phi = (mp_digit)(product >> MP_DIGIT_BIT); }
    32 /* c = a * b */
    33 static void 
    34 v8_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
    35 {
    36 #if !defined(MP_NO_MP_WORD)
    37   mp_digit   d = 0;
    39   /* Inner product:  Digits of a */
    40   while (a_len--) {
    41     mp_word w = ((mp_word)b * *a++) + d;
    42     *c++ = ACCUM(w);
    43     d = CARRYOUT(w);
    44   }
    45   *c = d;
    46 #else
    47   mp_digit carry = 0;
    48   while (a_len--) {
    49     mp_digit a_i = *a++;
    50     mp_digit a0b0, a1b1;
    52     MP_MUL_DxD(a_i, b, a1b1, a0b0);
    54     a0b0 += carry;
    55     if (a0b0 < carry)
    56       ++a1b1;
    57     *c++ = a0b0;
    58     carry = a1b1;
    59   }
    60   *c = carry;
    61 #endif
    62 }
    64 /* c += a * b */
    65 static void 
    66 v8_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
    67 {
    68 #if !defined(MP_NO_MP_WORD)
    69   mp_digit   d = 0;
    71   /* Inner product:  Digits of a */
    72   while (a_len--) {
    73     mp_word w = ((mp_word)b * *a++) + *c + d;
    74     *c++ = ACCUM(w);
    75     d = CARRYOUT(w);
    76   }
    77   *c = d;
    78 #else
    79   mp_digit carry = 0;
    80   while (a_len--) {
    81     mp_digit a_i = *a++;
    82     mp_digit a0b0, a1b1;
    84     MP_MUL_DxD(a_i, b, a1b1, a0b0);
    86     a0b0 += carry;
    87     if (a0b0 < carry)
    88       ++a1b1;
    89     a0b0 += a_i = *c;
    90     if (a0b0 < a_i)
    91       ++a1b1;
    92     *c++ = a0b0;
    93     carry = a1b1;
    94   }
    95   *c = carry;
    96 #endif
    97 }
    99 /* Presently, this is only used by the Montgomery arithmetic code. */
   100 /* c += a * b */
   101 static void 
   102 v8_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
   103 {
   104 #if !defined(MP_NO_MP_WORD)
   105   mp_digit   d = 0;
   107   /* Inner product:  Digits of a */
   108   while (a_len--) {
   109     mp_word w = ((mp_word)b * *a++) + *c + d;
   110     *c++ = ACCUM(w);
   111     d = CARRYOUT(w);
   112   }
   114   while (d) {
   115     mp_word w = (mp_word)*c + d;
   116     *c++ = ACCUM(w);
   117     d = CARRYOUT(w);
   118   }
   119 #else
   120   mp_digit carry = 0;
   121   while (a_len--) {
   122     mp_digit a_i = *a++;
   123     mp_digit a0b0, a1b1;
   125     MP_MUL_DxD(a_i, b, a1b1, a0b0);
   127     a0b0 += carry;
   128     if (a0b0 < carry)
   129       ++a1b1;
   131     a0b0 += a_i = *c;
   132     if (a0b0 < a_i)
   133       ++a1b1;
   135     *c++ = a0b0;
   136     carry = a1b1;
   137   }
   138   while (carry) {
   139     mp_digit c_i = *c;
   140     carry += c_i;
   141     *c++ = carry;
   142     carry = carry < c_i;
   143   }
   144 #endif
   145 }
   147 /* These functions run only on v8plus+vis or v9+vis CPUs. */
   149 /* c = a * b */
   150 void 
   151 s_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
   152 {
   153     mp_digit d;
   154     mp_digit x[258];
   155     if (a_len <= 256) {
   156 	if (a == c || ((ptrdiff_t)a & 0x7) != 0 || (a_len & 1) != 0) {
   157 	    mp_digit * px;
   158 	    px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x;
   159 	    memcpy(px, a, a_len * sizeof(*a));
   160 	    a = px;
   161 	    if (a_len & 1) {
   162 		px[a_len] = 0;
   163 	    }
   164 	}
   165 	s_mp_setz(c, a_len + 1);
   166 	d = mul_add_inp(c, a, a_len, b);
   167 	c[a_len] = d;
   168     } else {
   169 	v8_mpv_mul_d(a, a_len, b, c);
   170     }
   171 }
   173 /* c += a * b, where a is a_len words long. */
   174 void     
   175 s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
   176 {
   177     mp_digit d;
   178     mp_digit x[258];
   179     if (a_len <= 256) {
   180 	if (((ptrdiff_t)a & 0x7) != 0 || (a_len & 1) != 0) {
   181 	    mp_digit * px;
   182 	    px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x;
   183 	    memcpy(px, a, a_len * sizeof(*a));
   184 	    a = px;
   185 	    if (a_len & 1) {
   186 		px[a_len] = 0;
   187 	    }
   188 	}
   189 	d = mul_add_inp(c, a, a_len, b);
   190 	c[a_len] = d;
   191     } else {
   192 	v8_mpv_mul_d_add(a, a_len, b, c);
   193     }
   194 }
   196 /* c += a * b, where a is y words long. */
   197 void     
   198 s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
   199 {
   200     mp_digit d;
   201     mp_digit x[258];
   202     if (a_len <= 256) {
   203 	if (((ptrdiff_t)a & 0x7) != 0 || (a_len & 1) != 0) {
   204 	    mp_digit * px;
   205 	    px = (((ptrdiff_t)x & 0x7) != 0) ? x + 1 : x;
   206 	    memcpy(px, a, a_len * sizeof(*a));
   207 	    a = px;
   208 	    if (a_len & 1) {
   209 		px[a_len] = 0;
   210 	    }
   211 	}
   212 	d = mul_add_inp(c, a, a_len, b);
   213 	if (d) {
   214 	    c += a_len;
   215 	    do {
   216 		mp_digit sum = d + *c;
   217 		*c++ = sum;
   218 		d = sum < d;
   219 	    } while (d);
   220 	}
   221     } else {
   222 	v8_mpv_mul_d_add_prop(a, a_len, b, c);
   223     }
   224 }

mercurial