Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
1 /* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 #include "vis_proto.h"
7 /***************************************************************/
9 typedef int t_s32;
10 typedef unsigned int t_u32;
11 #if defined(__sparcv9)
12 typedef long t_s64;
13 typedef unsigned long t_u64;
14 #else
15 typedef long long t_s64;
16 typedef unsigned long long t_u64;
17 #endif
18 typedef double t_d64;
20 /***************************************************************/
22 typedef union {
23 t_d64 d64;
24 struct {
25 t_s32 i0;
26 t_s32 i1;
27 } i32s;
28 } d64_2_i32;
30 /***************************************************************/
32 #define BUFF_SIZE 256
34 #define A_BITS 19
35 #define A_MASK ((1 << A_BITS) - 1)
37 /***************************************************************/
39 static t_u64 mask_cnst[] = {
40 0x8000000080000000ull
41 };
43 /***************************************************************/
45 #define DEF_VARS(N) \
46 t_d64 *py = (t_d64*)y; \
47 t_d64 mask = *((t_d64*)mask_cnst); \
48 t_d64 ca = (1u << 31) - 1; \
49 t_d64 da = (t_d64)a; \
50 t_s64 buff[N], s; \
51 d64_2_i32 dy
53 /***************************************************************/
55 #define MUL_U32_S64_2(i) \
56 dy.d64 = vis_fxnor(mask, py[i]); \
57 buff[2*(i) ] = (ca - (t_d64)dy.i32s.i0) * da; \
58 buff[2*(i)+1] = (ca - (t_d64)dy.i32s.i1) * da
60 #define MUL_U32_S64_2_D(i) \
61 dy.d64 = vis_fxnor(mask, py[i]); \
62 d0 = ca - (t_d64)dy.i32s.i0; \
63 d1 = ca - (t_d64)dy.i32s.i1; \
64 buff[4*(i) ] = (t_s64)(d0 * da); \
65 buff[4*(i)+1] = (t_s64)(d0 * db); \
66 buff[4*(i)+2] = (t_s64)(d1 * da); \
67 buff[4*(i)+3] = (t_s64)(d1 * db)
69 /***************************************************************/
71 #define ADD_S64_U32(i) \
72 s = buff[i] + x[i] + c; \
73 z[i] = s; \
74 c = (s >> 32)
76 #define ADD_S64_U32_D(i) \
77 s = buff[2*(i)] +(((t_s64)(buff[2*(i)+1]))<<A_BITS) + x[i] + uc; \
78 z[i] = s; \
79 uc = ((t_u64)s >> 32)
81 /***************************************************************/
83 #define MUL_U32_S64_8(i) \
84 MUL_U32_S64_2(i); \
85 MUL_U32_S64_2(i+1); \
86 MUL_U32_S64_2(i+2); \
87 MUL_U32_S64_2(i+3)
89 #define MUL_U32_S64_D_8(i) \
90 MUL_U32_S64_2_D(i); \
91 MUL_U32_S64_2_D(i+1); \
92 MUL_U32_S64_2_D(i+2); \
93 MUL_U32_S64_2_D(i+3)
95 /***************************************************************/
97 #define ADD_S64_U32_8(i) \
98 ADD_S64_U32(i); \
99 ADD_S64_U32(i+1); \
100 ADD_S64_U32(i+2); \
101 ADD_S64_U32(i+3); \
102 ADD_S64_U32(i+4); \
103 ADD_S64_U32(i+5); \
104 ADD_S64_U32(i+6); \
105 ADD_S64_U32(i+7)
107 #define ADD_S64_U32_D_8(i) \
108 ADD_S64_U32_D(i); \
109 ADD_S64_U32_D(i+1); \
110 ADD_S64_U32_D(i+2); \
111 ADD_S64_U32_D(i+3); \
112 ADD_S64_U32_D(i+4); \
113 ADD_S64_U32_D(i+5); \
114 ADD_S64_U32_D(i+6); \
115 ADD_S64_U32_D(i+7)
117 /***************************************************************/
119 t_u32 mul_add(t_u32 *z, t_u32 *x, t_u32 *y, int n, t_u32 a)
120 {
121 if (a < (1 << A_BITS)) {
123 if (n == 8) {
124 DEF_VARS(8);
125 t_s32 c = 0;
127 MUL_U32_S64_8(0);
128 ADD_S64_U32_8(0);
130 return c;
132 } else if (n == 16) {
133 DEF_VARS(16);
134 t_s32 c = 0;
136 MUL_U32_S64_8(0);
137 MUL_U32_S64_8(4);
138 ADD_S64_U32_8(0);
139 ADD_S64_U32_8(8);
141 return c;
143 } else {
144 DEF_VARS(BUFF_SIZE);
145 t_s32 i, c = 0;
147 #pragma pipeloop(0)
148 for (i = 0; i < (n+1)/2; i ++) {
149 MUL_U32_S64_2(i);
150 }
152 #pragma pipeloop(0)
153 for (i = 0; i < n; i ++) {
154 ADD_S64_U32(i);
155 }
157 return c;
159 }
160 } else {
162 if (n == 8) {
163 DEF_VARS(2*8);
164 t_d64 d0, d1, db;
165 t_u32 uc = 0;
167 da = (t_d64)(a & A_MASK);
168 db = (t_d64)(a >> A_BITS);
170 MUL_U32_S64_D_8(0);
171 ADD_S64_U32_D_8(0);
173 return uc;
175 } else if (n == 16) {
176 DEF_VARS(2*16);
177 t_d64 d0, d1, db;
178 t_u32 uc = 0;
180 da = (t_d64)(a & A_MASK);
181 db = (t_d64)(a >> A_BITS);
183 MUL_U32_S64_D_8(0);
184 MUL_U32_S64_D_8(4);
185 ADD_S64_U32_D_8(0);
186 ADD_S64_U32_D_8(8);
188 return uc;
190 } else {
191 DEF_VARS(2*BUFF_SIZE);
192 t_d64 d0, d1, db;
193 t_u32 i, uc = 0;
195 da = (t_d64)(a & A_MASK);
196 db = (t_d64)(a >> A_BITS);
198 #pragma pipeloop(0)
199 for (i = 0; i < (n+1)/2; i ++) {
200 MUL_U32_S64_2_D(i);
201 }
203 #pragma pipeloop(0)
204 for (i = 0; i < n; i ++) {
205 ADD_S64_U32_D(i);
206 }
208 return uc;
209 }
210 }
211 }
213 /***************************************************************/
215 t_u32 mul_add_inp(t_u32 *x, t_u32 *y, int n, t_u32 a)
216 {
217 return mul_add(x, x, y, n, a);
218 }
220 /***************************************************************/