|
1 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
2 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
4 |
|
5 #include "vis_proto.h" |
|
6 |
|
7 /***************************************************************/ |
|
8 |
|
9 typedef int t_s32; |
|
10 typedef unsigned int t_u32; |
|
11 #if defined(__sparcv9) |
|
12 typedef long t_s64; |
|
13 typedef unsigned long t_u64; |
|
14 #else |
|
15 typedef long long t_s64; |
|
16 typedef unsigned long long t_u64; |
|
17 #endif |
|
18 typedef double t_d64; |
|
19 |
|
20 /***************************************************************/ |
|
21 |
|
22 typedef union { |
|
23 t_d64 d64; |
|
24 struct { |
|
25 t_s32 i0; |
|
26 t_s32 i1; |
|
27 } i32s; |
|
28 } d64_2_i32; |
|
29 |
|
30 /***************************************************************/ |
|
31 |
|
32 #define BUFF_SIZE 256 |
|
33 |
|
34 #define A_BITS 19 |
|
35 #define A_MASK ((1 << A_BITS) - 1) |
|
36 |
|
37 /***************************************************************/ |
|
38 |
|
39 static t_u64 mask_cnst[] = { |
|
40 0x8000000080000000ull |
|
41 }; |
|
42 |
|
43 /***************************************************************/ |
|
44 |
|
45 #define DEF_VARS(N) \ |
|
46 t_d64 *py = (t_d64*)y; \ |
|
47 t_d64 mask = *((t_d64*)mask_cnst); \ |
|
48 t_d64 ca = (1u << 31) - 1; \ |
|
49 t_d64 da = (t_d64)a; \ |
|
50 t_s64 buff[N], s; \ |
|
51 d64_2_i32 dy |
|
52 |
|
53 /***************************************************************/ |
|
54 |
|
55 #define MUL_U32_S64_2(i) \ |
|
56 dy.d64 = vis_fxnor(mask, py[i]); \ |
|
57 buff[2*(i) ] = (ca - (t_d64)dy.i32s.i0) * da; \ |
|
58 buff[2*(i)+1] = (ca - (t_d64)dy.i32s.i1) * da |
|
59 |
|
60 #define MUL_U32_S64_2_D(i) \ |
|
61 dy.d64 = vis_fxnor(mask, py[i]); \ |
|
62 d0 = ca - (t_d64)dy.i32s.i0; \ |
|
63 d1 = ca - (t_d64)dy.i32s.i1; \ |
|
64 buff[4*(i) ] = (t_s64)(d0 * da); \ |
|
65 buff[4*(i)+1] = (t_s64)(d0 * db); \ |
|
66 buff[4*(i)+2] = (t_s64)(d1 * da); \ |
|
67 buff[4*(i)+3] = (t_s64)(d1 * db) |
|
68 |
|
69 /***************************************************************/ |
|
70 |
|
71 #define ADD_S64_U32(i) \ |
|
72 s = buff[i] + x[i] + c; \ |
|
73 z[i] = s; \ |
|
74 c = (s >> 32) |
|
75 |
|
76 #define ADD_S64_U32_D(i) \ |
|
77 s = buff[2*(i)] +(((t_s64)(buff[2*(i)+1]))<<A_BITS) + x[i] + uc; \ |
|
78 z[i] = s; \ |
|
79 uc = ((t_u64)s >> 32) |
|
80 |
|
81 /***************************************************************/ |
|
82 |
|
83 #define MUL_U32_S64_8(i) \ |
|
84 MUL_U32_S64_2(i); \ |
|
85 MUL_U32_S64_2(i+1); \ |
|
86 MUL_U32_S64_2(i+2); \ |
|
87 MUL_U32_S64_2(i+3) |
|
88 |
|
89 #define MUL_U32_S64_D_8(i) \ |
|
90 MUL_U32_S64_2_D(i); \ |
|
91 MUL_U32_S64_2_D(i+1); \ |
|
92 MUL_U32_S64_2_D(i+2); \ |
|
93 MUL_U32_S64_2_D(i+3) |
|
94 |
|
95 /***************************************************************/ |
|
96 |
|
97 #define ADD_S64_U32_8(i) \ |
|
98 ADD_S64_U32(i); \ |
|
99 ADD_S64_U32(i+1); \ |
|
100 ADD_S64_U32(i+2); \ |
|
101 ADD_S64_U32(i+3); \ |
|
102 ADD_S64_U32(i+4); \ |
|
103 ADD_S64_U32(i+5); \ |
|
104 ADD_S64_U32(i+6); \ |
|
105 ADD_S64_U32(i+7) |
|
106 |
|
107 #define ADD_S64_U32_D_8(i) \ |
|
108 ADD_S64_U32_D(i); \ |
|
109 ADD_S64_U32_D(i+1); \ |
|
110 ADD_S64_U32_D(i+2); \ |
|
111 ADD_S64_U32_D(i+3); \ |
|
112 ADD_S64_U32_D(i+4); \ |
|
113 ADD_S64_U32_D(i+5); \ |
|
114 ADD_S64_U32_D(i+6); \ |
|
115 ADD_S64_U32_D(i+7) |
|
116 |
|
117 /***************************************************************/ |
|
118 |
|
119 t_u32 mul_add(t_u32 *z, t_u32 *x, t_u32 *y, int n, t_u32 a) |
|
120 { |
|
121 if (a < (1 << A_BITS)) { |
|
122 |
|
123 if (n == 8) { |
|
124 DEF_VARS(8); |
|
125 t_s32 c = 0; |
|
126 |
|
127 MUL_U32_S64_8(0); |
|
128 ADD_S64_U32_8(0); |
|
129 |
|
130 return c; |
|
131 |
|
132 } else if (n == 16) { |
|
133 DEF_VARS(16); |
|
134 t_s32 c = 0; |
|
135 |
|
136 MUL_U32_S64_8(0); |
|
137 MUL_U32_S64_8(4); |
|
138 ADD_S64_U32_8(0); |
|
139 ADD_S64_U32_8(8); |
|
140 |
|
141 return c; |
|
142 |
|
143 } else { |
|
144 DEF_VARS(BUFF_SIZE); |
|
145 t_s32 i, c = 0; |
|
146 |
|
147 #pragma pipeloop(0) |
|
148 for (i = 0; i < (n+1)/2; i ++) { |
|
149 MUL_U32_S64_2(i); |
|
150 } |
|
151 |
|
152 #pragma pipeloop(0) |
|
153 for (i = 0; i < n; i ++) { |
|
154 ADD_S64_U32(i); |
|
155 } |
|
156 |
|
157 return c; |
|
158 |
|
159 } |
|
160 } else { |
|
161 |
|
162 if (n == 8) { |
|
163 DEF_VARS(2*8); |
|
164 t_d64 d0, d1, db; |
|
165 t_u32 uc = 0; |
|
166 |
|
167 da = (t_d64)(a & A_MASK); |
|
168 db = (t_d64)(a >> A_BITS); |
|
169 |
|
170 MUL_U32_S64_D_8(0); |
|
171 ADD_S64_U32_D_8(0); |
|
172 |
|
173 return uc; |
|
174 |
|
175 } else if (n == 16) { |
|
176 DEF_VARS(2*16); |
|
177 t_d64 d0, d1, db; |
|
178 t_u32 uc = 0; |
|
179 |
|
180 da = (t_d64)(a & A_MASK); |
|
181 db = (t_d64)(a >> A_BITS); |
|
182 |
|
183 MUL_U32_S64_D_8(0); |
|
184 MUL_U32_S64_D_8(4); |
|
185 ADD_S64_U32_D_8(0); |
|
186 ADD_S64_U32_D_8(8); |
|
187 |
|
188 return uc; |
|
189 |
|
190 } else { |
|
191 DEF_VARS(2*BUFF_SIZE); |
|
192 t_d64 d0, d1, db; |
|
193 t_u32 i, uc = 0; |
|
194 |
|
195 da = (t_d64)(a & A_MASK); |
|
196 db = (t_d64)(a >> A_BITS); |
|
197 |
|
198 #pragma pipeloop(0) |
|
199 for (i = 0; i < (n+1)/2; i ++) { |
|
200 MUL_U32_S64_2_D(i); |
|
201 } |
|
202 |
|
203 #pragma pipeloop(0) |
|
204 for (i = 0; i < n; i ++) { |
|
205 ADD_S64_U32_D(i); |
|
206 } |
|
207 |
|
208 return uc; |
|
209 } |
|
210 } |
|
211 } |
|
212 |
|
213 /***************************************************************/ |
|
214 |
|
215 t_u32 mul_add_inp(t_u32 *x, t_u32 *y, int n, t_u32 a) |
|
216 { |
|
217 return mul_add(x, x, y, n, a); |
|
218 } |
|
219 |
|
220 /***************************************************************/ |