michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: #include "vis_proto.h" michael@0: michael@0: /***************************************************************/ michael@0: michael@0: typedef int t_s32; michael@0: typedef unsigned int t_u32; michael@0: #if defined(__sparcv9) michael@0: typedef long t_s64; michael@0: typedef unsigned long t_u64; michael@0: #else michael@0: typedef long long t_s64; michael@0: typedef unsigned long long t_u64; michael@0: #endif michael@0: typedef double t_d64; michael@0: michael@0: /***************************************************************/ michael@0: michael@0: typedef union { michael@0: t_d64 d64; michael@0: struct { michael@0: t_s32 i0; michael@0: t_s32 i1; michael@0: } i32s; michael@0: } d64_2_i32; michael@0: michael@0: /***************************************************************/ michael@0: michael@0: #define BUFF_SIZE 256 michael@0: michael@0: #define A_BITS 19 michael@0: #define A_MASK ((1 << A_BITS) - 1) michael@0: michael@0: /***************************************************************/ michael@0: michael@0: static t_u64 mask_cnst[] = { michael@0: 0x8000000080000000ull michael@0: }; michael@0: michael@0: /***************************************************************/ michael@0: michael@0: #define DEF_VARS(N) \ michael@0: t_d64 *py = (t_d64*)y; \ michael@0: t_d64 mask = *((t_d64*)mask_cnst); \ michael@0: t_d64 ca = (1u << 31) - 1; \ michael@0: t_d64 da = (t_d64)a; \ michael@0: t_s64 buff[N], s; \ michael@0: d64_2_i32 dy michael@0: michael@0: /***************************************************************/ michael@0: michael@0: #define MUL_U32_S64_2(i) \ michael@0: dy.d64 = vis_fxnor(mask, py[i]); \ michael@0: buff[2*(i) ] = (ca - (t_d64)dy.i32s.i0) * da; \ michael@0: buff[2*(i)+1] = (ca - (t_d64)dy.i32s.i1) * da michael@0: michael@0: #define MUL_U32_S64_2_D(i) \ michael@0: dy.d64 = vis_fxnor(mask, py[i]); \ michael@0: d0 = ca - (t_d64)dy.i32s.i0; \ michael@0: d1 = ca - (t_d64)dy.i32s.i1; \ michael@0: buff[4*(i) ] = (t_s64)(d0 * da); \ michael@0: buff[4*(i)+1] = (t_s64)(d0 * db); \ michael@0: buff[4*(i)+2] = (t_s64)(d1 * da); \ michael@0: buff[4*(i)+3] = (t_s64)(d1 * db) michael@0: michael@0: /***************************************************************/ michael@0: michael@0: #define ADD_S64_U32(i) \ michael@0: s = buff[i] + x[i] + c; \ michael@0: z[i] = s; \ michael@0: c = (s >> 32) michael@0: michael@0: #define ADD_S64_U32_D(i) \ michael@0: s = buff[2*(i)] +(((t_s64)(buff[2*(i)+1]))<> 32) michael@0: michael@0: /***************************************************************/ michael@0: michael@0: #define MUL_U32_S64_8(i) \ michael@0: MUL_U32_S64_2(i); \ michael@0: MUL_U32_S64_2(i+1); \ michael@0: MUL_U32_S64_2(i+2); \ michael@0: MUL_U32_S64_2(i+3) michael@0: michael@0: #define MUL_U32_S64_D_8(i) \ michael@0: MUL_U32_S64_2_D(i); \ michael@0: MUL_U32_S64_2_D(i+1); \ michael@0: MUL_U32_S64_2_D(i+2); \ michael@0: MUL_U32_S64_2_D(i+3) michael@0: michael@0: /***************************************************************/ michael@0: michael@0: #define ADD_S64_U32_8(i) \ michael@0: ADD_S64_U32(i); \ michael@0: ADD_S64_U32(i+1); \ michael@0: ADD_S64_U32(i+2); \ michael@0: ADD_S64_U32(i+3); \ michael@0: ADD_S64_U32(i+4); \ michael@0: ADD_S64_U32(i+5); \ michael@0: ADD_S64_U32(i+6); \ michael@0: ADD_S64_U32(i+7) michael@0: michael@0: #define ADD_S64_U32_D_8(i) \ michael@0: ADD_S64_U32_D(i); \ michael@0: ADD_S64_U32_D(i+1); \ michael@0: ADD_S64_U32_D(i+2); \ michael@0: ADD_S64_U32_D(i+3); \ michael@0: ADD_S64_U32_D(i+4); \ michael@0: ADD_S64_U32_D(i+5); \ michael@0: ADD_S64_U32_D(i+6); \ michael@0: ADD_S64_U32_D(i+7) michael@0: michael@0: /***************************************************************/ michael@0: michael@0: t_u32 mul_add(t_u32 *z, t_u32 *x, t_u32 *y, int n, t_u32 a) michael@0: { michael@0: if (a < (1 << A_BITS)) { michael@0: michael@0: if (n == 8) { michael@0: DEF_VARS(8); michael@0: t_s32 c = 0; michael@0: michael@0: MUL_U32_S64_8(0); michael@0: ADD_S64_U32_8(0); michael@0: michael@0: return c; michael@0: michael@0: } else if (n == 16) { michael@0: DEF_VARS(16); michael@0: t_s32 c = 0; michael@0: michael@0: MUL_U32_S64_8(0); michael@0: MUL_U32_S64_8(4); michael@0: ADD_S64_U32_8(0); michael@0: ADD_S64_U32_8(8); michael@0: michael@0: return c; michael@0: michael@0: } else { michael@0: DEF_VARS(BUFF_SIZE); michael@0: t_s32 i, c = 0; michael@0: michael@0: #pragma pipeloop(0) michael@0: for (i = 0; i < (n+1)/2; i ++) { michael@0: MUL_U32_S64_2(i); michael@0: } michael@0: michael@0: #pragma pipeloop(0) michael@0: for (i = 0; i < n; i ++) { michael@0: ADD_S64_U32(i); michael@0: } michael@0: michael@0: return c; michael@0: michael@0: } michael@0: } else { michael@0: michael@0: if (n == 8) { michael@0: DEF_VARS(2*8); michael@0: t_d64 d0, d1, db; michael@0: t_u32 uc = 0; michael@0: michael@0: da = (t_d64)(a & A_MASK); michael@0: db = (t_d64)(a >> A_BITS); michael@0: michael@0: MUL_U32_S64_D_8(0); michael@0: ADD_S64_U32_D_8(0); michael@0: michael@0: return uc; michael@0: michael@0: } else if (n == 16) { michael@0: DEF_VARS(2*16); michael@0: t_d64 d0, d1, db; michael@0: t_u32 uc = 0; michael@0: michael@0: da = (t_d64)(a & A_MASK); michael@0: db = (t_d64)(a >> A_BITS); michael@0: michael@0: MUL_U32_S64_D_8(0); michael@0: MUL_U32_S64_D_8(4); michael@0: ADD_S64_U32_D_8(0); michael@0: ADD_S64_U32_D_8(8); michael@0: michael@0: return uc; michael@0: michael@0: } else { michael@0: DEF_VARS(2*BUFF_SIZE); michael@0: t_d64 d0, d1, db; michael@0: t_u32 i, uc = 0; michael@0: michael@0: da = (t_d64)(a & A_MASK); michael@0: db = (t_d64)(a >> A_BITS); michael@0: michael@0: #pragma pipeloop(0) michael@0: for (i = 0; i < (n+1)/2; i ++) { michael@0: MUL_U32_S64_2_D(i); michael@0: } michael@0: michael@0: #pragma pipeloop(0) michael@0: for (i = 0; i < n; i ++) { michael@0: ADD_S64_U32_D(i); michael@0: } michael@0: michael@0: return uc; michael@0: } michael@0: } michael@0: } michael@0: michael@0: /***************************************************************/ michael@0: michael@0: t_u32 mul_add_inp(t_u32 *x, t_u32 *y, int n, t_u32 a) michael@0: { michael@0: return mul_add(x, x, y, n, a); michael@0: } michael@0: michael@0: /***************************************************************/