1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/qcms/transform-altivec.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,269 @@ 1.4 +/* vim: set ts=8 sw=8 noexpandtab: */ 1.5 +// qcms 1.6 +// Copyright (C) 2009 Mozilla Corporation 1.7 +// Copyright (C) 1998-2007 Marti Maria 1.8 +// 1.9 +// Permission is hereby granted, free of charge, to any person obtaining 1.10 +// a copy of this software and associated documentation files (the "Software"), 1.11 +// to deal in the Software without restriction, including without limitation 1.12 +// the rights to use, copy, modify, merge, publish, distribute, sublicense, 1.13 +// and/or sell copies of the Software, and to permit persons to whom the Software 1.14 +// is furnished to do so, subject to the following conditions: 1.15 +// 1.16 +// The above copyright notice and this permission notice shall be included in 1.17 +// all copies or substantial portions of the Software. 1.18 +// 1.19 +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 1.20 +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO 1.21 +// THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 1.22 +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 1.23 +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 1.24 +// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 1.25 +// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 1.26 + 1.27 +#include <altivec.h> 1.28 + 1.29 +#include "qcmsint.h" 1.30 + 1.31 +#define FLOATSCALE (float)(PRECACHE_OUTPUT_SIZE) 1.32 +#define CLAMPMAXVAL (((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE) 1.33 +static const ALIGN float floatScaleX4 = FLOATSCALE; 1.34 +static const ALIGN float clampMaxValueX4 = CLAMPMAXVAL; 1.35 + 1.36 +inline vector float load_aligned_float(float *dataPtr) 1.37 +{ 1.38 + vector float data = vec_lde(0, dataPtr); 1.39 + vector unsigned char moveToStart = vec_lvsl(0, dataPtr); 1.40 + return vec_perm(data, data, moveToStart); 1.41 +} 1.42 + 1.43 +void qcms_transform_data_rgb_out_lut_altivec(qcms_transform *transform, 1.44 + unsigned char *src, 1.45 + unsigned char *dest, 1.46 + size_t length) 1.47 +{ 1.48 + unsigned int i; 1.49 + float (*mat)[4] = transform->matrix; 1.50 + char input_back[32]; 1.51 + /* Ensure we have a buffer that's 16 byte aligned regardless of the original 1.52 + * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32)) 1.53 + * because they don't work on stack variables. gcc 4.4 does do the right thing 1.54 + * on x86 but that's too new for us right now. For more info: gcc bug #16660 */ 1.55 + float const *input = (float*)(((uintptr_t)&input_back[16]) & ~0xf); 1.56 + /* share input and output locations to save having to keep the 1.57 + * locations in separate registers */ 1.58 + uint32_t const *output = (uint32_t*)input; 1.59 + 1.60 + /* deref *transform now to avoid it in loop */ 1.61 + const float *igtbl_r = transform->input_gamma_table_r; 1.62 + const float *igtbl_g = transform->input_gamma_table_g; 1.63 + const float *igtbl_b = transform->input_gamma_table_b; 1.64 + 1.65 + /* deref *transform now to avoid it in loop */ 1.66 + const uint8_t *otdata_r = &transform->output_table_r->data[0]; 1.67 + const uint8_t *otdata_g = &transform->output_table_g->data[0]; 1.68 + const uint8_t *otdata_b = &transform->output_table_b->data[0]; 1.69 + 1.70 + /* input matrix values never change */ 1.71 + const vector float mat0 = vec_ldl(0, (vector float*)mat[0]); 1.72 + const vector float mat1 = vec_ldl(0, (vector float*)mat[1]); 1.73 + const vector float mat2 = vec_ldl(0, (vector float*)mat[2]); 1.74 + 1.75 + /* these values don't change, either */ 1.76 + const vector float max = vec_splat(vec_lde(0, (float*)&clampMaxValueX4), 0); 1.77 + const vector float min = (vector float)vec_splat_u32(0); 1.78 + const vector float scale = vec_splat(vec_lde(0, (float*)&floatScaleX4), 0); 1.79 + 1.80 + /* working variables */ 1.81 + vector float vec_r, vec_g, vec_b, result; 1.82 + 1.83 + /* CYA */ 1.84 + if (!length) 1.85 + return; 1.86 + 1.87 + /* one pixel is handled outside of the loop */ 1.88 + length--; 1.89 + 1.90 + /* setup for transforming 1st pixel */ 1.91 + vec_r = load_aligned_float((float*)&igtbl_r[src[0]]); 1.92 + vec_g = load_aligned_float((float*)&igtbl_r[src[1]]); 1.93 + vec_b = load_aligned_float((float*)&igtbl_r[src[2]]); 1.94 + src += 3; 1.95 + 1.96 + /* transform all but final pixel */ 1.97 + 1.98 + for (i=0; i<length; i++) 1.99 + { 1.100 + /* position values from gamma tables */ 1.101 + vec_r = vec_splat(vec_r, 0); 1.102 + vec_g = vec_splat(vec_g, 0); 1.103 + vec_b = vec_splat(vec_b, 0); 1.104 + 1.105 + /* gamma * matrix */ 1.106 + vec_r = vec_madd(vec_r, mat0, min); 1.107 + vec_g = vec_madd(vec_g, mat1, min); 1.108 + vec_b = vec_madd(vec_b, mat2, min); 1.109 + 1.110 + /* crunch, crunch, crunch */ 1.111 + vec_r = vec_add(vec_r, vec_add(vec_g, vec_b)); 1.112 + vec_r = vec_max(min, vec_r); 1.113 + vec_r = vec_min(max, vec_r); 1.114 + result = vec_madd(vec_r, scale, min); 1.115 + 1.116 + /* store calc'd output tables indices */ 1.117 + vec_st(vec_ctu(vec_round(result), 0), 0, (vector unsigned int*)output); 1.118 + 1.119 + /* load for next loop while store completes */ 1.120 + vec_r = load_aligned_float((float*)&igtbl_r[src[0]]); 1.121 + vec_g = load_aligned_float((float*)&igtbl_r[src[1]]); 1.122 + vec_b = load_aligned_float((float*)&igtbl_r[src[2]]); 1.123 + src += 3; 1.124 + 1.125 + /* use calc'd indices to output RGB values */ 1.126 + dest[0] = otdata_r[output[0]]; 1.127 + dest[1] = otdata_g[output[1]]; 1.128 + dest[2] = otdata_b[output[2]]; 1.129 + dest += 3; 1.130 + } 1.131 + 1.132 + /* handle final (maybe only) pixel */ 1.133 + 1.134 + vec_r = vec_splat(vec_r, 0); 1.135 + vec_g = vec_splat(vec_g, 0); 1.136 + vec_b = vec_splat(vec_b, 0); 1.137 + 1.138 + vec_r = vec_madd(vec_r, mat0, min); 1.139 + vec_g = vec_madd(vec_g, mat1, min); 1.140 + vec_b = vec_madd(vec_b, mat2, min); 1.141 + 1.142 + vec_r = vec_add(vec_r, vec_add(vec_g, vec_b)); 1.143 + vec_r = vec_max(min, vec_r); 1.144 + vec_r = vec_min(max, vec_r); 1.145 + result = vec_madd(vec_r, scale, min); 1.146 + 1.147 + vec_st(vec_ctu(vec_round(result),0),0,(vector unsigned int*)output); 1.148 + 1.149 + dest[0] = otdata_r[output[0]]; 1.150 + dest[1] = otdata_g[output[1]]; 1.151 + dest[2] = otdata_b[output[2]]; 1.152 +} 1.153 + 1.154 +void qcms_transform_data_rgba_out_lut_altivec(qcms_transform *transform, 1.155 + unsigned char *src, 1.156 + unsigned char *dest, 1.157 + size_t length) 1.158 +{ 1.159 + unsigned int i; 1.160 + float (*mat)[4] = transform->matrix; 1.161 + char input_back[32]; 1.162 + /* Ensure we have a buffer that's 16 byte aligned regardless of the original 1.163 + * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32)) 1.164 + * because they don't work on stack variables. gcc 4.4 does do the right thing 1.165 + * on x86 but that's too new for us right now. For more info: gcc bug #16660 */ 1.166 + float const *input = (float*)(((uintptr_t)&input_back[16]) & ~0xf); 1.167 + /* share input and output locations to save having to keep the 1.168 + * locations in separate registers */ 1.169 + uint32_t const *output = (uint32_t*)input; 1.170 + 1.171 + /* deref *transform now to avoid it in loop */ 1.172 + const float *igtbl_r = transform->input_gamma_table_r; 1.173 + const float *igtbl_g = transform->input_gamma_table_g; 1.174 + const float *igtbl_b = transform->input_gamma_table_b; 1.175 + 1.176 + /* deref *transform now to avoid it in loop */ 1.177 + const uint8_t *otdata_r = &transform->output_table_r->data[0]; 1.178 + const uint8_t *otdata_g = &transform->output_table_g->data[0]; 1.179 + const uint8_t *otdata_b = &transform->output_table_b->data[0]; 1.180 + 1.181 + /* input matrix values never change */ 1.182 + const vector float mat0 = vec_ldl(0, (vector float*)mat[0]); 1.183 + const vector float mat1 = vec_ldl(0, (vector float*)mat[1]); 1.184 + const vector float mat2 = vec_ldl(0, (vector float*)mat[2]); 1.185 + 1.186 + /* these values don't change, either */ 1.187 + const vector float max = vec_splat(vec_lde(0, (float*)&clampMaxValueX4), 0); 1.188 + const vector float min = (vector float)vec_splat_u32(0); 1.189 + const vector float scale = vec_splat(vec_lde(0, (float*)&floatScaleX4), 0); 1.190 + 1.191 + /* working variables */ 1.192 + vector float vec_r, vec_g, vec_b, result; 1.193 + unsigned char alpha; 1.194 + 1.195 + /* CYA */ 1.196 + if (!length) 1.197 + return; 1.198 + 1.199 + /* one pixel is handled outside of the loop */ 1.200 + length--; 1.201 + 1.202 + /* setup for transforming 1st pixel */ 1.203 + vec_r = load_aligned_float((float*)&igtbl_r[src[0]]); 1.204 + vec_g = load_aligned_float((float*)&igtbl_r[src[1]]); 1.205 + vec_b = load_aligned_float((float*)&igtbl_r[src[2]]); 1.206 + alpha = src[3]; 1.207 + src += 4; 1.208 + 1.209 + /* transform all but final pixel */ 1.210 + 1.211 + for (i=0; i<length; i++) 1.212 + { 1.213 + /* position values from gamma tables */ 1.214 + vec_r = vec_splat(vec_r, 0); 1.215 + vec_g = vec_splat(vec_g, 0); 1.216 + vec_b = vec_splat(vec_b, 0); 1.217 + 1.218 + /* gamma * matrix */ 1.219 + vec_r = vec_madd(vec_r, mat0, min); 1.220 + vec_g = vec_madd(vec_g, mat1, min); 1.221 + vec_b = vec_madd(vec_b, mat2, min); 1.222 + 1.223 + /* store alpha for this pixel; load alpha for next */ 1.224 + dest[3] = alpha; 1.225 + alpha = src[3]; 1.226 + 1.227 + /* crunch, crunch, crunch */ 1.228 + vec_r = vec_add(vec_r, vec_add(vec_g, vec_b)); 1.229 + vec_r = vec_max(min, vec_r); 1.230 + vec_r = vec_min(max, vec_r); 1.231 + result = vec_madd(vec_r, scale, min); 1.232 + 1.233 + /* store calc'd output tables indices */ 1.234 + vec_st(vec_ctu(vec_round(result), 0), 0, (vector unsigned int*)output); 1.235 + 1.236 + /* load gamma values for next loop while store completes */ 1.237 + vec_r = load_aligned_float((float*)&igtbl_r[src[0]]); 1.238 + vec_g = load_aligned_float((float*)&igtbl_r[src[1]]); 1.239 + vec_b = load_aligned_float((float*)&igtbl_r[src[2]]); 1.240 + src += 4; 1.241 + 1.242 + /* use calc'd indices to output RGB values */ 1.243 + dest[0] = otdata_r[output[0]]; 1.244 + dest[1] = otdata_g[output[1]]; 1.245 + dest[2] = otdata_b[output[2]]; 1.246 + dest += 4; 1.247 + } 1.248 + 1.249 + /* handle final (maybe only) pixel */ 1.250 + 1.251 + vec_r = vec_splat(vec_r, 0); 1.252 + vec_g = vec_splat(vec_g, 0); 1.253 + vec_b = vec_splat(vec_b, 0); 1.254 + 1.255 + vec_r = vec_madd(vec_r, mat0, min); 1.256 + vec_g = vec_madd(vec_g, mat1, min); 1.257 + vec_b = vec_madd(vec_b, mat2, min); 1.258 + 1.259 + dest[3] = alpha; 1.260 + 1.261 + vec_r = vec_add(vec_r, vec_add(vec_g, vec_b)); 1.262 + vec_r = vec_max(min, vec_r); 1.263 + vec_r = vec_min(max, vec_r); 1.264 + result = vec_madd(vec_r, scale, min); 1.265 + 1.266 + vec_st(vec_ctu(vec_round(result), 0), 0, (vector unsigned int*)output); 1.267 + 1.268 + dest[0] = otdata_r[output[0]]; 1.269 + dest[1] = otdata_g[output[1]]; 1.270 + dest[2] = otdata_b[output[2]]; 1.271 +} 1.272 +