gfx/qcms/transform-altivec.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/qcms/transform-altivec.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,269 @@
     1.4 +/* vim: set ts=8 sw=8 noexpandtab: */
     1.5 +//  qcms
     1.6 +//  Copyright (C) 2009 Mozilla Corporation
     1.7 +//  Copyright (C) 1998-2007 Marti Maria
     1.8 +//
     1.9 +// Permission is hereby granted, free of charge, to any person obtaining
    1.10 +// a copy of this software and associated documentation files (the "Software"),
    1.11 +// to deal in the Software without restriction, including without limitation
    1.12 +// the rights to use, copy, modify, merge, publish, distribute, sublicense,
    1.13 +// and/or sell copies of the Software, and to permit persons to whom the Software
    1.14 +// is furnished to do so, subject to the following conditions:
    1.15 +//
    1.16 +// The above copyright notice and this permission notice shall be included in
    1.17 +// all copies or substantial portions of the Software.
    1.18 +//
    1.19 +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    1.20 +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
    1.21 +// THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
    1.22 +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
    1.23 +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
    1.24 +// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
    1.25 +// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
    1.26 +
    1.27 +#include <altivec.h>
    1.28 +
    1.29 +#include "qcmsint.h"
    1.30 +
    1.31 +#define FLOATSCALE (float)(PRECACHE_OUTPUT_SIZE)
    1.32 +#define CLAMPMAXVAL (((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE)
    1.33 +static const ALIGN float floatScaleX4 = FLOATSCALE;
    1.34 +static const ALIGN float clampMaxValueX4 = CLAMPMAXVAL;
    1.35 +
    1.36 +inline vector float load_aligned_float(float *dataPtr)
    1.37 +{
    1.38 +	vector float data = vec_lde(0, dataPtr);
    1.39 +	vector unsigned char moveToStart = vec_lvsl(0, dataPtr);
    1.40 +	return vec_perm(data, data, moveToStart);
    1.41 +}
    1.42 +
    1.43 +void qcms_transform_data_rgb_out_lut_altivec(qcms_transform *transform,
    1.44 +                                             unsigned char *src,
    1.45 +                                             unsigned char *dest,
    1.46 +                                             size_t length)
    1.47 +{
    1.48 +	unsigned int i;
    1.49 +	float (*mat)[4] = transform->matrix;
    1.50 +	char input_back[32];
    1.51 +	/* Ensure we have a buffer that's 16 byte aligned regardless of the original
    1.52 +	 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
    1.53 +	 * because they don't work on stack variables. gcc 4.4 does do the right thing
    1.54 +	 * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
    1.55 +	float const *input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
    1.56 +	/* share input and output locations to save having to keep the
    1.57 + 	 * locations in separate registers */
    1.58 +	uint32_t const *output = (uint32_t*)input;
    1.59 +
    1.60 +	/* deref *transform now to avoid it in loop */
    1.61 +	const float *igtbl_r = transform->input_gamma_table_r;
    1.62 +	const float *igtbl_g = transform->input_gamma_table_g;
    1.63 +	const float *igtbl_b = transform->input_gamma_table_b;
    1.64 +
    1.65 +	/* deref *transform now to avoid it in loop */
    1.66 +	const uint8_t *otdata_r = &transform->output_table_r->data[0];
    1.67 +	const uint8_t *otdata_g = &transform->output_table_g->data[0];
    1.68 +	const uint8_t *otdata_b = &transform->output_table_b->data[0];
    1.69 +
    1.70 +	/* input matrix values never change */
    1.71 +	const vector float mat0 = vec_ldl(0, (vector float*)mat[0]);
    1.72 +	const vector float mat1 = vec_ldl(0, (vector float*)mat[1]);
    1.73 +	const vector float mat2 = vec_ldl(0, (vector float*)mat[2]);
    1.74 +
    1.75 +	/* these values don't change, either */
    1.76 +	const vector float max = vec_splat(vec_lde(0, (float*)&clampMaxValueX4), 0);
    1.77 +	const vector float min = (vector float)vec_splat_u32(0);
    1.78 +	const vector float scale = vec_splat(vec_lde(0, (float*)&floatScaleX4), 0);
    1.79 +
    1.80 +	/* working variables */
    1.81 +	vector float vec_r, vec_g, vec_b, result;
    1.82 +
    1.83 +	/* CYA */
    1.84 +	if (!length)
    1.85 +		return;
    1.86 +
    1.87 +	/* one pixel is handled outside of the loop */
    1.88 +	length--;
    1.89 +
    1.90 +	/* setup for transforming 1st pixel */
    1.91 +	vec_r = load_aligned_float((float*)&igtbl_r[src[0]]);
    1.92 +	vec_g = load_aligned_float((float*)&igtbl_r[src[1]]);
    1.93 +	vec_b = load_aligned_float((float*)&igtbl_r[src[2]]);
    1.94 +	src += 3;
    1.95 +
    1.96 +	/* transform all but final pixel */
    1.97 +
    1.98 +	for (i=0; i<length; i++)
    1.99 +	{
   1.100 +		/* position values from gamma tables */
   1.101 +		vec_r = vec_splat(vec_r, 0);
   1.102 +		vec_g = vec_splat(vec_g, 0);
   1.103 +		vec_b = vec_splat(vec_b, 0);
   1.104 +
   1.105 +		/* gamma * matrix */
   1.106 +		vec_r = vec_madd(vec_r, mat0, min);
   1.107 +		vec_g = vec_madd(vec_g, mat1, min);
   1.108 +		vec_b = vec_madd(vec_b, mat2, min);
   1.109 +
   1.110 +		/* crunch, crunch, crunch */
   1.111 +		vec_r = vec_add(vec_r, vec_add(vec_g, vec_b));
   1.112 +		vec_r = vec_max(min, vec_r);
   1.113 +		vec_r = vec_min(max, vec_r);
   1.114 +		result = vec_madd(vec_r, scale, min);
   1.115 +
   1.116 +		/* store calc'd output tables indices */
   1.117 +		vec_st(vec_ctu(vec_round(result), 0), 0, (vector unsigned int*)output);
   1.118 +
   1.119 +		/* load for next loop while store completes */
   1.120 +		vec_r = load_aligned_float((float*)&igtbl_r[src[0]]);
   1.121 +		vec_g = load_aligned_float((float*)&igtbl_r[src[1]]);
   1.122 +		vec_b = load_aligned_float((float*)&igtbl_r[src[2]]);
   1.123 +		src += 3;
   1.124 +
   1.125 +		/* use calc'd indices to output RGB values */
   1.126 +		dest[0] = otdata_r[output[0]];
   1.127 +		dest[1] = otdata_g[output[1]];
   1.128 +		dest[2] = otdata_b[output[2]];
   1.129 +		dest += 3;
   1.130 +	}
   1.131 +
   1.132 +	/* handle final (maybe only) pixel */
   1.133 +
   1.134 +	vec_r = vec_splat(vec_r, 0);
   1.135 +	vec_g = vec_splat(vec_g, 0);
   1.136 +	vec_b = vec_splat(vec_b, 0);
   1.137 +
   1.138 +	vec_r = vec_madd(vec_r, mat0, min);
   1.139 +	vec_g = vec_madd(vec_g, mat1, min);
   1.140 +	vec_b = vec_madd(vec_b, mat2, min);
   1.141 +
   1.142 +	vec_r = vec_add(vec_r, vec_add(vec_g, vec_b));
   1.143 +	vec_r = vec_max(min, vec_r);
   1.144 +	vec_r = vec_min(max, vec_r);
   1.145 +	result = vec_madd(vec_r, scale, min);
   1.146 +
   1.147 +	vec_st(vec_ctu(vec_round(result),0),0,(vector unsigned int*)output);
   1.148 +
   1.149 +	dest[0] = otdata_r[output[0]];
   1.150 +	dest[1] = otdata_g[output[1]];
   1.151 +	dest[2] = otdata_b[output[2]];
   1.152 +}
   1.153 +
   1.154 +void qcms_transform_data_rgba_out_lut_altivec(qcms_transform *transform,
   1.155 +                                              unsigned char *src,
   1.156 +                                              unsigned char *dest,
   1.157 +                                              size_t length)
   1.158 +{
   1.159 +	unsigned int i;
   1.160 +	float (*mat)[4] = transform->matrix;
   1.161 +	char input_back[32];
   1.162 +	/* Ensure we have a buffer that's 16 byte aligned regardless of the original
   1.163 +	 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
   1.164 +	 * because they don't work on stack variables. gcc 4.4 does do the right thing
   1.165 +	 * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
   1.166 +	float const *input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
   1.167 +	/* share input and output locations to save having to keep the
   1.168 +	 * locations in separate registers */
   1.169 +	uint32_t const *output = (uint32_t*)input;
   1.170 +
   1.171 +	/* deref *transform now to avoid it in loop */
   1.172 +	const float *igtbl_r = transform->input_gamma_table_r;
   1.173 +	const float *igtbl_g = transform->input_gamma_table_g;
   1.174 +	const float *igtbl_b = transform->input_gamma_table_b;
   1.175 +
   1.176 +	/* deref *transform now to avoid it in loop */
   1.177 +	const uint8_t *otdata_r = &transform->output_table_r->data[0];
   1.178 +	const uint8_t *otdata_g = &transform->output_table_g->data[0];
   1.179 +	const uint8_t *otdata_b = &transform->output_table_b->data[0];
   1.180 +
   1.181 +	/* input matrix values never change */
   1.182 +	const vector float mat0 = vec_ldl(0, (vector float*)mat[0]);
   1.183 +	const vector float mat1 = vec_ldl(0, (vector float*)mat[1]);
   1.184 +	const vector float mat2 = vec_ldl(0, (vector float*)mat[2]);
   1.185 +
   1.186 +	/* these values don't change, either */
   1.187 +	const vector float max = vec_splat(vec_lde(0, (float*)&clampMaxValueX4), 0);
   1.188 +	const vector float min = (vector float)vec_splat_u32(0);
   1.189 +	const vector float scale = vec_splat(vec_lde(0, (float*)&floatScaleX4), 0);
   1.190 +
   1.191 +	/* working variables */
   1.192 +	vector float vec_r, vec_g, vec_b, result;
   1.193 +	unsigned char alpha;
   1.194 +
   1.195 +	/* CYA */
   1.196 +	if (!length)
   1.197 +		return;
   1.198 +
   1.199 +	/* one pixel is handled outside of the loop */
   1.200 +	length--;
   1.201 +
   1.202 +	/* setup for transforming 1st pixel */
   1.203 +	vec_r = load_aligned_float((float*)&igtbl_r[src[0]]);
   1.204 +	vec_g = load_aligned_float((float*)&igtbl_r[src[1]]);
   1.205 +	vec_b = load_aligned_float((float*)&igtbl_r[src[2]]);
   1.206 +	alpha = src[3];
   1.207 +	src += 4;
   1.208 +
   1.209 +	/* transform all but final pixel */
   1.210 +
   1.211 +	for (i=0; i<length; i++)
   1.212 +	{
   1.213 +		/* position values from gamma tables */
   1.214 +		vec_r = vec_splat(vec_r, 0);
   1.215 +		vec_g = vec_splat(vec_g, 0);
   1.216 +		vec_b = vec_splat(vec_b, 0);
   1.217 +
   1.218 +		/* gamma * matrix */
   1.219 +		vec_r = vec_madd(vec_r, mat0, min);
   1.220 +		vec_g = vec_madd(vec_g, mat1, min);
   1.221 +		vec_b = vec_madd(vec_b, mat2, min);
   1.222 +
   1.223 +		/* store alpha for this pixel; load alpha for next */
   1.224 +		dest[3] = alpha;
   1.225 +		alpha = src[3];
   1.226 +
   1.227 +		/* crunch, crunch, crunch */
   1.228 +		vec_r = vec_add(vec_r, vec_add(vec_g, vec_b));
   1.229 +		vec_r = vec_max(min, vec_r);
   1.230 +		vec_r = vec_min(max, vec_r);
   1.231 +		result = vec_madd(vec_r, scale, min);
   1.232 +
   1.233 +		/* store calc'd output tables indices */
   1.234 +		vec_st(vec_ctu(vec_round(result), 0), 0, (vector unsigned int*)output);
   1.235 +
   1.236 +		/* load gamma values for next loop while store completes */
   1.237 +		vec_r = load_aligned_float((float*)&igtbl_r[src[0]]);
   1.238 +		vec_g = load_aligned_float((float*)&igtbl_r[src[1]]);
   1.239 +		vec_b = load_aligned_float((float*)&igtbl_r[src[2]]);
   1.240 +		src += 4;
   1.241 +
   1.242 +		/* use calc'd indices to output RGB values */
   1.243 +		dest[0] = otdata_r[output[0]];
   1.244 +		dest[1] = otdata_g[output[1]];
   1.245 +		dest[2] = otdata_b[output[2]];
   1.246 +		dest += 4;
   1.247 +	}
   1.248 +
   1.249 +	/* handle final (maybe only) pixel */
   1.250 +
   1.251 +	vec_r = vec_splat(vec_r, 0);
   1.252 +	vec_g = vec_splat(vec_g, 0);
   1.253 +	vec_b = vec_splat(vec_b, 0);
   1.254 +
   1.255 +	vec_r = vec_madd(vec_r, mat0, min);
   1.256 +	vec_g = vec_madd(vec_g, mat1, min);
   1.257 +	vec_b = vec_madd(vec_b, mat2, min);
   1.258 +
   1.259 +	dest[3] = alpha;
   1.260 +
   1.261 +	vec_r = vec_add(vec_r, vec_add(vec_g, vec_b));
   1.262 +	vec_r = vec_max(min, vec_r);
   1.263 +	vec_r = vec_min(max, vec_r);
   1.264 +	result = vec_madd(vec_r, scale, min);
   1.265 +
   1.266 +	vec_st(vec_ctu(vec_round(result), 0), 0, (vector unsigned int*)output);
   1.267 +
   1.268 +	dest[0] = otdata_r[output[0]];
   1.269 +	dest[1] = otdata_g[output[1]];
   1.270 +	dest[2] = otdata_b[output[2]];
   1.271 +}
   1.272 +

mercurial