gfx/qcms/transform.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/qcms/transform.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1395 @@
     1.4 +/* vim: set ts=8 sw=8 noexpandtab: */
     1.5 +//  qcms
     1.6 +//  Copyright (C) 2009 Mozilla Corporation
     1.7 +//  Copyright (C) 1998-2007 Marti Maria
     1.8 +//
     1.9 +// Permission is hereby granted, free of charge, to any person obtaining 
    1.10 +// a copy of this software and associated documentation files (the "Software"), 
    1.11 +// to deal in the Software without restriction, including without limitation 
    1.12 +// the rights to use, copy, modify, merge, publish, distribute, sublicense, 
    1.13 +// and/or sell copies of the Software, and to permit persons to whom the Software 
    1.14 +// is furnished to do so, subject to the following conditions:
    1.15 +//
    1.16 +// The above copyright notice and this permission notice shall be included in 
    1.17 +// all copies or substantial portions of the Software.
    1.18 +//
    1.19 +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
    1.20 +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO 
    1.21 +// THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
    1.22 +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
    1.23 +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 
    1.24 +// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 
    1.25 +// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
    1.26 +
    1.27 +#include <stdlib.h>
    1.28 +#include <math.h>
    1.29 +#include <assert.h>
    1.30 +#include <string.h> //memcpy
    1.31 +#include "qcmsint.h"
    1.32 +#include "chain.h"
    1.33 +#include "matrix.h"
    1.34 +#include "transform_util.h"
    1.35 +
    1.36 +/* for MSVC, GCC, Intel, and Sun compilers */
    1.37 +#if defined(_M_IX86) || defined(__i386__) || defined(__i386) || defined(_M_AMD64) || defined(__x86_64__) || defined(__x86_64)
    1.38 +#define X86
    1.39 +#endif /* _M_IX86 || __i386__ || __i386 || _M_AMD64 || __x86_64__ || __x86_64 */
    1.40 +
    1.41 +/**
    1.42 + * AltiVec detection for PowerPC CPUs
    1.43 + * In case we have a method of detecting do the runtime detection.
    1.44 + * Otherwise statically choose the AltiVec path in case the compiler
    1.45 + * was told to build with AltiVec support.
    1.46 + */
    1.47 +#if (defined(__POWERPC__) || defined(__powerpc__))
    1.48 +#if defined(__linux__)
    1.49 +#include <unistd.h>
    1.50 +#include <fcntl.h>
    1.51 +#include <stdio.h>
    1.52 +#include <elf.h>
    1.53 +#include <linux/auxvec.h>
    1.54 +#include <asm/cputable.h>
    1.55 +#include <link.h>
    1.56 +
    1.57 +static inline qcms_bool have_altivec() {
    1.58 +	static int available = -1;
    1.59 +	int new_avail = 0;
    1.60 +        ElfW(auxv_t) auxv;
    1.61 +	ssize_t count;
    1.62 +	int fd, i;
    1.63 +
    1.64 +	if (available != -1)
    1.65 +		return (available != 0 ? true : false);
    1.66 +
    1.67 +	fd = open("/proc/self/auxv", O_RDONLY);
    1.68 +	if (fd < 0)
    1.69 +		goto out;
    1.70 +	do {
    1.71 +		count = read(fd, &auxv, sizeof(auxv));
    1.72 +		if (count < 0)
    1.73 +			goto out_close;
    1.74 +
    1.75 +		if (auxv.a_type == AT_HWCAP) {
    1.76 +			new_avail = !!(auxv.a_un.a_val & PPC_FEATURE_HAS_ALTIVEC);
    1.77 +			goto out_close;
    1.78 +		}
    1.79 +	} while (auxv.a_type != AT_NULL);
    1.80 +
    1.81 +out_close:
    1.82 +	close(fd);
    1.83 +out:
    1.84 +	available = new_avail;
    1.85 +	return (available != 0 ? true : false);
    1.86 +}
    1.87 +#elif defined(__APPLE__) && defined(__MACH__)
    1.88 +#include <sys/sysctl.h>
    1.89 +
    1.90 +/**
    1.91 + * rip-off from ffmpeg AltiVec detection code.
    1.92 + * this code also appears on Apple's AltiVec pages.
    1.93 + */
    1.94 +static inline qcms_bool have_altivec() {
    1.95 +	int sels[2] = {CTL_HW, HW_VECTORUNIT};
    1.96 +	static int available = -1;
    1.97 +	size_t len = sizeof(available);
    1.98 +	int err;
    1.99 +
   1.100 +	if (available != -1)
   1.101 +		return (available != 0 ? true : false);
   1.102 +
   1.103 +	err = sysctl(sels, 2, &available, &len, NULL, 0);
   1.104 +
   1.105 +	if (err == 0)
   1.106 +		if (available != 0)
   1.107 +			return true;
   1.108 +
   1.109 +	return false;
   1.110 +}
   1.111 +#elif defined(__ALTIVEC__) || defined(__APPLE_ALTIVEC__)
   1.112 +#define have_altivec() true
   1.113 +#else
   1.114 +#define have_altivec() false
   1.115 +#endif
   1.116 +#endif // (defined(__POWERPC__) || defined(__powerpc__))
   1.117 +
   1.118 +// Build a White point, primary chromas transfer matrix from RGB to CIE XYZ
   1.119 +// This is just an approximation, I am not handling all the non-linear
   1.120 +// aspects of the RGB to XYZ process, and assumming that the gamma correction
   1.121 +// has transitive property in the tranformation chain.
   1.122 +//
   1.123 +// the alghoritm:
   1.124 +//
   1.125 +//            - First I build the absolute conversion matrix using
   1.126 +//              primaries in XYZ. This matrix is next inverted
   1.127 +//            - Then I eval the source white point across this matrix
   1.128 +//              obtaining the coeficients of the transformation
   1.129 +//            - Then, I apply these coeficients to the original matrix
   1.130 +static struct matrix build_RGB_to_XYZ_transfer_matrix(qcms_CIE_xyY white, qcms_CIE_xyYTRIPLE primrs)
   1.131 +{
   1.132 +	struct matrix primaries;
   1.133 +	struct matrix primaries_invert;
   1.134 +	struct matrix result;
   1.135 +	struct vector white_point;
   1.136 +	struct vector coefs;
   1.137 +
   1.138 +	double xn, yn;
   1.139 +	double xr, yr;
   1.140 +	double xg, yg;
   1.141 +	double xb, yb;
   1.142 +
   1.143 +	xn = white.x;
   1.144 +	yn = white.y;
   1.145 +
   1.146 +	if (yn == 0.0)
   1.147 +		return matrix_invalid();
   1.148 +
   1.149 +	xr = primrs.red.x;
   1.150 +	yr = primrs.red.y;
   1.151 +	xg = primrs.green.x;
   1.152 +	yg = primrs.green.y;
   1.153 +	xb = primrs.blue.x;
   1.154 +	yb = primrs.blue.y;
   1.155 +
   1.156 +	primaries.m[0][0] = xr;
   1.157 +	primaries.m[0][1] = xg;
   1.158 +	primaries.m[0][2] = xb;
   1.159 +
   1.160 +	primaries.m[1][0] = yr;
   1.161 +	primaries.m[1][1] = yg;
   1.162 +	primaries.m[1][2] = yb;
   1.163 +
   1.164 +	primaries.m[2][0] = 1 - xr - yr;
   1.165 +	primaries.m[2][1] = 1 - xg - yg;
   1.166 +	primaries.m[2][2] = 1 - xb - yb;
   1.167 +	primaries.invalid = false;
   1.168 +
   1.169 +	white_point.v[0] = xn/yn;
   1.170 +	white_point.v[1] = 1.;
   1.171 +	white_point.v[2] = (1.0-xn-yn)/yn;
   1.172 +
   1.173 +	primaries_invert = matrix_invert(primaries);
   1.174 +
   1.175 +	coefs = matrix_eval(primaries_invert, white_point);
   1.176 +
   1.177 +	result.m[0][0] = coefs.v[0]*xr;
   1.178 +	result.m[0][1] = coefs.v[1]*xg;
   1.179 +	result.m[0][2] = coefs.v[2]*xb;
   1.180 +
   1.181 +	result.m[1][0] = coefs.v[0]*yr;
   1.182 +	result.m[1][1] = coefs.v[1]*yg;
   1.183 +	result.m[1][2] = coefs.v[2]*yb;
   1.184 +
   1.185 +	result.m[2][0] = coefs.v[0]*(1.-xr-yr);
   1.186 +	result.m[2][1] = coefs.v[1]*(1.-xg-yg);
   1.187 +	result.m[2][2] = coefs.v[2]*(1.-xb-yb);
   1.188 +	result.invalid = primaries_invert.invalid;
   1.189 +
   1.190 +	return result;
   1.191 +}
   1.192 +
   1.193 +struct CIE_XYZ {
   1.194 +	double X;
   1.195 +	double Y;
   1.196 +	double Z;
   1.197 +};
   1.198 +
   1.199 +/* CIE Illuminant D50 */
   1.200 +static const struct CIE_XYZ D50_XYZ = {
   1.201 +	0.9642,
   1.202 +	1.0000,
   1.203 +	0.8249
   1.204 +};
   1.205 +
   1.206 +/* from lcms: xyY2XYZ()
   1.207 + * corresponds to argyll: icmYxy2XYZ() */
   1.208 +static struct CIE_XYZ xyY2XYZ(qcms_CIE_xyY source)
   1.209 +{
   1.210 +	struct CIE_XYZ dest;
   1.211 +	dest.X = (source.x / source.y) * source.Y;
   1.212 +	dest.Y = source.Y;
   1.213 +	dest.Z = ((1 - source.x - source.y) / source.y) * source.Y;
   1.214 +	return dest;
   1.215 +}
   1.216 +
   1.217 +/* from lcms: ComputeChromaticAdaption */
   1.218 +// Compute chromatic adaption matrix using chad as cone matrix
   1.219 +static struct matrix
   1.220 +compute_chromatic_adaption(struct CIE_XYZ source_white_point,
   1.221 +                           struct CIE_XYZ dest_white_point,
   1.222 +                           struct matrix chad)
   1.223 +{
   1.224 +	struct matrix chad_inv;
   1.225 +	struct vector cone_source_XYZ, cone_source_rgb;
   1.226 +	struct vector cone_dest_XYZ, cone_dest_rgb;
   1.227 +	struct matrix cone, tmp;
   1.228 +
   1.229 +	tmp = chad;
   1.230 +	chad_inv = matrix_invert(tmp);
   1.231 +
   1.232 +	cone_source_XYZ.v[0] = source_white_point.X;
   1.233 +	cone_source_XYZ.v[1] = source_white_point.Y;
   1.234 +	cone_source_XYZ.v[2] = source_white_point.Z;
   1.235 +
   1.236 +	cone_dest_XYZ.v[0] = dest_white_point.X;
   1.237 +	cone_dest_XYZ.v[1] = dest_white_point.Y;
   1.238 +	cone_dest_XYZ.v[2] = dest_white_point.Z;
   1.239 +
   1.240 +	cone_source_rgb = matrix_eval(chad, cone_source_XYZ);
   1.241 +	cone_dest_rgb   = matrix_eval(chad, cone_dest_XYZ);
   1.242 +
   1.243 +	cone.m[0][0] = cone_dest_rgb.v[0]/cone_source_rgb.v[0];
   1.244 +	cone.m[0][1] = 0;
   1.245 +	cone.m[0][2] = 0;
   1.246 +	cone.m[1][0] = 0;
   1.247 +	cone.m[1][1] = cone_dest_rgb.v[1]/cone_source_rgb.v[1];
   1.248 +	cone.m[1][2] = 0;
   1.249 +	cone.m[2][0] = 0;
   1.250 +	cone.m[2][1] = 0;
   1.251 +	cone.m[2][2] = cone_dest_rgb.v[2]/cone_source_rgb.v[2];
   1.252 +	cone.invalid = false;
   1.253 +
   1.254 +	// Normalize
   1.255 +	return matrix_multiply(chad_inv, matrix_multiply(cone, chad));
   1.256 +}
   1.257 +
   1.258 +/* from lcms: cmsAdaptionMatrix */
   1.259 +// Returns the final chrmatic adaptation from illuminant FromIll to Illuminant ToIll
   1.260 +// Bradford is assumed
   1.261 +static struct matrix
   1.262 +adaption_matrix(struct CIE_XYZ source_illumination, struct CIE_XYZ target_illumination)
   1.263 +{
   1.264 +	struct matrix lam_rigg = {{ // Bradford matrix
   1.265 +	                         {  0.8951,  0.2664, -0.1614 },
   1.266 +	                         { -0.7502,  1.7135,  0.0367 },
   1.267 +	                         {  0.0389, -0.0685,  1.0296 }
   1.268 +	                         }};
   1.269 +	return compute_chromatic_adaption(source_illumination, target_illumination, lam_rigg);
   1.270 +}
   1.271 +
   1.272 +/* from lcms: cmsAdaptMatrixToD50 */
   1.273 +static struct matrix adapt_matrix_to_D50(struct matrix r, qcms_CIE_xyY source_white_pt)
   1.274 +{
   1.275 +	struct CIE_XYZ Dn;
   1.276 +	struct matrix Bradford;
   1.277 +
   1.278 +	if (source_white_pt.y == 0.0)
   1.279 +		return matrix_invalid();
   1.280 +
   1.281 +	Dn = xyY2XYZ(source_white_pt);
   1.282 +
   1.283 +	Bradford = adaption_matrix(Dn, D50_XYZ);
   1.284 +	return matrix_multiply(Bradford, r);
   1.285 +}
   1.286 +
   1.287 +qcms_bool set_rgb_colorants(qcms_profile *profile, qcms_CIE_xyY white_point, qcms_CIE_xyYTRIPLE primaries)
   1.288 +{
   1.289 +	struct matrix colorants;
   1.290 +	colorants = build_RGB_to_XYZ_transfer_matrix(white_point, primaries);
   1.291 +	colorants = adapt_matrix_to_D50(colorants, white_point);
   1.292 +
   1.293 +	if (colorants.invalid)
   1.294 +		return false;
   1.295 +
   1.296 +	/* note: there's a transpose type of operation going on here */
   1.297 +	profile->redColorant.X = double_to_s15Fixed16Number(colorants.m[0][0]);
   1.298 +	profile->redColorant.Y = double_to_s15Fixed16Number(colorants.m[1][0]);
   1.299 +	profile->redColorant.Z = double_to_s15Fixed16Number(colorants.m[2][0]);
   1.300 +
   1.301 +	profile->greenColorant.X = double_to_s15Fixed16Number(colorants.m[0][1]);
   1.302 +	profile->greenColorant.Y = double_to_s15Fixed16Number(colorants.m[1][1]);
   1.303 +	profile->greenColorant.Z = double_to_s15Fixed16Number(colorants.m[2][1]);
   1.304 +
   1.305 +	profile->blueColorant.X = double_to_s15Fixed16Number(colorants.m[0][2]);
   1.306 +	profile->blueColorant.Y = double_to_s15Fixed16Number(colorants.m[1][2]);
   1.307 +	profile->blueColorant.Z = double_to_s15Fixed16Number(colorants.m[2][2]);
   1.308 +
   1.309 +	return true;
   1.310 +}
   1.311 +
   1.312 +qcms_bool get_rgb_colorants(struct matrix *colorants, qcms_CIE_xyY white_point, qcms_CIE_xyYTRIPLE primaries)
   1.313 +{
   1.314 +	*colorants = build_RGB_to_XYZ_transfer_matrix(white_point, primaries);
   1.315 +	*colorants = adapt_matrix_to_D50(*colorants, white_point);
   1.316 +
   1.317 +	return (colorants->invalid ? true : false);
   1.318 +}
   1.319 +
   1.320 +#if 0
   1.321 +static void qcms_transform_data_rgb_out_pow(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
   1.322 +{
   1.323 +	int i;
   1.324 +	float (*mat)[4] = transform->matrix;
   1.325 +	for (i=0; i<length; i++) {
   1.326 +		unsigned char device_r = *src++;
   1.327 +		unsigned char device_g = *src++;
   1.328 +		unsigned char device_b = *src++;
   1.329 +
   1.330 +		float linear_r = transform->input_gamma_table_r[device_r];
   1.331 +		float linear_g = transform->input_gamma_table_g[device_g];
   1.332 +		float linear_b = transform->input_gamma_table_b[device_b];
   1.333 +
   1.334 +		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
   1.335 +		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
   1.336 +		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
   1.337 +
   1.338 +		float out_device_r = pow(out_linear_r, transform->out_gamma_r);
   1.339 +		float out_device_g = pow(out_linear_g, transform->out_gamma_g);
   1.340 +		float out_device_b = pow(out_linear_b, transform->out_gamma_b);
   1.341 +
   1.342 +		dest[OUTPUT_R_INDEX] = clamp_u8(255*out_device_r);
   1.343 +		dest[OUTPUT_G_INDEX] = clamp_u8(255*out_device_g);
   1.344 +		dest[OUTPUT_B_INDEX] = clamp_u8(255*out_device_b);
   1.345 +		dest += RGB_OUTPUT_COMPONENTS;
   1.346 +	}
   1.347 +}
   1.348 +#endif
   1.349 +
   1.350 +static void qcms_transform_data_gray_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
   1.351 +{
   1.352 +	unsigned int i;
   1.353 +	for (i = 0; i < length; i++) {
   1.354 +		float out_device_r, out_device_g, out_device_b;
   1.355 +		unsigned char device = *src++;
   1.356 +
   1.357 +		float linear = transform->input_gamma_table_gray[device];
   1.358 +
   1.359 +                out_device_r = lut_interp_linear(linear, transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
   1.360 +		out_device_g = lut_interp_linear(linear, transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
   1.361 +		out_device_b = lut_interp_linear(linear, transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
   1.362 +
   1.363 +		dest[OUTPUT_R_INDEX] = clamp_u8(out_device_r*255);
   1.364 +		dest[OUTPUT_G_INDEX] = clamp_u8(out_device_g*255);
   1.365 +		dest[OUTPUT_B_INDEX] = clamp_u8(out_device_b*255);
   1.366 +		dest += RGB_OUTPUT_COMPONENTS;
   1.367 +	}
   1.368 +}
   1.369 +
   1.370 +/* Alpha is not corrected.
   1.371 +   A rationale for this is found in Alvy Ray's "Should Alpha Be Nonlinear If
   1.372 +   RGB Is?" Tech Memo 17 (December 14, 1998).
   1.373 +	See: ftp://ftp.alvyray.com/Acrobat/17_Nonln.pdf
   1.374 +*/
   1.375 +
   1.376 +static void qcms_transform_data_graya_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
   1.377 +{
   1.378 +	unsigned int i;
   1.379 +	for (i = 0; i < length; i++) {
   1.380 +		float out_device_r, out_device_g, out_device_b;
   1.381 +		unsigned char device = *src++;
   1.382 +		unsigned char alpha = *src++;
   1.383 +
   1.384 +		float linear = transform->input_gamma_table_gray[device];
   1.385 +
   1.386 +                out_device_r = lut_interp_linear(linear, transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
   1.387 +		out_device_g = lut_interp_linear(linear, transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
   1.388 +		out_device_b = lut_interp_linear(linear, transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
   1.389 +
   1.390 +		dest[OUTPUT_R_INDEX] = clamp_u8(out_device_r*255);
   1.391 +		dest[OUTPUT_G_INDEX] = clamp_u8(out_device_g*255);
   1.392 +		dest[OUTPUT_B_INDEX] = clamp_u8(out_device_b*255);
   1.393 +		dest[OUTPUT_A_INDEX] = alpha;
   1.394 +		dest += RGBA_OUTPUT_COMPONENTS;
   1.395 +	}
   1.396 +}
   1.397 +
   1.398 +
   1.399 +static void qcms_transform_data_gray_out_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
   1.400 +{
   1.401 +	unsigned int i;
   1.402 +	for (i = 0; i < length; i++) {
   1.403 +		unsigned char device = *src++;
   1.404 +		uint16_t gray;
   1.405 +
   1.406 +		float linear = transform->input_gamma_table_gray[device];
   1.407 +
   1.408 +		/* we could round here... */
   1.409 +		gray = linear * PRECACHE_OUTPUT_MAX;
   1.410 +
   1.411 +		dest[OUTPUT_R_INDEX] = transform->output_table_r->data[gray];
   1.412 +		dest[OUTPUT_G_INDEX] = transform->output_table_g->data[gray];
   1.413 +		dest[OUTPUT_B_INDEX] = transform->output_table_b->data[gray];
   1.414 +		dest += RGB_OUTPUT_COMPONENTS;
   1.415 +	}
   1.416 +}
   1.417 +
   1.418 +static void qcms_transform_data_graya_out_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
   1.419 +{
   1.420 +	unsigned int i;
   1.421 +	for (i = 0; i < length; i++) {
   1.422 +		unsigned char device = *src++;
   1.423 +		unsigned char alpha = *src++;
   1.424 +		uint16_t gray;
   1.425 +
   1.426 +		float linear = transform->input_gamma_table_gray[device];
   1.427 +
   1.428 +		/* we could round here... */
   1.429 +		gray = linear * PRECACHE_OUTPUT_MAX;
   1.430 +
   1.431 +		dest[OUTPUT_R_INDEX] = transform->output_table_r->data[gray];
   1.432 +		dest[OUTPUT_G_INDEX] = transform->output_table_g->data[gray];
   1.433 +		dest[OUTPUT_B_INDEX] = transform->output_table_b->data[gray];
   1.434 +		dest[OUTPUT_A_INDEX] = alpha;
   1.435 +		dest += RGBA_OUTPUT_COMPONENTS;
   1.436 +	}
   1.437 +}
   1.438 +
   1.439 +static void qcms_transform_data_rgb_out_lut_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
   1.440 +{
   1.441 +	unsigned int i;
   1.442 +	float (*mat)[4] = transform->matrix;
   1.443 +	for (i = 0; i < length; i++) {
   1.444 +		unsigned char device_r = *src++;
   1.445 +		unsigned char device_g = *src++;
   1.446 +		unsigned char device_b = *src++;
   1.447 +		uint16_t r, g, b;
   1.448 +
   1.449 +		float linear_r = transform->input_gamma_table_r[device_r];
   1.450 +		float linear_g = transform->input_gamma_table_g[device_g];
   1.451 +		float linear_b = transform->input_gamma_table_b[device_b];
   1.452 +
   1.453 +		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
   1.454 +		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
   1.455 +		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
   1.456 +
   1.457 +		out_linear_r = clamp_float(out_linear_r);
   1.458 +		out_linear_g = clamp_float(out_linear_g);
   1.459 +		out_linear_b = clamp_float(out_linear_b);
   1.460 +
   1.461 +		/* we could round here... */
   1.462 +		r = out_linear_r * PRECACHE_OUTPUT_MAX;
   1.463 +		g = out_linear_g * PRECACHE_OUTPUT_MAX;
   1.464 +		b = out_linear_b * PRECACHE_OUTPUT_MAX;
   1.465 +
   1.466 +		dest[OUTPUT_R_INDEX] = transform->output_table_r->data[r];
   1.467 +		dest[OUTPUT_G_INDEX] = transform->output_table_g->data[g];
   1.468 +		dest[OUTPUT_B_INDEX] = transform->output_table_b->data[b];
   1.469 +		dest += RGB_OUTPUT_COMPONENTS;
   1.470 +	}
   1.471 +}
   1.472 +
   1.473 +static void qcms_transform_data_rgba_out_lut_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
   1.474 +{
   1.475 +	unsigned int i;
   1.476 +	float (*mat)[4] = transform->matrix;
   1.477 +	for (i = 0; i < length; i++) {
   1.478 +		unsigned char device_r = *src++;
   1.479 +		unsigned char device_g = *src++;
   1.480 +		unsigned char device_b = *src++;
   1.481 +		unsigned char alpha = *src++;
   1.482 +		uint16_t r, g, b;
   1.483 +
   1.484 +		float linear_r = transform->input_gamma_table_r[device_r];
   1.485 +		float linear_g = transform->input_gamma_table_g[device_g];
   1.486 +		float linear_b = transform->input_gamma_table_b[device_b];
   1.487 +
   1.488 +		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
   1.489 +		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
   1.490 +		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
   1.491 +
   1.492 +		out_linear_r = clamp_float(out_linear_r);
   1.493 +		out_linear_g = clamp_float(out_linear_g);
   1.494 +		out_linear_b = clamp_float(out_linear_b);
   1.495 +
   1.496 +		/* we could round here... */
   1.497 +		r = out_linear_r * PRECACHE_OUTPUT_MAX;
   1.498 +		g = out_linear_g * PRECACHE_OUTPUT_MAX;
   1.499 +		b = out_linear_b * PRECACHE_OUTPUT_MAX;
   1.500 +
   1.501 +		dest[OUTPUT_R_INDEX] = transform->output_table_r->data[r];
   1.502 +		dest[OUTPUT_G_INDEX] = transform->output_table_g->data[g];
   1.503 +		dest[OUTPUT_B_INDEX] = transform->output_table_b->data[b];
   1.504 +		dest[OUTPUT_A_INDEX] = alpha;
   1.505 +		dest += RGBA_OUTPUT_COMPONENTS;
   1.506 +	}
   1.507 +}
   1.508 +
   1.509 +// Not used
   1.510 +/* 
   1.511 +static void qcms_transform_data_clut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length) {
   1.512 +	unsigned int i;
   1.513 +	int xy_len = 1;
   1.514 +	int x_len = transform->grid_size;
   1.515 +	int len = x_len * x_len;
   1.516 +	float* r_table = transform->r_clut;
   1.517 +	float* g_table = transform->g_clut;
   1.518 +	float* b_table = transform->b_clut;
   1.519 +  
   1.520 +	for (i = 0; i < length; i++) {
   1.521 +		unsigned char in_r = *src++;
   1.522 +		unsigned char in_g = *src++;
   1.523 +		unsigned char in_b = *src++;
   1.524 +		float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
   1.525 +
   1.526 +		int x = floorf(linear_r * (transform->grid_size-1));
   1.527 +		int y = floorf(linear_g * (transform->grid_size-1));
   1.528 +		int z = floorf(linear_b * (transform->grid_size-1));
   1.529 +		int x_n = ceilf(linear_r * (transform->grid_size-1));
   1.530 +		int y_n = ceilf(linear_g * (transform->grid_size-1));
   1.531 +		int z_n = ceilf(linear_b * (transform->grid_size-1));
   1.532 +		float x_d = linear_r * (transform->grid_size-1) - x; 
   1.533 +		float y_d = linear_g * (transform->grid_size-1) - y;
   1.534 +		float z_d = linear_b * (transform->grid_size-1) - z; 
   1.535 +
   1.536 +		float r_x1 = lerp(CLU(r_table,x,y,z), CLU(r_table,x_n,y,z), x_d);
   1.537 +		float r_x2 = lerp(CLU(r_table,x,y_n,z), CLU(r_table,x_n,y_n,z), x_d);
   1.538 +		float r_y1 = lerp(r_x1, r_x2, y_d);
   1.539 +		float r_x3 = lerp(CLU(r_table,x,y,z_n), CLU(r_table,x_n,y,z_n), x_d);
   1.540 +		float r_x4 = lerp(CLU(r_table,x,y_n,z_n), CLU(r_table,x_n,y_n,z_n), x_d);
   1.541 +		float r_y2 = lerp(r_x3, r_x4, y_d);
   1.542 +		float clut_r = lerp(r_y1, r_y2, z_d);
   1.543 +
   1.544 +		float g_x1 = lerp(CLU(g_table,x,y,z), CLU(g_table,x_n,y,z), x_d);
   1.545 +		float g_x2 = lerp(CLU(g_table,x,y_n,z), CLU(g_table,x_n,y_n,z), x_d);
   1.546 +		float g_y1 = lerp(g_x1, g_x2, y_d);
   1.547 +		float g_x3 = lerp(CLU(g_table,x,y,z_n), CLU(g_table,x_n,y,z_n), x_d);
   1.548 +		float g_x4 = lerp(CLU(g_table,x,y_n,z_n), CLU(g_table,x_n,y_n,z_n), x_d);
   1.549 +		float g_y2 = lerp(g_x3, g_x4, y_d);
   1.550 +		float clut_g = lerp(g_y1, g_y2, z_d);
   1.551 +
   1.552 +		float b_x1 = lerp(CLU(b_table,x,y,z), CLU(b_table,x_n,y,z), x_d);
   1.553 +		float b_x2 = lerp(CLU(b_table,x,y_n,z), CLU(b_table,x_n,y_n,z), x_d);
   1.554 +		float b_y1 = lerp(b_x1, b_x2, y_d);
   1.555 +		float b_x3 = lerp(CLU(b_table,x,y,z_n), CLU(b_table,x_n,y,z_n), x_d);
   1.556 +		float b_x4 = lerp(CLU(b_table,x,y_n,z_n), CLU(b_table,x_n,y_n,z_n), x_d);
   1.557 +		float b_y2 = lerp(b_x3, b_x4, y_d);
   1.558 +		float clut_b = lerp(b_y1, b_y2, z_d);
   1.559 +
   1.560 +		*dest++ = clamp_u8(clut_r*255.0f);
   1.561 +		*dest++ = clamp_u8(clut_g*255.0f);
   1.562 +		*dest++ = clamp_u8(clut_b*255.0f);
   1.563 +	}	
   1.564 +}
   1.565 +*/
   1.566 +
   1.567 +static int int_div_ceil(int value, int div) {
   1.568 +	return ((value  + div - 1) / div);
   1.569 +}
   1.570 +
   1.571 +// Using lcms' tetra interpolation algorithm.
   1.572 +static void qcms_transform_data_tetra_clut_rgba(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length) {
   1.573 +	unsigned int i;
   1.574 +	int xy_len = 1;
   1.575 +	int x_len = transform->grid_size;
   1.576 +	int len = x_len * x_len;
   1.577 +	float* r_table = transform->r_clut;
   1.578 +	float* g_table = transform->g_clut;
   1.579 +	float* b_table = transform->b_clut;
   1.580 +	float c0_r, c1_r, c2_r, c3_r;
   1.581 +	float c0_g, c1_g, c2_g, c3_g;
   1.582 +	float c0_b, c1_b, c2_b, c3_b;
   1.583 +	float clut_r, clut_g, clut_b;
   1.584 +	for (i = 0; i < length; i++) {
   1.585 +		unsigned char in_r = *src++;
   1.586 +		unsigned char in_g = *src++;
   1.587 +		unsigned char in_b = *src++;
   1.588 +		unsigned char in_a = *src++;
   1.589 +		float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
   1.590 +
   1.591 +		int x = in_r * (transform->grid_size-1) / 255;
   1.592 +		int y = in_g * (transform->grid_size-1) / 255;
   1.593 +		int z = in_b * (transform->grid_size-1) / 255;
   1.594 +		int x_n = int_div_ceil(in_r * (transform->grid_size-1), 255);
   1.595 +		int y_n = int_div_ceil(in_g * (transform->grid_size-1), 255);
   1.596 +		int z_n = int_div_ceil(in_b * (transform->grid_size-1), 255);
   1.597 +		float rx = linear_r * (transform->grid_size-1) - x; 
   1.598 +		float ry = linear_g * (transform->grid_size-1) - y;
   1.599 +		float rz = linear_b * (transform->grid_size-1) - z; 
   1.600 +
   1.601 +		c0_r = CLU(r_table, x, y, z);
   1.602 +		c0_g = CLU(g_table, x, y, z);
   1.603 +		c0_b = CLU(b_table, x, y, z);
   1.604 +
   1.605 +		if( rx >= ry ) {
   1.606 +			if (ry >= rz) { //rx >= ry && ry >= rz
   1.607 +				c1_r = CLU(r_table, x_n, y, z) - c0_r;
   1.608 +				c2_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x_n, y, z);
   1.609 +				c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
   1.610 +				c1_g = CLU(g_table, x_n, y, z) - c0_g;
   1.611 +				c2_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x_n, y, z);
   1.612 +				c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
   1.613 +				c1_b = CLU(b_table, x_n, y, z) - c0_b;
   1.614 +				c2_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x_n, y, z);
   1.615 +				c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
   1.616 +			} else { 
   1.617 +				if (rx >= rz) { //rx >= rz && rz >= ry
   1.618 +					c1_r = CLU(r_table, x_n, y, z) - c0_r;
   1.619 +					c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
   1.620 +					c3_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x_n, y, z);
   1.621 +					c1_g = CLU(g_table, x_n, y, z) - c0_g;
   1.622 +					c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
   1.623 +					c3_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x_n, y, z);
   1.624 +					c1_b = CLU(b_table, x_n, y, z) - c0_b;
   1.625 +					c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
   1.626 +					c3_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x_n, y, z);
   1.627 +				} else { //rz > rx && rx >= ry
   1.628 +					c1_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x, y, z_n);
   1.629 +					c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
   1.630 +					c3_r = CLU(r_table, x, y, z_n) - c0_r;
   1.631 +					c1_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x, y, z_n);
   1.632 +					c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
   1.633 +					c3_g = CLU(g_table, x, y, z_n) - c0_g;
   1.634 +					c1_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x, y, z_n);
   1.635 +					c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
   1.636 +					c3_b = CLU(b_table, x, y, z_n) - c0_b;
   1.637 +				}
   1.638 +			}
   1.639 +		} else {
   1.640 +			if (rx >= rz) { //ry > rx && rx >= rz
   1.641 +				c1_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x, y_n, z);
   1.642 +				c2_r = CLU(r_table, x, y_n, z) - c0_r;
   1.643 +				c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
   1.644 +				c1_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x, y_n, z);
   1.645 +				c2_g = CLU(g_table, x, y_n, z) - c0_g;
   1.646 +				c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
   1.647 +				c1_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x, y_n, z);
   1.648 +				c2_b = CLU(b_table, x, y_n, z) - c0_b;
   1.649 +				c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
   1.650 +			} else {
   1.651 +				if (ry >= rz) { //ry >= rz && rz > rx 
   1.652 +					c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
   1.653 +					c2_r = CLU(r_table, x, y_n, z) - c0_r;
   1.654 +					c3_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y_n, z);
   1.655 +					c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
   1.656 +					c2_g = CLU(g_table, x, y_n, z) - c0_g;
   1.657 +					c3_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y_n, z);
   1.658 +					c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
   1.659 +					c2_b = CLU(b_table, x, y_n, z) - c0_b;
   1.660 +					c3_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y_n, z);
   1.661 +				} else { //rz > ry && ry > rx
   1.662 +					c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
   1.663 +					c2_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y, z_n);
   1.664 +					c3_r = CLU(r_table, x, y, z_n) - c0_r;
   1.665 +					c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
   1.666 +					c2_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y, z_n);
   1.667 +					c3_g = CLU(g_table, x, y, z_n) - c0_g;
   1.668 +					c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
   1.669 +					c2_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y, z_n);
   1.670 +					c3_b = CLU(b_table, x, y, z_n) - c0_b;
   1.671 +				}
   1.672 +			}
   1.673 +		}
   1.674 +				
   1.675 +		clut_r = c0_r + c1_r*rx + c2_r*ry + c3_r*rz;
   1.676 +		clut_g = c0_g + c1_g*rx + c2_g*ry + c3_g*rz;
   1.677 +		clut_b = c0_b + c1_b*rx + c2_b*ry + c3_b*rz;
   1.678 +
   1.679 +		dest[OUTPUT_R_INDEX] = clamp_u8(clut_r*255.0f);
   1.680 +		dest[OUTPUT_G_INDEX] = clamp_u8(clut_g*255.0f);
   1.681 +		dest[OUTPUT_B_INDEX] = clamp_u8(clut_b*255.0f);
   1.682 +		dest[OUTPUT_A_INDEX] = in_a;
   1.683 +		dest += RGBA_OUTPUT_COMPONENTS;
   1.684 +	}	
   1.685 +}
   1.686 +
   1.687 +// Using lcms' tetra interpolation code.
   1.688 +static void qcms_transform_data_tetra_clut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length) {
   1.689 +	unsigned int i;
   1.690 +	int xy_len = 1;
   1.691 +	int x_len = transform->grid_size;
   1.692 +	int len = x_len * x_len;
   1.693 +	float* r_table = transform->r_clut;
   1.694 +	float* g_table = transform->g_clut;
   1.695 +	float* b_table = transform->b_clut;
   1.696 +	float c0_r, c1_r, c2_r, c3_r;
   1.697 +	float c0_g, c1_g, c2_g, c3_g;
   1.698 +	float c0_b, c1_b, c2_b, c3_b;
   1.699 +	float clut_r, clut_g, clut_b;
   1.700 +	for (i = 0; i < length; i++) {
   1.701 +		unsigned char in_r = *src++;
   1.702 +		unsigned char in_g = *src++;
   1.703 +		unsigned char in_b = *src++;
   1.704 +		float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
   1.705 +
   1.706 +		int x = in_r * (transform->grid_size-1) / 255;
   1.707 +		int y = in_g * (transform->grid_size-1) / 255;
   1.708 +		int z = in_b * (transform->grid_size-1) / 255;
   1.709 +		int x_n = int_div_ceil(in_r * (transform->grid_size-1), 255);
   1.710 +		int y_n = int_div_ceil(in_g * (transform->grid_size-1), 255);
   1.711 +		int z_n = int_div_ceil(in_b * (transform->grid_size-1), 255);
   1.712 +		float rx = linear_r * (transform->grid_size-1) - x;
   1.713 +		float ry = linear_g * (transform->grid_size-1) - y;
   1.714 +		float rz = linear_b * (transform->grid_size-1) - z;
   1.715 +
   1.716 +		c0_r = CLU(r_table, x, y, z);
   1.717 +		c0_g = CLU(g_table, x, y, z);
   1.718 +		c0_b = CLU(b_table, x, y, z);
   1.719 +
   1.720 +		if( rx >= ry ) {
   1.721 +			if (ry >= rz) { //rx >= ry && ry >= rz
   1.722 +				c1_r = CLU(r_table, x_n, y, z) - c0_r;
   1.723 +				c2_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x_n, y, z);
   1.724 +				c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
   1.725 +				c1_g = CLU(g_table, x_n, y, z) - c0_g;
   1.726 +				c2_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x_n, y, z);
   1.727 +				c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
   1.728 +				c1_b = CLU(b_table, x_n, y, z) - c0_b;
   1.729 +				c2_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x_n, y, z);
   1.730 +				c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
   1.731 +			} else { 
   1.732 +				if (rx >= rz) { //rx >= rz && rz >= ry
   1.733 +					c1_r = CLU(r_table, x_n, y, z) - c0_r;
   1.734 +					c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
   1.735 +					c3_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x_n, y, z);
   1.736 +					c1_g = CLU(g_table, x_n, y, z) - c0_g;
   1.737 +					c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
   1.738 +					c3_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x_n, y, z);
   1.739 +					c1_b = CLU(b_table, x_n, y, z) - c0_b;
   1.740 +					c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
   1.741 +					c3_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x_n, y, z);
   1.742 +				} else { //rz > rx && rx >= ry
   1.743 +					c1_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x, y, z_n);
   1.744 +					c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
   1.745 +					c3_r = CLU(r_table, x, y, z_n) - c0_r;
   1.746 +					c1_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x, y, z_n);
   1.747 +					c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
   1.748 +					c3_g = CLU(g_table, x, y, z_n) - c0_g;
   1.749 +					c1_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x, y, z_n);
   1.750 +					c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
   1.751 +					c3_b = CLU(b_table, x, y, z_n) - c0_b;
   1.752 +				}
   1.753 +			}
   1.754 +		} else {
   1.755 +			if (rx >= rz) { //ry > rx && rx >= rz
   1.756 +				c1_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x, y_n, z);
   1.757 +				c2_r = CLU(r_table, x, y_n, z) - c0_r;
   1.758 +				c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
   1.759 +				c1_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x, y_n, z);
   1.760 +				c2_g = CLU(g_table, x, y_n, z) - c0_g;
   1.761 +				c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
   1.762 +				c1_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x, y_n, z);
   1.763 +				c2_b = CLU(b_table, x, y_n, z) - c0_b;
   1.764 +				c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
   1.765 +			} else {
   1.766 +				if (ry >= rz) { //ry >= rz && rz > rx 
   1.767 +					c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
   1.768 +					c2_r = CLU(r_table, x, y_n, z) - c0_r;
   1.769 +					c3_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y_n, z);
   1.770 +					c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
   1.771 +					c2_g = CLU(g_table, x, y_n, z) - c0_g;
   1.772 +					c3_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y_n, z);
   1.773 +					c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
   1.774 +					c2_b = CLU(b_table, x, y_n, z) - c0_b;
   1.775 +					c3_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y_n, z);
   1.776 +				} else { //rz > ry && ry > rx
   1.777 +					c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
   1.778 +					c2_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y, z_n);
   1.779 +					c3_r = CLU(r_table, x, y, z_n) - c0_r;
   1.780 +					c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
   1.781 +					c2_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y, z_n);
   1.782 +					c3_g = CLU(g_table, x, y, z_n) - c0_g;
   1.783 +					c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
   1.784 +					c2_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y, z_n);
   1.785 +					c3_b = CLU(b_table, x, y, z_n) - c0_b;
   1.786 +				}
   1.787 +			}
   1.788 +		}
   1.789 +				
   1.790 +		clut_r = c0_r + c1_r*rx + c2_r*ry + c3_r*rz;
   1.791 +		clut_g = c0_g + c1_g*rx + c2_g*ry + c3_g*rz;
   1.792 +		clut_b = c0_b + c1_b*rx + c2_b*ry + c3_b*rz;
   1.793 +
   1.794 +		dest[OUTPUT_R_INDEX] = clamp_u8(clut_r*255.0f);
   1.795 +		dest[OUTPUT_G_INDEX] = clamp_u8(clut_g*255.0f);
   1.796 +		dest[OUTPUT_B_INDEX] = clamp_u8(clut_b*255.0f);
   1.797 +		dest += RGB_OUTPUT_COMPONENTS;
   1.798 +	}	
   1.799 +}
   1.800 +
   1.801 +static void qcms_transform_data_rgb_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
   1.802 +{
   1.803 +	unsigned int i;
   1.804 +	float (*mat)[4] = transform->matrix;
   1.805 +	for (i = 0; i < length; i++) {
   1.806 +		unsigned char device_r = *src++;
   1.807 +		unsigned char device_g = *src++;
   1.808 +		unsigned char device_b = *src++;
   1.809 +		float out_device_r, out_device_g, out_device_b;
   1.810 +
   1.811 +		float linear_r = transform->input_gamma_table_r[device_r];
   1.812 +		float linear_g = transform->input_gamma_table_g[device_g];
   1.813 +		float linear_b = transform->input_gamma_table_b[device_b];
   1.814 +
   1.815 +		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
   1.816 +		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
   1.817 +		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
   1.818 +
   1.819 +		out_linear_r = clamp_float(out_linear_r);
   1.820 +		out_linear_g = clamp_float(out_linear_g);
   1.821 +		out_linear_b = clamp_float(out_linear_b);
   1.822 +
   1.823 +		out_device_r = lut_interp_linear(out_linear_r, 
   1.824 +				transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
   1.825 +		out_device_g = lut_interp_linear(out_linear_g, 
   1.826 +				transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
   1.827 +		out_device_b = lut_interp_linear(out_linear_b, 
   1.828 +				transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
   1.829 +
   1.830 +		dest[OUTPUT_R_INDEX] = clamp_u8(out_device_r*255);
   1.831 +		dest[OUTPUT_G_INDEX] = clamp_u8(out_device_g*255);
   1.832 +		dest[OUTPUT_B_INDEX] = clamp_u8(out_device_b*255);
   1.833 +		dest += RGB_OUTPUT_COMPONENTS;
   1.834 +	}
   1.835 +}
   1.836 +
   1.837 +static void qcms_transform_data_rgba_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
   1.838 +{
   1.839 +	unsigned int i;
   1.840 +	float (*mat)[4] = transform->matrix;
   1.841 +	for (i = 0; i < length; i++) {
   1.842 +		unsigned char device_r = *src++;
   1.843 +		unsigned char device_g = *src++;
   1.844 +		unsigned char device_b = *src++;
   1.845 +		unsigned char alpha = *src++;
   1.846 +		float out_device_r, out_device_g, out_device_b;
   1.847 +
   1.848 +		float linear_r = transform->input_gamma_table_r[device_r];
   1.849 +		float linear_g = transform->input_gamma_table_g[device_g];
   1.850 +		float linear_b = transform->input_gamma_table_b[device_b];
   1.851 +
   1.852 +		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
   1.853 +		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
   1.854 +		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
   1.855 +
   1.856 +		out_linear_r = clamp_float(out_linear_r);
   1.857 +		out_linear_g = clamp_float(out_linear_g);
   1.858 +		out_linear_b = clamp_float(out_linear_b);
   1.859 +
   1.860 +		out_device_r = lut_interp_linear(out_linear_r, 
   1.861 +				transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
   1.862 +		out_device_g = lut_interp_linear(out_linear_g, 
   1.863 +				transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
   1.864 +		out_device_b = lut_interp_linear(out_linear_b, 
   1.865 +				transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
   1.866 +
   1.867 +		dest[OUTPUT_R_INDEX] = clamp_u8(out_device_r*255);
   1.868 +		dest[OUTPUT_G_INDEX] = clamp_u8(out_device_g*255);
   1.869 +		dest[OUTPUT_B_INDEX] = clamp_u8(out_device_b*255);
   1.870 +		dest[OUTPUT_A_INDEX] = alpha;
   1.871 +		dest += RGBA_OUTPUT_COMPONENTS;
   1.872 +	}
   1.873 +}
   1.874 +
   1.875 +#if 0
   1.876 +static void qcms_transform_data_rgb_out_linear(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
   1.877 +{
   1.878 +	int i;
   1.879 +	float (*mat)[4] = transform->matrix;
   1.880 +	for (i = 0; i < length; i++) {
   1.881 +		unsigned char device_r = *src++;
   1.882 +		unsigned char device_g = *src++;
   1.883 +		unsigned char device_b = *src++;
   1.884 +
   1.885 +		float linear_r = transform->input_gamma_table_r[device_r];
   1.886 +		float linear_g = transform->input_gamma_table_g[device_g];
   1.887 +		float linear_b = transform->input_gamma_table_b[device_b];
   1.888 +
   1.889 +		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
   1.890 +		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
   1.891 +		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
   1.892 +
   1.893 +		*dest++ = clamp_u8(out_linear_r*255);
   1.894 +		*dest++ = clamp_u8(out_linear_g*255);
   1.895 +		*dest++ = clamp_u8(out_linear_b*255);
   1.896 +	}
   1.897 +}
   1.898 +#endif
   1.899 +
   1.900 +/*
   1.901 + * If users create and destroy objects on different threads, even if the same
   1.902 + * objects aren't used on different threads at the same time, we can still run
   1.903 + * in to trouble with refcounts if they aren't atomic.
   1.904 + *
   1.905 + * This can lead to us prematurely deleting the precache if threads get unlucky
   1.906 + * and write the wrong value to the ref count.
   1.907 + */
   1.908 +static struct precache_output *precache_reference(struct precache_output *p)
   1.909 +{
   1.910 +	qcms_atomic_increment(p->ref_count);
   1.911 +	return p;
   1.912 +}
   1.913 +
   1.914 +static struct precache_output *precache_create()
   1.915 +{
   1.916 +	struct precache_output *p = malloc(sizeof(struct precache_output));
   1.917 +	if (p)
   1.918 +		p->ref_count = 1;
   1.919 +	return p;
   1.920 +}
   1.921 +
   1.922 +void precache_release(struct precache_output *p)
   1.923 +{
   1.924 +	if (qcms_atomic_decrement(p->ref_count) == 0) {
   1.925 +		free(p);
   1.926 +	}
   1.927 +}
   1.928 +
   1.929 +#ifdef HAS_POSIX_MEMALIGN
   1.930 +static qcms_transform *transform_alloc(void)
   1.931 +{
   1.932 +	qcms_transform *t;
   1.933 +	if (!posix_memalign(&t, 16, sizeof(*t))) {
   1.934 +		return t;
   1.935 +	} else {
   1.936 +		return NULL;
   1.937 +	}
   1.938 +}
   1.939 +static void transform_free(qcms_transform *t)
   1.940 +{
   1.941 +	free(t);
   1.942 +}
   1.943 +#else
   1.944 +static qcms_transform *transform_alloc(void)
   1.945 +{
   1.946 +	/* transform needs to be aligned on a 16byte boundrary */
   1.947 +	char *original_block = calloc(sizeof(qcms_transform) + sizeof(void*) + 16, 1);
   1.948 +	/* make room for a pointer to the block returned by calloc */
   1.949 +	void *transform_start = original_block + sizeof(void*);
   1.950 +	/* align transform_start */
   1.951 +	qcms_transform *transform_aligned = (qcms_transform*)(((uintptr_t)transform_start + 15) & ~0xf);
   1.952 +
   1.953 +	/* store a pointer to the block returned by calloc so that we can free it later */
   1.954 +	void **(original_block_ptr) = (void**)transform_aligned;
   1.955 +	if (!original_block)
   1.956 +		return NULL;
   1.957 +	original_block_ptr--;
   1.958 +	*original_block_ptr = original_block;
   1.959 +
   1.960 +	return transform_aligned;
   1.961 +}
   1.962 +static void transform_free(qcms_transform *t)
   1.963 +{
   1.964 +	/* get at the pointer to the unaligned block returned by calloc */
   1.965 +	void **p = (void**)t;
   1.966 +	p--;
   1.967 +	free(*p);
   1.968 +}
   1.969 +#endif
   1.970 +
   1.971 +void qcms_transform_release(qcms_transform *t)
   1.972 +{
   1.973 +	/* ensure we only free the gamma tables once even if there are
   1.974 +	 * multiple references to the same data */
   1.975 +
   1.976 +	if (t->output_table_r)
   1.977 +		precache_release(t->output_table_r);
   1.978 +	if (t->output_table_g)
   1.979 +		precache_release(t->output_table_g);
   1.980 +	if (t->output_table_b)
   1.981 +		precache_release(t->output_table_b);
   1.982 +
   1.983 +	free(t->input_gamma_table_r);
   1.984 +	if (t->input_gamma_table_g != t->input_gamma_table_r)
   1.985 +		free(t->input_gamma_table_g);
   1.986 +	if (t->input_gamma_table_g != t->input_gamma_table_r &&
   1.987 +	    t->input_gamma_table_g != t->input_gamma_table_b)
   1.988 +		free(t->input_gamma_table_b);
   1.989 +
   1.990 +	free(t->input_gamma_table_gray);
   1.991 +
   1.992 +	free(t->output_gamma_lut_r);
   1.993 +	free(t->output_gamma_lut_g);
   1.994 +	free(t->output_gamma_lut_b);
   1.995 +
   1.996 +	transform_free(t);
   1.997 +}
   1.998 +
   1.999 +#ifdef X86
  1.1000 +// Determine if we can build with SSE2 (this was partly copied from jmorecfg.h in
  1.1001 +// mozilla/jpeg)
  1.1002 + // -------------------------------------------------------------------------
  1.1003 +#if defined(_M_IX86) && defined(_MSC_VER)
  1.1004 +#define HAS_CPUID
  1.1005 +/* Get us a CPUID function. Avoid clobbering EBX because sometimes it's the PIC
  1.1006 +   register - I'm not sure if that ever happens on windows, but cpuid isn't
  1.1007 +   on the critical path so we just preserve the register to be safe and to be
  1.1008 +   consistent with the non-windows version. */
  1.1009 +static void cpuid(uint32_t fxn, uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) {
  1.1010 +       uint32_t a_, b_, c_, d_;
  1.1011 +       __asm {
  1.1012 +              xchg   ebx, esi
  1.1013 +              mov    eax, fxn
  1.1014 +              cpuid
  1.1015 +              mov    a_, eax
  1.1016 +              mov    b_, ebx
  1.1017 +              mov    c_, ecx
  1.1018 +              mov    d_, edx
  1.1019 +              xchg   ebx, esi
  1.1020 +       }
  1.1021 +       *a = a_;
  1.1022 +       *b = b_;
  1.1023 +       *c = c_;
  1.1024 +       *d = d_;
  1.1025 +}
  1.1026 +#elif (defined(__GNUC__) || defined(__SUNPRO_C)) && (defined(__i386__) || defined(__i386))
  1.1027 +#define HAS_CPUID
  1.1028 +/* Get us a CPUID function. We can't use ebx because it's the PIC register on
  1.1029 +   some platforms, so we use ESI instead and save ebx to avoid clobbering it. */
  1.1030 +static void cpuid(uint32_t fxn, uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) {
  1.1031 +
  1.1032 +	uint32_t a_, b_, c_, d_;
  1.1033 +       __asm__ __volatile__ ("xchgl %%ebx, %%esi; cpuid; xchgl %%ebx, %%esi;" 
  1.1034 +                             : "=a" (a_), "=S" (b_), "=c" (c_), "=d" (d_) : "a" (fxn));
  1.1035 +	   *a = a_;
  1.1036 +	   *b = b_;
  1.1037 +	   *c = c_;
  1.1038 +	   *d = d_;
  1.1039 +}
  1.1040 +#endif
  1.1041 +
  1.1042 +// -------------------------Runtime SSEx Detection-----------------------------
  1.1043 +
  1.1044 +/* MMX is always supported per
  1.1045 + *  Gecko v1.9.1 minimum CPU requirements */
  1.1046 +#define SSE1_EDX_MASK (1UL << 25)
  1.1047 +#define SSE2_EDX_MASK (1UL << 26)
  1.1048 +#define SSE3_ECX_MASK (1UL <<  0)
  1.1049 +
  1.1050 +static int sse_version_available(void)
  1.1051 +{
  1.1052 +#if defined(__x86_64__) || defined(__x86_64) || defined(_M_AMD64)
  1.1053 +	/* we know at build time that 64-bit CPUs always have SSE2
  1.1054 +	 * this tells the compiler that non-SSE2 branches will never be
  1.1055 +	 * taken (i.e. OK to optimze away the SSE1 and non-SIMD code */
  1.1056 +	return 2;
  1.1057 +#elif defined(HAS_CPUID)
  1.1058 +	static int sse_version = -1;
  1.1059 +	uint32_t a, b, c, d;
  1.1060 +	uint32_t function = 0x00000001;
  1.1061 +
  1.1062 +	if (sse_version == -1) {
  1.1063 +		sse_version = 0;
  1.1064 +		cpuid(function, &a, &b, &c, &d);
  1.1065 +		if (c & SSE3_ECX_MASK)
  1.1066 +			sse_version = 3;
  1.1067 +		else if (d & SSE2_EDX_MASK)
  1.1068 +			sse_version = 2;
  1.1069 +		else if (d & SSE1_EDX_MASK)
  1.1070 +			sse_version = 1;
  1.1071 +	}
  1.1072 +
  1.1073 +	return sse_version;
  1.1074 +#else
  1.1075 +	return 0;
  1.1076 +#endif
  1.1077 +}
  1.1078 +#endif
  1.1079 +
  1.1080 +static const struct matrix bradford_matrix = {{	{ 0.8951f, 0.2664f,-0.1614f},
  1.1081 +						{-0.7502f, 1.7135f, 0.0367f},
  1.1082 +						{ 0.0389f,-0.0685f, 1.0296f}}, 
  1.1083 +						false};
  1.1084 +
  1.1085 +static const struct matrix bradford_matrix_inv = {{ { 0.9869929f,-0.1470543f, 0.1599627f},
  1.1086 +						    { 0.4323053f, 0.5183603f, 0.0492912f},
  1.1087 +						    {-0.0085287f, 0.0400428f, 0.9684867f}}, 
  1.1088 +						    false};
  1.1089 +
  1.1090 +// See ICCv4 E.3
  1.1091 +struct matrix compute_whitepoint_adaption(float X, float Y, float Z) {
  1.1092 +	float p = (0.96422f*bradford_matrix.m[0][0] + 1.000f*bradford_matrix.m[1][0] + 0.82521f*bradford_matrix.m[2][0]) /
  1.1093 +		  (X*bradford_matrix.m[0][0]      + Y*bradford_matrix.m[1][0]      + Z*bradford_matrix.m[2][0]     );
  1.1094 +	float y = (0.96422f*bradford_matrix.m[0][1] + 1.000f*bradford_matrix.m[1][1] + 0.82521f*bradford_matrix.m[2][1]) /
  1.1095 +		  (X*bradford_matrix.m[0][1]      + Y*bradford_matrix.m[1][1]      + Z*bradford_matrix.m[2][1]     );
  1.1096 +	float b = (0.96422f*bradford_matrix.m[0][2] + 1.000f*bradford_matrix.m[1][2] + 0.82521f*bradford_matrix.m[2][2]) /
  1.1097 +		  (X*bradford_matrix.m[0][2]      + Y*bradford_matrix.m[1][2]      + Z*bradford_matrix.m[2][2]     );
  1.1098 +	struct matrix white_adaption = {{ {p,0,0}, {0,y,0}, {0,0,b}}, false};
  1.1099 +	return matrix_multiply( bradford_matrix_inv, matrix_multiply(white_adaption, bradford_matrix) );
  1.1100 +}
  1.1101 +
  1.1102 +void qcms_profile_precache_output_transform(qcms_profile *profile)
  1.1103 +{
  1.1104 +	/* we only support precaching on rgb profiles */
  1.1105 +	if (profile->color_space != RGB_SIGNATURE)
  1.1106 +		return;
  1.1107 +
  1.1108 +	if (qcms_supports_iccv4) {
  1.1109 +		/* don't precache since we will use the B2A LUT */
  1.1110 +		if (profile->B2A0)
  1.1111 +			return;
  1.1112 +
  1.1113 +		/* don't precache since we will use the mBA LUT */
  1.1114 +		if (profile->mBA)
  1.1115 +			return;
  1.1116 +	}
  1.1117 +
  1.1118 +	/* don't precache if we do not have the TRC curves */
  1.1119 +	if (!profile->redTRC || !profile->greenTRC || !profile->blueTRC)
  1.1120 +		return;
  1.1121 +
  1.1122 +	if (!profile->output_table_r) {
  1.1123 +		profile->output_table_r = precache_create();
  1.1124 +		if (profile->output_table_r &&
  1.1125 +				!compute_precache(profile->redTRC, profile->output_table_r->data)) {
  1.1126 +			precache_release(profile->output_table_r);
  1.1127 +			profile->output_table_r = NULL;
  1.1128 +		}
  1.1129 +	}
  1.1130 +	if (!profile->output_table_g) {
  1.1131 +		profile->output_table_g = precache_create();
  1.1132 +		if (profile->output_table_g &&
  1.1133 +				!compute_precache(profile->greenTRC, profile->output_table_g->data)) {
  1.1134 +			precache_release(profile->output_table_g);
  1.1135 +			profile->output_table_g = NULL;
  1.1136 +		}
  1.1137 +	}
  1.1138 +	if (!profile->output_table_b) {
  1.1139 +		profile->output_table_b = precache_create();
  1.1140 +		if (profile->output_table_b &&
  1.1141 +				!compute_precache(profile->blueTRC, profile->output_table_b->data)) {
  1.1142 +			precache_release(profile->output_table_b);
  1.1143 +			profile->output_table_b = NULL;
  1.1144 +		}
  1.1145 +	}
  1.1146 +}
  1.1147 +
  1.1148 +/* Replace the current transformation with a LUT transformation using a given number of sample points */
  1.1149 +qcms_transform* qcms_transform_precacheLUT_float(qcms_transform *transform, qcms_profile *in, qcms_profile *out, 
  1.1150 +                                                 int samples, qcms_data_type in_type)
  1.1151 +{
  1.1152 +	/* The range between which 2 consecutive sample points can be used to interpolate */
  1.1153 +	uint16_t x,y,z;
  1.1154 +	uint32_t l;
  1.1155 +	uint32_t lutSize = 3 * samples * samples * samples;
  1.1156 +	float* src = NULL;
  1.1157 +	float* dest = NULL;
  1.1158 +	float* lut = NULL;
  1.1159 +
  1.1160 +	src = malloc(lutSize*sizeof(float));
  1.1161 +	dest = malloc(lutSize*sizeof(float));
  1.1162 +
  1.1163 +	if (src && dest) {
  1.1164 +		/* Prepare a list of points we want to sample */
  1.1165 +		l = 0;
  1.1166 +		for (x = 0; x < samples; x++) {
  1.1167 +			for (y = 0; y < samples; y++) {
  1.1168 +				for (z = 0; z < samples; z++) {
  1.1169 +					src[l++] = x / (float)(samples-1);
  1.1170 +					src[l++] = y / (float)(samples-1);
  1.1171 +					src[l++] = z / (float)(samples-1);
  1.1172 +				}
  1.1173 +			}
  1.1174 +		}
  1.1175 +
  1.1176 +		lut = qcms_chain_transform(in, out, src, dest, lutSize);
  1.1177 +		if (lut) {
  1.1178 +			transform->r_clut = &lut[0];
  1.1179 +			transform->g_clut = &lut[1];
  1.1180 +			transform->b_clut = &lut[2];
  1.1181 +			transform->grid_size = samples;
  1.1182 +			if (in_type == QCMS_DATA_RGBA_8) {
  1.1183 +				transform->transform_fn = qcms_transform_data_tetra_clut_rgba;
  1.1184 +			} else {
  1.1185 +				transform->transform_fn = qcms_transform_data_tetra_clut;
  1.1186 +			}
  1.1187 +		}
  1.1188 +	}
  1.1189 +
  1.1190 +
  1.1191 +	//XXX: qcms_modular_transform_data may return either the src or dest buffer. If so it must not be free-ed
  1.1192 +	if (src && lut != src) {
  1.1193 +		free(src);
  1.1194 +	}
  1.1195 +	if (dest && lut != dest) {
  1.1196 +		free(dest);
  1.1197 +	}
  1.1198 +
  1.1199 +	if (lut == NULL) {
  1.1200 +		return NULL;
  1.1201 +	}
  1.1202 +	return transform;
  1.1203 +}
  1.1204 +
  1.1205 +#define NO_MEM_TRANSFORM NULL
  1.1206 +
  1.1207 +qcms_transform* qcms_transform_create(
  1.1208 +		qcms_profile *in, qcms_data_type in_type,
  1.1209 +		qcms_profile *out, qcms_data_type out_type,
  1.1210 +		qcms_intent intent)
  1.1211 +{
  1.1212 +	bool precache = false;
  1.1213 +
  1.1214 +        qcms_transform *transform = transform_alloc();
  1.1215 +        if (!transform) {
  1.1216 +		return NULL;
  1.1217 +	}
  1.1218 +	if (out_type != QCMS_DATA_RGB_8 &&
  1.1219 +                out_type != QCMS_DATA_RGBA_8) {
  1.1220 +            assert(0 && "output type");
  1.1221 +	    transform_free(transform);
  1.1222 +            return NULL;
  1.1223 +        }
  1.1224 +
  1.1225 +	if (out->output_table_r &&
  1.1226 +			out->output_table_g &&
  1.1227 +			out->output_table_b) {
  1.1228 +		precache = true;
  1.1229 +	}
  1.1230 +
  1.1231 +	// This precache assumes RGB_SIGNATURE (fails on GRAY_SIGNATURE, for instance)
  1.1232 +	if (qcms_supports_iccv4 &&
  1.1233 +			(in_type == QCMS_DATA_RGB_8 || in_type == QCMS_DATA_RGBA_8) &&
  1.1234 +			(in->A2B0 || out->B2A0 || in->mAB || out->mAB))
  1.1235 +		{
  1.1236 +		// Precache the transformation to a CLUT 33x33x33 in size.
  1.1237 +		// 33 is used by many profiles and works well in pratice. 
  1.1238 +		// This evenly divides 256 into blocks of 8x8x8.
  1.1239 +		// TODO For transforming small data sets of about 200x200 or less
  1.1240 +		// precaching should be avoided.
  1.1241 +		qcms_transform *result = qcms_transform_precacheLUT_float(transform, in, out, 33, in_type);
  1.1242 +		if (!result) {
  1.1243 +            		assert(0 && "precacheLUT failed");
  1.1244 +			transform_free(transform);
  1.1245 +			return NULL;
  1.1246 +		}
  1.1247 +		return result;
  1.1248 +	}
  1.1249 +
  1.1250 +	if (precache) {
  1.1251 +		transform->output_table_r = precache_reference(out->output_table_r);
  1.1252 +		transform->output_table_g = precache_reference(out->output_table_g);
  1.1253 +		transform->output_table_b = precache_reference(out->output_table_b);
  1.1254 +	} else {
  1.1255 +		if (!out->redTRC || !out->greenTRC || !out->blueTRC) {
  1.1256 +			qcms_transform_release(transform);
  1.1257 +			return NO_MEM_TRANSFORM;
  1.1258 +		}
  1.1259 +		build_output_lut(out->redTRC, &transform->output_gamma_lut_r, &transform->output_gamma_lut_r_length);
  1.1260 +		build_output_lut(out->greenTRC, &transform->output_gamma_lut_g, &transform->output_gamma_lut_g_length);
  1.1261 +		build_output_lut(out->blueTRC, &transform->output_gamma_lut_b, &transform->output_gamma_lut_b_length);
  1.1262 +		if (!transform->output_gamma_lut_r || !transform->output_gamma_lut_g || !transform->output_gamma_lut_b) {
  1.1263 +			qcms_transform_release(transform);
  1.1264 +			return NO_MEM_TRANSFORM;
  1.1265 +		}
  1.1266 +	}
  1.1267 +
  1.1268 +        if (in->color_space == RGB_SIGNATURE) {
  1.1269 +		struct matrix in_matrix, out_matrix, result;
  1.1270 +
  1.1271 +		if (in_type != QCMS_DATA_RGB_8 &&
  1.1272 +                    in_type != QCMS_DATA_RGBA_8){
  1.1273 +                	assert(0 && "input type");
  1.1274 +			transform_free(transform);
  1.1275 +                	return NULL;
  1.1276 +            	}
  1.1277 +		if (precache) {
  1.1278 +#ifdef X86
  1.1279 +		    if (sse_version_available() >= 2) {
  1.1280 +			    if (in_type == QCMS_DATA_RGB_8)
  1.1281 +				    transform->transform_fn = qcms_transform_data_rgb_out_lut_sse2;
  1.1282 +			    else
  1.1283 +				    transform->transform_fn = qcms_transform_data_rgba_out_lut_sse2;
  1.1284 +
  1.1285 +#if !(defined(_MSC_VER) && defined(_M_AMD64))
  1.1286 +                    /* Microsoft Compiler for x64 doesn't support MMX.
  1.1287 +                     * SSE code uses MMX so that we disable on x64 */
  1.1288 +		    } else
  1.1289 +		    if (sse_version_available() >= 1) {
  1.1290 +			    if (in_type == QCMS_DATA_RGB_8)
  1.1291 +				    transform->transform_fn = qcms_transform_data_rgb_out_lut_sse1;
  1.1292 +			    else
  1.1293 +				    transform->transform_fn = qcms_transform_data_rgba_out_lut_sse1;
  1.1294 +#endif
  1.1295 +		    } else
  1.1296 +#endif
  1.1297 +#if (defined(__POWERPC__) || defined(__powerpc__))
  1.1298 +		    if (have_altivec()) {
  1.1299 +			    if (in_type == QCMS_DATA_RGB_8)
  1.1300 +				    transform->transform_fn = qcms_transform_data_rgb_out_lut_altivec;
  1.1301 +			    else
  1.1302 +				    transform->transform_fn = qcms_transform_data_rgba_out_lut_altivec;
  1.1303 +		    } else
  1.1304 +#endif
  1.1305 +			{
  1.1306 +				if (in_type == QCMS_DATA_RGB_8)
  1.1307 +					transform->transform_fn = qcms_transform_data_rgb_out_lut_precache;
  1.1308 +				else
  1.1309 +					transform->transform_fn = qcms_transform_data_rgba_out_lut_precache;
  1.1310 +			}
  1.1311 +		} else {
  1.1312 +			if (in_type == QCMS_DATA_RGB_8)
  1.1313 +				transform->transform_fn = qcms_transform_data_rgb_out_lut;
  1.1314 +			else
  1.1315 +				transform->transform_fn = qcms_transform_data_rgba_out_lut;
  1.1316 +		}
  1.1317 +
  1.1318 +		//XXX: avoid duplicating tables if we can
  1.1319 +		transform->input_gamma_table_r = build_input_gamma_table(in->redTRC);
  1.1320 +		transform->input_gamma_table_g = build_input_gamma_table(in->greenTRC);
  1.1321 +		transform->input_gamma_table_b = build_input_gamma_table(in->blueTRC);
  1.1322 +		if (!transform->input_gamma_table_r || !transform->input_gamma_table_g || !transform->input_gamma_table_b) {
  1.1323 +			qcms_transform_release(transform);
  1.1324 +			return NO_MEM_TRANSFORM;
  1.1325 +		}
  1.1326 +
  1.1327 +
  1.1328 +		/* build combined colorant matrix */
  1.1329 +		in_matrix = build_colorant_matrix(in);
  1.1330 +		out_matrix = build_colorant_matrix(out);
  1.1331 +		out_matrix = matrix_invert(out_matrix);
  1.1332 +		if (out_matrix.invalid) {
  1.1333 +			qcms_transform_release(transform);
  1.1334 +			return NULL;
  1.1335 +		}
  1.1336 +		result = matrix_multiply(out_matrix, in_matrix);
  1.1337 +
  1.1338 +		/* store the results in column major mode
  1.1339 +		 * this makes doing the multiplication with sse easier */
  1.1340 +		transform->matrix[0][0] = result.m[0][0];
  1.1341 +		transform->matrix[1][0] = result.m[0][1];
  1.1342 +		transform->matrix[2][0] = result.m[0][2];
  1.1343 +		transform->matrix[0][1] = result.m[1][0];
  1.1344 +		transform->matrix[1][1] = result.m[1][1];
  1.1345 +		transform->matrix[2][1] = result.m[1][2];
  1.1346 +		transform->matrix[0][2] = result.m[2][0];
  1.1347 +		transform->matrix[1][2] = result.m[2][1];
  1.1348 +		transform->matrix[2][2] = result.m[2][2];
  1.1349 +
  1.1350 +	} else if (in->color_space == GRAY_SIGNATURE) {
  1.1351 +		if (in_type != QCMS_DATA_GRAY_8 &&
  1.1352 +				in_type != QCMS_DATA_GRAYA_8){
  1.1353 +			assert(0 && "input type");
  1.1354 +			transform_free(transform);
  1.1355 +			return NULL;
  1.1356 +		}
  1.1357 +
  1.1358 +		transform->input_gamma_table_gray = build_input_gamma_table(in->grayTRC);
  1.1359 +		if (!transform->input_gamma_table_gray) {
  1.1360 +			qcms_transform_release(transform);
  1.1361 +			return NO_MEM_TRANSFORM;
  1.1362 +		}
  1.1363 +
  1.1364 +		if (precache) {
  1.1365 +			if (in_type == QCMS_DATA_GRAY_8) {
  1.1366 +				transform->transform_fn = qcms_transform_data_gray_out_precache;
  1.1367 +			} else {
  1.1368 +				transform->transform_fn = qcms_transform_data_graya_out_precache;
  1.1369 +			}
  1.1370 +		} else {
  1.1371 +			if (in_type == QCMS_DATA_GRAY_8) {
  1.1372 +				transform->transform_fn = qcms_transform_data_gray_out_lut;
  1.1373 +			} else {
  1.1374 +				transform->transform_fn = qcms_transform_data_graya_out_lut;
  1.1375 +			}
  1.1376 +		}
  1.1377 +	} else {
  1.1378 +		assert(0 && "unexpected colorspace");
  1.1379 +		transform_free(transform);
  1.1380 +		return NULL;
  1.1381 +	}
  1.1382 +	return transform;
  1.1383 +}
  1.1384 +
  1.1385 +#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
  1.1386 +/* we need this to avoid crashes when gcc assumes the stack is 128bit aligned */
  1.1387 +__attribute__((__force_align_arg_pointer__))
  1.1388 +#endif
  1.1389 +void qcms_transform_data(qcms_transform *transform, void *src, void *dest, size_t length)
  1.1390 +{
  1.1391 +	transform->transform_fn(transform, src, dest, length);
  1.1392 +}
  1.1393 +
  1.1394 +qcms_bool qcms_supports_iccv4;
  1.1395 +void qcms_enable_iccv4()
  1.1396 +{
  1.1397 +	qcms_supports_iccv4 = true;
  1.1398 +}

mercurial