gfx/qcms/transform.c

Tue, 06 Jan 2015 21:39:09 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Tue, 06 Jan 2015 21:39:09 +0100
branch
TOR_BUG_9701
changeset 8
97036ab72558
permissions
-rw-r--r--

Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 /* vim: set ts=8 sw=8 noexpandtab: */
     2 //  qcms
     3 //  Copyright (C) 2009 Mozilla Corporation
     4 //  Copyright (C) 1998-2007 Marti Maria
     5 //
     6 // Permission is hereby granted, free of charge, to any person obtaining 
     7 // a copy of this software and associated documentation files (the "Software"), 
     8 // to deal in the Software without restriction, including without limitation 
     9 // the rights to use, copy, modify, merge, publish, distribute, sublicense, 
    10 // and/or sell copies of the Software, and to permit persons to whom the Software 
    11 // is furnished to do so, subject to the following conditions:
    12 //
    13 // The above copyright notice and this permission notice shall be included in 
    14 // all copies or substantial portions of the Software.
    15 //
    16 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
    17 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO 
    18 // THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
    19 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 
    20 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 
    21 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 
    22 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
    24 #include <stdlib.h>
    25 #include <math.h>
    26 #include <assert.h>
    27 #include <string.h> //memcpy
    28 #include "qcmsint.h"
    29 #include "chain.h"
    30 #include "matrix.h"
    31 #include "transform_util.h"
    33 /* for MSVC, GCC, Intel, and Sun compilers */
    34 #if defined(_M_IX86) || defined(__i386__) || defined(__i386) || defined(_M_AMD64) || defined(__x86_64__) || defined(__x86_64)
    35 #define X86
    36 #endif /* _M_IX86 || __i386__ || __i386 || _M_AMD64 || __x86_64__ || __x86_64 */
    38 /**
    39  * AltiVec detection for PowerPC CPUs
    40  * In case we have a method of detecting do the runtime detection.
    41  * Otherwise statically choose the AltiVec path in case the compiler
    42  * was told to build with AltiVec support.
    43  */
    44 #if (defined(__POWERPC__) || defined(__powerpc__))
    45 #if defined(__linux__)
    46 #include <unistd.h>
    47 #include <fcntl.h>
    48 #include <stdio.h>
    49 #include <elf.h>
    50 #include <linux/auxvec.h>
    51 #include <asm/cputable.h>
    52 #include <link.h>
    54 static inline qcms_bool have_altivec() {
    55 	static int available = -1;
    56 	int new_avail = 0;
    57         ElfW(auxv_t) auxv;
    58 	ssize_t count;
    59 	int fd, i;
    61 	if (available != -1)
    62 		return (available != 0 ? true : false);
    64 	fd = open("/proc/self/auxv", O_RDONLY);
    65 	if (fd < 0)
    66 		goto out;
    67 	do {
    68 		count = read(fd, &auxv, sizeof(auxv));
    69 		if (count < 0)
    70 			goto out_close;
    72 		if (auxv.a_type == AT_HWCAP) {
    73 			new_avail = !!(auxv.a_un.a_val & PPC_FEATURE_HAS_ALTIVEC);
    74 			goto out_close;
    75 		}
    76 	} while (auxv.a_type != AT_NULL);
    78 out_close:
    79 	close(fd);
    80 out:
    81 	available = new_avail;
    82 	return (available != 0 ? true : false);
    83 }
    84 #elif defined(__APPLE__) && defined(__MACH__)
    85 #include <sys/sysctl.h>
    87 /**
    88  * rip-off from ffmpeg AltiVec detection code.
    89  * this code also appears on Apple's AltiVec pages.
    90  */
    91 static inline qcms_bool have_altivec() {
    92 	int sels[2] = {CTL_HW, HW_VECTORUNIT};
    93 	static int available = -1;
    94 	size_t len = sizeof(available);
    95 	int err;
    97 	if (available != -1)
    98 		return (available != 0 ? true : false);
   100 	err = sysctl(sels, 2, &available, &len, NULL, 0);
   102 	if (err == 0)
   103 		if (available != 0)
   104 			return true;
   106 	return false;
   107 }
   108 #elif defined(__ALTIVEC__) || defined(__APPLE_ALTIVEC__)
   109 #define have_altivec() true
   110 #else
   111 #define have_altivec() false
   112 #endif
   113 #endif // (defined(__POWERPC__) || defined(__powerpc__))
   115 // Build a White point, primary chromas transfer matrix from RGB to CIE XYZ
   116 // This is just an approximation, I am not handling all the non-linear
   117 // aspects of the RGB to XYZ process, and assumming that the gamma correction
   118 // has transitive property in the tranformation chain.
   119 //
   120 // the alghoritm:
   121 //
   122 //            - First I build the absolute conversion matrix using
   123 //              primaries in XYZ. This matrix is next inverted
   124 //            - Then I eval the source white point across this matrix
   125 //              obtaining the coeficients of the transformation
   126 //            - Then, I apply these coeficients to the original matrix
   127 static struct matrix build_RGB_to_XYZ_transfer_matrix(qcms_CIE_xyY white, qcms_CIE_xyYTRIPLE primrs)
   128 {
   129 	struct matrix primaries;
   130 	struct matrix primaries_invert;
   131 	struct matrix result;
   132 	struct vector white_point;
   133 	struct vector coefs;
   135 	double xn, yn;
   136 	double xr, yr;
   137 	double xg, yg;
   138 	double xb, yb;
   140 	xn = white.x;
   141 	yn = white.y;
   143 	if (yn == 0.0)
   144 		return matrix_invalid();
   146 	xr = primrs.red.x;
   147 	yr = primrs.red.y;
   148 	xg = primrs.green.x;
   149 	yg = primrs.green.y;
   150 	xb = primrs.blue.x;
   151 	yb = primrs.blue.y;
   153 	primaries.m[0][0] = xr;
   154 	primaries.m[0][1] = xg;
   155 	primaries.m[0][2] = xb;
   157 	primaries.m[1][0] = yr;
   158 	primaries.m[1][1] = yg;
   159 	primaries.m[1][2] = yb;
   161 	primaries.m[2][0] = 1 - xr - yr;
   162 	primaries.m[2][1] = 1 - xg - yg;
   163 	primaries.m[2][2] = 1 - xb - yb;
   164 	primaries.invalid = false;
   166 	white_point.v[0] = xn/yn;
   167 	white_point.v[1] = 1.;
   168 	white_point.v[2] = (1.0-xn-yn)/yn;
   170 	primaries_invert = matrix_invert(primaries);
   172 	coefs = matrix_eval(primaries_invert, white_point);
   174 	result.m[0][0] = coefs.v[0]*xr;
   175 	result.m[0][1] = coefs.v[1]*xg;
   176 	result.m[0][2] = coefs.v[2]*xb;
   178 	result.m[1][0] = coefs.v[0]*yr;
   179 	result.m[1][1] = coefs.v[1]*yg;
   180 	result.m[1][2] = coefs.v[2]*yb;
   182 	result.m[2][0] = coefs.v[0]*(1.-xr-yr);
   183 	result.m[2][1] = coefs.v[1]*(1.-xg-yg);
   184 	result.m[2][2] = coefs.v[2]*(1.-xb-yb);
   185 	result.invalid = primaries_invert.invalid;
   187 	return result;
   188 }
   190 struct CIE_XYZ {
   191 	double X;
   192 	double Y;
   193 	double Z;
   194 };
   196 /* CIE Illuminant D50 */
   197 static const struct CIE_XYZ D50_XYZ = {
   198 	0.9642,
   199 	1.0000,
   200 	0.8249
   201 };
   203 /* from lcms: xyY2XYZ()
   204  * corresponds to argyll: icmYxy2XYZ() */
   205 static struct CIE_XYZ xyY2XYZ(qcms_CIE_xyY source)
   206 {
   207 	struct CIE_XYZ dest;
   208 	dest.X = (source.x / source.y) * source.Y;
   209 	dest.Y = source.Y;
   210 	dest.Z = ((1 - source.x - source.y) / source.y) * source.Y;
   211 	return dest;
   212 }
   214 /* from lcms: ComputeChromaticAdaption */
   215 // Compute chromatic adaption matrix using chad as cone matrix
   216 static struct matrix
   217 compute_chromatic_adaption(struct CIE_XYZ source_white_point,
   218                            struct CIE_XYZ dest_white_point,
   219                            struct matrix chad)
   220 {
   221 	struct matrix chad_inv;
   222 	struct vector cone_source_XYZ, cone_source_rgb;
   223 	struct vector cone_dest_XYZ, cone_dest_rgb;
   224 	struct matrix cone, tmp;
   226 	tmp = chad;
   227 	chad_inv = matrix_invert(tmp);
   229 	cone_source_XYZ.v[0] = source_white_point.X;
   230 	cone_source_XYZ.v[1] = source_white_point.Y;
   231 	cone_source_XYZ.v[2] = source_white_point.Z;
   233 	cone_dest_XYZ.v[0] = dest_white_point.X;
   234 	cone_dest_XYZ.v[1] = dest_white_point.Y;
   235 	cone_dest_XYZ.v[2] = dest_white_point.Z;
   237 	cone_source_rgb = matrix_eval(chad, cone_source_XYZ);
   238 	cone_dest_rgb   = matrix_eval(chad, cone_dest_XYZ);
   240 	cone.m[0][0] = cone_dest_rgb.v[0]/cone_source_rgb.v[0];
   241 	cone.m[0][1] = 0;
   242 	cone.m[0][2] = 0;
   243 	cone.m[1][0] = 0;
   244 	cone.m[1][1] = cone_dest_rgb.v[1]/cone_source_rgb.v[1];
   245 	cone.m[1][2] = 0;
   246 	cone.m[2][0] = 0;
   247 	cone.m[2][1] = 0;
   248 	cone.m[2][2] = cone_dest_rgb.v[2]/cone_source_rgb.v[2];
   249 	cone.invalid = false;
   251 	// Normalize
   252 	return matrix_multiply(chad_inv, matrix_multiply(cone, chad));
   253 }
   255 /* from lcms: cmsAdaptionMatrix */
   256 // Returns the final chrmatic adaptation from illuminant FromIll to Illuminant ToIll
   257 // Bradford is assumed
   258 static struct matrix
   259 adaption_matrix(struct CIE_XYZ source_illumination, struct CIE_XYZ target_illumination)
   260 {
   261 	struct matrix lam_rigg = {{ // Bradford matrix
   262 	                         {  0.8951,  0.2664, -0.1614 },
   263 	                         { -0.7502,  1.7135,  0.0367 },
   264 	                         {  0.0389, -0.0685,  1.0296 }
   265 	                         }};
   266 	return compute_chromatic_adaption(source_illumination, target_illumination, lam_rigg);
   267 }
   269 /* from lcms: cmsAdaptMatrixToD50 */
   270 static struct matrix adapt_matrix_to_D50(struct matrix r, qcms_CIE_xyY source_white_pt)
   271 {
   272 	struct CIE_XYZ Dn;
   273 	struct matrix Bradford;
   275 	if (source_white_pt.y == 0.0)
   276 		return matrix_invalid();
   278 	Dn = xyY2XYZ(source_white_pt);
   280 	Bradford = adaption_matrix(Dn, D50_XYZ);
   281 	return matrix_multiply(Bradford, r);
   282 }
   284 qcms_bool set_rgb_colorants(qcms_profile *profile, qcms_CIE_xyY white_point, qcms_CIE_xyYTRIPLE primaries)
   285 {
   286 	struct matrix colorants;
   287 	colorants = build_RGB_to_XYZ_transfer_matrix(white_point, primaries);
   288 	colorants = adapt_matrix_to_D50(colorants, white_point);
   290 	if (colorants.invalid)
   291 		return false;
   293 	/* note: there's a transpose type of operation going on here */
   294 	profile->redColorant.X = double_to_s15Fixed16Number(colorants.m[0][0]);
   295 	profile->redColorant.Y = double_to_s15Fixed16Number(colorants.m[1][0]);
   296 	profile->redColorant.Z = double_to_s15Fixed16Number(colorants.m[2][0]);
   298 	profile->greenColorant.X = double_to_s15Fixed16Number(colorants.m[0][1]);
   299 	profile->greenColorant.Y = double_to_s15Fixed16Number(colorants.m[1][1]);
   300 	profile->greenColorant.Z = double_to_s15Fixed16Number(colorants.m[2][1]);
   302 	profile->blueColorant.X = double_to_s15Fixed16Number(colorants.m[0][2]);
   303 	profile->blueColorant.Y = double_to_s15Fixed16Number(colorants.m[1][2]);
   304 	profile->blueColorant.Z = double_to_s15Fixed16Number(colorants.m[2][2]);
   306 	return true;
   307 }
   309 qcms_bool get_rgb_colorants(struct matrix *colorants, qcms_CIE_xyY white_point, qcms_CIE_xyYTRIPLE primaries)
   310 {
   311 	*colorants = build_RGB_to_XYZ_transfer_matrix(white_point, primaries);
   312 	*colorants = adapt_matrix_to_D50(*colorants, white_point);
   314 	return (colorants->invalid ? true : false);
   315 }
   317 #if 0
   318 static void qcms_transform_data_rgb_out_pow(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
   319 {
   320 	int i;
   321 	float (*mat)[4] = transform->matrix;
   322 	for (i=0; i<length; i++) {
   323 		unsigned char device_r = *src++;
   324 		unsigned char device_g = *src++;
   325 		unsigned char device_b = *src++;
   327 		float linear_r = transform->input_gamma_table_r[device_r];
   328 		float linear_g = transform->input_gamma_table_g[device_g];
   329 		float linear_b = transform->input_gamma_table_b[device_b];
   331 		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
   332 		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
   333 		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
   335 		float out_device_r = pow(out_linear_r, transform->out_gamma_r);
   336 		float out_device_g = pow(out_linear_g, transform->out_gamma_g);
   337 		float out_device_b = pow(out_linear_b, transform->out_gamma_b);
   339 		dest[OUTPUT_R_INDEX] = clamp_u8(255*out_device_r);
   340 		dest[OUTPUT_G_INDEX] = clamp_u8(255*out_device_g);
   341 		dest[OUTPUT_B_INDEX] = clamp_u8(255*out_device_b);
   342 		dest += RGB_OUTPUT_COMPONENTS;
   343 	}
   344 }
   345 #endif
   347 static void qcms_transform_data_gray_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
   348 {
   349 	unsigned int i;
   350 	for (i = 0; i < length; i++) {
   351 		float out_device_r, out_device_g, out_device_b;
   352 		unsigned char device = *src++;
   354 		float linear = transform->input_gamma_table_gray[device];
   356                 out_device_r = lut_interp_linear(linear, transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
   357 		out_device_g = lut_interp_linear(linear, transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
   358 		out_device_b = lut_interp_linear(linear, transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
   360 		dest[OUTPUT_R_INDEX] = clamp_u8(out_device_r*255);
   361 		dest[OUTPUT_G_INDEX] = clamp_u8(out_device_g*255);
   362 		dest[OUTPUT_B_INDEX] = clamp_u8(out_device_b*255);
   363 		dest += RGB_OUTPUT_COMPONENTS;
   364 	}
   365 }
   367 /* Alpha is not corrected.
   368    A rationale for this is found in Alvy Ray's "Should Alpha Be Nonlinear If
   369    RGB Is?" Tech Memo 17 (December 14, 1998).
   370 	See: ftp://ftp.alvyray.com/Acrobat/17_Nonln.pdf
   371 */
   373 static void qcms_transform_data_graya_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
   374 {
   375 	unsigned int i;
   376 	for (i = 0; i < length; i++) {
   377 		float out_device_r, out_device_g, out_device_b;
   378 		unsigned char device = *src++;
   379 		unsigned char alpha = *src++;
   381 		float linear = transform->input_gamma_table_gray[device];
   383                 out_device_r = lut_interp_linear(linear, transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
   384 		out_device_g = lut_interp_linear(linear, transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
   385 		out_device_b = lut_interp_linear(linear, transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
   387 		dest[OUTPUT_R_INDEX] = clamp_u8(out_device_r*255);
   388 		dest[OUTPUT_G_INDEX] = clamp_u8(out_device_g*255);
   389 		dest[OUTPUT_B_INDEX] = clamp_u8(out_device_b*255);
   390 		dest[OUTPUT_A_INDEX] = alpha;
   391 		dest += RGBA_OUTPUT_COMPONENTS;
   392 	}
   393 }
   396 static void qcms_transform_data_gray_out_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
   397 {
   398 	unsigned int i;
   399 	for (i = 0; i < length; i++) {
   400 		unsigned char device = *src++;
   401 		uint16_t gray;
   403 		float linear = transform->input_gamma_table_gray[device];
   405 		/* we could round here... */
   406 		gray = linear * PRECACHE_OUTPUT_MAX;
   408 		dest[OUTPUT_R_INDEX] = transform->output_table_r->data[gray];
   409 		dest[OUTPUT_G_INDEX] = transform->output_table_g->data[gray];
   410 		dest[OUTPUT_B_INDEX] = transform->output_table_b->data[gray];
   411 		dest += RGB_OUTPUT_COMPONENTS;
   412 	}
   413 }
   415 static void qcms_transform_data_graya_out_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
   416 {
   417 	unsigned int i;
   418 	for (i = 0; i < length; i++) {
   419 		unsigned char device = *src++;
   420 		unsigned char alpha = *src++;
   421 		uint16_t gray;
   423 		float linear = transform->input_gamma_table_gray[device];
   425 		/* we could round here... */
   426 		gray = linear * PRECACHE_OUTPUT_MAX;
   428 		dest[OUTPUT_R_INDEX] = transform->output_table_r->data[gray];
   429 		dest[OUTPUT_G_INDEX] = transform->output_table_g->data[gray];
   430 		dest[OUTPUT_B_INDEX] = transform->output_table_b->data[gray];
   431 		dest[OUTPUT_A_INDEX] = alpha;
   432 		dest += RGBA_OUTPUT_COMPONENTS;
   433 	}
   434 }
   436 static void qcms_transform_data_rgb_out_lut_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
   437 {
   438 	unsigned int i;
   439 	float (*mat)[4] = transform->matrix;
   440 	for (i = 0; i < length; i++) {
   441 		unsigned char device_r = *src++;
   442 		unsigned char device_g = *src++;
   443 		unsigned char device_b = *src++;
   444 		uint16_t r, g, b;
   446 		float linear_r = transform->input_gamma_table_r[device_r];
   447 		float linear_g = transform->input_gamma_table_g[device_g];
   448 		float linear_b = transform->input_gamma_table_b[device_b];
   450 		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
   451 		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
   452 		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
   454 		out_linear_r = clamp_float(out_linear_r);
   455 		out_linear_g = clamp_float(out_linear_g);
   456 		out_linear_b = clamp_float(out_linear_b);
   458 		/* we could round here... */
   459 		r = out_linear_r * PRECACHE_OUTPUT_MAX;
   460 		g = out_linear_g * PRECACHE_OUTPUT_MAX;
   461 		b = out_linear_b * PRECACHE_OUTPUT_MAX;
   463 		dest[OUTPUT_R_INDEX] = transform->output_table_r->data[r];
   464 		dest[OUTPUT_G_INDEX] = transform->output_table_g->data[g];
   465 		dest[OUTPUT_B_INDEX] = transform->output_table_b->data[b];
   466 		dest += RGB_OUTPUT_COMPONENTS;
   467 	}
   468 }
   470 static void qcms_transform_data_rgba_out_lut_precache(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
   471 {
   472 	unsigned int i;
   473 	float (*mat)[4] = transform->matrix;
   474 	for (i = 0; i < length; i++) {
   475 		unsigned char device_r = *src++;
   476 		unsigned char device_g = *src++;
   477 		unsigned char device_b = *src++;
   478 		unsigned char alpha = *src++;
   479 		uint16_t r, g, b;
   481 		float linear_r = transform->input_gamma_table_r[device_r];
   482 		float linear_g = transform->input_gamma_table_g[device_g];
   483 		float linear_b = transform->input_gamma_table_b[device_b];
   485 		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
   486 		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
   487 		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
   489 		out_linear_r = clamp_float(out_linear_r);
   490 		out_linear_g = clamp_float(out_linear_g);
   491 		out_linear_b = clamp_float(out_linear_b);
   493 		/* we could round here... */
   494 		r = out_linear_r * PRECACHE_OUTPUT_MAX;
   495 		g = out_linear_g * PRECACHE_OUTPUT_MAX;
   496 		b = out_linear_b * PRECACHE_OUTPUT_MAX;
   498 		dest[OUTPUT_R_INDEX] = transform->output_table_r->data[r];
   499 		dest[OUTPUT_G_INDEX] = transform->output_table_g->data[g];
   500 		dest[OUTPUT_B_INDEX] = transform->output_table_b->data[b];
   501 		dest[OUTPUT_A_INDEX] = alpha;
   502 		dest += RGBA_OUTPUT_COMPONENTS;
   503 	}
   504 }
   506 // Not used
   507 /* 
   508 static void qcms_transform_data_clut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length) {
   509 	unsigned int i;
   510 	int xy_len = 1;
   511 	int x_len = transform->grid_size;
   512 	int len = x_len * x_len;
   513 	float* r_table = transform->r_clut;
   514 	float* g_table = transform->g_clut;
   515 	float* b_table = transform->b_clut;
   517 	for (i = 0; i < length; i++) {
   518 		unsigned char in_r = *src++;
   519 		unsigned char in_g = *src++;
   520 		unsigned char in_b = *src++;
   521 		float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
   523 		int x = floorf(linear_r * (transform->grid_size-1));
   524 		int y = floorf(linear_g * (transform->grid_size-1));
   525 		int z = floorf(linear_b * (transform->grid_size-1));
   526 		int x_n = ceilf(linear_r * (transform->grid_size-1));
   527 		int y_n = ceilf(linear_g * (transform->grid_size-1));
   528 		int z_n = ceilf(linear_b * (transform->grid_size-1));
   529 		float x_d = linear_r * (transform->grid_size-1) - x; 
   530 		float y_d = linear_g * (transform->grid_size-1) - y;
   531 		float z_d = linear_b * (transform->grid_size-1) - z; 
   533 		float r_x1 = lerp(CLU(r_table,x,y,z), CLU(r_table,x_n,y,z), x_d);
   534 		float r_x2 = lerp(CLU(r_table,x,y_n,z), CLU(r_table,x_n,y_n,z), x_d);
   535 		float r_y1 = lerp(r_x1, r_x2, y_d);
   536 		float r_x3 = lerp(CLU(r_table,x,y,z_n), CLU(r_table,x_n,y,z_n), x_d);
   537 		float r_x4 = lerp(CLU(r_table,x,y_n,z_n), CLU(r_table,x_n,y_n,z_n), x_d);
   538 		float r_y2 = lerp(r_x3, r_x4, y_d);
   539 		float clut_r = lerp(r_y1, r_y2, z_d);
   541 		float g_x1 = lerp(CLU(g_table,x,y,z), CLU(g_table,x_n,y,z), x_d);
   542 		float g_x2 = lerp(CLU(g_table,x,y_n,z), CLU(g_table,x_n,y_n,z), x_d);
   543 		float g_y1 = lerp(g_x1, g_x2, y_d);
   544 		float g_x3 = lerp(CLU(g_table,x,y,z_n), CLU(g_table,x_n,y,z_n), x_d);
   545 		float g_x4 = lerp(CLU(g_table,x,y_n,z_n), CLU(g_table,x_n,y_n,z_n), x_d);
   546 		float g_y2 = lerp(g_x3, g_x4, y_d);
   547 		float clut_g = lerp(g_y1, g_y2, z_d);
   549 		float b_x1 = lerp(CLU(b_table,x,y,z), CLU(b_table,x_n,y,z), x_d);
   550 		float b_x2 = lerp(CLU(b_table,x,y_n,z), CLU(b_table,x_n,y_n,z), x_d);
   551 		float b_y1 = lerp(b_x1, b_x2, y_d);
   552 		float b_x3 = lerp(CLU(b_table,x,y,z_n), CLU(b_table,x_n,y,z_n), x_d);
   553 		float b_x4 = lerp(CLU(b_table,x,y_n,z_n), CLU(b_table,x_n,y_n,z_n), x_d);
   554 		float b_y2 = lerp(b_x3, b_x4, y_d);
   555 		float clut_b = lerp(b_y1, b_y2, z_d);
   557 		*dest++ = clamp_u8(clut_r*255.0f);
   558 		*dest++ = clamp_u8(clut_g*255.0f);
   559 		*dest++ = clamp_u8(clut_b*255.0f);
   560 	}	
   561 }
   562 */
   564 static int int_div_ceil(int value, int div) {
   565 	return ((value  + div - 1) / div);
   566 }
   568 // Using lcms' tetra interpolation algorithm.
   569 static void qcms_transform_data_tetra_clut_rgba(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length) {
   570 	unsigned int i;
   571 	int xy_len = 1;
   572 	int x_len = transform->grid_size;
   573 	int len = x_len * x_len;
   574 	float* r_table = transform->r_clut;
   575 	float* g_table = transform->g_clut;
   576 	float* b_table = transform->b_clut;
   577 	float c0_r, c1_r, c2_r, c3_r;
   578 	float c0_g, c1_g, c2_g, c3_g;
   579 	float c0_b, c1_b, c2_b, c3_b;
   580 	float clut_r, clut_g, clut_b;
   581 	for (i = 0; i < length; i++) {
   582 		unsigned char in_r = *src++;
   583 		unsigned char in_g = *src++;
   584 		unsigned char in_b = *src++;
   585 		unsigned char in_a = *src++;
   586 		float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
   588 		int x = in_r * (transform->grid_size-1) / 255;
   589 		int y = in_g * (transform->grid_size-1) / 255;
   590 		int z = in_b * (transform->grid_size-1) / 255;
   591 		int x_n = int_div_ceil(in_r * (transform->grid_size-1), 255);
   592 		int y_n = int_div_ceil(in_g * (transform->grid_size-1), 255);
   593 		int z_n = int_div_ceil(in_b * (transform->grid_size-1), 255);
   594 		float rx = linear_r * (transform->grid_size-1) - x; 
   595 		float ry = linear_g * (transform->grid_size-1) - y;
   596 		float rz = linear_b * (transform->grid_size-1) - z; 
   598 		c0_r = CLU(r_table, x, y, z);
   599 		c0_g = CLU(g_table, x, y, z);
   600 		c0_b = CLU(b_table, x, y, z);
   602 		if( rx >= ry ) {
   603 			if (ry >= rz) { //rx >= ry && ry >= rz
   604 				c1_r = CLU(r_table, x_n, y, z) - c0_r;
   605 				c2_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x_n, y, z);
   606 				c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
   607 				c1_g = CLU(g_table, x_n, y, z) - c0_g;
   608 				c2_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x_n, y, z);
   609 				c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
   610 				c1_b = CLU(b_table, x_n, y, z) - c0_b;
   611 				c2_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x_n, y, z);
   612 				c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
   613 			} else { 
   614 				if (rx >= rz) { //rx >= rz && rz >= ry
   615 					c1_r = CLU(r_table, x_n, y, z) - c0_r;
   616 					c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
   617 					c3_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x_n, y, z);
   618 					c1_g = CLU(g_table, x_n, y, z) - c0_g;
   619 					c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
   620 					c3_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x_n, y, z);
   621 					c1_b = CLU(b_table, x_n, y, z) - c0_b;
   622 					c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
   623 					c3_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x_n, y, z);
   624 				} else { //rz > rx && rx >= ry
   625 					c1_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x, y, z_n);
   626 					c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
   627 					c3_r = CLU(r_table, x, y, z_n) - c0_r;
   628 					c1_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x, y, z_n);
   629 					c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
   630 					c3_g = CLU(g_table, x, y, z_n) - c0_g;
   631 					c1_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x, y, z_n);
   632 					c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
   633 					c3_b = CLU(b_table, x, y, z_n) - c0_b;
   634 				}
   635 			}
   636 		} else {
   637 			if (rx >= rz) { //ry > rx && rx >= rz
   638 				c1_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x, y_n, z);
   639 				c2_r = CLU(r_table, x, y_n, z) - c0_r;
   640 				c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
   641 				c1_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x, y_n, z);
   642 				c2_g = CLU(g_table, x, y_n, z) - c0_g;
   643 				c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
   644 				c1_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x, y_n, z);
   645 				c2_b = CLU(b_table, x, y_n, z) - c0_b;
   646 				c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
   647 			} else {
   648 				if (ry >= rz) { //ry >= rz && rz > rx 
   649 					c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
   650 					c2_r = CLU(r_table, x, y_n, z) - c0_r;
   651 					c3_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y_n, z);
   652 					c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
   653 					c2_g = CLU(g_table, x, y_n, z) - c0_g;
   654 					c3_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y_n, z);
   655 					c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
   656 					c2_b = CLU(b_table, x, y_n, z) - c0_b;
   657 					c3_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y_n, z);
   658 				} else { //rz > ry && ry > rx
   659 					c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
   660 					c2_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y, z_n);
   661 					c3_r = CLU(r_table, x, y, z_n) - c0_r;
   662 					c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
   663 					c2_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y, z_n);
   664 					c3_g = CLU(g_table, x, y, z_n) - c0_g;
   665 					c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
   666 					c2_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y, z_n);
   667 					c3_b = CLU(b_table, x, y, z_n) - c0_b;
   668 				}
   669 			}
   670 		}
   672 		clut_r = c0_r + c1_r*rx + c2_r*ry + c3_r*rz;
   673 		clut_g = c0_g + c1_g*rx + c2_g*ry + c3_g*rz;
   674 		clut_b = c0_b + c1_b*rx + c2_b*ry + c3_b*rz;
   676 		dest[OUTPUT_R_INDEX] = clamp_u8(clut_r*255.0f);
   677 		dest[OUTPUT_G_INDEX] = clamp_u8(clut_g*255.0f);
   678 		dest[OUTPUT_B_INDEX] = clamp_u8(clut_b*255.0f);
   679 		dest[OUTPUT_A_INDEX] = in_a;
   680 		dest += RGBA_OUTPUT_COMPONENTS;
   681 	}	
   682 }
   684 // Using lcms' tetra interpolation code.
   685 static void qcms_transform_data_tetra_clut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length) {
   686 	unsigned int i;
   687 	int xy_len = 1;
   688 	int x_len = transform->grid_size;
   689 	int len = x_len * x_len;
   690 	float* r_table = transform->r_clut;
   691 	float* g_table = transform->g_clut;
   692 	float* b_table = transform->b_clut;
   693 	float c0_r, c1_r, c2_r, c3_r;
   694 	float c0_g, c1_g, c2_g, c3_g;
   695 	float c0_b, c1_b, c2_b, c3_b;
   696 	float clut_r, clut_g, clut_b;
   697 	for (i = 0; i < length; i++) {
   698 		unsigned char in_r = *src++;
   699 		unsigned char in_g = *src++;
   700 		unsigned char in_b = *src++;
   701 		float linear_r = in_r/255.0f, linear_g=in_g/255.0f, linear_b = in_b/255.0f;
   703 		int x = in_r * (transform->grid_size-1) / 255;
   704 		int y = in_g * (transform->grid_size-1) / 255;
   705 		int z = in_b * (transform->grid_size-1) / 255;
   706 		int x_n = int_div_ceil(in_r * (transform->grid_size-1), 255);
   707 		int y_n = int_div_ceil(in_g * (transform->grid_size-1), 255);
   708 		int z_n = int_div_ceil(in_b * (transform->grid_size-1), 255);
   709 		float rx = linear_r * (transform->grid_size-1) - x;
   710 		float ry = linear_g * (transform->grid_size-1) - y;
   711 		float rz = linear_b * (transform->grid_size-1) - z;
   713 		c0_r = CLU(r_table, x, y, z);
   714 		c0_g = CLU(g_table, x, y, z);
   715 		c0_b = CLU(b_table, x, y, z);
   717 		if( rx >= ry ) {
   718 			if (ry >= rz) { //rx >= ry && ry >= rz
   719 				c1_r = CLU(r_table, x_n, y, z) - c0_r;
   720 				c2_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x_n, y, z);
   721 				c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
   722 				c1_g = CLU(g_table, x_n, y, z) - c0_g;
   723 				c2_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x_n, y, z);
   724 				c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
   725 				c1_b = CLU(b_table, x_n, y, z) - c0_b;
   726 				c2_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x_n, y, z);
   727 				c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
   728 			} else { 
   729 				if (rx >= rz) { //rx >= rz && rz >= ry
   730 					c1_r = CLU(r_table, x_n, y, z) - c0_r;
   731 					c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
   732 					c3_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x_n, y, z);
   733 					c1_g = CLU(g_table, x_n, y, z) - c0_g;
   734 					c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
   735 					c3_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x_n, y, z);
   736 					c1_b = CLU(b_table, x_n, y, z) - c0_b;
   737 					c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
   738 					c3_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x_n, y, z);
   739 				} else { //rz > rx && rx >= ry
   740 					c1_r = CLU(r_table, x_n, y, z_n) - CLU(r_table, x, y, z_n);
   741 					c2_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y, z_n);
   742 					c3_r = CLU(r_table, x, y, z_n) - c0_r;
   743 					c1_g = CLU(g_table, x_n, y, z_n) - CLU(g_table, x, y, z_n);
   744 					c2_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y, z_n);
   745 					c3_g = CLU(g_table, x, y, z_n) - c0_g;
   746 					c1_b = CLU(b_table, x_n, y, z_n) - CLU(b_table, x, y, z_n);
   747 					c2_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y, z_n);
   748 					c3_b = CLU(b_table, x, y, z_n) - c0_b;
   749 				}
   750 			}
   751 		} else {
   752 			if (rx >= rz) { //ry > rx && rx >= rz
   753 				c1_r = CLU(r_table, x_n, y_n, z) - CLU(r_table, x, y_n, z);
   754 				c2_r = CLU(r_table, x, y_n, z) - c0_r;
   755 				c3_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x_n, y_n, z);
   756 				c1_g = CLU(g_table, x_n, y_n, z) - CLU(g_table, x, y_n, z);
   757 				c2_g = CLU(g_table, x, y_n, z) - c0_g;
   758 				c3_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x_n, y_n, z);
   759 				c1_b = CLU(b_table, x_n, y_n, z) - CLU(b_table, x, y_n, z);
   760 				c2_b = CLU(b_table, x, y_n, z) - c0_b;
   761 				c3_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x_n, y_n, z);
   762 			} else {
   763 				if (ry >= rz) { //ry >= rz && rz > rx 
   764 					c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
   765 					c2_r = CLU(r_table, x, y_n, z) - c0_r;
   766 					c3_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y_n, z);
   767 					c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
   768 					c2_g = CLU(g_table, x, y_n, z) - c0_g;
   769 					c3_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y_n, z);
   770 					c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
   771 					c2_b = CLU(b_table, x, y_n, z) - c0_b;
   772 					c3_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y_n, z);
   773 				} else { //rz > ry && ry > rx
   774 					c1_r = CLU(r_table, x_n, y_n, z_n) - CLU(r_table, x, y_n, z_n);
   775 					c2_r = CLU(r_table, x, y_n, z_n) - CLU(r_table, x, y, z_n);
   776 					c3_r = CLU(r_table, x, y, z_n) - c0_r;
   777 					c1_g = CLU(g_table, x_n, y_n, z_n) - CLU(g_table, x, y_n, z_n);
   778 					c2_g = CLU(g_table, x, y_n, z_n) - CLU(g_table, x, y, z_n);
   779 					c3_g = CLU(g_table, x, y, z_n) - c0_g;
   780 					c1_b = CLU(b_table, x_n, y_n, z_n) - CLU(b_table, x, y_n, z_n);
   781 					c2_b = CLU(b_table, x, y_n, z_n) - CLU(b_table, x, y, z_n);
   782 					c3_b = CLU(b_table, x, y, z_n) - c0_b;
   783 				}
   784 			}
   785 		}
   787 		clut_r = c0_r + c1_r*rx + c2_r*ry + c3_r*rz;
   788 		clut_g = c0_g + c1_g*rx + c2_g*ry + c3_g*rz;
   789 		clut_b = c0_b + c1_b*rx + c2_b*ry + c3_b*rz;
   791 		dest[OUTPUT_R_INDEX] = clamp_u8(clut_r*255.0f);
   792 		dest[OUTPUT_G_INDEX] = clamp_u8(clut_g*255.0f);
   793 		dest[OUTPUT_B_INDEX] = clamp_u8(clut_b*255.0f);
   794 		dest += RGB_OUTPUT_COMPONENTS;
   795 	}	
   796 }
   798 static void qcms_transform_data_rgb_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
   799 {
   800 	unsigned int i;
   801 	float (*mat)[4] = transform->matrix;
   802 	for (i = 0; i < length; i++) {
   803 		unsigned char device_r = *src++;
   804 		unsigned char device_g = *src++;
   805 		unsigned char device_b = *src++;
   806 		float out_device_r, out_device_g, out_device_b;
   808 		float linear_r = transform->input_gamma_table_r[device_r];
   809 		float linear_g = transform->input_gamma_table_g[device_g];
   810 		float linear_b = transform->input_gamma_table_b[device_b];
   812 		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
   813 		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
   814 		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
   816 		out_linear_r = clamp_float(out_linear_r);
   817 		out_linear_g = clamp_float(out_linear_g);
   818 		out_linear_b = clamp_float(out_linear_b);
   820 		out_device_r = lut_interp_linear(out_linear_r, 
   821 				transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
   822 		out_device_g = lut_interp_linear(out_linear_g, 
   823 				transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
   824 		out_device_b = lut_interp_linear(out_linear_b, 
   825 				transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
   827 		dest[OUTPUT_R_INDEX] = clamp_u8(out_device_r*255);
   828 		dest[OUTPUT_G_INDEX] = clamp_u8(out_device_g*255);
   829 		dest[OUTPUT_B_INDEX] = clamp_u8(out_device_b*255);
   830 		dest += RGB_OUTPUT_COMPONENTS;
   831 	}
   832 }
   834 static void qcms_transform_data_rgba_out_lut(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
   835 {
   836 	unsigned int i;
   837 	float (*mat)[4] = transform->matrix;
   838 	for (i = 0; i < length; i++) {
   839 		unsigned char device_r = *src++;
   840 		unsigned char device_g = *src++;
   841 		unsigned char device_b = *src++;
   842 		unsigned char alpha = *src++;
   843 		float out_device_r, out_device_g, out_device_b;
   845 		float linear_r = transform->input_gamma_table_r[device_r];
   846 		float linear_g = transform->input_gamma_table_g[device_g];
   847 		float linear_b = transform->input_gamma_table_b[device_b];
   849 		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
   850 		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
   851 		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
   853 		out_linear_r = clamp_float(out_linear_r);
   854 		out_linear_g = clamp_float(out_linear_g);
   855 		out_linear_b = clamp_float(out_linear_b);
   857 		out_device_r = lut_interp_linear(out_linear_r, 
   858 				transform->output_gamma_lut_r, transform->output_gamma_lut_r_length);
   859 		out_device_g = lut_interp_linear(out_linear_g, 
   860 				transform->output_gamma_lut_g, transform->output_gamma_lut_g_length);
   861 		out_device_b = lut_interp_linear(out_linear_b, 
   862 				transform->output_gamma_lut_b, transform->output_gamma_lut_b_length);
   864 		dest[OUTPUT_R_INDEX] = clamp_u8(out_device_r*255);
   865 		dest[OUTPUT_G_INDEX] = clamp_u8(out_device_g*255);
   866 		dest[OUTPUT_B_INDEX] = clamp_u8(out_device_b*255);
   867 		dest[OUTPUT_A_INDEX] = alpha;
   868 		dest += RGBA_OUTPUT_COMPONENTS;
   869 	}
   870 }
   872 #if 0
   873 static void qcms_transform_data_rgb_out_linear(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length)
   874 {
   875 	int i;
   876 	float (*mat)[4] = transform->matrix;
   877 	for (i = 0; i < length; i++) {
   878 		unsigned char device_r = *src++;
   879 		unsigned char device_g = *src++;
   880 		unsigned char device_b = *src++;
   882 		float linear_r = transform->input_gamma_table_r[device_r];
   883 		float linear_g = transform->input_gamma_table_g[device_g];
   884 		float linear_b = transform->input_gamma_table_b[device_b];
   886 		float out_linear_r = mat[0][0]*linear_r + mat[1][0]*linear_g + mat[2][0]*linear_b;
   887 		float out_linear_g = mat[0][1]*linear_r + mat[1][1]*linear_g + mat[2][1]*linear_b;
   888 		float out_linear_b = mat[0][2]*linear_r + mat[1][2]*linear_g + mat[2][2]*linear_b;
   890 		*dest++ = clamp_u8(out_linear_r*255);
   891 		*dest++ = clamp_u8(out_linear_g*255);
   892 		*dest++ = clamp_u8(out_linear_b*255);
   893 	}
   894 }
   895 #endif
   897 /*
   898  * If users create and destroy objects on different threads, even if the same
   899  * objects aren't used on different threads at the same time, we can still run
   900  * in to trouble with refcounts if they aren't atomic.
   901  *
   902  * This can lead to us prematurely deleting the precache if threads get unlucky
   903  * and write the wrong value to the ref count.
   904  */
   905 static struct precache_output *precache_reference(struct precache_output *p)
   906 {
   907 	qcms_atomic_increment(p->ref_count);
   908 	return p;
   909 }
   911 static struct precache_output *precache_create()
   912 {
   913 	struct precache_output *p = malloc(sizeof(struct precache_output));
   914 	if (p)
   915 		p->ref_count = 1;
   916 	return p;
   917 }
   919 void precache_release(struct precache_output *p)
   920 {
   921 	if (qcms_atomic_decrement(p->ref_count) == 0) {
   922 		free(p);
   923 	}
   924 }
   926 #ifdef HAS_POSIX_MEMALIGN
   927 static qcms_transform *transform_alloc(void)
   928 {
   929 	qcms_transform *t;
   930 	if (!posix_memalign(&t, 16, sizeof(*t))) {
   931 		return t;
   932 	} else {
   933 		return NULL;
   934 	}
   935 }
   936 static void transform_free(qcms_transform *t)
   937 {
   938 	free(t);
   939 }
   940 #else
   941 static qcms_transform *transform_alloc(void)
   942 {
   943 	/* transform needs to be aligned on a 16byte boundrary */
   944 	char *original_block = calloc(sizeof(qcms_transform) + sizeof(void*) + 16, 1);
   945 	/* make room for a pointer to the block returned by calloc */
   946 	void *transform_start = original_block + sizeof(void*);
   947 	/* align transform_start */
   948 	qcms_transform *transform_aligned = (qcms_transform*)(((uintptr_t)transform_start + 15) & ~0xf);
   950 	/* store a pointer to the block returned by calloc so that we can free it later */
   951 	void **(original_block_ptr) = (void**)transform_aligned;
   952 	if (!original_block)
   953 		return NULL;
   954 	original_block_ptr--;
   955 	*original_block_ptr = original_block;
   957 	return transform_aligned;
   958 }
   959 static void transform_free(qcms_transform *t)
   960 {
   961 	/* get at the pointer to the unaligned block returned by calloc */
   962 	void **p = (void**)t;
   963 	p--;
   964 	free(*p);
   965 }
   966 #endif
   968 void qcms_transform_release(qcms_transform *t)
   969 {
   970 	/* ensure we only free the gamma tables once even if there are
   971 	 * multiple references to the same data */
   973 	if (t->output_table_r)
   974 		precache_release(t->output_table_r);
   975 	if (t->output_table_g)
   976 		precache_release(t->output_table_g);
   977 	if (t->output_table_b)
   978 		precache_release(t->output_table_b);
   980 	free(t->input_gamma_table_r);
   981 	if (t->input_gamma_table_g != t->input_gamma_table_r)
   982 		free(t->input_gamma_table_g);
   983 	if (t->input_gamma_table_g != t->input_gamma_table_r &&
   984 	    t->input_gamma_table_g != t->input_gamma_table_b)
   985 		free(t->input_gamma_table_b);
   987 	free(t->input_gamma_table_gray);
   989 	free(t->output_gamma_lut_r);
   990 	free(t->output_gamma_lut_g);
   991 	free(t->output_gamma_lut_b);
   993 	transform_free(t);
   994 }
   996 #ifdef X86
   997 // Determine if we can build with SSE2 (this was partly copied from jmorecfg.h in
   998 // mozilla/jpeg)
   999  // -------------------------------------------------------------------------
  1000 #if defined(_M_IX86) && defined(_MSC_VER)
  1001 #define HAS_CPUID
  1002 /* Get us a CPUID function. Avoid clobbering EBX because sometimes it's the PIC
  1003    register - I'm not sure if that ever happens on windows, but cpuid isn't
  1004    on the critical path so we just preserve the register to be safe and to be
  1005    consistent with the non-windows version. */
  1006 static void cpuid(uint32_t fxn, uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) {
  1007        uint32_t a_, b_, c_, d_;
  1008        __asm {
  1009               xchg   ebx, esi
  1010               mov    eax, fxn
  1011               cpuid
  1012               mov    a_, eax
  1013               mov    b_, ebx
  1014               mov    c_, ecx
  1015               mov    d_, edx
  1016               xchg   ebx, esi
  1018        *a = a_;
  1019        *b = b_;
  1020        *c = c_;
  1021        *d = d_;
  1023 #elif (defined(__GNUC__) || defined(__SUNPRO_C)) && (defined(__i386__) || defined(__i386))
  1024 #define HAS_CPUID
  1025 /* Get us a CPUID function. We can't use ebx because it's the PIC register on
  1026    some platforms, so we use ESI instead and save ebx to avoid clobbering it. */
  1027 static void cpuid(uint32_t fxn, uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) {
  1029 	uint32_t a_, b_, c_, d_;
  1030        __asm__ __volatile__ ("xchgl %%ebx, %%esi; cpuid; xchgl %%ebx, %%esi;" 
  1031                              : "=a" (a_), "=S" (b_), "=c" (c_), "=d" (d_) : "a" (fxn));
  1032 	   *a = a_;
  1033 	   *b = b_;
  1034 	   *c = c_;
  1035 	   *d = d_;
  1037 #endif
  1039 // -------------------------Runtime SSEx Detection-----------------------------
  1041 /* MMX is always supported per
  1042  *  Gecko v1.9.1 minimum CPU requirements */
  1043 #define SSE1_EDX_MASK (1UL << 25)
  1044 #define SSE2_EDX_MASK (1UL << 26)
  1045 #define SSE3_ECX_MASK (1UL <<  0)
  1047 static int sse_version_available(void)
  1049 #if defined(__x86_64__) || defined(__x86_64) || defined(_M_AMD64)
  1050 	/* we know at build time that 64-bit CPUs always have SSE2
  1051 	 * this tells the compiler that non-SSE2 branches will never be
  1052 	 * taken (i.e. OK to optimze away the SSE1 and non-SIMD code */
  1053 	return 2;
  1054 #elif defined(HAS_CPUID)
  1055 	static int sse_version = -1;
  1056 	uint32_t a, b, c, d;
  1057 	uint32_t function = 0x00000001;
  1059 	if (sse_version == -1) {
  1060 		sse_version = 0;
  1061 		cpuid(function, &a, &b, &c, &d);
  1062 		if (c & SSE3_ECX_MASK)
  1063 			sse_version = 3;
  1064 		else if (d & SSE2_EDX_MASK)
  1065 			sse_version = 2;
  1066 		else if (d & SSE1_EDX_MASK)
  1067 			sse_version = 1;
  1070 	return sse_version;
  1071 #else
  1072 	return 0;
  1073 #endif
  1075 #endif
  1077 static const struct matrix bradford_matrix = {{	{ 0.8951f, 0.2664f,-0.1614f},
  1078 						{-0.7502f, 1.7135f, 0.0367f},
  1079 						{ 0.0389f,-0.0685f, 1.0296f}}, 
  1080 						false};
  1082 static const struct matrix bradford_matrix_inv = {{ { 0.9869929f,-0.1470543f, 0.1599627f},
  1083 						    { 0.4323053f, 0.5183603f, 0.0492912f},
  1084 						    {-0.0085287f, 0.0400428f, 0.9684867f}}, 
  1085 						    false};
  1087 // See ICCv4 E.3
  1088 struct matrix compute_whitepoint_adaption(float X, float Y, float Z) {
  1089 	float p = (0.96422f*bradford_matrix.m[0][0] + 1.000f*bradford_matrix.m[1][0] + 0.82521f*bradford_matrix.m[2][0]) /
  1090 		  (X*bradford_matrix.m[0][0]      + Y*bradford_matrix.m[1][0]      + Z*bradford_matrix.m[2][0]     );
  1091 	float y = (0.96422f*bradford_matrix.m[0][1] + 1.000f*bradford_matrix.m[1][1] + 0.82521f*bradford_matrix.m[2][1]) /
  1092 		  (X*bradford_matrix.m[0][1]      + Y*bradford_matrix.m[1][1]      + Z*bradford_matrix.m[2][1]     );
  1093 	float b = (0.96422f*bradford_matrix.m[0][2] + 1.000f*bradford_matrix.m[1][2] + 0.82521f*bradford_matrix.m[2][2]) /
  1094 		  (X*bradford_matrix.m[0][2]      + Y*bradford_matrix.m[1][2]      + Z*bradford_matrix.m[2][2]     );
  1095 	struct matrix white_adaption = {{ {p,0,0}, {0,y,0}, {0,0,b}}, false};
  1096 	return matrix_multiply( bradford_matrix_inv, matrix_multiply(white_adaption, bradford_matrix) );
  1099 void qcms_profile_precache_output_transform(qcms_profile *profile)
  1101 	/* we only support precaching on rgb profiles */
  1102 	if (profile->color_space != RGB_SIGNATURE)
  1103 		return;
  1105 	if (qcms_supports_iccv4) {
  1106 		/* don't precache since we will use the B2A LUT */
  1107 		if (profile->B2A0)
  1108 			return;
  1110 		/* don't precache since we will use the mBA LUT */
  1111 		if (profile->mBA)
  1112 			return;
  1115 	/* don't precache if we do not have the TRC curves */
  1116 	if (!profile->redTRC || !profile->greenTRC || !profile->blueTRC)
  1117 		return;
  1119 	if (!profile->output_table_r) {
  1120 		profile->output_table_r = precache_create();
  1121 		if (profile->output_table_r &&
  1122 				!compute_precache(profile->redTRC, profile->output_table_r->data)) {
  1123 			precache_release(profile->output_table_r);
  1124 			profile->output_table_r = NULL;
  1127 	if (!profile->output_table_g) {
  1128 		profile->output_table_g = precache_create();
  1129 		if (profile->output_table_g &&
  1130 				!compute_precache(profile->greenTRC, profile->output_table_g->data)) {
  1131 			precache_release(profile->output_table_g);
  1132 			profile->output_table_g = NULL;
  1135 	if (!profile->output_table_b) {
  1136 		profile->output_table_b = precache_create();
  1137 		if (profile->output_table_b &&
  1138 				!compute_precache(profile->blueTRC, profile->output_table_b->data)) {
  1139 			precache_release(profile->output_table_b);
  1140 			profile->output_table_b = NULL;
  1145 /* Replace the current transformation with a LUT transformation using a given number of sample points */
  1146 qcms_transform* qcms_transform_precacheLUT_float(qcms_transform *transform, qcms_profile *in, qcms_profile *out, 
  1147                                                  int samples, qcms_data_type in_type)
  1149 	/* The range between which 2 consecutive sample points can be used to interpolate */
  1150 	uint16_t x,y,z;
  1151 	uint32_t l;
  1152 	uint32_t lutSize = 3 * samples * samples * samples;
  1153 	float* src = NULL;
  1154 	float* dest = NULL;
  1155 	float* lut = NULL;
  1157 	src = malloc(lutSize*sizeof(float));
  1158 	dest = malloc(lutSize*sizeof(float));
  1160 	if (src && dest) {
  1161 		/* Prepare a list of points we want to sample */
  1162 		l = 0;
  1163 		for (x = 0; x < samples; x++) {
  1164 			for (y = 0; y < samples; y++) {
  1165 				for (z = 0; z < samples; z++) {
  1166 					src[l++] = x / (float)(samples-1);
  1167 					src[l++] = y / (float)(samples-1);
  1168 					src[l++] = z / (float)(samples-1);
  1173 		lut = qcms_chain_transform(in, out, src, dest, lutSize);
  1174 		if (lut) {
  1175 			transform->r_clut = &lut[0];
  1176 			transform->g_clut = &lut[1];
  1177 			transform->b_clut = &lut[2];
  1178 			transform->grid_size = samples;
  1179 			if (in_type == QCMS_DATA_RGBA_8) {
  1180 				transform->transform_fn = qcms_transform_data_tetra_clut_rgba;
  1181 			} else {
  1182 				transform->transform_fn = qcms_transform_data_tetra_clut;
  1188 	//XXX: qcms_modular_transform_data may return either the src or dest buffer. If so it must not be free-ed
  1189 	if (src && lut != src) {
  1190 		free(src);
  1192 	if (dest && lut != dest) {
  1193 		free(dest);
  1196 	if (lut == NULL) {
  1197 		return NULL;
  1199 	return transform;
  1202 #define NO_MEM_TRANSFORM NULL
  1204 qcms_transform* qcms_transform_create(
  1205 		qcms_profile *in, qcms_data_type in_type,
  1206 		qcms_profile *out, qcms_data_type out_type,
  1207 		qcms_intent intent)
  1209 	bool precache = false;
  1211         qcms_transform *transform = transform_alloc();
  1212         if (!transform) {
  1213 		return NULL;
  1215 	if (out_type != QCMS_DATA_RGB_8 &&
  1216                 out_type != QCMS_DATA_RGBA_8) {
  1217             assert(0 && "output type");
  1218 	    transform_free(transform);
  1219             return NULL;
  1222 	if (out->output_table_r &&
  1223 			out->output_table_g &&
  1224 			out->output_table_b) {
  1225 		precache = true;
  1228 	// This precache assumes RGB_SIGNATURE (fails on GRAY_SIGNATURE, for instance)
  1229 	if (qcms_supports_iccv4 &&
  1230 			(in_type == QCMS_DATA_RGB_8 || in_type == QCMS_DATA_RGBA_8) &&
  1231 			(in->A2B0 || out->B2A0 || in->mAB || out->mAB))
  1233 		// Precache the transformation to a CLUT 33x33x33 in size.
  1234 		// 33 is used by many profiles and works well in pratice. 
  1235 		// This evenly divides 256 into blocks of 8x8x8.
  1236 		// TODO For transforming small data sets of about 200x200 or less
  1237 		// precaching should be avoided.
  1238 		qcms_transform *result = qcms_transform_precacheLUT_float(transform, in, out, 33, in_type);
  1239 		if (!result) {
  1240             		assert(0 && "precacheLUT failed");
  1241 			transform_free(transform);
  1242 			return NULL;
  1244 		return result;
  1247 	if (precache) {
  1248 		transform->output_table_r = precache_reference(out->output_table_r);
  1249 		transform->output_table_g = precache_reference(out->output_table_g);
  1250 		transform->output_table_b = precache_reference(out->output_table_b);
  1251 	} else {
  1252 		if (!out->redTRC || !out->greenTRC || !out->blueTRC) {
  1253 			qcms_transform_release(transform);
  1254 			return NO_MEM_TRANSFORM;
  1256 		build_output_lut(out->redTRC, &transform->output_gamma_lut_r, &transform->output_gamma_lut_r_length);
  1257 		build_output_lut(out->greenTRC, &transform->output_gamma_lut_g, &transform->output_gamma_lut_g_length);
  1258 		build_output_lut(out->blueTRC, &transform->output_gamma_lut_b, &transform->output_gamma_lut_b_length);
  1259 		if (!transform->output_gamma_lut_r || !transform->output_gamma_lut_g || !transform->output_gamma_lut_b) {
  1260 			qcms_transform_release(transform);
  1261 			return NO_MEM_TRANSFORM;
  1265         if (in->color_space == RGB_SIGNATURE) {
  1266 		struct matrix in_matrix, out_matrix, result;
  1268 		if (in_type != QCMS_DATA_RGB_8 &&
  1269                     in_type != QCMS_DATA_RGBA_8){
  1270                 	assert(0 && "input type");
  1271 			transform_free(transform);
  1272                 	return NULL;
  1274 		if (precache) {
  1275 #ifdef X86
  1276 		    if (sse_version_available() >= 2) {
  1277 			    if (in_type == QCMS_DATA_RGB_8)
  1278 				    transform->transform_fn = qcms_transform_data_rgb_out_lut_sse2;
  1279 			    else
  1280 				    transform->transform_fn = qcms_transform_data_rgba_out_lut_sse2;
  1282 #if !(defined(_MSC_VER) && defined(_M_AMD64))
  1283                     /* Microsoft Compiler for x64 doesn't support MMX.
  1284                      * SSE code uses MMX so that we disable on x64 */
  1285 		    } else
  1286 		    if (sse_version_available() >= 1) {
  1287 			    if (in_type == QCMS_DATA_RGB_8)
  1288 				    transform->transform_fn = qcms_transform_data_rgb_out_lut_sse1;
  1289 			    else
  1290 				    transform->transform_fn = qcms_transform_data_rgba_out_lut_sse1;
  1291 #endif
  1292 		    } else
  1293 #endif
  1294 #if (defined(__POWERPC__) || defined(__powerpc__))
  1295 		    if (have_altivec()) {
  1296 			    if (in_type == QCMS_DATA_RGB_8)
  1297 				    transform->transform_fn = qcms_transform_data_rgb_out_lut_altivec;
  1298 			    else
  1299 				    transform->transform_fn = qcms_transform_data_rgba_out_lut_altivec;
  1300 		    } else
  1301 #endif
  1303 				if (in_type == QCMS_DATA_RGB_8)
  1304 					transform->transform_fn = qcms_transform_data_rgb_out_lut_precache;
  1305 				else
  1306 					transform->transform_fn = qcms_transform_data_rgba_out_lut_precache;
  1308 		} else {
  1309 			if (in_type == QCMS_DATA_RGB_8)
  1310 				transform->transform_fn = qcms_transform_data_rgb_out_lut;
  1311 			else
  1312 				transform->transform_fn = qcms_transform_data_rgba_out_lut;
  1315 		//XXX: avoid duplicating tables if we can
  1316 		transform->input_gamma_table_r = build_input_gamma_table(in->redTRC);
  1317 		transform->input_gamma_table_g = build_input_gamma_table(in->greenTRC);
  1318 		transform->input_gamma_table_b = build_input_gamma_table(in->blueTRC);
  1319 		if (!transform->input_gamma_table_r || !transform->input_gamma_table_g || !transform->input_gamma_table_b) {
  1320 			qcms_transform_release(transform);
  1321 			return NO_MEM_TRANSFORM;
  1325 		/* build combined colorant matrix */
  1326 		in_matrix = build_colorant_matrix(in);
  1327 		out_matrix = build_colorant_matrix(out);
  1328 		out_matrix = matrix_invert(out_matrix);
  1329 		if (out_matrix.invalid) {
  1330 			qcms_transform_release(transform);
  1331 			return NULL;
  1333 		result = matrix_multiply(out_matrix, in_matrix);
  1335 		/* store the results in column major mode
  1336 		 * this makes doing the multiplication with sse easier */
  1337 		transform->matrix[0][0] = result.m[0][0];
  1338 		transform->matrix[1][0] = result.m[0][1];
  1339 		transform->matrix[2][0] = result.m[0][2];
  1340 		transform->matrix[0][1] = result.m[1][0];
  1341 		transform->matrix[1][1] = result.m[1][1];
  1342 		transform->matrix[2][1] = result.m[1][2];
  1343 		transform->matrix[0][2] = result.m[2][0];
  1344 		transform->matrix[1][2] = result.m[2][1];
  1345 		transform->matrix[2][2] = result.m[2][2];
  1347 	} else if (in->color_space == GRAY_SIGNATURE) {
  1348 		if (in_type != QCMS_DATA_GRAY_8 &&
  1349 				in_type != QCMS_DATA_GRAYA_8){
  1350 			assert(0 && "input type");
  1351 			transform_free(transform);
  1352 			return NULL;
  1355 		transform->input_gamma_table_gray = build_input_gamma_table(in->grayTRC);
  1356 		if (!transform->input_gamma_table_gray) {
  1357 			qcms_transform_release(transform);
  1358 			return NO_MEM_TRANSFORM;
  1361 		if (precache) {
  1362 			if (in_type == QCMS_DATA_GRAY_8) {
  1363 				transform->transform_fn = qcms_transform_data_gray_out_precache;
  1364 			} else {
  1365 				transform->transform_fn = qcms_transform_data_graya_out_precache;
  1367 		} else {
  1368 			if (in_type == QCMS_DATA_GRAY_8) {
  1369 				transform->transform_fn = qcms_transform_data_gray_out_lut;
  1370 			} else {
  1371 				transform->transform_fn = qcms_transform_data_graya_out_lut;
  1374 	} else {
  1375 		assert(0 && "unexpected colorspace");
  1376 		transform_free(transform);
  1377 		return NULL;
  1379 	return transform;
  1382 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
  1383 /* we need this to avoid crashes when gcc assumes the stack is 128bit aligned */
  1384 __attribute__((__force_align_arg_pointer__))
  1385 #endif
  1386 void qcms_transform_data(qcms_transform *transform, void *src, void *dest, size_t length)
  1388 	transform->transform_fn(transform, src, dest, length);
  1391 qcms_bool qcms_supports_iccv4;
  1392 void qcms_enable_iccv4()
  1394 	qcms_supports_iccv4 = true;

mercurial