|
1 /* vim: set ts=8 sw=8 noexpandtab: */ |
|
2 // qcms |
|
3 // Copyright (C) 2009 Mozilla Corporation |
|
4 // Copyright (C) 1998-2007 Marti Maria |
|
5 // |
|
6 // Permission is hereby granted, free of charge, to any person obtaining |
|
7 // a copy of this software and associated documentation files (the "Software"), |
|
8 // to deal in the Software without restriction, including without limitation |
|
9 // the rights to use, copy, modify, merge, publish, distribute, sublicense, |
|
10 // and/or sell copies of the Software, and to permit persons to whom the Software |
|
11 // is furnished to do so, subject to the following conditions: |
|
12 // |
|
13 // The above copyright notice and this permission notice shall be included in |
|
14 // all copies or substantial portions of the Software. |
|
15 // |
|
16 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
|
17 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO |
|
18 // THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
|
19 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE |
|
20 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION |
|
21 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION |
|
22 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
|
23 |
|
24 #include <altivec.h> |
|
25 |
|
26 #include "qcmsint.h" |
|
27 |
|
28 #define FLOATSCALE (float)(PRECACHE_OUTPUT_SIZE) |
|
29 #define CLAMPMAXVAL (((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE) |
|
30 static const ALIGN float floatScaleX4 = FLOATSCALE; |
|
31 static const ALIGN float clampMaxValueX4 = CLAMPMAXVAL; |
|
32 |
|
33 inline vector float load_aligned_float(float *dataPtr) |
|
34 { |
|
35 vector float data = vec_lde(0, dataPtr); |
|
36 vector unsigned char moveToStart = vec_lvsl(0, dataPtr); |
|
37 return vec_perm(data, data, moveToStart); |
|
38 } |
|
39 |
|
40 void qcms_transform_data_rgb_out_lut_altivec(qcms_transform *transform, |
|
41 unsigned char *src, |
|
42 unsigned char *dest, |
|
43 size_t length) |
|
44 { |
|
45 unsigned int i; |
|
46 float (*mat)[4] = transform->matrix; |
|
47 char input_back[32]; |
|
48 /* Ensure we have a buffer that's 16 byte aligned regardless of the original |
|
49 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32)) |
|
50 * because they don't work on stack variables. gcc 4.4 does do the right thing |
|
51 * on x86 but that's too new for us right now. For more info: gcc bug #16660 */ |
|
52 float const *input = (float*)(((uintptr_t)&input_back[16]) & ~0xf); |
|
53 /* share input and output locations to save having to keep the |
|
54 * locations in separate registers */ |
|
55 uint32_t const *output = (uint32_t*)input; |
|
56 |
|
57 /* deref *transform now to avoid it in loop */ |
|
58 const float *igtbl_r = transform->input_gamma_table_r; |
|
59 const float *igtbl_g = transform->input_gamma_table_g; |
|
60 const float *igtbl_b = transform->input_gamma_table_b; |
|
61 |
|
62 /* deref *transform now to avoid it in loop */ |
|
63 const uint8_t *otdata_r = &transform->output_table_r->data[0]; |
|
64 const uint8_t *otdata_g = &transform->output_table_g->data[0]; |
|
65 const uint8_t *otdata_b = &transform->output_table_b->data[0]; |
|
66 |
|
67 /* input matrix values never change */ |
|
68 const vector float mat0 = vec_ldl(0, (vector float*)mat[0]); |
|
69 const vector float mat1 = vec_ldl(0, (vector float*)mat[1]); |
|
70 const vector float mat2 = vec_ldl(0, (vector float*)mat[2]); |
|
71 |
|
72 /* these values don't change, either */ |
|
73 const vector float max = vec_splat(vec_lde(0, (float*)&clampMaxValueX4), 0); |
|
74 const vector float min = (vector float)vec_splat_u32(0); |
|
75 const vector float scale = vec_splat(vec_lde(0, (float*)&floatScaleX4), 0); |
|
76 |
|
77 /* working variables */ |
|
78 vector float vec_r, vec_g, vec_b, result; |
|
79 |
|
80 /* CYA */ |
|
81 if (!length) |
|
82 return; |
|
83 |
|
84 /* one pixel is handled outside of the loop */ |
|
85 length--; |
|
86 |
|
87 /* setup for transforming 1st pixel */ |
|
88 vec_r = load_aligned_float((float*)&igtbl_r[src[0]]); |
|
89 vec_g = load_aligned_float((float*)&igtbl_r[src[1]]); |
|
90 vec_b = load_aligned_float((float*)&igtbl_r[src[2]]); |
|
91 src += 3; |
|
92 |
|
93 /* transform all but final pixel */ |
|
94 |
|
95 for (i=0; i<length; i++) |
|
96 { |
|
97 /* position values from gamma tables */ |
|
98 vec_r = vec_splat(vec_r, 0); |
|
99 vec_g = vec_splat(vec_g, 0); |
|
100 vec_b = vec_splat(vec_b, 0); |
|
101 |
|
102 /* gamma * matrix */ |
|
103 vec_r = vec_madd(vec_r, mat0, min); |
|
104 vec_g = vec_madd(vec_g, mat1, min); |
|
105 vec_b = vec_madd(vec_b, mat2, min); |
|
106 |
|
107 /* crunch, crunch, crunch */ |
|
108 vec_r = vec_add(vec_r, vec_add(vec_g, vec_b)); |
|
109 vec_r = vec_max(min, vec_r); |
|
110 vec_r = vec_min(max, vec_r); |
|
111 result = vec_madd(vec_r, scale, min); |
|
112 |
|
113 /* store calc'd output tables indices */ |
|
114 vec_st(vec_ctu(vec_round(result), 0), 0, (vector unsigned int*)output); |
|
115 |
|
116 /* load for next loop while store completes */ |
|
117 vec_r = load_aligned_float((float*)&igtbl_r[src[0]]); |
|
118 vec_g = load_aligned_float((float*)&igtbl_r[src[1]]); |
|
119 vec_b = load_aligned_float((float*)&igtbl_r[src[2]]); |
|
120 src += 3; |
|
121 |
|
122 /* use calc'd indices to output RGB values */ |
|
123 dest[0] = otdata_r[output[0]]; |
|
124 dest[1] = otdata_g[output[1]]; |
|
125 dest[2] = otdata_b[output[2]]; |
|
126 dest += 3; |
|
127 } |
|
128 |
|
129 /* handle final (maybe only) pixel */ |
|
130 |
|
131 vec_r = vec_splat(vec_r, 0); |
|
132 vec_g = vec_splat(vec_g, 0); |
|
133 vec_b = vec_splat(vec_b, 0); |
|
134 |
|
135 vec_r = vec_madd(vec_r, mat0, min); |
|
136 vec_g = vec_madd(vec_g, mat1, min); |
|
137 vec_b = vec_madd(vec_b, mat2, min); |
|
138 |
|
139 vec_r = vec_add(vec_r, vec_add(vec_g, vec_b)); |
|
140 vec_r = vec_max(min, vec_r); |
|
141 vec_r = vec_min(max, vec_r); |
|
142 result = vec_madd(vec_r, scale, min); |
|
143 |
|
144 vec_st(vec_ctu(vec_round(result),0),0,(vector unsigned int*)output); |
|
145 |
|
146 dest[0] = otdata_r[output[0]]; |
|
147 dest[1] = otdata_g[output[1]]; |
|
148 dest[2] = otdata_b[output[2]]; |
|
149 } |
|
150 |
|
151 void qcms_transform_data_rgba_out_lut_altivec(qcms_transform *transform, |
|
152 unsigned char *src, |
|
153 unsigned char *dest, |
|
154 size_t length) |
|
155 { |
|
156 unsigned int i; |
|
157 float (*mat)[4] = transform->matrix; |
|
158 char input_back[32]; |
|
159 /* Ensure we have a buffer that's 16 byte aligned regardless of the original |
|
160 * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32)) |
|
161 * because they don't work on stack variables. gcc 4.4 does do the right thing |
|
162 * on x86 but that's too new for us right now. For more info: gcc bug #16660 */ |
|
163 float const *input = (float*)(((uintptr_t)&input_back[16]) & ~0xf); |
|
164 /* share input and output locations to save having to keep the |
|
165 * locations in separate registers */ |
|
166 uint32_t const *output = (uint32_t*)input; |
|
167 |
|
168 /* deref *transform now to avoid it in loop */ |
|
169 const float *igtbl_r = transform->input_gamma_table_r; |
|
170 const float *igtbl_g = transform->input_gamma_table_g; |
|
171 const float *igtbl_b = transform->input_gamma_table_b; |
|
172 |
|
173 /* deref *transform now to avoid it in loop */ |
|
174 const uint8_t *otdata_r = &transform->output_table_r->data[0]; |
|
175 const uint8_t *otdata_g = &transform->output_table_g->data[0]; |
|
176 const uint8_t *otdata_b = &transform->output_table_b->data[0]; |
|
177 |
|
178 /* input matrix values never change */ |
|
179 const vector float mat0 = vec_ldl(0, (vector float*)mat[0]); |
|
180 const vector float mat1 = vec_ldl(0, (vector float*)mat[1]); |
|
181 const vector float mat2 = vec_ldl(0, (vector float*)mat[2]); |
|
182 |
|
183 /* these values don't change, either */ |
|
184 const vector float max = vec_splat(vec_lde(0, (float*)&clampMaxValueX4), 0); |
|
185 const vector float min = (vector float)vec_splat_u32(0); |
|
186 const vector float scale = vec_splat(vec_lde(0, (float*)&floatScaleX4), 0); |
|
187 |
|
188 /* working variables */ |
|
189 vector float vec_r, vec_g, vec_b, result; |
|
190 unsigned char alpha; |
|
191 |
|
192 /* CYA */ |
|
193 if (!length) |
|
194 return; |
|
195 |
|
196 /* one pixel is handled outside of the loop */ |
|
197 length--; |
|
198 |
|
199 /* setup for transforming 1st pixel */ |
|
200 vec_r = load_aligned_float((float*)&igtbl_r[src[0]]); |
|
201 vec_g = load_aligned_float((float*)&igtbl_r[src[1]]); |
|
202 vec_b = load_aligned_float((float*)&igtbl_r[src[2]]); |
|
203 alpha = src[3]; |
|
204 src += 4; |
|
205 |
|
206 /* transform all but final pixel */ |
|
207 |
|
208 for (i=0; i<length; i++) |
|
209 { |
|
210 /* position values from gamma tables */ |
|
211 vec_r = vec_splat(vec_r, 0); |
|
212 vec_g = vec_splat(vec_g, 0); |
|
213 vec_b = vec_splat(vec_b, 0); |
|
214 |
|
215 /* gamma * matrix */ |
|
216 vec_r = vec_madd(vec_r, mat0, min); |
|
217 vec_g = vec_madd(vec_g, mat1, min); |
|
218 vec_b = vec_madd(vec_b, mat2, min); |
|
219 |
|
220 /* store alpha for this pixel; load alpha for next */ |
|
221 dest[3] = alpha; |
|
222 alpha = src[3]; |
|
223 |
|
224 /* crunch, crunch, crunch */ |
|
225 vec_r = vec_add(vec_r, vec_add(vec_g, vec_b)); |
|
226 vec_r = vec_max(min, vec_r); |
|
227 vec_r = vec_min(max, vec_r); |
|
228 result = vec_madd(vec_r, scale, min); |
|
229 |
|
230 /* store calc'd output tables indices */ |
|
231 vec_st(vec_ctu(vec_round(result), 0), 0, (vector unsigned int*)output); |
|
232 |
|
233 /* load gamma values for next loop while store completes */ |
|
234 vec_r = load_aligned_float((float*)&igtbl_r[src[0]]); |
|
235 vec_g = load_aligned_float((float*)&igtbl_r[src[1]]); |
|
236 vec_b = load_aligned_float((float*)&igtbl_r[src[2]]); |
|
237 src += 4; |
|
238 |
|
239 /* use calc'd indices to output RGB values */ |
|
240 dest[0] = otdata_r[output[0]]; |
|
241 dest[1] = otdata_g[output[1]]; |
|
242 dest[2] = otdata_b[output[2]]; |
|
243 dest += 4; |
|
244 } |
|
245 |
|
246 /* handle final (maybe only) pixel */ |
|
247 |
|
248 vec_r = vec_splat(vec_r, 0); |
|
249 vec_g = vec_splat(vec_g, 0); |
|
250 vec_b = vec_splat(vec_b, 0); |
|
251 |
|
252 vec_r = vec_madd(vec_r, mat0, min); |
|
253 vec_g = vec_madd(vec_g, mat1, min); |
|
254 vec_b = vec_madd(vec_b, mat2, min); |
|
255 |
|
256 dest[3] = alpha; |
|
257 |
|
258 vec_r = vec_add(vec_r, vec_add(vec_g, vec_b)); |
|
259 vec_r = vec_max(min, vec_r); |
|
260 vec_r = vec_min(max, vec_r); |
|
261 result = vec_madd(vec_r, scale, min); |
|
262 |
|
263 vec_st(vec_ctu(vec_round(result), 0), 0, (vector unsigned int*)output); |
|
264 |
|
265 dest[0] = otdata_r[output[0]]; |
|
266 dest[1] = otdata_g[output[1]]; |
|
267 dest[2] = otdata_b[output[2]]; |
|
268 } |
|
269 |