gfx/qcms/transform-sse2.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/qcms/transform-sse2.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,243 @@
     1.4 +#include <emmintrin.h>
     1.5 +
     1.6 +#include "qcmsint.h"
     1.7 +
     1.8 +/* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */
     1.9 +#define FLOATSCALE  (float)(PRECACHE_OUTPUT_SIZE)
    1.10 +#define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE )
    1.11 +static const ALIGN float floatScaleX4[4] =
    1.12 +    { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE};
    1.13 +static const ALIGN float clampMaxValueX4[4] =
    1.14 +    { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL};
    1.15 +
    1.16 +void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform,
    1.17 +                                          unsigned char *src,
    1.18 +                                          unsigned char *dest,
    1.19 +                                          size_t length)
    1.20 +{
    1.21 +    unsigned int i;
    1.22 +    float (*mat)[4] = transform->matrix;
    1.23 +    char input_back[32];
    1.24 +    /* Ensure we have a buffer that's 16 byte aligned regardless of the original
    1.25 +     * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
    1.26 +     * because they don't work on stack variables. gcc 4.4 does do the right thing
    1.27 +     * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
    1.28 +    float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
    1.29 +    /* share input and output locations to save having to keep the
    1.30 +     * locations in separate registers */
    1.31 +    uint32_t const * output = (uint32_t*)input;
    1.32 +
    1.33 +    /* deref *transform now to avoid it in loop */
    1.34 +    const float *igtbl_r = transform->input_gamma_table_r;
    1.35 +    const float *igtbl_g = transform->input_gamma_table_g;
    1.36 +    const float *igtbl_b = transform->input_gamma_table_b;
    1.37 +
    1.38 +    /* deref *transform now to avoid it in loop */
    1.39 +    const uint8_t *otdata_r = &transform->output_table_r->data[0];
    1.40 +    const uint8_t *otdata_g = &transform->output_table_g->data[0];
    1.41 +    const uint8_t *otdata_b = &transform->output_table_b->data[0];
    1.42 +
    1.43 +    /* input matrix values never change */
    1.44 +    const __m128 mat0  = _mm_load_ps(mat[0]);
    1.45 +    const __m128 mat1  = _mm_load_ps(mat[1]);
    1.46 +    const __m128 mat2  = _mm_load_ps(mat[2]);
    1.47 +
    1.48 +    /* these values don't change, either */
    1.49 +    const __m128 max   = _mm_load_ps(clampMaxValueX4);
    1.50 +    const __m128 min   = _mm_setzero_ps();
    1.51 +    const __m128 scale = _mm_load_ps(floatScaleX4);
    1.52 +
    1.53 +    /* working variables */
    1.54 +    __m128 vec_r, vec_g, vec_b, result;
    1.55 +
    1.56 +    /* CYA */
    1.57 +    if (!length)
    1.58 +        return;
    1.59 +
    1.60 +    /* one pixel is handled outside of the loop */
    1.61 +    length--;
    1.62 +
    1.63 +    /* setup for transforming 1st pixel */
    1.64 +    vec_r = _mm_load_ss(&igtbl_r[src[0]]);
    1.65 +    vec_g = _mm_load_ss(&igtbl_g[src[1]]);
    1.66 +    vec_b = _mm_load_ss(&igtbl_b[src[2]]);
    1.67 +    src += 3;
    1.68 +
    1.69 +    /* transform all but final pixel */
    1.70 +
    1.71 +    for (i=0; i<length; i++)
    1.72 +    {
    1.73 +        /* position values from gamma tables */
    1.74 +        vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
    1.75 +        vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
    1.76 +        vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
    1.77 +
    1.78 +        /* gamma * matrix */
    1.79 +        vec_r = _mm_mul_ps(vec_r, mat0);
    1.80 +        vec_g = _mm_mul_ps(vec_g, mat1);
    1.81 +        vec_b = _mm_mul_ps(vec_b, mat2);
    1.82 +
    1.83 +        /* crunch, crunch, crunch */
    1.84 +        vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
    1.85 +        vec_r  = _mm_max_ps(min, vec_r);
    1.86 +        vec_r  = _mm_min_ps(max, vec_r);
    1.87 +        result = _mm_mul_ps(vec_r, scale);
    1.88 +
    1.89 +        /* store calc'd output tables indices */
    1.90 +        _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
    1.91 +
    1.92 +        /* load for next loop while store completes */
    1.93 +        vec_r = _mm_load_ss(&igtbl_r[src[0]]);
    1.94 +        vec_g = _mm_load_ss(&igtbl_g[src[1]]);
    1.95 +        vec_b = _mm_load_ss(&igtbl_b[src[2]]);
    1.96 +        src += 3;
    1.97 +
    1.98 +        /* use calc'd indices to output RGB values */
    1.99 +        dest[OUTPUT_R_INDEX] = otdata_r[output[0]];
   1.100 +        dest[OUTPUT_G_INDEX] = otdata_g[output[1]];
   1.101 +        dest[OUTPUT_B_INDEX] = otdata_b[output[2]];
   1.102 +        dest += RGB_OUTPUT_COMPONENTS;
   1.103 +    }
   1.104 +
   1.105 +    /* handle final (maybe only) pixel */
   1.106 +
   1.107 +    vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
   1.108 +    vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
   1.109 +    vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
   1.110 +
   1.111 +    vec_r = _mm_mul_ps(vec_r, mat0);
   1.112 +    vec_g = _mm_mul_ps(vec_g, mat1);
   1.113 +    vec_b = _mm_mul_ps(vec_b, mat2);
   1.114 +
   1.115 +    vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
   1.116 +    vec_r  = _mm_max_ps(min, vec_r);
   1.117 +    vec_r  = _mm_min_ps(max, vec_r);
   1.118 +    result = _mm_mul_ps(vec_r, scale);
   1.119 +
   1.120 +    _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
   1.121 +
   1.122 +    dest[OUTPUT_R_INDEX] = otdata_r[output[0]];
   1.123 +    dest[OUTPUT_G_INDEX] = otdata_g[output[1]];
   1.124 +    dest[OUTPUT_B_INDEX] = otdata_b[output[2]];
   1.125 +}
   1.126 +
   1.127 +void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform,
   1.128 +                                           unsigned char *src,
   1.129 +                                           unsigned char *dest,
   1.130 +                                           size_t length)
   1.131 +{
   1.132 +    unsigned int i;
   1.133 +    float (*mat)[4] = transform->matrix;
   1.134 +    char input_back[32];
   1.135 +    /* Ensure we have a buffer that's 16 byte aligned regardless of the original
   1.136 +     * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32))
   1.137 +     * because they don't work on stack variables. gcc 4.4 does do the right thing
   1.138 +     * on x86 but that's too new for us right now. For more info: gcc bug #16660 */
   1.139 +    float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf);
   1.140 +    /* share input and output locations to save having to keep the
   1.141 +     * locations in separate registers */
   1.142 +    uint32_t const * output = (uint32_t*)input;
   1.143 +
   1.144 +    /* deref *transform now to avoid it in loop */
   1.145 +    const float *igtbl_r = transform->input_gamma_table_r;
   1.146 +    const float *igtbl_g = transform->input_gamma_table_g;
   1.147 +    const float *igtbl_b = transform->input_gamma_table_b;
   1.148 +
   1.149 +    /* deref *transform now to avoid it in loop */
   1.150 +    const uint8_t *otdata_r = &transform->output_table_r->data[0];
   1.151 +    const uint8_t *otdata_g = &transform->output_table_g->data[0];
   1.152 +    const uint8_t *otdata_b = &transform->output_table_b->data[0];
   1.153 +
   1.154 +    /* input matrix values never change */
   1.155 +    const __m128 mat0  = _mm_load_ps(mat[0]);
   1.156 +    const __m128 mat1  = _mm_load_ps(mat[1]);
   1.157 +    const __m128 mat2  = _mm_load_ps(mat[2]);
   1.158 +
   1.159 +    /* these values don't change, either */
   1.160 +    const __m128 max   = _mm_load_ps(clampMaxValueX4);
   1.161 +    const __m128 min   = _mm_setzero_ps();
   1.162 +    const __m128 scale = _mm_load_ps(floatScaleX4);
   1.163 +
   1.164 +    /* working variables */
   1.165 +    __m128 vec_r, vec_g, vec_b, result;
   1.166 +    unsigned char alpha;
   1.167 +
   1.168 +    /* CYA */
   1.169 +    if (!length)
   1.170 +        return;
   1.171 +
   1.172 +    /* one pixel is handled outside of the loop */
   1.173 +    length--;
   1.174 +
   1.175 +    /* setup for transforming 1st pixel */
   1.176 +    vec_r = _mm_load_ss(&igtbl_r[src[0]]);
   1.177 +    vec_g = _mm_load_ss(&igtbl_g[src[1]]);
   1.178 +    vec_b = _mm_load_ss(&igtbl_b[src[2]]);
   1.179 +    alpha = src[3];
   1.180 +    src += 4;
   1.181 +
   1.182 +    /* transform all but final pixel */
   1.183 +
   1.184 +    for (i=0; i<length; i++)
   1.185 +    {
   1.186 +        /* position values from gamma tables */
   1.187 +        vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
   1.188 +        vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
   1.189 +        vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
   1.190 +
   1.191 +        /* gamma * matrix */
   1.192 +        vec_r = _mm_mul_ps(vec_r, mat0);
   1.193 +        vec_g = _mm_mul_ps(vec_g, mat1);
   1.194 +        vec_b = _mm_mul_ps(vec_b, mat2);
   1.195 +
   1.196 +        /* store alpha for this pixel; load alpha for next */
   1.197 +        dest[OUTPUT_A_INDEX] = alpha;
   1.198 +        alpha   = src[3];
   1.199 +
   1.200 +        /* crunch, crunch, crunch */
   1.201 +        vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
   1.202 +        vec_r  = _mm_max_ps(min, vec_r);
   1.203 +        vec_r  = _mm_min_ps(max, vec_r);
   1.204 +        result = _mm_mul_ps(vec_r, scale);
   1.205 +
   1.206 +        /* store calc'd output tables indices */
   1.207 +        _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
   1.208 +
   1.209 +        /* load gamma values for next loop while store completes */
   1.210 +        vec_r = _mm_load_ss(&igtbl_r[src[0]]);
   1.211 +        vec_g = _mm_load_ss(&igtbl_g[src[1]]);
   1.212 +        vec_b = _mm_load_ss(&igtbl_b[src[2]]);
   1.213 +        src += 4;
   1.214 +
   1.215 +        /* use calc'd indices to output RGB values */
   1.216 +        dest[OUTPUT_R_INDEX] = otdata_r[output[0]];
   1.217 +        dest[OUTPUT_G_INDEX] = otdata_g[output[1]];
   1.218 +        dest[OUTPUT_B_INDEX] = otdata_b[output[2]];
   1.219 +        dest += RGBA_OUTPUT_COMPONENTS;
   1.220 +    }
   1.221 +
   1.222 +    /* handle final (maybe only) pixel */
   1.223 +
   1.224 +    vec_r = _mm_shuffle_ps(vec_r, vec_r, 0);
   1.225 +    vec_g = _mm_shuffle_ps(vec_g, vec_g, 0);
   1.226 +    vec_b = _mm_shuffle_ps(vec_b, vec_b, 0);
   1.227 +
   1.228 +    vec_r = _mm_mul_ps(vec_r, mat0);
   1.229 +    vec_g = _mm_mul_ps(vec_g, mat1);
   1.230 +    vec_b = _mm_mul_ps(vec_b, mat2);
   1.231 +
   1.232 +    dest[OUTPUT_A_INDEX] = alpha;
   1.233 +
   1.234 +    vec_r  = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b));
   1.235 +    vec_r  = _mm_max_ps(min, vec_r);
   1.236 +    vec_r  = _mm_min_ps(max, vec_r);
   1.237 +    result = _mm_mul_ps(vec_r, scale);
   1.238 +
   1.239 +    _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result));
   1.240 +
   1.241 +    dest[OUTPUT_R_INDEX] = otdata_r[output[0]];
   1.242 +    dest[OUTPUT_G_INDEX] = otdata_g[output[1]];
   1.243 +    dest[OUTPUT_B_INDEX] = otdata_b[output[2]];
   1.244 +}
   1.245 +
   1.246 +

mercurial