Tue, 06 Jan 2015 21:39:09 +0100
Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.
michael@0 | 1 | #include <emmintrin.h> |
michael@0 | 2 | |
michael@0 | 3 | #include "qcmsint.h" |
michael@0 | 4 | |
michael@0 | 5 | /* pre-shuffled: just load these into XMM reg instead of load-scalar/shufps sequence */ |
michael@0 | 6 | #define FLOATSCALE (float)(PRECACHE_OUTPUT_SIZE) |
michael@0 | 7 | #define CLAMPMAXVAL ( ((float) (PRECACHE_OUTPUT_SIZE - 1)) / PRECACHE_OUTPUT_SIZE ) |
michael@0 | 8 | static const ALIGN float floatScaleX4[4] = |
michael@0 | 9 | { FLOATSCALE, FLOATSCALE, FLOATSCALE, FLOATSCALE}; |
michael@0 | 10 | static const ALIGN float clampMaxValueX4[4] = |
michael@0 | 11 | { CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL, CLAMPMAXVAL}; |
michael@0 | 12 | |
michael@0 | 13 | void qcms_transform_data_rgb_out_lut_sse2(qcms_transform *transform, |
michael@0 | 14 | unsigned char *src, |
michael@0 | 15 | unsigned char *dest, |
michael@0 | 16 | size_t length) |
michael@0 | 17 | { |
michael@0 | 18 | unsigned int i; |
michael@0 | 19 | float (*mat)[4] = transform->matrix; |
michael@0 | 20 | char input_back[32]; |
michael@0 | 21 | /* Ensure we have a buffer that's 16 byte aligned regardless of the original |
michael@0 | 22 | * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32)) |
michael@0 | 23 | * because they don't work on stack variables. gcc 4.4 does do the right thing |
michael@0 | 24 | * on x86 but that's too new for us right now. For more info: gcc bug #16660 */ |
michael@0 | 25 | float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf); |
michael@0 | 26 | /* share input and output locations to save having to keep the |
michael@0 | 27 | * locations in separate registers */ |
michael@0 | 28 | uint32_t const * output = (uint32_t*)input; |
michael@0 | 29 | |
michael@0 | 30 | /* deref *transform now to avoid it in loop */ |
michael@0 | 31 | const float *igtbl_r = transform->input_gamma_table_r; |
michael@0 | 32 | const float *igtbl_g = transform->input_gamma_table_g; |
michael@0 | 33 | const float *igtbl_b = transform->input_gamma_table_b; |
michael@0 | 34 | |
michael@0 | 35 | /* deref *transform now to avoid it in loop */ |
michael@0 | 36 | const uint8_t *otdata_r = &transform->output_table_r->data[0]; |
michael@0 | 37 | const uint8_t *otdata_g = &transform->output_table_g->data[0]; |
michael@0 | 38 | const uint8_t *otdata_b = &transform->output_table_b->data[0]; |
michael@0 | 39 | |
michael@0 | 40 | /* input matrix values never change */ |
michael@0 | 41 | const __m128 mat0 = _mm_load_ps(mat[0]); |
michael@0 | 42 | const __m128 mat1 = _mm_load_ps(mat[1]); |
michael@0 | 43 | const __m128 mat2 = _mm_load_ps(mat[2]); |
michael@0 | 44 | |
michael@0 | 45 | /* these values don't change, either */ |
michael@0 | 46 | const __m128 max = _mm_load_ps(clampMaxValueX4); |
michael@0 | 47 | const __m128 min = _mm_setzero_ps(); |
michael@0 | 48 | const __m128 scale = _mm_load_ps(floatScaleX4); |
michael@0 | 49 | |
michael@0 | 50 | /* working variables */ |
michael@0 | 51 | __m128 vec_r, vec_g, vec_b, result; |
michael@0 | 52 | |
michael@0 | 53 | /* CYA */ |
michael@0 | 54 | if (!length) |
michael@0 | 55 | return; |
michael@0 | 56 | |
michael@0 | 57 | /* one pixel is handled outside of the loop */ |
michael@0 | 58 | length--; |
michael@0 | 59 | |
michael@0 | 60 | /* setup for transforming 1st pixel */ |
michael@0 | 61 | vec_r = _mm_load_ss(&igtbl_r[src[0]]); |
michael@0 | 62 | vec_g = _mm_load_ss(&igtbl_g[src[1]]); |
michael@0 | 63 | vec_b = _mm_load_ss(&igtbl_b[src[2]]); |
michael@0 | 64 | src += 3; |
michael@0 | 65 | |
michael@0 | 66 | /* transform all but final pixel */ |
michael@0 | 67 | |
michael@0 | 68 | for (i=0; i<length; i++) |
michael@0 | 69 | { |
michael@0 | 70 | /* position values from gamma tables */ |
michael@0 | 71 | vec_r = _mm_shuffle_ps(vec_r, vec_r, 0); |
michael@0 | 72 | vec_g = _mm_shuffle_ps(vec_g, vec_g, 0); |
michael@0 | 73 | vec_b = _mm_shuffle_ps(vec_b, vec_b, 0); |
michael@0 | 74 | |
michael@0 | 75 | /* gamma * matrix */ |
michael@0 | 76 | vec_r = _mm_mul_ps(vec_r, mat0); |
michael@0 | 77 | vec_g = _mm_mul_ps(vec_g, mat1); |
michael@0 | 78 | vec_b = _mm_mul_ps(vec_b, mat2); |
michael@0 | 79 | |
michael@0 | 80 | /* crunch, crunch, crunch */ |
michael@0 | 81 | vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); |
michael@0 | 82 | vec_r = _mm_max_ps(min, vec_r); |
michael@0 | 83 | vec_r = _mm_min_ps(max, vec_r); |
michael@0 | 84 | result = _mm_mul_ps(vec_r, scale); |
michael@0 | 85 | |
michael@0 | 86 | /* store calc'd output tables indices */ |
michael@0 | 87 | _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result)); |
michael@0 | 88 | |
michael@0 | 89 | /* load for next loop while store completes */ |
michael@0 | 90 | vec_r = _mm_load_ss(&igtbl_r[src[0]]); |
michael@0 | 91 | vec_g = _mm_load_ss(&igtbl_g[src[1]]); |
michael@0 | 92 | vec_b = _mm_load_ss(&igtbl_b[src[2]]); |
michael@0 | 93 | src += 3; |
michael@0 | 94 | |
michael@0 | 95 | /* use calc'd indices to output RGB values */ |
michael@0 | 96 | dest[OUTPUT_R_INDEX] = otdata_r[output[0]]; |
michael@0 | 97 | dest[OUTPUT_G_INDEX] = otdata_g[output[1]]; |
michael@0 | 98 | dest[OUTPUT_B_INDEX] = otdata_b[output[2]]; |
michael@0 | 99 | dest += RGB_OUTPUT_COMPONENTS; |
michael@0 | 100 | } |
michael@0 | 101 | |
michael@0 | 102 | /* handle final (maybe only) pixel */ |
michael@0 | 103 | |
michael@0 | 104 | vec_r = _mm_shuffle_ps(vec_r, vec_r, 0); |
michael@0 | 105 | vec_g = _mm_shuffle_ps(vec_g, vec_g, 0); |
michael@0 | 106 | vec_b = _mm_shuffle_ps(vec_b, vec_b, 0); |
michael@0 | 107 | |
michael@0 | 108 | vec_r = _mm_mul_ps(vec_r, mat0); |
michael@0 | 109 | vec_g = _mm_mul_ps(vec_g, mat1); |
michael@0 | 110 | vec_b = _mm_mul_ps(vec_b, mat2); |
michael@0 | 111 | |
michael@0 | 112 | vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); |
michael@0 | 113 | vec_r = _mm_max_ps(min, vec_r); |
michael@0 | 114 | vec_r = _mm_min_ps(max, vec_r); |
michael@0 | 115 | result = _mm_mul_ps(vec_r, scale); |
michael@0 | 116 | |
michael@0 | 117 | _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result)); |
michael@0 | 118 | |
michael@0 | 119 | dest[OUTPUT_R_INDEX] = otdata_r[output[0]]; |
michael@0 | 120 | dest[OUTPUT_G_INDEX] = otdata_g[output[1]]; |
michael@0 | 121 | dest[OUTPUT_B_INDEX] = otdata_b[output[2]]; |
michael@0 | 122 | } |
michael@0 | 123 | |
michael@0 | 124 | void qcms_transform_data_rgba_out_lut_sse2(qcms_transform *transform, |
michael@0 | 125 | unsigned char *src, |
michael@0 | 126 | unsigned char *dest, |
michael@0 | 127 | size_t length) |
michael@0 | 128 | { |
michael@0 | 129 | unsigned int i; |
michael@0 | 130 | float (*mat)[4] = transform->matrix; |
michael@0 | 131 | char input_back[32]; |
michael@0 | 132 | /* Ensure we have a buffer that's 16 byte aligned regardless of the original |
michael@0 | 133 | * stack alignment. We can't use __attribute__((aligned(16))) or __declspec(align(32)) |
michael@0 | 134 | * because they don't work on stack variables. gcc 4.4 does do the right thing |
michael@0 | 135 | * on x86 but that's too new for us right now. For more info: gcc bug #16660 */ |
michael@0 | 136 | float const * input = (float*)(((uintptr_t)&input_back[16]) & ~0xf); |
michael@0 | 137 | /* share input and output locations to save having to keep the |
michael@0 | 138 | * locations in separate registers */ |
michael@0 | 139 | uint32_t const * output = (uint32_t*)input; |
michael@0 | 140 | |
michael@0 | 141 | /* deref *transform now to avoid it in loop */ |
michael@0 | 142 | const float *igtbl_r = transform->input_gamma_table_r; |
michael@0 | 143 | const float *igtbl_g = transform->input_gamma_table_g; |
michael@0 | 144 | const float *igtbl_b = transform->input_gamma_table_b; |
michael@0 | 145 | |
michael@0 | 146 | /* deref *transform now to avoid it in loop */ |
michael@0 | 147 | const uint8_t *otdata_r = &transform->output_table_r->data[0]; |
michael@0 | 148 | const uint8_t *otdata_g = &transform->output_table_g->data[0]; |
michael@0 | 149 | const uint8_t *otdata_b = &transform->output_table_b->data[0]; |
michael@0 | 150 | |
michael@0 | 151 | /* input matrix values never change */ |
michael@0 | 152 | const __m128 mat0 = _mm_load_ps(mat[0]); |
michael@0 | 153 | const __m128 mat1 = _mm_load_ps(mat[1]); |
michael@0 | 154 | const __m128 mat2 = _mm_load_ps(mat[2]); |
michael@0 | 155 | |
michael@0 | 156 | /* these values don't change, either */ |
michael@0 | 157 | const __m128 max = _mm_load_ps(clampMaxValueX4); |
michael@0 | 158 | const __m128 min = _mm_setzero_ps(); |
michael@0 | 159 | const __m128 scale = _mm_load_ps(floatScaleX4); |
michael@0 | 160 | |
michael@0 | 161 | /* working variables */ |
michael@0 | 162 | __m128 vec_r, vec_g, vec_b, result; |
michael@0 | 163 | unsigned char alpha; |
michael@0 | 164 | |
michael@0 | 165 | /* CYA */ |
michael@0 | 166 | if (!length) |
michael@0 | 167 | return; |
michael@0 | 168 | |
michael@0 | 169 | /* one pixel is handled outside of the loop */ |
michael@0 | 170 | length--; |
michael@0 | 171 | |
michael@0 | 172 | /* setup for transforming 1st pixel */ |
michael@0 | 173 | vec_r = _mm_load_ss(&igtbl_r[src[0]]); |
michael@0 | 174 | vec_g = _mm_load_ss(&igtbl_g[src[1]]); |
michael@0 | 175 | vec_b = _mm_load_ss(&igtbl_b[src[2]]); |
michael@0 | 176 | alpha = src[3]; |
michael@0 | 177 | src += 4; |
michael@0 | 178 | |
michael@0 | 179 | /* transform all but final pixel */ |
michael@0 | 180 | |
michael@0 | 181 | for (i=0; i<length; i++) |
michael@0 | 182 | { |
michael@0 | 183 | /* position values from gamma tables */ |
michael@0 | 184 | vec_r = _mm_shuffle_ps(vec_r, vec_r, 0); |
michael@0 | 185 | vec_g = _mm_shuffle_ps(vec_g, vec_g, 0); |
michael@0 | 186 | vec_b = _mm_shuffle_ps(vec_b, vec_b, 0); |
michael@0 | 187 | |
michael@0 | 188 | /* gamma * matrix */ |
michael@0 | 189 | vec_r = _mm_mul_ps(vec_r, mat0); |
michael@0 | 190 | vec_g = _mm_mul_ps(vec_g, mat1); |
michael@0 | 191 | vec_b = _mm_mul_ps(vec_b, mat2); |
michael@0 | 192 | |
michael@0 | 193 | /* store alpha for this pixel; load alpha for next */ |
michael@0 | 194 | dest[OUTPUT_A_INDEX] = alpha; |
michael@0 | 195 | alpha = src[3]; |
michael@0 | 196 | |
michael@0 | 197 | /* crunch, crunch, crunch */ |
michael@0 | 198 | vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); |
michael@0 | 199 | vec_r = _mm_max_ps(min, vec_r); |
michael@0 | 200 | vec_r = _mm_min_ps(max, vec_r); |
michael@0 | 201 | result = _mm_mul_ps(vec_r, scale); |
michael@0 | 202 | |
michael@0 | 203 | /* store calc'd output tables indices */ |
michael@0 | 204 | _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result)); |
michael@0 | 205 | |
michael@0 | 206 | /* load gamma values for next loop while store completes */ |
michael@0 | 207 | vec_r = _mm_load_ss(&igtbl_r[src[0]]); |
michael@0 | 208 | vec_g = _mm_load_ss(&igtbl_g[src[1]]); |
michael@0 | 209 | vec_b = _mm_load_ss(&igtbl_b[src[2]]); |
michael@0 | 210 | src += 4; |
michael@0 | 211 | |
michael@0 | 212 | /* use calc'd indices to output RGB values */ |
michael@0 | 213 | dest[OUTPUT_R_INDEX] = otdata_r[output[0]]; |
michael@0 | 214 | dest[OUTPUT_G_INDEX] = otdata_g[output[1]]; |
michael@0 | 215 | dest[OUTPUT_B_INDEX] = otdata_b[output[2]]; |
michael@0 | 216 | dest += RGBA_OUTPUT_COMPONENTS; |
michael@0 | 217 | } |
michael@0 | 218 | |
michael@0 | 219 | /* handle final (maybe only) pixel */ |
michael@0 | 220 | |
michael@0 | 221 | vec_r = _mm_shuffle_ps(vec_r, vec_r, 0); |
michael@0 | 222 | vec_g = _mm_shuffle_ps(vec_g, vec_g, 0); |
michael@0 | 223 | vec_b = _mm_shuffle_ps(vec_b, vec_b, 0); |
michael@0 | 224 | |
michael@0 | 225 | vec_r = _mm_mul_ps(vec_r, mat0); |
michael@0 | 226 | vec_g = _mm_mul_ps(vec_g, mat1); |
michael@0 | 227 | vec_b = _mm_mul_ps(vec_b, mat2); |
michael@0 | 228 | |
michael@0 | 229 | dest[OUTPUT_A_INDEX] = alpha; |
michael@0 | 230 | |
michael@0 | 231 | vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); |
michael@0 | 232 | vec_r = _mm_max_ps(min, vec_r); |
michael@0 | 233 | vec_r = _mm_min_ps(max, vec_r); |
michael@0 | 234 | result = _mm_mul_ps(vec_r, scale); |
michael@0 | 235 | |
michael@0 | 236 | _mm_store_si128((__m128i*)output, _mm_cvtps_epi32(result)); |
michael@0 | 237 | |
michael@0 | 238 | dest[OUTPUT_R_INDEX] = otdata_r[output[0]]; |
michael@0 | 239 | dest[OUTPUT_G_INDEX] = otdata_g[output[1]]; |
michael@0 | 240 | dest[OUTPUT_B_INDEX] = otdata_b[output[2]]; |
michael@0 | 241 | } |
michael@0 | 242 | |
michael@0 | 243 |