1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/cairo/pixman-8888-over-565.patch Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,712 @@ 1.4 +changeset: 96613:3e003f0b8026 1.5 +tag: 2pass 1.6 +tag: qbase 1.7 +tag: qtip 1.8 +tag: tip 1.9 +user: Jeff Muizelaar <jmuizelaar@mozilla.com> 1.10 +date: Thu May 17 19:23:53 2012 -0400 1.11 +summary: Bug 757878. Add a fast path for 8888_over_565 with NEON. r=bgirard,joe 1.12 + 1.13 +diff --git a/gfx/cairo/libpixman/src/pixman-arm-common.h b/gfx/cairo/libpixman/src/pixman-arm-common.h 1.14 +--- a/gfx/cairo/libpixman/src/pixman-arm-common.h 1.15 ++++ b/gfx/cairo/libpixman/src/pixman-arm-common.h 1.16 +@@ -355,26 +355,26 @@ scaled_bilinear_scanline_##cputype##_##n 1.17 + if ((flags & SKIP_ZERO_SRC) && zero_src) \ 1.18 + return; \ 1.19 + pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype ( \ 1.20 + dst, src_top, src_bottom, wt, wb, vx, unit_x, w); \ 1.21 + } \ 1.22 + \ 1.23 + FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_cover_##op, \ 1.24 + scaled_bilinear_scanline_##cputype##_##name##_##op, \ 1.25 +- src_type, uint32_t, dst_type, COVER, FLAG_NONE) \ 1.26 ++ NULL, src_type, uint32_t, dst_type, COVER, FLAG_NONE) \ 1.27 + FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_none_##op, \ 1.28 + scaled_bilinear_scanline_##cputype##_##name##_##op, \ 1.29 +- src_type, uint32_t, dst_type, NONE, FLAG_NONE) \ 1.30 ++ NULL, src_type, uint32_t, dst_type, NONE, FLAG_NONE) \ 1.31 + FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_pad_##op, \ 1.32 + scaled_bilinear_scanline_##cputype##_##name##_##op, \ 1.33 +- src_type, uint32_t, dst_type, PAD, FLAG_NONE) \ 1.34 ++ NULL, src_type, uint32_t, dst_type, PAD, FLAG_NONE) \ 1.35 + FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_normal_##op, \ 1.36 + scaled_bilinear_scanline_##cputype##_##name##_##op, \ 1.37 +- src_type, uint32_t, dst_type, NORMAL, \ 1.38 ++ NULL, src_type, uint32_t, dst_type, NORMAL, \ 1.39 + FLAG_NONE) 1.40 + 1.41 + 1.42 + #define PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST(flags, cputype, name, op, \ 1.43 + src_type, dst_type) \ 1.44 + void \ 1.45 + pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype ( \ 1.46 + dst_type * dst, \ 1.47 +@@ -404,25 +404,25 @@ scaled_bilinear_scanline_##cputype##_##n 1.48 + if ((flags & SKIP_ZERO_SRC) && zero_src) \ 1.49 + return; \ 1.50 + pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype ( \ 1.51 + dst, mask, src_top, src_bottom, wt, wb, vx, unit_x, w); \ 1.52 + } \ 1.53 + \ 1.54 + FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_cover_##op, \ 1.55 + scaled_bilinear_scanline_##cputype##_##name##_##op, \ 1.56 +- src_type, uint8_t, dst_type, COVER, \ 1.57 ++ NULL, src_type, uint8_t, dst_type, COVER, \ 1.58 + FLAG_HAVE_NON_SOLID_MASK) \ 1.59 + FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_none_##op, \ 1.60 + scaled_bilinear_scanline_##cputype##_##name##_##op, \ 1.61 +- src_type, uint8_t, dst_type, NONE, \ 1.62 ++ NULL, src_type, uint8_t, dst_type, NONE, \ 1.63 + FLAG_HAVE_NON_SOLID_MASK) \ 1.64 + FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_pad_##op, \ 1.65 + scaled_bilinear_scanline_##cputype##_##name##_##op, \ 1.66 +- src_type, uint8_t, dst_type, PAD, \ 1.67 ++ NULL, src_type, uint8_t, dst_type, PAD, \ 1.68 + FLAG_HAVE_NON_SOLID_MASK) \ 1.69 + FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_normal_##op, \ 1.70 + scaled_bilinear_scanline_##cputype##_##name##_##op, \ 1.71 +- src_type, uint8_t, dst_type, NORMAL, \ 1.72 ++ NULL, src_type, uint8_t, dst_type, NORMAL, \ 1.73 + FLAG_HAVE_NON_SOLID_MASK) 1.74 + 1.75 + 1.76 + #endif 1.77 +diff --git a/gfx/cairo/libpixman/src/pixman-arm-neon.c b/gfx/cairo/libpixman/src/pixman-arm-neon.c 1.78 +--- a/gfx/cairo/libpixman/src/pixman-arm-neon.c 1.79 ++++ b/gfx/cairo/libpixman/src/pixman-arm-neon.c 1.80 +@@ -140,16 +140,33 @@ PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST 1.81 + PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_0565, SRC, 1.82 + uint32_t, uint16_t) 1.83 + PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_x888, SRC, 1.84 + uint16_t, uint32_t) 1.85 + PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_0565, SRC, 1.86 + uint16_t, uint16_t) 1.87 + PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, neon, 8888_8888, OVER, 1.88 + uint32_t, uint32_t) 1.89 ++static force_inline void 1.90 ++pixman_scaled_bilinear_scanline_8888_8888_SRC ( 1.91 ++ uint32_t * dst, 1.92 ++ const uint32_t * mask, 1.93 ++ const uint32_t * src_top, 1.94 ++ const uint32_t * src_bottom, 1.95 ++ int32_t w, 1.96 ++ int wt, 1.97 ++ int wb, 1.98 ++ pixman_fixed_t vx, 1.99 ++ pixman_fixed_t unit_x, 1.100 ++ pixman_fixed_t max_vx, 1.101 ++ pixman_bool_t zero_src) 1.102 ++{ 1.103 ++ pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon (dst, src_top, src_bottom, wt, wb, vx, unit_x, w); 1.104 ++} 1.105 ++ 1.106 + PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, neon, 8888_8888, ADD, 1.107 + uint32_t, uint32_t) 1.108 + 1.109 + PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 8888_8_8888, SRC, 1.110 + uint32_t, uint32_t) 1.111 + PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 8888_8_0565, SRC, 1.112 + uint32_t, uint16_t) 1.113 + PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 0565_8_x888, SRC, 1.114 +@@ -261,16 +278,38 @@ pixman_blt_neon (uint32_t *src_bits, 1.115 + (uint32_t *)(((char *) src_bits) + 1.116 + src_y * src_stride * 4 + src_x * 4), src_stride); 1.117 + return TRUE; 1.118 + default: 1.119 + return FALSE; 1.120 + } 1.121 + } 1.122 + 1.123 ++static inline void op_bilinear_over_8888_0565(uint16_t *dst, const uint32_t *mask, const uint32_t *src, int width) 1.124 ++{ 1.125 ++ pixman_composite_over_8888_0565_asm_neon (width, 1, dst, 0, src, 0); 1.126 ++} 1.127 ++ 1.128 ++FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_0565_cover_OVER, 1.129 ++ pixman_scaled_bilinear_scanline_8888_8888_SRC, op_bilinear_over_8888_0565, 1.130 ++ uint32_t, uint32_t, uint16_t, 1.131 ++ COVER, FLAG_NONE) 1.132 ++FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_0565_pad_OVER, 1.133 ++ pixman_scaled_bilinear_scanline_8888_8888_SRC, op_bilinear_over_8888_0565, 1.134 ++ uint32_t, uint32_t, uint16_t, 1.135 ++ PAD, FLAG_NONE) 1.136 ++FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_0565_none_OVER, 1.137 ++ pixman_scaled_bilinear_scanline_8888_8888_SRC, op_bilinear_over_8888_0565, 1.138 ++ uint32_t, uint32_t, uint16_t, 1.139 ++ NONE, FLAG_NONE) 1.140 ++FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_0565_normal_OVER, 1.141 ++ pixman_scaled_bilinear_scanline_8888_8888_SRC, op_bilinear_over_8888_0565, 1.142 ++ uint32_t, uint32_t, uint16_t, 1.143 ++ NORMAL, FLAG_NONE) 1.144 ++ 1.145 + static const pixman_fast_path_t arm_neon_fast_paths[] = 1.146 + { 1.147 + PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, neon_composite_src_0565_0565), 1.148 + PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, neon_composite_src_0565_0565), 1.149 + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, neon_composite_src_8888_0565), 1.150 + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, neon_composite_src_8888_0565), 1.151 + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, neon_composite_src_8888_0565), 1.152 + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, neon_composite_src_8888_0565), 1.153 +@@ -414,16 +453,18 @@ static const pixman_fast_path_t arm_neon 1.154 + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_8_0565), 1.155 + 1.156 + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8_8888), 1.157 + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8_8888), 1.158 + 1.159 + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8_8888), 1.160 + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8_8888), 1.161 + 1.162 ++ SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_0565), 1.163 ++ 1.164 + { PIXMAN_OP_NONE }, 1.165 + }; 1.166 + 1.167 + static pixman_bool_t 1.168 + arm_neon_blt (pixman_implementation_t *imp, 1.169 + uint32_t * src_bits, 1.170 + uint32_t * dst_bits, 1.171 + int src_stride, 1.172 +diff --git a/gfx/cairo/libpixman/src/pixman-fast-path.c b/gfx/cairo/libpixman/src/pixman-fast-path.c 1.173 +--- a/gfx/cairo/libpixman/src/pixman-fast-path.c 1.174 ++++ b/gfx/cairo/libpixman/src/pixman-fast-path.c 1.175 +@@ -1356,63 +1356,63 @@ scaled_bilinear_scanline_565_565_SRC (ui 1.176 + vx += unit_x; 1.177 + *dst++ = d; 1.178 + } 1.179 + } 1.180 + 1.181 + #endif 1.182 + 1.183 + FAST_BILINEAR_MAINLOOP_COMMON (565_565_cover_SRC, 1.184 +- scaled_bilinear_scanline_565_565_SRC, 1.185 ++ scaled_bilinear_scanline_565_565_SRC, NULL, 1.186 + uint16_t, uint32_t, uint16_t, 1.187 + COVER, FLAG_NONE) 1.188 + FAST_BILINEAR_MAINLOOP_COMMON (565_565_pad_SRC, 1.189 +- scaled_bilinear_scanline_565_565_SRC, 1.190 ++ scaled_bilinear_scanline_565_565_SRC, NULL, 1.191 + uint16_t, uint32_t, uint16_t, 1.192 + PAD, FLAG_NONE) 1.193 + FAST_BILINEAR_MAINLOOP_COMMON (565_565_none_SRC, 1.194 +- scaled_bilinear_scanline_565_565_SRC, 1.195 ++ scaled_bilinear_scanline_565_565_SRC, NULL, 1.196 + uint16_t, uint32_t, uint16_t, 1.197 + NONE, FLAG_NONE) 1.198 + FAST_BILINEAR_MAINLOOP_COMMON (565_565_normal_SRC, 1.199 +- scaled_bilinear_scanline_565_565_SRC, 1.200 ++ scaled_bilinear_scanline_565_565_SRC, NULL, 1.201 + uint16_t, uint32_t, uint16_t, 1.202 + NORMAL, FLAG_NONE) 1.203 + 1.204 + FAST_BILINEAR_MAINLOOP_COMMON (8888_565_cover_OVER, 1.205 +- scaled_bilinear_scanline_8888_565_OVER, 1.206 ++ scaled_bilinear_scanline_8888_565_OVER, NULL, 1.207 + uint32_t, uint32_t, uint16_t, 1.208 + COVER, FLAG_NONE) 1.209 + FAST_BILINEAR_MAINLOOP_COMMON (8888_565_pad_OVER, 1.210 +- scaled_bilinear_scanline_8888_565_OVER, 1.211 ++ scaled_bilinear_scanline_8888_565_OVER, NULL, 1.212 + uint32_t, uint32_t, uint16_t, 1.213 + PAD, FLAG_NONE) 1.214 + FAST_BILINEAR_MAINLOOP_COMMON (8888_565_none_OVER, 1.215 +- scaled_bilinear_scanline_8888_565_OVER, 1.216 ++ scaled_bilinear_scanline_8888_565_OVER, NULL, 1.217 + uint32_t, uint32_t, uint16_t, 1.218 + NONE, FLAG_NONE) 1.219 + FAST_BILINEAR_MAINLOOP_COMMON (8888_565_normal_OVER, 1.220 +- scaled_bilinear_scanline_8888_565_OVER, 1.221 ++ scaled_bilinear_scanline_8888_565_OVER, NULL, 1.222 + uint32_t, uint32_t, uint16_t, 1.223 + NORMAL, FLAG_NONE) 1.224 + 1.225 + FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_cover_OVER, 1.226 +- scaled_bilinear_scanline_8888_8888_OVER, 1.227 ++ scaled_bilinear_scanline_8888_8888_OVER, NULL, 1.228 + uint32_t, uint32_t, uint32_t, 1.229 + COVER, FLAG_NONE) 1.230 + FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_pad_OVER, 1.231 +- scaled_bilinear_scanline_8888_8888_OVER, 1.232 ++ scaled_bilinear_scanline_8888_8888_OVER, NULL, 1.233 + uint32_t, uint32_t, uint32_t, 1.234 + PAD, FLAG_NONE) 1.235 + FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_none_OVER, 1.236 +- scaled_bilinear_scanline_8888_8888_OVER, 1.237 ++ scaled_bilinear_scanline_8888_8888_OVER, NULL, 1.238 + uint32_t, uint32_t, uint32_t, 1.239 + NONE, FLAG_NONE) 1.240 + FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_normal_OVER, 1.241 +- scaled_bilinear_scanline_8888_8888_OVER, 1.242 ++ scaled_bilinear_scanline_8888_8888_OVER, NULL, 1.243 + uint32_t, uint32_t, uint32_t, 1.244 + NORMAL, FLAG_NONE) 1.245 + 1.246 + #define REPEAT_MIN_WIDTH 32 1.247 + 1.248 + static void 1.249 + fast_composite_tiled_repeat (pixman_implementation_t *imp, 1.250 + pixman_composite_info_t *info) 1.251 +diff --git a/gfx/cairo/libpixman/src/pixman-inlines.h b/gfx/cairo/libpixman/src/pixman-inlines.h 1.252 +--- a/gfx/cairo/libpixman/src/pixman-inlines.h 1.253 ++++ b/gfx/cairo/libpixman/src/pixman-inlines.h 1.254 +@@ -816,18 +816,48 @@ bilinear_pad_repeat_get_scanline_bounds 1.255 + * 1.256 + * Note: normally the sum of 'weight_top' and 'weight_bottom' is equal to 256, 1.257 + * but sometimes it may be less than that for NONE repeat when handling 1.258 + * fuzzy antialiased top or bottom image edges. Also both top and 1.259 + * bottom weight variables are guaranteed to have value in 0-255 1.260 + * range and can fit into unsigned byte or be used with 8-bit SIMD 1.261 + * multiplication instructions. 1.262 + */ 1.263 +-#define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t, \ 1.264 +- dst_type_t, repeat_mode, flags) \ 1.265 ++ 1.266 ++/* Replace a single "scanline_func" with "fetch_func" & "op_func" to allow optional 1.267 ++ * two stage processing (bilinear fetch to a temp buffer, followed by unscaled 1.268 ++ * combine), "op_func" may be NULL, in this case we keep old behavior. 1.269 ++ * This is ugly and gcc issues some warnings, but works. 1.270 ++ * 1.271 ++ * An advice: clang has much better error reporting than gcc for deeply nested macros. 1.272 ++ */ 1.273 ++ 1.274 ++#define scanline_func(dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ 1.275 ++ scanline_buf, mask, src_top, src_bottom, width, \ 1.276 ++ weight_top, weight_bottom, vx, unit_x, max_vx, zero_src) \ 1.277 ++ do { \ 1.278 ++ if (op_func != NULL) \ 1.279 ++ { \ 1.280 ++ fetch_func ((void *)scanline_buf, (mask), (src_top), (src_bottom), (width), \ 1.281 ++ (weight_top), (weight_bottom), (vx), (unit_x), (max_vx), (zero_src)); \ 1.282 ++ ((void (*)(dst_type_t *, const mask_type_t *, const src_type_t *, int)) op_func)\ 1.283 ++ ((dst), (mask), (src_type_t *)scanline_buf, (width)); \ 1.284 ++ } \ 1.285 ++ else \ 1.286 ++ { \ 1.287 ++ fetch_func ((void*)(dst), (mask), (src_top), (src_bottom), (width), (weight_top), \ 1.288 ++ (weight_bottom), (vx), (unit_x), (max_vx), (zero_src)); \ 1.289 ++ } \ 1.290 ++ } while (0) 1.291 ++ 1.292 ++ 1.293 ++#define SCANLINE_BUFFER_LENGTH 3072 1.294 ++ 1.295 ++#define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, fetch_func, op_func, src_type_t, \ 1.296 ++ mask_type_t, dst_type_t, repeat_mode, flags) \ 1.297 + static void \ 1.298 + fast_composite_scaled_bilinear ## scale_func_name (pixman_implementation_t *imp, \ 1.299 + pixman_composite_info_t *info) \ 1.300 + { \ 1.301 + PIXMAN_COMPOSITE_ARGS (info); \ 1.302 + dst_type_t *dst_line; \ 1.303 + mask_type_t *mask_line; \ 1.304 + src_type_t *src_first_line; \ 1.305 +@@ -842,16 +872,19 @@ fast_composite_scaled_bilinear ## scale_ 1.306 + mask_type_t solid_mask; \ 1.307 + const mask_type_t *mask = &solid_mask; \ 1.308 + int src_stride, mask_stride, dst_stride; \ 1.309 + \ 1.310 + int src_width; \ 1.311 + pixman_fixed_t src_width_fixed; \ 1.312 + int max_x; \ 1.313 + pixman_bool_t need_src_extension; \ 1.314 ++ \ 1.315 ++ uint64_t stack_scanline_buffer[SCANLINE_BUFFER_LENGTH]; \ 1.316 ++ uint8_t *scanline_buffer = (uint8_t *) stack_scanline_buffer; \ 1.317 + \ 1.318 + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type_t, dst_stride, dst_line, 1); \ 1.319 + if (flags & FLAG_HAVE_SOLID_MASK) \ 1.320 + { \ 1.321 + solid_mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format); \ 1.322 + mask_stride = 0; \ 1.323 + } \ 1.324 + else if (flags & FLAG_HAVE_NON_SOLID_MASK) \ 1.325 +@@ -914,16 +947,24 @@ fast_composite_scaled_bilinear ## scale_ 1.326 + else \ 1.327 + { \ 1.328 + src_width = src_image->bits.width; \ 1.329 + need_src_extension = FALSE; \ 1.330 + } \ 1.331 + \ 1.332 + src_width_fixed = pixman_int_to_fixed (src_width); \ 1.333 + } \ 1.334 ++ \ 1.335 ++ if (op_func != NULL && width * sizeof(src_type_t) > sizeof(stack_scanline_buffer)) \ 1.336 ++ { \ 1.337 ++ scanline_buffer = pixman_malloc_ab (width, sizeof(src_type_t)); \ 1.338 ++ \ 1.339 ++ if (!scanline_buffer) \ 1.340 ++ return; \ 1.341 ++ } \ 1.342 + \ 1.343 + while (--height >= 0) \ 1.344 + { \ 1.345 + int weight1, weight2; \ 1.346 + dst = dst_line; \ 1.347 + dst_line += dst_stride; \ 1.348 + vx = v.vector[0]; \ 1.349 + if (flags & FLAG_HAVE_NON_SOLID_MASK) \ 1.350 +@@ -956,36 +997,39 @@ fast_composite_scaled_bilinear ## scale_ 1.351 + repeat (PIXMAN_REPEAT_PAD, &y2, src_image->bits.height); \ 1.352 + src1 = src_first_line + src_stride * y1; \ 1.353 + src2 = src_first_line + src_stride * y2; \ 1.354 + \ 1.355 + if (left_pad > 0) \ 1.356 + { \ 1.357 + buf1[0] = buf1[1] = src1[0]; \ 1.358 + buf2[0] = buf2[1] = src2[0]; \ 1.359 +- scanline_func (dst, mask, \ 1.360 +- buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, FALSE); \ 1.361 ++ scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ 1.362 ++ scanline_buffer, mask, buf1, buf2, left_pad, weight1, weight2, \ 1.363 ++ 0, 0, 0, FALSE); \ 1.364 + dst += left_pad; \ 1.365 + if (flags & FLAG_HAVE_NON_SOLID_MASK) \ 1.366 + mask += left_pad; \ 1.367 + } \ 1.368 + if (width > 0) \ 1.369 + { \ 1.370 +- scanline_func (dst, mask, \ 1.371 +- src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE); \ 1.372 ++ scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ 1.373 ++ scanline_buffer, mask, src1, src2, width, weight1, weight2, \ 1.374 ++ vx, unit_x, 0, FALSE); \ 1.375 + dst += width; \ 1.376 + if (flags & FLAG_HAVE_NON_SOLID_MASK) \ 1.377 + mask += width; \ 1.378 + } \ 1.379 + if (right_pad > 0) \ 1.380 + { \ 1.381 + buf1[0] = buf1[1] = src1[src_image->bits.width - 1]; \ 1.382 + buf2[0] = buf2[1] = src2[src_image->bits.width - 1]; \ 1.383 +- scanline_func (dst, mask, \ 1.384 +- buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, FALSE); \ 1.385 ++ scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ 1.386 ++ scanline_buffer, mask, buf1, buf2, right_pad, weight1, weight2, \ 1.387 ++ 0, 0, 0, FALSE); \ 1.388 + } \ 1.389 + } \ 1.390 + else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \ 1.391 + { \ 1.392 + src_type_t *src1, *src2; \ 1.393 + src_type_t buf1[2]; \ 1.394 + src_type_t buf2[2]; \ 1.395 + /* handle top/bottom zero padding by just setting weights to 0 if needed */ \ 1.396 +@@ -1011,64 +1055,67 @@ fast_composite_scaled_bilinear ## scale_ 1.397 + } \ 1.398 + src1 = src_first_line + src_stride * y1; \ 1.399 + src2 = src_first_line + src_stride * y2; \ 1.400 + \ 1.401 + if (left_pad > 0) \ 1.402 + { \ 1.403 + buf1[0] = buf1[1] = 0; \ 1.404 + buf2[0] = buf2[1] = 0; \ 1.405 +- scanline_func (dst, mask, \ 1.406 +- buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, TRUE); \ 1.407 ++ scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ 1.408 ++ scanline_buffer, mask, buf1, buf2, left_pad, weight1, weight2, \ 1.409 ++ 0, 0, 0, TRUE); \ 1.410 + dst += left_pad; \ 1.411 + if (flags & FLAG_HAVE_NON_SOLID_MASK) \ 1.412 + mask += left_pad; \ 1.413 + } \ 1.414 + if (left_tz > 0) \ 1.415 + { \ 1.416 + buf1[0] = 0; \ 1.417 + buf1[1] = src1[0]; \ 1.418 + buf2[0] = 0; \ 1.419 + buf2[1] = src2[0]; \ 1.420 +- scanline_func (dst, mask, \ 1.421 +- buf1, buf2, left_tz, weight1, weight2, \ 1.422 ++ scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ 1.423 ++ scanline_buffer, mask, buf1, buf2, left_tz, weight1, weight2, \ 1.424 + pixman_fixed_frac (vx), unit_x, 0, FALSE); \ 1.425 + dst += left_tz; \ 1.426 + if (flags & FLAG_HAVE_NON_SOLID_MASK) \ 1.427 + mask += left_tz; \ 1.428 + vx += left_tz * unit_x; \ 1.429 + } \ 1.430 + if (width > 0) \ 1.431 + { \ 1.432 +- scanline_func (dst, mask, \ 1.433 +- src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE); \ 1.434 ++ scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ 1.435 ++ scanline_buffer, mask, src1, src2, width, weight1, weight2, \ 1.436 ++ vx, unit_x, 0, FALSE); \ 1.437 + dst += width; \ 1.438 + if (flags & FLAG_HAVE_NON_SOLID_MASK) \ 1.439 + mask += width; \ 1.440 + vx += width * unit_x; \ 1.441 + } \ 1.442 + if (right_tz > 0) \ 1.443 + { \ 1.444 + buf1[0] = src1[src_image->bits.width - 1]; \ 1.445 + buf1[1] = 0; \ 1.446 + buf2[0] = src2[src_image->bits.width - 1]; \ 1.447 + buf2[1] = 0; \ 1.448 +- scanline_func (dst, mask, \ 1.449 +- buf1, buf2, right_tz, weight1, weight2, \ 1.450 ++ scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ 1.451 ++ scanline_buffer, mask, buf1, buf2, right_tz, weight1, weight2, \ 1.452 + pixman_fixed_frac (vx), unit_x, 0, FALSE); \ 1.453 + dst += right_tz; \ 1.454 + if (flags & FLAG_HAVE_NON_SOLID_MASK) \ 1.455 + mask += right_tz; \ 1.456 + } \ 1.457 + if (right_pad > 0) \ 1.458 + { \ 1.459 + buf1[0] = buf1[1] = 0; \ 1.460 + buf2[0] = buf2[1] = 0; \ 1.461 +- scanline_func (dst, mask, \ 1.462 +- buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, TRUE); \ 1.463 ++ scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ 1.464 ++ scanline_buffer, mask, buf1, buf2, right_pad, weight1, weight2, \ 1.465 ++ 0, 0, 0, TRUE); \ 1.466 + } \ 1.467 + } \ 1.468 + else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \ 1.469 + { \ 1.470 + int32_t num_pixels; \ 1.471 + int32_t width_remain; \ 1.472 + src_type_t * src_line_top; \ 1.473 + src_type_t * src_line_bottom; \ 1.474 +@@ -1120,17 +1167,18 @@ fast_composite_scaled_bilinear ## scale_ 1.475 + * vx is in range [0, src_width_fixed - pixman_fixed_e] \ 1.476 + * So we are safe from overflow. \ 1.477 + */ \ 1.478 + num_pixels = ((src_width_fixed - vx - pixman_fixed_e) / unit_x) + 1; \ 1.479 + \ 1.480 + if (num_pixels > width_remain) \ 1.481 + num_pixels = width_remain; \ 1.482 + \ 1.483 +- scanline_func (dst, mask, buf1, buf2, num_pixels, \ 1.484 ++ scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, \ 1.485 ++ dst, scanline_buffer, mask, buf1, buf2, num_pixels, \ 1.486 + weight1, weight2, pixman_fixed_frac(vx), \ 1.487 + unit_x, src_width_fixed, FALSE); \ 1.488 + \ 1.489 + width_remain -= num_pixels; \ 1.490 + vx += num_pixels * unit_x; \ 1.491 + dst += num_pixels; \ 1.492 + \ 1.493 + if (flags & FLAG_HAVE_NON_SOLID_MASK) \ 1.494 +@@ -1149,41 +1197,47 @@ fast_composite_scaled_bilinear ## scale_ 1.495 + * So we are safe from overflow here. \ 1.496 + */ \ 1.497 + num_pixels = ((src_width_fixed - pixman_fixed_1 - vx - pixman_fixed_e) \ 1.498 + / unit_x) + 1; \ 1.499 + \ 1.500 + if (num_pixels > width_remain) \ 1.501 + num_pixels = width_remain; \ 1.502 + \ 1.503 +- scanline_func (dst, mask, src_line_top, src_line_bottom, num_pixels, \ 1.504 +- weight1, weight2, vx, unit_x, src_width_fixed, FALSE); \ 1.505 ++ scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, \ 1.506 ++ dst, scanline_buffer, mask, src_line_top, src_line_bottom, \ 1.507 ++ num_pixels, weight1, weight2, vx, unit_x, src_width_fixed, \ 1.508 ++ FALSE); \ 1.509 + \ 1.510 + width_remain -= num_pixels; \ 1.511 + vx += num_pixels * unit_x; \ 1.512 + dst += num_pixels; \ 1.513 + \ 1.514 + if (flags & FLAG_HAVE_NON_SOLID_MASK) \ 1.515 + mask += num_pixels; \ 1.516 + } \ 1.517 + } \ 1.518 + } \ 1.519 + else \ 1.520 + { \ 1.521 +- scanline_func (dst, mask, src_first_line + src_stride * y1, \ 1.522 ++ scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ 1.523 ++ scanline_buffer, mask, \ 1.524 ++ src_first_line + src_stride * y1, \ 1.525 + src_first_line + src_stride * y2, width, \ 1.526 + weight1, weight2, vx, unit_x, max_vx, FALSE); \ 1.527 + } \ 1.528 + } \ 1.529 ++ if (scanline_buffer != (uint8_t *) stack_scanline_buffer) \ 1.530 ++ free (scanline_buffer); \ 1.531 + } 1.532 + 1.533 + /* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */ 1.534 +-#define FAST_BILINEAR_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t, \ 1.535 ++#define FAST_BILINEAR_MAINLOOP_COMMON(scale_func_name, fetch_func, op_func, src_type_t, mask_type_t,\ 1.536 + dst_type_t, repeat_mode, flags) \ 1.537 +- FAST_BILINEAR_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,\ 1.538 ++ FAST_BILINEAR_MAINLOOP_INT(_ ## scale_func_name, fetch_func, op_func, src_type_t, mask_type_t,\ 1.539 + dst_type_t, repeat_mode, flags) 1.540 + 1.541 + #define SCALED_BILINEAR_FLAGS \ 1.542 + (FAST_PATH_SCALE_TRANSFORM | \ 1.543 + FAST_PATH_NO_ALPHA_MAP | \ 1.544 + FAST_PATH_BILINEAR_FILTER | \ 1.545 + FAST_PATH_NO_ACCESSORS | \ 1.546 + FAST_PATH_NARROW_FORMAT) 1.547 +diff --git a/gfx/cairo/libpixman/src/pixman-sse2.c b/gfx/cairo/libpixman/src/pixman-sse2.c 1.548 +--- a/gfx/cairo/libpixman/src/pixman-sse2.c 1.549 ++++ b/gfx/cairo/libpixman/src/pixman-sse2.c 1.550 +@@ -5404,30 +5404,33 @@ scaled_bilinear_scanline_sse2_8888_8888_ 1.551 + if (w & 1) 1.552 + { 1.553 + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 1.554 + *dst = pix1; 1.555 + } 1.556 + 1.557 + } 1.558 + 1.559 ++/* Add extra NULL argument to the existing bilinear fast paths to indicate 1.560 ++ * that we don't need two-pass processing */ 1.561 ++ 1.562 + FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC, 1.563 +- scaled_bilinear_scanline_sse2_8888_8888_SRC, 1.564 ++ scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, 1.565 + uint32_t, uint32_t, uint32_t, 1.566 + COVER, FLAG_NONE) 1.567 + FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC, 1.568 +- scaled_bilinear_scanline_sse2_8888_8888_SRC, 1.569 ++ scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, 1.570 + uint32_t, uint32_t, uint32_t, 1.571 + PAD, FLAG_NONE) 1.572 + FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC, 1.573 +- scaled_bilinear_scanline_sse2_8888_8888_SRC, 1.574 ++ scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, 1.575 + uint32_t, uint32_t, uint32_t, 1.576 + NONE, FLAG_NONE) 1.577 + FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC, 1.578 +- scaled_bilinear_scanline_sse2_8888_8888_SRC, 1.579 ++ scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, 1.580 + uint32_t, uint32_t, uint32_t, 1.581 + NORMAL, FLAG_NONE) 1.582 + 1.583 + static force_inline void 1.584 + scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst, 1.585 + const uint32_t * mask, 1.586 + const uint32_t * src_top, 1.587 + const uint32_t * src_bottom, 1.588 +@@ -5505,32 +5508,66 @@ scaled_bilinear_scanline_sse2_8888_8888_ 1.589 + } 1.590 + 1.591 + w--; 1.592 + dst++; 1.593 + } 1.594 + } 1.595 + 1.596 + FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER, 1.597 +- scaled_bilinear_scanline_sse2_8888_8888_OVER, 1.598 ++ scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, 1.599 + uint32_t, uint32_t, uint32_t, 1.600 + COVER, FLAG_NONE) 1.601 + FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER, 1.602 +- scaled_bilinear_scanline_sse2_8888_8888_OVER, 1.603 ++ scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, 1.604 + uint32_t, uint32_t, uint32_t, 1.605 + PAD, FLAG_NONE) 1.606 + FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER, 1.607 +- scaled_bilinear_scanline_sse2_8888_8888_OVER, 1.608 ++ scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, 1.609 + uint32_t, uint32_t, uint32_t, 1.610 + NONE, FLAG_NONE) 1.611 + FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER, 1.612 +- scaled_bilinear_scanline_sse2_8888_8888_OVER, 1.613 ++ scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, 1.614 + uint32_t, uint32_t, uint32_t, 1.615 + NORMAL, FLAG_NONE) 1.616 + 1.617 ++ 1.618 ++/* An example of SSE2 two-stage bilinear_over_8888_0565 fast path, which is implemented 1.619 ++ as scaled_bilinear_scanline_sse2_8888_8888_SRC + op_bilinear_over_8888_0565 */ 1.620 ++ 1.621 ++void op_bilinear_over_8888_0565(uint16_t *dst, const uint32_t *mask, const uint32_t *src, int width) 1.622 ++{ 1.623 ++ /* Note: this is not really fast and should be based on 8 pixel loop from sse2_composite_over_8888_0565 */ 1.624 ++ while (--width >= 0) 1.625 ++ { 1.626 ++ *dst = composite_over_8888_0565pixel (*src, *dst); 1.627 ++ src++; 1.628 ++ dst++; 1.629 ++ } 1.630 ++} 1.631 ++ 1.632 ++FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_cover_OVER, 1.633 ++ scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, 1.634 ++ uint32_t, uint32_t, uint16_t, 1.635 ++ COVER, FLAG_NONE) 1.636 ++FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_pad_OVER, 1.637 ++ scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, 1.638 ++ uint32_t, uint32_t, uint16_t, 1.639 ++ PAD, FLAG_NONE) 1.640 ++FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_none_OVER, 1.641 ++ scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, 1.642 ++ uint32_t, uint32_t, uint16_t, 1.643 ++ NONE, FLAG_NONE) 1.644 ++FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_normal_OVER, 1.645 ++ scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, 1.646 ++ uint32_t, uint32_t, uint16_t, 1.647 ++ NORMAL, FLAG_NONE) 1.648 ++ 1.649 ++/*****************************/ 1.650 ++ 1.651 + static force_inline void 1.652 + scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst, 1.653 + const uint8_t * mask, 1.654 + const uint32_t * src_top, 1.655 + const uint32_t * src_bottom, 1.656 + int32_t w, 1.657 + int wt, 1.658 + int wb, 1.659 +@@ -5669,29 +5706,29 @@ scaled_bilinear_scanline_sse2_8888_8_888 1.660 + } 1.661 + 1.662 + w--; 1.663 + dst++; 1.664 + } 1.665 + } 1.666 + 1.667 + FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER, 1.668 +- scaled_bilinear_scanline_sse2_8888_8_8888_OVER, 1.669 ++ scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, 1.670 + uint32_t, uint8_t, uint32_t, 1.671 + COVER, FLAG_HAVE_NON_SOLID_MASK) 1.672 + FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER, 1.673 +- scaled_bilinear_scanline_sse2_8888_8_8888_OVER, 1.674 ++ scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, 1.675 + uint32_t, uint8_t, uint32_t, 1.676 + PAD, FLAG_HAVE_NON_SOLID_MASK) 1.677 + FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER, 1.678 +- scaled_bilinear_scanline_sse2_8888_8_8888_OVER, 1.679 ++ scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, 1.680 + uint32_t, uint8_t, uint32_t, 1.681 + NONE, FLAG_HAVE_NON_SOLID_MASK) 1.682 + FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER, 1.683 +- scaled_bilinear_scanline_sse2_8888_8_8888_OVER, 1.684 ++ scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, 1.685 + uint32_t, uint8_t, uint32_t, 1.686 + NORMAL, FLAG_HAVE_NON_SOLID_MASK) 1.687 + 1.688 + static const pixman_fast_path_t sse2_fast_paths[] = 1.689 + { 1.690 + /* PIXMAN_OP_OVER */ 1.691 + PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565), 1.692 + PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565), 1.693 +@@ -5808,16 +5845,21 @@ static const pixman_fast_path_t sse2_fas 1.694 + SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), 1.695 + SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), 1.696 + 1.697 + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888), 1.698 + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888), 1.699 + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888), 1.700 + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888), 1.701 + 1.702 ++ /* and here the needed entries are added to the fast path table */ 1.703 ++ 1.704 ++ SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, r5g6b5, sse2_8888_0565), 1.705 ++ SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, b5g6r5, sse2_8888_0565), 1.706 ++ 1.707 + { PIXMAN_OP_NONE }, 1.708 + }; 1.709 + 1.710 + static pixman_bool_t 1.711 + sse2_blt (pixman_implementation_t *imp, 1.712 + uint32_t * src_bits, 1.713 + uint32_t * dst_bits, 1.714 + int src_stride, 1.715 +