michael@0: changeset: 96613:3e003f0b8026 michael@0: tag: 2pass michael@0: tag: qbase michael@0: tag: qtip michael@0: tag: tip michael@0: user: Jeff Muizelaar michael@0: date: Thu May 17 19:23:53 2012 -0400 michael@0: summary: Bug 757878. Add a fast path for 8888_over_565 with NEON. r=bgirard,joe michael@0: michael@0: diff --git a/gfx/cairo/libpixman/src/pixman-arm-common.h b/gfx/cairo/libpixman/src/pixman-arm-common.h michael@0: --- a/gfx/cairo/libpixman/src/pixman-arm-common.h michael@0: +++ b/gfx/cairo/libpixman/src/pixman-arm-common.h michael@0: @@ -355,26 +355,26 @@ scaled_bilinear_scanline_##cputype##_##n michael@0: if ((flags & SKIP_ZERO_SRC) && zero_src) \ michael@0: return; \ michael@0: pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype ( \ michael@0: dst, src_top, src_bottom, wt, wb, vx, unit_x, w); \ michael@0: } \ michael@0: \ michael@0: FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_cover_##op, \ michael@0: scaled_bilinear_scanline_##cputype##_##name##_##op, \ michael@0: - src_type, uint32_t, dst_type, COVER, FLAG_NONE) \ michael@0: + NULL, src_type, uint32_t, dst_type, COVER, FLAG_NONE) \ michael@0: FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_none_##op, \ michael@0: scaled_bilinear_scanline_##cputype##_##name##_##op, \ michael@0: - src_type, uint32_t, dst_type, NONE, FLAG_NONE) \ michael@0: + NULL, src_type, uint32_t, dst_type, NONE, FLAG_NONE) \ michael@0: FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_pad_##op, \ michael@0: scaled_bilinear_scanline_##cputype##_##name##_##op, \ michael@0: - src_type, uint32_t, dst_type, PAD, FLAG_NONE) \ michael@0: + NULL, src_type, uint32_t, dst_type, PAD, FLAG_NONE) \ michael@0: FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_normal_##op, \ michael@0: scaled_bilinear_scanline_##cputype##_##name##_##op, \ michael@0: - src_type, uint32_t, dst_type, NORMAL, \ michael@0: + NULL, src_type, uint32_t, dst_type, NORMAL, \ michael@0: FLAG_NONE) michael@0: michael@0: michael@0: #define PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST(flags, cputype, name, op, \ michael@0: src_type, dst_type) \ michael@0: void \ michael@0: pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype ( \ michael@0: dst_type * dst, \ michael@0: @@ -404,25 +404,25 @@ scaled_bilinear_scanline_##cputype##_##n michael@0: if ((flags & SKIP_ZERO_SRC) && zero_src) \ michael@0: return; \ michael@0: pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype ( \ michael@0: dst, mask, src_top, src_bottom, wt, wb, vx, unit_x, w); \ michael@0: } \ michael@0: \ michael@0: FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_cover_##op, \ michael@0: scaled_bilinear_scanline_##cputype##_##name##_##op, \ michael@0: - src_type, uint8_t, dst_type, COVER, \ michael@0: + NULL, src_type, uint8_t, dst_type, COVER, \ michael@0: FLAG_HAVE_NON_SOLID_MASK) \ michael@0: FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_none_##op, \ michael@0: scaled_bilinear_scanline_##cputype##_##name##_##op, \ michael@0: - src_type, uint8_t, dst_type, NONE, \ michael@0: + NULL, src_type, uint8_t, dst_type, NONE, \ michael@0: FLAG_HAVE_NON_SOLID_MASK) \ michael@0: FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_pad_##op, \ michael@0: scaled_bilinear_scanline_##cputype##_##name##_##op, \ michael@0: - src_type, uint8_t, dst_type, PAD, \ michael@0: + NULL, src_type, uint8_t, dst_type, PAD, \ michael@0: FLAG_HAVE_NON_SOLID_MASK) \ michael@0: FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_normal_##op, \ michael@0: scaled_bilinear_scanline_##cputype##_##name##_##op, \ michael@0: - src_type, uint8_t, dst_type, NORMAL, \ michael@0: + NULL, src_type, uint8_t, dst_type, NORMAL, \ michael@0: FLAG_HAVE_NON_SOLID_MASK) michael@0: michael@0: michael@0: #endif michael@0: diff --git a/gfx/cairo/libpixman/src/pixman-arm-neon.c b/gfx/cairo/libpixman/src/pixman-arm-neon.c michael@0: --- a/gfx/cairo/libpixman/src/pixman-arm-neon.c michael@0: +++ b/gfx/cairo/libpixman/src/pixman-arm-neon.c michael@0: @@ -140,16 +140,33 @@ PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST michael@0: PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_0565, SRC, michael@0: uint32_t, uint16_t) michael@0: PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_x888, SRC, michael@0: uint16_t, uint32_t) michael@0: PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_0565, SRC, michael@0: uint16_t, uint16_t) michael@0: PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, neon, 8888_8888, OVER, michael@0: uint32_t, uint32_t) michael@0: +static force_inline void michael@0: +pixman_scaled_bilinear_scanline_8888_8888_SRC ( michael@0: + uint32_t * dst, michael@0: + const uint32_t * mask, michael@0: + const uint32_t * src_top, michael@0: + const uint32_t * src_bottom, michael@0: + int32_t w, michael@0: + int wt, michael@0: + int wb, michael@0: + pixman_fixed_t vx, michael@0: + pixman_fixed_t unit_x, michael@0: + pixman_fixed_t max_vx, michael@0: + pixman_bool_t zero_src) michael@0: +{ michael@0: + pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon (dst, src_top, src_bottom, wt, wb, vx, unit_x, w); michael@0: +} michael@0: + michael@0: PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, neon, 8888_8888, ADD, michael@0: uint32_t, uint32_t) michael@0: michael@0: PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 8888_8_8888, SRC, michael@0: uint32_t, uint32_t) michael@0: PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 8888_8_0565, SRC, michael@0: uint32_t, uint16_t) michael@0: PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 0565_8_x888, SRC, michael@0: @@ -261,16 +278,38 @@ pixman_blt_neon (uint32_t *src_bits, michael@0: (uint32_t *)(((char *) src_bits) + michael@0: src_y * src_stride * 4 + src_x * 4), src_stride); michael@0: return TRUE; michael@0: default: michael@0: return FALSE; michael@0: } michael@0: } michael@0: michael@0: +static inline void op_bilinear_over_8888_0565(uint16_t *dst, const uint32_t *mask, const uint32_t *src, int width) michael@0: +{ michael@0: + pixman_composite_over_8888_0565_asm_neon (width, 1, dst, 0, src, 0); michael@0: +} michael@0: + michael@0: +FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_0565_cover_OVER, michael@0: + pixman_scaled_bilinear_scanline_8888_8888_SRC, op_bilinear_over_8888_0565, michael@0: + uint32_t, uint32_t, uint16_t, michael@0: + COVER, FLAG_NONE) michael@0: +FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_0565_pad_OVER, michael@0: + pixman_scaled_bilinear_scanline_8888_8888_SRC, op_bilinear_over_8888_0565, michael@0: + uint32_t, uint32_t, uint16_t, michael@0: + PAD, FLAG_NONE) michael@0: +FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_0565_none_OVER, michael@0: + pixman_scaled_bilinear_scanline_8888_8888_SRC, op_bilinear_over_8888_0565, michael@0: + uint32_t, uint32_t, uint16_t, michael@0: + NONE, FLAG_NONE) michael@0: +FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_0565_normal_OVER, michael@0: + pixman_scaled_bilinear_scanline_8888_8888_SRC, op_bilinear_over_8888_0565, michael@0: + uint32_t, uint32_t, uint16_t, michael@0: + NORMAL, FLAG_NONE) michael@0: + michael@0: static const pixman_fast_path_t arm_neon_fast_paths[] = michael@0: { michael@0: PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, neon_composite_src_0565_0565), michael@0: PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, neon_composite_src_0565_0565), michael@0: PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, neon_composite_src_8888_0565), michael@0: PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, neon_composite_src_8888_0565), michael@0: PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, neon_composite_src_8888_0565), michael@0: PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, neon_composite_src_8888_0565), michael@0: @@ -414,16 +453,18 @@ static const pixman_fast_path_t arm_neon michael@0: SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_8_0565), michael@0: michael@0: SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8_8888), michael@0: SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8_8888), michael@0: michael@0: SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8_8888), michael@0: SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8_8888), michael@0: michael@0: + SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_0565), michael@0: + michael@0: { PIXMAN_OP_NONE }, michael@0: }; michael@0: michael@0: static pixman_bool_t michael@0: arm_neon_blt (pixman_implementation_t *imp, michael@0: uint32_t * src_bits, michael@0: uint32_t * dst_bits, michael@0: int src_stride, michael@0: diff --git a/gfx/cairo/libpixman/src/pixman-fast-path.c b/gfx/cairo/libpixman/src/pixman-fast-path.c michael@0: --- a/gfx/cairo/libpixman/src/pixman-fast-path.c michael@0: +++ b/gfx/cairo/libpixman/src/pixman-fast-path.c michael@0: @@ -1356,63 +1356,63 @@ scaled_bilinear_scanline_565_565_SRC (ui michael@0: vx += unit_x; michael@0: *dst++ = d; michael@0: } michael@0: } michael@0: michael@0: #endif michael@0: michael@0: FAST_BILINEAR_MAINLOOP_COMMON (565_565_cover_SRC, michael@0: - scaled_bilinear_scanline_565_565_SRC, michael@0: + scaled_bilinear_scanline_565_565_SRC, NULL, michael@0: uint16_t, uint32_t, uint16_t, michael@0: COVER, FLAG_NONE) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (565_565_pad_SRC, michael@0: - scaled_bilinear_scanline_565_565_SRC, michael@0: + scaled_bilinear_scanline_565_565_SRC, NULL, michael@0: uint16_t, uint32_t, uint16_t, michael@0: PAD, FLAG_NONE) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (565_565_none_SRC, michael@0: - scaled_bilinear_scanline_565_565_SRC, michael@0: + scaled_bilinear_scanline_565_565_SRC, NULL, michael@0: uint16_t, uint32_t, uint16_t, michael@0: NONE, FLAG_NONE) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (565_565_normal_SRC, michael@0: - scaled_bilinear_scanline_565_565_SRC, michael@0: + scaled_bilinear_scanline_565_565_SRC, NULL, michael@0: uint16_t, uint32_t, uint16_t, michael@0: NORMAL, FLAG_NONE) michael@0: michael@0: FAST_BILINEAR_MAINLOOP_COMMON (8888_565_cover_OVER, michael@0: - scaled_bilinear_scanline_8888_565_OVER, michael@0: + scaled_bilinear_scanline_8888_565_OVER, NULL, michael@0: uint32_t, uint32_t, uint16_t, michael@0: COVER, FLAG_NONE) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (8888_565_pad_OVER, michael@0: - scaled_bilinear_scanline_8888_565_OVER, michael@0: + scaled_bilinear_scanline_8888_565_OVER, NULL, michael@0: uint32_t, uint32_t, uint16_t, michael@0: PAD, FLAG_NONE) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (8888_565_none_OVER, michael@0: - scaled_bilinear_scanline_8888_565_OVER, michael@0: + scaled_bilinear_scanline_8888_565_OVER, NULL, michael@0: uint32_t, uint32_t, uint16_t, michael@0: NONE, FLAG_NONE) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (8888_565_normal_OVER, michael@0: - scaled_bilinear_scanline_8888_565_OVER, michael@0: + scaled_bilinear_scanline_8888_565_OVER, NULL, michael@0: uint32_t, uint32_t, uint16_t, michael@0: NORMAL, FLAG_NONE) michael@0: michael@0: FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_cover_OVER, michael@0: - scaled_bilinear_scanline_8888_8888_OVER, michael@0: + scaled_bilinear_scanline_8888_8888_OVER, NULL, michael@0: uint32_t, uint32_t, uint32_t, michael@0: COVER, FLAG_NONE) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_pad_OVER, michael@0: - scaled_bilinear_scanline_8888_8888_OVER, michael@0: + scaled_bilinear_scanline_8888_8888_OVER, NULL, michael@0: uint32_t, uint32_t, uint32_t, michael@0: PAD, FLAG_NONE) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_none_OVER, michael@0: - scaled_bilinear_scanline_8888_8888_OVER, michael@0: + scaled_bilinear_scanline_8888_8888_OVER, NULL, michael@0: uint32_t, uint32_t, uint32_t, michael@0: NONE, FLAG_NONE) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_normal_OVER, michael@0: - scaled_bilinear_scanline_8888_8888_OVER, michael@0: + scaled_bilinear_scanline_8888_8888_OVER, NULL, michael@0: uint32_t, uint32_t, uint32_t, michael@0: NORMAL, FLAG_NONE) michael@0: michael@0: #define REPEAT_MIN_WIDTH 32 michael@0: michael@0: static void michael@0: fast_composite_tiled_repeat (pixman_implementation_t *imp, michael@0: pixman_composite_info_t *info) michael@0: diff --git a/gfx/cairo/libpixman/src/pixman-inlines.h b/gfx/cairo/libpixman/src/pixman-inlines.h michael@0: --- a/gfx/cairo/libpixman/src/pixman-inlines.h michael@0: +++ b/gfx/cairo/libpixman/src/pixman-inlines.h michael@0: @@ -816,18 +816,48 @@ bilinear_pad_repeat_get_scanline_bounds michael@0: * michael@0: * Note: normally the sum of 'weight_top' and 'weight_bottom' is equal to 256, michael@0: * but sometimes it may be less than that for NONE repeat when handling michael@0: * fuzzy antialiased top or bottom image edges. Also both top and michael@0: * bottom weight variables are guaranteed to have value in 0-255 michael@0: * range and can fit into unsigned byte or be used with 8-bit SIMD michael@0: * multiplication instructions. michael@0: */ michael@0: -#define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t, \ michael@0: - dst_type_t, repeat_mode, flags) \ michael@0: + michael@0: +/* Replace a single "scanline_func" with "fetch_func" & "op_func" to allow optional michael@0: + * two stage processing (bilinear fetch to a temp buffer, followed by unscaled michael@0: + * combine), "op_func" may be NULL, in this case we keep old behavior. michael@0: + * This is ugly and gcc issues some warnings, but works. michael@0: + * michael@0: + * An advice: clang has much better error reporting than gcc for deeply nested macros. michael@0: + */ michael@0: + michael@0: +#define scanline_func(dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ michael@0: + scanline_buf, mask, src_top, src_bottom, width, \ michael@0: + weight_top, weight_bottom, vx, unit_x, max_vx, zero_src) \ michael@0: + do { \ michael@0: + if (op_func != NULL) \ michael@0: + { \ michael@0: + fetch_func ((void *)scanline_buf, (mask), (src_top), (src_bottom), (width), \ michael@0: + (weight_top), (weight_bottom), (vx), (unit_x), (max_vx), (zero_src)); \ michael@0: + ((void (*)(dst_type_t *, const mask_type_t *, const src_type_t *, int)) op_func)\ michael@0: + ((dst), (mask), (src_type_t *)scanline_buf, (width)); \ michael@0: + } \ michael@0: + else \ michael@0: + { \ michael@0: + fetch_func ((void*)(dst), (mask), (src_top), (src_bottom), (width), (weight_top), \ michael@0: + (weight_bottom), (vx), (unit_x), (max_vx), (zero_src)); \ michael@0: + } \ michael@0: + } while (0) michael@0: + michael@0: + michael@0: +#define SCANLINE_BUFFER_LENGTH 3072 michael@0: + michael@0: +#define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, fetch_func, op_func, src_type_t, \ michael@0: + mask_type_t, dst_type_t, repeat_mode, flags) \ michael@0: static void \ michael@0: fast_composite_scaled_bilinear ## scale_func_name (pixman_implementation_t *imp, \ michael@0: pixman_composite_info_t *info) \ michael@0: { \ michael@0: PIXMAN_COMPOSITE_ARGS (info); \ michael@0: dst_type_t *dst_line; \ michael@0: mask_type_t *mask_line; \ michael@0: src_type_t *src_first_line; \ michael@0: @@ -842,16 +872,19 @@ fast_composite_scaled_bilinear ## scale_ michael@0: mask_type_t solid_mask; \ michael@0: const mask_type_t *mask = &solid_mask; \ michael@0: int src_stride, mask_stride, dst_stride; \ michael@0: \ michael@0: int src_width; \ michael@0: pixman_fixed_t src_width_fixed; \ michael@0: int max_x; \ michael@0: pixman_bool_t need_src_extension; \ michael@0: + \ michael@0: + uint64_t stack_scanline_buffer[SCANLINE_BUFFER_LENGTH]; \ michael@0: + uint8_t *scanline_buffer = (uint8_t *) stack_scanline_buffer; \ michael@0: \ michael@0: PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type_t, dst_stride, dst_line, 1); \ michael@0: if (flags & FLAG_HAVE_SOLID_MASK) \ michael@0: { \ michael@0: solid_mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format); \ michael@0: mask_stride = 0; \ michael@0: } \ michael@0: else if (flags & FLAG_HAVE_NON_SOLID_MASK) \ michael@0: @@ -914,16 +947,24 @@ fast_composite_scaled_bilinear ## scale_ michael@0: else \ michael@0: { \ michael@0: src_width = src_image->bits.width; \ michael@0: need_src_extension = FALSE; \ michael@0: } \ michael@0: \ michael@0: src_width_fixed = pixman_int_to_fixed (src_width); \ michael@0: } \ michael@0: + \ michael@0: + if (op_func != NULL && width * sizeof(src_type_t) > sizeof(stack_scanline_buffer)) \ michael@0: + { \ michael@0: + scanline_buffer = pixman_malloc_ab (width, sizeof(src_type_t)); \ michael@0: + \ michael@0: + if (!scanline_buffer) \ michael@0: + return; \ michael@0: + } \ michael@0: \ michael@0: while (--height >= 0) \ michael@0: { \ michael@0: int weight1, weight2; \ michael@0: dst = dst_line; \ michael@0: dst_line += dst_stride; \ michael@0: vx = v.vector[0]; \ michael@0: if (flags & FLAG_HAVE_NON_SOLID_MASK) \ michael@0: @@ -956,36 +997,39 @@ fast_composite_scaled_bilinear ## scale_ michael@0: repeat (PIXMAN_REPEAT_PAD, &y2, src_image->bits.height); \ michael@0: src1 = src_first_line + src_stride * y1; \ michael@0: src2 = src_first_line + src_stride * y2; \ michael@0: \ michael@0: if (left_pad > 0) \ michael@0: { \ michael@0: buf1[0] = buf1[1] = src1[0]; \ michael@0: buf2[0] = buf2[1] = src2[0]; \ michael@0: - scanline_func (dst, mask, \ michael@0: - buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, FALSE); \ michael@0: + scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ michael@0: + scanline_buffer, mask, buf1, buf2, left_pad, weight1, weight2, \ michael@0: + 0, 0, 0, FALSE); \ michael@0: dst += left_pad; \ michael@0: if (flags & FLAG_HAVE_NON_SOLID_MASK) \ michael@0: mask += left_pad; \ michael@0: } \ michael@0: if (width > 0) \ michael@0: { \ michael@0: - scanline_func (dst, mask, \ michael@0: - src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE); \ michael@0: + scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ michael@0: + scanline_buffer, mask, src1, src2, width, weight1, weight2, \ michael@0: + vx, unit_x, 0, FALSE); \ michael@0: dst += width; \ michael@0: if (flags & FLAG_HAVE_NON_SOLID_MASK) \ michael@0: mask += width; \ michael@0: } \ michael@0: if (right_pad > 0) \ michael@0: { \ michael@0: buf1[0] = buf1[1] = src1[src_image->bits.width - 1]; \ michael@0: buf2[0] = buf2[1] = src2[src_image->bits.width - 1]; \ michael@0: - scanline_func (dst, mask, \ michael@0: - buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, FALSE); \ michael@0: + scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ michael@0: + scanline_buffer, mask, buf1, buf2, right_pad, weight1, weight2, \ michael@0: + 0, 0, 0, FALSE); \ michael@0: } \ michael@0: } \ michael@0: else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \ michael@0: { \ michael@0: src_type_t *src1, *src2; \ michael@0: src_type_t buf1[2]; \ michael@0: src_type_t buf2[2]; \ michael@0: /* handle top/bottom zero padding by just setting weights to 0 if needed */ \ michael@0: @@ -1011,64 +1055,67 @@ fast_composite_scaled_bilinear ## scale_ michael@0: } \ michael@0: src1 = src_first_line + src_stride * y1; \ michael@0: src2 = src_first_line + src_stride * y2; \ michael@0: \ michael@0: if (left_pad > 0) \ michael@0: { \ michael@0: buf1[0] = buf1[1] = 0; \ michael@0: buf2[0] = buf2[1] = 0; \ michael@0: - scanline_func (dst, mask, \ michael@0: - buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, TRUE); \ michael@0: + scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ michael@0: + scanline_buffer, mask, buf1, buf2, left_pad, weight1, weight2, \ michael@0: + 0, 0, 0, TRUE); \ michael@0: dst += left_pad; \ michael@0: if (flags & FLAG_HAVE_NON_SOLID_MASK) \ michael@0: mask += left_pad; \ michael@0: } \ michael@0: if (left_tz > 0) \ michael@0: { \ michael@0: buf1[0] = 0; \ michael@0: buf1[1] = src1[0]; \ michael@0: buf2[0] = 0; \ michael@0: buf2[1] = src2[0]; \ michael@0: - scanline_func (dst, mask, \ michael@0: - buf1, buf2, left_tz, weight1, weight2, \ michael@0: + scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ michael@0: + scanline_buffer, mask, buf1, buf2, left_tz, weight1, weight2, \ michael@0: pixman_fixed_frac (vx), unit_x, 0, FALSE); \ michael@0: dst += left_tz; \ michael@0: if (flags & FLAG_HAVE_NON_SOLID_MASK) \ michael@0: mask += left_tz; \ michael@0: vx += left_tz * unit_x; \ michael@0: } \ michael@0: if (width > 0) \ michael@0: { \ michael@0: - scanline_func (dst, mask, \ michael@0: - src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE); \ michael@0: + scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ michael@0: + scanline_buffer, mask, src1, src2, width, weight1, weight2, \ michael@0: + vx, unit_x, 0, FALSE); \ michael@0: dst += width; \ michael@0: if (flags & FLAG_HAVE_NON_SOLID_MASK) \ michael@0: mask += width; \ michael@0: vx += width * unit_x; \ michael@0: } \ michael@0: if (right_tz > 0) \ michael@0: { \ michael@0: buf1[0] = src1[src_image->bits.width - 1]; \ michael@0: buf1[1] = 0; \ michael@0: buf2[0] = src2[src_image->bits.width - 1]; \ michael@0: buf2[1] = 0; \ michael@0: - scanline_func (dst, mask, \ michael@0: - buf1, buf2, right_tz, weight1, weight2, \ michael@0: + scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ michael@0: + scanline_buffer, mask, buf1, buf2, right_tz, weight1, weight2, \ michael@0: pixman_fixed_frac (vx), unit_x, 0, FALSE); \ michael@0: dst += right_tz; \ michael@0: if (flags & FLAG_HAVE_NON_SOLID_MASK) \ michael@0: mask += right_tz; \ michael@0: } \ michael@0: if (right_pad > 0) \ michael@0: { \ michael@0: buf1[0] = buf1[1] = 0; \ michael@0: buf2[0] = buf2[1] = 0; \ michael@0: - scanline_func (dst, mask, \ michael@0: - buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, TRUE); \ michael@0: + scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ michael@0: + scanline_buffer, mask, buf1, buf2, right_pad, weight1, weight2, \ michael@0: + 0, 0, 0, TRUE); \ michael@0: } \ michael@0: } \ michael@0: else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \ michael@0: { \ michael@0: int32_t num_pixels; \ michael@0: int32_t width_remain; \ michael@0: src_type_t * src_line_top; \ michael@0: src_type_t * src_line_bottom; \ michael@0: @@ -1120,17 +1167,18 @@ fast_composite_scaled_bilinear ## scale_ michael@0: * vx is in range [0, src_width_fixed - pixman_fixed_e] \ michael@0: * So we are safe from overflow. \ michael@0: */ \ michael@0: num_pixels = ((src_width_fixed - vx - pixman_fixed_e) / unit_x) + 1; \ michael@0: \ michael@0: if (num_pixels > width_remain) \ michael@0: num_pixels = width_remain; \ michael@0: \ michael@0: - scanline_func (dst, mask, buf1, buf2, num_pixels, \ michael@0: + scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, \ michael@0: + dst, scanline_buffer, mask, buf1, buf2, num_pixels, \ michael@0: weight1, weight2, pixman_fixed_frac(vx), \ michael@0: unit_x, src_width_fixed, FALSE); \ michael@0: \ michael@0: width_remain -= num_pixels; \ michael@0: vx += num_pixels * unit_x; \ michael@0: dst += num_pixels; \ michael@0: \ michael@0: if (flags & FLAG_HAVE_NON_SOLID_MASK) \ michael@0: @@ -1149,41 +1197,47 @@ fast_composite_scaled_bilinear ## scale_ michael@0: * So we are safe from overflow here. \ michael@0: */ \ michael@0: num_pixels = ((src_width_fixed - pixman_fixed_1 - vx - pixman_fixed_e) \ michael@0: / unit_x) + 1; \ michael@0: \ michael@0: if (num_pixels > width_remain) \ michael@0: num_pixels = width_remain; \ michael@0: \ michael@0: - scanline_func (dst, mask, src_line_top, src_line_bottom, num_pixels, \ michael@0: - weight1, weight2, vx, unit_x, src_width_fixed, FALSE); \ michael@0: + scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, \ michael@0: + dst, scanline_buffer, mask, src_line_top, src_line_bottom, \ michael@0: + num_pixels, weight1, weight2, vx, unit_x, src_width_fixed, \ michael@0: + FALSE); \ michael@0: \ michael@0: width_remain -= num_pixels; \ michael@0: vx += num_pixels * unit_x; \ michael@0: dst += num_pixels; \ michael@0: \ michael@0: if (flags & FLAG_HAVE_NON_SOLID_MASK) \ michael@0: mask += num_pixels; \ michael@0: } \ michael@0: } \ michael@0: } \ michael@0: else \ michael@0: { \ michael@0: - scanline_func (dst, mask, src_first_line + src_stride * y1, \ michael@0: + scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ michael@0: + scanline_buffer, mask, \ michael@0: + src_first_line + src_stride * y1, \ michael@0: src_first_line + src_stride * y2, width, \ michael@0: weight1, weight2, vx, unit_x, max_vx, FALSE); \ michael@0: } \ michael@0: } \ michael@0: + if (scanline_buffer != (uint8_t *) stack_scanline_buffer) \ michael@0: + free (scanline_buffer); \ michael@0: } michael@0: michael@0: /* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */ michael@0: -#define FAST_BILINEAR_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t, \ michael@0: +#define FAST_BILINEAR_MAINLOOP_COMMON(scale_func_name, fetch_func, op_func, src_type_t, mask_type_t,\ michael@0: dst_type_t, repeat_mode, flags) \ michael@0: - FAST_BILINEAR_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,\ michael@0: + FAST_BILINEAR_MAINLOOP_INT(_ ## scale_func_name, fetch_func, op_func, src_type_t, mask_type_t,\ michael@0: dst_type_t, repeat_mode, flags) michael@0: michael@0: #define SCALED_BILINEAR_FLAGS \ michael@0: (FAST_PATH_SCALE_TRANSFORM | \ michael@0: FAST_PATH_NO_ALPHA_MAP | \ michael@0: FAST_PATH_BILINEAR_FILTER | \ michael@0: FAST_PATH_NO_ACCESSORS | \ michael@0: FAST_PATH_NARROW_FORMAT) michael@0: diff --git a/gfx/cairo/libpixman/src/pixman-sse2.c b/gfx/cairo/libpixman/src/pixman-sse2.c michael@0: --- a/gfx/cairo/libpixman/src/pixman-sse2.c michael@0: +++ b/gfx/cairo/libpixman/src/pixman-sse2.c michael@0: @@ -5404,30 +5404,33 @@ scaled_bilinear_scanline_sse2_8888_8888_ michael@0: if (w & 1) michael@0: { michael@0: BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); michael@0: *dst = pix1; michael@0: } michael@0: michael@0: } michael@0: michael@0: +/* Add extra NULL argument to the existing bilinear fast paths to indicate michael@0: + * that we don't need two-pass processing */ michael@0: + michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC, michael@0: - scaled_bilinear_scanline_sse2_8888_8888_SRC, michael@0: + scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, michael@0: uint32_t, uint32_t, uint32_t, michael@0: COVER, FLAG_NONE) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC, michael@0: - scaled_bilinear_scanline_sse2_8888_8888_SRC, michael@0: + scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, michael@0: uint32_t, uint32_t, uint32_t, michael@0: PAD, FLAG_NONE) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC, michael@0: - scaled_bilinear_scanline_sse2_8888_8888_SRC, michael@0: + scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, michael@0: uint32_t, uint32_t, uint32_t, michael@0: NONE, FLAG_NONE) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC, michael@0: - scaled_bilinear_scanline_sse2_8888_8888_SRC, michael@0: + scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, michael@0: uint32_t, uint32_t, uint32_t, michael@0: NORMAL, FLAG_NONE) michael@0: michael@0: static force_inline void michael@0: scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst, michael@0: const uint32_t * mask, michael@0: const uint32_t * src_top, michael@0: const uint32_t * src_bottom, michael@0: @@ -5505,32 +5508,66 @@ scaled_bilinear_scanline_sse2_8888_8888_ michael@0: } michael@0: michael@0: w--; michael@0: dst++; michael@0: } michael@0: } michael@0: michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER, michael@0: - scaled_bilinear_scanline_sse2_8888_8888_OVER, michael@0: + scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, michael@0: uint32_t, uint32_t, uint32_t, michael@0: COVER, FLAG_NONE) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER, michael@0: - scaled_bilinear_scanline_sse2_8888_8888_OVER, michael@0: + scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, michael@0: uint32_t, uint32_t, uint32_t, michael@0: PAD, FLAG_NONE) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER, michael@0: - scaled_bilinear_scanline_sse2_8888_8888_OVER, michael@0: + scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, michael@0: uint32_t, uint32_t, uint32_t, michael@0: NONE, FLAG_NONE) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER, michael@0: - scaled_bilinear_scanline_sse2_8888_8888_OVER, michael@0: + scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, michael@0: uint32_t, uint32_t, uint32_t, michael@0: NORMAL, FLAG_NONE) michael@0: michael@0: + michael@0: +/* An example of SSE2 two-stage bilinear_over_8888_0565 fast path, which is implemented michael@0: + as scaled_bilinear_scanline_sse2_8888_8888_SRC + op_bilinear_over_8888_0565 */ michael@0: + michael@0: +void op_bilinear_over_8888_0565(uint16_t *dst, const uint32_t *mask, const uint32_t *src, int width) michael@0: +{ michael@0: + /* Note: this is not really fast and should be based on 8 pixel loop from sse2_composite_over_8888_0565 */ michael@0: + while (--width >= 0) michael@0: + { michael@0: + *dst = composite_over_8888_0565pixel (*src, *dst); michael@0: + src++; michael@0: + dst++; michael@0: + } michael@0: +} michael@0: + michael@0: +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_cover_OVER, michael@0: + scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, michael@0: + uint32_t, uint32_t, uint16_t, michael@0: + COVER, FLAG_NONE) michael@0: +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_pad_OVER, michael@0: + scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, michael@0: + uint32_t, uint32_t, uint16_t, michael@0: + PAD, FLAG_NONE) michael@0: +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_none_OVER, michael@0: + scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, michael@0: + uint32_t, uint32_t, uint16_t, michael@0: + NONE, FLAG_NONE) michael@0: +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_normal_OVER, michael@0: + scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, michael@0: + uint32_t, uint32_t, uint16_t, michael@0: + NORMAL, FLAG_NONE) michael@0: + michael@0: +/*****************************/ michael@0: + michael@0: static force_inline void michael@0: scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst, michael@0: const uint8_t * mask, michael@0: const uint32_t * src_top, michael@0: const uint32_t * src_bottom, michael@0: int32_t w, michael@0: int wt, michael@0: int wb, michael@0: @@ -5669,29 +5706,29 @@ scaled_bilinear_scanline_sse2_8888_8_888 michael@0: } michael@0: michael@0: w--; michael@0: dst++; michael@0: } michael@0: } michael@0: michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER, michael@0: - scaled_bilinear_scanline_sse2_8888_8_8888_OVER, michael@0: + scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, michael@0: uint32_t, uint8_t, uint32_t, michael@0: COVER, FLAG_HAVE_NON_SOLID_MASK) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER, michael@0: - scaled_bilinear_scanline_sse2_8888_8_8888_OVER, michael@0: + scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, michael@0: uint32_t, uint8_t, uint32_t, michael@0: PAD, FLAG_HAVE_NON_SOLID_MASK) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER, michael@0: - scaled_bilinear_scanline_sse2_8888_8_8888_OVER, michael@0: + scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, michael@0: uint32_t, uint8_t, uint32_t, michael@0: NONE, FLAG_HAVE_NON_SOLID_MASK) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER, michael@0: - scaled_bilinear_scanline_sse2_8888_8_8888_OVER, michael@0: + scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, michael@0: uint32_t, uint8_t, uint32_t, michael@0: NORMAL, FLAG_HAVE_NON_SOLID_MASK) michael@0: michael@0: static const pixman_fast_path_t sse2_fast_paths[] = michael@0: { michael@0: /* PIXMAN_OP_OVER */ michael@0: PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565), michael@0: PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565), michael@0: @@ -5808,16 +5845,21 @@ static const pixman_fast_path_t sse2_fas michael@0: SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), michael@0: SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), michael@0: michael@0: SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888), michael@0: SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888), michael@0: SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888), michael@0: SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888), michael@0: michael@0: + /* and here the needed entries are added to the fast path table */ michael@0: + michael@0: + SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, r5g6b5, sse2_8888_0565), michael@0: + SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, b5g6r5, sse2_8888_0565), michael@0: + michael@0: { PIXMAN_OP_NONE }, michael@0: }; michael@0: michael@0: static pixman_bool_t michael@0: sse2_blt (pixman_implementation_t *imp, michael@0: uint32_t * src_bits, michael@0: uint32_t * dst_bits, michael@0: int src_stride, michael@0: