gfx/cairo/pixman-8888-over-565.patch

Tue, 06 Jan 2015 21:39:09 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Tue, 06 Jan 2015 21:39:09 +0100
branch
TOR_BUG_9701
changeset 8
97036ab72558
permissions
-rw-r--r--

Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 changeset:   96613:3e003f0b8026
     2 tag:         2pass
     3 tag:         qbase
     4 tag:         qtip
     5 tag:         tip
     6 user:        Jeff Muizelaar <jmuizelaar@mozilla.com>
     7 date:        Thu May 17 19:23:53 2012 -0400
     8 summary:     Bug 757878. Add a fast path for 8888_over_565 with NEON. r=bgirard,joe
    10 diff --git a/gfx/cairo/libpixman/src/pixman-arm-common.h b/gfx/cairo/libpixman/src/pixman-arm-common.h
    11 --- a/gfx/cairo/libpixman/src/pixman-arm-common.h
    12 +++ b/gfx/cairo/libpixman/src/pixman-arm-common.h
    13 @@ -355,26 +355,26 @@ scaled_bilinear_scanline_##cputype##_##n
    14      if ((flags & SKIP_ZERO_SRC) && zero_src)                                  \
    15  	return;                                                               \
    16      pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (           \
    17                              dst, src_top, src_bottom, wt, wb, vx, unit_x, w); \
    18  }                                                                             \
    19                                                                                \
    20  FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_cover_##op,                 \
    21                         scaled_bilinear_scanline_##cputype##_##name##_##op,    \
    22 -                       src_type, uint32_t, dst_type, COVER, FLAG_NONE)        \
    23 +                       NULL, src_type, uint32_t, dst_type, COVER, FLAG_NONE)  \
    24  FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_none_##op,                  \
    25                         scaled_bilinear_scanline_##cputype##_##name##_##op,    \
    26 -                       src_type, uint32_t, dst_type, NONE, FLAG_NONE)         \
    27 +                       NULL, src_type, uint32_t, dst_type, NONE, FLAG_NONE)   \
    28  FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_pad_##op,                   \
    29                         scaled_bilinear_scanline_##cputype##_##name##_##op,    \
    30 -                       src_type, uint32_t, dst_type, PAD, FLAG_NONE)          \
    31 +                       NULL, src_type, uint32_t, dst_type, PAD, FLAG_NONE)    \
    32  FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_normal_##op,                \
    33                         scaled_bilinear_scanline_##cputype##_##name##_##op,    \
    34 -                       src_type, uint32_t, dst_type, NORMAL,                  \
    35 +                       NULL, src_type, uint32_t, dst_type, NORMAL,            \
    36                         FLAG_NONE)
    39  #define PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST(flags, cputype, name, op,  \
    40                                                  src_type, dst_type)           \
    41  void                                                                          \
    42  pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (               \
    43                                                  dst_type *       dst,         \
    44 @@ -404,25 +404,25 @@ scaled_bilinear_scanline_##cputype##_##n
    45      if ((flags & SKIP_ZERO_SRC) && zero_src)                                  \
    46  	return;                                                                   \
    47      pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype (           \
    48                        dst, mask, src_top, src_bottom, wt, wb, vx, unit_x, w); \
    49  }                                                                             \
    50                                                                                \
    51  FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_cover_##op,                 \
    52                         scaled_bilinear_scanline_##cputype##_##name##_##op,    \
    53 -                       src_type, uint8_t, dst_type, COVER,                    \
    54 +                       NULL, src_type, uint8_t, dst_type, COVER,              \
    55                         FLAG_HAVE_NON_SOLID_MASK)                              \
    56  FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_none_##op,                  \
    57                         scaled_bilinear_scanline_##cputype##_##name##_##op,    \
    58 -                       src_type, uint8_t, dst_type, NONE,                     \
    59 +                       NULL, src_type, uint8_t, dst_type, NONE,               \
    60                         FLAG_HAVE_NON_SOLID_MASK)                              \
    61  FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_pad_##op,                   \
    62                         scaled_bilinear_scanline_##cputype##_##name##_##op,    \
    63 -                       src_type, uint8_t, dst_type, PAD,                      \
    64 +                       NULL, src_type, uint8_t, dst_type, PAD,                \
    65                         FLAG_HAVE_NON_SOLID_MASK)                              \
    66  FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_normal_##op,                \
    67                         scaled_bilinear_scanline_##cputype##_##name##_##op,    \
    68 -                       src_type, uint8_t, dst_type, NORMAL,                   \
    69 +                       NULL, src_type, uint8_t, dst_type, NORMAL,             \
    70                         FLAG_HAVE_NON_SOLID_MASK)
    73  #endif
    74 diff --git a/gfx/cairo/libpixman/src/pixman-arm-neon.c b/gfx/cairo/libpixman/src/pixman-arm-neon.c
    75 --- a/gfx/cairo/libpixman/src/pixman-arm-neon.c
    76 +++ b/gfx/cairo/libpixman/src/pixman-arm-neon.c
    77 @@ -140,16 +140,33 @@ PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST 
    78  PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_0565, SRC,
    79                                           uint32_t, uint16_t)
    80  PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_x888, SRC,
    81                                           uint16_t, uint32_t)
    82  PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_0565, SRC,
    83                                           uint16_t, uint16_t)
    84  PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, neon, 8888_8888, OVER,
    85                                           uint32_t, uint32_t)
    86 +static force_inline void
    87 +pixman_scaled_bilinear_scanline_8888_8888_SRC (
    88 +                                                uint32_t *       dst,
    89 +                                                const uint32_t * mask,
    90 +                                                const uint32_t * src_top,
    91 +                                                const uint32_t * src_bottom,
    92 +                                                int32_t          w,
    93 +                                                int              wt,
    94 +                                                int              wb,
    95 +                                                pixman_fixed_t   vx,
    96 +                                                pixman_fixed_t   unit_x,
    97 +                                                pixman_fixed_t   max_vx,
    98 +                                                pixman_bool_t    zero_src)
    99 +{
   100 +    pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon (dst, src_top, src_bottom, wt, wb, vx, unit_x, w);
   101 +}
   102 +
   103  PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, neon, 8888_8888, ADD,
   104                                           uint32_t, uint32_t)
   106  PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 8888_8_8888, SRC,
   107                                              uint32_t, uint32_t)
   108  PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 8888_8_0565, SRC,
   109                                              uint32_t, uint16_t)
   110  PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 0565_8_x888, SRC,
   111 @@ -261,16 +278,38 @@ pixman_blt_neon (uint32_t *src_bits,
   112  		(uint32_t *)(((char *) src_bits) +
   113  		src_y * src_stride * 4 + src_x * 4), src_stride);
   114  	return TRUE;
   115      default:
   116  	return FALSE;
   117      }
   118  }
   120 +static inline void op_bilinear_over_8888_0565(uint16_t *dst, const uint32_t *mask, const uint32_t *src, int width)
   121 +{
   122 +    pixman_composite_over_8888_0565_asm_neon (width, 1, dst, 0, src, 0);
   123 +}
   124 +
   125 +FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_0565_cover_OVER,
   126 +			       pixman_scaled_bilinear_scanline_8888_8888_SRC, op_bilinear_over_8888_0565,
   127 +			       uint32_t, uint32_t, uint16_t,
   128 +			       COVER, FLAG_NONE)
   129 +FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_0565_pad_OVER,
   130 +			       pixman_scaled_bilinear_scanline_8888_8888_SRC, op_bilinear_over_8888_0565,
   131 +			       uint32_t, uint32_t, uint16_t,
   132 +			       PAD, FLAG_NONE)
   133 +FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_0565_none_OVER,
   134 +			       pixman_scaled_bilinear_scanline_8888_8888_SRC, op_bilinear_over_8888_0565,
   135 +			       uint32_t, uint32_t, uint16_t,
   136 +			       NONE, FLAG_NONE)
   137 +FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_0565_normal_OVER,
   138 +			       pixman_scaled_bilinear_scanline_8888_8888_SRC, op_bilinear_over_8888_0565,
   139 +			       uint32_t, uint32_t, uint16_t,
   140 +			       NORMAL, FLAG_NONE)
   141 +
   142  static const pixman_fast_path_t arm_neon_fast_paths[] =
   143  {
   144      PIXMAN_STD_FAST_PATH (SRC,  r5g6b5,   null,     r5g6b5,   neon_composite_src_0565_0565),
   145      PIXMAN_STD_FAST_PATH (SRC,  b5g6r5,   null,     b5g6r5,   neon_composite_src_0565_0565),
   146      PIXMAN_STD_FAST_PATH (SRC,  a8r8g8b8, null,     r5g6b5,   neon_composite_src_8888_0565),
   147      PIXMAN_STD_FAST_PATH (SRC,  x8r8g8b8, null,     r5g6b5,   neon_composite_src_8888_0565),
   148      PIXMAN_STD_FAST_PATH (SRC,  a8b8g8r8, null,     b5g6r5,   neon_composite_src_8888_0565),
   149      PIXMAN_STD_FAST_PATH (SRC,  x8b8g8r8, null,     b5g6r5,   neon_composite_src_8888_0565),
   150 @@ -414,16 +453,18 @@ static const pixman_fast_path_t arm_neon
   151      SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_8_0565),
   153      SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
   154      SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
   156      SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
   157      SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
   159 +    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_0565),
   160 +
   161      { PIXMAN_OP_NONE },
   162  };
   164  static pixman_bool_t
   165  arm_neon_blt (pixman_implementation_t *imp,
   166                uint32_t *               src_bits,
   167                uint32_t *               dst_bits,
   168                int                      src_stride,
   169 diff --git a/gfx/cairo/libpixman/src/pixman-fast-path.c b/gfx/cairo/libpixman/src/pixman-fast-path.c
   170 --- a/gfx/cairo/libpixman/src/pixman-fast-path.c
   171 +++ b/gfx/cairo/libpixman/src/pixman-fast-path.c
   172 @@ -1356,63 +1356,63 @@ scaled_bilinear_scanline_565_565_SRC (ui
   173          vx += unit_x;
   174          *dst++ = d;
   175      }
   176  }
   178  #endif
   180  FAST_BILINEAR_MAINLOOP_COMMON (565_565_cover_SRC,
   181 -			       scaled_bilinear_scanline_565_565_SRC,
   182 +			       scaled_bilinear_scanline_565_565_SRC, NULL,
   183  			       uint16_t, uint32_t, uint16_t,
   184  			       COVER, FLAG_NONE)
   185  FAST_BILINEAR_MAINLOOP_COMMON (565_565_pad_SRC,
   186 -			       scaled_bilinear_scanline_565_565_SRC,
   187 +			       scaled_bilinear_scanline_565_565_SRC, NULL,
   188  			       uint16_t, uint32_t, uint16_t,
   189  			       PAD, FLAG_NONE)
   190  FAST_BILINEAR_MAINLOOP_COMMON (565_565_none_SRC,
   191 -			       scaled_bilinear_scanline_565_565_SRC,
   192 +			       scaled_bilinear_scanline_565_565_SRC, NULL,
   193  			       uint16_t, uint32_t, uint16_t,
   194  			       NONE, FLAG_NONE)
   195  FAST_BILINEAR_MAINLOOP_COMMON (565_565_normal_SRC,
   196 -			       scaled_bilinear_scanline_565_565_SRC,
   197 +			       scaled_bilinear_scanline_565_565_SRC, NULL,
   198  			       uint16_t, uint32_t, uint16_t,
   199  			       NORMAL, FLAG_NONE)
   201  FAST_BILINEAR_MAINLOOP_COMMON (8888_565_cover_OVER,
   202 -			       scaled_bilinear_scanline_8888_565_OVER,
   203 +			       scaled_bilinear_scanline_8888_565_OVER, NULL,
   204  			       uint32_t, uint32_t, uint16_t,
   205  			       COVER, FLAG_NONE)
   206  FAST_BILINEAR_MAINLOOP_COMMON (8888_565_pad_OVER,
   207 -			       scaled_bilinear_scanline_8888_565_OVER,
   208 +			       scaled_bilinear_scanline_8888_565_OVER, NULL,
   209  			       uint32_t, uint32_t, uint16_t,
   210  			       PAD, FLAG_NONE)
   211  FAST_BILINEAR_MAINLOOP_COMMON (8888_565_none_OVER,
   212 -			       scaled_bilinear_scanline_8888_565_OVER,
   213 +			       scaled_bilinear_scanline_8888_565_OVER, NULL,
   214  			       uint32_t, uint32_t, uint16_t,
   215  			       NONE, FLAG_NONE)
   216  FAST_BILINEAR_MAINLOOP_COMMON (8888_565_normal_OVER,
   217 -			       scaled_bilinear_scanline_8888_565_OVER,
   218 +			       scaled_bilinear_scanline_8888_565_OVER, NULL,
   219  			       uint32_t, uint32_t, uint16_t,
   220  			       NORMAL, FLAG_NONE)
   222  FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_cover_OVER,
   223 -			       scaled_bilinear_scanline_8888_8888_OVER,
   224 +			       scaled_bilinear_scanline_8888_8888_OVER, NULL,
   225  			       uint32_t, uint32_t, uint32_t,
   226  			       COVER, FLAG_NONE)
   227  FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_pad_OVER,
   228 -			       scaled_bilinear_scanline_8888_8888_OVER,
   229 +			       scaled_bilinear_scanline_8888_8888_OVER, NULL,
   230  			       uint32_t, uint32_t, uint32_t,
   231  			       PAD, FLAG_NONE)
   232  FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_none_OVER,
   233 -			       scaled_bilinear_scanline_8888_8888_OVER,
   234 +			       scaled_bilinear_scanline_8888_8888_OVER, NULL,
   235  			       uint32_t, uint32_t, uint32_t,
   236  			       NONE, FLAG_NONE)
   237  FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_normal_OVER,
   238 -			       scaled_bilinear_scanline_8888_8888_OVER,
   239 +			       scaled_bilinear_scanline_8888_8888_OVER, NULL,
   240  			       uint32_t, uint32_t, uint32_t,
   241  			       NORMAL, FLAG_NONE)
   243  #define REPEAT_MIN_WIDTH    32
   245  static void
   246  fast_composite_tiled_repeat (pixman_implementation_t *imp,
   247  			     pixman_composite_info_t *info)
   248 diff --git a/gfx/cairo/libpixman/src/pixman-inlines.h b/gfx/cairo/libpixman/src/pixman-inlines.h
   249 --- a/gfx/cairo/libpixman/src/pixman-inlines.h
   250 +++ b/gfx/cairo/libpixman/src/pixman-inlines.h
   251 @@ -816,18 +816,48 @@ bilinear_pad_repeat_get_scanline_bounds 
   252   *
   253   * Note: normally the sum of 'weight_top' and 'weight_bottom' is equal to 256,
   254   *       but sometimes it may be less than that for NONE repeat when handling
   255   *       fuzzy antialiased top or bottom image edges. Also both top and
   256   *       bottom weight variables are guaranteed to have value in 0-255
   257   *       range and can fit into unsigned byte or be used with 8-bit SIMD
   258   *       multiplication instructions.
   259   */
   260 -#define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
   261 -				  dst_type_t, repeat_mode, flags)				\
   262 +
   263 +/* Replace a single "scanline_func" with "fetch_func" & "op_func" to allow optional
   264 + * two stage processing (bilinear fetch to a temp buffer, followed by unscaled
   265 + * combine), "op_func" may be NULL, in this case we keep old behavior.
   266 + * This is ugly and gcc issues some warnings, but works.
   267 + *
   268 + * An advice: clang has much better error reporting than gcc for deeply nested macros.
   269 + */
   270 +
   271 +#define	scanline_func(dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst,            \
   272 +                      scanline_buf, mask, src_top, src_bottom, width,                           \
   273 +                      weight_top, weight_bottom, vx, unit_x, max_vx, zero_src)                  \
   274 + do {                                                                                           \
   275 +		if (op_func != NULL)								\
   276 +		{										\
   277 +		    fetch_func ((void *)scanline_buf, (mask), (src_top), (src_bottom), (width), \
   278 +                        (weight_top), (weight_bottom), (vx), (unit_x), (max_vx), (zero_src));   \
   279 +		    ((void (*)(dst_type_t *, const mask_type_t *, const src_type_t *, int)) op_func)\
   280 +			((dst), (mask), (src_type_t *)scanline_buf, (width));			\
   281 +		}										\
   282 +		else										\
   283 +		{										\
   284 +		    fetch_func ((void*)(dst), (mask), (src_top), (src_bottom), (width), (weight_top),  \
   285 +                                (weight_bottom), (vx), (unit_x), (max_vx), (zero_src));         \
   286 +		}                                                                               \
   287 +  } while (0)
   288 +
   289 +
   290 +#define SCANLINE_BUFFER_LENGTH 3072
   291 +
   292 +#define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, fetch_func, op_func, src_type_t,		\
   293 +				  mask_type_t, dst_type_t, repeat_mode, flags)			\
   294  static void											\
   295  fast_composite_scaled_bilinear ## scale_func_name (pixman_implementation_t *imp,		\
   296  						   pixman_composite_info_t *info)		\
   297  {												\
   298      PIXMAN_COMPOSITE_ARGS (info);								\
   299      dst_type_t *dst_line;									\
   300      mask_type_t *mask_line;									\
   301      src_type_t *src_first_line;									\
   302 @@ -842,16 +872,19 @@ fast_composite_scaled_bilinear ## scale_
   303      mask_type_t solid_mask;									\
   304      const mask_type_t *mask = &solid_mask;							\
   305      int src_stride, mask_stride, dst_stride;							\
   306  												\
   307      int src_width;										\
   308      pixman_fixed_t src_width_fixed;								\
   309      int max_x;											\
   310      pixman_bool_t need_src_extension;								\
   311 +                                                                                                \
   312 +    uint64_t stack_scanline_buffer[SCANLINE_BUFFER_LENGTH];                                     \
   313 +    uint8_t *scanline_buffer = (uint8_t *) stack_scanline_buffer;                               \
   314  												\
   315      PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type_t, dst_stride, dst_line, 1);	\
   316      if (flags & FLAG_HAVE_SOLID_MASK)								\
   317      {												\
   318  	solid_mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);	\
   319  	mask_stride = 0;									\
   320      }												\
   321      else if (flags & FLAG_HAVE_NON_SOLID_MASK)							\
   322 @@ -914,16 +947,24 @@ fast_composite_scaled_bilinear ## scale_
   323  	else											\
   324  	{											\
   325  	    src_width = src_image->bits.width;							\
   326  	    need_src_extension = FALSE;								\
   327  	}											\
   328  												\
   329  	src_width_fixed = pixman_int_to_fixed (src_width);					\
   330      }												\
   331 +                                                                                                \
   332 +    if (op_func != NULL && width * sizeof(src_type_t) > sizeof(stack_scanline_buffer))          \
   333 +    {                                                                                           \
   334 +	scanline_buffer = pixman_malloc_ab (width, sizeof(src_type_t));                         \
   335 +                                                                                                \
   336 +	if (!scanline_buffer)                                                                   \
   337 +	    return;                                                                             \
   338 +    }                                                                                           \
   339  												\
   340      while (--height >= 0)									\
   341      {												\
   342  	int weight1, weight2;									\
   343  	dst = dst_line;										\
   344  	dst_line += dst_stride;									\
   345  	vx = v.vector[0];									\
   346  	if (flags & FLAG_HAVE_NON_SOLID_MASK)							\
   347 @@ -956,36 +997,39 @@ fast_composite_scaled_bilinear ## scale_
   348  	    repeat (PIXMAN_REPEAT_PAD, &y2, src_image->bits.height);				\
   349  	    src1 = src_first_line + src_stride * y1;						\
   350  	    src2 = src_first_line + src_stride * y2;						\
   351  												\
   352  	    if (left_pad > 0)									\
   353  	    {											\
   354  		buf1[0] = buf1[1] = src1[0];							\
   355  		buf2[0] = buf2[1] = src2[0];							\
   356 -		scanline_func (dst, mask,							\
   357 -			       buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, FALSE);		\
   358 +		scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst,	\
   359 +			       scanline_buffer, mask, buf1, buf2, left_pad, weight1, weight2,   \
   360 +                               0, 0, 0, FALSE);	                                                \
   361  		dst += left_pad;								\
   362  		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
   363  		    mask += left_pad;								\
   364  	    }											\
   365  	    if (width > 0)									\
   366  	    {											\
   367 -		scanline_func (dst, mask,							\
   368 -			       src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE);	\
   369 +		scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst,	\
   370 +			       scanline_buffer, mask, src1, src2, width, weight1, weight2,      \
   371 +                               vx, unit_x, 0, FALSE);                                           \
   372  		dst += width;									\
   373  		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
   374  		    mask += width;								\
   375  	    }											\
   376  	    if (right_pad > 0)									\
   377  	    {											\
   378  		buf1[0] = buf1[1] = src1[src_image->bits.width - 1];				\
   379  		buf2[0] = buf2[1] = src2[src_image->bits.width - 1];				\
   380 -		scanline_func (dst, mask,							\
   381 -			       buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, FALSE);	\
   382 +		scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst,	\
   383 +			       scanline_buffer, mask, buf1, buf2, right_pad, weight1, weight2,  \
   384 +                               0, 0, 0, FALSE);                                                 \
   385  	    }											\
   386  	}											\
   387  	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)				\
   388  	{											\
   389  	    src_type_t *src1, *src2;								\
   390  	    src_type_t buf1[2];									\
   391  	    src_type_t buf2[2];									\
   392  	    /* handle top/bottom zero padding by just setting weights to 0 if needed */		\
   393 @@ -1011,64 +1055,67 @@ fast_composite_scaled_bilinear ## scale_
   394  	    }											\
   395  	    src1 = src_first_line + src_stride * y1;						\
   396  	    src2 = src_first_line + src_stride * y2;						\
   397  												\
   398  	    if (left_pad > 0)									\
   399  	    {											\
   400  		buf1[0] = buf1[1] = 0;								\
   401  		buf2[0] = buf2[1] = 0;								\
   402 -		scanline_func (dst, mask,							\
   403 -			       buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, TRUE);		\
   404 +		scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst,	\
   405 +			       scanline_buffer, mask, buf1, buf2, left_pad, weight1, weight2,   \
   406 +                               0, 0, 0, TRUE);	                                                \
   407  		dst += left_pad;								\
   408  		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
   409  		    mask += left_pad;								\
   410  	    }											\
   411  	    if (left_tz > 0)									\
   412  	    {											\
   413  		buf1[0] = 0;									\
   414  		buf1[1] = src1[0];								\
   415  		buf2[0] = 0;									\
   416  		buf2[1] = src2[0];								\
   417 -		scanline_func (dst, mask,							\
   418 -			       buf1, buf2, left_tz, weight1, weight2,				\
   419 +		scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst,	\
   420 +			       scanline_buffer, mask, buf1, buf2, left_tz, weight1, weight2,	\
   421  			       pixman_fixed_frac (vx), unit_x, 0, FALSE);			\
   422  		dst += left_tz;									\
   423  		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
   424  		    mask += left_tz;								\
   425  		vx += left_tz * unit_x;								\
   426  	    }											\
   427  	    if (width > 0)									\
   428  	    {											\
   429 -		scanline_func (dst, mask,							\
   430 -			       src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE);	\
   431 +		scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst,	\
   432 +			       scanline_buffer, mask, src1, src2, width, weight1, weight2,      \
   433 +                               vx, unit_x, 0, FALSE);                                           \
   434  		dst += width;									\
   435  		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
   436  		    mask += width;								\
   437  		vx += width * unit_x;								\
   438  	    }											\
   439  	    if (right_tz > 0)									\
   440  	    {											\
   441  		buf1[0] = src1[src_image->bits.width - 1];					\
   442  		buf1[1] = 0;									\
   443  		buf2[0] = src2[src_image->bits.width - 1];					\
   444  		buf2[1] = 0;									\
   445 -		scanline_func (dst, mask,							\
   446 -			       buf1, buf2, right_tz, weight1, weight2,				\
   447 +		scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst,	\
   448 +			       scanline_buffer, mask, buf1, buf2, right_tz, weight1, weight2,   \
   449  			       pixman_fixed_frac (vx), unit_x, 0, FALSE);			\
   450  		dst += right_tz;								\
   451  		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
   452  		    mask += right_tz;								\
   453  	    }											\
   454  	    if (right_pad > 0)									\
   455  	    {											\
   456  		buf1[0] = buf1[1] = 0;								\
   457  		buf2[0] = buf2[1] = 0;								\
   458 -		scanline_func (dst, mask,							\
   459 -			       buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, TRUE);		\
   460 +		scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst,	\
   461 +			       scanline_buffer, mask, buf1, buf2, right_pad, weight1, weight2,  \
   462 +                               0, 0, 0, TRUE);	                                                \
   463  	    }											\
   464  	}											\
   465  	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
   466  	{											\
   467  	    int32_t	    num_pixels;								\
   468  	    int32_t	    width_remain;							\
   469  	    src_type_t *    src_line_top;							\
   470  	    src_type_t *    src_line_bottom;							\
   471 @@ -1120,17 +1167,18 @@ fast_composite_scaled_bilinear ## scale_
   472  		     * vx is in range [0, src_width_fixed - pixman_fixed_e]			\
   473  		     * So we are safe from overflow.						\
   474  		     */										\
   475  		    num_pixels = ((src_width_fixed - vx - pixman_fixed_e) / unit_x) + 1;	\
   476  												\
   477  		    if (num_pixels > width_remain)						\
   478  			num_pixels = width_remain;						\
   479  												\
   480 -		    scanline_func (dst, mask, buf1, buf2, num_pixels,				\
   481 +		    scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func,    \
   482 +                                   dst, scanline_buffer, mask, buf1, buf2, num_pixels,          \
   483  				   weight1, weight2, pixman_fixed_frac(vx),			\
   484  				   unit_x, src_width_fixed, FALSE);				\
   485  												\
   486  		    width_remain -= num_pixels;							\
   487  		    vx += num_pixels * unit_x;							\
   488  		    dst += num_pixels;								\
   489  												\
   490  		    if (flags & FLAG_HAVE_NON_SOLID_MASK)					\
   491 @@ -1149,41 +1197,47 @@ fast_composite_scaled_bilinear ## scale_
   492  		     * So we are safe from overflow here.					\
   493  		     */										\
   494  		    num_pixels = ((src_width_fixed - pixman_fixed_1 - vx - pixman_fixed_e)	\
   495  				  / unit_x) + 1;						\
   496  												\
   497  		    if (num_pixels > width_remain)						\
   498  			num_pixels = width_remain;						\
   499  												\
   500 -		    scanline_func (dst, mask, src_line_top, src_line_bottom, num_pixels,	\
   501 -				   weight1, weight2, vx, unit_x, src_width_fixed, FALSE);	\
   502 +		    scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func,    \
   503 +                                   dst, scanline_buffer, mask, src_line_top, src_line_bottom,   \
   504 +                                   num_pixels, weight1, weight2, vx, unit_x, src_width_fixed,   \
   505 +                                   FALSE);	                                                \
   506  												\
   507  		    width_remain -= num_pixels;							\
   508  		    vx += num_pixels * unit_x;							\
   509  		    dst += num_pixels;								\
   510  												\
   511  		    if (flags & FLAG_HAVE_NON_SOLID_MASK)					\
   512  		        mask += num_pixels;							\
   513  		}										\
   514  	    }											\
   515  	}											\
   516  	else											\
   517  	{											\
   518 -	    scanline_func (dst, mask, src_first_line + src_stride * y1,				\
   519 +	    scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst,       \
   520 +                           scanline_buffer, mask,                                               \
   521 +                           src_first_line + src_stride * y1,					\
   522  			   src_first_line + src_stride * y2, width,				\
   523  			   weight1, weight2, vx, unit_x, max_vx, FALSE);			\
   524  	}											\
   525      }												\
   526 +    if (scanline_buffer != (uint8_t *) stack_scanline_buffer)                                   \
   527 +	free (scanline_buffer);                                                                 \
   528  }
   530  /* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
   531 -#define FAST_BILINEAR_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
   532 +#define FAST_BILINEAR_MAINLOOP_COMMON(scale_func_name, fetch_func, op_func, src_type_t, mask_type_t,\
   533  				  dst_type_t, repeat_mode, flags)				\
   534 -	FAST_BILINEAR_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,\
   535 +	FAST_BILINEAR_MAINLOOP_INT(_ ## scale_func_name, fetch_func, op_func, src_type_t, mask_type_t,\
   536  				  dst_type_t, repeat_mode, flags)
   538  #define SCALED_BILINEAR_FLAGS						\
   539      (FAST_PATH_SCALE_TRANSFORM	|					\
   540       FAST_PATH_NO_ALPHA_MAP	|					\
   541       FAST_PATH_BILINEAR_FILTER	|					\
   542       FAST_PATH_NO_ACCESSORS	|					\
   543       FAST_PATH_NARROW_FORMAT)
   544 diff --git a/gfx/cairo/libpixman/src/pixman-sse2.c b/gfx/cairo/libpixman/src/pixman-sse2.c
   545 --- a/gfx/cairo/libpixman/src/pixman-sse2.c
   546 +++ b/gfx/cairo/libpixman/src/pixman-sse2.c
   547 @@ -5404,30 +5404,33 @@ scaled_bilinear_scanline_sse2_8888_8888_
   548      if (w & 1)
   549      {
   550  	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
   551  	*dst = pix1;
   552      }
   554  }
   556 +/* Add extra NULL argument to the existing bilinear fast paths to indicate
   557 + * that we don't need two-pass processing */
   558 +
   559  FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
   560 -			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
   561 +			       scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
   562  			       uint32_t, uint32_t, uint32_t,
   563  			       COVER, FLAG_NONE)
   564  FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
   565 -			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
   566 +			       scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
   567  			       uint32_t, uint32_t, uint32_t,
   568  			       PAD, FLAG_NONE)
   569  FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
   570 -			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
   571 +			       scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
   572  			       uint32_t, uint32_t, uint32_t,
   573  			       NONE, FLAG_NONE)
   574  FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
   575 -			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
   576 +			       scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
   577  			       uint32_t, uint32_t, uint32_t,
   578  			       NORMAL, FLAG_NONE)
   580  static force_inline void
   581  scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst,
   582  					      const uint32_t * mask,
   583  					      const uint32_t * src_top,
   584  					      const uint32_t * src_bottom,
   585 @@ -5505,32 +5508,66 @@ scaled_bilinear_scanline_sse2_8888_8888_
   586  	}
   588  	w--;
   589  	dst++;
   590      }
   591  }
   593  FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,
   594 -			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
   595 +			       scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
   596  			       uint32_t, uint32_t, uint32_t,
   597  			       COVER, FLAG_NONE)
   598  FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
   599 -			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
   600 +			       scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
   601  			       uint32_t, uint32_t, uint32_t,
   602  			       PAD, FLAG_NONE)
   603  FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,
   604 -			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
   605 +			       scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
   606  			       uint32_t, uint32_t, uint32_t,
   607  			       NONE, FLAG_NONE)
   608  FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,
   609 -			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
   610 +			       scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
   611  			       uint32_t, uint32_t, uint32_t,
   612  			       NORMAL, FLAG_NONE)
   614 +
   615 +/* An example of SSE2 two-stage bilinear_over_8888_0565 fast path, which is implemented
   616 +   as scaled_bilinear_scanline_sse2_8888_8888_SRC + op_bilinear_over_8888_0565 */
   617 +
   618 +void op_bilinear_over_8888_0565(uint16_t *dst, const uint32_t *mask, const uint32_t *src, int width)
   619 +{
   620 +    /* Note: this is not really fast and should be based on 8 pixel loop from sse2_composite_over_8888_0565 */
   621 +    while (--width >= 0)
   622 +    {
   623 +	*dst = composite_over_8888_0565pixel (*src, *dst);
   624 +	src++;
   625 +	dst++;
   626 +    }
   627 +}
   628 +
   629 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_cover_OVER,
   630 +			       scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
   631 +			       uint32_t, uint32_t, uint16_t,
   632 +			       COVER, FLAG_NONE)
   633 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_pad_OVER,
   634 +			       scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
   635 +			       uint32_t, uint32_t, uint16_t,
   636 +			       PAD, FLAG_NONE)
   637 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_none_OVER,
   638 +			       scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
   639 +			       uint32_t, uint32_t, uint16_t,
   640 +			       NONE, FLAG_NONE)
   641 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_normal_OVER,
   642 +			       scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
   643 +			       uint32_t, uint32_t, uint16_t,
   644 +			       NORMAL, FLAG_NONE)
   645 +
   646 +/*****************************/
   647 +
   648  static force_inline void
   649  scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,
   650  						const uint8_t  * mask,
   651  						const uint32_t * src_top,
   652  						const uint32_t * src_bottom,
   653  						int32_t          w,
   654  						int              wt,
   655  						int              wb,
   656 @@ -5669,29 +5706,29 @@ scaled_bilinear_scanline_sse2_8888_8_888
   657  	}
   659  	w--;
   660  	dst++;
   661      }
   662  }
   664  FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,
   665 -			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
   666 +			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
   667  			       uint32_t, uint8_t, uint32_t,
   668  			       COVER, FLAG_HAVE_NON_SOLID_MASK)
   669  FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,
   670 -			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
   671 +			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
   672  			       uint32_t, uint8_t, uint32_t,
   673  			       PAD, FLAG_HAVE_NON_SOLID_MASK)
   674  FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,
   675 -			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
   676 +			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
   677  			       uint32_t, uint8_t, uint32_t,
   678  			       NONE, FLAG_HAVE_NON_SOLID_MASK)
   679  FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
   680 -			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
   681 +			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
   682  			       uint32_t, uint8_t, uint32_t,
   683  			       NORMAL, FLAG_HAVE_NON_SOLID_MASK)
   685  static const pixman_fast_path_t sse2_fast_paths[] =
   686  {
   687      /* PIXMAN_OP_OVER */
   688      PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
   689      PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
   690 @@ -5808,16 +5845,21 @@ static const pixman_fast_path_t sse2_fas
   691      SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
   692      SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
   694      SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
   695      SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
   696      SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
   697      SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888),
   699 +    /* and here the needed entries are added to the fast path table */
   700 +
   701 +    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, r5g6b5, sse2_8888_0565),
   702 +    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, b5g6r5, sse2_8888_0565),
   703 +
   704      { PIXMAN_OP_NONE },
   705  };
   707  static pixman_bool_t
   708  sse2_blt (pixman_implementation_t *imp,
   709            uint32_t *               src_bits,
   710            uint32_t *               dst_bits,
   711            int                      src_stride,

mercurial