|
1 changeset: 96613:3e003f0b8026 |
|
2 tag: 2pass |
|
3 tag: qbase |
|
4 tag: qtip |
|
5 tag: tip |
|
6 user: Jeff Muizelaar <jmuizelaar@mozilla.com> |
|
7 date: Thu May 17 19:23:53 2012 -0400 |
|
8 summary: Bug 757878. Add a fast path for 8888_over_565 with NEON. r=bgirard,joe |
|
9 |
|
10 diff --git a/gfx/cairo/libpixman/src/pixman-arm-common.h b/gfx/cairo/libpixman/src/pixman-arm-common.h |
|
11 --- a/gfx/cairo/libpixman/src/pixman-arm-common.h |
|
12 +++ b/gfx/cairo/libpixman/src/pixman-arm-common.h |
|
13 @@ -355,26 +355,26 @@ scaled_bilinear_scanline_##cputype##_##n |
|
14 if ((flags & SKIP_ZERO_SRC) && zero_src) \ |
|
15 return; \ |
|
16 pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype ( \ |
|
17 dst, src_top, src_bottom, wt, wb, vx, unit_x, w); \ |
|
18 } \ |
|
19 \ |
|
20 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_cover_##op, \ |
|
21 scaled_bilinear_scanline_##cputype##_##name##_##op, \ |
|
22 - src_type, uint32_t, dst_type, COVER, FLAG_NONE) \ |
|
23 + NULL, src_type, uint32_t, dst_type, COVER, FLAG_NONE) \ |
|
24 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_none_##op, \ |
|
25 scaled_bilinear_scanline_##cputype##_##name##_##op, \ |
|
26 - src_type, uint32_t, dst_type, NONE, FLAG_NONE) \ |
|
27 + NULL, src_type, uint32_t, dst_type, NONE, FLAG_NONE) \ |
|
28 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_pad_##op, \ |
|
29 scaled_bilinear_scanline_##cputype##_##name##_##op, \ |
|
30 - src_type, uint32_t, dst_type, PAD, FLAG_NONE) \ |
|
31 + NULL, src_type, uint32_t, dst_type, PAD, FLAG_NONE) \ |
|
32 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_normal_##op, \ |
|
33 scaled_bilinear_scanline_##cputype##_##name##_##op, \ |
|
34 - src_type, uint32_t, dst_type, NORMAL, \ |
|
35 + NULL, src_type, uint32_t, dst_type, NORMAL, \ |
|
36 FLAG_NONE) |
|
37 |
|
38 |
|
39 #define PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST(flags, cputype, name, op, \ |
|
40 src_type, dst_type) \ |
|
41 void \ |
|
42 pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype ( \ |
|
43 dst_type * dst, \ |
|
44 @@ -404,25 +404,25 @@ scaled_bilinear_scanline_##cputype##_##n |
|
45 if ((flags & SKIP_ZERO_SRC) && zero_src) \ |
|
46 return; \ |
|
47 pixman_scaled_bilinear_scanline_##name##_##op##_asm_##cputype ( \ |
|
48 dst, mask, src_top, src_bottom, wt, wb, vx, unit_x, w); \ |
|
49 } \ |
|
50 \ |
|
51 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_cover_##op, \ |
|
52 scaled_bilinear_scanline_##cputype##_##name##_##op, \ |
|
53 - src_type, uint8_t, dst_type, COVER, \ |
|
54 + NULL, src_type, uint8_t, dst_type, COVER, \ |
|
55 FLAG_HAVE_NON_SOLID_MASK) \ |
|
56 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_none_##op, \ |
|
57 scaled_bilinear_scanline_##cputype##_##name##_##op, \ |
|
58 - src_type, uint8_t, dst_type, NONE, \ |
|
59 + NULL, src_type, uint8_t, dst_type, NONE, \ |
|
60 FLAG_HAVE_NON_SOLID_MASK) \ |
|
61 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_pad_##op, \ |
|
62 scaled_bilinear_scanline_##cputype##_##name##_##op, \ |
|
63 - src_type, uint8_t, dst_type, PAD, \ |
|
64 + NULL, src_type, uint8_t, dst_type, PAD, \ |
|
65 FLAG_HAVE_NON_SOLID_MASK) \ |
|
66 FAST_BILINEAR_MAINLOOP_COMMON (cputype##_##name##_normal_##op, \ |
|
67 scaled_bilinear_scanline_##cputype##_##name##_##op, \ |
|
68 - src_type, uint8_t, dst_type, NORMAL, \ |
|
69 + NULL, src_type, uint8_t, dst_type, NORMAL, \ |
|
70 FLAG_HAVE_NON_SOLID_MASK) |
|
71 |
|
72 |
|
73 #endif |
|
74 diff --git a/gfx/cairo/libpixman/src/pixman-arm-neon.c b/gfx/cairo/libpixman/src/pixman-arm-neon.c |
|
75 --- a/gfx/cairo/libpixman/src/pixman-arm-neon.c |
|
76 +++ b/gfx/cairo/libpixman/src/pixman-arm-neon.c |
|
77 @@ -140,16 +140,33 @@ PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST |
|
78 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 8888_0565, SRC, |
|
79 uint32_t, uint16_t) |
|
80 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_x888, SRC, |
|
81 uint16_t, uint32_t) |
|
82 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_0565, SRC, |
|
83 uint16_t, uint16_t) |
|
84 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, neon, 8888_8888, OVER, |
|
85 uint32_t, uint32_t) |
|
86 +static force_inline void |
|
87 +pixman_scaled_bilinear_scanline_8888_8888_SRC ( |
|
88 + uint32_t * dst, |
|
89 + const uint32_t * mask, |
|
90 + const uint32_t * src_top, |
|
91 + const uint32_t * src_bottom, |
|
92 + int32_t w, |
|
93 + int wt, |
|
94 + int wb, |
|
95 + pixman_fixed_t vx, |
|
96 + pixman_fixed_t unit_x, |
|
97 + pixman_fixed_t max_vx, |
|
98 + pixman_bool_t zero_src) |
|
99 +{ |
|
100 + pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_neon (dst, src_top, src_bottom, wt, wb, vx, unit_x, w); |
|
101 +} |
|
102 + |
|
103 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, neon, 8888_8888, ADD, |
|
104 uint32_t, uint32_t) |
|
105 |
|
106 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 8888_8_8888, SRC, |
|
107 uint32_t, uint32_t) |
|
108 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 8888_8_0565, SRC, |
|
109 uint32_t, uint16_t) |
|
110 PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 0565_8_x888, SRC, |
|
111 @@ -261,16 +278,38 @@ pixman_blt_neon (uint32_t *src_bits, |
|
112 (uint32_t *)(((char *) src_bits) + |
|
113 src_y * src_stride * 4 + src_x * 4), src_stride); |
|
114 return TRUE; |
|
115 default: |
|
116 return FALSE; |
|
117 } |
|
118 } |
|
119 |
|
120 +static inline void op_bilinear_over_8888_0565(uint16_t *dst, const uint32_t *mask, const uint32_t *src, int width) |
|
121 +{ |
|
122 + pixman_composite_over_8888_0565_asm_neon (width, 1, dst, 0, src, 0); |
|
123 +} |
|
124 + |
|
125 +FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_0565_cover_OVER, |
|
126 + pixman_scaled_bilinear_scanline_8888_8888_SRC, op_bilinear_over_8888_0565, |
|
127 + uint32_t, uint32_t, uint16_t, |
|
128 + COVER, FLAG_NONE) |
|
129 +FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_0565_pad_OVER, |
|
130 + pixman_scaled_bilinear_scanline_8888_8888_SRC, op_bilinear_over_8888_0565, |
|
131 + uint32_t, uint32_t, uint16_t, |
|
132 + PAD, FLAG_NONE) |
|
133 +FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_0565_none_OVER, |
|
134 + pixman_scaled_bilinear_scanline_8888_8888_SRC, op_bilinear_over_8888_0565, |
|
135 + uint32_t, uint32_t, uint16_t, |
|
136 + NONE, FLAG_NONE) |
|
137 +FAST_BILINEAR_MAINLOOP_COMMON (neon_8888_0565_normal_OVER, |
|
138 + pixman_scaled_bilinear_scanline_8888_8888_SRC, op_bilinear_over_8888_0565, |
|
139 + uint32_t, uint32_t, uint16_t, |
|
140 + NORMAL, FLAG_NONE) |
|
141 + |
|
142 static const pixman_fast_path_t arm_neon_fast_paths[] = |
|
143 { |
|
144 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, neon_composite_src_0565_0565), |
|
145 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, neon_composite_src_0565_0565), |
|
146 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, neon_composite_src_8888_0565), |
|
147 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, neon_composite_src_8888_0565), |
|
148 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, neon_composite_src_8888_0565), |
|
149 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, neon_composite_src_8888_0565), |
|
150 @@ -414,16 +453,18 @@ static const pixman_fast_path_t arm_neon |
|
151 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_8_0565), |
|
152 |
|
153 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8_8888), |
|
154 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8_8888), |
|
155 |
|
156 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8_8888), |
|
157 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8_8888), |
|
158 |
|
159 + SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, r5g6b5, neon_8888_0565), |
|
160 + |
|
161 { PIXMAN_OP_NONE }, |
|
162 }; |
|
163 |
|
164 static pixman_bool_t |
|
165 arm_neon_blt (pixman_implementation_t *imp, |
|
166 uint32_t * src_bits, |
|
167 uint32_t * dst_bits, |
|
168 int src_stride, |
|
169 diff --git a/gfx/cairo/libpixman/src/pixman-fast-path.c b/gfx/cairo/libpixman/src/pixman-fast-path.c |
|
170 --- a/gfx/cairo/libpixman/src/pixman-fast-path.c |
|
171 +++ b/gfx/cairo/libpixman/src/pixman-fast-path.c |
|
172 @@ -1356,63 +1356,63 @@ scaled_bilinear_scanline_565_565_SRC (ui |
|
173 vx += unit_x; |
|
174 *dst++ = d; |
|
175 } |
|
176 } |
|
177 |
|
178 #endif |
|
179 |
|
180 FAST_BILINEAR_MAINLOOP_COMMON (565_565_cover_SRC, |
|
181 - scaled_bilinear_scanline_565_565_SRC, |
|
182 + scaled_bilinear_scanline_565_565_SRC, NULL, |
|
183 uint16_t, uint32_t, uint16_t, |
|
184 COVER, FLAG_NONE) |
|
185 FAST_BILINEAR_MAINLOOP_COMMON (565_565_pad_SRC, |
|
186 - scaled_bilinear_scanline_565_565_SRC, |
|
187 + scaled_bilinear_scanline_565_565_SRC, NULL, |
|
188 uint16_t, uint32_t, uint16_t, |
|
189 PAD, FLAG_NONE) |
|
190 FAST_BILINEAR_MAINLOOP_COMMON (565_565_none_SRC, |
|
191 - scaled_bilinear_scanline_565_565_SRC, |
|
192 + scaled_bilinear_scanline_565_565_SRC, NULL, |
|
193 uint16_t, uint32_t, uint16_t, |
|
194 NONE, FLAG_NONE) |
|
195 FAST_BILINEAR_MAINLOOP_COMMON (565_565_normal_SRC, |
|
196 - scaled_bilinear_scanline_565_565_SRC, |
|
197 + scaled_bilinear_scanline_565_565_SRC, NULL, |
|
198 uint16_t, uint32_t, uint16_t, |
|
199 NORMAL, FLAG_NONE) |
|
200 |
|
201 FAST_BILINEAR_MAINLOOP_COMMON (8888_565_cover_OVER, |
|
202 - scaled_bilinear_scanline_8888_565_OVER, |
|
203 + scaled_bilinear_scanline_8888_565_OVER, NULL, |
|
204 uint32_t, uint32_t, uint16_t, |
|
205 COVER, FLAG_NONE) |
|
206 FAST_BILINEAR_MAINLOOP_COMMON (8888_565_pad_OVER, |
|
207 - scaled_bilinear_scanline_8888_565_OVER, |
|
208 + scaled_bilinear_scanline_8888_565_OVER, NULL, |
|
209 uint32_t, uint32_t, uint16_t, |
|
210 PAD, FLAG_NONE) |
|
211 FAST_BILINEAR_MAINLOOP_COMMON (8888_565_none_OVER, |
|
212 - scaled_bilinear_scanline_8888_565_OVER, |
|
213 + scaled_bilinear_scanline_8888_565_OVER, NULL, |
|
214 uint32_t, uint32_t, uint16_t, |
|
215 NONE, FLAG_NONE) |
|
216 FAST_BILINEAR_MAINLOOP_COMMON (8888_565_normal_OVER, |
|
217 - scaled_bilinear_scanline_8888_565_OVER, |
|
218 + scaled_bilinear_scanline_8888_565_OVER, NULL, |
|
219 uint32_t, uint32_t, uint16_t, |
|
220 NORMAL, FLAG_NONE) |
|
221 |
|
222 FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_cover_OVER, |
|
223 - scaled_bilinear_scanline_8888_8888_OVER, |
|
224 + scaled_bilinear_scanline_8888_8888_OVER, NULL, |
|
225 uint32_t, uint32_t, uint32_t, |
|
226 COVER, FLAG_NONE) |
|
227 FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_pad_OVER, |
|
228 - scaled_bilinear_scanline_8888_8888_OVER, |
|
229 + scaled_bilinear_scanline_8888_8888_OVER, NULL, |
|
230 uint32_t, uint32_t, uint32_t, |
|
231 PAD, FLAG_NONE) |
|
232 FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_none_OVER, |
|
233 - scaled_bilinear_scanline_8888_8888_OVER, |
|
234 + scaled_bilinear_scanline_8888_8888_OVER, NULL, |
|
235 uint32_t, uint32_t, uint32_t, |
|
236 NONE, FLAG_NONE) |
|
237 FAST_BILINEAR_MAINLOOP_COMMON (8888_8888_normal_OVER, |
|
238 - scaled_bilinear_scanline_8888_8888_OVER, |
|
239 + scaled_bilinear_scanline_8888_8888_OVER, NULL, |
|
240 uint32_t, uint32_t, uint32_t, |
|
241 NORMAL, FLAG_NONE) |
|
242 |
|
243 #define REPEAT_MIN_WIDTH 32 |
|
244 |
|
245 static void |
|
246 fast_composite_tiled_repeat (pixman_implementation_t *imp, |
|
247 pixman_composite_info_t *info) |
|
248 diff --git a/gfx/cairo/libpixman/src/pixman-inlines.h b/gfx/cairo/libpixman/src/pixman-inlines.h |
|
249 --- a/gfx/cairo/libpixman/src/pixman-inlines.h |
|
250 +++ b/gfx/cairo/libpixman/src/pixman-inlines.h |
|
251 @@ -816,18 +816,48 @@ bilinear_pad_repeat_get_scanline_bounds |
|
252 * |
|
253 * Note: normally the sum of 'weight_top' and 'weight_bottom' is equal to 256, |
|
254 * but sometimes it may be less than that for NONE repeat when handling |
|
255 * fuzzy antialiased top or bottom image edges. Also both top and |
|
256 * bottom weight variables are guaranteed to have value in 0-255 |
|
257 * range and can fit into unsigned byte or be used with 8-bit SIMD |
|
258 * multiplication instructions. |
|
259 */ |
|
260 -#define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t, \ |
|
261 - dst_type_t, repeat_mode, flags) \ |
|
262 + |
|
263 +/* Replace a single "scanline_func" with "fetch_func" & "op_func" to allow optional |
|
264 + * two stage processing (bilinear fetch to a temp buffer, followed by unscaled |
|
265 + * combine), "op_func" may be NULL, in this case we keep old behavior. |
|
266 + * This is ugly and gcc issues some warnings, but works. |
|
267 + * |
|
268 + * An advice: clang has much better error reporting than gcc for deeply nested macros. |
|
269 + */ |
|
270 + |
|
271 +#define scanline_func(dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ |
|
272 + scanline_buf, mask, src_top, src_bottom, width, \ |
|
273 + weight_top, weight_bottom, vx, unit_x, max_vx, zero_src) \ |
|
274 + do { \ |
|
275 + if (op_func != NULL) \ |
|
276 + { \ |
|
277 + fetch_func ((void *)scanline_buf, (mask), (src_top), (src_bottom), (width), \ |
|
278 + (weight_top), (weight_bottom), (vx), (unit_x), (max_vx), (zero_src)); \ |
|
279 + ((void (*)(dst_type_t *, const mask_type_t *, const src_type_t *, int)) op_func)\ |
|
280 + ((dst), (mask), (src_type_t *)scanline_buf, (width)); \ |
|
281 + } \ |
|
282 + else \ |
|
283 + { \ |
|
284 + fetch_func ((void*)(dst), (mask), (src_top), (src_bottom), (width), (weight_top), \ |
|
285 + (weight_bottom), (vx), (unit_x), (max_vx), (zero_src)); \ |
|
286 + } \ |
|
287 + } while (0) |
|
288 + |
|
289 + |
|
290 +#define SCANLINE_BUFFER_LENGTH 3072 |
|
291 + |
|
292 +#define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, fetch_func, op_func, src_type_t, \ |
|
293 + mask_type_t, dst_type_t, repeat_mode, flags) \ |
|
294 static void \ |
|
295 fast_composite_scaled_bilinear ## scale_func_name (pixman_implementation_t *imp, \ |
|
296 pixman_composite_info_t *info) \ |
|
297 { \ |
|
298 PIXMAN_COMPOSITE_ARGS (info); \ |
|
299 dst_type_t *dst_line; \ |
|
300 mask_type_t *mask_line; \ |
|
301 src_type_t *src_first_line; \ |
|
302 @@ -842,16 +872,19 @@ fast_composite_scaled_bilinear ## scale_ |
|
303 mask_type_t solid_mask; \ |
|
304 const mask_type_t *mask = &solid_mask; \ |
|
305 int src_stride, mask_stride, dst_stride; \ |
|
306 \ |
|
307 int src_width; \ |
|
308 pixman_fixed_t src_width_fixed; \ |
|
309 int max_x; \ |
|
310 pixman_bool_t need_src_extension; \ |
|
311 + \ |
|
312 + uint64_t stack_scanline_buffer[SCANLINE_BUFFER_LENGTH]; \ |
|
313 + uint8_t *scanline_buffer = (uint8_t *) stack_scanline_buffer; \ |
|
314 \ |
|
315 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type_t, dst_stride, dst_line, 1); \ |
|
316 if (flags & FLAG_HAVE_SOLID_MASK) \ |
|
317 { \ |
|
318 solid_mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format); \ |
|
319 mask_stride = 0; \ |
|
320 } \ |
|
321 else if (flags & FLAG_HAVE_NON_SOLID_MASK) \ |
|
322 @@ -914,16 +947,24 @@ fast_composite_scaled_bilinear ## scale_ |
|
323 else \ |
|
324 { \ |
|
325 src_width = src_image->bits.width; \ |
|
326 need_src_extension = FALSE; \ |
|
327 } \ |
|
328 \ |
|
329 src_width_fixed = pixman_int_to_fixed (src_width); \ |
|
330 } \ |
|
331 + \ |
|
332 + if (op_func != NULL && width * sizeof(src_type_t) > sizeof(stack_scanline_buffer)) \ |
|
333 + { \ |
|
334 + scanline_buffer = pixman_malloc_ab (width, sizeof(src_type_t)); \ |
|
335 + \ |
|
336 + if (!scanline_buffer) \ |
|
337 + return; \ |
|
338 + } \ |
|
339 \ |
|
340 while (--height >= 0) \ |
|
341 { \ |
|
342 int weight1, weight2; \ |
|
343 dst = dst_line; \ |
|
344 dst_line += dst_stride; \ |
|
345 vx = v.vector[0]; \ |
|
346 if (flags & FLAG_HAVE_NON_SOLID_MASK) \ |
|
347 @@ -956,36 +997,39 @@ fast_composite_scaled_bilinear ## scale_ |
|
348 repeat (PIXMAN_REPEAT_PAD, &y2, src_image->bits.height); \ |
|
349 src1 = src_first_line + src_stride * y1; \ |
|
350 src2 = src_first_line + src_stride * y2; \ |
|
351 \ |
|
352 if (left_pad > 0) \ |
|
353 { \ |
|
354 buf1[0] = buf1[1] = src1[0]; \ |
|
355 buf2[0] = buf2[1] = src2[0]; \ |
|
356 - scanline_func (dst, mask, \ |
|
357 - buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, FALSE); \ |
|
358 + scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ |
|
359 + scanline_buffer, mask, buf1, buf2, left_pad, weight1, weight2, \ |
|
360 + 0, 0, 0, FALSE); \ |
|
361 dst += left_pad; \ |
|
362 if (flags & FLAG_HAVE_NON_SOLID_MASK) \ |
|
363 mask += left_pad; \ |
|
364 } \ |
|
365 if (width > 0) \ |
|
366 { \ |
|
367 - scanline_func (dst, mask, \ |
|
368 - src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE); \ |
|
369 + scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ |
|
370 + scanline_buffer, mask, src1, src2, width, weight1, weight2, \ |
|
371 + vx, unit_x, 0, FALSE); \ |
|
372 dst += width; \ |
|
373 if (flags & FLAG_HAVE_NON_SOLID_MASK) \ |
|
374 mask += width; \ |
|
375 } \ |
|
376 if (right_pad > 0) \ |
|
377 { \ |
|
378 buf1[0] = buf1[1] = src1[src_image->bits.width - 1]; \ |
|
379 buf2[0] = buf2[1] = src2[src_image->bits.width - 1]; \ |
|
380 - scanline_func (dst, mask, \ |
|
381 - buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, FALSE); \ |
|
382 + scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ |
|
383 + scanline_buffer, mask, buf1, buf2, right_pad, weight1, weight2, \ |
|
384 + 0, 0, 0, FALSE); \ |
|
385 } \ |
|
386 } \ |
|
387 else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE) \ |
|
388 { \ |
|
389 src_type_t *src1, *src2; \ |
|
390 src_type_t buf1[2]; \ |
|
391 src_type_t buf2[2]; \ |
|
392 /* handle top/bottom zero padding by just setting weights to 0 if needed */ \ |
|
393 @@ -1011,64 +1055,67 @@ fast_composite_scaled_bilinear ## scale_ |
|
394 } \ |
|
395 src1 = src_first_line + src_stride * y1; \ |
|
396 src2 = src_first_line + src_stride * y2; \ |
|
397 \ |
|
398 if (left_pad > 0) \ |
|
399 { \ |
|
400 buf1[0] = buf1[1] = 0; \ |
|
401 buf2[0] = buf2[1] = 0; \ |
|
402 - scanline_func (dst, mask, \ |
|
403 - buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, TRUE); \ |
|
404 + scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ |
|
405 + scanline_buffer, mask, buf1, buf2, left_pad, weight1, weight2, \ |
|
406 + 0, 0, 0, TRUE); \ |
|
407 dst += left_pad; \ |
|
408 if (flags & FLAG_HAVE_NON_SOLID_MASK) \ |
|
409 mask += left_pad; \ |
|
410 } \ |
|
411 if (left_tz > 0) \ |
|
412 { \ |
|
413 buf1[0] = 0; \ |
|
414 buf1[1] = src1[0]; \ |
|
415 buf2[0] = 0; \ |
|
416 buf2[1] = src2[0]; \ |
|
417 - scanline_func (dst, mask, \ |
|
418 - buf1, buf2, left_tz, weight1, weight2, \ |
|
419 + scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ |
|
420 + scanline_buffer, mask, buf1, buf2, left_tz, weight1, weight2, \ |
|
421 pixman_fixed_frac (vx), unit_x, 0, FALSE); \ |
|
422 dst += left_tz; \ |
|
423 if (flags & FLAG_HAVE_NON_SOLID_MASK) \ |
|
424 mask += left_tz; \ |
|
425 vx += left_tz * unit_x; \ |
|
426 } \ |
|
427 if (width > 0) \ |
|
428 { \ |
|
429 - scanline_func (dst, mask, \ |
|
430 - src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE); \ |
|
431 + scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ |
|
432 + scanline_buffer, mask, src1, src2, width, weight1, weight2, \ |
|
433 + vx, unit_x, 0, FALSE); \ |
|
434 dst += width; \ |
|
435 if (flags & FLAG_HAVE_NON_SOLID_MASK) \ |
|
436 mask += width; \ |
|
437 vx += width * unit_x; \ |
|
438 } \ |
|
439 if (right_tz > 0) \ |
|
440 { \ |
|
441 buf1[0] = src1[src_image->bits.width - 1]; \ |
|
442 buf1[1] = 0; \ |
|
443 buf2[0] = src2[src_image->bits.width - 1]; \ |
|
444 buf2[1] = 0; \ |
|
445 - scanline_func (dst, mask, \ |
|
446 - buf1, buf2, right_tz, weight1, weight2, \ |
|
447 + scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ |
|
448 + scanline_buffer, mask, buf1, buf2, right_tz, weight1, weight2, \ |
|
449 pixman_fixed_frac (vx), unit_x, 0, FALSE); \ |
|
450 dst += right_tz; \ |
|
451 if (flags & FLAG_HAVE_NON_SOLID_MASK) \ |
|
452 mask += right_tz; \ |
|
453 } \ |
|
454 if (right_pad > 0) \ |
|
455 { \ |
|
456 buf1[0] = buf1[1] = 0; \ |
|
457 buf2[0] = buf2[1] = 0; \ |
|
458 - scanline_func (dst, mask, \ |
|
459 - buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, TRUE); \ |
|
460 + scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ |
|
461 + scanline_buffer, mask, buf1, buf2, right_pad, weight1, weight2, \ |
|
462 + 0, 0, 0, TRUE); \ |
|
463 } \ |
|
464 } \ |
|
465 else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL) \ |
|
466 { \ |
|
467 int32_t num_pixels; \ |
|
468 int32_t width_remain; \ |
|
469 src_type_t * src_line_top; \ |
|
470 src_type_t * src_line_bottom; \ |
|
471 @@ -1120,17 +1167,18 @@ fast_composite_scaled_bilinear ## scale_ |
|
472 * vx is in range [0, src_width_fixed - pixman_fixed_e] \ |
|
473 * So we are safe from overflow. \ |
|
474 */ \ |
|
475 num_pixels = ((src_width_fixed - vx - pixman_fixed_e) / unit_x) + 1; \ |
|
476 \ |
|
477 if (num_pixels > width_remain) \ |
|
478 num_pixels = width_remain; \ |
|
479 \ |
|
480 - scanline_func (dst, mask, buf1, buf2, num_pixels, \ |
|
481 + scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, \ |
|
482 + dst, scanline_buffer, mask, buf1, buf2, num_pixels, \ |
|
483 weight1, weight2, pixman_fixed_frac(vx), \ |
|
484 unit_x, src_width_fixed, FALSE); \ |
|
485 \ |
|
486 width_remain -= num_pixels; \ |
|
487 vx += num_pixels * unit_x; \ |
|
488 dst += num_pixels; \ |
|
489 \ |
|
490 if (flags & FLAG_HAVE_NON_SOLID_MASK) \ |
|
491 @@ -1149,41 +1197,47 @@ fast_composite_scaled_bilinear ## scale_ |
|
492 * So we are safe from overflow here. \ |
|
493 */ \ |
|
494 num_pixels = ((src_width_fixed - pixman_fixed_1 - vx - pixman_fixed_e) \ |
|
495 / unit_x) + 1; \ |
|
496 \ |
|
497 if (num_pixels > width_remain) \ |
|
498 num_pixels = width_remain; \ |
|
499 \ |
|
500 - scanline_func (dst, mask, src_line_top, src_line_bottom, num_pixels, \ |
|
501 - weight1, weight2, vx, unit_x, src_width_fixed, FALSE); \ |
|
502 + scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, \ |
|
503 + dst, scanline_buffer, mask, src_line_top, src_line_bottom, \ |
|
504 + num_pixels, weight1, weight2, vx, unit_x, src_width_fixed, \ |
|
505 + FALSE); \ |
|
506 \ |
|
507 width_remain -= num_pixels; \ |
|
508 vx += num_pixels * unit_x; \ |
|
509 dst += num_pixels; \ |
|
510 \ |
|
511 if (flags & FLAG_HAVE_NON_SOLID_MASK) \ |
|
512 mask += num_pixels; \ |
|
513 } \ |
|
514 } \ |
|
515 } \ |
|
516 else \ |
|
517 { \ |
|
518 - scanline_func (dst, mask, src_first_line + src_stride * y1, \ |
|
519 + scanline_func (dst_type_t, mask_type_t, src_type_t, fetch_func, op_func, dst, \ |
|
520 + scanline_buffer, mask, \ |
|
521 + src_first_line + src_stride * y1, \ |
|
522 src_first_line + src_stride * y2, width, \ |
|
523 weight1, weight2, vx, unit_x, max_vx, FALSE); \ |
|
524 } \ |
|
525 } \ |
|
526 + if (scanline_buffer != (uint8_t *) stack_scanline_buffer) \ |
|
527 + free (scanline_buffer); \ |
|
528 } |
|
529 |
|
530 /* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */ |
|
531 -#define FAST_BILINEAR_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t, \ |
|
532 +#define FAST_BILINEAR_MAINLOOP_COMMON(scale_func_name, fetch_func, op_func, src_type_t, mask_type_t,\ |
|
533 dst_type_t, repeat_mode, flags) \ |
|
534 - FAST_BILINEAR_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,\ |
|
535 + FAST_BILINEAR_MAINLOOP_INT(_ ## scale_func_name, fetch_func, op_func, src_type_t, mask_type_t,\ |
|
536 dst_type_t, repeat_mode, flags) |
|
537 |
|
538 #define SCALED_BILINEAR_FLAGS \ |
|
539 (FAST_PATH_SCALE_TRANSFORM | \ |
|
540 FAST_PATH_NO_ALPHA_MAP | \ |
|
541 FAST_PATH_BILINEAR_FILTER | \ |
|
542 FAST_PATH_NO_ACCESSORS | \ |
|
543 FAST_PATH_NARROW_FORMAT) |
|
544 diff --git a/gfx/cairo/libpixman/src/pixman-sse2.c b/gfx/cairo/libpixman/src/pixman-sse2.c |
|
545 --- a/gfx/cairo/libpixman/src/pixman-sse2.c |
|
546 +++ b/gfx/cairo/libpixman/src/pixman-sse2.c |
|
547 @@ -5404,30 +5404,33 @@ scaled_bilinear_scanline_sse2_8888_8888_ |
|
548 if (w & 1) |
|
549 { |
|
550 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
|
551 *dst = pix1; |
|
552 } |
|
553 |
|
554 } |
|
555 |
|
556 +/* Add extra NULL argument to the existing bilinear fast paths to indicate |
|
557 + * that we don't need two-pass processing */ |
|
558 + |
|
559 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC, |
|
560 - scaled_bilinear_scanline_sse2_8888_8888_SRC, |
|
561 + scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, |
|
562 uint32_t, uint32_t, uint32_t, |
|
563 COVER, FLAG_NONE) |
|
564 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC, |
|
565 - scaled_bilinear_scanline_sse2_8888_8888_SRC, |
|
566 + scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, |
|
567 uint32_t, uint32_t, uint32_t, |
|
568 PAD, FLAG_NONE) |
|
569 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC, |
|
570 - scaled_bilinear_scanline_sse2_8888_8888_SRC, |
|
571 + scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, |
|
572 uint32_t, uint32_t, uint32_t, |
|
573 NONE, FLAG_NONE) |
|
574 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC, |
|
575 - scaled_bilinear_scanline_sse2_8888_8888_SRC, |
|
576 + scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, |
|
577 uint32_t, uint32_t, uint32_t, |
|
578 NORMAL, FLAG_NONE) |
|
579 |
|
580 static force_inline void |
|
581 scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst, |
|
582 const uint32_t * mask, |
|
583 const uint32_t * src_top, |
|
584 const uint32_t * src_bottom, |
|
585 @@ -5505,32 +5508,66 @@ scaled_bilinear_scanline_sse2_8888_8888_ |
|
586 } |
|
587 |
|
588 w--; |
|
589 dst++; |
|
590 } |
|
591 } |
|
592 |
|
593 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER, |
|
594 - scaled_bilinear_scanline_sse2_8888_8888_OVER, |
|
595 + scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, |
|
596 uint32_t, uint32_t, uint32_t, |
|
597 COVER, FLAG_NONE) |
|
598 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER, |
|
599 - scaled_bilinear_scanline_sse2_8888_8888_OVER, |
|
600 + scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, |
|
601 uint32_t, uint32_t, uint32_t, |
|
602 PAD, FLAG_NONE) |
|
603 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER, |
|
604 - scaled_bilinear_scanline_sse2_8888_8888_OVER, |
|
605 + scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, |
|
606 uint32_t, uint32_t, uint32_t, |
|
607 NONE, FLAG_NONE) |
|
608 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER, |
|
609 - scaled_bilinear_scanline_sse2_8888_8888_OVER, |
|
610 + scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, |
|
611 uint32_t, uint32_t, uint32_t, |
|
612 NORMAL, FLAG_NONE) |
|
613 |
|
614 + |
|
615 +/* An example of SSE2 two-stage bilinear_over_8888_0565 fast path, which is implemented |
|
616 + as scaled_bilinear_scanline_sse2_8888_8888_SRC + op_bilinear_over_8888_0565 */ |
|
617 + |
|
618 +void op_bilinear_over_8888_0565(uint16_t *dst, const uint32_t *mask, const uint32_t *src, int width) |
|
619 +{ |
|
620 + /* Note: this is not really fast and should be based on 8 pixel loop from sse2_composite_over_8888_0565 */ |
|
621 + while (--width >= 0) |
|
622 + { |
|
623 + *dst = composite_over_8888_0565pixel (*src, *dst); |
|
624 + src++; |
|
625 + dst++; |
|
626 + } |
|
627 +} |
|
628 + |
|
629 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_cover_OVER, |
|
630 + scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, |
|
631 + uint32_t, uint32_t, uint16_t, |
|
632 + COVER, FLAG_NONE) |
|
633 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_pad_OVER, |
|
634 + scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, |
|
635 + uint32_t, uint32_t, uint16_t, |
|
636 + PAD, FLAG_NONE) |
|
637 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_none_OVER, |
|
638 + scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, |
|
639 + uint32_t, uint32_t, uint16_t, |
|
640 + NONE, FLAG_NONE) |
|
641 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_normal_OVER, |
|
642 + scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, |
|
643 + uint32_t, uint32_t, uint16_t, |
|
644 + NORMAL, FLAG_NONE) |
|
645 + |
|
646 +/*****************************/ |
|
647 + |
|
648 static force_inline void |
|
649 scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst, |
|
650 const uint8_t * mask, |
|
651 const uint32_t * src_top, |
|
652 const uint32_t * src_bottom, |
|
653 int32_t w, |
|
654 int wt, |
|
655 int wb, |
|
656 @@ -5669,29 +5706,29 @@ scaled_bilinear_scanline_sse2_8888_8_888 |
|
657 } |
|
658 |
|
659 w--; |
|
660 dst++; |
|
661 } |
|
662 } |
|
663 |
|
664 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER, |
|
665 - scaled_bilinear_scanline_sse2_8888_8_8888_OVER, |
|
666 + scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, |
|
667 uint32_t, uint8_t, uint32_t, |
|
668 COVER, FLAG_HAVE_NON_SOLID_MASK) |
|
669 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER, |
|
670 - scaled_bilinear_scanline_sse2_8888_8_8888_OVER, |
|
671 + scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, |
|
672 uint32_t, uint8_t, uint32_t, |
|
673 PAD, FLAG_HAVE_NON_SOLID_MASK) |
|
674 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER, |
|
675 - scaled_bilinear_scanline_sse2_8888_8_8888_OVER, |
|
676 + scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, |
|
677 uint32_t, uint8_t, uint32_t, |
|
678 NONE, FLAG_HAVE_NON_SOLID_MASK) |
|
679 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER, |
|
680 - scaled_bilinear_scanline_sse2_8888_8_8888_OVER, |
|
681 + scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, |
|
682 uint32_t, uint8_t, uint32_t, |
|
683 NORMAL, FLAG_HAVE_NON_SOLID_MASK) |
|
684 |
|
685 static const pixman_fast_path_t sse2_fast_paths[] = |
|
686 { |
|
687 /* PIXMAN_OP_OVER */ |
|
688 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565), |
|
689 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565), |
|
690 @@ -5808,16 +5845,21 @@ static const pixman_fast_path_t sse2_fas |
|
691 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), |
|
692 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), |
|
693 |
|
694 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888), |
|
695 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888), |
|
696 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888), |
|
697 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888), |
|
698 |
|
699 + /* and here the needed entries are added to the fast path table */ |
|
700 + |
|
701 + SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, r5g6b5, sse2_8888_0565), |
|
702 + SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, b5g6r5, sse2_8888_0565), |
|
703 + |
|
704 { PIXMAN_OP_NONE }, |
|
705 }; |
|
706 |
|
707 static pixman_bool_t |
|
708 sse2_blt (pixman_implementation_t *imp, |
|
709 uint32_t * src_bits, |
|
710 uint32_t * dst_bits, |
|
711 int src_stride, |
|
712 |