|
1 diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp |
|
2 --- a/gfx/ycbcr/yuv_convert.cpp |
|
3 +++ b/gfx/ycbcr/yuv_convert.cpp |
|
4 @@ -6,145 +6,102 @@ |
|
5 // http://www.fourcc.org/yuv.php |
|
6 // The actual conversion is best described here |
|
7 // http://en.wikipedia.org/wiki/YUV |
|
8 // An article on optimizing YUV conversion using tables instead of multiplies |
|
9 // http://lestourtereaux.free.fr/papers/data/yuvrgb.pdf |
|
10 // |
|
11 // YV12 is a full plane of Y and a half height, half width chroma planes |
|
12 // YV16 is a full plane of Y and a full height, half width chroma planes |
|
13 +// YV24 is a full plane of Y and a full height, full width chroma planes |
|
14 // |
|
15 // ARGB pixel format is output, which on little endian is stored as BGRA. |
|
16 // The alpha is set to 255, allowing the application to use RGBA or RGB32. |
|
17 |
|
18 -#include "media/base/yuv_convert.h" |
|
19 +#include "yuv_convert.h" |
|
20 |
|
21 // Header for low level row functions. |
|
22 -#include "media/base/yuv_row.h" |
|
23 - |
|
24 -#if USE_MMX |
|
25 -#if defined(_MSC_VER) |
|
26 -#include <intrin.h> |
|
27 -#else |
|
28 -#include <mmintrin.h> |
|
29 -#endif |
|
30 -#endif |
|
31 - |
|
32 -#if USE_SSE2 |
|
33 -#include <emmintrin.h> |
|
34 -#endif |
|
35 - |
|
36 -namespace media { |
|
37 - |
|
38 +#include "yuv_row.h" |
|
39 +#include "mozilla/SSE.h" |
|
40 + |
|
41 +namespace mozilla { |
|
42 + |
|
43 +namespace gfx { |
|
44 + |
|
45 // 16.16 fixed point arithmetic |
|
46 const int kFractionBits = 16; |
|
47 const int kFractionMax = 1 << kFractionBits; |
|
48 const int kFractionMask = ((1 << kFractionBits) - 1); |
|
49 |
|
50 // Convert a frame of YUV to 32 bit ARGB. |
|
51 -void ConvertYUVToRGB32(const uint8* y_buf, |
|
52 - const uint8* u_buf, |
|
53 - const uint8* v_buf, |
|
54 - uint8* rgb_buf, |
|
55 - int width, |
|
56 - int height, |
|
57 - int y_pitch, |
|
58 - int uv_pitch, |
|
59 - int rgb_pitch, |
|
60 - YUVType yuv_type) { |
|
61 - unsigned int y_shift = yuv_type; |
|
62 - for (int y = 0; y < height; ++y) { |
|
63 - uint8* rgb_row = rgb_buf + y * rgb_pitch; |
|
64 - const uint8* y_ptr = y_buf + y * y_pitch; |
|
65 - const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch; |
|
66 - const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch; |
|
67 - |
|
68 - FastConvertYUVToRGB32Row(y_ptr, |
|
69 - u_ptr, |
|
70 - v_ptr, |
|
71 - rgb_row, |
|
72 - width); |
|
73 - } |
|
74 +NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* y_buf, |
|
75 + const uint8* u_buf, |
|
76 + const uint8* v_buf, |
|
77 + uint8* rgb_buf, |
|
78 + int pic_x, |
|
79 + int pic_y, |
|
80 + int pic_width, |
|
81 + int pic_height, |
|
82 + int y_pitch, |
|
83 + int uv_pitch, |
|
84 + int rgb_pitch, |
|
85 + YUVType yuv_type) { |
|
86 + unsigned int y_shift = yuv_type == YV12 ? 1 : 0; |
|
87 + unsigned int x_shift = yuv_type == YV24 ? 0 : 1; |
|
88 + // Test for SSE because the optimized code uses movntq, which is not part of MMX. |
|
89 + bool has_sse = supports_mmx() && supports_sse(); |
|
90 + // There is no optimized YV24 SSE routine so we check for this and |
|
91 + // fall back to the C code. |
|
92 + has_sse &= yuv_type != YV24; |
|
93 + bool odd_pic_x = yuv_type != YV24 && pic_x % 2 != 0; |
|
94 + int x_width = odd_pic_x ? pic_width - 1 : pic_width; |
|
95 + |
|
96 + for (int y = pic_y; y < pic_height + pic_y; ++y) { |
|
97 + uint8* rgb_row = rgb_buf + (y - pic_y) * rgb_pitch; |
|
98 + const uint8* y_ptr = y_buf + y * y_pitch + pic_x; |
|
99 + const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift); |
|
100 + const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift); |
|
101 + |
|
102 + if (odd_pic_x) { |
|
103 + // Handle the single odd pixel manually and use the |
|
104 + // fast routines for the remaining. |
|
105 + FastConvertYUVToRGB32Row_C(y_ptr++, |
|
106 + u_ptr++, |
|
107 + v_ptr++, |
|
108 + rgb_row, |
|
109 + 1, |
|
110 + x_shift); |
|
111 + rgb_row += 4; |
|
112 + } |
|
113 + |
|
114 + if (has_sse) { |
|
115 + FastConvertYUVToRGB32Row(y_ptr, |
|
116 + u_ptr, |
|
117 + v_ptr, |
|
118 + rgb_row, |
|
119 + x_width); |
|
120 + } |
|
121 + else { |
|
122 + FastConvertYUVToRGB32Row_C(y_ptr, |
|
123 + u_ptr, |
|
124 + v_ptr, |
|
125 + rgb_row, |
|
126 + x_width, |
|
127 + x_shift); |
|
128 + } |
|
129 + } |
|
130 |
|
131 // MMX used for FastConvertYUVToRGB32Row requires emms instruction. |
|
132 - EMMS(); |
|
133 -} |
|
134 - |
|
135 -#if USE_SSE2 |
|
136 -// FilterRows combines two rows of the image using linear interpolation. |
|
137 -// SSE2 version does 16 pixels at a time |
|
138 - |
|
139 -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, |
|
140 - int source_width, int source_y_fraction) { |
|
141 - __m128i zero = _mm_setzero_si128(); |
|
142 - __m128i y1_fraction = _mm_set1_epi16(source_y_fraction); |
|
143 - __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction); |
|
144 - |
|
145 - const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr); |
|
146 - const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr); |
|
147 - __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf); |
|
148 - __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width); |
|
149 - |
|
150 - do { |
|
151 - __m128i y0 = _mm_loadu_si128(y0_ptr128); |
|
152 - __m128i y1 = _mm_loadu_si128(y1_ptr128); |
|
153 - __m128i y2 = _mm_unpackhi_epi8(y0, zero); |
|
154 - __m128i y3 = _mm_unpackhi_epi8(y1, zero); |
|
155 - y0 = _mm_unpacklo_epi8(y0, zero); |
|
156 - y1 = _mm_unpacklo_epi8(y1, zero); |
|
157 - y0 = _mm_mullo_epi16(y0, y0_fraction); |
|
158 - y1 = _mm_mullo_epi16(y1, y1_fraction); |
|
159 - y2 = _mm_mullo_epi16(y2, y0_fraction); |
|
160 - y3 = _mm_mullo_epi16(y3, y1_fraction); |
|
161 - y0 = _mm_add_epi16(y0, y1); |
|
162 - y2 = _mm_add_epi16(y2, y3); |
|
163 - y0 = _mm_srli_epi16(y0, 8); |
|
164 - y2 = _mm_srli_epi16(y2, 8); |
|
165 - y0 = _mm_packus_epi16(y0, y2); |
|
166 - *dest128++ = y0; |
|
167 - ++y0_ptr128; |
|
168 - ++y1_ptr128; |
|
169 - } while (dest128 < end128); |
|
170 -} |
|
171 -#elif USE_MMX |
|
172 -// MMX version does 8 pixels at a time |
|
173 -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, |
|
174 - int source_width, int source_y_fraction) { |
|
175 - __m64 zero = _mm_setzero_si64(); |
|
176 - __m64 y1_fraction = _mm_set1_pi16(source_y_fraction); |
|
177 - __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction); |
|
178 - |
|
179 - const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr); |
|
180 - const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr); |
|
181 - __m64* dest64 = reinterpret_cast<__m64*>(ybuf); |
|
182 - __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width); |
|
183 - |
|
184 - do { |
|
185 - __m64 y0 = *y0_ptr64++; |
|
186 - __m64 y1 = *y1_ptr64++; |
|
187 - __m64 y2 = _mm_unpackhi_pi8(y0, zero); |
|
188 - __m64 y3 = _mm_unpackhi_pi8(y1, zero); |
|
189 - y0 = _mm_unpacklo_pi8(y0, zero); |
|
190 - y1 = _mm_unpacklo_pi8(y1, zero); |
|
191 - y0 = _mm_mullo_pi16(y0, y0_fraction); |
|
192 - y1 = _mm_mullo_pi16(y1, y1_fraction); |
|
193 - y2 = _mm_mullo_pi16(y2, y0_fraction); |
|
194 - y3 = _mm_mullo_pi16(y3, y1_fraction); |
|
195 - y0 = _mm_add_pi16(y0, y1); |
|
196 - y2 = _mm_add_pi16(y2, y3); |
|
197 - y0 = _mm_srli_pi16(y0, 8); |
|
198 - y2 = _mm_srli_pi16(y2, 8); |
|
199 - y0 = _mm_packs_pu16(y0, y2); |
|
200 - *dest64++ = y0; |
|
201 - } while (dest64 < end64); |
|
202 -} |
|
203 -#else // no MMX or SSE2 |
|
204 + if (has_sse) |
|
205 + EMMS(); |
|
206 +} |
|
207 + |
|
208 // C version does 8 at a time to mimic MMX code |
|
209 -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, |
|
210 - int source_width, int source_y_fraction) { |
|
211 +static void FilterRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, |
|
212 + int source_width, int source_y_fraction) { |
|
213 int y1_fraction = source_y_fraction; |
|
214 int y0_fraction = 256 - y1_fraction; |
|
215 uint8* end = ybuf + source_width; |
|
216 do { |
|
217 ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8; |
|
218 ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8; |
|
219 ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8; |
|
220 ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8; |
|
221 @@ -152,46 +140,77 @@ static void FilterRows(uint8* ybuf, cons |
|
222 ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8; |
|
223 ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8; |
|
224 ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8; |
|
225 y0_ptr += 8; |
|
226 y1_ptr += 8; |
|
227 ybuf += 8; |
|
228 } while (ybuf < end); |
|
229 } |
|
230 -#endif |
|
231 + |
|
232 +#ifdef MOZILLA_MAY_SUPPORT_MMX |
|
233 +void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, |
|
234 + int source_width, int source_y_fraction); |
|
235 +#endif |
|
236 + |
|
237 +#ifdef MOZILLA_MAY_SUPPORT_SSE2 |
|
238 +void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, |
|
239 + int source_width, int source_y_fraction); |
|
240 +#endif |
|
241 + |
|
242 +static inline void FilterRows(uint8* ybuf, const uint8* y0_ptr, |
|
243 + const uint8* y1_ptr, int source_width, |
|
244 + int source_y_fraction) { |
|
245 +#ifdef MOZILLA_MAY_SUPPORT_SSE2 |
|
246 + if (mozilla::supports_sse2()) { |
|
247 + FilterRows_SSE2(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); |
|
248 + return; |
|
249 + } |
|
250 +#endif |
|
251 + |
|
252 +#ifdef MOZILLA_MAY_SUPPORT_MMX |
|
253 + if (mozilla::supports_mmx()) { |
|
254 + FilterRows_MMX(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); |
|
255 + return; |
|
256 + } |
|
257 +#endif |
|
258 + |
|
259 + FilterRows_C(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); |
|
260 +} |
|
261 |
|
262 |
|
263 // Scale a frame of YUV to 32 bit ARGB. |
|
264 -void ScaleYUVToRGB32(const uint8* y_buf, |
|
265 - const uint8* u_buf, |
|
266 - const uint8* v_buf, |
|
267 - uint8* rgb_buf, |
|
268 - int source_width, |
|
269 - int source_height, |
|
270 - int width, |
|
271 - int height, |
|
272 - int y_pitch, |
|
273 - int uv_pitch, |
|
274 - int rgb_pitch, |
|
275 - YUVType yuv_type, |
|
276 - Rotate view_rotate, |
|
277 - ScaleFilter filter) { |
|
278 +NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* y_buf, |
|
279 + const uint8* u_buf, |
|
280 + const uint8* v_buf, |
|
281 + uint8* rgb_buf, |
|
282 + int source_width, |
|
283 + int source_height, |
|
284 + int width, |
|
285 + int height, |
|
286 + int y_pitch, |
|
287 + int uv_pitch, |
|
288 + int rgb_pitch, |
|
289 + YUVType yuv_type, |
|
290 + Rotate view_rotate, |
|
291 + ScaleFilter filter) { |
|
292 + bool has_mmx = supports_mmx(); |
|
293 + |
|
294 // 4096 allows 3 buffers to fit in 12k. |
|
295 // Helps performance on CPU with 16K L1 cache. |
|
296 // Large enough for 3830x2160 and 30" displays which are 2560x1600. |
|
297 const int kFilterBufferSize = 4096; |
|
298 // Disable filtering if the screen is too big (to avoid buffer overflows). |
|
299 // This should never happen to regular users: they don't have monitors |
|
300 // wider than 4096 pixels. |
|
301 // TODO(fbarchard): Allow rotated videos to filter. |
|
302 if (source_width > kFilterBufferSize || view_rotate) |
|
303 filter = FILTER_NONE; |
|
304 |
|
305 - unsigned int y_shift = yuv_type; |
|
306 + unsigned int y_shift = yuv_type == YV12 ? 1 : 0; |
|
307 // Diagram showing origin and direction of source sampling. |
|
308 // ->0 4<- |
|
309 // 7 3 |
|
310 // |
|
311 // 6 5 |
|
312 // ->1 2<- |
|
313 // Rotations that start at right side of image. |
|
314 if ((view_rotate == ROTATE_180) || |
|
315 @@ -276,17 +295,17 @@ void ScaleYUVToRGB32(const uint8* y_buf, |
|
316 int source_uv_fraction = |
|
317 ((source_y_subpixel >> y_shift) & kFractionMask) >> 8; |
|
318 |
|
319 const uint8* y_ptr = y0_ptr; |
|
320 const uint8* u_ptr = u0_ptr; |
|
321 const uint8* v_ptr = v0_ptr; |
|
322 // Apply vertical filtering if necessary. |
|
323 // TODO(fbarchard): Remove memcpy when not necessary. |
|
324 - if (filter & media::FILTER_BILINEAR_V) { |
|
325 + if (filter & mozilla::gfx::FILTER_BILINEAR_V) { |
|
326 if (yscale_fixed != kFractionMax && |
|
327 source_y_fraction && ((source_y + 1) < source_height)) { |
|
328 FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); |
|
329 } else { |
|
330 memcpy(ybuf, y0_ptr, source_width); |
|
331 } |
|
332 y_ptr = ybuf; |
|
333 ybuf[source_width] = ybuf[source_width-1]; |
|
334 @@ -303,44 +322,50 @@ void ScaleYUVToRGB32(const uint8* y_buf, |
|
335 u_ptr = ubuf; |
|
336 v_ptr = vbuf; |
|
337 ubuf[uv_source_width] = ubuf[uv_source_width - 1]; |
|
338 vbuf[uv_source_width] = vbuf[uv_source_width - 1]; |
|
339 } |
|
340 if (source_dx == kFractionMax) { // Not scaled |
|
341 FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr, |
|
342 dest_pixel, width); |
|
343 - } else { |
|
344 - if (filter & FILTER_BILINEAR_H) { |
|
345 + } else if (filter & FILTER_BILINEAR_H) { |
|
346 LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, |
|
347 dest_pixel, width, source_dx); |
|
348 } else { |
|
349 // Specialized scalers and rotation. |
|
350 -#if USE_MMX && defined(_MSC_VER) |
|
351 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_MSC_VER) && defined(_M_IX86) |
|
352 + if(mozilla::supports_sse()) { |
|
353 if (width == (source_width * 2)) { |
|
354 - DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, |
|
355 - dest_pixel, width); |
|
356 + DoubleYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr, |
|
357 + dest_pixel, width); |
|
358 } else if ((source_dx & kFractionMask) == 0) { |
|
359 // Scaling by integer scale factor. ie half. |
|
360 - ConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr, |
|
361 - dest_pixel, width, |
|
362 - source_dx >> kFractionBits); |
|
363 + ConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr, |
|
364 + dest_pixel, width, |
|
365 + source_dx >> kFractionBits); |
|
366 } else if (source_dx_uv == source_dx) { // Not rotated. |
|
367 ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, |
|
368 dest_pixel, width, source_dx); |
|
369 } else { |
|
370 - RotateConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr, |
|
371 - dest_pixel, width, |
|
372 - source_dx >> kFractionBits, |
|
373 - source_dx_uv >> kFractionBits); |
|
374 + RotateConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr, |
|
375 + dest_pixel, width, |
|
376 + source_dx >> kFractionBits, |
|
377 + source_dx_uv >> kFractionBits); |
|
378 } |
|
379 + } |
|
380 + else { |
|
381 + ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr, |
|
382 + dest_pixel, width, source_dx); |
|
383 + } |
|
384 #else |
|
385 - ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, |
|
386 - dest_pixel, width, source_dx); |
|
387 -#endif |
|
388 - } |
|
389 + ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, |
|
390 + dest_pixel, width, source_dx); |
|
391 +#endif |
|
392 } |
|
393 } |
|
394 // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms. |
|
395 - EMMS(); |
|
396 -} |
|
397 - |
|
398 -} // namespace media |
|
399 + if (has_mmx) |
|
400 + EMMS(); |
|
401 +} |
|
402 + |
|
403 +} // namespace gfx |
|
404 +} // namespace mozilla |
|
405 diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h |
|
406 --- a/gfx/ycbcr/yuv_convert.h |
|
407 +++ b/gfx/ycbcr/yuv_convert.h |
|
408 @@ -1,72 +1,79 @@ |
|
409 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
|
410 // Use of this source code is governed by a BSD-style license that can be |
|
411 // found in the LICENSE file. |
|
412 |
|
413 #ifndef MEDIA_BASE_YUV_CONVERT_H_ |
|
414 #define MEDIA_BASE_YUV_CONVERT_H_ |
|
415 |
|
416 -#include "base/basictypes.h" |
|
417 - |
|
418 -namespace media { |
|
419 - |
|
420 +#include "chromium_types.h" |
|
421 +#include "gfxCore.h" |
|
422 + |
|
423 +namespace mozilla { |
|
424 + |
|
425 +namespace gfx { |
|
426 + |
|
427 // Type of YUV surface. |
|
428 // The value of these enums matter as they are used to shift vertical indices. |
|
429 enum YUVType { |
|
430 - YV16 = 0, // YV16 is half width and full height chroma channels. |
|
431 - YV12 = 1, // YV12 is half width and half height chroma channels. |
|
432 + YV12 = 0, // YV12 is half width and half height chroma channels. |
|
433 + YV16 = 1, // YV16 is half width and full height chroma channels. |
|
434 + YV24 = 2 // YV24 is full width and full height chroma channels. |
|
435 }; |
|
436 |
|
437 // Mirror means flip the image horizontally, as in looking in a mirror. |
|
438 // Rotate happens after mirroring. |
|
439 enum Rotate { |
|
440 ROTATE_0, // Rotation off. |
|
441 ROTATE_90, // Rotate clockwise. |
|
442 ROTATE_180, // Rotate upside down. |
|
443 ROTATE_270, // Rotate counter clockwise. |
|
444 MIRROR_ROTATE_0, // Mirror horizontally. |
|
445 MIRROR_ROTATE_90, // Mirror then Rotate clockwise. |
|
446 MIRROR_ROTATE_180, // Mirror vertically. |
|
447 - MIRROR_ROTATE_270, // Transpose. |
|
448 + MIRROR_ROTATE_270 // Transpose. |
|
449 }; |
|
450 |
|
451 // Filter affects how scaling looks. |
|
452 enum ScaleFilter { |
|
453 FILTER_NONE = 0, // No filter (point sampled). |
|
454 FILTER_BILINEAR_H = 1, // Bilinear horizontal filter. |
|
455 FILTER_BILINEAR_V = 2, // Bilinear vertical filter. |
|
456 - FILTER_BILINEAR = 3, // Bilinear filter. |
|
457 + FILTER_BILINEAR = 3 // Bilinear filter. |
|
458 }; |
|
459 |
|
460 // Convert a frame of YUV to 32 bit ARGB. |
|
461 // Pass in YV16/YV12 depending on source format |
|
462 -void ConvertYUVToRGB32(const uint8* yplane, |
|
463 - const uint8* uplane, |
|
464 - const uint8* vplane, |
|
465 - uint8* rgbframe, |
|
466 - int width, |
|
467 - int height, |
|
468 - int ystride, |
|
469 - int uvstride, |
|
470 - int rgbstride, |
|
471 - YUVType yuv_type); |
|
472 +NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* yplane, |
|
473 + const uint8* uplane, |
|
474 + const uint8* vplane, |
|
475 + uint8* rgbframe, |
|
476 + int pic_x, |
|
477 + int pic_y, |
|
478 + int pic_width, |
|
479 + int pic_height, |
|
480 + int ystride, |
|
481 + int uvstride, |
|
482 + int rgbstride, |
|
483 + YUVType yuv_type); |
|
484 |
|
485 // Scale a frame of YUV to 32 bit ARGB. |
|
486 // Supports rotation and mirroring. |
|
487 -void ScaleYUVToRGB32(const uint8* yplane, |
|
488 - const uint8* uplane, |
|
489 - const uint8* vplane, |
|
490 - uint8* rgbframe, |
|
491 - int source_width, |
|
492 - int source_height, |
|
493 - int width, |
|
494 - int height, |
|
495 - int ystride, |
|
496 - int uvstride, |
|
497 - int rgbstride, |
|
498 - YUVType yuv_type, |
|
499 - Rotate view_rotate, |
|
500 - ScaleFilter filter); |
|
501 - |
|
502 -} // namespace media |
|
503 - |
|
504 +NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* yplane, |
|
505 + const uint8* uplane, |
|
506 + const uint8* vplane, |
|
507 + uint8* rgbframe, |
|
508 + int source_width, |
|
509 + int source_height, |
|
510 + int width, |
|
511 + int height, |
|
512 + int ystride, |
|
513 + int uvstride, |
|
514 + int rgbstride, |
|
515 + YUVType yuv_type, |
|
516 + Rotate view_rotate, |
|
517 + ScaleFilter filter); |
|
518 + |
|
519 +} // namespace gfx |
|
520 +} // namespace mozilla |
|
521 + |
|
522 #endif // MEDIA_BASE_YUV_CONVERT_H_ |
|
523 diff --git a/gfx/ycbcr/yuv_convert_mmx.cpp b/gfx/ycbcr/yuv_convert_mmx.cpp |
|
524 new file mode 100644 |
|
525 --- /dev/null |
|
526 +++ b/gfx/ycbcr/yuv_convert_mmx.cpp |
|
527 @@ -0,0 +1,45 @@ |
|
528 +// Copyright (c) 2010 The Chromium Authors. All rights reserved. |
|
529 +// Use of this source code is governed by a BSD-style license that can be |
|
530 +// found in the LICENSE file. |
|
531 + |
|
532 +#include <mmintrin.h> |
|
533 +#include "yuv_row.h" |
|
534 + |
|
535 +namespace mozilla { |
|
536 +namespace gfx { |
|
537 + |
|
538 +// FilterRows combines two rows of the image using linear interpolation. |
|
539 +// MMX version does 8 pixels at a time. |
|
540 +void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, |
|
541 + int source_width, int source_y_fraction) { |
|
542 + __m64 zero = _mm_setzero_si64(); |
|
543 + __m64 y1_fraction = _mm_set1_pi16(source_y_fraction); |
|
544 + __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction); |
|
545 + |
|
546 + const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr); |
|
547 + const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr); |
|
548 + __m64* dest64 = reinterpret_cast<__m64*>(ybuf); |
|
549 + __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width); |
|
550 + |
|
551 + do { |
|
552 + __m64 y0 = *y0_ptr64++; |
|
553 + __m64 y1 = *y1_ptr64++; |
|
554 + __m64 y2 = _mm_unpackhi_pi8(y0, zero); |
|
555 + __m64 y3 = _mm_unpackhi_pi8(y1, zero); |
|
556 + y0 = _mm_unpacklo_pi8(y0, zero); |
|
557 + y1 = _mm_unpacklo_pi8(y1, zero); |
|
558 + y0 = _mm_mullo_pi16(y0, y0_fraction); |
|
559 + y1 = _mm_mullo_pi16(y1, y1_fraction); |
|
560 + y2 = _mm_mullo_pi16(y2, y0_fraction); |
|
561 + y3 = _mm_mullo_pi16(y3, y1_fraction); |
|
562 + y0 = _mm_add_pi16(y0, y1); |
|
563 + y2 = _mm_add_pi16(y2, y3); |
|
564 + y0 = _mm_srli_pi16(y0, 8); |
|
565 + y2 = _mm_srli_pi16(y2, 8); |
|
566 + y0 = _mm_packs_pu16(y0, y2); |
|
567 + *dest64++ = y0; |
|
568 + } while (dest64 < end64); |
|
569 +} |
|
570 + |
|
571 +} |
|
572 +} |
|
573 diff --git a/gfx/ycbcr/yuv_convert_sse2.cpp b/gfx/ycbcr/yuv_convert_sse2.cpp |
|
574 new file mode 100644 |
|
575 --- /dev/null |
|
576 +++ b/gfx/ycbcr/yuv_convert_sse2.cpp |
|
577 @@ -0,0 +1,47 @@ |
|
578 +// Copyright (c) 2010 The Chromium Authors. All rights reserved. |
|
579 +// Use of this source code is governed by a BSD-style license that can be |
|
580 +// found in the LICENSE file. |
|
581 + |
|
582 +#include <emmintrin.h> |
|
583 +#include "yuv_row.h" |
|
584 + |
|
585 +namespace mozilla { |
|
586 +namespace gfx { |
|
587 + |
|
588 +// FilterRows combines two rows of the image using linear interpolation. |
|
589 +// SSE2 version does 16 pixels at a time. |
|
590 +void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, |
|
591 + int source_width, int source_y_fraction) { |
|
592 + __m128i zero = _mm_setzero_si128(); |
|
593 + __m128i y1_fraction = _mm_set1_epi16(source_y_fraction); |
|
594 + __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction); |
|
595 + |
|
596 + const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr); |
|
597 + const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr); |
|
598 + __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf); |
|
599 + __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width); |
|
600 + |
|
601 + do { |
|
602 + __m128i y0 = _mm_loadu_si128(y0_ptr128); |
|
603 + __m128i y1 = _mm_loadu_si128(y1_ptr128); |
|
604 + __m128i y2 = _mm_unpackhi_epi8(y0, zero); |
|
605 + __m128i y3 = _mm_unpackhi_epi8(y1, zero); |
|
606 + y0 = _mm_unpacklo_epi8(y0, zero); |
|
607 + y1 = _mm_unpacklo_epi8(y1, zero); |
|
608 + y0 = _mm_mullo_epi16(y0, y0_fraction); |
|
609 + y1 = _mm_mullo_epi16(y1, y1_fraction); |
|
610 + y2 = _mm_mullo_epi16(y2, y0_fraction); |
|
611 + y3 = _mm_mullo_epi16(y3, y1_fraction); |
|
612 + y0 = _mm_add_epi16(y0, y1); |
|
613 + y2 = _mm_add_epi16(y2, y3); |
|
614 + y0 = _mm_srli_epi16(y0, 8); |
|
615 + y2 = _mm_srli_epi16(y2, 8); |
|
616 + y0 = _mm_packus_epi16(y0, y2); |
|
617 + *dest128++ = y0; |
|
618 + ++y0_ptr128; |
|
619 + ++y1_ptr128; |
|
620 + } while (dest128 < end128); |
|
621 +} |
|
622 + |
|
623 +} |
|
624 +} |
|
625 diff --git a/gfx/ycbcr/yuv_row.h b/gfx/ycbcr/yuv_row.h |
|
626 --- a/gfx/ycbcr/yuv_row.h |
|
627 +++ b/gfx/ycbcr/yuv_row.h |
|
628 @@ -5,109 +5,133 @@ |
|
629 // yuv_row internal functions to handle YUV conversion and scaling to RGB. |
|
630 // These functions are used from both yuv_convert.cc and yuv_scale.cc. |
|
631 |
|
632 // TODO(fbarchard): Write function that can handle rotation and scaling. |
|
633 |
|
634 #ifndef MEDIA_BASE_YUV_ROW_H_ |
|
635 #define MEDIA_BASE_YUV_ROW_H_ |
|
636 |
|
637 -#include "base/basictypes.h" |
|
638 +#include "chromium_types.h" |
|
639 |
|
640 extern "C" { |
|
641 // Can only do 1x. |
|
642 // This is the second fastest of the scalers. |
|
643 void FastConvertYUVToRGB32Row(const uint8* y_buf, |
|
644 const uint8* u_buf, |
|
645 const uint8* v_buf, |
|
646 uint8* rgb_buf, |
|
647 int width); |
|
648 |
|
649 -// Can do 1x, half size or any scale down by an integer amount. |
|
650 -// Step can be negative (mirroring, rotate 180). |
|
651 -// This is the third fastest of the scalers. |
|
652 -void ConvertYUVToRGB32Row(const uint8* y_buf, |
|
653 - const uint8* u_buf, |
|
654 - const uint8* v_buf, |
|
655 - uint8* rgb_buf, |
|
656 - int width, |
|
657 - int step); |
|
658 - |
|
659 -// Rotate is like Convert, but applies different step to Y versus U and V. |
|
660 -// This allows rotation by 90 or 270, by stepping by stride. |
|
661 -// This is the forth fastest of the scalers. |
|
662 -void RotateConvertYUVToRGB32Row(const uint8* y_buf, |
|
663 +void FastConvertYUVToRGB32Row_C(const uint8* y_buf, |
|
664 const uint8* u_buf, |
|
665 const uint8* v_buf, |
|
666 uint8* rgb_buf, |
|
667 int width, |
|
668 - int ystep, |
|
669 - int uvstep); |
|
670 + unsigned int x_shift); |
|
671 + |
|
672 +void FastConvertYUVToRGB32Row(const uint8* y_buf, |
|
673 + const uint8* u_buf, |
|
674 + const uint8* v_buf, |
|
675 + uint8* rgb_buf, |
|
676 + int width); |
|
677 + |
|
678 +// Can do 1x, half size or any scale down by an integer amount. |
|
679 +// Step can be negative (mirroring, rotate 180). |
|
680 +// This is the third fastest of the scalers. |
|
681 +// Only defined on Windows x86-32. |
|
682 +void ConvertYUVToRGB32Row_SSE(const uint8* y_buf, |
|
683 + const uint8* u_buf, |
|
684 + const uint8* v_buf, |
|
685 + uint8* rgb_buf, |
|
686 + int width, |
|
687 + int step); |
|
688 + |
|
689 +// Rotate is like Convert, but applies different step to Y versus U and V. |
|
690 +// This allows rotation by 90 or 270, by stepping by stride. |
|
691 +// This is the forth fastest of the scalers. |
|
692 +// Only defined on Windows x86-32. |
|
693 +void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf, |
|
694 + const uint8* u_buf, |
|
695 + const uint8* v_buf, |
|
696 + uint8* rgb_buf, |
|
697 + int width, |
|
698 + int ystep, |
|
699 + int uvstep); |
|
700 |
|
701 // Doubler does 4 pixels at a time. Each pixel is replicated. |
|
702 // This is the fastest of the scalers. |
|
703 -void DoubleYUVToRGB32Row(const uint8* y_buf, |
|
704 - const uint8* u_buf, |
|
705 - const uint8* v_buf, |
|
706 - uint8* rgb_buf, |
|
707 - int width); |
|
708 +// Only defined on Windows x86-32. |
|
709 +void DoubleYUVToRGB32Row_SSE(const uint8* y_buf, |
|
710 + const uint8* u_buf, |
|
711 + const uint8* v_buf, |
|
712 + uint8* rgb_buf, |
|
713 + int width); |
|
714 |
|
715 // Handles arbitrary scaling up or down. |
|
716 // Mirroring is supported, but not 90 or 270 degree rotation. |
|
717 // Chroma is under sampled every 2 pixels for performance. |
|
718 void ScaleYUVToRGB32Row(const uint8* y_buf, |
|
719 const uint8* u_buf, |
|
720 const uint8* v_buf, |
|
721 uint8* rgb_buf, |
|
722 int width, |
|
723 int source_dx); |
|
724 |
|
725 +void ScaleYUVToRGB32Row(const uint8* y_buf, |
|
726 + const uint8* u_buf, |
|
727 + const uint8* v_buf, |
|
728 + uint8* rgb_buf, |
|
729 + int width, |
|
730 + int source_dx); |
|
731 + |
|
732 +void ScaleYUVToRGB32Row_C(const uint8* y_buf, |
|
733 + const uint8* u_buf, |
|
734 + const uint8* v_buf, |
|
735 + uint8* rgb_buf, |
|
736 + int width, |
|
737 + int source_dx); |
|
738 + |
|
739 // Handles arbitrary scaling up or down with bilinear filtering. |
|
740 // Mirroring is supported, but not 90 or 270 degree rotation. |
|
741 // Chroma is under sampled every 2 pixels for performance. |
|
742 // This is the slowest of the scalers. |
|
743 void LinearScaleYUVToRGB32Row(const uint8* y_buf, |
|
744 const uint8* u_buf, |
|
745 const uint8* v_buf, |
|
746 uint8* rgb_buf, |
|
747 int width, |
|
748 int source_dx); |
|
749 |
|
750 +void LinearScaleYUVToRGB32Row(const uint8* y_buf, |
|
751 + const uint8* u_buf, |
|
752 + const uint8* v_buf, |
|
753 + uint8* rgb_buf, |
|
754 + int width, |
|
755 + int source_dx); |
|
756 + |
|
757 +void LinearScaleYUVToRGB32Row_C(const uint8* y_buf, |
|
758 + const uint8* u_buf, |
|
759 + const uint8* v_buf, |
|
760 + uint8* rgb_buf, |
|
761 + int width, |
|
762 + int source_dx); |
|
763 + |
|
764 + |
|
765 #if defined(_MSC_VER) |
|
766 #define SIMD_ALIGNED(var) __declspec(align(16)) var |
|
767 #else |
|
768 #define SIMD_ALIGNED(var) var __attribute__((aligned(16))) |
|
769 #endif |
|
770 extern SIMD_ALIGNED(int16 kCoefficientsRgbY[768][4]); |
|
771 |
|
772 -// Method to force C version. |
|
773 -//#define USE_MMX 0 |
|
774 -//#define USE_SSE2 0 |
|
775 - |
|
776 -#if !defined(USE_MMX) |
|
777 -// Windows, Mac and Linux/BSD use MMX |
|
778 -#if defined(__MMX__) || defined(_MSC_VER) |
|
779 -#define USE_MMX 1 |
|
780 -#else |
|
781 -#define USE_MMX 0 |
|
782 -#endif |
|
783 -#endif |
|
784 - |
|
785 -#if !defined(USE_SSE2) |
|
786 -#if defined(__SSE2__) || defined(ARCH_CPU_X86_64) || _M_IX86_FP==2 |
|
787 -#define USE_SSE2 1 |
|
788 -#else |
|
789 -#define USE_SSE2 0 |
|
790 -#endif |
|
791 -#endif |
|
792 - |
|
793 // x64 uses MMX2 (SSE) so emms is not required. |
|
794 // Warning C4799: function has no EMMS instruction. |
|
795 // EMMS() is slow and should be called by the calling function once per image. |
|
796 -#if USE_MMX && !defined(ARCH_CPU_X86_64) |
|
797 +#if defined(ARCH_CPU_X86) && !defined(ARCH_CPU_X86_64) |
|
798 #if defined(_MSC_VER) |
|
799 #define EMMS() __asm emms |
|
800 #pragma warning(disable: 4799) |
|
801 #else |
|
802 #define EMMS() asm("emms") |
|
803 #endif |
|
804 #else |
|
805 #define EMMS() |
|
806 diff --git a/gfx/ycbcr/yuv_row_c.cpp b/gfx/ycbcr/yuv_row_c.cpp |
|
807 --- a/gfx/ycbcr/yuv_row_c.cpp |
|
808 +++ b/gfx/ycbcr/yuv_row_c.cpp |
|
809 @@ -1,812 +1,18 @@ |
|
810 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
|
811 // Use of this source code is governed by a BSD-style license that can be |
|
812 // found in the LICENSE file. |
|
813 |
|
814 -#include "media/base/yuv_row.h" |
|
815 - |
|
816 -#ifdef _DEBUG |
|
817 -#include "base/logging.h" |
|
818 -#else |
|
819 +#include "yuv_row.h" |
|
820 + |
|
821 #define DCHECK(a) |
|
822 -#endif |
|
823 |
|
824 extern "C" { |
|
825 |
|
826 -#if USE_SSE2 && defined(ARCH_CPU_X86_64) |
|
827 - |
|
828 -// AMD64 ABI uses register paremters. |
|
829 -void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi |
|
830 - const uint8* u_buf, // rsi |
|
831 - const uint8* v_buf, // rdx |
|
832 - uint8* rgb_buf, // rcx |
|
833 - int width) { // r8 |
|
834 - asm( |
|
835 - "jmp convertend\n" |
|
836 -"convertloop:" |
|
837 - "movzb (%1),%%r10\n" |
|
838 - "add $0x1,%1\n" |
|
839 - "movzb (%2),%%r11\n" |
|
840 - "add $0x1,%2\n" |
|
841 - "movq 2048(%5,%%r10,8),%%xmm0\n" |
|
842 - "movzb (%0),%%r10\n" |
|
843 - "movq 4096(%5,%%r11,8),%%xmm1\n" |
|
844 - "movzb 0x1(%0),%%r11\n" |
|
845 - "paddsw %%xmm1,%%xmm0\n" |
|
846 - "movq (%5,%%r10,8),%%xmm2\n" |
|
847 - "add $0x2,%0\n" |
|
848 - "movq (%5,%%r11,8),%%xmm3\n" |
|
849 - "paddsw %%xmm0,%%xmm2\n" |
|
850 - "paddsw %%xmm0,%%xmm3\n" |
|
851 - "shufps $0x44,%%xmm3,%%xmm2\n" |
|
852 - "psraw $0x6,%%xmm2\n" |
|
853 - "packuswb %%xmm2,%%xmm2\n" |
|
854 - "movq %%xmm2,0x0(%3)\n" |
|
855 - "add $0x8,%3\n" |
|
856 -"convertend:" |
|
857 - "sub $0x2,%4\n" |
|
858 - "jns convertloop\n" |
|
859 - |
|
860 -"convertnext:" |
|
861 - "add $0x1,%4\n" |
|
862 - "js convertdone\n" |
|
863 - |
|
864 - "movzb (%1),%%r10\n" |
|
865 - "movq 2048(%5,%%r10,8),%%xmm0\n" |
|
866 - "movzb (%2),%%r10\n" |
|
867 - "movq 4096(%5,%%r10,8),%%xmm1\n" |
|
868 - "paddsw %%xmm1,%%xmm0\n" |
|
869 - "movzb (%0),%%r10\n" |
|
870 - "movq (%5,%%r10,8),%%xmm1\n" |
|
871 - "paddsw %%xmm0,%%xmm1\n" |
|
872 - "psraw $0x6,%%xmm1\n" |
|
873 - "packuswb %%xmm1,%%xmm1\n" |
|
874 - "movd %%xmm1,0x0(%3)\n" |
|
875 -"convertdone:" |
|
876 - : |
|
877 - : "r"(y_buf), // %0 |
|
878 - "r"(u_buf), // %1 |
|
879 - "r"(v_buf), // %2 |
|
880 - "r"(rgb_buf), // %3 |
|
881 - "r"(width), // %4 |
|
882 - "r" (kCoefficientsRgbY) // %5 |
|
883 - : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" |
|
884 -); |
|
885 -} |
|
886 - |
|
887 -void ScaleYUVToRGB32Row(const uint8* y_buf, // rdi |
|
888 - const uint8* u_buf, // rsi |
|
889 - const uint8* v_buf, // rdx |
|
890 - uint8* rgb_buf, // rcx |
|
891 - int width, // r8 |
|
892 - int source_dx) { // r9 |
|
893 - asm( |
|
894 - "xor %%r11,%%r11\n" |
|
895 - "sub $0x2,%4\n" |
|
896 - "js scalenext\n" |
|
897 - |
|
898 -"scaleloop:" |
|
899 - "mov %%r11,%%r10\n" |
|
900 - "sar $0x11,%%r10\n" |
|
901 - "movzb (%1,%%r10,1),%%rax\n" |
|
902 - "movq 2048(%5,%%rax,8),%%xmm0\n" |
|
903 - "movzb (%2,%%r10,1),%%rax\n" |
|
904 - "movq 4096(%5,%%rax,8),%%xmm1\n" |
|
905 - "lea (%%r11,%6),%%r10\n" |
|
906 - "sar $0x10,%%r11\n" |
|
907 - "movzb (%0,%%r11,1),%%rax\n" |
|
908 - "paddsw %%xmm1,%%xmm0\n" |
|
909 - "movq (%5,%%rax,8),%%xmm1\n" |
|
910 - "lea (%%r10,%6),%%r11\n" |
|
911 - "sar $0x10,%%r10\n" |
|
912 - "movzb (%0,%%r10,1),%%rax\n" |
|
913 - "movq (%5,%%rax,8),%%xmm2\n" |
|
914 - "paddsw %%xmm0,%%xmm1\n" |
|
915 - "paddsw %%xmm0,%%xmm2\n" |
|
916 - "shufps $0x44,%%xmm2,%%xmm1\n" |
|
917 - "psraw $0x6,%%xmm1\n" |
|
918 - "packuswb %%xmm1,%%xmm1\n" |
|
919 - "movq %%xmm1,0x0(%3)\n" |
|
920 - "add $0x8,%3\n" |
|
921 - "sub $0x2,%4\n" |
|
922 - "jns scaleloop\n" |
|
923 - |
|
924 -"scalenext:" |
|
925 - "add $0x1,%4\n" |
|
926 - "js scaledone\n" |
|
927 - |
|
928 - "mov %%r11,%%r10\n" |
|
929 - "sar $0x11,%%r10\n" |
|
930 - "movzb (%1,%%r10,1),%%rax\n" |
|
931 - "movq 2048(%5,%%rax,8),%%xmm0\n" |
|
932 - "movzb (%2,%%r10,1),%%rax\n" |
|
933 - "movq 4096(%5,%%rax,8),%%xmm1\n" |
|
934 - "paddsw %%xmm1,%%xmm0\n" |
|
935 - "sar $0x10,%%r11\n" |
|
936 - "movzb (%0,%%r11,1),%%rax\n" |
|
937 - "movq (%5,%%rax,8),%%xmm1\n" |
|
938 - "paddsw %%xmm0,%%xmm1\n" |
|
939 - "psraw $0x6,%%xmm1\n" |
|
940 - "packuswb %%xmm1,%%xmm1\n" |
|
941 - "movd %%xmm1,0x0(%3)\n" |
|
942 - |
|
943 -"scaledone:" |
|
944 - : |
|
945 - : "r"(y_buf), // %0 |
|
946 - "r"(u_buf), // %1 |
|
947 - "r"(v_buf), // %2 |
|
948 - "r"(rgb_buf), // %3 |
|
949 - "r"(width), // %4 |
|
950 - "r" (kCoefficientsRgbY), // %5 |
|
951 - "r"(static_cast<long>(source_dx)) // %6 |
|
952 - : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2" |
|
953 -); |
|
954 -} |
|
955 - |
|
956 -void LinearScaleYUVToRGB32Row(const uint8* y_buf, |
|
957 - const uint8* u_buf, |
|
958 - const uint8* v_buf, |
|
959 - uint8* rgb_buf, |
|
960 - int width, |
|
961 - int source_dx) { |
|
962 - asm( |
|
963 - "xor %%r11,%%r11\n" // x = 0 |
|
964 - "sub $0x2,%4\n" |
|
965 - "js .lscalenext\n" |
|
966 - "cmp $0x20000,%6\n" // if source_dx >= 2.0 |
|
967 - "jl .lscalehalf\n" |
|
968 - "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less |
|
969 -".lscalehalf:" |
|
970 - |
|
971 -".lscaleloop:" |
|
972 - "mov %%r11,%%r10\n" |
|
973 - "sar $0x11,%%r10\n" |
|
974 - |
|
975 - "movzb (%1, %%r10, 1), %%r13 \n" |
|
976 - "movzb 1(%1, %%r10, 1), %%r14 \n" |
|
977 - "mov %%r11, %%rax \n" |
|
978 - "and $0x1fffe, %%rax \n" |
|
979 - "imul %%rax, %%r14 \n" |
|
980 - "xor $0x1fffe, %%rax \n" |
|
981 - "imul %%rax, %%r13 \n" |
|
982 - "add %%r14, %%r13 \n" |
|
983 - "shr $17, %%r13 \n" |
|
984 - "movq 2048(%5,%%r13,8), %%xmm0\n" |
|
985 - |
|
986 - "movzb (%2, %%r10, 1), %%r13 \n" |
|
987 - "movzb 1(%2, %%r10, 1), %%r14 \n" |
|
988 - "mov %%r11, %%rax \n" |
|
989 - "and $0x1fffe, %%rax \n" |
|
990 - "imul %%rax, %%r14 \n" |
|
991 - "xor $0x1fffe, %%rax \n" |
|
992 - "imul %%rax, %%r13 \n" |
|
993 - "add %%r14, %%r13 \n" |
|
994 - "shr $17, %%r13 \n" |
|
995 - "movq 4096(%5,%%r13,8), %%xmm1\n" |
|
996 - |
|
997 - "mov %%r11, %%rax \n" |
|
998 - "lea (%%r11,%6),%%r10\n" |
|
999 - "sar $0x10,%%r11\n" |
|
1000 - "paddsw %%xmm1,%%xmm0\n" |
|
1001 - |
|
1002 - "movzb (%0, %%r11, 1), %%r13 \n" |
|
1003 - "movzb 1(%0, %%r11, 1), %%r14 \n" |
|
1004 - "and $0xffff, %%rax \n" |
|
1005 - "imul %%rax, %%r14 \n" |
|
1006 - "xor $0xffff, %%rax \n" |
|
1007 - "imul %%rax, %%r13 \n" |
|
1008 - "add %%r14, %%r13 \n" |
|
1009 - "shr $16, %%r13 \n" |
|
1010 - "movq (%5,%%r13,8),%%xmm1\n" |
|
1011 - |
|
1012 - "mov %%r10, %%rax \n" |
|
1013 - "lea (%%r10,%6),%%r11\n" |
|
1014 - "sar $0x10,%%r10\n" |
|
1015 - |
|
1016 - "movzb (%0,%%r10,1), %%r13 \n" |
|
1017 - "movzb 1(%0,%%r10,1), %%r14 \n" |
|
1018 - "and $0xffff, %%rax \n" |
|
1019 - "imul %%rax, %%r14 \n" |
|
1020 - "xor $0xffff, %%rax \n" |
|
1021 - "imul %%rax, %%r13 \n" |
|
1022 - "add %%r14, %%r13 \n" |
|
1023 - "shr $16, %%r13 \n" |
|
1024 - "movq (%5,%%r13,8),%%xmm2\n" |
|
1025 - |
|
1026 - "paddsw %%xmm0,%%xmm1\n" |
|
1027 - "paddsw %%xmm0,%%xmm2\n" |
|
1028 - "shufps $0x44,%%xmm2,%%xmm1\n" |
|
1029 - "psraw $0x6,%%xmm1\n" |
|
1030 - "packuswb %%xmm1,%%xmm1\n" |
|
1031 - "movq %%xmm1,0x0(%3)\n" |
|
1032 - "add $0x8,%3\n" |
|
1033 - "sub $0x2,%4\n" |
|
1034 - "jns .lscaleloop\n" |
|
1035 - |
|
1036 -".lscalenext:" |
|
1037 - "add $0x1,%4\n" |
|
1038 - "js .lscaledone\n" |
|
1039 - |
|
1040 - "mov %%r11,%%r10\n" |
|
1041 - "sar $0x11,%%r10\n" |
|
1042 - |
|
1043 - "movzb (%1,%%r10,1), %%r13 \n" |
|
1044 - "movq 2048(%5,%%r13,8),%%xmm0\n" |
|
1045 - |
|
1046 - "movzb (%2,%%r10,1), %%r13 \n" |
|
1047 - "movq 4096(%5,%%r13,8),%%xmm1\n" |
|
1048 - |
|
1049 - "paddsw %%xmm1,%%xmm0\n" |
|
1050 - "sar $0x10,%%r11\n" |
|
1051 - |
|
1052 - "movzb (%0,%%r11,1), %%r13 \n" |
|
1053 - "movq (%5,%%r13,8),%%xmm1\n" |
|
1054 - |
|
1055 - "paddsw %%xmm0,%%xmm1\n" |
|
1056 - "psraw $0x6,%%xmm1\n" |
|
1057 - "packuswb %%xmm1,%%xmm1\n" |
|
1058 - "movd %%xmm1,0x0(%3)\n" |
|
1059 - |
|
1060 -".lscaledone:" |
|
1061 - : |
|
1062 - : "r"(y_buf), // %0 |
|
1063 - "r"(u_buf), // %1 |
|
1064 - "r"(v_buf), // %2 |
|
1065 - "r"(rgb_buf), // %3 |
|
1066 - "r"(width), // %4 |
|
1067 - "r" (kCoefficientsRgbY), // %5 |
|
1068 - "r"(static_cast<long>(source_dx)) // %6 |
|
1069 - : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2" |
|
1070 -); |
|
1071 -} |
|
1072 - |
|
1073 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__) |
|
1074 - |
|
1075 -// PIC version is slower because less registers are available, so |
|
1076 -// non-PIC is used on platforms where it is possible. |
|
1077 - |
|
1078 -void FastConvertYUVToRGB32Row(const uint8* y_buf, |
|
1079 - const uint8* u_buf, |
|
1080 - const uint8* v_buf, |
|
1081 - uint8* rgb_buf, |
|
1082 - int width); |
|
1083 - asm( |
|
1084 - ".text\n" |
|
1085 - ".global FastConvertYUVToRGB32Row\n" |
|
1086 -"FastConvertYUVToRGB32Row:\n" |
|
1087 - "pusha\n" |
|
1088 - "mov 0x24(%esp),%edx\n" |
|
1089 - "mov 0x28(%esp),%edi\n" |
|
1090 - "mov 0x2c(%esp),%esi\n" |
|
1091 - "mov 0x30(%esp),%ebp\n" |
|
1092 - "mov 0x34(%esp),%ecx\n" |
|
1093 - "jmp convertend\n" |
|
1094 - |
|
1095 -"convertloop:" |
|
1096 - "movzbl (%edi),%eax\n" |
|
1097 - "add $0x1,%edi\n" |
|
1098 - "movzbl (%esi),%ebx\n" |
|
1099 - "add $0x1,%esi\n" |
|
1100 - "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" |
|
1101 - "movzbl (%edx),%eax\n" |
|
1102 - "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" |
|
1103 - "movzbl 0x1(%edx),%ebx\n" |
|
1104 - "movq kCoefficientsRgbY(,%eax,8),%mm1\n" |
|
1105 - "add $0x2,%edx\n" |
|
1106 - "movq kCoefficientsRgbY(,%ebx,8),%mm2\n" |
|
1107 - "paddsw %mm0,%mm1\n" |
|
1108 - "paddsw %mm0,%mm2\n" |
|
1109 - "psraw $0x6,%mm1\n" |
|
1110 - "psraw $0x6,%mm2\n" |
|
1111 - "packuswb %mm2,%mm1\n" |
|
1112 - "movntq %mm1,0x0(%ebp)\n" |
|
1113 - "add $0x8,%ebp\n" |
|
1114 -"convertend:" |
|
1115 - "sub $0x2,%ecx\n" |
|
1116 - "jns convertloop\n" |
|
1117 - |
|
1118 - "and $0x1,%ecx\n" |
|
1119 - "je convertdone\n" |
|
1120 - |
|
1121 - "movzbl (%edi),%eax\n" |
|
1122 - "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" |
|
1123 - "movzbl (%esi),%eax\n" |
|
1124 - "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" |
|
1125 - "movzbl (%edx),%eax\n" |
|
1126 - "movq kCoefficientsRgbY(,%eax,8),%mm1\n" |
|
1127 - "paddsw %mm0,%mm1\n" |
|
1128 - "psraw $0x6,%mm1\n" |
|
1129 - "packuswb %mm1,%mm1\n" |
|
1130 - "movd %mm1,0x0(%ebp)\n" |
|
1131 -"convertdone:" |
|
1132 - "popa\n" |
|
1133 - "ret\n" |
|
1134 -); |
|
1135 - |
|
1136 - |
|
1137 -void ScaleYUVToRGB32Row(const uint8* y_buf, |
|
1138 - const uint8* u_buf, |
|
1139 - const uint8* v_buf, |
|
1140 - uint8* rgb_buf, |
|
1141 - int width, |
|
1142 - int source_dx); |
|
1143 - asm( |
|
1144 - ".text\n" |
|
1145 - ".global ScaleYUVToRGB32Row\n" |
|
1146 -"ScaleYUVToRGB32Row:\n" |
|
1147 - "pusha\n" |
|
1148 - "mov 0x24(%esp),%edx\n" |
|
1149 - "mov 0x28(%esp),%edi\n" |
|
1150 - "mov 0x2c(%esp),%esi\n" |
|
1151 - "mov 0x30(%esp),%ebp\n" |
|
1152 - "mov 0x34(%esp),%ecx\n" |
|
1153 - "xor %ebx,%ebx\n" |
|
1154 - "jmp scaleend\n" |
|
1155 - |
|
1156 -"scaleloop:" |
|
1157 - "mov %ebx,%eax\n" |
|
1158 - "sar $0x11,%eax\n" |
|
1159 - "movzbl (%edi,%eax,1),%eax\n" |
|
1160 - "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" |
|
1161 - "mov %ebx,%eax\n" |
|
1162 - "sar $0x11,%eax\n" |
|
1163 - "movzbl (%esi,%eax,1),%eax\n" |
|
1164 - "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" |
|
1165 - "mov %ebx,%eax\n" |
|
1166 - "add 0x38(%esp),%ebx\n" |
|
1167 - "sar $0x10,%eax\n" |
|
1168 - "movzbl (%edx,%eax,1),%eax\n" |
|
1169 - "movq kCoefficientsRgbY(,%eax,8),%mm1\n" |
|
1170 - "mov %ebx,%eax\n" |
|
1171 - "add 0x38(%esp),%ebx\n" |
|
1172 - "sar $0x10,%eax\n" |
|
1173 - "movzbl (%edx,%eax,1),%eax\n" |
|
1174 - "movq kCoefficientsRgbY(,%eax,8),%mm2\n" |
|
1175 - "paddsw %mm0,%mm1\n" |
|
1176 - "paddsw %mm0,%mm2\n" |
|
1177 - "psraw $0x6,%mm1\n" |
|
1178 - "psraw $0x6,%mm2\n" |
|
1179 - "packuswb %mm2,%mm1\n" |
|
1180 - "movntq %mm1,0x0(%ebp)\n" |
|
1181 - "add $0x8,%ebp\n" |
|
1182 -"scaleend:" |
|
1183 - "sub $0x2,%ecx\n" |
|
1184 - "jns scaleloop\n" |
|
1185 - |
|
1186 - "and $0x1,%ecx\n" |
|
1187 - "je scaledone\n" |
|
1188 - |
|
1189 - "mov %ebx,%eax\n" |
|
1190 - "sar $0x11,%eax\n" |
|
1191 - "movzbl (%edi,%eax,1),%eax\n" |
|
1192 - "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" |
|
1193 - "mov %ebx,%eax\n" |
|
1194 - "sar $0x11,%eax\n" |
|
1195 - "movzbl (%esi,%eax,1),%eax\n" |
|
1196 - "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" |
|
1197 - "mov %ebx,%eax\n" |
|
1198 - "sar $0x10,%eax\n" |
|
1199 - "movzbl (%edx,%eax,1),%eax\n" |
|
1200 - "movq kCoefficientsRgbY(,%eax,8),%mm1\n" |
|
1201 - "paddsw %mm0,%mm1\n" |
|
1202 - "psraw $0x6,%mm1\n" |
|
1203 - "packuswb %mm1,%mm1\n" |
|
1204 - "movd %mm1,0x0(%ebp)\n" |
|
1205 - |
|
1206 -"scaledone:" |
|
1207 - "popa\n" |
|
1208 - "ret\n" |
|
1209 -); |
|
1210 - |
|
1211 -void LinearScaleYUVToRGB32Row(const uint8* y_buf, |
|
1212 - const uint8* u_buf, |
|
1213 - const uint8* v_buf, |
|
1214 - uint8* rgb_buf, |
|
1215 - int width, |
|
1216 - int source_dx); |
|
1217 - asm( |
|
1218 - ".text\n" |
|
1219 - ".global LinearScaleYUVToRGB32Row\n" |
|
1220 -"LinearScaleYUVToRGB32Row:\n" |
|
1221 - "pusha\n" |
|
1222 - "mov 0x24(%esp),%edx\n" |
|
1223 - "mov 0x28(%esp),%edi\n" |
|
1224 - "mov 0x30(%esp),%ebp\n" |
|
1225 - |
|
1226 - // source_width = width * source_dx + ebx |
|
1227 - "mov 0x34(%esp), %ecx\n" |
|
1228 - "imull 0x38(%esp), %ecx\n" |
|
1229 - "mov %ecx, 0x34(%esp)\n" |
|
1230 - |
|
1231 - "mov 0x38(%esp), %ecx\n" |
|
1232 - "xor %ebx,%ebx\n" // x = 0 |
|
1233 - "cmp $0x20000,%ecx\n" // if source_dx >= 2.0 |
|
1234 - "jl .lscaleend\n" |
|
1235 - "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less |
|
1236 - "jmp .lscaleend\n" |
|
1237 - |
|
1238 -".lscaleloop:" |
|
1239 - "mov %ebx,%eax\n" |
|
1240 - "sar $0x11,%eax\n" |
|
1241 - |
|
1242 - "movzbl (%edi,%eax,1),%ecx\n" |
|
1243 - "movzbl 1(%edi,%eax,1),%esi\n" |
|
1244 - "mov %ebx,%eax\n" |
|
1245 - "andl $0x1fffe, %eax \n" |
|
1246 - "imul %eax, %esi \n" |
|
1247 - "xorl $0x1fffe, %eax \n" |
|
1248 - "imul %eax, %ecx \n" |
|
1249 - "addl %esi, %ecx \n" |
|
1250 - "shrl $17, %ecx \n" |
|
1251 - "movq kCoefficientsRgbY+2048(,%ecx,8),%mm0\n" |
|
1252 - |
|
1253 - "mov 0x2c(%esp),%esi\n" |
|
1254 - "mov %ebx,%eax\n" |
|
1255 - "sar $0x11,%eax\n" |
|
1256 - |
|
1257 - "movzbl (%esi,%eax,1),%ecx\n" |
|
1258 - "movzbl 1(%esi,%eax,1),%esi\n" |
|
1259 - "mov %ebx,%eax\n" |
|
1260 - "andl $0x1fffe, %eax \n" |
|
1261 - "imul %eax, %esi \n" |
|
1262 - "xorl $0x1fffe, %eax \n" |
|
1263 - "imul %eax, %ecx \n" |
|
1264 - "addl %esi, %ecx \n" |
|
1265 - "shrl $17, %ecx \n" |
|
1266 - "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n" |
|
1267 - |
|
1268 - "mov %ebx,%eax\n" |
|
1269 - "sar $0x10,%eax\n" |
|
1270 - "movzbl (%edx,%eax,1),%ecx\n" |
|
1271 - "movzbl 1(%edx,%eax,1),%esi\n" |
|
1272 - "mov %ebx,%eax\n" |
|
1273 - "add 0x38(%esp),%ebx\n" |
|
1274 - "andl $0xffff, %eax \n" |
|
1275 - "imul %eax, %esi \n" |
|
1276 - "xorl $0xffff, %eax \n" |
|
1277 - "imul %eax, %ecx \n" |
|
1278 - "addl %esi, %ecx \n" |
|
1279 - "shrl $16, %ecx \n" |
|
1280 - "movq kCoefficientsRgbY(,%ecx,8),%mm1\n" |
|
1281 - |
|
1282 - "cmp 0x34(%esp), %ebx\n" |
|
1283 - "jge .lscalelastpixel\n" |
|
1284 - |
|
1285 - "mov %ebx,%eax\n" |
|
1286 - "sar $0x10,%eax\n" |
|
1287 - "movzbl (%edx,%eax,1),%ecx\n" |
|
1288 - "movzbl 1(%edx,%eax,1),%esi\n" |
|
1289 - "mov %ebx,%eax\n" |
|
1290 - "add 0x38(%esp),%ebx\n" |
|
1291 - "andl $0xffff, %eax \n" |
|
1292 - "imul %eax, %esi \n" |
|
1293 - "xorl $0xffff, %eax \n" |
|
1294 - "imul %eax, %ecx \n" |
|
1295 - "addl %esi, %ecx \n" |
|
1296 - "shrl $16, %ecx \n" |
|
1297 - "movq kCoefficientsRgbY(,%ecx,8),%mm2\n" |
|
1298 - |
|
1299 - "paddsw %mm0,%mm1\n" |
|
1300 - "paddsw %mm0,%mm2\n" |
|
1301 - "psraw $0x6,%mm1\n" |
|
1302 - "psraw $0x6,%mm2\n" |
|
1303 - "packuswb %mm2,%mm1\n" |
|
1304 - "movntq %mm1,0x0(%ebp)\n" |
|
1305 - "add $0x8,%ebp\n" |
|
1306 - |
|
1307 -".lscaleend:" |
|
1308 - "cmp 0x34(%esp), %ebx\n" |
|
1309 - "jl .lscaleloop\n" |
|
1310 - "popa\n" |
|
1311 - "ret\n" |
|
1312 - |
|
1313 -".lscalelastpixel:" |
|
1314 - "paddsw %mm0, %mm1\n" |
|
1315 - "psraw $6, %mm1\n" |
|
1316 - "packuswb %mm1, %mm1\n" |
|
1317 - "movd %mm1, (%ebp)\n" |
|
1318 - "popa\n" |
|
1319 - "ret\n" |
|
1320 -); |
|
1321 - |
|
1322 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__) |
|
1323 - |
|
1324 -extern void PICConvertYUVToRGB32Row(const uint8* y_buf, |
|
1325 - const uint8* u_buf, |
|
1326 - const uint8* v_buf, |
|
1327 - uint8* rgb_buf, |
|
1328 - int width, |
|
1329 - int16 *kCoefficientsRgbY); |
|
1330 - asm( |
|
1331 - ".text\n" |
|
1332 -#if defined(OS_MACOSX) |
|
1333 -"_PICConvertYUVToRGB32Row:\n" |
|
1334 -#else |
|
1335 -"PICConvertYUVToRGB32Row:\n" |
|
1336 -#endif |
|
1337 - "pusha\n" |
|
1338 - "mov 0x24(%esp),%edx\n" |
|
1339 - "mov 0x28(%esp),%edi\n" |
|
1340 - "mov 0x2c(%esp),%esi\n" |
|
1341 - "mov 0x30(%esp),%ebp\n" |
|
1342 - "mov 0x38(%esp),%ecx\n" |
|
1343 - |
|
1344 - "jmp .Lconvertend\n" |
|
1345 - |
|
1346 -".Lconvertloop:" |
|
1347 - "movzbl (%edi),%eax\n" |
|
1348 - "add $0x1,%edi\n" |
|
1349 - "movzbl (%esi),%ebx\n" |
|
1350 - "add $0x1,%esi\n" |
|
1351 - "movq 2048(%ecx,%eax,8),%mm0\n" |
|
1352 - "movzbl (%edx),%eax\n" |
|
1353 - "paddsw 4096(%ecx,%ebx,8),%mm0\n" |
|
1354 - "movzbl 0x1(%edx),%ebx\n" |
|
1355 - "movq 0(%ecx,%eax,8),%mm1\n" |
|
1356 - "add $0x2,%edx\n" |
|
1357 - "movq 0(%ecx,%ebx,8),%mm2\n" |
|
1358 - "paddsw %mm0,%mm1\n" |
|
1359 - "paddsw %mm0,%mm2\n" |
|
1360 - "psraw $0x6,%mm1\n" |
|
1361 - "psraw $0x6,%mm2\n" |
|
1362 - "packuswb %mm2,%mm1\n" |
|
1363 - "movntq %mm1,0x0(%ebp)\n" |
|
1364 - "add $0x8,%ebp\n" |
|
1365 -".Lconvertend:" |
|
1366 - "subl $0x2,0x34(%esp)\n" |
|
1367 - "jns .Lconvertloop\n" |
|
1368 - |
|
1369 - "andl $0x1,0x34(%esp)\n" |
|
1370 - "je .Lconvertdone\n" |
|
1371 - |
|
1372 - "movzbl (%edi),%eax\n" |
|
1373 - "movq 2048(%ecx,%eax,8),%mm0\n" |
|
1374 - "movzbl (%esi),%eax\n" |
|
1375 - "paddsw 4096(%ecx,%eax,8),%mm0\n" |
|
1376 - "movzbl (%edx),%eax\n" |
|
1377 - "movq 0(%ecx,%eax,8),%mm1\n" |
|
1378 - "paddsw %mm0,%mm1\n" |
|
1379 - "psraw $0x6,%mm1\n" |
|
1380 - "packuswb %mm1,%mm1\n" |
|
1381 - "movd %mm1,0x0(%ebp)\n" |
|
1382 -".Lconvertdone:\n" |
|
1383 - "popa\n" |
|
1384 - "ret\n" |
|
1385 -); |
|
1386 - |
|
1387 -void FastConvertYUVToRGB32Row(const uint8* y_buf, |
|
1388 - const uint8* u_buf, |
|
1389 - const uint8* v_buf, |
|
1390 - uint8* rgb_buf, |
|
1391 - int width) { |
|
1392 - PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, |
|
1393 - &kCoefficientsRgbY[0][0]); |
|
1394 -} |
|
1395 - |
|
1396 -extern void PICScaleYUVToRGB32Row(const uint8* y_buf, |
|
1397 - const uint8* u_buf, |
|
1398 - const uint8* v_buf, |
|
1399 - uint8* rgb_buf, |
|
1400 - int width, |
|
1401 - int source_dx, |
|
1402 - int16 *kCoefficientsRgbY); |
|
1403 - |
|
1404 - asm( |
|
1405 - ".text\n" |
|
1406 -#if defined(OS_MACOSX) |
|
1407 -"_PICScaleYUVToRGB32Row:\n" |
|
1408 -#else |
|
1409 -"PICScaleYUVToRGB32Row:\n" |
|
1410 -#endif |
|
1411 - "pusha\n" |
|
1412 - "mov 0x24(%esp),%edx\n" |
|
1413 - "mov 0x28(%esp),%edi\n" |
|
1414 - "mov 0x2c(%esp),%esi\n" |
|
1415 - "mov 0x30(%esp),%ebp\n" |
|
1416 - "mov 0x3c(%esp),%ecx\n" |
|
1417 - "xor %ebx,%ebx\n" |
|
1418 - "jmp Lscaleend\n" |
|
1419 - |
|
1420 -"Lscaleloop:" |
|
1421 - "mov %ebx,%eax\n" |
|
1422 - "sar $0x11,%eax\n" |
|
1423 - "movzbl (%edi,%eax,1),%eax\n" |
|
1424 - "movq 2048(%ecx,%eax,8),%mm0\n" |
|
1425 - "mov %ebx,%eax\n" |
|
1426 - "sar $0x11,%eax\n" |
|
1427 - "movzbl (%esi,%eax,1),%eax\n" |
|
1428 - "paddsw 4096(%ecx,%eax,8),%mm0\n" |
|
1429 - "mov %ebx,%eax\n" |
|
1430 - "add 0x38(%esp),%ebx\n" |
|
1431 - "sar $0x10,%eax\n" |
|
1432 - "movzbl (%edx,%eax,1),%eax\n" |
|
1433 - "movq 0(%ecx,%eax,8),%mm1\n" |
|
1434 - "mov %ebx,%eax\n" |
|
1435 - "add 0x38(%esp),%ebx\n" |
|
1436 - "sar $0x10,%eax\n" |
|
1437 - "movzbl (%edx,%eax,1),%eax\n" |
|
1438 - "movq 0(%ecx,%eax,8),%mm2\n" |
|
1439 - "paddsw %mm0,%mm1\n" |
|
1440 - "paddsw %mm0,%mm2\n" |
|
1441 - "psraw $0x6,%mm1\n" |
|
1442 - "psraw $0x6,%mm2\n" |
|
1443 - "packuswb %mm2,%mm1\n" |
|
1444 - "movntq %mm1,0x0(%ebp)\n" |
|
1445 - "add $0x8,%ebp\n" |
|
1446 -"Lscaleend:" |
|
1447 - "subl $0x2,0x34(%esp)\n" |
|
1448 - "jns Lscaleloop\n" |
|
1449 - |
|
1450 - "andl $0x1,0x34(%esp)\n" |
|
1451 - "je Lscaledone\n" |
|
1452 - |
|
1453 - "mov %ebx,%eax\n" |
|
1454 - "sar $0x11,%eax\n" |
|
1455 - "movzbl (%edi,%eax,1),%eax\n" |
|
1456 - "movq 2048(%ecx,%eax,8),%mm0\n" |
|
1457 - "mov %ebx,%eax\n" |
|
1458 - "sar $0x11,%eax\n" |
|
1459 - "movzbl (%esi,%eax,1),%eax\n" |
|
1460 - "paddsw 4096(%ecx,%eax,8),%mm0\n" |
|
1461 - "mov %ebx,%eax\n" |
|
1462 - "sar $0x10,%eax\n" |
|
1463 - "movzbl (%edx,%eax,1),%eax\n" |
|
1464 - "movq 0(%ecx,%eax,8),%mm1\n" |
|
1465 - "paddsw %mm0,%mm1\n" |
|
1466 - "psraw $0x6,%mm1\n" |
|
1467 - "packuswb %mm1,%mm1\n" |
|
1468 - "movd %mm1,0x0(%ebp)\n" |
|
1469 - |
|
1470 -"Lscaledone:" |
|
1471 - "popa\n" |
|
1472 - "ret\n" |
|
1473 -); |
|
1474 - |
|
1475 - |
|
1476 -void ScaleYUVToRGB32Row(const uint8* y_buf, |
|
1477 - const uint8* u_buf, |
|
1478 - const uint8* v_buf, |
|
1479 - uint8* rgb_buf, |
|
1480 - int width, |
|
1481 - int source_dx) { |
|
1482 - PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx, |
|
1483 - &kCoefficientsRgbY[0][0]); |
|
1484 -} |
|
1485 - |
|
1486 -void PICLinearScaleYUVToRGB32Row(const uint8* y_buf, |
|
1487 - const uint8* u_buf, |
|
1488 - const uint8* v_buf, |
|
1489 - uint8* rgb_buf, |
|
1490 - int width, |
|
1491 - int source_dx, |
|
1492 - int16 *kCoefficientsRgbY); |
|
1493 - asm( |
|
1494 - ".text\n" |
|
1495 -#if defined(OS_MACOSX) |
|
1496 -"_PICLinearScaleYUVToRGB32Row:\n" |
|
1497 -#else |
|
1498 -"PICLinearScaleYUVToRGB32Row:\n" |
|
1499 -#endif |
|
1500 - "pusha\n" |
|
1501 - "mov 0x24(%esp),%edx\n" |
|
1502 - "mov 0x30(%esp),%ebp\n" |
|
1503 - "mov 0x34(%esp),%ecx\n" |
|
1504 - "mov 0x3c(%esp),%edi\n" |
|
1505 - "xor %ebx,%ebx\n" |
|
1506 - |
|
1507 - // source_width = width * source_dx + ebx |
|
1508 - "mov 0x34(%esp), %ecx\n" |
|
1509 - "imull 0x38(%esp), %ecx\n" |
|
1510 - "mov %ecx, 0x34(%esp)\n" |
|
1511 - |
|
1512 - "mov 0x38(%esp), %ecx\n" |
|
1513 - "xor %ebx,%ebx\n" // x = 0 |
|
1514 - "cmp $0x20000,%ecx\n" // if source_dx >= 2.0 |
|
1515 - "jl .lscaleend\n" |
|
1516 - "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less |
|
1517 - "jmp .lscaleend\n" |
|
1518 - |
|
1519 -".lscaleloop:" |
|
1520 - "mov 0x28(%esp),%esi\n" |
|
1521 - "mov %ebx,%eax\n" |
|
1522 - "sar $0x11,%eax\n" |
|
1523 - |
|
1524 - "movzbl (%esi,%eax,1),%ecx\n" |
|
1525 - "movzbl 1(%esi,%eax,1),%esi\n" |
|
1526 - "mov %ebx,%eax\n" |
|
1527 - "andl $0x1fffe, %eax \n" |
|
1528 - "imul %eax, %esi \n" |
|
1529 - "xorl $0x1fffe, %eax \n" |
|
1530 - "imul %eax, %ecx \n" |
|
1531 - "addl %esi, %ecx \n" |
|
1532 - "shrl $17, %ecx \n" |
|
1533 - "movq 2048(%edi,%ecx,8),%mm0\n" |
|
1534 - |
|
1535 - "mov 0x2c(%esp),%esi\n" |
|
1536 - "mov %ebx,%eax\n" |
|
1537 - "sar $0x11,%eax\n" |
|
1538 - |
|
1539 - "movzbl (%esi,%eax,1),%ecx\n" |
|
1540 - "movzbl 1(%esi,%eax,1),%esi\n" |
|
1541 - "mov %ebx,%eax\n" |
|
1542 - "andl $0x1fffe, %eax \n" |
|
1543 - "imul %eax, %esi \n" |
|
1544 - "xorl $0x1fffe, %eax \n" |
|
1545 - "imul %eax, %ecx \n" |
|
1546 - "addl %esi, %ecx \n" |
|
1547 - "shrl $17, %ecx \n" |
|
1548 - "paddsw 4096(%edi,%ecx,8),%mm0\n" |
|
1549 - |
|
1550 - "mov %ebx,%eax\n" |
|
1551 - "sar $0x10,%eax\n" |
|
1552 - "movzbl (%edx,%eax,1),%ecx\n" |
|
1553 - "movzbl 1(%edx,%eax,1),%esi\n" |
|
1554 - "mov %ebx,%eax\n" |
|
1555 - "add 0x38(%esp),%ebx\n" |
|
1556 - "andl $0xffff, %eax \n" |
|
1557 - "imul %eax, %esi \n" |
|
1558 - "xorl $0xffff, %eax \n" |
|
1559 - "imul %eax, %ecx \n" |
|
1560 - "addl %esi, %ecx \n" |
|
1561 - "shrl $16, %ecx \n" |
|
1562 - "movq (%edi,%ecx,8),%mm1\n" |
|
1563 - |
|
1564 - "cmp 0x34(%esp), %ebx\n" |
|
1565 - "jge .lscalelastpixel\n" |
|
1566 - |
|
1567 - "mov %ebx,%eax\n" |
|
1568 - "sar $0x10,%eax\n" |
|
1569 - "movzbl (%edx,%eax,1),%ecx\n" |
|
1570 - "movzbl 1(%edx,%eax,1),%esi\n" |
|
1571 - "mov %ebx,%eax\n" |
|
1572 - "add 0x38(%esp),%ebx\n" |
|
1573 - "andl $0xffff, %eax \n" |
|
1574 - "imul %eax, %esi \n" |
|
1575 - "xorl $0xffff, %eax \n" |
|
1576 - "imul %eax, %ecx \n" |
|
1577 - "addl %esi, %ecx \n" |
|
1578 - "shrl $16, %ecx \n" |
|
1579 - "movq (%edi,%ecx,8),%mm2\n" |
|
1580 - |
|
1581 - "paddsw %mm0,%mm1\n" |
|
1582 - "paddsw %mm0,%mm2\n" |
|
1583 - "psraw $0x6,%mm1\n" |
|
1584 - "psraw $0x6,%mm2\n" |
|
1585 - "packuswb %mm2,%mm1\n" |
|
1586 - "movntq %mm1,0x0(%ebp)\n" |
|
1587 - "add $0x8,%ebp\n" |
|
1588 - |
|
1589 -".lscaleend:" |
|
1590 - "cmp %ebx, 0x34(%esp)\n" |
|
1591 - "jg .lscaleloop\n" |
|
1592 - "popa\n" |
|
1593 - "ret\n" |
|
1594 - |
|
1595 -".lscalelastpixel:" |
|
1596 - "paddsw %mm0, %mm1\n" |
|
1597 - "psraw $6, %mm1\n" |
|
1598 - "packuswb %mm1, %mm1\n" |
|
1599 - "movd %mm1, (%ebp)\n" |
|
1600 - "popa\n" |
|
1601 - "ret\n" |
|
1602 -); |
|
1603 - |
|
1604 -void LinearScaleYUVToRGB32Row(const uint8* y_buf, |
|
1605 - const uint8* u_buf, |
|
1606 - const uint8* v_buf, |
|
1607 - uint8* rgb_buf, |
|
1608 - int width, |
|
1609 - int source_dx) { |
|
1610 - PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx, |
|
1611 - &kCoefficientsRgbY[0][0]); |
|
1612 -} |
|
1613 - |
|
1614 -#else // USE_MMX |
|
1615 - |
|
1616 // C reference code that mimic the YUV assembly. |
|
1617 #define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x))) |
|
1618 #define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \ |
|
1619 (((x) + (y)) > 32767 ? 32767 : ((x) + (y)))) |
|
1620 |
|
1621 static inline void YuvPixel(uint8 y, |
|
1622 uint8 u, |
|
1623 uint8 v, |
|
1624 @@ -833,66 +39,71 @@ static inline void YuvPixel(uint8 y, |
|
1625 a >>= 6; |
|
1626 |
|
1627 *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) | |
|
1628 (packuswb(g) << 8) | |
|
1629 (packuswb(r) << 16) | |
|
1630 (packuswb(a) << 24); |
|
1631 } |
|
1632 |
|
1633 -void FastConvertYUVToRGB32Row(const uint8* y_buf, |
|
1634 - const uint8* u_buf, |
|
1635 - const uint8* v_buf, |
|
1636 - uint8* rgb_buf, |
|
1637 - int width) { |
|
1638 +void FastConvertYUVToRGB32Row_C(const uint8* y_buf, |
|
1639 + const uint8* u_buf, |
|
1640 + const uint8* v_buf, |
|
1641 + uint8* rgb_buf, |
|
1642 + int width, |
|
1643 + unsigned int x_shift) { |
|
1644 for (int x = 0; x < width; x += 2) { |
|
1645 - uint8 u = u_buf[x >> 1]; |
|
1646 - uint8 v = v_buf[x >> 1]; |
|
1647 + uint8 u = u_buf[x >> x_shift]; |
|
1648 + uint8 v = v_buf[x >> x_shift]; |
|
1649 uint8 y0 = y_buf[x]; |
|
1650 YuvPixel(y0, u, v, rgb_buf); |
|
1651 if ((x + 1) < width) { |
|
1652 uint8 y1 = y_buf[x + 1]; |
|
1653 + if (x_shift == 0) { |
|
1654 + u = u_buf[x + 1]; |
|
1655 + v = v_buf[x + 1]; |
|
1656 + } |
|
1657 YuvPixel(y1, u, v, rgb_buf + 4); |
|
1658 } |
|
1659 rgb_buf += 8; // Advance 2 pixels. |
|
1660 } |
|
1661 } |
|
1662 |
|
1663 // 16.16 fixed point is used. A shift by 16 isolates the integer. |
|
1664 // A shift by 17 is used to further subsample the chrominence channels. |
|
1665 // & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits, |
|
1666 // for 1/65536 pixel accurate interpolation. |
|
1667 -void ScaleYUVToRGB32Row(const uint8* y_buf, |
|
1668 - const uint8* u_buf, |
|
1669 - const uint8* v_buf, |
|
1670 - uint8* rgb_buf, |
|
1671 - int width, |
|
1672 - int source_dx) { |
|
1673 +void ScaleYUVToRGB32Row_C(const uint8* y_buf, |
|
1674 + const uint8* u_buf, |
|
1675 + const uint8* v_buf, |
|
1676 + uint8* rgb_buf, |
|
1677 + int width, |
|
1678 + int source_dx) { |
|
1679 int x = 0; |
|
1680 for (int i = 0; i < width; i += 2) { |
|
1681 int y = y_buf[x >> 16]; |
|
1682 int u = u_buf[(x >> 17)]; |
|
1683 int v = v_buf[(x >> 17)]; |
|
1684 YuvPixel(y, u, v, rgb_buf); |
|
1685 x += source_dx; |
|
1686 if ((i + 1) < width) { |
|
1687 y = y_buf[x >> 16]; |
|
1688 YuvPixel(y, u, v, rgb_buf+4); |
|
1689 x += source_dx; |
|
1690 } |
|
1691 rgb_buf += 8; |
|
1692 } |
|
1693 } |
|
1694 |
|
1695 -void LinearScaleYUVToRGB32Row(const uint8* y_buf, |
|
1696 - const uint8* u_buf, |
|
1697 - const uint8* v_buf, |
|
1698 - uint8* rgb_buf, |
|
1699 - int width, |
|
1700 - int source_dx) { |
|
1701 +void LinearScaleYUVToRGB32Row_C(const uint8* y_buf, |
|
1702 + const uint8* u_buf, |
|
1703 + const uint8* v_buf, |
|
1704 + uint8* rgb_buf, |
|
1705 + int width, |
|
1706 + int source_dx) { |
|
1707 int x = 0; |
|
1708 if (source_dx >= 0x20000) { |
|
1709 x = 32768; |
|
1710 } |
|
1711 for (int i = 0; i < width; i += 2) { |
|
1712 int y0 = y_buf[x >> 16]; |
|
1713 int y1 = y_buf[(x >> 16) + 1]; |
|
1714 int u0 = u_buf[(x >> 17)]; |
|
1715 @@ -913,11 +124,10 @@ void LinearScaleYUVToRGB32Row(const uint |
|
1716 y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; |
|
1717 YuvPixel(y, u, v, rgb_buf+4); |
|
1718 x += source_dx; |
|
1719 } |
|
1720 rgb_buf += 8; |
|
1721 } |
|
1722 } |
|
1723 |
|
1724 -#endif // USE_MMX |
|
1725 } // extern "C" |
|
1726 |
|
1727 diff --git a/gfx/ycbcr/yuv_row_posix.cpp b/gfx/ycbcr/yuv_row_posix.cpp |
|
1728 --- a/gfx/ycbcr/yuv_row_posix.cpp |
|
1729 +++ b/gfx/ycbcr/yuv_row_posix.cpp |
|
1730 @@ -1,33 +1,32 @@ |
|
1731 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
|
1732 // Use of this source code is governed by a BSD-style license that can be |
|
1733 // found in the LICENSE file. |
|
1734 |
|
1735 -#include "media/base/yuv_row.h" |
|
1736 - |
|
1737 -#ifdef _DEBUG |
|
1738 -#include "base/logging.h" |
|
1739 -#else |
|
1740 +#include "yuv_row.h" |
|
1741 +#include "mozilla/SSE.h" |
|
1742 + |
|
1743 #define DCHECK(a) |
|
1744 -#endif |
|
1745 |
|
1746 extern "C" { |
|
1747 |
|
1748 -#if USE_SSE2 && defined(ARCH_CPU_X86_64) |
|
1749 +#if defined(ARCH_CPU_X86_64) |
|
1750 + |
|
1751 +// We don't need CPUID guards here, since x86-64 implies SSE2. |
|
1752 |
|
1753 // AMD64 ABI uses register paremters. |
|
1754 void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi |
|
1755 const uint8* u_buf, // rsi |
|
1756 const uint8* v_buf, // rdx |
|
1757 uint8* rgb_buf, // rcx |
|
1758 int width) { // r8 |
|
1759 asm( |
|
1760 - "jmp convertend\n" |
|
1761 -"convertloop:" |
|
1762 + "jmp 1f\n" |
|
1763 +"0:" |
|
1764 "movzb (%1),%%r10\n" |
|
1765 "add $0x1,%1\n" |
|
1766 "movzb (%2),%%r11\n" |
|
1767 "add $0x1,%2\n" |
|
1768 "movq 2048(%5,%%r10,8),%%xmm0\n" |
|
1769 "movzb (%0),%%r10\n" |
|
1770 "movq 4096(%5,%%r11,8),%%xmm1\n" |
|
1771 "movzb 0x1(%0),%%r11\n" |
|
1772 @@ -37,36 +36,36 @@ void FastConvertYUVToRGB32Row(const uint |
|
1773 "movq (%5,%%r11,8),%%xmm3\n" |
|
1774 "paddsw %%xmm0,%%xmm2\n" |
|
1775 "paddsw %%xmm0,%%xmm3\n" |
|
1776 "shufps $0x44,%%xmm3,%%xmm2\n" |
|
1777 "psraw $0x6,%%xmm2\n" |
|
1778 "packuswb %%xmm2,%%xmm2\n" |
|
1779 "movq %%xmm2,0x0(%3)\n" |
|
1780 "add $0x8,%3\n" |
|
1781 -"convertend:" |
|
1782 +"1:" |
|
1783 "sub $0x2,%4\n" |
|
1784 - "jns convertloop\n" |
|
1785 - |
|
1786 -"convertnext:" |
|
1787 + "jns 0b\n" |
|
1788 + |
|
1789 +"2:" |
|
1790 "add $0x1,%4\n" |
|
1791 - "js convertdone\n" |
|
1792 + "js 3f\n" |
|
1793 |
|
1794 "movzb (%1),%%r10\n" |
|
1795 "movq 2048(%5,%%r10,8),%%xmm0\n" |
|
1796 "movzb (%2),%%r10\n" |
|
1797 "movq 4096(%5,%%r10,8),%%xmm1\n" |
|
1798 "paddsw %%xmm1,%%xmm0\n" |
|
1799 "movzb (%0),%%r10\n" |
|
1800 "movq (%5,%%r10,8),%%xmm1\n" |
|
1801 "paddsw %%xmm0,%%xmm1\n" |
|
1802 "psraw $0x6,%%xmm1\n" |
|
1803 "packuswb %%xmm1,%%xmm1\n" |
|
1804 "movd %%xmm1,0x0(%3)\n" |
|
1805 -"convertdone:" |
|
1806 +"3:" |
|
1807 : |
|
1808 : "r"(y_buf), // %0 |
|
1809 "r"(u_buf), // %1 |
|
1810 "r"(v_buf), // %2 |
|
1811 "r"(rgb_buf), // %3 |
|
1812 "r"(width), // %4 |
|
1813 "r" (kCoefficientsRgbY) // %5 |
|
1814 : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" |
|
1815 @@ -77,19 +76,19 @@ void ScaleYUVToRGB32Row(const uint8* y_b |
|
1816 const uint8* u_buf, // rsi |
|
1817 const uint8* v_buf, // rdx |
|
1818 uint8* rgb_buf, // rcx |
|
1819 int width, // r8 |
|
1820 int source_dx) { // r9 |
|
1821 asm( |
|
1822 "xor %%r11,%%r11\n" |
|
1823 "sub $0x2,%4\n" |
|
1824 - "js scalenext\n" |
|
1825 - |
|
1826 -"scaleloop:" |
|
1827 + "js 1f\n" |
|
1828 + |
|
1829 +"0:" |
|
1830 "mov %%r11,%%r10\n" |
|
1831 "sar $0x11,%%r10\n" |
|
1832 "movzb (%1,%%r10,1),%%rax\n" |
|
1833 "movq 2048(%5,%%rax,8),%%xmm0\n" |
|
1834 "movzb (%2,%%r10,1),%%rax\n" |
|
1835 "movq 4096(%5,%%rax,8),%%xmm1\n" |
|
1836 "lea (%%r11,%6),%%r10\n" |
|
1837 "sar $0x10,%%r11\n" |
|
1838 @@ -103,38 +102,38 @@ void ScaleYUVToRGB32Row(const uint8* y_b |
|
1839 "paddsw %%xmm0,%%xmm1\n" |
|
1840 "paddsw %%xmm0,%%xmm2\n" |
|
1841 "shufps $0x44,%%xmm2,%%xmm1\n" |
|
1842 "psraw $0x6,%%xmm1\n" |
|
1843 "packuswb %%xmm1,%%xmm1\n" |
|
1844 "movq %%xmm1,0x0(%3)\n" |
|
1845 "add $0x8,%3\n" |
|
1846 "sub $0x2,%4\n" |
|
1847 - "jns scaleloop\n" |
|
1848 - |
|
1849 -"scalenext:" |
|
1850 + "jns 0b\n" |
|
1851 + |
|
1852 +"1:" |
|
1853 "add $0x1,%4\n" |
|
1854 - "js scaledone\n" |
|
1855 + "js 2f\n" |
|
1856 |
|
1857 "mov %%r11,%%r10\n" |
|
1858 "sar $0x11,%%r10\n" |
|
1859 "movzb (%1,%%r10,1),%%rax\n" |
|
1860 "movq 2048(%5,%%rax,8),%%xmm0\n" |
|
1861 "movzb (%2,%%r10,1),%%rax\n" |
|
1862 "movq 4096(%5,%%rax,8),%%xmm1\n" |
|
1863 "paddsw %%xmm1,%%xmm0\n" |
|
1864 "sar $0x10,%%r11\n" |
|
1865 "movzb (%0,%%r11,1),%%rax\n" |
|
1866 "movq (%5,%%rax,8),%%xmm1\n" |
|
1867 "paddsw %%xmm0,%%xmm1\n" |
|
1868 "psraw $0x6,%%xmm1\n" |
|
1869 "packuswb %%xmm1,%%xmm1\n" |
|
1870 "movd %%xmm1,0x0(%3)\n" |
|
1871 |
|
1872 -"scaledone:" |
|
1873 +"2:" |
|
1874 : |
|
1875 : "r"(y_buf), // %0 |
|
1876 "r"(u_buf), // %1 |
|
1877 "r"(v_buf), // %2 |
|
1878 "r"(rgb_buf), // %3 |
|
1879 "r"(width), // %4 |
|
1880 "r" (kCoefficientsRgbY), // %5 |
|
1881 "r"(static_cast<long>(source_dx)) // %6 |
|
1882 @@ -146,23 +145,23 @@ void LinearScaleYUVToRGB32Row(const uint |
|
1883 const uint8* u_buf, |
|
1884 const uint8* v_buf, |
|
1885 uint8* rgb_buf, |
|
1886 int width, |
|
1887 int source_dx) { |
|
1888 asm( |
|
1889 "xor %%r11,%%r11\n" // x = 0 |
|
1890 "sub $0x2,%4\n" |
|
1891 - "js .lscalenext\n" |
|
1892 + "js 2f\n" |
|
1893 "cmp $0x20000,%6\n" // if source_dx >= 2.0 |
|
1894 - "jl .lscalehalf\n" |
|
1895 + "jl 0f\n" |
|
1896 "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less |
|
1897 -".lscalehalf:" |
|
1898 - |
|
1899 -".lscaleloop:" |
|
1900 +"0:" |
|
1901 + |
|
1902 +"1:" |
|
1903 "mov %%r11,%%r10\n" |
|
1904 "sar $0x11,%%r10\n" |
|
1905 |
|
1906 "movzb (%1, %%r10, 1), %%r13 \n" |
|
1907 "movzb 1(%1, %%r10, 1), %%r14 \n" |
|
1908 "mov %%r11, %%rax \n" |
|
1909 "and $0x1fffe, %%rax \n" |
|
1910 "imul %%rax, %%r14 \n" |
|
1911 @@ -215,21 +214,21 @@ void LinearScaleYUVToRGB32Row(const uint |
|
1912 "paddsw %%xmm0,%%xmm1\n" |
|
1913 "paddsw %%xmm0,%%xmm2\n" |
|
1914 "shufps $0x44,%%xmm2,%%xmm1\n" |
|
1915 "psraw $0x6,%%xmm1\n" |
|
1916 "packuswb %%xmm1,%%xmm1\n" |
|
1917 "movq %%xmm1,0x0(%3)\n" |
|
1918 "add $0x8,%3\n" |
|
1919 "sub $0x2,%4\n" |
|
1920 - "jns .lscaleloop\n" |
|
1921 - |
|
1922 -".lscalenext:" |
|
1923 + "jns 1b\n" |
|
1924 + |
|
1925 +"2:" |
|
1926 "add $0x1,%4\n" |
|
1927 - "js .lscaledone\n" |
|
1928 + "js 3f\n" |
|
1929 |
|
1930 "mov %%r11,%%r10\n" |
|
1931 "sar $0x11,%%r10\n" |
|
1932 |
|
1933 "movzb (%1,%%r10,1), %%r13 \n" |
|
1934 "movq 2048(%5,%%r13,8),%%xmm0\n" |
|
1935 |
|
1936 "movzb (%2,%%r10,1), %%r13 \n" |
|
1937 @@ -241,52 +240,52 @@ void LinearScaleYUVToRGB32Row(const uint |
|
1938 "movzb (%0,%%r11,1), %%r13 \n" |
|
1939 "movq (%5,%%r13,8),%%xmm1\n" |
|
1940 |
|
1941 "paddsw %%xmm0,%%xmm1\n" |
|
1942 "psraw $0x6,%%xmm1\n" |
|
1943 "packuswb %%xmm1,%%xmm1\n" |
|
1944 "movd %%xmm1,0x0(%3)\n" |
|
1945 |
|
1946 -".lscaledone:" |
|
1947 +"3:" |
|
1948 : |
|
1949 : "r"(y_buf), // %0 |
|
1950 "r"(u_buf), // %1 |
|
1951 "r"(v_buf), // %2 |
|
1952 "r"(rgb_buf), // %3 |
|
1953 "r"(width), // %4 |
|
1954 "r" (kCoefficientsRgbY), // %5 |
|
1955 "r"(static_cast<long>(source_dx)) // %6 |
|
1956 : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2" |
|
1957 ); |
|
1958 } |
|
1959 |
|
1960 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__) |
|
1961 +#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__) |
|
1962 |
|
1963 // PIC version is slower because less registers are available, so |
|
1964 // non-PIC is used on platforms where it is possible. |
|
1965 - |
|
1966 -void FastConvertYUVToRGB32Row(const uint8* y_buf, |
|
1967 - const uint8* u_buf, |
|
1968 - const uint8* v_buf, |
|
1969 - uint8* rgb_buf, |
|
1970 - int width); |
|
1971 +void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf, |
|
1972 + const uint8* u_buf, |
|
1973 + const uint8* v_buf, |
|
1974 + uint8* rgb_buf, |
|
1975 + int width); |
|
1976 asm( |
|
1977 ".text\n" |
|
1978 - ".global FastConvertYUVToRGB32Row\n" |
|
1979 -"FastConvertYUVToRGB32Row:\n" |
|
1980 + ".global FastConvertYUVToRGB32Row_SSE\n" |
|
1981 + ".type FastConvertYUVToRGB32Row_SSE, @function\n" |
|
1982 +"FastConvertYUVToRGB32Row_SSE:\n" |
|
1983 "pusha\n" |
|
1984 "mov 0x24(%esp),%edx\n" |
|
1985 "mov 0x28(%esp),%edi\n" |
|
1986 "mov 0x2c(%esp),%esi\n" |
|
1987 "mov 0x30(%esp),%ebp\n" |
|
1988 "mov 0x34(%esp),%ecx\n" |
|
1989 - "jmp convertend\n" |
|
1990 - |
|
1991 -"convertloop:" |
|
1992 + "jmp 1f\n" |
|
1993 + |
|
1994 +"0:" |
|
1995 "movzbl (%edi),%eax\n" |
|
1996 "add $0x1,%edi\n" |
|
1997 "movzbl (%esi),%ebx\n" |
|
1998 "add $0x1,%esi\n" |
|
1999 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" |
|
2000 "movzbl (%edx),%eax\n" |
|
2001 "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" |
|
2002 "movzbl 0x1(%edx),%ebx\n" |
|
2003 @@ -295,59 +294,77 @@ void FastConvertYUVToRGB32Row(const uint |
|
2004 "movq kCoefficientsRgbY(,%ebx,8),%mm2\n" |
|
2005 "paddsw %mm0,%mm1\n" |
|
2006 "paddsw %mm0,%mm2\n" |
|
2007 "psraw $0x6,%mm1\n" |
|
2008 "psraw $0x6,%mm2\n" |
|
2009 "packuswb %mm2,%mm1\n" |
|
2010 "movntq %mm1,0x0(%ebp)\n" |
|
2011 "add $0x8,%ebp\n" |
|
2012 -"convertend:" |
|
2013 +"1:" |
|
2014 "sub $0x2,%ecx\n" |
|
2015 - "jns convertloop\n" |
|
2016 + "jns 0b\n" |
|
2017 |
|
2018 "and $0x1,%ecx\n" |
|
2019 - "je convertdone\n" |
|
2020 + "je 2f\n" |
|
2021 |
|
2022 "movzbl (%edi),%eax\n" |
|
2023 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" |
|
2024 "movzbl (%esi),%eax\n" |
|
2025 "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" |
|
2026 "movzbl (%edx),%eax\n" |
|
2027 "movq kCoefficientsRgbY(,%eax,8),%mm1\n" |
|
2028 "paddsw %mm0,%mm1\n" |
|
2029 "psraw $0x6,%mm1\n" |
|
2030 "packuswb %mm1,%mm1\n" |
|
2031 "movd %mm1,0x0(%ebp)\n" |
|
2032 -"convertdone:" |
|
2033 +"2:" |
|
2034 "popa\n" |
|
2035 "ret\n" |
|
2036 +#if !defined(XP_MACOSX) |
|
2037 + ".previous\n" |
|
2038 +#endif |
|
2039 ); |
|
2040 |
|
2041 - |
|
2042 -void ScaleYUVToRGB32Row(const uint8* y_buf, |
|
2043 - const uint8* u_buf, |
|
2044 - const uint8* v_buf, |
|
2045 - uint8* rgb_buf, |
|
2046 - int width, |
|
2047 - int source_dx); |
|
2048 +void FastConvertYUVToRGB32Row(const uint8* y_buf, |
|
2049 + const uint8* u_buf, |
|
2050 + const uint8* v_buf, |
|
2051 + uint8* rgb_buf, |
|
2052 + int width) |
|
2053 +{ |
|
2054 + if (mozilla::supports_sse()) { |
|
2055 + FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width); |
|
2056 + return; |
|
2057 + } |
|
2058 + |
|
2059 + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); |
|
2060 +} |
|
2061 + |
|
2062 + |
|
2063 +void ScaleYUVToRGB32Row_SSE(const uint8* y_buf, |
|
2064 + const uint8* u_buf, |
|
2065 + const uint8* v_buf, |
|
2066 + uint8* rgb_buf, |
|
2067 + int width, |
|
2068 + int source_dx); |
|
2069 asm( |
|
2070 ".text\n" |
|
2071 - ".global ScaleYUVToRGB32Row\n" |
|
2072 -"ScaleYUVToRGB32Row:\n" |
|
2073 + ".global ScaleYUVToRGB32Row_SSE\n" |
|
2074 + ".type ScaleYUVToRGB32Row_SSE, @function\n" |
|
2075 +"ScaleYUVToRGB32Row_SSE:\n" |
|
2076 "pusha\n" |
|
2077 "mov 0x24(%esp),%edx\n" |
|
2078 "mov 0x28(%esp),%edi\n" |
|
2079 "mov 0x2c(%esp),%esi\n" |
|
2080 "mov 0x30(%esp),%ebp\n" |
|
2081 "mov 0x34(%esp),%ecx\n" |
|
2082 "xor %ebx,%ebx\n" |
|
2083 - "jmp scaleend\n" |
|
2084 - |
|
2085 -"scaleloop:" |
|
2086 + "jmp 1f\n" |
|
2087 + |
|
2088 +"0:" |
|
2089 "mov %ebx,%eax\n" |
|
2090 "sar $0x11,%eax\n" |
|
2091 "movzbl (%edi,%eax,1),%eax\n" |
|
2092 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" |
|
2093 "mov %ebx,%eax\n" |
|
2094 "sar $0x11,%eax\n" |
|
2095 "movzbl (%esi,%eax,1),%eax\n" |
|
2096 "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" |
|
2097 @@ -363,22 +380,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b |
|
2098 "movq kCoefficientsRgbY(,%eax,8),%mm2\n" |
|
2099 "paddsw %mm0,%mm1\n" |
|
2100 "paddsw %mm0,%mm2\n" |
|
2101 "psraw $0x6,%mm1\n" |
|
2102 "psraw $0x6,%mm2\n" |
|
2103 "packuswb %mm2,%mm1\n" |
|
2104 "movntq %mm1,0x0(%ebp)\n" |
|
2105 "add $0x8,%ebp\n" |
|
2106 -"scaleend:" |
|
2107 +"1:" |
|
2108 "sub $0x2,%ecx\n" |
|
2109 - "jns scaleloop\n" |
|
2110 + "jns 0b\n" |
|
2111 |
|
2112 "and $0x1,%ecx\n" |
|
2113 - "je scaledone\n" |
|
2114 + "je 2f\n" |
|
2115 |
|
2116 "mov %ebx,%eax\n" |
|
2117 "sar $0x11,%eax\n" |
|
2118 "movzbl (%edi,%eax,1),%eax\n" |
|
2119 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" |
|
2120 "mov %ebx,%eax\n" |
|
2121 "sar $0x11,%eax\n" |
|
2122 "movzbl (%esi,%eax,1),%eax\n" |
|
2123 @@ -387,51 +404,71 @@ void ScaleYUVToRGB32Row(const uint8* y_b |
|
2124 "sar $0x10,%eax\n" |
|
2125 "movzbl (%edx,%eax,1),%eax\n" |
|
2126 "movq kCoefficientsRgbY(,%eax,8),%mm1\n" |
|
2127 "paddsw %mm0,%mm1\n" |
|
2128 "psraw $0x6,%mm1\n" |
|
2129 "packuswb %mm1,%mm1\n" |
|
2130 "movd %mm1,0x0(%ebp)\n" |
|
2131 |
|
2132 -"scaledone:" |
|
2133 +"2:" |
|
2134 "popa\n" |
|
2135 "ret\n" |
|
2136 +#if !defined(XP_MACOSX) |
|
2137 + ".previous\n" |
|
2138 +#endif |
|
2139 ); |
|
2140 |
|
2141 -void LinearScaleYUVToRGB32Row(const uint8* y_buf, |
|
2142 - const uint8* u_buf, |
|
2143 - const uint8* v_buf, |
|
2144 - uint8* rgb_buf, |
|
2145 - int width, |
|
2146 - int source_dx); |
|
2147 +void ScaleYUVToRGB32Row(const uint8* y_buf, |
|
2148 + const uint8* u_buf, |
|
2149 + const uint8* v_buf, |
|
2150 + uint8* rgb_buf, |
|
2151 + int width, |
|
2152 + int source_dx) |
|
2153 +{ |
|
2154 + if (mozilla::supports_sse()) { |
|
2155 + ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, |
|
2156 + width, source_dx); |
|
2157 + } |
|
2158 + |
|
2159 + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, |
|
2160 + width, source_dx); |
|
2161 +} |
|
2162 + |
|
2163 +void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf, |
|
2164 + const uint8* u_buf, |
|
2165 + const uint8* v_buf, |
|
2166 + uint8* rgb_buf, |
|
2167 + int width, |
|
2168 + int source_dx); |
|
2169 asm( |
|
2170 ".text\n" |
|
2171 - ".global LinearScaleYUVToRGB32Row\n" |
|
2172 -"LinearScaleYUVToRGB32Row:\n" |
|
2173 + ".global LinearScaleYUVToRGB32Row_SSE\n" |
|
2174 + ".type LinearScaleYUVToRGB32Row_SSE, @function\n" |
|
2175 +"LinearScaleYUVToRGB32Row_SSE:\n" |
|
2176 "pusha\n" |
|
2177 "mov 0x24(%esp),%edx\n" |
|
2178 "mov 0x28(%esp),%edi\n" |
|
2179 "mov 0x30(%esp),%ebp\n" |
|
2180 |
|
2181 // source_width = width * source_dx + ebx |
|
2182 "mov 0x34(%esp), %ecx\n" |
|
2183 "imull 0x38(%esp), %ecx\n" |
|
2184 "mov %ecx, 0x34(%esp)\n" |
|
2185 |
|
2186 "mov 0x38(%esp), %ecx\n" |
|
2187 "xor %ebx,%ebx\n" // x = 0 |
|
2188 "cmp $0x20000,%ecx\n" // if source_dx >= 2.0 |
|
2189 - "jl .lscaleend\n" |
|
2190 + "jl 1f\n" |
|
2191 "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less |
|
2192 - "jmp .lscaleend\n" |
|
2193 - |
|
2194 -".lscaleloop:" |
|
2195 - "mov %ebx,%eax\n" |
|
2196 - "sar $0x11,%eax\n" |
|
2197 + "jmp 1f\n" |
|
2198 + |
|
2199 +"0:" |
|
2200 + "mov %ebx,%eax\n" |
|
2201 + "sar $0x11,%eax\n" |
|
2202 |
|
2203 "movzbl (%edi,%eax,1),%ecx\n" |
|
2204 "movzbl 1(%edi,%eax,1),%esi\n" |
|
2205 "mov %ebx,%eax\n" |
|
2206 "andl $0x1fffe, %eax \n" |
|
2207 "imul %eax, %esi \n" |
|
2208 "xorl $0x1fffe, %eax \n" |
|
2209 "imul %eax, %ecx \n" |
|
2210 @@ -464,17 +501,17 @@ void LinearScaleYUVToRGB32Row(const uint |
|
2211 "imul %eax, %esi \n" |
|
2212 "xorl $0xffff, %eax \n" |
|
2213 "imul %eax, %ecx \n" |
|
2214 "addl %esi, %ecx \n" |
|
2215 "shrl $16, %ecx \n" |
|
2216 "movq kCoefficientsRgbY(,%ecx,8),%mm1\n" |
|
2217 |
|
2218 "cmp 0x34(%esp), %ebx\n" |
|
2219 - "jge .lscalelastpixel\n" |
|
2220 + "jge 2f\n" |
|
2221 |
|
2222 "mov %ebx,%eax\n" |
|
2223 "sar $0x10,%eax\n" |
|
2224 "movzbl (%edx,%eax,1),%ecx\n" |
|
2225 "movzbl 1(%edx,%eax,1),%esi\n" |
|
2226 "mov %ebx,%eax\n" |
|
2227 "add 0x38(%esp),%ebx\n" |
|
2228 "andl $0xffff, %eax \n" |
|
2229 @@ -488,56 +525,76 @@ void LinearScaleYUVToRGB32Row(const uint |
|
2230 "paddsw %mm0,%mm1\n" |
|
2231 "paddsw %mm0,%mm2\n" |
|
2232 "psraw $0x6,%mm1\n" |
|
2233 "psraw $0x6,%mm2\n" |
|
2234 "packuswb %mm2,%mm1\n" |
|
2235 "movntq %mm1,0x0(%ebp)\n" |
|
2236 "add $0x8,%ebp\n" |
|
2237 |
|
2238 -".lscaleend:" |
|
2239 +"1:" |
|
2240 "cmp 0x34(%esp), %ebx\n" |
|
2241 - "jl .lscaleloop\n" |
|
2242 + "jl 0b\n" |
|
2243 "popa\n" |
|
2244 "ret\n" |
|
2245 |
|
2246 -".lscalelastpixel:" |
|
2247 +"2:" |
|
2248 "paddsw %mm0, %mm1\n" |
|
2249 "psraw $6, %mm1\n" |
|
2250 "packuswb %mm1, %mm1\n" |
|
2251 "movd %mm1, (%ebp)\n" |
|
2252 "popa\n" |
|
2253 "ret\n" |
|
2254 +#if !defined(XP_MACOSX) |
|
2255 + ".previous\n" |
|
2256 +#endif |
|
2257 ); |
|
2258 |
|
2259 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__) |
|
2260 - |
|
2261 -extern void PICConvertYUVToRGB32Row(const uint8* y_buf, |
|
2262 - const uint8* u_buf, |
|
2263 - const uint8* v_buf, |
|
2264 - uint8* rgb_buf, |
|
2265 - int width, |
|
2266 - int16 *kCoefficientsRgbY); |
|
2267 +void LinearScaleYUVToRGB32Row(const uint8* y_buf, |
|
2268 + const uint8* u_buf, |
|
2269 + const uint8* v_buf, |
|
2270 + uint8* rgb_buf, |
|
2271 + int width, |
|
2272 + int source_dx) |
|
2273 +{ |
|
2274 + if (mozilla::supports_sse()) { |
|
2275 + LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, |
|
2276 + width, source_dx); |
|
2277 + } |
|
2278 + |
|
2279 + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, |
|
2280 + width, source_dx); |
|
2281 +} |
|
2282 + |
|
2283 +#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__) |
|
2284 + |
|
2285 +void PICConvertYUVToRGB32Row_SSE(const uint8* y_buf, |
|
2286 + const uint8* u_buf, |
|
2287 + const uint8* v_buf, |
|
2288 + uint8* rgb_buf, |
|
2289 + int width, |
|
2290 + int16 *kCoefficientsRgbY); |
|
2291 + |
|
2292 asm( |
|
2293 ".text\n" |
|
2294 -#if defined(OS_MACOSX) |
|
2295 -"_PICConvertYUVToRGB32Row:\n" |
|
2296 +#if defined(XP_MACOSX) |
|
2297 +"_PICConvertYUVToRGB32Row_SSE:\n" |
|
2298 #else |
|
2299 -"PICConvertYUVToRGB32Row:\n" |
|
2300 +"PICConvertYUVToRGB32Row_SSE:\n" |
|
2301 #endif |
|
2302 "pusha\n" |
|
2303 "mov 0x24(%esp),%edx\n" |
|
2304 "mov 0x28(%esp),%edi\n" |
|
2305 "mov 0x2c(%esp),%esi\n" |
|
2306 "mov 0x30(%esp),%ebp\n" |
|
2307 "mov 0x38(%esp),%ecx\n" |
|
2308 |
|
2309 - "jmp .Lconvertend\n" |
|
2310 - |
|
2311 -".Lconvertloop:" |
|
2312 + "jmp 1f\n" |
|
2313 + |
|
2314 +"0:" |
|
2315 "movzbl (%edi),%eax\n" |
|
2316 "add $0x1,%edi\n" |
|
2317 "movzbl (%esi),%ebx\n" |
|
2318 "add $0x1,%esi\n" |
|
2319 "movq 2048(%ecx,%eax,8),%mm0\n" |
|
2320 "movzbl (%edx),%eax\n" |
|
2321 "paddsw 4096(%ecx,%ebx,8),%mm0\n" |
|
2322 "movzbl 0x1(%edx),%ebx\n" |
|
2323 @@ -546,72 +603,81 @@ extern void PICConvertYUVToRGB32Row(cons |
|
2324 "movq 0(%ecx,%ebx,8),%mm2\n" |
|
2325 "paddsw %mm0,%mm1\n" |
|
2326 "paddsw %mm0,%mm2\n" |
|
2327 "psraw $0x6,%mm1\n" |
|
2328 "psraw $0x6,%mm2\n" |
|
2329 "packuswb %mm2,%mm1\n" |
|
2330 "movntq %mm1,0x0(%ebp)\n" |
|
2331 "add $0x8,%ebp\n" |
|
2332 -".Lconvertend:" |
|
2333 +"1:" |
|
2334 "subl $0x2,0x34(%esp)\n" |
|
2335 - "jns .Lconvertloop\n" |
|
2336 + "jns 0b\n" |
|
2337 |
|
2338 "andl $0x1,0x34(%esp)\n" |
|
2339 - "je .Lconvertdone\n" |
|
2340 + "je 2f\n" |
|
2341 |
|
2342 "movzbl (%edi),%eax\n" |
|
2343 "movq 2048(%ecx,%eax,8),%mm0\n" |
|
2344 "movzbl (%esi),%eax\n" |
|
2345 "paddsw 4096(%ecx,%eax,8),%mm0\n" |
|
2346 "movzbl (%edx),%eax\n" |
|
2347 "movq 0(%ecx,%eax,8),%mm1\n" |
|
2348 "paddsw %mm0,%mm1\n" |
|
2349 "psraw $0x6,%mm1\n" |
|
2350 "packuswb %mm1,%mm1\n" |
|
2351 "movd %mm1,0x0(%ebp)\n" |
|
2352 -".Lconvertdone:\n" |
|
2353 +"2:" |
|
2354 "popa\n" |
|
2355 "ret\n" |
|
2356 +#if !defined(XP_MACOSX) |
|
2357 + ".previous\n" |
|
2358 +#endif |
|
2359 ); |
|
2360 |
|
2361 void FastConvertYUVToRGB32Row(const uint8* y_buf, |
|
2362 const uint8* u_buf, |
|
2363 const uint8* v_buf, |
|
2364 uint8* rgb_buf, |
|
2365 - int width) { |
|
2366 - PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, |
|
2367 - &kCoefficientsRgbY[0][0]); |
|
2368 -} |
|
2369 - |
|
2370 -extern void PICScaleYUVToRGB32Row(const uint8* y_buf, |
|
2371 + int width) |
|
2372 +{ |
|
2373 + if (mozilla::supports_sse()) { |
|
2374 + PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, |
|
2375 + &kCoefficientsRgbY[0][0]); |
|
2376 + return; |
|
2377 + } |
|
2378 + |
|
2379 + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); |
|
2380 +} |
|
2381 + |
|
2382 +void PICScaleYUVToRGB32Row_SSE(const uint8* y_buf, |
|
2383 const uint8* u_buf, |
|
2384 const uint8* v_buf, |
|
2385 uint8* rgb_buf, |
|
2386 int width, |
|
2387 int source_dx, |
|
2388 int16 *kCoefficientsRgbY); |
|
2389 |
|
2390 asm( |
|
2391 ".text\n" |
|
2392 -#if defined(OS_MACOSX) |
|
2393 -"_PICScaleYUVToRGB32Row:\n" |
|
2394 +#if defined(XP_MACOSX) |
|
2395 +"_PICScaleYUVToRGB32Row_SSE:\n" |
|
2396 #else |
|
2397 -"PICScaleYUVToRGB32Row:\n" |
|
2398 +"PICScaleYUVToRGB32Row_SSE:\n" |
|
2399 #endif |
|
2400 "pusha\n" |
|
2401 "mov 0x24(%esp),%edx\n" |
|
2402 "mov 0x28(%esp),%edi\n" |
|
2403 "mov 0x2c(%esp),%esi\n" |
|
2404 "mov 0x30(%esp),%ebp\n" |
|
2405 "mov 0x3c(%esp),%ecx\n" |
|
2406 "xor %ebx,%ebx\n" |
|
2407 - "jmp Lscaleend\n" |
|
2408 - |
|
2409 -"Lscaleloop:" |
|
2410 + "jmp 1f\n" |
|
2411 + |
|
2412 +"0:" |
|
2413 "mov %ebx,%eax\n" |
|
2414 "sar $0x11,%eax\n" |
|
2415 "movzbl (%edi,%eax,1),%eax\n" |
|
2416 "movq 2048(%ecx,%eax,8),%mm0\n" |
|
2417 "mov %ebx,%eax\n" |
|
2418 "sar $0x11,%eax\n" |
|
2419 "movzbl (%esi,%eax,1),%eax\n" |
|
2420 "paddsw 4096(%ecx,%eax,8),%mm0\n" |
|
2421 @@ -627,22 +693,22 @@ extern void PICScaleYUVToRGB32Row(const |
|
2422 "movq 0(%ecx,%eax,8),%mm2\n" |
|
2423 "paddsw %mm0,%mm1\n" |
|
2424 "paddsw %mm0,%mm2\n" |
|
2425 "psraw $0x6,%mm1\n" |
|
2426 "psraw $0x6,%mm2\n" |
|
2427 "packuswb %mm2,%mm1\n" |
|
2428 "movntq %mm1,0x0(%ebp)\n" |
|
2429 "add $0x8,%ebp\n" |
|
2430 -"Lscaleend:" |
|
2431 +"1:" |
|
2432 "subl $0x2,0x34(%esp)\n" |
|
2433 - "jns Lscaleloop\n" |
|
2434 + "jns 0b\n" |
|
2435 |
|
2436 "andl $0x1,0x34(%esp)\n" |
|
2437 - "je Lscaledone\n" |
|
2438 + "je 2f\n" |
|
2439 |
|
2440 "mov %ebx,%eax\n" |
|
2441 "sar $0x11,%eax\n" |
|
2442 "movzbl (%edi,%eax,1),%eax\n" |
|
2443 "movq 2048(%ecx,%eax,8),%mm0\n" |
|
2444 "mov %ebx,%eax\n" |
|
2445 "sar $0x11,%eax\n" |
|
2446 "movzbl (%esi,%eax,1),%eax\n" |
|
2447 @@ -651,66 +717,75 @@ extern void PICScaleYUVToRGB32Row(const |
|
2448 "sar $0x10,%eax\n" |
|
2449 "movzbl (%edx,%eax,1),%eax\n" |
|
2450 "movq 0(%ecx,%eax,8),%mm1\n" |
|
2451 "paddsw %mm0,%mm1\n" |
|
2452 "psraw $0x6,%mm1\n" |
|
2453 "packuswb %mm1,%mm1\n" |
|
2454 "movd %mm1,0x0(%ebp)\n" |
|
2455 |
|
2456 -"Lscaledone:" |
|
2457 +"2:" |
|
2458 "popa\n" |
|
2459 "ret\n" |
|
2460 +#if !defined(XP_MACOSX) |
|
2461 + ".previous\n" |
|
2462 +#endif |
|
2463 ); |
|
2464 |
|
2465 - |
|
2466 void ScaleYUVToRGB32Row(const uint8* y_buf, |
|
2467 const uint8* u_buf, |
|
2468 const uint8* v_buf, |
|
2469 uint8* rgb_buf, |
|
2470 int width, |
|
2471 - int source_dx) { |
|
2472 - PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx, |
|
2473 - &kCoefficientsRgbY[0][0]); |
|
2474 -} |
|
2475 - |
|
2476 -void PICLinearScaleYUVToRGB32Row(const uint8* y_buf, |
|
2477 - const uint8* u_buf, |
|
2478 - const uint8* v_buf, |
|
2479 - uint8* rgb_buf, |
|
2480 - int width, |
|
2481 - int source_dx, |
|
2482 - int16 *kCoefficientsRgbY); |
|
2483 + int source_dx) |
|
2484 +{ |
|
2485 + if (mozilla::supports_sse()) { |
|
2486 + PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx, |
|
2487 + &kCoefficientsRgbY[0][0]); |
|
2488 + return; |
|
2489 + } |
|
2490 + |
|
2491 + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); |
|
2492 +} |
|
2493 + |
|
2494 +void PICLinearScaleYUVToRGB32Row_SSE(const uint8* y_buf, |
|
2495 + const uint8* u_buf, |
|
2496 + const uint8* v_buf, |
|
2497 + uint8* rgb_buf, |
|
2498 + int width, |
|
2499 + int source_dx, |
|
2500 + int16 *kCoefficientsRgbY); |
|
2501 + |
|
2502 asm( |
|
2503 ".text\n" |
|
2504 -#if defined(OS_MACOSX) |
|
2505 -"_PICLinearScaleYUVToRGB32Row:\n" |
|
2506 +#if defined(XP_MACOSX) |
|
2507 +"_PICLinearScaleYUVToRGB32Row_SSE:\n" |
|
2508 #else |
|
2509 -"PICLinearScaleYUVToRGB32Row:\n" |
|
2510 +"PICLinearScaleYUVToRGB32Row_SSE:\n" |
|
2511 #endif |
|
2512 "pusha\n" |
|
2513 "mov 0x24(%esp),%edx\n" |
|
2514 "mov 0x30(%esp),%ebp\n" |
|
2515 "mov 0x34(%esp),%ecx\n" |
|
2516 "mov 0x3c(%esp),%edi\n" |
|
2517 "xor %ebx,%ebx\n" |
|
2518 |
|
2519 // source_width = width * source_dx + ebx |
|
2520 "mov 0x34(%esp), %ecx\n" |
|
2521 "imull 0x38(%esp), %ecx\n" |
|
2522 "mov %ecx, 0x34(%esp)\n" |
|
2523 |
|
2524 "mov 0x38(%esp), %ecx\n" |
|
2525 "xor %ebx,%ebx\n" // x = 0 |
|
2526 "cmp $0x20000,%ecx\n" // if source_dx >= 2.0 |
|
2527 - "jl .lscaleend\n" |
|
2528 + "jl 1f\n" |
|
2529 "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less |
|
2530 - "jmp .lscaleend\n" |
|
2531 - |
|
2532 -".lscaleloop:" |
|
2533 + "jmp 1f\n" |
|
2534 + |
|
2535 +"0:" |
|
2536 "mov 0x28(%esp),%esi\n" |
|
2537 "mov %ebx,%eax\n" |
|
2538 "sar $0x11,%eax\n" |
|
2539 |
|
2540 "movzbl (%esi,%eax,1),%ecx\n" |
|
2541 "movzbl 1(%esi,%eax,1),%esi\n" |
|
2542 "mov %ebx,%eax\n" |
|
2543 "andl $0x1fffe, %eax \n" |
|
2544 @@ -746,17 +821,17 @@ void PICLinearScaleYUVToRGB32Row(const u |
|
2545 "imul %eax, %esi \n" |
|
2546 "xorl $0xffff, %eax \n" |
|
2547 "imul %eax, %ecx \n" |
|
2548 "addl %esi, %ecx \n" |
|
2549 "shrl $16, %ecx \n" |
|
2550 "movq (%edi,%ecx,8),%mm1\n" |
|
2551 |
|
2552 "cmp 0x34(%esp), %ebx\n" |
|
2553 - "jge .lscalelastpixel\n" |
|
2554 + "jge 2f\n" |
|
2555 |
|
2556 "mov %ebx,%eax\n" |
|
2557 "sar $0x10,%eax\n" |
|
2558 "movzbl (%edx,%eax,1),%ecx\n" |
|
2559 "movzbl 1(%edx,%eax,1),%esi\n" |
|
2560 "mov %ebx,%eax\n" |
|
2561 "add 0x38(%esp),%ebx\n" |
|
2562 "andl $0xffff, %eax \n" |
|
2563 @@ -770,154 +845,71 @@ void PICLinearScaleYUVToRGB32Row(const u |
|
2564 "paddsw %mm0,%mm1\n" |
|
2565 "paddsw %mm0,%mm2\n" |
|
2566 "psraw $0x6,%mm1\n" |
|
2567 "psraw $0x6,%mm2\n" |
|
2568 "packuswb %mm2,%mm1\n" |
|
2569 "movntq %mm1,0x0(%ebp)\n" |
|
2570 "add $0x8,%ebp\n" |
|
2571 |
|
2572 -".lscaleend:" |
|
2573 +"1:" |
|
2574 "cmp %ebx, 0x34(%esp)\n" |
|
2575 - "jg .lscaleloop\n" |
|
2576 + "jg 0b\n" |
|
2577 "popa\n" |
|
2578 "ret\n" |
|
2579 |
|
2580 -".lscalelastpixel:" |
|
2581 +"2:" |
|
2582 "paddsw %mm0, %mm1\n" |
|
2583 "psraw $6, %mm1\n" |
|
2584 "packuswb %mm1, %mm1\n" |
|
2585 "movd %mm1, (%ebp)\n" |
|
2586 "popa\n" |
|
2587 "ret\n" |
|
2588 +#if !defined(XP_MACOSX) |
|
2589 + ".previous\n" |
|
2590 +#endif |
|
2591 ); |
|
2592 |
|
2593 + |
|
2594 void LinearScaleYUVToRGB32Row(const uint8* y_buf, |
|
2595 - const uint8* u_buf, |
|
2596 - const uint8* v_buf, |
|
2597 - uint8* rgb_buf, |
|
2598 - int width, |
|
2599 - int source_dx) { |
|
2600 - PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx, |
|
2601 - &kCoefficientsRgbY[0][0]); |
|
2602 -} |
|
2603 - |
|
2604 -#else // USE_MMX |
|
2605 - |
|
2606 -// C reference code that mimic the YUV assembly. |
|
2607 -#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x))) |
|
2608 -#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \ |
|
2609 - (((x) + (y)) > 32767 ? 32767 : ((x) + (y)))) |
|
2610 - |
|
2611 -static inline void YuvPixel(uint8 y, |
|
2612 - uint8 u, |
|
2613 - uint8 v, |
|
2614 - uint8* rgb_buf) { |
|
2615 - |
|
2616 - int b = kCoefficientsRgbY[256+u][0]; |
|
2617 - int g = kCoefficientsRgbY[256+u][1]; |
|
2618 - int r = kCoefficientsRgbY[256+u][2]; |
|
2619 - int a = kCoefficientsRgbY[256+u][3]; |
|
2620 - |
|
2621 - b = paddsw(b, kCoefficientsRgbY[512+v][0]); |
|
2622 - g = paddsw(g, kCoefficientsRgbY[512+v][1]); |
|
2623 - r = paddsw(r, kCoefficientsRgbY[512+v][2]); |
|
2624 - a = paddsw(a, kCoefficientsRgbY[512+v][3]); |
|
2625 - |
|
2626 - b = paddsw(b, kCoefficientsRgbY[y][0]); |
|
2627 - g = paddsw(g, kCoefficientsRgbY[y][1]); |
|
2628 - r = paddsw(r, kCoefficientsRgbY[y][2]); |
|
2629 - a = paddsw(a, kCoefficientsRgbY[y][3]); |
|
2630 - |
|
2631 - b >>= 6; |
|
2632 - g >>= 6; |
|
2633 - r >>= 6; |
|
2634 - a >>= 6; |
|
2635 - |
|
2636 - *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) | |
|
2637 - (packuswb(g) << 8) | |
|
2638 - (packuswb(r) << 16) | |
|
2639 - (packuswb(a) << 24); |
|
2640 -} |
|
2641 - |
|
2642 + const uint8* u_buf, |
|
2643 + const uint8* v_buf, |
|
2644 + uint8* rgb_buf, |
|
2645 + int width, |
|
2646 + int source_dx) |
|
2647 +{ |
|
2648 + if (mozilla::supports_sse()) { |
|
2649 + PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, |
|
2650 + source_dx, &kCoefficientsRgbY[0][0]); |
|
2651 + return; |
|
2652 + } |
|
2653 + |
|
2654 + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); |
|
2655 +} |
|
2656 +#else |
|
2657 void FastConvertYUVToRGB32Row(const uint8* y_buf, |
|
2658 const uint8* u_buf, |
|
2659 const uint8* v_buf, |
|
2660 uint8* rgb_buf, |
|
2661 int width) { |
|
2662 - for (int x = 0; x < width; x += 2) { |
|
2663 - uint8 u = u_buf[x >> 1]; |
|
2664 - uint8 v = v_buf[x >> 1]; |
|
2665 - uint8 y0 = y_buf[x]; |
|
2666 - YuvPixel(y0, u, v, rgb_buf); |
|
2667 - if ((x + 1) < width) { |
|
2668 - uint8 y1 = y_buf[x + 1]; |
|
2669 - YuvPixel(y1, u, v, rgb_buf + 4); |
|
2670 - } |
|
2671 - rgb_buf += 8; // Advance 2 pixels. |
|
2672 - } |
|
2673 -} |
|
2674 - |
|
2675 -// 16.16 fixed point is used. A shift by 16 isolates the integer. |
|
2676 -// A shift by 17 is used to further subsample the chrominence channels. |
|
2677 -// & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits, |
|
2678 -// for 1/65536 pixel accurate interpolation. |
|
2679 + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); |
|
2680 +} |
|
2681 + |
|
2682 void ScaleYUVToRGB32Row(const uint8* y_buf, |
|
2683 const uint8* u_buf, |
|
2684 const uint8* v_buf, |
|
2685 uint8* rgb_buf, |
|
2686 int width, |
|
2687 int source_dx) { |
|
2688 - int x = 0; |
|
2689 - for (int i = 0; i < width; i += 2) { |
|
2690 - int y = y_buf[x >> 16]; |
|
2691 - int u = u_buf[(x >> 17)]; |
|
2692 - int v = v_buf[(x >> 17)]; |
|
2693 - YuvPixel(y, u, v, rgb_buf); |
|
2694 - x += source_dx; |
|
2695 - if ((i + 1) < width) { |
|
2696 - y = y_buf[x >> 16]; |
|
2697 - YuvPixel(y, u, v, rgb_buf+4); |
|
2698 - x += source_dx; |
|
2699 - } |
|
2700 - rgb_buf += 8; |
|
2701 - } |
|
2702 -} |
|
2703 + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); |
|
2704 +} |
|
2705 |
|
2706 void LinearScaleYUVToRGB32Row(const uint8* y_buf, |
|
2707 const uint8* u_buf, |
|
2708 const uint8* v_buf, |
|
2709 uint8* rgb_buf, |
|
2710 int width, |
|
2711 int source_dx) { |
|
2712 - int x = 0; |
|
2713 - if (source_dx >= 0x20000) { |
|
2714 - x = 32768; |
|
2715 - } |
|
2716 - for (int i = 0; i < width; i += 2) { |
|
2717 - int y0 = y_buf[x >> 16]; |
|
2718 - int y1 = y_buf[(x >> 16) + 1]; |
|
2719 - int u0 = u_buf[(x >> 17)]; |
|
2720 - int u1 = u_buf[(x >> 17) + 1]; |
|
2721 - int v0 = v_buf[(x >> 17)]; |
|
2722 - int v1 = v_buf[(x >> 17) + 1]; |
|
2723 - int y_frac = (x & 65535); |
|
2724 - int uv_frac = ((x >> 1) & 65535); |
|
2725 - int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; |
|
2726 - int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16; |
|
2727 - int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16; |
|
2728 - YuvPixel(y, u, v, rgb_buf); |
|
2729 - x += source_dx; |
|
2730 - if ((i + 1) < width) { |
|
2731 - y0 = y_buf[x >> 16]; |
|
2732 - y1 = y_buf[(x >> 16) + 1]; |
|
2733 - y_frac = (x & 65535); |
|
2734 - y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; |
|
2735 - YuvPixel(y, u, v, rgb_buf+4); |
|
2736 - x += source_dx; |
|
2737 - } |
|
2738 - rgb_buf += 8; |
|
2739 - } |
|
2740 -} |
|
2741 - |
|
2742 -#endif // USE_MMX |
|
2743 -} // extern "C" |
|
2744 - |
|
2745 + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); |
|
2746 +} |
|
2747 +#endif |
|
2748 + |
|
2749 +} |
|
2750 diff --git a/gfx/ycbcr/yuv_row_table.cpp b/gfx/ycbcr/yuv_row_table.cpp |
|
2751 --- a/gfx/ycbcr/yuv_row_table.cpp |
|
2752 +++ b/gfx/ycbcr/yuv_row_table.cpp |
|
2753 @@ -1,13 +1,13 @@ |
|
2754 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
|
2755 // Use of this source code is governed by a BSD-style license that can be |
|
2756 // found in the LICENSE file. |
|
2757 |
|
2758 -#include "media/base/yuv_row.h" |
|
2759 +#include "yuv_row.h" |
|
2760 |
|
2761 extern "C" { |
|
2762 |
|
2763 #define RGBY(i) { \ |
|
2764 static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ |
|
2765 static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ |
|
2766 static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ |
|
2767 0 \ |
|
2768 diff --git a/gfx/ycbcr/yuv_row_win.cpp b/gfx/ycbcr/yuv_row_win.cpp |
|
2769 --- a/gfx/ycbcr/yuv_row_win.cpp |
|
2770 +++ b/gfx/ycbcr/yuv_row_win.cpp |
|
2771 @@ -1,26 +1,27 @@ |
|
2772 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
|
2773 // Use of this source code is governed by a BSD-style license that can be |
|
2774 // found in the LICENSE file. |
|
2775 |
|
2776 -#include "media/base/yuv_row.h" |
|
2777 +#include "yuv_row.h" |
|
2778 +#include "mozilla/SSE.h" |
|
2779 |
|
2780 #define kCoefficientsRgbU kCoefficientsRgbY + 2048 |
|
2781 #define kCoefficientsRgbV kCoefficientsRgbY + 4096 |
|
2782 |
|
2783 extern "C" { |
|
2784 |
|
2785 -#if USE_MMX |
|
2786 -__declspec(naked) |
|
2787 -void FastConvertYUVToRGB32Row(const uint8* y_buf, |
|
2788 - const uint8* u_buf, |
|
2789 - const uint8* v_buf, |
|
2790 - uint8* rgb_buf, |
|
2791 - int width) { |
|
2792 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) |
|
2793 +__declspec(naked) |
|
2794 +void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf, |
|
2795 + const uint8* u_buf, |
|
2796 + const uint8* v_buf, |
|
2797 + uint8* rgb_buf, |
|
2798 + int width) { |
|
2799 __asm { |
|
2800 pushad |
|
2801 mov edx, [esp + 32 + 4] // Y |
|
2802 mov edi, [esp + 32 + 8] // U |
|
2803 mov esi, [esp + 32 + 12] // V |
|
2804 mov ebp, [esp + 32 + 16] // rgb |
|
2805 mov ecx, [esp + 32 + 20] // width |
|
2806 jmp convertend |
|
2807 @@ -64,22 +65,22 @@ void FastConvertYUVToRGB32Row(const uint |
|
2808 convertdone : |
|
2809 |
|
2810 popad |
|
2811 ret |
|
2812 } |
|
2813 } |
|
2814 |
|
2815 __declspec(naked) |
|
2816 -void ConvertYUVToRGB32Row(const uint8* y_buf, |
|
2817 - const uint8* u_buf, |
|
2818 - const uint8* v_buf, |
|
2819 - uint8* rgb_buf, |
|
2820 - int width, |
|
2821 - int step) { |
|
2822 +void ConvertYUVToRGB32Row_SSE(const uint8* y_buf, |
|
2823 + const uint8* u_buf, |
|
2824 + const uint8* v_buf, |
|
2825 + uint8* rgb_buf, |
|
2826 + int width, |
|
2827 + int step) { |
|
2828 __asm { |
|
2829 pushad |
|
2830 mov edx, [esp + 32 + 4] // Y |
|
2831 mov edi, [esp + 32 + 8] // U |
|
2832 mov esi, [esp + 32 + 12] // V |
|
2833 mov ebp, [esp + 32 + 16] // rgb |
|
2834 mov ecx, [esp + 32 + 20] // width |
|
2835 mov ebx, [esp + 32 + 24] // step |
|
2836 @@ -125,23 +126,23 @@ void ConvertYUVToRGB32Row(const uint8* y |
|
2837 wdone : |
|
2838 |
|
2839 popad |
|
2840 ret |
|
2841 } |
|
2842 } |
|
2843 |
|
2844 __declspec(naked) |
|
2845 -void RotateConvertYUVToRGB32Row(const uint8* y_buf, |
|
2846 - const uint8* u_buf, |
|
2847 - const uint8* v_buf, |
|
2848 - uint8* rgb_buf, |
|
2849 - int width, |
|
2850 - int ystep, |
|
2851 - int uvstep) { |
|
2852 +void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf, |
|
2853 + const uint8* u_buf, |
|
2854 + const uint8* v_buf, |
|
2855 + uint8* rgb_buf, |
|
2856 + int width, |
|
2857 + int ystep, |
|
2858 + int uvstep) { |
|
2859 __asm { |
|
2860 pushad |
|
2861 mov edx, [esp + 32 + 4] // Y |
|
2862 mov edi, [esp + 32 + 8] // U |
|
2863 mov esi, [esp + 32 + 12] // V |
|
2864 mov ebp, [esp + 32 + 16] // rgb |
|
2865 mov ecx, [esp + 32 + 20] // width |
|
2866 jmp wend |
|
2867 @@ -188,21 +189,21 @@ void RotateConvertYUVToRGB32Row(const ui |
|
2868 wdone : |
|
2869 |
|
2870 popad |
|
2871 ret |
|
2872 } |
|
2873 } |
|
2874 |
|
2875 __declspec(naked) |
|
2876 -void DoubleYUVToRGB32Row(const uint8* y_buf, |
|
2877 - const uint8* u_buf, |
|
2878 - const uint8* v_buf, |
|
2879 - uint8* rgb_buf, |
|
2880 - int width) { |
|
2881 +void DoubleYUVToRGB32Row_SSE(const uint8* y_buf, |
|
2882 + const uint8* u_buf, |
|
2883 + const uint8* v_buf, |
|
2884 + uint8* rgb_buf, |
|
2885 + int width) { |
|
2886 __asm { |
|
2887 pushad |
|
2888 mov edx, [esp + 32 + 4] // Y |
|
2889 mov edi, [esp + 32 + 8] // U |
|
2890 mov esi, [esp + 32 + 12] // V |
|
2891 mov ebp, [esp + 32 + 16] // rgb |
|
2892 mov ecx, [esp + 32 + 20] // width |
|
2893 jmp wend |
|
2894 @@ -256,26 +257,26 @@ void DoubleYUVToRGB32Row(const uint8* y_ |
|
2895 jns wloop1 |
|
2896 wdone : |
|
2897 popad |
|
2898 ret |
|
2899 } |
|
2900 } |
|
2901 |
|
2902 // This version does general purpose scaling by any amount, up or down. |
|
2903 -// The only thing it can not do it rotation by 90 or 270. |
|
2904 -// For performance the chroma is under sampled, reducing cost of a 3x |
|
2905 +// The only thing it cannot do is rotation by 90 or 270. |
|
2906 +// For performance the chroma is under-sampled, reducing cost of a 3x |
|
2907 // 1080p scale from 8.4 ms to 5.4 ms. |
|
2908 __declspec(naked) |
|
2909 -void ScaleYUVToRGB32Row(const uint8* y_buf, |
|
2910 - const uint8* u_buf, |
|
2911 - const uint8* v_buf, |
|
2912 - uint8* rgb_buf, |
|
2913 - int width, |
|
2914 - int source_dx) { |
|
2915 +void ScaleYUVToRGB32Row_SSE(const uint8* y_buf, |
|
2916 + const uint8* u_buf, |
|
2917 + const uint8* v_buf, |
|
2918 + uint8* rgb_buf, |
|
2919 + int width, |
|
2920 + int source_dx) { |
|
2921 __asm { |
|
2922 pushad |
|
2923 mov edx, [esp + 32 + 4] // Y |
|
2924 mov edi, [esp + 32 + 8] // U |
|
2925 mov esi, [esp + 32 + 12] // V |
|
2926 mov ebp, [esp + 32 + 16] // rgb |
|
2927 mov ecx, [esp + 32 + 20] // width |
|
2928 xor ebx, ebx // x |
|
2929 @@ -333,22 +334,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b |
|
2930 |
|
2931 scaledone : |
|
2932 popad |
|
2933 ret |
|
2934 } |
|
2935 } |
|
2936 |
|
2937 __declspec(naked) |
|
2938 -void LinearScaleYUVToRGB32Row(const uint8* y_buf, |
|
2939 - const uint8* u_buf, |
|
2940 - const uint8* v_buf, |
|
2941 - uint8* rgb_buf, |
|
2942 - int width, |
|
2943 - int source_dx) { |
|
2944 +void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf, |
|
2945 + const uint8* u_buf, |
|
2946 + const uint8* v_buf, |
|
2947 + uint8* rgb_buf, |
|
2948 + int width, |
|
2949 + int source_dx) { |
|
2950 __asm { |
|
2951 pushad |
|
2952 mov edx, [esp + 32 + 4] // Y |
|
2953 mov edi, [esp + 32 + 8] // U |
|
2954 // [esp + 32 + 12] // V |
|
2955 mov ebp, [esp + 32 + 16] // rgb |
|
2956 mov ecx, [esp + 32 + 20] // width |
|
2957 imul ecx, [esp + 32 + 24] // source_dx |
|
2958 @@ -438,152 +439,60 @@ lscalelastpixel: |
|
2959 paddsw mm1, mm0 |
|
2960 psraw mm1, 6 |
|
2961 packuswb mm1, mm1 |
|
2962 movd [ebp], mm1 |
|
2963 popad |
|
2964 ret |
|
2965 }; |
|
2966 } |
|
2967 -#else // USE_MMX |
|
2968 - |
|
2969 -// C reference code that mimic the YUV assembly. |
|
2970 -#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x))) |
|
2971 -#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \ |
|
2972 - (((x) + (y)) > 32767 ? 32767 : ((x) + (y)))) |
|
2973 - |
|
2974 -static inline void YuvPixel(uint8 y, |
|
2975 - uint8 u, |
|
2976 - uint8 v, |
|
2977 - uint8* rgb_buf) { |
|
2978 - |
|
2979 - int b = kCoefficientsRgbY[256+u][0]; |
|
2980 - int g = kCoefficientsRgbY[256+u][1]; |
|
2981 - int r = kCoefficientsRgbY[256+u][2]; |
|
2982 - int a = kCoefficientsRgbY[256+u][3]; |
|
2983 - |
|
2984 - b = paddsw(b, kCoefficientsRgbY[512+v][0]); |
|
2985 - g = paddsw(g, kCoefficientsRgbY[512+v][1]); |
|
2986 - r = paddsw(r, kCoefficientsRgbY[512+v][2]); |
|
2987 - a = paddsw(a, kCoefficientsRgbY[512+v][3]); |
|
2988 - |
|
2989 - b = paddsw(b, kCoefficientsRgbY[y][0]); |
|
2990 - g = paddsw(g, kCoefficientsRgbY[y][1]); |
|
2991 - r = paddsw(r, kCoefficientsRgbY[y][2]); |
|
2992 - a = paddsw(a, kCoefficientsRgbY[y][3]); |
|
2993 - |
|
2994 - b >>= 6; |
|
2995 - g >>= 6; |
|
2996 - r >>= 6; |
|
2997 - a >>= 6; |
|
2998 - |
|
2999 - *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) | |
|
3000 - (packuswb(g) << 8) | |
|
3001 - (packuswb(r) << 16) | |
|
3002 - (packuswb(a) << 24); |
|
3003 -} |
|
3004 - |
|
3005 -#if TEST_MMX_YUV |
|
3006 -static inline void YuvPixel(uint8 y, |
|
3007 - uint8 u, |
|
3008 - uint8 v, |
|
3009 - uint8* rgb_buf) { |
|
3010 - |
|
3011 - __asm { |
|
3012 - movzx eax, u |
|
3013 - movq mm0, [kCoefficientsRgbY+2048 + 8 * eax] |
|
3014 - movzx eax, v |
|
3015 - paddsw mm0, [kCoefficientsRgbY+4096 + 8 * eax] |
|
3016 - movzx eax, y |
|
3017 - movq mm1, [kCoefficientsRgbY + 8 * eax] |
|
3018 - paddsw mm1, mm0 |
|
3019 - psraw mm1, 6 |
|
3020 - packuswb mm1, mm1 |
|
3021 - mov eax, rgb_buf |
|
3022 - movd [eax], mm1 |
|
3023 - emms |
|
3024 - } |
|
3025 -} |
|
3026 -#endif |
|
3027 +#endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) |
|
3028 |
|
3029 void FastConvertYUVToRGB32Row(const uint8* y_buf, |
|
3030 const uint8* u_buf, |
|
3031 const uint8* v_buf, |
|
3032 uint8* rgb_buf, |
|
3033 int width) { |
|
3034 - for (int x = 0; x < width; x += 2) { |
|
3035 - uint8 u = u_buf[x >> 1]; |
|
3036 - uint8 v = v_buf[x >> 1]; |
|
3037 - uint8 y0 = y_buf[x]; |
|
3038 - YuvPixel(y0, u, v, rgb_buf); |
|
3039 - if ((x + 1) < width) { |
|
3040 - uint8 y1 = y_buf[x + 1]; |
|
3041 - YuvPixel(y1, u, v, rgb_buf + 4); |
|
3042 - } |
|
3043 - rgb_buf += 8; // Advance 2 pixels. |
|
3044 - } |
|
3045 -} |
|
3046 - |
|
3047 -// 16.16 fixed point is used. A shift by 16 isolates the integer. |
|
3048 -// A shift by 17 is used to further subsample the chrominence channels. |
|
3049 -// & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits, |
|
3050 -// for 1/65536 pixel accurate interpolation. |
|
3051 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) |
|
3052 + if (mozilla::supports_sse()) { |
|
3053 + FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width); |
|
3054 + return; |
|
3055 + } |
|
3056 +#endif |
|
3057 + |
|
3058 + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); |
|
3059 +} |
|
3060 + |
|
3061 void ScaleYUVToRGB32Row(const uint8* y_buf, |
|
3062 const uint8* u_buf, |
|
3063 const uint8* v_buf, |
|
3064 uint8* rgb_buf, |
|
3065 int width, |
|
3066 int source_dx) { |
|
3067 - int x = 0; |
|
3068 - for (int i = 0; i < width; i += 2) { |
|
3069 - int y = y_buf[x >> 16]; |
|
3070 - int u = u_buf[(x >> 17)]; |
|
3071 - int v = v_buf[(x >> 17)]; |
|
3072 - YuvPixel(y, u, v, rgb_buf); |
|
3073 - x += source_dx; |
|
3074 - if ((i + 1) < width) { |
|
3075 - y = y_buf[x >> 16]; |
|
3076 - YuvPixel(y, u, v, rgb_buf+4); |
|
3077 - x += source_dx; |
|
3078 - } |
|
3079 - rgb_buf += 8; |
|
3080 - } |
|
3081 -} |
|
3082 + |
|
3083 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) |
|
3084 + if (mozilla::supports_sse()) { |
|
3085 + ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); |
|
3086 + return; |
|
3087 + } |
|
3088 +#endif |
|
3089 + |
|
3090 + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); |
|
3091 +} |
|
3092 |
|
3093 void LinearScaleYUVToRGB32Row(const uint8* y_buf, |
|
3094 const uint8* u_buf, |
|
3095 const uint8* v_buf, |
|
3096 uint8* rgb_buf, |
|
3097 int width, |
|
3098 int source_dx) { |
|
3099 - int x = 0; |
|
3100 - if (source_dx >= 0x20000) { |
|
3101 - x = 32768; |
|
3102 - } |
|
3103 - for (int i = 0; i < width; i += 2) { |
|
3104 - int y0 = y_buf[x >> 16]; |
|
3105 - int y1 = y_buf[(x >> 16) + 1]; |
|
3106 - int u0 = u_buf[(x >> 17)]; |
|
3107 - int u1 = u_buf[(x >> 17) + 1]; |
|
3108 - int v0 = v_buf[(x >> 17)]; |
|
3109 - int v1 = v_buf[(x >> 17) + 1]; |
|
3110 - int y_frac = (x & 65535); |
|
3111 - int uv_frac = ((x >> 1) & 65535); |
|
3112 - int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; |
|
3113 - int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16; |
|
3114 - int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16; |
|
3115 - YuvPixel(y, u, v, rgb_buf); |
|
3116 - x += source_dx; |
|
3117 - if ((i + 1) < width) { |
|
3118 - y0 = y_buf[x >> 16]; |
|
3119 - y1 = y_buf[(x >> 16) + 1]; |
|
3120 - y_frac = (x & 65535); |
|
3121 - y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; |
|
3122 - YuvPixel(y, u, v, rgb_buf+4); |
|
3123 - x += source_dx; |
|
3124 - } |
|
3125 - rgb_buf += 8; |
|
3126 - } |
|
3127 -} |
|
3128 - |
|
3129 -#endif // USE_MMX |
|
3130 -} // extern "C" |
|
3131 - |
|
3132 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) |
|
3133 + if (mozilla::supports_sse()) { |
|
3134 + LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, |
|
3135 + source_dx); |
|
3136 + return; |
|
3137 + } |
|
3138 +#endif |
|
3139 + |
|
3140 + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); |
|
3141 +} |
|
3142 + |
|
3143 +} // extern "C" |