1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/ycbcr/convert.patch Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,3143 @@ 1.4 +diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp 1.5 +--- a/gfx/ycbcr/yuv_convert.cpp 1.6 ++++ b/gfx/ycbcr/yuv_convert.cpp 1.7 +@@ -6,145 +6,102 @@ 1.8 + // http://www.fourcc.org/yuv.php 1.9 + // The actual conversion is best described here 1.10 + // http://en.wikipedia.org/wiki/YUV 1.11 + // An article on optimizing YUV conversion using tables instead of multiplies 1.12 + // http://lestourtereaux.free.fr/papers/data/yuvrgb.pdf 1.13 + // 1.14 + // YV12 is a full plane of Y and a half height, half width chroma planes 1.15 + // YV16 is a full plane of Y and a full height, half width chroma planes 1.16 ++// YV24 is a full plane of Y and a full height, full width chroma planes 1.17 + // 1.18 + // ARGB pixel format is output, which on little endian is stored as BGRA. 1.19 + // The alpha is set to 255, allowing the application to use RGBA or RGB32. 1.20 + 1.21 +-#include "media/base/yuv_convert.h" 1.22 ++#include "yuv_convert.h" 1.23 + 1.24 + // Header for low level row functions. 1.25 +-#include "media/base/yuv_row.h" 1.26 +- 1.27 +-#if USE_MMX 1.28 +-#if defined(_MSC_VER) 1.29 +-#include <intrin.h> 1.30 +-#else 1.31 +-#include <mmintrin.h> 1.32 +-#endif 1.33 +-#endif 1.34 +- 1.35 +-#if USE_SSE2 1.36 +-#include <emmintrin.h> 1.37 +-#endif 1.38 +- 1.39 +-namespace media { 1.40 +- 1.41 ++#include "yuv_row.h" 1.42 ++#include "mozilla/SSE.h" 1.43 ++ 1.44 ++namespace mozilla { 1.45 ++ 1.46 ++namespace gfx { 1.47 ++ 1.48 + // 16.16 fixed point arithmetic 1.49 + const int kFractionBits = 16; 1.50 + const int kFractionMax = 1 << kFractionBits; 1.51 + const int kFractionMask = ((1 << kFractionBits) - 1); 1.52 + 1.53 + // Convert a frame of YUV to 32 bit ARGB. 1.54 +-void ConvertYUVToRGB32(const uint8* y_buf, 1.55 +- const uint8* u_buf, 1.56 +- const uint8* v_buf, 1.57 +- uint8* rgb_buf, 1.58 +- int width, 1.59 +- int height, 1.60 +- int y_pitch, 1.61 +- int uv_pitch, 1.62 +- int rgb_pitch, 1.63 +- YUVType yuv_type) { 1.64 +- unsigned int y_shift = yuv_type; 1.65 +- for (int y = 0; y < height; ++y) { 1.66 +- uint8* rgb_row = rgb_buf + y * rgb_pitch; 1.67 +- const uint8* y_ptr = y_buf + y * y_pitch; 1.68 +- const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch; 1.69 +- const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch; 1.70 +- 1.71 +- FastConvertYUVToRGB32Row(y_ptr, 1.72 +- u_ptr, 1.73 +- v_ptr, 1.74 +- rgb_row, 1.75 +- width); 1.76 +- } 1.77 ++NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* y_buf, 1.78 ++ const uint8* u_buf, 1.79 ++ const uint8* v_buf, 1.80 ++ uint8* rgb_buf, 1.81 ++ int pic_x, 1.82 ++ int pic_y, 1.83 ++ int pic_width, 1.84 ++ int pic_height, 1.85 ++ int y_pitch, 1.86 ++ int uv_pitch, 1.87 ++ int rgb_pitch, 1.88 ++ YUVType yuv_type) { 1.89 ++ unsigned int y_shift = yuv_type == YV12 ? 1 : 0; 1.90 ++ unsigned int x_shift = yuv_type == YV24 ? 0 : 1; 1.91 ++ // Test for SSE because the optimized code uses movntq, which is not part of MMX. 1.92 ++ bool has_sse = supports_mmx() && supports_sse(); 1.93 ++ // There is no optimized YV24 SSE routine so we check for this and 1.94 ++ // fall back to the C code. 1.95 ++ has_sse &= yuv_type != YV24; 1.96 ++ bool odd_pic_x = yuv_type != YV24 && pic_x % 2 != 0; 1.97 ++ int x_width = odd_pic_x ? pic_width - 1 : pic_width; 1.98 ++ 1.99 ++ for (int y = pic_y; y < pic_height + pic_y; ++y) { 1.100 ++ uint8* rgb_row = rgb_buf + (y - pic_y) * rgb_pitch; 1.101 ++ const uint8* y_ptr = y_buf + y * y_pitch + pic_x; 1.102 ++ const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift); 1.103 ++ const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift); 1.104 ++ 1.105 ++ if (odd_pic_x) { 1.106 ++ // Handle the single odd pixel manually and use the 1.107 ++ // fast routines for the remaining. 1.108 ++ FastConvertYUVToRGB32Row_C(y_ptr++, 1.109 ++ u_ptr++, 1.110 ++ v_ptr++, 1.111 ++ rgb_row, 1.112 ++ 1, 1.113 ++ x_shift); 1.114 ++ rgb_row += 4; 1.115 ++ } 1.116 ++ 1.117 ++ if (has_sse) { 1.118 ++ FastConvertYUVToRGB32Row(y_ptr, 1.119 ++ u_ptr, 1.120 ++ v_ptr, 1.121 ++ rgb_row, 1.122 ++ x_width); 1.123 ++ } 1.124 ++ else { 1.125 ++ FastConvertYUVToRGB32Row_C(y_ptr, 1.126 ++ u_ptr, 1.127 ++ v_ptr, 1.128 ++ rgb_row, 1.129 ++ x_width, 1.130 ++ x_shift); 1.131 ++ } 1.132 ++ } 1.133 + 1.134 + // MMX used for FastConvertYUVToRGB32Row requires emms instruction. 1.135 +- EMMS(); 1.136 +-} 1.137 +- 1.138 +-#if USE_SSE2 1.139 +-// FilterRows combines two rows of the image using linear interpolation. 1.140 +-// SSE2 version does 16 pixels at a time 1.141 +- 1.142 +-static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, 1.143 +- int source_width, int source_y_fraction) { 1.144 +- __m128i zero = _mm_setzero_si128(); 1.145 +- __m128i y1_fraction = _mm_set1_epi16(source_y_fraction); 1.146 +- __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction); 1.147 +- 1.148 +- const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr); 1.149 +- const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr); 1.150 +- __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf); 1.151 +- __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width); 1.152 +- 1.153 +- do { 1.154 +- __m128i y0 = _mm_loadu_si128(y0_ptr128); 1.155 +- __m128i y1 = _mm_loadu_si128(y1_ptr128); 1.156 +- __m128i y2 = _mm_unpackhi_epi8(y0, zero); 1.157 +- __m128i y3 = _mm_unpackhi_epi8(y1, zero); 1.158 +- y0 = _mm_unpacklo_epi8(y0, zero); 1.159 +- y1 = _mm_unpacklo_epi8(y1, zero); 1.160 +- y0 = _mm_mullo_epi16(y0, y0_fraction); 1.161 +- y1 = _mm_mullo_epi16(y1, y1_fraction); 1.162 +- y2 = _mm_mullo_epi16(y2, y0_fraction); 1.163 +- y3 = _mm_mullo_epi16(y3, y1_fraction); 1.164 +- y0 = _mm_add_epi16(y0, y1); 1.165 +- y2 = _mm_add_epi16(y2, y3); 1.166 +- y0 = _mm_srli_epi16(y0, 8); 1.167 +- y2 = _mm_srli_epi16(y2, 8); 1.168 +- y0 = _mm_packus_epi16(y0, y2); 1.169 +- *dest128++ = y0; 1.170 +- ++y0_ptr128; 1.171 +- ++y1_ptr128; 1.172 +- } while (dest128 < end128); 1.173 +-} 1.174 +-#elif USE_MMX 1.175 +-// MMX version does 8 pixels at a time 1.176 +-static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, 1.177 +- int source_width, int source_y_fraction) { 1.178 +- __m64 zero = _mm_setzero_si64(); 1.179 +- __m64 y1_fraction = _mm_set1_pi16(source_y_fraction); 1.180 +- __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction); 1.181 +- 1.182 +- const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr); 1.183 +- const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr); 1.184 +- __m64* dest64 = reinterpret_cast<__m64*>(ybuf); 1.185 +- __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width); 1.186 +- 1.187 +- do { 1.188 +- __m64 y0 = *y0_ptr64++; 1.189 +- __m64 y1 = *y1_ptr64++; 1.190 +- __m64 y2 = _mm_unpackhi_pi8(y0, zero); 1.191 +- __m64 y3 = _mm_unpackhi_pi8(y1, zero); 1.192 +- y0 = _mm_unpacklo_pi8(y0, zero); 1.193 +- y1 = _mm_unpacklo_pi8(y1, zero); 1.194 +- y0 = _mm_mullo_pi16(y0, y0_fraction); 1.195 +- y1 = _mm_mullo_pi16(y1, y1_fraction); 1.196 +- y2 = _mm_mullo_pi16(y2, y0_fraction); 1.197 +- y3 = _mm_mullo_pi16(y3, y1_fraction); 1.198 +- y0 = _mm_add_pi16(y0, y1); 1.199 +- y2 = _mm_add_pi16(y2, y3); 1.200 +- y0 = _mm_srli_pi16(y0, 8); 1.201 +- y2 = _mm_srli_pi16(y2, 8); 1.202 +- y0 = _mm_packs_pu16(y0, y2); 1.203 +- *dest64++ = y0; 1.204 +- } while (dest64 < end64); 1.205 +-} 1.206 +-#else // no MMX or SSE2 1.207 ++ if (has_sse) 1.208 ++ EMMS(); 1.209 ++} 1.210 ++ 1.211 + // C version does 8 at a time to mimic MMX code 1.212 +-static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, 1.213 +- int source_width, int source_y_fraction) { 1.214 ++static void FilterRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, 1.215 ++ int source_width, int source_y_fraction) { 1.216 + int y1_fraction = source_y_fraction; 1.217 + int y0_fraction = 256 - y1_fraction; 1.218 + uint8* end = ybuf + source_width; 1.219 + do { 1.220 + ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8; 1.221 + ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8; 1.222 + ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8; 1.223 + ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8; 1.224 +@@ -152,46 +140,77 @@ static void FilterRows(uint8* ybuf, cons 1.225 + ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8; 1.226 + ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8; 1.227 + ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8; 1.228 + y0_ptr += 8; 1.229 + y1_ptr += 8; 1.230 + ybuf += 8; 1.231 + } while (ybuf < end); 1.232 + } 1.233 +-#endif 1.234 ++ 1.235 ++#ifdef MOZILLA_MAY_SUPPORT_MMX 1.236 ++void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, 1.237 ++ int source_width, int source_y_fraction); 1.238 ++#endif 1.239 ++ 1.240 ++#ifdef MOZILLA_MAY_SUPPORT_SSE2 1.241 ++void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, 1.242 ++ int source_width, int source_y_fraction); 1.243 ++#endif 1.244 ++ 1.245 ++static inline void FilterRows(uint8* ybuf, const uint8* y0_ptr, 1.246 ++ const uint8* y1_ptr, int source_width, 1.247 ++ int source_y_fraction) { 1.248 ++#ifdef MOZILLA_MAY_SUPPORT_SSE2 1.249 ++ if (mozilla::supports_sse2()) { 1.250 ++ FilterRows_SSE2(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); 1.251 ++ return; 1.252 ++ } 1.253 ++#endif 1.254 ++ 1.255 ++#ifdef MOZILLA_MAY_SUPPORT_MMX 1.256 ++ if (mozilla::supports_mmx()) { 1.257 ++ FilterRows_MMX(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); 1.258 ++ return; 1.259 ++ } 1.260 ++#endif 1.261 ++ 1.262 ++ FilterRows_C(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); 1.263 ++} 1.264 + 1.265 + 1.266 + // Scale a frame of YUV to 32 bit ARGB. 1.267 +-void ScaleYUVToRGB32(const uint8* y_buf, 1.268 +- const uint8* u_buf, 1.269 +- const uint8* v_buf, 1.270 +- uint8* rgb_buf, 1.271 +- int source_width, 1.272 +- int source_height, 1.273 +- int width, 1.274 +- int height, 1.275 +- int y_pitch, 1.276 +- int uv_pitch, 1.277 +- int rgb_pitch, 1.278 +- YUVType yuv_type, 1.279 +- Rotate view_rotate, 1.280 +- ScaleFilter filter) { 1.281 ++NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* y_buf, 1.282 ++ const uint8* u_buf, 1.283 ++ const uint8* v_buf, 1.284 ++ uint8* rgb_buf, 1.285 ++ int source_width, 1.286 ++ int source_height, 1.287 ++ int width, 1.288 ++ int height, 1.289 ++ int y_pitch, 1.290 ++ int uv_pitch, 1.291 ++ int rgb_pitch, 1.292 ++ YUVType yuv_type, 1.293 ++ Rotate view_rotate, 1.294 ++ ScaleFilter filter) { 1.295 ++ bool has_mmx = supports_mmx(); 1.296 ++ 1.297 + // 4096 allows 3 buffers to fit in 12k. 1.298 + // Helps performance on CPU with 16K L1 cache. 1.299 + // Large enough for 3830x2160 and 30" displays which are 2560x1600. 1.300 + const int kFilterBufferSize = 4096; 1.301 + // Disable filtering if the screen is too big (to avoid buffer overflows). 1.302 + // This should never happen to regular users: they don't have monitors 1.303 + // wider than 4096 pixels. 1.304 + // TODO(fbarchard): Allow rotated videos to filter. 1.305 + if (source_width > kFilterBufferSize || view_rotate) 1.306 + filter = FILTER_NONE; 1.307 + 1.308 +- unsigned int y_shift = yuv_type; 1.309 ++ unsigned int y_shift = yuv_type == YV12 ? 1 : 0; 1.310 + // Diagram showing origin and direction of source sampling. 1.311 + // ->0 4<- 1.312 + // 7 3 1.313 + // 1.314 + // 6 5 1.315 + // ->1 2<- 1.316 + // Rotations that start at right side of image. 1.317 + if ((view_rotate == ROTATE_180) || 1.318 +@@ -276,17 +295,17 @@ void ScaleYUVToRGB32(const uint8* y_buf, 1.319 + int source_uv_fraction = 1.320 + ((source_y_subpixel >> y_shift) & kFractionMask) >> 8; 1.321 + 1.322 + const uint8* y_ptr = y0_ptr; 1.323 + const uint8* u_ptr = u0_ptr; 1.324 + const uint8* v_ptr = v0_ptr; 1.325 + // Apply vertical filtering if necessary. 1.326 + // TODO(fbarchard): Remove memcpy when not necessary. 1.327 +- if (filter & media::FILTER_BILINEAR_V) { 1.328 ++ if (filter & mozilla::gfx::FILTER_BILINEAR_V) { 1.329 + if (yscale_fixed != kFractionMax && 1.330 + source_y_fraction && ((source_y + 1) < source_height)) { 1.331 + FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); 1.332 + } else { 1.333 + memcpy(ybuf, y0_ptr, source_width); 1.334 + } 1.335 + y_ptr = ybuf; 1.336 + ybuf[source_width] = ybuf[source_width-1]; 1.337 +@@ -303,44 +322,50 @@ void ScaleYUVToRGB32(const uint8* y_buf, 1.338 + u_ptr = ubuf; 1.339 + v_ptr = vbuf; 1.340 + ubuf[uv_source_width] = ubuf[uv_source_width - 1]; 1.341 + vbuf[uv_source_width] = vbuf[uv_source_width - 1]; 1.342 + } 1.343 + if (source_dx == kFractionMax) { // Not scaled 1.344 + FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr, 1.345 + dest_pixel, width); 1.346 +- } else { 1.347 +- if (filter & FILTER_BILINEAR_H) { 1.348 ++ } else if (filter & FILTER_BILINEAR_H) { 1.349 + LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, 1.350 + dest_pixel, width, source_dx); 1.351 + } else { 1.352 + // Specialized scalers and rotation. 1.353 +-#if USE_MMX && defined(_MSC_VER) 1.354 ++#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_MSC_VER) && defined(_M_IX86) 1.355 ++ if(mozilla::supports_sse()) { 1.356 + if (width == (source_width * 2)) { 1.357 +- DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, 1.358 +- dest_pixel, width); 1.359 ++ DoubleYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr, 1.360 ++ dest_pixel, width); 1.361 + } else if ((source_dx & kFractionMask) == 0) { 1.362 + // Scaling by integer scale factor. ie half. 1.363 +- ConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr, 1.364 +- dest_pixel, width, 1.365 +- source_dx >> kFractionBits); 1.366 ++ ConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr, 1.367 ++ dest_pixel, width, 1.368 ++ source_dx >> kFractionBits); 1.369 + } else if (source_dx_uv == source_dx) { // Not rotated. 1.370 + ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, 1.371 + dest_pixel, width, source_dx); 1.372 + } else { 1.373 +- RotateConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr, 1.374 +- dest_pixel, width, 1.375 +- source_dx >> kFractionBits, 1.376 +- source_dx_uv >> kFractionBits); 1.377 ++ RotateConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr, 1.378 ++ dest_pixel, width, 1.379 ++ source_dx >> kFractionBits, 1.380 ++ source_dx_uv >> kFractionBits); 1.381 + } 1.382 ++ } 1.383 ++ else { 1.384 ++ ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr, 1.385 ++ dest_pixel, width, source_dx); 1.386 ++ } 1.387 + #else 1.388 +- ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, 1.389 +- dest_pixel, width, source_dx); 1.390 +-#endif 1.391 +- } 1.392 ++ ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, 1.393 ++ dest_pixel, width, source_dx); 1.394 ++#endif 1.395 + } 1.396 + } 1.397 + // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms. 1.398 +- EMMS(); 1.399 +-} 1.400 +- 1.401 +-} // namespace media 1.402 ++ if (has_mmx) 1.403 ++ EMMS(); 1.404 ++} 1.405 ++ 1.406 ++} // namespace gfx 1.407 ++} // namespace mozilla 1.408 +diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h 1.409 +--- a/gfx/ycbcr/yuv_convert.h 1.410 ++++ b/gfx/ycbcr/yuv_convert.h 1.411 +@@ -1,72 +1,79 @@ 1.412 + // Copyright (c) 2010 The Chromium Authors. All rights reserved. 1.413 + // Use of this source code is governed by a BSD-style license that can be 1.414 + // found in the LICENSE file. 1.415 + 1.416 + #ifndef MEDIA_BASE_YUV_CONVERT_H_ 1.417 + #define MEDIA_BASE_YUV_CONVERT_H_ 1.418 + 1.419 +-#include "base/basictypes.h" 1.420 +- 1.421 +-namespace media { 1.422 +- 1.423 ++#include "chromium_types.h" 1.424 ++#include "gfxCore.h" 1.425 ++ 1.426 ++namespace mozilla { 1.427 ++ 1.428 ++namespace gfx { 1.429 ++ 1.430 + // Type of YUV surface. 1.431 + // The value of these enums matter as they are used to shift vertical indices. 1.432 + enum YUVType { 1.433 +- YV16 = 0, // YV16 is half width and full height chroma channels. 1.434 +- YV12 = 1, // YV12 is half width and half height chroma channels. 1.435 ++ YV12 = 0, // YV12 is half width and half height chroma channels. 1.436 ++ YV16 = 1, // YV16 is half width and full height chroma channels. 1.437 ++ YV24 = 2 // YV24 is full width and full height chroma channels. 1.438 + }; 1.439 + 1.440 + // Mirror means flip the image horizontally, as in looking in a mirror. 1.441 + // Rotate happens after mirroring. 1.442 + enum Rotate { 1.443 + ROTATE_0, // Rotation off. 1.444 + ROTATE_90, // Rotate clockwise. 1.445 + ROTATE_180, // Rotate upside down. 1.446 + ROTATE_270, // Rotate counter clockwise. 1.447 + MIRROR_ROTATE_0, // Mirror horizontally. 1.448 + MIRROR_ROTATE_90, // Mirror then Rotate clockwise. 1.449 + MIRROR_ROTATE_180, // Mirror vertically. 1.450 +- MIRROR_ROTATE_270, // Transpose. 1.451 ++ MIRROR_ROTATE_270 // Transpose. 1.452 + }; 1.453 + 1.454 + // Filter affects how scaling looks. 1.455 + enum ScaleFilter { 1.456 + FILTER_NONE = 0, // No filter (point sampled). 1.457 + FILTER_BILINEAR_H = 1, // Bilinear horizontal filter. 1.458 + FILTER_BILINEAR_V = 2, // Bilinear vertical filter. 1.459 +- FILTER_BILINEAR = 3, // Bilinear filter. 1.460 ++ FILTER_BILINEAR = 3 // Bilinear filter. 1.461 + }; 1.462 + 1.463 + // Convert a frame of YUV to 32 bit ARGB. 1.464 + // Pass in YV16/YV12 depending on source format 1.465 +-void ConvertYUVToRGB32(const uint8* yplane, 1.466 +- const uint8* uplane, 1.467 +- const uint8* vplane, 1.468 +- uint8* rgbframe, 1.469 +- int width, 1.470 +- int height, 1.471 +- int ystride, 1.472 +- int uvstride, 1.473 +- int rgbstride, 1.474 +- YUVType yuv_type); 1.475 ++NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* yplane, 1.476 ++ const uint8* uplane, 1.477 ++ const uint8* vplane, 1.478 ++ uint8* rgbframe, 1.479 ++ int pic_x, 1.480 ++ int pic_y, 1.481 ++ int pic_width, 1.482 ++ int pic_height, 1.483 ++ int ystride, 1.484 ++ int uvstride, 1.485 ++ int rgbstride, 1.486 ++ YUVType yuv_type); 1.487 + 1.488 + // Scale a frame of YUV to 32 bit ARGB. 1.489 + // Supports rotation and mirroring. 1.490 +-void ScaleYUVToRGB32(const uint8* yplane, 1.491 +- const uint8* uplane, 1.492 +- const uint8* vplane, 1.493 +- uint8* rgbframe, 1.494 +- int source_width, 1.495 +- int source_height, 1.496 +- int width, 1.497 +- int height, 1.498 +- int ystride, 1.499 +- int uvstride, 1.500 +- int rgbstride, 1.501 +- YUVType yuv_type, 1.502 +- Rotate view_rotate, 1.503 +- ScaleFilter filter); 1.504 +- 1.505 +-} // namespace media 1.506 +- 1.507 ++NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* yplane, 1.508 ++ const uint8* uplane, 1.509 ++ const uint8* vplane, 1.510 ++ uint8* rgbframe, 1.511 ++ int source_width, 1.512 ++ int source_height, 1.513 ++ int width, 1.514 ++ int height, 1.515 ++ int ystride, 1.516 ++ int uvstride, 1.517 ++ int rgbstride, 1.518 ++ YUVType yuv_type, 1.519 ++ Rotate view_rotate, 1.520 ++ ScaleFilter filter); 1.521 ++ 1.522 ++} // namespace gfx 1.523 ++} // namespace mozilla 1.524 ++ 1.525 + #endif // MEDIA_BASE_YUV_CONVERT_H_ 1.526 +diff --git a/gfx/ycbcr/yuv_convert_mmx.cpp b/gfx/ycbcr/yuv_convert_mmx.cpp 1.527 +new file mode 100644 1.528 +--- /dev/null 1.529 ++++ b/gfx/ycbcr/yuv_convert_mmx.cpp 1.530 +@@ -0,0 +1,45 @@ 1.531 ++// Copyright (c) 2010 The Chromium Authors. All rights reserved. 1.532 ++// Use of this source code is governed by a BSD-style license that can be 1.533 ++// found in the LICENSE file. 1.534 ++ 1.535 ++#include <mmintrin.h> 1.536 ++#include "yuv_row.h" 1.537 ++ 1.538 ++namespace mozilla { 1.539 ++namespace gfx { 1.540 ++ 1.541 ++// FilterRows combines two rows of the image using linear interpolation. 1.542 ++// MMX version does 8 pixels at a time. 1.543 ++void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, 1.544 ++ int source_width, int source_y_fraction) { 1.545 ++ __m64 zero = _mm_setzero_si64(); 1.546 ++ __m64 y1_fraction = _mm_set1_pi16(source_y_fraction); 1.547 ++ __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction); 1.548 ++ 1.549 ++ const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr); 1.550 ++ const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr); 1.551 ++ __m64* dest64 = reinterpret_cast<__m64*>(ybuf); 1.552 ++ __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width); 1.553 ++ 1.554 ++ do { 1.555 ++ __m64 y0 = *y0_ptr64++; 1.556 ++ __m64 y1 = *y1_ptr64++; 1.557 ++ __m64 y2 = _mm_unpackhi_pi8(y0, zero); 1.558 ++ __m64 y3 = _mm_unpackhi_pi8(y1, zero); 1.559 ++ y0 = _mm_unpacklo_pi8(y0, zero); 1.560 ++ y1 = _mm_unpacklo_pi8(y1, zero); 1.561 ++ y0 = _mm_mullo_pi16(y0, y0_fraction); 1.562 ++ y1 = _mm_mullo_pi16(y1, y1_fraction); 1.563 ++ y2 = _mm_mullo_pi16(y2, y0_fraction); 1.564 ++ y3 = _mm_mullo_pi16(y3, y1_fraction); 1.565 ++ y0 = _mm_add_pi16(y0, y1); 1.566 ++ y2 = _mm_add_pi16(y2, y3); 1.567 ++ y0 = _mm_srli_pi16(y0, 8); 1.568 ++ y2 = _mm_srli_pi16(y2, 8); 1.569 ++ y0 = _mm_packs_pu16(y0, y2); 1.570 ++ *dest64++ = y0; 1.571 ++ } while (dest64 < end64); 1.572 ++} 1.573 ++ 1.574 ++} 1.575 ++} 1.576 +diff --git a/gfx/ycbcr/yuv_convert_sse2.cpp b/gfx/ycbcr/yuv_convert_sse2.cpp 1.577 +new file mode 100644 1.578 +--- /dev/null 1.579 ++++ b/gfx/ycbcr/yuv_convert_sse2.cpp 1.580 +@@ -0,0 +1,47 @@ 1.581 ++// Copyright (c) 2010 The Chromium Authors. All rights reserved. 1.582 ++// Use of this source code is governed by a BSD-style license that can be 1.583 ++// found in the LICENSE file. 1.584 ++ 1.585 ++#include <emmintrin.h> 1.586 ++#include "yuv_row.h" 1.587 ++ 1.588 ++namespace mozilla { 1.589 ++namespace gfx { 1.590 ++ 1.591 ++// FilterRows combines two rows of the image using linear interpolation. 1.592 ++// SSE2 version does 16 pixels at a time. 1.593 ++void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, 1.594 ++ int source_width, int source_y_fraction) { 1.595 ++ __m128i zero = _mm_setzero_si128(); 1.596 ++ __m128i y1_fraction = _mm_set1_epi16(source_y_fraction); 1.597 ++ __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction); 1.598 ++ 1.599 ++ const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr); 1.600 ++ const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr); 1.601 ++ __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf); 1.602 ++ __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width); 1.603 ++ 1.604 ++ do { 1.605 ++ __m128i y0 = _mm_loadu_si128(y0_ptr128); 1.606 ++ __m128i y1 = _mm_loadu_si128(y1_ptr128); 1.607 ++ __m128i y2 = _mm_unpackhi_epi8(y0, zero); 1.608 ++ __m128i y3 = _mm_unpackhi_epi8(y1, zero); 1.609 ++ y0 = _mm_unpacklo_epi8(y0, zero); 1.610 ++ y1 = _mm_unpacklo_epi8(y1, zero); 1.611 ++ y0 = _mm_mullo_epi16(y0, y0_fraction); 1.612 ++ y1 = _mm_mullo_epi16(y1, y1_fraction); 1.613 ++ y2 = _mm_mullo_epi16(y2, y0_fraction); 1.614 ++ y3 = _mm_mullo_epi16(y3, y1_fraction); 1.615 ++ y0 = _mm_add_epi16(y0, y1); 1.616 ++ y2 = _mm_add_epi16(y2, y3); 1.617 ++ y0 = _mm_srli_epi16(y0, 8); 1.618 ++ y2 = _mm_srli_epi16(y2, 8); 1.619 ++ y0 = _mm_packus_epi16(y0, y2); 1.620 ++ *dest128++ = y0; 1.621 ++ ++y0_ptr128; 1.622 ++ ++y1_ptr128; 1.623 ++ } while (dest128 < end128); 1.624 ++} 1.625 ++ 1.626 ++} 1.627 ++} 1.628 +diff --git a/gfx/ycbcr/yuv_row.h b/gfx/ycbcr/yuv_row.h 1.629 +--- a/gfx/ycbcr/yuv_row.h 1.630 ++++ b/gfx/ycbcr/yuv_row.h 1.631 +@@ -5,109 +5,133 @@ 1.632 + // yuv_row internal functions to handle YUV conversion and scaling to RGB. 1.633 + // These functions are used from both yuv_convert.cc and yuv_scale.cc. 1.634 + 1.635 + // TODO(fbarchard): Write function that can handle rotation and scaling. 1.636 + 1.637 + #ifndef MEDIA_BASE_YUV_ROW_H_ 1.638 + #define MEDIA_BASE_YUV_ROW_H_ 1.639 + 1.640 +-#include "base/basictypes.h" 1.641 ++#include "chromium_types.h" 1.642 + 1.643 + extern "C" { 1.644 + // Can only do 1x. 1.645 + // This is the second fastest of the scalers. 1.646 + void FastConvertYUVToRGB32Row(const uint8* y_buf, 1.647 + const uint8* u_buf, 1.648 + const uint8* v_buf, 1.649 + uint8* rgb_buf, 1.650 + int width); 1.651 + 1.652 +-// Can do 1x, half size or any scale down by an integer amount. 1.653 +-// Step can be negative (mirroring, rotate 180). 1.654 +-// This is the third fastest of the scalers. 1.655 +-void ConvertYUVToRGB32Row(const uint8* y_buf, 1.656 +- const uint8* u_buf, 1.657 +- const uint8* v_buf, 1.658 +- uint8* rgb_buf, 1.659 +- int width, 1.660 +- int step); 1.661 +- 1.662 +-// Rotate is like Convert, but applies different step to Y versus U and V. 1.663 +-// This allows rotation by 90 or 270, by stepping by stride. 1.664 +-// This is the forth fastest of the scalers. 1.665 +-void RotateConvertYUVToRGB32Row(const uint8* y_buf, 1.666 ++void FastConvertYUVToRGB32Row_C(const uint8* y_buf, 1.667 + const uint8* u_buf, 1.668 + const uint8* v_buf, 1.669 + uint8* rgb_buf, 1.670 + int width, 1.671 +- int ystep, 1.672 +- int uvstep); 1.673 ++ unsigned int x_shift); 1.674 ++ 1.675 ++void FastConvertYUVToRGB32Row(const uint8* y_buf, 1.676 ++ const uint8* u_buf, 1.677 ++ const uint8* v_buf, 1.678 ++ uint8* rgb_buf, 1.679 ++ int width); 1.680 ++ 1.681 ++// Can do 1x, half size or any scale down by an integer amount. 1.682 ++// Step can be negative (mirroring, rotate 180). 1.683 ++// This is the third fastest of the scalers. 1.684 ++// Only defined on Windows x86-32. 1.685 ++void ConvertYUVToRGB32Row_SSE(const uint8* y_buf, 1.686 ++ const uint8* u_buf, 1.687 ++ const uint8* v_buf, 1.688 ++ uint8* rgb_buf, 1.689 ++ int width, 1.690 ++ int step); 1.691 ++ 1.692 ++// Rotate is like Convert, but applies different step to Y versus U and V. 1.693 ++// This allows rotation by 90 or 270, by stepping by stride. 1.694 ++// This is the forth fastest of the scalers. 1.695 ++// Only defined on Windows x86-32. 1.696 ++void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf, 1.697 ++ const uint8* u_buf, 1.698 ++ const uint8* v_buf, 1.699 ++ uint8* rgb_buf, 1.700 ++ int width, 1.701 ++ int ystep, 1.702 ++ int uvstep); 1.703 + 1.704 + // Doubler does 4 pixels at a time. Each pixel is replicated. 1.705 + // This is the fastest of the scalers. 1.706 +-void DoubleYUVToRGB32Row(const uint8* y_buf, 1.707 +- const uint8* u_buf, 1.708 +- const uint8* v_buf, 1.709 +- uint8* rgb_buf, 1.710 +- int width); 1.711 ++// Only defined on Windows x86-32. 1.712 ++void DoubleYUVToRGB32Row_SSE(const uint8* y_buf, 1.713 ++ const uint8* u_buf, 1.714 ++ const uint8* v_buf, 1.715 ++ uint8* rgb_buf, 1.716 ++ int width); 1.717 + 1.718 + // Handles arbitrary scaling up or down. 1.719 + // Mirroring is supported, but not 90 or 270 degree rotation. 1.720 + // Chroma is under sampled every 2 pixels for performance. 1.721 + void ScaleYUVToRGB32Row(const uint8* y_buf, 1.722 + const uint8* u_buf, 1.723 + const uint8* v_buf, 1.724 + uint8* rgb_buf, 1.725 + int width, 1.726 + int source_dx); 1.727 + 1.728 ++void ScaleYUVToRGB32Row(const uint8* y_buf, 1.729 ++ const uint8* u_buf, 1.730 ++ const uint8* v_buf, 1.731 ++ uint8* rgb_buf, 1.732 ++ int width, 1.733 ++ int source_dx); 1.734 ++ 1.735 ++void ScaleYUVToRGB32Row_C(const uint8* y_buf, 1.736 ++ const uint8* u_buf, 1.737 ++ const uint8* v_buf, 1.738 ++ uint8* rgb_buf, 1.739 ++ int width, 1.740 ++ int source_dx); 1.741 ++ 1.742 + // Handles arbitrary scaling up or down with bilinear filtering. 1.743 + // Mirroring is supported, but not 90 or 270 degree rotation. 1.744 + // Chroma is under sampled every 2 pixels for performance. 1.745 + // This is the slowest of the scalers. 1.746 + void LinearScaleYUVToRGB32Row(const uint8* y_buf, 1.747 + const uint8* u_buf, 1.748 + const uint8* v_buf, 1.749 + uint8* rgb_buf, 1.750 + int width, 1.751 + int source_dx); 1.752 + 1.753 ++void LinearScaleYUVToRGB32Row(const uint8* y_buf, 1.754 ++ const uint8* u_buf, 1.755 ++ const uint8* v_buf, 1.756 ++ uint8* rgb_buf, 1.757 ++ int width, 1.758 ++ int source_dx); 1.759 ++ 1.760 ++void LinearScaleYUVToRGB32Row_C(const uint8* y_buf, 1.761 ++ const uint8* u_buf, 1.762 ++ const uint8* v_buf, 1.763 ++ uint8* rgb_buf, 1.764 ++ int width, 1.765 ++ int source_dx); 1.766 ++ 1.767 ++ 1.768 + #if defined(_MSC_VER) 1.769 + #define SIMD_ALIGNED(var) __declspec(align(16)) var 1.770 + #else 1.771 + #define SIMD_ALIGNED(var) var __attribute__((aligned(16))) 1.772 + #endif 1.773 + extern SIMD_ALIGNED(int16 kCoefficientsRgbY[768][4]); 1.774 + 1.775 +-// Method to force C version. 1.776 +-//#define USE_MMX 0 1.777 +-//#define USE_SSE2 0 1.778 +- 1.779 +-#if !defined(USE_MMX) 1.780 +-// Windows, Mac and Linux/BSD use MMX 1.781 +-#if defined(__MMX__) || defined(_MSC_VER) 1.782 +-#define USE_MMX 1 1.783 +-#else 1.784 +-#define USE_MMX 0 1.785 +-#endif 1.786 +-#endif 1.787 +- 1.788 +-#if !defined(USE_SSE2) 1.789 +-#if defined(__SSE2__) || defined(ARCH_CPU_X86_64) || _M_IX86_FP==2 1.790 +-#define USE_SSE2 1 1.791 +-#else 1.792 +-#define USE_SSE2 0 1.793 +-#endif 1.794 +-#endif 1.795 +- 1.796 + // x64 uses MMX2 (SSE) so emms is not required. 1.797 + // Warning C4799: function has no EMMS instruction. 1.798 + // EMMS() is slow and should be called by the calling function once per image. 1.799 +-#if USE_MMX && !defined(ARCH_CPU_X86_64) 1.800 ++#if defined(ARCH_CPU_X86) && !defined(ARCH_CPU_X86_64) 1.801 + #if defined(_MSC_VER) 1.802 + #define EMMS() __asm emms 1.803 + #pragma warning(disable: 4799) 1.804 + #else 1.805 + #define EMMS() asm("emms") 1.806 + #endif 1.807 + #else 1.808 + #define EMMS() 1.809 +diff --git a/gfx/ycbcr/yuv_row_c.cpp b/gfx/ycbcr/yuv_row_c.cpp 1.810 +--- a/gfx/ycbcr/yuv_row_c.cpp 1.811 ++++ b/gfx/ycbcr/yuv_row_c.cpp 1.812 +@@ -1,812 +1,18 @@ 1.813 + // Copyright (c) 2010 The Chromium Authors. All rights reserved. 1.814 + // Use of this source code is governed by a BSD-style license that can be 1.815 + // found in the LICENSE file. 1.816 + 1.817 +-#include "media/base/yuv_row.h" 1.818 +- 1.819 +-#ifdef _DEBUG 1.820 +-#include "base/logging.h" 1.821 +-#else 1.822 ++#include "yuv_row.h" 1.823 ++ 1.824 + #define DCHECK(a) 1.825 +-#endif 1.826 + 1.827 + extern "C" { 1.828 + 1.829 +-#if USE_SSE2 && defined(ARCH_CPU_X86_64) 1.830 +- 1.831 +-// AMD64 ABI uses register paremters. 1.832 +-void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi 1.833 +- const uint8* u_buf, // rsi 1.834 +- const uint8* v_buf, // rdx 1.835 +- uint8* rgb_buf, // rcx 1.836 +- int width) { // r8 1.837 +- asm( 1.838 +- "jmp convertend\n" 1.839 +-"convertloop:" 1.840 +- "movzb (%1),%%r10\n" 1.841 +- "add $0x1,%1\n" 1.842 +- "movzb (%2),%%r11\n" 1.843 +- "add $0x1,%2\n" 1.844 +- "movq 2048(%5,%%r10,8),%%xmm0\n" 1.845 +- "movzb (%0),%%r10\n" 1.846 +- "movq 4096(%5,%%r11,8),%%xmm1\n" 1.847 +- "movzb 0x1(%0),%%r11\n" 1.848 +- "paddsw %%xmm1,%%xmm0\n" 1.849 +- "movq (%5,%%r10,8),%%xmm2\n" 1.850 +- "add $0x2,%0\n" 1.851 +- "movq (%5,%%r11,8),%%xmm3\n" 1.852 +- "paddsw %%xmm0,%%xmm2\n" 1.853 +- "paddsw %%xmm0,%%xmm3\n" 1.854 +- "shufps $0x44,%%xmm3,%%xmm2\n" 1.855 +- "psraw $0x6,%%xmm2\n" 1.856 +- "packuswb %%xmm2,%%xmm2\n" 1.857 +- "movq %%xmm2,0x0(%3)\n" 1.858 +- "add $0x8,%3\n" 1.859 +-"convertend:" 1.860 +- "sub $0x2,%4\n" 1.861 +- "jns convertloop\n" 1.862 +- 1.863 +-"convertnext:" 1.864 +- "add $0x1,%4\n" 1.865 +- "js convertdone\n" 1.866 +- 1.867 +- "movzb (%1),%%r10\n" 1.868 +- "movq 2048(%5,%%r10,8),%%xmm0\n" 1.869 +- "movzb (%2),%%r10\n" 1.870 +- "movq 4096(%5,%%r10,8),%%xmm1\n" 1.871 +- "paddsw %%xmm1,%%xmm0\n" 1.872 +- "movzb (%0),%%r10\n" 1.873 +- "movq (%5,%%r10,8),%%xmm1\n" 1.874 +- "paddsw %%xmm0,%%xmm1\n" 1.875 +- "psraw $0x6,%%xmm1\n" 1.876 +- "packuswb %%xmm1,%%xmm1\n" 1.877 +- "movd %%xmm1,0x0(%3)\n" 1.878 +-"convertdone:" 1.879 +- : 1.880 +- : "r"(y_buf), // %0 1.881 +- "r"(u_buf), // %1 1.882 +- "r"(v_buf), // %2 1.883 +- "r"(rgb_buf), // %3 1.884 +- "r"(width), // %4 1.885 +- "r" (kCoefficientsRgbY) // %5 1.886 +- : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" 1.887 +-); 1.888 +-} 1.889 +- 1.890 +-void ScaleYUVToRGB32Row(const uint8* y_buf, // rdi 1.891 +- const uint8* u_buf, // rsi 1.892 +- const uint8* v_buf, // rdx 1.893 +- uint8* rgb_buf, // rcx 1.894 +- int width, // r8 1.895 +- int source_dx) { // r9 1.896 +- asm( 1.897 +- "xor %%r11,%%r11\n" 1.898 +- "sub $0x2,%4\n" 1.899 +- "js scalenext\n" 1.900 +- 1.901 +-"scaleloop:" 1.902 +- "mov %%r11,%%r10\n" 1.903 +- "sar $0x11,%%r10\n" 1.904 +- "movzb (%1,%%r10,1),%%rax\n" 1.905 +- "movq 2048(%5,%%rax,8),%%xmm0\n" 1.906 +- "movzb (%2,%%r10,1),%%rax\n" 1.907 +- "movq 4096(%5,%%rax,8),%%xmm1\n" 1.908 +- "lea (%%r11,%6),%%r10\n" 1.909 +- "sar $0x10,%%r11\n" 1.910 +- "movzb (%0,%%r11,1),%%rax\n" 1.911 +- "paddsw %%xmm1,%%xmm0\n" 1.912 +- "movq (%5,%%rax,8),%%xmm1\n" 1.913 +- "lea (%%r10,%6),%%r11\n" 1.914 +- "sar $0x10,%%r10\n" 1.915 +- "movzb (%0,%%r10,1),%%rax\n" 1.916 +- "movq (%5,%%rax,8),%%xmm2\n" 1.917 +- "paddsw %%xmm0,%%xmm1\n" 1.918 +- "paddsw %%xmm0,%%xmm2\n" 1.919 +- "shufps $0x44,%%xmm2,%%xmm1\n" 1.920 +- "psraw $0x6,%%xmm1\n" 1.921 +- "packuswb %%xmm1,%%xmm1\n" 1.922 +- "movq %%xmm1,0x0(%3)\n" 1.923 +- "add $0x8,%3\n" 1.924 +- "sub $0x2,%4\n" 1.925 +- "jns scaleloop\n" 1.926 +- 1.927 +-"scalenext:" 1.928 +- "add $0x1,%4\n" 1.929 +- "js scaledone\n" 1.930 +- 1.931 +- "mov %%r11,%%r10\n" 1.932 +- "sar $0x11,%%r10\n" 1.933 +- "movzb (%1,%%r10,1),%%rax\n" 1.934 +- "movq 2048(%5,%%rax,8),%%xmm0\n" 1.935 +- "movzb (%2,%%r10,1),%%rax\n" 1.936 +- "movq 4096(%5,%%rax,8),%%xmm1\n" 1.937 +- "paddsw %%xmm1,%%xmm0\n" 1.938 +- "sar $0x10,%%r11\n" 1.939 +- "movzb (%0,%%r11,1),%%rax\n" 1.940 +- "movq (%5,%%rax,8),%%xmm1\n" 1.941 +- "paddsw %%xmm0,%%xmm1\n" 1.942 +- "psraw $0x6,%%xmm1\n" 1.943 +- "packuswb %%xmm1,%%xmm1\n" 1.944 +- "movd %%xmm1,0x0(%3)\n" 1.945 +- 1.946 +-"scaledone:" 1.947 +- : 1.948 +- : "r"(y_buf), // %0 1.949 +- "r"(u_buf), // %1 1.950 +- "r"(v_buf), // %2 1.951 +- "r"(rgb_buf), // %3 1.952 +- "r"(width), // %4 1.953 +- "r" (kCoefficientsRgbY), // %5 1.954 +- "r"(static_cast<long>(source_dx)) // %6 1.955 +- : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2" 1.956 +-); 1.957 +-} 1.958 +- 1.959 +-void LinearScaleYUVToRGB32Row(const uint8* y_buf, 1.960 +- const uint8* u_buf, 1.961 +- const uint8* v_buf, 1.962 +- uint8* rgb_buf, 1.963 +- int width, 1.964 +- int source_dx) { 1.965 +- asm( 1.966 +- "xor %%r11,%%r11\n" // x = 0 1.967 +- "sub $0x2,%4\n" 1.968 +- "js .lscalenext\n" 1.969 +- "cmp $0x20000,%6\n" // if source_dx >= 2.0 1.970 +- "jl .lscalehalf\n" 1.971 +- "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less 1.972 +-".lscalehalf:" 1.973 +- 1.974 +-".lscaleloop:" 1.975 +- "mov %%r11,%%r10\n" 1.976 +- "sar $0x11,%%r10\n" 1.977 +- 1.978 +- "movzb (%1, %%r10, 1), %%r13 \n" 1.979 +- "movzb 1(%1, %%r10, 1), %%r14 \n" 1.980 +- "mov %%r11, %%rax \n" 1.981 +- "and $0x1fffe, %%rax \n" 1.982 +- "imul %%rax, %%r14 \n" 1.983 +- "xor $0x1fffe, %%rax \n" 1.984 +- "imul %%rax, %%r13 \n" 1.985 +- "add %%r14, %%r13 \n" 1.986 +- "shr $17, %%r13 \n" 1.987 +- "movq 2048(%5,%%r13,8), %%xmm0\n" 1.988 +- 1.989 +- "movzb (%2, %%r10, 1), %%r13 \n" 1.990 +- "movzb 1(%2, %%r10, 1), %%r14 \n" 1.991 +- "mov %%r11, %%rax \n" 1.992 +- "and $0x1fffe, %%rax \n" 1.993 +- "imul %%rax, %%r14 \n" 1.994 +- "xor $0x1fffe, %%rax \n" 1.995 +- "imul %%rax, %%r13 \n" 1.996 +- "add %%r14, %%r13 \n" 1.997 +- "shr $17, %%r13 \n" 1.998 +- "movq 4096(%5,%%r13,8), %%xmm1\n" 1.999 +- 1.1000 +- "mov %%r11, %%rax \n" 1.1001 +- "lea (%%r11,%6),%%r10\n" 1.1002 +- "sar $0x10,%%r11\n" 1.1003 +- "paddsw %%xmm1,%%xmm0\n" 1.1004 +- 1.1005 +- "movzb (%0, %%r11, 1), %%r13 \n" 1.1006 +- "movzb 1(%0, %%r11, 1), %%r14 \n" 1.1007 +- "and $0xffff, %%rax \n" 1.1008 +- "imul %%rax, %%r14 \n" 1.1009 +- "xor $0xffff, %%rax \n" 1.1010 +- "imul %%rax, %%r13 \n" 1.1011 +- "add %%r14, %%r13 \n" 1.1012 +- "shr $16, %%r13 \n" 1.1013 +- "movq (%5,%%r13,8),%%xmm1\n" 1.1014 +- 1.1015 +- "mov %%r10, %%rax \n" 1.1016 +- "lea (%%r10,%6),%%r11\n" 1.1017 +- "sar $0x10,%%r10\n" 1.1018 +- 1.1019 +- "movzb (%0,%%r10,1), %%r13 \n" 1.1020 +- "movzb 1(%0,%%r10,1), %%r14 \n" 1.1021 +- "and $0xffff, %%rax \n" 1.1022 +- "imul %%rax, %%r14 \n" 1.1023 +- "xor $0xffff, %%rax \n" 1.1024 +- "imul %%rax, %%r13 \n" 1.1025 +- "add %%r14, %%r13 \n" 1.1026 +- "shr $16, %%r13 \n" 1.1027 +- "movq (%5,%%r13,8),%%xmm2\n" 1.1028 +- 1.1029 +- "paddsw %%xmm0,%%xmm1\n" 1.1030 +- "paddsw %%xmm0,%%xmm2\n" 1.1031 +- "shufps $0x44,%%xmm2,%%xmm1\n" 1.1032 +- "psraw $0x6,%%xmm1\n" 1.1033 +- "packuswb %%xmm1,%%xmm1\n" 1.1034 +- "movq %%xmm1,0x0(%3)\n" 1.1035 +- "add $0x8,%3\n" 1.1036 +- "sub $0x2,%4\n" 1.1037 +- "jns .lscaleloop\n" 1.1038 +- 1.1039 +-".lscalenext:" 1.1040 +- "add $0x1,%4\n" 1.1041 +- "js .lscaledone\n" 1.1042 +- 1.1043 +- "mov %%r11,%%r10\n" 1.1044 +- "sar $0x11,%%r10\n" 1.1045 +- 1.1046 +- "movzb (%1,%%r10,1), %%r13 \n" 1.1047 +- "movq 2048(%5,%%r13,8),%%xmm0\n" 1.1048 +- 1.1049 +- "movzb (%2,%%r10,1), %%r13 \n" 1.1050 +- "movq 4096(%5,%%r13,8),%%xmm1\n" 1.1051 +- 1.1052 +- "paddsw %%xmm1,%%xmm0\n" 1.1053 +- "sar $0x10,%%r11\n" 1.1054 +- 1.1055 +- "movzb (%0,%%r11,1), %%r13 \n" 1.1056 +- "movq (%5,%%r13,8),%%xmm1\n" 1.1057 +- 1.1058 +- "paddsw %%xmm0,%%xmm1\n" 1.1059 +- "psraw $0x6,%%xmm1\n" 1.1060 +- "packuswb %%xmm1,%%xmm1\n" 1.1061 +- "movd %%xmm1,0x0(%3)\n" 1.1062 +- 1.1063 +-".lscaledone:" 1.1064 +- : 1.1065 +- : "r"(y_buf), // %0 1.1066 +- "r"(u_buf), // %1 1.1067 +- "r"(v_buf), // %2 1.1068 +- "r"(rgb_buf), // %3 1.1069 +- "r"(width), // %4 1.1070 +- "r" (kCoefficientsRgbY), // %5 1.1071 +- "r"(static_cast<long>(source_dx)) // %6 1.1072 +- : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2" 1.1073 +-); 1.1074 +-} 1.1075 +- 1.1076 +-#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__) 1.1077 +- 1.1078 +-// PIC version is slower because less registers are available, so 1.1079 +-// non-PIC is used on platforms where it is possible. 1.1080 +- 1.1081 +-void FastConvertYUVToRGB32Row(const uint8* y_buf, 1.1082 +- const uint8* u_buf, 1.1083 +- const uint8* v_buf, 1.1084 +- uint8* rgb_buf, 1.1085 +- int width); 1.1086 +- asm( 1.1087 +- ".text\n" 1.1088 +- ".global FastConvertYUVToRGB32Row\n" 1.1089 +-"FastConvertYUVToRGB32Row:\n" 1.1090 +- "pusha\n" 1.1091 +- "mov 0x24(%esp),%edx\n" 1.1092 +- "mov 0x28(%esp),%edi\n" 1.1093 +- "mov 0x2c(%esp),%esi\n" 1.1094 +- "mov 0x30(%esp),%ebp\n" 1.1095 +- "mov 0x34(%esp),%ecx\n" 1.1096 +- "jmp convertend\n" 1.1097 +- 1.1098 +-"convertloop:" 1.1099 +- "movzbl (%edi),%eax\n" 1.1100 +- "add $0x1,%edi\n" 1.1101 +- "movzbl (%esi),%ebx\n" 1.1102 +- "add $0x1,%esi\n" 1.1103 +- "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" 1.1104 +- "movzbl (%edx),%eax\n" 1.1105 +- "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" 1.1106 +- "movzbl 0x1(%edx),%ebx\n" 1.1107 +- "movq kCoefficientsRgbY(,%eax,8),%mm1\n" 1.1108 +- "add $0x2,%edx\n" 1.1109 +- "movq kCoefficientsRgbY(,%ebx,8),%mm2\n" 1.1110 +- "paddsw %mm0,%mm1\n" 1.1111 +- "paddsw %mm0,%mm2\n" 1.1112 +- "psraw $0x6,%mm1\n" 1.1113 +- "psraw $0x6,%mm2\n" 1.1114 +- "packuswb %mm2,%mm1\n" 1.1115 +- "movntq %mm1,0x0(%ebp)\n" 1.1116 +- "add $0x8,%ebp\n" 1.1117 +-"convertend:" 1.1118 +- "sub $0x2,%ecx\n" 1.1119 +- "jns convertloop\n" 1.1120 +- 1.1121 +- "and $0x1,%ecx\n" 1.1122 +- "je convertdone\n" 1.1123 +- 1.1124 +- "movzbl (%edi),%eax\n" 1.1125 +- "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" 1.1126 +- "movzbl (%esi),%eax\n" 1.1127 +- "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" 1.1128 +- "movzbl (%edx),%eax\n" 1.1129 +- "movq kCoefficientsRgbY(,%eax,8),%mm1\n" 1.1130 +- "paddsw %mm0,%mm1\n" 1.1131 +- "psraw $0x6,%mm1\n" 1.1132 +- "packuswb %mm1,%mm1\n" 1.1133 +- "movd %mm1,0x0(%ebp)\n" 1.1134 +-"convertdone:" 1.1135 +- "popa\n" 1.1136 +- "ret\n" 1.1137 +-); 1.1138 +- 1.1139 +- 1.1140 +-void ScaleYUVToRGB32Row(const uint8* y_buf, 1.1141 +- const uint8* u_buf, 1.1142 +- const uint8* v_buf, 1.1143 +- uint8* rgb_buf, 1.1144 +- int width, 1.1145 +- int source_dx); 1.1146 +- asm( 1.1147 +- ".text\n" 1.1148 +- ".global ScaleYUVToRGB32Row\n" 1.1149 +-"ScaleYUVToRGB32Row:\n" 1.1150 +- "pusha\n" 1.1151 +- "mov 0x24(%esp),%edx\n" 1.1152 +- "mov 0x28(%esp),%edi\n" 1.1153 +- "mov 0x2c(%esp),%esi\n" 1.1154 +- "mov 0x30(%esp),%ebp\n" 1.1155 +- "mov 0x34(%esp),%ecx\n" 1.1156 +- "xor %ebx,%ebx\n" 1.1157 +- "jmp scaleend\n" 1.1158 +- 1.1159 +-"scaleloop:" 1.1160 +- "mov %ebx,%eax\n" 1.1161 +- "sar $0x11,%eax\n" 1.1162 +- "movzbl (%edi,%eax,1),%eax\n" 1.1163 +- "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" 1.1164 +- "mov %ebx,%eax\n" 1.1165 +- "sar $0x11,%eax\n" 1.1166 +- "movzbl (%esi,%eax,1),%eax\n" 1.1167 +- "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" 1.1168 +- "mov %ebx,%eax\n" 1.1169 +- "add 0x38(%esp),%ebx\n" 1.1170 +- "sar $0x10,%eax\n" 1.1171 +- "movzbl (%edx,%eax,1),%eax\n" 1.1172 +- "movq kCoefficientsRgbY(,%eax,8),%mm1\n" 1.1173 +- "mov %ebx,%eax\n" 1.1174 +- "add 0x38(%esp),%ebx\n" 1.1175 +- "sar $0x10,%eax\n" 1.1176 +- "movzbl (%edx,%eax,1),%eax\n" 1.1177 +- "movq kCoefficientsRgbY(,%eax,8),%mm2\n" 1.1178 +- "paddsw %mm0,%mm1\n" 1.1179 +- "paddsw %mm0,%mm2\n" 1.1180 +- "psraw $0x6,%mm1\n" 1.1181 +- "psraw $0x6,%mm2\n" 1.1182 +- "packuswb %mm2,%mm1\n" 1.1183 +- "movntq %mm1,0x0(%ebp)\n" 1.1184 +- "add $0x8,%ebp\n" 1.1185 +-"scaleend:" 1.1186 +- "sub $0x2,%ecx\n" 1.1187 +- "jns scaleloop\n" 1.1188 +- 1.1189 +- "and $0x1,%ecx\n" 1.1190 +- "je scaledone\n" 1.1191 +- 1.1192 +- "mov %ebx,%eax\n" 1.1193 +- "sar $0x11,%eax\n" 1.1194 +- "movzbl (%edi,%eax,1),%eax\n" 1.1195 +- "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" 1.1196 +- "mov %ebx,%eax\n" 1.1197 +- "sar $0x11,%eax\n" 1.1198 +- "movzbl (%esi,%eax,1),%eax\n" 1.1199 +- "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" 1.1200 +- "mov %ebx,%eax\n" 1.1201 +- "sar $0x10,%eax\n" 1.1202 +- "movzbl (%edx,%eax,1),%eax\n" 1.1203 +- "movq kCoefficientsRgbY(,%eax,8),%mm1\n" 1.1204 +- "paddsw %mm0,%mm1\n" 1.1205 +- "psraw $0x6,%mm1\n" 1.1206 +- "packuswb %mm1,%mm1\n" 1.1207 +- "movd %mm1,0x0(%ebp)\n" 1.1208 +- 1.1209 +-"scaledone:" 1.1210 +- "popa\n" 1.1211 +- "ret\n" 1.1212 +-); 1.1213 +- 1.1214 +-void LinearScaleYUVToRGB32Row(const uint8* y_buf, 1.1215 +- const uint8* u_buf, 1.1216 +- const uint8* v_buf, 1.1217 +- uint8* rgb_buf, 1.1218 +- int width, 1.1219 +- int source_dx); 1.1220 +- asm( 1.1221 +- ".text\n" 1.1222 +- ".global LinearScaleYUVToRGB32Row\n" 1.1223 +-"LinearScaleYUVToRGB32Row:\n" 1.1224 +- "pusha\n" 1.1225 +- "mov 0x24(%esp),%edx\n" 1.1226 +- "mov 0x28(%esp),%edi\n" 1.1227 +- "mov 0x30(%esp),%ebp\n" 1.1228 +- 1.1229 +- // source_width = width * source_dx + ebx 1.1230 +- "mov 0x34(%esp), %ecx\n" 1.1231 +- "imull 0x38(%esp), %ecx\n" 1.1232 +- "mov %ecx, 0x34(%esp)\n" 1.1233 +- 1.1234 +- "mov 0x38(%esp), %ecx\n" 1.1235 +- "xor %ebx,%ebx\n" // x = 0 1.1236 +- "cmp $0x20000,%ecx\n" // if source_dx >= 2.0 1.1237 +- "jl .lscaleend\n" 1.1238 +- "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less 1.1239 +- "jmp .lscaleend\n" 1.1240 +- 1.1241 +-".lscaleloop:" 1.1242 +- "mov %ebx,%eax\n" 1.1243 +- "sar $0x11,%eax\n" 1.1244 +- 1.1245 +- "movzbl (%edi,%eax,1),%ecx\n" 1.1246 +- "movzbl 1(%edi,%eax,1),%esi\n" 1.1247 +- "mov %ebx,%eax\n" 1.1248 +- "andl $0x1fffe, %eax \n" 1.1249 +- "imul %eax, %esi \n" 1.1250 +- "xorl $0x1fffe, %eax \n" 1.1251 +- "imul %eax, %ecx \n" 1.1252 +- "addl %esi, %ecx \n" 1.1253 +- "shrl $17, %ecx \n" 1.1254 +- "movq kCoefficientsRgbY+2048(,%ecx,8),%mm0\n" 1.1255 +- 1.1256 +- "mov 0x2c(%esp),%esi\n" 1.1257 +- "mov %ebx,%eax\n" 1.1258 +- "sar $0x11,%eax\n" 1.1259 +- 1.1260 +- "movzbl (%esi,%eax,1),%ecx\n" 1.1261 +- "movzbl 1(%esi,%eax,1),%esi\n" 1.1262 +- "mov %ebx,%eax\n" 1.1263 +- "andl $0x1fffe, %eax \n" 1.1264 +- "imul %eax, %esi \n" 1.1265 +- "xorl $0x1fffe, %eax \n" 1.1266 +- "imul %eax, %ecx \n" 1.1267 +- "addl %esi, %ecx \n" 1.1268 +- "shrl $17, %ecx \n" 1.1269 +- "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n" 1.1270 +- 1.1271 +- "mov %ebx,%eax\n" 1.1272 +- "sar $0x10,%eax\n" 1.1273 +- "movzbl (%edx,%eax,1),%ecx\n" 1.1274 +- "movzbl 1(%edx,%eax,1),%esi\n" 1.1275 +- "mov %ebx,%eax\n" 1.1276 +- "add 0x38(%esp),%ebx\n" 1.1277 +- "andl $0xffff, %eax \n" 1.1278 +- "imul %eax, %esi \n" 1.1279 +- "xorl $0xffff, %eax \n" 1.1280 +- "imul %eax, %ecx \n" 1.1281 +- "addl %esi, %ecx \n" 1.1282 +- "shrl $16, %ecx \n" 1.1283 +- "movq kCoefficientsRgbY(,%ecx,8),%mm1\n" 1.1284 +- 1.1285 +- "cmp 0x34(%esp), %ebx\n" 1.1286 +- "jge .lscalelastpixel\n" 1.1287 +- 1.1288 +- "mov %ebx,%eax\n" 1.1289 +- "sar $0x10,%eax\n" 1.1290 +- "movzbl (%edx,%eax,1),%ecx\n" 1.1291 +- "movzbl 1(%edx,%eax,1),%esi\n" 1.1292 +- "mov %ebx,%eax\n" 1.1293 +- "add 0x38(%esp),%ebx\n" 1.1294 +- "andl $0xffff, %eax \n" 1.1295 +- "imul %eax, %esi \n" 1.1296 +- "xorl $0xffff, %eax \n" 1.1297 +- "imul %eax, %ecx \n" 1.1298 +- "addl %esi, %ecx \n" 1.1299 +- "shrl $16, %ecx \n" 1.1300 +- "movq kCoefficientsRgbY(,%ecx,8),%mm2\n" 1.1301 +- 1.1302 +- "paddsw %mm0,%mm1\n" 1.1303 +- "paddsw %mm0,%mm2\n" 1.1304 +- "psraw $0x6,%mm1\n" 1.1305 +- "psraw $0x6,%mm2\n" 1.1306 +- "packuswb %mm2,%mm1\n" 1.1307 +- "movntq %mm1,0x0(%ebp)\n" 1.1308 +- "add $0x8,%ebp\n" 1.1309 +- 1.1310 +-".lscaleend:" 1.1311 +- "cmp 0x34(%esp), %ebx\n" 1.1312 +- "jl .lscaleloop\n" 1.1313 +- "popa\n" 1.1314 +- "ret\n" 1.1315 +- 1.1316 +-".lscalelastpixel:" 1.1317 +- "paddsw %mm0, %mm1\n" 1.1318 +- "psraw $6, %mm1\n" 1.1319 +- "packuswb %mm1, %mm1\n" 1.1320 +- "movd %mm1, (%ebp)\n" 1.1321 +- "popa\n" 1.1322 +- "ret\n" 1.1323 +-); 1.1324 +- 1.1325 +-#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__) 1.1326 +- 1.1327 +-extern void PICConvertYUVToRGB32Row(const uint8* y_buf, 1.1328 +- const uint8* u_buf, 1.1329 +- const uint8* v_buf, 1.1330 +- uint8* rgb_buf, 1.1331 +- int width, 1.1332 +- int16 *kCoefficientsRgbY); 1.1333 +- asm( 1.1334 +- ".text\n" 1.1335 +-#if defined(OS_MACOSX) 1.1336 +-"_PICConvertYUVToRGB32Row:\n" 1.1337 +-#else 1.1338 +-"PICConvertYUVToRGB32Row:\n" 1.1339 +-#endif 1.1340 +- "pusha\n" 1.1341 +- "mov 0x24(%esp),%edx\n" 1.1342 +- "mov 0x28(%esp),%edi\n" 1.1343 +- "mov 0x2c(%esp),%esi\n" 1.1344 +- "mov 0x30(%esp),%ebp\n" 1.1345 +- "mov 0x38(%esp),%ecx\n" 1.1346 +- 1.1347 +- "jmp .Lconvertend\n" 1.1348 +- 1.1349 +-".Lconvertloop:" 1.1350 +- "movzbl (%edi),%eax\n" 1.1351 +- "add $0x1,%edi\n" 1.1352 +- "movzbl (%esi),%ebx\n" 1.1353 +- "add $0x1,%esi\n" 1.1354 +- "movq 2048(%ecx,%eax,8),%mm0\n" 1.1355 +- "movzbl (%edx),%eax\n" 1.1356 +- "paddsw 4096(%ecx,%ebx,8),%mm0\n" 1.1357 +- "movzbl 0x1(%edx),%ebx\n" 1.1358 +- "movq 0(%ecx,%eax,8),%mm1\n" 1.1359 +- "add $0x2,%edx\n" 1.1360 +- "movq 0(%ecx,%ebx,8),%mm2\n" 1.1361 +- "paddsw %mm0,%mm1\n" 1.1362 +- "paddsw %mm0,%mm2\n" 1.1363 +- "psraw $0x6,%mm1\n" 1.1364 +- "psraw $0x6,%mm2\n" 1.1365 +- "packuswb %mm2,%mm1\n" 1.1366 +- "movntq %mm1,0x0(%ebp)\n" 1.1367 +- "add $0x8,%ebp\n" 1.1368 +-".Lconvertend:" 1.1369 +- "subl $0x2,0x34(%esp)\n" 1.1370 +- "jns .Lconvertloop\n" 1.1371 +- 1.1372 +- "andl $0x1,0x34(%esp)\n" 1.1373 +- "je .Lconvertdone\n" 1.1374 +- 1.1375 +- "movzbl (%edi),%eax\n" 1.1376 +- "movq 2048(%ecx,%eax,8),%mm0\n" 1.1377 +- "movzbl (%esi),%eax\n" 1.1378 +- "paddsw 4096(%ecx,%eax,8),%mm0\n" 1.1379 +- "movzbl (%edx),%eax\n" 1.1380 +- "movq 0(%ecx,%eax,8),%mm1\n" 1.1381 +- "paddsw %mm0,%mm1\n" 1.1382 +- "psraw $0x6,%mm1\n" 1.1383 +- "packuswb %mm1,%mm1\n" 1.1384 +- "movd %mm1,0x0(%ebp)\n" 1.1385 +-".Lconvertdone:\n" 1.1386 +- "popa\n" 1.1387 +- "ret\n" 1.1388 +-); 1.1389 +- 1.1390 +-void FastConvertYUVToRGB32Row(const uint8* y_buf, 1.1391 +- const uint8* u_buf, 1.1392 +- const uint8* v_buf, 1.1393 +- uint8* rgb_buf, 1.1394 +- int width) { 1.1395 +- PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, 1.1396 +- &kCoefficientsRgbY[0][0]); 1.1397 +-} 1.1398 +- 1.1399 +-extern void PICScaleYUVToRGB32Row(const uint8* y_buf, 1.1400 +- const uint8* u_buf, 1.1401 +- const uint8* v_buf, 1.1402 +- uint8* rgb_buf, 1.1403 +- int width, 1.1404 +- int source_dx, 1.1405 +- int16 *kCoefficientsRgbY); 1.1406 +- 1.1407 +- asm( 1.1408 +- ".text\n" 1.1409 +-#if defined(OS_MACOSX) 1.1410 +-"_PICScaleYUVToRGB32Row:\n" 1.1411 +-#else 1.1412 +-"PICScaleYUVToRGB32Row:\n" 1.1413 +-#endif 1.1414 +- "pusha\n" 1.1415 +- "mov 0x24(%esp),%edx\n" 1.1416 +- "mov 0x28(%esp),%edi\n" 1.1417 +- "mov 0x2c(%esp),%esi\n" 1.1418 +- "mov 0x30(%esp),%ebp\n" 1.1419 +- "mov 0x3c(%esp),%ecx\n" 1.1420 +- "xor %ebx,%ebx\n" 1.1421 +- "jmp Lscaleend\n" 1.1422 +- 1.1423 +-"Lscaleloop:" 1.1424 +- "mov %ebx,%eax\n" 1.1425 +- "sar $0x11,%eax\n" 1.1426 +- "movzbl (%edi,%eax,1),%eax\n" 1.1427 +- "movq 2048(%ecx,%eax,8),%mm0\n" 1.1428 +- "mov %ebx,%eax\n" 1.1429 +- "sar $0x11,%eax\n" 1.1430 +- "movzbl (%esi,%eax,1),%eax\n" 1.1431 +- "paddsw 4096(%ecx,%eax,8),%mm0\n" 1.1432 +- "mov %ebx,%eax\n" 1.1433 +- "add 0x38(%esp),%ebx\n" 1.1434 +- "sar $0x10,%eax\n" 1.1435 +- "movzbl (%edx,%eax,1),%eax\n" 1.1436 +- "movq 0(%ecx,%eax,8),%mm1\n" 1.1437 +- "mov %ebx,%eax\n" 1.1438 +- "add 0x38(%esp),%ebx\n" 1.1439 +- "sar $0x10,%eax\n" 1.1440 +- "movzbl (%edx,%eax,1),%eax\n" 1.1441 +- "movq 0(%ecx,%eax,8),%mm2\n" 1.1442 +- "paddsw %mm0,%mm1\n" 1.1443 +- "paddsw %mm0,%mm2\n" 1.1444 +- "psraw $0x6,%mm1\n" 1.1445 +- "psraw $0x6,%mm2\n" 1.1446 +- "packuswb %mm2,%mm1\n" 1.1447 +- "movntq %mm1,0x0(%ebp)\n" 1.1448 +- "add $0x8,%ebp\n" 1.1449 +-"Lscaleend:" 1.1450 +- "subl $0x2,0x34(%esp)\n" 1.1451 +- "jns Lscaleloop\n" 1.1452 +- 1.1453 +- "andl $0x1,0x34(%esp)\n" 1.1454 +- "je Lscaledone\n" 1.1455 +- 1.1456 +- "mov %ebx,%eax\n" 1.1457 +- "sar $0x11,%eax\n" 1.1458 +- "movzbl (%edi,%eax,1),%eax\n" 1.1459 +- "movq 2048(%ecx,%eax,8),%mm0\n" 1.1460 +- "mov %ebx,%eax\n" 1.1461 +- "sar $0x11,%eax\n" 1.1462 +- "movzbl (%esi,%eax,1),%eax\n" 1.1463 +- "paddsw 4096(%ecx,%eax,8),%mm0\n" 1.1464 +- "mov %ebx,%eax\n" 1.1465 +- "sar $0x10,%eax\n" 1.1466 +- "movzbl (%edx,%eax,1),%eax\n" 1.1467 +- "movq 0(%ecx,%eax,8),%mm1\n" 1.1468 +- "paddsw %mm0,%mm1\n" 1.1469 +- "psraw $0x6,%mm1\n" 1.1470 +- "packuswb %mm1,%mm1\n" 1.1471 +- "movd %mm1,0x0(%ebp)\n" 1.1472 +- 1.1473 +-"Lscaledone:" 1.1474 +- "popa\n" 1.1475 +- "ret\n" 1.1476 +-); 1.1477 +- 1.1478 +- 1.1479 +-void ScaleYUVToRGB32Row(const uint8* y_buf, 1.1480 +- const uint8* u_buf, 1.1481 +- const uint8* v_buf, 1.1482 +- uint8* rgb_buf, 1.1483 +- int width, 1.1484 +- int source_dx) { 1.1485 +- PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx, 1.1486 +- &kCoefficientsRgbY[0][0]); 1.1487 +-} 1.1488 +- 1.1489 +-void PICLinearScaleYUVToRGB32Row(const uint8* y_buf, 1.1490 +- const uint8* u_buf, 1.1491 +- const uint8* v_buf, 1.1492 +- uint8* rgb_buf, 1.1493 +- int width, 1.1494 +- int source_dx, 1.1495 +- int16 *kCoefficientsRgbY); 1.1496 +- asm( 1.1497 +- ".text\n" 1.1498 +-#if defined(OS_MACOSX) 1.1499 +-"_PICLinearScaleYUVToRGB32Row:\n" 1.1500 +-#else 1.1501 +-"PICLinearScaleYUVToRGB32Row:\n" 1.1502 +-#endif 1.1503 +- "pusha\n" 1.1504 +- "mov 0x24(%esp),%edx\n" 1.1505 +- "mov 0x30(%esp),%ebp\n" 1.1506 +- "mov 0x34(%esp),%ecx\n" 1.1507 +- "mov 0x3c(%esp),%edi\n" 1.1508 +- "xor %ebx,%ebx\n" 1.1509 +- 1.1510 +- // source_width = width * source_dx + ebx 1.1511 +- "mov 0x34(%esp), %ecx\n" 1.1512 +- "imull 0x38(%esp), %ecx\n" 1.1513 +- "mov %ecx, 0x34(%esp)\n" 1.1514 +- 1.1515 +- "mov 0x38(%esp), %ecx\n" 1.1516 +- "xor %ebx,%ebx\n" // x = 0 1.1517 +- "cmp $0x20000,%ecx\n" // if source_dx >= 2.0 1.1518 +- "jl .lscaleend\n" 1.1519 +- "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less 1.1520 +- "jmp .lscaleend\n" 1.1521 +- 1.1522 +-".lscaleloop:" 1.1523 +- "mov 0x28(%esp),%esi\n" 1.1524 +- "mov %ebx,%eax\n" 1.1525 +- "sar $0x11,%eax\n" 1.1526 +- 1.1527 +- "movzbl (%esi,%eax,1),%ecx\n" 1.1528 +- "movzbl 1(%esi,%eax,1),%esi\n" 1.1529 +- "mov %ebx,%eax\n" 1.1530 +- "andl $0x1fffe, %eax \n" 1.1531 +- "imul %eax, %esi \n" 1.1532 +- "xorl $0x1fffe, %eax \n" 1.1533 +- "imul %eax, %ecx \n" 1.1534 +- "addl %esi, %ecx \n" 1.1535 +- "shrl $17, %ecx \n" 1.1536 +- "movq 2048(%edi,%ecx,8),%mm0\n" 1.1537 +- 1.1538 +- "mov 0x2c(%esp),%esi\n" 1.1539 +- "mov %ebx,%eax\n" 1.1540 +- "sar $0x11,%eax\n" 1.1541 +- 1.1542 +- "movzbl (%esi,%eax,1),%ecx\n" 1.1543 +- "movzbl 1(%esi,%eax,1),%esi\n" 1.1544 +- "mov %ebx,%eax\n" 1.1545 +- "andl $0x1fffe, %eax \n" 1.1546 +- "imul %eax, %esi \n" 1.1547 +- "xorl $0x1fffe, %eax \n" 1.1548 +- "imul %eax, %ecx \n" 1.1549 +- "addl %esi, %ecx \n" 1.1550 +- "shrl $17, %ecx \n" 1.1551 +- "paddsw 4096(%edi,%ecx,8),%mm0\n" 1.1552 +- 1.1553 +- "mov %ebx,%eax\n" 1.1554 +- "sar $0x10,%eax\n" 1.1555 +- "movzbl (%edx,%eax,1),%ecx\n" 1.1556 +- "movzbl 1(%edx,%eax,1),%esi\n" 1.1557 +- "mov %ebx,%eax\n" 1.1558 +- "add 0x38(%esp),%ebx\n" 1.1559 +- "andl $0xffff, %eax \n" 1.1560 +- "imul %eax, %esi \n" 1.1561 +- "xorl $0xffff, %eax \n" 1.1562 +- "imul %eax, %ecx \n" 1.1563 +- "addl %esi, %ecx \n" 1.1564 +- "shrl $16, %ecx \n" 1.1565 +- "movq (%edi,%ecx,8),%mm1\n" 1.1566 +- 1.1567 +- "cmp 0x34(%esp), %ebx\n" 1.1568 +- "jge .lscalelastpixel\n" 1.1569 +- 1.1570 +- "mov %ebx,%eax\n" 1.1571 +- "sar $0x10,%eax\n" 1.1572 +- "movzbl (%edx,%eax,1),%ecx\n" 1.1573 +- "movzbl 1(%edx,%eax,1),%esi\n" 1.1574 +- "mov %ebx,%eax\n" 1.1575 +- "add 0x38(%esp),%ebx\n" 1.1576 +- "andl $0xffff, %eax \n" 1.1577 +- "imul %eax, %esi \n" 1.1578 +- "xorl $0xffff, %eax \n" 1.1579 +- "imul %eax, %ecx \n" 1.1580 +- "addl %esi, %ecx \n" 1.1581 +- "shrl $16, %ecx \n" 1.1582 +- "movq (%edi,%ecx,8),%mm2\n" 1.1583 +- 1.1584 +- "paddsw %mm0,%mm1\n" 1.1585 +- "paddsw %mm0,%mm2\n" 1.1586 +- "psraw $0x6,%mm1\n" 1.1587 +- "psraw $0x6,%mm2\n" 1.1588 +- "packuswb %mm2,%mm1\n" 1.1589 +- "movntq %mm1,0x0(%ebp)\n" 1.1590 +- "add $0x8,%ebp\n" 1.1591 +- 1.1592 +-".lscaleend:" 1.1593 +- "cmp %ebx, 0x34(%esp)\n" 1.1594 +- "jg .lscaleloop\n" 1.1595 +- "popa\n" 1.1596 +- "ret\n" 1.1597 +- 1.1598 +-".lscalelastpixel:" 1.1599 +- "paddsw %mm0, %mm1\n" 1.1600 +- "psraw $6, %mm1\n" 1.1601 +- "packuswb %mm1, %mm1\n" 1.1602 +- "movd %mm1, (%ebp)\n" 1.1603 +- "popa\n" 1.1604 +- "ret\n" 1.1605 +-); 1.1606 +- 1.1607 +-void LinearScaleYUVToRGB32Row(const uint8* y_buf, 1.1608 +- const uint8* u_buf, 1.1609 +- const uint8* v_buf, 1.1610 +- uint8* rgb_buf, 1.1611 +- int width, 1.1612 +- int source_dx) { 1.1613 +- PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx, 1.1614 +- &kCoefficientsRgbY[0][0]); 1.1615 +-} 1.1616 +- 1.1617 +-#else // USE_MMX 1.1618 +- 1.1619 + // C reference code that mimic the YUV assembly. 1.1620 + #define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x))) 1.1621 + #define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \ 1.1622 + (((x) + (y)) > 32767 ? 32767 : ((x) + (y)))) 1.1623 + 1.1624 + static inline void YuvPixel(uint8 y, 1.1625 + uint8 u, 1.1626 + uint8 v, 1.1627 +@@ -833,66 +39,71 @@ static inline void YuvPixel(uint8 y, 1.1628 + a >>= 6; 1.1629 + 1.1630 + *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) | 1.1631 + (packuswb(g) << 8) | 1.1632 + (packuswb(r) << 16) | 1.1633 + (packuswb(a) << 24); 1.1634 + } 1.1635 + 1.1636 +-void FastConvertYUVToRGB32Row(const uint8* y_buf, 1.1637 +- const uint8* u_buf, 1.1638 +- const uint8* v_buf, 1.1639 +- uint8* rgb_buf, 1.1640 +- int width) { 1.1641 ++void FastConvertYUVToRGB32Row_C(const uint8* y_buf, 1.1642 ++ const uint8* u_buf, 1.1643 ++ const uint8* v_buf, 1.1644 ++ uint8* rgb_buf, 1.1645 ++ int width, 1.1646 ++ unsigned int x_shift) { 1.1647 + for (int x = 0; x < width; x += 2) { 1.1648 +- uint8 u = u_buf[x >> 1]; 1.1649 +- uint8 v = v_buf[x >> 1]; 1.1650 ++ uint8 u = u_buf[x >> x_shift]; 1.1651 ++ uint8 v = v_buf[x >> x_shift]; 1.1652 + uint8 y0 = y_buf[x]; 1.1653 + YuvPixel(y0, u, v, rgb_buf); 1.1654 + if ((x + 1) < width) { 1.1655 + uint8 y1 = y_buf[x + 1]; 1.1656 ++ if (x_shift == 0) { 1.1657 ++ u = u_buf[x + 1]; 1.1658 ++ v = v_buf[x + 1]; 1.1659 ++ } 1.1660 + YuvPixel(y1, u, v, rgb_buf + 4); 1.1661 + } 1.1662 + rgb_buf += 8; // Advance 2 pixels. 1.1663 + } 1.1664 + } 1.1665 + 1.1666 + // 16.16 fixed point is used. A shift by 16 isolates the integer. 1.1667 + // A shift by 17 is used to further subsample the chrominence channels. 1.1668 + // & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits, 1.1669 + // for 1/65536 pixel accurate interpolation. 1.1670 +-void ScaleYUVToRGB32Row(const uint8* y_buf, 1.1671 +- const uint8* u_buf, 1.1672 +- const uint8* v_buf, 1.1673 +- uint8* rgb_buf, 1.1674 +- int width, 1.1675 +- int source_dx) { 1.1676 ++void ScaleYUVToRGB32Row_C(const uint8* y_buf, 1.1677 ++ const uint8* u_buf, 1.1678 ++ const uint8* v_buf, 1.1679 ++ uint8* rgb_buf, 1.1680 ++ int width, 1.1681 ++ int source_dx) { 1.1682 + int x = 0; 1.1683 + for (int i = 0; i < width; i += 2) { 1.1684 + int y = y_buf[x >> 16]; 1.1685 + int u = u_buf[(x >> 17)]; 1.1686 + int v = v_buf[(x >> 17)]; 1.1687 + YuvPixel(y, u, v, rgb_buf); 1.1688 + x += source_dx; 1.1689 + if ((i + 1) < width) { 1.1690 + y = y_buf[x >> 16]; 1.1691 + YuvPixel(y, u, v, rgb_buf+4); 1.1692 + x += source_dx; 1.1693 + } 1.1694 + rgb_buf += 8; 1.1695 + } 1.1696 + } 1.1697 + 1.1698 +-void LinearScaleYUVToRGB32Row(const uint8* y_buf, 1.1699 +- const uint8* u_buf, 1.1700 +- const uint8* v_buf, 1.1701 +- uint8* rgb_buf, 1.1702 +- int width, 1.1703 +- int source_dx) { 1.1704 ++void LinearScaleYUVToRGB32Row_C(const uint8* y_buf, 1.1705 ++ const uint8* u_buf, 1.1706 ++ const uint8* v_buf, 1.1707 ++ uint8* rgb_buf, 1.1708 ++ int width, 1.1709 ++ int source_dx) { 1.1710 + int x = 0; 1.1711 + if (source_dx >= 0x20000) { 1.1712 + x = 32768; 1.1713 + } 1.1714 + for (int i = 0; i < width; i += 2) { 1.1715 + int y0 = y_buf[x >> 16]; 1.1716 + int y1 = y_buf[(x >> 16) + 1]; 1.1717 + int u0 = u_buf[(x >> 17)]; 1.1718 +@@ -913,11 +124,10 @@ void LinearScaleYUVToRGB32Row(const uint 1.1719 + y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; 1.1720 + YuvPixel(y, u, v, rgb_buf+4); 1.1721 + x += source_dx; 1.1722 + } 1.1723 + rgb_buf += 8; 1.1724 + } 1.1725 + } 1.1726 + 1.1727 +-#endif // USE_MMX 1.1728 + } // extern "C" 1.1729 + 1.1730 +diff --git a/gfx/ycbcr/yuv_row_posix.cpp b/gfx/ycbcr/yuv_row_posix.cpp 1.1731 +--- a/gfx/ycbcr/yuv_row_posix.cpp 1.1732 ++++ b/gfx/ycbcr/yuv_row_posix.cpp 1.1733 +@@ -1,33 +1,32 @@ 1.1734 + // Copyright (c) 2010 The Chromium Authors. All rights reserved. 1.1735 + // Use of this source code is governed by a BSD-style license that can be 1.1736 + // found in the LICENSE file. 1.1737 + 1.1738 +-#include "media/base/yuv_row.h" 1.1739 +- 1.1740 +-#ifdef _DEBUG 1.1741 +-#include "base/logging.h" 1.1742 +-#else 1.1743 ++#include "yuv_row.h" 1.1744 ++#include "mozilla/SSE.h" 1.1745 ++ 1.1746 + #define DCHECK(a) 1.1747 +-#endif 1.1748 + 1.1749 + extern "C" { 1.1750 + 1.1751 +-#if USE_SSE2 && defined(ARCH_CPU_X86_64) 1.1752 ++#if defined(ARCH_CPU_X86_64) 1.1753 ++ 1.1754 ++// We don't need CPUID guards here, since x86-64 implies SSE2. 1.1755 + 1.1756 + // AMD64 ABI uses register paremters. 1.1757 + void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi 1.1758 + const uint8* u_buf, // rsi 1.1759 + const uint8* v_buf, // rdx 1.1760 + uint8* rgb_buf, // rcx 1.1761 + int width) { // r8 1.1762 + asm( 1.1763 +- "jmp convertend\n" 1.1764 +-"convertloop:" 1.1765 ++ "jmp 1f\n" 1.1766 ++"0:" 1.1767 + "movzb (%1),%%r10\n" 1.1768 + "add $0x1,%1\n" 1.1769 + "movzb (%2),%%r11\n" 1.1770 + "add $0x1,%2\n" 1.1771 + "movq 2048(%5,%%r10,8),%%xmm0\n" 1.1772 + "movzb (%0),%%r10\n" 1.1773 + "movq 4096(%5,%%r11,8),%%xmm1\n" 1.1774 + "movzb 0x1(%0),%%r11\n" 1.1775 +@@ -37,36 +36,36 @@ void FastConvertYUVToRGB32Row(const uint 1.1776 + "movq (%5,%%r11,8),%%xmm3\n" 1.1777 + "paddsw %%xmm0,%%xmm2\n" 1.1778 + "paddsw %%xmm0,%%xmm3\n" 1.1779 + "shufps $0x44,%%xmm3,%%xmm2\n" 1.1780 + "psraw $0x6,%%xmm2\n" 1.1781 + "packuswb %%xmm2,%%xmm2\n" 1.1782 + "movq %%xmm2,0x0(%3)\n" 1.1783 + "add $0x8,%3\n" 1.1784 +-"convertend:" 1.1785 ++"1:" 1.1786 + "sub $0x2,%4\n" 1.1787 +- "jns convertloop\n" 1.1788 +- 1.1789 +-"convertnext:" 1.1790 ++ "jns 0b\n" 1.1791 ++ 1.1792 ++"2:" 1.1793 + "add $0x1,%4\n" 1.1794 +- "js convertdone\n" 1.1795 ++ "js 3f\n" 1.1796 + 1.1797 + "movzb (%1),%%r10\n" 1.1798 + "movq 2048(%5,%%r10,8),%%xmm0\n" 1.1799 + "movzb (%2),%%r10\n" 1.1800 + "movq 4096(%5,%%r10,8),%%xmm1\n" 1.1801 + "paddsw %%xmm1,%%xmm0\n" 1.1802 + "movzb (%0),%%r10\n" 1.1803 + "movq (%5,%%r10,8),%%xmm1\n" 1.1804 + "paddsw %%xmm0,%%xmm1\n" 1.1805 + "psraw $0x6,%%xmm1\n" 1.1806 + "packuswb %%xmm1,%%xmm1\n" 1.1807 + "movd %%xmm1,0x0(%3)\n" 1.1808 +-"convertdone:" 1.1809 ++"3:" 1.1810 + : 1.1811 + : "r"(y_buf), // %0 1.1812 + "r"(u_buf), // %1 1.1813 + "r"(v_buf), // %2 1.1814 + "r"(rgb_buf), // %3 1.1815 + "r"(width), // %4 1.1816 + "r" (kCoefficientsRgbY) // %5 1.1817 + : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" 1.1818 +@@ -77,19 +76,19 @@ void ScaleYUVToRGB32Row(const uint8* y_b 1.1819 + const uint8* u_buf, // rsi 1.1820 + const uint8* v_buf, // rdx 1.1821 + uint8* rgb_buf, // rcx 1.1822 + int width, // r8 1.1823 + int source_dx) { // r9 1.1824 + asm( 1.1825 + "xor %%r11,%%r11\n" 1.1826 + "sub $0x2,%4\n" 1.1827 +- "js scalenext\n" 1.1828 +- 1.1829 +-"scaleloop:" 1.1830 ++ "js 1f\n" 1.1831 ++ 1.1832 ++"0:" 1.1833 + "mov %%r11,%%r10\n" 1.1834 + "sar $0x11,%%r10\n" 1.1835 + "movzb (%1,%%r10,1),%%rax\n" 1.1836 + "movq 2048(%5,%%rax,8),%%xmm0\n" 1.1837 + "movzb (%2,%%r10,1),%%rax\n" 1.1838 + "movq 4096(%5,%%rax,8),%%xmm1\n" 1.1839 + "lea (%%r11,%6),%%r10\n" 1.1840 + "sar $0x10,%%r11\n" 1.1841 +@@ -103,38 +102,38 @@ void ScaleYUVToRGB32Row(const uint8* y_b 1.1842 + "paddsw %%xmm0,%%xmm1\n" 1.1843 + "paddsw %%xmm0,%%xmm2\n" 1.1844 + "shufps $0x44,%%xmm2,%%xmm1\n" 1.1845 + "psraw $0x6,%%xmm1\n" 1.1846 + "packuswb %%xmm1,%%xmm1\n" 1.1847 + "movq %%xmm1,0x0(%3)\n" 1.1848 + "add $0x8,%3\n" 1.1849 + "sub $0x2,%4\n" 1.1850 +- "jns scaleloop\n" 1.1851 +- 1.1852 +-"scalenext:" 1.1853 ++ "jns 0b\n" 1.1854 ++ 1.1855 ++"1:" 1.1856 + "add $0x1,%4\n" 1.1857 +- "js scaledone\n" 1.1858 ++ "js 2f\n" 1.1859 + 1.1860 + "mov %%r11,%%r10\n" 1.1861 + "sar $0x11,%%r10\n" 1.1862 + "movzb (%1,%%r10,1),%%rax\n" 1.1863 + "movq 2048(%5,%%rax,8),%%xmm0\n" 1.1864 + "movzb (%2,%%r10,1),%%rax\n" 1.1865 + "movq 4096(%5,%%rax,8),%%xmm1\n" 1.1866 + "paddsw %%xmm1,%%xmm0\n" 1.1867 + "sar $0x10,%%r11\n" 1.1868 + "movzb (%0,%%r11,1),%%rax\n" 1.1869 + "movq (%5,%%rax,8),%%xmm1\n" 1.1870 + "paddsw %%xmm0,%%xmm1\n" 1.1871 + "psraw $0x6,%%xmm1\n" 1.1872 + "packuswb %%xmm1,%%xmm1\n" 1.1873 + "movd %%xmm1,0x0(%3)\n" 1.1874 + 1.1875 +-"scaledone:" 1.1876 ++"2:" 1.1877 + : 1.1878 + : "r"(y_buf), // %0 1.1879 + "r"(u_buf), // %1 1.1880 + "r"(v_buf), // %2 1.1881 + "r"(rgb_buf), // %3 1.1882 + "r"(width), // %4 1.1883 + "r" (kCoefficientsRgbY), // %5 1.1884 + "r"(static_cast<long>(source_dx)) // %6 1.1885 +@@ -146,23 +145,23 @@ void LinearScaleYUVToRGB32Row(const uint 1.1886 + const uint8* u_buf, 1.1887 + const uint8* v_buf, 1.1888 + uint8* rgb_buf, 1.1889 + int width, 1.1890 + int source_dx) { 1.1891 + asm( 1.1892 + "xor %%r11,%%r11\n" // x = 0 1.1893 + "sub $0x2,%4\n" 1.1894 +- "js .lscalenext\n" 1.1895 ++ "js 2f\n" 1.1896 + "cmp $0x20000,%6\n" // if source_dx >= 2.0 1.1897 +- "jl .lscalehalf\n" 1.1898 ++ "jl 0f\n" 1.1899 + "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less 1.1900 +-".lscalehalf:" 1.1901 +- 1.1902 +-".lscaleloop:" 1.1903 ++"0:" 1.1904 ++ 1.1905 ++"1:" 1.1906 + "mov %%r11,%%r10\n" 1.1907 + "sar $0x11,%%r10\n" 1.1908 + 1.1909 + "movzb (%1, %%r10, 1), %%r13 \n" 1.1910 + "movzb 1(%1, %%r10, 1), %%r14 \n" 1.1911 + "mov %%r11, %%rax \n" 1.1912 + "and $0x1fffe, %%rax \n" 1.1913 + "imul %%rax, %%r14 \n" 1.1914 +@@ -215,21 +214,21 @@ void LinearScaleYUVToRGB32Row(const uint 1.1915 + "paddsw %%xmm0,%%xmm1\n" 1.1916 + "paddsw %%xmm0,%%xmm2\n" 1.1917 + "shufps $0x44,%%xmm2,%%xmm1\n" 1.1918 + "psraw $0x6,%%xmm1\n" 1.1919 + "packuswb %%xmm1,%%xmm1\n" 1.1920 + "movq %%xmm1,0x0(%3)\n" 1.1921 + "add $0x8,%3\n" 1.1922 + "sub $0x2,%4\n" 1.1923 +- "jns .lscaleloop\n" 1.1924 +- 1.1925 +-".lscalenext:" 1.1926 ++ "jns 1b\n" 1.1927 ++ 1.1928 ++"2:" 1.1929 + "add $0x1,%4\n" 1.1930 +- "js .lscaledone\n" 1.1931 ++ "js 3f\n" 1.1932 + 1.1933 + "mov %%r11,%%r10\n" 1.1934 + "sar $0x11,%%r10\n" 1.1935 + 1.1936 + "movzb (%1,%%r10,1), %%r13 \n" 1.1937 + "movq 2048(%5,%%r13,8),%%xmm0\n" 1.1938 + 1.1939 + "movzb (%2,%%r10,1), %%r13 \n" 1.1940 +@@ -241,52 +240,52 @@ void LinearScaleYUVToRGB32Row(const uint 1.1941 + "movzb (%0,%%r11,1), %%r13 \n" 1.1942 + "movq (%5,%%r13,8),%%xmm1\n" 1.1943 + 1.1944 + "paddsw %%xmm0,%%xmm1\n" 1.1945 + "psraw $0x6,%%xmm1\n" 1.1946 + "packuswb %%xmm1,%%xmm1\n" 1.1947 + "movd %%xmm1,0x0(%3)\n" 1.1948 + 1.1949 +-".lscaledone:" 1.1950 ++"3:" 1.1951 + : 1.1952 + : "r"(y_buf), // %0 1.1953 + "r"(u_buf), // %1 1.1954 + "r"(v_buf), // %2 1.1955 + "r"(rgb_buf), // %3 1.1956 + "r"(width), // %4 1.1957 + "r" (kCoefficientsRgbY), // %5 1.1958 + "r"(static_cast<long>(source_dx)) // %6 1.1959 + : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2" 1.1960 + ); 1.1961 + } 1.1962 + 1.1963 +-#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__) 1.1964 ++#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__) 1.1965 + 1.1966 + // PIC version is slower because less registers are available, so 1.1967 + // non-PIC is used on platforms where it is possible. 1.1968 +- 1.1969 +-void FastConvertYUVToRGB32Row(const uint8* y_buf, 1.1970 +- const uint8* u_buf, 1.1971 +- const uint8* v_buf, 1.1972 +- uint8* rgb_buf, 1.1973 +- int width); 1.1974 ++void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf, 1.1975 ++ const uint8* u_buf, 1.1976 ++ const uint8* v_buf, 1.1977 ++ uint8* rgb_buf, 1.1978 ++ int width); 1.1979 + asm( 1.1980 + ".text\n" 1.1981 +- ".global FastConvertYUVToRGB32Row\n" 1.1982 +-"FastConvertYUVToRGB32Row:\n" 1.1983 ++ ".global FastConvertYUVToRGB32Row_SSE\n" 1.1984 ++ ".type FastConvertYUVToRGB32Row_SSE, @function\n" 1.1985 ++"FastConvertYUVToRGB32Row_SSE:\n" 1.1986 + "pusha\n" 1.1987 + "mov 0x24(%esp),%edx\n" 1.1988 + "mov 0x28(%esp),%edi\n" 1.1989 + "mov 0x2c(%esp),%esi\n" 1.1990 + "mov 0x30(%esp),%ebp\n" 1.1991 + "mov 0x34(%esp),%ecx\n" 1.1992 +- "jmp convertend\n" 1.1993 +- 1.1994 +-"convertloop:" 1.1995 ++ "jmp 1f\n" 1.1996 ++ 1.1997 ++"0:" 1.1998 + "movzbl (%edi),%eax\n" 1.1999 + "add $0x1,%edi\n" 1.2000 + "movzbl (%esi),%ebx\n" 1.2001 + "add $0x1,%esi\n" 1.2002 + "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" 1.2003 + "movzbl (%edx),%eax\n" 1.2004 + "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" 1.2005 + "movzbl 0x1(%edx),%ebx\n" 1.2006 +@@ -295,59 +294,77 @@ void FastConvertYUVToRGB32Row(const uint 1.2007 + "movq kCoefficientsRgbY(,%ebx,8),%mm2\n" 1.2008 + "paddsw %mm0,%mm1\n" 1.2009 + "paddsw %mm0,%mm2\n" 1.2010 + "psraw $0x6,%mm1\n" 1.2011 + "psraw $0x6,%mm2\n" 1.2012 + "packuswb %mm2,%mm1\n" 1.2013 + "movntq %mm1,0x0(%ebp)\n" 1.2014 + "add $0x8,%ebp\n" 1.2015 +-"convertend:" 1.2016 ++"1:" 1.2017 + "sub $0x2,%ecx\n" 1.2018 +- "jns convertloop\n" 1.2019 ++ "jns 0b\n" 1.2020 + 1.2021 + "and $0x1,%ecx\n" 1.2022 +- "je convertdone\n" 1.2023 ++ "je 2f\n" 1.2024 + 1.2025 + "movzbl (%edi),%eax\n" 1.2026 + "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" 1.2027 + "movzbl (%esi),%eax\n" 1.2028 + "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" 1.2029 + "movzbl (%edx),%eax\n" 1.2030 + "movq kCoefficientsRgbY(,%eax,8),%mm1\n" 1.2031 + "paddsw %mm0,%mm1\n" 1.2032 + "psraw $0x6,%mm1\n" 1.2033 + "packuswb %mm1,%mm1\n" 1.2034 + "movd %mm1,0x0(%ebp)\n" 1.2035 +-"convertdone:" 1.2036 ++"2:" 1.2037 + "popa\n" 1.2038 + "ret\n" 1.2039 ++#if !defined(XP_MACOSX) 1.2040 ++ ".previous\n" 1.2041 ++#endif 1.2042 + ); 1.2043 + 1.2044 +- 1.2045 +-void ScaleYUVToRGB32Row(const uint8* y_buf, 1.2046 +- const uint8* u_buf, 1.2047 +- const uint8* v_buf, 1.2048 +- uint8* rgb_buf, 1.2049 +- int width, 1.2050 +- int source_dx); 1.2051 ++void FastConvertYUVToRGB32Row(const uint8* y_buf, 1.2052 ++ const uint8* u_buf, 1.2053 ++ const uint8* v_buf, 1.2054 ++ uint8* rgb_buf, 1.2055 ++ int width) 1.2056 ++{ 1.2057 ++ if (mozilla::supports_sse()) { 1.2058 ++ FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width); 1.2059 ++ return; 1.2060 ++ } 1.2061 ++ 1.2062 ++ FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); 1.2063 ++} 1.2064 ++ 1.2065 ++ 1.2066 ++void ScaleYUVToRGB32Row_SSE(const uint8* y_buf, 1.2067 ++ const uint8* u_buf, 1.2068 ++ const uint8* v_buf, 1.2069 ++ uint8* rgb_buf, 1.2070 ++ int width, 1.2071 ++ int source_dx); 1.2072 + asm( 1.2073 + ".text\n" 1.2074 +- ".global ScaleYUVToRGB32Row\n" 1.2075 +-"ScaleYUVToRGB32Row:\n" 1.2076 ++ ".global ScaleYUVToRGB32Row_SSE\n" 1.2077 ++ ".type ScaleYUVToRGB32Row_SSE, @function\n" 1.2078 ++"ScaleYUVToRGB32Row_SSE:\n" 1.2079 + "pusha\n" 1.2080 + "mov 0x24(%esp),%edx\n" 1.2081 + "mov 0x28(%esp),%edi\n" 1.2082 + "mov 0x2c(%esp),%esi\n" 1.2083 + "mov 0x30(%esp),%ebp\n" 1.2084 + "mov 0x34(%esp),%ecx\n" 1.2085 + "xor %ebx,%ebx\n" 1.2086 +- "jmp scaleend\n" 1.2087 +- 1.2088 +-"scaleloop:" 1.2089 ++ "jmp 1f\n" 1.2090 ++ 1.2091 ++"0:" 1.2092 + "mov %ebx,%eax\n" 1.2093 + "sar $0x11,%eax\n" 1.2094 + "movzbl (%edi,%eax,1),%eax\n" 1.2095 + "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" 1.2096 + "mov %ebx,%eax\n" 1.2097 + "sar $0x11,%eax\n" 1.2098 + "movzbl (%esi,%eax,1),%eax\n" 1.2099 + "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" 1.2100 +@@ -363,22 +380,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b 1.2101 + "movq kCoefficientsRgbY(,%eax,8),%mm2\n" 1.2102 + "paddsw %mm0,%mm1\n" 1.2103 + "paddsw %mm0,%mm2\n" 1.2104 + "psraw $0x6,%mm1\n" 1.2105 + "psraw $0x6,%mm2\n" 1.2106 + "packuswb %mm2,%mm1\n" 1.2107 + "movntq %mm1,0x0(%ebp)\n" 1.2108 + "add $0x8,%ebp\n" 1.2109 +-"scaleend:" 1.2110 ++"1:" 1.2111 + "sub $0x2,%ecx\n" 1.2112 +- "jns scaleloop\n" 1.2113 ++ "jns 0b\n" 1.2114 + 1.2115 + "and $0x1,%ecx\n" 1.2116 +- "je scaledone\n" 1.2117 ++ "je 2f\n" 1.2118 + 1.2119 + "mov %ebx,%eax\n" 1.2120 + "sar $0x11,%eax\n" 1.2121 + "movzbl (%edi,%eax,1),%eax\n" 1.2122 + "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" 1.2123 + "mov %ebx,%eax\n" 1.2124 + "sar $0x11,%eax\n" 1.2125 + "movzbl (%esi,%eax,1),%eax\n" 1.2126 +@@ -387,51 +404,71 @@ void ScaleYUVToRGB32Row(const uint8* y_b 1.2127 + "sar $0x10,%eax\n" 1.2128 + "movzbl (%edx,%eax,1),%eax\n" 1.2129 + "movq kCoefficientsRgbY(,%eax,8),%mm1\n" 1.2130 + "paddsw %mm0,%mm1\n" 1.2131 + "psraw $0x6,%mm1\n" 1.2132 + "packuswb %mm1,%mm1\n" 1.2133 + "movd %mm1,0x0(%ebp)\n" 1.2134 + 1.2135 +-"scaledone:" 1.2136 ++"2:" 1.2137 + "popa\n" 1.2138 + "ret\n" 1.2139 ++#if !defined(XP_MACOSX) 1.2140 ++ ".previous\n" 1.2141 ++#endif 1.2142 + ); 1.2143 + 1.2144 +-void LinearScaleYUVToRGB32Row(const uint8* y_buf, 1.2145 +- const uint8* u_buf, 1.2146 +- const uint8* v_buf, 1.2147 +- uint8* rgb_buf, 1.2148 +- int width, 1.2149 +- int source_dx); 1.2150 ++void ScaleYUVToRGB32Row(const uint8* y_buf, 1.2151 ++ const uint8* u_buf, 1.2152 ++ const uint8* v_buf, 1.2153 ++ uint8* rgb_buf, 1.2154 ++ int width, 1.2155 ++ int source_dx) 1.2156 ++{ 1.2157 ++ if (mozilla::supports_sse()) { 1.2158 ++ ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, 1.2159 ++ width, source_dx); 1.2160 ++ } 1.2161 ++ 1.2162 ++ ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, 1.2163 ++ width, source_dx); 1.2164 ++} 1.2165 ++ 1.2166 ++void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf, 1.2167 ++ const uint8* u_buf, 1.2168 ++ const uint8* v_buf, 1.2169 ++ uint8* rgb_buf, 1.2170 ++ int width, 1.2171 ++ int source_dx); 1.2172 + asm( 1.2173 + ".text\n" 1.2174 +- ".global LinearScaleYUVToRGB32Row\n" 1.2175 +-"LinearScaleYUVToRGB32Row:\n" 1.2176 ++ ".global LinearScaleYUVToRGB32Row_SSE\n" 1.2177 ++ ".type LinearScaleYUVToRGB32Row_SSE, @function\n" 1.2178 ++"LinearScaleYUVToRGB32Row_SSE:\n" 1.2179 + "pusha\n" 1.2180 + "mov 0x24(%esp),%edx\n" 1.2181 + "mov 0x28(%esp),%edi\n" 1.2182 + "mov 0x30(%esp),%ebp\n" 1.2183 + 1.2184 + // source_width = width * source_dx + ebx 1.2185 + "mov 0x34(%esp), %ecx\n" 1.2186 + "imull 0x38(%esp), %ecx\n" 1.2187 + "mov %ecx, 0x34(%esp)\n" 1.2188 + 1.2189 + "mov 0x38(%esp), %ecx\n" 1.2190 + "xor %ebx,%ebx\n" // x = 0 1.2191 + "cmp $0x20000,%ecx\n" // if source_dx >= 2.0 1.2192 +- "jl .lscaleend\n" 1.2193 ++ "jl 1f\n" 1.2194 + "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less 1.2195 +- "jmp .lscaleend\n" 1.2196 +- 1.2197 +-".lscaleloop:" 1.2198 +- "mov %ebx,%eax\n" 1.2199 +- "sar $0x11,%eax\n" 1.2200 ++ "jmp 1f\n" 1.2201 ++ 1.2202 ++"0:" 1.2203 ++ "mov %ebx,%eax\n" 1.2204 ++ "sar $0x11,%eax\n" 1.2205 + 1.2206 + "movzbl (%edi,%eax,1),%ecx\n" 1.2207 + "movzbl 1(%edi,%eax,1),%esi\n" 1.2208 + "mov %ebx,%eax\n" 1.2209 + "andl $0x1fffe, %eax \n" 1.2210 + "imul %eax, %esi \n" 1.2211 + "xorl $0x1fffe, %eax \n" 1.2212 + "imul %eax, %ecx \n" 1.2213 +@@ -464,17 +501,17 @@ void LinearScaleYUVToRGB32Row(const uint 1.2214 + "imul %eax, %esi \n" 1.2215 + "xorl $0xffff, %eax \n" 1.2216 + "imul %eax, %ecx \n" 1.2217 + "addl %esi, %ecx \n" 1.2218 + "shrl $16, %ecx \n" 1.2219 + "movq kCoefficientsRgbY(,%ecx,8),%mm1\n" 1.2220 + 1.2221 + "cmp 0x34(%esp), %ebx\n" 1.2222 +- "jge .lscalelastpixel\n" 1.2223 ++ "jge 2f\n" 1.2224 + 1.2225 + "mov %ebx,%eax\n" 1.2226 + "sar $0x10,%eax\n" 1.2227 + "movzbl (%edx,%eax,1),%ecx\n" 1.2228 + "movzbl 1(%edx,%eax,1),%esi\n" 1.2229 + "mov %ebx,%eax\n" 1.2230 + "add 0x38(%esp),%ebx\n" 1.2231 + "andl $0xffff, %eax \n" 1.2232 +@@ -488,56 +525,76 @@ void LinearScaleYUVToRGB32Row(const uint 1.2233 + "paddsw %mm0,%mm1\n" 1.2234 + "paddsw %mm0,%mm2\n" 1.2235 + "psraw $0x6,%mm1\n" 1.2236 + "psraw $0x6,%mm2\n" 1.2237 + "packuswb %mm2,%mm1\n" 1.2238 + "movntq %mm1,0x0(%ebp)\n" 1.2239 + "add $0x8,%ebp\n" 1.2240 + 1.2241 +-".lscaleend:" 1.2242 ++"1:" 1.2243 + "cmp 0x34(%esp), %ebx\n" 1.2244 +- "jl .lscaleloop\n" 1.2245 ++ "jl 0b\n" 1.2246 + "popa\n" 1.2247 + "ret\n" 1.2248 + 1.2249 +-".lscalelastpixel:" 1.2250 ++"2:" 1.2251 + "paddsw %mm0, %mm1\n" 1.2252 + "psraw $6, %mm1\n" 1.2253 + "packuswb %mm1, %mm1\n" 1.2254 + "movd %mm1, (%ebp)\n" 1.2255 + "popa\n" 1.2256 + "ret\n" 1.2257 ++#if !defined(XP_MACOSX) 1.2258 ++ ".previous\n" 1.2259 ++#endif 1.2260 + ); 1.2261 + 1.2262 +-#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__) 1.2263 +- 1.2264 +-extern void PICConvertYUVToRGB32Row(const uint8* y_buf, 1.2265 +- const uint8* u_buf, 1.2266 +- const uint8* v_buf, 1.2267 +- uint8* rgb_buf, 1.2268 +- int width, 1.2269 +- int16 *kCoefficientsRgbY); 1.2270 ++void LinearScaleYUVToRGB32Row(const uint8* y_buf, 1.2271 ++ const uint8* u_buf, 1.2272 ++ const uint8* v_buf, 1.2273 ++ uint8* rgb_buf, 1.2274 ++ int width, 1.2275 ++ int source_dx) 1.2276 ++{ 1.2277 ++ if (mozilla::supports_sse()) { 1.2278 ++ LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, 1.2279 ++ width, source_dx); 1.2280 ++ } 1.2281 ++ 1.2282 ++ LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, 1.2283 ++ width, source_dx); 1.2284 ++} 1.2285 ++ 1.2286 ++#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__) 1.2287 ++ 1.2288 ++void PICConvertYUVToRGB32Row_SSE(const uint8* y_buf, 1.2289 ++ const uint8* u_buf, 1.2290 ++ const uint8* v_buf, 1.2291 ++ uint8* rgb_buf, 1.2292 ++ int width, 1.2293 ++ int16 *kCoefficientsRgbY); 1.2294 ++ 1.2295 + asm( 1.2296 + ".text\n" 1.2297 +-#if defined(OS_MACOSX) 1.2298 +-"_PICConvertYUVToRGB32Row:\n" 1.2299 ++#if defined(XP_MACOSX) 1.2300 ++"_PICConvertYUVToRGB32Row_SSE:\n" 1.2301 + #else 1.2302 +-"PICConvertYUVToRGB32Row:\n" 1.2303 ++"PICConvertYUVToRGB32Row_SSE:\n" 1.2304 + #endif 1.2305 + "pusha\n" 1.2306 + "mov 0x24(%esp),%edx\n" 1.2307 + "mov 0x28(%esp),%edi\n" 1.2308 + "mov 0x2c(%esp),%esi\n" 1.2309 + "mov 0x30(%esp),%ebp\n" 1.2310 + "mov 0x38(%esp),%ecx\n" 1.2311 + 1.2312 +- "jmp .Lconvertend\n" 1.2313 +- 1.2314 +-".Lconvertloop:" 1.2315 ++ "jmp 1f\n" 1.2316 ++ 1.2317 ++"0:" 1.2318 + "movzbl (%edi),%eax\n" 1.2319 + "add $0x1,%edi\n" 1.2320 + "movzbl (%esi),%ebx\n" 1.2321 + "add $0x1,%esi\n" 1.2322 + "movq 2048(%ecx,%eax,8),%mm0\n" 1.2323 + "movzbl (%edx),%eax\n" 1.2324 + "paddsw 4096(%ecx,%ebx,8),%mm0\n" 1.2325 + "movzbl 0x1(%edx),%ebx\n" 1.2326 +@@ -546,72 +603,81 @@ extern void PICConvertYUVToRGB32Row(cons 1.2327 + "movq 0(%ecx,%ebx,8),%mm2\n" 1.2328 + "paddsw %mm0,%mm1\n" 1.2329 + "paddsw %mm0,%mm2\n" 1.2330 + "psraw $0x6,%mm1\n" 1.2331 + "psraw $0x6,%mm2\n" 1.2332 + "packuswb %mm2,%mm1\n" 1.2333 + "movntq %mm1,0x0(%ebp)\n" 1.2334 + "add $0x8,%ebp\n" 1.2335 +-".Lconvertend:" 1.2336 ++"1:" 1.2337 + "subl $0x2,0x34(%esp)\n" 1.2338 +- "jns .Lconvertloop\n" 1.2339 ++ "jns 0b\n" 1.2340 + 1.2341 + "andl $0x1,0x34(%esp)\n" 1.2342 +- "je .Lconvertdone\n" 1.2343 ++ "je 2f\n" 1.2344 + 1.2345 + "movzbl (%edi),%eax\n" 1.2346 + "movq 2048(%ecx,%eax,8),%mm0\n" 1.2347 + "movzbl (%esi),%eax\n" 1.2348 + "paddsw 4096(%ecx,%eax,8),%mm0\n" 1.2349 + "movzbl (%edx),%eax\n" 1.2350 + "movq 0(%ecx,%eax,8),%mm1\n" 1.2351 + "paddsw %mm0,%mm1\n" 1.2352 + "psraw $0x6,%mm1\n" 1.2353 + "packuswb %mm1,%mm1\n" 1.2354 + "movd %mm1,0x0(%ebp)\n" 1.2355 +-".Lconvertdone:\n" 1.2356 ++"2:" 1.2357 + "popa\n" 1.2358 + "ret\n" 1.2359 ++#if !defined(XP_MACOSX) 1.2360 ++ ".previous\n" 1.2361 ++#endif 1.2362 + ); 1.2363 + 1.2364 + void FastConvertYUVToRGB32Row(const uint8* y_buf, 1.2365 + const uint8* u_buf, 1.2366 + const uint8* v_buf, 1.2367 + uint8* rgb_buf, 1.2368 +- int width) { 1.2369 +- PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, 1.2370 +- &kCoefficientsRgbY[0][0]); 1.2371 +-} 1.2372 +- 1.2373 +-extern void PICScaleYUVToRGB32Row(const uint8* y_buf, 1.2374 ++ int width) 1.2375 ++{ 1.2376 ++ if (mozilla::supports_sse()) { 1.2377 ++ PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, 1.2378 ++ &kCoefficientsRgbY[0][0]); 1.2379 ++ return; 1.2380 ++ } 1.2381 ++ 1.2382 ++ FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); 1.2383 ++} 1.2384 ++ 1.2385 ++void PICScaleYUVToRGB32Row_SSE(const uint8* y_buf, 1.2386 + const uint8* u_buf, 1.2387 + const uint8* v_buf, 1.2388 + uint8* rgb_buf, 1.2389 + int width, 1.2390 + int source_dx, 1.2391 + int16 *kCoefficientsRgbY); 1.2392 + 1.2393 + asm( 1.2394 + ".text\n" 1.2395 +-#if defined(OS_MACOSX) 1.2396 +-"_PICScaleYUVToRGB32Row:\n" 1.2397 ++#if defined(XP_MACOSX) 1.2398 ++"_PICScaleYUVToRGB32Row_SSE:\n" 1.2399 + #else 1.2400 +-"PICScaleYUVToRGB32Row:\n" 1.2401 ++"PICScaleYUVToRGB32Row_SSE:\n" 1.2402 + #endif 1.2403 + "pusha\n" 1.2404 + "mov 0x24(%esp),%edx\n" 1.2405 + "mov 0x28(%esp),%edi\n" 1.2406 + "mov 0x2c(%esp),%esi\n" 1.2407 + "mov 0x30(%esp),%ebp\n" 1.2408 + "mov 0x3c(%esp),%ecx\n" 1.2409 + "xor %ebx,%ebx\n" 1.2410 +- "jmp Lscaleend\n" 1.2411 +- 1.2412 +-"Lscaleloop:" 1.2413 ++ "jmp 1f\n" 1.2414 ++ 1.2415 ++"0:" 1.2416 + "mov %ebx,%eax\n" 1.2417 + "sar $0x11,%eax\n" 1.2418 + "movzbl (%edi,%eax,1),%eax\n" 1.2419 + "movq 2048(%ecx,%eax,8),%mm0\n" 1.2420 + "mov %ebx,%eax\n" 1.2421 + "sar $0x11,%eax\n" 1.2422 + "movzbl (%esi,%eax,1),%eax\n" 1.2423 + "paddsw 4096(%ecx,%eax,8),%mm0\n" 1.2424 +@@ -627,22 +693,22 @@ extern void PICScaleYUVToRGB32Row(const 1.2425 + "movq 0(%ecx,%eax,8),%mm2\n" 1.2426 + "paddsw %mm0,%mm1\n" 1.2427 + "paddsw %mm0,%mm2\n" 1.2428 + "psraw $0x6,%mm1\n" 1.2429 + "psraw $0x6,%mm2\n" 1.2430 + "packuswb %mm2,%mm1\n" 1.2431 + "movntq %mm1,0x0(%ebp)\n" 1.2432 + "add $0x8,%ebp\n" 1.2433 +-"Lscaleend:" 1.2434 ++"1:" 1.2435 + "subl $0x2,0x34(%esp)\n" 1.2436 +- "jns Lscaleloop\n" 1.2437 ++ "jns 0b\n" 1.2438 + 1.2439 + "andl $0x1,0x34(%esp)\n" 1.2440 +- "je Lscaledone\n" 1.2441 ++ "je 2f\n" 1.2442 + 1.2443 + "mov %ebx,%eax\n" 1.2444 + "sar $0x11,%eax\n" 1.2445 + "movzbl (%edi,%eax,1),%eax\n" 1.2446 + "movq 2048(%ecx,%eax,8),%mm0\n" 1.2447 + "mov %ebx,%eax\n" 1.2448 + "sar $0x11,%eax\n" 1.2449 + "movzbl (%esi,%eax,1),%eax\n" 1.2450 +@@ -651,66 +717,75 @@ extern void PICScaleYUVToRGB32Row(const 1.2451 + "sar $0x10,%eax\n" 1.2452 + "movzbl (%edx,%eax,1),%eax\n" 1.2453 + "movq 0(%ecx,%eax,8),%mm1\n" 1.2454 + "paddsw %mm0,%mm1\n" 1.2455 + "psraw $0x6,%mm1\n" 1.2456 + "packuswb %mm1,%mm1\n" 1.2457 + "movd %mm1,0x0(%ebp)\n" 1.2458 + 1.2459 +-"Lscaledone:" 1.2460 ++"2:" 1.2461 + "popa\n" 1.2462 + "ret\n" 1.2463 ++#if !defined(XP_MACOSX) 1.2464 ++ ".previous\n" 1.2465 ++#endif 1.2466 + ); 1.2467 + 1.2468 +- 1.2469 + void ScaleYUVToRGB32Row(const uint8* y_buf, 1.2470 + const uint8* u_buf, 1.2471 + const uint8* v_buf, 1.2472 + uint8* rgb_buf, 1.2473 + int width, 1.2474 +- int source_dx) { 1.2475 +- PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx, 1.2476 +- &kCoefficientsRgbY[0][0]); 1.2477 +-} 1.2478 +- 1.2479 +-void PICLinearScaleYUVToRGB32Row(const uint8* y_buf, 1.2480 +- const uint8* u_buf, 1.2481 +- const uint8* v_buf, 1.2482 +- uint8* rgb_buf, 1.2483 +- int width, 1.2484 +- int source_dx, 1.2485 +- int16 *kCoefficientsRgbY); 1.2486 ++ int source_dx) 1.2487 ++{ 1.2488 ++ if (mozilla::supports_sse()) { 1.2489 ++ PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx, 1.2490 ++ &kCoefficientsRgbY[0][0]); 1.2491 ++ return; 1.2492 ++ } 1.2493 ++ 1.2494 ++ ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); 1.2495 ++} 1.2496 ++ 1.2497 ++void PICLinearScaleYUVToRGB32Row_SSE(const uint8* y_buf, 1.2498 ++ const uint8* u_buf, 1.2499 ++ const uint8* v_buf, 1.2500 ++ uint8* rgb_buf, 1.2501 ++ int width, 1.2502 ++ int source_dx, 1.2503 ++ int16 *kCoefficientsRgbY); 1.2504 ++ 1.2505 + asm( 1.2506 + ".text\n" 1.2507 +-#if defined(OS_MACOSX) 1.2508 +-"_PICLinearScaleYUVToRGB32Row:\n" 1.2509 ++#if defined(XP_MACOSX) 1.2510 ++"_PICLinearScaleYUVToRGB32Row_SSE:\n" 1.2511 + #else 1.2512 +-"PICLinearScaleYUVToRGB32Row:\n" 1.2513 ++"PICLinearScaleYUVToRGB32Row_SSE:\n" 1.2514 + #endif 1.2515 + "pusha\n" 1.2516 + "mov 0x24(%esp),%edx\n" 1.2517 + "mov 0x30(%esp),%ebp\n" 1.2518 + "mov 0x34(%esp),%ecx\n" 1.2519 + "mov 0x3c(%esp),%edi\n" 1.2520 + "xor %ebx,%ebx\n" 1.2521 + 1.2522 + // source_width = width * source_dx + ebx 1.2523 + "mov 0x34(%esp), %ecx\n" 1.2524 + "imull 0x38(%esp), %ecx\n" 1.2525 + "mov %ecx, 0x34(%esp)\n" 1.2526 + 1.2527 + "mov 0x38(%esp), %ecx\n" 1.2528 + "xor %ebx,%ebx\n" // x = 0 1.2529 + "cmp $0x20000,%ecx\n" // if source_dx >= 2.0 1.2530 +- "jl .lscaleend\n" 1.2531 ++ "jl 1f\n" 1.2532 + "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less 1.2533 +- "jmp .lscaleend\n" 1.2534 +- 1.2535 +-".lscaleloop:" 1.2536 ++ "jmp 1f\n" 1.2537 ++ 1.2538 ++"0:" 1.2539 + "mov 0x28(%esp),%esi\n" 1.2540 + "mov %ebx,%eax\n" 1.2541 + "sar $0x11,%eax\n" 1.2542 + 1.2543 + "movzbl (%esi,%eax,1),%ecx\n" 1.2544 + "movzbl 1(%esi,%eax,1),%esi\n" 1.2545 + "mov %ebx,%eax\n" 1.2546 + "andl $0x1fffe, %eax \n" 1.2547 +@@ -746,17 +821,17 @@ void PICLinearScaleYUVToRGB32Row(const u 1.2548 + "imul %eax, %esi \n" 1.2549 + "xorl $0xffff, %eax \n" 1.2550 + "imul %eax, %ecx \n" 1.2551 + "addl %esi, %ecx \n" 1.2552 + "shrl $16, %ecx \n" 1.2553 + "movq (%edi,%ecx,8),%mm1\n" 1.2554 + 1.2555 + "cmp 0x34(%esp), %ebx\n" 1.2556 +- "jge .lscalelastpixel\n" 1.2557 ++ "jge 2f\n" 1.2558 + 1.2559 + "mov %ebx,%eax\n" 1.2560 + "sar $0x10,%eax\n" 1.2561 + "movzbl (%edx,%eax,1),%ecx\n" 1.2562 + "movzbl 1(%edx,%eax,1),%esi\n" 1.2563 + "mov %ebx,%eax\n" 1.2564 + "add 0x38(%esp),%ebx\n" 1.2565 + "andl $0xffff, %eax \n" 1.2566 +@@ -770,154 +845,71 @@ void PICLinearScaleYUVToRGB32Row(const u 1.2567 + "paddsw %mm0,%mm1\n" 1.2568 + "paddsw %mm0,%mm2\n" 1.2569 + "psraw $0x6,%mm1\n" 1.2570 + "psraw $0x6,%mm2\n" 1.2571 + "packuswb %mm2,%mm1\n" 1.2572 + "movntq %mm1,0x0(%ebp)\n" 1.2573 + "add $0x8,%ebp\n" 1.2574 + 1.2575 +-".lscaleend:" 1.2576 ++"1:" 1.2577 + "cmp %ebx, 0x34(%esp)\n" 1.2578 +- "jg .lscaleloop\n" 1.2579 ++ "jg 0b\n" 1.2580 + "popa\n" 1.2581 + "ret\n" 1.2582 + 1.2583 +-".lscalelastpixel:" 1.2584 ++"2:" 1.2585 + "paddsw %mm0, %mm1\n" 1.2586 + "psraw $6, %mm1\n" 1.2587 + "packuswb %mm1, %mm1\n" 1.2588 + "movd %mm1, (%ebp)\n" 1.2589 + "popa\n" 1.2590 + "ret\n" 1.2591 ++#if !defined(XP_MACOSX) 1.2592 ++ ".previous\n" 1.2593 ++#endif 1.2594 + ); 1.2595 + 1.2596 ++ 1.2597 + void LinearScaleYUVToRGB32Row(const uint8* y_buf, 1.2598 +- const uint8* u_buf, 1.2599 +- const uint8* v_buf, 1.2600 +- uint8* rgb_buf, 1.2601 +- int width, 1.2602 +- int source_dx) { 1.2603 +- PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx, 1.2604 +- &kCoefficientsRgbY[0][0]); 1.2605 +-} 1.2606 +- 1.2607 +-#else // USE_MMX 1.2608 +- 1.2609 +-// C reference code that mimic the YUV assembly. 1.2610 +-#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x))) 1.2611 +-#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \ 1.2612 +- (((x) + (y)) > 32767 ? 32767 : ((x) + (y)))) 1.2613 +- 1.2614 +-static inline void YuvPixel(uint8 y, 1.2615 +- uint8 u, 1.2616 +- uint8 v, 1.2617 +- uint8* rgb_buf) { 1.2618 +- 1.2619 +- int b = kCoefficientsRgbY[256+u][0]; 1.2620 +- int g = kCoefficientsRgbY[256+u][1]; 1.2621 +- int r = kCoefficientsRgbY[256+u][2]; 1.2622 +- int a = kCoefficientsRgbY[256+u][3]; 1.2623 +- 1.2624 +- b = paddsw(b, kCoefficientsRgbY[512+v][0]); 1.2625 +- g = paddsw(g, kCoefficientsRgbY[512+v][1]); 1.2626 +- r = paddsw(r, kCoefficientsRgbY[512+v][2]); 1.2627 +- a = paddsw(a, kCoefficientsRgbY[512+v][3]); 1.2628 +- 1.2629 +- b = paddsw(b, kCoefficientsRgbY[y][0]); 1.2630 +- g = paddsw(g, kCoefficientsRgbY[y][1]); 1.2631 +- r = paddsw(r, kCoefficientsRgbY[y][2]); 1.2632 +- a = paddsw(a, kCoefficientsRgbY[y][3]); 1.2633 +- 1.2634 +- b >>= 6; 1.2635 +- g >>= 6; 1.2636 +- r >>= 6; 1.2637 +- a >>= 6; 1.2638 +- 1.2639 +- *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) | 1.2640 +- (packuswb(g) << 8) | 1.2641 +- (packuswb(r) << 16) | 1.2642 +- (packuswb(a) << 24); 1.2643 +-} 1.2644 +- 1.2645 ++ const uint8* u_buf, 1.2646 ++ const uint8* v_buf, 1.2647 ++ uint8* rgb_buf, 1.2648 ++ int width, 1.2649 ++ int source_dx) 1.2650 ++{ 1.2651 ++ if (mozilla::supports_sse()) { 1.2652 ++ PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, 1.2653 ++ source_dx, &kCoefficientsRgbY[0][0]); 1.2654 ++ return; 1.2655 ++ } 1.2656 ++ 1.2657 ++ LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); 1.2658 ++} 1.2659 ++#else 1.2660 + void FastConvertYUVToRGB32Row(const uint8* y_buf, 1.2661 + const uint8* u_buf, 1.2662 + const uint8* v_buf, 1.2663 + uint8* rgb_buf, 1.2664 + int width) { 1.2665 +- for (int x = 0; x < width; x += 2) { 1.2666 +- uint8 u = u_buf[x >> 1]; 1.2667 +- uint8 v = v_buf[x >> 1]; 1.2668 +- uint8 y0 = y_buf[x]; 1.2669 +- YuvPixel(y0, u, v, rgb_buf); 1.2670 +- if ((x + 1) < width) { 1.2671 +- uint8 y1 = y_buf[x + 1]; 1.2672 +- YuvPixel(y1, u, v, rgb_buf + 4); 1.2673 +- } 1.2674 +- rgb_buf += 8; // Advance 2 pixels. 1.2675 +- } 1.2676 +-} 1.2677 +- 1.2678 +-// 16.16 fixed point is used. A shift by 16 isolates the integer. 1.2679 +-// A shift by 17 is used to further subsample the chrominence channels. 1.2680 +-// & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits, 1.2681 +-// for 1/65536 pixel accurate interpolation. 1.2682 ++ FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); 1.2683 ++} 1.2684 ++ 1.2685 + void ScaleYUVToRGB32Row(const uint8* y_buf, 1.2686 + const uint8* u_buf, 1.2687 + const uint8* v_buf, 1.2688 + uint8* rgb_buf, 1.2689 + int width, 1.2690 + int source_dx) { 1.2691 +- int x = 0; 1.2692 +- for (int i = 0; i < width; i += 2) { 1.2693 +- int y = y_buf[x >> 16]; 1.2694 +- int u = u_buf[(x >> 17)]; 1.2695 +- int v = v_buf[(x >> 17)]; 1.2696 +- YuvPixel(y, u, v, rgb_buf); 1.2697 +- x += source_dx; 1.2698 +- if ((i + 1) < width) { 1.2699 +- y = y_buf[x >> 16]; 1.2700 +- YuvPixel(y, u, v, rgb_buf+4); 1.2701 +- x += source_dx; 1.2702 +- } 1.2703 +- rgb_buf += 8; 1.2704 +- } 1.2705 +-} 1.2706 ++ ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); 1.2707 ++} 1.2708 + 1.2709 + void LinearScaleYUVToRGB32Row(const uint8* y_buf, 1.2710 + const uint8* u_buf, 1.2711 + const uint8* v_buf, 1.2712 + uint8* rgb_buf, 1.2713 + int width, 1.2714 + int source_dx) { 1.2715 +- int x = 0; 1.2716 +- if (source_dx >= 0x20000) { 1.2717 +- x = 32768; 1.2718 +- } 1.2719 +- for (int i = 0; i < width; i += 2) { 1.2720 +- int y0 = y_buf[x >> 16]; 1.2721 +- int y1 = y_buf[(x >> 16) + 1]; 1.2722 +- int u0 = u_buf[(x >> 17)]; 1.2723 +- int u1 = u_buf[(x >> 17) + 1]; 1.2724 +- int v0 = v_buf[(x >> 17)]; 1.2725 +- int v1 = v_buf[(x >> 17) + 1]; 1.2726 +- int y_frac = (x & 65535); 1.2727 +- int uv_frac = ((x >> 1) & 65535); 1.2728 +- int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; 1.2729 +- int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16; 1.2730 +- int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16; 1.2731 +- YuvPixel(y, u, v, rgb_buf); 1.2732 +- x += source_dx; 1.2733 +- if ((i + 1) < width) { 1.2734 +- y0 = y_buf[x >> 16]; 1.2735 +- y1 = y_buf[(x >> 16) + 1]; 1.2736 +- y_frac = (x & 65535); 1.2737 +- y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; 1.2738 +- YuvPixel(y, u, v, rgb_buf+4); 1.2739 +- x += source_dx; 1.2740 +- } 1.2741 +- rgb_buf += 8; 1.2742 +- } 1.2743 +-} 1.2744 +- 1.2745 +-#endif // USE_MMX 1.2746 +-} // extern "C" 1.2747 +- 1.2748 ++ LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); 1.2749 ++} 1.2750 ++#endif 1.2751 ++ 1.2752 ++} 1.2753 +diff --git a/gfx/ycbcr/yuv_row_table.cpp b/gfx/ycbcr/yuv_row_table.cpp 1.2754 +--- a/gfx/ycbcr/yuv_row_table.cpp 1.2755 ++++ b/gfx/ycbcr/yuv_row_table.cpp 1.2756 +@@ -1,13 +1,13 @@ 1.2757 + // Copyright (c) 2010 The Chromium Authors. All rights reserved. 1.2758 + // Use of this source code is governed by a BSD-style license that can be 1.2759 + // found in the LICENSE file. 1.2760 + 1.2761 +-#include "media/base/yuv_row.h" 1.2762 ++#include "yuv_row.h" 1.2763 + 1.2764 + extern "C" { 1.2765 + 1.2766 + #define RGBY(i) { \ 1.2767 + static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ 1.2768 + static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ 1.2769 + static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ 1.2770 + 0 \ 1.2771 +diff --git a/gfx/ycbcr/yuv_row_win.cpp b/gfx/ycbcr/yuv_row_win.cpp 1.2772 +--- a/gfx/ycbcr/yuv_row_win.cpp 1.2773 ++++ b/gfx/ycbcr/yuv_row_win.cpp 1.2774 +@@ -1,26 +1,27 @@ 1.2775 + // Copyright (c) 2010 The Chromium Authors. All rights reserved. 1.2776 + // Use of this source code is governed by a BSD-style license that can be 1.2777 + // found in the LICENSE file. 1.2778 + 1.2779 +-#include "media/base/yuv_row.h" 1.2780 ++#include "yuv_row.h" 1.2781 ++#include "mozilla/SSE.h" 1.2782 + 1.2783 + #define kCoefficientsRgbU kCoefficientsRgbY + 2048 1.2784 + #define kCoefficientsRgbV kCoefficientsRgbY + 4096 1.2785 + 1.2786 + extern "C" { 1.2787 + 1.2788 +-#if USE_MMX 1.2789 +-__declspec(naked) 1.2790 +-void FastConvertYUVToRGB32Row(const uint8* y_buf, 1.2791 +- const uint8* u_buf, 1.2792 +- const uint8* v_buf, 1.2793 +- uint8* rgb_buf, 1.2794 +- int width) { 1.2795 ++#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) 1.2796 ++__declspec(naked) 1.2797 ++void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf, 1.2798 ++ const uint8* u_buf, 1.2799 ++ const uint8* v_buf, 1.2800 ++ uint8* rgb_buf, 1.2801 ++ int width) { 1.2802 + __asm { 1.2803 + pushad 1.2804 + mov edx, [esp + 32 + 4] // Y 1.2805 + mov edi, [esp + 32 + 8] // U 1.2806 + mov esi, [esp + 32 + 12] // V 1.2807 + mov ebp, [esp + 32 + 16] // rgb 1.2808 + mov ecx, [esp + 32 + 20] // width 1.2809 + jmp convertend 1.2810 +@@ -64,22 +65,22 @@ void FastConvertYUVToRGB32Row(const uint 1.2811 + convertdone : 1.2812 + 1.2813 + popad 1.2814 + ret 1.2815 + } 1.2816 + } 1.2817 + 1.2818 + __declspec(naked) 1.2819 +-void ConvertYUVToRGB32Row(const uint8* y_buf, 1.2820 +- const uint8* u_buf, 1.2821 +- const uint8* v_buf, 1.2822 +- uint8* rgb_buf, 1.2823 +- int width, 1.2824 +- int step) { 1.2825 ++void ConvertYUVToRGB32Row_SSE(const uint8* y_buf, 1.2826 ++ const uint8* u_buf, 1.2827 ++ const uint8* v_buf, 1.2828 ++ uint8* rgb_buf, 1.2829 ++ int width, 1.2830 ++ int step) { 1.2831 + __asm { 1.2832 + pushad 1.2833 + mov edx, [esp + 32 + 4] // Y 1.2834 + mov edi, [esp + 32 + 8] // U 1.2835 + mov esi, [esp + 32 + 12] // V 1.2836 + mov ebp, [esp + 32 + 16] // rgb 1.2837 + mov ecx, [esp + 32 + 20] // width 1.2838 + mov ebx, [esp + 32 + 24] // step 1.2839 +@@ -125,23 +126,23 @@ void ConvertYUVToRGB32Row(const uint8* y 1.2840 + wdone : 1.2841 + 1.2842 + popad 1.2843 + ret 1.2844 + } 1.2845 + } 1.2846 + 1.2847 + __declspec(naked) 1.2848 +-void RotateConvertYUVToRGB32Row(const uint8* y_buf, 1.2849 +- const uint8* u_buf, 1.2850 +- const uint8* v_buf, 1.2851 +- uint8* rgb_buf, 1.2852 +- int width, 1.2853 +- int ystep, 1.2854 +- int uvstep) { 1.2855 ++void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf, 1.2856 ++ const uint8* u_buf, 1.2857 ++ const uint8* v_buf, 1.2858 ++ uint8* rgb_buf, 1.2859 ++ int width, 1.2860 ++ int ystep, 1.2861 ++ int uvstep) { 1.2862 + __asm { 1.2863 + pushad 1.2864 + mov edx, [esp + 32 + 4] // Y 1.2865 + mov edi, [esp + 32 + 8] // U 1.2866 + mov esi, [esp + 32 + 12] // V 1.2867 + mov ebp, [esp + 32 + 16] // rgb 1.2868 + mov ecx, [esp + 32 + 20] // width 1.2869 + jmp wend 1.2870 +@@ -188,21 +189,21 @@ void RotateConvertYUVToRGB32Row(const ui 1.2871 + wdone : 1.2872 + 1.2873 + popad 1.2874 + ret 1.2875 + } 1.2876 + } 1.2877 + 1.2878 + __declspec(naked) 1.2879 +-void DoubleYUVToRGB32Row(const uint8* y_buf, 1.2880 +- const uint8* u_buf, 1.2881 +- const uint8* v_buf, 1.2882 +- uint8* rgb_buf, 1.2883 +- int width) { 1.2884 ++void DoubleYUVToRGB32Row_SSE(const uint8* y_buf, 1.2885 ++ const uint8* u_buf, 1.2886 ++ const uint8* v_buf, 1.2887 ++ uint8* rgb_buf, 1.2888 ++ int width) { 1.2889 + __asm { 1.2890 + pushad 1.2891 + mov edx, [esp + 32 + 4] // Y 1.2892 + mov edi, [esp + 32 + 8] // U 1.2893 + mov esi, [esp + 32 + 12] // V 1.2894 + mov ebp, [esp + 32 + 16] // rgb 1.2895 + mov ecx, [esp + 32 + 20] // width 1.2896 + jmp wend 1.2897 +@@ -256,26 +257,26 @@ void DoubleYUVToRGB32Row(const uint8* y_ 1.2898 + jns wloop1 1.2899 + wdone : 1.2900 + popad 1.2901 + ret 1.2902 + } 1.2903 + } 1.2904 + 1.2905 + // This version does general purpose scaling by any amount, up or down. 1.2906 +-// The only thing it can not do it rotation by 90 or 270. 1.2907 +-// For performance the chroma is under sampled, reducing cost of a 3x 1.2908 ++// The only thing it cannot do is rotation by 90 or 270. 1.2909 ++// For performance the chroma is under-sampled, reducing cost of a 3x 1.2910 + // 1080p scale from 8.4 ms to 5.4 ms. 1.2911 + __declspec(naked) 1.2912 +-void ScaleYUVToRGB32Row(const uint8* y_buf, 1.2913 +- const uint8* u_buf, 1.2914 +- const uint8* v_buf, 1.2915 +- uint8* rgb_buf, 1.2916 +- int width, 1.2917 +- int source_dx) { 1.2918 ++void ScaleYUVToRGB32Row_SSE(const uint8* y_buf, 1.2919 ++ const uint8* u_buf, 1.2920 ++ const uint8* v_buf, 1.2921 ++ uint8* rgb_buf, 1.2922 ++ int width, 1.2923 ++ int source_dx) { 1.2924 + __asm { 1.2925 + pushad 1.2926 + mov edx, [esp + 32 + 4] // Y 1.2927 + mov edi, [esp + 32 + 8] // U 1.2928 + mov esi, [esp + 32 + 12] // V 1.2929 + mov ebp, [esp + 32 + 16] // rgb 1.2930 + mov ecx, [esp + 32 + 20] // width 1.2931 + xor ebx, ebx // x 1.2932 +@@ -333,22 +334,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b 1.2933 + 1.2934 + scaledone : 1.2935 + popad 1.2936 + ret 1.2937 + } 1.2938 + } 1.2939 + 1.2940 + __declspec(naked) 1.2941 +-void LinearScaleYUVToRGB32Row(const uint8* y_buf, 1.2942 +- const uint8* u_buf, 1.2943 +- const uint8* v_buf, 1.2944 +- uint8* rgb_buf, 1.2945 +- int width, 1.2946 +- int source_dx) { 1.2947 ++void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf, 1.2948 ++ const uint8* u_buf, 1.2949 ++ const uint8* v_buf, 1.2950 ++ uint8* rgb_buf, 1.2951 ++ int width, 1.2952 ++ int source_dx) { 1.2953 + __asm { 1.2954 + pushad 1.2955 + mov edx, [esp + 32 + 4] // Y 1.2956 + mov edi, [esp + 32 + 8] // U 1.2957 + // [esp + 32 + 12] // V 1.2958 + mov ebp, [esp + 32 + 16] // rgb 1.2959 + mov ecx, [esp + 32 + 20] // width 1.2960 + imul ecx, [esp + 32 + 24] // source_dx 1.2961 +@@ -438,152 +439,60 @@ lscalelastpixel: 1.2962 + paddsw mm1, mm0 1.2963 + psraw mm1, 6 1.2964 + packuswb mm1, mm1 1.2965 + movd [ebp], mm1 1.2966 + popad 1.2967 + ret 1.2968 + }; 1.2969 + } 1.2970 +-#else // USE_MMX 1.2971 +- 1.2972 +-// C reference code that mimic the YUV assembly. 1.2973 +-#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x))) 1.2974 +-#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \ 1.2975 +- (((x) + (y)) > 32767 ? 32767 : ((x) + (y)))) 1.2976 +- 1.2977 +-static inline void YuvPixel(uint8 y, 1.2978 +- uint8 u, 1.2979 +- uint8 v, 1.2980 +- uint8* rgb_buf) { 1.2981 +- 1.2982 +- int b = kCoefficientsRgbY[256+u][0]; 1.2983 +- int g = kCoefficientsRgbY[256+u][1]; 1.2984 +- int r = kCoefficientsRgbY[256+u][2]; 1.2985 +- int a = kCoefficientsRgbY[256+u][3]; 1.2986 +- 1.2987 +- b = paddsw(b, kCoefficientsRgbY[512+v][0]); 1.2988 +- g = paddsw(g, kCoefficientsRgbY[512+v][1]); 1.2989 +- r = paddsw(r, kCoefficientsRgbY[512+v][2]); 1.2990 +- a = paddsw(a, kCoefficientsRgbY[512+v][3]); 1.2991 +- 1.2992 +- b = paddsw(b, kCoefficientsRgbY[y][0]); 1.2993 +- g = paddsw(g, kCoefficientsRgbY[y][1]); 1.2994 +- r = paddsw(r, kCoefficientsRgbY[y][2]); 1.2995 +- a = paddsw(a, kCoefficientsRgbY[y][3]); 1.2996 +- 1.2997 +- b >>= 6; 1.2998 +- g >>= 6; 1.2999 +- r >>= 6; 1.3000 +- a >>= 6; 1.3001 +- 1.3002 +- *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) | 1.3003 +- (packuswb(g) << 8) | 1.3004 +- (packuswb(r) << 16) | 1.3005 +- (packuswb(a) << 24); 1.3006 +-} 1.3007 +- 1.3008 +-#if TEST_MMX_YUV 1.3009 +-static inline void YuvPixel(uint8 y, 1.3010 +- uint8 u, 1.3011 +- uint8 v, 1.3012 +- uint8* rgb_buf) { 1.3013 +- 1.3014 +- __asm { 1.3015 +- movzx eax, u 1.3016 +- movq mm0, [kCoefficientsRgbY+2048 + 8 * eax] 1.3017 +- movzx eax, v 1.3018 +- paddsw mm0, [kCoefficientsRgbY+4096 + 8 * eax] 1.3019 +- movzx eax, y 1.3020 +- movq mm1, [kCoefficientsRgbY + 8 * eax] 1.3021 +- paddsw mm1, mm0 1.3022 +- psraw mm1, 6 1.3023 +- packuswb mm1, mm1 1.3024 +- mov eax, rgb_buf 1.3025 +- movd [eax], mm1 1.3026 +- emms 1.3027 +- } 1.3028 +-} 1.3029 +-#endif 1.3030 ++#endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) 1.3031 + 1.3032 + void FastConvertYUVToRGB32Row(const uint8* y_buf, 1.3033 + const uint8* u_buf, 1.3034 + const uint8* v_buf, 1.3035 + uint8* rgb_buf, 1.3036 + int width) { 1.3037 +- for (int x = 0; x < width; x += 2) { 1.3038 +- uint8 u = u_buf[x >> 1]; 1.3039 +- uint8 v = v_buf[x >> 1]; 1.3040 +- uint8 y0 = y_buf[x]; 1.3041 +- YuvPixel(y0, u, v, rgb_buf); 1.3042 +- if ((x + 1) < width) { 1.3043 +- uint8 y1 = y_buf[x + 1]; 1.3044 +- YuvPixel(y1, u, v, rgb_buf + 4); 1.3045 +- } 1.3046 +- rgb_buf += 8; // Advance 2 pixels. 1.3047 +- } 1.3048 +-} 1.3049 +- 1.3050 +-// 16.16 fixed point is used. A shift by 16 isolates the integer. 1.3051 +-// A shift by 17 is used to further subsample the chrominence channels. 1.3052 +-// & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits, 1.3053 +-// for 1/65536 pixel accurate interpolation. 1.3054 ++#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) 1.3055 ++ if (mozilla::supports_sse()) { 1.3056 ++ FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width); 1.3057 ++ return; 1.3058 ++ } 1.3059 ++#endif 1.3060 ++ 1.3061 ++ FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); 1.3062 ++} 1.3063 ++ 1.3064 + void ScaleYUVToRGB32Row(const uint8* y_buf, 1.3065 + const uint8* u_buf, 1.3066 + const uint8* v_buf, 1.3067 + uint8* rgb_buf, 1.3068 + int width, 1.3069 + int source_dx) { 1.3070 +- int x = 0; 1.3071 +- for (int i = 0; i < width; i += 2) { 1.3072 +- int y = y_buf[x >> 16]; 1.3073 +- int u = u_buf[(x >> 17)]; 1.3074 +- int v = v_buf[(x >> 17)]; 1.3075 +- YuvPixel(y, u, v, rgb_buf); 1.3076 +- x += source_dx; 1.3077 +- if ((i + 1) < width) { 1.3078 +- y = y_buf[x >> 16]; 1.3079 +- YuvPixel(y, u, v, rgb_buf+4); 1.3080 +- x += source_dx; 1.3081 +- } 1.3082 +- rgb_buf += 8; 1.3083 +- } 1.3084 +-} 1.3085 ++ 1.3086 ++#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) 1.3087 ++ if (mozilla::supports_sse()) { 1.3088 ++ ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); 1.3089 ++ return; 1.3090 ++ } 1.3091 ++#endif 1.3092 ++ 1.3093 ++ ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); 1.3094 ++} 1.3095 + 1.3096 + void LinearScaleYUVToRGB32Row(const uint8* y_buf, 1.3097 + const uint8* u_buf, 1.3098 + const uint8* v_buf, 1.3099 + uint8* rgb_buf, 1.3100 + int width, 1.3101 + int source_dx) { 1.3102 +- int x = 0; 1.3103 +- if (source_dx >= 0x20000) { 1.3104 +- x = 32768; 1.3105 +- } 1.3106 +- for (int i = 0; i < width; i += 2) { 1.3107 +- int y0 = y_buf[x >> 16]; 1.3108 +- int y1 = y_buf[(x >> 16) + 1]; 1.3109 +- int u0 = u_buf[(x >> 17)]; 1.3110 +- int u1 = u_buf[(x >> 17) + 1]; 1.3111 +- int v0 = v_buf[(x >> 17)]; 1.3112 +- int v1 = v_buf[(x >> 17) + 1]; 1.3113 +- int y_frac = (x & 65535); 1.3114 +- int uv_frac = ((x >> 1) & 65535); 1.3115 +- int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; 1.3116 +- int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16; 1.3117 +- int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16; 1.3118 +- YuvPixel(y, u, v, rgb_buf); 1.3119 +- x += source_dx; 1.3120 +- if ((i + 1) < width) { 1.3121 +- y0 = y_buf[x >> 16]; 1.3122 +- y1 = y_buf[(x >> 16) + 1]; 1.3123 +- y_frac = (x & 65535); 1.3124 +- y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; 1.3125 +- YuvPixel(y, u, v, rgb_buf+4); 1.3126 +- x += source_dx; 1.3127 +- } 1.3128 +- rgb_buf += 8; 1.3129 +- } 1.3130 +-} 1.3131 +- 1.3132 +-#endif // USE_MMX 1.3133 +-} // extern "C" 1.3134 +- 1.3135 ++#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) 1.3136 ++ if (mozilla::supports_sse()) { 1.3137 ++ LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, 1.3138 ++ source_dx); 1.3139 ++ return; 1.3140 ++ } 1.3141 ++#endif 1.3142 ++ 1.3143 ++ LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); 1.3144 ++} 1.3145 ++ 1.3146 ++} // extern "C"