gfx/ycbcr/convert.patch

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/ycbcr/convert.patch	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,3143 @@
     1.4 +diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp
     1.5 +--- a/gfx/ycbcr/yuv_convert.cpp
     1.6 ++++ b/gfx/ycbcr/yuv_convert.cpp
     1.7 +@@ -6,145 +6,102 @@
     1.8 + // http://www.fourcc.org/yuv.php
     1.9 + // The actual conversion is best described here
    1.10 + // http://en.wikipedia.org/wiki/YUV
    1.11 + // An article on optimizing YUV conversion using tables instead of multiplies
    1.12 + // http://lestourtereaux.free.fr/papers/data/yuvrgb.pdf
    1.13 + //
    1.14 + // YV12 is a full plane of Y and a half height, half width chroma planes
    1.15 + // YV16 is a full plane of Y and a full height, half width chroma planes
    1.16 ++// YV24 is a full plane of Y and a full height, full width chroma planes
    1.17 + //
    1.18 + // ARGB pixel format is output, which on little endian is stored as BGRA.
    1.19 + // The alpha is set to 255, allowing the application to use RGBA or RGB32.
    1.20 + 
    1.21 +-#include "media/base/yuv_convert.h"
    1.22 ++#include "yuv_convert.h"
    1.23 + 
    1.24 + // Header for low level row functions.
    1.25 +-#include "media/base/yuv_row.h"
    1.26 +-
    1.27 +-#if USE_MMX
    1.28 +-#if defined(_MSC_VER)
    1.29 +-#include <intrin.h>
    1.30 +-#else
    1.31 +-#include <mmintrin.h>
    1.32 +-#endif
    1.33 +-#endif
    1.34 +-
    1.35 +-#if USE_SSE2
    1.36 +-#include <emmintrin.h>
    1.37 +-#endif
    1.38 +-
    1.39 +-namespace media {
    1.40 +-
    1.41 ++#include "yuv_row.h"
    1.42 ++#include "mozilla/SSE.h"
    1.43 ++
    1.44 ++namespace mozilla {
    1.45 ++
    1.46 ++namespace gfx {
    1.47 ++ 
    1.48 + // 16.16 fixed point arithmetic
    1.49 + const int kFractionBits = 16;
    1.50 + const int kFractionMax = 1 << kFractionBits;
    1.51 + const int kFractionMask = ((1 << kFractionBits) - 1);
    1.52 + 
    1.53 + // Convert a frame of YUV to 32 bit ARGB.
    1.54 +-void ConvertYUVToRGB32(const uint8* y_buf,
    1.55 +-                       const uint8* u_buf,
    1.56 +-                       const uint8* v_buf,
    1.57 +-                       uint8* rgb_buf,
    1.58 +-                       int width,
    1.59 +-                       int height,
    1.60 +-                       int y_pitch,
    1.61 +-                       int uv_pitch,
    1.62 +-                       int rgb_pitch,
    1.63 +-                       YUVType yuv_type) {
    1.64 +-  unsigned int y_shift = yuv_type;
    1.65 +-  for (int y = 0; y < height; ++y) {
    1.66 +-    uint8* rgb_row = rgb_buf + y * rgb_pitch;
    1.67 +-    const uint8* y_ptr = y_buf + y * y_pitch;
    1.68 +-    const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch;
    1.69 +-    const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch;
    1.70 +-
    1.71 +-    FastConvertYUVToRGB32Row(y_ptr,
    1.72 +-                             u_ptr,
    1.73 +-                             v_ptr,
    1.74 +-                             rgb_row,
    1.75 +-                             width);
    1.76 +-  }
    1.77 ++NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* y_buf,
    1.78 ++                                  const uint8* u_buf,
    1.79 ++                                  const uint8* v_buf,
    1.80 ++                                  uint8* rgb_buf,
    1.81 ++                                  int pic_x,
    1.82 ++                                  int pic_y,
    1.83 ++                                  int pic_width,
    1.84 ++                                  int pic_height,
    1.85 ++                                  int y_pitch,
    1.86 ++                                  int uv_pitch,
    1.87 ++                                  int rgb_pitch,
    1.88 ++                                  YUVType yuv_type) {
    1.89 ++  unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
    1.90 ++  unsigned int x_shift = yuv_type == YV24 ? 0 : 1;
    1.91 ++  // Test for SSE because the optimized code uses movntq, which is not part of MMX.
    1.92 ++  bool has_sse = supports_mmx() && supports_sse();
    1.93 ++  // There is no optimized YV24 SSE routine so we check for this and
    1.94 ++  // fall back to the C code.
    1.95 ++  has_sse &= yuv_type != YV24;
    1.96 ++  bool odd_pic_x = yuv_type != YV24 && pic_x % 2 != 0;
    1.97 ++  int x_width = odd_pic_x ? pic_width - 1 : pic_width;
    1.98 ++
    1.99 ++  for (int y = pic_y; y < pic_height + pic_y; ++y) {
   1.100 ++    uint8* rgb_row = rgb_buf + (y - pic_y) * rgb_pitch;
   1.101 ++    const uint8* y_ptr = y_buf + y * y_pitch + pic_x;
   1.102 ++    const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
   1.103 ++    const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
   1.104 ++
   1.105 ++    if (odd_pic_x) {
   1.106 ++      // Handle the single odd pixel manually and use the
   1.107 ++      // fast routines for the remaining.
   1.108 ++      FastConvertYUVToRGB32Row_C(y_ptr++,
   1.109 ++                                 u_ptr++,
   1.110 ++                                 v_ptr++,
   1.111 ++                                 rgb_row,
   1.112 ++                                 1,
   1.113 ++                                 x_shift);
   1.114 ++      rgb_row += 4;
   1.115 ++    }
   1.116 ++
   1.117 ++    if (has_sse) {
   1.118 ++      FastConvertYUVToRGB32Row(y_ptr,
   1.119 ++                               u_ptr,
   1.120 ++                               v_ptr,
   1.121 ++                               rgb_row,
   1.122 ++                               x_width);
   1.123 ++    }
   1.124 ++    else {
   1.125 ++      FastConvertYUVToRGB32Row_C(y_ptr,
   1.126 ++                                 u_ptr,
   1.127 ++                                 v_ptr,
   1.128 ++                                 rgb_row,
   1.129 ++                                 x_width,
   1.130 ++                                 x_shift);
   1.131 ++    }
   1.132 ++  }
   1.133 + 
   1.134 +   // MMX used for FastConvertYUVToRGB32Row requires emms instruction.
   1.135 +-  EMMS();
   1.136 +-}
   1.137 +-
   1.138 +-#if USE_SSE2
   1.139 +-// FilterRows combines two rows of the image using linear interpolation.
   1.140 +-// SSE2 version does 16 pixels at a time
   1.141 +-
   1.142 +-static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
   1.143 +-                       int source_width, int source_y_fraction) {
   1.144 +-  __m128i zero = _mm_setzero_si128();
   1.145 +-  __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
   1.146 +-  __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
   1.147 +-
   1.148 +-  const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
   1.149 +-  const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
   1.150 +-  __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
   1.151 +-  __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
   1.152 +-
   1.153 +-  do {
   1.154 +-    __m128i y0 = _mm_loadu_si128(y0_ptr128);
   1.155 +-    __m128i y1 = _mm_loadu_si128(y1_ptr128);
   1.156 +-    __m128i y2 = _mm_unpackhi_epi8(y0, zero);
   1.157 +-    __m128i y3 = _mm_unpackhi_epi8(y1, zero);
   1.158 +-    y0 = _mm_unpacklo_epi8(y0, zero);
   1.159 +-    y1 = _mm_unpacklo_epi8(y1, zero);
   1.160 +-    y0 = _mm_mullo_epi16(y0, y0_fraction);
   1.161 +-    y1 = _mm_mullo_epi16(y1, y1_fraction);
   1.162 +-    y2 = _mm_mullo_epi16(y2, y0_fraction);
   1.163 +-    y3 = _mm_mullo_epi16(y3, y1_fraction);
   1.164 +-    y0 = _mm_add_epi16(y0, y1);
   1.165 +-    y2 = _mm_add_epi16(y2, y3);
   1.166 +-    y0 = _mm_srli_epi16(y0, 8);
   1.167 +-    y2 = _mm_srli_epi16(y2, 8);
   1.168 +-    y0 = _mm_packus_epi16(y0, y2);
   1.169 +-    *dest128++ = y0;
   1.170 +-    ++y0_ptr128;
   1.171 +-    ++y1_ptr128;
   1.172 +-  } while (dest128 < end128);
   1.173 +-}
   1.174 +-#elif USE_MMX
   1.175 +-// MMX version does 8 pixels at a time
   1.176 +-static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
   1.177 +-                       int source_width, int source_y_fraction) {
   1.178 +-  __m64 zero = _mm_setzero_si64();
   1.179 +-  __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
   1.180 +-  __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
   1.181 +-
   1.182 +-  const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
   1.183 +-  const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
   1.184 +-  __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
   1.185 +-  __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
   1.186 +-
   1.187 +-  do {
   1.188 +-    __m64 y0 = *y0_ptr64++;
   1.189 +-    __m64 y1 = *y1_ptr64++;
   1.190 +-    __m64 y2 = _mm_unpackhi_pi8(y0, zero);
   1.191 +-    __m64 y3 = _mm_unpackhi_pi8(y1, zero);
   1.192 +-    y0 = _mm_unpacklo_pi8(y0, zero);
   1.193 +-    y1 = _mm_unpacklo_pi8(y1, zero);
   1.194 +-    y0 = _mm_mullo_pi16(y0, y0_fraction);
   1.195 +-    y1 = _mm_mullo_pi16(y1, y1_fraction);
   1.196 +-    y2 = _mm_mullo_pi16(y2, y0_fraction);
   1.197 +-    y3 = _mm_mullo_pi16(y3, y1_fraction);
   1.198 +-    y0 = _mm_add_pi16(y0, y1);
   1.199 +-    y2 = _mm_add_pi16(y2, y3);
   1.200 +-    y0 = _mm_srli_pi16(y0, 8);
   1.201 +-    y2 = _mm_srli_pi16(y2, 8);
   1.202 +-    y0 = _mm_packs_pu16(y0, y2);
   1.203 +-    *dest64++ = y0;
   1.204 +-  } while (dest64 < end64);
   1.205 +-}
   1.206 +-#else  // no MMX or SSE2
   1.207 ++  if (has_sse)
   1.208 ++    EMMS();
   1.209 ++}
   1.210 ++
   1.211 + // C version does 8 at a time to mimic MMX code
   1.212 +-static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
   1.213 +-                       int source_width, int source_y_fraction) {
   1.214 ++static void FilterRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
   1.215 ++                         int source_width, int source_y_fraction) {
   1.216 +   int y1_fraction = source_y_fraction;
   1.217 +   int y0_fraction = 256 - y1_fraction;
   1.218 +   uint8* end = ybuf + source_width;
   1.219 +   do {
   1.220 +     ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8;
   1.221 +     ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8;
   1.222 +     ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8;
   1.223 +     ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8;
   1.224 +@@ -152,46 +140,77 @@ static void FilterRows(uint8* ybuf, cons
   1.225 +     ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8;
   1.226 +     ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8;
   1.227 +     ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8;
   1.228 +     y0_ptr += 8;
   1.229 +     y1_ptr += 8;
   1.230 +     ybuf += 8;
   1.231 +   } while (ybuf < end);
   1.232 + }
   1.233 +-#endif
   1.234 ++
   1.235 ++#ifdef MOZILLA_MAY_SUPPORT_MMX
   1.236 ++void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
   1.237 ++                    int source_width, int source_y_fraction);
   1.238 ++#endif
   1.239 ++
   1.240 ++#ifdef MOZILLA_MAY_SUPPORT_SSE2
   1.241 ++void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
   1.242 ++                     int source_width, int source_y_fraction);
   1.243 ++#endif
   1.244 ++
   1.245 ++static inline void FilterRows(uint8* ybuf, const uint8* y0_ptr,
   1.246 ++                              const uint8* y1_ptr, int source_width,
   1.247 ++                              int source_y_fraction) {
   1.248 ++#ifdef MOZILLA_MAY_SUPPORT_SSE2
   1.249 ++  if (mozilla::supports_sse2()) {
   1.250 ++    FilterRows_SSE2(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
   1.251 ++    return;
   1.252 ++  }
   1.253 ++#endif
   1.254 ++
   1.255 ++#ifdef MOZILLA_MAY_SUPPORT_MMX
   1.256 ++  if (mozilla::supports_mmx()) {
   1.257 ++    FilterRows_MMX(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
   1.258 ++    return;
   1.259 ++  }
   1.260 ++#endif
   1.261 ++
   1.262 ++  FilterRows_C(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
   1.263 ++}
   1.264 + 
   1.265 + 
   1.266 + // Scale a frame of YUV to 32 bit ARGB.
   1.267 +-void ScaleYUVToRGB32(const uint8* y_buf,
   1.268 +-                     const uint8* u_buf,
   1.269 +-                     const uint8* v_buf,
   1.270 +-                     uint8* rgb_buf,
   1.271 +-                     int source_width,
   1.272 +-                     int source_height,
   1.273 +-                     int width,
   1.274 +-                     int height,
   1.275 +-                     int y_pitch,
   1.276 +-                     int uv_pitch,
   1.277 +-                     int rgb_pitch,
   1.278 +-                     YUVType yuv_type,
   1.279 +-                     Rotate view_rotate,
   1.280 +-                     ScaleFilter filter) {
   1.281 ++NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* y_buf,
   1.282 ++                                const uint8* u_buf,
   1.283 ++                                const uint8* v_buf,
   1.284 ++                                uint8* rgb_buf,
   1.285 ++                                int source_width,
   1.286 ++                                int source_height,
   1.287 ++                                int width,
   1.288 ++                                int height,
   1.289 ++                                int y_pitch,
   1.290 ++                                int uv_pitch,
   1.291 ++                                int rgb_pitch,
   1.292 ++                                YUVType yuv_type,
   1.293 ++                                Rotate view_rotate,
   1.294 ++                                ScaleFilter filter) {
   1.295 ++  bool has_mmx = supports_mmx();
   1.296 ++
   1.297 +   // 4096 allows 3 buffers to fit in 12k.
   1.298 +   // Helps performance on CPU with 16K L1 cache.
   1.299 +   // Large enough for 3830x2160 and 30" displays which are 2560x1600.
   1.300 +   const int kFilterBufferSize = 4096;
   1.301 +   // Disable filtering if the screen is too big (to avoid buffer overflows).
   1.302 +   // This should never happen to regular users: they don't have monitors
   1.303 +   // wider than 4096 pixels.
   1.304 +   // TODO(fbarchard): Allow rotated videos to filter.
   1.305 +   if (source_width > kFilterBufferSize || view_rotate)
   1.306 +     filter = FILTER_NONE;
   1.307 + 
   1.308 +-  unsigned int y_shift = yuv_type;
   1.309 ++  unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
   1.310 +   // Diagram showing origin and direction of source sampling.
   1.311 +   // ->0   4<-
   1.312 +   // 7       3
   1.313 +   //
   1.314 +   // 6       5
   1.315 +   // ->1   2<-
   1.316 +   // Rotations that start at right side of image.
   1.317 +   if ((view_rotate == ROTATE_180) ||
   1.318 +@@ -276,17 +295,17 @@ void ScaleYUVToRGB32(const uint8* y_buf,
   1.319 +     int source_uv_fraction =
   1.320 +         ((source_y_subpixel >> y_shift) & kFractionMask) >> 8;
   1.321 + 
   1.322 +     const uint8* y_ptr = y0_ptr;
   1.323 +     const uint8* u_ptr = u0_ptr;
   1.324 +     const uint8* v_ptr = v0_ptr;
   1.325 +     // Apply vertical filtering if necessary.
   1.326 +     // TODO(fbarchard): Remove memcpy when not necessary.
   1.327 +-    if (filter & media::FILTER_BILINEAR_V) {
   1.328 ++    if (filter & mozilla::gfx::FILTER_BILINEAR_V) {
   1.329 +       if (yscale_fixed != kFractionMax &&
   1.330 +           source_y_fraction && ((source_y + 1) < source_height)) {
   1.331 +         FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
   1.332 +       } else {
   1.333 +         memcpy(ybuf, y0_ptr, source_width);
   1.334 +       }
   1.335 +       y_ptr = ybuf;
   1.336 +       ybuf[source_width] = ybuf[source_width-1];
   1.337 +@@ -303,44 +322,50 @@ void ScaleYUVToRGB32(const uint8* y_buf,
   1.338 +       u_ptr = ubuf;
   1.339 +       v_ptr = vbuf;
   1.340 +       ubuf[uv_source_width] = ubuf[uv_source_width - 1];
   1.341 +       vbuf[uv_source_width] = vbuf[uv_source_width - 1];
   1.342 +     }
   1.343 +     if (source_dx == kFractionMax) {  // Not scaled
   1.344 +       FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
   1.345 +                                dest_pixel, width);
   1.346 +-    } else {
   1.347 +-      if (filter & FILTER_BILINEAR_H) {
   1.348 ++    } else if (filter & FILTER_BILINEAR_H) {
   1.349 +         LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
   1.350 +                                  dest_pixel, width, source_dx);
   1.351 +     } else {
   1.352 + // Specialized scalers and rotation.
   1.353 +-#if USE_MMX && defined(_MSC_VER)
   1.354 ++#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_MSC_VER) && defined(_M_IX86)
   1.355 ++      if(mozilla::supports_sse()) {
   1.356 +         if (width == (source_width * 2)) {
   1.357 +-          DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
   1.358 +-                              dest_pixel, width);
   1.359 ++          DoubleYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
   1.360 ++                                  dest_pixel, width);
   1.361 +         } else if ((source_dx & kFractionMask) == 0) {
   1.362 +           // Scaling by integer scale factor. ie half.
   1.363 +-          ConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
   1.364 +-                               dest_pixel, width,
   1.365 +-                               source_dx >> kFractionBits);
   1.366 ++          ConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
   1.367 ++                                   dest_pixel, width,
   1.368 ++                                   source_dx >> kFractionBits);
   1.369 +         } else if (source_dx_uv == source_dx) {  // Not rotated.
   1.370 +           ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
   1.371 +                              dest_pixel, width, source_dx);
   1.372 +         } else {
   1.373 +-          RotateConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
   1.374 +-                                     dest_pixel, width,
   1.375 +-                                     source_dx >> kFractionBits,
   1.376 +-                                     source_dx_uv >> kFractionBits);
   1.377 ++          RotateConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
   1.378 ++                                         dest_pixel, width,
   1.379 ++                                         source_dx >> kFractionBits,
   1.380 ++                                         source_dx_uv >> kFractionBits);
   1.381 +         }
   1.382 ++      }
   1.383 ++      else {
   1.384 ++        ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,
   1.385 ++                             dest_pixel, width, source_dx);
   1.386 ++      }
   1.387 + #else
   1.388 +-        ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
   1.389 +-                           dest_pixel, width, source_dx);
   1.390 +-#endif
   1.391 +-      }
   1.392 ++      ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
   1.393 ++                         dest_pixel, width, source_dx);
   1.394 ++#endif
   1.395 +     }
   1.396 +   }
   1.397 +   // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms.
   1.398 +-  EMMS();
   1.399 +-}
   1.400 +-
   1.401 +-}  // namespace media
   1.402 ++  if (has_mmx)
   1.403 ++    EMMS();
   1.404 ++}
   1.405 ++
   1.406 ++}  // namespace gfx
   1.407 ++}  // namespace mozilla
   1.408 +diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h
   1.409 +--- a/gfx/ycbcr/yuv_convert.h
   1.410 ++++ b/gfx/ycbcr/yuv_convert.h
   1.411 +@@ -1,72 +1,79 @@
   1.412 + // Copyright (c) 2010 The Chromium Authors. All rights reserved.
   1.413 + // Use of this source code is governed by a BSD-style license that can be
   1.414 + // found in the LICENSE file.
   1.415 + 
   1.416 + #ifndef MEDIA_BASE_YUV_CONVERT_H_
   1.417 + #define MEDIA_BASE_YUV_CONVERT_H_
   1.418 + 
   1.419 +-#include "base/basictypes.h"
   1.420 +-
   1.421 +-namespace media {
   1.422 +-
   1.423 ++#include "chromium_types.h"
   1.424 ++#include "gfxCore.h"
   1.425 ++
   1.426 ++namespace mozilla {
   1.427 ++
   1.428 ++namespace gfx {
   1.429 ++ 
   1.430 + // Type of YUV surface.
   1.431 + // The value of these enums matter as they are used to shift vertical indices.
   1.432 + enum YUVType {
   1.433 +-  YV16 = 0,           // YV16 is half width and full height chroma channels.
   1.434 +-  YV12 = 1,           // YV12 is half width and half height chroma channels.
   1.435 ++  YV12 = 0,           // YV12 is half width and half height chroma channels.
   1.436 ++  YV16 = 1,           // YV16 is half width and full height chroma channels.
   1.437 ++  YV24 = 2            // YV24 is full width and full height chroma channels.
   1.438 + };
   1.439 + 
   1.440 + // Mirror means flip the image horizontally, as in looking in a mirror.
   1.441 + // Rotate happens after mirroring.
   1.442 + enum Rotate {
   1.443 +   ROTATE_0,           // Rotation off.
   1.444 +   ROTATE_90,          // Rotate clockwise.
   1.445 +   ROTATE_180,         // Rotate upside down.
   1.446 +   ROTATE_270,         // Rotate counter clockwise.
   1.447 +   MIRROR_ROTATE_0,    // Mirror horizontally.
   1.448 +   MIRROR_ROTATE_90,   // Mirror then Rotate clockwise.
   1.449 +   MIRROR_ROTATE_180,  // Mirror vertically.
   1.450 +-  MIRROR_ROTATE_270,  // Transpose.
   1.451 ++  MIRROR_ROTATE_270   // Transpose.
   1.452 + };
   1.453 + 
   1.454 + // Filter affects how scaling looks.
   1.455 + enum ScaleFilter {
   1.456 +   FILTER_NONE = 0,        // No filter (point sampled).
   1.457 +   FILTER_BILINEAR_H = 1,  // Bilinear horizontal filter.
   1.458 +   FILTER_BILINEAR_V = 2,  // Bilinear vertical filter.
   1.459 +-  FILTER_BILINEAR = 3,    // Bilinear filter.
   1.460 ++  FILTER_BILINEAR = 3     // Bilinear filter.
   1.461 + };
   1.462 + 
   1.463 + // Convert a frame of YUV to 32 bit ARGB.
   1.464 + // Pass in YV16/YV12 depending on source format
   1.465 +-void ConvertYUVToRGB32(const uint8* yplane,
   1.466 +-                       const uint8* uplane,
   1.467 +-                       const uint8* vplane,
   1.468 +-                       uint8* rgbframe,
   1.469 +-                       int width,
   1.470 +-                       int height,
   1.471 +-                       int ystride,
   1.472 +-                       int uvstride,
   1.473 +-                       int rgbstride,
   1.474 +-                       YUVType yuv_type);
   1.475 ++NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* yplane,
   1.476 ++                                  const uint8* uplane,
   1.477 ++                                  const uint8* vplane,
   1.478 ++                                  uint8* rgbframe,
   1.479 ++                                  int pic_x,
   1.480 ++                                  int pic_y,
   1.481 ++                                  int pic_width,
   1.482 ++                                  int pic_height,
   1.483 ++                                  int ystride,
   1.484 ++                                  int uvstride,
   1.485 ++                                  int rgbstride,
   1.486 ++                                  YUVType yuv_type);
   1.487 + 
   1.488 + // Scale a frame of YUV to 32 bit ARGB.
   1.489 + // Supports rotation and mirroring.
   1.490 +-void ScaleYUVToRGB32(const uint8* yplane,
   1.491 +-                     const uint8* uplane,
   1.492 +-                     const uint8* vplane,
   1.493 +-                     uint8* rgbframe,
   1.494 +-                     int source_width,
   1.495 +-                     int source_height,
   1.496 +-                     int width,
   1.497 +-                     int height,
   1.498 +-                     int ystride,
   1.499 +-                     int uvstride,
   1.500 +-                     int rgbstride,
   1.501 +-                     YUVType yuv_type,
   1.502 +-                     Rotate view_rotate,
   1.503 +-                     ScaleFilter filter);
   1.504 +-
   1.505 +-}  // namespace media
   1.506 +-
   1.507 ++NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* yplane,
   1.508 ++                                const uint8* uplane,
   1.509 ++                                const uint8* vplane,
   1.510 ++                                uint8* rgbframe,
   1.511 ++                                int source_width,
   1.512 ++                                int source_height,
   1.513 ++                                int width,
   1.514 ++                                int height,
   1.515 ++                                int ystride,
   1.516 ++                                int uvstride,
   1.517 ++                                int rgbstride,
   1.518 ++                                YUVType yuv_type,
   1.519 ++                                Rotate view_rotate,
   1.520 ++                                ScaleFilter filter);
   1.521 ++
   1.522 ++}  // namespace gfx
   1.523 ++}  // namespace mozilla
   1.524 ++ 
   1.525 + #endif  // MEDIA_BASE_YUV_CONVERT_H_
   1.526 +diff --git a/gfx/ycbcr/yuv_convert_mmx.cpp b/gfx/ycbcr/yuv_convert_mmx.cpp
   1.527 +new file mode 100644
   1.528 +--- /dev/null
   1.529 ++++ b/gfx/ycbcr/yuv_convert_mmx.cpp
   1.530 +@@ -0,0 +1,45 @@
   1.531 ++// Copyright (c) 2010 The Chromium Authors. All rights reserved.
   1.532 ++// Use of this source code is governed by a BSD-style license that can be
   1.533 ++// found in the LICENSE file.
   1.534 ++
   1.535 ++#include <mmintrin.h>
   1.536 ++#include "yuv_row.h"
   1.537 ++
   1.538 ++namespace mozilla {
   1.539 ++namespace gfx {
   1.540 ++
   1.541 ++// FilterRows combines two rows of the image using linear interpolation.
   1.542 ++// MMX version does 8 pixels at a time.
   1.543 ++void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
   1.544 ++                    int source_width, int source_y_fraction) {
   1.545 ++  __m64 zero = _mm_setzero_si64();
   1.546 ++  __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
   1.547 ++  __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
   1.548 ++
   1.549 ++  const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
   1.550 ++  const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
   1.551 ++  __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
   1.552 ++  __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
   1.553 ++
   1.554 ++  do {
   1.555 ++    __m64 y0 = *y0_ptr64++;
   1.556 ++    __m64 y1 = *y1_ptr64++;
   1.557 ++    __m64 y2 = _mm_unpackhi_pi8(y0, zero);
   1.558 ++    __m64 y3 = _mm_unpackhi_pi8(y1, zero);
   1.559 ++    y0 = _mm_unpacklo_pi8(y0, zero);
   1.560 ++    y1 = _mm_unpacklo_pi8(y1, zero);
   1.561 ++    y0 = _mm_mullo_pi16(y0, y0_fraction);
   1.562 ++    y1 = _mm_mullo_pi16(y1, y1_fraction);
   1.563 ++    y2 = _mm_mullo_pi16(y2, y0_fraction);
   1.564 ++    y3 = _mm_mullo_pi16(y3, y1_fraction);
   1.565 ++    y0 = _mm_add_pi16(y0, y1);
   1.566 ++    y2 = _mm_add_pi16(y2, y3);
   1.567 ++    y0 = _mm_srli_pi16(y0, 8);
   1.568 ++    y2 = _mm_srli_pi16(y2, 8);
   1.569 ++    y0 = _mm_packs_pu16(y0, y2);
   1.570 ++    *dest64++ = y0;
   1.571 ++  } while (dest64 < end64);
   1.572 ++}
   1.573 ++
   1.574 ++}
   1.575 ++}
   1.576 +diff --git a/gfx/ycbcr/yuv_convert_sse2.cpp b/gfx/ycbcr/yuv_convert_sse2.cpp
   1.577 +new file mode 100644
   1.578 +--- /dev/null
   1.579 ++++ b/gfx/ycbcr/yuv_convert_sse2.cpp
   1.580 +@@ -0,0 +1,47 @@
   1.581 ++// Copyright (c) 2010 The Chromium Authors. All rights reserved.
   1.582 ++// Use of this source code is governed by a BSD-style license that can be
   1.583 ++// found in the LICENSE file.
   1.584 ++
   1.585 ++#include <emmintrin.h>
   1.586 ++#include "yuv_row.h"
   1.587 ++
   1.588 ++namespace mozilla {
   1.589 ++namespace gfx {
   1.590 ++
   1.591 ++// FilterRows combines two rows of the image using linear interpolation.
   1.592 ++// SSE2 version does 16 pixels at a time.
   1.593 ++void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
   1.594 ++                     int source_width, int source_y_fraction) {
   1.595 ++  __m128i zero = _mm_setzero_si128();
   1.596 ++  __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
   1.597 ++  __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
   1.598 ++
   1.599 ++  const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
   1.600 ++  const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
   1.601 ++  __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
   1.602 ++  __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
   1.603 ++
   1.604 ++  do {
   1.605 ++    __m128i y0 = _mm_loadu_si128(y0_ptr128);
   1.606 ++    __m128i y1 = _mm_loadu_si128(y1_ptr128);
   1.607 ++    __m128i y2 = _mm_unpackhi_epi8(y0, zero);
   1.608 ++    __m128i y3 = _mm_unpackhi_epi8(y1, zero);
   1.609 ++    y0 = _mm_unpacklo_epi8(y0, zero);
   1.610 ++    y1 = _mm_unpacklo_epi8(y1, zero);
   1.611 ++    y0 = _mm_mullo_epi16(y0, y0_fraction);
   1.612 ++    y1 = _mm_mullo_epi16(y1, y1_fraction);
   1.613 ++    y2 = _mm_mullo_epi16(y2, y0_fraction);
   1.614 ++    y3 = _mm_mullo_epi16(y3, y1_fraction);
   1.615 ++    y0 = _mm_add_epi16(y0, y1);
   1.616 ++    y2 = _mm_add_epi16(y2, y3);
   1.617 ++    y0 = _mm_srli_epi16(y0, 8);
   1.618 ++    y2 = _mm_srli_epi16(y2, 8);
   1.619 ++    y0 = _mm_packus_epi16(y0, y2);
   1.620 ++    *dest128++ = y0;
   1.621 ++    ++y0_ptr128;
   1.622 ++    ++y1_ptr128;
   1.623 ++  } while (dest128 < end128);
   1.624 ++}
   1.625 ++
   1.626 ++}
   1.627 ++}
   1.628 +diff --git a/gfx/ycbcr/yuv_row.h b/gfx/ycbcr/yuv_row.h
   1.629 +--- a/gfx/ycbcr/yuv_row.h
   1.630 ++++ b/gfx/ycbcr/yuv_row.h
   1.631 +@@ -5,109 +5,133 @@
   1.632 + // yuv_row internal functions to handle YUV conversion and scaling to RGB.
   1.633 + // These functions are used from both yuv_convert.cc and yuv_scale.cc.
   1.634 + 
   1.635 + // TODO(fbarchard): Write function that can handle rotation and scaling.
   1.636 + 
   1.637 + #ifndef MEDIA_BASE_YUV_ROW_H_
   1.638 + #define MEDIA_BASE_YUV_ROW_H_
   1.639 + 
   1.640 +-#include "base/basictypes.h"
   1.641 ++#include "chromium_types.h"
   1.642 + 
   1.643 + extern "C" {
   1.644 + // Can only do 1x.
   1.645 + // This is the second fastest of the scalers.
   1.646 + void FastConvertYUVToRGB32Row(const uint8* y_buf,
   1.647 +                               const uint8* u_buf,
   1.648 +                               const uint8* v_buf,
   1.649 +                               uint8* rgb_buf,
   1.650 +                               int width);
   1.651 + 
   1.652 +-// Can do 1x, half size or any scale down by an integer amount.
   1.653 +-// Step can be negative (mirroring, rotate 180).
   1.654 +-// This is the third fastest of the scalers.
   1.655 +-void ConvertYUVToRGB32Row(const uint8* y_buf,
   1.656 +-                          const uint8* u_buf,
   1.657 +-                          const uint8* v_buf,
   1.658 +-                          uint8* rgb_buf,
   1.659 +-                          int width,
   1.660 +-                          int step);
   1.661 +-
   1.662 +-// Rotate is like Convert, but applies different step to Y versus U and V.
   1.663 +-// This allows rotation by 90 or 270, by stepping by stride.
   1.664 +-// This is the forth fastest of the scalers.
   1.665 +-void RotateConvertYUVToRGB32Row(const uint8* y_buf,
   1.666 ++void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
   1.667 +                                 const uint8* u_buf,
   1.668 +                                 const uint8* v_buf,
   1.669 +                                 uint8* rgb_buf,
   1.670 +                                 int width,
   1.671 +-                                int ystep,
   1.672 +-                                int uvstep);
   1.673 ++                                unsigned int x_shift);
   1.674 ++
   1.675 ++void FastConvertYUVToRGB32Row(const uint8* y_buf,
   1.676 ++                              const uint8* u_buf,
   1.677 ++                              const uint8* v_buf,
   1.678 ++                              uint8* rgb_buf,
   1.679 ++                              int width);
   1.680 ++
   1.681 ++// Can do 1x, half size or any scale down by an integer amount.
   1.682 ++// Step can be negative (mirroring, rotate 180).
   1.683 ++// This is the third fastest of the scalers.
   1.684 ++// Only defined on Windows x86-32.
   1.685 ++void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
   1.686 ++                              const uint8* u_buf,
   1.687 ++                              const uint8* v_buf,
   1.688 ++                              uint8* rgb_buf,
   1.689 ++                              int width,
   1.690 ++                              int step);
   1.691 ++
   1.692 ++// Rotate is like Convert, but applies different step to Y versus U and V.
   1.693 ++// This allows rotation by 90 or 270, by stepping by stride.
   1.694 ++// This is the forth fastest of the scalers.
   1.695 ++// Only defined on Windows x86-32.
   1.696 ++void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
   1.697 ++                                    const uint8* u_buf,
   1.698 ++                                    const uint8* v_buf,
   1.699 ++                                    uint8* rgb_buf,
   1.700 ++                                    int width,
   1.701 ++                                    int ystep,
   1.702 ++                                    int uvstep);
   1.703 + 
   1.704 + // Doubler does 4 pixels at a time.  Each pixel is replicated.
   1.705 + // This is the fastest of the scalers.
   1.706 +-void DoubleYUVToRGB32Row(const uint8* y_buf,
   1.707 +-                         const uint8* u_buf,
   1.708 +-                         const uint8* v_buf,
   1.709 +-                         uint8* rgb_buf,
   1.710 +-                         int width);
   1.711 ++// Only defined on Windows x86-32.
   1.712 ++void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
   1.713 ++                             const uint8* u_buf,
   1.714 ++                             const uint8* v_buf,
   1.715 ++                             uint8* rgb_buf,
   1.716 ++                             int width);
   1.717 + 
   1.718 + // Handles arbitrary scaling up or down.
   1.719 + // Mirroring is supported, but not 90 or 270 degree rotation.
   1.720 + // Chroma is under sampled every 2 pixels for performance.
   1.721 + void ScaleYUVToRGB32Row(const uint8* y_buf,
   1.722 +                         const uint8* u_buf,
   1.723 +                         const uint8* v_buf,
   1.724 +                         uint8* rgb_buf,
   1.725 +                         int width,
   1.726 +                         int source_dx);
   1.727 + 
   1.728 ++void ScaleYUVToRGB32Row(const uint8* y_buf,
   1.729 ++                        const uint8* u_buf,
   1.730 ++                        const uint8* v_buf,
   1.731 ++                        uint8* rgb_buf,
   1.732 ++                        int width,
   1.733 ++                        int source_dx);
   1.734 ++
   1.735 ++void ScaleYUVToRGB32Row_C(const uint8* y_buf,
   1.736 ++                          const uint8* u_buf,
   1.737 ++                          const uint8* v_buf,
   1.738 ++                          uint8* rgb_buf,
   1.739 ++                          int width,
   1.740 ++                          int source_dx);
   1.741 ++
   1.742 + // Handles arbitrary scaling up or down with bilinear filtering.
   1.743 + // Mirroring is supported, but not 90 or 270 degree rotation.
   1.744 + // Chroma is under sampled every 2 pixels for performance.
   1.745 + // This is the slowest of the scalers.
   1.746 + void LinearScaleYUVToRGB32Row(const uint8* y_buf,
   1.747 +                               const uint8* u_buf,
   1.748 +                               const uint8* v_buf,
   1.749 +                               uint8* rgb_buf,
   1.750 +                               int width,
   1.751 +                               int source_dx);
   1.752 + 
   1.753 ++void LinearScaleYUVToRGB32Row(const uint8* y_buf,
   1.754 ++                              const uint8* u_buf,
   1.755 ++                              const uint8* v_buf,
   1.756 ++                              uint8* rgb_buf,
   1.757 ++                              int width,
   1.758 ++                              int source_dx);
   1.759 ++
   1.760 ++void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
   1.761 ++                                const uint8* u_buf,
   1.762 ++                                const uint8* v_buf,
   1.763 ++                                uint8* rgb_buf,
   1.764 ++                                int width,
   1.765 ++                                int source_dx);
   1.766 ++
   1.767 ++
   1.768 + #if defined(_MSC_VER)
   1.769 + #define SIMD_ALIGNED(var) __declspec(align(16)) var
   1.770 + #else
   1.771 + #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
   1.772 + #endif
   1.773 + extern SIMD_ALIGNED(int16 kCoefficientsRgbY[768][4]);
   1.774 + 
   1.775 +-// Method to force C version.
   1.776 +-//#define USE_MMX 0
   1.777 +-//#define USE_SSE2 0
   1.778 +-
   1.779 +-#if !defined(USE_MMX)
   1.780 +-// Windows, Mac and Linux/BSD use MMX
   1.781 +-#if defined(__MMX__) || defined(_MSC_VER)
   1.782 +-#define USE_MMX 1
   1.783 +-#else
   1.784 +-#define USE_MMX 0
   1.785 +-#endif
   1.786 +-#endif
   1.787 +-
   1.788 +-#if !defined(USE_SSE2)
   1.789 +-#if defined(__SSE2__) || defined(ARCH_CPU_X86_64) || _M_IX86_FP==2
   1.790 +-#define USE_SSE2 1
   1.791 +-#else
   1.792 +-#define USE_SSE2 0
   1.793 +-#endif
   1.794 +-#endif
   1.795 +-
   1.796 + // x64 uses MMX2 (SSE) so emms is not required.
   1.797 + // Warning C4799: function has no EMMS instruction.
   1.798 + // EMMS() is slow and should be called by the calling function once per image.
   1.799 +-#if USE_MMX && !defined(ARCH_CPU_X86_64)
   1.800 ++#if defined(ARCH_CPU_X86) && !defined(ARCH_CPU_X86_64)
   1.801 + #if defined(_MSC_VER)
   1.802 + #define EMMS() __asm emms
   1.803 + #pragma warning(disable: 4799)
   1.804 + #else
   1.805 + #define EMMS() asm("emms")
   1.806 + #endif
   1.807 + #else
   1.808 + #define EMMS()
   1.809 +diff --git a/gfx/ycbcr/yuv_row_c.cpp b/gfx/ycbcr/yuv_row_c.cpp
   1.810 +--- a/gfx/ycbcr/yuv_row_c.cpp
   1.811 ++++ b/gfx/ycbcr/yuv_row_c.cpp
   1.812 +@@ -1,812 +1,18 @@
   1.813 + // Copyright (c) 2010 The Chromium Authors. All rights reserved.
   1.814 + // Use of this source code is governed by a BSD-style license that can be
   1.815 + // found in the LICENSE file.
   1.816 + 
   1.817 +-#include "media/base/yuv_row.h"
   1.818 +-
   1.819 +-#ifdef _DEBUG
   1.820 +-#include "base/logging.h"
   1.821 +-#else
   1.822 ++#include "yuv_row.h"
   1.823 ++
   1.824 + #define DCHECK(a)
   1.825 +-#endif
   1.826 + 
   1.827 + extern "C" {
   1.828 + 
   1.829 +-#if USE_SSE2 && defined(ARCH_CPU_X86_64)
   1.830 +-
   1.831 +-// AMD64 ABI uses register paremters.
   1.832 +-void FastConvertYUVToRGB32Row(const uint8* y_buf,  // rdi
   1.833 +-                              const uint8* u_buf,  // rsi
   1.834 +-                              const uint8* v_buf,  // rdx
   1.835 +-                              uint8* rgb_buf,      // rcx
   1.836 +-                              int width) {         // r8
   1.837 +-  asm(
   1.838 +-  "jmp    convertend\n"
   1.839 +-"convertloop:"
   1.840 +-  "movzb  (%1),%%r10\n"
   1.841 +-  "add    $0x1,%1\n"
   1.842 +-  "movzb  (%2),%%r11\n"
   1.843 +-  "add    $0x1,%2\n"
   1.844 +-  "movq   2048(%5,%%r10,8),%%xmm0\n"
   1.845 +-  "movzb  (%0),%%r10\n"
   1.846 +-  "movq   4096(%5,%%r11,8),%%xmm1\n"
   1.847 +-  "movzb  0x1(%0),%%r11\n"
   1.848 +-  "paddsw %%xmm1,%%xmm0\n"
   1.849 +-  "movq   (%5,%%r10,8),%%xmm2\n"
   1.850 +-  "add    $0x2,%0\n"
   1.851 +-  "movq   (%5,%%r11,8),%%xmm3\n"
   1.852 +-  "paddsw %%xmm0,%%xmm2\n"
   1.853 +-  "paddsw %%xmm0,%%xmm3\n"
   1.854 +-  "shufps $0x44,%%xmm3,%%xmm2\n"
   1.855 +-  "psraw  $0x6,%%xmm2\n"
   1.856 +-  "packuswb %%xmm2,%%xmm2\n"
   1.857 +-  "movq   %%xmm2,0x0(%3)\n"
   1.858 +-  "add    $0x8,%3\n"
   1.859 +-"convertend:"
   1.860 +-  "sub    $0x2,%4\n"
   1.861 +-  "jns    convertloop\n"
   1.862 +-
   1.863 +-"convertnext:"
   1.864 +-  "add    $0x1,%4\n"
   1.865 +-  "js     convertdone\n"
   1.866 +-
   1.867 +-  "movzb  (%1),%%r10\n"
   1.868 +-  "movq   2048(%5,%%r10,8),%%xmm0\n"
   1.869 +-  "movzb  (%2),%%r10\n"
   1.870 +-  "movq   4096(%5,%%r10,8),%%xmm1\n"
   1.871 +-  "paddsw %%xmm1,%%xmm0\n"
   1.872 +-  "movzb  (%0),%%r10\n"
   1.873 +-  "movq   (%5,%%r10,8),%%xmm1\n"
   1.874 +-  "paddsw %%xmm0,%%xmm1\n"
   1.875 +-  "psraw  $0x6,%%xmm1\n"
   1.876 +-  "packuswb %%xmm1,%%xmm1\n"
   1.877 +-  "movd   %%xmm1,0x0(%3)\n"
   1.878 +-"convertdone:"
   1.879 +-  :
   1.880 +-  : "r"(y_buf),  // %0
   1.881 +-    "r"(u_buf),  // %1
   1.882 +-    "r"(v_buf),  // %2
   1.883 +-    "r"(rgb_buf),  // %3
   1.884 +-    "r"(width),  // %4
   1.885 +-    "r" (kCoefficientsRgbY)  // %5
   1.886 +-  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
   1.887 +-);
   1.888 +-}
   1.889 +-
   1.890 +-void ScaleYUVToRGB32Row(const uint8* y_buf,  // rdi
   1.891 +-                        const uint8* u_buf,  // rsi
   1.892 +-                        const uint8* v_buf,  // rdx
   1.893 +-                        uint8* rgb_buf,      // rcx
   1.894 +-                        int width,           // r8
   1.895 +-                        int source_dx) {     // r9
   1.896 +-  asm(
   1.897 +-  "xor    %%r11,%%r11\n"
   1.898 +-  "sub    $0x2,%4\n"
   1.899 +-  "js     scalenext\n"
   1.900 +-
   1.901 +-"scaleloop:"
   1.902 +-  "mov    %%r11,%%r10\n"
   1.903 +-  "sar    $0x11,%%r10\n"
   1.904 +-  "movzb  (%1,%%r10,1),%%rax\n"
   1.905 +-  "movq   2048(%5,%%rax,8),%%xmm0\n"
   1.906 +-  "movzb  (%2,%%r10,1),%%rax\n"
   1.907 +-  "movq   4096(%5,%%rax,8),%%xmm1\n"
   1.908 +-  "lea    (%%r11,%6),%%r10\n"
   1.909 +-  "sar    $0x10,%%r11\n"
   1.910 +-  "movzb  (%0,%%r11,1),%%rax\n"
   1.911 +-  "paddsw %%xmm1,%%xmm0\n"
   1.912 +-  "movq   (%5,%%rax,8),%%xmm1\n"
   1.913 +-  "lea    (%%r10,%6),%%r11\n"
   1.914 +-  "sar    $0x10,%%r10\n"
   1.915 +-  "movzb  (%0,%%r10,1),%%rax\n"
   1.916 +-  "movq   (%5,%%rax,8),%%xmm2\n"
   1.917 +-  "paddsw %%xmm0,%%xmm1\n"
   1.918 +-  "paddsw %%xmm0,%%xmm2\n"
   1.919 +-  "shufps $0x44,%%xmm2,%%xmm1\n"
   1.920 +-  "psraw  $0x6,%%xmm1\n"
   1.921 +-  "packuswb %%xmm1,%%xmm1\n"
   1.922 +-  "movq   %%xmm1,0x0(%3)\n"
   1.923 +-  "add    $0x8,%3\n"
   1.924 +-  "sub    $0x2,%4\n"
   1.925 +-  "jns    scaleloop\n"
   1.926 +-
   1.927 +-"scalenext:"
   1.928 +-  "add    $0x1,%4\n"
   1.929 +-  "js     scaledone\n"
   1.930 +-
   1.931 +-  "mov    %%r11,%%r10\n"
   1.932 +-  "sar    $0x11,%%r10\n"
   1.933 +-  "movzb  (%1,%%r10,1),%%rax\n"
   1.934 +-  "movq   2048(%5,%%rax,8),%%xmm0\n"
   1.935 +-  "movzb  (%2,%%r10,1),%%rax\n"
   1.936 +-  "movq   4096(%5,%%rax,8),%%xmm1\n"
   1.937 +-  "paddsw %%xmm1,%%xmm0\n"
   1.938 +-  "sar    $0x10,%%r11\n"
   1.939 +-  "movzb  (%0,%%r11,1),%%rax\n"
   1.940 +-  "movq   (%5,%%rax,8),%%xmm1\n"
   1.941 +-  "paddsw %%xmm0,%%xmm1\n"
   1.942 +-  "psraw  $0x6,%%xmm1\n"
   1.943 +-  "packuswb %%xmm1,%%xmm1\n"
   1.944 +-  "movd   %%xmm1,0x0(%3)\n"
   1.945 +-
   1.946 +-"scaledone:"
   1.947 +-  :
   1.948 +-  : "r"(y_buf),  // %0
   1.949 +-    "r"(u_buf),  // %1
   1.950 +-    "r"(v_buf),  // %2
   1.951 +-    "r"(rgb_buf),  // %3
   1.952 +-    "r"(width),  // %4
   1.953 +-    "r" (kCoefficientsRgbY),  // %5
   1.954 +-    "r"(static_cast<long>(source_dx))  // %6
   1.955 +-  : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2"
   1.956 +-);
   1.957 +-}
   1.958 +-
   1.959 +-void LinearScaleYUVToRGB32Row(const uint8* y_buf,
   1.960 +-                              const uint8* u_buf,
   1.961 +-                              const uint8* v_buf,
   1.962 +-                              uint8* rgb_buf,
   1.963 +-                              int width,
   1.964 +-                              int source_dx) {
   1.965 +-  asm(
   1.966 +-  "xor    %%r11,%%r11\n"   // x = 0
   1.967 +-  "sub    $0x2,%4\n"
   1.968 +-  "js     .lscalenext\n"
   1.969 +-  "cmp    $0x20000,%6\n"   // if source_dx >= 2.0
   1.970 +-  "jl     .lscalehalf\n"
   1.971 +-  "mov    $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
   1.972 +-".lscalehalf:"
   1.973 +-
   1.974 +-".lscaleloop:"
   1.975 +-  "mov    %%r11,%%r10\n"
   1.976 +-  "sar    $0x11,%%r10\n"
   1.977 +-
   1.978 +-  "movzb  (%1, %%r10, 1), %%r13 \n"
   1.979 +-  "movzb  1(%1, %%r10, 1), %%r14 \n"
   1.980 +-  "mov    %%r11, %%rax \n"
   1.981 +-  "and    $0x1fffe, %%rax \n"
   1.982 +-  "imul   %%rax, %%r14 \n"
   1.983 +-  "xor    $0x1fffe, %%rax \n"
   1.984 +-  "imul   %%rax, %%r13 \n"
   1.985 +-  "add    %%r14, %%r13 \n"
   1.986 +-  "shr    $17, %%r13 \n"
   1.987 +-  "movq   2048(%5,%%r13,8), %%xmm0\n"
   1.988 +-
   1.989 +-  "movzb  (%2, %%r10, 1), %%r13 \n"
   1.990 +-  "movzb  1(%2, %%r10, 1), %%r14 \n"
   1.991 +-  "mov    %%r11, %%rax \n"
   1.992 +-  "and    $0x1fffe, %%rax \n"
   1.993 +-  "imul   %%rax, %%r14 \n"
   1.994 +-  "xor    $0x1fffe, %%rax \n"
   1.995 +-  "imul   %%rax, %%r13 \n"
   1.996 +-  "add    %%r14, %%r13 \n"
   1.997 +-  "shr    $17, %%r13 \n"
   1.998 +-  "movq   4096(%5,%%r13,8), %%xmm1\n"
   1.999 +-
  1.1000 +-  "mov    %%r11, %%rax \n"
  1.1001 +-  "lea    (%%r11,%6),%%r10\n"
  1.1002 +-  "sar    $0x10,%%r11\n"
  1.1003 +-  "paddsw %%xmm1,%%xmm0\n"
  1.1004 +-
  1.1005 +-  "movzb  (%0, %%r11, 1), %%r13 \n"
  1.1006 +-  "movzb  1(%0, %%r11, 1), %%r14 \n"
  1.1007 +-  "and    $0xffff, %%rax \n"
  1.1008 +-  "imul   %%rax, %%r14 \n"
  1.1009 +-  "xor    $0xffff, %%rax \n"
  1.1010 +-  "imul   %%rax, %%r13 \n"
  1.1011 +-  "add    %%r14, %%r13 \n"
  1.1012 +-  "shr    $16, %%r13 \n"
  1.1013 +-  "movq   (%5,%%r13,8),%%xmm1\n"
  1.1014 +-
  1.1015 +-  "mov    %%r10, %%rax \n"
  1.1016 +-  "lea    (%%r10,%6),%%r11\n"
  1.1017 +-  "sar    $0x10,%%r10\n"
  1.1018 +-
  1.1019 +-  "movzb  (%0,%%r10,1), %%r13 \n"
  1.1020 +-  "movzb  1(%0,%%r10,1), %%r14 \n"
  1.1021 +-  "and    $0xffff, %%rax \n"
  1.1022 +-  "imul   %%rax, %%r14 \n"
  1.1023 +-  "xor    $0xffff, %%rax \n"
  1.1024 +-  "imul   %%rax, %%r13 \n"
  1.1025 +-  "add    %%r14, %%r13 \n"
  1.1026 +-  "shr    $16, %%r13 \n"
  1.1027 +-  "movq   (%5,%%r13,8),%%xmm2\n"
  1.1028 +-
  1.1029 +-  "paddsw %%xmm0,%%xmm1\n"
  1.1030 +-  "paddsw %%xmm0,%%xmm2\n"
  1.1031 +-  "shufps $0x44,%%xmm2,%%xmm1\n"
  1.1032 +-  "psraw  $0x6,%%xmm1\n"
  1.1033 +-  "packuswb %%xmm1,%%xmm1\n"
  1.1034 +-  "movq   %%xmm1,0x0(%3)\n"
  1.1035 +-  "add    $0x8,%3\n"
  1.1036 +-  "sub    $0x2,%4\n"
  1.1037 +-  "jns    .lscaleloop\n"
  1.1038 +-
  1.1039 +-".lscalenext:"
  1.1040 +-  "add    $0x1,%4\n"
  1.1041 +-  "js     .lscaledone\n"
  1.1042 +-
  1.1043 +-  "mov    %%r11,%%r10\n"
  1.1044 +-  "sar    $0x11,%%r10\n"
  1.1045 +-
  1.1046 +-  "movzb  (%1,%%r10,1), %%r13 \n"
  1.1047 +-  "movq   2048(%5,%%r13,8),%%xmm0\n"
  1.1048 +-
  1.1049 +-  "movzb  (%2,%%r10,1), %%r13 \n"
  1.1050 +-  "movq   4096(%5,%%r13,8),%%xmm1\n"
  1.1051 +-
  1.1052 +-  "paddsw %%xmm1,%%xmm0\n"
  1.1053 +-  "sar    $0x10,%%r11\n"
  1.1054 +-
  1.1055 +-  "movzb  (%0,%%r11,1), %%r13 \n"
  1.1056 +-  "movq   (%5,%%r13,8),%%xmm1\n"
  1.1057 +-
  1.1058 +-  "paddsw %%xmm0,%%xmm1\n"
  1.1059 +-  "psraw  $0x6,%%xmm1\n"
  1.1060 +-  "packuswb %%xmm1,%%xmm1\n"
  1.1061 +-  "movd   %%xmm1,0x0(%3)\n"
  1.1062 +-
  1.1063 +-".lscaledone:"
  1.1064 +-  :
  1.1065 +-  : "r"(y_buf),  // %0
  1.1066 +-    "r"(u_buf),  // %1
  1.1067 +-    "r"(v_buf),  // %2
  1.1068 +-    "r"(rgb_buf),  // %3
  1.1069 +-    "r"(width),  // %4
  1.1070 +-    "r" (kCoefficientsRgbY),  // %5
  1.1071 +-    "r"(static_cast<long>(source_dx))  // %6
  1.1072 +-  : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
  1.1073 +-);
  1.1074 +-}
  1.1075 +-
  1.1076 +-#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__)
  1.1077 +-
  1.1078 +-// PIC version is slower because less registers are available, so
  1.1079 +-// non-PIC is used on platforms where it is possible.
  1.1080 +-
  1.1081 +-void FastConvertYUVToRGB32Row(const uint8* y_buf,
  1.1082 +-                              const uint8* u_buf,
  1.1083 +-                              const uint8* v_buf,
  1.1084 +-                              uint8* rgb_buf,
  1.1085 +-                              int width);
  1.1086 +-  asm(
  1.1087 +-  ".text\n"
  1.1088 +-  ".global FastConvertYUVToRGB32Row\n"
  1.1089 +-"FastConvertYUVToRGB32Row:\n"
  1.1090 +-  "pusha\n"
  1.1091 +-  "mov    0x24(%esp),%edx\n"
  1.1092 +-  "mov    0x28(%esp),%edi\n"
  1.1093 +-  "mov    0x2c(%esp),%esi\n"
  1.1094 +-  "mov    0x30(%esp),%ebp\n"
  1.1095 +-  "mov    0x34(%esp),%ecx\n"
  1.1096 +-  "jmp    convertend\n"
  1.1097 +-
  1.1098 +-"convertloop:"
  1.1099 +-  "movzbl (%edi),%eax\n"
  1.1100 +-  "add    $0x1,%edi\n"
  1.1101 +-  "movzbl (%esi),%ebx\n"
  1.1102 +-  "add    $0x1,%esi\n"
  1.1103 +-  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
  1.1104 +-  "movzbl (%edx),%eax\n"
  1.1105 +-  "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
  1.1106 +-  "movzbl 0x1(%edx),%ebx\n"
  1.1107 +-  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
  1.1108 +-  "add    $0x2,%edx\n"
  1.1109 +-  "movq   kCoefficientsRgbY(,%ebx,8),%mm2\n"
  1.1110 +-  "paddsw %mm0,%mm1\n"
  1.1111 +-  "paddsw %mm0,%mm2\n"
  1.1112 +-  "psraw  $0x6,%mm1\n"
  1.1113 +-  "psraw  $0x6,%mm2\n"
  1.1114 +-  "packuswb %mm2,%mm1\n"
  1.1115 +-  "movntq %mm1,0x0(%ebp)\n"
  1.1116 +-  "add    $0x8,%ebp\n"
  1.1117 +-"convertend:"
  1.1118 +-  "sub    $0x2,%ecx\n"
  1.1119 +-  "jns    convertloop\n"
  1.1120 +-
  1.1121 +-  "and    $0x1,%ecx\n"
  1.1122 +-  "je     convertdone\n"
  1.1123 +-
  1.1124 +-  "movzbl (%edi),%eax\n"
  1.1125 +-  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
  1.1126 +-  "movzbl (%esi),%eax\n"
  1.1127 +-  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
  1.1128 +-  "movzbl (%edx),%eax\n"
  1.1129 +-  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
  1.1130 +-  "paddsw %mm0,%mm1\n"
  1.1131 +-  "psraw  $0x6,%mm1\n"
  1.1132 +-  "packuswb %mm1,%mm1\n"
  1.1133 +-  "movd   %mm1,0x0(%ebp)\n"
  1.1134 +-"convertdone:"
  1.1135 +-  "popa\n"
  1.1136 +-  "ret\n"
  1.1137 +-);
  1.1138 +-
  1.1139 +-
  1.1140 +-void ScaleYUVToRGB32Row(const uint8* y_buf,
  1.1141 +-                        const uint8* u_buf,
  1.1142 +-                        const uint8* v_buf,
  1.1143 +-                        uint8* rgb_buf,
  1.1144 +-                        int width,
  1.1145 +-                        int source_dx);
  1.1146 +-  asm(
  1.1147 +-  ".text\n"
  1.1148 +-  ".global ScaleYUVToRGB32Row\n"
  1.1149 +-"ScaleYUVToRGB32Row:\n"
  1.1150 +-  "pusha\n"
  1.1151 +-  "mov    0x24(%esp),%edx\n"
  1.1152 +-  "mov    0x28(%esp),%edi\n"
  1.1153 +-  "mov    0x2c(%esp),%esi\n"
  1.1154 +-  "mov    0x30(%esp),%ebp\n"
  1.1155 +-  "mov    0x34(%esp),%ecx\n"
  1.1156 +-  "xor    %ebx,%ebx\n"
  1.1157 +-  "jmp    scaleend\n"
  1.1158 +-
  1.1159 +-"scaleloop:"
  1.1160 +-  "mov    %ebx,%eax\n"
  1.1161 +-  "sar    $0x11,%eax\n"
  1.1162 +-  "movzbl (%edi,%eax,1),%eax\n"
  1.1163 +-  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
  1.1164 +-  "mov    %ebx,%eax\n"
  1.1165 +-  "sar    $0x11,%eax\n"
  1.1166 +-  "movzbl (%esi,%eax,1),%eax\n"
  1.1167 +-  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
  1.1168 +-  "mov    %ebx,%eax\n"
  1.1169 +-  "add    0x38(%esp),%ebx\n"
  1.1170 +-  "sar    $0x10,%eax\n"
  1.1171 +-  "movzbl (%edx,%eax,1),%eax\n"
  1.1172 +-  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
  1.1173 +-  "mov    %ebx,%eax\n"
  1.1174 +-  "add    0x38(%esp),%ebx\n"
  1.1175 +-  "sar    $0x10,%eax\n"
  1.1176 +-  "movzbl (%edx,%eax,1),%eax\n"
  1.1177 +-  "movq   kCoefficientsRgbY(,%eax,8),%mm2\n"
  1.1178 +-  "paddsw %mm0,%mm1\n"
  1.1179 +-  "paddsw %mm0,%mm2\n"
  1.1180 +-  "psraw  $0x6,%mm1\n"
  1.1181 +-  "psraw  $0x6,%mm2\n"
  1.1182 +-  "packuswb %mm2,%mm1\n"
  1.1183 +-  "movntq %mm1,0x0(%ebp)\n"
  1.1184 +-  "add    $0x8,%ebp\n"
  1.1185 +-"scaleend:"
  1.1186 +-  "sub    $0x2,%ecx\n"
  1.1187 +-  "jns    scaleloop\n"
  1.1188 +-
  1.1189 +-  "and    $0x1,%ecx\n"
  1.1190 +-  "je     scaledone\n"
  1.1191 +-
  1.1192 +-  "mov    %ebx,%eax\n"
  1.1193 +-  "sar    $0x11,%eax\n"
  1.1194 +-  "movzbl (%edi,%eax,1),%eax\n"
  1.1195 +-  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
  1.1196 +-  "mov    %ebx,%eax\n"
  1.1197 +-  "sar    $0x11,%eax\n"
  1.1198 +-  "movzbl (%esi,%eax,1),%eax\n"
  1.1199 +-  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
  1.1200 +-  "mov    %ebx,%eax\n"
  1.1201 +-  "sar    $0x10,%eax\n"
  1.1202 +-  "movzbl (%edx,%eax,1),%eax\n"
  1.1203 +-  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
  1.1204 +-  "paddsw %mm0,%mm1\n"
  1.1205 +-  "psraw  $0x6,%mm1\n"
  1.1206 +-  "packuswb %mm1,%mm1\n"
  1.1207 +-  "movd   %mm1,0x0(%ebp)\n"
  1.1208 +-
  1.1209 +-"scaledone:"
  1.1210 +-  "popa\n"
  1.1211 +-  "ret\n"
  1.1212 +-);
  1.1213 +-
  1.1214 +-void LinearScaleYUVToRGB32Row(const uint8* y_buf,
  1.1215 +-                              const uint8* u_buf,
  1.1216 +-                              const uint8* v_buf,
  1.1217 +-                              uint8* rgb_buf,
  1.1218 +-                              int width,
  1.1219 +-                              int source_dx);
  1.1220 +-  asm(
  1.1221 +-  ".text\n"
  1.1222 +-  ".global LinearScaleYUVToRGB32Row\n"
  1.1223 +-"LinearScaleYUVToRGB32Row:\n"
  1.1224 +-  "pusha\n"
  1.1225 +-  "mov    0x24(%esp),%edx\n"
  1.1226 +-  "mov    0x28(%esp),%edi\n"
  1.1227 +-  "mov    0x30(%esp),%ebp\n"
  1.1228 +-
  1.1229 +-  // source_width = width * source_dx + ebx
  1.1230 +-  "mov    0x34(%esp), %ecx\n"
  1.1231 +-  "imull  0x38(%esp), %ecx\n"
  1.1232 +-  "mov    %ecx, 0x34(%esp)\n"
  1.1233 +-
  1.1234 +-  "mov    0x38(%esp), %ecx\n"
  1.1235 +-  "xor    %ebx,%ebx\n"     // x = 0
  1.1236 +-  "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
  1.1237 +-  "jl     .lscaleend\n"
  1.1238 +-  "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
  1.1239 +-  "jmp    .lscaleend\n"
  1.1240 +-
  1.1241 +-".lscaleloop:"
  1.1242 +-  "mov    %ebx,%eax\n"
  1.1243 +-  "sar    $0x11,%eax\n"
  1.1244 +-
  1.1245 +-  "movzbl (%edi,%eax,1),%ecx\n"
  1.1246 +-  "movzbl 1(%edi,%eax,1),%esi\n"
  1.1247 +-  "mov    %ebx,%eax\n"
  1.1248 +-  "andl   $0x1fffe, %eax \n"
  1.1249 +-  "imul   %eax, %esi \n"
  1.1250 +-  "xorl   $0x1fffe, %eax \n"
  1.1251 +-  "imul   %eax, %ecx \n"
  1.1252 +-  "addl   %esi, %ecx \n"
  1.1253 +-  "shrl   $17, %ecx \n"
  1.1254 +-  "movq   kCoefficientsRgbY+2048(,%ecx,8),%mm0\n"
  1.1255 +-
  1.1256 +-  "mov    0x2c(%esp),%esi\n"
  1.1257 +-  "mov    %ebx,%eax\n"
  1.1258 +-  "sar    $0x11,%eax\n"
  1.1259 +-
  1.1260 +-  "movzbl (%esi,%eax,1),%ecx\n"
  1.1261 +-  "movzbl 1(%esi,%eax,1),%esi\n"
  1.1262 +-  "mov    %ebx,%eax\n"
  1.1263 +-  "andl   $0x1fffe, %eax \n"
  1.1264 +-  "imul   %eax, %esi \n"
  1.1265 +-  "xorl   $0x1fffe, %eax \n"
  1.1266 +-  "imul   %eax, %ecx \n"
  1.1267 +-  "addl   %esi, %ecx \n"
  1.1268 +-  "shrl   $17, %ecx \n"
  1.1269 +-  "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n"
  1.1270 +-
  1.1271 +-  "mov    %ebx,%eax\n"
  1.1272 +-  "sar    $0x10,%eax\n"
  1.1273 +-  "movzbl (%edx,%eax,1),%ecx\n"
  1.1274 +-  "movzbl 1(%edx,%eax,1),%esi\n"
  1.1275 +-  "mov    %ebx,%eax\n"
  1.1276 +-  "add    0x38(%esp),%ebx\n"
  1.1277 +-  "andl   $0xffff, %eax \n"
  1.1278 +-  "imul   %eax, %esi \n"
  1.1279 +-  "xorl   $0xffff, %eax \n"
  1.1280 +-  "imul   %eax, %ecx \n"
  1.1281 +-  "addl   %esi, %ecx \n"
  1.1282 +-  "shrl   $16, %ecx \n"
  1.1283 +-  "movq   kCoefficientsRgbY(,%ecx,8),%mm1\n"
  1.1284 +-
  1.1285 +-  "cmp    0x34(%esp), %ebx\n"
  1.1286 +-  "jge    .lscalelastpixel\n"
  1.1287 +-
  1.1288 +-  "mov    %ebx,%eax\n"
  1.1289 +-  "sar    $0x10,%eax\n"
  1.1290 +-  "movzbl (%edx,%eax,1),%ecx\n"
  1.1291 +-  "movzbl 1(%edx,%eax,1),%esi\n"
  1.1292 +-  "mov    %ebx,%eax\n"
  1.1293 +-  "add    0x38(%esp),%ebx\n"
  1.1294 +-  "andl   $0xffff, %eax \n"
  1.1295 +-  "imul   %eax, %esi \n"
  1.1296 +-  "xorl   $0xffff, %eax \n"
  1.1297 +-  "imul   %eax, %ecx \n"
  1.1298 +-  "addl   %esi, %ecx \n"
  1.1299 +-  "shrl   $16, %ecx \n"
  1.1300 +-  "movq   kCoefficientsRgbY(,%ecx,8),%mm2\n"
  1.1301 +-
  1.1302 +-  "paddsw %mm0,%mm1\n"
  1.1303 +-  "paddsw %mm0,%mm2\n"
  1.1304 +-  "psraw  $0x6,%mm1\n"
  1.1305 +-  "psraw  $0x6,%mm2\n"
  1.1306 +-  "packuswb %mm2,%mm1\n"
  1.1307 +-  "movntq %mm1,0x0(%ebp)\n"
  1.1308 +-  "add    $0x8,%ebp\n"
  1.1309 +-
  1.1310 +-".lscaleend:"
  1.1311 +-  "cmp    0x34(%esp), %ebx\n"
  1.1312 +-  "jl     .lscaleloop\n"
  1.1313 +-  "popa\n"
  1.1314 +-  "ret\n"
  1.1315 +-
  1.1316 +-".lscalelastpixel:"
  1.1317 +-  "paddsw %mm0, %mm1\n"
  1.1318 +-  "psraw $6, %mm1\n"
  1.1319 +-  "packuswb %mm1, %mm1\n"
  1.1320 +-  "movd %mm1, (%ebp)\n"
  1.1321 +-  "popa\n"
  1.1322 +-  "ret\n"
  1.1323 +-);
  1.1324 +-
  1.1325 +-#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__)
  1.1326 +-
  1.1327 +-extern void PICConvertYUVToRGB32Row(const uint8* y_buf,
  1.1328 +-                                    const uint8* u_buf,
  1.1329 +-                                    const uint8* v_buf,
  1.1330 +-                                    uint8* rgb_buf,
  1.1331 +-                                    int width,
  1.1332 +-                                    int16 *kCoefficientsRgbY);
  1.1333 +-  asm(
  1.1334 +-  ".text\n"
  1.1335 +-#if defined(OS_MACOSX)
  1.1336 +-"_PICConvertYUVToRGB32Row:\n"
  1.1337 +-#else
  1.1338 +-"PICConvertYUVToRGB32Row:\n"
  1.1339 +-#endif
  1.1340 +-  "pusha\n"
  1.1341 +-  "mov    0x24(%esp),%edx\n"
  1.1342 +-  "mov    0x28(%esp),%edi\n"
  1.1343 +-  "mov    0x2c(%esp),%esi\n"
  1.1344 +-  "mov    0x30(%esp),%ebp\n"
  1.1345 +-  "mov    0x38(%esp),%ecx\n"
  1.1346 +-
  1.1347 +-  "jmp    .Lconvertend\n"
  1.1348 +-
  1.1349 +-".Lconvertloop:"
  1.1350 +-  "movzbl (%edi),%eax\n"
  1.1351 +-  "add    $0x1,%edi\n"
  1.1352 +-  "movzbl (%esi),%ebx\n"
  1.1353 +-  "add    $0x1,%esi\n"
  1.1354 +-  "movq   2048(%ecx,%eax,8),%mm0\n"
  1.1355 +-  "movzbl (%edx),%eax\n"
  1.1356 +-  "paddsw 4096(%ecx,%ebx,8),%mm0\n"
  1.1357 +-  "movzbl 0x1(%edx),%ebx\n"
  1.1358 +-  "movq   0(%ecx,%eax,8),%mm1\n"
  1.1359 +-  "add    $0x2,%edx\n"
  1.1360 +-  "movq   0(%ecx,%ebx,8),%mm2\n"
  1.1361 +-  "paddsw %mm0,%mm1\n"
  1.1362 +-  "paddsw %mm0,%mm2\n"
  1.1363 +-  "psraw  $0x6,%mm1\n"
  1.1364 +-  "psraw  $0x6,%mm2\n"
  1.1365 +-  "packuswb %mm2,%mm1\n"
  1.1366 +-  "movntq %mm1,0x0(%ebp)\n"
  1.1367 +-  "add    $0x8,%ebp\n"
  1.1368 +-".Lconvertend:"
  1.1369 +-  "subl   $0x2,0x34(%esp)\n"
  1.1370 +-  "jns    .Lconvertloop\n"
  1.1371 +-
  1.1372 +-  "andl   $0x1,0x34(%esp)\n"
  1.1373 +-  "je     .Lconvertdone\n"
  1.1374 +-
  1.1375 +-  "movzbl (%edi),%eax\n"
  1.1376 +-  "movq   2048(%ecx,%eax,8),%mm0\n"
  1.1377 +-  "movzbl (%esi),%eax\n"
  1.1378 +-  "paddsw 4096(%ecx,%eax,8),%mm0\n"
  1.1379 +-  "movzbl (%edx),%eax\n"
  1.1380 +-  "movq   0(%ecx,%eax,8),%mm1\n"
  1.1381 +-  "paddsw %mm0,%mm1\n"
  1.1382 +-  "psraw  $0x6,%mm1\n"
  1.1383 +-  "packuswb %mm1,%mm1\n"
  1.1384 +-  "movd   %mm1,0x0(%ebp)\n"
  1.1385 +-".Lconvertdone:\n"
  1.1386 +-  "popa\n"
  1.1387 +-  "ret\n"
  1.1388 +-);
  1.1389 +-
  1.1390 +-void FastConvertYUVToRGB32Row(const uint8* y_buf,
  1.1391 +-                              const uint8* u_buf,
  1.1392 +-                              const uint8* v_buf,
  1.1393 +-                              uint8* rgb_buf,
  1.1394 +-                              int width) {
  1.1395 +-  PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
  1.1396 +-                          &kCoefficientsRgbY[0][0]);
  1.1397 +-}
  1.1398 +-
  1.1399 +-extern void PICScaleYUVToRGB32Row(const uint8* y_buf,
  1.1400 +-                               const uint8* u_buf,
  1.1401 +-                               const uint8* v_buf,
  1.1402 +-                               uint8* rgb_buf,
  1.1403 +-                               int width,
  1.1404 +-                               int source_dx,
  1.1405 +-                               int16 *kCoefficientsRgbY);
  1.1406 +-
  1.1407 +-  asm(
  1.1408 +-  ".text\n"
  1.1409 +-#if defined(OS_MACOSX)
  1.1410 +-"_PICScaleYUVToRGB32Row:\n"
  1.1411 +-#else
  1.1412 +-"PICScaleYUVToRGB32Row:\n"
  1.1413 +-#endif
  1.1414 +-  "pusha\n"
  1.1415 +-  "mov    0x24(%esp),%edx\n"
  1.1416 +-  "mov    0x28(%esp),%edi\n"
  1.1417 +-  "mov    0x2c(%esp),%esi\n"
  1.1418 +-  "mov    0x30(%esp),%ebp\n"
  1.1419 +-  "mov    0x3c(%esp),%ecx\n"
  1.1420 +-  "xor    %ebx,%ebx\n"
  1.1421 +-  "jmp    Lscaleend\n"
  1.1422 +-
  1.1423 +-"Lscaleloop:"
  1.1424 +-  "mov    %ebx,%eax\n"
  1.1425 +-  "sar    $0x11,%eax\n"
  1.1426 +-  "movzbl (%edi,%eax,1),%eax\n"
  1.1427 +-  "movq   2048(%ecx,%eax,8),%mm0\n"
  1.1428 +-  "mov    %ebx,%eax\n"
  1.1429 +-  "sar    $0x11,%eax\n"
  1.1430 +-  "movzbl (%esi,%eax,1),%eax\n"
  1.1431 +-  "paddsw 4096(%ecx,%eax,8),%mm0\n"
  1.1432 +-  "mov    %ebx,%eax\n"
  1.1433 +-  "add    0x38(%esp),%ebx\n"
  1.1434 +-  "sar    $0x10,%eax\n"
  1.1435 +-  "movzbl (%edx,%eax,1),%eax\n"
  1.1436 +-  "movq   0(%ecx,%eax,8),%mm1\n"
  1.1437 +-  "mov    %ebx,%eax\n"
  1.1438 +-  "add    0x38(%esp),%ebx\n"
  1.1439 +-  "sar    $0x10,%eax\n"
  1.1440 +-  "movzbl (%edx,%eax,1),%eax\n"
  1.1441 +-  "movq   0(%ecx,%eax,8),%mm2\n"
  1.1442 +-  "paddsw %mm0,%mm1\n"
  1.1443 +-  "paddsw %mm0,%mm2\n"
  1.1444 +-  "psraw  $0x6,%mm1\n"
  1.1445 +-  "psraw  $0x6,%mm2\n"
  1.1446 +-  "packuswb %mm2,%mm1\n"
  1.1447 +-  "movntq %mm1,0x0(%ebp)\n"
  1.1448 +-  "add    $0x8,%ebp\n"
  1.1449 +-"Lscaleend:"
  1.1450 +-  "subl   $0x2,0x34(%esp)\n"
  1.1451 +-  "jns    Lscaleloop\n"
  1.1452 +-
  1.1453 +-  "andl   $0x1,0x34(%esp)\n"
  1.1454 +-  "je     Lscaledone\n"
  1.1455 +-
  1.1456 +-  "mov    %ebx,%eax\n"
  1.1457 +-  "sar    $0x11,%eax\n"
  1.1458 +-  "movzbl (%edi,%eax,1),%eax\n"
  1.1459 +-  "movq   2048(%ecx,%eax,8),%mm0\n"
  1.1460 +-  "mov    %ebx,%eax\n"
  1.1461 +-  "sar    $0x11,%eax\n"
  1.1462 +-  "movzbl (%esi,%eax,1),%eax\n"
  1.1463 +-  "paddsw 4096(%ecx,%eax,8),%mm0\n"
  1.1464 +-  "mov    %ebx,%eax\n"
  1.1465 +-  "sar    $0x10,%eax\n"
  1.1466 +-  "movzbl (%edx,%eax,1),%eax\n"
  1.1467 +-  "movq   0(%ecx,%eax,8),%mm1\n"
  1.1468 +-  "paddsw %mm0,%mm1\n"
  1.1469 +-  "psraw  $0x6,%mm1\n"
  1.1470 +-  "packuswb %mm1,%mm1\n"
  1.1471 +-  "movd   %mm1,0x0(%ebp)\n"
  1.1472 +-
  1.1473 +-"Lscaledone:"
  1.1474 +-  "popa\n"
  1.1475 +-  "ret\n"
  1.1476 +-);
  1.1477 +-
  1.1478 +-
  1.1479 +-void ScaleYUVToRGB32Row(const uint8* y_buf,
  1.1480 +-                        const uint8* u_buf,
  1.1481 +-                        const uint8* v_buf,
  1.1482 +-                        uint8* rgb_buf,
  1.1483 +-                        int width,
  1.1484 +-                        int source_dx) {
  1.1485 +-  PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
  1.1486 +-                        &kCoefficientsRgbY[0][0]);
  1.1487 +-}
  1.1488 +-
  1.1489 +-void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
  1.1490 +-                                 const uint8* u_buf,
  1.1491 +-                                 const uint8* v_buf,
  1.1492 +-                                 uint8* rgb_buf,
  1.1493 +-                                 int width,
  1.1494 +-                                 int source_dx,
  1.1495 +-                                 int16 *kCoefficientsRgbY);
  1.1496 +-  asm(
  1.1497 +-  ".text\n"
  1.1498 +-#if defined(OS_MACOSX)
  1.1499 +-"_PICLinearScaleYUVToRGB32Row:\n"
  1.1500 +-#else
  1.1501 +-"PICLinearScaleYUVToRGB32Row:\n"
  1.1502 +-#endif
  1.1503 +-  "pusha\n"
  1.1504 +-  "mov    0x24(%esp),%edx\n"
  1.1505 +-  "mov    0x30(%esp),%ebp\n"
  1.1506 +-  "mov    0x34(%esp),%ecx\n"
  1.1507 +-  "mov    0x3c(%esp),%edi\n"
  1.1508 +-  "xor    %ebx,%ebx\n"
  1.1509 +-
  1.1510 +-  // source_width = width * source_dx + ebx
  1.1511 +-  "mov    0x34(%esp), %ecx\n"
  1.1512 +-  "imull  0x38(%esp), %ecx\n"
  1.1513 +-  "mov    %ecx, 0x34(%esp)\n"
  1.1514 +-
  1.1515 +-  "mov    0x38(%esp), %ecx\n"
  1.1516 +-  "xor    %ebx,%ebx\n"     // x = 0
  1.1517 +-  "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
  1.1518 +-  "jl     .lscaleend\n"
  1.1519 +-  "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
  1.1520 +-  "jmp    .lscaleend\n"
  1.1521 +-
  1.1522 +-".lscaleloop:"
  1.1523 +-  "mov    0x28(%esp),%esi\n"
  1.1524 +-  "mov    %ebx,%eax\n"
  1.1525 +-  "sar    $0x11,%eax\n"
  1.1526 +-
  1.1527 +-  "movzbl (%esi,%eax,1),%ecx\n"
  1.1528 +-  "movzbl 1(%esi,%eax,1),%esi\n"
  1.1529 +-  "mov    %ebx,%eax\n"
  1.1530 +-  "andl   $0x1fffe, %eax \n"
  1.1531 +-  "imul   %eax, %esi \n"
  1.1532 +-  "xorl   $0x1fffe, %eax \n"
  1.1533 +-  "imul   %eax, %ecx \n"
  1.1534 +-  "addl   %esi, %ecx \n"
  1.1535 +-  "shrl   $17, %ecx \n"
  1.1536 +-  "movq   2048(%edi,%ecx,8),%mm0\n"
  1.1537 +-
  1.1538 +-  "mov    0x2c(%esp),%esi\n"
  1.1539 +-  "mov    %ebx,%eax\n"
  1.1540 +-  "sar    $0x11,%eax\n"
  1.1541 +-
  1.1542 +-  "movzbl (%esi,%eax,1),%ecx\n"
  1.1543 +-  "movzbl 1(%esi,%eax,1),%esi\n"
  1.1544 +-  "mov    %ebx,%eax\n"
  1.1545 +-  "andl   $0x1fffe, %eax \n"
  1.1546 +-  "imul   %eax, %esi \n"
  1.1547 +-  "xorl   $0x1fffe, %eax \n"
  1.1548 +-  "imul   %eax, %ecx \n"
  1.1549 +-  "addl   %esi, %ecx \n"
  1.1550 +-  "shrl   $17, %ecx \n"
  1.1551 +-  "paddsw 4096(%edi,%ecx,8),%mm0\n"
  1.1552 +-
  1.1553 +-  "mov    %ebx,%eax\n"
  1.1554 +-  "sar    $0x10,%eax\n"
  1.1555 +-  "movzbl (%edx,%eax,1),%ecx\n"
  1.1556 +-  "movzbl 1(%edx,%eax,1),%esi\n"
  1.1557 +-  "mov    %ebx,%eax\n"
  1.1558 +-  "add    0x38(%esp),%ebx\n"
  1.1559 +-  "andl   $0xffff, %eax \n"
  1.1560 +-  "imul   %eax, %esi \n"
  1.1561 +-  "xorl   $0xffff, %eax \n"
  1.1562 +-  "imul   %eax, %ecx \n"
  1.1563 +-  "addl   %esi, %ecx \n"
  1.1564 +-  "shrl   $16, %ecx \n"
  1.1565 +-  "movq   (%edi,%ecx,8),%mm1\n"
  1.1566 +-
  1.1567 +-  "cmp    0x34(%esp), %ebx\n"
  1.1568 +-  "jge    .lscalelastpixel\n"
  1.1569 +-
  1.1570 +-  "mov    %ebx,%eax\n"
  1.1571 +-  "sar    $0x10,%eax\n"
  1.1572 +-  "movzbl (%edx,%eax,1),%ecx\n"
  1.1573 +-  "movzbl 1(%edx,%eax,1),%esi\n"
  1.1574 +-  "mov    %ebx,%eax\n"
  1.1575 +-  "add    0x38(%esp),%ebx\n"
  1.1576 +-  "andl   $0xffff, %eax \n"
  1.1577 +-  "imul   %eax, %esi \n"
  1.1578 +-  "xorl   $0xffff, %eax \n"
  1.1579 +-  "imul   %eax, %ecx \n"
  1.1580 +-  "addl   %esi, %ecx \n"
  1.1581 +-  "shrl   $16, %ecx \n"
  1.1582 +-  "movq   (%edi,%ecx,8),%mm2\n"
  1.1583 +-
  1.1584 +-  "paddsw %mm0,%mm1\n"
  1.1585 +-  "paddsw %mm0,%mm2\n"
  1.1586 +-  "psraw  $0x6,%mm1\n"
  1.1587 +-  "psraw  $0x6,%mm2\n"
  1.1588 +-  "packuswb %mm2,%mm1\n"
  1.1589 +-  "movntq %mm1,0x0(%ebp)\n"
  1.1590 +-  "add    $0x8,%ebp\n"
  1.1591 +-
  1.1592 +-".lscaleend:"
  1.1593 +-  "cmp    %ebx, 0x34(%esp)\n"
  1.1594 +-  "jg     .lscaleloop\n"
  1.1595 +-  "popa\n"
  1.1596 +-  "ret\n"
  1.1597 +-
  1.1598 +-".lscalelastpixel:"
  1.1599 +-  "paddsw %mm0, %mm1\n"
  1.1600 +-  "psraw $6, %mm1\n"
  1.1601 +-  "packuswb %mm1, %mm1\n"
  1.1602 +-  "movd %mm1, (%ebp)\n"
  1.1603 +-  "popa\n"
  1.1604 +-  "ret\n"
  1.1605 +-);
  1.1606 +-
  1.1607 +-void LinearScaleYUVToRGB32Row(const uint8* y_buf,
  1.1608 +-                        const uint8* u_buf,
  1.1609 +-                        const uint8* v_buf,
  1.1610 +-                        uint8* rgb_buf,
  1.1611 +-                        int width,
  1.1612 +-                        int source_dx) {
  1.1613 +-  PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
  1.1614 +-                              &kCoefficientsRgbY[0][0]);
  1.1615 +-}
  1.1616 +-
  1.1617 +-#else  // USE_MMX
  1.1618 +-
  1.1619 + // C reference code that mimic the YUV assembly.
  1.1620 + #define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
  1.1621 + #define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
  1.1622 +     (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
  1.1623 + 
  1.1624 + static inline void YuvPixel(uint8 y,
  1.1625 +                             uint8 u,
  1.1626 +                             uint8 v,
  1.1627 +@@ -833,66 +39,71 @@ static inline void YuvPixel(uint8 y,
  1.1628 +   a >>= 6;
  1.1629 + 
  1.1630 +   *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
  1.1631 +                                         (packuswb(g) << 8) |
  1.1632 +                                         (packuswb(r) << 16) |
  1.1633 +                                         (packuswb(a) << 24);
  1.1634 + }
  1.1635 + 
  1.1636 +-void FastConvertYUVToRGB32Row(const uint8* y_buf,
  1.1637 +-                              const uint8* u_buf,
  1.1638 +-                              const uint8* v_buf,
  1.1639 +-                              uint8* rgb_buf,
  1.1640 +-                              int width) {
  1.1641 ++void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
  1.1642 ++                                const uint8* u_buf,
  1.1643 ++                                const uint8* v_buf,
  1.1644 ++                                uint8* rgb_buf,
  1.1645 ++                                int width,
  1.1646 ++                                unsigned int x_shift) {
  1.1647 +   for (int x = 0; x < width; x += 2) {
  1.1648 +-    uint8 u = u_buf[x >> 1];
  1.1649 +-    uint8 v = v_buf[x >> 1];
  1.1650 ++    uint8 u = u_buf[x >> x_shift];
  1.1651 ++    uint8 v = v_buf[x >> x_shift];
  1.1652 +     uint8 y0 = y_buf[x];
  1.1653 +     YuvPixel(y0, u, v, rgb_buf);
  1.1654 +     if ((x + 1) < width) {
  1.1655 +       uint8 y1 = y_buf[x + 1];
  1.1656 ++      if (x_shift == 0) {
  1.1657 ++        u = u_buf[x + 1];
  1.1658 ++        v = v_buf[x + 1];
  1.1659 ++      }
  1.1660 +       YuvPixel(y1, u, v, rgb_buf + 4);
  1.1661 +     }
  1.1662 +     rgb_buf += 8;  // Advance 2 pixels.
  1.1663 +   }
  1.1664 + }
  1.1665 + 
  1.1666 + // 16.16 fixed point is used.  A shift by 16 isolates the integer.
  1.1667 + // A shift by 17 is used to further subsample the chrominence channels.
  1.1668 + // & 0xffff isolates the fixed point fraction.  >> 2 to get the upper 2 bits,
  1.1669 + // for 1/65536 pixel accurate interpolation.
  1.1670 +-void ScaleYUVToRGB32Row(const uint8* y_buf,
  1.1671 +-                        const uint8* u_buf,
  1.1672 +-                        const uint8* v_buf,
  1.1673 +-                        uint8* rgb_buf,
  1.1674 +-                        int width,
  1.1675 +-                        int source_dx) {
  1.1676 ++void ScaleYUVToRGB32Row_C(const uint8* y_buf,
  1.1677 ++                          const uint8* u_buf,
  1.1678 ++                          const uint8* v_buf,
  1.1679 ++                          uint8* rgb_buf,
  1.1680 ++                          int width,
  1.1681 ++                          int source_dx) {
  1.1682 +   int x = 0;
  1.1683 +   for (int i = 0; i < width; i += 2) {
  1.1684 +     int y = y_buf[x >> 16];
  1.1685 +     int u = u_buf[(x >> 17)];
  1.1686 +     int v = v_buf[(x >> 17)];
  1.1687 +     YuvPixel(y, u, v, rgb_buf);
  1.1688 +     x += source_dx;
  1.1689 +     if ((i + 1) < width) {
  1.1690 +       y = y_buf[x >> 16];
  1.1691 +       YuvPixel(y, u, v, rgb_buf+4);
  1.1692 +       x += source_dx;
  1.1693 +     }
  1.1694 +     rgb_buf += 8;
  1.1695 +   }
  1.1696 + }
  1.1697 + 
  1.1698 +-void LinearScaleYUVToRGB32Row(const uint8* y_buf,
  1.1699 +-                              const uint8* u_buf,
  1.1700 +-                              const uint8* v_buf,
  1.1701 +-                              uint8* rgb_buf,
  1.1702 +-                              int width,
  1.1703 +-                              int source_dx) {
  1.1704 ++void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
  1.1705 ++                                const uint8* u_buf,
  1.1706 ++                                const uint8* v_buf,
  1.1707 ++                                uint8* rgb_buf,
  1.1708 ++                                int width,
  1.1709 ++                                int source_dx) {
  1.1710 +   int x = 0;
  1.1711 +   if (source_dx >= 0x20000) {
  1.1712 +     x = 32768;
  1.1713 +   }
  1.1714 +   for (int i = 0; i < width; i += 2) {
  1.1715 +     int y0 = y_buf[x >> 16];
  1.1716 +     int y1 = y_buf[(x >> 16) + 1];
  1.1717 +     int u0 = u_buf[(x >> 17)];
  1.1718 +@@ -913,11 +124,10 @@ void LinearScaleYUVToRGB32Row(const uint
  1.1719 +       y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
  1.1720 +       YuvPixel(y, u, v, rgb_buf+4);
  1.1721 +       x += source_dx;
  1.1722 +     }
  1.1723 +     rgb_buf += 8;
  1.1724 +   }
  1.1725 + }
  1.1726 + 
  1.1727 +-#endif  // USE_MMX
  1.1728 + }  // extern "C"
  1.1729 + 
  1.1730 +diff --git a/gfx/ycbcr/yuv_row_posix.cpp b/gfx/ycbcr/yuv_row_posix.cpp
  1.1731 +--- a/gfx/ycbcr/yuv_row_posix.cpp
  1.1732 ++++ b/gfx/ycbcr/yuv_row_posix.cpp
  1.1733 +@@ -1,33 +1,32 @@
  1.1734 + // Copyright (c) 2010 The Chromium Authors. All rights reserved.
  1.1735 + // Use of this source code is governed by a BSD-style license that can be
  1.1736 + // found in the LICENSE file.
  1.1737 + 
  1.1738 +-#include "media/base/yuv_row.h"
  1.1739 +-
  1.1740 +-#ifdef _DEBUG
  1.1741 +-#include "base/logging.h"
  1.1742 +-#else
  1.1743 ++#include "yuv_row.h"
  1.1744 ++#include "mozilla/SSE.h"
  1.1745 ++
  1.1746 + #define DCHECK(a)
  1.1747 +-#endif
  1.1748 + 
  1.1749 + extern "C" {
  1.1750 + 
  1.1751 +-#if USE_SSE2 && defined(ARCH_CPU_X86_64)
  1.1752 ++#if defined(ARCH_CPU_X86_64)
  1.1753 ++
  1.1754 ++// We don't need CPUID guards here, since x86-64 implies SSE2.
  1.1755 + 
  1.1756 + // AMD64 ABI uses register paremters.
  1.1757 + void FastConvertYUVToRGB32Row(const uint8* y_buf,  // rdi
  1.1758 +                               const uint8* u_buf,  // rsi
  1.1759 +                               const uint8* v_buf,  // rdx
  1.1760 +                               uint8* rgb_buf,      // rcx
  1.1761 +                               int width) {         // r8
  1.1762 +   asm(
  1.1763 +-  "jmp    convertend\n"
  1.1764 +-"convertloop:"
  1.1765 ++  "jmp    1f\n"
  1.1766 ++"0:"
  1.1767 +   "movzb  (%1),%%r10\n"
  1.1768 +   "add    $0x1,%1\n"
  1.1769 +   "movzb  (%2),%%r11\n"
  1.1770 +   "add    $0x1,%2\n"
  1.1771 +   "movq   2048(%5,%%r10,8),%%xmm0\n"
  1.1772 +   "movzb  (%0),%%r10\n"
  1.1773 +   "movq   4096(%5,%%r11,8),%%xmm1\n"
  1.1774 +   "movzb  0x1(%0),%%r11\n"
  1.1775 +@@ -37,36 +36,36 @@ void FastConvertYUVToRGB32Row(const uint
  1.1776 +   "movq   (%5,%%r11,8),%%xmm3\n"
  1.1777 +   "paddsw %%xmm0,%%xmm2\n"
  1.1778 +   "paddsw %%xmm0,%%xmm3\n"
  1.1779 +   "shufps $0x44,%%xmm3,%%xmm2\n"
  1.1780 +   "psraw  $0x6,%%xmm2\n"
  1.1781 +   "packuswb %%xmm2,%%xmm2\n"
  1.1782 +   "movq   %%xmm2,0x0(%3)\n"
  1.1783 +   "add    $0x8,%3\n"
  1.1784 +-"convertend:"
  1.1785 ++"1:"
  1.1786 +   "sub    $0x2,%4\n"
  1.1787 +-  "jns    convertloop\n"
  1.1788 +-
  1.1789 +-"convertnext:"
  1.1790 ++  "jns    0b\n"
  1.1791 ++
  1.1792 ++"2:"
  1.1793 +   "add    $0x1,%4\n"
  1.1794 +-  "js     convertdone\n"
  1.1795 ++  "js     3f\n"
  1.1796 + 
  1.1797 +   "movzb  (%1),%%r10\n"
  1.1798 +   "movq   2048(%5,%%r10,8),%%xmm0\n"
  1.1799 +   "movzb  (%2),%%r10\n"
  1.1800 +   "movq   4096(%5,%%r10,8),%%xmm1\n"
  1.1801 +   "paddsw %%xmm1,%%xmm0\n"
  1.1802 +   "movzb  (%0),%%r10\n"
  1.1803 +   "movq   (%5,%%r10,8),%%xmm1\n"
  1.1804 +   "paddsw %%xmm0,%%xmm1\n"
  1.1805 +   "psraw  $0x6,%%xmm1\n"
  1.1806 +   "packuswb %%xmm1,%%xmm1\n"
  1.1807 +   "movd   %%xmm1,0x0(%3)\n"
  1.1808 +-"convertdone:"
  1.1809 ++"3:"
  1.1810 +   :
  1.1811 +   : "r"(y_buf),  // %0
  1.1812 +     "r"(u_buf),  // %1
  1.1813 +     "r"(v_buf),  // %2
  1.1814 +     "r"(rgb_buf),  // %3
  1.1815 +     "r"(width),  // %4
  1.1816 +     "r" (kCoefficientsRgbY)  // %5
  1.1817 +   : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
  1.1818 +@@ -77,19 +76,19 @@ void ScaleYUVToRGB32Row(const uint8* y_b
  1.1819 +                         const uint8* u_buf,  // rsi
  1.1820 +                         const uint8* v_buf,  // rdx
  1.1821 +                         uint8* rgb_buf,      // rcx
  1.1822 +                         int width,           // r8
  1.1823 +                         int source_dx) {     // r9
  1.1824 +   asm(
  1.1825 +   "xor    %%r11,%%r11\n"
  1.1826 +   "sub    $0x2,%4\n"
  1.1827 +-  "js     scalenext\n"
  1.1828 +-
  1.1829 +-"scaleloop:"
  1.1830 ++  "js     1f\n"
  1.1831 ++
  1.1832 ++"0:"
  1.1833 +   "mov    %%r11,%%r10\n"
  1.1834 +   "sar    $0x11,%%r10\n"
  1.1835 +   "movzb  (%1,%%r10,1),%%rax\n"
  1.1836 +   "movq   2048(%5,%%rax,8),%%xmm0\n"
  1.1837 +   "movzb  (%2,%%r10,1),%%rax\n"
  1.1838 +   "movq   4096(%5,%%rax,8),%%xmm1\n"
  1.1839 +   "lea    (%%r11,%6),%%r10\n"
  1.1840 +   "sar    $0x10,%%r11\n"
  1.1841 +@@ -103,38 +102,38 @@ void ScaleYUVToRGB32Row(const uint8* y_b
  1.1842 +   "paddsw %%xmm0,%%xmm1\n"
  1.1843 +   "paddsw %%xmm0,%%xmm2\n"
  1.1844 +   "shufps $0x44,%%xmm2,%%xmm1\n"
  1.1845 +   "psraw  $0x6,%%xmm1\n"
  1.1846 +   "packuswb %%xmm1,%%xmm1\n"
  1.1847 +   "movq   %%xmm1,0x0(%3)\n"
  1.1848 +   "add    $0x8,%3\n"
  1.1849 +   "sub    $0x2,%4\n"
  1.1850 +-  "jns    scaleloop\n"
  1.1851 +-
  1.1852 +-"scalenext:"
  1.1853 ++  "jns    0b\n"
  1.1854 ++
  1.1855 ++"1:"
  1.1856 +   "add    $0x1,%4\n"
  1.1857 +-  "js     scaledone\n"
  1.1858 ++  "js     2f\n"
  1.1859 + 
  1.1860 +   "mov    %%r11,%%r10\n"
  1.1861 +   "sar    $0x11,%%r10\n"
  1.1862 +   "movzb  (%1,%%r10,1),%%rax\n"
  1.1863 +   "movq   2048(%5,%%rax,8),%%xmm0\n"
  1.1864 +   "movzb  (%2,%%r10,1),%%rax\n"
  1.1865 +   "movq   4096(%5,%%rax,8),%%xmm1\n"
  1.1866 +   "paddsw %%xmm1,%%xmm0\n"
  1.1867 +   "sar    $0x10,%%r11\n"
  1.1868 +   "movzb  (%0,%%r11,1),%%rax\n"
  1.1869 +   "movq   (%5,%%rax,8),%%xmm1\n"
  1.1870 +   "paddsw %%xmm0,%%xmm1\n"
  1.1871 +   "psraw  $0x6,%%xmm1\n"
  1.1872 +   "packuswb %%xmm1,%%xmm1\n"
  1.1873 +   "movd   %%xmm1,0x0(%3)\n"
  1.1874 + 
  1.1875 +-"scaledone:"
  1.1876 ++"2:"
  1.1877 +   :
  1.1878 +   : "r"(y_buf),  // %0
  1.1879 +     "r"(u_buf),  // %1
  1.1880 +     "r"(v_buf),  // %2
  1.1881 +     "r"(rgb_buf),  // %3
  1.1882 +     "r"(width),  // %4
  1.1883 +     "r" (kCoefficientsRgbY),  // %5
  1.1884 +     "r"(static_cast<long>(source_dx))  // %6
  1.1885 +@@ -146,23 +145,23 @@ void LinearScaleYUVToRGB32Row(const uint
  1.1886 +                               const uint8* u_buf,
  1.1887 +                               const uint8* v_buf,
  1.1888 +                               uint8* rgb_buf,
  1.1889 +                               int width,
  1.1890 +                               int source_dx) {
  1.1891 +   asm(
  1.1892 +   "xor    %%r11,%%r11\n"   // x = 0
  1.1893 +   "sub    $0x2,%4\n"
  1.1894 +-  "js     .lscalenext\n"
  1.1895 ++  "js     2f\n"
  1.1896 +   "cmp    $0x20000,%6\n"   // if source_dx >= 2.0
  1.1897 +-  "jl     .lscalehalf\n"
  1.1898 ++  "jl     0f\n"
  1.1899 +   "mov    $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
  1.1900 +-".lscalehalf:"
  1.1901 +-
  1.1902 +-".lscaleloop:"
  1.1903 ++"0:"
  1.1904 ++
  1.1905 ++"1:"
  1.1906 +   "mov    %%r11,%%r10\n"
  1.1907 +   "sar    $0x11,%%r10\n"
  1.1908 + 
  1.1909 +   "movzb  (%1, %%r10, 1), %%r13 \n"
  1.1910 +   "movzb  1(%1, %%r10, 1), %%r14 \n"
  1.1911 +   "mov    %%r11, %%rax \n"
  1.1912 +   "and    $0x1fffe, %%rax \n"
  1.1913 +   "imul   %%rax, %%r14 \n"
  1.1914 +@@ -215,21 +214,21 @@ void LinearScaleYUVToRGB32Row(const uint
  1.1915 +   "paddsw %%xmm0,%%xmm1\n"
  1.1916 +   "paddsw %%xmm0,%%xmm2\n"
  1.1917 +   "shufps $0x44,%%xmm2,%%xmm1\n"
  1.1918 +   "psraw  $0x6,%%xmm1\n"
  1.1919 +   "packuswb %%xmm1,%%xmm1\n"
  1.1920 +   "movq   %%xmm1,0x0(%3)\n"
  1.1921 +   "add    $0x8,%3\n"
  1.1922 +   "sub    $0x2,%4\n"
  1.1923 +-  "jns    .lscaleloop\n"
  1.1924 +-
  1.1925 +-".lscalenext:"
  1.1926 ++  "jns    1b\n"
  1.1927 ++
  1.1928 ++"2:"
  1.1929 +   "add    $0x1,%4\n"
  1.1930 +-  "js     .lscaledone\n"
  1.1931 ++  "js     3f\n"
  1.1932 + 
  1.1933 +   "mov    %%r11,%%r10\n"
  1.1934 +   "sar    $0x11,%%r10\n"
  1.1935 + 
  1.1936 +   "movzb  (%1,%%r10,1), %%r13 \n"
  1.1937 +   "movq   2048(%5,%%r13,8),%%xmm0\n"
  1.1938 + 
  1.1939 +   "movzb  (%2,%%r10,1), %%r13 \n"
  1.1940 +@@ -241,52 +240,52 @@ void LinearScaleYUVToRGB32Row(const uint
  1.1941 +   "movzb  (%0,%%r11,1), %%r13 \n"
  1.1942 +   "movq   (%5,%%r13,8),%%xmm1\n"
  1.1943 + 
  1.1944 +   "paddsw %%xmm0,%%xmm1\n"
  1.1945 +   "psraw  $0x6,%%xmm1\n"
  1.1946 +   "packuswb %%xmm1,%%xmm1\n"
  1.1947 +   "movd   %%xmm1,0x0(%3)\n"
  1.1948 + 
  1.1949 +-".lscaledone:"
  1.1950 ++"3:"
  1.1951 +   :
  1.1952 +   : "r"(y_buf),  // %0
  1.1953 +     "r"(u_buf),  // %1
  1.1954 +     "r"(v_buf),  // %2
  1.1955 +     "r"(rgb_buf),  // %3
  1.1956 +     "r"(width),  // %4
  1.1957 +     "r" (kCoefficientsRgbY),  // %5
  1.1958 +     "r"(static_cast<long>(source_dx))  // %6
  1.1959 +   : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
  1.1960 + );
  1.1961 + }
  1.1962 + 
  1.1963 +-#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__)
  1.1964 ++#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__)
  1.1965 + 
  1.1966 + // PIC version is slower because less registers are available, so
  1.1967 + // non-PIC is used on platforms where it is possible.
  1.1968 +-
  1.1969 +-void FastConvertYUVToRGB32Row(const uint8* y_buf,
  1.1970 +-                              const uint8* u_buf,
  1.1971 +-                              const uint8* v_buf,
  1.1972 +-                              uint8* rgb_buf,
  1.1973 +-                              int width);
  1.1974 ++void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
  1.1975 ++                                  const uint8* u_buf,
  1.1976 ++                                  const uint8* v_buf,
  1.1977 ++                                  uint8* rgb_buf,
  1.1978 ++                                  int width);
  1.1979 +   asm(
  1.1980 +   ".text\n"
  1.1981 +-  ".global FastConvertYUVToRGB32Row\n"
  1.1982 +-"FastConvertYUVToRGB32Row:\n"
  1.1983 ++  ".global FastConvertYUVToRGB32Row_SSE\n"
  1.1984 ++  ".type FastConvertYUVToRGB32Row_SSE, @function\n"
  1.1985 ++"FastConvertYUVToRGB32Row_SSE:\n"
  1.1986 +   "pusha\n"
  1.1987 +   "mov    0x24(%esp),%edx\n"
  1.1988 +   "mov    0x28(%esp),%edi\n"
  1.1989 +   "mov    0x2c(%esp),%esi\n"
  1.1990 +   "mov    0x30(%esp),%ebp\n"
  1.1991 +   "mov    0x34(%esp),%ecx\n"
  1.1992 +-  "jmp    convertend\n"
  1.1993 +-
  1.1994 +-"convertloop:"
  1.1995 ++  "jmp    1f\n"
  1.1996 ++
  1.1997 ++"0:"
  1.1998 +   "movzbl (%edi),%eax\n"
  1.1999 +   "add    $0x1,%edi\n"
  1.2000 +   "movzbl (%esi),%ebx\n"
  1.2001 +   "add    $0x1,%esi\n"
  1.2002 +   "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
  1.2003 +   "movzbl (%edx),%eax\n"
  1.2004 +   "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
  1.2005 +   "movzbl 0x1(%edx),%ebx\n"
  1.2006 +@@ -295,59 +294,77 @@ void FastConvertYUVToRGB32Row(const uint
  1.2007 +   "movq   kCoefficientsRgbY(,%ebx,8),%mm2\n"
  1.2008 +   "paddsw %mm0,%mm1\n"
  1.2009 +   "paddsw %mm0,%mm2\n"
  1.2010 +   "psraw  $0x6,%mm1\n"
  1.2011 +   "psraw  $0x6,%mm2\n"
  1.2012 +   "packuswb %mm2,%mm1\n"
  1.2013 +   "movntq %mm1,0x0(%ebp)\n"
  1.2014 +   "add    $0x8,%ebp\n"
  1.2015 +-"convertend:"
  1.2016 ++"1:"
  1.2017 +   "sub    $0x2,%ecx\n"
  1.2018 +-  "jns    convertloop\n"
  1.2019 ++  "jns    0b\n"
  1.2020 + 
  1.2021 +   "and    $0x1,%ecx\n"
  1.2022 +-  "je     convertdone\n"
  1.2023 ++  "je     2f\n"
  1.2024 + 
  1.2025 +   "movzbl (%edi),%eax\n"
  1.2026 +   "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
  1.2027 +   "movzbl (%esi),%eax\n"
  1.2028 +   "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
  1.2029 +   "movzbl (%edx),%eax\n"
  1.2030 +   "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
  1.2031 +   "paddsw %mm0,%mm1\n"
  1.2032 +   "psraw  $0x6,%mm1\n"
  1.2033 +   "packuswb %mm1,%mm1\n"
  1.2034 +   "movd   %mm1,0x0(%ebp)\n"
  1.2035 +-"convertdone:"
  1.2036 ++"2:"
  1.2037 +   "popa\n"
  1.2038 +   "ret\n"
  1.2039 ++#if !defined(XP_MACOSX)
  1.2040 ++  ".previous\n"
  1.2041 ++#endif
  1.2042 + );
  1.2043 + 
  1.2044 +-
  1.2045 +-void ScaleYUVToRGB32Row(const uint8* y_buf,
  1.2046 +-                        const uint8* u_buf,
  1.2047 +-                        const uint8* v_buf,
  1.2048 +-                        uint8* rgb_buf,
  1.2049 +-                        int width,
  1.2050 +-                        int source_dx);
  1.2051 ++void FastConvertYUVToRGB32Row(const uint8* y_buf,
  1.2052 ++                              const uint8* u_buf,
  1.2053 ++                              const uint8* v_buf,
  1.2054 ++                              uint8* rgb_buf,
  1.2055 ++                              int width)
  1.2056 ++{
  1.2057 ++  if (mozilla::supports_sse()) {
  1.2058 ++    FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
  1.2059 ++    return;
  1.2060 ++  }
  1.2061 ++
  1.2062 ++  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
  1.2063 ++}
  1.2064 ++
  1.2065 ++
  1.2066 ++void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
  1.2067 ++                            const uint8* u_buf,
  1.2068 ++                            const uint8* v_buf,
  1.2069 ++                            uint8* rgb_buf,
  1.2070 ++                            int width,
  1.2071 ++                            int source_dx);
  1.2072 +   asm(
  1.2073 +   ".text\n"
  1.2074 +-  ".global ScaleYUVToRGB32Row\n"
  1.2075 +-"ScaleYUVToRGB32Row:\n"
  1.2076 ++  ".global ScaleYUVToRGB32Row_SSE\n"
  1.2077 ++  ".type ScaleYUVToRGB32Row_SSE, @function\n"
  1.2078 ++"ScaleYUVToRGB32Row_SSE:\n"
  1.2079 +   "pusha\n"
  1.2080 +   "mov    0x24(%esp),%edx\n"
  1.2081 +   "mov    0x28(%esp),%edi\n"
  1.2082 +   "mov    0x2c(%esp),%esi\n"
  1.2083 +   "mov    0x30(%esp),%ebp\n"
  1.2084 +   "mov    0x34(%esp),%ecx\n"
  1.2085 +   "xor    %ebx,%ebx\n"
  1.2086 +-  "jmp    scaleend\n"
  1.2087 +-
  1.2088 +-"scaleloop:"
  1.2089 ++  "jmp    1f\n"
  1.2090 ++
  1.2091 ++"0:"
  1.2092 +   "mov    %ebx,%eax\n"
  1.2093 +   "sar    $0x11,%eax\n"
  1.2094 +   "movzbl (%edi,%eax,1),%eax\n"
  1.2095 +   "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
  1.2096 +   "mov    %ebx,%eax\n"
  1.2097 +   "sar    $0x11,%eax\n"
  1.2098 +   "movzbl (%esi,%eax,1),%eax\n"
  1.2099 +   "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
  1.2100 +@@ -363,22 +380,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b
  1.2101 +   "movq   kCoefficientsRgbY(,%eax,8),%mm2\n"
  1.2102 +   "paddsw %mm0,%mm1\n"
  1.2103 +   "paddsw %mm0,%mm2\n"
  1.2104 +   "psraw  $0x6,%mm1\n"
  1.2105 +   "psraw  $0x6,%mm2\n"
  1.2106 +   "packuswb %mm2,%mm1\n"
  1.2107 +   "movntq %mm1,0x0(%ebp)\n"
  1.2108 +   "add    $0x8,%ebp\n"
  1.2109 +-"scaleend:"
  1.2110 ++"1:"
  1.2111 +   "sub    $0x2,%ecx\n"
  1.2112 +-  "jns    scaleloop\n"
  1.2113 ++  "jns    0b\n"
  1.2114 + 
  1.2115 +   "and    $0x1,%ecx\n"
  1.2116 +-  "je     scaledone\n"
  1.2117 ++  "je     2f\n"
  1.2118 + 
  1.2119 +   "mov    %ebx,%eax\n"
  1.2120 +   "sar    $0x11,%eax\n"
  1.2121 +   "movzbl (%edi,%eax,1),%eax\n"
  1.2122 +   "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
  1.2123 +   "mov    %ebx,%eax\n"
  1.2124 +   "sar    $0x11,%eax\n"
  1.2125 +   "movzbl (%esi,%eax,1),%eax\n"
  1.2126 +@@ -387,51 +404,71 @@ void ScaleYUVToRGB32Row(const uint8* y_b
  1.2127 +   "sar    $0x10,%eax\n"
  1.2128 +   "movzbl (%edx,%eax,1),%eax\n"
  1.2129 +   "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
  1.2130 +   "paddsw %mm0,%mm1\n"
  1.2131 +   "psraw  $0x6,%mm1\n"
  1.2132 +   "packuswb %mm1,%mm1\n"
  1.2133 +   "movd   %mm1,0x0(%ebp)\n"
  1.2134 + 
  1.2135 +-"scaledone:"
  1.2136 ++"2:"
  1.2137 +   "popa\n"
  1.2138 +   "ret\n"
  1.2139 ++#if !defined(XP_MACOSX)
  1.2140 ++  ".previous\n"
  1.2141 ++#endif
  1.2142 + );
  1.2143 + 
  1.2144 +-void LinearScaleYUVToRGB32Row(const uint8* y_buf,
  1.2145 +-                              const uint8* u_buf,
  1.2146 +-                              const uint8* v_buf,
  1.2147 +-                              uint8* rgb_buf,
  1.2148 +-                              int width,
  1.2149 +-                              int source_dx);
  1.2150 ++void ScaleYUVToRGB32Row(const uint8* y_buf,
  1.2151 ++                        const uint8* u_buf,
  1.2152 ++                        const uint8* v_buf,
  1.2153 ++                        uint8* rgb_buf,
  1.2154 ++                        int width,
  1.2155 ++                        int source_dx)
  1.2156 ++{
  1.2157 ++  if (mozilla::supports_sse()) {
  1.2158 ++    ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
  1.2159 ++                           width, source_dx);
  1.2160 ++  }
  1.2161 ++
  1.2162 ++  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
  1.2163 ++                       width, source_dx);
  1.2164 ++}
  1.2165 ++
  1.2166 ++void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
  1.2167 ++                                  const uint8* u_buf,
  1.2168 ++                                  const uint8* v_buf,
  1.2169 ++                                  uint8* rgb_buf,
  1.2170 ++                                  int width,
  1.2171 ++                                  int source_dx);
  1.2172 +   asm(
  1.2173 +   ".text\n"
  1.2174 +-  ".global LinearScaleYUVToRGB32Row\n"
  1.2175 +-"LinearScaleYUVToRGB32Row:\n"
  1.2176 ++  ".global LinearScaleYUVToRGB32Row_SSE\n"
  1.2177 ++  ".type LinearScaleYUVToRGB32Row_SSE, @function\n"
  1.2178 ++"LinearScaleYUVToRGB32Row_SSE:\n"
  1.2179 +   "pusha\n"
  1.2180 +   "mov    0x24(%esp),%edx\n"
  1.2181 +   "mov    0x28(%esp),%edi\n"
  1.2182 +   "mov    0x30(%esp),%ebp\n"
  1.2183 + 
  1.2184 +   // source_width = width * source_dx + ebx
  1.2185 +   "mov    0x34(%esp), %ecx\n"
  1.2186 +   "imull  0x38(%esp), %ecx\n"
  1.2187 +   "mov    %ecx, 0x34(%esp)\n"
  1.2188 + 
  1.2189 +   "mov    0x38(%esp), %ecx\n"
  1.2190 +   "xor    %ebx,%ebx\n"     // x = 0
  1.2191 +   "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
  1.2192 +-  "jl     .lscaleend\n"
  1.2193 ++  "jl     1f\n"
  1.2194 +   "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
  1.2195 +-  "jmp    .lscaleend\n"
  1.2196 +-
  1.2197 +-".lscaleloop:"
  1.2198 +-  "mov    %ebx,%eax\n"
  1.2199 +-  "sar    $0x11,%eax\n"
  1.2200 ++  "jmp    1f\n"
  1.2201 ++
  1.2202 ++"0:"
  1.2203 ++  "mov    %ebx,%eax\n"
  1.2204 ++  "sar    $0x11,%eax\n"
  1.2205 + 
  1.2206 +   "movzbl (%edi,%eax,1),%ecx\n"
  1.2207 +   "movzbl 1(%edi,%eax,1),%esi\n"
  1.2208 +   "mov    %ebx,%eax\n"
  1.2209 +   "andl   $0x1fffe, %eax \n"
  1.2210 +   "imul   %eax, %esi \n"
  1.2211 +   "xorl   $0x1fffe, %eax \n"
  1.2212 +   "imul   %eax, %ecx \n"
  1.2213 +@@ -464,17 +501,17 @@ void LinearScaleYUVToRGB32Row(const uint
  1.2214 +   "imul   %eax, %esi \n"
  1.2215 +   "xorl   $0xffff, %eax \n"
  1.2216 +   "imul   %eax, %ecx \n"
  1.2217 +   "addl   %esi, %ecx \n"
  1.2218 +   "shrl   $16, %ecx \n"
  1.2219 +   "movq   kCoefficientsRgbY(,%ecx,8),%mm1\n"
  1.2220 + 
  1.2221 +   "cmp    0x34(%esp), %ebx\n"
  1.2222 +-  "jge    .lscalelastpixel\n"
  1.2223 ++  "jge    2f\n"
  1.2224 + 
  1.2225 +   "mov    %ebx,%eax\n"
  1.2226 +   "sar    $0x10,%eax\n"
  1.2227 +   "movzbl (%edx,%eax,1),%ecx\n"
  1.2228 +   "movzbl 1(%edx,%eax,1),%esi\n"
  1.2229 +   "mov    %ebx,%eax\n"
  1.2230 +   "add    0x38(%esp),%ebx\n"
  1.2231 +   "andl   $0xffff, %eax \n"
  1.2232 +@@ -488,56 +525,76 @@ void LinearScaleYUVToRGB32Row(const uint
  1.2233 +   "paddsw %mm0,%mm1\n"
  1.2234 +   "paddsw %mm0,%mm2\n"
  1.2235 +   "psraw  $0x6,%mm1\n"
  1.2236 +   "psraw  $0x6,%mm2\n"
  1.2237 +   "packuswb %mm2,%mm1\n"
  1.2238 +   "movntq %mm1,0x0(%ebp)\n"
  1.2239 +   "add    $0x8,%ebp\n"
  1.2240 + 
  1.2241 +-".lscaleend:"
  1.2242 ++"1:"
  1.2243 +   "cmp    0x34(%esp), %ebx\n"
  1.2244 +-  "jl     .lscaleloop\n"
  1.2245 ++  "jl     0b\n"
  1.2246 +   "popa\n"
  1.2247 +   "ret\n"
  1.2248 + 
  1.2249 +-".lscalelastpixel:"
  1.2250 ++"2:"
  1.2251 +   "paddsw %mm0, %mm1\n"
  1.2252 +   "psraw $6, %mm1\n"
  1.2253 +   "packuswb %mm1, %mm1\n"
  1.2254 +   "movd %mm1, (%ebp)\n"
  1.2255 +   "popa\n"
  1.2256 +   "ret\n"
  1.2257 ++#if !defined(XP_MACOSX)
  1.2258 ++  ".previous\n"
  1.2259 ++#endif
  1.2260 + );
  1.2261 + 
  1.2262 +-#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__)
  1.2263 +-
  1.2264 +-extern void PICConvertYUVToRGB32Row(const uint8* y_buf,
  1.2265 +-                                    const uint8* u_buf,
  1.2266 +-                                    const uint8* v_buf,
  1.2267 +-                                    uint8* rgb_buf,
  1.2268 +-                                    int width,
  1.2269 +-                                    int16 *kCoefficientsRgbY);
  1.2270 ++void LinearScaleYUVToRGB32Row(const uint8* y_buf,
  1.2271 ++                              const uint8* u_buf,
  1.2272 ++                              const uint8* v_buf,
  1.2273 ++                              uint8* rgb_buf,
  1.2274 ++                              int width,
  1.2275 ++                              int source_dx)
  1.2276 ++{
  1.2277 ++  if (mozilla::supports_sse()) {
  1.2278 ++    LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
  1.2279 ++                                 width, source_dx);
  1.2280 ++  }
  1.2281 ++
  1.2282 ++  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
  1.2283 ++                             width, source_dx);
  1.2284 ++}
  1.2285 ++
  1.2286 ++#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__)
  1.2287 ++
  1.2288 ++void PICConvertYUVToRGB32Row_SSE(const uint8* y_buf,
  1.2289 ++                                 const uint8* u_buf,
  1.2290 ++                                 const uint8* v_buf,
  1.2291 ++                                 uint8* rgb_buf,
  1.2292 ++                                 int width,
  1.2293 ++                                 int16 *kCoefficientsRgbY);
  1.2294 ++
  1.2295 +   asm(
  1.2296 +   ".text\n"
  1.2297 +-#if defined(OS_MACOSX)
  1.2298 +-"_PICConvertYUVToRGB32Row:\n"
  1.2299 ++#if defined(XP_MACOSX)
  1.2300 ++"_PICConvertYUVToRGB32Row_SSE:\n"
  1.2301 + #else
  1.2302 +-"PICConvertYUVToRGB32Row:\n"
  1.2303 ++"PICConvertYUVToRGB32Row_SSE:\n"
  1.2304 + #endif
  1.2305 +   "pusha\n"
  1.2306 +   "mov    0x24(%esp),%edx\n"
  1.2307 +   "mov    0x28(%esp),%edi\n"
  1.2308 +   "mov    0x2c(%esp),%esi\n"
  1.2309 +   "mov    0x30(%esp),%ebp\n"
  1.2310 +   "mov    0x38(%esp),%ecx\n"
  1.2311 + 
  1.2312 +-  "jmp    .Lconvertend\n"
  1.2313 +-
  1.2314 +-".Lconvertloop:"
  1.2315 ++  "jmp    1f\n"
  1.2316 ++
  1.2317 ++"0:"
  1.2318 +   "movzbl (%edi),%eax\n"
  1.2319 +   "add    $0x1,%edi\n"
  1.2320 +   "movzbl (%esi),%ebx\n"
  1.2321 +   "add    $0x1,%esi\n"
  1.2322 +   "movq   2048(%ecx,%eax,8),%mm0\n"
  1.2323 +   "movzbl (%edx),%eax\n"
  1.2324 +   "paddsw 4096(%ecx,%ebx,8),%mm0\n"
  1.2325 +   "movzbl 0x1(%edx),%ebx\n"
  1.2326 +@@ -546,72 +603,81 @@ extern void PICConvertYUVToRGB32Row(cons
  1.2327 +   "movq   0(%ecx,%ebx,8),%mm2\n"
  1.2328 +   "paddsw %mm0,%mm1\n"
  1.2329 +   "paddsw %mm0,%mm2\n"
  1.2330 +   "psraw  $0x6,%mm1\n"
  1.2331 +   "psraw  $0x6,%mm2\n"
  1.2332 +   "packuswb %mm2,%mm1\n"
  1.2333 +   "movntq %mm1,0x0(%ebp)\n"
  1.2334 +   "add    $0x8,%ebp\n"
  1.2335 +-".Lconvertend:"
  1.2336 ++"1:"
  1.2337 +   "subl   $0x2,0x34(%esp)\n"
  1.2338 +-  "jns    .Lconvertloop\n"
  1.2339 ++  "jns    0b\n"
  1.2340 + 
  1.2341 +   "andl   $0x1,0x34(%esp)\n"
  1.2342 +-  "je     .Lconvertdone\n"
  1.2343 ++  "je     2f\n"
  1.2344 + 
  1.2345 +   "movzbl (%edi),%eax\n"
  1.2346 +   "movq   2048(%ecx,%eax,8),%mm0\n"
  1.2347 +   "movzbl (%esi),%eax\n"
  1.2348 +   "paddsw 4096(%ecx,%eax,8),%mm0\n"
  1.2349 +   "movzbl (%edx),%eax\n"
  1.2350 +   "movq   0(%ecx,%eax,8),%mm1\n"
  1.2351 +   "paddsw %mm0,%mm1\n"
  1.2352 +   "psraw  $0x6,%mm1\n"
  1.2353 +   "packuswb %mm1,%mm1\n"
  1.2354 +   "movd   %mm1,0x0(%ebp)\n"
  1.2355 +-".Lconvertdone:\n"
  1.2356 ++"2:"
  1.2357 +   "popa\n"
  1.2358 +   "ret\n"
  1.2359 ++#if !defined(XP_MACOSX)
  1.2360 ++  ".previous\n"
  1.2361 ++#endif
  1.2362 + );
  1.2363 + 
  1.2364 + void FastConvertYUVToRGB32Row(const uint8* y_buf,
  1.2365 +                               const uint8* u_buf,
  1.2366 +                               const uint8* v_buf,
  1.2367 +                               uint8* rgb_buf,
  1.2368 +-                              int width) {
  1.2369 +-  PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
  1.2370 +-                          &kCoefficientsRgbY[0][0]);
  1.2371 +-}
  1.2372 +-
  1.2373 +-extern void PICScaleYUVToRGB32Row(const uint8* y_buf,
  1.2374 ++                              int width)
  1.2375 ++{
  1.2376 ++  if (mozilla::supports_sse()) {
  1.2377 ++    PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
  1.2378 ++                                &kCoefficientsRgbY[0][0]);
  1.2379 ++    return;
  1.2380 ++  }
  1.2381 ++
  1.2382 ++  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
  1.2383 ++}
  1.2384 ++
  1.2385 ++void PICScaleYUVToRGB32Row_SSE(const uint8* y_buf,
  1.2386 +                                const uint8* u_buf,
  1.2387 +                                const uint8* v_buf,
  1.2388 +                                uint8* rgb_buf,
  1.2389 +                                int width,
  1.2390 +                                int source_dx,
  1.2391 +                                int16 *kCoefficientsRgbY);
  1.2392 + 
  1.2393 +   asm(
  1.2394 +   ".text\n"
  1.2395 +-#if defined(OS_MACOSX)
  1.2396 +-"_PICScaleYUVToRGB32Row:\n"
  1.2397 ++#if defined(XP_MACOSX)
  1.2398 ++"_PICScaleYUVToRGB32Row_SSE:\n"
  1.2399 + #else
  1.2400 +-"PICScaleYUVToRGB32Row:\n"
  1.2401 ++"PICScaleYUVToRGB32Row_SSE:\n"
  1.2402 + #endif
  1.2403 +   "pusha\n"
  1.2404 +   "mov    0x24(%esp),%edx\n"
  1.2405 +   "mov    0x28(%esp),%edi\n"
  1.2406 +   "mov    0x2c(%esp),%esi\n"
  1.2407 +   "mov    0x30(%esp),%ebp\n"
  1.2408 +   "mov    0x3c(%esp),%ecx\n"
  1.2409 +   "xor    %ebx,%ebx\n"
  1.2410 +-  "jmp    Lscaleend\n"
  1.2411 +-
  1.2412 +-"Lscaleloop:"
  1.2413 ++  "jmp    1f\n"
  1.2414 ++
  1.2415 ++"0:"
  1.2416 +   "mov    %ebx,%eax\n"
  1.2417 +   "sar    $0x11,%eax\n"
  1.2418 +   "movzbl (%edi,%eax,1),%eax\n"
  1.2419 +   "movq   2048(%ecx,%eax,8),%mm0\n"
  1.2420 +   "mov    %ebx,%eax\n"
  1.2421 +   "sar    $0x11,%eax\n"
  1.2422 +   "movzbl (%esi,%eax,1),%eax\n"
  1.2423 +   "paddsw 4096(%ecx,%eax,8),%mm0\n"
  1.2424 +@@ -627,22 +693,22 @@ extern void PICScaleYUVToRGB32Row(const 
  1.2425 +   "movq   0(%ecx,%eax,8),%mm2\n"
  1.2426 +   "paddsw %mm0,%mm1\n"
  1.2427 +   "paddsw %mm0,%mm2\n"
  1.2428 +   "psraw  $0x6,%mm1\n"
  1.2429 +   "psraw  $0x6,%mm2\n"
  1.2430 +   "packuswb %mm2,%mm1\n"
  1.2431 +   "movntq %mm1,0x0(%ebp)\n"
  1.2432 +   "add    $0x8,%ebp\n"
  1.2433 +-"Lscaleend:"
  1.2434 ++"1:"
  1.2435 +   "subl   $0x2,0x34(%esp)\n"
  1.2436 +-  "jns    Lscaleloop\n"
  1.2437 ++  "jns    0b\n"
  1.2438 + 
  1.2439 +   "andl   $0x1,0x34(%esp)\n"
  1.2440 +-  "je     Lscaledone\n"
  1.2441 ++  "je     2f\n"
  1.2442 + 
  1.2443 +   "mov    %ebx,%eax\n"
  1.2444 +   "sar    $0x11,%eax\n"
  1.2445 +   "movzbl (%edi,%eax,1),%eax\n"
  1.2446 +   "movq   2048(%ecx,%eax,8),%mm0\n"
  1.2447 +   "mov    %ebx,%eax\n"
  1.2448 +   "sar    $0x11,%eax\n"
  1.2449 +   "movzbl (%esi,%eax,1),%eax\n"
  1.2450 +@@ -651,66 +717,75 @@ extern void PICScaleYUVToRGB32Row(const 
  1.2451 +   "sar    $0x10,%eax\n"
  1.2452 +   "movzbl (%edx,%eax,1),%eax\n"
  1.2453 +   "movq   0(%ecx,%eax,8),%mm1\n"
  1.2454 +   "paddsw %mm0,%mm1\n"
  1.2455 +   "psraw  $0x6,%mm1\n"
  1.2456 +   "packuswb %mm1,%mm1\n"
  1.2457 +   "movd   %mm1,0x0(%ebp)\n"
  1.2458 + 
  1.2459 +-"Lscaledone:"
  1.2460 ++"2:"
  1.2461 +   "popa\n"
  1.2462 +   "ret\n"
  1.2463 ++#if !defined(XP_MACOSX)
  1.2464 ++  ".previous\n"
  1.2465 ++#endif
  1.2466 + );
  1.2467 + 
  1.2468 +-
  1.2469 + void ScaleYUVToRGB32Row(const uint8* y_buf,
  1.2470 +                         const uint8* u_buf,
  1.2471 +                         const uint8* v_buf,
  1.2472 +                         uint8* rgb_buf,
  1.2473 +                         int width,
  1.2474 +-                        int source_dx) {
  1.2475 +-  PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
  1.2476 +-                        &kCoefficientsRgbY[0][0]);
  1.2477 +-}
  1.2478 +-
  1.2479 +-void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
  1.2480 +-                                 const uint8* u_buf,
  1.2481 +-                                 const uint8* v_buf,
  1.2482 +-                                 uint8* rgb_buf,
  1.2483 +-                                 int width,
  1.2484 +-                                 int source_dx,
  1.2485 +-                                 int16 *kCoefficientsRgbY);
  1.2486 ++                        int source_dx)
  1.2487 ++{
  1.2488 ++  if (mozilla::supports_sse()) {
  1.2489 ++    PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
  1.2490 ++                              &kCoefficientsRgbY[0][0]);
  1.2491 ++    return;
  1.2492 ++  }
  1.2493 ++
  1.2494 ++  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
  1.2495 ++}
  1.2496 ++
  1.2497 ++void PICLinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
  1.2498 ++                                     const uint8* u_buf,
  1.2499 ++                                     const uint8* v_buf,
  1.2500 ++                                     uint8* rgb_buf,
  1.2501 ++                                     int width,
  1.2502 ++                                     int source_dx,
  1.2503 ++                                     int16 *kCoefficientsRgbY);
  1.2504 ++
  1.2505 +   asm(
  1.2506 +   ".text\n"
  1.2507 +-#if defined(OS_MACOSX)
  1.2508 +-"_PICLinearScaleYUVToRGB32Row:\n"
  1.2509 ++#if defined(XP_MACOSX)
  1.2510 ++"_PICLinearScaleYUVToRGB32Row_SSE:\n"
  1.2511 + #else
  1.2512 +-"PICLinearScaleYUVToRGB32Row:\n"
  1.2513 ++"PICLinearScaleYUVToRGB32Row_SSE:\n"
  1.2514 + #endif
  1.2515 +   "pusha\n"
  1.2516 +   "mov    0x24(%esp),%edx\n"
  1.2517 +   "mov    0x30(%esp),%ebp\n"
  1.2518 +   "mov    0x34(%esp),%ecx\n"
  1.2519 +   "mov    0x3c(%esp),%edi\n"
  1.2520 +   "xor    %ebx,%ebx\n"
  1.2521 + 
  1.2522 +   // source_width = width * source_dx + ebx
  1.2523 +   "mov    0x34(%esp), %ecx\n"
  1.2524 +   "imull  0x38(%esp), %ecx\n"
  1.2525 +   "mov    %ecx, 0x34(%esp)\n"
  1.2526 + 
  1.2527 +   "mov    0x38(%esp), %ecx\n"
  1.2528 +   "xor    %ebx,%ebx\n"     // x = 0
  1.2529 +   "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
  1.2530 +-  "jl     .lscaleend\n"
  1.2531 ++  "jl     1f\n"
  1.2532 +   "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
  1.2533 +-  "jmp    .lscaleend\n"
  1.2534 +-
  1.2535 +-".lscaleloop:"
  1.2536 ++  "jmp    1f\n"
  1.2537 ++
  1.2538 ++"0:"
  1.2539 +   "mov    0x28(%esp),%esi\n"
  1.2540 +   "mov    %ebx,%eax\n"
  1.2541 +   "sar    $0x11,%eax\n"
  1.2542 + 
  1.2543 +   "movzbl (%esi,%eax,1),%ecx\n"
  1.2544 +   "movzbl 1(%esi,%eax,1),%esi\n"
  1.2545 +   "mov    %ebx,%eax\n"
  1.2546 +   "andl   $0x1fffe, %eax \n"
  1.2547 +@@ -746,17 +821,17 @@ void PICLinearScaleYUVToRGB32Row(const u
  1.2548 +   "imul   %eax, %esi \n"
  1.2549 +   "xorl   $0xffff, %eax \n"
  1.2550 +   "imul   %eax, %ecx \n"
  1.2551 +   "addl   %esi, %ecx \n"
  1.2552 +   "shrl   $16, %ecx \n"
  1.2553 +   "movq   (%edi,%ecx,8),%mm1\n"
  1.2554 + 
  1.2555 +   "cmp    0x34(%esp), %ebx\n"
  1.2556 +-  "jge    .lscalelastpixel\n"
  1.2557 ++  "jge    2f\n"
  1.2558 + 
  1.2559 +   "mov    %ebx,%eax\n"
  1.2560 +   "sar    $0x10,%eax\n"
  1.2561 +   "movzbl (%edx,%eax,1),%ecx\n"
  1.2562 +   "movzbl 1(%edx,%eax,1),%esi\n"
  1.2563 +   "mov    %ebx,%eax\n"
  1.2564 +   "add    0x38(%esp),%ebx\n"
  1.2565 +   "andl   $0xffff, %eax \n"
  1.2566 +@@ -770,154 +845,71 @@ void PICLinearScaleYUVToRGB32Row(const u
  1.2567 +   "paddsw %mm0,%mm1\n"
  1.2568 +   "paddsw %mm0,%mm2\n"
  1.2569 +   "psraw  $0x6,%mm1\n"
  1.2570 +   "psraw  $0x6,%mm2\n"
  1.2571 +   "packuswb %mm2,%mm1\n"
  1.2572 +   "movntq %mm1,0x0(%ebp)\n"
  1.2573 +   "add    $0x8,%ebp\n"
  1.2574 + 
  1.2575 +-".lscaleend:"
  1.2576 ++"1:"
  1.2577 +   "cmp    %ebx, 0x34(%esp)\n"
  1.2578 +-  "jg     .lscaleloop\n"
  1.2579 ++  "jg     0b\n"
  1.2580 +   "popa\n"
  1.2581 +   "ret\n"
  1.2582 + 
  1.2583 +-".lscalelastpixel:"
  1.2584 ++"2:"
  1.2585 +   "paddsw %mm0, %mm1\n"
  1.2586 +   "psraw $6, %mm1\n"
  1.2587 +   "packuswb %mm1, %mm1\n"
  1.2588 +   "movd %mm1, (%ebp)\n"
  1.2589 +   "popa\n"
  1.2590 +   "ret\n"
  1.2591 ++#if !defined(XP_MACOSX)
  1.2592 ++  ".previous\n"
  1.2593 ++#endif
  1.2594 + );
  1.2595 + 
  1.2596 ++
  1.2597 + void LinearScaleYUVToRGB32Row(const uint8* y_buf,
  1.2598 +-                        const uint8* u_buf,
  1.2599 +-                        const uint8* v_buf,
  1.2600 +-                        uint8* rgb_buf,
  1.2601 +-                        int width,
  1.2602 +-                        int source_dx) {
  1.2603 +-  PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
  1.2604 +-                              &kCoefficientsRgbY[0][0]);
  1.2605 +-}
  1.2606 +-
  1.2607 +-#else  // USE_MMX
  1.2608 +-
  1.2609 +-// C reference code that mimic the YUV assembly.
  1.2610 +-#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
  1.2611 +-#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
  1.2612 +-    (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
  1.2613 +-
  1.2614 +-static inline void YuvPixel(uint8 y,
  1.2615 +-                            uint8 u,
  1.2616 +-                            uint8 v,
  1.2617 +-                            uint8* rgb_buf) {
  1.2618 +-
  1.2619 +-  int b = kCoefficientsRgbY[256+u][0];
  1.2620 +-  int g = kCoefficientsRgbY[256+u][1];
  1.2621 +-  int r = kCoefficientsRgbY[256+u][2];
  1.2622 +-  int a = kCoefficientsRgbY[256+u][3];
  1.2623 +-
  1.2624 +-  b = paddsw(b, kCoefficientsRgbY[512+v][0]);
  1.2625 +-  g = paddsw(g, kCoefficientsRgbY[512+v][1]);
  1.2626 +-  r = paddsw(r, kCoefficientsRgbY[512+v][2]);
  1.2627 +-  a = paddsw(a, kCoefficientsRgbY[512+v][3]);
  1.2628 +-
  1.2629 +-  b = paddsw(b, kCoefficientsRgbY[y][0]);
  1.2630 +-  g = paddsw(g, kCoefficientsRgbY[y][1]);
  1.2631 +-  r = paddsw(r, kCoefficientsRgbY[y][2]);
  1.2632 +-  a = paddsw(a, kCoefficientsRgbY[y][3]);
  1.2633 +-
  1.2634 +-  b >>= 6;
  1.2635 +-  g >>= 6;
  1.2636 +-  r >>= 6;
  1.2637 +-  a >>= 6;
  1.2638 +-
  1.2639 +-  *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
  1.2640 +-                                        (packuswb(g) << 8) |
  1.2641 +-                                        (packuswb(r) << 16) |
  1.2642 +-                                        (packuswb(a) << 24);
  1.2643 +-}
  1.2644 +-
  1.2645 ++                              const uint8* u_buf,
  1.2646 ++                              const uint8* v_buf,
  1.2647 ++                              uint8* rgb_buf,
  1.2648 ++                              int width,
  1.2649 ++                              int source_dx)
  1.2650 ++{
  1.2651 ++  if (mozilla::supports_sse()) {
  1.2652 ++    PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
  1.2653 ++                                    source_dx, &kCoefficientsRgbY[0][0]);
  1.2654 ++    return;
  1.2655 ++  }
  1.2656 ++
  1.2657 ++  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
  1.2658 ++}
  1.2659 ++#else
  1.2660 + void FastConvertYUVToRGB32Row(const uint8* y_buf,
  1.2661 +                               const uint8* u_buf,
  1.2662 +                               const uint8* v_buf,
  1.2663 +                               uint8* rgb_buf,
  1.2664 +                               int width) {
  1.2665 +-  for (int x = 0; x < width; x += 2) {
  1.2666 +-    uint8 u = u_buf[x >> 1];
  1.2667 +-    uint8 v = v_buf[x >> 1];
  1.2668 +-    uint8 y0 = y_buf[x];
  1.2669 +-    YuvPixel(y0, u, v, rgb_buf);
  1.2670 +-    if ((x + 1) < width) {
  1.2671 +-      uint8 y1 = y_buf[x + 1];
  1.2672 +-      YuvPixel(y1, u, v, rgb_buf + 4);
  1.2673 +-    }
  1.2674 +-    rgb_buf += 8;  // Advance 2 pixels.
  1.2675 +-  }
  1.2676 +-}
  1.2677 +-
  1.2678 +-// 16.16 fixed point is used.  A shift by 16 isolates the integer.
  1.2679 +-// A shift by 17 is used to further subsample the chrominence channels.
  1.2680 +-// & 0xffff isolates the fixed point fraction.  >> 2 to get the upper 2 bits,
  1.2681 +-// for 1/65536 pixel accurate interpolation.
  1.2682 ++  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
  1.2683 ++}
  1.2684 ++
  1.2685 + void ScaleYUVToRGB32Row(const uint8* y_buf,
  1.2686 +                         const uint8* u_buf,
  1.2687 +                         const uint8* v_buf,
  1.2688 +                         uint8* rgb_buf,
  1.2689 +                         int width,
  1.2690 +                         int source_dx) {
  1.2691 +-  int x = 0;
  1.2692 +-  for (int i = 0; i < width; i += 2) {
  1.2693 +-    int y = y_buf[x >> 16];
  1.2694 +-    int u = u_buf[(x >> 17)];
  1.2695 +-    int v = v_buf[(x >> 17)];
  1.2696 +-    YuvPixel(y, u, v, rgb_buf);
  1.2697 +-    x += source_dx;
  1.2698 +-    if ((i + 1) < width) {
  1.2699 +-      y = y_buf[x >> 16];
  1.2700 +-      YuvPixel(y, u, v, rgb_buf+4);
  1.2701 +-      x += source_dx;
  1.2702 +-    }
  1.2703 +-    rgb_buf += 8;
  1.2704 +-  }
  1.2705 +-}
  1.2706 ++  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
  1.2707 ++}
  1.2708 + 
  1.2709 + void LinearScaleYUVToRGB32Row(const uint8* y_buf,
  1.2710 +                               const uint8* u_buf,
  1.2711 +                               const uint8* v_buf,
  1.2712 +                               uint8* rgb_buf,
  1.2713 +                               int width,
  1.2714 +                               int source_dx) {
  1.2715 +-  int x = 0;
  1.2716 +-  if (source_dx >= 0x20000) {
  1.2717 +-    x = 32768;
  1.2718 +-  }
  1.2719 +-  for (int i = 0; i < width; i += 2) {
  1.2720 +-    int y0 = y_buf[x >> 16];
  1.2721 +-    int y1 = y_buf[(x >> 16) + 1];
  1.2722 +-    int u0 = u_buf[(x >> 17)];
  1.2723 +-    int u1 = u_buf[(x >> 17) + 1];
  1.2724 +-    int v0 = v_buf[(x >> 17)];
  1.2725 +-    int v1 = v_buf[(x >> 17) + 1];
  1.2726 +-    int y_frac = (x & 65535);
  1.2727 +-    int uv_frac = ((x >> 1) & 65535);
  1.2728 +-    int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
  1.2729 +-    int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;
  1.2730 +-    int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;
  1.2731 +-    YuvPixel(y, u, v, rgb_buf);
  1.2732 +-    x += source_dx;
  1.2733 +-    if ((i + 1) < width) {
  1.2734 +-      y0 = y_buf[x >> 16];
  1.2735 +-      y1 = y_buf[(x >> 16) + 1];
  1.2736 +-      y_frac = (x & 65535);
  1.2737 +-      y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
  1.2738 +-      YuvPixel(y, u, v, rgb_buf+4);
  1.2739 +-      x += source_dx;
  1.2740 +-    }
  1.2741 +-    rgb_buf += 8;
  1.2742 +-  }
  1.2743 +-}
  1.2744 +-
  1.2745 +-#endif  // USE_MMX
  1.2746 +-}  // extern "C"
  1.2747 +-
  1.2748 ++  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
  1.2749 ++}
  1.2750 ++#endif
  1.2751 ++
  1.2752 ++}
  1.2753 +diff --git a/gfx/ycbcr/yuv_row_table.cpp b/gfx/ycbcr/yuv_row_table.cpp
  1.2754 +--- a/gfx/ycbcr/yuv_row_table.cpp
  1.2755 ++++ b/gfx/ycbcr/yuv_row_table.cpp
  1.2756 +@@ -1,13 +1,13 @@
  1.2757 + // Copyright (c) 2010 The Chromium Authors. All rights reserved.
  1.2758 + // Use of this source code is governed by a BSD-style license that can be
  1.2759 + // found in the LICENSE file.
  1.2760 + 
  1.2761 +-#include "media/base/yuv_row.h"
  1.2762 ++#include "yuv_row.h"
  1.2763 + 
  1.2764 + extern "C" {
  1.2765 + 
  1.2766 + #define RGBY(i) { \
  1.2767 +   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
  1.2768 +   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
  1.2769 +   static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
  1.2770 +   0 \
  1.2771 +diff --git a/gfx/ycbcr/yuv_row_win.cpp b/gfx/ycbcr/yuv_row_win.cpp
  1.2772 +--- a/gfx/ycbcr/yuv_row_win.cpp
  1.2773 ++++ b/gfx/ycbcr/yuv_row_win.cpp
  1.2774 +@@ -1,26 +1,27 @@
  1.2775 + // Copyright (c) 2010 The Chromium Authors. All rights reserved.
  1.2776 + // Use of this source code is governed by a BSD-style license that can be
  1.2777 + // found in the LICENSE file.
  1.2778 + 
  1.2779 +-#include "media/base/yuv_row.h"
  1.2780 ++#include "yuv_row.h"
  1.2781 ++#include "mozilla/SSE.h"
  1.2782 + 
  1.2783 + #define kCoefficientsRgbU kCoefficientsRgbY + 2048
  1.2784 + #define kCoefficientsRgbV kCoefficientsRgbY + 4096
  1.2785 + 
  1.2786 + extern "C" {
  1.2787 + 
  1.2788 +-#if USE_MMX
  1.2789 +-__declspec(naked)
  1.2790 +-void FastConvertYUVToRGB32Row(const uint8* y_buf,
  1.2791 +-                              const uint8* u_buf,
  1.2792 +-                              const uint8* v_buf,
  1.2793 +-                              uint8* rgb_buf,
  1.2794 +-                              int width) {
  1.2795 ++#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
  1.2796 ++__declspec(naked)
  1.2797 ++void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
  1.2798 ++                                  const uint8* u_buf,
  1.2799 ++                                  const uint8* v_buf,
  1.2800 ++                                  uint8* rgb_buf,
  1.2801 ++                                  int width) {
  1.2802 +   __asm {
  1.2803 +     pushad
  1.2804 +     mov       edx, [esp + 32 + 4]   // Y
  1.2805 +     mov       edi, [esp + 32 + 8]   // U
  1.2806 +     mov       esi, [esp + 32 + 12]  // V
  1.2807 +     mov       ebp, [esp + 32 + 16]  // rgb
  1.2808 +     mov       ecx, [esp + 32 + 20]  // width
  1.2809 +     jmp       convertend
  1.2810 +@@ -64,22 +65,22 @@ void FastConvertYUVToRGB32Row(const uint
  1.2811 +  convertdone :
  1.2812 + 
  1.2813 +     popad
  1.2814 +     ret
  1.2815 +   }
  1.2816 + }
  1.2817 + 
  1.2818 + __declspec(naked)
  1.2819 +-void ConvertYUVToRGB32Row(const uint8* y_buf,
  1.2820 +-                          const uint8* u_buf,
  1.2821 +-                          const uint8* v_buf,
  1.2822 +-                          uint8* rgb_buf,
  1.2823 +-                          int width,
  1.2824 +-                          int step) {
  1.2825 ++void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
  1.2826 ++                              const uint8* u_buf,
  1.2827 ++                              const uint8* v_buf,
  1.2828 ++                              uint8* rgb_buf,
  1.2829 ++                              int width,
  1.2830 ++                              int step) {
  1.2831 +   __asm {
  1.2832 +     pushad
  1.2833 +     mov       edx, [esp + 32 + 4]   // Y
  1.2834 +     mov       edi, [esp + 32 + 8]   // U
  1.2835 +     mov       esi, [esp + 32 + 12]  // V
  1.2836 +     mov       ebp, [esp + 32 + 16]  // rgb
  1.2837 +     mov       ecx, [esp + 32 + 20]  // width
  1.2838 +     mov       ebx, [esp + 32 + 24]  // step
  1.2839 +@@ -125,23 +126,23 @@ void ConvertYUVToRGB32Row(const uint8* y
  1.2840 +  wdone :
  1.2841 + 
  1.2842 +     popad
  1.2843 +     ret
  1.2844 +   }
  1.2845 + }
  1.2846 + 
  1.2847 + __declspec(naked)
  1.2848 +-void RotateConvertYUVToRGB32Row(const uint8* y_buf,
  1.2849 +-                                const uint8* u_buf,
  1.2850 +-                                const uint8* v_buf,
  1.2851 +-                                uint8* rgb_buf,
  1.2852 +-                                int width,
  1.2853 +-                                int ystep,
  1.2854 +-                                int uvstep) {
  1.2855 ++void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
  1.2856 ++                                    const uint8* u_buf,
  1.2857 ++                                    const uint8* v_buf,
  1.2858 ++                                    uint8* rgb_buf,
  1.2859 ++                                    int width,
  1.2860 ++                                    int ystep,
  1.2861 ++                                    int uvstep) {
  1.2862 +   __asm {
  1.2863 +     pushad
  1.2864 +     mov       edx, [esp + 32 + 4]   // Y
  1.2865 +     mov       edi, [esp + 32 + 8]   // U
  1.2866 +     mov       esi, [esp + 32 + 12]  // V
  1.2867 +     mov       ebp, [esp + 32 + 16]  // rgb
  1.2868 +     mov       ecx, [esp + 32 + 20]  // width
  1.2869 +     jmp       wend
  1.2870 +@@ -188,21 +189,21 @@ void RotateConvertYUVToRGB32Row(const ui
  1.2871 +  wdone :
  1.2872 + 
  1.2873 +     popad
  1.2874 +     ret
  1.2875 +   }
  1.2876 + }
  1.2877 + 
  1.2878 + __declspec(naked)
  1.2879 +-void DoubleYUVToRGB32Row(const uint8* y_buf,
  1.2880 +-                         const uint8* u_buf,
  1.2881 +-                         const uint8* v_buf,
  1.2882 +-                         uint8* rgb_buf,
  1.2883 +-                         int width) {
  1.2884 ++void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
  1.2885 ++                             const uint8* u_buf,
  1.2886 ++                             const uint8* v_buf,
  1.2887 ++                             uint8* rgb_buf,
  1.2888 ++                             int width) {
  1.2889 +   __asm {
  1.2890 +     pushad
  1.2891 +     mov       edx, [esp + 32 + 4]   // Y
  1.2892 +     mov       edi, [esp + 32 + 8]   // U
  1.2893 +     mov       esi, [esp + 32 + 12]  // V
  1.2894 +     mov       ebp, [esp + 32 + 16]  // rgb
  1.2895 +     mov       ecx, [esp + 32 + 20]  // width
  1.2896 +     jmp       wend
  1.2897 +@@ -256,26 +257,26 @@ void DoubleYUVToRGB32Row(const uint8* y_
  1.2898 +     jns       wloop1
  1.2899 +  wdone :
  1.2900 +     popad
  1.2901 +     ret
  1.2902 +   }
  1.2903 + }
  1.2904 + 
  1.2905 + // This version does general purpose scaling by any amount, up or down.
  1.2906 +-// The only thing it can not do it rotation by 90 or 270.
  1.2907 +-// For performance the chroma is under sampled, reducing cost of a 3x
  1.2908 ++// The only thing it cannot do is rotation by 90 or 270.
  1.2909 ++// For performance the chroma is under-sampled, reducing cost of a 3x
  1.2910 + // 1080p scale from 8.4 ms to 5.4 ms.
  1.2911 + __declspec(naked)
  1.2912 +-void ScaleYUVToRGB32Row(const uint8* y_buf,
  1.2913 +-                        const uint8* u_buf,
  1.2914 +-                        const uint8* v_buf,
  1.2915 +-                        uint8* rgb_buf,
  1.2916 +-                        int width,
  1.2917 +-                        int source_dx) {
  1.2918 ++void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
  1.2919 ++                            const uint8* u_buf,
  1.2920 ++                            const uint8* v_buf,
  1.2921 ++                            uint8* rgb_buf,
  1.2922 ++                            int width,
  1.2923 ++                            int source_dx) {
  1.2924 +   __asm {
  1.2925 +     pushad
  1.2926 +     mov       edx, [esp + 32 + 4]   // Y
  1.2927 +     mov       edi, [esp + 32 + 8]   // U
  1.2928 +     mov       esi, [esp + 32 + 12]  // V
  1.2929 +     mov       ebp, [esp + 32 + 16]  // rgb
  1.2930 +     mov       ecx, [esp + 32 + 20]  // width
  1.2931 +     xor       ebx, ebx              // x
  1.2932 +@@ -333,22 +334,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b
  1.2933 + 
  1.2934 +  scaledone :
  1.2935 +     popad
  1.2936 +     ret
  1.2937 +   }
  1.2938 + }
  1.2939 + 
  1.2940 + __declspec(naked)
  1.2941 +-void LinearScaleYUVToRGB32Row(const uint8* y_buf,
  1.2942 +-                              const uint8* u_buf,
  1.2943 +-                              const uint8* v_buf,
  1.2944 +-                              uint8* rgb_buf,
  1.2945 +-                              int width,
  1.2946 +-                              int source_dx) {
  1.2947 ++void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
  1.2948 ++                                  const uint8* u_buf,
  1.2949 ++                                  const uint8* v_buf,
  1.2950 ++                                  uint8* rgb_buf,
  1.2951 ++                                  int width,
  1.2952 ++                                  int source_dx) {
  1.2953 +   __asm {
  1.2954 +     pushad
  1.2955 +     mov       edx, [esp + 32 + 4]  // Y
  1.2956 +     mov       edi, [esp + 32 + 8]  // U
  1.2957 +                 // [esp + 32 + 12] // V
  1.2958 +     mov       ebp, [esp + 32 + 16] // rgb
  1.2959 +     mov       ecx, [esp + 32 + 20] // width
  1.2960 +     imul      ecx, [esp + 32 + 24] // source_dx
  1.2961 +@@ -438,152 +439,60 @@ lscalelastpixel:
  1.2962 +     paddsw    mm1, mm0
  1.2963 +     psraw     mm1, 6
  1.2964 +     packuswb  mm1, mm1
  1.2965 +     movd      [ebp], mm1
  1.2966 +     popad
  1.2967 +     ret
  1.2968 +   };
  1.2969 + }
  1.2970 +-#else  // USE_MMX
  1.2971 +-
  1.2972 +-// C reference code that mimic the YUV assembly.
  1.2973 +-#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
  1.2974 +-#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
  1.2975 +-    (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
  1.2976 +-
  1.2977 +-static inline void YuvPixel(uint8 y,
  1.2978 +-                            uint8 u,
  1.2979 +-                            uint8 v,
  1.2980 +-                            uint8* rgb_buf) {
  1.2981 +-
  1.2982 +-  int b = kCoefficientsRgbY[256+u][0];
  1.2983 +-  int g = kCoefficientsRgbY[256+u][1];
  1.2984 +-  int r = kCoefficientsRgbY[256+u][2];
  1.2985 +-  int a = kCoefficientsRgbY[256+u][3];
  1.2986 +-
  1.2987 +-  b = paddsw(b, kCoefficientsRgbY[512+v][0]);
  1.2988 +-  g = paddsw(g, kCoefficientsRgbY[512+v][1]);
  1.2989 +-  r = paddsw(r, kCoefficientsRgbY[512+v][2]);
  1.2990 +-  a = paddsw(a, kCoefficientsRgbY[512+v][3]);
  1.2991 +-
  1.2992 +-  b = paddsw(b, kCoefficientsRgbY[y][0]);
  1.2993 +-  g = paddsw(g, kCoefficientsRgbY[y][1]);
  1.2994 +-  r = paddsw(r, kCoefficientsRgbY[y][2]);
  1.2995 +-  a = paddsw(a, kCoefficientsRgbY[y][3]);
  1.2996 +-
  1.2997 +-  b >>= 6;
  1.2998 +-  g >>= 6;
  1.2999 +-  r >>= 6;
  1.3000 +-  a >>= 6;
  1.3001 +-
  1.3002 +-  *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
  1.3003 +-                                        (packuswb(g) << 8) |
  1.3004 +-                                        (packuswb(r) << 16) |
  1.3005 +-                                        (packuswb(a) << 24);
  1.3006 +-}
  1.3007 +-
  1.3008 +-#if TEST_MMX_YUV
  1.3009 +-static inline void YuvPixel(uint8 y,
  1.3010 +-                            uint8 u,
  1.3011 +-                            uint8 v,
  1.3012 +-                            uint8* rgb_buf) {
  1.3013 +-
  1.3014 +-  __asm {
  1.3015 +-    movzx     eax, u
  1.3016 +-    movq      mm0, [kCoefficientsRgbY+2048 + 8 * eax]
  1.3017 +-    movzx     eax, v
  1.3018 +-    paddsw    mm0, [kCoefficientsRgbY+4096 + 8 * eax]
  1.3019 +-    movzx     eax, y
  1.3020 +-    movq      mm1, [kCoefficientsRgbY + 8 * eax]
  1.3021 +-    paddsw    mm1, mm0
  1.3022 +-    psraw     mm1, 6
  1.3023 +-    packuswb  mm1, mm1
  1.3024 +-    mov       eax, rgb_buf
  1.3025 +-    movd      [eax], mm1
  1.3026 +-    emms
  1.3027 +-  }
  1.3028 +-}
  1.3029 +-#endif
  1.3030 ++#endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
  1.3031 + 
  1.3032 + void FastConvertYUVToRGB32Row(const uint8* y_buf,
  1.3033 +                               const uint8* u_buf,
  1.3034 +                               const uint8* v_buf,
  1.3035 +                               uint8* rgb_buf,
  1.3036 +                               int width) {
  1.3037 +-  for (int x = 0; x < width; x += 2) {
  1.3038 +-    uint8 u = u_buf[x >> 1];
  1.3039 +-    uint8 v = v_buf[x >> 1];
  1.3040 +-    uint8 y0 = y_buf[x];
  1.3041 +-    YuvPixel(y0, u, v, rgb_buf);
  1.3042 +-    if ((x + 1) < width) {
  1.3043 +-      uint8 y1 = y_buf[x + 1];
  1.3044 +-      YuvPixel(y1, u, v, rgb_buf + 4);
  1.3045 +-    }
  1.3046 +-    rgb_buf += 8;  // Advance 2 pixels.
  1.3047 +-  }
  1.3048 +-}
  1.3049 +-
  1.3050 +-// 16.16 fixed point is used.  A shift by 16 isolates the integer.
  1.3051 +-// A shift by 17 is used to further subsample the chrominence channels.
  1.3052 +-// & 0xffff isolates the fixed point fraction.  >> 2 to get the upper 2 bits,
  1.3053 +-// for 1/65536 pixel accurate interpolation.
  1.3054 ++#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
  1.3055 ++  if (mozilla::supports_sse()) {
  1.3056 ++    FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
  1.3057 ++    return;
  1.3058 ++  }
  1.3059 ++#endif
  1.3060 ++
  1.3061 ++  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
  1.3062 ++}
  1.3063 ++
  1.3064 + void ScaleYUVToRGB32Row(const uint8* y_buf,
  1.3065 +                         const uint8* u_buf,
  1.3066 +                         const uint8* v_buf,
  1.3067 +                         uint8* rgb_buf,
  1.3068 +                         int width,
  1.3069 +                         int source_dx) {
  1.3070 +-  int x = 0;
  1.3071 +-  for (int i = 0; i < width; i += 2) {
  1.3072 +-    int y = y_buf[x >> 16];
  1.3073 +-    int u = u_buf[(x >> 17)];
  1.3074 +-    int v = v_buf[(x >> 17)];
  1.3075 +-    YuvPixel(y, u, v, rgb_buf);
  1.3076 +-    x += source_dx;
  1.3077 +-    if ((i + 1) < width) {
  1.3078 +-      y = y_buf[x >> 16];
  1.3079 +-      YuvPixel(y, u, v, rgb_buf+4);
  1.3080 +-      x += source_dx;
  1.3081 +-    }
  1.3082 +-    rgb_buf += 8;
  1.3083 +-  }
  1.3084 +-}
  1.3085 ++
  1.3086 ++#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
  1.3087 ++  if (mozilla::supports_sse()) {
  1.3088 ++    ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
  1.3089 ++    return;
  1.3090 ++  }
  1.3091 ++#endif
  1.3092 ++
  1.3093 ++  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
  1.3094 ++}
  1.3095 + 
  1.3096 + void LinearScaleYUVToRGB32Row(const uint8* y_buf,
  1.3097 +                               const uint8* u_buf,
  1.3098 +                               const uint8* v_buf,
  1.3099 +                               uint8* rgb_buf,
  1.3100 +                               int width,
  1.3101 +                               int source_dx) {
  1.3102 +-  int x = 0;
  1.3103 +-  if (source_dx >= 0x20000) {
  1.3104 +-    x = 32768;
  1.3105 +-  }
  1.3106 +-  for (int i = 0; i < width; i += 2) {
  1.3107 +-    int y0 = y_buf[x >> 16];
  1.3108 +-    int y1 = y_buf[(x >> 16) + 1];
  1.3109 +-    int u0 = u_buf[(x >> 17)];
  1.3110 +-    int u1 = u_buf[(x >> 17) + 1];
  1.3111 +-    int v0 = v_buf[(x >> 17)];
  1.3112 +-    int v1 = v_buf[(x >> 17) + 1];
  1.3113 +-    int y_frac = (x & 65535);
  1.3114 +-    int uv_frac = ((x >> 1) & 65535);
  1.3115 +-    int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
  1.3116 +-    int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;
  1.3117 +-    int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;
  1.3118 +-    YuvPixel(y, u, v, rgb_buf);
  1.3119 +-    x += source_dx;
  1.3120 +-    if ((i + 1) < width) {
  1.3121 +-      y0 = y_buf[x >> 16];
  1.3122 +-      y1 = y_buf[(x >> 16) + 1];
  1.3123 +-      y_frac = (x & 65535);
  1.3124 +-      y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
  1.3125 +-      YuvPixel(y, u, v, rgb_buf+4);
  1.3126 +-      x += source_dx;
  1.3127 +-    }
  1.3128 +-    rgb_buf += 8;
  1.3129 +-  }
  1.3130 +-}
  1.3131 +-
  1.3132 +-#endif  // USE_MMX
  1.3133 +-}  // extern "C"
  1.3134 +-
  1.3135 ++#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
  1.3136 ++  if (mozilla::supports_sse()) {
  1.3137 ++    LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
  1.3138 ++                                 source_dx);
  1.3139 ++    return;
  1.3140 ++  }
  1.3141 ++#endif
  1.3142 ++
  1.3143 ++  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
  1.3144 ++}
  1.3145 ++
  1.3146 ++} // extern "C"

mercurial