gfx/ycbcr/convert.patch

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp
     2 --- a/gfx/ycbcr/yuv_convert.cpp
     3 +++ b/gfx/ycbcr/yuv_convert.cpp
     4 @@ -6,145 +6,102 @@
     5  // http://www.fourcc.org/yuv.php
     6  // The actual conversion is best described here
     7  // http://en.wikipedia.org/wiki/YUV
     8  // An article on optimizing YUV conversion using tables instead of multiplies
     9  // http://lestourtereaux.free.fr/papers/data/yuvrgb.pdf
    10  //
    11  // YV12 is a full plane of Y and a half height, half width chroma planes
    12  // YV16 is a full plane of Y and a full height, half width chroma planes
    13 +// YV24 is a full plane of Y and a full height, full width chroma planes
    14  //
    15  // ARGB pixel format is output, which on little endian is stored as BGRA.
    16  // The alpha is set to 255, allowing the application to use RGBA or RGB32.
    18 -#include "media/base/yuv_convert.h"
    19 +#include "yuv_convert.h"
    21  // Header for low level row functions.
    22 -#include "media/base/yuv_row.h"
    23 -
    24 -#if USE_MMX
    25 -#if defined(_MSC_VER)
    26 -#include <intrin.h>
    27 -#else
    28 -#include <mmintrin.h>
    29 -#endif
    30 -#endif
    31 -
    32 -#if USE_SSE2
    33 -#include <emmintrin.h>
    34 -#endif
    35 -
    36 -namespace media {
    37 -
    38 +#include "yuv_row.h"
    39 +#include "mozilla/SSE.h"
    40 +
    41 +namespace mozilla {
    42 +
    43 +namespace gfx {
    44 + 
    45  // 16.16 fixed point arithmetic
    46  const int kFractionBits = 16;
    47  const int kFractionMax = 1 << kFractionBits;
    48  const int kFractionMask = ((1 << kFractionBits) - 1);
    50  // Convert a frame of YUV to 32 bit ARGB.
    51 -void ConvertYUVToRGB32(const uint8* y_buf,
    52 -                       const uint8* u_buf,
    53 -                       const uint8* v_buf,
    54 -                       uint8* rgb_buf,
    55 -                       int width,
    56 -                       int height,
    57 -                       int y_pitch,
    58 -                       int uv_pitch,
    59 -                       int rgb_pitch,
    60 -                       YUVType yuv_type) {
    61 -  unsigned int y_shift = yuv_type;
    62 -  for (int y = 0; y < height; ++y) {
    63 -    uint8* rgb_row = rgb_buf + y * rgb_pitch;
    64 -    const uint8* y_ptr = y_buf + y * y_pitch;
    65 -    const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch;
    66 -    const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch;
    67 -
    68 -    FastConvertYUVToRGB32Row(y_ptr,
    69 -                             u_ptr,
    70 -                             v_ptr,
    71 -                             rgb_row,
    72 -                             width);
    73 -  }
    74 +NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* y_buf,
    75 +                                  const uint8* u_buf,
    76 +                                  const uint8* v_buf,
    77 +                                  uint8* rgb_buf,
    78 +                                  int pic_x,
    79 +                                  int pic_y,
    80 +                                  int pic_width,
    81 +                                  int pic_height,
    82 +                                  int y_pitch,
    83 +                                  int uv_pitch,
    84 +                                  int rgb_pitch,
    85 +                                  YUVType yuv_type) {
    86 +  unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
    87 +  unsigned int x_shift = yuv_type == YV24 ? 0 : 1;
    88 +  // Test for SSE because the optimized code uses movntq, which is not part of MMX.
    89 +  bool has_sse = supports_mmx() && supports_sse();
    90 +  // There is no optimized YV24 SSE routine so we check for this and
    91 +  // fall back to the C code.
    92 +  has_sse &= yuv_type != YV24;
    93 +  bool odd_pic_x = yuv_type != YV24 && pic_x % 2 != 0;
    94 +  int x_width = odd_pic_x ? pic_width - 1 : pic_width;
    95 +
    96 +  for (int y = pic_y; y < pic_height + pic_y; ++y) {
    97 +    uint8* rgb_row = rgb_buf + (y - pic_y) * rgb_pitch;
    98 +    const uint8* y_ptr = y_buf + y * y_pitch + pic_x;
    99 +    const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
   100 +    const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
   101 +
   102 +    if (odd_pic_x) {
   103 +      // Handle the single odd pixel manually and use the
   104 +      // fast routines for the remaining.
   105 +      FastConvertYUVToRGB32Row_C(y_ptr++,
   106 +                                 u_ptr++,
   107 +                                 v_ptr++,
   108 +                                 rgb_row,
   109 +                                 1,
   110 +                                 x_shift);
   111 +      rgb_row += 4;
   112 +    }
   113 +
   114 +    if (has_sse) {
   115 +      FastConvertYUVToRGB32Row(y_ptr,
   116 +                               u_ptr,
   117 +                               v_ptr,
   118 +                               rgb_row,
   119 +                               x_width);
   120 +    }
   121 +    else {
   122 +      FastConvertYUVToRGB32Row_C(y_ptr,
   123 +                                 u_ptr,
   124 +                                 v_ptr,
   125 +                                 rgb_row,
   126 +                                 x_width,
   127 +                                 x_shift);
   128 +    }
   129 +  }
   131    // MMX used for FastConvertYUVToRGB32Row requires emms instruction.
   132 -  EMMS();
   133 -}
   134 -
   135 -#if USE_SSE2
   136 -// FilterRows combines two rows of the image using linear interpolation.
   137 -// SSE2 version does 16 pixels at a time
   138 -
   139 -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
   140 -                       int source_width, int source_y_fraction) {
   141 -  __m128i zero = _mm_setzero_si128();
   142 -  __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
   143 -  __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
   144 -
   145 -  const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
   146 -  const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
   147 -  __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
   148 -  __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
   149 -
   150 -  do {
   151 -    __m128i y0 = _mm_loadu_si128(y0_ptr128);
   152 -    __m128i y1 = _mm_loadu_si128(y1_ptr128);
   153 -    __m128i y2 = _mm_unpackhi_epi8(y0, zero);
   154 -    __m128i y3 = _mm_unpackhi_epi8(y1, zero);
   155 -    y0 = _mm_unpacklo_epi8(y0, zero);
   156 -    y1 = _mm_unpacklo_epi8(y1, zero);
   157 -    y0 = _mm_mullo_epi16(y0, y0_fraction);
   158 -    y1 = _mm_mullo_epi16(y1, y1_fraction);
   159 -    y2 = _mm_mullo_epi16(y2, y0_fraction);
   160 -    y3 = _mm_mullo_epi16(y3, y1_fraction);
   161 -    y0 = _mm_add_epi16(y0, y1);
   162 -    y2 = _mm_add_epi16(y2, y3);
   163 -    y0 = _mm_srli_epi16(y0, 8);
   164 -    y2 = _mm_srli_epi16(y2, 8);
   165 -    y0 = _mm_packus_epi16(y0, y2);
   166 -    *dest128++ = y0;
   167 -    ++y0_ptr128;
   168 -    ++y1_ptr128;
   169 -  } while (dest128 < end128);
   170 -}
   171 -#elif USE_MMX
   172 -// MMX version does 8 pixels at a time
   173 -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
   174 -                       int source_width, int source_y_fraction) {
   175 -  __m64 zero = _mm_setzero_si64();
   176 -  __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
   177 -  __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
   178 -
   179 -  const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
   180 -  const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
   181 -  __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
   182 -  __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
   183 -
   184 -  do {
   185 -    __m64 y0 = *y0_ptr64++;
   186 -    __m64 y1 = *y1_ptr64++;
   187 -    __m64 y2 = _mm_unpackhi_pi8(y0, zero);
   188 -    __m64 y3 = _mm_unpackhi_pi8(y1, zero);
   189 -    y0 = _mm_unpacklo_pi8(y0, zero);
   190 -    y1 = _mm_unpacklo_pi8(y1, zero);
   191 -    y0 = _mm_mullo_pi16(y0, y0_fraction);
   192 -    y1 = _mm_mullo_pi16(y1, y1_fraction);
   193 -    y2 = _mm_mullo_pi16(y2, y0_fraction);
   194 -    y3 = _mm_mullo_pi16(y3, y1_fraction);
   195 -    y0 = _mm_add_pi16(y0, y1);
   196 -    y2 = _mm_add_pi16(y2, y3);
   197 -    y0 = _mm_srli_pi16(y0, 8);
   198 -    y2 = _mm_srli_pi16(y2, 8);
   199 -    y0 = _mm_packs_pu16(y0, y2);
   200 -    *dest64++ = y0;
   201 -  } while (dest64 < end64);
   202 -}
   203 -#else  // no MMX or SSE2
   204 +  if (has_sse)
   205 +    EMMS();
   206 +}
   207 +
   208  // C version does 8 at a time to mimic MMX code
   209 -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
   210 -                       int source_width, int source_y_fraction) {
   211 +static void FilterRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
   212 +                         int source_width, int source_y_fraction) {
   213    int y1_fraction = source_y_fraction;
   214    int y0_fraction = 256 - y1_fraction;
   215    uint8* end = ybuf + source_width;
   216    do {
   217      ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8;
   218      ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8;
   219      ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8;
   220      ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8;
   221 @@ -152,46 +140,77 @@ static void FilterRows(uint8* ybuf, cons
   222      ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8;
   223      ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8;
   224      ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8;
   225      y0_ptr += 8;
   226      y1_ptr += 8;
   227      ybuf += 8;
   228    } while (ybuf < end);
   229  }
   230 -#endif
   231 +
   232 +#ifdef MOZILLA_MAY_SUPPORT_MMX
   233 +void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
   234 +                    int source_width, int source_y_fraction);
   235 +#endif
   236 +
   237 +#ifdef MOZILLA_MAY_SUPPORT_SSE2
   238 +void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
   239 +                     int source_width, int source_y_fraction);
   240 +#endif
   241 +
   242 +static inline void FilterRows(uint8* ybuf, const uint8* y0_ptr,
   243 +                              const uint8* y1_ptr, int source_width,
   244 +                              int source_y_fraction) {
   245 +#ifdef MOZILLA_MAY_SUPPORT_SSE2
   246 +  if (mozilla::supports_sse2()) {
   247 +    FilterRows_SSE2(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
   248 +    return;
   249 +  }
   250 +#endif
   251 +
   252 +#ifdef MOZILLA_MAY_SUPPORT_MMX
   253 +  if (mozilla::supports_mmx()) {
   254 +    FilterRows_MMX(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
   255 +    return;
   256 +  }
   257 +#endif
   258 +
   259 +  FilterRows_C(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
   260 +}
   263  // Scale a frame of YUV to 32 bit ARGB.
   264 -void ScaleYUVToRGB32(const uint8* y_buf,
   265 -                     const uint8* u_buf,
   266 -                     const uint8* v_buf,
   267 -                     uint8* rgb_buf,
   268 -                     int source_width,
   269 -                     int source_height,
   270 -                     int width,
   271 -                     int height,
   272 -                     int y_pitch,
   273 -                     int uv_pitch,
   274 -                     int rgb_pitch,
   275 -                     YUVType yuv_type,
   276 -                     Rotate view_rotate,
   277 -                     ScaleFilter filter) {
   278 +NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* y_buf,
   279 +                                const uint8* u_buf,
   280 +                                const uint8* v_buf,
   281 +                                uint8* rgb_buf,
   282 +                                int source_width,
   283 +                                int source_height,
   284 +                                int width,
   285 +                                int height,
   286 +                                int y_pitch,
   287 +                                int uv_pitch,
   288 +                                int rgb_pitch,
   289 +                                YUVType yuv_type,
   290 +                                Rotate view_rotate,
   291 +                                ScaleFilter filter) {
   292 +  bool has_mmx = supports_mmx();
   293 +
   294    // 4096 allows 3 buffers to fit in 12k.
   295    // Helps performance on CPU with 16K L1 cache.
   296    // Large enough for 3830x2160 and 30" displays which are 2560x1600.
   297    const int kFilterBufferSize = 4096;
   298    // Disable filtering if the screen is too big (to avoid buffer overflows).
   299    // This should never happen to regular users: they don't have monitors
   300    // wider than 4096 pixels.
   301    // TODO(fbarchard): Allow rotated videos to filter.
   302    if (source_width > kFilterBufferSize || view_rotate)
   303      filter = FILTER_NONE;
   305 -  unsigned int y_shift = yuv_type;
   306 +  unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
   307    // Diagram showing origin and direction of source sampling.
   308    // ->0   4<-
   309    // 7       3
   310    //
   311    // 6       5
   312    // ->1   2<-
   313    // Rotations that start at right side of image.
   314    if ((view_rotate == ROTATE_180) ||
   315 @@ -276,17 +295,17 @@ void ScaleYUVToRGB32(const uint8* y_buf,
   316      int source_uv_fraction =
   317          ((source_y_subpixel >> y_shift) & kFractionMask) >> 8;
   319      const uint8* y_ptr = y0_ptr;
   320      const uint8* u_ptr = u0_ptr;
   321      const uint8* v_ptr = v0_ptr;
   322      // Apply vertical filtering if necessary.
   323      // TODO(fbarchard): Remove memcpy when not necessary.
   324 -    if (filter & media::FILTER_BILINEAR_V) {
   325 +    if (filter & mozilla::gfx::FILTER_BILINEAR_V) {
   326        if (yscale_fixed != kFractionMax &&
   327            source_y_fraction && ((source_y + 1) < source_height)) {
   328          FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
   329        } else {
   330          memcpy(ybuf, y0_ptr, source_width);
   331        }
   332        y_ptr = ybuf;
   333        ybuf[source_width] = ybuf[source_width-1];
   334 @@ -303,44 +322,50 @@ void ScaleYUVToRGB32(const uint8* y_buf,
   335        u_ptr = ubuf;
   336        v_ptr = vbuf;
   337        ubuf[uv_source_width] = ubuf[uv_source_width - 1];
   338        vbuf[uv_source_width] = vbuf[uv_source_width - 1];
   339      }
   340      if (source_dx == kFractionMax) {  // Not scaled
   341        FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
   342                                 dest_pixel, width);
   343 -    } else {
   344 -      if (filter & FILTER_BILINEAR_H) {
   345 +    } else if (filter & FILTER_BILINEAR_H) {
   346          LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
   347                                   dest_pixel, width, source_dx);
   348      } else {
   349  // Specialized scalers and rotation.
   350 -#if USE_MMX && defined(_MSC_VER)
   351 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_MSC_VER) && defined(_M_IX86)
   352 +      if(mozilla::supports_sse()) {
   353          if (width == (source_width * 2)) {
   354 -          DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
   355 -                              dest_pixel, width);
   356 +          DoubleYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
   357 +                                  dest_pixel, width);
   358          } else if ((source_dx & kFractionMask) == 0) {
   359            // Scaling by integer scale factor. ie half.
   360 -          ConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
   361 -                               dest_pixel, width,
   362 -                               source_dx >> kFractionBits);
   363 +          ConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
   364 +                                   dest_pixel, width,
   365 +                                   source_dx >> kFractionBits);
   366          } else if (source_dx_uv == source_dx) {  // Not rotated.
   367            ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
   368                               dest_pixel, width, source_dx);
   369          } else {
   370 -          RotateConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
   371 -                                     dest_pixel, width,
   372 -                                     source_dx >> kFractionBits,
   373 -                                     source_dx_uv >> kFractionBits);
   374 +          RotateConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
   375 +                                         dest_pixel, width,
   376 +                                         source_dx >> kFractionBits,
   377 +                                         source_dx_uv >> kFractionBits);
   378          }
   379 +      }
   380 +      else {
   381 +        ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,
   382 +                             dest_pixel, width, source_dx);
   383 +      }
   384  #else
   385 -        ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
   386 -                           dest_pixel, width, source_dx);
   387 -#endif
   388 -      }
   389 +      ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
   390 +                         dest_pixel, width, source_dx);
   391 +#endif
   392      }
   393    }
   394    // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms.
   395 -  EMMS();
   396 -}
   397 -
   398 -}  // namespace media
   399 +  if (has_mmx)
   400 +    EMMS();
   401 +}
   402 +
   403 +}  // namespace gfx
   404 +}  // namespace mozilla
   405 diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h
   406 --- a/gfx/ycbcr/yuv_convert.h
   407 +++ b/gfx/ycbcr/yuv_convert.h
   408 @@ -1,72 +1,79 @@
   409  // Copyright (c) 2010 The Chromium Authors. All rights reserved.
   410  // Use of this source code is governed by a BSD-style license that can be
   411  // found in the LICENSE file.
   413  #ifndef MEDIA_BASE_YUV_CONVERT_H_
   414  #define MEDIA_BASE_YUV_CONVERT_H_
   416 -#include "base/basictypes.h"
   417 -
   418 -namespace media {
   419 -
   420 +#include "chromium_types.h"
   421 +#include "gfxCore.h"
   422 +
   423 +namespace mozilla {
   424 +
   425 +namespace gfx {
   426 + 
   427  // Type of YUV surface.
   428  // The value of these enums matter as they are used to shift vertical indices.
   429  enum YUVType {
   430 -  YV16 = 0,           // YV16 is half width and full height chroma channels.
   431 -  YV12 = 1,           // YV12 is half width and half height chroma channels.
   432 +  YV12 = 0,           // YV12 is half width and half height chroma channels.
   433 +  YV16 = 1,           // YV16 is half width and full height chroma channels.
   434 +  YV24 = 2            // YV24 is full width and full height chroma channels.
   435  };
   437  // Mirror means flip the image horizontally, as in looking in a mirror.
   438  // Rotate happens after mirroring.
   439  enum Rotate {
   440    ROTATE_0,           // Rotation off.
   441    ROTATE_90,          // Rotate clockwise.
   442    ROTATE_180,         // Rotate upside down.
   443    ROTATE_270,         // Rotate counter clockwise.
   444    MIRROR_ROTATE_0,    // Mirror horizontally.
   445    MIRROR_ROTATE_90,   // Mirror then Rotate clockwise.
   446    MIRROR_ROTATE_180,  // Mirror vertically.
   447 -  MIRROR_ROTATE_270,  // Transpose.
   448 +  MIRROR_ROTATE_270   // Transpose.
   449  };
   451  // Filter affects how scaling looks.
   452  enum ScaleFilter {
   453    FILTER_NONE = 0,        // No filter (point sampled).
   454    FILTER_BILINEAR_H = 1,  // Bilinear horizontal filter.
   455    FILTER_BILINEAR_V = 2,  // Bilinear vertical filter.
   456 -  FILTER_BILINEAR = 3,    // Bilinear filter.
   457 +  FILTER_BILINEAR = 3     // Bilinear filter.
   458  };
   460  // Convert a frame of YUV to 32 bit ARGB.
   461  // Pass in YV16/YV12 depending on source format
   462 -void ConvertYUVToRGB32(const uint8* yplane,
   463 -                       const uint8* uplane,
   464 -                       const uint8* vplane,
   465 -                       uint8* rgbframe,
   466 -                       int width,
   467 -                       int height,
   468 -                       int ystride,
   469 -                       int uvstride,
   470 -                       int rgbstride,
   471 -                       YUVType yuv_type);
   472 +NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* yplane,
   473 +                                  const uint8* uplane,
   474 +                                  const uint8* vplane,
   475 +                                  uint8* rgbframe,
   476 +                                  int pic_x,
   477 +                                  int pic_y,
   478 +                                  int pic_width,
   479 +                                  int pic_height,
   480 +                                  int ystride,
   481 +                                  int uvstride,
   482 +                                  int rgbstride,
   483 +                                  YUVType yuv_type);
   485  // Scale a frame of YUV to 32 bit ARGB.
   486  // Supports rotation and mirroring.
   487 -void ScaleYUVToRGB32(const uint8* yplane,
   488 -                     const uint8* uplane,
   489 -                     const uint8* vplane,
   490 -                     uint8* rgbframe,
   491 -                     int source_width,
   492 -                     int source_height,
   493 -                     int width,
   494 -                     int height,
   495 -                     int ystride,
   496 -                     int uvstride,
   497 -                     int rgbstride,
   498 -                     YUVType yuv_type,
   499 -                     Rotate view_rotate,
   500 -                     ScaleFilter filter);
   501 -
   502 -}  // namespace media
   503 -
   504 +NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* yplane,
   505 +                                const uint8* uplane,
   506 +                                const uint8* vplane,
   507 +                                uint8* rgbframe,
   508 +                                int source_width,
   509 +                                int source_height,
   510 +                                int width,
   511 +                                int height,
   512 +                                int ystride,
   513 +                                int uvstride,
   514 +                                int rgbstride,
   515 +                                YUVType yuv_type,
   516 +                                Rotate view_rotate,
   517 +                                ScaleFilter filter);
   518 +
   519 +}  // namespace gfx
   520 +}  // namespace mozilla
   521 + 
   522  #endif  // MEDIA_BASE_YUV_CONVERT_H_
   523 diff --git a/gfx/ycbcr/yuv_convert_mmx.cpp b/gfx/ycbcr/yuv_convert_mmx.cpp
   524 new file mode 100644
   525 --- /dev/null
   526 +++ b/gfx/ycbcr/yuv_convert_mmx.cpp
   527 @@ -0,0 +1,45 @@
   528 +// Copyright (c) 2010 The Chromium Authors. All rights reserved.
   529 +// Use of this source code is governed by a BSD-style license that can be
   530 +// found in the LICENSE file.
   531 +
   532 +#include <mmintrin.h>
   533 +#include "yuv_row.h"
   534 +
   535 +namespace mozilla {
   536 +namespace gfx {
   537 +
   538 +// FilterRows combines two rows of the image using linear interpolation.
   539 +// MMX version does 8 pixels at a time.
   540 +void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
   541 +                    int source_width, int source_y_fraction) {
   542 +  __m64 zero = _mm_setzero_si64();
   543 +  __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
   544 +  __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
   545 +
   546 +  const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
   547 +  const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
   548 +  __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
   549 +  __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
   550 +
   551 +  do {
   552 +    __m64 y0 = *y0_ptr64++;
   553 +    __m64 y1 = *y1_ptr64++;
   554 +    __m64 y2 = _mm_unpackhi_pi8(y0, zero);
   555 +    __m64 y3 = _mm_unpackhi_pi8(y1, zero);
   556 +    y0 = _mm_unpacklo_pi8(y0, zero);
   557 +    y1 = _mm_unpacklo_pi8(y1, zero);
   558 +    y0 = _mm_mullo_pi16(y0, y0_fraction);
   559 +    y1 = _mm_mullo_pi16(y1, y1_fraction);
   560 +    y2 = _mm_mullo_pi16(y2, y0_fraction);
   561 +    y3 = _mm_mullo_pi16(y3, y1_fraction);
   562 +    y0 = _mm_add_pi16(y0, y1);
   563 +    y2 = _mm_add_pi16(y2, y3);
   564 +    y0 = _mm_srli_pi16(y0, 8);
   565 +    y2 = _mm_srli_pi16(y2, 8);
   566 +    y0 = _mm_packs_pu16(y0, y2);
   567 +    *dest64++ = y0;
   568 +  } while (dest64 < end64);
   569 +}
   570 +
   571 +}
   572 +}
   573 diff --git a/gfx/ycbcr/yuv_convert_sse2.cpp b/gfx/ycbcr/yuv_convert_sse2.cpp
   574 new file mode 100644
   575 --- /dev/null
   576 +++ b/gfx/ycbcr/yuv_convert_sse2.cpp
   577 @@ -0,0 +1,47 @@
   578 +// Copyright (c) 2010 The Chromium Authors. All rights reserved.
   579 +// Use of this source code is governed by a BSD-style license that can be
   580 +// found in the LICENSE file.
   581 +
   582 +#include <emmintrin.h>
   583 +#include "yuv_row.h"
   584 +
   585 +namespace mozilla {
   586 +namespace gfx {
   587 +
   588 +// FilterRows combines two rows of the image using linear interpolation.
   589 +// SSE2 version does 16 pixels at a time.
   590 +void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
   591 +                     int source_width, int source_y_fraction) {
   592 +  __m128i zero = _mm_setzero_si128();
   593 +  __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
   594 +  __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
   595 +
   596 +  const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
   597 +  const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
   598 +  __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
   599 +  __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
   600 +
   601 +  do {
   602 +    __m128i y0 = _mm_loadu_si128(y0_ptr128);
   603 +    __m128i y1 = _mm_loadu_si128(y1_ptr128);
   604 +    __m128i y2 = _mm_unpackhi_epi8(y0, zero);
   605 +    __m128i y3 = _mm_unpackhi_epi8(y1, zero);
   606 +    y0 = _mm_unpacklo_epi8(y0, zero);
   607 +    y1 = _mm_unpacklo_epi8(y1, zero);
   608 +    y0 = _mm_mullo_epi16(y0, y0_fraction);
   609 +    y1 = _mm_mullo_epi16(y1, y1_fraction);
   610 +    y2 = _mm_mullo_epi16(y2, y0_fraction);
   611 +    y3 = _mm_mullo_epi16(y3, y1_fraction);
   612 +    y0 = _mm_add_epi16(y0, y1);
   613 +    y2 = _mm_add_epi16(y2, y3);
   614 +    y0 = _mm_srli_epi16(y0, 8);
   615 +    y2 = _mm_srli_epi16(y2, 8);
   616 +    y0 = _mm_packus_epi16(y0, y2);
   617 +    *dest128++ = y0;
   618 +    ++y0_ptr128;
   619 +    ++y1_ptr128;
   620 +  } while (dest128 < end128);
   621 +}
   622 +
   623 +}
   624 +}
   625 diff --git a/gfx/ycbcr/yuv_row.h b/gfx/ycbcr/yuv_row.h
   626 --- a/gfx/ycbcr/yuv_row.h
   627 +++ b/gfx/ycbcr/yuv_row.h
   628 @@ -5,109 +5,133 @@
   629  // yuv_row internal functions to handle YUV conversion and scaling to RGB.
   630  // These functions are used from both yuv_convert.cc and yuv_scale.cc.
   632  // TODO(fbarchard): Write function that can handle rotation and scaling.
   634  #ifndef MEDIA_BASE_YUV_ROW_H_
   635  #define MEDIA_BASE_YUV_ROW_H_
   637 -#include "base/basictypes.h"
   638 +#include "chromium_types.h"
   640  extern "C" {
   641  // Can only do 1x.
   642  // This is the second fastest of the scalers.
   643  void FastConvertYUVToRGB32Row(const uint8* y_buf,
   644                                const uint8* u_buf,
   645                                const uint8* v_buf,
   646                                uint8* rgb_buf,
   647                                int width);
   649 -// Can do 1x, half size or any scale down by an integer amount.
   650 -// Step can be negative (mirroring, rotate 180).
   651 -// This is the third fastest of the scalers.
   652 -void ConvertYUVToRGB32Row(const uint8* y_buf,
   653 -                          const uint8* u_buf,
   654 -                          const uint8* v_buf,
   655 -                          uint8* rgb_buf,
   656 -                          int width,
   657 -                          int step);
   658 -
   659 -// Rotate is like Convert, but applies different step to Y versus U and V.
   660 -// This allows rotation by 90 or 270, by stepping by stride.
   661 -// This is the forth fastest of the scalers.
   662 -void RotateConvertYUVToRGB32Row(const uint8* y_buf,
   663 +void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
   664                                  const uint8* u_buf,
   665                                  const uint8* v_buf,
   666                                  uint8* rgb_buf,
   667                                  int width,
   668 -                                int ystep,
   669 -                                int uvstep);
   670 +                                unsigned int x_shift);
   671 +
   672 +void FastConvertYUVToRGB32Row(const uint8* y_buf,
   673 +                              const uint8* u_buf,
   674 +                              const uint8* v_buf,
   675 +                              uint8* rgb_buf,
   676 +                              int width);
   677 +
   678 +// Can do 1x, half size or any scale down by an integer amount.
   679 +// Step can be negative (mirroring, rotate 180).
   680 +// This is the third fastest of the scalers.
   681 +// Only defined on Windows x86-32.
   682 +void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
   683 +                              const uint8* u_buf,
   684 +                              const uint8* v_buf,
   685 +                              uint8* rgb_buf,
   686 +                              int width,
   687 +                              int step);
   688 +
   689 +// Rotate is like Convert, but applies different step to Y versus U and V.
   690 +// This allows rotation by 90 or 270, by stepping by stride.
   691 +// This is the forth fastest of the scalers.
   692 +// Only defined on Windows x86-32.
   693 +void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
   694 +                                    const uint8* u_buf,
   695 +                                    const uint8* v_buf,
   696 +                                    uint8* rgb_buf,
   697 +                                    int width,
   698 +                                    int ystep,
   699 +                                    int uvstep);
   701  // Doubler does 4 pixels at a time.  Each pixel is replicated.
   702  // This is the fastest of the scalers.
   703 -void DoubleYUVToRGB32Row(const uint8* y_buf,
   704 -                         const uint8* u_buf,
   705 -                         const uint8* v_buf,
   706 -                         uint8* rgb_buf,
   707 -                         int width);
   708 +// Only defined on Windows x86-32.
   709 +void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
   710 +                             const uint8* u_buf,
   711 +                             const uint8* v_buf,
   712 +                             uint8* rgb_buf,
   713 +                             int width);
   715  // Handles arbitrary scaling up or down.
   716  // Mirroring is supported, but not 90 or 270 degree rotation.
   717  // Chroma is under sampled every 2 pixels for performance.
   718  void ScaleYUVToRGB32Row(const uint8* y_buf,
   719                          const uint8* u_buf,
   720                          const uint8* v_buf,
   721                          uint8* rgb_buf,
   722                          int width,
   723                          int source_dx);
   725 +void ScaleYUVToRGB32Row(const uint8* y_buf,
   726 +                        const uint8* u_buf,
   727 +                        const uint8* v_buf,
   728 +                        uint8* rgb_buf,
   729 +                        int width,
   730 +                        int source_dx);
   731 +
   732 +void ScaleYUVToRGB32Row_C(const uint8* y_buf,
   733 +                          const uint8* u_buf,
   734 +                          const uint8* v_buf,
   735 +                          uint8* rgb_buf,
   736 +                          int width,
   737 +                          int source_dx);
   738 +
   739  // Handles arbitrary scaling up or down with bilinear filtering.
   740  // Mirroring is supported, but not 90 or 270 degree rotation.
   741  // Chroma is under sampled every 2 pixels for performance.
   742  // This is the slowest of the scalers.
   743  void LinearScaleYUVToRGB32Row(const uint8* y_buf,
   744                                const uint8* u_buf,
   745                                const uint8* v_buf,
   746                                uint8* rgb_buf,
   747                                int width,
   748                                int source_dx);
   750 +void LinearScaleYUVToRGB32Row(const uint8* y_buf,
   751 +                              const uint8* u_buf,
   752 +                              const uint8* v_buf,
   753 +                              uint8* rgb_buf,
   754 +                              int width,
   755 +                              int source_dx);
   756 +
   757 +void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
   758 +                                const uint8* u_buf,
   759 +                                const uint8* v_buf,
   760 +                                uint8* rgb_buf,
   761 +                                int width,
   762 +                                int source_dx);
   763 +
   764 +
   765  #if defined(_MSC_VER)
   766  #define SIMD_ALIGNED(var) __declspec(align(16)) var
   767  #else
   768  #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
   769  #endif
   770  extern SIMD_ALIGNED(int16 kCoefficientsRgbY[768][4]);
   772 -// Method to force C version.
   773 -//#define USE_MMX 0
   774 -//#define USE_SSE2 0
   775 -
   776 -#if !defined(USE_MMX)
   777 -// Windows, Mac and Linux/BSD use MMX
   778 -#if defined(__MMX__) || defined(_MSC_VER)
   779 -#define USE_MMX 1
   780 -#else
   781 -#define USE_MMX 0
   782 -#endif
   783 -#endif
   784 -
   785 -#if !defined(USE_SSE2)
   786 -#if defined(__SSE2__) || defined(ARCH_CPU_X86_64) || _M_IX86_FP==2
   787 -#define USE_SSE2 1
   788 -#else
   789 -#define USE_SSE2 0
   790 -#endif
   791 -#endif
   792 -
   793  // x64 uses MMX2 (SSE) so emms is not required.
   794  // Warning C4799: function has no EMMS instruction.
   795  // EMMS() is slow and should be called by the calling function once per image.
   796 -#if USE_MMX && !defined(ARCH_CPU_X86_64)
   797 +#if defined(ARCH_CPU_X86) && !defined(ARCH_CPU_X86_64)
   798  #if defined(_MSC_VER)
   799  #define EMMS() __asm emms
   800  #pragma warning(disable: 4799)
   801  #else
   802  #define EMMS() asm("emms")
   803  #endif
   804  #else
   805  #define EMMS()
   806 diff --git a/gfx/ycbcr/yuv_row_c.cpp b/gfx/ycbcr/yuv_row_c.cpp
   807 --- a/gfx/ycbcr/yuv_row_c.cpp
   808 +++ b/gfx/ycbcr/yuv_row_c.cpp
   809 @@ -1,812 +1,18 @@
   810  // Copyright (c) 2010 The Chromium Authors. All rights reserved.
   811  // Use of this source code is governed by a BSD-style license that can be
   812  // found in the LICENSE file.
   814 -#include "media/base/yuv_row.h"
   815 -
   816 -#ifdef _DEBUG
   817 -#include "base/logging.h"
   818 -#else
   819 +#include "yuv_row.h"
   820 +
   821  #define DCHECK(a)
   822 -#endif
   824  extern "C" {
   826 -#if USE_SSE2 && defined(ARCH_CPU_X86_64)
   827 -
   828 -// AMD64 ABI uses register paremters.
   829 -void FastConvertYUVToRGB32Row(const uint8* y_buf,  // rdi
   830 -                              const uint8* u_buf,  // rsi
   831 -                              const uint8* v_buf,  // rdx
   832 -                              uint8* rgb_buf,      // rcx
   833 -                              int width) {         // r8
   834 -  asm(
   835 -  "jmp    convertend\n"
   836 -"convertloop:"
   837 -  "movzb  (%1),%%r10\n"
   838 -  "add    $0x1,%1\n"
   839 -  "movzb  (%2),%%r11\n"
   840 -  "add    $0x1,%2\n"
   841 -  "movq   2048(%5,%%r10,8),%%xmm0\n"
   842 -  "movzb  (%0),%%r10\n"
   843 -  "movq   4096(%5,%%r11,8),%%xmm1\n"
   844 -  "movzb  0x1(%0),%%r11\n"
   845 -  "paddsw %%xmm1,%%xmm0\n"
   846 -  "movq   (%5,%%r10,8),%%xmm2\n"
   847 -  "add    $0x2,%0\n"
   848 -  "movq   (%5,%%r11,8),%%xmm3\n"
   849 -  "paddsw %%xmm0,%%xmm2\n"
   850 -  "paddsw %%xmm0,%%xmm3\n"
   851 -  "shufps $0x44,%%xmm3,%%xmm2\n"
   852 -  "psraw  $0x6,%%xmm2\n"
   853 -  "packuswb %%xmm2,%%xmm2\n"
   854 -  "movq   %%xmm2,0x0(%3)\n"
   855 -  "add    $0x8,%3\n"
   856 -"convertend:"
   857 -  "sub    $0x2,%4\n"
   858 -  "jns    convertloop\n"
   859 -
   860 -"convertnext:"
   861 -  "add    $0x1,%4\n"
   862 -  "js     convertdone\n"
   863 -
   864 -  "movzb  (%1),%%r10\n"
   865 -  "movq   2048(%5,%%r10,8),%%xmm0\n"
   866 -  "movzb  (%2),%%r10\n"
   867 -  "movq   4096(%5,%%r10,8),%%xmm1\n"
   868 -  "paddsw %%xmm1,%%xmm0\n"
   869 -  "movzb  (%0),%%r10\n"
   870 -  "movq   (%5,%%r10,8),%%xmm1\n"
   871 -  "paddsw %%xmm0,%%xmm1\n"
   872 -  "psraw  $0x6,%%xmm1\n"
   873 -  "packuswb %%xmm1,%%xmm1\n"
   874 -  "movd   %%xmm1,0x0(%3)\n"
   875 -"convertdone:"
   876 -  :
   877 -  : "r"(y_buf),  // %0
   878 -    "r"(u_buf),  // %1
   879 -    "r"(v_buf),  // %2
   880 -    "r"(rgb_buf),  // %3
   881 -    "r"(width),  // %4
   882 -    "r" (kCoefficientsRgbY)  // %5
   883 -  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
   884 -);
   885 -}
   886 -
   887 -void ScaleYUVToRGB32Row(const uint8* y_buf,  // rdi
   888 -                        const uint8* u_buf,  // rsi
   889 -                        const uint8* v_buf,  // rdx
   890 -                        uint8* rgb_buf,      // rcx
   891 -                        int width,           // r8
   892 -                        int source_dx) {     // r9
   893 -  asm(
   894 -  "xor    %%r11,%%r11\n"
   895 -  "sub    $0x2,%4\n"
   896 -  "js     scalenext\n"
   897 -
   898 -"scaleloop:"
   899 -  "mov    %%r11,%%r10\n"
   900 -  "sar    $0x11,%%r10\n"
   901 -  "movzb  (%1,%%r10,1),%%rax\n"
   902 -  "movq   2048(%5,%%rax,8),%%xmm0\n"
   903 -  "movzb  (%2,%%r10,1),%%rax\n"
   904 -  "movq   4096(%5,%%rax,8),%%xmm1\n"
   905 -  "lea    (%%r11,%6),%%r10\n"
   906 -  "sar    $0x10,%%r11\n"
   907 -  "movzb  (%0,%%r11,1),%%rax\n"
   908 -  "paddsw %%xmm1,%%xmm0\n"
   909 -  "movq   (%5,%%rax,8),%%xmm1\n"
   910 -  "lea    (%%r10,%6),%%r11\n"
   911 -  "sar    $0x10,%%r10\n"
   912 -  "movzb  (%0,%%r10,1),%%rax\n"
   913 -  "movq   (%5,%%rax,8),%%xmm2\n"
   914 -  "paddsw %%xmm0,%%xmm1\n"
   915 -  "paddsw %%xmm0,%%xmm2\n"
   916 -  "shufps $0x44,%%xmm2,%%xmm1\n"
   917 -  "psraw  $0x6,%%xmm1\n"
   918 -  "packuswb %%xmm1,%%xmm1\n"
   919 -  "movq   %%xmm1,0x0(%3)\n"
   920 -  "add    $0x8,%3\n"
   921 -  "sub    $0x2,%4\n"
   922 -  "jns    scaleloop\n"
   923 -
   924 -"scalenext:"
   925 -  "add    $0x1,%4\n"
   926 -  "js     scaledone\n"
   927 -
   928 -  "mov    %%r11,%%r10\n"
   929 -  "sar    $0x11,%%r10\n"
   930 -  "movzb  (%1,%%r10,1),%%rax\n"
   931 -  "movq   2048(%5,%%rax,8),%%xmm0\n"
   932 -  "movzb  (%2,%%r10,1),%%rax\n"
   933 -  "movq   4096(%5,%%rax,8),%%xmm1\n"
   934 -  "paddsw %%xmm1,%%xmm0\n"
   935 -  "sar    $0x10,%%r11\n"
   936 -  "movzb  (%0,%%r11,1),%%rax\n"
   937 -  "movq   (%5,%%rax,8),%%xmm1\n"
   938 -  "paddsw %%xmm0,%%xmm1\n"
   939 -  "psraw  $0x6,%%xmm1\n"
   940 -  "packuswb %%xmm1,%%xmm1\n"
   941 -  "movd   %%xmm1,0x0(%3)\n"
   942 -
   943 -"scaledone:"
   944 -  :
   945 -  : "r"(y_buf),  // %0
   946 -    "r"(u_buf),  // %1
   947 -    "r"(v_buf),  // %2
   948 -    "r"(rgb_buf),  // %3
   949 -    "r"(width),  // %4
   950 -    "r" (kCoefficientsRgbY),  // %5
   951 -    "r"(static_cast<long>(source_dx))  // %6
   952 -  : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2"
   953 -);
   954 -}
   955 -
   956 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
   957 -                              const uint8* u_buf,
   958 -                              const uint8* v_buf,
   959 -                              uint8* rgb_buf,
   960 -                              int width,
   961 -                              int source_dx) {
   962 -  asm(
   963 -  "xor    %%r11,%%r11\n"   // x = 0
   964 -  "sub    $0x2,%4\n"
   965 -  "js     .lscalenext\n"
   966 -  "cmp    $0x20000,%6\n"   // if source_dx >= 2.0
   967 -  "jl     .lscalehalf\n"
   968 -  "mov    $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
   969 -".lscalehalf:"
   970 -
   971 -".lscaleloop:"
   972 -  "mov    %%r11,%%r10\n"
   973 -  "sar    $0x11,%%r10\n"
   974 -
   975 -  "movzb  (%1, %%r10, 1), %%r13 \n"
   976 -  "movzb  1(%1, %%r10, 1), %%r14 \n"
   977 -  "mov    %%r11, %%rax \n"
   978 -  "and    $0x1fffe, %%rax \n"
   979 -  "imul   %%rax, %%r14 \n"
   980 -  "xor    $0x1fffe, %%rax \n"
   981 -  "imul   %%rax, %%r13 \n"
   982 -  "add    %%r14, %%r13 \n"
   983 -  "shr    $17, %%r13 \n"
   984 -  "movq   2048(%5,%%r13,8), %%xmm0\n"
   985 -
   986 -  "movzb  (%2, %%r10, 1), %%r13 \n"
   987 -  "movzb  1(%2, %%r10, 1), %%r14 \n"
   988 -  "mov    %%r11, %%rax \n"
   989 -  "and    $0x1fffe, %%rax \n"
   990 -  "imul   %%rax, %%r14 \n"
   991 -  "xor    $0x1fffe, %%rax \n"
   992 -  "imul   %%rax, %%r13 \n"
   993 -  "add    %%r14, %%r13 \n"
   994 -  "shr    $17, %%r13 \n"
   995 -  "movq   4096(%5,%%r13,8), %%xmm1\n"
   996 -
   997 -  "mov    %%r11, %%rax \n"
   998 -  "lea    (%%r11,%6),%%r10\n"
   999 -  "sar    $0x10,%%r11\n"
  1000 -  "paddsw %%xmm1,%%xmm0\n"
  1002 -  "movzb  (%0, %%r11, 1), %%r13 \n"
  1003 -  "movzb  1(%0, %%r11, 1), %%r14 \n"
  1004 -  "and    $0xffff, %%rax \n"
  1005 -  "imul   %%rax, %%r14 \n"
  1006 -  "xor    $0xffff, %%rax \n"
  1007 -  "imul   %%rax, %%r13 \n"
  1008 -  "add    %%r14, %%r13 \n"
  1009 -  "shr    $16, %%r13 \n"
  1010 -  "movq   (%5,%%r13,8),%%xmm1\n"
  1012 -  "mov    %%r10, %%rax \n"
  1013 -  "lea    (%%r10,%6),%%r11\n"
  1014 -  "sar    $0x10,%%r10\n"
  1016 -  "movzb  (%0,%%r10,1), %%r13 \n"
  1017 -  "movzb  1(%0,%%r10,1), %%r14 \n"
  1018 -  "and    $0xffff, %%rax \n"
  1019 -  "imul   %%rax, %%r14 \n"
  1020 -  "xor    $0xffff, %%rax \n"
  1021 -  "imul   %%rax, %%r13 \n"
  1022 -  "add    %%r14, %%r13 \n"
  1023 -  "shr    $16, %%r13 \n"
  1024 -  "movq   (%5,%%r13,8),%%xmm2\n"
  1026 -  "paddsw %%xmm0,%%xmm1\n"
  1027 -  "paddsw %%xmm0,%%xmm2\n"
  1028 -  "shufps $0x44,%%xmm2,%%xmm1\n"
  1029 -  "psraw  $0x6,%%xmm1\n"
  1030 -  "packuswb %%xmm1,%%xmm1\n"
  1031 -  "movq   %%xmm1,0x0(%3)\n"
  1032 -  "add    $0x8,%3\n"
  1033 -  "sub    $0x2,%4\n"
  1034 -  "jns    .lscaleloop\n"
  1036 -".lscalenext:"
  1037 -  "add    $0x1,%4\n"
  1038 -  "js     .lscaledone\n"
  1040 -  "mov    %%r11,%%r10\n"
  1041 -  "sar    $0x11,%%r10\n"
  1043 -  "movzb  (%1,%%r10,1), %%r13 \n"
  1044 -  "movq   2048(%5,%%r13,8),%%xmm0\n"
  1046 -  "movzb  (%2,%%r10,1), %%r13 \n"
  1047 -  "movq   4096(%5,%%r13,8),%%xmm1\n"
  1049 -  "paddsw %%xmm1,%%xmm0\n"
  1050 -  "sar    $0x10,%%r11\n"
  1052 -  "movzb  (%0,%%r11,1), %%r13 \n"
  1053 -  "movq   (%5,%%r13,8),%%xmm1\n"
  1055 -  "paddsw %%xmm0,%%xmm1\n"
  1056 -  "psraw  $0x6,%%xmm1\n"
  1057 -  "packuswb %%xmm1,%%xmm1\n"
  1058 -  "movd   %%xmm1,0x0(%3)\n"
  1060 -".lscaledone:"
  1061 -  :
  1062 -  : "r"(y_buf),  // %0
  1063 -    "r"(u_buf),  // %1
  1064 -    "r"(v_buf),  // %2
  1065 -    "r"(rgb_buf),  // %3
  1066 -    "r"(width),  // %4
  1067 -    "r" (kCoefficientsRgbY),  // %5
  1068 -    "r"(static_cast<long>(source_dx))  // %6
  1069 -  : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
  1070 -);
  1071 -}
  1073 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__)
  1075 -// PIC version is slower because less registers are available, so
  1076 -// non-PIC is used on platforms where it is possible.
  1078 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
  1079 -                              const uint8* u_buf,
  1080 -                              const uint8* v_buf,
  1081 -                              uint8* rgb_buf,
  1082 -                              int width);
  1083 -  asm(
  1084 -  ".text\n"
  1085 -  ".global FastConvertYUVToRGB32Row\n"
  1086 -"FastConvertYUVToRGB32Row:\n"
  1087 -  "pusha\n"
  1088 -  "mov    0x24(%esp),%edx\n"
  1089 -  "mov    0x28(%esp),%edi\n"
  1090 -  "mov    0x2c(%esp),%esi\n"
  1091 -  "mov    0x30(%esp),%ebp\n"
  1092 -  "mov    0x34(%esp),%ecx\n"
  1093 -  "jmp    convertend\n"
  1095 -"convertloop:"
  1096 -  "movzbl (%edi),%eax\n"
  1097 -  "add    $0x1,%edi\n"
  1098 -  "movzbl (%esi),%ebx\n"
  1099 -  "add    $0x1,%esi\n"
  1100 -  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
  1101 -  "movzbl (%edx),%eax\n"
  1102 -  "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
  1103 -  "movzbl 0x1(%edx),%ebx\n"
  1104 -  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
  1105 -  "add    $0x2,%edx\n"
  1106 -  "movq   kCoefficientsRgbY(,%ebx,8),%mm2\n"
  1107 -  "paddsw %mm0,%mm1\n"
  1108 -  "paddsw %mm0,%mm2\n"
  1109 -  "psraw  $0x6,%mm1\n"
  1110 -  "psraw  $0x6,%mm2\n"
  1111 -  "packuswb %mm2,%mm1\n"
  1112 -  "movntq %mm1,0x0(%ebp)\n"
  1113 -  "add    $0x8,%ebp\n"
  1114 -"convertend:"
  1115 -  "sub    $0x2,%ecx\n"
  1116 -  "jns    convertloop\n"
  1118 -  "and    $0x1,%ecx\n"
  1119 -  "je     convertdone\n"
  1121 -  "movzbl (%edi),%eax\n"
  1122 -  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
  1123 -  "movzbl (%esi),%eax\n"
  1124 -  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
  1125 -  "movzbl (%edx),%eax\n"
  1126 -  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
  1127 -  "paddsw %mm0,%mm1\n"
  1128 -  "psraw  $0x6,%mm1\n"
  1129 -  "packuswb %mm1,%mm1\n"
  1130 -  "movd   %mm1,0x0(%ebp)\n"
  1131 -"convertdone:"
  1132 -  "popa\n"
  1133 -  "ret\n"
  1134 -);
  1137 -void ScaleYUVToRGB32Row(const uint8* y_buf,
  1138 -                        const uint8* u_buf,
  1139 -                        const uint8* v_buf,
  1140 -                        uint8* rgb_buf,
  1141 -                        int width,
  1142 -                        int source_dx);
  1143 -  asm(
  1144 -  ".text\n"
  1145 -  ".global ScaleYUVToRGB32Row\n"
  1146 -"ScaleYUVToRGB32Row:\n"
  1147 -  "pusha\n"
  1148 -  "mov    0x24(%esp),%edx\n"
  1149 -  "mov    0x28(%esp),%edi\n"
  1150 -  "mov    0x2c(%esp),%esi\n"
  1151 -  "mov    0x30(%esp),%ebp\n"
  1152 -  "mov    0x34(%esp),%ecx\n"
  1153 -  "xor    %ebx,%ebx\n"
  1154 -  "jmp    scaleend\n"
  1156 -"scaleloop:"
  1157 -  "mov    %ebx,%eax\n"
  1158 -  "sar    $0x11,%eax\n"
  1159 -  "movzbl (%edi,%eax,1),%eax\n"
  1160 -  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
  1161 -  "mov    %ebx,%eax\n"
  1162 -  "sar    $0x11,%eax\n"
  1163 -  "movzbl (%esi,%eax,1),%eax\n"
  1164 -  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
  1165 -  "mov    %ebx,%eax\n"
  1166 -  "add    0x38(%esp),%ebx\n"
  1167 -  "sar    $0x10,%eax\n"
  1168 -  "movzbl (%edx,%eax,1),%eax\n"
  1169 -  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
  1170 -  "mov    %ebx,%eax\n"
  1171 -  "add    0x38(%esp),%ebx\n"
  1172 -  "sar    $0x10,%eax\n"
  1173 -  "movzbl (%edx,%eax,1),%eax\n"
  1174 -  "movq   kCoefficientsRgbY(,%eax,8),%mm2\n"
  1175 -  "paddsw %mm0,%mm1\n"
  1176 -  "paddsw %mm0,%mm2\n"
  1177 -  "psraw  $0x6,%mm1\n"
  1178 -  "psraw  $0x6,%mm2\n"
  1179 -  "packuswb %mm2,%mm1\n"
  1180 -  "movntq %mm1,0x0(%ebp)\n"
  1181 -  "add    $0x8,%ebp\n"
  1182 -"scaleend:"
  1183 -  "sub    $0x2,%ecx\n"
  1184 -  "jns    scaleloop\n"
  1186 -  "and    $0x1,%ecx\n"
  1187 -  "je     scaledone\n"
  1189 -  "mov    %ebx,%eax\n"
  1190 -  "sar    $0x11,%eax\n"
  1191 -  "movzbl (%edi,%eax,1),%eax\n"
  1192 -  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
  1193 -  "mov    %ebx,%eax\n"
  1194 -  "sar    $0x11,%eax\n"
  1195 -  "movzbl (%esi,%eax,1),%eax\n"
  1196 -  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
  1197 -  "mov    %ebx,%eax\n"
  1198 -  "sar    $0x10,%eax\n"
  1199 -  "movzbl (%edx,%eax,1),%eax\n"
  1200 -  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
  1201 -  "paddsw %mm0,%mm1\n"
  1202 -  "psraw  $0x6,%mm1\n"
  1203 -  "packuswb %mm1,%mm1\n"
  1204 -  "movd   %mm1,0x0(%ebp)\n"
  1206 -"scaledone:"
  1207 -  "popa\n"
  1208 -  "ret\n"
  1209 -);
  1211 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
  1212 -                              const uint8* u_buf,
  1213 -                              const uint8* v_buf,
  1214 -                              uint8* rgb_buf,
  1215 -                              int width,
  1216 -                              int source_dx);
  1217 -  asm(
  1218 -  ".text\n"
  1219 -  ".global LinearScaleYUVToRGB32Row\n"
  1220 -"LinearScaleYUVToRGB32Row:\n"
  1221 -  "pusha\n"
  1222 -  "mov    0x24(%esp),%edx\n"
  1223 -  "mov    0x28(%esp),%edi\n"
  1224 -  "mov    0x30(%esp),%ebp\n"
  1226 -  // source_width = width * source_dx + ebx
  1227 -  "mov    0x34(%esp), %ecx\n"
  1228 -  "imull  0x38(%esp), %ecx\n"
  1229 -  "mov    %ecx, 0x34(%esp)\n"
  1231 -  "mov    0x38(%esp), %ecx\n"
  1232 -  "xor    %ebx,%ebx\n"     // x = 0
  1233 -  "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
  1234 -  "jl     .lscaleend\n"
  1235 -  "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
  1236 -  "jmp    .lscaleend\n"
  1238 -".lscaleloop:"
  1239 -  "mov    %ebx,%eax\n"
  1240 -  "sar    $0x11,%eax\n"
  1242 -  "movzbl (%edi,%eax,1),%ecx\n"
  1243 -  "movzbl 1(%edi,%eax,1),%esi\n"
  1244 -  "mov    %ebx,%eax\n"
  1245 -  "andl   $0x1fffe, %eax \n"
  1246 -  "imul   %eax, %esi \n"
  1247 -  "xorl   $0x1fffe, %eax \n"
  1248 -  "imul   %eax, %ecx \n"
  1249 -  "addl   %esi, %ecx \n"
  1250 -  "shrl   $17, %ecx \n"
  1251 -  "movq   kCoefficientsRgbY+2048(,%ecx,8),%mm0\n"
  1253 -  "mov    0x2c(%esp),%esi\n"
  1254 -  "mov    %ebx,%eax\n"
  1255 -  "sar    $0x11,%eax\n"
  1257 -  "movzbl (%esi,%eax,1),%ecx\n"
  1258 -  "movzbl 1(%esi,%eax,1),%esi\n"
  1259 -  "mov    %ebx,%eax\n"
  1260 -  "andl   $0x1fffe, %eax \n"
  1261 -  "imul   %eax, %esi \n"
  1262 -  "xorl   $0x1fffe, %eax \n"
  1263 -  "imul   %eax, %ecx \n"
  1264 -  "addl   %esi, %ecx \n"
  1265 -  "shrl   $17, %ecx \n"
  1266 -  "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n"
  1268 -  "mov    %ebx,%eax\n"
  1269 -  "sar    $0x10,%eax\n"
  1270 -  "movzbl (%edx,%eax,1),%ecx\n"
  1271 -  "movzbl 1(%edx,%eax,1),%esi\n"
  1272 -  "mov    %ebx,%eax\n"
  1273 -  "add    0x38(%esp),%ebx\n"
  1274 -  "andl   $0xffff, %eax \n"
  1275 -  "imul   %eax, %esi \n"
  1276 -  "xorl   $0xffff, %eax \n"
  1277 -  "imul   %eax, %ecx \n"
  1278 -  "addl   %esi, %ecx \n"
  1279 -  "shrl   $16, %ecx \n"
  1280 -  "movq   kCoefficientsRgbY(,%ecx,8),%mm1\n"
  1282 -  "cmp    0x34(%esp), %ebx\n"
  1283 -  "jge    .lscalelastpixel\n"
  1285 -  "mov    %ebx,%eax\n"
  1286 -  "sar    $0x10,%eax\n"
  1287 -  "movzbl (%edx,%eax,1),%ecx\n"
  1288 -  "movzbl 1(%edx,%eax,1),%esi\n"
  1289 -  "mov    %ebx,%eax\n"
  1290 -  "add    0x38(%esp),%ebx\n"
  1291 -  "andl   $0xffff, %eax \n"
  1292 -  "imul   %eax, %esi \n"
  1293 -  "xorl   $0xffff, %eax \n"
  1294 -  "imul   %eax, %ecx \n"
  1295 -  "addl   %esi, %ecx \n"
  1296 -  "shrl   $16, %ecx \n"
  1297 -  "movq   kCoefficientsRgbY(,%ecx,8),%mm2\n"
  1299 -  "paddsw %mm0,%mm1\n"
  1300 -  "paddsw %mm0,%mm2\n"
  1301 -  "psraw  $0x6,%mm1\n"
  1302 -  "psraw  $0x6,%mm2\n"
  1303 -  "packuswb %mm2,%mm1\n"
  1304 -  "movntq %mm1,0x0(%ebp)\n"
  1305 -  "add    $0x8,%ebp\n"
  1307 -".lscaleend:"
  1308 -  "cmp    0x34(%esp), %ebx\n"
  1309 -  "jl     .lscaleloop\n"
  1310 -  "popa\n"
  1311 -  "ret\n"
  1313 -".lscalelastpixel:"
  1314 -  "paddsw %mm0, %mm1\n"
  1315 -  "psraw $6, %mm1\n"
  1316 -  "packuswb %mm1, %mm1\n"
  1317 -  "movd %mm1, (%ebp)\n"
  1318 -  "popa\n"
  1319 -  "ret\n"
  1320 -);
  1322 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__)
  1324 -extern void PICConvertYUVToRGB32Row(const uint8* y_buf,
  1325 -                                    const uint8* u_buf,
  1326 -                                    const uint8* v_buf,
  1327 -                                    uint8* rgb_buf,
  1328 -                                    int width,
  1329 -                                    int16 *kCoefficientsRgbY);
  1330 -  asm(
  1331 -  ".text\n"
  1332 -#if defined(OS_MACOSX)
  1333 -"_PICConvertYUVToRGB32Row:\n"
  1334 -#else
  1335 -"PICConvertYUVToRGB32Row:\n"
  1336 -#endif
  1337 -  "pusha\n"
  1338 -  "mov    0x24(%esp),%edx\n"
  1339 -  "mov    0x28(%esp),%edi\n"
  1340 -  "mov    0x2c(%esp),%esi\n"
  1341 -  "mov    0x30(%esp),%ebp\n"
  1342 -  "mov    0x38(%esp),%ecx\n"
  1344 -  "jmp    .Lconvertend\n"
  1346 -".Lconvertloop:"
  1347 -  "movzbl (%edi),%eax\n"
  1348 -  "add    $0x1,%edi\n"
  1349 -  "movzbl (%esi),%ebx\n"
  1350 -  "add    $0x1,%esi\n"
  1351 -  "movq   2048(%ecx,%eax,8),%mm0\n"
  1352 -  "movzbl (%edx),%eax\n"
  1353 -  "paddsw 4096(%ecx,%ebx,8),%mm0\n"
  1354 -  "movzbl 0x1(%edx),%ebx\n"
  1355 -  "movq   0(%ecx,%eax,8),%mm1\n"
  1356 -  "add    $0x2,%edx\n"
  1357 -  "movq   0(%ecx,%ebx,8),%mm2\n"
  1358 -  "paddsw %mm0,%mm1\n"
  1359 -  "paddsw %mm0,%mm2\n"
  1360 -  "psraw  $0x6,%mm1\n"
  1361 -  "psraw  $0x6,%mm2\n"
  1362 -  "packuswb %mm2,%mm1\n"
  1363 -  "movntq %mm1,0x0(%ebp)\n"
  1364 -  "add    $0x8,%ebp\n"
  1365 -".Lconvertend:"
  1366 -  "subl   $0x2,0x34(%esp)\n"
  1367 -  "jns    .Lconvertloop\n"
  1369 -  "andl   $0x1,0x34(%esp)\n"
  1370 -  "je     .Lconvertdone\n"
  1372 -  "movzbl (%edi),%eax\n"
  1373 -  "movq   2048(%ecx,%eax,8),%mm0\n"
  1374 -  "movzbl (%esi),%eax\n"
  1375 -  "paddsw 4096(%ecx,%eax,8),%mm0\n"
  1376 -  "movzbl (%edx),%eax\n"
  1377 -  "movq   0(%ecx,%eax,8),%mm1\n"
  1378 -  "paddsw %mm0,%mm1\n"
  1379 -  "psraw  $0x6,%mm1\n"
  1380 -  "packuswb %mm1,%mm1\n"
  1381 -  "movd   %mm1,0x0(%ebp)\n"
  1382 -".Lconvertdone:\n"
  1383 -  "popa\n"
  1384 -  "ret\n"
  1385 -);
  1387 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
  1388 -                              const uint8* u_buf,
  1389 -                              const uint8* v_buf,
  1390 -                              uint8* rgb_buf,
  1391 -                              int width) {
  1392 -  PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
  1393 -                          &kCoefficientsRgbY[0][0]);
  1394 -}
  1396 -extern void PICScaleYUVToRGB32Row(const uint8* y_buf,
  1397 -                               const uint8* u_buf,
  1398 -                               const uint8* v_buf,
  1399 -                               uint8* rgb_buf,
  1400 -                               int width,
  1401 -                               int source_dx,
  1402 -                               int16 *kCoefficientsRgbY);
  1404 -  asm(
  1405 -  ".text\n"
  1406 -#if defined(OS_MACOSX)
  1407 -"_PICScaleYUVToRGB32Row:\n"
  1408 -#else
  1409 -"PICScaleYUVToRGB32Row:\n"
  1410 -#endif
  1411 -  "pusha\n"
  1412 -  "mov    0x24(%esp),%edx\n"
  1413 -  "mov    0x28(%esp),%edi\n"
  1414 -  "mov    0x2c(%esp),%esi\n"
  1415 -  "mov    0x30(%esp),%ebp\n"
  1416 -  "mov    0x3c(%esp),%ecx\n"
  1417 -  "xor    %ebx,%ebx\n"
  1418 -  "jmp    Lscaleend\n"
  1420 -"Lscaleloop:"
  1421 -  "mov    %ebx,%eax\n"
  1422 -  "sar    $0x11,%eax\n"
  1423 -  "movzbl (%edi,%eax,1),%eax\n"
  1424 -  "movq   2048(%ecx,%eax,8),%mm0\n"
  1425 -  "mov    %ebx,%eax\n"
  1426 -  "sar    $0x11,%eax\n"
  1427 -  "movzbl (%esi,%eax,1),%eax\n"
  1428 -  "paddsw 4096(%ecx,%eax,8),%mm0\n"
  1429 -  "mov    %ebx,%eax\n"
  1430 -  "add    0x38(%esp),%ebx\n"
  1431 -  "sar    $0x10,%eax\n"
  1432 -  "movzbl (%edx,%eax,1),%eax\n"
  1433 -  "movq   0(%ecx,%eax,8),%mm1\n"
  1434 -  "mov    %ebx,%eax\n"
  1435 -  "add    0x38(%esp),%ebx\n"
  1436 -  "sar    $0x10,%eax\n"
  1437 -  "movzbl (%edx,%eax,1),%eax\n"
  1438 -  "movq   0(%ecx,%eax,8),%mm2\n"
  1439 -  "paddsw %mm0,%mm1\n"
  1440 -  "paddsw %mm0,%mm2\n"
  1441 -  "psraw  $0x6,%mm1\n"
  1442 -  "psraw  $0x6,%mm2\n"
  1443 -  "packuswb %mm2,%mm1\n"
  1444 -  "movntq %mm1,0x0(%ebp)\n"
  1445 -  "add    $0x8,%ebp\n"
  1446 -"Lscaleend:"
  1447 -  "subl   $0x2,0x34(%esp)\n"
  1448 -  "jns    Lscaleloop\n"
  1450 -  "andl   $0x1,0x34(%esp)\n"
  1451 -  "je     Lscaledone\n"
  1453 -  "mov    %ebx,%eax\n"
  1454 -  "sar    $0x11,%eax\n"
  1455 -  "movzbl (%edi,%eax,1),%eax\n"
  1456 -  "movq   2048(%ecx,%eax,8),%mm0\n"
  1457 -  "mov    %ebx,%eax\n"
  1458 -  "sar    $0x11,%eax\n"
  1459 -  "movzbl (%esi,%eax,1),%eax\n"
  1460 -  "paddsw 4096(%ecx,%eax,8),%mm0\n"
  1461 -  "mov    %ebx,%eax\n"
  1462 -  "sar    $0x10,%eax\n"
  1463 -  "movzbl (%edx,%eax,1),%eax\n"
  1464 -  "movq   0(%ecx,%eax,8),%mm1\n"
  1465 -  "paddsw %mm0,%mm1\n"
  1466 -  "psraw  $0x6,%mm1\n"
  1467 -  "packuswb %mm1,%mm1\n"
  1468 -  "movd   %mm1,0x0(%ebp)\n"
  1470 -"Lscaledone:"
  1471 -  "popa\n"
  1472 -  "ret\n"
  1473 -);
  1476 -void ScaleYUVToRGB32Row(const uint8* y_buf,
  1477 -                        const uint8* u_buf,
  1478 -                        const uint8* v_buf,
  1479 -                        uint8* rgb_buf,
  1480 -                        int width,
  1481 -                        int source_dx) {
  1482 -  PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
  1483 -                        &kCoefficientsRgbY[0][0]);
  1484 -}
  1486 -void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
  1487 -                                 const uint8* u_buf,
  1488 -                                 const uint8* v_buf,
  1489 -                                 uint8* rgb_buf,
  1490 -                                 int width,
  1491 -                                 int source_dx,
  1492 -                                 int16 *kCoefficientsRgbY);
  1493 -  asm(
  1494 -  ".text\n"
  1495 -#if defined(OS_MACOSX)
  1496 -"_PICLinearScaleYUVToRGB32Row:\n"
  1497 -#else
  1498 -"PICLinearScaleYUVToRGB32Row:\n"
  1499 -#endif
  1500 -  "pusha\n"
  1501 -  "mov    0x24(%esp),%edx\n"
  1502 -  "mov    0x30(%esp),%ebp\n"
  1503 -  "mov    0x34(%esp),%ecx\n"
  1504 -  "mov    0x3c(%esp),%edi\n"
  1505 -  "xor    %ebx,%ebx\n"
  1507 -  // source_width = width * source_dx + ebx
  1508 -  "mov    0x34(%esp), %ecx\n"
  1509 -  "imull  0x38(%esp), %ecx\n"
  1510 -  "mov    %ecx, 0x34(%esp)\n"
  1512 -  "mov    0x38(%esp), %ecx\n"
  1513 -  "xor    %ebx,%ebx\n"     // x = 0
  1514 -  "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
  1515 -  "jl     .lscaleend\n"
  1516 -  "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
  1517 -  "jmp    .lscaleend\n"
  1519 -".lscaleloop:"
  1520 -  "mov    0x28(%esp),%esi\n"
  1521 -  "mov    %ebx,%eax\n"
  1522 -  "sar    $0x11,%eax\n"
  1524 -  "movzbl (%esi,%eax,1),%ecx\n"
  1525 -  "movzbl 1(%esi,%eax,1),%esi\n"
  1526 -  "mov    %ebx,%eax\n"
  1527 -  "andl   $0x1fffe, %eax \n"
  1528 -  "imul   %eax, %esi \n"
  1529 -  "xorl   $0x1fffe, %eax \n"
  1530 -  "imul   %eax, %ecx \n"
  1531 -  "addl   %esi, %ecx \n"
  1532 -  "shrl   $17, %ecx \n"
  1533 -  "movq   2048(%edi,%ecx,8),%mm0\n"
  1535 -  "mov    0x2c(%esp),%esi\n"
  1536 -  "mov    %ebx,%eax\n"
  1537 -  "sar    $0x11,%eax\n"
  1539 -  "movzbl (%esi,%eax,1),%ecx\n"
  1540 -  "movzbl 1(%esi,%eax,1),%esi\n"
  1541 -  "mov    %ebx,%eax\n"
  1542 -  "andl   $0x1fffe, %eax \n"
  1543 -  "imul   %eax, %esi \n"
  1544 -  "xorl   $0x1fffe, %eax \n"
  1545 -  "imul   %eax, %ecx \n"
  1546 -  "addl   %esi, %ecx \n"
  1547 -  "shrl   $17, %ecx \n"
  1548 -  "paddsw 4096(%edi,%ecx,8),%mm0\n"
  1550 -  "mov    %ebx,%eax\n"
  1551 -  "sar    $0x10,%eax\n"
  1552 -  "movzbl (%edx,%eax,1),%ecx\n"
  1553 -  "movzbl 1(%edx,%eax,1),%esi\n"
  1554 -  "mov    %ebx,%eax\n"
  1555 -  "add    0x38(%esp),%ebx\n"
  1556 -  "andl   $0xffff, %eax \n"
  1557 -  "imul   %eax, %esi \n"
  1558 -  "xorl   $0xffff, %eax \n"
  1559 -  "imul   %eax, %ecx \n"
  1560 -  "addl   %esi, %ecx \n"
  1561 -  "shrl   $16, %ecx \n"
  1562 -  "movq   (%edi,%ecx,8),%mm1\n"
  1564 -  "cmp    0x34(%esp), %ebx\n"
  1565 -  "jge    .lscalelastpixel\n"
  1567 -  "mov    %ebx,%eax\n"
  1568 -  "sar    $0x10,%eax\n"
  1569 -  "movzbl (%edx,%eax,1),%ecx\n"
  1570 -  "movzbl 1(%edx,%eax,1),%esi\n"
  1571 -  "mov    %ebx,%eax\n"
  1572 -  "add    0x38(%esp),%ebx\n"
  1573 -  "andl   $0xffff, %eax \n"
  1574 -  "imul   %eax, %esi \n"
  1575 -  "xorl   $0xffff, %eax \n"
  1576 -  "imul   %eax, %ecx \n"
  1577 -  "addl   %esi, %ecx \n"
  1578 -  "shrl   $16, %ecx \n"
  1579 -  "movq   (%edi,%ecx,8),%mm2\n"
  1581 -  "paddsw %mm0,%mm1\n"
  1582 -  "paddsw %mm0,%mm2\n"
  1583 -  "psraw  $0x6,%mm1\n"
  1584 -  "psraw  $0x6,%mm2\n"
  1585 -  "packuswb %mm2,%mm1\n"
  1586 -  "movntq %mm1,0x0(%ebp)\n"
  1587 -  "add    $0x8,%ebp\n"
  1589 -".lscaleend:"
  1590 -  "cmp    %ebx, 0x34(%esp)\n"
  1591 -  "jg     .lscaleloop\n"
  1592 -  "popa\n"
  1593 -  "ret\n"
  1595 -".lscalelastpixel:"
  1596 -  "paddsw %mm0, %mm1\n"
  1597 -  "psraw $6, %mm1\n"
  1598 -  "packuswb %mm1, %mm1\n"
  1599 -  "movd %mm1, (%ebp)\n"
  1600 -  "popa\n"
  1601 -  "ret\n"
  1602 -);
  1604 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
  1605 -                        const uint8* u_buf,
  1606 -                        const uint8* v_buf,
  1607 -                        uint8* rgb_buf,
  1608 -                        int width,
  1609 -                        int source_dx) {
  1610 -  PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
  1611 -                              &kCoefficientsRgbY[0][0]);
  1612 -}
  1614 -#else  // USE_MMX
  1616  // C reference code that mimic the YUV assembly.
  1617  #define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
  1618  #define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
  1619      (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
  1621  static inline void YuvPixel(uint8 y,
  1622                              uint8 u,
  1623                              uint8 v,
  1624 @@ -833,66 +39,71 @@ static inline void YuvPixel(uint8 y,
  1625    a >>= 6;
  1627    *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
  1628                                          (packuswb(g) << 8) |
  1629                                          (packuswb(r) << 16) |
  1630                                          (packuswb(a) << 24);
  1633 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
  1634 -                              const uint8* u_buf,
  1635 -                              const uint8* v_buf,
  1636 -                              uint8* rgb_buf,
  1637 -                              int width) {
  1638 +void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
  1639 +                                const uint8* u_buf,
  1640 +                                const uint8* v_buf,
  1641 +                                uint8* rgb_buf,
  1642 +                                int width,
  1643 +                                unsigned int x_shift) {
  1644    for (int x = 0; x < width; x += 2) {
  1645 -    uint8 u = u_buf[x >> 1];
  1646 -    uint8 v = v_buf[x >> 1];
  1647 +    uint8 u = u_buf[x >> x_shift];
  1648 +    uint8 v = v_buf[x >> x_shift];
  1649      uint8 y0 = y_buf[x];
  1650      YuvPixel(y0, u, v, rgb_buf);
  1651      if ((x + 1) < width) {
  1652        uint8 y1 = y_buf[x + 1];
  1653 +      if (x_shift == 0) {
  1654 +        u = u_buf[x + 1];
  1655 +        v = v_buf[x + 1];
  1656 +      }
  1657        YuvPixel(y1, u, v, rgb_buf + 4);
  1659      rgb_buf += 8;  // Advance 2 pixels.
  1663  // 16.16 fixed point is used.  A shift by 16 isolates the integer.
  1664  // A shift by 17 is used to further subsample the chrominence channels.
  1665  // & 0xffff isolates the fixed point fraction.  >> 2 to get the upper 2 bits,
  1666  // for 1/65536 pixel accurate interpolation.
  1667 -void ScaleYUVToRGB32Row(const uint8* y_buf,
  1668 -                        const uint8* u_buf,
  1669 -                        const uint8* v_buf,
  1670 -                        uint8* rgb_buf,
  1671 -                        int width,
  1672 -                        int source_dx) {
  1673 +void ScaleYUVToRGB32Row_C(const uint8* y_buf,
  1674 +                          const uint8* u_buf,
  1675 +                          const uint8* v_buf,
  1676 +                          uint8* rgb_buf,
  1677 +                          int width,
  1678 +                          int source_dx) {
  1679    int x = 0;
  1680    for (int i = 0; i < width; i += 2) {
  1681      int y = y_buf[x >> 16];
  1682      int u = u_buf[(x >> 17)];
  1683      int v = v_buf[(x >> 17)];
  1684      YuvPixel(y, u, v, rgb_buf);
  1685      x += source_dx;
  1686      if ((i + 1) < width) {
  1687        y = y_buf[x >> 16];
  1688        YuvPixel(y, u, v, rgb_buf+4);
  1689        x += source_dx;
  1691      rgb_buf += 8;
  1695 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
  1696 -                              const uint8* u_buf,
  1697 -                              const uint8* v_buf,
  1698 -                              uint8* rgb_buf,
  1699 -                              int width,
  1700 -                              int source_dx) {
  1701 +void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
  1702 +                                const uint8* u_buf,
  1703 +                                const uint8* v_buf,
  1704 +                                uint8* rgb_buf,
  1705 +                                int width,
  1706 +                                int source_dx) {
  1707    int x = 0;
  1708    if (source_dx >= 0x20000) {
  1709      x = 32768;
  1711    for (int i = 0; i < width; i += 2) {
  1712      int y0 = y_buf[x >> 16];
  1713      int y1 = y_buf[(x >> 16) + 1];
  1714      int u0 = u_buf[(x >> 17)];
  1715 @@ -913,11 +124,10 @@ void LinearScaleYUVToRGB32Row(const uint
  1716        y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
  1717        YuvPixel(y, u, v, rgb_buf+4);
  1718        x += source_dx;
  1720      rgb_buf += 8;
  1724 -#endif  // USE_MMX
  1725  }  // extern "C"
  1727 diff --git a/gfx/ycbcr/yuv_row_posix.cpp b/gfx/ycbcr/yuv_row_posix.cpp
  1728 --- a/gfx/ycbcr/yuv_row_posix.cpp
  1729 +++ b/gfx/ycbcr/yuv_row_posix.cpp
  1730 @@ -1,33 +1,32 @@
  1731  // Copyright (c) 2010 The Chromium Authors. All rights reserved.
  1732  // Use of this source code is governed by a BSD-style license that can be
  1733  // found in the LICENSE file.
  1735 -#include "media/base/yuv_row.h"
  1737 -#ifdef _DEBUG
  1738 -#include "base/logging.h"
  1739 -#else
  1740 +#include "yuv_row.h"
  1741 +#include "mozilla/SSE.h"
  1743  #define DCHECK(a)
  1744 -#endif
  1746  extern "C" {
  1748 -#if USE_SSE2 && defined(ARCH_CPU_X86_64)
  1749 +#if defined(ARCH_CPU_X86_64)
  1751 +// We don't need CPUID guards here, since x86-64 implies SSE2.
  1753  // AMD64 ABI uses register paremters.
  1754  void FastConvertYUVToRGB32Row(const uint8* y_buf,  // rdi
  1755                                const uint8* u_buf,  // rsi
  1756                                const uint8* v_buf,  // rdx
  1757                                uint8* rgb_buf,      // rcx
  1758                                int width) {         // r8
  1759    asm(
  1760 -  "jmp    convertend\n"
  1761 -"convertloop:"
  1762 +  "jmp    1f\n"
  1763 +"0:"
  1764    "movzb  (%1),%%r10\n"
  1765    "add    $0x1,%1\n"
  1766    "movzb  (%2),%%r11\n"
  1767    "add    $0x1,%2\n"
  1768    "movq   2048(%5,%%r10,8),%%xmm0\n"
  1769    "movzb  (%0),%%r10\n"
  1770    "movq   4096(%5,%%r11,8),%%xmm1\n"
  1771    "movzb  0x1(%0),%%r11\n"
  1772 @@ -37,36 +36,36 @@ void FastConvertYUVToRGB32Row(const uint
  1773    "movq   (%5,%%r11,8),%%xmm3\n"
  1774    "paddsw %%xmm0,%%xmm2\n"
  1775    "paddsw %%xmm0,%%xmm3\n"
  1776    "shufps $0x44,%%xmm3,%%xmm2\n"
  1777    "psraw  $0x6,%%xmm2\n"
  1778    "packuswb %%xmm2,%%xmm2\n"
  1779    "movq   %%xmm2,0x0(%3)\n"
  1780    "add    $0x8,%3\n"
  1781 -"convertend:"
  1782 +"1:"
  1783    "sub    $0x2,%4\n"
  1784 -  "jns    convertloop\n"
  1786 -"convertnext:"
  1787 +  "jns    0b\n"
  1789 +"2:"
  1790    "add    $0x1,%4\n"
  1791 -  "js     convertdone\n"
  1792 +  "js     3f\n"
  1794    "movzb  (%1),%%r10\n"
  1795    "movq   2048(%5,%%r10,8),%%xmm0\n"
  1796    "movzb  (%2),%%r10\n"
  1797    "movq   4096(%5,%%r10,8),%%xmm1\n"
  1798    "paddsw %%xmm1,%%xmm0\n"
  1799    "movzb  (%0),%%r10\n"
  1800    "movq   (%5,%%r10,8),%%xmm1\n"
  1801    "paddsw %%xmm0,%%xmm1\n"
  1802    "psraw  $0x6,%%xmm1\n"
  1803    "packuswb %%xmm1,%%xmm1\n"
  1804    "movd   %%xmm1,0x0(%3)\n"
  1805 -"convertdone:"
  1806 +"3:"
  1808    : "r"(y_buf),  // %0
  1809      "r"(u_buf),  // %1
  1810      "r"(v_buf),  // %2
  1811      "r"(rgb_buf),  // %3
  1812      "r"(width),  // %4
  1813      "r" (kCoefficientsRgbY)  // %5
  1814    : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
  1815 @@ -77,19 +76,19 @@ void ScaleYUVToRGB32Row(const uint8* y_b
  1816                          const uint8* u_buf,  // rsi
  1817                          const uint8* v_buf,  // rdx
  1818                          uint8* rgb_buf,      // rcx
  1819                          int width,           // r8
  1820                          int source_dx) {     // r9
  1821    asm(
  1822    "xor    %%r11,%%r11\n"
  1823    "sub    $0x2,%4\n"
  1824 -  "js     scalenext\n"
  1826 -"scaleloop:"
  1827 +  "js     1f\n"
  1829 +"0:"
  1830    "mov    %%r11,%%r10\n"
  1831    "sar    $0x11,%%r10\n"
  1832    "movzb  (%1,%%r10,1),%%rax\n"
  1833    "movq   2048(%5,%%rax,8),%%xmm0\n"
  1834    "movzb  (%2,%%r10,1),%%rax\n"
  1835    "movq   4096(%5,%%rax,8),%%xmm1\n"
  1836    "lea    (%%r11,%6),%%r10\n"
  1837    "sar    $0x10,%%r11\n"
  1838 @@ -103,38 +102,38 @@ void ScaleYUVToRGB32Row(const uint8* y_b
  1839    "paddsw %%xmm0,%%xmm1\n"
  1840    "paddsw %%xmm0,%%xmm2\n"
  1841    "shufps $0x44,%%xmm2,%%xmm1\n"
  1842    "psraw  $0x6,%%xmm1\n"
  1843    "packuswb %%xmm1,%%xmm1\n"
  1844    "movq   %%xmm1,0x0(%3)\n"
  1845    "add    $0x8,%3\n"
  1846    "sub    $0x2,%4\n"
  1847 -  "jns    scaleloop\n"
  1849 -"scalenext:"
  1850 +  "jns    0b\n"
  1852 +"1:"
  1853    "add    $0x1,%4\n"
  1854 -  "js     scaledone\n"
  1855 +  "js     2f\n"
  1857    "mov    %%r11,%%r10\n"
  1858    "sar    $0x11,%%r10\n"
  1859    "movzb  (%1,%%r10,1),%%rax\n"
  1860    "movq   2048(%5,%%rax,8),%%xmm0\n"
  1861    "movzb  (%2,%%r10,1),%%rax\n"
  1862    "movq   4096(%5,%%rax,8),%%xmm1\n"
  1863    "paddsw %%xmm1,%%xmm0\n"
  1864    "sar    $0x10,%%r11\n"
  1865    "movzb  (%0,%%r11,1),%%rax\n"
  1866    "movq   (%5,%%rax,8),%%xmm1\n"
  1867    "paddsw %%xmm0,%%xmm1\n"
  1868    "psraw  $0x6,%%xmm1\n"
  1869    "packuswb %%xmm1,%%xmm1\n"
  1870    "movd   %%xmm1,0x0(%3)\n"
  1872 -"scaledone:"
  1873 +"2:"
  1875    : "r"(y_buf),  // %0
  1876      "r"(u_buf),  // %1
  1877      "r"(v_buf),  // %2
  1878      "r"(rgb_buf),  // %3
  1879      "r"(width),  // %4
  1880      "r" (kCoefficientsRgbY),  // %5
  1881      "r"(static_cast<long>(source_dx))  // %6
  1882 @@ -146,23 +145,23 @@ void LinearScaleYUVToRGB32Row(const uint
  1883                                const uint8* u_buf,
  1884                                const uint8* v_buf,
  1885                                uint8* rgb_buf,
  1886                                int width,
  1887                                int source_dx) {
  1888    asm(
  1889    "xor    %%r11,%%r11\n"   // x = 0
  1890    "sub    $0x2,%4\n"
  1891 -  "js     .lscalenext\n"
  1892 +  "js     2f\n"
  1893    "cmp    $0x20000,%6\n"   // if source_dx >= 2.0
  1894 -  "jl     .lscalehalf\n"
  1895 +  "jl     0f\n"
  1896    "mov    $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
  1897 -".lscalehalf:"
  1899 -".lscaleloop:"
  1900 +"0:"
  1902 +"1:"
  1903    "mov    %%r11,%%r10\n"
  1904    "sar    $0x11,%%r10\n"
  1906    "movzb  (%1, %%r10, 1), %%r13 \n"
  1907    "movzb  1(%1, %%r10, 1), %%r14 \n"
  1908    "mov    %%r11, %%rax \n"
  1909    "and    $0x1fffe, %%rax \n"
  1910    "imul   %%rax, %%r14 \n"
  1911 @@ -215,21 +214,21 @@ void LinearScaleYUVToRGB32Row(const uint
  1912    "paddsw %%xmm0,%%xmm1\n"
  1913    "paddsw %%xmm0,%%xmm2\n"
  1914    "shufps $0x44,%%xmm2,%%xmm1\n"
  1915    "psraw  $0x6,%%xmm1\n"
  1916    "packuswb %%xmm1,%%xmm1\n"
  1917    "movq   %%xmm1,0x0(%3)\n"
  1918    "add    $0x8,%3\n"
  1919    "sub    $0x2,%4\n"
  1920 -  "jns    .lscaleloop\n"
  1922 -".lscalenext:"
  1923 +  "jns    1b\n"
  1925 +"2:"
  1926    "add    $0x1,%4\n"
  1927 -  "js     .lscaledone\n"
  1928 +  "js     3f\n"
  1930    "mov    %%r11,%%r10\n"
  1931    "sar    $0x11,%%r10\n"
  1933    "movzb  (%1,%%r10,1), %%r13 \n"
  1934    "movq   2048(%5,%%r13,8),%%xmm0\n"
  1936    "movzb  (%2,%%r10,1), %%r13 \n"
  1937 @@ -241,52 +240,52 @@ void LinearScaleYUVToRGB32Row(const uint
  1938    "movzb  (%0,%%r11,1), %%r13 \n"
  1939    "movq   (%5,%%r13,8),%%xmm1\n"
  1941    "paddsw %%xmm0,%%xmm1\n"
  1942    "psraw  $0x6,%%xmm1\n"
  1943    "packuswb %%xmm1,%%xmm1\n"
  1944    "movd   %%xmm1,0x0(%3)\n"
  1946 -".lscaledone:"
  1947 +"3:"
  1949    : "r"(y_buf),  // %0
  1950      "r"(u_buf),  // %1
  1951      "r"(v_buf),  // %2
  1952      "r"(rgb_buf),  // %3
  1953      "r"(width),  // %4
  1954      "r" (kCoefficientsRgbY),  // %5
  1955      "r"(static_cast<long>(source_dx))  // %6
  1956    : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
  1957  );
  1960 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__)
  1961 +#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__)
  1963  // PIC version is slower because less registers are available, so
  1964  // non-PIC is used on platforms where it is possible.
  1966 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
  1967 -                              const uint8* u_buf,
  1968 -                              const uint8* v_buf,
  1969 -                              uint8* rgb_buf,
  1970 -                              int width);
  1971 +void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
  1972 +                                  const uint8* u_buf,
  1973 +                                  const uint8* v_buf,
  1974 +                                  uint8* rgb_buf,
  1975 +                                  int width);
  1976    asm(
  1977    ".text\n"
  1978 -  ".global FastConvertYUVToRGB32Row\n"
  1979 -"FastConvertYUVToRGB32Row:\n"
  1980 +  ".global FastConvertYUVToRGB32Row_SSE\n"
  1981 +  ".type FastConvertYUVToRGB32Row_SSE, @function\n"
  1982 +"FastConvertYUVToRGB32Row_SSE:\n"
  1983    "pusha\n"
  1984    "mov    0x24(%esp),%edx\n"
  1985    "mov    0x28(%esp),%edi\n"
  1986    "mov    0x2c(%esp),%esi\n"
  1987    "mov    0x30(%esp),%ebp\n"
  1988    "mov    0x34(%esp),%ecx\n"
  1989 -  "jmp    convertend\n"
  1991 -"convertloop:"
  1992 +  "jmp    1f\n"
  1994 +"0:"
  1995    "movzbl (%edi),%eax\n"
  1996    "add    $0x1,%edi\n"
  1997    "movzbl (%esi),%ebx\n"
  1998    "add    $0x1,%esi\n"
  1999    "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
  2000    "movzbl (%edx),%eax\n"
  2001    "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
  2002    "movzbl 0x1(%edx),%ebx\n"
  2003 @@ -295,59 +294,77 @@ void FastConvertYUVToRGB32Row(const uint
  2004    "movq   kCoefficientsRgbY(,%ebx,8),%mm2\n"
  2005    "paddsw %mm0,%mm1\n"
  2006    "paddsw %mm0,%mm2\n"
  2007    "psraw  $0x6,%mm1\n"
  2008    "psraw  $0x6,%mm2\n"
  2009    "packuswb %mm2,%mm1\n"
  2010    "movntq %mm1,0x0(%ebp)\n"
  2011    "add    $0x8,%ebp\n"
  2012 -"convertend:"
  2013 +"1:"
  2014    "sub    $0x2,%ecx\n"
  2015 -  "jns    convertloop\n"
  2016 +  "jns    0b\n"
  2018    "and    $0x1,%ecx\n"
  2019 -  "je     convertdone\n"
  2020 +  "je     2f\n"
  2022    "movzbl (%edi),%eax\n"
  2023    "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
  2024    "movzbl (%esi),%eax\n"
  2025    "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
  2026    "movzbl (%edx),%eax\n"
  2027    "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
  2028    "paddsw %mm0,%mm1\n"
  2029    "psraw  $0x6,%mm1\n"
  2030    "packuswb %mm1,%mm1\n"
  2031    "movd   %mm1,0x0(%ebp)\n"
  2032 -"convertdone:"
  2033 +"2:"
  2034    "popa\n"
  2035    "ret\n"
  2036 +#if !defined(XP_MACOSX)
  2037 +  ".previous\n"
  2038 +#endif
  2039  );
  2042 -void ScaleYUVToRGB32Row(const uint8* y_buf,
  2043 -                        const uint8* u_buf,
  2044 -                        const uint8* v_buf,
  2045 -                        uint8* rgb_buf,
  2046 -                        int width,
  2047 -                        int source_dx);
  2048 +void FastConvertYUVToRGB32Row(const uint8* y_buf,
  2049 +                              const uint8* u_buf,
  2050 +                              const uint8* v_buf,
  2051 +                              uint8* rgb_buf,
  2052 +                              int width)
  2053 +{
  2054 +  if (mozilla::supports_sse()) {
  2055 +    FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
  2056 +    return;
  2057 +  }
  2059 +  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
  2060 +}
  2063 +void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
  2064 +                            const uint8* u_buf,
  2065 +                            const uint8* v_buf,
  2066 +                            uint8* rgb_buf,
  2067 +                            int width,
  2068 +                            int source_dx);
  2069    asm(
  2070    ".text\n"
  2071 -  ".global ScaleYUVToRGB32Row\n"
  2072 -"ScaleYUVToRGB32Row:\n"
  2073 +  ".global ScaleYUVToRGB32Row_SSE\n"
  2074 +  ".type ScaleYUVToRGB32Row_SSE, @function\n"
  2075 +"ScaleYUVToRGB32Row_SSE:\n"
  2076    "pusha\n"
  2077    "mov    0x24(%esp),%edx\n"
  2078    "mov    0x28(%esp),%edi\n"
  2079    "mov    0x2c(%esp),%esi\n"
  2080    "mov    0x30(%esp),%ebp\n"
  2081    "mov    0x34(%esp),%ecx\n"
  2082    "xor    %ebx,%ebx\n"
  2083 -  "jmp    scaleend\n"
  2085 -"scaleloop:"
  2086 +  "jmp    1f\n"
  2088 +"0:"
  2089    "mov    %ebx,%eax\n"
  2090    "sar    $0x11,%eax\n"
  2091    "movzbl (%edi,%eax,1),%eax\n"
  2092    "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
  2093    "mov    %ebx,%eax\n"
  2094    "sar    $0x11,%eax\n"
  2095    "movzbl (%esi,%eax,1),%eax\n"
  2096    "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
  2097 @@ -363,22 +380,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b
  2098    "movq   kCoefficientsRgbY(,%eax,8),%mm2\n"
  2099    "paddsw %mm0,%mm1\n"
  2100    "paddsw %mm0,%mm2\n"
  2101    "psraw  $0x6,%mm1\n"
  2102    "psraw  $0x6,%mm2\n"
  2103    "packuswb %mm2,%mm1\n"
  2104    "movntq %mm1,0x0(%ebp)\n"
  2105    "add    $0x8,%ebp\n"
  2106 -"scaleend:"
  2107 +"1:"
  2108    "sub    $0x2,%ecx\n"
  2109 -  "jns    scaleloop\n"
  2110 +  "jns    0b\n"
  2112    "and    $0x1,%ecx\n"
  2113 -  "je     scaledone\n"
  2114 +  "je     2f\n"
  2116    "mov    %ebx,%eax\n"
  2117    "sar    $0x11,%eax\n"
  2118    "movzbl (%edi,%eax,1),%eax\n"
  2119    "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
  2120    "mov    %ebx,%eax\n"
  2121    "sar    $0x11,%eax\n"
  2122    "movzbl (%esi,%eax,1),%eax\n"
  2123 @@ -387,51 +404,71 @@ void ScaleYUVToRGB32Row(const uint8* y_b
  2124    "sar    $0x10,%eax\n"
  2125    "movzbl (%edx,%eax,1),%eax\n"
  2126    "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
  2127    "paddsw %mm0,%mm1\n"
  2128    "psraw  $0x6,%mm1\n"
  2129    "packuswb %mm1,%mm1\n"
  2130    "movd   %mm1,0x0(%ebp)\n"
  2132 -"scaledone:"
  2133 +"2:"
  2134    "popa\n"
  2135    "ret\n"
  2136 +#if !defined(XP_MACOSX)
  2137 +  ".previous\n"
  2138 +#endif
  2139  );
  2141 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
  2142 -                              const uint8* u_buf,
  2143 -                              const uint8* v_buf,
  2144 -                              uint8* rgb_buf,
  2145 -                              int width,
  2146 -                              int source_dx);
  2147 +void ScaleYUVToRGB32Row(const uint8* y_buf,
  2148 +                        const uint8* u_buf,
  2149 +                        const uint8* v_buf,
  2150 +                        uint8* rgb_buf,
  2151 +                        int width,
  2152 +                        int source_dx)
  2153 +{
  2154 +  if (mozilla::supports_sse()) {
  2155 +    ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
  2156 +                           width, source_dx);
  2157 +  }
  2159 +  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
  2160 +                       width, source_dx);
  2161 +}
  2163 +void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
  2164 +                                  const uint8* u_buf,
  2165 +                                  const uint8* v_buf,
  2166 +                                  uint8* rgb_buf,
  2167 +                                  int width,
  2168 +                                  int source_dx);
  2169    asm(
  2170    ".text\n"
  2171 -  ".global LinearScaleYUVToRGB32Row\n"
  2172 -"LinearScaleYUVToRGB32Row:\n"
  2173 +  ".global LinearScaleYUVToRGB32Row_SSE\n"
  2174 +  ".type LinearScaleYUVToRGB32Row_SSE, @function\n"
  2175 +"LinearScaleYUVToRGB32Row_SSE:\n"
  2176    "pusha\n"
  2177    "mov    0x24(%esp),%edx\n"
  2178    "mov    0x28(%esp),%edi\n"
  2179    "mov    0x30(%esp),%ebp\n"
  2181    // source_width = width * source_dx + ebx
  2182    "mov    0x34(%esp), %ecx\n"
  2183    "imull  0x38(%esp), %ecx\n"
  2184    "mov    %ecx, 0x34(%esp)\n"
  2186    "mov    0x38(%esp), %ecx\n"
  2187    "xor    %ebx,%ebx\n"     // x = 0
  2188    "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
  2189 -  "jl     .lscaleend\n"
  2190 +  "jl     1f\n"
  2191    "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
  2192 -  "jmp    .lscaleend\n"
  2194 -".lscaleloop:"
  2195 -  "mov    %ebx,%eax\n"
  2196 -  "sar    $0x11,%eax\n"
  2197 +  "jmp    1f\n"
  2199 +"0:"
  2200 +  "mov    %ebx,%eax\n"
  2201 +  "sar    $0x11,%eax\n"
  2203    "movzbl (%edi,%eax,1),%ecx\n"
  2204    "movzbl 1(%edi,%eax,1),%esi\n"
  2205    "mov    %ebx,%eax\n"
  2206    "andl   $0x1fffe, %eax \n"
  2207    "imul   %eax, %esi \n"
  2208    "xorl   $0x1fffe, %eax \n"
  2209    "imul   %eax, %ecx \n"
  2210 @@ -464,17 +501,17 @@ void LinearScaleYUVToRGB32Row(const uint
  2211    "imul   %eax, %esi \n"
  2212    "xorl   $0xffff, %eax \n"
  2213    "imul   %eax, %ecx \n"
  2214    "addl   %esi, %ecx \n"
  2215    "shrl   $16, %ecx \n"
  2216    "movq   kCoefficientsRgbY(,%ecx,8),%mm1\n"
  2218    "cmp    0x34(%esp), %ebx\n"
  2219 -  "jge    .lscalelastpixel\n"
  2220 +  "jge    2f\n"
  2222    "mov    %ebx,%eax\n"
  2223    "sar    $0x10,%eax\n"
  2224    "movzbl (%edx,%eax,1),%ecx\n"
  2225    "movzbl 1(%edx,%eax,1),%esi\n"
  2226    "mov    %ebx,%eax\n"
  2227    "add    0x38(%esp),%ebx\n"
  2228    "andl   $0xffff, %eax \n"
  2229 @@ -488,56 +525,76 @@ void LinearScaleYUVToRGB32Row(const uint
  2230    "paddsw %mm0,%mm1\n"
  2231    "paddsw %mm0,%mm2\n"
  2232    "psraw  $0x6,%mm1\n"
  2233    "psraw  $0x6,%mm2\n"
  2234    "packuswb %mm2,%mm1\n"
  2235    "movntq %mm1,0x0(%ebp)\n"
  2236    "add    $0x8,%ebp\n"
  2238 -".lscaleend:"
  2239 +"1:"
  2240    "cmp    0x34(%esp), %ebx\n"
  2241 -  "jl     .lscaleloop\n"
  2242 +  "jl     0b\n"
  2243    "popa\n"
  2244    "ret\n"
  2246 -".lscalelastpixel:"
  2247 +"2:"
  2248    "paddsw %mm0, %mm1\n"
  2249    "psraw $6, %mm1\n"
  2250    "packuswb %mm1, %mm1\n"
  2251    "movd %mm1, (%ebp)\n"
  2252    "popa\n"
  2253    "ret\n"
  2254 +#if !defined(XP_MACOSX)
  2255 +  ".previous\n"
  2256 +#endif
  2257  );
  2259 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__)
  2261 -extern void PICConvertYUVToRGB32Row(const uint8* y_buf,
  2262 -                                    const uint8* u_buf,
  2263 -                                    const uint8* v_buf,
  2264 -                                    uint8* rgb_buf,
  2265 -                                    int width,
  2266 -                                    int16 *kCoefficientsRgbY);
  2267 +void LinearScaleYUVToRGB32Row(const uint8* y_buf,
  2268 +                              const uint8* u_buf,
  2269 +                              const uint8* v_buf,
  2270 +                              uint8* rgb_buf,
  2271 +                              int width,
  2272 +                              int source_dx)
  2273 +{
  2274 +  if (mozilla::supports_sse()) {
  2275 +    LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
  2276 +                                 width, source_dx);
  2277 +  }
  2279 +  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
  2280 +                             width, source_dx);
  2281 +}
  2283 +#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__)
  2285 +void PICConvertYUVToRGB32Row_SSE(const uint8* y_buf,
  2286 +                                 const uint8* u_buf,
  2287 +                                 const uint8* v_buf,
  2288 +                                 uint8* rgb_buf,
  2289 +                                 int width,
  2290 +                                 int16 *kCoefficientsRgbY);
  2292    asm(
  2293    ".text\n"
  2294 -#if defined(OS_MACOSX)
  2295 -"_PICConvertYUVToRGB32Row:\n"
  2296 +#if defined(XP_MACOSX)
  2297 +"_PICConvertYUVToRGB32Row_SSE:\n"
  2298  #else
  2299 -"PICConvertYUVToRGB32Row:\n"
  2300 +"PICConvertYUVToRGB32Row_SSE:\n"
  2301  #endif
  2302    "pusha\n"
  2303    "mov    0x24(%esp),%edx\n"
  2304    "mov    0x28(%esp),%edi\n"
  2305    "mov    0x2c(%esp),%esi\n"
  2306    "mov    0x30(%esp),%ebp\n"
  2307    "mov    0x38(%esp),%ecx\n"
  2309 -  "jmp    .Lconvertend\n"
  2311 -".Lconvertloop:"
  2312 +  "jmp    1f\n"
  2314 +"0:"
  2315    "movzbl (%edi),%eax\n"
  2316    "add    $0x1,%edi\n"
  2317    "movzbl (%esi),%ebx\n"
  2318    "add    $0x1,%esi\n"
  2319    "movq   2048(%ecx,%eax,8),%mm0\n"
  2320    "movzbl (%edx),%eax\n"
  2321    "paddsw 4096(%ecx,%ebx,8),%mm0\n"
  2322    "movzbl 0x1(%edx),%ebx\n"
  2323 @@ -546,72 +603,81 @@ extern void PICConvertYUVToRGB32Row(cons
  2324    "movq   0(%ecx,%ebx,8),%mm2\n"
  2325    "paddsw %mm0,%mm1\n"
  2326    "paddsw %mm0,%mm2\n"
  2327    "psraw  $0x6,%mm1\n"
  2328    "psraw  $0x6,%mm2\n"
  2329    "packuswb %mm2,%mm1\n"
  2330    "movntq %mm1,0x0(%ebp)\n"
  2331    "add    $0x8,%ebp\n"
  2332 -".Lconvertend:"
  2333 +"1:"
  2334    "subl   $0x2,0x34(%esp)\n"
  2335 -  "jns    .Lconvertloop\n"
  2336 +  "jns    0b\n"
  2338    "andl   $0x1,0x34(%esp)\n"
  2339 -  "je     .Lconvertdone\n"
  2340 +  "je     2f\n"
  2342    "movzbl (%edi),%eax\n"
  2343    "movq   2048(%ecx,%eax,8),%mm0\n"
  2344    "movzbl (%esi),%eax\n"
  2345    "paddsw 4096(%ecx,%eax,8),%mm0\n"
  2346    "movzbl (%edx),%eax\n"
  2347    "movq   0(%ecx,%eax,8),%mm1\n"
  2348    "paddsw %mm0,%mm1\n"
  2349    "psraw  $0x6,%mm1\n"
  2350    "packuswb %mm1,%mm1\n"
  2351    "movd   %mm1,0x0(%ebp)\n"
  2352 -".Lconvertdone:\n"
  2353 +"2:"
  2354    "popa\n"
  2355    "ret\n"
  2356 +#if !defined(XP_MACOSX)
  2357 +  ".previous\n"
  2358 +#endif
  2359  );
  2361  void FastConvertYUVToRGB32Row(const uint8* y_buf,
  2362                                const uint8* u_buf,
  2363                                const uint8* v_buf,
  2364                                uint8* rgb_buf,
  2365 -                              int width) {
  2366 -  PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
  2367 -                          &kCoefficientsRgbY[0][0]);
  2368 -}
  2370 -extern void PICScaleYUVToRGB32Row(const uint8* y_buf,
  2371 +                              int width)
  2372 +{
  2373 +  if (mozilla::supports_sse()) {
  2374 +    PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
  2375 +                                &kCoefficientsRgbY[0][0]);
  2376 +    return;
  2377 +  }
  2379 +  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
  2380 +}
  2382 +void PICScaleYUVToRGB32Row_SSE(const uint8* y_buf,
  2383                                 const uint8* u_buf,
  2384                                 const uint8* v_buf,
  2385                                 uint8* rgb_buf,
  2386                                 int width,
  2387                                 int source_dx,
  2388                                 int16 *kCoefficientsRgbY);
  2390    asm(
  2391    ".text\n"
  2392 -#if defined(OS_MACOSX)
  2393 -"_PICScaleYUVToRGB32Row:\n"
  2394 +#if defined(XP_MACOSX)
  2395 +"_PICScaleYUVToRGB32Row_SSE:\n"
  2396  #else
  2397 -"PICScaleYUVToRGB32Row:\n"
  2398 +"PICScaleYUVToRGB32Row_SSE:\n"
  2399  #endif
  2400    "pusha\n"
  2401    "mov    0x24(%esp),%edx\n"
  2402    "mov    0x28(%esp),%edi\n"
  2403    "mov    0x2c(%esp),%esi\n"
  2404    "mov    0x30(%esp),%ebp\n"
  2405    "mov    0x3c(%esp),%ecx\n"
  2406    "xor    %ebx,%ebx\n"
  2407 -  "jmp    Lscaleend\n"
  2409 -"Lscaleloop:"
  2410 +  "jmp    1f\n"
  2412 +"0:"
  2413    "mov    %ebx,%eax\n"
  2414    "sar    $0x11,%eax\n"
  2415    "movzbl (%edi,%eax,1),%eax\n"
  2416    "movq   2048(%ecx,%eax,8),%mm0\n"
  2417    "mov    %ebx,%eax\n"
  2418    "sar    $0x11,%eax\n"
  2419    "movzbl (%esi,%eax,1),%eax\n"
  2420    "paddsw 4096(%ecx,%eax,8),%mm0\n"
  2421 @@ -627,22 +693,22 @@ extern void PICScaleYUVToRGB32Row(const 
  2422    "movq   0(%ecx,%eax,8),%mm2\n"
  2423    "paddsw %mm0,%mm1\n"
  2424    "paddsw %mm0,%mm2\n"
  2425    "psraw  $0x6,%mm1\n"
  2426    "psraw  $0x6,%mm2\n"
  2427    "packuswb %mm2,%mm1\n"
  2428    "movntq %mm1,0x0(%ebp)\n"
  2429    "add    $0x8,%ebp\n"
  2430 -"Lscaleend:"
  2431 +"1:"
  2432    "subl   $0x2,0x34(%esp)\n"
  2433 -  "jns    Lscaleloop\n"
  2434 +  "jns    0b\n"
  2436    "andl   $0x1,0x34(%esp)\n"
  2437 -  "je     Lscaledone\n"
  2438 +  "je     2f\n"
  2440    "mov    %ebx,%eax\n"
  2441    "sar    $0x11,%eax\n"
  2442    "movzbl (%edi,%eax,1),%eax\n"
  2443    "movq   2048(%ecx,%eax,8),%mm0\n"
  2444    "mov    %ebx,%eax\n"
  2445    "sar    $0x11,%eax\n"
  2446    "movzbl (%esi,%eax,1),%eax\n"
  2447 @@ -651,66 +717,75 @@ extern void PICScaleYUVToRGB32Row(const 
  2448    "sar    $0x10,%eax\n"
  2449    "movzbl (%edx,%eax,1),%eax\n"
  2450    "movq   0(%ecx,%eax,8),%mm1\n"
  2451    "paddsw %mm0,%mm1\n"
  2452    "psraw  $0x6,%mm1\n"
  2453    "packuswb %mm1,%mm1\n"
  2454    "movd   %mm1,0x0(%ebp)\n"
  2456 -"Lscaledone:"
  2457 +"2:"
  2458    "popa\n"
  2459    "ret\n"
  2460 +#if !defined(XP_MACOSX)
  2461 +  ".previous\n"
  2462 +#endif
  2463  );
  2466  void ScaleYUVToRGB32Row(const uint8* y_buf,
  2467                          const uint8* u_buf,
  2468                          const uint8* v_buf,
  2469                          uint8* rgb_buf,
  2470                          int width,
  2471 -                        int source_dx) {
  2472 -  PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
  2473 -                        &kCoefficientsRgbY[0][0]);
  2474 -}
  2476 -void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
  2477 -                                 const uint8* u_buf,
  2478 -                                 const uint8* v_buf,
  2479 -                                 uint8* rgb_buf,
  2480 -                                 int width,
  2481 -                                 int source_dx,
  2482 -                                 int16 *kCoefficientsRgbY);
  2483 +                        int source_dx)
  2484 +{
  2485 +  if (mozilla::supports_sse()) {
  2486 +    PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
  2487 +                              &kCoefficientsRgbY[0][0]);
  2488 +    return;
  2489 +  }
  2491 +  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
  2492 +}
  2494 +void PICLinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
  2495 +                                     const uint8* u_buf,
  2496 +                                     const uint8* v_buf,
  2497 +                                     uint8* rgb_buf,
  2498 +                                     int width,
  2499 +                                     int source_dx,
  2500 +                                     int16 *kCoefficientsRgbY);
  2502    asm(
  2503    ".text\n"
  2504 -#if defined(OS_MACOSX)
  2505 -"_PICLinearScaleYUVToRGB32Row:\n"
  2506 +#if defined(XP_MACOSX)
  2507 +"_PICLinearScaleYUVToRGB32Row_SSE:\n"
  2508  #else
  2509 -"PICLinearScaleYUVToRGB32Row:\n"
  2510 +"PICLinearScaleYUVToRGB32Row_SSE:\n"
  2511  #endif
  2512    "pusha\n"
  2513    "mov    0x24(%esp),%edx\n"
  2514    "mov    0x30(%esp),%ebp\n"
  2515    "mov    0x34(%esp),%ecx\n"
  2516    "mov    0x3c(%esp),%edi\n"
  2517    "xor    %ebx,%ebx\n"
  2519    // source_width = width * source_dx + ebx
  2520    "mov    0x34(%esp), %ecx\n"
  2521    "imull  0x38(%esp), %ecx\n"
  2522    "mov    %ecx, 0x34(%esp)\n"
  2524    "mov    0x38(%esp), %ecx\n"
  2525    "xor    %ebx,%ebx\n"     // x = 0
  2526    "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
  2527 -  "jl     .lscaleend\n"
  2528 +  "jl     1f\n"
  2529    "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
  2530 -  "jmp    .lscaleend\n"
  2532 -".lscaleloop:"
  2533 +  "jmp    1f\n"
  2535 +"0:"
  2536    "mov    0x28(%esp),%esi\n"
  2537    "mov    %ebx,%eax\n"
  2538    "sar    $0x11,%eax\n"
  2540    "movzbl (%esi,%eax,1),%ecx\n"
  2541    "movzbl 1(%esi,%eax,1),%esi\n"
  2542    "mov    %ebx,%eax\n"
  2543    "andl   $0x1fffe, %eax \n"
  2544 @@ -746,17 +821,17 @@ void PICLinearScaleYUVToRGB32Row(const u
  2545    "imul   %eax, %esi \n"
  2546    "xorl   $0xffff, %eax \n"
  2547    "imul   %eax, %ecx \n"
  2548    "addl   %esi, %ecx \n"
  2549    "shrl   $16, %ecx \n"
  2550    "movq   (%edi,%ecx,8),%mm1\n"
  2552    "cmp    0x34(%esp), %ebx\n"
  2553 -  "jge    .lscalelastpixel\n"
  2554 +  "jge    2f\n"
  2556    "mov    %ebx,%eax\n"
  2557    "sar    $0x10,%eax\n"
  2558    "movzbl (%edx,%eax,1),%ecx\n"
  2559    "movzbl 1(%edx,%eax,1),%esi\n"
  2560    "mov    %ebx,%eax\n"
  2561    "add    0x38(%esp),%ebx\n"
  2562    "andl   $0xffff, %eax \n"
  2563 @@ -770,154 +845,71 @@ void PICLinearScaleYUVToRGB32Row(const u
  2564    "paddsw %mm0,%mm1\n"
  2565    "paddsw %mm0,%mm2\n"
  2566    "psraw  $0x6,%mm1\n"
  2567    "psraw  $0x6,%mm2\n"
  2568    "packuswb %mm2,%mm1\n"
  2569    "movntq %mm1,0x0(%ebp)\n"
  2570    "add    $0x8,%ebp\n"
  2572 -".lscaleend:"
  2573 +"1:"
  2574    "cmp    %ebx, 0x34(%esp)\n"
  2575 -  "jg     .lscaleloop\n"
  2576 +  "jg     0b\n"
  2577    "popa\n"
  2578    "ret\n"
  2580 -".lscalelastpixel:"
  2581 +"2:"
  2582    "paddsw %mm0, %mm1\n"
  2583    "psraw $6, %mm1\n"
  2584    "packuswb %mm1, %mm1\n"
  2585    "movd %mm1, (%ebp)\n"
  2586    "popa\n"
  2587    "ret\n"
  2588 +#if !defined(XP_MACOSX)
  2589 +  ".previous\n"
  2590 +#endif
  2591  );
  2594  void LinearScaleYUVToRGB32Row(const uint8* y_buf,
  2595 -                        const uint8* u_buf,
  2596 -                        const uint8* v_buf,
  2597 -                        uint8* rgb_buf,
  2598 -                        int width,
  2599 -                        int source_dx) {
  2600 -  PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
  2601 -                              &kCoefficientsRgbY[0][0]);
  2602 -}
  2604 -#else  // USE_MMX
  2606 -// C reference code that mimic the YUV assembly.
  2607 -#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
  2608 -#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
  2609 -    (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
  2611 -static inline void YuvPixel(uint8 y,
  2612 -                            uint8 u,
  2613 -                            uint8 v,
  2614 -                            uint8* rgb_buf) {
  2616 -  int b = kCoefficientsRgbY[256+u][0];
  2617 -  int g = kCoefficientsRgbY[256+u][1];
  2618 -  int r = kCoefficientsRgbY[256+u][2];
  2619 -  int a = kCoefficientsRgbY[256+u][3];
  2621 -  b = paddsw(b, kCoefficientsRgbY[512+v][0]);
  2622 -  g = paddsw(g, kCoefficientsRgbY[512+v][1]);
  2623 -  r = paddsw(r, kCoefficientsRgbY[512+v][2]);
  2624 -  a = paddsw(a, kCoefficientsRgbY[512+v][3]);
  2626 -  b = paddsw(b, kCoefficientsRgbY[y][0]);
  2627 -  g = paddsw(g, kCoefficientsRgbY[y][1]);
  2628 -  r = paddsw(r, kCoefficientsRgbY[y][2]);
  2629 -  a = paddsw(a, kCoefficientsRgbY[y][3]);
  2631 -  b >>= 6;
  2632 -  g >>= 6;
  2633 -  r >>= 6;
  2634 -  a >>= 6;
  2636 -  *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
  2637 -                                        (packuswb(g) << 8) |
  2638 -                                        (packuswb(r) << 16) |
  2639 -                                        (packuswb(a) << 24);
  2640 -}
  2642 +                              const uint8* u_buf,
  2643 +                              const uint8* v_buf,
  2644 +                              uint8* rgb_buf,
  2645 +                              int width,
  2646 +                              int source_dx)
  2647 +{
  2648 +  if (mozilla::supports_sse()) {
  2649 +    PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
  2650 +                                    source_dx, &kCoefficientsRgbY[0][0]);
  2651 +    return;
  2652 +  }
  2654 +  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
  2655 +}
  2656 +#else
  2657  void FastConvertYUVToRGB32Row(const uint8* y_buf,
  2658                                const uint8* u_buf,
  2659                                const uint8* v_buf,
  2660                                uint8* rgb_buf,
  2661                                int width) {
  2662 -  for (int x = 0; x < width; x += 2) {
  2663 -    uint8 u = u_buf[x >> 1];
  2664 -    uint8 v = v_buf[x >> 1];
  2665 -    uint8 y0 = y_buf[x];
  2666 -    YuvPixel(y0, u, v, rgb_buf);
  2667 -    if ((x + 1) < width) {
  2668 -      uint8 y1 = y_buf[x + 1];
  2669 -      YuvPixel(y1, u, v, rgb_buf + 4);
  2670 -    }
  2671 -    rgb_buf += 8;  // Advance 2 pixels.
  2672 -  }
  2673 -}
  2675 -// 16.16 fixed point is used.  A shift by 16 isolates the integer.
  2676 -// A shift by 17 is used to further subsample the chrominence channels.
  2677 -// & 0xffff isolates the fixed point fraction.  >> 2 to get the upper 2 bits,
  2678 -// for 1/65536 pixel accurate interpolation.
  2679 +  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
  2680 +}
  2682  void ScaleYUVToRGB32Row(const uint8* y_buf,
  2683                          const uint8* u_buf,
  2684                          const uint8* v_buf,
  2685                          uint8* rgb_buf,
  2686                          int width,
  2687                          int source_dx) {
  2688 -  int x = 0;
  2689 -  for (int i = 0; i < width; i += 2) {
  2690 -    int y = y_buf[x >> 16];
  2691 -    int u = u_buf[(x >> 17)];
  2692 -    int v = v_buf[(x >> 17)];
  2693 -    YuvPixel(y, u, v, rgb_buf);
  2694 -    x += source_dx;
  2695 -    if ((i + 1) < width) {
  2696 -      y = y_buf[x >> 16];
  2697 -      YuvPixel(y, u, v, rgb_buf+4);
  2698 -      x += source_dx;
  2699 -    }
  2700 -    rgb_buf += 8;
  2701 -  }
  2702 -}
  2703 +  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
  2704 +}
  2706  void LinearScaleYUVToRGB32Row(const uint8* y_buf,
  2707                                const uint8* u_buf,
  2708                                const uint8* v_buf,
  2709                                uint8* rgb_buf,
  2710                                int width,
  2711                                int source_dx) {
  2712 -  int x = 0;
  2713 -  if (source_dx >= 0x20000) {
  2714 -    x = 32768;
  2715 -  }
  2716 -  for (int i = 0; i < width; i += 2) {
  2717 -    int y0 = y_buf[x >> 16];
  2718 -    int y1 = y_buf[(x >> 16) + 1];
  2719 -    int u0 = u_buf[(x >> 17)];
  2720 -    int u1 = u_buf[(x >> 17) + 1];
  2721 -    int v0 = v_buf[(x >> 17)];
  2722 -    int v1 = v_buf[(x >> 17) + 1];
  2723 -    int y_frac = (x & 65535);
  2724 -    int uv_frac = ((x >> 1) & 65535);
  2725 -    int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
  2726 -    int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;
  2727 -    int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;
  2728 -    YuvPixel(y, u, v, rgb_buf);
  2729 -    x += source_dx;
  2730 -    if ((i + 1) < width) {
  2731 -      y0 = y_buf[x >> 16];
  2732 -      y1 = y_buf[(x >> 16) + 1];
  2733 -      y_frac = (x & 65535);
  2734 -      y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
  2735 -      YuvPixel(y, u, v, rgb_buf+4);
  2736 -      x += source_dx;
  2737 -    }
  2738 -    rgb_buf += 8;
  2739 -  }
  2740 -}
  2742 -#endif  // USE_MMX
  2743 -}  // extern "C"
  2745 +  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
  2746 +}
  2747 +#endif
  2749 +}
  2750 diff --git a/gfx/ycbcr/yuv_row_table.cpp b/gfx/ycbcr/yuv_row_table.cpp
  2751 --- a/gfx/ycbcr/yuv_row_table.cpp
  2752 +++ b/gfx/ycbcr/yuv_row_table.cpp
  2753 @@ -1,13 +1,13 @@
  2754  // Copyright (c) 2010 The Chromium Authors. All rights reserved.
  2755  // Use of this source code is governed by a BSD-style license that can be
  2756  // found in the LICENSE file.
  2758 -#include "media/base/yuv_row.h"
  2759 +#include "yuv_row.h"
  2761  extern "C" {
  2763  #define RGBY(i) { \
  2764    static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
  2765    static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
  2766    static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
  2767    0 \
  2768 diff --git a/gfx/ycbcr/yuv_row_win.cpp b/gfx/ycbcr/yuv_row_win.cpp
  2769 --- a/gfx/ycbcr/yuv_row_win.cpp
  2770 +++ b/gfx/ycbcr/yuv_row_win.cpp
  2771 @@ -1,26 +1,27 @@
  2772  // Copyright (c) 2010 The Chromium Authors. All rights reserved.
  2773  // Use of this source code is governed by a BSD-style license that can be
  2774  // found in the LICENSE file.
  2776 -#include "media/base/yuv_row.h"
  2777 +#include "yuv_row.h"
  2778 +#include "mozilla/SSE.h"
  2780  #define kCoefficientsRgbU kCoefficientsRgbY + 2048
  2781  #define kCoefficientsRgbV kCoefficientsRgbY + 4096
  2783  extern "C" {
  2785 -#if USE_MMX
  2786 -__declspec(naked)
  2787 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
  2788 -                              const uint8* u_buf,
  2789 -                              const uint8* v_buf,
  2790 -                              uint8* rgb_buf,
  2791 -                              int width) {
  2792 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
  2793 +__declspec(naked)
  2794 +void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
  2795 +                                  const uint8* u_buf,
  2796 +                                  const uint8* v_buf,
  2797 +                                  uint8* rgb_buf,
  2798 +                                  int width) {
  2799    __asm {
  2800      pushad
  2801      mov       edx, [esp + 32 + 4]   // Y
  2802      mov       edi, [esp + 32 + 8]   // U
  2803      mov       esi, [esp + 32 + 12]  // V
  2804      mov       ebp, [esp + 32 + 16]  // rgb
  2805      mov       ecx, [esp + 32 + 20]  // width
  2806      jmp       convertend
  2807 @@ -64,22 +65,22 @@ void FastConvertYUVToRGB32Row(const uint
  2808   convertdone :
  2810      popad
  2811      ret
  2815  __declspec(naked)
  2816 -void ConvertYUVToRGB32Row(const uint8* y_buf,
  2817 -                          const uint8* u_buf,
  2818 -                          const uint8* v_buf,
  2819 -                          uint8* rgb_buf,
  2820 -                          int width,
  2821 -                          int step) {
  2822 +void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
  2823 +                              const uint8* u_buf,
  2824 +                              const uint8* v_buf,
  2825 +                              uint8* rgb_buf,
  2826 +                              int width,
  2827 +                              int step) {
  2828    __asm {
  2829      pushad
  2830      mov       edx, [esp + 32 + 4]   // Y
  2831      mov       edi, [esp + 32 + 8]   // U
  2832      mov       esi, [esp + 32 + 12]  // V
  2833      mov       ebp, [esp + 32 + 16]  // rgb
  2834      mov       ecx, [esp + 32 + 20]  // width
  2835      mov       ebx, [esp + 32 + 24]  // step
  2836 @@ -125,23 +126,23 @@ void ConvertYUVToRGB32Row(const uint8* y
  2837   wdone :
  2839      popad
  2840      ret
  2844  __declspec(naked)
  2845 -void RotateConvertYUVToRGB32Row(const uint8* y_buf,
  2846 -                                const uint8* u_buf,
  2847 -                                const uint8* v_buf,
  2848 -                                uint8* rgb_buf,
  2849 -                                int width,
  2850 -                                int ystep,
  2851 -                                int uvstep) {
  2852 +void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
  2853 +                                    const uint8* u_buf,
  2854 +                                    const uint8* v_buf,
  2855 +                                    uint8* rgb_buf,
  2856 +                                    int width,
  2857 +                                    int ystep,
  2858 +                                    int uvstep) {
  2859    __asm {
  2860      pushad
  2861      mov       edx, [esp + 32 + 4]   // Y
  2862      mov       edi, [esp + 32 + 8]   // U
  2863      mov       esi, [esp + 32 + 12]  // V
  2864      mov       ebp, [esp + 32 + 16]  // rgb
  2865      mov       ecx, [esp + 32 + 20]  // width
  2866      jmp       wend
  2867 @@ -188,21 +189,21 @@ void RotateConvertYUVToRGB32Row(const ui
  2868   wdone :
  2870      popad
  2871      ret
  2875  __declspec(naked)
  2876 -void DoubleYUVToRGB32Row(const uint8* y_buf,
  2877 -                         const uint8* u_buf,
  2878 -                         const uint8* v_buf,
  2879 -                         uint8* rgb_buf,
  2880 -                         int width) {
  2881 +void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
  2882 +                             const uint8* u_buf,
  2883 +                             const uint8* v_buf,
  2884 +                             uint8* rgb_buf,
  2885 +                             int width) {
  2886    __asm {
  2887      pushad
  2888      mov       edx, [esp + 32 + 4]   // Y
  2889      mov       edi, [esp + 32 + 8]   // U
  2890      mov       esi, [esp + 32 + 12]  // V
  2891      mov       ebp, [esp + 32 + 16]  // rgb
  2892      mov       ecx, [esp + 32 + 20]  // width
  2893      jmp       wend
  2894 @@ -256,26 +257,26 @@ void DoubleYUVToRGB32Row(const uint8* y_
  2895      jns       wloop1
  2896   wdone :
  2897      popad
  2898      ret
  2902  // This version does general purpose scaling by any amount, up or down.
  2903 -// The only thing it can not do it rotation by 90 or 270.
  2904 -// For performance the chroma is under sampled, reducing cost of a 3x
  2905 +// The only thing it cannot do is rotation by 90 or 270.
  2906 +// For performance the chroma is under-sampled, reducing cost of a 3x
  2907  // 1080p scale from 8.4 ms to 5.4 ms.
  2908  __declspec(naked)
  2909 -void ScaleYUVToRGB32Row(const uint8* y_buf,
  2910 -                        const uint8* u_buf,
  2911 -                        const uint8* v_buf,
  2912 -                        uint8* rgb_buf,
  2913 -                        int width,
  2914 -                        int source_dx) {
  2915 +void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
  2916 +                            const uint8* u_buf,
  2917 +                            const uint8* v_buf,
  2918 +                            uint8* rgb_buf,
  2919 +                            int width,
  2920 +                            int source_dx) {
  2921    __asm {
  2922      pushad
  2923      mov       edx, [esp + 32 + 4]   // Y
  2924      mov       edi, [esp + 32 + 8]   // U
  2925      mov       esi, [esp + 32 + 12]  // V
  2926      mov       ebp, [esp + 32 + 16]  // rgb
  2927      mov       ecx, [esp + 32 + 20]  // width
  2928      xor       ebx, ebx              // x
  2929 @@ -333,22 +334,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b
  2931   scaledone :
  2932      popad
  2933      ret
  2937  __declspec(naked)
  2938 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
  2939 -                              const uint8* u_buf,
  2940 -                              const uint8* v_buf,
  2941 -                              uint8* rgb_buf,
  2942 -                              int width,
  2943 -                              int source_dx) {
  2944 +void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
  2945 +                                  const uint8* u_buf,
  2946 +                                  const uint8* v_buf,
  2947 +                                  uint8* rgb_buf,
  2948 +                                  int width,
  2949 +                                  int source_dx) {
  2950    __asm {
  2951      pushad
  2952      mov       edx, [esp + 32 + 4]  // Y
  2953      mov       edi, [esp + 32 + 8]  // U
  2954                  // [esp + 32 + 12] // V
  2955      mov       ebp, [esp + 32 + 16] // rgb
  2956      mov       ecx, [esp + 32 + 20] // width
  2957      imul      ecx, [esp + 32 + 24] // source_dx
  2958 @@ -438,152 +439,60 @@ lscalelastpixel:
  2959      paddsw    mm1, mm0
  2960      psraw     mm1, 6
  2961      packuswb  mm1, mm1
  2962      movd      [ebp], mm1
  2963      popad
  2964      ret
  2965    };
  2967 -#else  // USE_MMX
  2969 -// C reference code that mimic the YUV assembly.
  2970 -#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
  2971 -#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
  2972 -    (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
  2974 -static inline void YuvPixel(uint8 y,
  2975 -                            uint8 u,
  2976 -                            uint8 v,
  2977 -                            uint8* rgb_buf) {
  2979 -  int b = kCoefficientsRgbY[256+u][0];
  2980 -  int g = kCoefficientsRgbY[256+u][1];
  2981 -  int r = kCoefficientsRgbY[256+u][2];
  2982 -  int a = kCoefficientsRgbY[256+u][3];
  2984 -  b = paddsw(b, kCoefficientsRgbY[512+v][0]);
  2985 -  g = paddsw(g, kCoefficientsRgbY[512+v][1]);
  2986 -  r = paddsw(r, kCoefficientsRgbY[512+v][2]);
  2987 -  a = paddsw(a, kCoefficientsRgbY[512+v][3]);
  2989 -  b = paddsw(b, kCoefficientsRgbY[y][0]);
  2990 -  g = paddsw(g, kCoefficientsRgbY[y][1]);
  2991 -  r = paddsw(r, kCoefficientsRgbY[y][2]);
  2992 -  a = paddsw(a, kCoefficientsRgbY[y][3]);
  2994 -  b >>= 6;
  2995 -  g >>= 6;
  2996 -  r >>= 6;
  2997 -  a >>= 6;
  2999 -  *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
  3000 -                                        (packuswb(g) << 8) |
  3001 -                                        (packuswb(r) << 16) |
  3002 -                                        (packuswb(a) << 24);
  3003 -}
  3005 -#if TEST_MMX_YUV
  3006 -static inline void YuvPixel(uint8 y,
  3007 -                            uint8 u,
  3008 -                            uint8 v,
  3009 -                            uint8* rgb_buf) {
  3011 -  __asm {
  3012 -    movzx     eax, u
  3013 -    movq      mm0, [kCoefficientsRgbY+2048 + 8 * eax]
  3014 -    movzx     eax, v
  3015 -    paddsw    mm0, [kCoefficientsRgbY+4096 + 8 * eax]
  3016 -    movzx     eax, y
  3017 -    movq      mm1, [kCoefficientsRgbY + 8 * eax]
  3018 -    paddsw    mm1, mm0
  3019 -    psraw     mm1, 6
  3020 -    packuswb  mm1, mm1
  3021 -    mov       eax, rgb_buf
  3022 -    movd      [eax], mm1
  3023 -    emms
  3024 -  }
  3025 -}
  3026 -#endif
  3027 +#endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
  3029  void FastConvertYUVToRGB32Row(const uint8* y_buf,
  3030                                const uint8* u_buf,
  3031                                const uint8* v_buf,
  3032                                uint8* rgb_buf,
  3033                                int width) {
  3034 -  for (int x = 0; x < width; x += 2) {
  3035 -    uint8 u = u_buf[x >> 1];
  3036 -    uint8 v = v_buf[x >> 1];
  3037 -    uint8 y0 = y_buf[x];
  3038 -    YuvPixel(y0, u, v, rgb_buf);
  3039 -    if ((x + 1) < width) {
  3040 -      uint8 y1 = y_buf[x + 1];
  3041 -      YuvPixel(y1, u, v, rgb_buf + 4);
  3042 -    }
  3043 -    rgb_buf += 8;  // Advance 2 pixels.
  3044 -  }
  3045 -}
  3047 -// 16.16 fixed point is used.  A shift by 16 isolates the integer.
  3048 -// A shift by 17 is used to further subsample the chrominence channels.
  3049 -// & 0xffff isolates the fixed point fraction.  >> 2 to get the upper 2 bits,
  3050 -// for 1/65536 pixel accurate interpolation.
  3051 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
  3052 +  if (mozilla::supports_sse()) {
  3053 +    FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
  3054 +    return;
  3055 +  }
  3056 +#endif
  3058 +  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
  3059 +}
  3061  void ScaleYUVToRGB32Row(const uint8* y_buf,
  3062                          const uint8* u_buf,
  3063                          const uint8* v_buf,
  3064                          uint8* rgb_buf,
  3065                          int width,
  3066                          int source_dx) {
  3067 -  int x = 0;
  3068 -  for (int i = 0; i < width; i += 2) {
  3069 -    int y = y_buf[x >> 16];
  3070 -    int u = u_buf[(x >> 17)];
  3071 -    int v = v_buf[(x >> 17)];
  3072 -    YuvPixel(y, u, v, rgb_buf);
  3073 -    x += source_dx;
  3074 -    if ((i + 1) < width) {
  3075 -      y = y_buf[x >> 16];
  3076 -      YuvPixel(y, u, v, rgb_buf+4);
  3077 -      x += source_dx;
  3078 -    }
  3079 -    rgb_buf += 8;
  3080 -  }
  3081 -}
  3083 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
  3084 +  if (mozilla::supports_sse()) {
  3085 +    ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
  3086 +    return;
  3087 +  }
  3088 +#endif
  3090 +  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
  3091 +}
  3093  void LinearScaleYUVToRGB32Row(const uint8* y_buf,
  3094                                const uint8* u_buf,
  3095                                const uint8* v_buf,
  3096                                uint8* rgb_buf,
  3097                                int width,
  3098                                int source_dx) {
  3099 -  int x = 0;
  3100 -  if (source_dx >= 0x20000) {
  3101 -    x = 32768;
  3102 -  }
  3103 -  for (int i = 0; i < width; i += 2) {
  3104 -    int y0 = y_buf[x >> 16];
  3105 -    int y1 = y_buf[(x >> 16) + 1];
  3106 -    int u0 = u_buf[(x >> 17)];
  3107 -    int u1 = u_buf[(x >> 17) + 1];
  3108 -    int v0 = v_buf[(x >> 17)];
  3109 -    int v1 = v_buf[(x >> 17) + 1];
  3110 -    int y_frac = (x & 65535);
  3111 -    int uv_frac = ((x >> 1) & 65535);
  3112 -    int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
  3113 -    int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;
  3114 -    int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;
  3115 -    YuvPixel(y, u, v, rgb_buf);
  3116 -    x += source_dx;
  3117 -    if ((i + 1) < width) {
  3118 -      y0 = y_buf[x >> 16];
  3119 -      y1 = y_buf[(x >> 16) + 1];
  3120 -      y_frac = (x & 65535);
  3121 -      y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
  3122 -      YuvPixel(y, u, v, rgb_buf+4);
  3123 -      x += source_dx;
  3124 -    }
  3125 -    rgb_buf += 8;
  3126 -  }
  3127 -}
  3129 -#endif  // USE_MMX
  3130 -}  // extern "C"
  3132 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
  3133 +  if (mozilla::supports_sse()) {
  3134 +    LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
  3135 +                                 source_dx);
  3136 +    return;
  3137 +  }
  3138 +#endif
  3140 +  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
  3141 +}
  3143 +} // extern "C"

mercurial