The Tor Browser: gfx/ycbcr/convert.patch@6474c204b198

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp

     2 --- a/gfx/ycbcr/yuv_convert.cpp

     3 +++ b/gfx/ycbcr/yuv_convert.cpp

     4 @@ -6,145 +6,102 @@

     5  // http://www.fourcc.org/yuv.php

     6  // The actual conversion is best described here

     7  // http://en.wikipedia.org/wiki/YUV

     8  // An article on optimizing YUV conversion using tables instead of multiplies

     9  // http://lestourtereaux.free.fr/papers/data/yuvrgb.pdf

    10  //

    11  // YV12 is a full plane of Y and a half height, half width chroma planes

    12  // YV16 is a full plane of Y and a full height, half width chroma planes

    13 +// YV24 is a full plane of Y and a full height, full width chroma planes

    14  //

    15  // ARGB pixel format is output, which on little endian is stored as BGRA.

    16  // The alpha is set to 255, allowing the application to use RGBA or RGB32.

    18 -#include "media/base/yuv_convert.h"

    19 +#include "yuv_convert.h"

    21  // Header for low level row functions.

    22 -#include "media/base/yuv_row.h"

    23 -

    24 -#if USE_MMX

    25 -#if defined(_MSC_VER)

    26 -#include <intrin.h>

    27 -#else

    28 -#include <mmintrin.h>

    29 -#endif

    30 -#endif

    31 -

    32 -#if USE_SSE2

    33 -#include <emmintrin.h>

    34 -#endif

    35 -

    36 -namespace media {

    37 -

    38 +#include "yuv_row.h"

    39 +#include "mozilla/SSE.h"

    40 +

    41 +namespace mozilla {

    42 +

    43 +namespace gfx {

    44 +

    45  // 16.16 fixed point arithmetic

    46  const int kFractionBits = 16;

    47  const int kFractionMax = 1 << kFractionBits;

    48  const int kFractionMask = ((1 << kFractionBits) - 1);

    50  // Convert a frame of YUV to 32 bit ARGB.

    51 -void ConvertYUVToRGB32(const uint8* y_buf,

    52 -                       const uint8* u_buf,

    53 -                       const uint8* v_buf,

    54 -                       uint8* rgb_buf,

    55 -                       int width,

    56 -                       int height,

    57 -                       int y_pitch,

    58 -                       int uv_pitch,

    59 -                       int rgb_pitch,

    60 -                       YUVType yuv_type) {

    61 -  unsigned int y_shift = yuv_type;

    62 -  for (int y = 0; y < height; ++y) {

    63 -    uint8* rgb_row = rgb_buf + y * rgb_pitch;

    64 -    const uint8* y_ptr = y_buf + y * y_pitch;

    65 -    const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch;

    66 -    const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch;

    67 -

    68 -    FastConvertYUVToRGB32Row(y_ptr,

    69 -                             u_ptr,

    70 -                             v_ptr,

    71 -                             rgb_row,

    72 -                             width);

    73 -  }

    74 +NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* y_buf,

    75 +                                  const uint8* u_buf,

    76 +                                  const uint8* v_buf,

    77 +                                  uint8* rgb_buf,

    78 +                                  int pic_x,

    79 +                                  int pic_y,

    80 +                                  int pic_width,

    81 +                                  int pic_height,

    82 +                                  int y_pitch,

    83 +                                  int uv_pitch,

    84 +                                  int rgb_pitch,

    85 +                                  YUVType yuv_type) {

    86 +  unsigned int y_shift = yuv_type == YV12 ? 1 : 0;

    87 +  unsigned int x_shift = yuv_type == YV24 ? 0 : 1;

    88 +  // Test for SSE because the optimized code uses movntq, which is not part of MMX.

    89 +  bool has_sse = supports_mmx() && supports_sse();

    90 +  // There is no optimized YV24 SSE routine so we check for this and

    91 +  // fall back to the C code.

    92 +  has_sse &= yuv_type != YV24;

    93 +  bool odd_pic_x = yuv_type != YV24 && pic_x % 2 != 0;

    94 +  int x_width = odd_pic_x ? pic_width - 1 : pic_width;

    95 +

    96 +  for (int y = pic_y; y < pic_height + pic_y; ++y) {

    97 +    uint8* rgb_row = rgb_buf + (y - pic_y) * rgb_pitch;

    98 +    const uint8* y_ptr = y_buf + y * y_pitch + pic_x;

    99 +    const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);

   100 +    const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);

   101 +

   102 +    if (odd_pic_x) {

   103 +      // Handle the single odd pixel manually and use the

   104 +      // fast routines for the remaining.

   105 +      FastConvertYUVToRGB32Row_C(y_ptr++,

   106 +                                 u_ptr++,

   107 +                                 v_ptr++,

   108 +                                 rgb_row,

   109 +                                 1,

   110 +                                 x_shift);

   111 +      rgb_row += 4;

   112 +    }

   113 +

   114 +    if (has_sse) {

   115 +      FastConvertYUVToRGB32Row(y_ptr,

   116 +                               u_ptr,

   117 +                               v_ptr,

   118 +                               rgb_row,

   119 +                               x_width);

   120 +    }

   121 +    else {

   122 +      FastConvertYUVToRGB32Row_C(y_ptr,

   123 +                                 u_ptr,

   124 +                                 v_ptr,

   125 +                                 rgb_row,

   126 +                                 x_width,

   127 +                                 x_shift);

   128 +    }

   129 +  }

   131    // MMX used for FastConvertYUVToRGB32Row requires emms instruction.

   132 -  EMMS();

   133 -}

   134 -

   135 -#if USE_SSE2

   136 -// FilterRows combines two rows of the image using linear interpolation.

   137 -// SSE2 version does 16 pixels at a time

   138 -

   139 -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,

   140 -                       int source_width, int source_y_fraction) {

   141 -  __m128i zero = _mm_setzero_si128();

   142 -  __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);

   143 -  __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);

   144 -

   145 -  const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);

   146 -  const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);

   147 -  __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);

   148 -  __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);

   149 -

   150 -  do {

   151 -    __m128i y0 = _mm_loadu_si128(y0_ptr128);

   152 -    __m128i y1 = _mm_loadu_si128(y1_ptr128);

   153 -    __m128i y2 = _mm_unpackhi_epi8(y0, zero);

   154 -    __m128i y3 = _mm_unpackhi_epi8(y1, zero);

   155 -    y0 = _mm_unpacklo_epi8(y0, zero);

   156 -    y1 = _mm_unpacklo_epi8(y1, zero);

   157 -    y0 = _mm_mullo_epi16(y0, y0_fraction);

   158 -    y1 = _mm_mullo_epi16(y1, y1_fraction);

   159 -    y2 = _mm_mullo_epi16(y2, y0_fraction);

   160 -    y3 = _mm_mullo_epi16(y3, y1_fraction);

   161 -    y0 = _mm_add_epi16(y0, y1);

   162 -    y2 = _mm_add_epi16(y2, y3);

   163 -    y0 = _mm_srli_epi16(y0, 8);

   164 -    y2 = _mm_srli_epi16(y2, 8);

   165 -    y0 = _mm_packus_epi16(y0, y2);

   166 -    *dest128++ = y0;

   167 -    ++y0_ptr128;

   168 -    ++y1_ptr128;

   169 -  } while (dest128 < end128);

   170 -}

   171 -#elif USE_MMX

   172 -// MMX version does 8 pixels at a time

   173 -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,

   174 -                       int source_width, int source_y_fraction) {

   175 -  __m64 zero = _mm_setzero_si64();

   176 -  __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);

   177 -  __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);

   178 -

   179 -  const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);

   180 -  const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);

   181 -  __m64* dest64 = reinterpret_cast<__m64*>(ybuf);

   182 -  __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);

   183 -

   184 -  do {

   185 -    __m64 y0 = *y0_ptr64++;

   186 -    __m64 y1 = *y1_ptr64++;

   187 -    __m64 y2 = _mm_unpackhi_pi8(y0, zero);

   188 -    __m64 y3 = _mm_unpackhi_pi8(y1, zero);

   189 -    y0 = _mm_unpacklo_pi8(y0, zero);

   190 -    y1 = _mm_unpacklo_pi8(y1, zero);

   191 -    y0 = _mm_mullo_pi16(y0, y0_fraction);

   192 -    y1 = _mm_mullo_pi16(y1, y1_fraction);

   193 -    y2 = _mm_mullo_pi16(y2, y0_fraction);

   194 -    y3 = _mm_mullo_pi16(y3, y1_fraction);

   195 -    y0 = _mm_add_pi16(y0, y1);

   196 -    y2 = _mm_add_pi16(y2, y3);

   197 -    y0 = _mm_srli_pi16(y0, 8);

   198 -    y2 = _mm_srli_pi16(y2, 8);

   199 -    y0 = _mm_packs_pu16(y0, y2);

   200 -    *dest64++ = y0;

   201 -  } while (dest64 < end64);

   202 -}

   203 -#else  // no MMX or SSE2

   204 +  if (has_sse)

   205 +    EMMS();

   206 +}

   207 +

   208  // C version does 8 at a time to mimic MMX code

   209 -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,

   210 -                       int source_width, int source_y_fraction) {

   211 +static void FilterRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,

   212 +                         int source_width, int source_y_fraction) {

   213    int y1_fraction = source_y_fraction;

   214    int y0_fraction = 256 - y1_fraction;

   215    uint8* end = ybuf + source_width;

   216    do {

   217      ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8;

   218      ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8;

   219      ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8;

   220      ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8;

   221 @@ -152,46 +140,77 @@ static void FilterRows(uint8* ybuf, cons

   222      ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8;

   223      ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8;

   224      ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8;

   225      y0_ptr += 8;

   226      y1_ptr += 8;

   227      ybuf += 8;

   228    } while (ybuf < end);

   229  }

   230 -#endif

   231 +

   232 +#ifdef MOZILLA_MAY_SUPPORT_MMX

   233 +void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,

   234 +                    int source_width, int source_y_fraction);

   235 +#endif

   236 +

   237 +#ifdef MOZILLA_MAY_SUPPORT_SSE2

   238 +void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,

   239 +                     int source_width, int source_y_fraction);

   240 +#endif

   241 +

   242 +static inline void FilterRows(uint8* ybuf, const uint8* y0_ptr,

   243 +                              const uint8* y1_ptr, int source_width,

   244 +                              int source_y_fraction) {

   245 +#ifdef MOZILLA_MAY_SUPPORT_SSE2

   246 +  if (mozilla::supports_sse2()) {

   247 +    FilterRows_SSE2(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);

   248 +    return;

   249 +  }

   250 +#endif

   251 +

   252 +#ifdef MOZILLA_MAY_SUPPORT_MMX

   253 +  if (mozilla::supports_mmx()) {

   254 +    FilterRows_MMX(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);

   255 +    return;

   256 +  }

   257 +#endif

   258 +

   259 +  FilterRows_C(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);

   260 +}

   263  // Scale a frame of YUV to 32 bit ARGB.

   264 -void ScaleYUVToRGB32(const uint8* y_buf,

   265 -                     const uint8* u_buf,

   266 -                     const uint8* v_buf,

   267 -                     uint8* rgb_buf,

   268 -                     int source_width,

   269 -                     int source_height,

   270 -                     int width,

   271 -                     int height,

   272 -                     int y_pitch,

   273 -                     int uv_pitch,

   274 -                     int rgb_pitch,

   275 -                     YUVType yuv_type,

   276 -                     Rotate view_rotate,

   277 -                     ScaleFilter filter) {

   278 +NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* y_buf,

   279 +                                const uint8* u_buf,

   280 +                                const uint8* v_buf,

   281 +                                uint8* rgb_buf,

   282 +                                int source_width,

   283 +                                int source_height,

   284 +                                int width,

   285 +                                int height,

   286 +                                int y_pitch,

   287 +                                int uv_pitch,

   288 +                                int rgb_pitch,

   289 +                                YUVType yuv_type,

   290 +                                Rotate view_rotate,

   291 +                                ScaleFilter filter) {

   292 +  bool has_mmx = supports_mmx();

   293 +

   294    // 4096 allows 3 buffers to fit in 12k.

   295    // Helps performance on CPU with 16K L1 cache.

   296    // Large enough for 3830x2160 and 30" displays which are 2560x1600.

   297    const int kFilterBufferSize = 4096;

   298    // Disable filtering if the screen is too big (to avoid buffer overflows).

   299    // This should never happen to regular users: they don't have monitors

   300    // wider than 4096 pixels.

   301    // TODO(fbarchard): Allow rotated videos to filter.

   302    if (source_width > kFilterBufferSize || view_rotate)

   303      filter = FILTER_NONE;

   305 -  unsigned int y_shift = yuv_type;

   306 +  unsigned int y_shift = yuv_type == YV12 ? 1 : 0;

   307    // Diagram showing origin and direction of source sampling.

   308    // ->0   4<-

   309    // 7       3

   310    //

   311    // 6       5

   312    // ->1   2<-

   313    // Rotations that start at right side of image.

   314    if ((view_rotate == ROTATE_180) ||

   315 @@ -276,17 +295,17 @@ void ScaleYUVToRGB32(const uint8* y_buf,

   316      int source_uv_fraction =

   317          ((source_y_subpixel >> y_shift) & kFractionMask) >> 8;

   319      const uint8* y_ptr = y0_ptr;

   320      const uint8* u_ptr = u0_ptr;

   321      const uint8* v_ptr = v0_ptr;

   322      // Apply vertical filtering if necessary.

   323      // TODO(fbarchard): Remove memcpy when not necessary.

   324 -    if (filter & media::FILTER_BILINEAR_V) {

   325 +    if (filter & mozilla::gfx::FILTER_BILINEAR_V) {

   326        if (yscale_fixed != kFractionMax &&

   327            source_y_fraction && ((source_y + 1) < source_height)) {

   328          FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);

   329        } else {

   330          memcpy(ybuf, y0_ptr, source_width);

   331        }

   332        y_ptr = ybuf;

   333        ybuf[source_width] = ybuf[source_width-1];

   334 @@ -303,44 +322,50 @@ void ScaleYUVToRGB32(const uint8* y_buf,

   335        u_ptr = ubuf;

   336        v_ptr = vbuf;

   337        ubuf[uv_source_width] = ubuf[uv_source_width - 1];

   338        vbuf[uv_source_width] = vbuf[uv_source_width - 1];

   339      }

   340      if (source_dx == kFractionMax) {  // Not scaled

   341        FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,

   342                                 dest_pixel, width);

   343 -    } else {

   344 -      if (filter & FILTER_BILINEAR_H) {

   345 +    } else if (filter & FILTER_BILINEAR_H) {

   346          LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,

   347                                   dest_pixel, width, source_dx);

   348      } else {

   349  // Specialized scalers and rotation.

   350 -#if USE_MMX && defined(_MSC_VER)

   351 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_MSC_VER) && defined(_M_IX86)

   352 +      if(mozilla::supports_sse()) {

   353          if (width == (source_width * 2)) {

   354 -          DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,

   355 -                              dest_pixel, width);

   356 +          DoubleYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,

   357 +                                  dest_pixel, width);

   358          } else if ((source_dx & kFractionMask) == 0) {

   359            // Scaling by integer scale factor. ie half.

   360 -          ConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,

   361 -                               dest_pixel, width,

   362 -                               source_dx >> kFractionBits);

   363 +          ConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,

   364 +                                   dest_pixel, width,

   365 +                                   source_dx >> kFractionBits);

   366          } else if (source_dx_uv == source_dx) {  // Not rotated.

   367            ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,

   368                               dest_pixel, width, source_dx);

   369          } else {

   370 -          RotateConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,

   371 -                                     dest_pixel, width,

   372 -                                     source_dx >> kFractionBits,

   373 -                                     source_dx_uv >> kFractionBits);

   374 +          RotateConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,

   375 +                                         dest_pixel, width,

   376 +                                         source_dx >> kFractionBits,

   377 +                                         source_dx_uv >> kFractionBits);

   378          }

   379 +      }

   380 +      else {

   381 +        ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,

   382 +                             dest_pixel, width, source_dx);

   383 +      }

   384  #else

   385 -        ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,

   386 -                           dest_pixel, width, source_dx);

   387 -#endif

   388 -      }

   389 +      ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,

   390 +                         dest_pixel, width, source_dx);

   391 +#endif

   392      }

   393    }

   394    // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms.

   395 -  EMMS();

   396 -}

   397 -

   398 -}  // namespace media

   399 +  if (has_mmx)

   400 +    EMMS();

   401 +}

   402 +

   403 +}  // namespace gfx

   404 +}  // namespace mozilla

   405 diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h

   406 --- a/gfx/ycbcr/yuv_convert.h

   407 +++ b/gfx/ycbcr/yuv_convert.h

   408 @@ -1,72 +1,79 @@

   409  // Copyright (c) 2010 The Chromium Authors. All rights reserved.

   410  // Use of this source code is governed by a BSD-style license that can be

   411  // found in the LICENSE file.

   413  #ifndef MEDIA_BASE_YUV_CONVERT_H_

   414  #define MEDIA_BASE_YUV_CONVERT_H_

   416 -#include "base/basictypes.h"

   417 -

   418 -namespace media {

   419 -

   420 +#include "chromium_types.h"

   421 +#include "gfxCore.h"

   422 +

   423 +namespace mozilla {

   424 +

   425 +namespace gfx {

   426 +

   427  // Type of YUV surface.

   428  // The value of these enums matter as they are used to shift vertical indices.

   429  enum YUVType {

   430 -  YV16 = 0,           // YV16 is half width and full height chroma channels.

   431 -  YV12 = 1,           // YV12 is half width and half height chroma channels.

   432 +  YV12 = 0,           // YV12 is half width and half height chroma channels.

   433 +  YV16 = 1,           // YV16 is half width and full height chroma channels.

   434 +  YV24 = 2            // YV24 is full width and full height chroma channels.

   435  };

   437  // Mirror means flip the image horizontally, as in looking in a mirror.

   438  // Rotate happens after mirroring.

   439  enum Rotate {

   440    ROTATE_0,           // Rotation off.

   441    ROTATE_90,          // Rotate clockwise.

   442    ROTATE_180,         // Rotate upside down.

   443    ROTATE_270,         // Rotate counter clockwise.

   444    MIRROR_ROTATE_0,    // Mirror horizontally.

   445    MIRROR_ROTATE_90,   // Mirror then Rotate clockwise.

   446    MIRROR_ROTATE_180,  // Mirror vertically.

   447 -  MIRROR_ROTATE_270,  // Transpose.

   448 +  MIRROR_ROTATE_270   // Transpose.

   449  };

   451  // Filter affects how scaling looks.

   452  enum ScaleFilter {

   453    FILTER_NONE = 0,        // No filter (point sampled).

   454    FILTER_BILINEAR_H = 1,  // Bilinear horizontal filter.

   455    FILTER_BILINEAR_V = 2,  // Bilinear vertical filter.

   456 -  FILTER_BILINEAR = 3,    // Bilinear filter.

   457 +  FILTER_BILINEAR = 3     // Bilinear filter.

   458  };

   460  // Convert a frame of YUV to 32 bit ARGB.

   461  // Pass in YV16/YV12 depending on source format

   462 -void ConvertYUVToRGB32(const uint8* yplane,

   463 -                       const uint8* uplane,

   464 -                       const uint8* vplane,

   465 -                       uint8* rgbframe,

   466 -                       int width,

   467 -                       int height,

   468 -                       int ystride,

   469 -                       int uvstride,

   470 -                       int rgbstride,

   471 -                       YUVType yuv_type);

   472 +NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* yplane,

   473 +                                  const uint8* uplane,

   474 +                                  const uint8* vplane,

   475 +                                  uint8* rgbframe,

   476 +                                  int pic_x,

   477 +                                  int pic_y,

   478 +                                  int pic_width,

   479 +                                  int pic_height,

   480 +                                  int ystride,

   481 +                                  int uvstride,

   482 +                                  int rgbstride,

   483 +                                  YUVType yuv_type);

   485  // Scale a frame of YUV to 32 bit ARGB.

   486  // Supports rotation and mirroring.

   487 -void ScaleYUVToRGB32(const uint8* yplane,

   488 -                     const uint8* uplane,

   489 -                     const uint8* vplane,

   490 -                     uint8* rgbframe,

   491 -                     int source_width,

   492 -                     int source_height,

   493 -                     int width,

   494 -                     int height,

   495 -                     int ystride,

   496 -                     int uvstride,

   497 -                     int rgbstride,

   498 -                     YUVType yuv_type,

   499 -                     Rotate view_rotate,

   500 -                     ScaleFilter filter);

   501 -

   502 -}  // namespace media

   503 -

   504 +NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* yplane,

   505 +                                const uint8* uplane,

   506 +                                const uint8* vplane,

   507 +                                uint8* rgbframe,

   508 +                                int source_width,

   509 +                                int source_height,

   510 +                                int width,

   511 +                                int height,

   512 +                                int ystride,

   513 +                                int uvstride,

   514 +                                int rgbstride,

   515 +                                YUVType yuv_type,

   516 +                                Rotate view_rotate,

   517 +                                ScaleFilter filter);

   518 +

   519 +}  // namespace gfx

   520 +}  // namespace mozilla

   521 +

   522  #endif  // MEDIA_BASE_YUV_CONVERT_H_

   523 diff --git a/gfx/ycbcr/yuv_convert_mmx.cpp b/gfx/ycbcr/yuv_convert_mmx.cpp

   524 new file mode 100644

   525 --- /dev/null

   526 +++ b/gfx/ycbcr/yuv_convert_mmx.cpp

   527 @@ -0,0 +1,45 @@

   528 +// Copyright (c) 2010 The Chromium Authors. All rights reserved.

   529 +// Use of this source code is governed by a BSD-style license that can be

   530 +// found in the LICENSE file.

   531 +

   532 +#include <mmintrin.h>

   533 +#include "yuv_row.h"

   534 +

   535 +namespace mozilla {

   536 +namespace gfx {

   537 +

   538 +// FilterRows combines two rows of the image using linear interpolation.

   539 +// MMX version does 8 pixels at a time.

   540 +void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,

   541 +                    int source_width, int source_y_fraction) {

   542 +  __m64 zero = _mm_setzero_si64();

   543 +  __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);

   544 +  __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);

   545 +

   546 +  const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);

   547 +  const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);

   548 +  __m64* dest64 = reinterpret_cast<__m64*>(ybuf);

   549 +  __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);

   550 +

   551 +  do {

   552 +    __m64 y0 = *y0_ptr64++;

   553 +    __m64 y1 = *y1_ptr64++;

   554 +    __m64 y2 = _mm_unpackhi_pi8(y0, zero);

   555 +    __m64 y3 = _mm_unpackhi_pi8(y1, zero);

   556 +    y0 = _mm_unpacklo_pi8(y0, zero);

   557 +    y1 = _mm_unpacklo_pi8(y1, zero);

   558 +    y0 = _mm_mullo_pi16(y0, y0_fraction);

   559 +    y1 = _mm_mullo_pi16(y1, y1_fraction);

   560 +    y2 = _mm_mullo_pi16(y2, y0_fraction);

   561 +    y3 = _mm_mullo_pi16(y3, y1_fraction);

   562 +    y0 = _mm_add_pi16(y0, y1);

   563 +    y2 = _mm_add_pi16(y2, y3);

   564 +    y0 = _mm_srli_pi16(y0, 8);

   565 +    y2 = _mm_srli_pi16(y2, 8);

   566 +    y0 = _mm_packs_pu16(y0, y2);

   567 +    *dest64++ = y0;

   568 +  } while (dest64 < end64);

   569 +}

   570 +

   571 +}

   572 +}

   573 diff --git a/gfx/ycbcr/yuv_convert_sse2.cpp b/gfx/ycbcr/yuv_convert_sse2.cpp

   574 new file mode 100644

   575 --- /dev/null

   576 +++ b/gfx/ycbcr/yuv_convert_sse2.cpp

   577 @@ -0,0 +1,47 @@

   578 +// Copyright (c) 2010 The Chromium Authors. All rights reserved.

   579 +// Use of this source code is governed by a BSD-style license that can be

   580 +// found in the LICENSE file.

   581 +

   582 +#include <emmintrin.h>

   583 +#include "yuv_row.h"

   584 +

   585 +namespace mozilla {

   586 +namespace gfx {

   587 +

   588 +// FilterRows combines two rows of the image using linear interpolation.

   589 +// SSE2 version does 16 pixels at a time.

   590 +void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,

   591 +                     int source_width, int source_y_fraction) {

   592 +  __m128i zero = _mm_setzero_si128();

   593 +  __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);

   594 +  __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);

   595 +

   596 +  const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);

   597 +  const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);

   598 +  __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);

   599 +  __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);

   600 +

   601 +  do {

   602 +    __m128i y0 = _mm_loadu_si128(y0_ptr128);

   603 +    __m128i y1 = _mm_loadu_si128(y1_ptr128);

   604 +    __m128i y2 = _mm_unpackhi_epi8(y0, zero);

   605 +    __m128i y3 = _mm_unpackhi_epi8(y1, zero);

   606 +    y0 = _mm_unpacklo_epi8(y0, zero);

   607 +    y1 = _mm_unpacklo_epi8(y1, zero);

   608 +    y0 = _mm_mullo_epi16(y0, y0_fraction);

   609 +    y1 = _mm_mullo_epi16(y1, y1_fraction);

   610 +    y2 = _mm_mullo_epi16(y2, y0_fraction);

   611 +    y3 = _mm_mullo_epi16(y3, y1_fraction);

   612 +    y0 = _mm_add_epi16(y0, y1);

   613 +    y2 = _mm_add_epi16(y2, y3);

   614 +    y0 = _mm_srli_epi16(y0, 8);

   615 +    y2 = _mm_srli_epi16(y2, 8);

   616 +    y0 = _mm_packus_epi16(y0, y2);

   617 +    *dest128++ = y0;

   618 +    ++y0_ptr128;

   619 +    ++y1_ptr128;

   620 +  } while (dest128 < end128);

   621 +}

   622 +

   623 +}

   624 +}

   625 diff --git a/gfx/ycbcr/yuv_row.h b/gfx/ycbcr/yuv_row.h

   626 --- a/gfx/ycbcr/yuv_row.h

   627 +++ b/gfx/ycbcr/yuv_row.h

   628 @@ -5,109 +5,133 @@

   629  // yuv_row internal functions to handle YUV conversion and scaling to RGB.

   630  // These functions are used from both yuv_convert.cc and yuv_scale.cc.

   632  // TODO(fbarchard): Write function that can handle rotation and scaling.

   634  #ifndef MEDIA_BASE_YUV_ROW_H_

   635  #define MEDIA_BASE_YUV_ROW_H_

   637 -#include "base/basictypes.h"

   638 +#include "chromium_types.h"

   640  extern "C" {

   641  // Can only do 1x.

   642  // This is the second fastest of the scalers.

   643  void FastConvertYUVToRGB32Row(const uint8* y_buf,

   644                                const uint8* u_buf,

   645                                const uint8* v_buf,

   646                                uint8* rgb_buf,

   647                                int width);

   649 -// Can do 1x, half size or any scale down by an integer amount.

   650 -// Step can be negative (mirroring, rotate 180).

   651 -// This is the third fastest of the scalers.

   652 -void ConvertYUVToRGB32Row(const uint8* y_buf,

   653 -                          const uint8* u_buf,

   654 -                          const uint8* v_buf,

   655 -                          uint8* rgb_buf,

   656 -                          int width,

   657 -                          int step);

   658 -

   659 -// Rotate is like Convert, but applies different step to Y versus U and V.

   660 -// This allows rotation by 90 or 270, by stepping by stride.

   661 -// This is the forth fastest of the scalers.

   662 -void RotateConvertYUVToRGB32Row(const uint8* y_buf,

   663 +void FastConvertYUVToRGB32Row_C(const uint8* y_buf,

   664                                  const uint8* u_buf,

   665                                  const uint8* v_buf,

   666                                  uint8* rgb_buf,

   667                                  int width,

   668 -                                int ystep,

   669 -                                int uvstep);

   670 +                                unsigned int x_shift);

   671 +

   672 +void FastConvertYUVToRGB32Row(const uint8* y_buf,

   673 +                              const uint8* u_buf,

   674 +                              const uint8* v_buf,

   675 +                              uint8* rgb_buf,

   676 +                              int width);

   677 +

   678 +// Can do 1x, half size or any scale down by an integer amount.

   679 +// Step can be negative (mirroring, rotate 180).

   680 +// This is the third fastest of the scalers.

   681 +// Only defined on Windows x86-32.

   682 +void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,

   683 +                              const uint8* u_buf,

   684 +                              const uint8* v_buf,

   685 +                              uint8* rgb_buf,

   686 +                              int width,

   687 +                              int step);

   688 +

   689 +// Rotate is like Convert, but applies different step to Y versus U and V.

   690 +// This allows rotation by 90 or 270, by stepping by stride.

   691 +// This is the forth fastest of the scalers.

   692 +// Only defined on Windows x86-32.

   693 +void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,

   694 +                                    const uint8* u_buf,

   695 +                                    const uint8* v_buf,

   696 +                                    uint8* rgb_buf,

   697 +                                    int width,

   698 +                                    int ystep,

   699 +                                    int uvstep);

   701  // Doubler does 4 pixels at a time.  Each pixel is replicated.

   702  // This is the fastest of the scalers.

   703 -void DoubleYUVToRGB32Row(const uint8* y_buf,

   704 -                         const uint8* u_buf,

   705 -                         const uint8* v_buf,

   706 -                         uint8* rgb_buf,

   707 -                         int width);

   708 +// Only defined on Windows x86-32.

   709 +void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,

   710 +                             const uint8* u_buf,

   711 +                             const uint8* v_buf,

   712 +                             uint8* rgb_buf,

   713 +                             int width);

   715  // Handles arbitrary scaling up or down.

   716  // Mirroring is supported, but not 90 or 270 degree rotation.

   717  // Chroma is under sampled every 2 pixels for performance.

   718  void ScaleYUVToRGB32Row(const uint8* y_buf,

   719                          const uint8* u_buf,

   720                          const uint8* v_buf,

   721                          uint8* rgb_buf,

   722                          int width,

   723                          int source_dx);

   725 +void ScaleYUVToRGB32Row(const uint8* y_buf,

   726 +                        const uint8* u_buf,

   727 +                        const uint8* v_buf,

   728 +                        uint8* rgb_buf,

   729 +                        int width,

   730 +                        int source_dx);

   731 +

   732 +void ScaleYUVToRGB32Row_C(const uint8* y_buf,

   733 +                          const uint8* u_buf,

   734 +                          const uint8* v_buf,

   735 +                          uint8* rgb_buf,

   736 +                          int width,

   737 +                          int source_dx);

   738 +

   739  // Handles arbitrary scaling up or down with bilinear filtering.

   740  // Mirroring is supported, but not 90 or 270 degree rotation.

   741  // Chroma is under sampled every 2 pixels for performance.

   742  // This is the slowest of the scalers.

   743  void LinearScaleYUVToRGB32Row(const uint8* y_buf,

   744                                const uint8* u_buf,

   745                                const uint8* v_buf,

   746                                uint8* rgb_buf,

   747                                int width,

   748                                int source_dx);

   750 +void LinearScaleYUVToRGB32Row(const uint8* y_buf,

   751 +                              const uint8* u_buf,

   752 +                              const uint8* v_buf,

   753 +                              uint8* rgb_buf,

   754 +                              int width,

   755 +                              int source_dx);

   756 +

   757 +void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,

   758 +                                const uint8* u_buf,

   759 +                                const uint8* v_buf,

   760 +                                uint8* rgb_buf,

   761 +                                int width,

   762 +                                int source_dx);

   763 +

   764 +

   765  #if defined(_MSC_VER)

   766  #define SIMD_ALIGNED(var) __declspec(align(16)) var

   767  #else

   768  #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))

   769  #endif

   770  extern SIMD_ALIGNED(int16 kCoefficientsRgbY[768][4]);

   772 -// Method to force C version.

   773 -//#define USE_MMX 0

   774 -//#define USE_SSE2 0

   775 -

   776 -#if !defined(USE_MMX)

   777 -// Windows, Mac and Linux/BSD use MMX

   778 -#if defined(__MMX__) || defined(_MSC_VER)

   779 -#define USE_MMX 1

   780 -#else

   781 -#define USE_MMX 0

   782 -#endif

   783 -#endif

   784 -

   785 -#if !defined(USE_SSE2)

   786 -#if defined(__SSE2__) || defined(ARCH_CPU_X86_64) || _M_IX86_FP==2

   787 -#define USE_SSE2 1

   788 -#else

   789 -#define USE_SSE2 0

   790 -#endif

   791 -#endif

   792 -

   793  // x64 uses MMX2 (SSE) so emms is not required.

   794  // Warning C4799: function has no EMMS instruction.

   795  // EMMS() is slow and should be called by the calling function once per image.

   796 -#if USE_MMX && !defined(ARCH_CPU_X86_64)

   797 +#if defined(ARCH_CPU_X86) && !defined(ARCH_CPU_X86_64)

   798  #if defined(_MSC_VER)

   799  #define EMMS() __asm emms

   800  #pragma warning(disable: 4799)

   801  #else

   802  #define EMMS() asm("emms")

   803  #endif

   804  #else

   805  #define EMMS()

   806 diff --git a/gfx/ycbcr/yuv_row_c.cpp b/gfx/ycbcr/yuv_row_c.cpp

   807 --- a/gfx/ycbcr/yuv_row_c.cpp

   808 +++ b/gfx/ycbcr/yuv_row_c.cpp

   809 @@ -1,812 +1,18 @@

   810  // Copyright (c) 2010 The Chromium Authors. All rights reserved.

   811  // Use of this source code is governed by a BSD-style license that can be

   812  // found in the LICENSE file.

   814 -#include "media/base/yuv_row.h"

   815 -

   816 -#ifdef _DEBUG

   817 -#include "base/logging.h"

   818 -#else

   819 +#include "yuv_row.h"

   820 +

   821  #define DCHECK(a)

   822 -#endif

   824  extern "C" {

   826 -#if USE_SSE2 && defined(ARCH_CPU_X86_64)

   827 -

   828 -// AMD64 ABI uses register paremters.

   829 -void FastConvertYUVToRGB32Row(const uint8* y_buf,  // rdi

   830 -                              const uint8* u_buf,  // rsi

   831 -                              const uint8* v_buf,  // rdx

   832 -                              uint8* rgb_buf,      // rcx

   833 -                              int width) {         // r8

   834 -  asm(

   835 -  "jmp    convertend\n"

   836 -"convertloop:"

   837 -  "movzb  (%1),%%r10\n"

   838 -  "add    $0x1,%1\n"

   839 -  "movzb  (%2),%%r11\n"

   840 -  "add    $0x1,%2\n"

   841 -  "movq   2048(%5,%%r10,8),%%xmm0\n"

   842 -  "movzb  (%0),%%r10\n"

   843 -  "movq   4096(%5,%%r11,8),%%xmm1\n"

   844 -  "movzb  0x1(%0),%%r11\n"

   845 -  "paddsw %%xmm1,%%xmm0\n"

   846 -  "movq   (%5,%%r10,8),%%xmm2\n"

   847 -  "add    $0x2,%0\n"

   848 -  "movq   (%5,%%r11,8),%%xmm3\n"

   849 -  "paddsw %%xmm0,%%xmm2\n"

   850 -  "paddsw %%xmm0,%%xmm3\n"

   851 -  "shufps $0x44,%%xmm3,%%xmm2\n"

   852 -  "psraw  $0x6,%%xmm2\n"

   853 -  "packuswb %%xmm2,%%xmm2\n"

   854 -  "movq   %%xmm2,0x0(%3)\n"

   855 -  "add    $0x8,%3\n"

   856 -"convertend:"

   857 -  "sub    $0x2,%4\n"

   858 -  "jns    convertloop\n"

   859 -

   860 -"convertnext:"

   861 -  "add    $0x1,%4\n"

   862 -  "js     convertdone\n"

   863 -

   864 -  "movzb  (%1),%%r10\n"

   865 -  "movq   2048(%5,%%r10,8),%%xmm0\n"

   866 -  "movzb  (%2),%%r10\n"

   867 -  "movq   4096(%5,%%r10,8),%%xmm1\n"

   868 -  "paddsw %%xmm1,%%xmm0\n"

   869 -  "movzb  (%0),%%r10\n"

   870 -  "movq   (%5,%%r10,8),%%xmm1\n"

   871 -  "paddsw %%xmm0,%%xmm1\n"

   872 -  "psraw  $0x6,%%xmm1\n"

   873 -  "packuswb %%xmm1,%%xmm1\n"

   874 -  "movd   %%xmm1,0x0(%3)\n"

   875 -"convertdone:"

   876 -  :

   877 -  : "r"(y_buf),  // %0

   878 -    "r"(u_buf),  // %1

   879 -    "r"(v_buf),  // %2

   880 -    "r"(rgb_buf),  // %3

   881 -    "r"(width),  // %4

   882 -    "r" (kCoefficientsRgbY)  // %5

   883 -  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"

   884 -);

   885 -}

   886 -

   887 -void ScaleYUVToRGB32Row(const uint8* y_buf,  // rdi

   888 -                        const uint8* u_buf,  // rsi

   889 -                        const uint8* v_buf,  // rdx

   890 -                        uint8* rgb_buf,      // rcx

   891 -                        int width,           // r8

   892 -                        int source_dx) {     // r9

   893 -  asm(

   894 -  "xor    %%r11,%%r11\n"

   895 -  "sub    $0x2,%4\n"

   896 -  "js     scalenext\n"

   897 -

   898 -"scaleloop:"

   899 -  "mov    %%r11,%%r10\n"

   900 -  "sar    $0x11,%%r10\n"

   901 -  "movzb  (%1,%%r10,1),%%rax\n"

   902 -  "movq   2048(%5,%%rax,8),%%xmm0\n"

   903 -  "movzb  (%2,%%r10,1),%%rax\n"

   904 -  "movq   4096(%5,%%rax,8),%%xmm1\n"

   905 -  "lea    (%%r11,%6),%%r10\n"

   906 -  "sar    $0x10,%%r11\n"

   907 -  "movzb  (%0,%%r11,1),%%rax\n"

   908 -  "paddsw %%xmm1,%%xmm0\n"

   909 -  "movq   (%5,%%rax,8),%%xmm1\n"

   910 -  "lea    (%%r10,%6),%%r11\n"

   911 -  "sar    $0x10,%%r10\n"

   912 -  "movzb  (%0,%%r10,1),%%rax\n"

   913 -  "movq   (%5,%%rax,8),%%xmm2\n"

   914 -  "paddsw %%xmm0,%%xmm1\n"

   915 -  "paddsw %%xmm0,%%xmm2\n"

   916 -  "shufps $0x44,%%xmm2,%%xmm1\n"

   917 -  "psraw  $0x6,%%xmm1\n"

   918 -  "packuswb %%xmm1,%%xmm1\n"

   919 -  "movq   %%xmm1,0x0(%3)\n"

   920 -  "add    $0x8,%3\n"

   921 -  "sub    $0x2,%4\n"

   922 -  "jns    scaleloop\n"

   923 -

   924 -"scalenext:"

   925 -  "add    $0x1,%4\n"

   926 -  "js     scaledone\n"

   927 -

   928 -  "mov    %%r11,%%r10\n"

   929 -  "sar    $0x11,%%r10\n"

   930 -  "movzb  (%1,%%r10,1),%%rax\n"

   931 -  "movq   2048(%5,%%rax,8),%%xmm0\n"

   932 -  "movzb  (%2,%%r10,1),%%rax\n"

   933 -  "movq   4096(%5,%%rax,8),%%xmm1\n"

   934 -  "paddsw %%xmm1,%%xmm0\n"

   935 -  "sar    $0x10,%%r11\n"

   936 -  "movzb  (%0,%%r11,1),%%rax\n"

   937 -  "movq   (%5,%%rax,8),%%xmm1\n"

   938 -  "paddsw %%xmm0,%%xmm1\n"

   939 -  "psraw  $0x6,%%xmm1\n"

   940 -  "packuswb %%xmm1,%%xmm1\n"

   941 -  "movd   %%xmm1,0x0(%3)\n"

   942 -

   943 -"scaledone:"

   944 -  :

   945 -  : "r"(y_buf),  // %0

   946 -    "r"(u_buf),  // %1

   947 -    "r"(v_buf),  // %2

   948 -    "r"(rgb_buf),  // %3

   949 -    "r"(width),  // %4

   950 -    "r" (kCoefficientsRgbY),  // %5

   951 -    "r"(static_cast<long>(source_dx))  // %6

   952 -  : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2"

   953 -);

   954 -}

   955 -

   956 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,

   957 -                              const uint8* u_buf,

   958 -                              const uint8* v_buf,

   959 -                              uint8* rgb_buf,

   960 -                              int width,

   961 -                              int source_dx) {

   962 -  asm(

   963 -  "xor    %%r11,%%r11\n"   // x = 0

   964 -  "sub    $0x2,%4\n"

   965 -  "js     .lscalenext\n"

   966 -  "cmp    $0x20000,%6\n"   // if source_dx >= 2.0

   967 -  "jl     .lscalehalf\n"

   968 -  "mov    $0x8000,%%r11\n" // x = 0.5 for 1/2 or less

   969 -".lscalehalf:"

   970 -

   971 -".lscaleloop:"

   972 -  "mov    %%r11,%%r10\n"

   973 -  "sar    $0x11,%%r10\n"

   974 -

   975 -  "movzb  (%1, %%r10, 1), %%r13 \n"

   976 -  "movzb  1(%1, %%r10, 1), %%r14 \n"

   977 -  "mov    %%r11, %%rax \n"

   978 -  "and    $0x1fffe, %%rax \n"

   979 -  "imul   %%rax, %%r14 \n"

   980 -  "xor    $0x1fffe, %%rax \n"

   981 -  "imul   %%rax, %%r13 \n"

   982 -  "add    %%r14, %%r13 \n"

   983 -  "shr    $17, %%r13 \n"

   984 -  "movq   2048(%5,%%r13,8), %%xmm0\n"

   985 -

   986 -  "movzb  (%2, %%r10, 1), %%r13 \n"

   987 -  "movzb  1(%2, %%r10, 1), %%r14 \n"

   988 -  "mov    %%r11, %%rax \n"

   989 -  "and    $0x1fffe, %%rax \n"

   990 -  "imul   %%rax, %%r14 \n"

   991 -  "xor    $0x1fffe, %%rax \n"

   992 -  "imul   %%rax, %%r13 \n"

   993 -  "add    %%r14, %%r13 \n"

   994 -  "shr    $17, %%r13 \n"

   995 -  "movq   4096(%5,%%r13,8), %%xmm1\n"

   996 -

   997 -  "mov    %%r11, %%rax \n"

   998 -  "lea    (%%r11,%6),%%r10\n"

   999 -  "sar    $0x10,%%r11\n"

  1000 -  "paddsw %%xmm1,%%xmm0\n"

  1001 -

  1002 -  "movzb  (%0, %%r11, 1), %%r13 \n"

  1003 -  "movzb  1(%0, %%r11, 1), %%r14 \n"

  1004 -  "and    $0xffff, %%rax \n"

  1005 -  "imul   %%rax, %%r14 \n"

  1006 -  "xor    $0xffff, %%rax \n"

  1007 -  "imul   %%rax, %%r13 \n"

  1008 -  "add    %%r14, %%r13 \n"

  1009 -  "shr    $16, %%r13 \n"

  1010 -  "movq   (%5,%%r13,8),%%xmm1\n"

  1011 -

  1012 -  "mov    %%r10, %%rax \n"

  1013 -  "lea    (%%r10,%6),%%r11\n"

  1014 -  "sar    $0x10,%%r10\n"

  1015 -

  1016 -  "movzb  (%0,%%r10,1), %%r13 \n"

  1017 -  "movzb  1(%0,%%r10,1), %%r14 \n"

  1018 -  "and    $0xffff, %%rax \n"

  1019 -  "imul   %%rax, %%r14 \n"

  1020 -  "xor    $0xffff, %%rax \n"

  1021 -  "imul   %%rax, %%r13 \n"

  1022 -  "add    %%r14, %%r13 \n"

  1023 -  "shr    $16, %%r13 \n"

  1024 -  "movq   (%5,%%r13,8),%%xmm2\n"

  1025 -

  1026 -  "paddsw %%xmm0,%%xmm1\n"

  1027 -  "paddsw %%xmm0,%%xmm2\n"

  1028 -  "shufps $0x44,%%xmm2,%%xmm1\n"

  1029 -  "psraw  $0x6,%%xmm1\n"

  1030 -  "packuswb %%xmm1,%%xmm1\n"

  1031 -  "movq   %%xmm1,0x0(%3)\n"

  1032 -  "add    $0x8,%3\n"

  1033 -  "sub    $0x2,%4\n"

  1034 -  "jns    .lscaleloop\n"

  1035 -

  1036 -".lscalenext:"

  1037 -  "add    $0x1,%4\n"

  1038 -  "js     .lscaledone\n"

  1039 -

  1040 -  "mov    %%r11,%%r10\n"

  1041 -  "sar    $0x11,%%r10\n"

  1042 -

  1043 -  "movzb  (%1,%%r10,1), %%r13 \n"

  1044 -  "movq   2048(%5,%%r13,8),%%xmm0\n"

  1045 -

  1046 -  "movzb  (%2,%%r10,1), %%r13 \n"

  1047 -  "movq   4096(%5,%%r13,8),%%xmm1\n"

  1048 -

  1049 -  "paddsw %%xmm1,%%xmm0\n"

  1050 -  "sar    $0x10,%%r11\n"

  1051 -

  1052 -  "movzb  (%0,%%r11,1), %%r13 \n"

  1053 -  "movq   (%5,%%r13,8),%%xmm1\n"

  1054 -

  1055 -  "paddsw %%xmm0,%%xmm1\n"

  1056 -  "psraw  $0x6,%%xmm1\n"

  1057 -  "packuswb %%xmm1,%%xmm1\n"

  1058 -  "movd   %%xmm1,0x0(%3)\n"

  1059 -

  1060 -".lscaledone:"

  1061 -  :

  1062 -  : "r"(y_buf),  // %0

  1063 -    "r"(u_buf),  // %1

  1064 -    "r"(v_buf),  // %2

  1065 -    "r"(rgb_buf),  // %3

  1066 -    "r"(width),  // %4

  1067 -    "r" (kCoefficientsRgbY),  // %5

  1068 -    "r"(static_cast<long>(source_dx))  // %6

  1069 -  : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"

  1070 -);

  1071 -}

  1072 -

  1073 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__)

  1074 -

  1075 -// PIC version is slower because less registers are available, so

  1076 -// non-PIC is used on platforms where it is possible.

  1077 -

  1078 -void FastConvertYUVToRGB32Row(const uint8* y_buf,

  1079 -                              const uint8* u_buf,

  1080 -                              const uint8* v_buf,

  1081 -                              uint8* rgb_buf,

  1082 -                              int width);

  1083 -  asm(

  1084 -  ".text\n"

  1085 -  ".global FastConvertYUVToRGB32Row\n"

  1086 -"FastConvertYUVToRGB32Row:\n"

  1087 -  "pusha\n"

  1088 -  "mov    0x24(%esp),%edx\n"

  1089 -  "mov    0x28(%esp),%edi\n"

  1090 -  "mov    0x2c(%esp),%esi\n"

  1091 -  "mov    0x30(%esp),%ebp\n"

  1092 -  "mov    0x34(%esp),%ecx\n"

  1093 -  "jmp    convertend\n"

  1094 -

  1095 -"convertloop:"

  1096 -  "movzbl (%edi),%eax\n"

  1097 -  "add    $0x1,%edi\n"

  1098 -  "movzbl (%esi),%ebx\n"

  1099 -  "add    $0x1,%esi\n"

  1100 -  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"

  1101 -  "movzbl (%edx),%eax\n"

  1102 -  "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"

  1103 -  "movzbl 0x1(%edx),%ebx\n"

  1104 -  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"

  1105 -  "add    $0x2,%edx\n"

  1106 -  "movq   kCoefficientsRgbY(,%ebx,8),%mm2\n"

  1107 -  "paddsw %mm0,%mm1\n"

  1108 -  "paddsw %mm0,%mm2\n"

  1109 -  "psraw  $0x6,%mm1\n"

  1110 -  "psraw  $0x6,%mm2\n"

  1111 -  "packuswb %mm2,%mm1\n"

  1112 -  "movntq %mm1,0x0(%ebp)\n"

  1113 -  "add    $0x8,%ebp\n"

  1114 -"convertend:"

  1115 -  "sub    $0x2,%ecx\n"

  1116 -  "jns    convertloop\n"

  1117 -

  1118 -  "and    $0x1,%ecx\n"

  1119 -  "je     convertdone\n"

  1120 -

  1121 -  "movzbl (%edi),%eax\n"

  1122 -  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"

  1123 -  "movzbl (%esi),%eax\n"

  1124 -  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"

  1125 -  "movzbl (%edx),%eax\n"

  1126 -  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"

  1127 -  "paddsw %mm0,%mm1\n"

  1128 -  "psraw  $0x6,%mm1\n"

  1129 -  "packuswb %mm1,%mm1\n"

  1130 -  "movd   %mm1,0x0(%ebp)\n"

  1131 -"convertdone:"

  1132 -  "popa\n"

  1133 -  "ret\n"

  1134 -);

  1135 -

  1136 -

  1137 -void ScaleYUVToRGB32Row(const uint8* y_buf,

  1138 -                        const uint8* u_buf,

  1139 -                        const uint8* v_buf,

  1140 -                        uint8* rgb_buf,

  1141 -                        int width,

  1142 -                        int source_dx);

  1143 -  asm(

  1144 -  ".text\n"

  1145 -  ".global ScaleYUVToRGB32Row\n"

  1146 -"ScaleYUVToRGB32Row:\n"

  1147 -  "pusha\n"

  1148 -  "mov    0x24(%esp),%edx\n"

  1149 -  "mov    0x28(%esp),%edi\n"

  1150 -  "mov    0x2c(%esp),%esi\n"

  1151 -  "mov    0x30(%esp),%ebp\n"

  1152 -  "mov    0x34(%esp),%ecx\n"

  1153 -  "xor    %ebx,%ebx\n"

  1154 -  "jmp    scaleend\n"

  1155 -

  1156 -"scaleloop:"

  1157 -  "mov    %ebx,%eax\n"

  1158 -  "sar    $0x11,%eax\n"

  1159 -  "movzbl (%edi,%eax,1),%eax\n"

  1160 -  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"

  1161 -  "mov    %ebx,%eax\n"

  1162 -  "sar    $0x11,%eax\n"

  1163 -  "movzbl (%esi,%eax,1),%eax\n"

  1164 -  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"

  1165 -  "mov    %ebx,%eax\n"

  1166 -  "add    0x38(%esp),%ebx\n"

  1167 -  "sar    $0x10,%eax\n"

  1168 -  "movzbl (%edx,%eax,1),%eax\n"

  1169 -  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"

  1170 -  "mov    %ebx,%eax\n"

  1171 -  "add    0x38(%esp),%ebx\n"

  1172 -  "sar    $0x10,%eax\n"

  1173 -  "movzbl (%edx,%eax,1),%eax\n"

  1174 -  "movq   kCoefficientsRgbY(,%eax,8),%mm2\n"

  1175 -  "paddsw %mm0,%mm1\n"

  1176 -  "paddsw %mm0,%mm2\n"

  1177 -  "psraw  $0x6,%mm1\n"

  1178 -  "psraw  $0x6,%mm2\n"

  1179 -  "packuswb %mm2,%mm1\n"

  1180 -  "movntq %mm1,0x0(%ebp)\n"

  1181 -  "add    $0x8,%ebp\n"

  1182 -"scaleend:"

  1183 -  "sub    $0x2,%ecx\n"

  1184 -  "jns    scaleloop\n"

  1185 -

  1186 -  "and    $0x1,%ecx\n"

  1187 -  "je     scaledone\n"

  1188 -

  1189 -  "mov    %ebx,%eax\n"

  1190 -  "sar    $0x11,%eax\n"

  1191 -  "movzbl (%edi,%eax,1),%eax\n"

  1192 -  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"

  1193 -  "mov    %ebx,%eax\n"

  1194 -  "sar    $0x11,%eax\n"

  1195 -  "movzbl (%esi,%eax,1),%eax\n"

  1196 -  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"

  1197 -  "mov    %ebx,%eax\n"

  1198 -  "sar    $0x10,%eax\n"

  1199 -  "movzbl (%edx,%eax,1),%eax\n"

  1200 -  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"

  1201 -  "paddsw %mm0,%mm1\n"

  1202 -  "psraw  $0x6,%mm1\n"

  1203 -  "packuswb %mm1,%mm1\n"

  1204 -  "movd   %mm1,0x0(%ebp)\n"

  1205 -

  1206 -"scaledone:"

  1207 -  "popa\n"

  1208 -  "ret\n"

  1209 -);

  1210 -

  1211 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,

  1212 -                              const uint8* u_buf,

  1213 -                              const uint8* v_buf,

  1214 -                              uint8* rgb_buf,

  1215 -                              int width,

  1216 -                              int source_dx);

  1217 -  asm(

  1218 -  ".text\n"

  1219 -  ".global LinearScaleYUVToRGB32Row\n"

  1220 -"LinearScaleYUVToRGB32Row:\n"

  1221 -  "pusha\n"

  1222 -  "mov    0x24(%esp),%edx\n"

  1223 -  "mov    0x28(%esp),%edi\n"

  1224 -  "mov    0x30(%esp),%ebp\n"

  1225 -

  1226 -  // source_width = width * source_dx + ebx

  1227 -  "mov    0x34(%esp), %ecx\n"

  1228 -  "imull  0x38(%esp), %ecx\n"

  1229 -  "mov    %ecx, 0x34(%esp)\n"

  1230 -

  1231 -  "mov    0x38(%esp), %ecx\n"

  1232 -  "xor    %ebx,%ebx\n"     // x = 0

  1233 -  "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0

  1234 -  "jl     .lscaleend\n"

  1235 -  "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less

  1236 -  "jmp    .lscaleend\n"

  1237 -

  1238 -".lscaleloop:"

  1239 -  "mov    %ebx,%eax\n"

  1240 -  "sar    $0x11,%eax\n"

  1241 -

  1242 -  "movzbl (%edi,%eax,1),%ecx\n"

  1243 -  "movzbl 1(%edi,%eax,1),%esi\n"

  1244 -  "mov    %ebx,%eax\n"

  1245 -  "andl   $0x1fffe, %eax \n"

  1246 -  "imul   %eax, %esi \n"

  1247 -  "xorl   $0x1fffe, %eax \n"

  1248 -  "imul   %eax, %ecx \n"

  1249 -  "addl   %esi, %ecx \n"

  1250 -  "shrl   $17, %ecx \n"

  1251 -  "movq   kCoefficientsRgbY+2048(,%ecx,8),%mm0\n"

  1252 -

  1253 -  "mov    0x2c(%esp),%esi\n"

  1254 -  "mov    %ebx,%eax\n"

  1255 -  "sar    $0x11,%eax\n"

  1256 -

  1257 -  "movzbl (%esi,%eax,1),%ecx\n"

  1258 -  "movzbl 1(%esi,%eax,1),%esi\n"

  1259 -  "mov    %ebx,%eax\n"

  1260 -  "andl   $0x1fffe, %eax \n"

  1261 -  "imul   %eax, %esi \n"

  1262 -  "xorl   $0x1fffe, %eax \n"

  1263 -  "imul   %eax, %ecx \n"

  1264 -  "addl   %esi, %ecx \n"

  1265 -  "shrl   $17, %ecx \n"

  1266 -  "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n"

  1267 -

  1268 -  "mov    %ebx,%eax\n"

  1269 -  "sar    $0x10,%eax\n"

  1270 -  "movzbl (%edx,%eax,1),%ecx\n"

  1271 -  "movzbl 1(%edx,%eax,1),%esi\n"

  1272 -  "mov    %ebx,%eax\n"

  1273 -  "add    0x38(%esp),%ebx\n"

  1274 -  "andl   $0xffff, %eax \n"

  1275 -  "imul   %eax, %esi \n"

  1276 -  "xorl   $0xffff, %eax \n"

  1277 -  "imul   %eax, %ecx \n"

  1278 -  "addl   %esi, %ecx \n"

  1279 -  "shrl   $16, %ecx \n"

  1280 -  "movq   kCoefficientsRgbY(,%ecx,8),%mm1\n"

  1281 -

  1282 -  "cmp    0x34(%esp), %ebx\n"

  1283 -  "jge    .lscalelastpixel\n"

  1284 -

  1285 -  "mov    %ebx,%eax\n"

  1286 -  "sar    $0x10,%eax\n"

  1287 -  "movzbl (%edx,%eax,1),%ecx\n"

  1288 -  "movzbl 1(%edx,%eax,1),%esi\n"

  1289 -  "mov    %ebx,%eax\n"

  1290 -  "add    0x38(%esp),%ebx\n"

  1291 -  "andl   $0xffff, %eax \n"

  1292 -  "imul   %eax, %esi \n"

  1293 -  "xorl   $0xffff, %eax \n"

  1294 -  "imul   %eax, %ecx \n"

  1295 -  "addl   %esi, %ecx \n"

  1296 -  "shrl   $16, %ecx \n"

  1297 -  "movq   kCoefficientsRgbY(,%ecx,8),%mm2\n"

  1298 -

  1299 -  "paddsw %mm0,%mm1\n"

  1300 -  "paddsw %mm0,%mm2\n"

  1301 -  "psraw  $0x6,%mm1\n"

  1302 -  "psraw  $0x6,%mm2\n"

  1303 -  "packuswb %mm2,%mm1\n"

  1304 -  "movntq %mm1,0x0(%ebp)\n"

  1305 -  "add    $0x8,%ebp\n"

  1306 -

  1307 -".lscaleend:"

  1308 -  "cmp    0x34(%esp), %ebx\n"

  1309 -  "jl     .lscaleloop\n"

  1310 -  "popa\n"

  1311 -  "ret\n"

  1312 -

  1313 -".lscalelastpixel:"

  1314 -  "paddsw %mm0, %mm1\n"

  1315 -  "psraw $6, %mm1\n"

  1316 -  "packuswb %mm1, %mm1\n"

  1317 -  "movd %mm1, (%ebp)\n"

  1318 -  "popa\n"

  1319 -  "ret\n"

  1320 -);

  1321 -

  1322 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__)

  1323 -

  1324 -extern void PICConvertYUVToRGB32Row(const uint8* y_buf,

  1325 -                                    const uint8* u_buf,

  1326 -                                    const uint8* v_buf,

  1327 -                                    uint8* rgb_buf,

  1328 -                                    int width,

  1329 -                                    int16 *kCoefficientsRgbY);

  1330 -  asm(

  1331 -  ".text\n"

  1332 -#if defined(OS_MACOSX)

  1333 -"_PICConvertYUVToRGB32Row:\n"

  1334 -#else

  1335 -"PICConvertYUVToRGB32Row:\n"

  1336 -#endif

  1337 -  "pusha\n"

  1338 -  "mov    0x24(%esp),%edx\n"

  1339 -  "mov    0x28(%esp),%edi\n"

  1340 -  "mov    0x2c(%esp),%esi\n"

  1341 -  "mov    0x30(%esp),%ebp\n"

  1342 -  "mov    0x38(%esp),%ecx\n"

  1343 -

  1344 -  "jmp    .Lconvertend\n"

  1345 -

  1346 -".Lconvertloop:"

  1347 -  "movzbl (%edi),%eax\n"

  1348 -  "add    $0x1,%edi\n"

  1349 -  "movzbl (%esi),%ebx\n"

  1350 -  "add    $0x1,%esi\n"

  1351 -  "movq   2048(%ecx,%eax,8),%mm0\n"

  1352 -  "movzbl (%edx),%eax\n"

  1353 -  "paddsw 4096(%ecx,%ebx,8),%mm0\n"

  1354 -  "movzbl 0x1(%edx),%ebx\n"

  1355 -  "movq   0(%ecx,%eax,8),%mm1\n"

  1356 -  "add    $0x2,%edx\n"

  1357 -  "movq   0(%ecx,%ebx,8),%mm2\n"

  1358 -  "paddsw %mm0,%mm1\n"

  1359 -  "paddsw %mm0,%mm2\n"

  1360 -  "psraw  $0x6,%mm1\n"

  1361 -  "psraw  $0x6,%mm2\n"

  1362 -  "packuswb %mm2,%mm1\n"

  1363 -  "movntq %mm1,0x0(%ebp)\n"

  1364 -  "add    $0x8,%ebp\n"

  1365 -".Lconvertend:"

  1366 -  "subl   $0x2,0x34(%esp)\n"

  1367 -  "jns    .Lconvertloop\n"

  1368 -

  1369 -  "andl   $0x1,0x34(%esp)\n"

  1370 -  "je     .Lconvertdone\n"

  1371 -

  1372 -  "movzbl (%edi),%eax\n"

  1373 -  "movq   2048(%ecx,%eax,8),%mm0\n"

  1374 -  "movzbl (%esi),%eax\n"

  1375 -  "paddsw 4096(%ecx,%eax,8),%mm0\n"

  1376 -  "movzbl (%edx),%eax\n"

  1377 -  "movq   0(%ecx,%eax,8),%mm1\n"

  1378 -  "paddsw %mm0,%mm1\n"

  1379 -  "psraw  $0x6,%mm1\n"

  1380 -  "packuswb %mm1,%mm1\n"

  1381 -  "movd   %mm1,0x0(%ebp)\n"

  1382 -".Lconvertdone:\n"

  1383 -  "popa\n"

  1384 -  "ret\n"

  1385 -);

  1386 -

  1387 -void FastConvertYUVToRGB32Row(const uint8* y_buf,

  1388 -                              const uint8* u_buf,

  1389 -                              const uint8* v_buf,

  1390 -                              uint8* rgb_buf,

  1391 -                              int width) {

  1392 -  PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,

  1393 -                          &kCoefficientsRgbY[0][0]);

  1394 -}

  1395 -

  1396 -extern void PICScaleYUVToRGB32Row(const uint8* y_buf,

  1397 -                               const uint8* u_buf,

  1398 -                               const uint8* v_buf,

  1399 -                               uint8* rgb_buf,

  1400 -                               int width,

  1401 -                               int source_dx,

  1402 -                               int16 *kCoefficientsRgbY);

  1403 -

  1404 -  asm(

  1405 -  ".text\n"

  1406 -#if defined(OS_MACOSX)

  1407 -"_PICScaleYUVToRGB32Row:\n"

  1408 -#else

  1409 -"PICScaleYUVToRGB32Row:\n"

  1410 -#endif

  1411 -  "pusha\n"

  1412 -  "mov    0x24(%esp),%edx\n"

  1413 -  "mov    0x28(%esp),%edi\n"

  1414 -  "mov    0x2c(%esp),%esi\n"

  1415 -  "mov    0x30(%esp),%ebp\n"

  1416 -  "mov    0x3c(%esp),%ecx\n"

  1417 -  "xor    %ebx,%ebx\n"

  1418 -  "jmp    Lscaleend\n"

  1419 -

  1420 -"Lscaleloop:"

  1421 -  "mov    %ebx,%eax\n"

  1422 -  "sar    $0x11,%eax\n"

  1423 -  "movzbl (%edi,%eax,1),%eax\n"

  1424 -  "movq   2048(%ecx,%eax,8),%mm0\n"

  1425 -  "mov    %ebx,%eax\n"

  1426 -  "sar    $0x11,%eax\n"

  1427 -  "movzbl (%esi,%eax,1),%eax\n"

  1428 -  "paddsw 4096(%ecx,%eax,8),%mm0\n"

  1429 -  "mov    %ebx,%eax\n"

  1430 -  "add    0x38(%esp),%ebx\n"

  1431 -  "sar    $0x10,%eax\n"

  1432 -  "movzbl (%edx,%eax,1),%eax\n"

  1433 -  "movq   0(%ecx,%eax,8),%mm1\n"

  1434 -  "mov    %ebx,%eax\n"

  1435 -  "add    0x38(%esp),%ebx\n"

  1436 -  "sar    $0x10,%eax\n"

  1437 -  "movzbl (%edx,%eax,1),%eax\n"

  1438 -  "movq   0(%ecx,%eax,8),%mm2\n"

  1439 -  "paddsw %mm0,%mm1\n"

  1440 -  "paddsw %mm0,%mm2\n"

  1441 -  "psraw  $0x6,%mm1\n"

  1442 -  "psraw  $0x6,%mm2\n"

  1443 -  "packuswb %mm2,%mm1\n"

  1444 -  "movntq %mm1,0x0(%ebp)\n"

  1445 -  "add    $0x8,%ebp\n"

  1446 -"Lscaleend:"

  1447 -  "subl   $0x2,0x34(%esp)\n"

  1448 -  "jns    Lscaleloop\n"

  1449 -

  1450 -  "andl   $0x1,0x34(%esp)\n"

  1451 -  "je     Lscaledone\n"

  1452 -

  1453 -  "mov    %ebx,%eax\n"

  1454 -  "sar    $0x11,%eax\n"

  1455 -  "movzbl (%edi,%eax,1),%eax\n"

  1456 -  "movq   2048(%ecx,%eax,8),%mm0\n"

  1457 -  "mov    %ebx,%eax\n"

  1458 -  "sar    $0x11,%eax\n"

  1459 -  "movzbl (%esi,%eax,1),%eax\n"

  1460 -  "paddsw 4096(%ecx,%eax,8),%mm0\n"

  1461 -  "mov    %ebx,%eax\n"

  1462 -  "sar    $0x10,%eax\n"

  1463 -  "movzbl (%edx,%eax,1),%eax\n"

  1464 -  "movq   0(%ecx,%eax,8),%mm1\n"

  1465 -  "paddsw %mm0,%mm1\n"

  1466 -  "psraw  $0x6,%mm1\n"

  1467 -  "packuswb %mm1,%mm1\n"

  1468 -  "movd   %mm1,0x0(%ebp)\n"

  1469 -

  1470 -"Lscaledone:"

  1471 -  "popa\n"

  1472 -  "ret\n"

  1473 -);

  1474 -

  1475 -

  1476 -void ScaleYUVToRGB32Row(const uint8* y_buf,

  1477 -                        const uint8* u_buf,

  1478 -                        const uint8* v_buf,

  1479 -                        uint8* rgb_buf,

  1480 -                        int width,

  1481 -                        int source_dx) {

  1482 -  PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,

  1483 -                        &kCoefficientsRgbY[0][0]);

  1484 -}

  1485 -

  1486 -void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,

  1487 -                                 const uint8* u_buf,

  1488 -                                 const uint8* v_buf,

  1489 -                                 uint8* rgb_buf,

  1490 -                                 int width,

  1491 -                                 int source_dx,

  1492 -                                 int16 *kCoefficientsRgbY);

  1493 -  asm(

  1494 -  ".text\n"

  1495 -#if defined(OS_MACOSX)

  1496 -"_PICLinearScaleYUVToRGB32Row:\n"

  1497 -#else

  1498 -"PICLinearScaleYUVToRGB32Row:\n"

  1499 -#endif

  1500 -  "pusha\n"

  1501 -  "mov    0x24(%esp),%edx\n"

  1502 -  "mov    0x30(%esp),%ebp\n"

  1503 -  "mov    0x34(%esp),%ecx\n"

  1504 -  "mov    0x3c(%esp),%edi\n"

  1505 -  "xor    %ebx,%ebx\n"

  1506 -

  1507 -  // source_width = width * source_dx + ebx

  1508 -  "mov    0x34(%esp), %ecx\n"

  1509 -  "imull  0x38(%esp), %ecx\n"

  1510 -  "mov    %ecx, 0x34(%esp)\n"

  1511 -

  1512 -  "mov    0x38(%esp), %ecx\n"

  1513 -  "xor    %ebx,%ebx\n"     // x = 0

  1514 -  "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0

  1515 -  "jl     .lscaleend\n"

  1516 -  "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less

  1517 -  "jmp    .lscaleend\n"

  1518 -

  1519 -".lscaleloop:"

  1520 -  "mov    0x28(%esp),%esi\n"

  1521 -  "mov    %ebx,%eax\n"

  1522 -  "sar    $0x11,%eax\n"

  1523 -

  1524 -  "movzbl (%esi,%eax,1),%ecx\n"

  1525 -  "movzbl 1(%esi,%eax,1),%esi\n"

  1526 -  "mov    %ebx,%eax\n"

  1527 -  "andl   $0x1fffe, %eax \n"

  1528 -  "imul   %eax, %esi \n"

  1529 -  "xorl   $0x1fffe, %eax \n"

  1530 -  "imul   %eax, %ecx \n"

  1531 -  "addl   %esi, %ecx \n"

  1532 -  "shrl   $17, %ecx \n"

  1533 -  "movq   2048(%edi,%ecx,8),%mm0\n"

  1534 -

  1535 -  "mov    0x2c(%esp),%esi\n"

  1536 -  "mov    %ebx,%eax\n"

  1537 -  "sar    $0x11,%eax\n"

  1538 -

  1539 -  "movzbl (%esi,%eax,1),%ecx\n"

  1540 -  "movzbl 1(%esi,%eax,1),%esi\n"

  1541 -  "mov    %ebx,%eax\n"

  1542 -  "andl   $0x1fffe, %eax \n"

  1543 -  "imul   %eax, %esi \n"

  1544 -  "xorl   $0x1fffe, %eax \n"

  1545 -  "imul   %eax, %ecx \n"

  1546 -  "addl   %esi, %ecx \n"

  1547 -  "shrl   $17, %ecx \n"

  1548 -  "paddsw 4096(%edi,%ecx,8),%mm0\n"

  1549 -

  1550 -  "mov    %ebx,%eax\n"

  1551 -  "sar    $0x10,%eax\n"

  1552 -  "movzbl (%edx,%eax,1),%ecx\n"

  1553 -  "movzbl 1(%edx,%eax,1),%esi\n"

  1554 -  "mov    %ebx,%eax\n"

  1555 -  "add    0x38(%esp),%ebx\n"

  1556 -  "andl   $0xffff, %eax \n"

  1557 -  "imul   %eax, %esi \n"

  1558 -  "xorl   $0xffff, %eax \n"

  1559 -  "imul   %eax, %ecx \n"

  1560 -  "addl   %esi, %ecx \n"

  1561 -  "shrl   $16, %ecx \n"

  1562 -  "movq   (%edi,%ecx,8),%mm1\n"

  1563 -

  1564 -  "cmp    0x34(%esp), %ebx\n"

  1565 -  "jge    .lscalelastpixel\n"

  1566 -

  1567 -  "mov    %ebx,%eax\n"

  1568 -  "sar    $0x10,%eax\n"

  1569 -  "movzbl (%edx,%eax,1),%ecx\n"

  1570 -  "movzbl 1(%edx,%eax,1),%esi\n"

  1571 -  "mov    %ebx,%eax\n"

  1572 -  "add    0x38(%esp),%ebx\n"

  1573 -  "andl   $0xffff, %eax \n"

  1574 -  "imul   %eax, %esi \n"

  1575 -  "xorl   $0xffff, %eax \n"

  1576 -  "imul   %eax, %ecx \n"

  1577 -  "addl   %esi, %ecx \n"

  1578 -  "shrl   $16, %ecx \n"

  1579 -  "movq   (%edi,%ecx,8),%mm2\n"

  1580 -

  1581 -  "paddsw %mm0,%mm1\n"

  1582 -  "paddsw %mm0,%mm2\n"

  1583 -  "psraw  $0x6,%mm1\n"

  1584 -  "psraw  $0x6,%mm2\n"

  1585 -  "packuswb %mm2,%mm1\n"

  1586 -  "movntq %mm1,0x0(%ebp)\n"

  1587 -  "add    $0x8,%ebp\n"

  1588 -

  1589 -".lscaleend:"

  1590 -  "cmp    %ebx, 0x34(%esp)\n"

  1591 -  "jg     .lscaleloop\n"

  1592 -  "popa\n"

  1593 -  "ret\n"

  1594 -

  1595 -".lscalelastpixel:"

  1596 -  "paddsw %mm0, %mm1\n"

  1597 -  "psraw $6, %mm1\n"

  1598 -  "packuswb %mm1, %mm1\n"

  1599 -  "movd %mm1, (%ebp)\n"

  1600 -  "popa\n"

  1601 -  "ret\n"

  1602 -);

  1603 -

  1604 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,

  1605 -                        const uint8* u_buf,

  1606 -                        const uint8* v_buf,

  1607 -                        uint8* rgb_buf,

  1608 -                        int width,

  1609 -                        int source_dx) {

  1610 -  PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,

  1611 -                              &kCoefficientsRgbY[0][0]);

  1612 -}

  1613 -

  1614 -#else  // USE_MMX

  1615 -

  1616  // C reference code that mimic the YUV assembly.

  1617  #define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))

  1618  #define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \

  1619      (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))

  1621  static inline void YuvPixel(uint8 y,

  1622                              uint8 u,

  1623                              uint8 v,

  1624 @@ -833,66 +39,71 @@ static inline void YuvPixel(uint8 y,

  1625    a >>= 6;

  1627    *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |

  1628                                          (packuswb(g) << 8) |

  1629                                          (packuswb(r) << 16) |

  1630                                          (packuswb(a) << 24);

  1631  }

  1633 -void FastConvertYUVToRGB32Row(const uint8* y_buf,

  1634 -                              const uint8* u_buf,

  1635 -                              const uint8* v_buf,

  1636 -                              uint8* rgb_buf,

  1637 -                              int width) {

  1638 +void FastConvertYUVToRGB32Row_C(const uint8* y_buf,

  1639 +                                const uint8* u_buf,

  1640 +                                const uint8* v_buf,

  1641 +                                uint8* rgb_buf,

  1642 +                                int width,

  1643 +                                unsigned int x_shift) {

  1644    for (int x = 0; x < width; x += 2) {

  1645 -    uint8 u = u_buf[x >> 1];

  1646 -    uint8 v = v_buf[x >> 1];

  1647 +    uint8 u = u_buf[x >> x_shift];

  1648 +    uint8 v = v_buf[x >> x_shift];

  1649      uint8 y0 = y_buf[x];

  1650      YuvPixel(y0, u, v, rgb_buf);

  1651      if ((x + 1) < width) {

  1652        uint8 y1 = y_buf[x + 1];

  1653 +      if (x_shift == 0) {

  1654 +        u = u_buf[x + 1];

  1655 +        v = v_buf[x + 1];

  1656 +      }

  1657        YuvPixel(y1, u, v, rgb_buf + 4);

  1658      }

  1659      rgb_buf += 8;  // Advance 2 pixels.

  1660    }

  1661  }

  1663  // 16.16 fixed point is used.  A shift by 16 isolates the integer.

  1664  // A shift by 17 is used to further subsample the chrominence channels.

  1665  // & 0xffff isolates the fixed point fraction.  >> 2 to get the upper 2 bits,

  1666  // for 1/65536 pixel accurate interpolation.

  1667 -void ScaleYUVToRGB32Row(const uint8* y_buf,

  1668 -                        const uint8* u_buf,

  1669 -                        const uint8* v_buf,

  1670 -                        uint8* rgb_buf,

  1671 -                        int width,

  1672 -                        int source_dx) {

  1673 +void ScaleYUVToRGB32Row_C(const uint8* y_buf,

  1674 +                          const uint8* u_buf,

  1675 +                          const uint8* v_buf,

  1676 +                          uint8* rgb_buf,

  1677 +                          int width,

  1678 +                          int source_dx) {

  1679    int x = 0;

  1680    for (int i = 0; i < width; i += 2) {

  1681      int y = y_buf[x >> 16];

  1682      int u = u_buf[(x >> 17)];

  1683      int v = v_buf[(x >> 17)];

  1684      YuvPixel(y, u, v, rgb_buf);

  1685      x += source_dx;

  1686      if ((i + 1) < width) {

  1687        y = y_buf[x >> 16];

  1688        YuvPixel(y, u, v, rgb_buf+4);

  1689        x += source_dx;

  1690      }

  1691      rgb_buf += 8;

  1692    }

  1693  }

  1695 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,

  1696 -                              const uint8* u_buf,

  1697 -                              const uint8* v_buf,

  1698 -                              uint8* rgb_buf,

  1699 -                              int width,

  1700 -                              int source_dx) {

  1701 +void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,

  1702 +                                const uint8* u_buf,

  1703 +                                const uint8* v_buf,

  1704 +                                uint8* rgb_buf,

  1705 +                                int width,

  1706 +                                int source_dx) {

  1707    int x = 0;

  1708    if (source_dx >= 0x20000) {

  1709      x = 32768;

  1710    }

  1711    for (int i = 0; i < width; i += 2) {

  1712      int y0 = y_buf[x >> 16];

  1713      int y1 = y_buf[(x >> 16) + 1];

  1714      int u0 = u_buf[(x >> 17)];

  1715 @@ -913,11 +124,10 @@ void LinearScaleYUVToRGB32Row(const uint

  1716        y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;

  1717        YuvPixel(y, u, v, rgb_buf+4);

  1718        x += source_dx;

  1719      }

  1720      rgb_buf += 8;

  1721    }

  1722  }

  1724 -#endif  // USE_MMX

  1725  }  // extern "C"

  1727 diff --git a/gfx/ycbcr/yuv_row_posix.cpp b/gfx/ycbcr/yuv_row_posix.cpp

  1728 --- a/gfx/ycbcr/yuv_row_posix.cpp

  1729 +++ b/gfx/ycbcr/yuv_row_posix.cpp

  1730 @@ -1,33 +1,32 @@

  1731  // Copyright (c) 2010 The Chromium Authors. All rights reserved.

  1732  // Use of this source code is governed by a BSD-style license that can be

  1733  // found in the LICENSE file.

  1735 -#include "media/base/yuv_row.h"

  1736 -

  1737 -#ifdef _DEBUG

  1738 -#include "base/logging.h"

  1739 -#else

  1740 +#include "yuv_row.h"

  1741 +#include "mozilla/SSE.h"

  1742 +

  1743  #define DCHECK(a)

  1744 -#endif

  1746  extern "C" {

  1748 -#if USE_SSE2 && defined(ARCH_CPU_X86_64)

  1749 +#if defined(ARCH_CPU_X86_64)

  1750 +

  1751 +// We don't need CPUID guards here, since x86-64 implies SSE2.

  1753  // AMD64 ABI uses register paremters.

  1754  void FastConvertYUVToRGB32Row(const uint8* y_buf,  // rdi

  1755                                const uint8* u_buf,  // rsi

  1756                                const uint8* v_buf,  // rdx

  1757                                uint8* rgb_buf,      // rcx

  1758                                int width) {         // r8

  1759    asm(

  1760 -  "jmp    convertend\n"

  1761 -"convertloop:"

  1762 +  "jmp    1f\n"

  1763 +"0:"

  1764    "movzb  (%1),%%r10\n"

  1765    "add    $0x1,%1\n"

  1766    "movzb  (%2),%%r11\n"

  1767    "add    $0x1,%2\n"

  1768    "movq   2048(%5,%%r10,8),%%xmm0\n"

  1769    "movzb  (%0),%%r10\n"

  1770    "movq   4096(%5,%%r11,8),%%xmm1\n"

  1771    "movzb  0x1(%0),%%r11\n"

  1772 @@ -37,36 +36,36 @@ void FastConvertYUVToRGB32Row(const uint

  1773    "movq   (%5,%%r11,8),%%xmm3\n"

  1774    "paddsw %%xmm0,%%xmm2\n"

  1775    "paddsw %%xmm0,%%xmm3\n"

  1776    "shufps $0x44,%%xmm3,%%xmm2\n"

  1777    "psraw  $0x6,%%xmm2\n"

  1778    "packuswb %%xmm2,%%xmm2\n"

  1779    "movq   %%xmm2,0x0(%3)\n"

  1780    "add    $0x8,%3\n"

  1781 -"convertend:"

  1782 +"1:"

  1783    "sub    $0x2,%4\n"

  1784 -  "jns    convertloop\n"

  1785 -

  1786 -"convertnext:"

  1787 +  "jns    0b\n"

  1788 +

  1789 +"2:"

  1790    "add    $0x1,%4\n"

  1791 -  "js     convertdone\n"

  1792 +  "js     3f\n"

  1794    "movzb  (%1),%%r10\n"

  1795    "movq   2048(%5,%%r10,8),%%xmm0\n"

  1796    "movzb  (%2),%%r10\n"

  1797    "movq   4096(%5,%%r10,8),%%xmm1\n"

  1798    "paddsw %%xmm1,%%xmm0\n"

  1799    "movzb  (%0),%%r10\n"

  1800    "movq   (%5,%%r10,8),%%xmm1\n"

  1801    "paddsw %%xmm0,%%xmm1\n"

  1802    "psraw  $0x6,%%xmm1\n"

  1803    "packuswb %%xmm1,%%xmm1\n"

  1804    "movd   %%xmm1,0x0(%3)\n"

  1805 -"convertdone:"

  1806 +"3:"

  1807    :

  1808    : "r"(y_buf),  // %0

  1809      "r"(u_buf),  // %1

  1810      "r"(v_buf),  // %2

  1811      "r"(rgb_buf),  // %3

  1812      "r"(width),  // %4

  1813      "r" (kCoefficientsRgbY)  // %5

  1814    : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"

  1815 @@ -77,19 +76,19 @@ void ScaleYUVToRGB32Row(const uint8* y_b

  1816                          const uint8* u_buf,  // rsi

  1817                          const uint8* v_buf,  // rdx

  1818                          uint8* rgb_buf,      // rcx

  1819                          int width,           // r8

  1820                          int source_dx) {     // r9

  1821    asm(

  1822    "xor    %%r11,%%r11\n"

  1823    "sub    $0x2,%4\n"

  1824 -  "js     scalenext\n"

  1825 -

  1826 -"scaleloop:"

  1827 +  "js     1f\n"

  1828 +

  1829 +"0:"

  1830    "mov    %%r11,%%r10\n"

  1831    "sar    $0x11,%%r10\n"

  1832    "movzb  (%1,%%r10,1),%%rax\n"

  1833    "movq   2048(%5,%%rax,8),%%xmm0\n"

  1834    "movzb  (%2,%%r10,1),%%rax\n"

  1835    "movq   4096(%5,%%rax,8),%%xmm1\n"

  1836    "lea    (%%r11,%6),%%r10\n"

  1837    "sar    $0x10,%%r11\n"

  1838 @@ -103,38 +102,38 @@ void ScaleYUVToRGB32Row(const uint8* y_b

  1839    "paddsw %%xmm0,%%xmm1\n"

  1840    "paddsw %%xmm0,%%xmm2\n"

  1841    "shufps $0x44,%%xmm2,%%xmm1\n"

  1842    "psraw  $0x6,%%xmm1\n"

  1843    "packuswb %%xmm1,%%xmm1\n"

  1844    "movq   %%xmm1,0x0(%3)\n"

  1845    "add    $0x8,%3\n"

  1846    "sub    $0x2,%4\n"

  1847 -  "jns    scaleloop\n"

  1848 -

  1849 -"scalenext:"

  1850 +  "jns    0b\n"

  1851 +

  1852 +"1:"

  1853    "add    $0x1,%4\n"

  1854 -  "js     scaledone\n"

  1855 +  "js     2f\n"

  1857    "mov    %%r11,%%r10\n"

  1858    "sar    $0x11,%%r10\n"

  1859    "movzb  (%1,%%r10,1),%%rax\n"

  1860    "movq   2048(%5,%%rax,8),%%xmm0\n"

  1861    "movzb  (%2,%%r10,1),%%rax\n"

  1862    "movq   4096(%5,%%rax,8),%%xmm1\n"

  1863    "paddsw %%xmm1,%%xmm0\n"

  1864    "sar    $0x10,%%r11\n"

  1865    "movzb  (%0,%%r11,1),%%rax\n"

  1866    "movq   (%5,%%rax,8),%%xmm1\n"

  1867    "paddsw %%xmm0,%%xmm1\n"

  1868    "psraw  $0x6,%%xmm1\n"

  1869    "packuswb %%xmm1,%%xmm1\n"

  1870    "movd   %%xmm1,0x0(%3)\n"

  1872 -"scaledone:"

  1873 +"2:"

  1874    :

  1875    : "r"(y_buf),  // %0

  1876      "r"(u_buf),  // %1

  1877      "r"(v_buf),  // %2

  1878      "r"(rgb_buf),  // %3

  1879      "r"(width),  // %4

  1880      "r" (kCoefficientsRgbY),  // %5

  1881      "r"(static_cast<long>(source_dx))  // %6

  1882 @@ -146,23 +145,23 @@ void LinearScaleYUVToRGB32Row(const uint

  1883                                const uint8* u_buf,

  1884                                const uint8* v_buf,

  1885                                uint8* rgb_buf,

  1886                                int width,

  1887                                int source_dx) {

  1888    asm(

  1889    "xor    %%r11,%%r11\n"   // x = 0

  1890    "sub    $0x2,%4\n"

  1891 -  "js     .lscalenext\n"

  1892 +  "js     2f\n"

  1893    "cmp    $0x20000,%6\n"   // if source_dx >= 2.0

  1894 -  "jl     .lscalehalf\n"

  1895 +  "jl     0f\n"

  1896    "mov    $0x8000,%%r11\n" // x = 0.5 for 1/2 or less

  1897 -".lscalehalf:"

  1898 -

  1899 -".lscaleloop:"

  1900 +"0:"

  1901 +

  1902 +"1:"

  1903    "mov    %%r11,%%r10\n"

  1904    "sar    $0x11,%%r10\n"

  1906    "movzb  (%1, %%r10, 1), %%r13 \n"

  1907    "movzb  1(%1, %%r10, 1), %%r14 \n"

  1908    "mov    %%r11, %%rax \n"

  1909    "and    $0x1fffe, %%rax \n"

  1910    "imul   %%rax, %%r14 \n"

  1911 @@ -215,21 +214,21 @@ void LinearScaleYUVToRGB32Row(const uint

  1912    "paddsw %%xmm0,%%xmm1\n"

  1913    "paddsw %%xmm0,%%xmm2\n"

  1914    "shufps $0x44,%%xmm2,%%xmm1\n"

  1915    "psraw  $0x6,%%xmm1\n"

  1916    "packuswb %%xmm1,%%xmm1\n"

  1917    "movq   %%xmm1,0x0(%3)\n"

  1918    "add    $0x8,%3\n"

  1919    "sub    $0x2,%4\n"

  1920 -  "jns    .lscaleloop\n"

  1921 -

  1922 -".lscalenext:"

  1923 +  "jns    1b\n"

  1924 +

  1925 +"2:"

  1926    "add    $0x1,%4\n"

  1927 -  "js     .lscaledone\n"

  1928 +  "js     3f\n"

  1930    "mov    %%r11,%%r10\n"

  1931    "sar    $0x11,%%r10\n"

  1933    "movzb  (%1,%%r10,1), %%r13 \n"

  1934    "movq   2048(%5,%%r13,8),%%xmm0\n"

  1936    "movzb  (%2,%%r10,1), %%r13 \n"

  1937 @@ -241,52 +240,52 @@ void LinearScaleYUVToRGB32Row(const uint

  1938    "movzb  (%0,%%r11,1), %%r13 \n"

  1939    "movq   (%5,%%r13,8),%%xmm1\n"

  1941    "paddsw %%xmm0,%%xmm1\n"

  1942    "psraw  $0x6,%%xmm1\n"

  1943    "packuswb %%xmm1,%%xmm1\n"

  1944    "movd   %%xmm1,0x0(%3)\n"

  1946 -".lscaledone:"

  1947 +"3:"

  1948    :

  1949    : "r"(y_buf),  // %0

  1950      "r"(u_buf),  // %1

  1951      "r"(v_buf),  // %2

  1952      "r"(rgb_buf),  // %3

  1953      "r"(width),  // %4

  1954      "r" (kCoefficientsRgbY),  // %5

  1955      "r"(static_cast<long>(source_dx))  // %6

  1956    : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"

  1957  );

  1958  }

  1960 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__)

  1961 +#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__)

  1963  // PIC version is slower because less registers are available, so

  1964  // non-PIC is used on platforms where it is possible.

  1965 -

  1966 -void FastConvertYUVToRGB32Row(const uint8* y_buf,

  1967 -                              const uint8* u_buf,

  1968 -                              const uint8* v_buf,

  1969 -                              uint8* rgb_buf,

  1970 -                              int width);

  1971 +void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,

  1972 +                                  const uint8* u_buf,

  1973 +                                  const uint8* v_buf,

  1974 +                                  uint8* rgb_buf,

  1975 +                                  int width);

  1976    asm(

  1977    ".text\n"

  1978 -  ".global FastConvertYUVToRGB32Row\n"

  1979 -"FastConvertYUVToRGB32Row:\n"

  1980 +  ".global FastConvertYUVToRGB32Row_SSE\n"

  1981 +  ".type FastConvertYUVToRGB32Row_SSE, @function\n"

  1982 +"FastConvertYUVToRGB32Row_SSE:\n"

  1983    "pusha\n"

  1984    "mov    0x24(%esp),%edx\n"

  1985    "mov    0x28(%esp),%edi\n"

  1986    "mov    0x2c(%esp),%esi\n"

  1987    "mov    0x30(%esp),%ebp\n"

  1988    "mov    0x34(%esp),%ecx\n"

  1989 -  "jmp    convertend\n"

  1990 -

  1991 -"convertloop:"

  1992 +  "jmp    1f\n"

  1993 +

  1994 +"0:"

  1995    "movzbl (%edi),%eax\n"

  1996    "add    $0x1,%edi\n"

  1997    "movzbl (%esi),%ebx\n"

  1998    "add    $0x1,%esi\n"

  1999    "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"

  2000    "movzbl (%edx),%eax\n"

  2001    "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"

  2002    "movzbl 0x1(%edx),%ebx\n"

  2003 @@ -295,59 +294,77 @@ void FastConvertYUVToRGB32Row(const uint

  2004    "movq   kCoefficientsRgbY(,%ebx,8),%mm2\n"

  2005    "paddsw %mm0,%mm1\n"

  2006    "paddsw %mm0,%mm2\n"

  2007    "psraw  $0x6,%mm1\n"

  2008    "psraw  $0x6,%mm2\n"

  2009    "packuswb %mm2,%mm1\n"

  2010    "movntq %mm1,0x0(%ebp)\n"

  2011    "add    $0x8,%ebp\n"

  2012 -"convertend:"

  2013 +"1:"

  2014    "sub    $0x2,%ecx\n"

  2015 -  "jns    convertloop\n"

  2016 +  "jns    0b\n"

  2018    "and    $0x1,%ecx\n"

  2019 -  "je     convertdone\n"

  2020 +  "je     2f\n"

  2022    "movzbl (%edi),%eax\n"

  2023    "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"

  2024    "movzbl (%esi),%eax\n"

  2025    "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"

  2026    "movzbl (%edx),%eax\n"

  2027    "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"

  2028    "paddsw %mm0,%mm1\n"

  2029    "psraw  $0x6,%mm1\n"

  2030    "packuswb %mm1,%mm1\n"

  2031    "movd   %mm1,0x0(%ebp)\n"

  2032 -"convertdone:"

  2033 +"2:"

  2034    "popa\n"

  2035    "ret\n"

  2036 +#if !defined(XP_MACOSX)

  2037 +  ".previous\n"

  2038 +#endif

  2039  );

  2041 -

  2042 -void ScaleYUVToRGB32Row(const uint8* y_buf,

  2043 -                        const uint8* u_buf,

  2044 -                        const uint8* v_buf,

  2045 -                        uint8* rgb_buf,

  2046 -                        int width,

  2047 -                        int source_dx);

  2048 +void FastConvertYUVToRGB32Row(const uint8* y_buf,

  2049 +                              const uint8* u_buf,

  2050 +                              const uint8* v_buf,

  2051 +                              uint8* rgb_buf,

  2052 +                              int width)

  2053 +{

  2054 +  if (mozilla::supports_sse()) {

  2055 +    FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);

  2056 +    return;

  2057 +  }

  2058 +

  2059 +  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);

  2060 +}

  2061 +

  2062 +

  2063 +void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,

  2064 +                            const uint8* u_buf,

  2065 +                            const uint8* v_buf,

  2066 +                            uint8* rgb_buf,

  2067 +                            int width,

  2068 +                            int source_dx);

  2069    asm(

  2070    ".text\n"

  2071 -  ".global ScaleYUVToRGB32Row\n"

  2072 -"ScaleYUVToRGB32Row:\n"

  2073 +  ".global ScaleYUVToRGB32Row_SSE\n"

  2074 +  ".type ScaleYUVToRGB32Row_SSE, @function\n"

  2075 +"ScaleYUVToRGB32Row_SSE:\n"

  2076    "pusha\n"

  2077    "mov    0x24(%esp),%edx\n"

  2078    "mov    0x28(%esp),%edi\n"

  2079    "mov    0x2c(%esp),%esi\n"

  2080    "mov    0x30(%esp),%ebp\n"

  2081    "mov    0x34(%esp),%ecx\n"

  2082    "xor    %ebx,%ebx\n"

  2083 -  "jmp    scaleend\n"

  2084 -

  2085 -"scaleloop:"

  2086 +  "jmp    1f\n"

  2087 +

  2088 +"0:"

  2089    "mov    %ebx,%eax\n"

  2090    "sar    $0x11,%eax\n"

  2091    "movzbl (%edi,%eax,1),%eax\n"

  2092    "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"

  2093    "mov    %ebx,%eax\n"

  2094    "sar    $0x11,%eax\n"

  2095    "movzbl (%esi,%eax,1),%eax\n"

  2096    "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"

  2097 @@ -363,22 +380,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b

  2098    "movq   kCoefficientsRgbY(,%eax,8),%mm2\n"

  2099    "paddsw %mm0,%mm1\n"

  2100    "paddsw %mm0,%mm2\n"

  2101    "psraw  $0x6,%mm1\n"

  2102    "psraw  $0x6,%mm2\n"

  2103    "packuswb %mm2,%mm1\n"

  2104    "movntq %mm1,0x0(%ebp)\n"

  2105    "add    $0x8,%ebp\n"

  2106 -"scaleend:"

  2107 +"1:"

  2108    "sub    $0x2,%ecx\n"

  2109 -  "jns    scaleloop\n"

  2110 +  "jns    0b\n"

  2112    "and    $0x1,%ecx\n"

  2113 -  "je     scaledone\n"

  2114 +  "je     2f\n"

  2116    "mov    %ebx,%eax\n"

  2117    "sar    $0x11,%eax\n"

  2118    "movzbl (%edi,%eax,1),%eax\n"

  2119    "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"

  2120    "mov    %ebx,%eax\n"

  2121    "sar    $0x11,%eax\n"

  2122    "movzbl (%esi,%eax,1),%eax\n"

  2123 @@ -387,51 +404,71 @@ void ScaleYUVToRGB32Row(const uint8* y_b

  2124    "sar    $0x10,%eax\n"

  2125    "movzbl (%edx,%eax,1),%eax\n"

  2126    "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"

  2127    "paddsw %mm0,%mm1\n"

  2128    "psraw  $0x6,%mm1\n"

  2129    "packuswb %mm1,%mm1\n"

  2130    "movd   %mm1,0x0(%ebp)\n"

  2132 -"scaledone:"

  2133 +"2:"

  2134    "popa\n"

  2135    "ret\n"

  2136 +#if !defined(XP_MACOSX)

  2137 +  ".previous\n"

  2138 +#endif

  2139  );

  2141 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,

  2142 -                              const uint8* u_buf,

  2143 -                              const uint8* v_buf,

  2144 -                              uint8* rgb_buf,

  2145 -                              int width,

  2146 -                              int source_dx);

  2147 +void ScaleYUVToRGB32Row(const uint8* y_buf,

  2148 +                        const uint8* u_buf,

  2149 +                        const uint8* v_buf,

  2150 +                        uint8* rgb_buf,

  2151 +                        int width,

  2152 +                        int source_dx)

  2153 +{

  2154 +  if (mozilla::supports_sse()) {

  2155 +    ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,

  2156 +                           width, source_dx);

  2157 +  }

  2158 +

  2159 +  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,

  2160 +                       width, source_dx);

  2161 +}

  2162 +

  2163 +void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,

  2164 +                                  const uint8* u_buf,

  2165 +                                  const uint8* v_buf,

  2166 +                                  uint8* rgb_buf,

  2167 +                                  int width,

  2168 +                                  int source_dx);

  2169    asm(

  2170    ".text\n"

  2171 -  ".global LinearScaleYUVToRGB32Row\n"

  2172 -"LinearScaleYUVToRGB32Row:\n"

  2173 +  ".global LinearScaleYUVToRGB32Row_SSE\n"

  2174 +  ".type LinearScaleYUVToRGB32Row_SSE, @function\n"

  2175 +"LinearScaleYUVToRGB32Row_SSE:\n"

  2176    "pusha\n"

  2177    "mov    0x24(%esp),%edx\n"

  2178    "mov    0x28(%esp),%edi\n"

  2179    "mov    0x30(%esp),%ebp\n"

  2181    // source_width = width * source_dx + ebx

  2182    "mov    0x34(%esp), %ecx\n"

  2183    "imull  0x38(%esp), %ecx\n"

  2184    "mov    %ecx, 0x34(%esp)\n"

  2186    "mov    0x38(%esp), %ecx\n"

  2187    "xor    %ebx,%ebx\n"     // x = 0

  2188    "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0

  2189 -  "jl     .lscaleend\n"

  2190 +  "jl     1f\n"

  2191    "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less

  2192 -  "jmp    .lscaleend\n"

  2193 -

  2194 -".lscaleloop:"

  2195 -  "mov    %ebx,%eax\n"

  2196 -  "sar    $0x11,%eax\n"

  2197 +  "jmp    1f\n"

  2198 +

  2199 +"0:"

  2200 +  "mov    %ebx,%eax\n"

  2201 +  "sar    $0x11,%eax\n"

  2203    "movzbl (%edi,%eax,1),%ecx\n"

  2204    "movzbl 1(%edi,%eax,1),%esi\n"

  2205    "mov    %ebx,%eax\n"

  2206    "andl   $0x1fffe, %eax \n"

  2207    "imul   %eax, %esi \n"

  2208    "xorl   $0x1fffe, %eax \n"

  2209    "imul   %eax, %ecx \n"

  2210 @@ -464,17 +501,17 @@ void LinearScaleYUVToRGB32Row(const uint

  2211    "imul   %eax, %esi \n"

  2212    "xorl   $0xffff, %eax \n"

  2213    "imul   %eax, %ecx \n"

  2214    "addl   %esi, %ecx \n"

  2215    "shrl   $16, %ecx \n"

  2216    "movq   kCoefficientsRgbY(,%ecx,8),%mm1\n"

  2218    "cmp    0x34(%esp), %ebx\n"

  2219 -  "jge    .lscalelastpixel\n"

  2220 +  "jge    2f\n"

  2222    "mov    %ebx,%eax\n"

  2223    "sar    $0x10,%eax\n"

  2224    "movzbl (%edx,%eax,1),%ecx\n"

  2225    "movzbl 1(%edx,%eax,1),%esi\n"

  2226    "mov    %ebx,%eax\n"

  2227    "add    0x38(%esp),%ebx\n"

  2228    "andl   $0xffff, %eax \n"

  2229 @@ -488,56 +525,76 @@ void LinearScaleYUVToRGB32Row(const uint

  2230    "paddsw %mm0,%mm1\n"

  2231    "paddsw %mm0,%mm2\n"

  2232    "psraw  $0x6,%mm1\n"

  2233    "psraw  $0x6,%mm2\n"

  2234    "packuswb %mm2,%mm1\n"

  2235    "movntq %mm1,0x0(%ebp)\n"

  2236    "add    $0x8,%ebp\n"

  2238 -".lscaleend:"

  2239 +"1:"

  2240    "cmp    0x34(%esp), %ebx\n"

  2241 -  "jl     .lscaleloop\n"

  2242 +  "jl     0b\n"

  2243    "popa\n"

  2244    "ret\n"

  2246 -".lscalelastpixel:"

  2247 +"2:"

  2248    "paddsw %mm0, %mm1\n"

  2249    "psraw $6, %mm1\n"

  2250    "packuswb %mm1, %mm1\n"

  2251    "movd %mm1, (%ebp)\n"

  2252    "popa\n"

  2253    "ret\n"

  2254 +#if !defined(XP_MACOSX)

  2255 +  ".previous\n"

  2256 +#endif

  2257  );

  2259 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__)

  2260 -

  2261 -extern void PICConvertYUVToRGB32Row(const uint8* y_buf,

  2262 -                                    const uint8* u_buf,

  2263 -                                    const uint8* v_buf,

  2264 -                                    uint8* rgb_buf,

  2265 -                                    int width,

  2266 -                                    int16 *kCoefficientsRgbY);

  2267 +void LinearScaleYUVToRGB32Row(const uint8* y_buf,

  2268 +                              const uint8* u_buf,

  2269 +                              const uint8* v_buf,

  2270 +                              uint8* rgb_buf,

  2271 +                              int width,

  2272 +                              int source_dx)

  2273 +{

  2274 +  if (mozilla::supports_sse()) {

  2275 +    LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,

  2276 +                                 width, source_dx);

  2277 +  }

  2278 +

  2279 +  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,

  2280 +                             width, source_dx);

  2281 +}

  2282 +

  2283 +#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__)

  2284 +

  2285 +void PICConvertYUVToRGB32Row_SSE(const uint8* y_buf,

  2286 +                                 const uint8* u_buf,

  2287 +                                 const uint8* v_buf,

  2288 +                                 uint8* rgb_buf,

  2289 +                                 int width,

  2290 +                                 int16 *kCoefficientsRgbY);

  2291 +

  2292    asm(

  2293    ".text\n"

  2294 -#if defined(OS_MACOSX)

  2295 -"_PICConvertYUVToRGB32Row:\n"

  2296 +#if defined(XP_MACOSX)

  2297 +"_PICConvertYUVToRGB32Row_SSE:\n"

  2298  #else

  2299 -"PICConvertYUVToRGB32Row:\n"

  2300 +"PICConvertYUVToRGB32Row_SSE:\n"

  2301  #endif

  2302    "pusha\n"

  2303    "mov    0x24(%esp),%edx\n"

  2304    "mov    0x28(%esp),%edi\n"

  2305    "mov    0x2c(%esp),%esi\n"

  2306    "mov    0x30(%esp),%ebp\n"

  2307    "mov    0x38(%esp),%ecx\n"

  2309 -  "jmp    .Lconvertend\n"

  2310 -

  2311 -".Lconvertloop:"

  2312 +  "jmp    1f\n"

  2313 +

  2314 +"0:"

  2315    "movzbl (%edi),%eax\n"

  2316    "add    $0x1,%edi\n"

  2317    "movzbl (%esi),%ebx\n"

  2318    "add    $0x1,%esi\n"

  2319    "movq   2048(%ecx,%eax,8),%mm0\n"

  2320    "movzbl (%edx),%eax\n"

  2321    "paddsw 4096(%ecx,%ebx,8),%mm0\n"

  2322    "movzbl 0x1(%edx),%ebx\n"

  2323 @@ -546,72 +603,81 @@ extern void PICConvertYUVToRGB32Row(cons

  2324    "movq   0(%ecx,%ebx,8),%mm2\n"

  2325    "paddsw %mm0,%mm1\n"

  2326    "paddsw %mm0,%mm2\n"

  2327    "psraw  $0x6,%mm1\n"

  2328    "psraw  $0x6,%mm2\n"

  2329    "packuswb %mm2,%mm1\n"

  2330    "movntq %mm1,0x0(%ebp)\n"

  2331    "add    $0x8,%ebp\n"

  2332 -".Lconvertend:"

  2333 +"1:"

  2334    "subl   $0x2,0x34(%esp)\n"

  2335 -  "jns    .Lconvertloop\n"

  2336 +  "jns    0b\n"

  2338    "andl   $0x1,0x34(%esp)\n"

  2339 -  "je     .Lconvertdone\n"

  2340 +  "je     2f\n"

  2342    "movzbl (%edi),%eax\n"

  2343    "movq   2048(%ecx,%eax,8),%mm0\n"

  2344    "movzbl (%esi),%eax\n"

  2345    "paddsw 4096(%ecx,%eax,8),%mm0\n"

  2346    "movzbl (%edx),%eax\n"

  2347    "movq   0(%ecx,%eax,8),%mm1\n"

  2348    "paddsw %mm0,%mm1\n"

  2349    "psraw  $0x6,%mm1\n"

  2350    "packuswb %mm1,%mm1\n"

  2351    "movd   %mm1,0x0(%ebp)\n"

  2352 -".Lconvertdone:\n"

  2353 +"2:"

  2354    "popa\n"

  2355    "ret\n"

  2356 +#if !defined(XP_MACOSX)

  2357 +  ".previous\n"

  2358 +#endif

  2359  );

  2361  void FastConvertYUVToRGB32Row(const uint8* y_buf,

  2362                                const uint8* u_buf,

  2363                                const uint8* v_buf,

  2364                                uint8* rgb_buf,

  2365 -                              int width) {

  2366 -  PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,

  2367 -                          &kCoefficientsRgbY[0][0]);

  2368 -}

  2369 -

  2370 -extern void PICScaleYUVToRGB32Row(const uint8* y_buf,

  2371 +                              int width)

  2372 +{

  2373 +  if (mozilla::supports_sse()) {

  2374 +    PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,

  2375 +                                &kCoefficientsRgbY[0][0]);

  2376 +    return;

  2377 +  }

  2378 +

  2379 +  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);

  2380 +}

  2381 +

  2382 +void PICScaleYUVToRGB32Row_SSE(const uint8* y_buf,

  2383                                 const uint8* u_buf,

  2384                                 const uint8* v_buf,

  2385                                 uint8* rgb_buf,

  2386                                 int width,

  2387                                 int source_dx,

  2388                                 int16 *kCoefficientsRgbY);

  2390    asm(

  2391    ".text\n"

  2392 -#if defined(OS_MACOSX)

  2393 -"_PICScaleYUVToRGB32Row:\n"

  2394 +#if defined(XP_MACOSX)

  2395 +"_PICScaleYUVToRGB32Row_SSE:\n"

  2396  #else

  2397 -"PICScaleYUVToRGB32Row:\n"

  2398 +"PICScaleYUVToRGB32Row_SSE:\n"

  2399  #endif

  2400    "pusha\n"

  2401    "mov    0x24(%esp),%edx\n"

  2402    "mov    0x28(%esp),%edi\n"

  2403    "mov    0x2c(%esp),%esi\n"

  2404    "mov    0x30(%esp),%ebp\n"

  2405    "mov    0x3c(%esp),%ecx\n"

  2406    "xor    %ebx,%ebx\n"

  2407 -  "jmp    Lscaleend\n"

  2408 -

  2409 -"Lscaleloop:"

  2410 +  "jmp    1f\n"

  2411 +

  2412 +"0:"

  2413    "mov    %ebx,%eax\n"

  2414    "sar    $0x11,%eax\n"

  2415    "movzbl (%edi,%eax,1),%eax\n"

  2416    "movq   2048(%ecx,%eax,8),%mm0\n"

  2417    "mov    %ebx,%eax\n"

  2418    "sar    $0x11,%eax\n"

  2419    "movzbl (%esi,%eax,1),%eax\n"

  2420    "paddsw 4096(%ecx,%eax,8),%mm0\n"

  2421 @@ -627,22 +693,22 @@ extern void PICScaleYUVToRGB32Row(const

  2422    "movq   0(%ecx,%eax,8),%mm2\n"

  2423    "paddsw %mm0,%mm1\n"

  2424    "paddsw %mm0,%mm2\n"

  2425    "psraw  $0x6,%mm1\n"

  2426    "psraw  $0x6,%mm2\n"

  2427    "packuswb %mm2,%mm1\n"

  2428    "movntq %mm1,0x0(%ebp)\n"

  2429    "add    $0x8,%ebp\n"

  2430 -"Lscaleend:"

  2431 +"1:"

  2432    "subl   $0x2,0x34(%esp)\n"

  2433 -  "jns    Lscaleloop\n"

  2434 +  "jns    0b\n"

  2436    "andl   $0x1,0x34(%esp)\n"

  2437 -  "je     Lscaledone\n"

  2438 +  "je     2f\n"

  2440    "mov    %ebx,%eax\n"

  2441    "sar    $0x11,%eax\n"

  2442    "movzbl (%edi,%eax,1),%eax\n"

  2443    "movq   2048(%ecx,%eax,8),%mm0\n"

  2444    "mov    %ebx,%eax\n"

  2445    "sar    $0x11,%eax\n"

  2446    "movzbl (%esi,%eax,1),%eax\n"

  2447 @@ -651,66 +717,75 @@ extern void PICScaleYUVToRGB32Row(const

  2448    "sar    $0x10,%eax\n"

  2449    "movzbl (%edx,%eax,1),%eax\n"

  2450    "movq   0(%ecx,%eax,8),%mm1\n"

  2451    "paddsw %mm0,%mm1\n"

  2452    "psraw  $0x6,%mm1\n"

  2453    "packuswb %mm1,%mm1\n"

  2454    "movd   %mm1,0x0(%ebp)\n"

  2456 -"Lscaledone:"

  2457 +"2:"

  2458    "popa\n"

  2459    "ret\n"

  2460 +#if !defined(XP_MACOSX)

  2461 +  ".previous\n"

  2462 +#endif

  2463  );

  2465 -

  2466  void ScaleYUVToRGB32Row(const uint8* y_buf,

  2467                          const uint8* u_buf,

  2468                          const uint8* v_buf,

  2469                          uint8* rgb_buf,

  2470                          int width,

  2471 -                        int source_dx) {

  2472 -  PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,

  2473 -                        &kCoefficientsRgbY[0][0]);

  2474 -}

  2475 -

  2476 -void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,

  2477 -                                 const uint8* u_buf,

  2478 -                                 const uint8* v_buf,

  2479 -                                 uint8* rgb_buf,

  2480 -                                 int width,

  2481 -                                 int source_dx,

  2482 -                                 int16 *kCoefficientsRgbY);

  2483 +                        int source_dx)

  2484 +{

  2485 +  if (mozilla::supports_sse()) {

  2486 +    PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,

  2487 +                              &kCoefficientsRgbY[0][0]);

  2488 +    return;

  2489 +  }

  2490 +

  2491 +  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);

  2492 +}

  2493 +

  2494 +void PICLinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,

  2495 +                                     const uint8* u_buf,

  2496 +                                     const uint8* v_buf,

  2497 +                                     uint8* rgb_buf,

  2498 +                                     int width,

  2499 +                                     int source_dx,

  2500 +                                     int16 *kCoefficientsRgbY);

  2501 +

  2502    asm(

  2503    ".text\n"

  2504 -#if defined(OS_MACOSX)

  2505 -"_PICLinearScaleYUVToRGB32Row:\n"

  2506 +#if defined(XP_MACOSX)

  2507 +"_PICLinearScaleYUVToRGB32Row_SSE:\n"

  2508  #else

  2509 -"PICLinearScaleYUVToRGB32Row:\n"

  2510 +"PICLinearScaleYUVToRGB32Row_SSE:\n"

  2511  #endif

  2512    "pusha\n"

  2513    "mov    0x24(%esp),%edx\n"

  2514    "mov    0x30(%esp),%ebp\n"

  2515    "mov    0x34(%esp),%ecx\n"

  2516    "mov    0x3c(%esp),%edi\n"

  2517    "xor    %ebx,%ebx\n"

  2519    // source_width = width * source_dx + ebx

  2520    "mov    0x34(%esp), %ecx\n"

  2521    "imull  0x38(%esp), %ecx\n"

  2522    "mov    %ecx, 0x34(%esp)\n"

  2524    "mov    0x38(%esp), %ecx\n"

  2525    "xor    %ebx,%ebx\n"     // x = 0

  2526    "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0

  2527 -  "jl     .lscaleend\n"

  2528 +  "jl     1f\n"

  2529    "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less

  2530 -  "jmp    .lscaleend\n"

  2531 -

  2532 -".lscaleloop:"

  2533 +  "jmp    1f\n"

  2534 +

  2535 +"0:"

  2536    "mov    0x28(%esp),%esi\n"

  2537    "mov    %ebx,%eax\n"

  2538    "sar    $0x11,%eax\n"

  2540    "movzbl (%esi,%eax,1),%ecx\n"

  2541    "movzbl 1(%esi,%eax,1),%esi\n"

  2542    "mov    %ebx,%eax\n"

  2543    "andl   $0x1fffe, %eax \n"

  2544 @@ -746,17 +821,17 @@ void PICLinearScaleYUVToRGB32Row(const u

  2545    "imul   %eax, %esi \n"

  2546    "xorl   $0xffff, %eax \n"

  2547    "imul   %eax, %ecx \n"

  2548    "addl   %esi, %ecx \n"

  2549    "shrl   $16, %ecx \n"

  2550    "movq   (%edi,%ecx,8),%mm1\n"

  2552    "cmp    0x34(%esp), %ebx\n"

  2553 -  "jge    .lscalelastpixel\n"

  2554 +  "jge    2f\n"

  2556    "mov    %ebx,%eax\n"

  2557    "sar    $0x10,%eax\n"

  2558    "movzbl (%edx,%eax,1),%ecx\n"

  2559    "movzbl 1(%edx,%eax,1),%esi\n"

  2560    "mov    %ebx,%eax\n"

  2561    "add    0x38(%esp),%ebx\n"

  2562    "andl   $0xffff, %eax \n"

  2563 @@ -770,154 +845,71 @@ void PICLinearScaleYUVToRGB32Row(const u

  2564    "paddsw %mm0,%mm1\n"

  2565    "paddsw %mm0,%mm2\n"

  2566    "psraw  $0x6,%mm1\n"

  2567    "psraw  $0x6,%mm2\n"

  2568    "packuswb %mm2,%mm1\n"

  2569    "movntq %mm1,0x0(%ebp)\n"

  2570    "add    $0x8,%ebp\n"

  2572 -".lscaleend:"

  2573 +"1:"

  2574    "cmp    %ebx, 0x34(%esp)\n"

  2575 -  "jg     .lscaleloop\n"

  2576 +  "jg     0b\n"

  2577    "popa\n"

  2578    "ret\n"

  2580 -".lscalelastpixel:"

  2581 +"2:"

  2582    "paddsw %mm0, %mm1\n"

  2583    "psraw $6, %mm1\n"

  2584    "packuswb %mm1, %mm1\n"

  2585    "movd %mm1, (%ebp)\n"

  2586    "popa\n"

  2587    "ret\n"

  2588 +#if !defined(XP_MACOSX)

  2589 +  ".previous\n"

  2590 +#endif

  2591  );

  2593 +

  2594  void LinearScaleYUVToRGB32Row(const uint8* y_buf,

  2595 -                        const uint8* u_buf,

  2596 -                        const uint8* v_buf,

  2597 -                        uint8* rgb_buf,

  2598 -                        int width,

  2599 -                        int source_dx) {

  2600 -  PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,

  2601 -                              &kCoefficientsRgbY[0][0]);

  2602 -}

  2603 -

  2604 -#else  // USE_MMX

  2605 -

  2606 -// C reference code that mimic the YUV assembly.

  2607 -#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))

  2608 -#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \

  2609 -    (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))

  2610 -

  2611 -static inline void YuvPixel(uint8 y,

  2612 -                            uint8 u,

  2613 -                            uint8 v,

  2614 -                            uint8* rgb_buf) {

  2615 -

  2616 -  int b = kCoefficientsRgbY[256+u][0];

  2617 -  int g = kCoefficientsRgbY[256+u][1];

  2618 -  int r = kCoefficientsRgbY[256+u][2];

  2619 -  int a = kCoefficientsRgbY[256+u][3];

  2620 -

  2621 -  b = paddsw(b, kCoefficientsRgbY[512+v][0]);

  2622 -  g = paddsw(g, kCoefficientsRgbY[512+v][1]);

  2623 -  r = paddsw(r, kCoefficientsRgbY[512+v][2]);

  2624 -  a = paddsw(a, kCoefficientsRgbY[512+v][3]);

  2625 -

  2626 -  b = paddsw(b, kCoefficientsRgbY[y][0]);

  2627 -  g = paddsw(g, kCoefficientsRgbY[y][1]);

  2628 -  r = paddsw(r, kCoefficientsRgbY[y][2]);

  2629 -  a = paddsw(a, kCoefficientsRgbY[y][3]);

  2630 -

  2631 -  b >>= 6;

  2632 -  g >>= 6;

  2633 -  r >>= 6;

  2634 -  a >>= 6;

  2635 -

  2636 -  *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |

  2637 -                                        (packuswb(g) << 8) |

  2638 -                                        (packuswb(r) << 16) |

  2639 -                                        (packuswb(a) << 24);

  2640 -}

  2641 -

  2642 +                              const uint8* u_buf,

  2643 +                              const uint8* v_buf,

  2644 +                              uint8* rgb_buf,

  2645 +                              int width,

  2646 +                              int source_dx)

  2647 +{

  2648 +  if (mozilla::supports_sse()) {

  2649 +    PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,

  2650 +                                    source_dx, &kCoefficientsRgbY[0][0]);

  2651 +    return;

  2652 +  }

  2653 +

  2654 +  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);

  2655 +}

  2656 +#else

  2657  void FastConvertYUVToRGB32Row(const uint8* y_buf,

  2658                                const uint8* u_buf,

  2659                                const uint8* v_buf,

  2660                                uint8* rgb_buf,

  2661                                int width) {

  2662 -  for (int x = 0; x < width; x += 2) {

  2663 -    uint8 u = u_buf[x >> 1];

  2664 -    uint8 v = v_buf[x >> 1];

  2665 -    uint8 y0 = y_buf[x];

  2666 -    YuvPixel(y0, u, v, rgb_buf);

  2667 -    if ((x + 1) < width) {

  2668 -      uint8 y1 = y_buf[x + 1];

  2669 -      YuvPixel(y1, u, v, rgb_buf + 4);

  2670 -    }

  2671 -    rgb_buf += 8;  // Advance 2 pixels.

  2672 -  }

  2673 -}

  2674 -

  2675 -// 16.16 fixed point is used.  A shift by 16 isolates the integer.

  2676 -// A shift by 17 is used to further subsample the chrominence channels.

  2677 -// & 0xffff isolates the fixed point fraction.  >> 2 to get the upper 2 bits,

  2678 -// for 1/65536 pixel accurate interpolation.

  2679 +  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);

  2680 +}

  2681 +

  2682  void ScaleYUVToRGB32Row(const uint8* y_buf,

  2683                          const uint8* u_buf,

  2684                          const uint8* v_buf,

  2685                          uint8* rgb_buf,

  2686                          int width,

  2687                          int source_dx) {

  2688 -  int x = 0;

  2689 -  for (int i = 0; i < width; i += 2) {

  2690 -    int y = y_buf[x >> 16];

  2691 -    int u = u_buf[(x >> 17)];

  2692 -    int v = v_buf[(x >> 17)];

  2693 -    YuvPixel(y, u, v, rgb_buf);

  2694 -    x += source_dx;

  2695 -    if ((i + 1) < width) {

  2696 -      y = y_buf[x >> 16];

  2697 -      YuvPixel(y, u, v, rgb_buf+4);

  2698 -      x += source_dx;

  2699 -    }

  2700 -    rgb_buf += 8;

  2701 -  }

  2702 -}

  2703 +  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);

  2704 +}

  2706  void LinearScaleYUVToRGB32Row(const uint8* y_buf,

  2707                                const uint8* u_buf,

  2708                                const uint8* v_buf,

  2709                                uint8* rgb_buf,

  2710                                int width,

  2711                                int source_dx) {

  2712 -  int x = 0;

  2713 -  if (source_dx >= 0x20000) {

  2714 -    x = 32768;

  2715 -  }

  2716 -  for (int i = 0; i < width; i += 2) {

  2717 -    int y0 = y_buf[x >> 16];

  2718 -    int y1 = y_buf[(x >> 16) + 1];

  2719 -    int u0 = u_buf[(x >> 17)];

  2720 -    int u1 = u_buf[(x >> 17) + 1];

  2721 -    int v0 = v_buf[(x >> 17)];

  2722 -    int v1 = v_buf[(x >> 17) + 1];

  2723 -    int y_frac = (x & 65535);

  2724 -    int uv_frac = ((x >> 1) & 65535);

  2725 -    int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;

  2726 -    int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;

  2727 -    int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;

  2728 -    YuvPixel(y, u, v, rgb_buf);

  2729 -    x += source_dx;

  2730 -    if ((i + 1) < width) {

  2731 -      y0 = y_buf[x >> 16];

  2732 -      y1 = y_buf[(x >> 16) + 1];

  2733 -      y_frac = (x & 65535);

  2734 -      y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;

  2735 -      YuvPixel(y, u, v, rgb_buf+4);

  2736 -      x += source_dx;

  2737 -    }

  2738 -    rgb_buf += 8;

  2739 -  }

  2740 -}

  2741 -

  2742 -#endif  // USE_MMX

  2743 -}  // extern "C"

  2744 -

  2745 +  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);

  2746 +}

  2747 +#endif

  2748 +

  2749 +}

  2750 diff --git a/gfx/ycbcr/yuv_row_table.cpp b/gfx/ycbcr/yuv_row_table.cpp

  2751 --- a/gfx/ycbcr/yuv_row_table.cpp

  2752 +++ b/gfx/ycbcr/yuv_row_table.cpp

  2753 @@ -1,13 +1,13 @@

  2754  // Copyright (c) 2010 The Chromium Authors. All rights reserved.

  2755  // Use of this source code is governed by a BSD-style license that can be

  2756  // found in the LICENSE file.

  2758 -#include "media/base/yuv_row.h"

  2759 +#include "yuv_row.h"

  2761  extern "C" {

  2763  #define RGBY(i) { \

  2764    static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \

  2765    static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \

  2766    static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \

  2767    0 \

  2768 diff --git a/gfx/ycbcr/yuv_row_win.cpp b/gfx/ycbcr/yuv_row_win.cpp

  2769 --- a/gfx/ycbcr/yuv_row_win.cpp

  2770 +++ b/gfx/ycbcr/yuv_row_win.cpp

  2771 @@ -1,26 +1,27 @@

  2772  // Copyright (c) 2010 The Chromium Authors. All rights reserved.

  2773  // Use of this source code is governed by a BSD-style license that can be

  2774  // found in the LICENSE file.

  2776 -#include "media/base/yuv_row.h"

  2777 +#include "yuv_row.h"

  2778 +#include "mozilla/SSE.h"

  2780  #define kCoefficientsRgbU kCoefficientsRgbY + 2048

  2781  #define kCoefficientsRgbV kCoefficientsRgbY + 4096

  2783  extern "C" {

  2785 -#if USE_MMX

  2786 -__declspec(naked)

  2787 -void FastConvertYUVToRGB32Row(const uint8* y_buf,

  2788 -                              const uint8* u_buf,

  2789 -                              const uint8* v_buf,

  2790 -                              uint8* rgb_buf,

  2791 -                              int width) {

  2792 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)

  2793 +__declspec(naked)

  2794 +void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,

  2795 +                                  const uint8* u_buf,

  2796 +                                  const uint8* v_buf,

  2797 +                                  uint8* rgb_buf,

  2798 +                                  int width) {

  2799    __asm {

  2800      pushad

  2801      mov       edx, [esp + 32 + 4]   // Y

  2802      mov       edi, [esp + 32 + 8]   // U

  2803      mov       esi, [esp + 32 + 12]  // V

  2804      mov       ebp, [esp + 32 + 16]  // rgb

  2805      mov       ecx, [esp + 32 + 20]  // width

  2806      jmp       convertend

  2807 @@ -64,22 +65,22 @@ void FastConvertYUVToRGB32Row(const uint

  2808   convertdone :

  2810      popad

  2811      ret

  2812    }

  2813  }

  2815  __declspec(naked)

  2816 -void ConvertYUVToRGB32Row(const uint8* y_buf,

  2817 -                          const uint8* u_buf,

  2818 -                          const uint8* v_buf,

  2819 -                          uint8* rgb_buf,

  2820 -                          int width,

  2821 -                          int step) {

  2822 +void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,

  2823 +                              const uint8* u_buf,

  2824 +                              const uint8* v_buf,

  2825 +                              uint8* rgb_buf,

  2826 +                              int width,

  2827 +                              int step) {

  2828    __asm {

  2829      pushad

  2830      mov       edx, [esp + 32 + 4]   // Y

  2831      mov       edi, [esp + 32 + 8]   // U

  2832      mov       esi, [esp + 32 + 12]  // V

  2833      mov       ebp, [esp + 32 + 16]  // rgb

  2834      mov       ecx, [esp + 32 + 20]  // width

  2835      mov       ebx, [esp + 32 + 24]  // step

  2836 @@ -125,23 +126,23 @@ void ConvertYUVToRGB32Row(const uint8* y

  2837   wdone :

  2839      popad

  2840      ret

  2841    }

  2842  }

  2844  __declspec(naked)

  2845 -void RotateConvertYUVToRGB32Row(const uint8* y_buf,

  2846 -                                const uint8* u_buf,

  2847 -                                const uint8* v_buf,

  2848 -                                uint8* rgb_buf,

  2849 -                                int width,

  2850 -                                int ystep,

  2851 -                                int uvstep) {

  2852 +void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,

  2853 +                                    const uint8* u_buf,

  2854 +                                    const uint8* v_buf,

  2855 +                                    uint8* rgb_buf,

  2856 +                                    int width,

  2857 +                                    int ystep,

  2858 +                                    int uvstep) {

  2859    __asm {

  2860      pushad

  2861      mov       edx, [esp + 32 + 4]   // Y

  2862      mov       edi, [esp + 32 + 8]   // U

  2863      mov       esi, [esp + 32 + 12]  // V

  2864      mov       ebp, [esp + 32 + 16]  // rgb

  2865      mov       ecx, [esp + 32 + 20]  // width

  2866      jmp       wend

  2867 @@ -188,21 +189,21 @@ void RotateConvertYUVToRGB32Row(const ui

  2868   wdone :

  2870      popad

  2871      ret

  2872    }

  2873  }

  2875  __declspec(naked)

  2876 -void DoubleYUVToRGB32Row(const uint8* y_buf,

  2877 -                         const uint8* u_buf,

  2878 -                         const uint8* v_buf,

  2879 -                         uint8* rgb_buf,

  2880 -                         int width) {

  2881 +void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,

  2882 +                             const uint8* u_buf,

  2883 +                             const uint8* v_buf,

  2884 +                             uint8* rgb_buf,

  2885 +                             int width) {

  2886    __asm {

  2887      pushad

  2888      mov       edx, [esp + 32 + 4]   // Y

  2889      mov       edi, [esp + 32 + 8]   // U

  2890      mov       esi, [esp + 32 + 12]  // V

  2891      mov       ebp, [esp + 32 + 16]  // rgb

  2892      mov       ecx, [esp + 32 + 20]  // width

  2893      jmp       wend

  2894 @@ -256,26 +257,26 @@ void DoubleYUVToRGB32Row(const uint8* y_

  2895      jns       wloop1

  2896   wdone :

  2897      popad

  2898      ret

  2899    }

  2900  }

  2902  // This version does general purpose scaling by any amount, up or down.

  2903 -// The only thing it can not do it rotation by 90 or 270.

  2904 -// For performance the chroma is under sampled, reducing cost of a 3x

  2905 +// The only thing it cannot do is rotation by 90 or 270.

  2906 +// For performance the chroma is under-sampled, reducing cost of a 3x

  2907  // 1080p scale from 8.4 ms to 5.4 ms.

  2908  __declspec(naked)

  2909 -void ScaleYUVToRGB32Row(const uint8* y_buf,

  2910 -                        const uint8* u_buf,

  2911 -                        const uint8* v_buf,

  2912 -                        uint8* rgb_buf,

  2913 -                        int width,

  2914 -                        int source_dx) {

  2915 +void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,

  2916 +                            const uint8* u_buf,

  2917 +                            const uint8* v_buf,

  2918 +                            uint8* rgb_buf,

  2919 +                            int width,

  2920 +                            int source_dx) {

  2921    __asm {

  2922      pushad

  2923      mov       edx, [esp + 32 + 4]   // Y

  2924      mov       edi, [esp + 32 + 8]   // U

  2925      mov       esi, [esp + 32 + 12]  // V

  2926      mov       ebp, [esp + 32 + 16]  // rgb

  2927      mov       ecx, [esp + 32 + 20]  // width

  2928      xor       ebx, ebx              // x

  2929 @@ -333,22 +334,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b

  2931   scaledone :

  2932      popad

  2933      ret

  2934    }

  2935  }

  2937  __declspec(naked)

  2938 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,

  2939 -                              const uint8* u_buf,

  2940 -                              const uint8* v_buf,

  2941 -                              uint8* rgb_buf,

  2942 -                              int width,

  2943 -                              int source_dx) {

  2944 +void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,

  2945 +                                  const uint8* u_buf,

  2946 +                                  const uint8* v_buf,

  2947 +                                  uint8* rgb_buf,

  2948 +                                  int width,

  2949 +                                  int source_dx) {

  2950    __asm {

  2951      pushad

  2952      mov       edx, [esp + 32 + 4]  // Y

  2953      mov       edi, [esp + 32 + 8]  // U

  2954                  // [esp + 32 + 12] // V

  2955      mov       ebp, [esp + 32 + 16] // rgb

  2956      mov       ecx, [esp + 32 + 20] // width

  2957      imul      ecx, [esp + 32 + 24] // source_dx

  2958 @@ -438,152 +439,60 @@ lscalelastpixel:

  2959      paddsw    mm1, mm0

  2960      psraw     mm1, 6

  2961      packuswb  mm1, mm1

  2962      movd      [ebp], mm1

  2963      popad

  2964      ret

  2965    };

  2966  }

  2967 -#else  // USE_MMX

  2968 -

  2969 -// C reference code that mimic the YUV assembly.

  2970 -#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))

  2971 -#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \

  2972 -    (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))

  2973 -

  2974 -static inline void YuvPixel(uint8 y,

  2975 -                            uint8 u,

  2976 -                            uint8 v,

  2977 -                            uint8* rgb_buf) {

  2978 -

  2979 -  int b = kCoefficientsRgbY[256+u][0];

  2980 -  int g = kCoefficientsRgbY[256+u][1];

  2981 -  int r = kCoefficientsRgbY[256+u][2];

  2982 -  int a = kCoefficientsRgbY[256+u][3];

  2983 -

  2984 -  b = paddsw(b, kCoefficientsRgbY[512+v][0]);

  2985 -  g = paddsw(g, kCoefficientsRgbY[512+v][1]);

  2986 -  r = paddsw(r, kCoefficientsRgbY[512+v][2]);

  2987 -  a = paddsw(a, kCoefficientsRgbY[512+v][3]);

  2988 -

  2989 -  b = paddsw(b, kCoefficientsRgbY[y][0]);

  2990 -  g = paddsw(g, kCoefficientsRgbY[y][1]);

  2991 -  r = paddsw(r, kCoefficientsRgbY[y][2]);

  2992 -  a = paddsw(a, kCoefficientsRgbY[y][3]);

  2993 -

  2994 -  b >>= 6;

  2995 -  g >>= 6;

  2996 -  r >>= 6;

  2997 -  a >>= 6;

  2998 -

  2999 -  *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |

  3000 -                                        (packuswb(g) << 8) |

  3001 -                                        (packuswb(r) << 16) |

  3002 -                                        (packuswb(a) << 24);

  3003 -}

  3004 -

  3005 -#if TEST_MMX_YUV

  3006 -static inline void YuvPixel(uint8 y,

  3007 -                            uint8 u,

  3008 -                            uint8 v,

  3009 -                            uint8* rgb_buf) {

  3010 -

  3011 -  __asm {

  3012 -    movzx     eax, u

  3013 -    movq      mm0, [kCoefficientsRgbY+2048 + 8 * eax]

  3014 -    movzx     eax, v

  3015 -    paddsw    mm0, [kCoefficientsRgbY+4096 + 8 * eax]

  3016 -    movzx     eax, y

  3017 -    movq      mm1, [kCoefficientsRgbY + 8 * eax]

  3018 -    paddsw    mm1, mm0

  3019 -    psraw     mm1, 6

  3020 -    packuswb  mm1, mm1

  3021 -    mov       eax, rgb_buf

  3022 -    movd      [eax], mm1

  3023 -    emms

  3024 -  }

  3025 -}

  3026 -#endif

  3027 +#endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)

  3029  void FastConvertYUVToRGB32Row(const uint8* y_buf,

  3030                                const uint8* u_buf,

  3031                                const uint8* v_buf,

  3032                                uint8* rgb_buf,

  3033                                int width) {

  3034 -  for (int x = 0; x < width; x += 2) {

  3035 -    uint8 u = u_buf[x >> 1];

  3036 -    uint8 v = v_buf[x >> 1];

  3037 -    uint8 y0 = y_buf[x];

  3038 -    YuvPixel(y0, u, v, rgb_buf);

  3039 -    if ((x + 1) < width) {

  3040 -      uint8 y1 = y_buf[x + 1];

  3041 -      YuvPixel(y1, u, v, rgb_buf + 4);

  3042 -    }

  3043 -    rgb_buf += 8;  // Advance 2 pixels.

  3044 -  }

  3045 -}

  3046 -

  3047 -// 16.16 fixed point is used.  A shift by 16 isolates the integer.

  3048 -// A shift by 17 is used to further subsample the chrominence channels.

  3049 -// & 0xffff isolates the fixed point fraction.  >> 2 to get the upper 2 bits,

  3050 -// for 1/65536 pixel accurate interpolation.

  3051 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)

  3052 +  if (mozilla::supports_sse()) {

  3053 +    FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);

  3054 +    return;

  3055 +  }

  3056 +#endif

  3057 +

  3058 +  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);

  3059 +}

  3060 +

  3061  void ScaleYUVToRGB32Row(const uint8* y_buf,

  3062                          const uint8* u_buf,

  3063                          const uint8* v_buf,

  3064                          uint8* rgb_buf,

  3065                          int width,

  3066                          int source_dx) {

  3067 -  int x = 0;

  3068 -  for (int i = 0; i < width; i += 2) {

  3069 -    int y = y_buf[x >> 16];

  3070 -    int u = u_buf[(x >> 17)];

  3071 -    int v = v_buf[(x >> 17)];

  3072 -    YuvPixel(y, u, v, rgb_buf);

  3073 -    x += source_dx;

  3074 -    if ((i + 1) < width) {

  3075 -      y = y_buf[x >> 16];

  3076 -      YuvPixel(y, u, v, rgb_buf+4);

  3077 -      x += source_dx;

  3078 -    }

  3079 -    rgb_buf += 8;

  3080 -  }

  3081 -}

  3082 +

  3083 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)

  3084 +  if (mozilla::supports_sse()) {

  3085 +    ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);

  3086 +    return;

  3087 +  }

  3088 +#endif

  3089 +

  3090 +  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);

  3091 +}

  3093  void LinearScaleYUVToRGB32Row(const uint8* y_buf,

  3094                                const uint8* u_buf,

  3095                                const uint8* v_buf,

  3096                                uint8* rgb_buf,

  3097                                int width,

  3098                                int source_dx) {

  3099 -  int x = 0;

  3100 -  if (source_dx >= 0x20000) {

  3101 -    x = 32768;

  3102 -  }

  3103 -  for (int i = 0; i < width; i += 2) {

  3104 -    int y0 = y_buf[x >> 16];

  3105 -    int y1 = y_buf[(x >> 16) + 1];

  3106 -    int u0 = u_buf[(x >> 17)];

  3107 -    int u1 = u_buf[(x >> 17) + 1];

  3108 -    int v0 = v_buf[(x >> 17)];

  3109 -    int v1 = v_buf[(x >> 17) + 1];

  3110 -    int y_frac = (x & 65535);

  3111 -    int uv_frac = ((x >> 1) & 65535);

  3112 -    int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;

  3113 -    int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;

  3114 -    int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;

  3115 -    YuvPixel(y, u, v, rgb_buf);

  3116 -    x += source_dx;

  3117 -    if ((i + 1) < width) {

  3118 -      y0 = y_buf[x >> 16];

  3119 -      y1 = y_buf[(x >> 16) + 1];

  3120 -      y_frac = (x & 65535);

  3121 -      y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;

  3122 -      YuvPixel(y, u, v, rgb_buf+4);

  3123 -      x += source_dx;

  3124 -    }

  3125 -    rgb_buf += 8;

  3126 -  }

  3127 -}

  3128 -

  3129 -#endif  // USE_MMX

  3130 -}  // extern "C"

  3131 -

  3132 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)

  3133 +  if (mozilla::supports_sse()) {

  3134 +    LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,

  3135 +                                 source_dx);

  3136 +    return;

  3137 +  }

  3138 +#endif

  3139 +

  3140 +  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);

  3141 +}

  3142 +

  3143 +} // extern "C"

The Tor Browser / file revision

gfx/ycbcr/convert.patch@6474c204b198

gfx/ycbcr/convert.patch