gfx/ycbcr/convert.patch

Tue, 06 Jan 2015 21:39:09 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Tue, 06 Jan 2015 21:39:09 +0100
branch
TOR_BUG_9701
changeset 8
97036ab72558
permissions
-rw-r--r--

Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp
michael@0 2 --- a/gfx/ycbcr/yuv_convert.cpp
michael@0 3 +++ b/gfx/ycbcr/yuv_convert.cpp
michael@0 4 @@ -6,145 +6,102 @@
michael@0 5 // http://www.fourcc.org/yuv.php
michael@0 6 // The actual conversion is best described here
michael@0 7 // http://en.wikipedia.org/wiki/YUV
michael@0 8 // An article on optimizing YUV conversion using tables instead of multiplies
michael@0 9 // http://lestourtereaux.free.fr/papers/data/yuvrgb.pdf
michael@0 10 //
michael@0 11 // YV12 is a full plane of Y and a half height, half width chroma planes
michael@0 12 // YV16 is a full plane of Y and a full height, half width chroma planes
michael@0 13 +// YV24 is a full plane of Y and a full height, full width chroma planes
michael@0 14 //
michael@0 15 // ARGB pixel format is output, which on little endian is stored as BGRA.
michael@0 16 // The alpha is set to 255, allowing the application to use RGBA or RGB32.
michael@0 17
michael@0 18 -#include "media/base/yuv_convert.h"
michael@0 19 +#include "yuv_convert.h"
michael@0 20
michael@0 21 // Header for low level row functions.
michael@0 22 -#include "media/base/yuv_row.h"
michael@0 23 -
michael@0 24 -#if USE_MMX
michael@0 25 -#if defined(_MSC_VER)
michael@0 26 -#include <intrin.h>
michael@0 27 -#else
michael@0 28 -#include <mmintrin.h>
michael@0 29 -#endif
michael@0 30 -#endif
michael@0 31 -
michael@0 32 -#if USE_SSE2
michael@0 33 -#include <emmintrin.h>
michael@0 34 -#endif
michael@0 35 -
michael@0 36 -namespace media {
michael@0 37 -
michael@0 38 +#include "yuv_row.h"
michael@0 39 +#include "mozilla/SSE.h"
michael@0 40 +
michael@0 41 +namespace mozilla {
michael@0 42 +
michael@0 43 +namespace gfx {
michael@0 44 +
michael@0 45 // 16.16 fixed point arithmetic
michael@0 46 const int kFractionBits = 16;
michael@0 47 const int kFractionMax = 1 << kFractionBits;
michael@0 48 const int kFractionMask = ((1 << kFractionBits) - 1);
michael@0 49
michael@0 50 // Convert a frame of YUV to 32 bit ARGB.
michael@0 51 -void ConvertYUVToRGB32(const uint8* y_buf,
michael@0 52 - const uint8* u_buf,
michael@0 53 - const uint8* v_buf,
michael@0 54 - uint8* rgb_buf,
michael@0 55 - int width,
michael@0 56 - int height,
michael@0 57 - int y_pitch,
michael@0 58 - int uv_pitch,
michael@0 59 - int rgb_pitch,
michael@0 60 - YUVType yuv_type) {
michael@0 61 - unsigned int y_shift = yuv_type;
michael@0 62 - for (int y = 0; y < height; ++y) {
michael@0 63 - uint8* rgb_row = rgb_buf + y * rgb_pitch;
michael@0 64 - const uint8* y_ptr = y_buf + y * y_pitch;
michael@0 65 - const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch;
michael@0 66 - const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch;
michael@0 67 -
michael@0 68 - FastConvertYUVToRGB32Row(y_ptr,
michael@0 69 - u_ptr,
michael@0 70 - v_ptr,
michael@0 71 - rgb_row,
michael@0 72 - width);
michael@0 73 - }
michael@0 74 +NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* y_buf,
michael@0 75 + const uint8* u_buf,
michael@0 76 + const uint8* v_buf,
michael@0 77 + uint8* rgb_buf,
michael@0 78 + int pic_x,
michael@0 79 + int pic_y,
michael@0 80 + int pic_width,
michael@0 81 + int pic_height,
michael@0 82 + int y_pitch,
michael@0 83 + int uv_pitch,
michael@0 84 + int rgb_pitch,
michael@0 85 + YUVType yuv_type) {
michael@0 86 + unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
michael@0 87 + unsigned int x_shift = yuv_type == YV24 ? 0 : 1;
michael@0 88 + // Test for SSE because the optimized code uses movntq, which is not part of MMX.
michael@0 89 + bool has_sse = supports_mmx() && supports_sse();
michael@0 90 + // There is no optimized YV24 SSE routine so we check for this and
michael@0 91 + // fall back to the C code.
michael@0 92 + has_sse &= yuv_type != YV24;
michael@0 93 + bool odd_pic_x = yuv_type != YV24 && pic_x % 2 != 0;
michael@0 94 + int x_width = odd_pic_x ? pic_width - 1 : pic_width;
michael@0 95 +
michael@0 96 + for (int y = pic_y; y < pic_height + pic_y; ++y) {
michael@0 97 + uint8* rgb_row = rgb_buf + (y - pic_y) * rgb_pitch;
michael@0 98 + const uint8* y_ptr = y_buf + y * y_pitch + pic_x;
michael@0 99 + const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
michael@0 100 + const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
michael@0 101 +
michael@0 102 + if (odd_pic_x) {
michael@0 103 + // Handle the single odd pixel manually and use the
michael@0 104 + // fast routines for the remaining.
michael@0 105 + FastConvertYUVToRGB32Row_C(y_ptr++,
michael@0 106 + u_ptr++,
michael@0 107 + v_ptr++,
michael@0 108 + rgb_row,
michael@0 109 + 1,
michael@0 110 + x_shift);
michael@0 111 + rgb_row += 4;
michael@0 112 + }
michael@0 113 +
michael@0 114 + if (has_sse) {
michael@0 115 + FastConvertYUVToRGB32Row(y_ptr,
michael@0 116 + u_ptr,
michael@0 117 + v_ptr,
michael@0 118 + rgb_row,
michael@0 119 + x_width);
michael@0 120 + }
michael@0 121 + else {
michael@0 122 + FastConvertYUVToRGB32Row_C(y_ptr,
michael@0 123 + u_ptr,
michael@0 124 + v_ptr,
michael@0 125 + rgb_row,
michael@0 126 + x_width,
michael@0 127 + x_shift);
michael@0 128 + }
michael@0 129 + }
michael@0 130
michael@0 131 // MMX used for FastConvertYUVToRGB32Row requires emms instruction.
michael@0 132 - EMMS();
michael@0 133 -}
michael@0 134 -
michael@0 135 -#if USE_SSE2
michael@0 136 -// FilterRows combines two rows of the image using linear interpolation.
michael@0 137 -// SSE2 version does 16 pixels at a time
michael@0 138 -
michael@0 139 -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
michael@0 140 - int source_width, int source_y_fraction) {
michael@0 141 - __m128i zero = _mm_setzero_si128();
michael@0 142 - __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
michael@0 143 - __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
michael@0 144 -
michael@0 145 - const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
michael@0 146 - const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
michael@0 147 - __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
michael@0 148 - __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
michael@0 149 -
michael@0 150 - do {
michael@0 151 - __m128i y0 = _mm_loadu_si128(y0_ptr128);
michael@0 152 - __m128i y1 = _mm_loadu_si128(y1_ptr128);
michael@0 153 - __m128i y2 = _mm_unpackhi_epi8(y0, zero);
michael@0 154 - __m128i y3 = _mm_unpackhi_epi8(y1, zero);
michael@0 155 - y0 = _mm_unpacklo_epi8(y0, zero);
michael@0 156 - y1 = _mm_unpacklo_epi8(y1, zero);
michael@0 157 - y0 = _mm_mullo_epi16(y0, y0_fraction);
michael@0 158 - y1 = _mm_mullo_epi16(y1, y1_fraction);
michael@0 159 - y2 = _mm_mullo_epi16(y2, y0_fraction);
michael@0 160 - y3 = _mm_mullo_epi16(y3, y1_fraction);
michael@0 161 - y0 = _mm_add_epi16(y0, y1);
michael@0 162 - y2 = _mm_add_epi16(y2, y3);
michael@0 163 - y0 = _mm_srli_epi16(y0, 8);
michael@0 164 - y2 = _mm_srli_epi16(y2, 8);
michael@0 165 - y0 = _mm_packus_epi16(y0, y2);
michael@0 166 - *dest128++ = y0;
michael@0 167 - ++y0_ptr128;
michael@0 168 - ++y1_ptr128;
michael@0 169 - } while (dest128 < end128);
michael@0 170 -}
michael@0 171 -#elif USE_MMX
michael@0 172 -// MMX version does 8 pixels at a time
michael@0 173 -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
michael@0 174 - int source_width, int source_y_fraction) {
michael@0 175 - __m64 zero = _mm_setzero_si64();
michael@0 176 - __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
michael@0 177 - __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
michael@0 178 -
michael@0 179 - const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
michael@0 180 - const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
michael@0 181 - __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
michael@0 182 - __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
michael@0 183 -
michael@0 184 - do {
michael@0 185 - __m64 y0 = *y0_ptr64++;
michael@0 186 - __m64 y1 = *y1_ptr64++;
michael@0 187 - __m64 y2 = _mm_unpackhi_pi8(y0, zero);
michael@0 188 - __m64 y3 = _mm_unpackhi_pi8(y1, zero);
michael@0 189 - y0 = _mm_unpacklo_pi8(y0, zero);
michael@0 190 - y1 = _mm_unpacklo_pi8(y1, zero);
michael@0 191 - y0 = _mm_mullo_pi16(y0, y0_fraction);
michael@0 192 - y1 = _mm_mullo_pi16(y1, y1_fraction);
michael@0 193 - y2 = _mm_mullo_pi16(y2, y0_fraction);
michael@0 194 - y3 = _mm_mullo_pi16(y3, y1_fraction);
michael@0 195 - y0 = _mm_add_pi16(y0, y1);
michael@0 196 - y2 = _mm_add_pi16(y2, y3);
michael@0 197 - y0 = _mm_srli_pi16(y0, 8);
michael@0 198 - y2 = _mm_srli_pi16(y2, 8);
michael@0 199 - y0 = _mm_packs_pu16(y0, y2);
michael@0 200 - *dest64++ = y0;
michael@0 201 - } while (dest64 < end64);
michael@0 202 -}
michael@0 203 -#else // no MMX or SSE2
michael@0 204 + if (has_sse)
michael@0 205 + EMMS();
michael@0 206 +}
michael@0 207 +
michael@0 208 // C version does 8 at a time to mimic MMX code
michael@0 209 -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
michael@0 210 - int source_width, int source_y_fraction) {
michael@0 211 +static void FilterRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
michael@0 212 + int source_width, int source_y_fraction) {
michael@0 213 int y1_fraction = source_y_fraction;
michael@0 214 int y0_fraction = 256 - y1_fraction;
michael@0 215 uint8* end = ybuf + source_width;
michael@0 216 do {
michael@0 217 ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8;
michael@0 218 ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8;
michael@0 219 ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8;
michael@0 220 ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8;
michael@0 221 @@ -152,46 +140,77 @@ static void FilterRows(uint8* ybuf, cons
michael@0 222 ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8;
michael@0 223 ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8;
michael@0 224 ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8;
michael@0 225 y0_ptr += 8;
michael@0 226 y1_ptr += 8;
michael@0 227 ybuf += 8;
michael@0 228 } while (ybuf < end);
michael@0 229 }
michael@0 230 -#endif
michael@0 231 +
michael@0 232 +#ifdef MOZILLA_MAY_SUPPORT_MMX
michael@0 233 +void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
michael@0 234 + int source_width, int source_y_fraction);
michael@0 235 +#endif
michael@0 236 +
michael@0 237 +#ifdef MOZILLA_MAY_SUPPORT_SSE2
michael@0 238 +void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
michael@0 239 + int source_width, int source_y_fraction);
michael@0 240 +#endif
michael@0 241 +
michael@0 242 +static inline void FilterRows(uint8* ybuf, const uint8* y0_ptr,
michael@0 243 + const uint8* y1_ptr, int source_width,
michael@0 244 + int source_y_fraction) {
michael@0 245 +#ifdef MOZILLA_MAY_SUPPORT_SSE2
michael@0 246 + if (mozilla::supports_sse2()) {
michael@0 247 + FilterRows_SSE2(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
michael@0 248 + return;
michael@0 249 + }
michael@0 250 +#endif
michael@0 251 +
michael@0 252 +#ifdef MOZILLA_MAY_SUPPORT_MMX
michael@0 253 + if (mozilla::supports_mmx()) {
michael@0 254 + FilterRows_MMX(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
michael@0 255 + return;
michael@0 256 + }
michael@0 257 +#endif
michael@0 258 +
michael@0 259 + FilterRows_C(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
michael@0 260 +}
michael@0 261
michael@0 262
michael@0 263 // Scale a frame of YUV to 32 bit ARGB.
michael@0 264 -void ScaleYUVToRGB32(const uint8* y_buf,
michael@0 265 - const uint8* u_buf,
michael@0 266 - const uint8* v_buf,
michael@0 267 - uint8* rgb_buf,
michael@0 268 - int source_width,
michael@0 269 - int source_height,
michael@0 270 - int width,
michael@0 271 - int height,
michael@0 272 - int y_pitch,
michael@0 273 - int uv_pitch,
michael@0 274 - int rgb_pitch,
michael@0 275 - YUVType yuv_type,
michael@0 276 - Rotate view_rotate,
michael@0 277 - ScaleFilter filter) {
michael@0 278 +NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* y_buf,
michael@0 279 + const uint8* u_buf,
michael@0 280 + const uint8* v_buf,
michael@0 281 + uint8* rgb_buf,
michael@0 282 + int source_width,
michael@0 283 + int source_height,
michael@0 284 + int width,
michael@0 285 + int height,
michael@0 286 + int y_pitch,
michael@0 287 + int uv_pitch,
michael@0 288 + int rgb_pitch,
michael@0 289 + YUVType yuv_type,
michael@0 290 + Rotate view_rotate,
michael@0 291 + ScaleFilter filter) {
michael@0 292 + bool has_mmx = supports_mmx();
michael@0 293 +
michael@0 294 // 4096 allows 3 buffers to fit in 12k.
michael@0 295 // Helps performance on CPU with 16K L1 cache.
michael@0 296 // Large enough for 3830x2160 and 30" displays which are 2560x1600.
michael@0 297 const int kFilterBufferSize = 4096;
michael@0 298 // Disable filtering if the screen is too big (to avoid buffer overflows).
michael@0 299 // This should never happen to regular users: they don't have monitors
michael@0 300 // wider than 4096 pixels.
michael@0 301 // TODO(fbarchard): Allow rotated videos to filter.
michael@0 302 if (source_width > kFilterBufferSize || view_rotate)
michael@0 303 filter = FILTER_NONE;
michael@0 304
michael@0 305 - unsigned int y_shift = yuv_type;
michael@0 306 + unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
michael@0 307 // Diagram showing origin and direction of source sampling.
michael@0 308 // ->0 4<-
michael@0 309 // 7 3
michael@0 310 //
michael@0 311 // 6 5
michael@0 312 // ->1 2<-
michael@0 313 // Rotations that start at right side of image.
michael@0 314 if ((view_rotate == ROTATE_180) ||
michael@0 315 @@ -276,17 +295,17 @@ void ScaleYUVToRGB32(const uint8* y_buf,
michael@0 316 int source_uv_fraction =
michael@0 317 ((source_y_subpixel >> y_shift) & kFractionMask) >> 8;
michael@0 318
michael@0 319 const uint8* y_ptr = y0_ptr;
michael@0 320 const uint8* u_ptr = u0_ptr;
michael@0 321 const uint8* v_ptr = v0_ptr;
michael@0 322 // Apply vertical filtering if necessary.
michael@0 323 // TODO(fbarchard): Remove memcpy when not necessary.
michael@0 324 - if (filter & media::FILTER_BILINEAR_V) {
michael@0 325 + if (filter & mozilla::gfx::FILTER_BILINEAR_V) {
michael@0 326 if (yscale_fixed != kFractionMax &&
michael@0 327 source_y_fraction && ((source_y + 1) < source_height)) {
michael@0 328 FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
michael@0 329 } else {
michael@0 330 memcpy(ybuf, y0_ptr, source_width);
michael@0 331 }
michael@0 332 y_ptr = ybuf;
michael@0 333 ybuf[source_width] = ybuf[source_width-1];
michael@0 334 @@ -303,44 +322,50 @@ void ScaleYUVToRGB32(const uint8* y_buf,
michael@0 335 u_ptr = ubuf;
michael@0 336 v_ptr = vbuf;
michael@0 337 ubuf[uv_source_width] = ubuf[uv_source_width - 1];
michael@0 338 vbuf[uv_source_width] = vbuf[uv_source_width - 1];
michael@0 339 }
michael@0 340 if (source_dx == kFractionMax) { // Not scaled
michael@0 341 FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
michael@0 342 dest_pixel, width);
michael@0 343 - } else {
michael@0 344 - if (filter & FILTER_BILINEAR_H) {
michael@0 345 + } else if (filter & FILTER_BILINEAR_H) {
michael@0 346 LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
michael@0 347 dest_pixel, width, source_dx);
michael@0 348 } else {
michael@0 349 // Specialized scalers and rotation.
michael@0 350 -#if USE_MMX && defined(_MSC_VER)
michael@0 351 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_MSC_VER) && defined(_M_IX86)
michael@0 352 + if(mozilla::supports_sse()) {
michael@0 353 if (width == (source_width * 2)) {
michael@0 354 - DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
michael@0 355 - dest_pixel, width);
michael@0 356 + DoubleYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
michael@0 357 + dest_pixel, width);
michael@0 358 } else if ((source_dx & kFractionMask) == 0) {
michael@0 359 // Scaling by integer scale factor. ie half.
michael@0 360 - ConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
michael@0 361 - dest_pixel, width,
michael@0 362 - source_dx >> kFractionBits);
michael@0 363 + ConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
michael@0 364 + dest_pixel, width,
michael@0 365 + source_dx >> kFractionBits);
michael@0 366 } else if (source_dx_uv == source_dx) { // Not rotated.
michael@0 367 ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
michael@0 368 dest_pixel, width, source_dx);
michael@0 369 } else {
michael@0 370 - RotateConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
michael@0 371 - dest_pixel, width,
michael@0 372 - source_dx >> kFractionBits,
michael@0 373 - source_dx_uv >> kFractionBits);
michael@0 374 + RotateConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
michael@0 375 + dest_pixel, width,
michael@0 376 + source_dx >> kFractionBits,
michael@0 377 + source_dx_uv >> kFractionBits);
michael@0 378 }
michael@0 379 + }
michael@0 380 + else {
michael@0 381 + ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,
michael@0 382 + dest_pixel, width, source_dx);
michael@0 383 + }
michael@0 384 #else
michael@0 385 - ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
michael@0 386 - dest_pixel, width, source_dx);
michael@0 387 -#endif
michael@0 388 - }
michael@0 389 + ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
michael@0 390 + dest_pixel, width, source_dx);
michael@0 391 +#endif
michael@0 392 }
michael@0 393 }
michael@0 394 // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms.
michael@0 395 - EMMS();
michael@0 396 -}
michael@0 397 -
michael@0 398 -} // namespace media
michael@0 399 + if (has_mmx)
michael@0 400 + EMMS();
michael@0 401 +}
michael@0 402 +
michael@0 403 +} // namespace gfx
michael@0 404 +} // namespace mozilla
michael@0 405 diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h
michael@0 406 --- a/gfx/ycbcr/yuv_convert.h
michael@0 407 +++ b/gfx/ycbcr/yuv_convert.h
michael@0 408 @@ -1,72 +1,79 @@
michael@0 409 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
michael@0 410 // Use of this source code is governed by a BSD-style license that can be
michael@0 411 // found in the LICENSE file.
michael@0 412
michael@0 413 #ifndef MEDIA_BASE_YUV_CONVERT_H_
michael@0 414 #define MEDIA_BASE_YUV_CONVERT_H_
michael@0 415
michael@0 416 -#include "base/basictypes.h"
michael@0 417 -
michael@0 418 -namespace media {
michael@0 419 -
michael@0 420 +#include "chromium_types.h"
michael@0 421 +#include "gfxCore.h"
michael@0 422 +
michael@0 423 +namespace mozilla {
michael@0 424 +
michael@0 425 +namespace gfx {
michael@0 426 +
michael@0 427 // Type of YUV surface.
michael@0 428 // The value of these enums matter as they are used to shift vertical indices.
michael@0 429 enum YUVType {
michael@0 430 - YV16 = 0, // YV16 is half width and full height chroma channels.
michael@0 431 - YV12 = 1, // YV12 is half width and half height chroma channels.
michael@0 432 + YV12 = 0, // YV12 is half width and half height chroma channels.
michael@0 433 + YV16 = 1, // YV16 is half width and full height chroma channels.
michael@0 434 + YV24 = 2 // YV24 is full width and full height chroma channels.
michael@0 435 };
michael@0 436
michael@0 437 // Mirror means flip the image horizontally, as in looking in a mirror.
michael@0 438 // Rotate happens after mirroring.
michael@0 439 enum Rotate {
michael@0 440 ROTATE_0, // Rotation off.
michael@0 441 ROTATE_90, // Rotate clockwise.
michael@0 442 ROTATE_180, // Rotate upside down.
michael@0 443 ROTATE_270, // Rotate counter clockwise.
michael@0 444 MIRROR_ROTATE_0, // Mirror horizontally.
michael@0 445 MIRROR_ROTATE_90, // Mirror then Rotate clockwise.
michael@0 446 MIRROR_ROTATE_180, // Mirror vertically.
michael@0 447 - MIRROR_ROTATE_270, // Transpose.
michael@0 448 + MIRROR_ROTATE_270 // Transpose.
michael@0 449 };
michael@0 450
michael@0 451 // Filter affects how scaling looks.
michael@0 452 enum ScaleFilter {
michael@0 453 FILTER_NONE = 0, // No filter (point sampled).
michael@0 454 FILTER_BILINEAR_H = 1, // Bilinear horizontal filter.
michael@0 455 FILTER_BILINEAR_V = 2, // Bilinear vertical filter.
michael@0 456 - FILTER_BILINEAR = 3, // Bilinear filter.
michael@0 457 + FILTER_BILINEAR = 3 // Bilinear filter.
michael@0 458 };
michael@0 459
michael@0 460 // Convert a frame of YUV to 32 bit ARGB.
michael@0 461 // Pass in YV16/YV12 depending on source format
michael@0 462 -void ConvertYUVToRGB32(const uint8* yplane,
michael@0 463 - const uint8* uplane,
michael@0 464 - const uint8* vplane,
michael@0 465 - uint8* rgbframe,
michael@0 466 - int width,
michael@0 467 - int height,
michael@0 468 - int ystride,
michael@0 469 - int uvstride,
michael@0 470 - int rgbstride,
michael@0 471 - YUVType yuv_type);
michael@0 472 +NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* yplane,
michael@0 473 + const uint8* uplane,
michael@0 474 + const uint8* vplane,
michael@0 475 + uint8* rgbframe,
michael@0 476 + int pic_x,
michael@0 477 + int pic_y,
michael@0 478 + int pic_width,
michael@0 479 + int pic_height,
michael@0 480 + int ystride,
michael@0 481 + int uvstride,
michael@0 482 + int rgbstride,
michael@0 483 + YUVType yuv_type);
michael@0 484
michael@0 485 // Scale a frame of YUV to 32 bit ARGB.
michael@0 486 // Supports rotation and mirroring.
michael@0 487 -void ScaleYUVToRGB32(const uint8* yplane,
michael@0 488 - const uint8* uplane,
michael@0 489 - const uint8* vplane,
michael@0 490 - uint8* rgbframe,
michael@0 491 - int source_width,
michael@0 492 - int source_height,
michael@0 493 - int width,
michael@0 494 - int height,
michael@0 495 - int ystride,
michael@0 496 - int uvstride,
michael@0 497 - int rgbstride,
michael@0 498 - YUVType yuv_type,
michael@0 499 - Rotate view_rotate,
michael@0 500 - ScaleFilter filter);
michael@0 501 -
michael@0 502 -} // namespace media
michael@0 503 -
michael@0 504 +NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* yplane,
michael@0 505 + const uint8* uplane,
michael@0 506 + const uint8* vplane,
michael@0 507 + uint8* rgbframe,
michael@0 508 + int source_width,
michael@0 509 + int source_height,
michael@0 510 + int width,
michael@0 511 + int height,
michael@0 512 + int ystride,
michael@0 513 + int uvstride,
michael@0 514 + int rgbstride,
michael@0 515 + YUVType yuv_type,
michael@0 516 + Rotate view_rotate,
michael@0 517 + ScaleFilter filter);
michael@0 518 +
michael@0 519 +} // namespace gfx
michael@0 520 +} // namespace mozilla
michael@0 521 +
michael@0 522 #endif // MEDIA_BASE_YUV_CONVERT_H_
michael@0 523 diff --git a/gfx/ycbcr/yuv_convert_mmx.cpp b/gfx/ycbcr/yuv_convert_mmx.cpp
michael@0 524 new file mode 100644
michael@0 525 --- /dev/null
michael@0 526 +++ b/gfx/ycbcr/yuv_convert_mmx.cpp
michael@0 527 @@ -0,0 +1,45 @@
michael@0 528 +// Copyright (c) 2010 The Chromium Authors. All rights reserved.
michael@0 529 +// Use of this source code is governed by a BSD-style license that can be
michael@0 530 +// found in the LICENSE file.
michael@0 531 +
michael@0 532 +#include <mmintrin.h>
michael@0 533 +#include "yuv_row.h"
michael@0 534 +
michael@0 535 +namespace mozilla {
michael@0 536 +namespace gfx {
michael@0 537 +
michael@0 538 +// FilterRows combines two rows of the image using linear interpolation.
michael@0 539 +// MMX version does 8 pixels at a time.
michael@0 540 +void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
michael@0 541 + int source_width, int source_y_fraction) {
michael@0 542 + __m64 zero = _mm_setzero_si64();
michael@0 543 + __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
michael@0 544 + __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
michael@0 545 +
michael@0 546 + const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
michael@0 547 + const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
michael@0 548 + __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
michael@0 549 + __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
michael@0 550 +
michael@0 551 + do {
michael@0 552 + __m64 y0 = *y0_ptr64++;
michael@0 553 + __m64 y1 = *y1_ptr64++;
michael@0 554 + __m64 y2 = _mm_unpackhi_pi8(y0, zero);
michael@0 555 + __m64 y3 = _mm_unpackhi_pi8(y1, zero);
michael@0 556 + y0 = _mm_unpacklo_pi8(y0, zero);
michael@0 557 + y1 = _mm_unpacklo_pi8(y1, zero);
michael@0 558 + y0 = _mm_mullo_pi16(y0, y0_fraction);
michael@0 559 + y1 = _mm_mullo_pi16(y1, y1_fraction);
michael@0 560 + y2 = _mm_mullo_pi16(y2, y0_fraction);
michael@0 561 + y3 = _mm_mullo_pi16(y3, y1_fraction);
michael@0 562 + y0 = _mm_add_pi16(y0, y1);
michael@0 563 + y2 = _mm_add_pi16(y2, y3);
michael@0 564 + y0 = _mm_srli_pi16(y0, 8);
michael@0 565 + y2 = _mm_srli_pi16(y2, 8);
michael@0 566 + y0 = _mm_packs_pu16(y0, y2);
michael@0 567 + *dest64++ = y0;
michael@0 568 + } while (dest64 < end64);
michael@0 569 +}
michael@0 570 +
michael@0 571 +}
michael@0 572 +}
michael@0 573 diff --git a/gfx/ycbcr/yuv_convert_sse2.cpp b/gfx/ycbcr/yuv_convert_sse2.cpp
michael@0 574 new file mode 100644
michael@0 575 --- /dev/null
michael@0 576 +++ b/gfx/ycbcr/yuv_convert_sse2.cpp
michael@0 577 @@ -0,0 +1,47 @@
michael@0 578 +// Copyright (c) 2010 The Chromium Authors. All rights reserved.
michael@0 579 +// Use of this source code is governed by a BSD-style license that can be
michael@0 580 +// found in the LICENSE file.
michael@0 581 +
michael@0 582 +#include <emmintrin.h>
michael@0 583 +#include "yuv_row.h"
michael@0 584 +
michael@0 585 +namespace mozilla {
michael@0 586 +namespace gfx {
michael@0 587 +
michael@0 588 +// FilterRows combines two rows of the image using linear interpolation.
michael@0 589 +// SSE2 version does 16 pixels at a time.
michael@0 590 +void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
michael@0 591 + int source_width, int source_y_fraction) {
michael@0 592 + __m128i zero = _mm_setzero_si128();
michael@0 593 + __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
michael@0 594 + __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
michael@0 595 +
michael@0 596 + const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
michael@0 597 + const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
michael@0 598 + __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
michael@0 599 + __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
michael@0 600 +
michael@0 601 + do {
michael@0 602 + __m128i y0 = _mm_loadu_si128(y0_ptr128);
michael@0 603 + __m128i y1 = _mm_loadu_si128(y1_ptr128);
michael@0 604 + __m128i y2 = _mm_unpackhi_epi8(y0, zero);
michael@0 605 + __m128i y3 = _mm_unpackhi_epi8(y1, zero);
michael@0 606 + y0 = _mm_unpacklo_epi8(y0, zero);
michael@0 607 + y1 = _mm_unpacklo_epi8(y1, zero);
michael@0 608 + y0 = _mm_mullo_epi16(y0, y0_fraction);
michael@0 609 + y1 = _mm_mullo_epi16(y1, y1_fraction);
michael@0 610 + y2 = _mm_mullo_epi16(y2, y0_fraction);
michael@0 611 + y3 = _mm_mullo_epi16(y3, y1_fraction);
michael@0 612 + y0 = _mm_add_epi16(y0, y1);
michael@0 613 + y2 = _mm_add_epi16(y2, y3);
michael@0 614 + y0 = _mm_srli_epi16(y0, 8);
michael@0 615 + y2 = _mm_srli_epi16(y2, 8);
michael@0 616 + y0 = _mm_packus_epi16(y0, y2);
michael@0 617 + *dest128++ = y0;
michael@0 618 + ++y0_ptr128;
michael@0 619 + ++y1_ptr128;
michael@0 620 + } while (dest128 < end128);
michael@0 621 +}
michael@0 622 +
michael@0 623 +}
michael@0 624 +}
michael@0 625 diff --git a/gfx/ycbcr/yuv_row.h b/gfx/ycbcr/yuv_row.h
michael@0 626 --- a/gfx/ycbcr/yuv_row.h
michael@0 627 +++ b/gfx/ycbcr/yuv_row.h
michael@0 628 @@ -5,109 +5,133 @@
michael@0 629 // yuv_row internal functions to handle YUV conversion and scaling to RGB.
michael@0 630 // These functions are used from both yuv_convert.cc and yuv_scale.cc.
michael@0 631
michael@0 632 // TODO(fbarchard): Write function that can handle rotation and scaling.
michael@0 633
michael@0 634 #ifndef MEDIA_BASE_YUV_ROW_H_
michael@0 635 #define MEDIA_BASE_YUV_ROW_H_
michael@0 636
michael@0 637 -#include "base/basictypes.h"
michael@0 638 +#include "chromium_types.h"
michael@0 639
michael@0 640 extern "C" {
michael@0 641 // Can only do 1x.
michael@0 642 // This is the second fastest of the scalers.
michael@0 643 void FastConvertYUVToRGB32Row(const uint8* y_buf,
michael@0 644 const uint8* u_buf,
michael@0 645 const uint8* v_buf,
michael@0 646 uint8* rgb_buf,
michael@0 647 int width);
michael@0 648
michael@0 649 -// Can do 1x, half size or any scale down by an integer amount.
michael@0 650 -// Step can be negative (mirroring, rotate 180).
michael@0 651 -// This is the third fastest of the scalers.
michael@0 652 -void ConvertYUVToRGB32Row(const uint8* y_buf,
michael@0 653 - const uint8* u_buf,
michael@0 654 - const uint8* v_buf,
michael@0 655 - uint8* rgb_buf,
michael@0 656 - int width,
michael@0 657 - int step);
michael@0 658 -
michael@0 659 -// Rotate is like Convert, but applies different step to Y versus U and V.
michael@0 660 -// This allows rotation by 90 or 270, by stepping by stride.
michael@0 661 -// This is the forth fastest of the scalers.
michael@0 662 -void RotateConvertYUVToRGB32Row(const uint8* y_buf,
michael@0 663 +void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
michael@0 664 const uint8* u_buf,
michael@0 665 const uint8* v_buf,
michael@0 666 uint8* rgb_buf,
michael@0 667 int width,
michael@0 668 - int ystep,
michael@0 669 - int uvstep);
michael@0 670 + unsigned int x_shift);
michael@0 671 +
michael@0 672 +void FastConvertYUVToRGB32Row(const uint8* y_buf,
michael@0 673 + const uint8* u_buf,
michael@0 674 + const uint8* v_buf,
michael@0 675 + uint8* rgb_buf,
michael@0 676 + int width);
michael@0 677 +
michael@0 678 +// Can do 1x, half size or any scale down by an integer amount.
michael@0 679 +// Step can be negative (mirroring, rotate 180).
michael@0 680 +// This is the third fastest of the scalers.
michael@0 681 +// Only defined on Windows x86-32.
michael@0 682 +void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
michael@0 683 + const uint8* u_buf,
michael@0 684 + const uint8* v_buf,
michael@0 685 + uint8* rgb_buf,
michael@0 686 + int width,
michael@0 687 + int step);
michael@0 688 +
michael@0 689 +// Rotate is like Convert, but applies different step to Y versus U and V.
michael@0 690 +// This allows rotation by 90 or 270, by stepping by stride.
michael@0 691 +// This is the forth fastest of the scalers.
michael@0 692 +// Only defined on Windows x86-32.
michael@0 693 +void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
michael@0 694 + const uint8* u_buf,
michael@0 695 + const uint8* v_buf,
michael@0 696 + uint8* rgb_buf,
michael@0 697 + int width,
michael@0 698 + int ystep,
michael@0 699 + int uvstep);
michael@0 700
michael@0 701 // Doubler does 4 pixels at a time. Each pixel is replicated.
michael@0 702 // This is the fastest of the scalers.
michael@0 703 -void DoubleYUVToRGB32Row(const uint8* y_buf,
michael@0 704 - const uint8* u_buf,
michael@0 705 - const uint8* v_buf,
michael@0 706 - uint8* rgb_buf,
michael@0 707 - int width);
michael@0 708 +// Only defined on Windows x86-32.
michael@0 709 +void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
michael@0 710 + const uint8* u_buf,
michael@0 711 + const uint8* v_buf,
michael@0 712 + uint8* rgb_buf,
michael@0 713 + int width);
michael@0 714
michael@0 715 // Handles arbitrary scaling up or down.
michael@0 716 // Mirroring is supported, but not 90 or 270 degree rotation.
michael@0 717 // Chroma is under sampled every 2 pixels for performance.
michael@0 718 void ScaleYUVToRGB32Row(const uint8* y_buf,
michael@0 719 const uint8* u_buf,
michael@0 720 const uint8* v_buf,
michael@0 721 uint8* rgb_buf,
michael@0 722 int width,
michael@0 723 int source_dx);
michael@0 724
michael@0 725 +void ScaleYUVToRGB32Row(const uint8* y_buf,
michael@0 726 + const uint8* u_buf,
michael@0 727 + const uint8* v_buf,
michael@0 728 + uint8* rgb_buf,
michael@0 729 + int width,
michael@0 730 + int source_dx);
michael@0 731 +
michael@0 732 +void ScaleYUVToRGB32Row_C(const uint8* y_buf,
michael@0 733 + const uint8* u_buf,
michael@0 734 + const uint8* v_buf,
michael@0 735 + uint8* rgb_buf,
michael@0 736 + int width,
michael@0 737 + int source_dx);
michael@0 738 +
michael@0 739 // Handles arbitrary scaling up or down with bilinear filtering.
michael@0 740 // Mirroring is supported, but not 90 or 270 degree rotation.
michael@0 741 // Chroma is under sampled every 2 pixels for performance.
michael@0 742 // This is the slowest of the scalers.
michael@0 743 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
michael@0 744 const uint8* u_buf,
michael@0 745 const uint8* v_buf,
michael@0 746 uint8* rgb_buf,
michael@0 747 int width,
michael@0 748 int source_dx);
michael@0 749
michael@0 750 +void LinearScaleYUVToRGB32Row(const uint8* y_buf,
michael@0 751 + const uint8* u_buf,
michael@0 752 + const uint8* v_buf,
michael@0 753 + uint8* rgb_buf,
michael@0 754 + int width,
michael@0 755 + int source_dx);
michael@0 756 +
michael@0 757 +void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
michael@0 758 + const uint8* u_buf,
michael@0 759 + const uint8* v_buf,
michael@0 760 + uint8* rgb_buf,
michael@0 761 + int width,
michael@0 762 + int source_dx);
michael@0 763 +
michael@0 764 +
michael@0 765 #if defined(_MSC_VER)
michael@0 766 #define SIMD_ALIGNED(var) __declspec(align(16)) var
michael@0 767 #else
michael@0 768 #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
michael@0 769 #endif
michael@0 770 extern SIMD_ALIGNED(int16 kCoefficientsRgbY[768][4]);
michael@0 771
michael@0 772 -// Method to force C version.
michael@0 773 -//#define USE_MMX 0
michael@0 774 -//#define USE_SSE2 0
michael@0 775 -
michael@0 776 -#if !defined(USE_MMX)
michael@0 777 -// Windows, Mac and Linux/BSD use MMX
michael@0 778 -#if defined(__MMX__) || defined(_MSC_VER)
michael@0 779 -#define USE_MMX 1
michael@0 780 -#else
michael@0 781 -#define USE_MMX 0
michael@0 782 -#endif
michael@0 783 -#endif
michael@0 784 -
michael@0 785 -#if !defined(USE_SSE2)
michael@0 786 -#if defined(__SSE2__) || defined(ARCH_CPU_X86_64) || _M_IX86_FP==2
michael@0 787 -#define USE_SSE2 1
michael@0 788 -#else
michael@0 789 -#define USE_SSE2 0
michael@0 790 -#endif
michael@0 791 -#endif
michael@0 792 -
michael@0 793 // x64 uses MMX2 (SSE) so emms is not required.
michael@0 794 // Warning C4799: function has no EMMS instruction.
michael@0 795 // EMMS() is slow and should be called by the calling function once per image.
michael@0 796 -#if USE_MMX && !defined(ARCH_CPU_X86_64)
michael@0 797 +#if defined(ARCH_CPU_X86) && !defined(ARCH_CPU_X86_64)
michael@0 798 #if defined(_MSC_VER)
michael@0 799 #define EMMS() __asm emms
michael@0 800 #pragma warning(disable: 4799)
michael@0 801 #else
michael@0 802 #define EMMS() asm("emms")
michael@0 803 #endif
michael@0 804 #else
michael@0 805 #define EMMS()
michael@0 806 diff --git a/gfx/ycbcr/yuv_row_c.cpp b/gfx/ycbcr/yuv_row_c.cpp
michael@0 807 --- a/gfx/ycbcr/yuv_row_c.cpp
michael@0 808 +++ b/gfx/ycbcr/yuv_row_c.cpp
michael@0 809 @@ -1,812 +1,18 @@
michael@0 810 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
michael@0 811 // Use of this source code is governed by a BSD-style license that can be
michael@0 812 // found in the LICENSE file.
michael@0 813
michael@0 814 -#include "media/base/yuv_row.h"
michael@0 815 -
michael@0 816 -#ifdef _DEBUG
michael@0 817 -#include "base/logging.h"
michael@0 818 -#else
michael@0 819 +#include "yuv_row.h"
michael@0 820 +
michael@0 821 #define DCHECK(a)
michael@0 822 -#endif
michael@0 823
michael@0 824 extern "C" {
michael@0 825
michael@0 826 -#if USE_SSE2 && defined(ARCH_CPU_X86_64)
michael@0 827 -
michael@0 828 -// AMD64 ABI uses register paremters.
michael@0 829 -void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi
michael@0 830 - const uint8* u_buf, // rsi
michael@0 831 - const uint8* v_buf, // rdx
michael@0 832 - uint8* rgb_buf, // rcx
michael@0 833 - int width) { // r8
michael@0 834 - asm(
michael@0 835 - "jmp convertend\n"
michael@0 836 -"convertloop:"
michael@0 837 - "movzb (%1),%%r10\n"
michael@0 838 - "add $0x1,%1\n"
michael@0 839 - "movzb (%2),%%r11\n"
michael@0 840 - "add $0x1,%2\n"
michael@0 841 - "movq 2048(%5,%%r10,8),%%xmm0\n"
michael@0 842 - "movzb (%0),%%r10\n"
michael@0 843 - "movq 4096(%5,%%r11,8),%%xmm1\n"
michael@0 844 - "movzb 0x1(%0),%%r11\n"
michael@0 845 - "paddsw %%xmm1,%%xmm0\n"
michael@0 846 - "movq (%5,%%r10,8),%%xmm2\n"
michael@0 847 - "add $0x2,%0\n"
michael@0 848 - "movq (%5,%%r11,8),%%xmm3\n"
michael@0 849 - "paddsw %%xmm0,%%xmm2\n"
michael@0 850 - "paddsw %%xmm0,%%xmm3\n"
michael@0 851 - "shufps $0x44,%%xmm3,%%xmm2\n"
michael@0 852 - "psraw $0x6,%%xmm2\n"
michael@0 853 - "packuswb %%xmm2,%%xmm2\n"
michael@0 854 - "movq %%xmm2,0x0(%3)\n"
michael@0 855 - "add $0x8,%3\n"
michael@0 856 -"convertend:"
michael@0 857 - "sub $0x2,%4\n"
michael@0 858 - "jns convertloop\n"
michael@0 859 -
michael@0 860 -"convertnext:"
michael@0 861 - "add $0x1,%4\n"
michael@0 862 - "js convertdone\n"
michael@0 863 -
michael@0 864 - "movzb (%1),%%r10\n"
michael@0 865 - "movq 2048(%5,%%r10,8),%%xmm0\n"
michael@0 866 - "movzb (%2),%%r10\n"
michael@0 867 - "movq 4096(%5,%%r10,8),%%xmm1\n"
michael@0 868 - "paddsw %%xmm1,%%xmm0\n"
michael@0 869 - "movzb (%0),%%r10\n"
michael@0 870 - "movq (%5,%%r10,8),%%xmm1\n"
michael@0 871 - "paddsw %%xmm0,%%xmm1\n"
michael@0 872 - "psraw $0x6,%%xmm1\n"
michael@0 873 - "packuswb %%xmm1,%%xmm1\n"
michael@0 874 - "movd %%xmm1,0x0(%3)\n"
michael@0 875 -"convertdone:"
michael@0 876 - :
michael@0 877 - : "r"(y_buf), // %0
michael@0 878 - "r"(u_buf), // %1
michael@0 879 - "r"(v_buf), // %2
michael@0 880 - "r"(rgb_buf), // %3
michael@0 881 - "r"(width), // %4
michael@0 882 - "r" (kCoefficientsRgbY) // %5
michael@0 883 - : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
michael@0 884 -);
michael@0 885 -}
michael@0 886 -
michael@0 887 -void ScaleYUVToRGB32Row(const uint8* y_buf, // rdi
michael@0 888 - const uint8* u_buf, // rsi
michael@0 889 - const uint8* v_buf, // rdx
michael@0 890 - uint8* rgb_buf, // rcx
michael@0 891 - int width, // r8
michael@0 892 - int source_dx) { // r9
michael@0 893 - asm(
michael@0 894 - "xor %%r11,%%r11\n"
michael@0 895 - "sub $0x2,%4\n"
michael@0 896 - "js scalenext\n"
michael@0 897 -
michael@0 898 -"scaleloop:"
michael@0 899 - "mov %%r11,%%r10\n"
michael@0 900 - "sar $0x11,%%r10\n"
michael@0 901 - "movzb (%1,%%r10,1),%%rax\n"
michael@0 902 - "movq 2048(%5,%%rax,8),%%xmm0\n"
michael@0 903 - "movzb (%2,%%r10,1),%%rax\n"
michael@0 904 - "movq 4096(%5,%%rax,8),%%xmm1\n"
michael@0 905 - "lea (%%r11,%6),%%r10\n"
michael@0 906 - "sar $0x10,%%r11\n"
michael@0 907 - "movzb (%0,%%r11,1),%%rax\n"
michael@0 908 - "paddsw %%xmm1,%%xmm0\n"
michael@0 909 - "movq (%5,%%rax,8),%%xmm1\n"
michael@0 910 - "lea (%%r10,%6),%%r11\n"
michael@0 911 - "sar $0x10,%%r10\n"
michael@0 912 - "movzb (%0,%%r10,1),%%rax\n"
michael@0 913 - "movq (%5,%%rax,8),%%xmm2\n"
michael@0 914 - "paddsw %%xmm0,%%xmm1\n"
michael@0 915 - "paddsw %%xmm0,%%xmm2\n"
michael@0 916 - "shufps $0x44,%%xmm2,%%xmm1\n"
michael@0 917 - "psraw $0x6,%%xmm1\n"
michael@0 918 - "packuswb %%xmm1,%%xmm1\n"
michael@0 919 - "movq %%xmm1,0x0(%3)\n"
michael@0 920 - "add $0x8,%3\n"
michael@0 921 - "sub $0x2,%4\n"
michael@0 922 - "jns scaleloop\n"
michael@0 923 -
michael@0 924 -"scalenext:"
michael@0 925 - "add $0x1,%4\n"
michael@0 926 - "js scaledone\n"
michael@0 927 -
michael@0 928 - "mov %%r11,%%r10\n"
michael@0 929 - "sar $0x11,%%r10\n"
michael@0 930 - "movzb (%1,%%r10,1),%%rax\n"
michael@0 931 - "movq 2048(%5,%%rax,8),%%xmm0\n"
michael@0 932 - "movzb (%2,%%r10,1),%%rax\n"
michael@0 933 - "movq 4096(%5,%%rax,8),%%xmm1\n"
michael@0 934 - "paddsw %%xmm1,%%xmm0\n"
michael@0 935 - "sar $0x10,%%r11\n"
michael@0 936 - "movzb (%0,%%r11,1),%%rax\n"
michael@0 937 - "movq (%5,%%rax,8),%%xmm1\n"
michael@0 938 - "paddsw %%xmm0,%%xmm1\n"
michael@0 939 - "psraw $0x6,%%xmm1\n"
michael@0 940 - "packuswb %%xmm1,%%xmm1\n"
michael@0 941 - "movd %%xmm1,0x0(%3)\n"
michael@0 942 -
michael@0 943 -"scaledone:"
michael@0 944 - :
michael@0 945 - : "r"(y_buf), // %0
michael@0 946 - "r"(u_buf), // %1
michael@0 947 - "r"(v_buf), // %2
michael@0 948 - "r"(rgb_buf), // %3
michael@0 949 - "r"(width), // %4
michael@0 950 - "r" (kCoefficientsRgbY), // %5
michael@0 951 - "r"(static_cast<long>(source_dx)) // %6
michael@0 952 - : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2"
michael@0 953 -);
michael@0 954 -}
michael@0 955 -
michael@0 956 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
michael@0 957 - const uint8* u_buf,
michael@0 958 - const uint8* v_buf,
michael@0 959 - uint8* rgb_buf,
michael@0 960 - int width,
michael@0 961 - int source_dx) {
michael@0 962 - asm(
michael@0 963 - "xor %%r11,%%r11\n" // x = 0
michael@0 964 - "sub $0x2,%4\n"
michael@0 965 - "js .lscalenext\n"
michael@0 966 - "cmp $0x20000,%6\n" // if source_dx >= 2.0
michael@0 967 - "jl .lscalehalf\n"
michael@0 968 - "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
michael@0 969 -".lscalehalf:"
michael@0 970 -
michael@0 971 -".lscaleloop:"
michael@0 972 - "mov %%r11,%%r10\n"
michael@0 973 - "sar $0x11,%%r10\n"
michael@0 974 -
michael@0 975 - "movzb (%1, %%r10, 1), %%r13 \n"
michael@0 976 - "movzb 1(%1, %%r10, 1), %%r14 \n"
michael@0 977 - "mov %%r11, %%rax \n"
michael@0 978 - "and $0x1fffe, %%rax \n"
michael@0 979 - "imul %%rax, %%r14 \n"
michael@0 980 - "xor $0x1fffe, %%rax \n"
michael@0 981 - "imul %%rax, %%r13 \n"
michael@0 982 - "add %%r14, %%r13 \n"
michael@0 983 - "shr $17, %%r13 \n"
michael@0 984 - "movq 2048(%5,%%r13,8), %%xmm0\n"
michael@0 985 -
michael@0 986 - "movzb (%2, %%r10, 1), %%r13 \n"
michael@0 987 - "movzb 1(%2, %%r10, 1), %%r14 \n"
michael@0 988 - "mov %%r11, %%rax \n"
michael@0 989 - "and $0x1fffe, %%rax \n"
michael@0 990 - "imul %%rax, %%r14 \n"
michael@0 991 - "xor $0x1fffe, %%rax \n"
michael@0 992 - "imul %%rax, %%r13 \n"
michael@0 993 - "add %%r14, %%r13 \n"
michael@0 994 - "shr $17, %%r13 \n"
michael@0 995 - "movq 4096(%5,%%r13,8), %%xmm1\n"
michael@0 996 -
michael@0 997 - "mov %%r11, %%rax \n"
michael@0 998 - "lea (%%r11,%6),%%r10\n"
michael@0 999 - "sar $0x10,%%r11\n"
michael@0 1000 - "paddsw %%xmm1,%%xmm0\n"
michael@0 1001 -
michael@0 1002 - "movzb (%0, %%r11, 1), %%r13 \n"
michael@0 1003 - "movzb 1(%0, %%r11, 1), %%r14 \n"
michael@0 1004 - "and $0xffff, %%rax \n"
michael@0 1005 - "imul %%rax, %%r14 \n"
michael@0 1006 - "xor $0xffff, %%rax \n"
michael@0 1007 - "imul %%rax, %%r13 \n"
michael@0 1008 - "add %%r14, %%r13 \n"
michael@0 1009 - "shr $16, %%r13 \n"
michael@0 1010 - "movq (%5,%%r13,8),%%xmm1\n"
michael@0 1011 -
michael@0 1012 - "mov %%r10, %%rax \n"
michael@0 1013 - "lea (%%r10,%6),%%r11\n"
michael@0 1014 - "sar $0x10,%%r10\n"
michael@0 1015 -
michael@0 1016 - "movzb (%0,%%r10,1), %%r13 \n"
michael@0 1017 - "movzb 1(%0,%%r10,1), %%r14 \n"
michael@0 1018 - "and $0xffff, %%rax \n"
michael@0 1019 - "imul %%rax, %%r14 \n"
michael@0 1020 - "xor $0xffff, %%rax \n"
michael@0 1021 - "imul %%rax, %%r13 \n"
michael@0 1022 - "add %%r14, %%r13 \n"
michael@0 1023 - "shr $16, %%r13 \n"
michael@0 1024 - "movq (%5,%%r13,8),%%xmm2\n"
michael@0 1025 -
michael@0 1026 - "paddsw %%xmm0,%%xmm1\n"
michael@0 1027 - "paddsw %%xmm0,%%xmm2\n"
michael@0 1028 - "shufps $0x44,%%xmm2,%%xmm1\n"
michael@0 1029 - "psraw $0x6,%%xmm1\n"
michael@0 1030 - "packuswb %%xmm1,%%xmm1\n"
michael@0 1031 - "movq %%xmm1,0x0(%3)\n"
michael@0 1032 - "add $0x8,%3\n"
michael@0 1033 - "sub $0x2,%4\n"
michael@0 1034 - "jns .lscaleloop\n"
michael@0 1035 -
michael@0 1036 -".lscalenext:"
michael@0 1037 - "add $0x1,%4\n"
michael@0 1038 - "js .lscaledone\n"
michael@0 1039 -
michael@0 1040 - "mov %%r11,%%r10\n"
michael@0 1041 - "sar $0x11,%%r10\n"
michael@0 1042 -
michael@0 1043 - "movzb (%1,%%r10,1), %%r13 \n"
michael@0 1044 - "movq 2048(%5,%%r13,8),%%xmm0\n"
michael@0 1045 -
michael@0 1046 - "movzb (%2,%%r10,1), %%r13 \n"
michael@0 1047 - "movq 4096(%5,%%r13,8),%%xmm1\n"
michael@0 1048 -
michael@0 1049 - "paddsw %%xmm1,%%xmm0\n"
michael@0 1050 - "sar $0x10,%%r11\n"
michael@0 1051 -
michael@0 1052 - "movzb (%0,%%r11,1), %%r13 \n"
michael@0 1053 - "movq (%5,%%r13,8),%%xmm1\n"
michael@0 1054 -
michael@0 1055 - "paddsw %%xmm0,%%xmm1\n"
michael@0 1056 - "psraw $0x6,%%xmm1\n"
michael@0 1057 - "packuswb %%xmm1,%%xmm1\n"
michael@0 1058 - "movd %%xmm1,0x0(%3)\n"
michael@0 1059 -
michael@0 1060 -".lscaledone:"
michael@0 1061 - :
michael@0 1062 - : "r"(y_buf), // %0
michael@0 1063 - "r"(u_buf), // %1
michael@0 1064 - "r"(v_buf), // %2
michael@0 1065 - "r"(rgb_buf), // %3
michael@0 1066 - "r"(width), // %4
michael@0 1067 - "r" (kCoefficientsRgbY), // %5
michael@0 1068 - "r"(static_cast<long>(source_dx)) // %6
michael@0 1069 - : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
michael@0 1070 -);
michael@0 1071 -}
michael@0 1072 -
michael@0 1073 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__)
michael@0 1074 -
michael@0 1075 -// PIC version is slower because less registers are available, so
michael@0 1076 -// non-PIC is used on platforms where it is possible.
michael@0 1077 -
michael@0 1078 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
michael@0 1079 - const uint8* u_buf,
michael@0 1080 - const uint8* v_buf,
michael@0 1081 - uint8* rgb_buf,
michael@0 1082 - int width);
michael@0 1083 - asm(
michael@0 1084 - ".text\n"
michael@0 1085 - ".global FastConvertYUVToRGB32Row\n"
michael@0 1086 -"FastConvertYUVToRGB32Row:\n"
michael@0 1087 - "pusha\n"
michael@0 1088 - "mov 0x24(%esp),%edx\n"
michael@0 1089 - "mov 0x28(%esp),%edi\n"
michael@0 1090 - "mov 0x2c(%esp),%esi\n"
michael@0 1091 - "mov 0x30(%esp),%ebp\n"
michael@0 1092 - "mov 0x34(%esp),%ecx\n"
michael@0 1093 - "jmp convertend\n"
michael@0 1094 -
michael@0 1095 -"convertloop:"
michael@0 1096 - "movzbl (%edi),%eax\n"
michael@0 1097 - "add $0x1,%edi\n"
michael@0 1098 - "movzbl (%esi),%ebx\n"
michael@0 1099 - "add $0x1,%esi\n"
michael@0 1100 - "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
michael@0 1101 - "movzbl (%edx),%eax\n"
michael@0 1102 - "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
michael@0 1103 - "movzbl 0x1(%edx),%ebx\n"
michael@0 1104 - "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
michael@0 1105 - "add $0x2,%edx\n"
michael@0 1106 - "movq kCoefficientsRgbY(,%ebx,8),%mm2\n"
michael@0 1107 - "paddsw %mm0,%mm1\n"
michael@0 1108 - "paddsw %mm0,%mm2\n"
michael@0 1109 - "psraw $0x6,%mm1\n"
michael@0 1110 - "psraw $0x6,%mm2\n"
michael@0 1111 - "packuswb %mm2,%mm1\n"
michael@0 1112 - "movntq %mm1,0x0(%ebp)\n"
michael@0 1113 - "add $0x8,%ebp\n"
michael@0 1114 -"convertend:"
michael@0 1115 - "sub $0x2,%ecx\n"
michael@0 1116 - "jns convertloop\n"
michael@0 1117 -
michael@0 1118 - "and $0x1,%ecx\n"
michael@0 1119 - "je convertdone\n"
michael@0 1120 -
michael@0 1121 - "movzbl (%edi),%eax\n"
michael@0 1122 - "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
michael@0 1123 - "movzbl (%esi),%eax\n"
michael@0 1124 - "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
michael@0 1125 - "movzbl (%edx),%eax\n"
michael@0 1126 - "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
michael@0 1127 - "paddsw %mm0,%mm1\n"
michael@0 1128 - "psraw $0x6,%mm1\n"
michael@0 1129 - "packuswb %mm1,%mm1\n"
michael@0 1130 - "movd %mm1,0x0(%ebp)\n"
michael@0 1131 -"convertdone:"
michael@0 1132 - "popa\n"
michael@0 1133 - "ret\n"
michael@0 1134 -);
michael@0 1135 -
michael@0 1136 -
michael@0 1137 -void ScaleYUVToRGB32Row(const uint8* y_buf,
michael@0 1138 - const uint8* u_buf,
michael@0 1139 - const uint8* v_buf,
michael@0 1140 - uint8* rgb_buf,
michael@0 1141 - int width,
michael@0 1142 - int source_dx);
michael@0 1143 - asm(
michael@0 1144 - ".text\n"
michael@0 1145 - ".global ScaleYUVToRGB32Row\n"
michael@0 1146 -"ScaleYUVToRGB32Row:\n"
michael@0 1147 - "pusha\n"
michael@0 1148 - "mov 0x24(%esp),%edx\n"
michael@0 1149 - "mov 0x28(%esp),%edi\n"
michael@0 1150 - "mov 0x2c(%esp),%esi\n"
michael@0 1151 - "mov 0x30(%esp),%ebp\n"
michael@0 1152 - "mov 0x34(%esp),%ecx\n"
michael@0 1153 - "xor %ebx,%ebx\n"
michael@0 1154 - "jmp scaleend\n"
michael@0 1155 -
michael@0 1156 -"scaleloop:"
michael@0 1157 - "mov %ebx,%eax\n"
michael@0 1158 - "sar $0x11,%eax\n"
michael@0 1159 - "movzbl (%edi,%eax,1),%eax\n"
michael@0 1160 - "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
michael@0 1161 - "mov %ebx,%eax\n"
michael@0 1162 - "sar $0x11,%eax\n"
michael@0 1163 - "movzbl (%esi,%eax,1),%eax\n"
michael@0 1164 - "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
michael@0 1165 - "mov %ebx,%eax\n"
michael@0 1166 - "add 0x38(%esp),%ebx\n"
michael@0 1167 - "sar $0x10,%eax\n"
michael@0 1168 - "movzbl (%edx,%eax,1),%eax\n"
michael@0 1169 - "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
michael@0 1170 - "mov %ebx,%eax\n"
michael@0 1171 - "add 0x38(%esp),%ebx\n"
michael@0 1172 - "sar $0x10,%eax\n"
michael@0 1173 - "movzbl (%edx,%eax,1),%eax\n"
michael@0 1174 - "movq kCoefficientsRgbY(,%eax,8),%mm2\n"
michael@0 1175 - "paddsw %mm0,%mm1\n"
michael@0 1176 - "paddsw %mm0,%mm2\n"
michael@0 1177 - "psraw $0x6,%mm1\n"
michael@0 1178 - "psraw $0x6,%mm2\n"
michael@0 1179 - "packuswb %mm2,%mm1\n"
michael@0 1180 - "movntq %mm1,0x0(%ebp)\n"
michael@0 1181 - "add $0x8,%ebp\n"
michael@0 1182 -"scaleend:"
michael@0 1183 - "sub $0x2,%ecx\n"
michael@0 1184 - "jns scaleloop\n"
michael@0 1185 -
michael@0 1186 - "and $0x1,%ecx\n"
michael@0 1187 - "je scaledone\n"
michael@0 1188 -
michael@0 1189 - "mov %ebx,%eax\n"
michael@0 1190 - "sar $0x11,%eax\n"
michael@0 1191 - "movzbl (%edi,%eax,1),%eax\n"
michael@0 1192 - "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
michael@0 1193 - "mov %ebx,%eax\n"
michael@0 1194 - "sar $0x11,%eax\n"
michael@0 1195 - "movzbl (%esi,%eax,1),%eax\n"
michael@0 1196 - "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
michael@0 1197 - "mov %ebx,%eax\n"
michael@0 1198 - "sar $0x10,%eax\n"
michael@0 1199 - "movzbl (%edx,%eax,1),%eax\n"
michael@0 1200 - "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
michael@0 1201 - "paddsw %mm0,%mm1\n"
michael@0 1202 - "psraw $0x6,%mm1\n"
michael@0 1203 - "packuswb %mm1,%mm1\n"
michael@0 1204 - "movd %mm1,0x0(%ebp)\n"
michael@0 1205 -
michael@0 1206 -"scaledone:"
michael@0 1207 - "popa\n"
michael@0 1208 - "ret\n"
michael@0 1209 -);
michael@0 1210 -
michael@0 1211 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
michael@0 1212 - const uint8* u_buf,
michael@0 1213 - const uint8* v_buf,
michael@0 1214 - uint8* rgb_buf,
michael@0 1215 - int width,
michael@0 1216 - int source_dx);
michael@0 1217 - asm(
michael@0 1218 - ".text\n"
michael@0 1219 - ".global LinearScaleYUVToRGB32Row\n"
michael@0 1220 -"LinearScaleYUVToRGB32Row:\n"
michael@0 1221 - "pusha\n"
michael@0 1222 - "mov 0x24(%esp),%edx\n"
michael@0 1223 - "mov 0x28(%esp),%edi\n"
michael@0 1224 - "mov 0x30(%esp),%ebp\n"
michael@0 1225 -
michael@0 1226 - // source_width = width * source_dx + ebx
michael@0 1227 - "mov 0x34(%esp), %ecx\n"
michael@0 1228 - "imull 0x38(%esp), %ecx\n"
michael@0 1229 - "mov %ecx, 0x34(%esp)\n"
michael@0 1230 -
michael@0 1231 - "mov 0x38(%esp), %ecx\n"
michael@0 1232 - "xor %ebx,%ebx\n" // x = 0
michael@0 1233 - "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
michael@0 1234 - "jl .lscaleend\n"
michael@0 1235 - "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
michael@0 1236 - "jmp .lscaleend\n"
michael@0 1237 -
michael@0 1238 -".lscaleloop:"
michael@0 1239 - "mov %ebx,%eax\n"
michael@0 1240 - "sar $0x11,%eax\n"
michael@0 1241 -
michael@0 1242 - "movzbl (%edi,%eax,1),%ecx\n"
michael@0 1243 - "movzbl 1(%edi,%eax,1),%esi\n"
michael@0 1244 - "mov %ebx,%eax\n"
michael@0 1245 - "andl $0x1fffe, %eax \n"
michael@0 1246 - "imul %eax, %esi \n"
michael@0 1247 - "xorl $0x1fffe, %eax \n"
michael@0 1248 - "imul %eax, %ecx \n"
michael@0 1249 - "addl %esi, %ecx \n"
michael@0 1250 - "shrl $17, %ecx \n"
michael@0 1251 - "movq kCoefficientsRgbY+2048(,%ecx,8),%mm0\n"
michael@0 1252 -
michael@0 1253 - "mov 0x2c(%esp),%esi\n"
michael@0 1254 - "mov %ebx,%eax\n"
michael@0 1255 - "sar $0x11,%eax\n"
michael@0 1256 -
michael@0 1257 - "movzbl (%esi,%eax,1),%ecx\n"
michael@0 1258 - "movzbl 1(%esi,%eax,1),%esi\n"
michael@0 1259 - "mov %ebx,%eax\n"
michael@0 1260 - "andl $0x1fffe, %eax \n"
michael@0 1261 - "imul %eax, %esi \n"
michael@0 1262 - "xorl $0x1fffe, %eax \n"
michael@0 1263 - "imul %eax, %ecx \n"
michael@0 1264 - "addl %esi, %ecx \n"
michael@0 1265 - "shrl $17, %ecx \n"
michael@0 1266 - "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n"
michael@0 1267 -
michael@0 1268 - "mov %ebx,%eax\n"
michael@0 1269 - "sar $0x10,%eax\n"
michael@0 1270 - "movzbl (%edx,%eax,1),%ecx\n"
michael@0 1271 - "movzbl 1(%edx,%eax,1),%esi\n"
michael@0 1272 - "mov %ebx,%eax\n"
michael@0 1273 - "add 0x38(%esp),%ebx\n"
michael@0 1274 - "andl $0xffff, %eax \n"
michael@0 1275 - "imul %eax, %esi \n"
michael@0 1276 - "xorl $0xffff, %eax \n"
michael@0 1277 - "imul %eax, %ecx \n"
michael@0 1278 - "addl %esi, %ecx \n"
michael@0 1279 - "shrl $16, %ecx \n"
michael@0 1280 - "movq kCoefficientsRgbY(,%ecx,8),%mm1\n"
michael@0 1281 -
michael@0 1282 - "cmp 0x34(%esp), %ebx\n"
michael@0 1283 - "jge .lscalelastpixel\n"
michael@0 1284 -
michael@0 1285 - "mov %ebx,%eax\n"
michael@0 1286 - "sar $0x10,%eax\n"
michael@0 1287 - "movzbl (%edx,%eax,1),%ecx\n"
michael@0 1288 - "movzbl 1(%edx,%eax,1),%esi\n"
michael@0 1289 - "mov %ebx,%eax\n"
michael@0 1290 - "add 0x38(%esp),%ebx\n"
michael@0 1291 - "andl $0xffff, %eax \n"
michael@0 1292 - "imul %eax, %esi \n"
michael@0 1293 - "xorl $0xffff, %eax \n"
michael@0 1294 - "imul %eax, %ecx \n"
michael@0 1295 - "addl %esi, %ecx \n"
michael@0 1296 - "shrl $16, %ecx \n"
michael@0 1297 - "movq kCoefficientsRgbY(,%ecx,8),%mm2\n"
michael@0 1298 -
michael@0 1299 - "paddsw %mm0,%mm1\n"
michael@0 1300 - "paddsw %mm0,%mm2\n"
michael@0 1301 - "psraw $0x6,%mm1\n"
michael@0 1302 - "psraw $0x6,%mm2\n"
michael@0 1303 - "packuswb %mm2,%mm1\n"
michael@0 1304 - "movntq %mm1,0x0(%ebp)\n"
michael@0 1305 - "add $0x8,%ebp\n"
michael@0 1306 -
michael@0 1307 -".lscaleend:"
michael@0 1308 - "cmp 0x34(%esp), %ebx\n"
michael@0 1309 - "jl .lscaleloop\n"
michael@0 1310 - "popa\n"
michael@0 1311 - "ret\n"
michael@0 1312 -
michael@0 1313 -".lscalelastpixel:"
michael@0 1314 - "paddsw %mm0, %mm1\n"
michael@0 1315 - "psraw $6, %mm1\n"
michael@0 1316 - "packuswb %mm1, %mm1\n"
michael@0 1317 - "movd %mm1, (%ebp)\n"
michael@0 1318 - "popa\n"
michael@0 1319 - "ret\n"
michael@0 1320 -);
michael@0 1321 -
michael@0 1322 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__)
michael@0 1323 -
michael@0 1324 -extern void PICConvertYUVToRGB32Row(const uint8* y_buf,
michael@0 1325 - const uint8* u_buf,
michael@0 1326 - const uint8* v_buf,
michael@0 1327 - uint8* rgb_buf,
michael@0 1328 - int width,
michael@0 1329 - int16 *kCoefficientsRgbY);
michael@0 1330 - asm(
michael@0 1331 - ".text\n"
michael@0 1332 -#if defined(OS_MACOSX)
michael@0 1333 -"_PICConvertYUVToRGB32Row:\n"
michael@0 1334 -#else
michael@0 1335 -"PICConvertYUVToRGB32Row:\n"
michael@0 1336 -#endif
michael@0 1337 - "pusha\n"
michael@0 1338 - "mov 0x24(%esp),%edx\n"
michael@0 1339 - "mov 0x28(%esp),%edi\n"
michael@0 1340 - "mov 0x2c(%esp),%esi\n"
michael@0 1341 - "mov 0x30(%esp),%ebp\n"
michael@0 1342 - "mov 0x38(%esp),%ecx\n"
michael@0 1343 -
michael@0 1344 - "jmp .Lconvertend\n"
michael@0 1345 -
michael@0 1346 -".Lconvertloop:"
michael@0 1347 - "movzbl (%edi),%eax\n"
michael@0 1348 - "add $0x1,%edi\n"
michael@0 1349 - "movzbl (%esi),%ebx\n"
michael@0 1350 - "add $0x1,%esi\n"
michael@0 1351 - "movq 2048(%ecx,%eax,8),%mm0\n"
michael@0 1352 - "movzbl (%edx),%eax\n"
michael@0 1353 - "paddsw 4096(%ecx,%ebx,8),%mm0\n"
michael@0 1354 - "movzbl 0x1(%edx),%ebx\n"
michael@0 1355 - "movq 0(%ecx,%eax,8),%mm1\n"
michael@0 1356 - "add $0x2,%edx\n"
michael@0 1357 - "movq 0(%ecx,%ebx,8),%mm2\n"
michael@0 1358 - "paddsw %mm0,%mm1\n"
michael@0 1359 - "paddsw %mm0,%mm2\n"
michael@0 1360 - "psraw $0x6,%mm1\n"
michael@0 1361 - "psraw $0x6,%mm2\n"
michael@0 1362 - "packuswb %mm2,%mm1\n"
michael@0 1363 - "movntq %mm1,0x0(%ebp)\n"
michael@0 1364 - "add $0x8,%ebp\n"
michael@0 1365 -".Lconvertend:"
michael@0 1366 - "subl $0x2,0x34(%esp)\n"
michael@0 1367 - "jns .Lconvertloop\n"
michael@0 1368 -
michael@0 1369 - "andl $0x1,0x34(%esp)\n"
michael@0 1370 - "je .Lconvertdone\n"
michael@0 1371 -
michael@0 1372 - "movzbl (%edi),%eax\n"
michael@0 1373 - "movq 2048(%ecx,%eax,8),%mm0\n"
michael@0 1374 - "movzbl (%esi),%eax\n"
michael@0 1375 - "paddsw 4096(%ecx,%eax,8),%mm0\n"
michael@0 1376 - "movzbl (%edx),%eax\n"
michael@0 1377 - "movq 0(%ecx,%eax,8),%mm1\n"
michael@0 1378 - "paddsw %mm0,%mm1\n"
michael@0 1379 - "psraw $0x6,%mm1\n"
michael@0 1380 - "packuswb %mm1,%mm1\n"
michael@0 1381 - "movd %mm1,0x0(%ebp)\n"
michael@0 1382 -".Lconvertdone:\n"
michael@0 1383 - "popa\n"
michael@0 1384 - "ret\n"
michael@0 1385 -);
michael@0 1386 -
michael@0 1387 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
michael@0 1388 - const uint8* u_buf,
michael@0 1389 - const uint8* v_buf,
michael@0 1390 - uint8* rgb_buf,
michael@0 1391 - int width) {
michael@0 1392 - PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
michael@0 1393 - &kCoefficientsRgbY[0][0]);
michael@0 1394 -}
michael@0 1395 -
michael@0 1396 -extern void PICScaleYUVToRGB32Row(const uint8* y_buf,
michael@0 1397 - const uint8* u_buf,
michael@0 1398 - const uint8* v_buf,
michael@0 1399 - uint8* rgb_buf,
michael@0 1400 - int width,
michael@0 1401 - int source_dx,
michael@0 1402 - int16 *kCoefficientsRgbY);
michael@0 1403 -
michael@0 1404 - asm(
michael@0 1405 - ".text\n"
michael@0 1406 -#if defined(OS_MACOSX)
michael@0 1407 -"_PICScaleYUVToRGB32Row:\n"
michael@0 1408 -#else
michael@0 1409 -"PICScaleYUVToRGB32Row:\n"
michael@0 1410 -#endif
michael@0 1411 - "pusha\n"
michael@0 1412 - "mov 0x24(%esp),%edx\n"
michael@0 1413 - "mov 0x28(%esp),%edi\n"
michael@0 1414 - "mov 0x2c(%esp),%esi\n"
michael@0 1415 - "mov 0x30(%esp),%ebp\n"
michael@0 1416 - "mov 0x3c(%esp),%ecx\n"
michael@0 1417 - "xor %ebx,%ebx\n"
michael@0 1418 - "jmp Lscaleend\n"
michael@0 1419 -
michael@0 1420 -"Lscaleloop:"
michael@0 1421 - "mov %ebx,%eax\n"
michael@0 1422 - "sar $0x11,%eax\n"
michael@0 1423 - "movzbl (%edi,%eax,1),%eax\n"
michael@0 1424 - "movq 2048(%ecx,%eax,8),%mm0\n"
michael@0 1425 - "mov %ebx,%eax\n"
michael@0 1426 - "sar $0x11,%eax\n"
michael@0 1427 - "movzbl (%esi,%eax,1),%eax\n"
michael@0 1428 - "paddsw 4096(%ecx,%eax,8),%mm0\n"
michael@0 1429 - "mov %ebx,%eax\n"
michael@0 1430 - "add 0x38(%esp),%ebx\n"
michael@0 1431 - "sar $0x10,%eax\n"
michael@0 1432 - "movzbl (%edx,%eax,1),%eax\n"
michael@0 1433 - "movq 0(%ecx,%eax,8),%mm1\n"
michael@0 1434 - "mov %ebx,%eax\n"
michael@0 1435 - "add 0x38(%esp),%ebx\n"
michael@0 1436 - "sar $0x10,%eax\n"
michael@0 1437 - "movzbl (%edx,%eax,1),%eax\n"
michael@0 1438 - "movq 0(%ecx,%eax,8),%mm2\n"
michael@0 1439 - "paddsw %mm0,%mm1\n"
michael@0 1440 - "paddsw %mm0,%mm2\n"
michael@0 1441 - "psraw $0x6,%mm1\n"
michael@0 1442 - "psraw $0x6,%mm2\n"
michael@0 1443 - "packuswb %mm2,%mm1\n"
michael@0 1444 - "movntq %mm1,0x0(%ebp)\n"
michael@0 1445 - "add $0x8,%ebp\n"
michael@0 1446 -"Lscaleend:"
michael@0 1447 - "subl $0x2,0x34(%esp)\n"
michael@0 1448 - "jns Lscaleloop\n"
michael@0 1449 -
michael@0 1450 - "andl $0x1,0x34(%esp)\n"
michael@0 1451 - "je Lscaledone\n"
michael@0 1452 -
michael@0 1453 - "mov %ebx,%eax\n"
michael@0 1454 - "sar $0x11,%eax\n"
michael@0 1455 - "movzbl (%edi,%eax,1),%eax\n"
michael@0 1456 - "movq 2048(%ecx,%eax,8),%mm0\n"
michael@0 1457 - "mov %ebx,%eax\n"
michael@0 1458 - "sar $0x11,%eax\n"
michael@0 1459 - "movzbl (%esi,%eax,1),%eax\n"
michael@0 1460 - "paddsw 4096(%ecx,%eax,8),%mm0\n"
michael@0 1461 - "mov %ebx,%eax\n"
michael@0 1462 - "sar $0x10,%eax\n"
michael@0 1463 - "movzbl (%edx,%eax,1),%eax\n"
michael@0 1464 - "movq 0(%ecx,%eax,8),%mm1\n"
michael@0 1465 - "paddsw %mm0,%mm1\n"
michael@0 1466 - "psraw $0x6,%mm1\n"
michael@0 1467 - "packuswb %mm1,%mm1\n"
michael@0 1468 - "movd %mm1,0x0(%ebp)\n"
michael@0 1469 -
michael@0 1470 -"Lscaledone:"
michael@0 1471 - "popa\n"
michael@0 1472 - "ret\n"
michael@0 1473 -);
michael@0 1474 -
michael@0 1475 -
michael@0 1476 -void ScaleYUVToRGB32Row(const uint8* y_buf,
michael@0 1477 - const uint8* u_buf,
michael@0 1478 - const uint8* v_buf,
michael@0 1479 - uint8* rgb_buf,
michael@0 1480 - int width,
michael@0 1481 - int source_dx) {
michael@0 1482 - PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
michael@0 1483 - &kCoefficientsRgbY[0][0]);
michael@0 1484 -}
michael@0 1485 -
michael@0 1486 -void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
michael@0 1487 - const uint8* u_buf,
michael@0 1488 - const uint8* v_buf,
michael@0 1489 - uint8* rgb_buf,
michael@0 1490 - int width,
michael@0 1491 - int source_dx,
michael@0 1492 - int16 *kCoefficientsRgbY);
michael@0 1493 - asm(
michael@0 1494 - ".text\n"
michael@0 1495 -#if defined(OS_MACOSX)
michael@0 1496 -"_PICLinearScaleYUVToRGB32Row:\n"
michael@0 1497 -#else
michael@0 1498 -"PICLinearScaleYUVToRGB32Row:\n"
michael@0 1499 -#endif
michael@0 1500 - "pusha\n"
michael@0 1501 - "mov 0x24(%esp),%edx\n"
michael@0 1502 - "mov 0x30(%esp),%ebp\n"
michael@0 1503 - "mov 0x34(%esp),%ecx\n"
michael@0 1504 - "mov 0x3c(%esp),%edi\n"
michael@0 1505 - "xor %ebx,%ebx\n"
michael@0 1506 -
michael@0 1507 - // source_width = width * source_dx + ebx
michael@0 1508 - "mov 0x34(%esp), %ecx\n"
michael@0 1509 - "imull 0x38(%esp), %ecx\n"
michael@0 1510 - "mov %ecx, 0x34(%esp)\n"
michael@0 1511 -
michael@0 1512 - "mov 0x38(%esp), %ecx\n"
michael@0 1513 - "xor %ebx,%ebx\n" // x = 0
michael@0 1514 - "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
michael@0 1515 - "jl .lscaleend\n"
michael@0 1516 - "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
michael@0 1517 - "jmp .lscaleend\n"
michael@0 1518 -
michael@0 1519 -".lscaleloop:"
michael@0 1520 - "mov 0x28(%esp),%esi\n"
michael@0 1521 - "mov %ebx,%eax\n"
michael@0 1522 - "sar $0x11,%eax\n"
michael@0 1523 -
michael@0 1524 - "movzbl (%esi,%eax,1),%ecx\n"
michael@0 1525 - "movzbl 1(%esi,%eax,1),%esi\n"
michael@0 1526 - "mov %ebx,%eax\n"
michael@0 1527 - "andl $0x1fffe, %eax \n"
michael@0 1528 - "imul %eax, %esi \n"
michael@0 1529 - "xorl $0x1fffe, %eax \n"
michael@0 1530 - "imul %eax, %ecx \n"
michael@0 1531 - "addl %esi, %ecx \n"
michael@0 1532 - "shrl $17, %ecx \n"
michael@0 1533 - "movq 2048(%edi,%ecx,8),%mm0\n"
michael@0 1534 -
michael@0 1535 - "mov 0x2c(%esp),%esi\n"
michael@0 1536 - "mov %ebx,%eax\n"
michael@0 1537 - "sar $0x11,%eax\n"
michael@0 1538 -
michael@0 1539 - "movzbl (%esi,%eax,1),%ecx\n"
michael@0 1540 - "movzbl 1(%esi,%eax,1),%esi\n"
michael@0 1541 - "mov %ebx,%eax\n"
michael@0 1542 - "andl $0x1fffe, %eax \n"
michael@0 1543 - "imul %eax, %esi \n"
michael@0 1544 - "xorl $0x1fffe, %eax \n"
michael@0 1545 - "imul %eax, %ecx \n"
michael@0 1546 - "addl %esi, %ecx \n"
michael@0 1547 - "shrl $17, %ecx \n"
michael@0 1548 - "paddsw 4096(%edi,%ecx,8),%mm0\n"
michael@0 1549 -
michael@0 1550 - "mov %ebx,%eax\n"
michael@0 1551 - "sar $0x10,%eax\n"
michael@0 1552 - "movzbl (%edx,%eax,1),%ecx\n"
michael@0 1553 - "movzbl 1(%edx,%eax,1),%esi\n"
michael@0 1554 - "mov %ebx,%eax\n"
michael@0 1555 - "add 0x38(%esp),%ebx\n"
michael@0 1556 - "andl $0xffff, %eax \n"
michael@0 1557 - "imul %eax, %esi \n"
michael@0 1558 - "xorl $0xffff, %eax \n"
michael@0 1559 - "imul %eax, %ecx \n"
michael@0 1560 - "addl %esi, %ecx \n"
michael@0 1561 - "shrl $16, %ecx \n"
michael@0 1562 - "movq (%edi,%ecx,8),%mm1\n"
michael@0 1563 -
michael@0 1564 - "cmp 0x34(%esp), %ebx\n"
michael@0 1565 - "jge .lscalelastpixel\n"
michael@0 1566 -
michael@0 1567 - "mov %ebx,%eax\n"
michael@0 1568 - "sar $0x10,%eax\n"
michael@0 1569 - "movzbl (%edx,%eax,1),%ecx\n"
michael@0 1570 - "movzbl 1(%edx,%eax,1),%esi\n"
michael@0 1571 - "mov %ebx,%eax\n"
michael@0 1572 - "add 0x38(%esp),%ebx\n"
michael@0 1573 - "andl $0xffff, %eax \n"
michael@0 1574 - "imul %eax, %esi \n"
michael@0 1575 - "xorl $0xffff, %eax \n"
michael@0 1576 - "imul %eax, %ecx \n"
michael@0 1577 - "addl %esi, %ecx \n"
michael@0 1578 - "shrl $16, %ecx \n"
michael@0 1579 - "movq (%edi,%ecx,8),%mm2\n"
michael@0 1580 -
michael@0 1581 - "paddsw %mm0,%mm1\n"
michael@0 1582 - "paddsw %mm0,%mm2\n"
michael@0 1583 - "psraw $0x6,%mm1\n"
michael@0 1584 - "psraw $0x6,%mm2\n"
michael@0 1585 - "packuswb %mm2,%mm1\n"
michael@0 1586 - "movntq %mm1,0x0(%ebp)\n"
michael@0 1587 - "add $0x8,%ebp\n"
michael@0 1588 -
michael@0 1589 -".lscaleend:"
michael@0 1590 - "cmp %ebx, 0x34(%esp)\n"
michael@0 1591 - "jg .lscaleloop\n"
michael@0 1592 - "popa\n"
michael@0 1593 - "ret\n"
michael@0 1594 -
michael@0 1595 -".lscalelastpixel:"
michael@0 1596 - "paddsw %mm0, %mm1\n"
michael@0 1597 - "psraw $6, %mm1\n"
michael@0 1598 - "packuswb %mm1, %mm1\n"
michael@0 1599 - "movd %mm1, (%ebp)\n"
michael@0 1600 - "popa\n"
michael@0 1601 - "ret\n"
michael@0 1602 -);
michael@0 1603 -
michael@0 1604 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
michael@0 1605 - const uint8* u_buf,
michael@0 1606 - const uint8* v_buf,
michael@0 1607 - uint8* rgb_buf,
michael@0 1608 - int width,
michael@0 1609 - int source_dx) {
michael@0 1610 - PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
michael@0 1611 - &kCoefficientsRgbY[0][0]);
michael@0 1612 -}
michael@0 1613 -
michael@0 1614 -#else // USE_MMX
michael@0 1615 -
michael@0 1616 // C reference code that mimic the YUV assembly.
michael@0 1617 #define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
michael@0 1618 #define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
michael@0 1619 (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
michael@0 1620
michael@0 1621 static inline void YuvPixel(uint8 y,
michael@0 1622 uint8 u,
michael@0 1623 uint8 v,
michael@0 1624 @@ -833,66 +39,71 @@ static inline void YuvPixel(uint8 y,
michael@0 1625 a >>= 6;
michael@0 1626
michael@0 1627 *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
michael@0 1628 (packuswb(g) << 8) |
michael@0 1629 (packuswb(r) << 16) |
michael@0 1630 (packuswb(a) << 24);
michael@0 1631 }
michael@0 1632
michael@0 1633 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
michael@0 1634 - const uint8* u_buf,
michael@0 1635 - const uint8* v_buf,
michael@0 1636 - uint8* rgb_buf,
michael@0 1637 - int width) {
michael@0 1638 +void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
michael@0 1639 + const uint8* u_buf,
michael@0 1640 + const uint8* v_buf,
michael@0 1641 + uint8* rgb_buf,
michael@0 1642 + int width,
michael@0 1643 + unsigned int x_shift) {
michael@0 1644 for (int x = 0; x < width; x += 2) {
michael@0 1645 - uint8 u = u_buf[x >> 1];
michael@0 1646 - uint8 v = v_buf[x >> 1];
michael@0 1647 + uint8 u = u_buf[x >> x_shift];
michael@0 1648 + uint8 v = v_buf[x >> x_shift];
michael@0 1649 uint8 y0 = y_buf[x];
michael@0 1650 YuvPixel(y0, u, v, rgb_buf);
michael@0 1651 if ((x + 1) < width) {
michael@0 1652 uint8 y1 = y_buf[x + 1];
michael@0 1653 + if (x_shift == 0) {
michael@0 1654 + u = u_buf[x + 1];
michael@0 1655 + v = v_buf[x + 1];
michael@0 1656 + }
michael@0 1657 YuvPixel(y1, u, v, rgb_buf + 4);
michael@0 1658 }
michael@0 1659 rgb_buf += 8; // Advance 2 pixels.
michael@0 1660 }
michael@0 1661 }
michael@0 1662
michael@0 1663 // 16.16 fixed point is used. A shift by 16 isolates the integer.
michael@0 1664 // A shift by 17 is used to further subsample the chrominence channels.
michael@0 1665 // & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits,
michael@0 1666 // for 1/65536 pixel accurate interpolation.
michael@0 1667 -void ScaleYUVToRGB32Row(const uint8* y_buf,
michael@0 1668 - const uint8* u_buf,
michael@0 1669 - const uint8* v_buf,
michael@0 1670 - uint8* rgb_buf,
michael@0 1671 - int width,
michael@0 1672 - int source_dx) {
michael@0 1673 +void ScaleYUVToRGB32Row_C(const uint8* y_buf,
michael@0 1674 + const uint8* u_buf,
michael@0 1675 + const uint8* v_buf,
michael@0 1676 + uint8* rgb_buf,
michael@0 1677 + int width,
michael@0 1678 + int source_dx) {
michael@0 1679 int x = 0;
michael@0 1680 for (int i = 0; i < width; i += 2) {
michael@0 1681 int y = y_buf[x >> 16];
michael@0 1682 int u = u_buf[(x >> 17)];
michael@0 1683 int v = v_buf[(x >> 17)];
michael@0 1684 YuvPixel(y, u, v, rgb_buf);
michael@0 1685 x += source_dx;
michael@0 1686 if ((i + 1) < width) {
michael@0 1687 y = y_buf[x >> 16];
michael@0 1688 YuvPixel(y, u, v, rgb_buf+4);
michael@0 1689 x += source_dx;
michael@0 1690 }
michael@0 1691 rgb_buf += 8;
michael@0 1692 }
michael@0 1693 }
michael@0 1694
michael@0 1695 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
michael@0 1696 - const uint8* u_buf,
michael@0 1697 - const uint8* v_buf,
michael@0 1698 - uint8* rgb_buf,
michael@0 1699 - int width,
michael@0 1700 - int source_dx) {
michael@0 1701 +void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
michael@0 1702 + const uint8* u_buf,
michael@0 1703 + const uint8* v_buf,
michael@0 1704 + uint8* rgb_buf,
michael@0 1705 + int width,
michael@0 1706 + int source_dx) {
michael@0 1707 int x = 0;
michael@0 1708 if (source_dx >= 0x20000) {
michael@0 1709 x = 32768;
michael@0 1710 }
michael@0 1711 for (int i = 0; i < width; i += 2) {
michael@0 1712 int y0 = y_buf[x >> 16];
michael@0 1713 int y1 = y_buf[(x >> 16) + 1];
michael@0 1714 int u0 = u_buf[(x >> 17)];
michael@0 1715 @@ -913,11 +124,10 @@ void LinearScaleYUVToRGB32Row(const uint
michael@0 1716 y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
michael@0 1717 YuvPixel(y, u, v, rgb_buf+4);
michael@0 1718 x += source_dx;
michael@0 1719 }
michael@0 1720 rgb_buf += 8;
michael@0 1721 }
michael@0 1722 }
michael@0 1723
michael@0 1724 -#endif // USE_MMX
michael@0 1725 } // extern "C"
michael@0 1726
michael@0 1727 diff --git a/gfx/ycbcr/yuv_row_posix.cpp b/gfx/ycbcr/yuv_row_posix.cpp
michael@0 1728 --- a/gfx/ycbcr/yuv_row_posix.cpp
michael@0 1729 +++ b/gfx/ycbcr/yuv_row_posix.cpp
michael@0 1730 @@ -1,33 +1,32 @@
michael@0 1731 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
michael@0 1732 // Use of this source code is governed by a BSD-style license that can be
michael@0 1733 // found in the LICENSE file.
michael@0 1734
michael@0 1735 -#include "media/base/yuv_row.h"
michael@0 1736 -
michael@0 1737 -#ifdef _DEBUG
michael@0 1738 -#include "base/logging.h"
michael@0 1739 -#else
michael@0 1740 +#include "yuv_row.h"
michael@0 1741 +#include "mozilla/SSE.h"
michael@0 1742 +
michael@0 1743 #define DCHECK(a)
michael@0 1744 -#endif
michael@0 1745
michael@0 1746 extern "C" {
michael@0 1747
michael@0 1748 -#if USE_SSE2 && defined(ARCH_CPU_X86_64)
michael@0 1749 +#if defined(ARCH_CPU_X86_64)
michael@0 1750 +
michael@0 1751 +// We don't need CPUID guards here, since x86-64 implies SSE2.
michael@0 1752
michael@0 1753 // AMD64 ABI uses register paremters.
michael@0 1754 void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi
michael@0 1755 const uint8* u_buf, // rsi
michael@0 1756 const uint8* v_buf, // rdx
michael@0 1757 uint8* rgb_buf, // rcx
michael@0 1758 int width) { // r8
michael@0 1759 asm(
michael@0 1760 - "jmp convertend\n"
michael@0 1761 -"convertloop:"
michael@0 1762 + "jmp 1f\n"
michael@0 1763 +"0:"
michael@0 1764 "movzb (%1),%%r10\n"
michael@0 1765 "add $0x1,%1\n"
michael@0 1766 "movzb (%2),%%r11\n"
michael@0 1767 "add $0x1,%2\n"
michael@0 1768 "movq 2048(%5,%%r10,8),%%xmm0\n"
michael@0 1769 "movzb (%0),%%r10\n"
michael@0 1770 "movq 4096(%5,%%r11,8),%%xmm1\n"
michael@0 1771 "movzb 0x1(%0),%%r11\n"
michael@0 1772 @@ -37,36 +36,36 @@ void FastConvertYUVToRGB32Row(const uint
michael@0 1773 "movq (%5,%%r11,8),%%xmm3\n"
michael@0 1774 "paddsw %%xmm0,%%xmm2\n"
michael@0 1775 "paddsw %%xmm0,%%xmm3\n"
michael@0 1776 "shufps $0x44,%%xmm3,%%xmm2\n"
michael@0 1777 "psraw $0x6,%%xmm2\n"
michael@0 1778 "packuswb %%xmm2,%%xmm2\n"
michael@0 1779 "movq %%xmm2,0x0(%3)\n"
michael@0 1780 "add $0x8,%3\n"
michael@0 1781 -"convertend:"
michael@0 1782 +"1:"
michael@0 1783 "sub $0x2,%4\n"
michael@0 1784 - "jns convertloop\n"
michael@0 1785 -
michael@0 1786 -"convertnext:"
michael@0 1787 + "jns 0b\n"
michael@0 1788 +
michael@0 1789 +"2:"
michael@0 1790 "add $0x1,%4\n"
michael@0 1791 - "js convertdone\n"
michael@0 1792 + "js 3f\n"
michael@0 1793
michael@0 1794 "movzb (%1),%%r10\n"
michael@0 1795 "movq 2048(%5,%%r10,8),%%xmm0\n"
michael@0 1796 "movzb (%2),%%r10\n"
michael@0 1797 "movq 4096(%5,%%r10,8),%%xmm1\n"
michael@0 1798 "paddsw %%xmm1,%%xmm0\n"
michael@0 1799 "movzb (%0),%%r10\n"
michael@0 1800 "movq (%5,%%r10,8),%%xmm1\n"
michael@0 1801 "paddsw %%xmm0,%%xmm1\n"
michael@0 1802 "psraw $0x6,%%xmm1\n"
michael@0 1803 "packuswb %%xmm1,%%xmm1\n"
michael@0 1804 "movd %%xmm1,0x0(%3)\n"
michael@0 1805 -"convertdone:"
michael@0 1806 +"3:"
michael@0 1807 :
michael@0 1808 : "r"(y_buf), // %0
michael@0 1809 "r"(u_buf), // %1
michael@0 1810 "r"(v_buf), // %2
michael@0 1811 "r"(rgb_buf), // %3
michael@0 1812 "r"(width), // %4
michael@0 1813 "r" (kCoefficientsRgbY) // %5
michael@0 1814 : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
michael@0 1815 @@ -77,19 +76,19 @@ void ScaleYUVToRGB32Row(const uint8* y_b
michael@0 1816 const uint8* u_buf, // rsi
michael@0 1817 const uint8* v_buf, // rdx
michael@0 1818 uint8* rgb_buf, // rcx
michael@0 1819 int width, // r8
michael@0 1820 int source_dx) { // r9
michael@0 1821 asm(
michael@0 1822 "xor %%r11,%%r11\n"
michael@0 1823 "sub $0x2,%4\n"
michael@0 1824 - "js scalenext\n"
michael@0 1825 -
michael@0 1826 -"scaleloop:"
michael@0 1827 + "js 1f\n"
michael@0 1828 +
michael@0 1829 +"0:"
michael@0 1830 "mov %%r11,%%r10\n"
michael@0 1831 "sar $0x11,%%r10\n"
michael@0 1832 "movzb (%1,%%r10,1),%%rax\n"
michael@0 1833 "movq 2048(%5,%%rax,8),%%xmm0\n"
michael@0 1834 "movzb (%2,%%r10,1),%%rax\n"
michael@0 1835 "movq 4096(%5,%%rax,8),%%xmm1\n"
michael@0 1836 "lea (%%r11,%6),%%r10\n"
michael@0 1837 "sar $0x10,%%r11\n"
michael@0 1838 @@ -103,38 +102,38 @@ void ScaleYUVToRGB32Row(const uint8* y_b
michael@0 1839 "paddsw %%xmm0,%%xmm1\n"
michael@0 1840 "paddsw %%xmm0,%%xmm2\n"
michael@0 1841 "shufps $0x44,%%xmm2,%%xmm1\n"
michael@0 1842 "psraw $0x6,%%xmm1\n"
michael@0 1843 "packuswb %%xmm1,%%xmm1\n"
michael@0 1844 "movq %%xmm1,0x0(%3)\n"
michael@0 1845 "add $0x8,%3\n"
michael@0 1846 "sub $0x2,%4\n"
michael@0 1847 - "jns scaleloop\n"
michael@0 1848 -
michael@0 1849 -"scalenext:"
michael@0 1850 + "jns 0b\n"
michael@0 1851 +
michael@0 1852 +"1:"
michael@0 1853 "add $0x1,%4\n"
michael@0 1854 - "js scaledone\n"
michael@0 1855 + "js 2f\n"
michael@0 1856
michael@0 1857 "mov %%r11,%%r10\n"
michael@0 1858 "sar $0x11,%%r10\n"
michael@0 1859 "movzb (%1,%%r10,1),%%rax\n"
michael@0 1860 "movq 2048(%5,%%rax,8),%%xmm0\n"
michael@0 1861 "movzb (%2,%%r10,1),%%rax\n"
michael@0 1862 "movq 4096(%5,%%rax,8),%%xmm1\n"
michael@0 1863 "paddsw %%xmm1,%%xmm0\n"
michael@0 1864 "sar $0x10,%%r11\n"
michael@0 1865 "movzb (%0,%%r11,1),%%rax\n"
michael@0 1866 "movq (%5,%%rax,8),%%xmm1\n"
michael@0 1867 "paddsw %%xmm0,%%xmm1\n"
michael@0 1868 "psraw $0x6,%%xmm1\n"
michael@0 1869 "packuswb %%xmm1,%%xmm1\n"
michael@0 1870 "movd %%xmm1,0x0(%3)\n"
michael@0 1871
michael@0 1872 -"scaledone:"
michael@0 1873 +"2:"
michael@0 1874 :
michael@0 1875 : "r"(y_buf), // %0
michael@0 1876 "r"(u_buf), // %1
michael@0 1877 "r"(v_buf), // %2
michael@0 1878 "r"(rgb_buf), // %3
michael@0 1879 "r"(width), // %4
michael@0 1880 "r" (kCoefficientsRgbY), // %5
michael@0 1881 "r"(static_cast<long>(source_dx)) // %6
michael@0 1882 @@ -146,23 +145,23 @@ void LinearScaleYUVToRGB32Row(const uint
michael@0 1883 const uint8* u_buf,
michael@0 1884 const uint8* v_buf,
michael@0 1885 uint8* rgb_buf,
michael@0 1886 int width,
michael@0 1887 int source_dx) {
michael@0 1888 asm(
michael@0 1889 "xor %%r11,%%r11\n" // x = 0
michael@0 1890 "sub $0x2,%4\n"
michael@0 1891 - "js .lscalenext\n"
michael@0 1892 + "js 2f\n"
michael@0 1893 "cmp $0x20000,%6\n" // if source_dx >= 2.0
michael@0 1894 - "jl .lscalehalf\n"
michael@0 1895 + "jl 0f\n"
michael@0 1896 "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
michael@0 1897 -".lscalehalf:"
michael@0 1898 -
michael@0 1899 -".lscaleloop:"
michael@0 1900 +"0:"
michael@0 1901 +
michael@0 1902 +"1:"
michael@0 1903 "mov %%r11,%%r10\n"
michael@0 1904 "sar $0x11,%%r10\n"
michael@0 1905
michael@0 1906 "movzb (%1, %%r10, 1), %%r13 \n"
michael@0 1907 "movzb 1(%1, %%r10, 1), %%r14 \n"
michael@0 1908 "mov %%r11, %%rax \n"
michael@0 1909 "and $0x1fffe, %%rax \n"
michael@0 1910 "imul %%rax, %%r14 \n"
michael@0 1911 @@ -215,21 +214,21 @@ void LinearScaleYUVToRGB32Row(const uint
michael@0 1912 "paddsw %%xmm0,%%xmm1\n"
michael@0 1913 "paddsw %%xmm0,%%xmm2\n"
michael@0 1914 "shufps $0x44,%%xmm2,%%xmm1\n"
michael@0 1915 "psraw $0x6,%%xmm1\n"
michael@0 1916 "packuswb %%xmm1,%%xmm1\n"
michael@0 1917 "movq %%xmm1,0x0(%3)\n"
michael@0 1918 "add $0x8,%3\n"
michael@0 1919 "sub $0x2,%4\n"
michael@0 1920 - "jns .lscaleloop\n"
michael@0 1921 -
michael@0 1922 -".lscalenext:"
michael@0 1923 + "jns 1b\n"
michael@0 1924 +
michael@0 1925 +"2:"
michael@0 1926 "add $0x1,%4\n"
michael@0 1927 - "js .lscaledone\n"
michael@0 1928 + "js 3f\n"
michael@0 1929
michael@0 1930 "mov %%r11,%%r10\n"
michael@0 1931 "sar $0x11,%%r10\n"
michael@0 1932
michael@0 1933 "movzb (%1,%%r10,1), %%r13 \n"
michael@0 1934 "movq 2048(%5,%%r13,8),%%xmm0\n"
michael@0 1935
michael@0 1936 "movzb (%2,%%r10,1), %%r13 \n"
michael@0 1937 @@ -241,52 +240,52 @@ void LinearScaleYUVToRGB32Row(const uint
michael@0 1938 "movzb (%0,%%r11,1), %%r13 \n"
michael@0 1939 "movq (%5,%%r13,8),%%xmm1\n"
michael@0 1940
michael@0 1941 "paddsw %%xmm0,%%xmm1\n"
michael@0 1942 "psraw $0x6,%%xmm1\n"
michael@0 1943 "packuswb %%xmm1,%%xmm1\n"
michael@0 1944 "movd %%xmm1,0x0(%3)\n"
michael@0 1945
michael@0 1946 -".lscaledone:"
michael@0 1947 +"3:"
michael@0 1948 :
michael@0 1949 : "r"(y_buf), // %0
michael@0 1950 "r"(u_buf), // %1
michael@0 1951 "r"(v_buf), // %2
michael@0 1952 "r"(rgb_buf), // %3
michael@0 1953 "r"(width), // %4
michael@0 1954 "r" (kCoefficientsRgbY), // %5
michael@0 1955 "r"(static_cast<long>(source_dx)) // %6
michael@0 1956 : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
michael@0 1957 );
michael@0 1958 }
michael@0 1959
michael@0 1960 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__)
michael@0 1961 +#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__)
michael@0 1962
michael@0 1963 // PIC version is slower because less registers are available, so
michael@0 1964 // non-PIC is used on platforms where it is possible.
michael@0 1965 -
michael@0 1966 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
michael@0 1967 - const uint8* u_buf,
michael@0 1968 - const uint8* v_buf,
michael@0 1969 - uint8* rgb_buf,
michael@0 1970 - int width);
michael@0 1971 +void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
michael@0 1972 + const uint8* u_buf,
michael@0 1973 + const uint8* v_buf,
michael@0 1974 + uint8* rgb_buf,
michael@0 1975 + int width);
michael@0 1976 asm(
michael@0 1977 ".text\n"
michael@0 1978 - ".global FastConvertYUVToRGB32Row\n"
michael@0 1979 -"FastConvertYUVToRGB32Row:\n"
michael@0 1980 + ".global FastConvertYUVToRGB32Row_SSE\n"
michael@0 1981 + ".type FastConvertYUVToRGB32Row_SSE, @function\n"
michael@0 1982 +"FastConvertYUVToRGB32Row_SSE:\n"
michael@0 1983 "pusha\n"
michael@0 1984 "mov 0x24(%esp),%edx\n"
michael@0 1985 "mov 0x28(%esp),%edi\n"
michael@0 1986 "mov 0x2c(%esp),%esi\n"
michael@0 1987 "mov 0x30(%esp),%ebp\n"
michael@0 1988 "mov 0x34(%esp),%ecx\n"
michael@0 1989 - "jmp convertend\n"
michael@0 1990 -
michael@0 1991 -"convertloop:"
michael@0 1992 + "jmp 1f\n"
michael@0 1993 +
michael@0 1994 +"0:"
michael@0 1995 "movzbl (%edi),%eax\n"
michael@0 1996 "add $0x1,%edi\n"
michael@0 1997 "movzbl (%esi),%ebx\n"
michael@0 1998 "add $0x1,%esi\n"
michael@0 1999 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
michael@0 2000 "movzbl (%edx),%eax\n"
michael@0 2001 "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
michael@0 2002 "movzbl 0x1(%edx),%ebx\n"
michael@0 2003 @@ -295,59 +294,77 @@ void FastConvertYUVToRGB32Row(const uint
michael@0 2004 "movq kCoefficientsRgbY(,%ebx,8),%mm2\n"
michael@0 2005 "paddsw %mm0,%mm1\n"
michael@0 2006 "paddsw %mm0,%mm2\n"
michael@0 2007 "psraw $0x6,%mm1\n"
michael@0 2008 "psraw $0x6,%mm2\n"
michael@0 2009 "packuswb %mm2,%mm1\n"
michael@0 2010 "movntq %mm1,0x0(%ebp)\n"
michael@0 2011 "add $0x8,%ebp\n"
michael@0 2012 -"convertend:"
michael@0 2013 +"1:"
michael@0 2014 "sub $0x2,%ecx\n"
michael@0 2015 - "jns convertloop\n"
michael@0 2016 + "jns 0b\n"
michael@0 2017
michael@0 2018 "and $0x1,%ecx\n"
michael@0 2019 - "je convertdone\n"
michael@0 2020 + "je 2f\n"
michael@0 2021
michael@0 2022 "movzbl (%edi),%eax\n"
michael@0 2023 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
michael@0 2024 "movzbl (%esi),%eax\n"
michael@0 2025 "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
michael@0 2026 "movzbl (%edx),%eax\n"
michael@0 2027 "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
michael@0 2028 "paddsw %mm0,%mm1\n"
michael@0 2029 "psraw $0x6,%mm1\n"
michael@0 2030 "packuswb %mm1,%mm1\n"
michael@0 2031 "movd %mm1,0x0(%ebp)\n"
michael@0 2032 -"convertdone:"
michael@0 2033 +"2:"
michael@0 2034 "popa\n"
michael@0 2035 "ret\n"
michael@0 2036 +#if !defined(XP_MACOSX)
michael@0 2037 + ".previous\n"
michael@0 2038 +#endif
michael@0 2039 );
michael@0 2040
michael@0 2041 -
michael@0 2042 -void ScaleYUVToRGB32Row(const uint8* y_buf,
michael@0 2043 - const uint8* u_buf,
michael@0 2044 - const uint8* v_buf,
michael@0 2045 - uint8* rgb_buf,
michael@0 2046 - int width,
michael@0 2047 - int source_dx);
michael@0 2048 +void FastConvertYUVToRGB32Row(const uint8* y_buf,
michael@0 2049 + const uint8* u_buf,
michael@0 2050 + const uint8* v_buf,
michael@0 2051 + uint8* rgb_buf,
michael@0 2052 + int width)
michael@0 2053 +{
michael@0 2054 + if (mozilla::supports_sse()) {
michael@0 2055 + FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
michael@0 2056 + return;
michael@0 2057 + }
michael@0 2058 +
michael@0 2059 + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
michael@0 2060 +}
michael@0 2061 +
michael@0 2062 +
michael@0 2063 +void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
michael@0 2064 + const uint8* u_buf,
michael@0 2065 + const uint8* v_buf,
michael@0 2066 + uint8* rgb_buf,
michael@0 2067 + int width,
michael@0 2068 + int source_dx);
michael@0 2069 asm(
michael@0 2070 ".text\n"
michael@0 2071 - ".global ScaleYUVToRGB32Row\n"
michael@0 2072 -"ScaleYUVToRGB32Row:\n"
michael@0 2073 + ".global ScaleYUVToRGB32Row_SSE\n"
michael@0 2074 + ".type ScaleYUVToRGB32Row_SSE, @function\n"
michael@0 2075 +"ScaleYUVToRGB32Row_SSE:\n"
michael@0 2076 "pusha\n"
michael@0 2077 "mov 0x24(%esp),%edx\n"
michael@0 2078 "mov 0x28(%esp),%edi\n"
michael@0 2079 "mov 0x2c(%esp),%esi\n"
michael@0 2080 "mov 0x30(%esp),%ebp\n"
michael@0 2081 "mov 0x34(%esp),%ecx\n"
michael@0 2082 "xor %ebx,%ebx\n"
michael@0 2083 - "jmp scaleend\n"
michael@0 2084 -
michael@0 2085 -"scaleloop:"
michael@0 2086 + "jmp 1f\n"
michael@0 2087 +
michael@0 2088 +"0:"
michael@0 2089 "mov %ebx,%eax\n"
michael@0 2090 "sar $0x11,%eax\n"
michael@0 2091 "movzbl (%edi,%eax,1),%eax\n"
michael@0 2092 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
michael@0 2093 "mov %ebx,%eax\n"
michael@0 2094 "sar $0x11,%eax\n"
michael@0 2095 "movzbl (%esi,%eax,1),%eax\n"
michael@0 2096 "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
michael@0 2097 @@ -363,22 +380,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b
michael@0 2098 "movq kCoefficientsRgbY(,%eax,8),%mm2\n"
michael@0 2099 "paddsw %mm0,%mm1\n"
michael@0 2100 "paddsw %mm0,%mm2\n"
michael@0 2101 "psraw $0x6,%mm1\n"
michael@0 2102 "psraw $0x6,%mm2\n"
michael@0 2103 "packuswb %mm2,%mm1\n"
michael@0 2104 "movntq %mm1,0x0(%ebp)\n"
michael@0 2105 "add $0x8,%ebp\n"
michael@0 2106 -"scaleend:"
michael@0 2107 +"1:"
michael@0 2108 "sub $0x2,%ecx\n"
michael@0 2109 - "jns scaleloop\n"
michael@0 2110 + "jns 0b\n"
michael@0 2111
michael@0 2112 "and $0x1,%ecx\n"
michael@0 2113 - "je scaledone\n"
michael@0 2114 + "je 2f\n"
michael@0 2115
michael@0 2116 "mov %ebx,%eax\n"
michael@0 2117 "sar $0x11,%eax\n"
michael@0 2118 "movzbl (%edi,%eax,1),%eax\n"
michael@0 2119 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
michael@0 2120 "mov %ebx,%eax\n"
michael@0 2121 "sar $0x11,%eax\n"
michael@0 2122 "movzbl (%esi,%eax,1),%eax\n"
michael@0 2123 @@ -387,51 +404,71 @@ void ScaleYUVToRGB32Row(const uint8* y_b
michael@0 2124 "sar $0x10,%eax\n"
michael@0 2125 "movzbl (%edx,%eax,1),%eax\n"
michael@0 2126 "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
michael@0 2127 "paddsw %mm0,%mm1\n"
michael@0 2128 "psraw $0x6,%mm1\n"
michael@0 2129 "packuswb %mm1,%mm1\n"
michael@0 2130 "movd %mm1,0x0(%ebp)\n"
michael@0 2131
michael@0 2132 -"scaledone:"
michael@0 2133 +"2:"
michael@0 2134 "popa\n"
michael@0 2135 "ret\n"
michael@0 2136 +#if !defined(XP_MACOSX)
michael@0 2137 + ".previous\n"
michael@0 2138 +#endif
michael@0 2139 );
michael@0 2140
michael@0 2141 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
michael@0 2142 - const uint8* u_buf,
michael@0 2143 - const uint8* v_buf,
michael@0 2144 - uint8* rgb_buf,
michael@0 2145 - int width,
michael@0 2146 - int source_dx);
michael@0 2147 +void ScaleYUVToRGB32Row(const uint8* y_buf,
michael@0 2148 + const uint8* u_buf,
michael@0 2149 + const uint8* v_buf,
michael@0 2150 + uint8* rgb_buf,
michael@0 2151 + int width,
michael@0 2152 + int source_dx)
michael@0 2153 +{
michael@0 2154 + if (mozilla::supports_sse()) {
michael@0 2155 + ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
michael@0 2156 + width, source_dx);
michael@0 2157 + }
michael@0 2158 +
michael@0 2159 + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
michael@0 2160 + width, source_dx);
michael@0 2161 +}
michael@0 2162 +
michael@0 2163 +void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
michael@0 2164 + const uint8* u_buf,
michael@0 2165 + const uint8* v_buf,
michael@0 2166 + uint8* rgb_buf,
michael@0 2167 + int width,
michael@0 2168 + int source_dx);
michael@0 2169 asm(
michael@0 2170 ".text\n"
michael@0 2171 - ".global LinearScaleYUVToRGB32Row\n"
michael@0 2172 -"LinearScaleYUVToRGB32Row:\n"
michael@0 2173 + ".global LinearScaleYUVToRGB32Row_SSE\n"
michael@0 2174 + ".type LinearScaleYUVToRGB32Row_SSE, @function\n"
michael@0 2175 +"LinearScaleYUVToRGB32Row_SSE:\n"
michael@0 2176 "pusha\n"
michael@0 2177 "mov 0x24(%esp),%edx\n"
michael@0 2178 "mov 0x28(%esp),%edi\n"
michael@0 2179 "mov 0x30(%esp),%ebp\n"
michael@0 2180
michael@0 2181 // source_width = width * source_dx + ebx
michael@0 2182 "mov 0x34(%esp), %ecx\n"
michael@0 2183 "imull 0x38(%esp), %ecx\n"
michael@0 2184 "mov %ecx, 0x34(%esp)\n"
michael@0 2185
michael@0 2186 "mov 0x38(%esp), %ecx\n"
michael@0 2187 "xor %ebx,%ebx\n" // x = 0
michael@0 2188 "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
michael@0 2189 - "jl .lscaleend\n"
michael@0 2190 + "jl 1f\n"
michael@0 2191 "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
michael@0 2192 - "jmp .lscaleend\n"
michael@0 2193 -
michael@0 2194 -".lscaleloop:"
michael@0 2195 - "mov %ebx,%eax\n"
michael@0 2196 - "sar $0x11,%eax\n"
michael@0 2197 + "jmp 1f\n"
michael@0 2198 +
michael@0 2199 +"0:"
michael@0 2200 + "mov %ebx,%eax\n"
michael@0 2201 + "sar $0x11,%eax\n"
michael@0 2202
michael@0 2203 "movzbl (%edi,%eax,1),%ecx\n"
michael@0 2204 "movzbl 1(%edi,%eax,1),%esi\n"
michael@0 2205 "mov %ebx,%eax\n"
michael@0 2206 "andl $0x1fffe, %eax \n"
michael@0 2207 "imul %eax, %esi \n"
michael@0 2208 "xorl $0x1fffe, %eax \n"
michael@0 2209 "imul %eax, %ecx \n"
michael@0 2210 @@ -464,17 +501,17 @@ void LinearScaleYUVToRGB32Row(const uint
michael@0 2211 "imul %eax, %esi \n"
michael@0 2212 "xorl $0xffff, %eax \n"
michael@0 2213 "imul %eax, %ecx \n"
michael@0 2214 "addl %esi, %ecx \n"
michael@0 2215 "shrl $16, %ecx \n"
michael@0 2216 "movq kCoefficientsRgbY(,%ecx,8),%mm1\n"
michael@0 2217
michael@0 2218 "cmp 0x34(%esp), %ebx\n"
michael@0 2219 - "jge .lscalelastpixel\n"
michael@0 2220 + "jge 2f\n"
michael@0 2221
michael@0 2222 "mov %ebx,%eax\n"
michael@0 2223 "sar $0x10,%eax\n"
michael@0 2224 "movzbl (%edx,%eax,1),%ecx\n"
michael@0 2225 "movzbl 1(%edx,%eax,1),%esi\n"
michael@0 2226 "mov %ebx,%eax\n"
michael@0 2227 "add 0x38(%esp),%ebx\n"
michael@0 2228 "andl $0xffff, %eax \n"
michael@0 2229 @@ -488,56 +525,76 @@ void LinearScaleYUVToRGB32Row(const uint
michael@0 2230 "paddsw %mm0,%mm1\n"
michael@0 2231 "paddsw %mm0,%mm2\n"
michael@0 2232 "psraw $0x6,%mm1\n"
michael@0 2233 "psraw $0x6,%mm2\n"
michael@0 2234 "packuswb %mm2,%mm1\n"
michael@0 2235 "movntq %mm1,0x0(%ebp)\n"
michael@0 2236 "add $0x8,%ebp\n"
michael@0 2237
michael@0 2238 -".lscaleend:"
michael@0 2239 +"1:"
michael@0 2240 "cmp 0x34(%esp), %ebx\n"
michael@0 2241 - "jl .lscaleloop\n"
michael@0 2242 + "jl 0b\n"
michael@0 2243 "popa\n"
michael@0 2244 "ret\n"
michael@0 2245
michael@0 2246 -".lscalelastpixel:"
michael@0 2247 +"2:"
michael@0 2248 "paddsw %mm0, %mm1\n"
michael@0 2249 "psraw $6, %mm1\n"
michael@0 2250 "packuswb %mm1, %mm1\n"
michael@0 2251 "movd %mm1, (%ebp)\n"
michael@0 2252 "popa\n"
michael@0 2253 "ret\n"
michael@0 2254 +#if !defined(XP_MACOSX)
michael@0 2255 + ".previous\n"
michael@0 2256 +#endif
michael@0 2257 );
michael@0 2258
michael@0 2259 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__)
michael@0 2260 -
michael@0 2261 -extern void PICConvertYUVToRGB32Row(const uint8* y_buf,
michael@0 2262 - const uint8* u_buf,
michael@0 2263 - const uint8* v_buf,
michael@0 2264 - uint8* rgb_buf,
michael@0 2265 - int width,
michael@0 2266 - int16 *kCoefficientsRgbY);
michael@0 2267 +void LinearScaleYUVToRGB32Row(const uint8* y_buf,
michael@0 2268 + const uint8* u_buf,
michael@0 2269 + const uint8* v_buf,
michael@0 2270 + uint8* rgb_buf,
michael@0 2271 + int width,
michael@0 2272 + int source_dx)
michael@0 2273 +{
michael@0 2274 + if (mozilla::supports_sse()) {
michael@0 2275 + LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
michael@0 2276 + width, source_dx);
michael@0 2277 + }
michael@0 2278 +
michael@0 2279 + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
michael@0 2280 + width, source_dx);
michael@0 2281 +}
michael@0 2282 +
michael@0 2283 +#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__)
michael@0 2284 +
michael@0 2285 +void PICConvertYUVToRGB32Row_SSE(const uint8* y_buf,
michael@0 2286 + const uint8* u_buf,
michael@0 2287 + const uint8* v_buf,
michael@0 2288 + uint8* rgb_buf,
michael@0 2289 + int width,
michael@0 2290 + int16 *kCoefficientsRgbY);
michael@0 2291 +
michael@0 2292 asm(
michael@0 2293 ".text\n"
michael@0 2294 -#if defined(OS_MACOSX)
michael@0 2295 -"_PICConvertYUVToRGB32Row:\n"
michael@0 2296 +#if defined(XP_MACOSX)
michael@0 2297 +"_PICConvertYUVToRGB32Row_SSE:\n"
michael@0 2298 #else
michael@0 2299 -"PICConvertYUVToRGB32Row:\n"
michael@0 2300 +"PICConvertYUVToRGB32Row_SSE:\n"
michael@0 2301 #endif
michael@0 2302 "pusha\n"
michael@0 2303 "mov 0x24(%esp),%edx\n"
michael@0 2304 "mov 0x28(%esp),%edi\n"
michael@0 2305 "mov 0x2c(%esp),%esi\n"
michael@0 2306 "mov 0x30(%esp),%ebp\n"
michael@0 2307 "mov 0x38(%esp),%ecx\n"
michael@0 2308
michael@0 2309 - "jmp .Lconvertend\n"
michael@0 2310 -
michael@0 2311 -".Lconvertloop:"
michael@0 2312 + "jmp 1f\n"
michael@0 2313 +
michael@0 2314 +"0:"
michael@0 2315 "movzbl (%edi),%eax\n"
michael@0 2316 "add $0x1,%edi\n"
michael@0 2317 "movzbl (%esi),%ebx\n"
michael@0 2318 "add $0x1,%esi\n"
michael@0 2319 "movq 2048(%ecx,%eax,8),%mm0\n"
michael@0 2320 "movzbl (%edx),%eax\n"
michael@0 2321 "paddsw 4096(%ecx,%ebx,8),%mm0\n"
michael@0 2322 "movzbl 0x1(%edx),%ebx\n"
michael@0 2323 @@ -546,72 +603,81 @@ extern void PICConvertYUVToRGB32Row(cons
michael@0 2324 "movq 0(%ecx,%ebx,8),%mm2\n"
michael@0 2325 "paddsw %mm0,%mm1\n"
michael@0 2326 "paddsw %mm0,%mm2\n"
michael@0 2327 "psraw $0x6,%mm1\n"
michael@0 2328 "psraw $0x6,%mm2\n"
michael@0 2329 "packuswb %mm2,%mm1\n"
michael@0 2330 "movntq %mm1,0x0(%ebp)\n"
michael@0 2331 "add $0x8,%ebp\n"
michael@0 2332 -".Lconvertend:"
michael@0 2333 +"1:"
michael@0 2334 "subl $0x2,0x34(%esp)\n"
michael@0 2335 - "jns .Lconvertloop\n"
michael@0 2336 + "jns 0b\n"
michael@0 2337
michael@0 2338 "andl $0x1,0x34(%esp)\n"
michael@0 2339 - "je .Lconvertdone\n"
michael@0 2340 + "je 2f\n"
michael@0 2341
michael@0 2342 "movzbl (%edi),%eax\n"
michael@0 2343 "movq 2048(%ecx,%eax,8),%mm0\n"
michael@0 2344 "movzbl (%esi),%eax\n"
michael@0 2345 "paddsw 4096(%ecx,%eax,8),%mm0\n"
michael@0 2346 "movzbl (%edx),%eax\n"
michael@0 2347 "movq 0(%ecx,%eax,8),%mm1\n"
michael@0 2348 "paddsw %mm0,%mm1\n"
michael@0 2349 "psraw $0x6,%mm1\n"
michael@0 2350 "packuswb %mm1,%mm1\n"
michael@0 2351 "movd %mm1,0x0(%ebp)\n"
michael@0 2352 -".Lconvertdone:\n"
michael@0 2353 +"2:"
michael@0 2354 "popa\n"
michael@0 2355 "ret\n"
michael@0 2356 +#if !defined(XP_MACOSX)
michael@0 2357 + ".previous\n"
michael@0 2358 +#endif
michael@0 2359 );
michael@0 2360
michael@0 2361 void FastConvertYUVToRGB32Row(const uint8* y_buf,
michael@0 2362 const uint8* u_buf,
michael@0 2363 const uint8* v_buf,
michael@0 2364 uint8* rgb_buf,
michael@0 2365 - int width) {
michael@0 2366 - PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
michael@0 2367 - &kCoefficientsRgbY[0][0]);
michael@0 2368 -}
michael@0 2369 -
michael@0 2370 -extern void PICScaleYUVToRGB32Row(const uint8* y_buf,
michael@0 2371 + int width)
michael@0 2372 +{
michael@0 2373 + if (mozilla::supports_sse()) {
michael@0 2374 + PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
michael@0 2375 + &kCoefficientsRgbY[0][0]);
michael@0 2376 + return;
michael@0 2377 + }
michael@0 2378 +
michael@0 2379 + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
michael@0 2380 +}
michael@0 2381 +
michael@0 2382 +void PICScaleYUVToRGB32Row_SSE(const uint8* y_buf,
michael@0 2383 const uint8* u_buf,
michael@0 2384 const uint8* v_buf,
michael@0 2385 uint8* rgb_buf,
michael@0 2386 int width,
michael@0 2387 int source_dx,
michael@0 2388 int16 *kCoefficientsRgbY);
michael@0 2389
michael@0 2390 asm(
michael@0 2391 ".text\n"
michael@0 2392 -#if defined(OS_MACOSX)
michael@0 2393 -"_PICScaleYUVToRGB32Row:\n"
michael@0 2394 +#if defined(XP_MACOSX)
michael@0 2395 +"_PICScaleYUVToRGB32Row_SSE:\n"
michael@0 2396 #else
michael@0 2397 -"PICScaleYUVToRGB32Row:\n"
michael@0 2398 +"PICScaleYUVToRGB32Row_SSE:\n"
michael@0 2399 #endif
michael@0 2400 "pusha\n"
michael@0 2401 "mov 0x24(%esp),%edx\n"
michael@0 2402 "mov 0x28(%esp),%edi\n"
michael@0 2403 "mov 0x2c(%esp),%esi\n"
michael@0 2404 "mov 0x30(%esp),%ebp\n"
michael@0 2405 "mov 0x3c(%esp),%ecx\n"
michael@0 2406 "xor %ebx,%ebx\n"
michael@0 2407 - "jmp Lscaleend\n"
michael@0 2408 -
michael@0 2409 -"Lscaleloop:"
michael@0 2410 + "jmp 1f\n"
michael@0 2411 +
michael@0 2412 +"0:"
michael@0 2413 "mov %ebx,%eax\n"
michael@0 2414 "sar $0x11,%eax\n"
michael@0 2415 "movzbl (%edi,%eax,1),%eax\n"
michael@0 2416 "movq 2048(%ecx,%eax,8),%mm0\n"
michael@0 2417 "mov %ebx,%eax\n"
michael@0 2418 "sar $0x11,%eax\n"
michael@0 2419 "movzbl (%esi,%eax,1),%eax\n"
michael@0 2420 "paddsw 4096(%ecx,%eax,8),%mm0\n"
michael@0 2421 @@ -627,22 +693,22 @@ extern void PICScaleYUVToRGB32Row(const
michael@0 2422 "movq 0(%ecx,%eax,8),%mm2\n"
michael@0 2423 "paddsw %mm0,%mm1\n"
michael@0 2424 "paddsw %mm0,%mm2\n"
michael@0 2425 "psraw $0x6,%mm1\n"
michael@0 2426 "psraw $0x6,%mm2\n"
michael@0 2427 "packuswb %mm2,%mm1\n"
michael@0 2428 "movntq %mm1,0x0(%ebp)\n"
michael@0 2429 "add $0x8,%ebp\n"
michael@0 2430 -"Lscaleend:"
michael@0 2431 +"1:"
michael@0 2432 "subl $0x2,0x34(%esp)\n"
michael@0 2433 - "jns Lscaleloop\n"
michael@0 2434 + "jns 0b\n"
michael@0 2435
michael@0 2436 "andl $0x1,0x34(%esp)\n"
michael@0 2437 - "je Lscaledone\n"
michael@0 2438 + "je 2f\n"
michael@0 2439
michael@0 2440 "mov %ebx,%eax\n"
michael@0 2441 "sar $0x11,%eax\n"
michael@0 2442 "movzbl (%edi,%eax,1),%eax\n"
michael@0 2443 "movq 2048(%ecx,%eax,8),%mm0\n"
michael@0 2444 "mov %ebx,%eax\n"
michael@0 2445 "sar $0x11,%eax\n"
michael@0 2446 "movzbl (%esi,%eax,1),%eax\n"
michael@0 2447 @@ -651,66 +717,75 @@ extern void PICScaleYUVToRGB32Row(const
michael@0 2448 "sar $0x10,%eax\n"
michael@0 2449 "movzbl (%edx,%eax,1),%eax\n"
michael@0 2450 "movq 0(%ecx,%eax,8),%mm1\n"
michael@0 2451 "paddsw %mm0,%mm1\n"
michael@0 2452 "psraw $0x6,%mm1\n"
michael@0 2453 "packuswb %mm1,%mm1\n"
michael@0 2454 "movd %mm1,0x0(%ebp)\n"
michael@0 2455
michael@0 2456 -"Lscaledone:"
michael@0 2457 +"2:"
michael@0 2458 "popa\n"
michael@0 2459 "ret\n"
michael@0 2460 +#if !defined(XP_MACOSX)
michael@0 2461 + ".previous\n"
michael@0 2462 +#endif
michael@0 2463 );
michael@0 2464
michael@0 2465 -
michael@0 2466 void ScaleYUVToRGB32Row(const uint8* y_buf,
michael@0 2467 const uint8* u_buf,
michael@0 2468 const uint8* v_buf,
michael@0 2469 uint8* rgb_buf,
michael@0 2470 int width,
michael@0 2471 - int source_dx) {
michael@0 2472 - PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
michael@0 2473 - &kCoefficientsRgbY[0][0]);
michael@0 2474 -}
michael@0 2475 -
michael@0 2476 -void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
michael@0 2477 - const uint8* u_buf,
michael@0 2478 - const uint8* v_buf,
michael@0 2479 - uint8* rgb_buf,
michael@0 2480 - int width,
michael@0 2481 - int source_dx,
michael@0 2482 - int16 *kCoefficientsRgbY);
michael@0 2483 + int source_dx)
michael@0 2484 +{
michael@0 2485 + if (mozilla::supports_sse()) {
michael@0 2486 + PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
michael@0 2487 + &kCoefficientsRgbY[0][0]);
michael@0 2488 + return;
michael@0 2489 + }
michael@0 2490 +
michael@0 2491 + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
michael@0 2492 +}
michael@0 2493 +
michael@0 2494 +void PICLinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
michael@0 2495 + const uint8* u_buf,
michael@0 2496 + const uint8* v_buf,
michael@0 2497 + uint8* rgb_buf,
michael@0 2498 + int width,
michael@0 2499 + int source_dx,
michael@0 2500 + int16 *kCoefficientsRgbY);
michael@0 2501 +
michael@0 2502 asm(
michael@0 2503 ".text\n"
michael@0 2504 -#if defined(OS_MACOSX)
michael@0 2505 -"_PICLinearScaleYUVToRGB32Row:\n"
michael@0 2506 +#if defined(XP_MACOSX)
michael@0 2507 +"_PICLinearScaleYUVToRGB32Row_SSE:\n"
michael@0 2508 #else
michael@0 2509 -"PICLinearScaleYUVToRGB32Row:\n"
michael@0 2510 +"PICLinearScaleYUVToRGB32Row_SSE:\n"
michael@0 2511 #endif
michael@0 2512 "pusha\n"
michael@0 2513 "mov 0x24(%esp),%edx\n"
michael@0 2514 "mov 0x30(%esp),%ebp\n"
michael@0 2515 "mov 0x34(%esp),%ecx\n"
michael@0 2516 "mov 0x3c(%esp),%edi\n"
michael@0 2517 "xor %ebx,%ebx\n"
michael@0 2518
michael@0 2519 // source_width = width * source_dx + ebx
michael@0 2520 "mov 0x34(%esp), %ecx\n"
michael@0 2521 "imull 0x38(%esp), %ecx\n"
michael@0 2522 "mov %ecx, 0x34(%esp)\n"
michael@0 2523
michael@0 2524 "mov 0x38(%esp), %ecx\n"
michael@0 2525 "xor %ebx,%ebx\n" // x = 0
michael@0 2526 "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
michael@0 2527 - "jl .lscaleend\n"
michael@0 2528 + "jl 1f\n"
michael@0 2529 "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
michael@0 2530 - "jmp .lscaleend\n"
michael@0 2531 -
michael@0 2532 -".lscaleloop:"
michael@0 2533 + "jmp 1f\n"
michael@0 2534 +
michael@0 2535 +"0:"
michael@0 2536 "mov 0x28(%esp),%esi\n"
michael@0 2537 "mov %ebx,%eax\n"
michael@0 2538 "sar $0x11,%eax\n"
michael@0 2539
michael@0 2540 "movzbl (%esi,%eax,1),%ecx\n"
michael@0 2541 "movzbl 1(%esi,%eax,1),%esi\n"
michael@0 2542 "mov %ebx,%eax\n"
michael@0 2543 "andl $0x1fffe, %eax \n"
michael@0 2544 @@ -746,17 +821,17 @@ void PICLinearScaleYUVToRGB32Row(const u
michael@0 2545 "imul %eax, %esi \n"
michael@0 2546 "xorl $0xffff, %eax \n"
michael@0 2547 "imul %eax, %ecx \n"
michael@0 2548 "addl %esi, %ecx \n"
michael@0 2549 "shrl $16, %ecx \n"
michael@0 2550 "movq (%edi,%ecx,8),%mm1\n"
michael@0 2551
michael@0 2552 "cmp 0x34(%esp), %ebx\n"
michael@0 2553 - "jge .lscalelastpixel\n"
michael@0 2554 + "jge 2f\n"
michael@0 2555
michael@0 2556 "mov %ebx,%eax\n"
michael@0 2557 "sar $0x10,%eax\n"
michael@0 2558 "movzbl (%edx,%eax,1),%ecx\n"
michael@0 2559 "movzbl 1(%edx,%eax,1),%esi\n"
michael@0 2560 "mov %ebx,%eax\n"
michael@0 2561 "add 0x38(%esp),%ebx\n"
michael@0 2562 "andl $0xffff, %eax \n"
michael@0 2563 @@ -770,154 +845,71 @@ void PICLinearScaleYUVToRGB32Row(const u
michael@0 2564 "paddsw %mm0,%mm1\n"
michael@0 2565 "paddsw %mm0,%mm2\n"
michael@0 2566 "psraw $0x6,%mm1\n"
michael@0 2567 "psraw $0x6,%mm2\n"
michael@0 2568 "packuswb %mm2,%mm1\n"
michael@0 2569 "movntq %mm1,0x0(%ebp)\n"
michael@0 2570 "add $0x8,%ebp\n"
michael@0 2571
michael@0 2572 -".lscaleend:"
michael@0 2573 +"1:"
michael@0 2574 "cmp %ebx, 0x34(%esp)\n"
michael@0 2575 - "jg .lscaleloop\n"
michael@0 2576 + "jg 0b\n"
michael@0 2577 "popa\n"
michael@0 2578 "ret\n"
michael@0 2579
michael@0 2580 -".lscalelastpixel:"
michael@0 2581 +"2:"
michael@0 2582 "paddsw %mm0, %mm1\n"
michael@0 2583 "psraw $6, %mm1\n"
michael@0 2584 "packuswb %mm1, %mm1\n"
michael@0 2585 "movd %mm1, (%ebp)\n"
michael@0 2586 "popa\n"
michael@0 2587 "ret\n"
michael@0 2588 +#if !defined(XP_MACOSX)
michael@0 2589 + ".previous\n"
michael@0 2590 +#endif
michael@0 2591 );
michael@0 2592
michael@0 2593 +
michael@0 2594 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
michael@0 2595 - const uint8* u_buf,
michael@0 2596 - const uint8* v_buf,
michael@0 2597 - uint8* rgb_buf,
michael@0 2598 - int width,
michael@0 2599 - int source_dx) {
michael@0 2600 - PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
michael@0 2601 - &kCoefficientsRgbY[0][0]);
michael@0 2602 -}
michael@0 2603 -
michael@0 2604 -#else // USE_MMX
michael@0 2605 -
michael@0 2606 -// C reference code that mimic the YUV assembly.
michael@0 2607 -#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
michael@0 2608 -#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
michael@0 2609 - (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
michael@0 2610 -
michael@0 2611 -static inline void YuvPixel(uint8 y,
michael@0 2612 - uint8 u,
michael@0 2613 - uint8 v,
michael@0 2614 - uint8* rgb_buf) {
michael@0 2615 -
michael@0 2616 - int b = kCoefficientsRgbY[256+u][0];
michael@0 2617 - int g = kCoefficientsRgbY[256+u][1];
michael@0 2618 - int r = kCoefficientsRgbY[256+u][2];
michael@0 2619 - int a = kCoefficientsRgbY[256+u][3];
michael@0 2620 -
michael@0 2621 - b = paddsw(b, kCoefficientsRgbY[512+v][0]);
michael@0 2622 - g = paddsw(g, kCoefficientsRgbY[512+v][1]);
michael@0 2623 - r = paddsw(r, kCoefficientsRgbY[512+v][2]);
michael@0 2624 - a = paddsw(a, kCoefficientsRgbY[512+v][3]);
michael@0 2625 -
michael@0 2626 - b = paddsw(b, kCoefficientsRgbY[y][0]);
michael@0 2627 - g = paddsw(g, kCoefficientsRgbY[y][1]);
michael@0 2628 - r = paddsw(r, kCoefficientsRgbY[y][2]);
michael@0 2629 - a = paddsw(a, kCoefficientsRgbY[y][3]);
michael@0 2630 -
michael@0 2631 - b >>= 6;
michael@0 2632 - g >>= 6;
michael@0 2633 - r >>= 6;
michael@0 2634 - a >>= 6;
michael@0 2635 -
michael@0 2636 - *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
michael@0 2637 - (packuswb(g) << 8) |
michael@0 2638 - (packuswb(r) << 16) |
michael@0 2639 - (packuswb(a) << 24);
michael@0 2640 -}
michael@0 2641 -
michael@0 2642 + const uint8* u_buf,
michael@0 2643 + const uint8* v_buf,
michael@0 2644 + uint8* rgb_buf,
michael@0 2645 + int width,
michael@0 2646 + int source_dx)
michael@0 2647 +{
michael@0 2648 + if (mozilla::supports_sse()) {
michael@0 2649 + PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
michael@0 2650 + source_dx, &kCoefficientsRgbY[0][0]);
michael@0 2651 + return;
michael@0 2652 + }
michael@0 2653 +
michael@0 2654 + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
michael@0 2655 +}
michael@0 2656 +#else
michael@0 2657 void FastConvertYUVToRGB32Row(const uint8* y_buf,
michael@0 2658 const uint8* u_buf,
michael@0 2659 const uint8* v_buf,
michael@0 2660 uint8* rgb_buf,
michael@0 2661 int width) {
michael@0 2662 - for (int x = 0; x < width; x += 2) {
michael@0 2663 - uint8 u = u_buf[x >> 1];
michael@0 2664 - uint8 v = v_buf[x >> 1];
michael@0 2665 - uint8 y0 = y_buf[x];
michael@0 2666 - YuvPixel(y0, u, v, rgb_buf);
michael@0 2667 - if ((x + 1) < width) {
michael@0 2668 - uint8 y1 = y_buf[x + 1];
michael@0 2669 - YuvPixel(y1, u, v, rgb_buf + 4);
michael@0 2670 - }
michael@0 2671 - rgb_buf += 8; // Advance 2 pixels.
michael@0 2672 - }
michael@0 2673 -}
michael@0 2674 -
michael@0 2675 -// 16.16 fixed point is used. A shift by 16 isolates the integer.
michael@0 2676 -// A shift by 17 is used to further subsample the chrominence channels.
michael@0 2677 -// & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits,
michael@0 2678 -// for 1/65536 pixel accurate interpolation.
michael@0 2679 + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
michael@0 2680 +}
michael@0 2681 +
michael@0 2682 void ScaleYUVToRGB32Row(const uint8* y_buf,
michael@0 2683 const uint8* u_buf,
michael@0 2684 const uint8* v_buf,
michael@0 2685 uint8* rgb_buf,
michael@0 2686 int width,
michael@0 2687 int source_dx) {
michael@0 2688 - int x = 0;
michael@0 2689 - for (int i = 0; i < width; i += 2) {
michael@0 2690 - int y = y_buf[x >> 16];
michael@0 2691 - int u = u_buf[(x >> 17)];
michael@0 2692 - int v = v_buf[(x >> 17)];
michael@0 2693 - YuvPixel(y, u, v, rgb_buf);
michael@0 2694 - x += source_dx;
michael@0 2695 - if ((i + 1) < width) {
michael@0 2696 - y = y_buf[x >> 16];
michael@0 2697 - YuvPixel(y, u, v, rgb_buf+4);
michael@0 2698 - x += source_dx;
michael@0 2699 - }
michael@0 2700 - rgb_buf += 8;
michael@0 2701 - }
michael@0 2702 -}
michael@0 2703 + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
michael@0 2704 +}
michael@0 2705
michael@0 2706 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
michael@0 2707 const uint8* u_buf,
michael@0 2708 const uint8* v_buf,
michael@0 2709 uint8* rgb_buf,
michael@0 2710 int width,
michael@0 2711 int source_dx) {
michael@0 2712 - int x = 0;
michael@0 2713 - if (source_dx >= 0x20000) {
michael@0 2714 - x = 32768;
michael@0 2715 - }
michael@0 2716 - for (int i = 0; i < width; i += 2) {
michael@0 2717 - int y0 = y_buf[x >> 16];
michael@0 2718 - int y1 = y_buf[(x >> 16) + 1];
michael@0 2719 - int u0 = u_buf[(x >> 17)];
michael@0 2720 - int u1 = u_buf[(x >> 17) + 1];
michael@0 2721 - int v0 = v_buf[(x >> 17)];
michael@0 2722 - int v1 = v_buf[(x >> 17) + 1];
michael@0 2723 - int y_frac = (x & 65535);
michael@0 2724 - int uv_frac = ((x >> 1) & 65535);
michael@0 2725 - int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
michael@0 2726 - int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;
michael@0 2727 - int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;
michael@0 2728 - YuvPixel(y, u, v, rgb_buf);
michael@0 2729 - x += source_dx;
michael@0 2730 - if ((i + 1) < width) {
michael@0 2731 - y0 = y_buf[x >> 16];
michael@0 2732 - y1 = y_buf[(x >> 16) + 1];
michael@0 2733 - y_frac = (x & 65535);
michael@0 2734 - y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
michael@0 2735 - YuvPixel(y, u, v, rgb_buf+4);
michael@0 2736 - x += source_dx;
michael@0 2737 - }
michael@0 2738 - rgb_buf += 8;
michael@0 2739 - }
michael@0 2740 -}
michael@0 2741 -
michael@0 2742 -#endif // USE_MMX
michael@0 2743 -} // extern "C"
michael@0 2744 -
michael@0 2745 + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
michael@0 2746 +}
michael@0 2747 +#endif
michael@0 2748 +
michael@0 2749 +}
michael@0 2750 diff --git a/gfx/ycbcr/yuv_row_table.cpp b/gfx/ycbcr/yuv_row_table.cpp
michael@0 2751 --- a/gfx/ycbcr/yuv_row_table.cpp
michael@0 2752 +++ b/gfx/ycbcr/yuv_row_table.cpp
michael@0 2753 @@ -1,13 +1,13 @@
michael@0 2754 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
michael@0 2755 // Use of this source code is governed by a BSD-style license that can be
michael@0 2756 // found in the LICENSE file.
michael@0 2757
michael@0 2758 -#include "media/base/yuv_row.h"
michael@0 2759 +#include "yuv_row.h"
michael@0 2760
michael@0 2761 extern "C" {
michael@0 2762
michael@0 2763 #define RGBY(i) { \
michael@0 2764 static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
michael@0 2765 static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
michael@0 2766 static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
michael@0 2767 0 \
michael@0 2768 diff --git a/gfx/ycbcr/yuv_row_win.cpp b/gfx/ycbcr/yuv_row_win.cpp
michael@0 2769 --- a/gfx/ycbcr/yuv_row_win.cpp
michael@0 2770 +++ b/gfx/ycbcr/yuv_row_win.cpp
michael@0 2771 @@ -1,26 +1,27 @@
michael@0 2772 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
michael@0 2773 // Use of this source code is governed by a BSD-style license that can be
michael@0 2774 // found in the LICENSE file.
michael@0 2775
michael@0 2776 -#include "media/base/yuv_row.h"
michael@0 2777 +#include "yuv_row.h"
michael@0 2778 +#include "mozilla/SSE.h"
michael@0 2779
michael@0 2780 #define kCoefficientsRgbU kCoefficientsRgbY + 2048
michael@0 2781 #define kCoefficientsRgbV kCoefficientsRgbY + 4096
michael@0 2782
michael@0 2783 extern "C" {
michael@0 2784
michael@0 2785 -#if USE_MMX
michael@0 2786 -__declspec(naked)
michael@0 2787 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
michael@0 2788 - const uint8* u_buf,
michael@0 2789 - const uint8* v_buf,
michael@0 2790 - uint8* rgb_buf,
michael@0 2791 - int width) {
michael@0 2792 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
michael@0 2793 +__declspec(naked)
michael@0 2794 +void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
michael@0 2795 + const uint8* u_buf,
michael@0 2796 + const uint8* v_buf,
michael@0 2797 + uint8* rgb_buf,
michael@0 2798 + int width) {
michael@0 2799 __asm {
michael@0 2800 pushad
michael@0 2801 mov edx, [esp + 32 + 4] // Y
michael@0 2802 mov edi, [esp + 32 + 8] // U
michael@0 2803 mov esi, [esp + 32 + 12] // V
michael@0 2804 mov ebp, [esp + 32 + 16] // rgb
michael@0 2805 mov ecx, [esp + 32 + 20] // width
michael@0 2806 jmp convertend
michael@0 2807 @@ -64,22 +65,22 @@ void FastConvertYUVToRGB32Row(const uint
michael@0 2808 convertdone :
michael@0 2809
michael@0 2810 popad
michael@0 2811 ret
michael@0 2812 }
michael@0 2813 }
michael@0 2814
michael@0 2815 __declspec(naked)
michael@0 2816 -void ConvertYUVToRGB32Row(const uint8* y_buf,
michael@0 2817 - const uint8* u_buf,
michael@0 2818 - const uint8* v_buf,
michael@0 2819 - uint8* rgb_buf,
michael@0 2820 - int width,
michael@0 2821 - int step) {
michael@0 2822 +void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
michael@0 2823 + const uint8* u_buf,
michael@0 2824 + const uint8* v_buf,
michael@0 2825 + uint8* rgb_buf,
michael@0 2826 + int width,
michael@0 2827 + int step) {
michael@0 2828 __asm {
michael@0 2829 pushad
michael@0 2830 mov edx, [esp + 32 + 4] // Y
michael@0 2831 mov edi, [esp + 32 + 8] // U
michael@0 2832 mov esi, [esp + 32 + 12] // V
michael@0 2833 mov ebp, [esp + 32 + 16] // rgb
michael@0 2834 mov ecx, [esp + 32 + 20] // width
michael@0 2835 mov ebx, [esp + 32 + 24] // step
michael@0 2836 @@ -125,23 +126,23 @@ void ConvertYUVToRGB32Row(const uint8* y
michael@0 2837 wdone :
michael@0 2838
michael@0 2839 popad
michael@0 2840 ret
michael@0 2841 }
michael@0 2842 }
michael@0 2843
michael@0 2844 __declspec(naked)
michael@0 2845 -void RotateConvertYUVToRGB32Row(const uint8* y_buf,
michael@0 2846 - const uint8* u_buf,
michael@0 2847 - const uint8* v_buf,
michael@0 2848 - uint8* rgb_buf,
michael@0 2849 - int width,
michael@0 2850 - int ystep,
michael@0 2851 - int uvstep) {
michael@0 2852 +void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
michael@0 2853 + const uint8* u_buf,
michael@0 2854 + const uint8* v_buf,
michael@0 2855 + uint8* rgb_buf,
michael@0 2856 + int width,
michael@0 2857 + int ystep,
michael@0 2858 + int uvstep) {
michael@0 2859 __asm {
michael@0 2860 pushad
michael@0 2861 mov edx, [esp + 32 + 4] // Y
michael@0 2862 mov edi, [esp + 32 + 8] // U
michael@0 2863 mov esi, [esp + 32 + 12] // V
michael@0 2864 mov ebp, [esp + 32 + 16] // rgb
michael@0 2865 mov ecx, [esp + 32 + 20] // width
michael@0 2866 jmp wend
michael@0 2867 @@ -188,21 +189,21 @@ void RotateConvertYUVToRGB32Row(const ui
michael@0 2868 wdone :
michael@0 2869
michael@0 2870 popad
michael@0 2871 ret
michael@0 2872 }
michael@0 2873 }
michael@0 2874
michael@0 2875 __declspec(naked)
michael@0 2876 -void DoubleYUVToRGB32Row(const uint8* y_buf,
michael@0 2877 - const uint8* u_buf,
michael@0 2878 - const uint8* v_buf,
michael@0 2879 - uint8* rgb_buf,
michael@0 2880 - int width) {
michael@0 2881 +void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
michael@0 2882 + const uint8* u_buf,
michael@0 2883 + const uint8* v_buf,
michael@0 2884 + uint8* rgb_buf,
michael@0 2885 + int width) {
michael@0 2886 __asm {
michael@0 2887 pushad
michael@0 2888 mov edx, [esp + 32 + 4] // Y
michael@0 2889 mov edi, [esp + 32 + 8] // U
michael@0 2890 mov esi, [esp + 32 + 12] // V
michael@0 2891 mov ebp, [esp + 32 + 16] // rgb
michael@0 2892 mov ecx, [esp + 32 + 20] // width
michael@0 2893 jmp wend
michael@0 2894 @@ -256,26 +257,26 @@ void DoubleYUVToRGB32Row(const uint8* y_
michael@0 2895 jns wloop1
michael@0 2896 wdone :
michael@0 2897 popad
michael@0 2898 ret
michael@0 2899 }
michael@0 2900 }
michael@0 2901
michael@0 2902 // This version does general purpose scaling by any amount, up or down.
michael@0 2903 -// The only thing it can not do it rotation by 90 or 270.
michael@0 2904 -// For performance the chroma is under sampled, reducing cost of a 3x
michael@0 2905 +// The only thing it cannot do is rotation by 90 or 270.
michael@0 2906 +// For performance the chroma is under-sampled, reducing cost of a 3x
michael@0 2907 // 1080p scale from 8.4 ms to 5.4 ms.
michael@0 2908 __declspec(naked)
michael@0 2909 -void ScaleYUVToRGB32Row(const uint8* y_buf,
michael@0 2910 - const uint8* u_buf,
michael@0 2911 - const uint8* v_buf,
michael@0 2912 - uint8* rgb_buf,
michael@0 2913 - int width,
michael@0 2914 - int source_dx) {
michael@0 2915 +void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
michael@0 2916 + const uint8* u_buf,
michael@0 2917 + const uint8* v_buf,
michael@0 2918 + uint8* rgb_buf,
michael@0 2919 + int width,
michael@0 2920 + int source_dx) {
michael@0 2921 __asm {
michael@0 2922 pushad
michael@0 2923 mov edx, [esp + 32 + 4] // Y
michael@0 2924 mov edi, [esp + 32 + 8] // U
michael@0 2925 mov esi, [esp + 32 + 12] // V
michael@0 2926 mov ebp, [esp + 32 + 16] // rgb
michael@0 2927 mov ecx, [esp + 32 + 20] // width
michael@0 2928 xor ebx, ebx // x
michael@0 2929 @@ -333,22 +334,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b
michael@0 2930
michael@0 2931 scaledone :
michael@0 2932 popad
michael@0 2933 ret
michael@0 2934 }
michael@0 2935 }
michael@0 2936
michael@0 2937 __declspec(naked)
michael@0 2938 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
michael@0 2939 - const uint8* u_buf,
michael@0 2940 - const uint8* v_buf,
michael@0 2941 - uint8* rgb_buf,
michael@0 2942 - int width,
michael@0 2943 - int source_dx) {
michael@0 2944 +void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
michael@0 2945 + const uint8* u_buf,
michael@0 2946 + const uint8* v_buf,
michael@0 2947 + uint8* rgb_buf,
michael@0 2948 + int width,
michael@0 2949 + int source_dx) {
michael@0 2950 __asm {
michael@0 2951 pushad
michael@0 2952 mov edx, [esp + 32 + 4] // Y
michael@0 2953 mov edi, [esp + 32 + 8] // U
michael@0 2954 // [esp + 32 + 12] // V
michael@0 2955 mov ebp, [esp + 32 + 16] // rgb
michael@0 2956 mov ecx, [esp + 32 + 20] // width
michael@0 2957 imul ecx, [esp + 32 + 24] // source_dx
michael@0 2958 @@ -438,152 +439,60 @@ lscalelastpixel:
michael@0 2959 paddsw mm1, mm0
michael@0 2960 psraw mm1, 6
michael@0 2961 packuswb mm1, mm1
michael@0 2962 movd [ebp], mm1
michael@0 2963 popad
michael@0 2964 ret
michael@0 2965 };
michael@0 2966 }
michael@0 2967 -#else // USE_MMX
michael@0 2968 -
michael@0 2969 -// C reference code that mimic the YUV assembly.
michael@0 2970 -#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
michael@0 2971 -#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
michael@0 2972 - (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
michael@0 2973 -
michael@0 2974 -static inline void YuvPixel(uint8 y,
michael@0 2975 - uint8 u,
michael@0 2976 - uint8 v,
michael@0 2977 - uint8* rgb_buf) {
michael@0 2978 -
michael@0 2979 - int b = kCoefficientsRgbY[256+u][0];
michael@0 2980 - int g = kCoefficientsRgbY[256+u][1];
michael@0 2981 - int r = kCoefficientsRgbY[256+u][2];
michael@0 2982 - int a = kCoefficientsRgbY[256+u][3];
michael@0 2983 -
michael@0 2984 - b = paddsw(b, kCoefficientsRgbY[512+v][0]);
michael@0 2985 - g = paddsw(g, kCoefficientsRgbY[512+v][1]);
michael@0 2986 - r = paddsw(r, kCoefficientsRgbY[512+v][2]);
michael@0 2987 - a = paddsw(a, kCoefficientsRgbY[512+v][3]);
michael@0 2988 -
michael@0 2989 - b = paddsw(b, kCoefficientsRgbY[y][0]);
michael@0 2990 - g = paddsw(g, kCoefficientsRgbY[y][1]);
michael@0 2991 - r = paddsw(r, kCoefficientsRgbY[y][2]);
michael@0 2992 - a = paddsw(a, kCoefficientsRgbY[y][3]);
michael@0 2993 -
michael@0 2994 - b >>= 6;
michael@0 2995 - g >>= 6;
michael@0 2996 - r >>= 6;
michael@0 2997 - a >>= 6;
michael@0 2998 -
michael@0 2999 - *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
michael@0 3000 - (packuswb(g) << 8) |
michael@0 3001 - (packuswb(r) << 16) |
michael@0 3002 - (packuswb(a) << 24);
michael@0 3003 -}
michael@0 3004 -
michael@0 3005 -#if TEST_MMX_YUV
michael@0 3006 -static inline void YuvPixel(uint8 y,
michael@0 3007 - uint8 u,
michael@0 3008 - uint8 v,
michael@0 3009 - uint8* rgb_buf) {
michael@0 3010 -
michael@0 3011 - __asm {
michael@0 3012 - movzx eax, u
michael@0 3013 - movq mm0, [kCoefficientsRgbY+2048 + 8 * eax]
michael@0 3014 - movzx eax, v
michael@0 3015 - paddsw mm0, [kCoefficientsRgbY+4096 + 8 * eax]
michael@0 3016 - movzx eax, y
michael@0 3017 - movq mm1, [kCoefficientsRgbY + 8 * eax]
michael@0 3018 - paddsw mm1, mm0
michael@0 3019 - psraw mm1, 6
michael@0 3020 - packuswb mm1, mm1
michael@0 3021 - mov eax, rgb_buf
michael@0 3022 - movd [eax], mm1
michael@0 3023 - emms
michael@0 3024 - }
michael@0 3025 -}
michael@0 3026 -#endif
michael@0 3027 +#endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
michael@0 3028
michael@0 3029 void FastConvertYUVToRGB32Row(const uint8* y_buf,
michael@0 3030 const uint8* u_buf,
michael@0 3031 const uint8* v_buf,
michael@0 3032 uint8* rgb_buf,
michael@0 3033 int width) {
michael@0 3034 - for (int x = 0; x < width; x += 2) {
michael@0 3035 - uint8 u = u_buf[x >> 1];
michael@0 3036 - uint8 v = v_buf[x >> 1];
michael@0 3037 - uint8 y0 = y_buf[x];
michael@0 3038 - YuvPixel(y0, u, v, rgb_buf);
michael@0 3039 - if ((x + 1) < width) {
michael@0 3040 - uint8 y1 = y_buf[x + 1];
michael@0 3041 - YuvPixel(y1, u, v, rgb_buf + 4);
michael@0 3042 - }
michael@0 3043 - rgb_buf += 8; // Advance 2 pixels.
michael@0 3044 - }
michael@0 3045 -}
michael@0 3046 -
michael@0 3047 -// 16.16 fixed point is used. A shift by 16 isolates the integer.
michael@0 3048 -// A shift by 17 is used to further subsample the chrominence channels.
michael@0 3049 -// & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits,
michael@0 3050 -// for 1/65536 pixel accurate interpolation.
michael@0 3051 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
michael@0 3052 + if (mozilla::supports_sse()) {
michael@0 3053 + FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
michael@0 3054 + return;
michael@0 3055 + }
michael@0 3056 +#endif
michael@0 3057 +
michael@0 3058 + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
michael@0 3059 +}
michael@0 3060 +
michael@0 3061 void ScaleYUVToRGB32Row(const uint8* y_buf,
michael@0 3062 const uint8* u_buf,
michael@0 3063 const uint8* v_buf,
michael@0 3064 uint8* rgb_buf,
michael@0 3065 int width,
michael@0 3066 int source_dx) {
michael@0 3067 - int x = 0;
michael@0 3068 - for (int i = 0; i < width; i += 2) {
michael@0 3069 - int y = y_buf[x >> 16];
michael@0 3070 - int u = u_buf[(x >> 17)];
michael@0 3071 - int v = v_buf[(x >> 17)];
michael@0 3072 - YuvPixel(y, u, v, rgb_buf);
michael@0 3073 - x += source_dx;
michael@0 3074 - if ((i + 1) < width) {
michael@0 3075 - y = y_buf[x >> 16];
michael@0 3076 - YuvPixel(y, u, v, rgb_buf+4);
michael@0 3077 - x += source_dx;
michael@0 3078 - }
michael@0 3079 - rgb_buf += 8;
michael@0 3080 - }
michael@0 3081 -}
michael@0 3082 +
michael@0 3083 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
michael@0 3084 + if (mozilla::supports_sse()) {
michael@0 3085 + ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
michael@0 3086 + return;
michael@0 3087 + }
michael@0 3088 +#endif
michael@0 3089 +
michael@0 3090 + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
michael@0 3091 +}
michael@0 3092
michael@0 3093 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
michael@0 3094 const uint8* u_buf,
michael@0 3095 const uint8* v_buf,
michael@0 3096 uint8* rgb_buf,
michael@0 3097 int width,
michael@0 3098 int source_dx) {
michael@0 3099 - int x = 0;
michael@0 3100 - if (source_dx >= 0x20000) {
michael@0 3101 - x = 32768;
michael@0 3102 - }
michael@0 3103 - for (int i = 0; i < width; i += 2) {
michael@0 3104 - int y0 = y_buf[x >> 16];
michael@0 3105 - int y1 = y_buf[(x >> 16) + 1];
michael@0 3106 - int u0 = u_buf[(x >> 17)];
michael@0 3107 - int u1 = u_buf[(x >> 17) + 1];
michael@0 3108 - int v0 = v_buf[(x >> 17)];
michael@0 3109 - int v1 = v_buf[(x >> 17) + 1];
michael@0 3110 - int y_frac = (x & 65535);
michael@0 3111 - int uv_frac = ((x >> 1) & 65535);
michael@0 3112 - int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
michael@0 3113 - int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;
michael@0 3114 - int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;
michael@0 3115 - YuvPixel(y, u, v, rgb_buf);
michael@0 3116 - x += source_dx;
michael@0 3117 - if ((i + 1) < width) {
michael@0 3118 - y0 = y_buf[x >> 16];
michael@0 3119 - y1 = y_buf[(x >> 16) + 1];
michael@0 3120 - y_frac = (x & 65535);
michael@0 3121 - y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
michael@0 3122 - YuvPixel(y, u, v, rgb_buf+4);
michael@0 3123 - x += source_dx;
michael@0 3124 - }
michael@0 3125 - rgb_buf += 8;
michael@0 3126 - }
michael@0 3127 -}
michael@0 3128 -
michael@0 3129 -#endif // USE_MMX
michael@0 3130 -} // extern "C"
michael@0 3131 -
michael@0 3132 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
michael@0 3133 + if (mozilla::supports_sse()) {
michael@0 3134 + LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
michael@0 3135 + source_dx);
michael@0 3136 + return;
michael@0 3137 + }
michael@0 3138 +#endif
michael@0 3139 +
michael@0 3140 + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
michael@0 3141 +}
michael@0 3142 +
michael@0 3143 +} // extern "C"

mercurial