Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp |
michael@0 | 2 | --- a/gfx/ycbcr/yuv_convert.cpp |
michael@0 | 3 | +++ b/gfx/ycbcr/yuv_convert.cpp |
michael@0 | 4 | @@ -6,145 +6,102 @@ |
michael@0 | 5 | // http://www.fourcc.org/yuv.php |
michael@0 | 6 | // The actual conversion is best described here |
michael@0 | 7 | // http://en.wikipedia.org/wiki/YUV |
michael@0 | 8 | // An article on optimizing YUV conversion using tables instead of multiplies |
michael@0 | 9 | // http://lestourtereaux.free.fr/papers/data/yuvrgb.pdf |
michael@0 | 10 | // |
michael@0 | 11 | // YV12 is a full plane of Y and a half height, half width chroma planes |
michael@0 | 12 | // YV16 is a full plane of Y and a full height, half width chroma planes |
michael@0 | 13 | +// YV24 is a full plane of Y and a full height, full width chroma planes |
michael@0 | 14 | // |
michael@0 | 15 | // ARGB pixel format is output, which on little endian is stored as BGRA. |
michael@0 | 16 | // The alpha is set to 255, allowing the application to use RGBA or RGB32. |
michael@0 | 17 | |
michael@0 | 18 | -#include "media/base/yuv_convert.h" |
michael@0 | 19 | +#include "yuv_convert.h" |
michael@0 | 20 | |
michael@0 | 21 | // Header for low level row functions. |
michael@0 | 22 | -#include "media/base/yuv_row.h" |
michael@0 | 23 | - |
michael@0 | 24 | -#if USE_MMX |
michael@0 | 25 | -#if defined(_MSC_VER) |
michael@0 | 26 | -#include <intrin.h> |
michael@0 | 27 | -#else |
michael@0 | 28 | -#include <mmintrin.h> |
michael@0 | 29 | -#endif |
michael@0 | 30 | -#endif |
michael@0 | 31 | - |
michael@0 | 32 | -#if USE_SSE2 |
michael@0 | 33 | -#include <emmintrin.h> |
michael@0 | 34 | -#endif |
michael@0 | 35 | - |
michael@0 | 36 | -namespace media { |
michael@0 | 37 | - |
michael@0 | 38 | +#include "yuv_row.h" |
michael@0 | 39 | +#include "mozilla/SSE.h" |
michael@0 | 40 | + |
michael@0 | 41 | +namespace mozilla { |
michael@0 | 42 | + |
michael@0 | 43 | +namespace gfx { |
michael@0 | 44 | + |
michael@0 | 45 | // 16.16 fixed point arithmetic |
michael@0 | 46 | const int kFractionBits = 16; |
michael@0 | 47 | const int kFractionMax = 1 << kFractionBits; |
michael@0 | 48 | const int kFractionMask = ((1 << kFractionBits) - 1); |
michael@0 | 49 | |
michael@0 | 50 | // Convert a frame of YUV to 32 bit ARGB. |
michael@0 | 51 | -void ConvertYUVToRGB32(const uint8* y_buf, |
michael@0 | 52 | - const uint8* u_buf, |
michael@0 | 53 | - const uint8* v_buf, |
michael@0 | 54 | - uint8* rgb_buf, |
michael@0 | 55 | - int width, |
michael@0 | 56 | - int height, |
michael@0 | 57 | - int y_pitch, |
michael@0 | 58 | - int uv_pitch, |
michael@0 | 59 | - int rgb_pitch, |
michael@0 | 60 | - YUVType yuv_type) { |
michael@0 | 61 | - unsigned int y_shift = yuv_type; |
michael@0 | 62 | - for (int y = 0; y < height; ++y) { |
michael@0 | 63 | - uint8* rgb_row = rgb_buf + y * rgb_pitch; |
michael@0 | 64 | - const uint8* y_ptr = y_buf + y * y_pitch; |
michael@0 | 65 | - const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch; |
michael@0 | 66 | - const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch; |
michael@0 | 67 | - |
michael@0 | 68 | - FastConvertYUVToRGB32Row(y_ptr, |
michael@0 | 69 | - u_ptr, |
michael@0 | 70 | - v_ptr, |
michael@0 | 71 | - rgb_row, |
michael@0 | 72 | - width); |
michael@0 | 73 | - } |
michael@0 | 74 | +NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* y_buf, |
michael@0 | 75 | + const uint8* u_buf, |
michael@0 | 76 | + const uint8* v_buf, |
michael@0 | 77 | + uint8* rgb_buf, |
michael@0 | 78 | + int pic_x, |
michael@0 | 79 | + int pic_y, |
michael@0 | 80 | + int pic_width, |
michael@0 | 81 | + int pic_height, |
michael@0 | 82 | + int y_pitch, |
michael@0 | 83 | + int uv_pitch, |
michael@0 | 84 | + int rgb_pitch, |
michael@0 | 85 | + YUVType yuv_type) { |
michael@0 | 86 | + unsigned int y_shift = yuv_type == YV12 ? 1 : 0; |
michael@0 | 87 | + unsigned int x_shift = yuv_type == YV24 ? 0 : 1; |
michael@0 | 88 | + // Test for SSE because the optimized code uses movntq, which is not part of MMX. |
michael@0 | 89 | + bool has_sse = supports_mmx() && supports_sse(); |
michael@0 | 90 | + // There is no optimized YV24 SSE routine so we check for this and |
michael@0 | 91 | + // fall back to the C code. |
michael@0 | 92 | + has_sse &= yuv_type != YV24; |
michael@0 | 93 | + bool odd_pic_x = yuv_type != YV24 && pic_x % 2 != 0; |
michael@0 | 94 | + int x_width = odd_pic_x ? pic_width - 1 : pic_width; |
michael@0 | 95 | + |
michael@0 | 96 | + for (int y = pic_y; y < pic_height + pic_y; ++y) { |
michael@0 | 97 | + uint8* rgb_row = rgb_buf + (y - pic_y) * rgb_pitch; |
michael@0 | 98 | + const uint8* y_ptr = y_buf + y * y_pitch + pic_x; |
michael@0 | 99 | + const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift); |
michael@0 | 100 | + const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift); |
michael@0 | 101 | + |
michael@0 | 102 | + if (odd_pic_x) { |
michael@0 | 103 | + // Handle the single odd pixel manually and use the |
michael@0 | 104 | + // fast routines for the remaining. |
michael@0 | 105 | + FastConvertYUVToRGB32Row_C(y_ptr++, |
michael@0 | 106 | + u_ptr++, |
michael@0 | 107 | + v_ptr++, |
michael@0 | 108 | + rgb_row, |
michael@0 | 109 | + 1, |
michael@0 | 110 | + x_shift); |
michael@0 | 111 | + rgb_row += 4; |
michael@0 | 112 | + } |
michael@0 | 113 | + |
michael@0 | 114 | + if (has_sse) { |
michael@0 | 115 | + FastConvertYUVToRGB32Row(y_ptr, |
michael@0 | 116 | + u_ptr, |
michael@0 | 117 | + v_ptr, |
michael@0 | 118 | + rgb_row, |
michael@0 | 119 | + x_width); |
michael@0 | 120 | + } |
michael@0 | 121 | + else { |
michael@0 | 122 | + FastConvertYUVToRGB32Row_C(y_ptr, |
michael@0 | 123 | + u_ptr, |
michael@0 | 124 | + v_ptr, |
michael@0 | 125 | + rgb_row, |
michael@0 | 126 | + x_width, |
michael@0 | 127 | + x_shift); |
michael@0 | 128 | + } |
michael@0 | 129 | + } |
michael@0 | 130 | |
michael@0 | 131 | // MMX used for FastConvertYUVToRGB32Row requires emms instruction. |
michael@0 | 132 | - EMMS(); |
michael@0 | 133 | -} |
michael@0 | 134 | - |
michael@0 | 135 | -#if USE_SSE2 |
michael@0 | 136 | -// FilterRows combines two rows of the image using linear interpolation. |
michael@0 | 137 | -// SSE2 version does 16 pixels at a time |
michael@0 | 138 | - |
michael@0 | 139 | -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, |
michael@0 | 140 | - int source_width, int source_y_fraction) { |
michael@0 | 141 | - __m128i zero = _mm_setzero_si128(); |
michael@0 | 142 | - __m128i y1_fraction = _mm_set1_epi16(source_y_fraction); |
michael@0 | 143 | - __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction); |
michael@0 | 144 | - |
michael@0 | 145 | - const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr); |
michael@0 | 146 | - const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr); |
michael@0 | 147 | - __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf); |
michael@0 | 148 | - __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width); |
michael@0 | 149 | - |
michael@0 | 150 | - do { |
michael@0 | 151 | - __m128i y0 = _mm_loadu_si128(y0_ptr128); |
michael@0 | 152 | - __m128i y1 = _mm_loadu_si128(y1_ptr128); |
michael@0 | 153 | - __m128i y2 = _mm_unpackhi_epi8(y0, zero); |
michael@0 | 154 | - __m128i y3 = _mm_unpackhi_epi8(y1, zero); |
michael@0 | 155 | - y0 = _mm_unpacklo_epi8(y0, zero); |
michael@0 | 156 | - y1 = _mm_unpacklo_epi8(y1, zero); |
michael@0 | 157 | - y0 = _mm_mullo_epi16(y0, y0_fraction); |
michael@0 | 158 | - y1 = _mm_mullo_epi16(y1, y1_fraction); |
michael@0 | 159 | - y2 = _mm_mullo_epi16(y2, y0_fraction); |
michael@0 | 160 | - y3 = _mm_mullo_epi16(y3, y1_fraction); |
michael@0 | 161 | - y0 = _mm_add_epi16(y0, y1); |
michael@0 | 162 | - y2 = _mm_add_epi16(y2, y3); |
michael@0 | 163 | - y0 = _mm_srli_epi16(y0, 8); |
michael@0 | 164 | - y2 = _mm_srli_epi16(y2, 8); |
michael@0 | 165 | - y0 = _mm_packus_epi16(y0, y2); |
michael@0 | 166 | - *dest128++ = y0; |
michael@0 | 167 | - ++y0_ptr128; |
michael@0 | 168 | - ++y1_ptr128; |
michael@0 | 169 | - } while (dest128 < end128); |
michael@0 | 170 | -} |
michael@0 | 171 | -#elif USE_MMX |
michael@0 | 172 | -// MMX version does 8 pixels at a time |
michael@0 | 173 | -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, |
michael@0 | 174 | - int source_width, int source_y_fraction) { |
michael@0 | 175 | - __m64 zero = _mm_setzero_si64(); |
michael@0 | 176 | - __m64 y1_fraction = _mm_set1_pi16(source_y_fraction); |
michael@0 | 177 | - __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction); |
michael@0 | 178 | - |
michael@0 | 179 | - const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr); |
michael@0 | 180 | - const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr); |
michael@0 | 181 | - __m64* dest64 = reinterpret_cast<__m64*>(ybuf); |
michael@0 | 182 | - __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width); |
michael@0 | 183 | - |
michael@0 | 184 | - do { |
michael@0 | 185 | - __m64 y0 = *y0_ptr64++; |
michael@0 | 186 | - __m64 y1 = *y1_ptr64++; |
michael@0 | 187 | - __m64 y2 = _mm_unpackhi_pi8(y0, zero); |
michael@0 | 188 | - __m64 y3 = _mm_unpackhi_pi8(y1, zero); |
michael@0 | 189 | - y0 = _mm_unpacklo_pi8(y0, zero); |
michael@0 | 190 | - y1 = _mm_unpacklo_pi8(y1, zero); |
michael@0 | 191 | - y0 = _mm_mullo_pi16(y0, y0_fraction); |
michael@0 | 192 | - y1 = _mm_mullo_pi16(y1, y1_fraction); |
michael@0 | 193 | - y2 = _mm_mullo_pi16(y2, y0_fraction); |
michael@0 | 194 | - y3 = _mm_mullo_pi16(y3, y1_fraction); |
michael@0 | 195 | - y0 = _mm_add_pi16(y0, y1); |
michael@0 | 196 | - y2 = _mm_add_pi16(y2, y3); |
michael@0 | 197 | - y0 = _mm_srli_pi16(y0, 8); |
michael@0 | 198 | - y2 = _mm_srli_pi16(y2, 8); |
michael@0 | 199 | - y0 = _mm_packs_pu16(y0, y2); |
michael@0 | 200 | - *dest64++ = y0; |
michael@0 | 201 | - } while (dest64 < end64); |
michael@0 | 202 | -} |
michael@0 | 203 | -#else // no MMX or SSE2 |
michael@0 | 204 | + if (has_sse) |
michael@0 | 205 | + EMMS(); |
michael@0 | 206 | +} |
michael@0 | 207 | + |
michael@0 | 208 | // C version does 8 at a time to mimic MMX code |
michael@0 | 209 | -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, |
michael@0 | 210 | - int source_width, int source_y_fraction) { |
michael@0 | 211 | +static void FilterRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, |
michael@0 | 212 | + int source_width, int source_y_fraction) { |
michael@0 | 213 | int y1_fraction = source_y_fraction; |
michael@0 | 214 | int y0_fraction = 256 - y1_fraction; |
michael@0 | 215 | uint8* end = ybuf + source_width; |
michael@0 | 216 | do { |
michael@0 | 217 | ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8; |
michael@0 | 218 | ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8; |
michael@0 | 219 | ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8; |
michael@0 | 220 | ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8; |
michael@0 | 221 | @@ -152,46 +140,77 @@ static void FilterRows(uint8* ybuf, cons |
michael@0 | 222 | ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8; |
michael@0 | 223 | ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8; |
michael@0 | 224 | ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8; |
michael@0 | 225 | y0_ptr += 8; |
michael@0 | 226 | y1_ptr += 8; |
michael@0 | 227 | ybuf += 8; |
michael@0 | 228 | } while (ybuf < end); |
michael@0 | 229 | } |
michael@0 | 230 | -#endif |
michael@0 | 231 | + |
michael@0 | 232 | +#ifdef MOZILLA_MAY_SUPPORT_MMX |
michael@0 | 233 | +void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, |
michael@0 | 234 | + int source_width, int source_y_fraction); |
michael@0 | 235 | +#endif |
michael@0 | 236 | + |
michael@0 | 237 | +#ifdef MOZILLA_MAY_SUPPORT_SSE2 |
michael@0 | 238 | +void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, |
michael@0 | 239 | + int source_width, int source_y_fraction); |
michael@0 | 240 | +#endif |
michael@0 | 241 | + |
michael@0 | 242 | +static inline void FilterRows(uint8* ybuf, const uint8* y0_ptr, |
michael@0 | 243 | + const uint8* y1_ptr, int source_width, |
michael@0 | 244 | + int source_y_fraction) { |
michael@0 | 245 | +#ifdef MOZILLA_MAY_SUPPORT_SSE2 |
michael@0 | 246 | + if (mozilla::supports_sse2()) { |
michael@0 | 247 | + FilterRows_SSE2(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); |
michael@0 | 248 | + return; |
michael@0 | 249 | + } |
michael@0 | 250 | +#endif |
michael@0 | 251 | + |
michael@0 | 252 | +#ifdef MOZILLA_MAY_SUPPORT_MMX |
michael@0 | 253 | + if (mozilla::supports_mmx()) { |
michael@0 | 254 | + FilterRows_MMX(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); |
michael@0 | 255 | + return; |
michael@0 | 256 | + } |
michael@0 | 257 | +#endif |
michael@0 | 258 | + |
michael@0 | 259 | + FilterRows_C(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); |
michael@0 | 260 | +} |
michael@0 | 261 | |
michael@0 | 262 | |
michael@0 | 263 | // Scale a frame of YUV to 32 bit ARGB. |
michael@0 | 264 | -void ScaleYUVToRGB32(const uint8* y_buf, |
michael@0 | 265 | - const uint8* u_buf, |
michael@0 | 266 | - const uint8* v_buf, |
michael@0 | 267 | - uint8* rgb_buf, |
michael@0 | 268 | - int source_width, |
michael@0 | 269 | - int source_height, |
michael@0 | 270 | - int width, |
michael@0 | 271 | - int height, |
michael@0 | 272 | - int y_pitch, |
michael@0 | 273 | - int uv_pitch, |
michael@0 | 274 | - int rgb_pitch, |
michael@0 | 275 | - YUVType yuv_type, |
michael@0 | 276 | - Rotate view_rotate, |
michael@0 | 277 | - ScaleFilter filter) { |
michael@0 | 278 | +NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* y_buf, |
michael@0 | 279 | + const uint8* u_buf, |
michael@0 | 280 | + const uint8* v_buf, |
michael@0 | 281 | + uint8* rgb_buf, |
michael@0 | 282 | + int source_width, |
michael@0 | 283 | + int source_height, |
michael@0 | 284 | + int width, |
michael@0 | 285 | + int height, |
michael@0 | 286 | + int y_pitch, |
michael@0 | 287 | + int uv_pitch, |
michael@0 | 288 | + int rgb_pitch, |
michael@0 | 289 | + YUVType yuv_type, |
michael@0 | 290 | + Rotate view_rotate, |
michael@0 | 291 | + ScaleFilter filter) { |
michael@0 | 292 | + bool has_mmx = supports_mmx(); |
michael@0 | 293 | + |
michael@0 | 294 | // 4096 allows 3 buffers to fit in 12k. |
michael@0 | 295 | // Helps performance on CPU with 16K L1 cache. |
michael@0 | 296 | // Large enough for 3830x2160 and 30" displays which are 2560x1600. |
michael@0 | 297 | const int kFilterBufferSize = 4096; |
michael@0 | 298 | // Disable filtering if the screen is too big (to avoid buffer overflows). |
michael@0 | 299 | // This should never happen to regular users: they don't have monitors |
michael@0 | 300 | // wider than 4096 pixels. |
michael@0 | 301 | // TODO(fbarchard): Allow rotated videos to filter. |
michael@0 | 302 | if (source_width > kFilterBufferSize || view_rotate) |
michael@0 | 303 | filter = FILTER_NONE; |
michael@0 | 304 | |
michael@0 | 305 | - unsigned int y_shift = yuv_type; |
michael@0 | 306 | + unsigned int y_shift = yuv_type == YV12 ? 1 : 0; |
michael@0 | 307 | // Diagram showing origin and direction of source sampling. |
michael@0 | 308 | // ->0 4<- |
michael@0 | 309 | // 7 3 |
michael@0 | 310 | // |
michael@0 | 311 | // 6 5 |
michael@0 | 312 | // ->1 2<- |
michael@0 | 313 | // Rotations that start at right side of image. |
michael@0 | 314 | if ((view_rotate == ROTATE_180) || |
michael@0 | 315 | @@ -276,17 +295,17 @@ void ScaleYUVToRGB32(const uint8* y_buf, |
michael@0 | 316 | int source_uv_fraction = |
michael@0 | 317 | ((source_y_subpixel >> y_shift) & kFractionMask) >> 8; |
michael@0 | 318 | |
michael@0 | 319 | const uint8* y_ptr = y0_ptr; |
michael@0 | 320 | const uint8* u_ptr = u0_ptr; |
michael@0 | 321 | const uint8* v_ptr = v0_ptr; |
michael@0 | 322 | // Apply vertical filtering if necessary. |
michael@0 | 323 | // TODO(fbarchard): Remove memcpy when not necessary. |
michael@0 | 324 | - if (filter & media::FILTER_BILINEAR_V) { |
michael@0 | 325 | + if (filter & mozilla::gfx::FILTER_BILINEAR_V) { |
michael@0 | 326 | if (yscale_fixed != kFractionMax && |
michael@0 | 327 | source_y_fraction && ((source_y + 1) < source_height)) { |
michael@0 | 328 | FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); |
michael@0 | 329 | } else { |
michael@0 | 330 | memcpy(ybuf, y0_ptr, source_width); |
michael@0 | 331 | } |
michael@0 | 332 | y_ptr = ybuf; |
michael@0 | 333 | ybuf[source_width] = ybuf[source_width-1]; |
michael@0 | 334 | @@ -303,44 +322,50 @@ void ScaleYUVToRGB32(const uint8* y_buf, |
michael@0 | 335 | u_ptr = ubuf; |
michael@0 | 336 | v_ptr = vbuf; |
michael@0 | 337 | ubuf[uv_source_width] = ubuf[uv_source_width - 1]; |
michael@0 | 338 | vbuf[uv_source_width] = vbuf[uv_source_width - 1]; |
michael@0 | 339 | } |
michael@0 | 340 | if (source_dx == kFractionMax) { // Not scaled |
michael@0 | 341 | FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr, |
michael@0 | 342 | dest_pixel, width); |
michael@0 | 343 | - } else { |
michael@0 | 344 | - if (filter & FILTER_BILINEAR_H) { |
michael@0 | 345 | + } else if (filter & FILTER_BILINEAR_H) { |
michael@0 | 346 | LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, |
michael@0 | 347 | dest_pixel, width, source_dx); |
michael@0 | 348 | } else { |
michael@0 | 349 | // Specialized scalers and rotation. |
michael@0 | 350 | -#if USE_MMX && defined(_MSC_VER) |
michael@0 | 351 | +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_MSC_VER) && defined(_M_IX86) |
michael@0 | 352 | + if(mozilla::supports_sse()) { |
michael@0 | 353 | if (width == (source_width * 2)) { |
michael@0 | 354 | - DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, |
michael@0 | 355 | - dest_pixel, width); |
michael@0 | 356 | + DoubleYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr, |
michael@0 | 357 | + dest_pixel, width); |
michael@0 | 358 | } else if ((source_dx & kFractionMask) == 0) { |
michael@0 | 359 | // Scaling by integer scale factor. ie half. |
michael@0 | 360 | - ConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr, |
michael@0 | 361 | - dest_pixel, width, |
michael@0 | 362 | - source_dx >> kFractionBits); |
michael@0 | 363 | + ConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr, |
michael@0 | 364 | + dest_pixel, width, |
michael@0 | 365 | + source_dx >> kFractionBits); |
michael@0 | 366 | } else if (source_dx_uv == source_dx) { // Not rotated. |
michael@0 | 367 | ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, |
michael@0 | 368 | dest_pixel, width, source_dx); |
michael@0 | 369 | } else { |
michael@0 | 370 | - RotateConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr, |
michael@0 | 371 | - dest_pixel, width, |
michael@0 | 372 | - source_dx >> kFractionBits, |
michael@0 | 373 | - source_dx_uv >> kFractionBits); |
michael@0 | 374 | + RotateConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr, |
michael@0 | 375 | + dest_pixel, width, |
michael@0 | 376 | + source_dx >> kFractionBits, |
michael@0 | 377 | + source_dx_uv >> kFractionBits); |
michael@0 | 378 | } |
michael@0 | 379 | + } |
michael@0 | 380 | + else { |
michael@0 | 381 | + ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr, |
michael@0 | 382 | + dest_pixel, width, source_dx); |
michael@0 | 383 | + } |
michael@0 | 384 | #else |
michael@0 | 385 | - ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, |
michael@0 | 386 | - dest_pixel, width, source_dx); |
michael@0 | 387 | -#endif |
michael@0 | 388 | - } |
michael@0 | 389 | + ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, |
michael@0 | 390 | + dest_pixel, width, source_dx); |
michael@0 | 391 | +#endif |
michael@0 | 392 | } |
michael@0 | 393 | } |
michael@0 | 394 | // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms. |
michael@0 | 395 | - EMMS(); |
michael@0 | 396 | -} |
michael@0 | 397 | - |
michael@0 | 398 | -} // namespace media |
michael@0 | 399 | + if (has_mmx) |
michael@0 | 400 | + EMMS(); |
michael@0 | 401 | +} |
michael@0 | 402 | + |
michael@0 | 403 | +} // namespace gfx |
michael@0 | 404 | +} // namespace mozilla |
michael@0 | 405 | diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h |
michael@0 | 406 | --- a/gfx/ycbcr/yuv_convert.h |
michael@0 | 407 | +++ b/gfx/ycbcr/yuv_convert.h |
michael@0 | 408 | @@ -1,72 +1,79 @@ |
michael@0 | 409 | // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
michael@0 | 410 | // Use of this source code is governed by a BSD-style license that can be |
michael@0 | 411 | // found in the LICENSE file. |
michael@0 | 412 | |
michael@0 | 413 | #ifndef MEDIA_BASE_YUV_CONVERT_H_ |
michael@0 | 414 | #define MEDIA_BASE_YUV_CONVERT_H_ |
michael@0 | 415 | |
michael@0 | 416 | -#include "base/basictypes.h" |
michael@0 | 417 | - |
michael@0 | 418 | -namespace media { |
michael@0 | 419 | - |
michael@0 | 420 | +#include "chromium_types.h" |
michael@0 | 421 | +#include "gfxCore.h" |
michael@0 | 422 | + |
michael@0 | 423 | +namespace mozilla { |
michael@0 | 424 | + |
michael@0 | 425 | +namespace gfx { |
michael@0 | 426 | + |
michael@0 | 427 | // Type of YUV surface. |
michael@0 | 428 | // The value of these enums matter as they are used to shift vertical indices. |
michael@0 | 429 | enum YUVType { |
michael@0 | 430 | - YV16 = 0, // YV16 is half width and full height chroma channels. |
michael@0 | 431 | - YV12 = 1, // YV12 is half width and half height chroma channels. |
michael@0 | 432 | + YV12 = 0, // YV12 is half width and half height chroma channels. |
michael@0 | 433 | + YV16 = 1, // YV16 is half width and full height chroma channels. |
michael@0 | 434 | + YV24 = 2 // YV24 is full width and full height chroma channels. |
michael@0 | 435 | }; |
michael@0 | 436 | |
michael@0 | 437 | // Mirror means flip the image horizontally, as in looking in a mirror. |
michael@0 | 438 | // Rotate happens after mirroring. |
michael@0 | 439 | enum Rotate { |
michael@0 | 440 | ROTATE_0, // Rotation off. |
michael@0 | 441 | ROTATE_90, // Rotate clockwise. |
michael@0 | 442 | ROTATE_180, // Rotate upside down. |
michael@0 | 443 | ROTATE_270, // Rotate counter clockwise. |
michael@0 | 444 | MIRROR_ROTATE_0, // Mirror horizontally. |
michael@0 | 445 | MIRROR_ROTATE_90, // Mirror then Rotate clockwise. |
michael@0 | 446 | MIRROR_ROTATE_180, // Mirror vertically. |
michael@0 | 447 | - MIRROR_ROTATE_270, // Transpose. |
michael@0 | 448 | + MIRROR_ROTATE_270 // Transpose. |
michael@0 | 449 | }; |
michael@0 | 450 | |
michael@0 | 451 | // Filter affects how scaling looks. |
michael@0 | 452 | enum ScaleFilter { |
michael@0 | 453 | FILTER_NONE = 0, // No filter (point sampled). |
michael@0 | 454 | FILTER_BILINEAR_H = 1, // Bilinear horizontal filter. |
michael@0 | 455 | FILTER_BILINEAR_V = 2, // Bilinear vertical filter. |
michael@0 | 456 | - FILTER_BILINEAR = 3, // Bilinear filter. |
michael@0 | 457 | + FILTER_BILINEAR = 3 // Bilinear filter. |
michael@0 | 458 | }; |
michael@0 | 459 | |
michael@0 | 460 | // Convert a frame of YUV to 32 bit ARGB. |
michael@0 | 461 | // Pass in YV16/YV12 depending on source format |
michael@0 | 462 | -void ConvertYUVToRGB32(const uint8* yplane, |
michael@0 | 463 | - const uint8* uplane, |
michael@0 | 464 | - const uint8* vplane, |
michael@0 | 465 | - uint8* rgbframe, |
michael@0 | 466 | - int width, |
michael@0 | 467 | - int height, |
michael@0 | 468 | - int ystride, |
michael@0 | 469 | - int uvstride, |
michael@0 | 470 | - int rgbstride, |
michael@0 | 471 | - YUVType yuv_type); |
michael@0 | 472 | +NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* yplane, |
michael@0 | 473 | + const uint8* uplane, |
michael@0 | 474 | + const uint8* vplane, |
michael@0 | 475 | + uint8* rgbframe, |
michael@0 | 476 | + int pic_x, |
michael@0 | 477 | + int pic_y, |
michael@0 | 478 | + int pic_width, |
michael@0 | 479 | + int pic_height, |
michael@0 | 480 | + int ystride, |
michael@0 | 481 | + int uvstride, |
michael@0 | 482 | + int rgbstride, |
michael@0 | 483 | + YUVType yuv_type); |
michael@0 | 484 | |
michael@0 | 485 | // Scale a frame of YUV to 32 bit ARGB. |
michael@0 | 486 | // Supports rotation and mirroring. |
michael@0 | 487 | -void ScaleYUVToRGB32(const uint8* yplane, |
michael@0 | 488 | - const uint8* uplane, |
michael@0 | 489 | - const uint8* vplane, |
michael@0 | 490 | - uint8* rgbframe, |
michael@0 | 491 | - int source_width, |
michael@0 | 492 | - int source_height, |
michael@0 | 493 | - int width, |
michael@0 | 494 | - int height, |
michael@0 | 495 | - int ystride, |
michael@0 | 496 | - int uvstride, |
michael@0 | 497 | - int rgbstride, |
michael@0 | 498 | - YUVType yuv_type, |
michael@0 | 499 | - Rotate view_rotate, |
michael@0 | 500 | - ScaleFilter filter); |
michael@0 | 501 | - |
michael@0 | 502 | -} // namespace media |
michael@0 | 503 | - |
michael@0 | 504 | +NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* yplane, |
michael@0 | 505 | + const uint8* uplane, |
michael@0 | 506 | + const uint8* vplane, |
michael@0 | 507 | + uint8* rgbframe, |
michael@0 | 508 | + int source_width, |
michael@0 | 509 | + int source_height, |
michael@0 | 510 | + int width, |
michael@0 | 511 | + int height, |
michael@0 | 512 | + int ystride, |
michael@0 | 513 | + int uvstride, |
michael@0 | 514 | + int rgbstride, |
michael@0 | 515 | + YUVType yuv_type, |
michael@0 | 516 | + Rotate view_rotate, |
michael@0 | 517 | + ScaleFilter filter); |
michael@0 | 518 | + |
michael@0 | 519 | +} // namespace gfx |
michael@0 | 520 | +} // namespace mozilla |
michael@0 | 521 | + |
michael@0 | 522 | #endif // MEDIA_BASE_YUV_CONVERT_H_ |
michael@0 | 523 | diff --git a/gfx/ycbcr/yuv_convert_mmx.cpp b/gfx/ycbcr/yuv_convert_mmx.cpp |
michael@0 | 524 | new file mode 100644 |
michael@0 | 525 | --- /dev/null |
michael@0 | 526 | +++ b/gfx/ycbcr/yuv_convert_mmx.cpp |
michael@0 | 527 | @@ -0,0 +1,45 @@ |
michael@0 | 528 | +// Copyright (c) 2010 The Chromium Authors. All rights reserved. |
michael@0 | 529 | +// Use of this source code is governed by a BSD-style license that can be |
michael@0 | 530 | +// found in the LICENSE file. |
michael@0 | 531 | + |
michael@0 | 532 | +#include <mmintrin.h> |
michael@0 | 533 | +#include "yuv_row.h" |
michael@0 | 534 | + |
michael@0 | 535 | +namespace mozilla { |
michael@0 | 536 | +namespace gfx { |
michael@0 | 537 | + |
michael@0 | 538 | +// FilterRows combines two rows of the image using linear interpolation. |
michael@0 | 539 | +// MMX version does 8 pixels at a time. |
michael@0 | 540 | +void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, |
michael@0 | 541 | + int source_width, int source_y_fraction) { |
michael@0 | 542 | + __m64 zero = _mm_setzero_si64(); |
michael@0 | 543 | + __m64 y1_fraction = _mm_set1_pi16(source_y_fraction); |
michael@0 | 544 | + __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction); |
michael@0 | 545 | + |
michael@0 | 546 | + const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr); |
michael@0 | 547 | + const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr); |
michael@0 | 548 | + __m64* dest64 = reinterpret_cast<__m64*>(ybuf); |
michael@0 | 549 | + __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width); |
michael@0 | 550 | + |
michael@0 | 551 | + do { |
michael@0 | 552 | + __m64 y0 = *y0_ptr64++; |
michael@0 | 553 | + __m64 y1 = *y1_ptr64++; |
michael@0 | 554 | + __m64 y2 = _mm_unpackhi_pi8(y0, zero); |
michael@0 | 555 | + __m64 y3 = _mm_unpackhi_pi8(y1, zero); |
michael@0 | 556 | + y0 = _mm_unpacklo_pi8(y0, zero); |
michael@0 | 557 | + y1 = _mm_unpacklo_pi8(y1, zero); |
michael@0 | 558 | + y0 = _mm_mullo_pi16(y0, y0_fraction); |
michael@0 | 559 | + y1 = _mm_mullo_pi16(y1, y1_fraction); |
michael@0 | 560 | + y2 = _mm_mullo_pi16(y2, y0_fraction); |
michael@0 | 561 | + y3 = _mm_mullo_pi16(y3, y1_fraction); |
michael@0 | 562 | + y0 = _mm_add_pi16(y0, y1); |
michael@0 | 563 | + y2 = _mm_add_pi16(y2, y3); |
michael@0 | 564 | + y0 = _mm_srli_pi16(y0, 8); |
michael@0 | 565 | + y2 = _mm_srli_pi16(y2, 8); |
michael@0 | 566 | + y0 = _mm_packs_pu16(y0, y2); |
michael@0 | 567 | + *dest64++ = y0; |
michael@0 | 568 | + } while (dest64 < end64); |
michael@0 | 569 | +} |
michael@0 | 570 | + |
michael@0 | 571 | +} |
michael@0 | 572 | +} |
michael@0 | 573 | diff --git a/gfx/ycbcr/yuv_convert_sse2.cpp b/gfx/ycbcr/yuv_convert_sse2.cpp |
michael@0 | 574 | new file mode 100644 |
michael@0 | 575 | --- /dev/null |
michael@0 | 576 | +++ b/gfx/ycbcr/yuv_convert_sse2.cpp |
michael@0 | 577 | @@ -0,0 +1,47 @@ |
michael@0 | 578 | +// Copyright (c) 2010 The Chromium Authors. All rights reserved. |
michael@0 | 579 | +// Use of this source code is governed by a BSD-style license that can be |
michael@0 | 580 | +// found in the LICENSE file. |
michael@0 | 581 | + |
michael@0 | 582 | +#include <emmintrin.h> |
michael@0 | 583 | +#include "yuv_row.h" |
michael@0 | 584 | + |
michael@0 | 585 | +namespace mozilla { |
michael@0 | 586 | +namespace gfx { |
michael@0 | 587 | + |
michael@0 | 588 | +// FilterRows combines two rows of the image using linear interpolation. |
michael@0 | 589 | +// SSE2 version does 16 pixels at a time. |
michael@0 | 590 | +void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, |
michael@0 | 591 | + int source_width, int source_y_fraction) { |
michael@0 | 592 | + __m128i zero = _mm_setzero_si128(); |
michael@0 | 593 | + __m128i y1_fraction = _mm_set1_epi16(source_y_fraction); |
michael@0 | 594 | + __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction); |
michael@0 | 595 | + |
michael@0 | 596 | + const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr); |
michael@0 | 597 | + const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr); |
michael@0 | 598 | + __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf); |
michael@0 | 599 | + __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width); |
michael@0 | 600 | + |
michael@0 | 601 | + do { |
michael@0 | 602 | + __m128i y0 = _mm_loadu_si128(y0_ptr128); |
michael@0 | 603 | + __m128i y1 = _mm_loadu_si128(y1_ptr128); |
michael@0 | 604 | + __m128i y2 = _mm_unpackhi_epi8(y0, zero); |
michael@0 | 605 | + __m128i y3 = _mm_unpackhi_epi8(y1, zero); |
michael@0 | 606 | + y0 = _mm_unpacklo_epi8(y0, zero); |
michael@0 | 607 | + y1 = _mm_unpacklo_epi8(y1, zero); |
michael@0 | 608 | + y0 = _mm_mullo_epi16(y0, y0_fraction); |
michael@0 | 609 | + y1 = _mm_mullo_epi16(y1, y1_fraction); |
michael@0 | 610 | + y2 = _mm_mullo_epi16(y2, y0_fraction); |
michael@0 | 611 | + y3 = _mm_mullo_epi16(y3, y1_fraction); |
michael@0 | 612 | + y0 = _mm_add_epi16(y0, y1); |
michael@0 | 613 | + y2 = _mm_add_epi16(y2, y3); |
michael@0 | 614 | + y0 = _mm_srli_epi16(y0, 8); |
michael@0 | 615 | + y2 = _mm_srli_epi16(y2, 8); |
michael@0 | 616 | + y0 = _mm_packus_epi16(y0, y2); |
michael@0 | 617 | + *dest128++ = y0; |
michael@0 | 618 | + ++y0_ptr128; |
michael@0 | 619 | + ++y1_ptr128; |
michael@0 | 620 | + } while (dest128 < end128); |
michael@0 | 621 | +} |
michael@0 | 622 | + |
michael@0 | 623 | +} |
michael@0 | 624 | +} |
michael@0 | 625 | diff --git a/gfx/ycbcr/yuv_row.h b/gfx/ycbcr/yuv_row.h |
michael@0 | 626 | --- a/gfx/ycbcr/yuv_row.h |
michael@0 | 627 | +++ b/gfx/ycbcr/yuv_row.h |
michael@0 | 628 | @@ -5,109 +5,133 @@ |
michael@0 | 629 | // yuv_row internal functions to handle YUV conversion and scaling to RGB. |
michael@0 | 630 | // These functions are used from both yuv_convert.cc and yuv_scale.cc. |
michael@0 | 631 | |
michael@0 | 632 | // TODO(fbarchard): Write function that can handle rotation and scaling. |
michael@0 | 633 | |
michael@0 | 634 | #ifndef MEDIA_BASE_YUV_ROW_H_ |
michael@0 | 635 | #define MEDIA_BASE_YUV_ROW_H_ |
michael@0 | 636 | |
michael@0 | 637 | -#include "base/basictypes.h" |
michael@0 | 638 | +#include "chromium_types.h" |
michael@0 | 639 | |
michael@0 | 640 | extern "C" { |
michael@0 | 641 | // Can only do 1x. |
michael@0 | 642 | // This is the second fastest of the scalers. |
michael@0 | 643 | void FastConvertYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 644 | const uint8* u_buf, |
michael@0 | 645 | const uint8* v_buf, |
michael@0 | 646 | uint8* rgb_buf, |
michael@0 | 647 | int width); |
michael@0 | 648 | |
michael@0 | 649 | -// Can do 1x, half size or any scale down by an integer amount. |
michael@0 | 650 | -// Step can be negative (mirroring, rotate 180). |
michael@0 | 651 | -// This is the third fastest of the scalers. |
michael@0 | 652 | -void ConvertYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 653 | - const uint8* u_buf, |
michael@0 | 654 | - const uint8* v_buf, |
michael@0 | 655 | - uint8* rgb_buf, |
michael@0 | 656 | - int width, |
michael@0 | 657 | - int step); |
michael@0 | 658 | - |
michael@0 | 659 | -// Rotate is like Convert, but applies different step to Y versus U and V. |
michael@0 | 660 | -// This allows rotation by 90 or 270, by stepping by stride. |
michael@0 | 661 | -// This is the forth fastest of the scalers. |
michael@0 | 662 | -void RotateConvertYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 663 | +void FastConvertYUVToRGB32Row_C(const uint8* y_buf, |
michael@0 | 664 | const uint8* u_buf, |
michael@0 | 665 | const uint8* v_buf, |
michael@0 | 666 | uint8* rgb_buf, |
michael@0 | 667 | int width, |
michael@0 | 668 | - int ystep, |
michael@0 | 669 | - int uvstep); |
michael@0 | 670 | + unsigned int x_shift); |
michael@0 | 671 | + |
michael@0 | 672 | +void FastConvertYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 673 | + const uint8* u_buf, |
michael@0 | 674 | + const uint8* v_buf, |
michael@0 | 675 | + uint8* rgb_buf, |
michael@0 | 676 | + int width); |
michael@0 | 677 | + |
michael@0 | 678 | +// Can do 1x, half size or any scale down by an integer amount. |
michael@0 | 679 | +// Step can be negative (mirroring, rotate 180). |
michael@0 | 680 | +// This is the third fastest of the scalers. |
michael@0 | 681 | +// Only defined on Windows x86-32. |
michael@0 | 682 | +void ConvertYUVToRGB32Row_SSE(const uint8* y_buf, |
michael@0 | 683 | + const uint8* u_buf, |
michael@0 | 684 | + const uint8* v_buf, |
michael@0 | 685 | + uint8* rgb_buf, |
michael@0 | 686 | + int width, |
michael@0 | 687 | + int step); |
michael@0 | 688 | + |
michael@0 | 689 | +// Rotate is like Convert, but applies different step to Y versus U and V. |
michael@0 | 690 | +// This allows rotation by 90 or 270, by stepping by stride. |
michael@0 | 691 | +// This is the forth fastest of the scalers. |
michael@0 | 692 | +// Only defined on Windows x86-32. |
michael@0 | 693 | +void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf, |
michael@0 | 694 | + const uint8* u_buf, |
michael@0 | 695 | + const uint8* v_buf, |
michael@0 | 696 | + uint8* rgb_buf, |
michael@0 | 697 | + int width, |
michael@0 | 698 | + int ystep, |
michael@0 | 699 | + int uvstep); |
michael@0 | 700 | |
michael@0 | 701 | // Doubler does 4 pixels at a time. Each pixel is replicated. |
michael@0 | 702 | // This is the fastest of the scalers. |
michael@0 | 703 | -void DoubleYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 704 | - const uint8* u_buf, |
michael@0 | 705 | - const uint8* v_buf, |
michael@0 | 706 | - uint8* rgb_buf, |
michael@0 | 707 | - int width); |
michael@0 | 708 | +// Only defined on Windows x86-32. |
michael@0 | 709 | +void DoubleYUVToRGB32Row_SSE(const uint8* y_buf, |
michael@0 | 710 | + const uint8* u_buf, |
michael@0 | 711 | + const uint8* v_buf, |
michael@0 | 712 | + uint8* rgb_buf, |
michael@0 | 713 | + int width); |
michael@0 | 714 | |
michael@0 | 715 | // Handles arbitrary scaling up or down. |
michael@0 | 716 | // Mirroring is supported, but not 90 or 270 degree rotation. |
michael@0 | 717 | // Chroma is under sampled every 2 pixels for performance. |
michael@0 | 718 | void ScaleYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 719 | const uint8* u_buf, |
michael@0 | 720 | const uint8* v_buf, |
michael@0 | 721 | uint8* rgb_buf, |
michael@0 | 722 | int width, |
michael@0 | 723 | int source_dx); |
michael@0 | 724 | |
michael@0 | 725 | +void ScaleYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 726 | + const uint8* u_buf, |
michael@0 | 727 | + const uint8* v_buf, |
michael@0 | 728 | + uint8* rgb_buf, |
michael@0 | 729 | + int width, |
michael@0 | 730 | + int source_dx); |
michael@0 | 731 | + |
michael@0 | 732 | +void ScaleYUVToRGB32Row_C(const uint8* y_buf, |
michael@0 | 733 | + const uint8* u_buf, |
michael@0 | 734 | + const uint8* v_buf, |
michael@0 | 735 | + uint8* rgb_buf, |
michael@0 | 736 | + int width, |
michael@0 | 737 | + int source_dx); |
michael@0 | 738 | + |
michael@0 | 739 | // Handles arbitrary scaling up or down with bilinear filtering. |
michael@0 | 740 | // Mirroring is supported, but not 90 or 270 degree rotation. |
michael@0 | 741 | // Chroma is under sampled every 2 pixels for performance. |
michael@0 | 742 | // This is the slowest of the scalers. |
michael@0 | 743 | void LinearScaleYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 744 | const uint8* u_buf, |
michael@0 | 745 | const uint8* v_buf, |
michael@0 | 746 | uint8* rgb_buf, |
michael@0 | 747 | int width, |
michael@0 | 748 | int source_dx); |
michael@0 | 749 | |
michael@0 | 750 | +void LinearScaleYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 751 | + const uint8* u_buf, |
michael@0 | 752 | + const uint8* v_buf, |
michael@0 | 753 | + uint8* rgb_buf, |
michael@0 | 754 | + int width, |
michael@0 | 755 | + int source_dx); |
michael@0 | 756 | + |
michael@0 | 757 | +void LinearScaleYUVToRGB32Row_C(const uint8* y_buf, |
michael@0 | 758 | + const uint8* u_buf, |
michael@0 | 759 | + const uint8* v_buf, |
michael@0 | 760 | + uint8* rgb_buf, |
michael@0 | 761 | + int width, |
michael@0 | 762 | + int source_dx); |
michael@0 | 763 | + |
michael@0 | 764 | + |
michael@0 | 765 | #if defined(_MSC_VER) |
michael@0 | 766 | #define SIMD_ALIGNED(var) __declspec(align(16)) var |
michael@0 | 767 | #else |
michael@0 | 768 | #define SIMD_ALIGNED(var) var __attribute__((aligned(16))) |
michael@0 | 769 | #endif |
michael@0 | 770 | extern SIMD_ALIGNED(int16 kCoefficientsRgbY[768][4]); |
michael@0 | 771 | |
michael@0 | 772 | -// Method to force C version. |
michael@0 | 773 | -//#define USE_MMX 0 |
michael@0 | 774 | -//#define USE_SSE2 0 |
michael@0 | 775 | - |
michael@0 | 776 | -#if !defined(USE_MMX) |
michael@0 | 777 | -// Windows, Mac and Linux/BSD use MMX |
michael@0 | 778 | -#if defined(__MMX__) || defined(_MSC_VER) |
michael@0 | 779 | -#define USE_MMX 1 |
michael@0 | 780 | -#else |
michael@0 | 781 | -#define USE_MMX 0 |
michael@0 | 782 | -#endif |
michael@0 | 783 | -#endif |
michael@0 | 784 | - |
michael@0 | 785 | -#if !defined(USE_SSE2) |
michael@0 | 786 | -#if defined(__SSE2__) || defined(ARCH_CPU_X86_64) || _M_IX86_FP==2 |
michael@0 | 787 | -#define USE_SSE2 1 |
michael@0 | 788 | -#else |
michael@0 | 789 | -#define USE_SSE2 0 |
michael@0 | 790 | -#endif |
michael@0 | 791 | -#endif |
michael@0 | 792 | - |
michael@0 | 793 | // x64 uses MMX2 (SSE) so emms is not required. |
michael@0 | 794 | // Warning C4799: function has no EMMS instruction. |
michael@0 | 795 | // EMMS() is slow and should be called by the calling function once per image. |
michael@0 | 796 | -#if USE_MMX && !defined(ARCH_CPU_X86_64) |
michael@0 | 797 | +#if defined(ARCH_CPU_X86) && !defined(ARCH_CPU_X86_64) |
michael@0 | 798 | #if defined(_MSC_VER) |
michael@0 | 799 | #define EMMS() __asm emms |
michael@0 | 800 | #pragma warning(disable: 4799) |
michael@0 | 801 | #else |
michael@0 | 802 | #define EMMS() asm("emms") |
michael@0 | 803 | #endif |
michael@0 | 804 | #else |
michael@0 | 805 | #define EMMS() |
michael@0 | 806 | diff --git a/gfx/ycbcr/yuv_row_c.cpp b/gfx/ycbcr/yuv_row_c.cpp |
michael@0 | 807 | --- a/gfx/ycbcr/yuv_row_c.cpp |
michael@0 | 808 | +++ b/gfx/ycbcr/yuv_row_c.cpp |
michael@0 | 809 | @@ -1,812 +1,18 @@ |
michael@0 | 810 | // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
michael@0 | 811 | // Use of this source code is governed by a BSD-style license that can be |
michael@0 | 812 | // found in the LICENSE file. |
michael@0 | 813 | |
michael@0 | 814 | -#include "media/base/yuv_row.h" |
michael@0 | 815 | - |
michael@0 | 816 | -#ifdef _DEBUG |
michael@0 | 817 | -#include "base/logging.h" |
michael@0 | 818 | -#else |
michael@0 | 819 | +#include "yuv_row.h" |
michael@0 | 820 | + |
michael@0 | 821 | #define DCHECK(a) |
michael@0 | 822 | -#endif |
michael@0 | 823 | |
michael@0 | 824 | extern "C" { |
michael@0 | 825 | |
michael@0 | 826 | -#if USE_SSE2 && defined(ARCH_CPU_X86_64) |
michael@0 | 827 | - |
michael@0 | 828 | -// AMD64 ABI uses register paremters. |
michael@0 | 829 | -void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi |
michael@0 | 830 | - const uint8* u_buf, // rsi |
michael@0 | 831 | - const uint8* v_buf, // rdx |
michael@0 | 832 | - uint8* rgb_buf, // rcx |
michael@0 | 833 | - int width) { // r8 |
michael@0 | 834 | - asm( |
michael@0 | 835 | - "jmp convertend\n" |
michael@0 | 836 | -"convertloop:" |
michael@0 | 837 | - "movzb (%1),%%r10\n" |
michael@0 | 838 | - "add $0x1,%1\n" |
michael@0 | 839 | - "movzb (%2),%%r11\n" |
michael@0 | 840 | - "add $0x1,%2\n" |
michael@0 | 841 | - "movq 2048(%5,%%r10,8),%%xmm0\n" |
michael@0 | 842 | - "movzb (%0),%%r10\n" |
michael@0 | 843 | - "movq 4096(%5,%%r11,8),%%xmm1\n" |
michael@0 | 844 | - "movzb 0x1(%0),%%r11\n" |
michael@0 | 845 | - "paddsw %%xmm1,%%xmm0\n" |
michael@0 | 846 | - "movq (%5,%%r10,8),%%xmm2\n" |
michael@0 | 847 | - "add $0x2,%0\n" |
michael@0 | 848 | - "movq (%5,%%r11,8),%%xmm3\n" |
michael@0 | 849 | - "paddsw %%xmm0,%%xmm2\n" |
michael@0 | 850 | - "paddsw %%xmm0,%%xmm3\n" |
michael@0 | 851 | - "shufps $0x44,%%xmm3,%%xmm2\n" |
michael@0 | 852 | - "psraw $0x6,%%xmm2\n" |
michael@0 | 853 | - "packuswb %%xmm2,%%xmm2\n" |
michael@0 | 854 | - "movq %%xmm2,0x0(%3)\n" |
michael@0 | 855 | - "add $0x8,%3\n" |
michael@0 | 856 | -"convertend:" |
michael@0 | 857 | - "sub $0x2,%4\n" |
michael@0 | 858 | - "jns convertloop\n" |
michael@0 | 859 | - |
michael@0 | 860 | -"convertnext:" |
michael@0 | 861 | - "add $0x1,%4\n" |
michael@0 | 862 | - "js convertdone\n" |
michael@0 | 863 | - |
michael@0 | 864 | - "movzb (%1),%%r10\n" |
michael@0 | 865 | - "movq 2048(%5,%%r10,8),%%xmm0\n" |
michael@0 | 866 | - "movzb (%2),%%r10\n" |
michael@0 | 867 | - "movq 4096(%5,%%r10,8),%%xmm1\n" |
michael@0 | 868 | - "paddsw %%xmm1,%%xmm0\n" |
michael@0 | 869 | - "movzb (%0),%%r10\n" |
michael@0 | 870 | - "movq (%5,%%r10,8),%%xmm1\n" |
michael@0 | 871 | - "paddsw %%xmm0,%%xmm1\n" |
michael@0 | 872 | - "psraw $0x6,%%xmm1\n" |
michael@0 | 873 | - "packuswb %%xmm1,%%xmm1\n" |
michael@0 | 874 | - "movd %%xmm1,0x0(%3)\n" |
michael@0 | 875 | -"convertdone:" |
michael@0 | 876 | - : |
michael@0 | 877 | - : "r"(y_buf), // %0 |
michael@0 | 878 | - "r"(u_buf), // %1 |
michael@0 | 879 | - "r"(v_buf), // %2 |
michael@0 | 880 | - "r"(rgb_buf), // %3 |
michael@0 | 881 | - "r"(width), // %4 |
michael@0 | 882 | - "r" (kCoefficientsRgbY) // %5 |
michael@0 | 883 | - : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" |
michael@0 | 884 | -); |
michael@0 | 885 | -} |
michael@0 | 886 | - |
michael@0 | 887 | -void ScaleYUVToRGB32Row(const uint8* y_buf, // rdi |
michael@0 | 888 | - const uint8* u_buf, // rsi |
michael@0 | 889 | - const uint8* v_buf, // rdx |
michael@0 | 890 | - uint8* rgb_buf, // rcx |
michael@0 | 891 | - int width, // r8 |
michael@0 | 892 | - int source_dx) { // r9 |
michael@0 | 893 | - asm( |
michael@0 | 894 | - "xor %%r11,%%r11\n" |
michael@0 | 895 | - "sub $0x2,%4\n" |
michael@0 | 896 | - "js scalenext\n" |
michael@0 | 897 | - |
michael@0 | 898 | -"scaleloop:" |
michael@0 | 899 | - "mov %%r11,%%r10\n" |
michael@0 | 900 | - "sar $0x11,%%r10\n" |
michael@0 | 901 | - "movzb (%1,%%r10,1),%%rax\n" |
michael@0 | 902 | - "movq 2048(%5,%%rax,8),%%xmm0\n" |
michael@0 | 903 | - "movzb (%2,%%r10,1),%%rax\n" |
michael@0 | 904 | - "movq 4096(%5,%%rax,8),%%xmm1\n" |
michael@0 | 905 | - "lea (%%r11,%6),%%r10\n" |
michael@0 | 906 | - "sar $0x10,%%r11\n" |
michael@0 | 907 | - "movzb (%0,%%r11,1),%%rax\n" |
michael@0 | 908 | - "paddsw %%xmm1,%%xmm0\n" |
michael@0 | 909 | - "movq (%5,%%rax,8),%%xmm1\n" |
michael@0 | 910 | - "lea (%%r10,%6),%%r11\n" |
michael@0 | 911 | - "sar $0x10,%%r10\n" |
michael@0 | 912 | - "movzb (%0,%%r10,1),%%rax\n" |
michael@0 | 913 | - "movq (%5,%%rax,8),%%xmm2\n" |
michael@0 | 914 | - "paddsw %%xmm0,%%xmm1\n" |
michael@0 | 915 | - "paddsw %%xmm0,%%xmm2\n" |
michael@0 | 916 | - "shufps $0x44,%%xmm2,%%xmm1\n" |
michael@0 | 917 | - "psraw $0x6,%%xmm1\n" |
michael@0 | 918 | - "packuswb %%xmm1,%%xmm1\n" |
michael@0 | 919 | - "movq %%xmm1,0x0(%3)\n" |
michael@0 | 920 | - "add $0x8,%3\n" |
michael@0 | 921 | - "sub $0x2,%4\n" |
michael@0 | 922 | - "jns scaleloop\n" |
michael@0 | 923 | - |
michael@0 | 924 | -"scalenext:" |
michael@0 | 925 | - "add $0x1,%4\n" |
michael@0 | 926 | - "js scaledone\n" |
michael@0 | 927 | - |
michael@0 | 928 | - "mov %%r11,%%r10\n" |
michael@0 | 929 | - "sar $0x11,%%r10\n" |
michael@0 | 930 | - "movzb (%1,%%r10,1),%%rax\n" |
michael@0 | 931 | - "movq 2048(%5,%%rax,8),%%xmm0\n" |
michael@0 | 932 | - "movzb (%2,%%r10,1),%%rax\n" |
michael@0 | 933 | - "movq 4096(%5,%%rax,8),%%xmm1\n" |
michael@0 | 934 | - "paddsw %%xmm1,%%xmm0\n" |
michael@0 | 935 | - "sar $0x10,%%r11\n" |
michael@0 | 936 | - "movzb (%0,%%r11,1),%%rax\n" |
michael@0 | 937 | - "movq (%5,%%rax,8),%%xmm1\n" |
michael@0 | 938 | - "paddsw %%xmm0,%%xmm1\n" |
michael@0 | 939 | - "psraw $0x6,%%xmm1\n" |
michael@0 | 940 | - "packuswb %%xmm1,%%xmm1\n" |
michael@0 | 941 | - "movd %%xmm1,0x0(%3)\n" |
michael@0 | 942 | - |
michael@0 | 943 | -"scaledone:" |
michael@0 | 944 | - : |
michael@0 | 945 | - : "r"(y_buf), // %0 |
michael@0 | 946 | - "r"(u_buf), // %1 |
michael@0 | 947 | - "r"(v_buf), // %2 |
michael@0 | 948 | - "r"(rgb_buf), // %3 |
michael@0 | 949 | - "r"(width), // %4 |
michael@0 | 950 | - "r" (kCoefficientsRgbY), // %5 |
michael@0 | 951 | - "r"(static_cast<long>(source_dx)) // %6 |
michael@0 | 952 | - : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2" |
michael@0 | 953 | -); |
michael@0 | 954 | -} |
michael@0 | 955 | - |
michael@0 | 956 | -void LinearScaleYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 957 | - const uint8* u_buf, |
michael@0 | 958 | - const uint8* v_buf, |
michael@0 | 959 | - uint8* rgb_buf, |
michael@0 | 960 | - int width, |
michael@0 | 961 | - int source_dx) { |
michael@0 | 962 | - asm( |
michael@0 | 963 | - "xor %%r11,%%r11\n" // x = 0 |
michael@0 | 964 | - "sub $0x2,%4\n" |
michael@0 | 965 | - "js .lscalenext\n" |
michael@0 | 966 | - "cmp $0x20000,%6\n" // if source_dx >= 2.0 |
michael@0 | 967 | - "jl .lscalehalf\n" |
michael@0 | 968 | - "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less |
michael@0 | 969 | -".lscalehalf:" |
michael@0 | 970 | - |
michael@0 | 971 | -".lscaleloop:" |
michael@0 | 972 | - "mov %%r11,%%r10\n" |
michael@0 | 973 | - "sar $0x11,%%r10\n" |
michael@0 | 974 | - |
michael@0 | 975 | - "movzb (%1, %%r10, 1), %%r13 \n" |
michael@0 | 976 | - "movzb 1(%1, %%r10, 1), %%r14 \n" |
michael@0 | 977 | - "mov %%r11, %%rax \n" |
michael@0 | 978 | - "and $0x1fffe, %%rax \n" |
michael@0 | 979 | - "imul %%rax, %%r14 \n" |
michael@0 | 980 | - "xor $0x1fffe, %%rax \n" |
michael@0 | 981 | - "imul %%rax, %%r13 \n" |
michael@0 | 982 | - "add %%r14, %%r13 \n" |
michael@0 | 983 | - "shr $17, %%r13 \n" |
michael@0 | 984 | - "movq 2048(%5,%%r13,8), %%xmm0\n" |
michael@0 | 985 | - |
michael@0 | 986 | - "movzb (%2, %%r10, 1), %%r13 \n" |
michael@0 | 987 | - "movzb 1(%2, %%r10, 1), %%r14 \n" |
michael@0 | 988 | - "mov %%r11, %%rax \n" |
michael@0 | 989 | - "and $0x1fffe, %%rax \n" |
michael@0 | 990 | - "imul %%rax, %%r14 \n" |
michael@0 | 991 | - "xor $0x1fffe, %%rax \n" |
michael@0 | 992 | - "imul %%rax, %%r13 \n" |
michael@0 | 993 | - "add %%r14, %%r13 \n" |
michael@0 | 994 | - "shr $17, %%r13 \n" |
michael@0 | 995 | - "movq 4096(%5,%%r13,8), %%xmm1\n" |
michael@0 | 996 | - |
michael@0 | 997 | - "mov %%r11, %%rax \n" |
michael@0 | 998 | - "lea (%%r11,%6),%%r10\n" |
michael@0 | 999 | - "sar $0x10,%%r11\n" |
michael@0 | 1000 | - "paddsw %%xmm1,%%xmm0\n" |
michael@0 | 1001 | - |
michael@0 | 1002 | - "movzb (%0, %%r11, 1), %%r13 \n" |
michael@0 | 1003 | - "movzb 1(%0, %%r11, 1), %%r14 \n" |
michael@0 | 1004 | - "and $0xffff, %%rax \n" |
michael@0 | 1005 | - "imul %%rax, %%r14 \n" |
michael@0 | 1006 | - "xor $0xffff, %%rax \n" |
michael@0 | 1007 | - "imul %%rax, %%r13 \n" |
michael@0 | 1008 | - "add %%r14, %%r13 \n" |
michael@0 | 1009 | - "shr $16, %%r13 \n" |
michael@0 | 1010 | - "movq (%5,%%r13,8),%%xmm1\n" |
michael@0 | 1011 | - |
michael@0 | 1012 | - "mov %%r10, %%rax \n" |
michael@0 | 1013 | - "lea (%%r10,%6),%%r11\n" |
michael@0 | 1014 | - "sar $0x10,%%r10\n" |
michael@0 | 1015 | - |
michael@0 | 1016 | - "movzb (%0,%%r10,1), %%r13 \n" |
michael@0 | 1017 | - "movzb 1(%0,%%r10,1), %%r14 \n" |
michael@0 | 1018 | - "and $0xffff, %%rax \n" |
michael@0 | 1019 | - "imul %%rax, %%r14 \n" |
michael@0 | 1020 | - "xor $0xffff, %%rax \n" |
michael@0 | 1021 | - "imul %%rax, %%r13 \n" |
michael@0 | 1022 | - "add %%r14, %%r13 \n" |
michael@0 | 1023 | - "shr $16, %%r13 \n" |
michael@0 | 1024 | - "movq (%5,%%r13,8),%%xmm2\n" |
michael@0 | 1025 | - |
michael@0 | 1026 | - "paddsw %%xmm0,%%xmm1\n" |
michael@0 | 1027 | - "paddsw %%xmm0,%%xmm2\n" |
michael@0 | 1028 | - "shufps $0x44,%%xmm2,%%xmm1\n" |
michael@0 | 1029 | - "psraw $0x6,%%xmm1\n" |
michael@0 | 1030 | - "packuswb %%xmm1,%%xmm1\n" |
michael@0 | 1031 | - "movq %%xmm1,0x0(%3)\n" |
michael@0 | 1032 | - "add $0x8,%3\n" |
michael@0 | 1033 | - "sub $0x2,%4\n" |
michael@0 | 1034 | - "jns .lscaleloop\n" |
michael@0 | 1035 | - |
michael@0 | 1036 | -".lscalenext:" |
michael@0 | 1037 | - "add $0x1,%4\n" |
michael@0 | 1038 | - "js .lscaledone\n" |
michael@0 | 1039 | - |
michael@0 | 1040 | - "mov %%r11,%%r10\n" |
michael@0 | 1041 | - "sar $0x11,%%r10\n" |
michael@0 | 1042 | - |
michael@0 | 1043 | - "movzb (%1,%%r10,1), %%r13 \n" |
michael@0 | 1044 | - "movq 2048(%5,%%r13,8),%%xmm0\n" |
michael@0 | 1045 | - |
michael@0 | 1046 | - "movzb (%2,%%r10,1), %%r13 \n" |
michael@0 | 1047 | - "movq 4096(%5,%%r13,8),%%xmm1\n" |
michael@0 | 1048 | - |
michael@0 | 1049 | - "paddsw %%xmm1,%%xmm0\n" |
michael@0 | 1050 | - "sar $0x10,%%r11\n" |
michael@0 | 1051 | - |
michael@0 | 1052 | - "movzb (%0,%%r11,1), %%r13 \n" |
michael@0 | 1053 | - "movq (%5,%%r13,8),%%xmm1\n" |
michael@0 | 1054 | - |
michael@0 | 1055 | - "paddsw %%xmm0,%%xmm1\n" |
michael@0 | 1056 | - "psraw $0x6,%%xmm1\n" |
michael@0 | 1057 | - "packuswb %%xmm1,%%xmm1\n" |
michael@0 | 1058 | - "movd %%xmm1,0x0(%3)\n" |
michael@0 | 1059 | - |
michael@0 | 1060 | -".lscaledone:" |
michael@0 | 1061 | - : |
michael@0 | 1062 | - : "r"(y_buf), // %0 |
michael@0 | 1063 | - "r"(u_buf), // %1 |
michael@0 | 1064 | - "r"(v_buf), // %2 |
michael@0 | 1065 | - "r"(rgb_buf), // %3 |
michael@0 | 1066 | - "r"(width), // %4 |
michael@0 | 1067 | - "r" (kCoefficientsRgbY), // %5 |
michael@0 | 1068 | - "r"(static_cast<long>(source_dx)) // %6 |
michael@0 | 1069 | - : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2" |
michael@0 | 1070 | -); |
michael@0 | 1071 | -} |
michael@0 | 1072 | - |
michael@0 | 1073 | -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__) |
michael@0 | 1074 | - |
michael@0 | 1075 | -// PIC version is slower because less registers are available, so |
michael@0 | 1076 | -// non-PIC is used on platforms where it is possible. |
michael@0 | 1077 | - |
michael@0 | 1078 | -void FastConvertYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 1079 | - const uint8* u_buf, |
michael@0 | 1080 | - const uint8* v_buf, |
michael@0 | 1081 | - uint8* rgb_buf, |
michael@0 | 1082 | - int width); |
michael@0 | 1083 | - asm( |
michael@0 | 1084 | - ".text\n" |
michael@0 | 1085 | - ".global FastConvertYUVToRGB32Row\n" |
michael@0 | 1086 | -"FastConvertYUVToRGB32Row:\n" |
michael@0 | 1087 | - "pusha\n" |
michael@0 | 1088 | - "mov 0x24(%esp),%edx\n" |
michael@0 | 1089 | - "mov 0x28(%esp),%edi\n" |
michael@0 | 1090 | - "mov 0x2c(%esp),%esi\n" |
michael@0 | 1091 | - "mov 0x30(%esp),%ebp\n" |
michael@0 | 1092 | - "mov 0x34(%esp),%ecx\n" |
michael@0 | 1093 | - "jmp convertend\n" |
michael@0 | 1094 | - |
michael@0 | 1095 | -"convertloop:" |
michael@0 | 1096 | - "movzbl (%edi),%eax\n" |
michael@0 | 1097 | - "add $0x1,%edi\n" |
michael@0 | 1098 | - "movzbl (%esi),%ebx\n" |
michael@0 | 1099 | - "add $0x1,%esi\n" |
michael@0 | 1100 | - "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" |
michael@0 | 1101 | - "movzbl (%edx),%eax\n" |
michael@0 | 1102 | - "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" |
michael@0 | 1103 | - "movzbl 0x1(%edx),%ebx\n" |
michael@0 | 1104 | - "movq kCoefficientsRgbY(,%eax,8),%mm1\n" |
michael@0 | 1105 | - "add $0x2,%edx\n" |
michael@0 | 1106 | - "movq kCoefficientsRgbY(,%ebx,8),%mm2\n" |
michael@0 | 1107 | - "paddsw %mm0,%mm1\n" |
michael@0 | 1108 | - "paddsw %mm0,%mm2\n" |
michael@0 | 1109 | - "psraw $0x6,%mm1\n" |
michael@0 | 1110 | - "psraw $0x6,%mm2\n" |
michael@0 | 1111 | - "packuswb %mm2,%mm1\n" |
michael@0 | 1112 | - "movntq %mm1,0x0(%ebp)\n" |
michael@0 | 1113 | - "add $0x8,%ebp\n" |
michael@0 | 1114 | -"convertend:" |
michael@0 | 1115 | - "sub $0x2,%ecx\n" |
michael@0 | 1116 | - "jns convertloop\n" |
michael@0 | 1117 | - |
michael@0 | 1118 | - "and $0x1,%ecx\n" |
michael@0 | 1119 | - "je convertdone\n" |
michael@0 | 1120 | - |
michael@0 | 1121 | - "movzbl (%edi),%eax\n" |
michael@0 | 1122 | - "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" |
michael@0 | 1123 | - "movzbl (%esi),%eax\n" |
michael@0 | 1124 | - "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" |
michael@0 | 1125 | - "movzbl (%edx),%eax\n" |
michael@0 | 1126 | - "movq kCoefficientsRgbY(,%eax,8),%mm1\n" |
michael@0 | 1127 | - "paddsw %mm0,%mm1\n" |
michael@0 | 1128 | - "psraw $0x6,%mm1\n" |
michael@0 | 1129 | - "packuswb %mm1,%mm1\n" |
michael@0 | 1130 | - "movd %mm1,0x0(%ebp)\n" |
michael@0 | 1131 | -"convertdone:" |
michael@0 | 1132 | - "popa\n" |
michael@0 | 1133 | - "ret\n" |
michael@0 | 1134 | -); |
michael@0 | 1135 | - |
michael@0 | 1136 | - |
michael@0 | 1137 | -void ScaleYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 1138 | - const uint8* u_buf, |
michael@0 | 1139 | - const uint8* v_buf, |
michael@0 | 1140 | - uint8* rgb_buf, |
michael@0 | 1141 | - int width, |
michael@0 | 1142 | - int source_dx); |
michael@0 | 1143 | - asm( |
michael@0 | 1144 | - ".text\n" |
michael@0 | 1145 | - ".global ScaleYUVToRGB32Row\n" |
michael@0 | 1146 | -"ScaleYUVToRGB32Row:\n" |
michael@0 | 1147 | - "pusha\n" |
michael@0 | 1148 | - "mov 0x24(%esp),%edx\n" |
michael@0 | 1149 | - "mov 0x28(%esp),%edi\n" |
michael@0 | 1150 | - "mov 0x2c(%esp),%esi\n" |
michael@0 | 1151 | - "mov 0x30(%esp),%ebp\n" |
michael@0 | 1152 | - "mov 0x34(%esp),%ecx\n" |
michael@0 | 1153 | - "xor %ebx,%ebx\n" |
michael@0 | 1154 | - "jmp scaleend\n" |
michael@0 | 1155 | - |
michael@0 | 1156 | -"scaleloop:" |
michael@0 | 1157 | - "mov %ebx,%eax\n" |
michael@0 | 1158 | - "sar $0x11,%eax\n" |
michael@0 | 1159 | - "movzbl (%edi,%eax,1),%eax\n" |
michael@0 | 1160 | - "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" |
michael@0 | 1161 | - "mov %ebx,%eax\n" |
michael@0 | 1162 | - "sar $0x11,%eax\n" |
michael@0 | 1163 | - "movzbl (%esi,%eax,1),%eax\n" |
michael@0 | 1164 | - "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" |
michael@0 | 1165 | - "mov %ebx,%eax\n" |
michael@0 | 1166 | - "add 0x38(%esp),%ebx\n" |
michael@0 | 1167 | - "sar $0x10,%eax\n" |
michael@0 | 1168 | - "movzbl (%edx,%eax,1),%eax\n" |
michael@0 | 1169 | - "movq kCoefficientsRgbY(,%eax,8),%mm1\n" |
michael@0 | 1170 | - "mov %ebx,%eax\n" |
michael@0 | 1171 | - "add 0x38(%esp),%ebx\n" |
michael@0 | 1172 | - "sar $0x10,%eax\n" |
michael@0 | 1173 | - "movzbl (%edx,%eax,1),%eax\n" |
michael@0 | 1174 | - "movq kCoefficientsRgbY(,%eax,8),%mm2\n" |
michael@0 | 1175 | - "paddsw %mm0,%mm1\n" |
michael@0 | 1176 | - "paddsw %mm0,%mm2\n" |
michael@0 | 1177 | - "psraw $0x6,%mm1\n" |
michael@0 | 1178 | - "psraw $0x6,%mm2\n" |
michael@0 | 1179 | - "packuswb %mm2,%mm1\n" |
michael@0 | 1180 | - "movntq %mm1,0x0(%ebp)\n" |
michael@0 | 1181 | - "add $0x8,%ebp\n" |
michael@0 | 1182 | -"scaleend:" |
michael@0 | 1183 | - "sub $0x2,%ecx\n" |
michael@0 | 1184 | - "jns scaleloop\n" |
michael@0 | 1185 | - |
michael@0 | 1186 | - "and $0x1,%ecx\n" |
michael@0 | 1187 | - "je scaledone\n" |
michael@0 | 1188 | - |
michael@0 | 1189 | - "mov %ebx,%eax\n" |
michael@0 | 1190 | - "sar $0x11,%eax\n" |
michael@0 | 1191 | - "movzbl (%edi,%eax,1),%eax\n" |
michael@0 | 1192 | - "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" |
michael@0 | 1193 | - "mov %ebx,%eax\n" |
michael@0 | 1194 | - "sar $0x11,%eax\n" |
michael@0 | 1195 | - "movzbl (%esi,%eax,1),%eax\n" |
michael@0 | 1196 | - "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" |
michael@0 | 1197 | - "mov %ebx,%eax\n" |
michael@0 | 1198 | - "sar $0x10,%eax\n" |
michael@0 | 1199 | - "movzbl (%edx,%eax,1),%eax\n" |
michael@0 | 1200 | - "movq kCoefficientsRgbY(,%eax,8),%mm1\n" |
michael@0 | 1201 | - "paddsw %mm0,%mm1\n" |
michael@0 | 1202 | - "psraw $0x6,%mm1\n" |
michael@0 | 1203 | - "packuswb %mm1,%mm1\n" |
michael@0 | 1204 | - "movd %mm1,0x0(%ebp)\n" |
michael@0 | 1205 | - |
michael@0 | 1206 | -"scaledone:" |
michael@0 | 1207 | - "popa\n" |
michael@0 | 1208 | - "ret\n" |
michael@0 | 1209 | -); |
michael@0 | 1210 | - |
michael@0 | 1211 | -void LinearScaleYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 1212 | - const uint8* u_buf, |
michael@0 | 1213 | - const uint8* v_buf, |
michael@0 | 1214 | - uint8* rgb_buf, |
michael@0 | 1215 | - int width, |
michael@0 | 1216 | - int source_dx); |
michael@0 | 1217 | - asm( |
michael@0 | 1218 | - ".text\n" |
michael@0 | 1219 | - ".global LinearScaleYUVToRGB32Row\n" |
michael@0 | 1220 | -"LinearScaleYUVToRGB32Row:\n" |
michael@0 | 1221 | - "pusha\n" |
michael@0 | 1222 | - "mov 0x24(%esp),%edx\n" |
michael@0 | 1223 | - "mov 0x28(%esp),%edi\n" |
michael@0 | 1224 | - "mov 0x30(%esp),%ebp\n" |
michael@0 | 1225 | - |
michael@0 | 1226 | - // source_width = width * source_dx + ebx |
michael@0 | 1227 | - "mov 0x34(%esp), %ecx\n" |
michael@0 | 1228 | - "imull 0x38(%esp), %ecx\n" |
michael@0 | 1229 | - "mov %ecx, 0x34(%esp)\n" |
michael@0 | 1230 | - |
michael@0 | 1231 | - "mov 0x38(%esp), %ecx\n" |
michael@0 | 1232 | - "xor %ebx,%ebx\n" // x = 0 |
michael@0 | 1233 | - "cmp $0x20000,%ecx\n" // if source_dx >= 2.0 |
michael@0 | 1234 | - "jl .lscaleend\n" |
michael@0 | 1235 | - "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less |
michael@0 | 1236 | - "jmp .lscaleend\n" |
michael@0 | 1237 | - |
michael@0 | 1238 | -".lscaleloop:" |
michael@0 | 1239 | - "mov %ebx,%eax\n" |
michael@0 | 1240 | - "sar $0x11,%eax\n" |
michael@0 | 1241 | - |
michael@0 | 1242 | - "movzbl (%edi,%eax,1),%ecx\n" |
michael@0 | 1243 | - "movzbl 1(%edi,%eax,1),%esi\n" |
michael@0 | 1244 | - "mov %ebx,%eax\n" |
michael@0 | 1245 | - "andl $0x1fffe, %eax \n" |
michael@0 | 1246 | - "imul %eax, %esi \n" |
michael@0 | 1247 | - "xorl $0x1fffe, %eax \n" |
michael@0 | 1248 | - "imul %eax, %ecx \n" |
michael@0 | 1249 | - "addl %esi, %ecx \n" |
michael@0 | 1250 | - "shrl $17, %ecx \n" |
michael@0 | 1251 | - "movq kCoefficientsRgbY+2048(,%ecx,8),%mm0\n" |
michael@0 | 1252 | - |
michael@0 | 1253 | - "mov 0x2c(%esp),%esi\n" |
michael@0 | 1254 | - "mov %ebx,%eax\n" |
michael@0 | 1255 | - "sar $0x11,%eax\n" |
michael@0 | 1256 | - |
michael@0 | 1257 | - "movzbl (%esi,%eax,1),%ecx\n" |
michael@0 | 1258 | - "movzbl 1(%esi,%eax,1),%esi\n" |
michael@0 | 1259 | - "mov %ebx,%eax\n" |
michael@0 | 1260 | - "andl $0x1fffe, %eax \n" |
michael@0 | 1261 | - "imul %eax, %esi \n" |
michael@0 | 1262 | - "xorl $0x1fffe, %eax \n" |
michael@0 | 1263 | - "imul %eax, %ecx \n" |
michael@0 | 1264 | - "addl %esi, %ecx \n" |
michael@0 | 1265 | - "shrl $17, %ecx \n" |
michael@0 | 1266 | - "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n" |
michael@0 | 1267 | - |
michael@0 | 1268 | - "mov %ebx,%eax\n" |
michael@0 | 1269 | - "sar $0x10,%eax\n" |
michael@0 | 1270 | - "movzbl (%edx,%eax,1),%ecx\n" |
michael@0 | 1271 | - "movzbl 1(%edx,%eax,1),%esi\n" |
michael@0 | 1272 | - "mov %ebx,%eax\n" |
michael@0 | 1273 | - "add 0x38(%esp),%ebx\n" |
michael@0 | 1274 | - "andl $0xffff, %eax \n" |
michael@0 | 1275 | - "imul %eax, %esi \n" |
michael@0 | 1276 | - "xorl $0xffff, %eax \n" |
michael@0 | 1277 | - "imul %eax, %ecx \n" |
michael@0 | 1278 | - "addl %esi, %ecx \n" |
michael@0 | 1279 | - "shrl $16, %ecx \n" |
michael@0 | 1280 | - "movq kCoefficientsRgbY(,%ecx,8),%mm1\n" |
michael@0 | 1281 | - |
michael@0 | 1282 | - "cmp 0x34(%esp), %ebx\n" |
michael@0 | 1283 | - "jge .lscalelastpixel\n" |
michael@0 | 1284 | - |
michael@0 | 1285 | - "mov %ebx,%eax\n" |
michael@0 | 1286 | - "sar $0x10,%eax\n" |
michael@0 | 1287 | - "movzbl (%edx,%eax,1),%ecx\n" |
michael@0 | 1288 | - "movzbl 1(%edx,%eax,1),%esi\n" |
michael@0 | 1289 | - "mov %ebx,%eax\n" |
michael@0 | 1290 | - "add 0x38(%esp),%ebx\n" |
michael@0 | 1291 | - "andl $0xffff, %eax \n" |
michael@0 | 1292 | - "imul %eax, %esi \n" |
michael@0 | 1293 | - "xorl $0xffff, %eax \n" |
michael@0 | 1294 | - "imul %eax, %ecx \n" |
michael@0 | 1295 | - "addl %esi, %ecx \n" |
michael@0 | 1296 | - "shrl $16, %ecx \n" |
michael@0 | 1297 | - "movq kCoefficientsRgbY(,%ecx,8),%mm2\n" |
michael@0 | 1298 | - |
michael@0 | 1299 | - "paddsw %mm0,%mm1\n" |
michael@0 | 1300 | - "paddsw %mm0,%mm2\n" |
michael@0 | 1301 | - "psraw $0x6,%mm1\n" |
michael@0 | 1302 | - "psraw $0x6,%mm2\n" |
michael@0 | 1303 | - "packuswb %mm2,%mm1\n" |
michael@0 | 1304 | - "movntq %mm1,0x0(%ebp)\n" |
michael@0 | 1305 | - "add $0x8,%ebp\n" |
michael@0 | 1306 | - |
michael@0 | 1307 | -".lscaleend:" |
michael@0 | 1308 | - "cmp 0x34(%esp), %ebx\n" |
michael@0 | 1309 | - "jl .lscaleloop\n" |
michael@0 | 1310 | - "popa\n" |
michael@0 | 1311 | - "ret\n" |
michael@0 | 1312 | - |
michael@0 | 1313 | -".lscalelastpixel:" |
michael@0 | 1314 | - "paddsw %mm0, %mm1\n" |
michael@0 | 1315 | - "psraw $6, %mm1\n" |
michael@0 | 1316 | - "packuswb %mm1, %mm1\n" |
michael@0 | 1317 | - "movd %mm1, (%ebp)\n" |
michael@0 | 1318 | - "popa\n" |
michael@0 | 1319 | - "ret\n" |
michael@0 | 1320 | -); |
michael@0 | 1321 | - |
michael@0 | 1322 | -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__) |
michael@0 | 1323 | - |
michael@0 | 1324 | -extern void PICConvertYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 1325 | - const uint8* u_buf, |
michael@0 | 1326 | - const uint8* v_buf, |
michael@0 | 1327 | - uint8* rgb_buf, |
michael@0 | 1328 | - int width, |
michael@0 | 1329 | - int16 *kCoefficientsRgbY); |
michael@0 | 1330 | - asm( |
michael@0 | 1331 | - ".text\n" |
michael@0 | 1332 | -#if defined(OS_MACOSX) |
michael@0 | 1333 | -"_PICConvertYUVToRGB32Row:\n" |
michael@0 | 1334 | -#else |
michael@0 | 1335 | -"PICConvertYUVToRGB32Row:\n" |
michael@0 | 1336 | -#endif |
michael@0 | 1337 | - "pusha\n" |
michael@0 | 1338 | - "mov 0x24(%esp),%edx\n" |
michael@0 | 1339 | - "mov 0x28(%esp),%edi\n" |
michael@0 | 1340 | - "mov 0x2c(%esp),%esi\n" |
michael@0 | 1341 | - "mov 0x30(%esp),%ebp\n" |
michael@0 | 1342 | - "mov 0x38(%esp),%ecx\n" |
michael@0 | 1343 | - |
michael@0 | 1344 | - "jmp .Lconvertend\n" |
michael@0 | 1345 | - |
michael@0 | 1346 | -".Lconvertloop:" |
michael@0 | 1347 | - "movzbl (%edi),%eax\n" |
michael@0 | 1348 | - "add $0x1,%edi\n" |
michael@0 | 1349 | - "movzbl (%esi),%ebx\n" |
michael@0 | 1350 | - "add $0x1,%esi\n" |
michael@0 | 1351 | - "movq 2048(%ecx,%eax,8),%mm0\n" |
michael@0 | 1352 | - "movzbl (%edx),%eax\n" |
michael@0 | 1353 | - "paddsw 4096(%ecx,%ebx,8),%mm0\n" |
michael@0 | 1354 | - "movzbl 0x1(%edx),%ebx\n" |
michael@0 | 1355 | - "movq 0(%ecx,%eax,8),%mm1\n" |
michael@0 | 1356 | - "add $0x2,%edx\n" |
michael@0 | 1357 | - "movq 0(%ecx,%ebx,8),%mm2\n" |
michael@0 | 1358 | - "paddsw %mm0,%mm1\n" |
michael@0 | 1359 | - "paddsw %mm0,%mm2\n" |
michael@0 | 1360 | - "psraw $0x6,%mm1\n" |
michael@0 | 1361 | - "psraw $0x6,%mm2\n" |
michael@0 | 1362 | - "packuswb %mm2,%mm1\n" |
michael@0 | 1363 | - "movntq %mm1,0x0(%ebp)\n" |
michael@0 | 1364 | - "add $0x8,%ebp\n" |
michael@0 | 1365 | -".Lconvertend:" |
michael@0 | 1366 | - "subl $0x2,0x34(%esp)\n" |
michael@0 | 1367 | - "jns .Lconvertloop\n" |
michael@0 | 1368 | - |
michael@0 | 1369 | - "andl $0x1,0x34(%esp)\n" |
michael@0 | 1370 | - "je .Lconvertdone\n" |
michael@0 | 1371 | - |
michael@0 | 1372 | - "movzbl (%edi),%eax\n" |
michael@0 | 1373 | - "movq 2048(%ecx,%eax,8),%mm0\n" |
michael@0 | 1374 | - "movzbl (%esi),%eax\n" |
michael@0 | 1375 | - "paddsw 4096(%ecx,%eax,8),%mm0\n" |
michael@0 | 1376 | - "movzbl (%edx),%eax\n" |
michael@0 | 1377 | - "movq 0(%ecx,%eax,8),%mm1\n" |
michael@0 | 1378 | - "paddsw %mm0,%mm1\n" |
michael@0 | 1379 | - "psraw $0x6,%mm1\n" |
michael@0 | 1380 | - "packuswb %mm1,%mm1\n" |
michael@0 | 1381 | - "movd %mm1,0x0(%ebp)\n" |
michael@0 | 1382 | -".Lconvertdone:\n" |
michael@0 | 1383 | - "popa\n" |
michael@0 | 1384 | - "ret\n" |
michael@0 | 1385 | -); |
michael@0 | 1386 | - |
michael@0 | 1387 | -void FastConvertYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 1388 | - const uint8* u_buf, |
michael@0 | 1389 | - const uint8* v_buf, |
michael@0 | 1390 | - uint8* rgb_buf, |
michael@0 | 1391 | - int width) { |
michael@0 | 1392 | - PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, |
michael@0 | 1393 | - &kCoefficientsRgbY[0][0]); |
michael@0 | 1394 | -} |
michael@0 | 1395 | - |
michael@0 | 1396 | -extern void PICScaleYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 1397 | - const uint8* u_buf, |
michael@0 | 1398 | - const uint8* v_buf, |
michael@0 | 1399 | - uint8* rgb_buf, |
michael@0 | 1400 | - int width, |
michael@0 | 1401 | - int source_dx, |
michael@0 | 1402 | - int16 *kCoefficientsRgbY); |
michael@0 | 1403 | - |
michael@0 | 1404 | - asm( |
michael@0 | 1405 | - ".text\n" |
michael@0 | 1406 | -#if defined(OS_MACOSX) |
michael@0 | 1407 | -"_PICScaleYUVToRGB32Row:\n" |
michael@0 | 1408 | -#else |
michael@0 | 1409 | -"PICScaleYUVToRGB32Row:\n" |
michael@0 | 1410 | -#endif |
michael@0 | 1411 | - "pusha\n" |
michael@0 | 1412 | - "mov 0x24(%esp),%edx\n" |
michael@0 | 1413 | - "mov 0x28(%esp),%edi\n" |
michael@0 | 1414 | - "mov 0x2c(%esp),%esi\n" |
michael@0 | 1415 | - "mov 0x30(%esp),%ebp\n" |
michael@0 | 1416 | - "mov 0x3c(%esp),%ecx\n" |
michael@0 | 1417 | - "xor %ebx,%ebx\n" |
michael@0 | 1418 | - "jmp Lscaleend\n" |
michael@0 | 1419 | - |
michael@0 | 1420 | -"Lscaleloop:" |
michael@0 | 1421 | - "mov %ebx,%eax\n" |
michael@0 | 1422 | - "sar $0x11,%eax\n" |
michael@0 | 1423 | - "movzbl (%edi,%eax,1),%eax\n" |
michael@0 | 1424 | - "movq 2048(%ecx,%eax,8),%mm0\n" |
michael@0 | 1425 | - "mov %ebx,%eax\n" |
michael@0 | 1426 | - "sar $0x11,%eax\n" |
michael@0 | 1427 | - "movzbl (%esi,%eax,1),%eax\n" |
michael@0 | 1428 | - "paddsw 4096(%ecx,%eax,8),%mm0\n" |
michael@0 | 1429 | - "mov %ebx,%eax\n" |
michael@0 | 1430 | - "add 0x38(%esp),%ebx\n" |
michael@0 | 1431 | - "sar $0x10,%eax\n" |
michael@0 | 1432 | - "movzbl (%edx,%eax,1),%eax\n" |
michael@0 | 1433 | - "movq 0(%ecx,%eax,8),%mm1\n" |
michael@0 | 1434 | - "mov %ebx,%eax\n" |
michael@0 | 1435 | - "add 0x38(%esp),%ebx\n" |
michael@0 | 1436 | - "sar $0x10,%eax\n" |
michael@0 | 1437 | - "movzbl (%edx,%eax,1),%eax\n" |
michael@0 | 1438 | - "movq 0(%ecx,%eax,8),%mm2\n" |
michael@0 | 1439 | - "paddsw %mm0,%mm1\n" |
michael@0 | 1440 | - "paddsw %mm0,%mm2\n" |
michael@0 | 1441 | - "psraw $0x6,%mm1\n" |
michael@0 | 1442 | - "psraw $0x6,%mm2\n" |
michael@0 | 1443 | - "packuswb %mm2,%mm1\n" |
michael@0 | 1444 | - "movntq %mm1,0x0(%ebp)\n" |
michael@0 | 1445 | - "add $0x8,%ebp\n" |
michael@0 | 1446 | -"Lscaleend:" |
michael@0 | 1447 | - "subl $0x2,0x34(%esp)\n" |
michael@0 | 1448 | - "jns Lscaleloop\n" |
michael@0 | 1449 | - |
michael@0 | 1450 | - "andl $0x1,0x34(%esp)\n" |
michael@0 | 1451 | - "je Lscaledone\n" |
michael@0 | 1452 | - |
michael@0 | 1453 | - "mov %ebx,%eax\n" |
michael@0 | 1454 | - "sar $0x11,%eax\n" |
michael@0 | 1455 | - "movzbl (%edi,%eax,1),%eax\n" |
michael@0 | 1456 | - "movq 2048(%ecx,%eax,8),%mm0\n" |
michael@0 | 1457 | - "mov %ebx,%eax\n" |
michael@0 | 1458 | - "sar $0x11,%eax\n" |
michael@0 | 1459 | - "movzbl (%esi,%eax,1),%eax\n" |
michael@0 | 1460 | - "paddsw 4096(%ecx,%eax,8),%mm0\n" |
michael@0 | 1461 | - "mov %ebx,%eax\n" |
michael@0 | 1462 | - "sar $0x10,%eax\n" |
michael@0 | 1463 | - "movzbl (%edx,%eax,1),%eax\n" |
michael@0 | 1464 | - "movq 0(%ecx,%eax,8),%mm1\n" |
michael@0 | 1465 | - "paddsw %mm0,%mm1\n" |
michael@0 | 1466 | - "psraw $0x6,%mm1\n" |
michael@0 | 1467 | - "packuswb %mm1,%mm1\n" |
michael@0 | 1468 | - "movd %mm1,0x0(%ebp)\n" |
michael@0 | 1469 | - |
michael@0 | 1470 | -"Lscaledone:" |
michael@0 | 1471 | - "popa\n" |
michael@0 | 1472 | - "ret\n" |
michael@0 | 1473 | -); |
michael@0 | 1474 | - |
michael@0 | 1475 | - |
michael@0 | 1476 | -void ScaleYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 1477 | - const uint8* u_buf, |
michael@0 | 1478 | - const uint8* v_buf, |
michael@0 | 1479 | - uint8* rgb_buf, |
michael@0 | 1480 | - int width, |
michael@0 | 1481 | - int source_dx) { |
michael@0 | 1482 | - PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx, |
michael@0 | 1483 | - &kCoefficientsRgbY[0][0]); |
michael@0 | 1484 | -} |
michael@0 | 1485 | - |
michael@0 | 1486 | -void PICLinearScaleYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 1487 | - const uint8* u_buf, |
michael@0 | 1488 | - const uint8* v_buf, |
michael@0 | 1489 | - uint8* rgb_buf, |
michael@0 | 1490 | - int width, |
michael@0 | 1491 | - int source_dx, |
michael@0 | 1492 | - int16 *kCoefficientsRgbY); |
michael@0 | 1493 | - asm( |
michael@0 | 1494 | - ".text\n" |
michael@0 | 1495 | -#if defined(OS_MACOSX) |
michael@0 | 1496 | -"_PICLinearScaleYUVToRGB32Row:\n" |
michael@0 | 1497 | -#else |
michael@0 | 1498 | -"PICLinearScaleYUVToRGB32Row:\n" |
michael@0 | 1499 | -#endif |
michael@0 | 1500 | - "pusha\n" |
michael@0 | 1501 | - "mov 0x24(%esp),%edx\n" |
michael@0 | 1502 | - "mov 0x30(%esp),%ebp\n" |
michael@0 | 1503 | - "mov 0x34(%esp),%ecx\n" |
michael@0 | 1504 | - "mov 0x3c(%esp),%edi\n" |
michael@0 | 1505 | - "xor %ebx,%ebx\n" |
michael@0 | 1506 | - |
michael@0 | 1507 | - // source_width = width * source_dx + ebx |
michael@0 | 1508 | - "mov 0x34(%esp), %ecx\n" |
michael@0 | 1509 | - "imull 0x38(%esp), %ecx\n" |
michael@0 | 1510 | - "mov %ecx, 0x34(%esp)\n" |
michael@0 | 1511 | - |
michael@0 | 1512 | - "mov 0x38(%esp), %ecx\n" |
michael@0 | 1513 | - "xor %ebx,%ebx\n" // x = 0 |
michael@0 | 1514 | - "cmp $0x20000,%ecx\n" // if source_dx >= 2.0 |
michael@0 | 1515 | - "jl .lscaleend\n" |
michael@0 | 1516 | - "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less |
michael@0 | 1517 | - "jmp .lscaleend\n" |
michael@0 | 1518 | - |
michael@0 | 1519 | -".lscaleloop:" |
michael@0 | 1520 | - "mov 0x28(%esp),%esi\n" |
michael@0 | 1521 | - "mov %ebx,%eax\n" |
michael@0 | 1522 | - "sar $0x11,%eax\n" |
michael@0 | 1523 | - |
michael@0 | 1524 | - "movzbl (%esi,%eax,1),%ecx\n" |
michael@0 | 1525 | - "movzbl 1(%esi,%eax,1),%esi\n" |
michael@0 | 1526 | - "mov %ebx,%eax\n" |
michael@0 | 1527 | - "andl $0x1fffe, %eax \n" |
michael@0 | 1528 | - "imul %eax, %esi \n" |
michael@0 | 1529 | - "xorl $0x1fffe, %eax \n" |
michael@0 | 1530 | - "imul %eax, %ecx \n" |
michael@0 | 1531 | - "addl %esi, %ecx \n" |
michael@0 | 1532 | - "shrl $17, %ecx \n" |
michael@0 | 1533 | - "movq 2048(%edi,%ecx,8),%mm0\n" |
michael@0 | 1534 | - |
michael@0 | 1535 | - "mov 0x2c(%esp),%esi\n" |
michael@0 | 1536 | - "mov %ebx,%eax\n" |
michael@0 | 1537 | - "sar $0x11,%eax\n" |
michael@0 | 1538 | - |
michael@0 | 1539 | - "movzbl (%esi,%eax,1),%ecx\n" |
michael@0 | 1540 | - "movzbl 1(%esi,%eax,1),%esi\n" |
michael@0 | 1541 | - "mov %ebx,%eax\n" |
michael@0 | 1542 | - "andl $0x1fffe, %eax \n" |
michael@0 | 1543 | - "imul %eax, %esi \n" |
michael@0 | 1544 | - "xorl $0x1fffe, %eax \n" |
michael@0 | 1545 | - "imul %eax, %ecx \n" |
michael@0 | 1546 | - "addl %esi, %ecx \n" |
michael@0 | 1547 | - "shrl $17, %ecx \n" |
michael@0 | 1548 | - "paddsw 4096(%edi,%ecx,8),%mm0\n" |
michael@0 | 1549 | - |
michael@0 | 1550 | - "mov %ebx,%eax\n" |
michael@0 | 1551 | - "sar $0x10,%eax\n" |
michael@0 | 1552 | - "movzbl (%edx,%eax,1),%ecx\n" |
michael@0 | 1553 | - "movzbl 1(%edx,%eax,1),%esi\n" |
michael@0 | 1554 | - "mov %ebx,%eax\n" |
michael@0 | 1555 | - "add 0x38(%esp),%ebx\n" |
michael@0 | 1556 | - "andl $0xffff, %eax \n" |
michael@0 | 1557 | - "imul %eax, %esi \n" |
michael@0 | 1558 | - "xorl $0xffff, %eax \n" |
michael@0 | 1559 | - "imul %eax, %ecx \n" |
michael@0 | 1560 | - "addl %esi, %ecx \n" |
michael@0 | 1561 | - "shrl $16, %ecx \n" |
michael@0 | 1562 | - "movq (%edi,%ecx,8),%mm1\n" |
michael@0 | 1563 | - |
michael@0 | 1564 | - "cmp 0x34(%esp), %ebx\n" |
michael@0 | 1565 | - "jge .lscalelastpixel\n" |
michael@0 | 1566 | - |
michael@0 | 1567 | - "mov %ebx,%eax\n" |
michael@0 | 1568 | - "sar $0x10,%eax\n" |
michael@0 | 1569 | - "movzbl (%edx,%eax,1),%ecx\n" |
michael@0 | 1570 | - "movzbl 1(%edx,%eax,1),%esi\n" |
michael@0 | 1571 | - "mov %ebx,%eax\n" |
michael@0 | 1572 | - "add 0x38(%esp),%ebx\n" |
michael@0 | 1573 | - "andl $0xffff, %eax \n" |
michael@0 | 1574 | - "imul %eax, %esi \n" |
michael@0 | 1575 | - "xorl $0xffff, %eax \n" |
michael@0 | 1576 | - "imul %eax, %ecx \n" |
michael@0 | 1577 | - "addl %esi, %ecx \n" |
michael@0 | 1578 | - "shrl $16, %ecx \n" |
michael@0 | 1579 | - "movq (%edi,%ecx,8),%mm2\n" |
michael@0 | 1580 | - |
michael@0 | 1581 | - "paddsw %mm0,%mm1\n" |
michael@0 | 1582 | - "paddsw %mm0,%mm2\n" |
michael@0 | 1583 | - "psraw $0x6,%mm1\n" |
michael@0 | 1584 | - "psraw $0x6,%mm2\n" |
michael@0 | 1585 | - "packuswb %mm2,%mm1\n" |
michael@0 | 1586 | - "movntq %mm1,0x0(%ebp)\n" |
michael@0 | 1587 | - "add $0x8,%ebp\n" |
michael@0 | 1588 | - |
michael@0 | 1589 | -".lscaleend:" |
michael@0 | 1590 | - "cmp %ebx, 0x34(%esp)\n" |
michael@0 | 1591 | - "jg .lscaleloop\n" |
michael@0 | 1592 | - "popa\n" |
michael@0 | 1593 | - "ret\n" |
michael@0 | 1594 | - |
michael@0 | 1595 | -".lscalelastpixel:" |
michael@0 | 1596 | - "paddsw %mm0, %mm1\n" |
michael@0 | 1597 | - "psraw $6, %mm1\n" |
michael@0 | 1598 | - "packuswb %mm1, %mm1\n" |
michael@0 | 1599 | - "movd %mm1, (%ebp)\n" |
michael@0 | 1600 | - "popa\n" |
michael@0 | 1601 | - "ret\n" |
michael@0 | 1602 | -); |
michael@0 | 1603 | - |
michael@0 | 1604 | -void LinearScaleYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 1605 | - const uint8* u_buf, |
michael@0 | 1606 | - const uint8* v_buf, |
michael@0 | 1607 | - uint8* rgb_buf, |
michael@0 | 1608 | - int width, |
michael@0 | 1609 | - int source_dx) { |
michael@0 | 1610 | - PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx, |
michael@0 | 1611 | - &kCoefficientsRgbY[0][0]); |
michael@0 | 1612 | -} |
michael@0 | 1613 | - |
michael@0 | 1614 | -#else // USE_MMX |
michael@0 | 1615 | - |
michael@0 | 1616 | // C reference code that mimic the YUV assembly. |
michael@0 | 1617 | #define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x))) |
michael@0 | 1618 | #define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \ |
michael@0 | 1619 | (((x) + (y)) > 32767 ? 32767 : ((x) + (y)))) |
michael@0 | 1620 | |
michael@0 | 1621 | static inline void YuvPixel(uint8 y, |
michael@0 | 1622 | uint8 u, |
michael@0 | 1623 | uint8 v, |
michael@0 | 1624 | @@ -833,66 +39,71 @@ static inline void YuvPixel(uint8 y, |
michael@0 | 1625 | a >>= 6; |
michael@0 | 1626 | |
michael@0 | 1627 | *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) | |
michael@0 | 1628 | (packuswb(g) << 8) | |
michael@0 | 1629 | (packuswb(r) << 16) | |
michael@0 | 1630 | (packuswb(a) << 24); |
michael@0 | 1631 | } |
michael@0 | 1632 | |
michael@0 | 1633 | -void FastConvertYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 1634 | - const uint8* u_buf, |
michael@0 | 1635 | - const uint8* v_buf, |
michael@0 | 1636 | - uint8* rgb_buf, |
michael@0 | 1637 | - int width) { |
michael@0 | 1638 | +void FastConvertYUVToRGB32Row_C(const uint8* y_buf, |
michael@0 | 1639 | + const uint8* u_buf, |
michael@0 | 1640 | + const uint8* v_buf, |
michael@0 | 1641 | + uint8* rgb_buf, |
michael@0 | 1642 | + int width, |
michael@0 | 1643 | + unsigned int x_shift) { |
michael@0 | 1644 | for (int x = 0; x < width; x += 2) { |
michael@0 | 1645 | - uint8 u = u_buf[x >> 1]; |
michael@0 | 1646 | - uint8 v = v_buf[x >> 1]; |
michael@0 | 1647 | + uint8 u = u_buf[x >> x_shift]; |
michael@0 | 1648 | + uint8 v = v_buf[x >> x_shift]; |
michael@0 | 1649 | uint8 y0 = y_buf[x]; |
michael@0 | 1650 | YuvPixel(y0, u, v, rgb_buf); |
michael@0 | 1651 | if ((x + 1) < width) { |
michael@0 | 1652 | uint8 y1 = y_buf[x + 1]; |
michael@0 | 1653 | + if (x_shift == 0) { |
michael@0 | 1654 | + u = u_buf[x + 1]; |
michael@0 | 1655 | + v = v_buf[x + 1]; |
michael@0 | 1656 | + } |
michael@0 | 1657 | YuvPixel(y1, u, v, rgb_buf + 4); |
michael@0 | 1658 | } |
michael@0 | 1659 | rgb_buf += 8; // Advance 2 pixels. |
michael@0 | 1660 | } |
michael@0 | 1661 | } |
michael@0 | 1662 | |
michael@0 | 1663 | // 16.16 fixed point is used. A shift by 16 isolates the integer. |
michael@0 | 1664 | // A shift by 17 is used to further subsample the chrominence channels. |
michael@0 | 1665 | // & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits, |
michael@0 | 1666 | // for 1/65536 pixel accurate interpolation. |
michael@0 | 1667 | -void ScaleYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 1668 | - const uint8* u_buf, |
michael@0 | 1669 | - const uint8* v_buf, |
michael@0 | 1670 | - uint8* rgb_buf, |
michael@0 | 1671 | - int width, |
michael@0 | 1672 | - int source_dx) { |
michael@0 | 1673 | +void ScaleYUVToRGB32Row_C(const uint8* y_buf, |
michael@0 | 1674 | + const uint8* u_buf, |
michael@0 | 1675 | + const uint8* v_buf, |
michael@0 | 1676 | + uint8* rgb_buf, |
michael@0 | 1677 | + int width, |
michael@0 | 1678 | + int source_dx) { |
michael@0 | 1679 | int x = 0; |
michael@0 | 1680 | for (int i = 0; i < width; i += 2) { |
michael@0 | 1681 | int y = y_buf[x >> 16]; |
michael@0 | 1682 | int u = u_buf[(x >> 17)]; |
michael@0 | 1683 | int v = v_buf[(x >> 17)]; |
michael@0 | 1684 | YuvPixel(y, u, v, rgb_buf); |
michael@0 | 1685 | x += source_dx; |
michael@0 | 1686 | if ((i + 1) < width) { |
michael@0 | 1687 | y = y_buf[x >> 16]; |
michael@0 | 1688 | YuvPixel(y, u, v, rgb_buf+4); |
michael@0 | 1689 | x += source_dx; |
michael@0 | 1690 | } |
michael@0 | 1691 | rgb_buf += 8; |
michael@0 | 1692 | } |
michael@0 | 1693 | } |
michael@0 | 1694 | |
michael@0 | 1695 | -void LinearScaleYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 1696 | - const uint8* u_buf, |
michael@0 | 1697 | - const uint8* v_buf, |
michael@0 | 1698 | - uint8* rgb_buf, |
michael@0 | 1699 | - int width, |
michael@0 | 1700 | - int source_dx) { |
michael@0 | 1701 | +void LinearScaleYUVToRGB32Row_C(const uint8* y_buf, |
michael@0 | 1702 | + const uint8* u_buf, |
michael@0 | 1703 | + const uint8* v_buf, |
michael@0 | 1704 | + uint8* rgb_buf, |
michael@0 | 1705 | + int width, |
michael@0 | 1706 | + int source_dx) { |
michael@0 | 1707 | int x = 0; |
michael@0 | 1708 | if (source_dx >= 0x20000) { |
michael@0 | 1709 | x = 32768; |
michael@0 | 1710 | } |
michael@0 | 1711 | for (int i = 0; i < width; i += 2) { |
michael@0 | 1712 | int y0 = y_buf[x >> 16]; |
michael@0 | 1713 | int y1 = y_buf[(x >> 16) + 1]; |
michael@0 | 1714 | int u0 = u_buf[(x >> 17)]; |
michael@0 | 1715 | @@ -913,11 +124,10 @@ void LinearScaleYUVToRGB32Row(const uint |
michael@0 | 1716 | y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; |
michael@0 | 1717 | YuvPixel(y, u, v, rgb_buf+4); |
michael@0 | 1718 | x += source_dx; |
michael@0 | 1719 | } |
michael@0 | 1720 | rgb_buf += 8; |
michael@0 | 1721 | } |
michael@0 | 1722 | } |
michael@0 | 1723 | |
michael@0 | 1724 | -#endif // USE_MMX |
michael@0 | 1725 | } // extern "C" |
michael@0 | 1726 | |
michael@0 | 1727 | diff --git a/gfx/ycbcr/yuv_row_posix.cpp b/gfx/ycbcr/yuv_row_posix.cpp |
michael@0 | 1728 | --- a/gfx/ycbcr/yuv_row_posix.cpp |
michael@0 | 1729 | +++ b/gfx/ycbcr/yuv_row_posix.cpp |
michael@0 | 1730 | @@ -1,33 +1,32 @@ |
michael@0 | 1731 | // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
michael@0 | 1732 | // Use of this source code is governed by a BSD-style license that can be |
michael@0 | 1733 | // found in the LICENSE file. |
michael@0 | 1734 | |
michael@0 | 1735 | -#include "media/base/yuv_row.h" |
michael@0 | 1736 | - |
michael@0 | 1737 | -#ifdef _DEBUG |
michael@0 | 1738 | -#include "base/logging.h" |
michael@0 | 1739 | -#else |
michael@0 | 1740 | +#include "yuv_row.h" |
michael@0 | 1741 | +#include "mozilla/SSE.h" |
michael@0 | 1742 | + |
michael@0 | 1743 | #define DCHECK(a) |
michael@0 | 1744 | -#endif |
michael@0 | 1745 | |
michael@0 | 1746 | extern "C" { |
michael@0 | 1747 | |
michael@0 | 1748 | -#if USE_SSE2 && defined(ARCH_CPU_X86_64) |
michael@0 | 1749 | +#if defined(ARCH_CPU_X86_64) |
michael@0 | 1750 | + |
michael@0 | 1751 | +// We don't need CPUID guards here, since x86-64 implies SSE2. |
michael@0 | 1752 | |
michael@0 | 1753 | // AMD64 ABI uses register paremters. |
michael@0 | 1754 | void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi |
michael@0 | 1755 | const uint8* u_buf, // rsi |
michael@0 | 1756 | const uint8* v_buf, // rdx |
michael@0 | 1757 | uint8* rgb_buf, // rcx |
michael@0 | 1758 | int width) { // r8 |
michael@0 | 1759 | asm( |
michael@0 | 1760 | - "jmp convertend\n" |
michael@0 | 1761 | -"convertloop:" |
michael@0 | 1762 | + "jmp 1f\n" |
michael@0 | 1763 | +"0:" |
michael@0 | 1764 | "movzb (%1),%%r10\n" |
michael@0 | 1765 | "add $0x1,%1\n" |
michael@0 | 1766 | "movzb (%2),%%r11\n" |
michael@0 | 1767 | "add $0x1,%2\n" |
michael@0 | 1768 | "movq 2048(%5,%%r10,8),%%xmm0\n" |
michael@0 | 1769 | "movzb (%0),%%r10\n" |
michael@0 | 1770 | "movq 4096(%5,%%r11,8),%%xmm1\n" |
michael@0 | 1771 | "movzb 0x1(%0),%%r11\n" |
michael@0 | 1772 | @@ -37,36 +36,36 @@ void FastConvertYUVToRGB32Row(const uint |
michael@0 | 1773 | "movq (%5,%%r11,8),%%xmm3\n" |
michael@0 | 1774 | "paddsw %%xmm0,%%xmm2\n" |
michael@0 | 1775 | "paddsw %%xmm0,%%xmm3\n" |
michael@0 | 1776 | "shufps $0x44,%%xmm3,%%xmm2\n" |
michael@0 | 1777 | "psraw $0x6,%%xmm2\n" |
michael@0 | 1778 | "packuswb %%xmm2,%%xmm2\n" |
michael@0 | 1779 | "movq %%xmm2,0x0(%3)\n" |
michael@0 | 1780 | "add $0x8,%3\n" |
michael@0 | 1781 | -"convertend:" |
michael@0 | 1782 | +"1:" |
michael@0 | 1783 | "sub $0x2,%4\n" |
michael@0 | 1784 | - "jns convertloop\n" |
michael@0 | 1785 | - |
michael@0 | 1786 | -"convertnext:" |
michael@0 | 1787 | + "jns 0b\n" |
michael@0 | 1788 | + |
michael@0 | 1789 | +"2:" |
michael@0 | 1790 | "add $0x1,%4\n" |
michael@0 | 1791 | - "js convertdone\n" |
michael@0 | 1792 | + "js 3f\n" |
michael@0 | 1793 | |
michael@0 | 1794 | "movzb (%1),%%r10\n" |
michael@0 | 1795 | "movq 2048(%5,%%r10,8),%%xmm0\n" |
michael@0 | 1796 | "movzb (%2),%%r10\n" |
michael@0 | 1797 | "movq 4096(%5,%%r10,8),%%xmm1\n" |
michael@0 | 1798 | "paddsw %%xmm1,%%xmm0\n" |
michael@0 | 1799 | "movzb (%0),%%r10\n" |
michael@0 | 1800 | "movq (%5,%%r10,8),%%xmm1\n" |
michael@0 | 1801 | "paddsw %%xmm0,%%xmm1\n" |
michael@0 | 1802 | "psraw $0x6,%%xmm1\n" |
michael@0 | 1803 | "packuswb %%xmm1,%%xmm1\n" |
michael@0 | 1804 | "movd %%xmm1,0x0(%3)\n" |
michael@0 | 1805 | -"convertdone:" |
michael@0 | 1806 | +"3:" |
michael@0 | 1807 | : |
michael@0 | 1808 | : "r"(y_buf), // %0 |
michael@0 | 1809 | "r"(u_buf), // %1 |
michael@0 | 1810 | "r"(v_buf), // %2 |
michael@0 | 1811 | "r"(rgb_buf), // %3 |
michael@0 | 1812 | "r"(width), // %4 |
michael@0 | 1813 | "r" (kCoefficientsRgbY) // %5 |
michael@0 | 1814 | : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" |
michael@0 | 1815 | @@ -77,19 +76,19 @@ void ScaleYUVToRGB32Row(const uint8* y_b |
michael@0 | 1816 | const uint8* u_buf, // rsi |
michael@0 | 1817 | const uint8* v_buf, // rdx |
michael@0 | 1818 | uint8* rgb_buf, // rcx |
michael@0 | 1819 | int width, // r8 |
michael@0 | 1820 | int source_dx) { // r9 |
michael@0 | 1821 | asm( |
michael@0 | 1822 | "xor %%r11,%%r11\n" |
michael@0 | 1823 | "sub $0x2,%4\n" |
michael@0 | 1824 | - "js scalenext\n" |
michael@0 | 1825 | - |
michael@0 | 1826 | -"scaleloop:" |
michael@0 | 1827 | + "js 1f\n" |
michael@0 | 1828 | + |
michael@0 | 1829 | +"0:" |
michael@0 | 1830 | "mov %%r11,%%r10\n" |
michael@0 | 1831 | "sar $0x11,%%r10\n" |
michael@0 | 1832 | "movzb (%1,%%r10,1),%%rax\n" |
michael@0 | 1833 | "movq 2048(%5,%%rax,8),%%xmm0\n" |
michael@0 | 1834 | "movzb (%2,%%r10,1),%%rax\n" |
michael@0 | 1835 | "movq 4096(%5,%%rax,8),%%xmm1\n" |
michael@0 | 1836 | "lea (%%r11,%6),%%r10\n" |
michael@0 | 1837 | "sar $0x10,%%r11\n" |
michael@0 | 1838 | @@ -103,38 +102,38 @@ void ScaleYUVToRGB32Row(const uint8* y_b |
michael@0 | 1839 | "paddsw %%xmm0,%%xmm1\n" |
michael@0 | 1840 | "paddsw %%xmm0,%%xmm2\n" |
michael@0 | 1841 | "shufps $0x44,%%xmm2,%%xmm1\n" |
michael@0 | 1842 | "psraw $0x6,%%xmm1\n" |
michael@0 | 1843 | "packuswb %%xmm1,%%xmm1\n" |
michael@0 | 1844 | "movq %%xmm1,0x0(%3)\n" |
michael@0 | 1845 | "add $0x8,%3\n" |
michael@0 | 1846 | "sub $0x2,%4\n" |
michael@0 | 1847 | - "jns scaleloop\n" |
michael@0 | 1848 | - |
michael@0 | 1849 | -"scalenext:" |
michael@0 | 1850 | + "jns 0b\n" |
michael@0 | 1851 | + |
michael@0 | 1852 | +"1:" |
michael@0 | 1853 | "add $0x1,%4\n" |
michael@0 | 1854 | - "js scaledone\n" |
michael@0 | 1855 | + "js 2f\n" |
michael@0 | 1856 | |
michael@0 | 1857 | "mov %%r11,%%r10\n" |
michael@0 | 1858 | "sar $0x11,%%r10\n" |
michael@0 | 1859 | "movzb (%1,%%r10,1),%%rax\n" |
michael@0 | 1860 | "movq 2048(%5,%%rax,8),%%xmm0\n" |
michael@0 | 1861 | "movzb (%2,%%r10,1),%%rax\n" |
michael@0 | 1862 | "movq 4096(%5,%%rax,8),%%xmm1\n" |
michael@0 | 1863 | "paddsw %%xmm1,%%xmm0\n" |
michael@0 | 1864 | "sar $0x10,%%r11\n" |
michael@0 | 1865 | "movzb (%0,%%r11,1),%%rax\n" |
michael@0 | 1866 | "movq (%5,%%rax,8),%%xmm1\n" |
michael@0 | 1867 | "paddsw %%xmm0,%%xmm1\n" |
michael@0 | 1868 | "psraw $0x6,%%xmm1\n" |
michael@0 | 1869 | "packuswb %%xmm1,%%xmm1\n" |
michael@0 | 1870 | "movd %%xmm1,0x0(%3)\n" |
michael@0 | 1871 | |
michael@0 | 1872 | -"scaledone:" |
michael@0 | 1873 | +"2:" |
michael@0 | 1874 | : |
michael@0 | 1875 | : "r"(y_buf), // %0 |
michael@0 | 1876 | "r"(u_buf), // %1 |
michael@0 | 1877 | "r"(v_buf), // %2 |
michael@0 | 1878 | "r"(rgb_buf), // %3 |
michael@0 | 1879 | "r"(width), // %4 |
michael@0 | 1880 | "r" (kCoefficientsRgbY), // %5 |
michael@0 | 1881 | "r"(static_cast<long>(source_dx)) // %6 |
michael@0 | 1882 | @@ -146,23 +145,23 @@ void LinearScaleYUVToRGB32Row(const uint |
michael@0 | 1883 | const uint8* u_buf, |
michael@0 | 1884 | const uint8* v_buf, |
michael@0 | 1885 | uint8* rgb_buf, |
michael@0 | 1886 | int width, |
michael@0 | 1887 | int source_dx) { |
michael@0 | 1888 | asm( |
michael@0 | 1889 | "xor %%r11,%%r11\n" // x = 0 |
michael@0 | 1890 | "sub $0x2,%4\n" |
michael@0 | 1891 | - "js .lscalenext\n" |
michael@0 | 1892 | + "js 2f\n" |
michael@0 | 1893 | "cmp $0x20000,%6\n" // if source_dx >= 2.0 |
michael@0 | 1894 | - "jl .lscalehalf\n" |
michael@0 | 1895 | + "jl 0f\n" |
michael@0 | 1896 | "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less |
michael@0 | 1897 | -".lscalehalf:" |
michael@0 | 1898 | - |
michael@0 | 1899 | -".lscaleloop:" |
michael@0 | 1900 | +"0:" |
michael@0 | 1901 | + |
michael@0 | 1902 | +"1:" |
michael@0 | 1903 | "mov %%r11,%%r10\n" |
michael@0 | 1904 | "sar $0x11,%%r10\n" |
michael@0 | 1905 | |
michael@0 | 1906 | "movzb (%1, %%r10, 1), %%r13 \n" |
michael@0 | 1907 | "movzb 1(%1, %%r10, 1), %%r14 \n" |
michael@0 | 1908 | "mov %%r11, %%rax \n" |
michael@0 | 1909 | "and $0x1fffe, %%rax \n" |
michael@0 | 1910 | "imul %%rax, %%r14 \n" |
michael@0 | 1911 | @@ -215,21 +214,21 @@ void LinearScaleYUVToRGB32Row(const uint |
michael@0 | 1912 | "paddsw %%xmm0,%%xmm1\n" |
michael@0 | 1913 | "paddsw %%xmm0,%%xmm2\n" |
michael@0 | 1914 | "shufps $0x44,%%xmm2,%%xmm1\n" |
michael@0 | 1915 | "psraw $0x6,%%xmm1\n" |
michael@0 | 1916 | "packuswb %%xmm1,%%xmm1\n" |
michael@0 | 1917 | "movq %%xmm1,0x0(%3)\n" |
michael@0 | 1918 | "add $0x8,%3\n" |
michael@0 | 1919 | "sub $0x2,%4\n" |
michael@0 | 1920 | - "jns .lscaleloop\n" |
michael@0 | 1921 | - |
michael@0 | 1922 | -".lscalenext:" |
michael@0 | 1923 | + "jns 1b\n" |
michael@0 | 1924 | + |
michael@0 | 1925 | +"2:" |
michael@0 | 1926 | "add $0x1,%4\n" |
michael@0 | 1927 | - "js .lscaledone\n" |
michael@0 | 1928 | + "js 3f\n" |
michael@0 | 1929 | |
michael@0 | 1930 | "mov %%r11,%%r10\n" |
michael@0 | 1931 | "sar $0x11,%%r10\n" |
michael@0 | 1932 | |
michael@0 | 1933 | "movzb (%1,%%r10,1), %%r13 \n" |
michael@0 | 1934 | "movq 2048(%5,%%r13,8),%%xmm0\n" |
michael@0 | 1935 | |
michael@0 | 1936 | "movzb (%2,%%r10,1), %%r13 \n" |
michael@0 | 1937 | @@ -241,52 +240,52 @@ void LinearScaleYUVToRGB32Row(const uint |
michael@0 | 1938 | "movzb (%0,%%r11,1), %%r13 \n" |
michael@0 | 1939 | "movq (%5,%%r13,8),%%xmm1\n" |
michael@0 | 1940 | |
michael@0 | 1941 | "paddsw %%xmm0,%%xmm1\n" |
michael@0 | 1942 | "psraw $0x6,%%xmm1\n" |
michael@0 | 1943 | "packuswb %%xmm1,%%xmm1\n" |
michael@0 | 1944 | "movd %%xmm1,0x0(%3)\n" |
michael@0 | 1945 | |
michael@0 | 1946 | -".lscaledone:" |
michael@0 | 1947 | +"3:" |
michael@0 | 1948 | : |
michael@0 | 1949 | : "r"(y_buf), // %0 |
michael@0 | 1950 | "r"(u_buf), // %1 |
michael@0 | 1951 | "r"(v_buf), // %2 |
michael@0 | 1952 | "r"(rgb_buf), // %3 |
michael@0 | 1953 | "r"(width), // %4 |
michael@0 | 1954 | "r" (kCoefficientsRgbY), // %5 |
michael@0 | 1955 | "r"(static_cast<long>(source_dx)) // %6 |
michael@0 | 1956 | : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2" |
michael@0 | 1957 | ); |
michael@0 | 1958 | } |
michael@0 | 1959 | |
michael@0 | 1960 | -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__) |
michael@0 | 1961 | +#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__) |
michael@0 | 1962 | |
michael@0 | 1963 | // PIC version is slower because less registers are available, so |
michael@0 | 1964 | // non-PIC is used on platforms where it is possible. |
michael@0 | 1965 | - |
michael@0 | 1966 | -void FastConvertYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 1967 | - const uint8* u_buf, |
michael@0 | 1968 | - const uint8* v_buf, |
michael@0 | 1969 | - uint8* rgb_buf, |
michael@0 | 1970 | - int width); |
michael@0 | 1971 | +void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf, |
michael@0 | 1972 | + const uint8* u_buf, |
michael@0 | 1973 | + const uint8* v_buf, |
michael@0 | 1974 | + uint8* rgb_buf, |
michael@0 | 1975 | + int width); |
michael@0 | 1976 | asm( |
michael@0 | 1977 | ".text\n" |
michael@0 | 1978 | - ".global FastConvertYUVToRGB32Row\n" |
michael@0 | 1979 | -"FastConvertYUVToRGB32Row:\n" |
michael@0 | 1980 | + ".global FastConvertYUVToRGB32Row_SSE\n" |
michael@0 | 1981 | + ".type FastConvertYUVToRGB32Row_SSE, @function\n" |
michael@0 | 1982 | +"FastConvertYUVToRGB32Row_SSE:\n" |
michael@0 | 1983 | "pusha\n" |
michael@0 | 1984 | "mov 0x24(%esp),%edx\n" |
michael@0 | 1985 | "mov 0x28(%esp),%edi\n" |
michael@0 | 1986 | "mov 0x2c(%esp),%esi\n" |
michael@0 | 1987 | "mov 0x30(%esp),%ebp\n" |
michael@0 | 1988 | "mov 0x34(%esp),%ecx\n" |
michael@0 | 1989 | - "jmp convertend\n" |
michael@0 | 1990 | - |
michael@0 | 1991 | -"convertloop:" |
michael@0 | 1992 | + "jmp 1f\n" |
michael@0 | 1993 | + |
michael@0 | 1994 | +"0:" |
michael@0 | 1995 | "movzbl (%edi),%eax\n" |
michael@0 | 1996 | "add $0x1,%edi\n" |
michael@0 | 1997 | "movzbl (%esi),%ebx\n" |
michael@0 | 1998 | "add $0x1,%esi\n" |
michael@0 | 1999 | "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" |
michael@0 | 2000 | "movzbl (%edx),%eax\n" |
michael@0 | 2001 | "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" |
michael@0 | 2002 | "movzbl 0x1(%edx),%ebx\n" |
michael@0 | 2003 | @@ -295,59 +294,77 @@ void FastConvertYUVToRGB32Row(const uint |
michael@0 | 2004 | "movq kCoefficientsRgbY(,%ebx,8),%mm2\n" |
michael@0 | 2005 | "paddsw %mm0,%mm1\n" |
michael@0 | 2006 | "paddsw %mm0,%mm2\n" |
michael@0 | 2007 | "psraw $0x6,%mm1\n" |
michael@0 | 2008 | "psraw $0x6,%mm2\n" |
michael@0 | 2009 | "packuswb %mm2,%mm1\n" |
michael@0 | 2010 | "movntq %mm1,0x0(%ebp)\n" |
michael@0 | 2011 | "add $0x8,%ebp\n" |
michael@0 | 2012 | -"convertend:" |
michael@0 | 2013 | +"1:" |
michael@0 | 2014 | "sub $0x2,%ecx\n" |
michael@0 | 2015 | - "jns convertloop\n" |
michael@0 | 2016 | + "jns 0b\n" |
michael@0 | 2017 | |
michael@0 | 2018 | "and $0x1,%ecx\n" |
michael@0 | 2019 | - "je convertdone\n" |
michael@0 | 2020 | + "je 2f\n" |
michael@0 | 2021 | |
michael@0 | 2022 | "movzbl (%edi),%eax\n" |
michael@0 | 2023 | "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" |
michael@0 | 2024 | "movzbl (%esi),%eax\n" |
michael@0 | 2025 | "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" |
michael@0 | 2026 | "movzbl (%edx),%eax\n" |
michael@0 | 2027 | "movq kCoefficientsRgbY(,%eax,8),%mm1\n" |
michael@0 | 2028 | "paddsw %mm0,%mm1\n" |
michael@0 | 2029 | "psraw $0x6,%mm1\n" |
michael@0 | 2030 | "packuswb %mm1,%mm1\n" |
michael@0 | 2031 | "movd %mm1,0x0(%ebp)\n" |
michael@0 | 2032 | -"convertdone:" |
michael@0 | 2033 | +"2:" |
michael@0 | 2034 | "popa\n" |
michael@0 | 2035 | "ret\n" |
michael@0 | 2036 | +#if !defined(XP_MACOSX) |
michael@0 | 2037 | + ".previous\n" |
michael@0 | 2038 | +#endif |
michael@0 | 2039 | ); |
michael@0 | 2040 | |
michael@0 | 2041 | - |
michael@0 | 2042 | -void ScaleYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 2043 | - const uint8* u_buf, |
michael@0 | 2044 | - const uint8* v_buf, |
michael@0 | 2045 | - uint8* rgb_buf, |
michael@0 | 2046 | - int width, |
michael@0 | 2047 | - int source_dx); |
michael@0 | 2048 | +void FastConvertYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 2049 | + const uint8* u_buf, |
michael@0 | 2050 | + const uint8* v_buf, |
michael@0 | 2051 | + uint8* rgb_buf, |
michael@0 | 2052 | + int width) |
michael@0 | 2053 | +{ |
michael@0 | 2054 | + if (mozilla::supports_sse()) { |
michael@0 | 2055 | + FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width); |
michael@0 | 2056 | + return; |
michael@0 | 2057 | + } |
michael@0 | 2058 | + |
michael@0 | 2059 | + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); |
michael@0 | 2060 | +} |
michael@0 | 2061 | + |
michael@0 | 2062 | + |
michael@0 | 2063 | +void ScaleYUVToRGB32Row_SSE(const uint8* y_buf, |
michael@0 | 2064 | + const uint8* u_buf, |
michael@0 | 2065 | + const uint8* v_buf, |
michael@0 | 2066 | + uint8* rgb_buf, |
michael@0 | 2067 | + int width, |
michael@0 | 2068 | + int source_dx); |
michael@0 | 2069 | asm( |
michael@0 | 2070 | ".text\n" |
michael@0 | 2071 | - ".global ScaleYUVToRGB32Row\n" |
michael@0 | 2072 | -"ScaleYUVToRGB32Row:\n" |
michael@0 | 2073 | + ".global ScaleYUVToRGB32Row_SSE\n" |
michael@0 | 2074 | + ".type ScaleYUVToRGB32Row_SSE, @function\n" |
michael@0 | 2075 | +"ScaleYUVToRGB32Row_SSE:\n" |
michael@0 | 2076 | "pusha\n" |
michael@0 | 2077 | "mov 0x24(%esp),%edx\n" |
michael@0 | 2078 | "mov 0x28(%esp),%edi\n" |
michael@0 | 2079 | "mov 0x2c(%esp),%esi\n" |
michael@0 | 2080 | "mov 0x30(%esp),%ebp\n" |
michael@0 | 2081 | "mov 0x34(%esp),%ecx\n" |
michael@0 | 2082 | "xor %ebx,%ebx\n" |
michael@0 | 2083 | - "jmp scaleend\n" |
michael@0 | 2084 | - |
michael@0 | 2085 | -"scaleloop:" |
michael@0 | 2086 | + "jmp 1f\n" |
michael@0 | 2087 | + |
michael@0 | 2088 | +"0:" |
michael@0 | 2089 | "mov %ebx,%eax\n" |
michael@0 | 2090 | "sar $0x11,%eax\n" |
michael@0 | 2091 | "movzbl (%edi,%eax,1),%eax\n" |
michael@0 | 2092 | "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" |
michael@0 | 2093 | "mov %ebx,%eax\n" |
michael@0 | 2094 | "sar $0x11,%eax\n" |
michael@0 | 2095 | "movzbl (%esi,%eax,1),%eax\n" |
michael@0 | 2096 | "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" |
michael@0 | 2097 | @@ -363,22 +380,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b |
michael@0 | 2098 | "movq kCoefficientsRgbY(,%eax,8),%mm2\n" |
michael@0 | 2099 | "paddsw %mm0,%mm1\n" |
michael@0 | 2100 | "paddsw %mm0,%mm2\n" |
michael@0 | 2101 | "psraw $0x6,%mm1\n" |
michael@0 | 2102 | "psraw $0x6,%mm2\n" |
michael@0 | 2103 | "packuswb %mm2,%mm1\n" |
michael@0 | 2104 | "movntq %mm1,0x0(%ebp)\n" |
michael@0 | 2105 | "add $0x8,%ebp\n" |
michael@0 | 2106 | -"scaleend:" |
michael@0 | 2107 | +"1:" |
michael@0 | 2108 | "sub $0x2,%ecx\n" |
michael@0 | 2109 | - "jns scaleloop\n" |
michael@0 | 2110 | + "jns 0b\n" |
michael@0 | 2111 | |
michael@0 | 2112 | "and $0x1,%ecx\n" |
michael@0 | 2113 | - "je scaledone\n" |
michael@0 | 2114 | + "je 2f\n" |
michael@0 | 2115 | |
michael@0 | 2116 | "mov %ebx,%eax\n" |
michael@0 | 2117 | "sar $0x11,%eax\n" |
michael@0 | 2118 | "movzbl (%edi,%eax,1),%eax\n" |
michael@0 | 2119 | "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" |
michael@0 | 2120 | "mov %ebx,%eax\n" |
michael@0 | 2121 | "sar $0x11,%eax\n" |
michael@0 | 2122 | "movzbl (%esi,%eax,1),%eax\n" |
michael@0 | 2123 | @@ -387,51 +404,71 @@ void ScaleYUVToRGB32Row(const uint8* y_b |
michael@0 | 2124 | "sar $0x10,%eax\n" |
michael@0 | 2125 | "movzbl (%edx,%eax,1),%eax\n" |
michael@0 | 2126 | "movq kCoefficientsRgbY(,%eax,8),%mm1\n" |
michael@0 | 2127 | "paddsw %mm0,%mm1\n" |
michael@0 | 2128 | "psraw $0x6,%mm1\n" |
michael@0 | 2129 | "packuswb %mm1,%mm1\n" |
michael@0 | 2130 | "movd %mm1,0x0(%ebp)\n" |
michael@0 | 2131 | |
michael@0 | 2132 | -"scaledone:" |
michael@0 | 2133 | +"2:" |
michael@0 | 2134 | "popa\n" |
michael@0 | 2135 | "ret\n" |
michael@0 | 2136 | +#if !defined(XP_MACOSX) |
michael@0 | 2137 | + ".previous\n" |
michael@0 | 2138 | +#endif |
michael@0 | 2139 | ); |
michael@0 | 2140 | |
michael@0 | 2141 | -void LinearScaleYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 2142 | - const uint8* u_buf, |
michael@0 | 2143 | - const uint8* v_buf, |
michael@0 | 2144 | - uint8* rgb_buf, |
michael@0 | 2145 | - int width, |
michael@0 | 2146 | - int source_dx); |
michael@0 | 2147 | +void ScaleYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 2148 | + const uint8* u_buf, |
michael@0 | 2149 | + const uint8* v_buf, |
michael@0 | 2150 | + uint8* rgb_buf, |
michael@0 | 2151 | + int width, |
michael@0 | 2152 | + int source_dx) |
michael@0 | 2153 | +{ |
michael@0 | 2154 | + if (mozilla::supports_sse()) { |
michael@0 | 2155 | + ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, |
michael@0 | 2156 | + width, source_dx); |
michael@0 | 2157 | + } |
michael@0 | 2158 | + |
michael@0 | 2159 | + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, |
michael@0 | 2160 | + width, source_dx); |
michael@0 | 2161 | +} |
michael@0 | 2162 | + |
michael@0 | 2163 | +void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf, |
michael@0 | 2164 | + const uint8* u_buf, |
michael@0 | 2165 | + const uint8* v_buf, |
michael@0 | 2166 | + uint8* rgb_buf, |
michael@0 | 2167 | + int width, |
michael@0 | 2168 | + int source_dx); |
michael@0 | 2169 | asm( |
michael@0 | 2170 | ".text\n" |
michael@0 | 2171 | - ".global LinearScaleYUVToRGB32Row\n" |
michael@0 | 2172 | -"LinearScaleYUVToRGB32Row:\n" |
michael@0 | 2173 | + ".global LinearScaleYUVToRGB32Row_SSE\n" |
michael@0 | 2174 | + ".type LinearScaleYUVToRGB32Row_SSE, @function\n" |
michael@0 | 2175 | +"LinearScaleYUVToRGB32Row_SSE:\n" |
michael@0 | 2176 | "pusha\n" |
michael@0 | 2177 | "mov 0x24(%esp),%edx\n" |
michael@0 | 2178 | "mov 0x28(%esp),%edi\n" |
michael@0 | 2179 | "mov 0x30(%esp),%ebp\n" |
michael@0 | 2180 | |
michael@0 | 2181 | // source_width = width * source_dx + ebx |
michael@0 | 2182 | "mov 0x34(%esp), %ecx\n" |
michael@0 | 2183 | "imull 0x38(%esp), %ecx\n" |
michael@0 | 2184 | "mov %ecx, 0x34(%esp)\n" |
michael@0 | 2185 | |
michael@0 | 2186 | "mov 0x38(%esp), %ecx\n" |
michael@0 | 2187 | "xor %ebx,%ebx\n" // x = 0 |
michael@0 | 2188 | "cmp $0x20000,%ecx\n" // if source_dx >= 2.0 |
michael@0 | 2189 | - "jl .lscaleend\n" |
michael@0 | 2190 | + "jl 1f\n" |
michael@0 | 2191 | "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less |
michael@0 | 2192 | - "jmp .lscaleend\n" |
michael@0 | 2193 | - |
michael@0 | 2194 | -".lscaleloop:" |
michael@0 | 2195 | - "mov %ebx,%eax\n" |
michael@0 | 2196 | - "sar $0x11,%eax\n" |
michael@0 | 2197 | + "jmp 1f\n" |
michael@0 | 2198 | + |
michael@0 | 2199 | +"0:" |
michael@0 | 2200 | + "mov %ebx,%eax\n" |
michael@0 | 2201 | + "sar $0x11,%eax\n" |
michael@0 | 2202 | |
michael@0 | 2203 | "movzbl (%edi,%eax,1),%ecx\n" |
michael@0 | 2204 | "movzbl 1(%edi,%eax,1),%esi\n" |
michael@0 | 2205 | "mov %ebx,%eax\n" |
michael@0 | 2206 | "andl $0x1fffe, %eax \n" |
michael@0 | 2207 | "imul %eax, %esi \n" |
michael@0 | 2208 | "xorl $0x1fffe, %eax \n" |
michael@0 | 2209 | "imul %eax, %ecx \n" |
michael@0 | 2210 | @@ -464,17 +501,17 @@ void LinearScaleYUVToRGB32Row(const uint |
michael@0 | 2211 | "imul %eax, %esi \n" |
michael@0 | 2212 | "xorl $0xffff, %eax \n" |
michael@0 | 2213 | "imul %eax, %ecx \n" |
michael@0 | 2214 | "addl %esi, %ecx \n" |
michael@0 | 2215 | "shrl $16, %ecx \n" |
michael@0 | 2216 | "movq kCoefficientsRgbY(,%ecx,8),%mm1\n" |
michael@0 | 2217 | |
michael@0 | 2218 | "cmp 0x34(%esp), %ebx\n" |
michael@0 | 2219 | - "jge .lscalelastpixel\n" |
michael@0 | 2220 | + "jge 2f\n" |
michael@0 | 2221 | |
michael@0 | 2222 | "mov %ebx,%eax\n" |
michael@0 | 2223 | "sar $0x10,%eax\n" |
michael@0 | 2224 | "movzbl (%edx,%eax,1),%ecx\n" |
michael@0 | 2225 | "movzbl 1(%edx,%eax,1),%esi\n" |
michael@0 | 2226 | "mov %ebx,%eax\n" |
michael@0 | 2227 | "add 0x38(%esp),%ebx\n" |
michael@0 | 2228 | "andl $0xffff, %eax \n" |
michael@0 | 2229 | @@ -488,56 +525,76 @@ void LinearScaleYUVToRGB32Row(const uint |
michael@0 | 2230 | "paddsw %mm0,%mm1\n" |
michael@0 | 2231 | "paddsw %mm0,%mm2\n" |
michael@0 | 2232 | "psraw $0x6,%mm1\n" |
michael@0 | 2233 | "psraw $0x6,%mm2\n" |
michael@0 | 2234 | "packuswb %mm2,%mm1\n" |
michael@0 | 2235 | "movntq %mm1,0x0(%ebp)\n" |
michael@0 | 2236 | "add $0x8,%ebp\n" |
michael@0 | 2237 | |
michael@0 | 2238 | -".lscaleend:" |
michael@0 | 2239 | +"1:" |
michael@0 | 2240 | "cmp 0x34(%esp), %ebx\n" |
michael@0 | 2241 | - "jl .lscaleloop\n" |
michael@0 | 2242 | + "jl 0b\n" |
michael@0 | 2243 | "popa\n" |
michael@0 | 2244 | "ret\n" |
michael@0 | 2245 | |
michael@0 | 2246 | -".lscalelastpixel:" |
michael@0 | 2247 | +"2:" |
michael@0 | 2248 | "paddsw %mm0, %mm1\n" |
michael@0 | 2249 | "psraw $6, %mm1\n" |
michael@0 | 2250 | "packuswb %mm1, %mm1\n" |
michael@0 | 2251 | "movd %mm1, (%ebp)\n" |
michael@0 | 2252 | "popa\n" |
michael@0 | 2253 | "ret\n" |
michael@0 | 2254 | +#if !defined(XP_MACOSX) |
michael@0 | 2255 | + ".previous\n" |
michael@0 | 2256 | +#endif |
michael@0 | 2257 | ); |
michael@0 | 2258 | |
michael@0 | 2259 | -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__) |
michael@0 | 2260 | - |
michael@0 | 2261 | -extern void PICConvertYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 2262 | - const uint8* u_buf, |
michael@0 | 2263 | - const uint8* v_buf, |
michael@0 | 2264 | - uint8* rgb_buf, |
michael@0 | 2265 | - int width, |
michael@0 | 2266 | - int16 *kCoefficientsRgbY); |
michael@0 | 2267 | +void LinearScaleYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 2268 | + const uint8* u_buf, |
michael@0 | 2269 | + const uint8* v_buf, |
michael@0 | 2270 | + uint8* rgb_buf, |
michael@0 | 2271 | + int width, |
michael@0 | 2272 | + int source_dx) |
michael@0 | 2273 | +{ |
michael@0 | 2274 | + if (mozilla::supports_sse()) { |
michael@0 | 2275 | + LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, |
michael@0 | 2276 | + width, source_dx); |
michael@0 | 2277 | + } |
michael@0 | 2278 | + |
michael@0 | 2279 | + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, |
michael@0 | 2280 | + width, source_dx); |
michael@0 | 2281 | +} |
michael@0 | 2282 | + |
michael@0 | 2283 | +#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__) |
michael@0 | 2284 | + |
michael@0 | 2285 | +void PICConvertYUVToRGB32Row_SSE(const uint8* y_buf, |
michael@0 | 2286 | + const uint8* u_buf, |
michael@0 | 2287 | + const uint8* v_buf, |
michael@0 | 2288 | + uint8* rgb_buf, |
michael@0 | 2289 | + int width, |
michael@0 | 2290 | + int16 *kCoefficientsRgbY); |
michael@0 | 2291 | + |
michael@0 | 2292 | asm( |
michael@0 | 2293 | ".text\n" |
michael@0 | 2294 | -#if defined(OS_MACOSX) |
michael@0 | 2295 | -"_PICConvertYUVToRGB32Row:\n" |
michael@0 | 2296 | +#if defined(XP_MACOSX) |
michael@0 | 2297 | +"_PICConvertYUVToRGB32Row_SSE:\n" |
michael@0 | 2298 | #else |
michael@0 | 2299 | -"PICConvertYUVToRGB32Row:\n" |
michael@0 | 2300 | +"PICConvertYUVToRGB32Row_SSE:\n" |
michael@0 | 2301 | #endif |
michael@0 | 2302 | "pusha\n" |
michael@0 | 2303 | "mov 0x24(%esp),%edx\n" |
michael@0 | 2304 | "mov 0x28(%esp),%edi\n" |
michael@0 | 2305 | "mov 0x2c(%esp),%esi\n" |
michael@0 | 2306 | "mov 0x30(%esp),%ebp\n" |
michael@0 | 2307 | "mov 0x38(%esp),%ecx\n" |
michael@0 | 2308 | |
michael@0 | 2309 | - "jmp .Lconvertend\n" |
michael@0 | 2310 | - |
michael@0 | 2311 | -".Lconvertloop:" |
michael@0 | 2312 | + "jmp 1f\n" |
michael@0 | 2313 | + |
michael@0 | 2314 | +"0:" |
michael@0 | 2315 | "movzbl (%edi),%eax\n" |
michael@0 | 2316 | "add $0x1,%edi\n" |
michael@0 | 2317 | "movzbl (%esi),%ebx\n" |
michael@0 | 2318 | "add $0x1,%esi\n" |
michael@0 | 2319 | "movq 2048(%ecx,%eax,8),%mm0\n" |
michael@0 | 2320 | "movzbl (%edx),%eax\n" |
michael@0 | 2321 | "paddsw 4096(%ecx,%ebx,8),%mm0\n" |
michael@0 | 2322 | "movzbl 0x1(%edx),%ebx\n" |
michael@0 | 2323 | @@ -546,72 +603,81 @@ extern void PICConvertYUVToRGB32Row(cons |
michael@0 | 2324 | "movq 0(%ecx,%ebx,8),%mm2\n" |
michael@0 | 2325 | "paddsw %mm0,%mm1\n" |
michael@0 | 2326 | "paddsw %mm0,%mm2\n" |
michael@0 | 2327 | "psraw $0x6,%mm1\n" |
michael@0 | 2328 | "psraw $0x6,%mm2\n" |
michael@0 | 2329 | "packuswb %mm2,%mm1\n" |
michael@0 | 2330 | "movntq %mm1,0x0(%ebp)\n" |
michael@0 | 2331 | "add $0x8,%ebp\n" |
michael@0 | 2332 | -".Lconvertend:" |
michael@0 | 2333 | +"1:" |
michael@0 | 2334 | "subl $0x2,0x34(%esp)\n" |
michael@0 | 2335 | - "jns .Lconvertloop\n" |
michael@0 | 2336 | + "jns 0b\n" |
michael@0 | 2337 | |
michael@0 | 2338 | "andl $0x1,0x34(%esp)\n" |
michael@0 | 2339 | - "je .Lconvertdone\n" |
michael@0 | 2340 | + "je 2f\n" |
michael@0 | 2341 | |
michael@0 | 2342 | "movzbl (%edi),%eax\n" |
michael@0 | 2343 | "movq 2048(%ecx,%eax,8),%mm0\n" |
michael@0 | 2344 | "movzbl (%esi),%eax\n" |
michael@0 | 2345 | "paddsw 4096(%ecx,%eax,8),%mm0\n" |
michael@0 | 2346 | "movzbl (%edx),%eax\n" |
michael@0 | 2347 | "movq 0(%ecx,%eax,8),%mm1\n" |
michael@0 | 2348 | "paddsw %mm0,%mm1\n" |
michael@0 | 2349 | "psraw $0x6,%mm1\n" |
michael@0 | 2350 | "packuswb %mm1,%mm1\n" |
michael@0 | 2351 | "movd %mm1,0x0(%ebp)\n" |
michael@0 | 2352 | -".Lconvertdone:\n" |
michael@0 | 2353 | +"2:" |
michael@0 | 2354 | "popa\n" |
michael@0 | 2355 | "ret\n" |
michael@0 | 2356 | +#if !defined(XP_MACOSX) |
michael@0 | 2357 | + ".previous\n" |
michael@0 | 2358 | +#endif |
michael@0 | 2359 | ); |
michael@0 | 2360 | |
michael@0 | 2361 | void FastConvertYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 2362 | const uint8* u_buf, |
michael@0 | 2363 | const uint8* v_buf, |
michael@0 | 2364 | uint8* rgb_buf, |
michael@0 | 2365 | - int width) { |
michael@0 | 2366 | - PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, |
michael@0 | 2367 | - &kCoefficientsRgbY[0][0]); |
michael@0 | 2368 | -} |
michael@0 | 2369 | - |
michael@0 | 2370 | -extern void PICScaleYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 2371 | + int width) |
michael@0 | 2372 | +{ |
michael@0 | 2373 | + if (mozilla::supports_sse()) { |
michael@0 | 2374 | + PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, |
michael@0 | 2375 | + &kCoefficientsRgbY[0][0]); |
michael@0 | 2376 | + return; |
michael@0 | 2377 | + } |
michael@0 | 2378 | + |
michael@0 | 2379 | + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); |
michael@0 | 2380 | +} |
michael@0 | 2381 | + |
michael@0 | 2382 | +void PICScaleYUVToRGB32Row_SSE(const uint8* y_buf, |
michael@0 | 2383 | const uint8* u_buf, |
michael@0 | 2384 | const uint8* v_buf, |
michael@0 | 2385 | uint8* rgb_buf, |
michael@0 | 2386 | int width, |
michael@0 | 2387 | int source_dx, |
michael@0 | 2388 | int16 *kCoefficientsRgbY); |
michael@0 | 2389 | |
michael@0 | 2390 | asm( |
michael@0 | 2391 | ".text\n" |
michael@0 | 2392 | -#if defined(OS_MACOSX) |
michael@0 | 2393 | -"_PICScaleYUVToRGB32Row:\n" |
michael@0 | 2394 | +#if defined(XP_MACOSX) |
michael@0 | 2395 | +"_PICScaleYUVToRGB32Row_SSE:\n" |
michael@0 | 2396 | #else |
michael@0 | 2397 | -"PICScaleYUVToRGB32Row:\n" |
michael@0 | 2398 | +"PICScaleYUVToRGB32Row_SSE:\n" |
michael@0 | 2399 | #endif |
michael@0 | 2400 | "pusha\n" |
michael@0 | 2401 | "mov 0x24(%esp),%edx\n" |
michael@0 | 2402 | "mov 0x28(%esp),%edi\n" |
michael@0 | 2403 | "mov 0x2c(%esp),%esi\n" |
michael@0 | 2404 | "mov 0x30(%esp),%ebp\n" |
michael@0 | 2405 | "mov 0x3c(%esp),%ecx\n" |
michael@0 | 2406 | "xor %ebx,%ebx\n" |
michael@0 | 2407 | - "jmp Lscaleend\n" |
michael@0 | 2408 | - |
michael@0 | 2409 | -"Lscaleloop:" |
michael@0 | 2410 | + "jmp 1f\n" |
michael@0 | 2411 | + |
michael@0 | 2412 | +"0:" |
michael@0 | 2413 | "mov %ebx,%eax\n" |
michael@0 | 2414 | "sar $0x11,%eax\n" |
michael@0 | 2415 | "movzbl (%edi,%eax,1),%eax\n" |
michael@0 | 2416 | "movq 2048(%ecx,%eax,8),%mm0\n" |
michael@0 | 2417 | "mov %ebx,%eax\n" |
michael@0 | 2418 | "sar $0x11,%eax\n" |
michael@0 | 2419 | "movzbl (%esi,%eax,1),%eax\n" |
michael@0 | 2420 | "paddsw 4096(%ecx,%eax,8),%mm0\n" |
michael@0 | 2421 | @@ -627,22 +693,22 @@ extern void PICScaleYUVToRGB32Row(const |
michael@0 | 2422 | "movq 0(%ecx,%eax,8),%mm2\n" |
michael@0 | 2423 | "paddsw %mm0,%mm1\n" |
michael@0 | 2424 | "paddsw %mm0,%mm2\n" |
michael@0 | 2425 | "psraw $0x6,%mm1\n" |
michael@0 | 2426 | "psraw $0x6,%mm2\n" |
michael@0 | 2427 | "packuswb %mm2,%mm1\n" |
michael@0 | 2428 | "movntq %mm1,0x0(%ebp)\n" |
michael@0 | 2429 | "add $0x8,%ebp\n" |
michael@0 | 2430 | -"Lscaleend:" |
michael@0 | 2431 | +"1:" |
michael@0 | 2432 | "subl $0x2,0x34(%esp)\n" |
michael@0 | 2433 | - "jns Lscaleloop\n" |
michael@0 | 2434 | + "jns 0b\n" |
michael@0 | 2435 | |
michael@0 | 2436 | "andl $0x1,0x34(%esp)\n" |
michael@0 | 2437 | - "je Lscaledone\n" |
michael@0 | 2438 | + "je 2f\n" |
michael@0 | 2439 | |
michael@0 | 2440 | "mov %ebx,%eax\n" |
michael@0 | 2441 | "sar $0x11,%eax\n" |
michael@0 | 2442 | "movzbl (%edi,%eax,1),%eax\n" |
michael@0 | 2443 | "movq 2048(%ecx,%eax,8),%mm0\n" |
michael@0 | 2444 | "mov %ebx,%eax\n" |
michael@0 | 2445 | "sar $0x11,%eax\n" |
michael@0 | 2446 | "movzbl (%esi,%eax,1),%eax\n" |
michael@0 | 2447 | @@ -651,66 +717,75 @@ extern void PICScaleYUVToRGB32Row(const |
michael@0 | 2448 | "sar $0x10,%eax\n" |
michael@0 | 2449 | "movzbl (%edx,%eax,1),%eax\n" |
michael@0 | 2450 | "movq 0(%ecx,%eax,8),%mm1\n" |
michael@0 | 2451 | "paddsw %mm0,%mm1\n" |
michael@0 | 2452 | "psraw $0x6,%mm1\n" |
michael@0 | 2453 | "packuswb %mm1,%mm1\n" |
michael@0 | 2454 | "movd %mm1,0x0(%ebp)\n" |
michael@0 | 2455 | |
michael@0 | 2456 | -"Lscaledone:" |
michael@0 | 2457 | +"2:" |
michael@0 | 2458 | "popa\n" |
michael@0 | 2459 | "ret\n" |
michael@0 | 2460 | +#if !defined(XP_MACOSX) |
michael@0 | 2461 | + ".previous\n" |
michael@0 | 2462 | +#endif |
michael@0 | 2463 | ); |
michael@0 | 2464 | |
michael@0 | 2465 | - |
michael@0 | 2466 | void ScaleYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 2467 | const uint8* u_buf, |
michael@0 | 2468 | const uint8* v_buf, |
michael@0 | 2469 | uint8* rgb_buf, |
michael@0 | 2470 | int width, |
michael@0 | 2471 | - int source_dx) { |
michael@0 | 2472 | - PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx, |
michael@0 | 2473 | - &kCoefficientsRgbY[0][0]); |
michael@0 | 2474 | -} |
michael@0 | 2475 | - |
michael@0 | 2476 | -void PICLinearScaleYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 2477 | - const uint8* u_buf, |
michael@0 | 2478 | - const uint8* v_buf, |
michael@0 | 2479 | - uint8* rgb_buf, |
michael@0 | 2480 | - int width, |
michael@0 | 2481 | - int source_dx, |
michael@0 | 2482 | - int16 *kCoefficientsRgbY); |
michael@0 | 2483 | + int source_dx) |
michael@0 | 2484 | +{ |
michael@0 | 2485 | + if (mozilla::supports_sse()) { |
michael@0 | 2486 | + PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx, |
michael@0 | 2487 | + &kCoefficientsRgbY[0][0]); |
michael@0 | 2488 | + return; |
michael@0 | 2489 | + } |
michael@0 | 2490 | + |
michael@0 | 2491 | + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); |
michael@0 | 2492 | +} |
michael@0 | 2493 | + |
michael@0 | 2494 | +void PICLinearScaleYUVToRGB32Row_SSE(const uint8* y_buf, |
michael@0 | 2495 | + const uint8* u_buf, |
michael@0 | 2496 | + const uint8* v_buf, |
michael@0 | 2497 | + uint8* rgb_buf, |
michael@0 | 2498 | + int width, |
michael@0 | 2499 | + int source_dx, |
michael@0 | 2500 | + int16 *kCoefficientsRgbY); |
michael@0 | 2501 | + |
michael@0 | 2502 | asm( |
michael@0 | 2503 | ".text\n" |
michael@0 | 2504 | -#if defined(OS_MACOSX) |
michael@0 | 2505 | -"_PICLinearScaleYUVToRGB32Row:\n" |
michael@0 | 2506 | +#if defined(XP_MACOSX) |
michael@0 | 2507 | +"_PICLinearScaleYUVToRGB32Row_SSE:\n" |
michael@0 | 2508 | #else |
michael@0 | 2509 | -"PICLinearScaleYUVToRGB32Row:\n" |
michael@0 | 2510 | +"PICLinearScaleYUVToRGB32Row_SSE:\n" |
michael@0 | 2511 | #endif |
michael@0 | 2512 | "pusha\n" |
michael@0 | 2513 | "mov 0x24(%esp),%edx\n" |
michael@0 | 2514 | "mov 0x30(%esp),%ebp\n" |
michael@0 | 2515 | "mov 0x34(%esp),%ecx\n" |
michael@0 | 2516 | "mov 0x3c(%esp),%edi\n" |
michael@0 | 2517 | "xor %ebx,%ebx\n" |
michael@0 | 2518 | |
michael@0 | 2519 | // source_width = width * source_dx + ebx |
michael@0 | 2520 | "mov 0x34(%esp), %ecx\n" |
michael@0 | 2521 | "imull 0x38(%esp), %ecx\n" |
michael@0 | 2522 | "mov %ecx, 0x34(%esp)\n" |
michael@0 | 2523 | |
michael@0 | 2524 | "mov 0x38(%esp), %ecx\n" |
michael@0 | 2525 | "xor %ebx,%ebx\n" // x = 0 |
michael@0 | 2526 | "cmp $0x20000,%ecx\n" // if source_dx >= 2.0 |
michael@0 | 2527 | - "jl .lscaleend\n" |
michael@0 | 2528 | + "jl 1f\n" |
michael@0 | 2529 | "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less |
michael@0 | 2530 | - "jmp .lscaleend\n" |
michael@0 | 2531 | - |
michael@0 | 2532 | -".lscaleloop:" |
michael@0 | 2533 | + "jmp 1f\n" |
michael@0 | 2534 | + |
michael@0 | 2535 | +"0:" |
michael@0 | 2536 | "mov 0x28(%esp),%esi\n" |
michael@0 | 2537 | "mov %ebx,%eax\n" |
michael@0 | 2538 | "sar $0x11,%eax\n" |
michael@0 | 2539 | |
michael@0 | 2540 | "movzbl (%esi,%eax,1),%ecx\n" |
michael@0 | 2541 | "movzbl 1(%esi,%eax,1),%esi\n" |
michael@0 | 2542 | "mov %ebx,%eax\n" |
michael@0 | 2543 | "andl $0x1fffe, %eax \n" |
michael@0 | 2544 | @@ -746,17 +821,17 @@ void PICLinearScaleYUVToRGB32Row(const u |
michael@0 | 2545 | "imul %eax, %esi \n" |
michael@0 | 2546 | "xorl $0xffff, %eax \n" |
michael@0 | 2547 | "imul %eax, %ecx \n" |
michael@0 | 2548 | "addl %esi, %ecx \n" |
michael@0 | 2549 | "shrl $16, %ecx \n" |
michael@0 | 2550 | "movq (%edi,%ecx,8),%mm1\n" |
michael@0 | 2551 | |
michael@0 | 2552 | "cmp 0x34(%esp), %ebx\n" |
michael@0 | 2553 | - "jge .lscalelastpixel\n" |
michael@0 | 2554 | + "jge 2f\n" |
michael@0 | 2555 | |
michael@0 | 2556 | "mov %ebx,%eax\n" |
michael@0 | 2557 | "sar $0x10,%eax\n" |
michael@0 | 2558 | "movzbl (%edx,%eax,1),%ecx\n" |
michael@0 | 2559 | "movzbl 1(%edx,%eax,1),%esi\n" |
michael@0 | 2560 | "mov %ebx,%eax\n" |
michael@0 | 2561 | "add 0x38(%esp),%ebx\n" |
michael@0 | 2562 | "andl $0xffff, %eax \n" |
michael@0 | 2563 | @@ -770,154 +845,71 @@ void PICLinearScaleYUVToRGB32Row(const u |
michael@0 | 2564 | "paddsw %mm0,%mm1\n" |
michael@0 | 2565 | "paddsw %mm0,%mm2\n" |
michael@0 | 2566 | "psraw $0x6,%mm1\n" |
michael@0 | 2567 | "psraw $0x6,%mm2\n" |
michael@0 | 2568 | "packuswb %mm2,%mm1\n" |
michael@0 | 2569 | "movntq %mm1,0x0(%ebp)\n" |
michael@0 | 2570 | "add $0x8,%ebp\n" |
michael@0 | 2571 | |
michael@0 | 2572 | -".lscaleend:" |
michael@0 | 2573 | +"1:" |
michael@0 | 2574 | "cmp %ebx, 0x34(%esp)\n" |
michael@0 | 2575 | - "jg .lscaleloop\n" |
michael@0 | 2576 | + "jg 0b\n" |
michael@0 | 2577 | "popa\n" |
michael@0 | 2578 | "ret\n" |
michael@0 | 2579 | |
michael@0 | 2580 | -".lscalelastpixel:" |
michael@0 | 2581 | +"2:" |
michael@0 | 2582 | "paddsw %mm0, %mm1\n" |
michael@0 | 2583 | "psraw $6, %mm1\n" |
michael@0 | 2584 | "packuswb %mm1, %mm1\n" |
michael@0 | 2585 | "movd %mm1, (%ebp)\n" |
michael@0 | 2586 | "popa\n" |
michael@0 | 2587 | "ret\n" |
michael@0 | 2588 | +#if !defined(XP_MACOSX) |
michael@0 | 2589 | + ".previous\n" |
michael@0 | 2590 | +#endif |
michael@0 | 2591 | ); |
michael@0 | 2592 | |
michael@0 | 2593 | + |
michael@0 | 2594 | void LinearScaleYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 2595 | - const uint8* u_buf, |
michael@0 | 2596 | - const uint8* v_buf, |
michael@0 | 2597 | - uint8* rgb_buf, |
michael@0 | 2598 | - int width, |
michael@0 | 2599 | - int source_dx) { |
michael@0 | 2600 | - PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx, |
michael@0 | 2601 | - &kCoefficientsRgbY[0][0]); |
michael@0 | 2602 | -} |
michael@0 | 2603 | - |
michael@0 | 2604 | -#else // USE_MMX |
michael@0 | 2605 | - |
michael@0 | 2606 | -// C reference code that mimic the YUV assembly. |
michael@0 | 2607 | -#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x))) |
michael@0 | 2608 | -#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \ |
michael@0 | 2609 | - (((x) + (y)) > 32767 ? 32767 : ((x) + (y)))) |
michael@0 | 2610 | - |
michael@0 | 2611 | -static inline void YuvPixel(uint8 y, |
michael@0 | 2612 | - uint8 u, |
michael@0 | 2613 | - uint8 v, |
michael@0 | 2614 | - uint8* rgb_buf) { |
michael@0 | 2615 | - |
michael@0 | 2616 | - int b = kCoefficientsRgbY[256+u][0]; |
michael@0 | 2617 | - int g = kCoefficientsRgbY[256+u][1]; |
michael@0 | 2618 | - int r = kCoefficientsRgbY[256+u][2]; |
michael@0 | 2619 | - int a = kCoefficientsRgbY[256+u][3]; |
michael@0 | 2620 | - |
michael@0 | 2621 | - b = paddsw(b, kCoefficientsRgbY[512+v][0]); |
michael@0 | 2622 | - g = paddsw(g, kCoefficientsRgbY[512+v][1]); |
michael@0 | 2623 | - r = paddsw(r, kCoefficientsRgbY[512+v][2]); |
michael@0 | 2624 | - a = paddsw(a, kCoefficientsRgbY[512+v][3]); |
michael@0 | 2625 | - |
michael@0 | 2626 | - b = paddsw(b, kCoefficientsRgbY[y][0]); |
michael@0 | 2627 | - g = paddsw(g, kCoefficientsRgbY[y][1]); |
michael@0 | 2628 | - r = paddsw(r, kCoefficientsRgbY[y][2]); |
michael@0 | 2629 | - a = paddsw(a, kCoefficientsRgbY[y][3]); |
michael@0 | 2630 | - |
michael@0 | 2631 | - b >>= 6; |
michael@0 | 2632 | - g >>= 6; |
michael@0 | 2633 | - r >>= 6; |
michael@0 | 2634 | - a >>= 6; |
michael@0 | 2635 | - |
michael@0 | 2636 | - *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) | |
michael@0 | 2637 | - (packuswb(g) << 8) | |
michael@0 | 2638 | - (packuswb(r) << 16) | |
michael@0 | 2639 | - (packuswb(a) << 24); |
michael@0 | 2640 | -} |
michael@0 | 2641 | - |
michael@0 | 2642 | + const uint8* u_buf, |
michael@0 | 2643 | + const uint8* v_buf, |
michael@0 | 2644 | + uint8* rgb_buf, |
michael@0 | 2645 | + int width, |
michael@0 | 2646 | + int source_dx) |
michael@0 | 2647 | +{ |
michael@0 | 2648 | + if (mozilla::supports_sse()) { |
michael@0 | 2649 | + PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, |
michael@0 | 2650 | + source_dx, &kCoefficientsRgbY[0][0]); |
michael@0 | 2651 | + return; |
michael@0 | 2652 | + } |
michael@0 | 2653 | + |
michael@0 | 2654 | + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); |
michael@0 | 2655 | +} |
michael@0 | 2656 | +#else |
michael@0 | 2657 | void FastConvertYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 2658 | const uint8* u_buf, |
michael@0 | 2659 | const uint8* v_buf, |
michael@0 | 2660 | uint8* rgb_buf, |
michael@0 | 2661 | int width) { |
michael@0 | 2662 | - for (int x = 0; x < width; x += 2) { |
michael@0 | 2663 | - uint8 u = u_buf[x >> 1]; |
michael@0 | 2664 | - uint8 v = v_buf[x >> 1]; |
michael@0 | 2665 | - uint8 y0 = y_buf[x]; |
michael@0 | 2666 | - YuvPixel(y0, u, v, rgb_buf); |
michael@0 | 2667 | - if ((x + 1) < width) { |
michael@0 | 2668 | - uint8 y1 = y_buf[x + 1]; |
michael@0 | 2669 | - YuvPixel(y1, u, v, rgb_buf + 4); |
michael@0 | 2670 | - } |
michael@0 | 2671 | - rgb_buf += 8; // Advance 2 pixels. |
michael@0 | 2672 | - } |
michael@0 | 2673 | -} |
michael@0 | 2674 | - |
michael@0 | 2675 | -// 16.16 fixed point is used. A shift by 16 isolates the integer. |
michael@0 | 2676 | -// A shift by 17 is used to further subsample the chrominence channels. |
michael@0 | 2677 | -// & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits, |
michael@0 | 2678 | -// for 1/65536 pixel accurate interpolation. |
michael@0 | 2679 | + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); |
michael@0 | 2680 | +} |
michael@0 | 2681 | + |
michael@0 | 2682 | void ScaleYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 2683 | const uint8* u_buf, |
michael@0 | 2684 | const uint8* v_buf, |
michael@0 | 2685 | uint8* rgb_buf, |
michael@0 | 2686 | int width, |
michael@0 | 2687 | int source_dx) { |
michael@0 | 2688 | - int x = 0; |
michael@0 | 2689 | - for (int i = 0; i < width; i += 2) { |
michael@0 | 2690 | - int y = y_buf[x >> 16]; |
michael@0 | 2691 | - int u = u_buf[(x >> 17)]; |
michael@0 | 2692 | - int v = v_buf[(x >> 17)]; |
michael@0 | 2693 | - YuvPixel(y, u, v, rgb_buf); |
michael@0 | 2694 | - x += source_dx; |
michael@0 | 2695 | - if ((i + 1) < width) { |
michael@0 | 2696 | - y = y_buf[x >> 16]; |
michael@0 | 2697 | - YuvPixel(y, u, v, rgb_buf+4); |
michael@0 | 2698 | - x += source_dx; |
michael@0 | 2699 | - } |
michael@0 | 2700 | - rgb_buf += 8; |
michael@0 | 2701 | - } |
michael@0 | 2702 | -} |
michael@0 | 2703 | + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); |
michael@0 | 2704 | +} |
michael@0 | 2705 | |
michael@0 | 2706 | void LinearScaleYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 2707 | const uint8* u_buf, |
michael@0 | 2708 | const uint8* v_buf, |
michael@0 | 2709 | uint8* rgb_buf, |
michael@0 | 2710 | int width, |
michael@0 | 2711 | int source_dx) { |
michael@0 | 2712 | - int x = 0; |
michael@0 | 2713 | - if (source_dx >= 0x20000) { |
michael@0 | 2714 | - x = 32768; |
michael@0 | 2715 | - } |
michael@0 | 2716 | - for (int i = 0; i < width; i += 2) { |
michael@0 | 2717 | - int y0 = y_buf[x >> 16]; |
michael@0 | 2718 | - int y1 = y_buf[(x >> 16) + 1]; |
michael@0 | 2719 | - int u0 = u_buf[(x >> 17)]; |
michael@0 | 2720 | - int u1 = u_buf[(x >> 17) + 1]; |
michael@0 | 2721 | - int v0 = v_buf[(x >> 17)]; |
michael@0 | 2722 | - int v1 = v_buf[(x >> 17) + 1]; |
michael@0 | 2723 | - int y_frac = (x & 65535); |
michael@0 | 2724 | - int uv_frac = ((x >> 1) & 65535); |
michael@0 | 2725 | - int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; |
michael@0 | 2726 | - int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16; |
michael@0 | 2727 | - int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16; |
michael@0 | 2728 | - YuvPixel(y, u, v, rgb_buf); |
michael@0 | 2729 | - x += source_dx; |
michael@0 | 2730 | - if ((i + 1) < width) { |
michael@0 | 2731 | - y0 = y_buf[x >> 16]; |
michael@0 | 2732 | - y1 = y_buf[(x >> 16) + 1]; |
michael@0 | 2733 | - y_frac = (x & 65535); |
michael@0 | 2734 | - y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; |
michael@0 | 2735 | - YuvPixel(y, u, v, rgb_buf+4); |
michael@0 | 2736 | - x += source_dx; |
michael@0 | 2737 | - } |
michael@0 | 2738 | - rgb_buf += 8; |
michael@0 | 2739 | - } |
michael@0 | 2740 | -} |
michael@0 | 2741 | - |
michael@0 | 2742 | -#endif // USE_MMX |
michael@0 | 2743 | -} // extern "C" |
michael@0 | 2744 | - |
michael@0 | 2745 | + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); |
michael@0 | 2746 | +} |
michael@0 | 2747 | +#endif |
michael@0 | 2748 | + |
michael@0 | 2749 | +} |
michael@0 | 2750 | diff --git a/gfx/ycbcr/yuv_row_table.cpp b/gfx/ycbcr/yuv_row_table.cpp |
michael@0 | 2751 | --- a/gfx/ycbcr/yuv_row_table.cpp |
michael@0 | 2752 | +++ b/gfx/ycbcr/yuv_row_table.cpp |
michael@0 | 2753 | @@ -1,13 +1,13 @@ |
michael@0 | 2754 | // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
michael@0 | 2755 | // Use of this source code is governed by a BSD-style license that can be |
michael@0 | 2756 | // found in the LICENSE file. |
michael@0 | 2757 | |
michael@0 | 2758 | -#include "media/base/yuv_row.h" |
michael@0 | 2759 | +#include "yuv_row.h" |
michael@0 | 2760 | |
michael@0 | 2761 | extern "C" { |
michael@0 | 2762 | |
michael@0 | 2763 | #define RGBY(i) { \ |
michael@0 | 2764 | static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ |
michael@0 | 2765 | static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ |
michael@0 | 2766 | static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \ |
michael@0 | 2767 | 0 \ |
michael@0 | 2768 | diff --git a/gfx/ycbcr/yuv_row_win.cpp b/gfx/ycbcr/yuv_row_win.cpp |
michael@0 | 2769 | --- a/gfx/ycbcr/yuv_row_win.cpp |
michael@0 | 2770 | +++ b/gfx/ycbcr/yuv_row_win.cpp |
michael@0 | 2771 | @@ -1,26 +1,27 @@ |
michael@0 | 2772 | // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
michael@0 | 2773 | // Use of this source code is governed by a BSD-style license that can be |
michael@0 | 2774 | // found in the LICENSE file. |
michael@0 | 2775 | |
michael@0 | 2776 | -#include "media/base/yuv_row.h" |
michael@0 | 2777 | +#include "yuv_row.h" |
michael@0 | 2778 | +#include "mozilla/SSE.h" |
michael@0 | 2779 | |
michael@0 | 2780 | #define kCoefficientsRgbU kCoefficientsRgbY + 2048 |
michael@0 | 2781 | #define kCoefficientsRgbV kCoefficientsRgbY + 4096 |
michael@0 | 2782 | |
michael@0 | 2783 | extern "C" { |
michael@0 | 2784 | |
michael@0 | 2785 | -#if USE_MMX |
michael@0 | 2786 | -__declspec(naked) |
michael@0 | 2787 | -void FastConvertYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 2788 | - const uint8* u_buf, |
michael@0 | 2789 | - const uint8* v_buf, |
michael@0 | 2790 | - uint8* rgb_buf, |
michael@0 | 2791 | - int width) { |
michael@0 | 2792 | +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) |
michael@0 | 2793 | +__declspec(naked) |
michael@0 | 2794 | +void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf, |
michael@0 | 2795 | + const uint8* u_buf, |
michael@0 | 2796 | + const uint8* v_buf, |
michael@0 | 2797 | + uint8* rgb_buf, |
michael@0 | 2798 | + int width) { |
michael@0 | 2799 | __asm { |
michael@0 | 2800 | pushad |
michael@0 | 2801 | mov edx, [esp + 32 + 4] // Y |
michael@0 | 2802 | mov edi, [esp + 32 + 8] // U |
michael@0 | 2803 | mov esi, [esp + 32 + 12] // V |
michael@0 | 2804 | mov ebp, [esp + 32 + 16] // rgb |
michael@0 | 2805 | mov ecx, [esp + 32 + 20] // width |
michael@0 | 2806 | jmp convertend |
michael@0 | 2807 | @@ -64,22 +65,22 @@ void FastConvertYUVToRGB32Row(const uint |
michael@0 | 2808 | convertdone : |
michael@0 | 2809 | |
michael@0 | 2810 | popad |
michael@0 | 2811 | ret |
michael@0 | 2812 | } |
michael@0 | 2813 | } |
michael@0 | 2814 | |
michael@0 | 2815 | __declspec(naked) |
michael@0 | 2816 | -void ConvertYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 2817 | - const uint8* u_buf, |
michael@0 | 2818 | - const uint8* v_buf, |
michael@0 | 2819 | - uint8* rgb_buf, |
michael@0 | 2820 | - int width, |
michael@0 | 2821 | - int step) { |
michael@0 | 2822 | +void ConvertYUVToRGB32Row_SSE(const uint8* y_buf, |
michael@0 | 2823 | + const uint8* u_buf, |
michael@0 | 2824 | + const uint8* v_buf, |
michael@0 | 2825 | + uint8* rgb_buf, |
michael@0 | 2826 | + int width, |
michael@0 | 2827 | + int step) { |
michael@0 | 2828 | __asm { |
michael@0 | 2829 | pushad |
michael@0 | 2830 | mov edx, [esp + 32 + 4] // Y |
michael@0 | 2831 | mov edi, [esp + 32 + 8] // U |
michael@0 | 2832 | mov esi, [esp + 32 + 12] // V |
michael@0 | 2833 | mov ebp, [esp + 32 + 16] // rgb |
michael@0 | 2834 | mov ecx, [esp + 32 + 20] // width |
michael@0 | 2835 | mov ebx, [esp + 32 + 24] // step |
michael@0 | 2836 | @@ -125,23 +126,23 @@ void ConvertYUVToRGB32Row(const uint8* y |
michael@0 | 2837 | wdone : |
michael@0 | 2838 | |
michael@0 | 2839 | popad |
michael@0 | 2840 | ret |
michael@0 | 2841 | } |
michael@0 | 2842 | } |
michael@0 | 2843 | |
michael@0 | 2844 | __declspec(naked) |
michael@0 | 2845 | -void RotateConvertYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 2846 | - const uint8* u_buf, |
michael@0 | 2847 | - const uint8* v_buf, |
michael@0 | 2848 | - uint8* rgb_buf, |
michael@0 | 2849 | - int width, |
michael@0 | 2850 | - int ystep, |
michael@0 | 2851 | - int uvstep) { |
michael@0 | 2852 | +void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf, |
michael@0 | 2853 | + const uint8* u_buf, |
michael@0 | 2854 | + const uint8* v_buf, |
michael@0 | 2855 | + uint8* rgb_buf, |
michael@0 | 2856 | + int width, |
michael@0 | 2857 | + int ystep, |
michael@0 | 2858 | + int uvstep) { |
michael@0 | 2859 | __asm { |
michael@0 | 2860 | pushad |
michael@0 | 2861 | mov edx, [esp + 32 + 4] // Y |
michael@0 | 2862 | mov edi, [esp + 32 + 8] // U |
michael@0 | 2863 | mov esi, [esp + 32 + 12] // V |
michael@0 | 2864 | mov ebp, [esp + 32 + 16] // rgb |
michael@0 | 2865 | mov ecx, [esp + 32 + 20] // width |
michael@0 | 2866 | jmp wend |
michael@0 | 2867 | @@ -188,21 +189,21 @@ void RotateConvertYUVToRGB32Row(const ui |
michael@0 | 2868 | wdone : |
michael@0 | 2869 | |
michael@0 | 2870 | popad |
michael@0 | 2871 | ret |
michael@0 | 2872 | } |
michael@0 | 2873 | } |
michael@0 | 2874 | |
michael@0 | 2875 | __declspec(naked) |
michael@0 | 2876 | -void DoubleYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 2877 | - const uint8* u_buf, |
michael@0 | 2878 | - const uint8* v_buf, |
michael@0 | 2879 | - uint8* rgb_buf, |
michael@0 | 2880 | - int width) { |
michael@0 | 2881 | +void DoubleYUVToRGB32Row_SSE(const uint8* y_buf, |
michael@0 | 2882 | + const uint8* u_buf, |
michael@0 | 2883 | + const uint8* v_buf, |
michael@0 | 2884 | + uint8* rgb_buf, |
michael@0 | 2885 | + int width) { |
michael@0 | 2886 | __asm { |
michael@0 | 2887 | pushad |
michael@0 | 2888 | mov edx, [esp + 32 + 4] // Y |
michael@0 | 2889 | mov edi, [esp + 32 + 8] // U |
michael@0 | 2890 | mov esi, [esp + 32 + 12] // V |
michael@0 | 2891 | mov ebp, [esp + 32 + 16] // rgb |
michael@0 | 2892 | mov ecx, [esp + 32 + 20] // width |
michael@0 | 2893 | jmp wend |
michael@0 | 2894 | @@ -256,26 +257,26 @@ void DoubleYUVToRGB32Row(const uint8* y_ |
michael@0 | 2895 | jns wloop1 |
michael@0 | 2896 | wdone : |
michael@0 | 2897 | popad |
michael@0 | 2898 | ret |
michael@0 | 2899 | } |
michael@0 | 2900 | } |
michael@0 | 2901 | |
michael@0 | 2902 | // This version does general purpose scaling by any amount, up or down. |
michael@0 | 2903 | -// The only thing it can not do it rotation by 90 or 270. |
michael@0 | 2904 | -// For performance the chroma is under sampled, reducing cost of a 3x |
michael@0 | 2905 | +// The only thing it cannot do is rotation by 90 or 270. |
michael@0 | 2906 | +// For performance the chroma is under-sampled, reducing cost of a 3x |
michael@0 | 2907 | // 1080p scale from 8.4 ms to 5.4 ms. |
michael@0 | 2908 | __declspec(naked) |
michael@0 | 2909 | -void ScaleYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 2910 | - const uint8* u_buf, |
michael@0 | 2911 | - const uint8* v_buf, |
michael@0 | 2912 | - uint8* rgb_buf, |
michael@0 | 2913 | - int width, |
michael@0 | 2914 | - int source_dx) { |
michael@0 | 2915 | +void ScaleYUVToRGB32Row_SSE(const uint8* y_buf, |
michael@0 | 2916 | + const uint8* u_buf, |
michael@0 | 2917 | + const uint8* v_buf, |
michael@0 | 2918 | + uint8* rgb_buf, |
michael@0 | 2919 | + int width, |
michael@0 | 2920 | + int source_dx) { |
michael@0 | 2921 | __asm { |
michael@0 | 2922 | pushad |
michael@0 | 2923 | mov edx, [esp + 32 + 4] // Y |
michael@0 | 2924 | mov edi, [esp + 32 + 8] // U |
michael@0 | 2925 | mov esi, [esp + 32 + 12] // V |
michael@0 | 2926 | mov ebp, [esp + 32 + 16] // rgb |
michael@0 | 2927 | mov ecx, [esp + 32 + 20] // width |
michael@0 | 2928 | xor ebx, ebx // x |
michael@0 | 2929 | @@ -333,22 +334,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b |
michael@0 | 2930 | |
michael@0 | 2931 | scaledone : |
michael@0 | 2932 | popad |
michael@0 | 2933 | ret |
michael@0 | 2934 | } |
michael@0 | 2935 | } |
michael@0 | 2936 | |
michael@0 | 2937 | __declspec(naked) |
michael@0 | 2938 | -void LinearScaleYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 2939 | - const uint8* u_buf, |
michael@0 | 2940 | - const uint8* v_buf, |
michael@0 | 2941 | - uint8* rgb_buf, |
michael@0 | 2942 | - int width, |
michael@0 | 2943 | - int source_dx) { |
michael@0 | 2944 | +void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf, |
michael@0 | 2945 | + const uint8* u_buf, |
michael@0 | 2946 | + const uint8* v_buf, |
michael@0 | 2947 | + uint8* rgb_buf, |
michael@0 | 2948 | + int width, |
michael@0 | 2949 | + int source_dx) { |
michael@0 | 2950 | __asm { |
michael@0 | 2951 | pushad |
michael@0 | 2952 | mov edx, [esp + 32 + 4] // Y |
michael@0 | 2953 | mov edi, [esp + 32 + 8] // U |
michael@0 | 2954 | // [esp + 32 + 12] // V |
michael@0 | 2955 | mov ebp, [esp + 32 + 16] // rgb |
michael@0 | 2956 | mov ecx, [esp + 32 + 20] // width |
michael@0 | 2957 | imul ecx, [esp + 32 + 24] // source_dx |
michael@0 | 2958 | @@ -438,152 +439,60 @@ lscalelastpixel: |
michael@0 | 2959 | paddsw mm1, mm0 |
michael@0 | 2960 | psraw mm1, 6 |
michael@0 | 2961 | packuswb mm1, mm1 |
michael@0 | 2962 | movd [ebp], mm1 |
michael@0 | 2963 | popad |
michael@0 | 2964 | ret |
michael@0 | 2965 | }; |
michael@0 | 2966 | } |
michael@0 | 2967 | -#else // USE_MMX |
michael@0 | 2968 | - |
michael@0 | 2969 | -// C reference code that mimic the YUV assembly. |
michael@0 | 2970 | -#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x))) |
michael@0 | 2971 | -#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \ |
michael@0 | 2972 | - (((x) + (y)) > 32767 ? 32767 : ((x) + (y)))) |
michael@0 | 2973 | - |
michael@0 | 2974 | -static inline void YuvPixel(uint8 y, |
michael@0 | 2975 | - uint8 u, |
michael@0 | 2976 | - uint8 v, |
michael@0 | 2977 | - uint8* rgb_buf) { |
michael@0 | 2978 | - |
michael@0 | 2979 | - int b = kCoefficientsRgbY[256+u][0]; |
michael@0 | 2980 | - int g = kCoefficientsRgbY[256+u][1]; |
michael@0 | 2981 | - int r = kCoefficientsRgbY[256+u][2]; |
michael@0 | 2982 | - int a = kCoefficientsRgbY[256+u][3]; |
michael@0 | 2983 | - |
michael@0 | 2984 | - b = paddsw(b, kCoefficientsRgbY[512+v][0]); |
michael@0 | 2985 | - g = paddsw(g, kCoefficientsRgbY[512+v][1]); |
michael@0 | 2986 | - r = paddsw(r, kCoefficientsRgbY[512+v][2]); |
michael@0 | 2987 | - a = paddsw(a, kCoefficientsRgbY[512+v][3]); |
michael@0 | 2988 | - |
michael@0 | 2989 | - b = paddsw(b, kCoefficientsRgbY[y][0]); |
michael@0 | 2990 | - g = paddsw(g, kCoefficientsRgbY[y][1]); |
michael@0 | 2991 | - r = paddsw(r, kCoefficientsRgbY[y][2]); |
michael@0 | 2992 | - a = paddsw(a, kCoefficientsRgbY[y][3]); |
michael@0 | 2993 | - |
michael@0 | 2994 | - b >>= 6; |
michael@0 | 2995 | - g >>= 6; |
michael@0 | 2996 | - r >>= 6; |
michael@0 | 2997 | - a >>= 6; |
michael@0 | 2998 | - |
michael@0 | 2999 | - *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) | |
michael@0 | 3000 | - (packuswb(g) << 8) | |
michael@0 | 3001 | - (packuswb(r) << 16) | |
michael@0 | 3002 | - (packuswb(a) << 24); |
michael@0 | 3003 | -} |
michael@0 | 3004 | - |
michael@0 | 3005 | -#if TEST_MMX_YUV |
michael@0 | 3006 | -static inline void YuvPixel(uint8 y, |
michael@0 | 3007 | - uint8 u, |
michael@0 | 3008 | - uint8 v, |
michael@0 | 3009 | - uint8* rgb_buf) { |
michael@0 | 3010 | - |
michael@0 | 3011 | - __asm { |
michael@0 | 3012 | - movzx eax, u |
michael@0 | 3013 | - movq mm0, [kCoefficientsRgbY+2048 + 8 * eax] |
michael@0 | 3014 | - movzx eax, v |
michael@0 | 3015 | - paddsw mm0, [kCoefficientsRgbY+4096 + 8 * eax] |
michael@0 | 3016 | - movzx eax, y |
michael@0 | 3017 | - movq mm1, [kCoefficientsRgbY + 8 * eax] |
michael@0 | 3018 | - paddsw mm1, mm0 |
michael@0 | 3019 | - psraw mm1, 6 |
michael@0 | 3020 | - packuswb mm1, mm1 |
michael@0 | 3021 | - mov eax, rgb_buf |
michael@0 | 3022 | - movd [eax], mm1 |
michael@0 | 3023 | - emms |
michael@0 | 3024 | - } |
michael@0 | 3025 | -} |
michael@0 | 3026 | -#endif |
michael@0 | 3027 | +#endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) |
michael@0 | 3028 | |
michael@0 | 3029 | void FastConvertYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 3030 | const uint8* u_buf, |
michael@0 | 3031 | const uint8* v_buf, |
michael@0 | 3032 | uint8* rgb_buf, |
michael@0 | 3033 | int width) { |
michael@0 | 3034 | - for (int x = 0; x < width; x += 2) { |
michael@0 | 3035 | - uint8 u = u_buf[x >> 1]; |
michael@0 | 3036 | - uint8 v = v_buf[x >> 1]; |
michael@0 | 3037 | - uint8 y0 = y_buf[x]; |
michael@0 | 3038 | - YuvPixel(y0, u, v, rgb_buf); |
michael@0 | 3039 | - if ((x + 1) < width) { |
michael@0 | 3040 | - uint8 y1 = y_buf[x + 1]; |
michael@0 | 3041 | - YuvPixel(y1, u, v, rgb_buf + 4); |
michael@0 | 3042 | - } |
michael@0 | 3043 | - rgb_buf += 8; // Advance 2 pixels. |
michael@0 | 3044 | - } |
michael@0 | 3045 | -} |
michael@0 | 3046 | - |
michael@0 | 3047 | -// 16.16 fixed point is used. A shift by 16 isolates the integer. |
michael@0 | 3048 | -// A shift by 17 is used to further subsample the chrominence channels. |
michael@0 | 3049 | -// & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits, |
michael@0 | 3050 | -// for 1/65536 pixel accurate interpolation. |
michael@0 | 3051 | +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) |
michael@0 | 3052 | + if (mozilla::supports_sse()) { |
michael@0 | 3053 | + FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width); |
michael@0 | 3054 | + return; |
michael@0 | 3055 | + } |
michael@0 | 3056 | +#endif |
michael@0 | 3057 | + |
michael@0 | 3058 | + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); |
michael@0 | 3059 | +} |
michael@0 | 3060 | + |
michael@0 | 3061 | void ScaleYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 3062 | const uint8* u_buf, |
michael@0 | 3063 | const uint8* v_buf, |
michael@0 | 3064 | uint8* rgb_buf, |
michael@0 | 3065 | int width, |
michael@0 | 3066 | int source_dx) { |
michael@0 | 3067 | - int x = 0; |
michael@0 | 3068 | - for (int i = 0; i < width; i += 2) { |
michael@0 | 3069 | - int y = y_buf[x >> 16]; |
michael@0 | 3070 | - int u = u_buf[(x >> 17)]; |
michael@0 | 3071 | - int v = v_buf[(x >> 17)]; |
michael@0 | 3072 | - YuvPixel(y, u, v, rgb_buf); |
michael@0 | 3073 | - x += source_dx; |
michael@0 | 3074 | - if ((i + 1) < width) { |
michael@0 | 3075 | - y = y_buf[x >> 16]; |
michael@0 | 3076 | - YuvPixel(y, u, v, rgb_buf+4); |
michael@0 | 3077 | - x += source_dx; |
michael@0 | 3078 | - } |
michael@0 | 3079 | - rgb_buf += 8; |
michael@0 | 3080 | - } |
michael@0 | 3081 | -} |
michael@0 | 3082 | + |
michael@0 | 3083 | +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) |
michael@0 | 3084 | + if (mozilla::supports_sse()) { |
michael@0 | 3085 | + ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); |
michael@0 | 3086 | + return; |
michael@0 | 3087 | + } |
michael@0 | 3088 | +#endif |
michael@0 | 3089 | + |
michael@0 | 3090 | + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); |
michael@0 | 3091 | +} |
michael@0 | 3092 | |
michael@0 | 3093 | void LinearScaleYUVToRGB32Row(const uint8* y_buf, |
michael@0 | 3094 | const uint8* u_buf, |
michael@0 | 3095 | const uint8* v_buf, |
michael@0 | 3096 | uint8* rgb_buf, |
michael@0 | 3097 | int width, |
michael@0 | 3098 | int source_dx) { |
michael@0 | 3099 | - int x = 0; |
michael@0 | 3100 | - if (source_dx >= 0x20000) { |
michael@0 | 3101 | - x = 32768; |
michael@0 | 3102 | - } |
michael@0 | 3103 | - for (int i = 0; i < width; i += 2) { |
michael@0 | 3104 | - int y0 = y_buf[x >> 16]; |
michael@0 | 3105 | - int y1 = y_buf[(x >> 16) + 1]; |
michael@0 | 3106 | - int u0 = u_buf[(x >> 17)]; |
michael@0 | 3107 | - int u1 = u_buf[(x >> 17) + 1]; |
michael@0 | 3108 | - int v0 = v_buf[(x >> 17)]; |
michael@0 | 3109 | - int v1 = v_buf[(x >> 17) + 1]; |
michael@0 | 3110 | - int y_frac = (x & 65535); |
michael@0 | 3111 | - int uv_frac = ((x >> 1) & 65535); |
michael@0 | 3112 | - int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; |
michael@0 | 3113 | - int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16; |
michael@0 | 3114 | - int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16; |
michael@0 | 3115 | - YuvPixel(y, u, v, rgb_buf); |
michael@0 | 3116 | - x += source_dx; |
michael@0 | 3117 | - if ((i + 1) < width) { |
michael@0 | 3118 | - y0 = y_buf[x >> 16]; |
michael@0 | 3119 | - y1 = y_buf[(x >> 16) + 1]; |
michael@0 | 3120 | - y_frac = (x & 65535); |
michael@0 | 3121 | - y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16; |
michael@0 | 3122 | - YuvPixel(y, u, v, rgb_buf+4); |
michael@0 | 3123 | - x += source_dx; |
michael@0 | 3124 | - } |
michael@0 | 3125 | - rgb_buf += 8; |
michael@0 | 3126 | - } |
michael@0 | 3127 | -} |
michael@0 | 3128 | - |
michael@0 | 3129 | -#endif // USE_MMX |
michael@0 | 3130 | -} // extern "C" |
michael@0 | 3131 | - |
michael@0 | 3132 | +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) |
michael@0 | 3133 | + if (mozilla::supports_sse()) { |
michael@0 | 3134 | + LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, |
michael@0 | 3135 | + source_dx); |
michael@0 | 3136 | + return; |
michael@0 | 3137 | + } |
michael@0 | 3138 | +#endif |
michael@0 | 3139 | + |
michael@0 | 3140 | + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); |
michael@0 | 3141 | +} |
michael@0 | 3142 | + |
michael@0 | 3143 | +} // extern "C" |