michael@0: // Copyright (c) 2010 The Chromium Authors. All rights reserved. michael@0: // Use of this source code is governed by a BSD-style license that can be michael@0: // found in the LICENSE file. michael@0: michael@0: #include "yuv_row.h" michael@0: michael@0: extern "C" { michael@0: michael@0: // x64 compiler doesn't support MMX and inline assembler. Use SSE2 intrinsics. michael@0: michael@0: #define kCoefficientsRgbU (reinterpret_cast(kCoefficientsRgbY) + 2048) michael@0: #define kCoefficientsRgbV (reinterpret_cast(kCoefficientsRgbY) + 4096) michael@0: michael@0: #include michael@0: michael@0: static void FastConvertYUVToRGB32Row_SSE2(const uint8* y_buf, michael@0: const uint8* u_buf, michael@0: const uint8* v_buf, michael@0: uint8* rgb_buf, michael@0: int width) { michael@0: __m128i xmm0, xmmY1, xmmY2; michael@0: __m128 xmmY; michael@0: michael@0: while (width >= 2) { michael@0: xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf++)), michael@0: _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf++))); michael@0: michael@0: xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast(kCoefficientsRgbY) + 8 * *y_buf++)); michael@0: xmmY1 = _mm_adds_epi16(xmmY1, xmm0); michael@0: michael@0: xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast(kCoefficientsRgbY) + 8 * *y_buf++)); michael@0: xmmY2 = _mm_adds_epi16(xmmY2, xmm0); michael@0: michael@0: xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), michael@0: 0x44); michael@0: xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); michael@0: xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); michael@0: michael@0: _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); michael@0: rgb_buf += 8; michael@0: width -= 2; michael@0: } michael@0: michael@0: if (width) { michael@0: xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf)), michael@0: _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf))); michael@0: xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast(kCoefficientsRgbY) + 8 * *y_buf)); michael@0: xmmY1 = _mm_adds_epi16(xmmY1, xmm0); michael@0: xmmY1 = _mm_srai_epi16(xmmY1, 6); michael@0: xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); michael@0: *reinterpret_cast(rgb_buf) = _mm_cvtsi128_si32(xmmY1); michael@0: } michael@0: } michael@0: michael@0: static void ScaleYUVToRGB32Row_SSE2(const uint8* y_buf, michael@0: const uint8* u_buf, michael@0: const uint8* v_buf, michael@0: uint8* rgb_buf, michael@0: int width, michael@0: int source_dx) { michael@0: __m128i xmm0, xmmY1, xmmY2; michael@0: __m128 xmmY; michael@0: uint8 u, v, y; michael@0: int x = 0; michael@0: michael@0: while (width >= 2) { michael@0: u = u_buf[x >> 17]; michael@0: v = v_buf[x >> 17]; michael@0: y = y_buf[x >> 16]; michael@0: x += source_dx; michael@0: michael@0: xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), michael@0: _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); michael@0: xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast(kCoefficientsRgbY) + 8 * y)); michael@0: xmmY1 = _mm_adds_epi16(xmmY1, xmm0); michael@0: michael@0: y = y_buf[x >> 16]; michael@0: x += source_dx; michael@0: michael@0: xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast(kCoefficientsRgbY) + 8 * y)); michael@0: xmmY2 = _mm_adds_epi16(xmmY2, xmm0); michael@0: michael@0: xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), michael@0: 0x44); michael@0: xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); michael@0: xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); michael@0: michael@0: _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); michael@0: rgb_buf += 8; michael@0: width -= 2; michael@0: } michael@0: michael@0: if (width) { michael@0: u = u_buf[x >> 17]; michael@0: v = v_buf[x >> 17]; michael@0: y = y_buf[x >> 16]; michael@0: michael@0: xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), michael@0: _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); michael@0: xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast(kCoefficientsRgbY) + 8 * y)); michael@0: xmmY1 = _mm_adds_epi16(xmmY1, xmm0); michael@0: xmmY1 = _mm_srai_epi16(xmmY1, 6); michael@0: xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); michael@0: *reinterpret_cast(rgb_buf) = _mm_cvtsi128_si32(xmmY1); michael@0: } michael@0: } michael@0: michael@0: static void LinearScaleYUVToRGB32Row_SSE2(const uint8* y_buf, michael@0: const uint8* u_buf, michael@0: const uint8* v_buf, michael@0: uint8* rgb_buf, michael@0: int width, michael@0: int source_dx) { michael@0: __m128i xmm0, xmmY1, xmmY2; michael@0: __m128 xmmY; michael@0: uint8 u0, u1, v0, v1, y0, y1; michael@0: uint32 uv_frac, y_frac, u, v, y; michael@0: int x = 0; michael@0: michael@0: if (source_dx >= 0x20000) { michael@0: x = 32768; michael@0: } michael@0: michael@0: while(width >= 2) { michael@0: u0 = u_buf[x >> 17]; michael@0: u1 = u_buf[(x >> 17) + 1]; michael@0: v0 = v_buf[x >> 17]; michael@0: v1 = v_buf[(x >> 17) + 1]; michael@0: y0 = y_buf[x >> 16]; michael@0: y1 = y_buf[(x >> 16) + 1]; michael@0: uv_frac = (x & 0x1fffe); michael@0: y_frac = (x & 0xffff); michael@0: u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17; michael@0: v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17; michael@0: y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16; michael@0: x += source_dx; michael@0: michael@0: xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), michael@0: _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); michael@0: xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast(kCoefficientsRgbY) + 8 * y)); michael@0: xmmY1 = _mm_adds_epi16(xmmY1, xmm0); michael@0: michael@0: y0 = y_buf[x >> 16]; michael@0: y1 = y_buf[(x >> 16) + 1]; michael@0: y_frac = (x & 0xffff); michael@0: y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16; michael@0: x += source_dx; michael@0: michael@0: xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast(kCoefficientsRgbY) + 8 * y)); michael@0: xmmY2 = _mm_adds_epi16(xmmY2, xmm0); michael@0: michael@0: xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), michael@0: 0x44); michael@0: xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); michael@0: xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); michael@0: michael@0: _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); michael@0: rgb_buf += 8; michael@0: width -= 2; michael@0: } michael@0: michael@0: if (width) { michael@0: u = u_buf[x >> 17]; michael@0: v = v_buf[x >> 17]; michael@0: y = y_buf[x >> 16]; michael@0: michael@0: xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), michael@0: _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); michael@0: xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast(kCoefficientsRgbY) + 8 * y)); michael@0: michael@0: xmmY1 = _mm_adds_epi16(xmmY1, xmm0); michael@0: xmmY1 = _mm_srai_epi16(xmmY1, 6); michael@0: xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); michael@0: *reinterpret_cast(rgb_buf) = _mm_cvtsi128_si32(xmmY1); michael@0: } michael@0: } michael@0: michael@0: void FastConvertYUVToRGB32Row(const uint8* y_buf, michael@0: const uint8* u_buf, michael@0: const uint8* v_buf, michael@0: uint8* rgb_buf, michael@0: int width) { michael@0: FastConvertYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width); michael@0: } michael@0: michael@0: void ScaleYUVToRGB32Row(const uint8* y_buf, michael@0: const uint8* u_buf, michael@0: const uint8* v_buf, michael@0: uint8* rgb_buf, michael@0: int width, michael@0: int source_dx) { michael@0: ScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); michael@0: } michael@0: michael@0: void LinearScaleYUVToRGB32Row(const uint8* y_buf, michael@0: const uint8* u_buf, michael@0: const uint8* v_buf, michael@0: uint8* rgb_buf, michael@0: int width, michael@0: int source_dx) { michael@0: LinearScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, michael@0: source_dx); michael@0: } michael@0: michael@0: } // extern "C"