michael@0: // Copyright (c) 2010 The Chromium Authors. All rights reserved. michael@0: // Use of this source code is governed by a BSD-style license that can be michael@0: // found in the LICENSE file. michael@0: michael@0: #include michael@0: #include "yuv_row.h" michael@0: michael@0: namespace mozilla { michael@0: namespace gfx { michael@0: michael@0: // FilterRows combines two rows of the image using linear interpolation. michael@0: // SSE2 version does 16 pixels at a time. michael@0: void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, michael@0: int source_width, int source_y_fraction) { michael@0: __m128i zero = _mm_setzero_si128(); michael@0: __m128i y1_fraction = _mm_set1_epi16(source_y_fraction); michael@0: __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction); michael@0: michael@0: const __m128i* y0_ptr128 = reinterpret_cast(y0_ptr); michael@0: const __m128i* y1_ptr128 = reinterpret_cast(y1_ptr); michael@0: __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf); michael@0: __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width); michael@0: michael@0: do { michael@0: __m128i y0 = _mm_loadu_si128(y0_ptr128); michael@0: __m128i y1 = _mm_loadu_si128(y1_ptr128); michael@0: __m128i y2 = _mm_unpackhi_epi8(y0, zero); michael@0: __m128i y3 = _mm_unpackhi_epi8(y1, zero); michael@0: y0 = _mm_unpacklo_epi8(y0, zero); michael@0: y1 = _mm_unpacklo_epi8(y1, zero); michael@0: y0 = _mm_mullo_epi16(y0, y0_fraction); michael@0: y1 = _mm_mullo_epi16(y1, y1_fraction); michael@0: y2 = _mm_mullo_epi16(y2, y0_fraction); michael@0: y3 = _mm_mullo_epi16(y3, y1_fraction); michael@0: y0 = _mm_add_epi16(y0, y1); michael@0: y2 = _mm_add_epi16(y2, y3); michael@0: y0 = _mm_srli_epi16(y0, 8); michael@0: y2 = _mm_srli_epi16(y2, 8); michael@0: y0 = _mm_packus_epi16(y0, y2); michael@0: *dest128++ = y0; michael@0: ++y0_ptr128; michael@0: ++y1_ptr128; michael@0: } while (dest128 < end128); michael@0: } michael@0: michael@0: } michael@0: }