gfx/ycbcr/yuv_row_win64.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
michael@0 2 // Use of this source code is governed by a BSD-style license that can be
michael@0 3 // found in the LICENSE file.
michael@0 4
michael@0 5 #include "yuv_row.h"
michael@0 6
michael@0 7 extern "C" {
michael@0 8
michael@0 9 // x64 compiler doesn't support MMX and inline assembler. Use SSE2 intrinsics.
michael@0 10
michael@0 11 #define kCoefficientsRgbU (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 2048)
michael@0 12 #define kCoefficientsRgbV (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 4096)
michael@0 13
michael@0 14 #include <emmintrin.h>
michael@0 15
michael@0 16 static void FastConvertYUVToRGB32Row_SSE2(const uint8* y_buf,
michael@0 17 const uint8* u_buf,
michael@0 18 const uint8* v_buf,
michael@0 19 uint8* rgb_buf,
michael@0 20 int width) {
michael@0 21 __m128i xmm0, xmmY1, xmmY2;
michael@0 22 __m128 xmmY;
michael@0 23
michael@0 24 while (width >= 2) {
michael@0 25 xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf++)),
michael@0 26 _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf++)));
michael@0 27
michael@0 28 xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++));
michael@0 29 xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
michael@0 30
michael@0 31 xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++));
michael@0 32 xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
michael@0 33
michael@0 34 xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
michael@0 35 0x44);
michael@0 36 xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
michael@0 37 xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
michael@0 38
michael@0 39 _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
michael@0 40 rgb_buf += 8;
michael@0 41 width -= 2;
michael@0 42 }
michael@0 43
michael@0 44 if (width) {
michael@0 45 xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf)),
michael@0 46 _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf)));
michael@0 47 xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf));
michael@0 48 xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
michael@0 49 xmmY1 = _mm_srai_epi16(xmmY1, 6);
michael@0 50 xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
michael@0 51 *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
michael@0 52 }
michael@0 53 }
michael@0 54
michael@0 55 static void ScaleYUVToRGB32Row_SSE2(const uint8* y_buf,
michael@0 56 const uint8* u_buf,
michael@0 57 const uint8* v_buf,
michael@0 58 uint8* rgb_buf,
michael@0 59 int width,
michael@0 60 int source_dx) {
michael@0 61 __m128i xmm0, xmmY1, xmmY2;
michael@0 62 __m128 xmmY;
michael@0 63 uint8 u, v, y;
michael@0 64 int x = 0;
michael@0 65
michael@0 66 while (width >= 2) {
michael@0 67 u = u_buf[x >> 17];
michael@0 68 v = v_buf[x >> 17];
michael@0 69 y = y_buf[x >> 16];
michael@0 70 x += source_dx;
michael@0 71
michael@0 72 xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
michael@0 73 _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
michael@0 74 xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
michael@0 75 xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
michael@0 76
michael@0 77 y = y_buf[x >> 16];
michael@0 78 x += source_dx;
michael@0 79
michael@0 80 xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
michael@0 81 xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
michael@0 82
michael@0 83 xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
michael@0 84 0x44);
michael@0 85 xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
michael@0 86 xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
michael@0 87
michael@0 88 _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
michael@0 89 rgb_buf += 8;
michael@0 90 width -= 2;
michael@0 91 }
michael@0 92
michael@0 93 if (width) {
michael@0 94 u = u_buf[x >> 17];
michael@0 95 v = v_buf[x >> 17];
michael@0 96 y = y_buf[x >> 16];
michael@0 97
michael@0 98 xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
michael@0 99 _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
michael@0 100 xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
michael@0 101 xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
michael@0 102 xmmY1 = _mm_srai_epi16(xmmY1, 6);
michael@0 103 xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
michael@0 104 *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
michael@0 105 }
michael@0 106 }
michael@0 107
michael@0 108 static void LinearScaleYUVToRGB32Row_SSE2(const uint8* y_buf,
michael@0 109 const uint8* u_buf,
michael@0 110 const uint8* v_buf,
michael@0 111 uint8* rgb_buf,
michael@0 112 int width,
michael@0 113 int source_dx) {
michael@0 114 __m128i xmm0, xmmY1, xmmY2;
michael@0 115 __m128 xmmY;
michael@0 116 uint8 u0, u1, v0, v1, y0, y1;
michael@0 117 uint32 uv_frac, y_frac, u, v, y;
michael@0 118 int x = 0;
michael@0 119
michael@0 120 if (source_dx >= 0x20000) {
michael@0 121 x = 32768;
michael@0 122 }
michael@0 123
michael@0 124 while(width >= 2) {
michael@0 125 u0 = u_buf[x >> 17];
michael@0 126 u1 = u_buf[(x >> 17) + 1];
michael@0 127 v0 = v_buf[x >> 17];
michael@0 128 v1 = v_buf[(x >> 17) + 1];
michael@0 129 y0 = y_buf[x >> 16];
michael@0 130 y1 = y_buf[(x >> 16) + 1];
michael@0 131 uv_frac = (x & 0x1fffe);
michael@0 132 y_frac = (x & 0xffff);
michael@0 133 u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17;
michael@0 134 v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17;
michael@0 135 y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
michael@0 136 x += source_dx;
michael@0 137
michael@0 138 xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
michael@0 139 _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
michael@0 140 xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
michael@0 141 xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
michael@0 142
michael@0 143 y0 = y_buf[x >> 16];
michael@0 144 y1 = y_buf[(x >> 16) + 1];
michael@0 145 y_frac = (x & 0xffff);
michael@0 146 y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
michael@0 147 x += source_dx;
michael@0 148
michael@0 149 xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
michael@0 150 xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
michael@0 151
michael@0 152 xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
michael@0 153 0x44);
michael@0 154 xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
michael@0 155 xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
michael@0 156
michael@0 157 _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
michael@0 158 rgb_buf += 8;
michael@0 159 width -= 2;
michael@0 160 }
michael@0 161
michael@0 162 if (width) {
michael@0 163 u = u_buf[x >> 17];
michael@0 164 v = v_buf[x >> 17];
michael@0 165 y = y_buf[x >> 16];
michael@0 166
michael@0 167 xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
michael@0 168 _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
michael@0 169 xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
michael@0 170
michael@0 171 xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
michael@0 172 xmmY1 = _mm_srai_epi16(xmmY1, 6);
michael@0 173 xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
michael@0 174 *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
michael@0 175 }
michael@0 176 }
michael@0 177
michael@0 178 void FastConvertYUVToRGB32Row(const uint8* y_buf,
michael@0 179 const uint8* u_buf,
michael@0 180 const uint8* v_buf,
michael@0 181 uint8* rgb_buf,
michael@0 182 int width) {
michael@0 183 FastConvertYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width);
michael@0 184 }
michael@0 185
michael@0 186 void ScaleYUVToRGB32Row(const uint8* y_buf,
michael@0 187 const uint8* u_buf,
michael@0 188 const uint8* v_buf,
michael@0 189 uint8* rgb_buf,
michael@0 190 int width,
michael@0 191 int source_dx) {
michael@0 192 ScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
michael@0 193 }
michael@0 194
michael@0 195 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
michael@0 196 const uint8* u_buf,
michael@0 197 const uint8* v_buf,
michael@0 198 uint8* rgb_buf,
michael@0 199 int width,
michael@0 200 int source_dx) {
michael@0 201 LinearScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width,
michael@0 202 source_dx);
michael@0 203 }
michael@0 204
michael@0 205 } // extern "C"

mercurial