|
1 diff --git a/gfx/ycbcr/yuv_row_win64.cpp b/gfx/ycbcr/yuv_row_win64.cpp |
|
2 new file mode 100644 |
|
3 --- /dev/null |
|
4 +++ b/gfx/ycbcr/yuv_row_win64.cpp |
|
5 @@ -0,0 +1,205 @@ |
|
6 +// Copyright (c) 2010 The Chromium Authors. All rights reserved. |
|
7 +// Use of this source code is governed by a BSD-style license that can be |
|
8 +// found in the LICENSE file. |
|
9 + |
|
10 +#include "yuv_row.h" |
|
11 + |
|
12 +extern "C" { |
|
13 + |
|
14 +// x64 compiler doesn't support MMX and inline assembler. Use SSE2 intrinsics. |
|
15 + |
|
16 +#define kCoefficientsRgbU (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 2048) |
|
17 +#define kCoefficientsRgbV (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 4096) |
|
18 + |
|
19 +#include <emmintrin.h> |
|
20 + |
|
21 +static void FastConvertYUVToRGB32Row_SSE2(const uint8* y_buf, |
|
22 + const uint8* u_buf, |
|
23 + const uint8* v_buf, |
|
24 + uint8* rgb_buf, |
|
25 + int width) { |
|
26 + __m128i xmm0, xmmY1, xmmY2; |
|
27 + __m128 xmmY; |
|
28 + |
|
29 + while (width >= 2) { |
|
30 + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf++)), |
|
31 + _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf++))); |
|
32 + |
|
33 + xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++)); |
|
34 + xmmY1 = _mm_adds_epi16(xmmY1, xmm0); |
|
35 + |
|
36 + xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++)); |
|
37 + xmmY2 = _mm_adds_epi16(xmmY2, xmm0); |
|
38 + |
|
39 + xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), |
|
40 + 0x44); |
|
41 + xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); |
|
42 + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); |
|
43 + |
|
44 + _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); |
|
45 + rgb_buf += 8; |
|
46 + width -= 2; |
|
47 + } |
|
48 + |
|
49 + if (width) { |
|
50 + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf)), |
|
51 + _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf))); |
|
52 + xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf)); |
|
53 + xmmY1 = _mm_adds_epi16(xmmY1, xmm0); |
|
54 + xmmY1 = _mm_srai_epi16(xmmY1, 6); |
|
55 + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); |
|
56 + *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); |
|
57 + } |
|
58 +} |
|
59 + |
|
60 +static void ScaleYUVToRGB32Row_SSE2(const uint8* y_buf, |
|
61 + const uint8* u_buf, |
|
62 + const uint8* v_buf, |
|
63 + uint8* rgb_buf, |
|
64 + int width, |
|
65 + int source_dx) { |
|
66 + __m128i xmm0, xmmY1, xmmY2; |
|
67 + __m128 xmmY; |
|
68 + uint8 u, v, y; |
|
69 + int x = 0; |
|
70 + |
|
71 + while (width >= 2) { |
|
72 + u = u_buf[x >> 17]; |
|
73 + v = v_buf[x >> 17]; |
|
74 + y = y_buf[x >> 16]; |
|
75 + x += source_dx; |
|
76 + |
|
77 + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), |
|
78 + _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); |
|
79 + xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); |
|
80 + xmmY1 = _mm_adds_epi16(xmmY1, xmm0); |
|
81 + |
|
82 + y = y_buf[x >> 16]; |
|
83 + x += source_dx; |
|
84 + |
|
85 + xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); |
|
86 + xmmY2 = _mm_adds_epi16(xmmY2, xmm0); |
|
87 + |
|
88 + xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), |
|
89 + 0x44); |
|
90 + xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); |
|
91 + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); |
|
92 + |
|
93 + _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); |
|
94 + rgb_buf += 8; |
|
95 + width -= 2; |
|
96 + } |
|
97 + |
|
98 + if (width) { |
|
99 + u = u_buf[x >> 17]; |
|
100 + v = v_buf[x >> 17]; |
|
101 + y = y_buf[x >> 16]; |
|
102 + |
|
103 + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), |
|
104 + _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); |
|
105 + xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); |
|
106 + xmmY1 = _mm_adds_epi16(xmmY1, xmm0); |
|
107 + xmmY1 = _mm_srai_epi16(xmmY1, 6); |
|
108 + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); |
|
109 + *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); |
|
110 + } |
|
111 +} |
|
112 + |
|
113 +static void LinearScaleYUVToRGB32Row_SSE2(const uint8* y_buf, |
|
114 + const uint8* u_buf, |
|
115 + const uint8* v_buf, |
|
116 + uint8* rgb_buf, |
|
117 + int width, |
|
118 + int source_dx) { |
|
119 + __m128i xmm0, xmmY1, xmmY2; |
|
120 + __m128 xmmY; |
|
121 + uint8 u0, u1, v0, v1, y0, y1; |
|
122 + uint32 uv_frac, y_frac, u, v, y; |
|
123 + int x = 0; |
|
124 + |
|
125 + if (source_dx >= 0x20000) { |
|
126 + x = 32768; |
|
127 + } |
|
128 + |
|
129 + while(width >= 2) { |
|
130 + u0 = u_buf[x >> 17]; |
|
131 + u1 = u_buf[(x >> 17) + 1]; |
|
132 + v0 = v_buf[x >> 17]; |
|
133 + v1 = v_buf[(x >> 17) + 1]; |
|
134 + y0 = y_buf[x >> 16]; |
|
135 + y1 = y_buf[(x >> 16) + 1]; |
|
136 + uv_frac = (x & 0x1fffe); |
|
137 + y_frac = (x & 0xffff); |
|
138 + u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17; |
|
139 + v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17; |
|
140 + y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16; |
|
141 + x += source_dx; |
|
142 + |
|
143 + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), |
|
144 + _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); |
|
145 + xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); |
|
146 + xmmY1 = _mm_adds_epi16(xmmY1, xmm0); |
|
147 + |
|
148 + y0 = y_buf[x >> 16]; |
|
149 + y1 = y_buf[(x >> 16) + 1]; |
|
150 + y_frac = (x & 0xffff); |
|
151 + y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16; |
|
152 + x += source_dx; |
|
153 + |
|
154 + xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); |
|
155 + xmmY2 = _mm_adds_epi16(xmmY2, xmm0); |
|
156 + |
|
157 + xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), |
|
158 + 0x44); |
|
159 + xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); |
|
160 + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); |
|
161 + |
|
162 + _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); |
|
163 + rgb_buf += 8; |
|
164 + width -= 2; |
|
165 + } |
|
166 + |
|
167 + if (width) { |
|
168 + u = u_buf[x >> 17]; |
|
169 + v = v_buf[x >> 17]; |
|
170 + y = y_buf[x >> 16]; |
|
171 + |
|
172 + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), |
|
173 + _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); |
|
174 + xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); |
|
175 + |
|
176 + xmmY1 = _mm_adds_epi16(xmmY1, xmm0); |
|
177 + xmmY1 = _mm_srai_epi16(xmmY1, 6); |
|
178 + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); |
|
179 + *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); |
|
180 + } |
|
181 +} |
|
182 + |
|
183 +void FastConvertYUVToRGB32Row(const uint8* y_buf, |
|
184 + const uint8* u_buf, |
|
185 + const uint8* v_buf, |
|
186 + uint8* rgb_buf, |
|
187 + int width) { |
|
188 + FastConvertYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width); |
|
189 +} |
|
190 + |
|
191 +void ScaleYUVToRGB32Row(const uint8* y_buf, |
|
192 + const uint8* u_buf, |
|
193 + const uint8* v_buf, |
|
194 + uint8* rgb_buf, |
|
195 + int width, |
|
196 + int source_dx) { |
|
197 + ScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); |
|
198 +} |
|
199 + |
|
200 +void LinearScaleYUVToRGB32Row(const uint8* y_buf, |
|
201 + const uint8* u_buf, |
|
202 + const uint8* v_buf, |
|
203 + uint8* rgb_buf, |
|
204 + int width, |
|
205 + int source_dx) { |
|
206 + LinearScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, |
|
207 + source_dx); |
|
208 +} |
|
209 + |
|
210 +} // extern "C" |