gfx/ycbcr/win64.patch

branch
TOR_BUG_9701
changeset 15
b8a032363ba2
equal deleted inserted replaced
-1:000000000000 0:83c199b0c501
1 diff --git a/gfx/ycbcr/yuv_row_win64.cpp b/gfx/ycbcr/yuv_row_win64.cpp
2 new file mode 100644
3 --- /dev/null
4 +++ b/gfx/ycbcr/yuv_row_win64.cpp
5 @@ -0,0 +1,205 @@
6 +// Copyright (c) 2010 The Chromium Authors. All rights reserved.
7 +// Use of this source code is governed by a BSD-style license that can be
8 +// found in the LICENSE file.
9 +
10 +#include "yuv_row.h"
11 +
12 +extern "C" {
13 +
14 +// x64 compiler doesn't support MMX and inline assembler. Use SSE2 intrinsics.
15 +
16 +#define kCoefficientsRgbU (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 2048)
17 +#define kCoefficientsRgbV (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 4096)
18 +
19 +#include <emmintrin.h>
20 +
21 +static void FastConvertYUVToRGB32Row_SSE2(const uint8* y_buf,
22 + const uint8* u_buf,
23 + const uint8* v_buf,
24 + uint8* rgb_buf,
25 + int width) {
26 + __m128i xmm0, xmmY1, xmmY2;
27 + __m128 xmmY;
28 +
29 + while (width >= 2) {
30 + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf++)),
31 + _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf++)));
32 +
33 + xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++));
34 + xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
35 +
36 + xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++));
37 + xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
38 +
39 + xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
40 + 0x44);
41 + xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
42 + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
43 +
44 + _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
45 + rgb_buf += 8;
46 + width -= 2;
47 + }
48 +
49 + if (width) {
50 + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf)),
51 + _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf)));
52 + xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf));
53 + xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
54 + xmmY1 = _mm_srai_epi16(xmmY1, 6);
55 + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
56 + *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
57 + }
58 +}
59 +
60 +static void ScaleYUVToRGB32Row_SSE2(const uint8* y_buf,
61 + const uint8* u_buf,
62 + const uint8* v_buf,
63 + uint8* rgb_buf,
64 + int width,
65 + int source_dx) {
66 + __m128i xmm0, xmmY1, xmmY2;
67 + __m128 xmmY;
68 + uint8 u, v, y;
69 + int x = 0;
70 +
71 + while (width >= 2) {
72 + u = u_buf[x >> 17];
73 + v = v_buf[x >> 17];
74 + y = y_buf[x >> 16];
75 + x += source_dx;
76 +
77 + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
78 + _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
79 + xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
80 + xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
81 +
82 + y = y_buf[x >> 16];
83 + x += source_dx;
84 +
85 + xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
86 + xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
87 +
88 + xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
89 + 0x44);
90 + xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
91 + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
92 +
93 + _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
94 + rgb_buf += 8;
95 + width -= 2;
96 + }
97 +
98 + if (width) {
99 + u = u_buf[x >> 17];
100 + v = v_buf[x >> 17];
101 + y = y_buf[x >> 16];
102 +
103 + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
104 + _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
105 + xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
106 + xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
107 + xmmY1 = _mm_srai_epi16(xmmY1, 6);
108 + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
109 + *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
110 + }
111 +}
112 +
113 +static void LinearScaleYUVToRGB32Row_SSE2(const uint8* y_buf,
114 + const uint8* u_buf,
115 + const uint8* v_buf,
116 + uint8* rgb_buf,
117 + int width,
118 + int source_dx) {
119 + __m128i xmm0, xmmY1, xmmY2;
120 + __m128 xmmY;
121 + uint8 u0, u1, v0, v1, y0, y1;
122 + uint32 uv_frac, y_frac, u, v, y;
123 + int x = 0;
124 +
125 + if (source_dx >= 0x20000) {
126 + x = 32768;
127 + }
128 +
129 + while(width >= 2) {
130 + u0 = u_buf[x >> 17];
131 + u1 = u_buf[(x >> 17) + 1];
132 + v0 = v_buf[x >> 17];
133 + v1 = v_buf[(x >> 17) + 1];
134 + y0 = y_buf[x >> 16];
135 + y1 = y_buf[(x >> 16) + 1];
136 + uv_frac = (x & 0x1fffe);
137 + y_frac = (x & 0xffff);
138 + u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17;
139 + v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17;
140 + y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
141 + x += source_dx;
142 +
143 + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
144 + _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
145 + xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
146 + xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
147 +
148 + y0 = y_buf[x >> 16];
149 + y1 = y_buf[(x >> 16) + 1];
150 + y_frac = (x & 0xffff);
151 + y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
152 + x += source_dx;
153 +
154 + xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
155 + xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
156 +
157 + xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
158 + 0x44);
159 + xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
160 + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
161 +
162 + _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
163 + rgb_buf += 8;
164 + width -= 2;
165 + }
166 +
167 + if (width) {
168 + u = u_buf[x >> 17];
169 + v = v_buf[x >> 17];
170 + y = y_buf[x >> 16];
171 +
172 + xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
173 + _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
174 + xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
175 +
176 + xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
177 + xmmY1 = _mm_srai_epi16(xmmY1, 6);
178 + xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
179 + *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
180 + }
181 +}
182 +
183 +void FastConvertYUVToRGB32Row(const uint8* y_buf,
184 + const uint8* u_buf,
185 + const uint8* v_buf,
186 + uint8* rgb_buf,
187 + int width) {
188 + FastConvertYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width);
189 +}
190 +
191 +void ScaleYUVToRGB32Row(const uint8* y_buf,
192 + const uint8* u_buf,
193 + const uint8* v_buf,
194 + uint8* rgb_buf,
195 + int width,
196 + int source_dx) {
197 + ScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
198 +}
199 +
200 +void LinearScaleYUVToRGB32Row(const uint8* y_buf,
201 + const uint8* u_buf,
202 + const uint8* v_buf,
203 + uint8* rgb_buf,
204 + int width,
205 + int source_dx) {
206 + LinearScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width,
207 + source_dx);
208 +}
209 +
210 +} // extern "C"

mercurial