|
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. |
|
2 // Use of this source code is governed by a BSD-style license that can be |
|
3 // found in the LICENSE file. |
|
4 |
|
5 #include "yuv_row.h" |
|
6 |
|
7 extern "C" { |
|
8 |
|
9 // x64 compiler doesn't support MMX and inline assembler. Use SSE2 intrinsics. |
|
10 |
|
11 #define kCoefficientsRgbU (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 2048) |
|
12 #define kCoefficientsRgbV (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 4096) |
|
13 |
|
14 #include <emmintrin.h> |
|
15 |
|
16 static void FastConvertYUVToRGB32Row_SSE2(const uint8* y_buf, |
|
17 const uint8* u_buf, |
|
18 const uint8* v_buf, |
|
19 uint8* rgb_buf, |
|
20 int width) { |
|
21 __m128i xmm0, xmmY1, xmmY2; |
|
22 __m128 xmmY; |
|
23 |
|
24 while (width >= 2) { |
|
25 xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf++)), |
|
26 _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf++))); |
|
27 |
|
28 xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++)); |
|
29 xmmY1 = _mm_adds_epi16(xmmY1, xmm0); |
|
30 |
|
31 xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++)); |
|
32 xmmY2 = _mm_adds_epi16(xmmY2, xmm0); |
|
33 |
|
34 xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), |
|
35 0x44); |
|
36 xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); |
|
37 xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); |
|
38 |
|
39 _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); |
|
40 rgb_buf += 8; |
|
41 width -= 2; |
|
42 } |
|
43 |
|
44 if (width) { |
|
45 xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf)), |
|
46 _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf))); |
|
47 xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf)); |
|
48 xmmY1 = _mm_adds_epi16(xmmY1, xmm0); |
|
49 xmmY1 = _mm_srai_epi16(xmmY1, 6); |
|
50 xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); |
|
51 *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); |
|
52 } |
|
53 } |
|
54 |
|
55 static void ScaleYUVToRGB32Row_SSE2(const uint8* y_buf, |
|
56 const uint8* u_buf, |
|
57 const uint8* v_buf, |
|
58 uint8* rgb_buf, |
|
59 int width, |
|
60 int source_dx) { |
|
61 __m128i xmm0, xmmY1, xmmY2; |
|
62 __m128 xmmY; |
|
63 uint8 u, v, y; |
|
64 int x = 0; |
|
65 |
|
66 while (width >= 2) { |
|
67 u = u_buf[x >> 17]; |
|
68 v = v_buf[x >> 17]; |
|
69 y = y_buf[x >> 16]; |
|
70 x += source_dx; |
|
71 |
|
72 xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), |
|
73 _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); |
|
74 xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); |
|
75 xmmY1 = _mm_adds_epi16(xmmY1, xmm0); |
|
76 |
|
77 y = y_buf[x >> 16]; |
|
78 x += source_dx; |
|
79 |
|
80 xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); |
|
81 xmmY2 = _mm_adds_epi16(xmmY2, xmm0); |
|
82 |
|
83 xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), |
|
84 0x44); |
|
85 xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); |
|
86 xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); |
|
87 |
|
88 _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); |
|
89 rgb_buf += 8; |
|
90 width -= 2; |
|
91 } |
|
92 |
|
93 if (width) { |
|
94 u = u_buf[x >> 17]; |
|
95 v = v_buf[x >> 17]; |
|
96 y = y_buf[x >> 16]; |
|
97 |
|
98 xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), |
|
99 _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); |
|
100 xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); |
|
101 xmmY1 = _mm_adds_epi16(xmmY1, xmm0); |
|
102 xmmY1 = _mm_srai_epi16(xmmY1, 6); |
|
103 xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); |
|
104 *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); |
|
105 } |
|
106 } |
|
107 |
|
108 static void LinearScaleYUVToRGB32Row_SSE2(const uint8* y_buf, |
|
109 const uint8* u_buf, |
|
110 const uint8* v_buf, |
|
111 uint8* rgb_buf, |
|
112 int width, |
|
113 int source_dx) { |
|
114 __m128i xmm0, xmmY1, xmmY2; |
|
115 __m128 xmmY; |
|
116 uint8 u0, u1, v0, v1, y0, y1; |
|
117 uint32 uv_frac, y_frac, u, v, y; |
|
118 int x = 0; |
|
119 |
|
120 if (source_dx >= 0x20000) { |
|
121 x = 32768; |
|
122 } |
|
123 |
|
124 while(width >= 2) { |
|
125 u0 = u_buf[x >> 17]; |
|
126 u1 = u_buf[(x >> 17) + 1]; |
|
127 v0 = v_buf[x >> 17]; |
|
128 v1 = v_buf[(x >> 17) + 1]; |
|
129 y0 = y_buf[x >> 16]; |
|
130 y1 = y_buf[(x >> 16) + 1]; |
|
131 uv_frac = (x & 0x1fffe); |
|
132 y_frac = (x & 0xffff); |
|
133 u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17; |
|
134 v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17; |
|
135 y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16; |
|
136 x += source_dx; |
|
137 |
|
138 xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), |
|
139 _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); |
|
140 xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); |
|
141 xmmY1 = _mm_adds_epi16(xmmY1, xmm0); |
|
142 |
|
143 y0 = y_buf[x >> 16]; |
|
144 y1 = y_buf[(x >> 16) + 1]; |
|
145 y_frac = (x & 0xffff); |
|
146 y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16; |
|
147 x += source_dx; |
|
148 |
|
149 xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); |
|
150 xmmY2 = _mm_adds_epi16(xmmY2, xmm0); |
|
151 |
|
152 xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), |
|
153 0x44); |
|
154 xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); |
|
155 xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); |
|
156 |
|
157 _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); |
|
158 rgb_buf += 8; |
|
159 width -= 2; |
|
160 } |
|
161 |
|
162 if (width) { |
|
163 u = u_buf[x >> 17]; |
|
164 v = v_buf[x >> 17]; |
|
165 y = y_buf[x >> 16]; |
|
166 |
|
167 xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), |
|
168 _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); |
|
169 xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); |
|
170 |
|
171 xmmY1 = _mm_adds_epi16(xmmY1, xmm0); |
|
172 xmmY1 = _mm_srai_epi16(xmmY1, 6); |
|
173 xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); |
|
174 *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); |
|
175 } |
|
176 } |
|
177 |
|
178 void FastConvertYUVToRGB32Row(const uint8* y_buf, |
|
179 const uint8* u_buf, |
|
180 const uint8* v_buf, |
|
181 uint8* rgb_buf, |
|
182 int width) { |
|
183 FastConvertYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width); |
|
184 } |
|
185 |
|
186 void ScaleYUVToRGB32Row(const uint8* y_buf, |
|
187 const uint8* u_buf, |
|
188 const uint8* v_buf, |
|
189 uint8* rgb_buf, |
|
190 int width, |
|
191 int source_dx) { |
|
192 ScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); |
|
193 } |
|
194 |
|
195 void LinearScaleYUVToRGB32Row(const uint8* y_buf, |
|
196 const uint8* u_buf, |
|
197 const uint8* v_buf, |
|
198 uint8* rgb_buf, |
|
199 int width, |
|
200 int source_dx) { |
|
201 LinearScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, |
|
202 source_dx); |
|
203 } |
|
204 |
|
205 } // extern "C" |