|
1 /* |
|
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. |
|
3 * |
|
4 * Use of this source code is governed by a BSD-style license |
|
5 * that can be found in the LICENSE file in the root of the source |
|
6 * tree. An additional intellectual property rights grant can be found |
|
7 * in the file PATENTS. All contributing project authors may |
|
8 * be found in the AUTHORS file in the root of the source tree. |
|
9 */ |
|
10 |
|
11 #include "libyuv/row.h" |
|
12 |
|
13 #ifdef __cplusplus |
|
14 namespace libyuv { |
|
15 extern "C" { |
|
16 #endif |
|
17 |
|
18 // This module is for Visual C x86. |
|
19 #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) |
|
20 |
|
21 // Offsets for source bytes 0 to 9 |
|
22 static uvec8 kShuf0 = |
|
23 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; |
|
24 |
|
25 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. |
|
26 static uvec8 kShuf1 = |
|
27 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; |
|
28 |
|
29 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. |
|
30 static uvec8 kShuf2 = |
|
31 { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; |
|
32 |
|
33 // Offsets for source bytes 0 to 10 |
|
34 static uvec8 kShuf01 = |
|
35 { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; |
|
36 |
|
37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. |
|
38 static uvec8 kShuf11 = |
|
39 { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; |
|
40 |
|
41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. |
|
42 static uvec8 kShuf21 = |
|
43 { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; |
|
44 |
|
45 // Coefficients for source bytes 0 to 10 |
|
46 static uvec8 kMadd01 = |
|
47 { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; |
|
48 |
|
49 // Coefficients for source bytes 10 to 21 |
|
50 static uvec8 kMadd11 = |
|
51 { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; |
|
52 |
|
53 // Coefficients for source bytes 21 to 31 |
|
54 static uvec8 kMadd21 = |
|
55 { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; |
|
56 |
|
57 // Coefficients for source bytes 21 to 31 |
|
58 static vec16 kRound34 = |
|
59 { 2, 2, 2, 2, 2, 2, 2, 2 }; |
|
60 |
|
61 static uvec8 kShuf38a = |
|
62 { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; |
|
63 |
|
64 static uvec8 kShuf38b = |
|
65 { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; |
|
66 |
|
67 // Arrange words 0,3,6 into 0,1,2 |
|
68 static uvec8 kShufAc = |
|
69 { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; |
|
70 |
|
71 // Arrange words 0,3,6 into 3,4,5 |
|
72 static uvec8 kShufAc3 = |
|
73 { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; |
|
74 |
|
75 // Scaling values for boxes of 3x3 and 2x3 |
|
76 static uvec16 kScaleAc33 = |
|
77 { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; |
|
78 |
|
79 // Arrange first value for pixels 0,1,2,3,4,5 |
|
80 static uvec8 kShufAb0 = |
|
81 { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; |
|
82 |
|
83 // Arrange second value for pixels 0,1,2,3,4,5 |
|
84 static uvec8 kShufAb1 = |
|
85 { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; |
|
86 |
|
87 // Arrange third value for pixels 0,1,2,3,4,5 |
|
88 static uvec8 kShufAb2 = |
|
89 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; |
|
90 |
|
91 // Scaling values for boxes of 3x2 and 2x2 |
|
92 static uvec16 kScaleAb2 = |
|
93 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; |
|
94 |
|
95 // Reads 32 pixels, throws half away and writes 16 pixels. |
|
96 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. |
|
97 __declspec(naked) __declspec(align(16)) |
|
98 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
|
99 uint8* dst_ptr, int dst_width) { |
|
100 __asm { |
|
101 mov eax, [esp + 4] // src_ptr |
|
102 // src_stride ignored |
|
103 mov edx, [esp + 12] // dst_ptr |
|
104 mov ecx, [esp + 16] // dst_width |
|
105 |
|
106 align 4 |
|
107 wloop: |
|
108 movdqa xmm0, [eax] |
|
109 movdqa xmm1, [eax + 16] |
|
110 lea eax, [eax + 32] |
|
111 psrlw xmm0, 8 // isolate odd pixels. |
|
112 psrlw xmm1, 8 |
|
113 packuswb xmm0, xmm1 |
|
114 sub ecx, 16 |
|
115 movdqa [edx], xmm0 |
|
116 lea edx, [edx + 16] |
|
117 jg wloop |
|
118 |
|
119 ret |
|
120 } |
|
121 } |
|
122 |
|
123 // Blends 32x1 rectangle to 16x1. |
|
124 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. |
|
125 __declspec(naked) __declspec(align(16)) |
|
126 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
|
127 uint8* dst_ptr, int dst_width) { |
|
128 __asm { |
|
129 mov eax, [esp + 4] // src_ptr |
|
130 // src_stride |
|
131 mov edx, [esp + 12] // dst_ptr |
|
132 mov ecx, [esp + 16] // dst_width |
|
133 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
|
134 psrlw xmm5, 8 |
|
135 |
|
136 align 4 |
|
137 wloop: |
|
138 movdqa xmm0, [eax] |
|
139 movdqa xmm1, [eax + 16] |
|
140 lea eax, [eax + 32] |
|
141 |
|
142 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) |
|
143 psrlw xmm0, 8 |
|
144 movdqa xmm3, xmm1 |
|
145 psrlw xmm1, 8 |
|
146 pand xmm2, xmm5 |
|
147 pand xmm3, xmm5 |
|
148 pavgw xmm0, xmm2 |
|
149 pavgw xmm1, xmm3 |
|
150 packuswb xmm0, xmm1 |
|
151 |
|
152 sub ecx, 16 |
|
153 movdqa [edx], xmm0 |
|
154 lea edx, [edx + 16] |
|
155 jg wloop |
|
156 |
|
157 ret |
|
158 } |
|
159 } |
|
160 |
|
161 // Blends 32x2 rectangle to 16x1. |
|
162 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. |
|
163 __declspec(naked) __declspec(align(16)) |
|
164 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
|
165 uint8* dst_ptr, int dst_width) { |
|
166 __asm { |
|
167 push esi |
|
168 mov eax, [esp + 4 + 4] // src_ptr |
|
169 mov esi, [esp + 4 + 8] // src_stride |
|
170 mov edx, [esp + 4 + 12] // dst_ptr |
|
171 mov ecx, [esp + 4 + 16] // dst_width |
|
172 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
|
173 psrlw xmm5, 8 |
|
174 |
|
175 align 4 |
|
176 wloop: |
|
177 movdqa xmm0, [eax] |
|
178 movdqa xmm1, [eax + 16] |
|
179 movdqa xmm2, [eax + esi] |
|
180 movdqa xmm3, [eax + esi + 16] |
|
181 lea eax, [eax + 32] |
|
182 pavgb xmm0, xmm2 // average rows |
|
183 pavgb xmm1, xmm3 |
|
184 |
|
185 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) |
|
186 psrlw xmm0, 8 |
|
187 movdqa xmm3, xmm1 |
|
188 psrlw xmm1, 8 |
|
189 pand xmm2, xmm5 |
|
190 pand xmm3, xmm5 |
|
191 pavgw xmm0, xmm2 |
|
192 pavgw xmm1, xmm3 |
|
193 packuswb xmm0, xmm1 |
|
194 |
|
195 sub ecx, 16 |
|
196 movdqa [edx], xmm0 |
|
197 lea edx, [edx + 16] |
|
198 jg wloop |
|
199 |
|
200 pop esi |
|
201 ret |
|
202 } |
|
203 } |
|
204 |
|
205 // Reads 32 pixels, throws half away and writes 16 pixels. |
|
206 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. |
|
207 __declspec(naked) __declspec(align(16)) |
|
208 void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, |
|
209 ptrdiff_t src_stride, |
|
210 uint8* dst_ptr, int dst_width) { |
|
211 __asm { |
|
212 mov eax, [esp + 4] // src_ptr |
|
213 // src_stride ignored |
|
214 mov edx, [esp + 12] // dst_ptr |
|
215 mov ecx, [esp + 16] // dst_width |
|
216 |
|
217 align 4 |
|
218 wloop: |
|
219 movdqu xmm0, [eax] |
|
220 movdqu xmm1, [eax + 16] |
|
221 lea eax, [eax + 32] |
|
222 psrlw xmm0, 8 // isolate odd pixels. |
|
223 psrlw xmm1, 8 |
|
224 packuswb xmm0, xmm1 |
|
225 sub ecx, 16 |
|
226 movdqu [edx], xmm0 |
|
227 lea edx, [edx + 16] |
|
228 jg wloop |
|
229 |
|
230 ret |
|
231 } |
|
232 } |
|
233 |
|
234 // Blends 32x1 rectangle to 16x1. |
|
235 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. |
|
236 __declspec(naked) __declspec(align(16)) |
|
237 void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, |
|
238 ptrdiff_t src_stride, |
|
239 uint8* dst_ptr, int dst_width) { |
|
240 __asm { |
|
241 mov eax, [esp + 4] // src_ptr |
|
242 // src_stride |
|
243 mov edx, [esp + 12] // dst_ptr |
|
244 mov ecx, [esp + 16] // dst_width |
|
245 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
|
246 psrlw xmm5, 8 |
|
247 |
|
248 align 4 |
|
249 wloop: |
|
250 movdqu xmm0, [eax] |
|
251 movdqu xmm1, [eax + 16] |
|
252 lea eax, [eax + 32] |
|
253 |
|
254 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) |
|
255 psrlw xmm0, 8 |
|
256 movdqa xmm3, xmm1 |
|
257 psrlw xmm1, 8 |
|
258 pand xmm2, xmm5 |
|
259 pand xmm3, xmm5 |
|
260 pavgw xmm0, xmm2 |
|
261 pavgw xmm1, xmm3 |
|
262 packuswb xmm0, xmm1 |
|
263 |
|
264 sub ecx, 16 |
|
265 movdqu [edx], xmm0 |
|
266 lea edx, [edx + 16] |
|
267 jg wloop |
|
268 |
|
269 ret |
|
270 } |
|
271 } |
|
272 |
|
273 // Blends 32x2 rectangle to 16x1. |
|
274 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. |
|
275 __declspec(naked) __declspec(align(16)) |
|
276 void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, |
|
277 ptrdiff_t src_stride, |
|
278 uint8* dst_ptr, int dst_width) { |
|
279 __asm { |
|
280 push esi |
|
281 mov eax, [esp + 4 + 4] // src_ptr |
|
282 mov esi, [esp + 4 + 8] // src_stride |
|
283 mov edx, [esp + 4 + 12] // dst_ptr |
|
284 mov ecx, [esp + 4 + 16] // dst_width |
|
285 pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff |
|
286 psrlw xmm5, 8 |
|
287 |
|
288 align 4 |
|
289 wloop: |
|
290 movdqu xmm0, [eax] |
|
291 movdqu xmm1, [eax + 16] |
|
292 movdqu xmm2, [eax + esi] |
|
293 movdqu xmm3, [eax + esi + 16] |
|
294 lea eax, [eax + 32] |
|
295 pavgb xmm0, xmm2 // average rows |
|
296 pavgb xmm1, xmm3 |
|
297 |
|
298 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) |
|
299 psrlw xmm0, 8 |
|
300 movdqa xmm3, xmm1 |
|
301 psrlw xmm1, 8 |
|
302 pand xmm2, xmm5 |
|
303 pand xmm3, xmm5 |
|
304 pavgw xmm0, xmm2 |
|
305 pavgw xmm1, xmm3 |
|
306 packuswb xmm0, xmm1 |
|
307 |
|
308 sub ecx, 16 |
|
309 movdqu [edx], xmm0 |
|
310 lea edx, [edx + 16] |
|
311 jg wloop |
|
312 |
|
313 pop esi |
|
314 ret |
|
315 } |
|
316 } |
|
317 |
|
318 // Point samples 32 pixels to 8 pixels. |
|
319 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. |
|
320 __declspec(naked) __declspec(align(16)) |
|
321 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
|
322 uint8* dst_ptr, int dst_width) { |
|
323 __asm { |
|
324 mov eax, [esp + 4] // src_ptr |
|
325 // src_stride ignored |
|
326 mov edx, [esp + 12] // dst_ptr |
|
327 mov ecx, [esp + 16] // dst_width |
|
328 pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 |
|
329 psrld xmm5, 24 |
|
330 pslld xmm5, 16 |
|
331 |
|
332 align 4 |
|
333 wloop: |
|
334 movdqa xmm0, [eax] |
|
335 movdqa xmm1, [eax + 16] |
|
336 lea eax, [eax + 32] |
|
337 pand xmm0, xmm5 |
|
338 pand xmm1, xmm5 |
|
339 packuswb xmm0, xmm1 |
|
340 psrlw xmm0, 8 |
|
341 packuswb xmm0, xmm0 |
|
342 sub ecx, 8 |
|
343 movq qword ptr [edx], xmm0 |
|
344 lea edx, [edx + 8] |
|
345 jg wloop |
|
346 |
|
347 ret |
|
348 } |
|
349 } |
|
350 |
|
351 // Blends 32x4 rectangle to 8x1. |
|
352 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. |
|
353 __declspec(naked) __declspec(align(16)) |
|
354 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
|
355 uint8* dst_ptr, int dst_width) { |
|
356 __asm { |
|
357 push esi |
|
358 push edi |
|
359 mov eax, [esp + 8 + 4] // src_ptr |
|
360 mov esi, [esp + 8 + 8] // src_stride |
|
361 mov edx, [esp + 8 + 12] // dst_ptr |
|
362 mov ecx, [esp + 8 + 16] // dst_width |
|
363 lea edi, [esi + esi * 2] // src_stride * 3 |
|
364 pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff |
|
365 psrlw xmm7, 8 |
|
366 |
|
367 align 4 |
|
368 wloop: |
|
369 movdqa xmm0, [eax] |
|
370 movdqa xmm1, [eax + 16] |
|
371 movdqa xmm2, [eax + esi] |
|
372 movdqa xmm3, [eax + esi + 16] |
|
373 pavgb xmm0, xmm2 // average rows |
|
374 pavgb xmm1, xmm3 |
|
375 movdqa xmm2, [eax + esi * 2] |
|
376 movdqa xmm3, [eax + esi * 2 + 16] |
|
377 movdqa xmm4, [eax + edi] |
|
378 movdqa xmm5, [eax + edi + 16] |
|
379 lea eax, [eax + 32] |
|
380 pavgb xmm2, xmm4 |
|
381 pavgb xmm3, xmm5 |
|
382 pavgb xmm0, xmm2 |
|
383 pavgb xmm1, xmm3 |
|
384 |
|
385 movdqa xmm2, xmm0 // average columns (32 to 16 pixels) |
|
386 psrlw xmm0, 8 |
|
387 movdqa xmm3, xmm1 |
|
388 psrlw xmm1, 8 |
|
389 pand xmm2, xmm7 |
|
390 pand xmm3, xmm7 |
|
391 pavgw xmm0, xmm2 |
|
392 pavgw xmm1, xmm3 |
|
393 packuswb xmm0, xmm1 |
|
394 |
|
395 movdqa xmm2, xmm0 // average columns (16 to 8 pixels) |
|
396 psrlw xmm0, 8 |
|
397 pand xmm2, xmm7 |
|
398 pavgw xmm0, xmm2 |
|
399 packuswb xmm0, xmm0 |
|
400 |
|
401 sub ecx, 8 |
|
402 movq qword ptr [edx], xmm0 |
|
403 lea edx, [edx + 8] |
|
404 jg wloop |
|
405 |
|
406 pop edi |
|
407 pop esi |
|
408 ret |
|
409 } |
|
410 } |
|
411 |
|
412 // Point samples 32 pixels to 24 pixels. |
|
413 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. |
|
414 // Then shuffled to do the scaling. |
|
415 |
|
416 // Note that movdqa+palign may be better than movdqu. |
|
417 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. |
|
418 __declspec(naked) __declspec(align(16)) |
|
419 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
|
420 uint8* dst_ptr, int dst_width) { |
|
421 __asm { |
|
422 mov eax, [esp + 4] // src_ptr |
|
423 // src_stride ignored |
|
424 mov edx, [esp + 12] // dst_ptr |
|
425 mov ecx, [esp + 16] // dst_width |
|
426 movdqa xmm3, kShuf0 |
|
427 movdqa xmm4, kShuf1 |
|
428 movdqa xmm5, kShuf2 |
|
429 |
|
430 align 4 |
|
431 wloop: |
|
432 movdqa xmm0, [eax] |
|
433 movdqa xmm1, [eax + 16] |
|
434 lea eax, [eax + 32] |
|
435 movdqa xmm2, xmm1 |
|
436 palignr xmm1, xmm0, 8 |
|
437 pshufb xmm0, xmm3 |
|
438 pshufb xmm1, xmm4 |
|
439 pshufb xmm2, xmm5 |
|
440 movq qword ptr [edx], xmm0 |
|
441 movq qword ptr [edx + 8], xmm1 |
|
442 movq qword ptr [edx + 16], xmm2 |
|
443 lea edx, [edx + 24] |
|
444 sub ecx, 24 |
|
445 jg wloop |
|
446 |
|
447 ret |
|
448 } |
|
449 } |
|
450 |
|
451 // Blends 32x2 rectangle to 24x1 |
|
452 // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. |
|
453 // Then shuffled to do the scaling. |
|
454 |
|
455 // Register usage: |
|
456 // xmm0 src_row 0 |
|
457 // xmm1 src_row 1 |
|
458 // xmm2 shuf 0 |
|
459 // xmm3 shuf 1 |
|
460 // xmm4 shuf 2 |
|
461 // xmm5 madd 0 |
|
462 // xmm6 madd 1 |
|
463 // xmm7 kRound34 |
|
464 |
|
465 // Note that movdqa+palign may be better than movdqu. |
|
466 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. |
|
467 __declspec(naked) __declspec(align(16)) |
|
468 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, |
|
469 ptrdiff_t src_stride, |
|
470 uint8* dst_ptr, int dst_width) { |
|
471 __asm { |
|
472 push esi |
|
473 mov eax, [esp + 4 + 4] // src_ptr |
|
474 mov esi, [esp + 4 + 8] // src_stride |
|
475 mov edx, [esp + 4 + 12] // dst_ptr |
|
476 mov ecx, [esp + 4 + 16] // dst_width |
|
477 movdqa xmm2, kShuf01 |
|
478 movdqa xmm3, kShuf11 |
|
479 movdqa xmm4, kShuf21 |
|
480 movdqa xmm5, kMadd01 |
|
481 movdqa xmm6, kMadd11 |
|
482 movdqa xmm7, kRound34 |
|
483 |
|
484 align 4 |
|
485 wloop: |
|
486 movdqa xmm0, [eax] // pixels 0..7 |
|
487 movdqa xmm1, [eax + esi] |
|
488 pavgb xmm0, xmm1 |
|
489 pshufb xmm0, xmm2 |
|
490 pmaddubsw xmm0, xmm5 |
|
491 paddsw xmm0, xmm7 |
|
492 psrlw xmm0, 2 |
|
493 packuswb xmm0, xmm0 |
|
494 movq qword ptr [edx], xmm0 |
|
495 movdqu xmm0, [eax + 8] // pixels 8..15 |
|
496 movdqu xmm1, [eax + esi + 8] |
|
497 pavgb xmm0, xmm1 |
|
498 pshufb xmm0, xmm3 |
|
499 pmaddubsw xmm0, xmm6 |
|
500 paddsw xmm0, xmm7 |
|
501 psrlw xmm0, 2 |
|
502 packuswb xmm0, xmm0 |
|
503 movq qword ptr [edx + 8], xmm0 |
|
504 movdqa xmm0, [eax + 16] // pixels 16..23 |
|
505 movdqa xmm1, [eax + esi + 16] |
|
506 lea eax, [eax + 32] |
|
507 pavgb xmm0, xmm1 |
|
508 pshufb xmm0, xmm4 |
|
509 movdqa xmm1, kMadd21 |
|
510 pmaddubsw xmm0, xmm1 |
|
511 paddsw xmm0, xmm7 |
|
512 psrlw xmm0, 2 |
|
513 packuswb xmm0, xmm0 |
|
514 sub ecx, 24 |
|
515 movq qword ptr [edx + 16], xmm0 |
|
516 lea edx, [edx + 24] |
|
517 jg wloop |
|
518 |
|
519 pop esi |
|
520 ret |
|
521 } |
|
522 } |
|
523 |
|
524 // Note that movdqa+palign may be better than movdqu. |
|
525 // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. |
|
526 __declspec(naked) __declspec(align(16)) |
|
527 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, |
|
528 ptrdiff_t src_stride, |
|
529 uint8* dst_ptr, int dst_width) { |
|
530 __asm { |
|
531 push esi |
|
532 mov eax, [esp + 4 + 4] // src_ptr |
|
533 mov esi, [esp + 4 + 8] // src_stride |
|
534 mov edx, [esp + 4 + 12] // dst_ptr |
|
535 mov ecx, [esp + 4 + 16] // dst_width |
|
536 movdqa xmm2, kShuf01 |
|
537 movdqa xmm3, kShuf11 |
|
538 movdqa xmm4, kShuf21 |
|
539 movdqa xmm5, kMadd01 |
|
540 movdqa xmm6, kMadd11 |
|
541 movdqa xmm7, kRound34 |
|
542 |
|
543 align 4 |
|
544 wloop: |
|
545 movdqa xmm0, [eax] // pixels 0..7 |
|
546 movdqa xmm1, [eax + esi] |
|
547 pavgb xmm1, xmm0 |
|
548 pavgb xmm0, xmm1 |
|
549 pshufb xmm0, xmm2 |
|
550 pmaddubsw xmm0, xmm5 |
|
551 paddsw xmm0, xmm7 |
|
552 psrlw xmm0, 2 |
|
553 packuswb xmm0, xmm0 |
|
554 movq qword ptr [edx], xmm0 |
|
555 movdqu xmm0, [eax + 8] // pixels 8..15 |
|
556 movdqu xmm1, [eax + esi + 8] |
|
557 pavgb xmm1, xmm0 |
|
558 pavgb xmm0, xmm1 |
|
559 pshufb xmm0, xmm3 |
|
560 pmaddubsw xmm0, xmm6 |
|
561 paddsw xmm0, xmm7 |
|
562 psrlw xmm0, 2 |
|
563 packuswb xmm0, xmm0 |
|
564 movq qword ptr [edx + 8], xmm0 |
|
565 movdqa xmm0, [eax + 16] // pixels 16..23 |
|
566 movdqa xmm1, [eax + esi + 16] |
|
567 lea eax, [eax + 32] |
|
568 pavgb xmm1, xmm0 |
|
569 pavgb xmm0, xmm1 |
|
570 pshufb xmm0, xmm4 |
|
571 movdqa xmm1, kMadd21 |
|
572 pmaddubsw xmm0, xmm1 |
|
573 paddsw xmm0, xmm7 |
|
574 psrlw xmm0, 2 |
|
575 packuswb xmm0, xmm0 |
|
576 sub ecx, 24 |
|
577 movq qword ptr [edx + 16], xmm0 |
|
578 lea edx, [edx+24] |
|
579 jg wloop |
|
580 |
|
581 pop esi |
|
582 ret |
|
583 } |
|
584 } |
|
585 |
|
586 // 3/8 point sampler |
|
587 |
|
588 // Scale 32 pixels to 12 |
|
589 __declspec(naked) __declspec(align(16)) |
|
590 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
|
591 uint8* dst_ptr, int dst_width) { |
|
592 __asm { |
|
593 mov eax, [esp + 4] // src_ptr |
|
594 // src_stride ignored |
|
595 mov edx, [esp + 12] // dst_ptr |
|
596 mov ecx, [esp + 16] // dst_width |
|
597 movdqa xmm4, kShuf38a |
|
598 movdqa xmm5, kShuf38b |
|
599 |
|
600 align 4 |
|
601 xloop: |
|
602 movdqa xmm0, [eax] // 16 pixels -> 0,1,2,3,4,5 |
|
603 movdqa xmm1, [eax + 16] // 16 pixels -> 6,7,8,9,10,11 |
|
604 lea eax, [eax + 32] |
|
605 pshufb xmm0, xmm4 |
|
606 pshufb xmm1, xmm5 |
|
607 paddusb xmm0, xmm1 |
|
608 |
|
609 sub ecx, 12 |
|
610 movq qword ptr [edx], xmm0 // write 12 pixels |
|
611 movhlps xmm1, xmm0 |
|
612 movd [edx + 8], xmm1 |
|
613 lea edx, [edx + 12] |
|
614 jg xloop |
|
615 |
|
616 ret |
|
617 } |
|
618 } |
|
619 |
|
620 // Scale 16x3 pixels to 6x1 with interpolation |
|
621 __declspec(naked) __declspec(align(16)) |
|
622 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, |
|
623 ptrdiff_t src_stride, |
|
624 uint8* dst_ptr, int dst_width) { |
|
625 __asm { |
|
626 push esi |
|
627 mov eax, [esp + 4 + 4] // src_ptr |
|
628 mov esi, [esp + 4 + 8] // src_stride |
|
629 mov edx, [esp + 4 + 12] // dst_ptr |
|
630 mov ecx, [esp + 4 + 16] // dst_width |
|
631 movdqa xmm2, kShufAc |
|
632 movdqa xmm3, kShufAc3 |
|
633 movdqa xmm4, kScaleAc33 |
|
634 pxor xmm5, xmm5 |
|
635 |
|
636 align 4 |
|
637 xloop: |
|
638 movdqa xmm0, [eax] // sum up 3 rows into xmm0/1 |
|
639 movdqa xmm6, [eax + esi] |
|
640 movhlps xmm1, xmm0 |
|
641 movhlps xmm7, xmm6 |
|
642 punpcklbw xmm0, xmm5 |
|
643 punpcklbw xmm1, xmm5 |
|
644 punpcklbw xmm6, xmm5 |
|
645 punpcklbw xmm7, xmm5 |
|
646 paddusw xmm0, xmm6 |
|
647 paddusw xmm1, xmm7 |
|
648 movdqa xmm6, [eax + esi * 2] |
|
649 lea eax, [eax + 16] |
|
650 movhlps xmm7, xmm6 |
|
651 punpcklbw xmm6, xmm5 |
|
652 punpcklbw xmm7, xmm5 |
|
653 paddusw xmm0, xmm6 |
|
654 paddusw xmm1, xmm7 |
|
655 |
|
656 movdqa xmm6, xmm0 // 8 pixels -> 0,1,2 of xmm6 |
|
657 psrldq xmm0, 2 |
|
658 paddusw xmm6, xmm0 |
|
659 psrldq xmm0, 2 |
|
660 paddusw xmm6, xmm0 |
|
661 pshufb xmm6, xmm2 |
|
662 |
|
663 movdqa xmm7, xmm1 // 8 pixels -> 3,4,5 of xmm6 |
|
664 psrldq xmm1, 2 |
|
665 paddusw xmm7, xmm1 |
|
666 psrldq xmm1, 2 |
|
667 paddusw xmm7, xmm1 |
|
668 pshufb xmm7, xmm3 |
|
669 paddusw xmm6, xmm7 |
|
670 |
|
671 pmulhuw xmm6, xmm4 // divide by 9,9,6, 9,9,6 |
|
672 packuswb xmm6, xmm6 |
|
673 |
|
674 sub ecx, 6 |
|
675 movd [edx], xmm6 // write 6 pixels |
|
676 psrlq xmm6, 16 |
|
677 movd [edx + 2], xmm6 |
|
678 lea edx, [edx + 6] |
|
679 jg xloop |
|
680 |
|
681 pop esi |
|
682 ret |
|
683 } |
|
684 } |
|
685 |
|
686 // Scale 16x2 pixels to 6x1 with interpolation |
|
687 __declspec(naked) __declspec(align(16)) |
|
688 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, |
|
689 ptrdiff_t src_stride, |
|
690 uint8* dst_ptr, int dst_width) { |
|
691 __asm { |
|
692 push esi |
|
693 mov eax, [esp + 4 + 4] // src_ptr |
|
694 mov esi, [esp + 4 + 8] // src_stride |
|
695 mov edx, [esp + 4 + 12] // dst_ptr |
|
696 mov ecx, [esp + 4 + 16] // dst_width |
|
697 movdqa xmm2, kShufAb0 |
|
698 movdqa xmm3, kShufAb1 |
|
699 movdqa xmm4, kShufAb2 |
|
700 movdqa xmm5, kScaleAb2 |
|
701 |
|
702 align 4 |
|
703 xloop: |
|
704 movdqa xmm0, [eax] // average 2 rows into xmm0 |
|
705 pavgb xmm0, [eax + esi] |
|
706 lea eax, [eax + 16] |
|
707 |
|
708 movdqa xmm1, xmm0 // 16 pixels -> 0,1,2,3,4,5 of xmm1 |
|
709 pshufb xmm1, xmm2 |
|
710 movdqa xmm6, xmm0 |
|
711 pshufb xmm6, xmm3 |
|
712 paddusw xmm1, xmm6 |
|
713 pshufb xmm0, xmm4 |
|
714 paddusw xmm1, xmm0 |
|
715 |
|
716 pmulhuw xmm1, xmm5 // divide by 3,3,2, 3,3,2 |
|
717 packuswb xmm1, xmm1 |
|
718 |
|
719 sub ecx, 6 |
|
720 movd [edx], xmm1 // write 6 pixels |
|
721 psrlq xmm1, 16 |
|
722 movd [edx + 2], xmm1 |
|
723 lea edx, [edx + 6] |
|
724 jg xloop |
|
725 |
|
726 pop esi |
|
727 ret |
|
728 } |
|
729 } |
|
730 |
|
731 // Reads 16xN bytes and produces 16 shorts at a time. |
|
732 // TODO(fbarchard): Make this handle 4xN bytes for any width ARGB. |
|
733 __declspec(naked) __declspec(align(16)) |
|
734 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
|
735 uint16* dst_ptr, int src_width, |
|
736 int src_height) { |
|
737 __asm { |
|
738 push esi |
|
739 push edi |
|
740 push ebx |
|
741 push ebp |
|
742 mov esi, [esp + 16 + 4] // src_ptr |
|
743 mov edx, [esp + 16 + 8] // src_stride |
|
744 mov edi, [esp + 16 + 12] // dst_ptr |
|
745 mov ecx, [esp + 16 + 16] // dst_width |
|
746 mov ebx, [esp + 16 + 20] // height |
|
747 pxor xmm4, xmm4 |
|
748 dec ebx |
|
749 |
|
750 align 4 |
|
751 xloop: |
|
752 // first row |
|
753 movdqa xmm0, [esi] |
|
754 lea eax, [esi + edx] |
|
755 movdqa xmm1, xmm0 |
|
756 punpcklbw xmm0, xmm4 |
|
757 punpckhbw xmm1, xmm4 |
|
758 lea esi, [esi + 16] |
|
759 mov ebp, ebx |
|
760 test ebp, ebp |
|
761 je ydone |
|
762 |
|
763 // sum remaining rows |
|
764 align 4 |
|
765 yloop: |
|
766 movdqa xmm2, [eax] // read 16 pixels |
|
767 lea eax, [eax + edx] // advance to next row |
|
768 movdqa xmm3, xmm2 |
|
769 punpcklbw xmm2, xmm4 |
|
770 punpckhbw xmm3, xmm4 |
|
771 paddusw xmm0, xmm2 // sum 16 words |
|
772 paddusw xmm1, xmm3 |
|
773 sub ebp, 1 |
|
774 jg yloop |
|
775 |
|
776 align 4 |
|
777 ydone: |
|
778 movdqa [edi], xmm0 |
|
779 movdqa [edi + 16], xmm1 |
|
780 lea edi, [edi + 32] |
|
781 |
|
782 sub ecx, 16 |
|
783 jg xloop |
|
784 |
|
785 pop ebp |
|
786 pop ebx |
|
787 pop edi |
|
788 pop esi |
|
789 ret |
|
790 } |
|
791 } |
|
792 |
|
793 // Bilinear column filtering. SSSE3 version. |
|
794 // TODO(fbarchard): Port to Neon |
|
795 // TODO(fbarchard): Switch the following: |
|
796 // xor ebx, ebx |
|
797 // mov bx, word ptr [esi + eax] // 2 source x0 pixels |
|
798 // To |
|
799 // movzx ebx, word ptr [esi + eax] // 2 source x0 pixels |
|
800 // when drmemory bug fixed. |
|
801 // https://code.google.com/p/drmemory/issues/detail?id=1396 |
|
802 |
|
803 __declspec(naked) __declspec(align(16)) |
|
804 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
|
805 int dst_width, int x, int dx) { |
|
806 __asm { |
|
807 push ebx |
|
808 push esi |
|
809 push edi |
|
810 mov edi, [esp + 12 + 4] // dst_ptr |
|
811 mov esi, [esp + 12 + 8] // src_ptr |
|
812 mov ecx, [esp + 12 + 12] // dst_width |
|
813 movd xmm2, [esp + 12 + 16] // x |
|
814 movd xmm3, [esp + 12 + 20] // dx |
|
815 mov eax, 0x04040000 // shuffle to line up fractions with pixel. |
|
816 movd xmm5, eax |
|
817 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. |
|
818 psrlw xmm6, 9 |
|
819 pextrw eax, xmm2, 1 // get x0 integer. preroll |
|
820 sub ecx, 2 |
|
821 jl xloop29 |
|
822 |
|
823 movdqa xmm0, xmm2 // x1 = x0 + dx |
|
824 paddd xmm0, xmm3 |
|
825 punpckldq xmm2, xmm0 // x0 x1 |
|
826 punpckldq xmm3, xmm3 // dx dx |
|
827 paddd xmm3, xmm3 // dx * 2, dx * 2 |
|
828 pextrw edx, xmm2, 3 // get x1 integer. preroll |
|
829 |
|
830 // 2 Pixel loop. |
|
831 align 4 |
|
832 xloop2: |
|
833 movdqa xmm1, xmm2 // x0, x1 fractions. |
|
834 paddd xmm2, xmm3 // x += dx |
|
835 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels |
|
836 movd xmm0, ebx |
|
837 psrlw xmm1, 9 // 7 bit fractions. |
|
838 movzx ebx, word ptr [esi + edx] // 2 source x1 pixels |
|
839 movd xmm4, ebx |
|
840 pshufb xmm1, xmm5 // 0011 |
|
841 punpcklwd xmm0, xmm4 |
|
842 pxor xmm1, xmm6 // 0..7f and 7f..0 |
|
843 pmaddubsw xmm0, xmm1 // 16 bit, 2 pixels. |
|
844 pextrw eax, xmm2, 1 // get x0 integer. next iteration. |
|
845 pextrw edx, xmm2, 3 // get x1 integer. next iteration. |
|
846 psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. |
|
847 packuswb xmm0, xmm0 // 8 bits, 2 pixels. |
|
848 movd ebx, xmm0 |
|
849 mov [edi], bx |
|
850 lea edi, [edi + 2] |
|
851 sub ecx, 2 // 2 pixels |
|
852 jge xloop2 |
|
853 |
|
854 align 4 |
|
855 xloop29: |
|
856 |
|
857 add ecx, 2 - 1 |
|
858 jl xloop99 |
|
859 |
|
860 // 1 pixel remainder |
|
861 movzx ebx, word ptr [esi + eax] // 2 source x0 pixels |
|
862 movd xmm0, ebx |
|
863 psrlw xmm2, 9 // 7 bit fractions. |
|
864 pshufb xmm2, xmm5 // 0011 |
|
865 pxor xmm2, xmm6 // 0..7f and 7f..0 |
|
866 pmaddubsw xmm0, xmm2 // 16 bit |
|
867 psrlw xmm0, 7 // 8.7 fixed point to low 8 bits. |
|
868 packuswb xmm0, xmm0 // 8 bits |
|
869 movd ebx, xmm0 |
|
870 mov [edi], bl |
|
871 |
|
872 align 4 |
|
873 xloop99: |
|
874 |
|
875 pop edi |
|
876 pop esi |
|
877 pop ebx |
|
878 ret |
|
879 } |
|
880 } |
|
881 |
|
882 // Reads 16 pixels, duplicates them and writes 32 pixels. |
|
883 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. |
|
884 __declspec(naked) __declspec(align(16)) |
|
885 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, |
|
886 int dst_width, int x, int dx) { |
|
887 __asm { |
|
888 mov edx, [esp + 4] // dst_ptr |
|
889 mov eax, [esp + 8] // src_ptr |
|
890 mov ecx, [esp + 12] // dst_width |
|
891 |
|
892 align 4 |
|
893 wloop: |
|
894 movdqa xmm0, [eax] |
|
895 lea eax, [eax + 16] |
|
896 movdqa xmm1, xmm0 |
|
897 punpcklbw xmm0, xmm0 |
|
898 punpckhbw xmm1, xmm1 |
|
899 sub ecx, 32 |
|
900 movdqa [edx], xmm0 |
|
901 movdqa [edx + 16], xmm1 |
|
902 lea edx, [edx + 32] |
|
903 jg wloop |
|
904 |
|
905 ret |
|
906 } |
|
907 } |
|
908 |
|
909 // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) |
|
910 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. |
|
911 __declspec(naked) __declspec(align(16)) |
|
912 void ScaleARGBRowDown2_SSE2(const uint8* src_argb, |
|
913 ptrdiff_t src_stride, |
|
914 uint8* dst_argb, int dst_width) { |
|
915 __asm { |
|
916 mov eax, [esp + 4] // src_argb |
|
917 // src_stride ignored |
|
918 mov edx, [esp + 12] // dst_argb |
|
919 mov ecx, [esp + 16] // dst_width |
|
920 |
|
921 align 4 |
|
922 wloop: |
|
923 movdqa xmm0, [eax] |
|
924 movdqa xmm1, [eax + 16] |
|
925 lea eax, [eax + 32] |
|
926 shufps xmm0, xmm1, 0xdd |
|
927 sub ecx, 4 |
|
928 movdqa [edx], xmm0 |
|
929 lea edx, [edx + 16] |
|
930 jg wloop |
|
931 |
|
932 ret |
|
933 } |
|
934 } |
|
935 |
|
936 // Blends 8x1 rectangle to 4x1. |
|
937 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. |
|
938 __declspec(naked) __declspec(align(16)) |
|
939 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, |
|
940 ptrdiff_t src_stride, |
|
941 uint8* dst_argb, int dst_width) { |
|
942 __asm { |
|
943 mov eax, [esp + 4] // src_argb |
|
944 // src_stride ignored |
|
945 mov edx, [esp + 12] // dst_argb |
|
946 mov ecx, [esp + 16] // dst_width |
|
947 |
|
948 align 4 |
|
949 wloop: |
|
950 movdqa xmm0, [eax] |
|
951 movdqa xmm1, [eax + 16] |
|
952 lea eax, [eax + 32] |
|
953 movdqa xmm2, xmm0 |
|
954 shufps xmm0, xmm1, 0x88 // even pixels |
|
955 shufps xmm2, xmm1, 0xdd // odd pixels |
|
956 pavgb xmm0, xmm2 |
|
957 sub ecx, 4 |
|
958 movdqa [edx], xmm0 |
|
959 lea edx, [edx + 16] |
|
960 jg wloop |
|
961 |
|
962 ret |
|
963 } |
|
964 } |
|
965 |
|
966 // Blends 8x2 rectangle to 4x1. |
|
967 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. |
|
968 __declspec(naked) __declspec(align(16)) |
|
969 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, |
|
970 ptrdiff_t src_stride, |
|
971 uint8* dst_argb, int dst_width) { |
|
972 __asm { |
|
973 push esi |
|
974 mov eax, [esp + 4 + 4] // src_argb |
|
975 mov esi, [esp + 4 + 8] // src_stride |
|
976 mov edx, [esp + 4 + 12] // dst_argb |
|
977 mov ecx, [esp + 4 + 16] // dst_width |
|
978 |
|
979 align 4 |
|
980 wloop: |
|
981 movdqa xmm0, [eax] |
|
982 movdqa xmm1, [eax + 16] |
|
983 movdqa xmm2, [eax + esi] |
|
984 movdqa xmm3, [eax + esi + 16] |
|
985 lea eax, [eax + 32] |
|
986 pavgb xmm0, xmm2 // average rows |
|
987 pavgb xmm1, xmm3 |
|
988 movdqa xmm2, xmm0 // average columns (8 to 4 pixels) |
|
989 shufps xmm0, xmm1, 0x88 // even pixels |
|
990 shufps xmm2, xmm1, 0xdd // odd pixels |
|
991 pavgb xmm0, xmm2 |
|
992 sub ecx, 4 |
|
993 movdqa [edx], xmm0 |
|
994 lea edx, [edx + 16] |
|
995 jg wloop |
|
996 |
|
997 pop esi |
|
998 ret |
|
999 } |
|
1000 } |
|
1001 |
|
1002 // Reads 4 pixels at a time. |
|
1003 // Alignment requirement: dst_argb 16 byte aligned. |
|
1004 __declspec(naked) __declspec(align(16)) |
|
1005 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, |
|
1006 int src_stepx, |
|
1007 uint8* dst_argb, int dst_width) { |
|
1008 __asm { |
|
1009 push ebx |
|
1010 push edi |
|
1011 mov eax, [esp + 8 + 4] // src_argb |
|
1012 // src_stride ignored |
|
1013 mov ebx, [esp + 8 + 12] // src_stepx |
|
1014 mov edx, [esp + 8 + 16] // dst_argb |
|
1015 mov ecx, [esp + 8 + 20] // dst_width |
|
1016 lea ebx, [ebx * 4] |
|
1017 lea edi, [ebx + ebx * 2] |
|
1018 |
|
1019 align 4 |
|
1020 wloop: |
|
1021 movd xmm0, [eax] |
|
1022 movd xmm1, [eax + ebx] |
|
1023 punpckldq xmm0, xmm1 |
|
1024 movd xmm2, [eax + ebx * 2] |
|
1025 movd xmm3, [eax + edi] |
|
1026 lea eax, [eax + ebx * 4] |
|
1027 punpckldq xmm2, xmm3 |
|
1028 punpcklqdq xmm0, xmm2 |
|
1029 sub ecx, 4 |
|
1030 movdqa [edx], xmm0 |
|
1031 lea edx, [edx + 16] |
|
1032 jg wloop |
|
1033 |
|
1034 pop edi |
|
1035 pop ebx |
|
1036 ret |
|
1037 } |
|
1038 } |
|
1039 |
|
1040 // Blends four 2x2 to 4x1. |
|
1041 // Alignment requirement: dst_argb 16 byte aligned. |
|
1042 __declspec(naked) __declspec(align(16)) |
|
1043 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, |
|
1044 ptrdiff_t src_stride, |
|
1045 int src_stepx, |
|
1046 uint8* dst_argb, int dst_width) { |
|
1047 __asm { |
|
1048 push ebx |
|
1049 push esi |
|
1050 push edi |
|
1051 mov eax, [esp + 12 + 4] // src_argb |
|
1052 mov esi, [esp + 12 + 8] // src_stride |
|
1053 mov ebx, [esp + 12 + 12] // src_stepx |
|
1054 mov edx, [esp + 12 + 16] // dst_argb |
|
1055 mov ecx, [esp + 12 + 20] // dst_width |
|
1056 lea esi, [eax + esi] // row1 pointer |
|
1057 lea ebx, [ebx * 4] |
|
1058 lea edi, [ebx + ebx * 2] |
|
1059 |
|
1060 align 4 |
|
1061 wloop: |
|
1062 movq xmm0, qword ptr [eax] // row0 4 pairs |
|
1063 movhps xmm0, qword ptr [eax + ebx] |
|
1064 movq xmm1, qword ptr [eax + ebx * 2] |
|
1065 movhps xmm1, qword ptr [eax + edi] |
|
1066 lea eax, [eax + ebx * 4] |
|
1067 movq xmm2, qword ptr [esi] // row1 4 pairs |
|
1068 movhps xmm2, qword ptr [esi + ebx] |
|
1069 movq xmm3, qword ptr [esi + ebx * 2] |
|
1070 movhps xmm3, qword ptr [esi + edi] |
|
1071 lea esi, [esi + ebx * 4] |
|
1072 pavgb xmm0, xmm2 // average rows |
|
1073 pavgb xmm1, xmm3 |
|
1074 movdqa xmm2, xmm0 // average columns (8 to 4 pixels) |
|
1075 shufps xmm0, xmm1, 0x88 // even pixels |
|
1076 shufps xmm2, xmm1, 0xdd // odd pixels |
|
1077 pavgb xmm0, xmm2 |
|
1078 sub ecx, 4 |
|
1079 movdqa [edx], xmm0 |
|
1080 lea edx, [edx + 16] |
|
1081 jg wloop |
|
1082 |
|
1083 pop edi |
|
1084 pop esi |
|
1085 pop ebx |
|
1086 ret |
|
1087 } |
|
1088 } |
|
1089 |
|
1090 // Column scaling unfiltered. SSE2 version. |
|
1091 __declspec(naked) __declspec(align(16)) |
|
1092 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, |
|
1093 int dst_width, int x, int dx) { |
|
1094 __asm { |
|
1095 push edi |
|
1096 push esi |
|
1097 mov edi, [esp + 8 + 4] // dst_argb |
|
1098 mov esi, [esp + 8 + 8] // src_argb |
|
1099 mov ecx, [esp + 8 + 12] // dst_width |
|
1100 movd xmm2, [esp + 8 + 16] // x |
|
1101 movd xmm3, [esp + 8 + 20] // dx |
|
1102 |
|
1103 pshufd xmm2, xmm2, 0 // x0 x0 x0 x0 |
|
1104 pshufd xmm0, xmm3, 0x11 // dx 0 dx 0 |
|
1105 paddd xmm2, xmm0 |
|
1106 paddd xmm3, xmm3 // 0, 0, 0, dx * 2 |
|
1107 pshufd xmm0, xmm3, 0x05 // dx * 2, dx * 2, 0, 0 |
|
1108 paddd xmm2, xmm0 // x3 x2 x1 x0 |
|
1109 paddd xmm3, xmm3 // 0, 0, 0, dx * 4 |
|
1110 pshufd xmm3, xmm3, 0 // dx * 4, dx * 4, dx * 4, dx * 4 |
|
1111 |
|
1112 pextrw eax, xmm2, 1 // get x0 integer. |
|
1113 pextrw edx, xmm2, 3 // get x1 integer. |
|
1114 |
|
1115 cmp ecx, 0 |
|
1116 jle xloop99 |
|
1117 sub ecx, 4 |
|
1118 jl xloop49 |
|
1119 |
|
1120 // 4 Pixel loop. |
|
1121 align 4 |
|
1122 xloop4: |
|
1123 movd xmm0, [esi + eax * 4] // 1 source x0 pixels |
|
1124 movd xmm1, [esi + edx * 4] // 1 source x1 pixels |
|
1125 pextrw eax, xmm2, 5 // get x2 integer. |
|
1126 pextrw edx, xmm2, 7 // get x3 integer. |
|
1127 paddd xmm2, xmm3 // x += dx |
|
1128 punpckldq xmm0, xmm1 // x0 x1 |
|
1129 |
|
1130 movd xmm1, [esi + eax * 4] // 1 source x2 pixels |
|
1131 movd xmm4, [esi + edx * 4] // 1 source x3 pixels |
|
1132 pextrw eax, xmm2, 1 // get x0 integer. next iteration. |
|
1133 pextrw edx, xmm2, 3 // get x1 integer. next iteration. |
|
1134 punpckldq xmm1, xmm4 // x2 x3 |
|
1135 punpcklqdq xmm0, xmm1 // x0 x1 x2 x3 |
|
1136 sub ecx, 4 // 4 pixels |
|
1137 movdqu [edi], xmm0 |
|
1138 lea edi, [edi + 16] |
|
1139 jge xloop4 |
|
1140 |
|
1141 align 4 |
|
1142 xloop49: |
|
1143 test ecx, 2 |
|
1144 je xloop29 |
|
1145 |
|
1146 // 2 Pixels. |
|
1147 movd xmm0, [esi + eax * 4] // 1 source x0 pixels |
|
1148 movd xmm1, [esi + edx * 4] // 1 source x1 pixels |
|
1149 pextrw eax, xmm2, 5 // get x2 integer. |
|
1150 punpckldq xmm0, xmm1 // x0 x1 |
|
1151 |
|
1152 movq qword ptr [edi], xmm0 |
|
1153 lea edi, [edi + 8] |
|
1154 |
|
1155 xloop29: |
|
1156 test ecx, 1 |
|
1157 je xloop99 |
|
1158 |
|
1159 // 1 Pixels. |
|
1160 movd xmm0, [esi + eax * 4] // 1 source x2 pixels |
|
1161 movd dword ptr [edi], xmm0 |
|
1162 align 4 |
|
1163 xloop99: |
|
1164 |
|
1165 pop esi |
|
1166 pop edi |
|
1167 ret |
|
1168 } |
|
1169 } |
|
1170 |
|
1171 // Bilinear row filtering combines 2x1 -> 1x1. SSSE3 version. |
|
1172 // TODO(fbarchard): Port to Neon |
|
1173 |
|
1174 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw |
|
1175 static uvec8 kShuffleColARGB = { |
|
1176 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel |
|
1177 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel |
|
1178 }; |
|
1179 |
|
1180 // Shuffle table for duplicating 2 fractions into 8 bytes each |
|
1181 static uvec8 kShuffleFractions = { |
|
1182 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, |
|
1183 }; |
|
1184 |
|
1185 __declspec(naked) __declspec(align(16)) |
|
1186 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, |
|
1187 int dst_width, int x, int dx) { |
|
1188 __asm { |
|
1189 push esi |
|
1190 push edi |
|
1191 mov edi, [esp + 8 + 4] // dst_argb |
|
1192 mov esi, [esp + 8 + 8] // src_argb |
|
1193 mov ecx, [esp + 8 + 12] // dst_width |
|
1194 movd xmm2, [esp + 8 + 16] // x |
|
1195 movd xmm3, [esp + 8 + 20] // dx |
|
1196 movdqa xmm4, kShuffleColARGB |
|
1197 movdqa xmm5, kShuffleFractions |
|
1198 pcmpeqb xmm6, xmm6 // generate 0x007f for inverting fraction. |
|
1199 psrlw xmm6, 9 |
|
1200 pextrw eax, xmm2, 1 // get x0 integer. preroll |
|
1201 sub ecx, 2 |
|
1202 jl xloop29 |
|
1203 |
|
1204 movdqa xmm0, xmm2 // x1 = x0 + dx |
|
1205 paddd xmm0, xmm3 |
|
1206 punpckldq xmm2, xmm0 // x0 x1 |
|
1207 punpckldq xmm3, xmm3 // dx dx |
|
1208 paddd xmm3, xmm3 // dx * 2, dx * 2 |
|
1209 pextrw edx, xmm2, 3 // get x1 integer. preroll |
|
1210 |
|
1211 // 2 Pixel loop. |
|
1212 align 4 |
|
1213 xloop2: |
|
1214 movdqa xmm1, xmm2 // x0, x1 fractions. |
|
1215 paddd xmm2, xmm3 // x += dx |
|
1216 movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels |
|
1217 psrlw xmm1, 9 // 7 bit fractions. |
|
1218 movhps xmm0, qword ptr [esi + edx * 4] // 2 source x1 pixels |
|
1219 pshufb xmm1, xmm5 // 0000000011111111 |
|
1220 pshufb xmm0, xmm4 // arrange pixels into pairs |
|
1221 pxor xmm1, xmm6 // 0..7f and 7f..0 |
|
1222 pmaddubsw xmm0, xmm1 // argb_argb 16 bit, 2 pixels. |
|
1223 pextrw eax, xmm2, 1 // get x0 integer. next iteration. |
|
1224 pextrw edx, xmm2, 3 // get x1 integer. next iteration. |
|
1225 psrlw xmm0, 7 // argb 8.7 fixed point to low 8 bits. |
|
1226 packuswb xmm0, xmm0 // argb_argb 8 bits, 2 pixels. |
|
1227 movq qword ptr [edi], xmm0 |
|
1228 lea edi, [edi + 8] |
|
1229 sub ecx, 2 // 2 pixels |
|
1230 jge xloop2 |
|
1231 |
|
1232 align 4 |
|
1233 xloop29: |
|
1234 |
|
1235 add ecx, 2 - 1 |
|
1236 jl xloop99 |
|
1237 |
|
1238 // 1 pixel remainder |
|
1239 psrlw xmm2, 9 // 7 bit fractions. |
|
1240 movq xmm0, qword ptr [esi + eax * 4] // 2 source x0 pixels |
|
1241 pshufb xmm2, xmm5 // 00000000 |
|
1242 pshufb xmm0, xmm4 // arrange pixels into pairs |
|
1243 pxor xmm2, xmm6 // 0..7f and 7f..0 |
|
1244 pmaddubsw xmm0, xmm2 // argb 16 bit, 1 pixel. |
|
1245 psrlw xmm0, 7 |
|
1246 packuswb xmm0, xmm0 // argb 8 bits, 1 pixel. |
|
1247 movd [edi], xmm0 |
|
1248 |
|
1249 align 4 |
|
1250 xloop99: |
|
1251 |
|
1252 pop edi |
|
1253 pop esi |
|
1254 ret |
|
1255 } |
|
1256 } |
|
1257 |
|
1258 // Reads 4 pixels, duplicates them and writes 8 pixels. |
|
1259 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. |
|
1260 __declspec(naked) __declspec(align(16)) |
|
1261 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, |
|
1262 int dst_width, int x, int dx) { |
|
1263 __asm { |
|
1264 mov edx, [esp + 4] // dst_argb |
|
1265 mov eax, [esp + 8] // src_argb |
|
1266 mov ecx, [esp + 12] // dst_width |
|
1267 |
|
1268 align 4 |
|
1269 wloop: |
|
1270 movdqa xmm0, [eax] |
|
1271 lea eax, [eax + 16] |
|
1272 movdqa xmm1, xmm0 |
|
1273 punpckldq xmm0, xmm0 |
|
1274 punpckhdq xmm1, xmm1 |
|
1275 sub ecx, 8 |
|
1276 movdqa [edx], xmm0 |
|
1277 movdqa [edx + 16], xmm1 |
|
1278 lea edx, [edx + 32] |
|
1279 jg wloop |
|
1280 |
|
1281 ret |
|
1282 } |
|
1283 } |
|
1284 |
|
1285 // Divide num by div and return as 16.16 fixed point result. |
|
1286 __declspec(naked) __declspec(align(16)) |
|
1287 int FixedDiv_X86(int num, int div) { |
|
1288 __asm { |
|
1289 mov eax, [esp + 4] // num |
|
1290 cdq // extend num to 64 bits |
|
1291 shld edx, eax, 16 // 32.16 |
|
1292 shl eax, 16 |
|
1293 idiv dword ptr [esp + 8] |
|
1294 ret |
|
1295 } |
|
1296 } |
|
1297 |
|
1298 // Divide num by div and return as 16.16 fixed point result. |
|
1299 __declspec(naked) __declspec(align(16)) |
|
1300 int FixedDiv1_X86(int num, int div) { |
|
1301 __asm { |
|
1302 mov eax, [esp + 4] // num |
|
1303 mov ecx, [esp + 8] // denom |
|
1304 cdq // extend num to 64 bits |
|
1305 shld edx, eax, 16 // 32.16 |
|
1306 shl eax, 16 |
|
1307 sub eax, 0x00010001 |
|
1308 sbb edx, 0 |
|
1309 sub ecx, 1 |
|
1310 idiv ecx |
|
1311 ret |
|
1312 } |
|
1313 } |
|
1314 |
|
1315 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) |
|
1316 |
|
1317 #ifdef __cplusplus |
|
1318 } // extern "C" |
|
1319 } // namespace libyuv |
|
1320 #endif |