|
1 /* |
|
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. |
|
3 * |
|
4 * Use of this source code is governed by a BSD-style license |
|
5 * that can be found in the LICENSE file in the root of the source |
|
6 * tree. An additional intellectual property rights grant can be found |
|
7 * in the file PATENTS. All contributing project authors may |
|
8 * be found in the AUTHORS file in the root of the source tree. |
|
9 */ |
|
10 |
|
11 #include "libyuv/row.h" |
|
12 |
|
13 #ifdef __cplusplus |
|
14 namespace libyuv { |
|
15 extern "C" { |
|
16 #endif |
|
17 |
|
18 // This module is for GCC x86 and x64. |
|
19 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) |
|
20 |
|
21 // Offsets for source bytes 0 to 9 |
|
22 static uvec8 kShuf0 = |
|
23 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; |
|
24 |
|
25 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. |
|
26 static uvec8 kShuf1 = |
|
27 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; |
|
28 |
|
29 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. |
|
30 static uvec8 kShuf2 = |
|
31 { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; |
|
32 |
|
33 // Offsets for source bytes 0 to 10 |
|
34 static uvec8 kShuf01 = |
|
35 { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; |
|
36 |
|
37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. |
|
38 static uvec8 kShuf11 = |
|
39 { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; |
|
40 |
|
41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. |
|
42 static uvec8 kShuf21 = |
|
43 { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; |
|
44 |
|
45 // Coefficients for source bytes 0 to 10 |
|
46 static uvec8 kMadd01 = |
|
47 { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; |
|
48 |
|
49 // Coefficients for source bytes 10 to 21 |
|
50 static uvec8 kMadd11 = |
|
51 { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; |
|
52 |
|
53 // Coefficients for source bytes 21 to 31 |
|
54 static uvec8 kMadd21 = |
|
55 { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; |
|
56 |
|
57 // Coefficients for source bytes 21 to 31 |
|
58 static vec16 kRound34 = |
|
59 { 2, 2, 2, 2, 2, 2, 2, 2 }; |
|
60 |
|
61 static uvec8 kShuf38a = |
|
62 { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; |
|
63 |
|
64 static uvec8 kShuf38b = |
|
65 { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; |
|
66 |
|
67 // Arrange words 0,3,6 into 0,1,2 |
|
68 static uvec8 kShufAc = |
|
69 { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; |
|
70 |
|
71 // Arrange words 0,3,6 into 3,4,5 |
|
72 static uvec8 kShufAc3 = |
|
73 { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; |
|
74 |
|
75 // Scaling values for boxes of 3x3 and 2x3 |
|
76 static uvec16 kScaleAc33 = |
|
77 { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; |
|
78 |
|
79 // Arrange first value for pixels 0,1,2,3,4,5 |
|
80 static uvec8 kShufAb0 = |
|
81 { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; |
|
82 |
|
83 // Arrange second value for pixels 0,1,2,3,4,5 |
|
84 static uvec8 kShufAb1 = |
|
85 { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; |
|
86 |
|
87 // Arrange third value for pixels 0,1,2,3,4,5 |
|
88 static uvec8 kShufAb2 = |
|
89 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; |
|
90 |
|
91 // Scaling values for boxes of 3x2 and 2x2 |
|
92 static uvec16 kScaleAb2 = |
|
93 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; |
|
94 |
|
95 // GCC versions of row functions are verbatim conversions from Visual C. |
|
96 // Generated using gcc disassembly on Visual C object file: |
|
97 // objdump -D yuvscaler.obj >yuvscaler.txt |
|
98 |
|
99 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
|
100 uint8* dst_ptr, int dst_width) { |
|
101 asm volatile ( |
|
102 LABELALIGN |
|
103 "1: \n" |
|
104 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
|
105 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
|
106 "lea " MEMLEA(0x20,0) ",%0 \n" |
|
107 "psrlw $0x8,%%xmm0 \n" |
|
108 "psrlw $0x8,%%xmm1 \n" |
|
109 "packuswb %%xmm1,%%xmm0 \n" |
|
110 "movdqa %%xmm0," MEMACCESS(1) " \n" |
|
111 "lea " MEMLEA(0x10,1) ",%1 \n" |
|
112 "sub $0x10,%2 \n" |
|
113 "jg 1b \n" |
|
114 : "+r"(src_ptr), // %0 |
|
115 "+r"(dst_ptr), // %1 |
|
116 "+r"(dst_width) // %2 |
|
117 : |
|
118 : "memory", "cc" |
|
119 #if defined(__SSE2__) |
|
120 , "xmm0", "xmm1" |
|
121 #endif |
|
122 ); |
|
123 } |
|
124 |
|
125 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
|
126 uint8* dst_ptr, int dst_width) { |
|
127 asm volatile ( |
|
128 "pcmpeqb %%xmm5,%%xmm5 \n" |
|
129 "psrlw $0x8,%%xmm5 \n" |
|
130 |
|
131 LABELALIGN |
|
132 "1: \n" |
|
133 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
|
134 "movdqa " MEMACCESS2(0x10, 0) ",%%xmm1 \n" |
|
135 "lea " MEMLEA(0x20,0) ",%0 \n" |
|
136 "movdqa %%xmm0,%%xmm2 \n" |
|
137 "psrlw $0x8,%%xmm0 \n" |
|
138 "movdqa %%xmm1,%%xmm3 \n" |
|
139 "psrlw $0x8,%%xmm1 \n" |
|
140 "pand %%xmm5,%%xmm2 \n" |
|
141 "pand %%xmm5,%%xmm3 \n" |
|
142 "pavgw %%xmm2,%%xmm0 \n" |
|
143 "pavgw %%xmm3,%%xmm1 \n" |
|
144 "packuswb %%xmm1,%%xmm0 \n" |
|
145 "movdqa %%xmm0," MEMACCESS(1) " \n" |
|
146 "lea " MEMLEA(0x10,1) ",%1 \n" |
|
147 "sub $0x10,%2 \n" |
|
148 "jg 1b \n" |
|
149 : "+r"(src_ptr), // %0 |
|
150 "+r"(dst_ptr), // %1 |
|
151 "+r"(dst_width) // %2 |
|
152 : |
|
153 : "memory", "cc" |
|
154 #if defined(__SSE2__) |
|
155 , "xmm0", "xmm1", "xmm5" |
|
156 #endif |
|
157 ); |
|
158 } |
|
159 |
|
160 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
|
161 uint8* dst_ptr, int dst_width) { |
|
162 asm volatile ( |
|
163 "pcmpeqb %%xmm5,%%xmm5 \n" |
|
164 "psrlw $0x8,%%xmm5 \n" |
|
165 |
|
166 LABELALIGN |
|
167 "1: \n" |
|
168 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
|
169 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
|
170 MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2 |
|
171 BUNDLEALIGN |
|
172 MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3 |
|
173 "lea " MEMLEA(0x20,0) ",%0 \n" |
|
174 "pavgb %%xmm2,%%xmm0 \n" |
|
175 "pavgb %%xmm3,%%xmm1 \n" |
|
176 "movdqa %%xmm0,%%xmm2 \n" |
|
177 "psrlw $0x8,%%xmm0 \n" |
|
178 "movdqa %%xmm1,%%xmm3 \n" |
|
179 "psrlw $0x8,%%xmm1 \n" |
|
180 "pand %%xmm5,%%xmm2 \n" |
|
181 "pand %%xmm5,%%xmm3 \n" |
|
182 "pavgw %%xmm2,%%xmm0 \n" |
|
183 "pavgw %%xmm3,%%xmm1 \n" |
|
184 "packuswb %%xmm1,%%xmm0 \n" |
|
185 "movdqa %%xmm0," MEMACCESS(1) " \n" |
|
186 "lea " MEMLEA(0x10,1) ",%1 \n" |
|
187 "sub $0x10,%2 \n" |
|
188 "jg 1b \n" |
|
189 : "+r"(src_ptr), // %0 |
|
190 "+r"(dst_ptr), // %1 |
|
191 "+r"(dst_width) // %2 |
|
192 : "r"((intptr_t)(src_stride)) // %3 |
|
193 : "memory", "cc" |
|
194 #if defined(__native_client__) && defined(__x86_64__) |
|
195 , "r14" |
|
196 #endif |
|
197 #if defined(__SSE2__) |
|
198 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
|
199 #endif |
|
200 ); |
|
201 } |
|
202 |
|
203 void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
|
204 uint8* dst_ptr, int dst_width) { |
|
205 asm volatile ( |
|
206 LABELALIGN |
|
207 "1: \n" |
|
208 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
|
209 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
|
210 "lea " MEMLEA(0x20,0) ",%0 \n" |
|
211 "psrlw $0x8,%%xmm0 \n" |
|
212 "psrlw $0x8,%%xmm1 \n" |
|
213 "packuswb %%xmm1,%%xmm0 \n" |
|
214 "movdqu %%xmm0," MEMACCESS(1) " \n" |
|
215 "lea " MEMLEA(0x10,1) ",%1 \n" |
|
216 "sub $0x10,%2 \n" |
|
217 "jg 1b \n" |
|
218 : "+r"(src_ptr), // %0 |
|
219 "+r"(dst_ptr), // %1 |
|
220 "+r"(dst_width) // %2 |
|
221 : |
|
222 : "memory", "cc" |
|
223 #if defined(__SSE2__) |
|
224 , "xmm0", "xmm1" |
|
225 #endif |
|
226 ); |
|
227 } |
|
228 |
|
229 void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, |
|
230 ptrdiff_t src_stride, |
|
231 uint8* dst_ptr, int dst_width) { |
|
232 asm volatile ( |
|
233 "pcmpeqb %%xmm5,%%xmm5 \n" |
|
234 "psrlw $0x8,%%xmm5 \n" |
|
235 |
|
236 LABELALIGN |
|
237 "1: \n" |
|
238 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
|
239 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
|
240 "lea " MEMLEA(0x20,0) ",%0 \n" |
|
241 "movdqa %%xmm0,%%xmm2 \n" |
|
242 "psrlw $0x8,%%xmm0 \n" |
|
243 "movdqa %%xmm1,%%xmm3 \n" |
|
244 "psrlw $0x8,%%xmm1 \n" |
|
245 "pand %%xmm5,%%xmm2 \n" |
|
246 "pand %%xmm5,%%xmm3 \n" |
|
247 "pavgw %%xmm2,%%xmm0 \n" |
|
248 "pavgw %%xmm3,%%xmm1 \n" |
|
249 "packuswb %%xmm1,%%xmm0 \n" |
|
250 "movdqu %%xmm0," MEMACCESS(1) " \n" |
|
251 "lea " MEMLEA(0x10,1) ",%1 \n" |
|
252 "sub $0x10,%2 \n" |
|
253 "jg 1b \n" |
|
254 : "+r"(src_ptr), // %0 |
|
255 "+r"(dst_ptr), // %1 |
|
256 "+r"(dst_width) // %2 |
|
257 : |
|
258 : "memory", "cc" |
|
259 #if defined(__SSE2__) |
|
260 , "xmm0", "xmm1", "xmm5" |
|
261 #endif |
|
262 ); |
|
263 } |
|
264 |
|
265 void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, |
|
266 ptrdiff_t src_stride, |
|
267 uint8* dst_ptr, int dst_width) { |
|
268 asm volatile ( |
|
269 "pcmpeqb %%xmm5,%%xmm5 \n" |
|
270 "psrlw $0x8,%%xmm5 \n" |
|
271 |
|
272 LABELALIGN |
|
273 "1: \n" |
|
274 "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
|
275 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
|
276 MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 |
|
277 BUNDLEALIGN |
|
278 MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 |
|
279 "lea " MEMLEA(0x20,0) ",%0 \n" |
|
280 "pavgb %%xmm2,%%xmm0 \n" |
|
281 "pavgb %%xmm3,%%xmm1 \n" |
|
282 "movdqa %%xmm0,%%xmm2 \n" |
|
283 "psrlw $0x8,%%xmm0 \n" |
|
284 "movdqa %%xmm1,%%xmm3 \n" |
|
285 "psrlw $0x8,%%xmm1 \n" |
|
286 "pand %%xmm5,%%xmm2 \n" |
|
287 "pand %%xmm5,%%xmm3 \n" |
|
288 "pavgw %%xmm2,%%xmm0 \n" |
|
289 "pavgw %%xmm3,%%xmm1 \n" |
|
290 "packuswb %%xmm1,%%xmm0 \n" |
|
291 "movdqu %%xmm0," MEMACCESS(1) " \n" |
|
292 "lea " MEMLEA(0x10,1) ",%1 \n" |
|
293 "sub $0x10,%2 \n" |
|
294 "jg 1b \n" |
|
295 : "+r"(src_ptr), // %0 |
|
296 "+r"(dst_ptr), // %1 |
|
297 "+r"(dst_width) // %2 |
|
298 : "r"((intptr_t)(src_stride)) // %3 |
|
299 : "memory", "cc" |
|
300 #if defined(__native_client__) && defined(__x86_64__) |
|
301 , "r14" |
|
302 #endif |
|
303 #if defined(__SSE2__) |
|
304 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
|
305 #endif |
|
306 ); |
|
307 } |
|
308 |
|
309 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
|
310 uint8* dst_ptr, int dst_width) { |
|
311 asm volatile ( |
|
312 "pcmpeqb %%xmm5,%%xmm5 \n" |
|
313 "psrld $0x18,%%xmm5 \n" |
|
314 "pslld $0x10,%%xmm5 \n" |
|
315 |
|
316 LABELALIGN |
|
317 "1: \n" |
|
318 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
|
319 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
|
320 "lea " MEMLEA(0x20,0) ",%0 \n" |
|
321 "pand %%xmm5,%%xmm0 \n" |
|
322 "pand %%xmm5,%%xmm1 \n" |
|
323 "packuswb %%xmm1,%%xmm0 \n" |
|
324 "psrlw $0x8,%%xmm0 \n" |
|
325 "packuswb %%xmm0,%%xmm0 \n" |
|
326 "movq %%xmm0," MEMACCESS(1) " \n" |
|
327 "lea " MEMLEA(0x8,1) ",%1 \n" |
|
328 "sub $0x8,%2 \n" |
|
329 "jg 1b \n" |
|
330 : "+r"(src_ptr), // %0 |
|
331 "+r"(dst_ptr), // %1 |
|
332 "+r"(dst_width) // %2 |
|
333 : |
|
334 : "memory", "cc" |
|
335 #if defined(__SSE2__) |
|
336 , "xmm0", "xmm1", "xmm5" |
|
337 #endif |
|
338 ); |
|
339 } |
|
340 |
|
341 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
|
342 uint8* dst_ptr, int dst_width) { |
|
343 intptr_t stridex3 = 0; |
|
344 asm volatile ( |
|
345 "pcmpeqb %%xmm7,%%xmm7 \n" |
|
346 "psrlw $0x8,%%xmm7 \n" |
|
347 "lea " MEMLEA4(0x00,4,4,2) ",%3 \n" |
|
348 |
|
349 LABELALIGN |
|
350 "1: \n" |
|
351 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
|
352 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
|
353 MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 |
|
354 BUNDLEALIGN |
|
355 MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 |
|
356 "pavgb %%xmm2,%%xmm0 \n" |
|
357 "pavgb %%xmm3,%%xmm1 \n" |
|
358 MEMOPREG(movdqa,0x00,0,4,2,xmm2) // movdqa (%0,%4,2),%%xmm2 |
|
359 BUNDLEALIGN |
|
360 MEMOPREG(movdqa,0x10,0,4,2,xmm3) // movdqa 0x10(%0,%4,2),%%xmm3 |
|
361 MEMOPREG(movdqa,0x00,0,3,1,xmm4) // movdqa (%0,%3,1),%%xmm4 |
|
362 MEMOPREG(movdqa,0x10,0,3,1,xmm5) // movdqa 0x10(%0,%3,1),%%xmm5 |
|
363 "lea " MEMLEA(0x20,0) ",%0 \n" |
|
364 "pavgb %%xmm4,%%xmm2 \n" |
|
365 "pavgb %%xmm2,%%xmm0 \n" |
|
366 "pavgb %%xmm5,%%xmm3 \n" |
|
367 "pavgb %%xmm3,%%xmm1 \n" |
|
368 "movdqa %%xmm0,%%xmm2 \n" |
|
369 "psrlw $0x8,%%xmm0 \n" |
|
370 "movdqa %%xmm1,%%xmm3 \n" |
|
371 "psrlw $0x8,%%xmm1 \n" |
|
372 "pand %%xmm7,%%xmm2 \n" |
|
373 "pand %%xmm7,%%xmm3 \n" |
|
374 "pavgw %%xmm2,%%xmm0 \n" |
|
375 "pavgw %%xmm3,%%xmm1 \n" |
|
376 "packuswb %%xmm1,%%xmm0 \n" |
|
377 "movdqa %%xmm0,%%xmm2 \n" |
|
378 "psrlw $0x8,%%xmm0 \n" |
|
379 "pand %%xmm7,%%xmm2 \n" |
|
380 "pavgw %%xmm2,%%xmm0 \n" |
|
381 "packuswb %%xmm0,%%xmm0 \n" |
|
382 "movq %%xmm0," MEMACCESS(1) " \n" |
|
383 "lea " MEMLEA(0x8,1) ",%1 \n" |
|
384 "sub $0x8,%2 \n" |
|
385 "jg 1b \n" |
|
386 : "+r"(src_ptr), // %0 |
|
387 "+r"(dst_ptr), // %1 |
|
388 "+r"(dst_width), // %2 |
|
389 "+r"(stridex3) // %3 |
|
390 : "r"((intptr_t)(src_stride)) // %4 |
|
391 : "memory", "cc" |
|
392 #if defined(__native_client__) && defined(__x86_64__) |
|
393 , "r14" |
|
394 #endif |
|
395 #if defined(__SSE2__) |
|
396 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7" |
|
397 #endif |
|
398 ); |
|
399 } |
|
400 |
|
401 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
|
402 uint8* dst_ptr, int dst_width) { |
|
403 asm volatile ( |
|
404 "movdqa %0,%%xmm3 \n" |
|
405 "movdqa %1,%%xmm4 \n" |
|
406 "movdqa %2,%%xmm5 \n" |
|
407 : |
|
408 : "m"(kShuf0), // %0 |
|
409 "m"(kShuf1), // %1 |
|
410 "m"(kShuf2) // %2 |
|
411 ); |
|
412 asm volatile ( |
|
413 LABELALIGN |
|
414 "1: \n" |
|
415 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
|
416 "movdqa " MEMACCESS2(0x10,0) ",%%xmm2 \n" |
|
417 "lea " MEMLEA(0x20,0) ",%0 \n" |
|
418 "movdqa %%xmm2,%%xmm1 \n" |
|
419 "palignr $0x8,%%xmm0,%%xmm1 \n" |
|
420 "pshufb %%xmm3,%%xmm0 \n" |
|
421 "pshufb %%xmm4,%%xmm1 \n" |
|
422 "pshufb %%xmm5,%%xmm2 \n" |
|
423 "movq %%xmm0," MEMACCESS(1) " \n" |
|
424 "movq %%xmm1," MEMACCESS2(0x8,1) " \n" |
|
425 "movq %%xmm2," MEMACCESS2(0x10,1) " \n" |
|
426 "lea " MEMLEA(0x18,1) ",%1 \n" |
|
427 "sub $0x18,%2 \n" |
|
428 "jg 1b \n" |
|
429 : "+r"(src_ptr), // %0 |
|
430 "+r"(dst_ptr), // %1 |
|
431 "+r"(dst_width) // %2 |
|
432 : |
|
433 : "memory", "cc" |
|
434 #if defined(__SSE2__) |
|
435 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
|
436 #endif |
|
437 ); |
|
438 } |
|
439 |
|
440 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, |
|
441 ptrdiff_t src_stride, |
|
442 uint8* dst_ptr, int dst_width) { |
|
443 asm volatile ( |
|
444 "movdqa %0,%%xmm2 \n" // kShuf01 |
|
445 "movdqa %1,%%xmm3 \n" // kShuf11 |
|
446 "movdqa %2,%%xmm4 \n" // kShuf21 |
|
447 : |
|
448 : "m"(kShuf01), // %0 |
|
449 "m"(kShuf11), // %1 |
|
450 "m"(kShuf21) // %2 |
|
451 ); |
|
452 asm volatile ( |
|
453 "movdqa %0,%%xmm5 \n" // kMadd01 |
|
454 "movdqa %1,%%xmm0 \n" // kMadd11 |
|
455 "movdqa %2,%%xmm1 \n" // kRound34 |
|
456 : |
|
457 : "m"(kMadd01), // %0 |
|
458 "m"(kMadd11), // %1 |
|
459 "m"(kRound34) // %2 |
|
460 ); |
|
461 asm volatile ( |
|
462 LABELALIGN |
|
463 "1: \n" |
|
464 "movdqa " MEMACCESS(0) ",%%xmm6 \n" |
|
465 MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3),%%xmm7 |
|
466 "pavgb %%xmm7,%%xmm6 \n" |
|
467 "pshufb %%xmm2,%%xmm6 \n" |
|
468 "pmaddubsw %%xmm5,%%xmm6 \n" |
|
469 "paddsw %%xmm1,%%xmm6 \n" |
|
470 "psrlw $0x2,%%xmm6 \n" |
|
471 "packuswb %%xmm6,%%xmm6 \n" |
|
472 "movq %%xmm6," MEMACCESS(1) " \n" |
|
473 "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" |
|
474 MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7 |
|
475 "pavgb %%xmm7,%%xmm6 \n" |
|
476 "pshufb %%xmm3,%%xmm6 \n" |
|
477 "pmaddubsw %%xmm0,%%xmm6 \n" |
|
478 "paddsw %%xmm1,%%xmm6 \n" |
|
479 "psrlw $0x2,%%xmm6 \n" |
|
480 "packuswb %%xmm6,%%xmm6 \n" |
|
481 "movq %%xmm6," MEMACCESS2(0x8,1) " \n" |
|
482 "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" |
|
483 BUNDLEALIGN |
|
484 MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3),%%xmm7 |
|
485 "lea " MEMLEA(0x20,0) ",%0 \n" |
|
486 "pavgb %%xmm7,%%xmm6 \n" |
|
487 "pshufb %%xmm4,%%xmm6 \n" |
|
488 "pmaddubsw %4,%%xmm6 \n" |
|
489 "paddsw %%xmm1,%%xmm6 \n" |
|
490 "psrlw $0x2,%%xmm6 \n" |
|
491 "packuswb %%xmm6,%%xmm6 \n" |
|
492 "movq %%xmm6," MEMACCESS2(0x10,1) " \n" |
|
493 "lea " MEMLEA(0x18,1) ",%1 \n" |
|
494 "sub $0x18,%2 \n" |
|
495 "jg 1b \n" |
|
496 : "+r"(src_ptr), // %0 |
|
497 "+r"(dst_ptr), // %1 |
|
498 "+r"(dst_width) // %2 |
|
499 : "r"((intptr_t)(src_stride)), // %3 |
|
500 "m"(kMadd21) // %4 |
|
501 : "memory", "cc" |
|
502 #if defined(__native_client__) && defined(__x86_64__) |
|
503 , "r14" |
|
504 #endif |
|
505 #if defined(__SSE2__) |
|
506 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
|
507 #endif |
|
508 ); |
|
509 } |
|
510 |
|
511 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, |
|
512 ptrdiff_t src_stride, |
|
513 uint8* dst_ptr, int dst_width) { |
|
514 asm volatile ( |
|
515 "movdqa %0,%%xmm2 \n" // kShuf01 |
|
516 "movdqa %1,%%xmm3 \n" // kShuf11 |
|
517 "movdqa %2,%%xmm4 \n" // kShuf21 |
|
518 : |
|
519 : "m"(kShuf01), // %0 |
|
520 "m"(kShuf11), // %1 |
|
521 "m"(kShuf21) // %2 |
|
522 ); |
|
523 asm volatile ( |
|
524 "movdqa %0,%%xmm5 \n" // kMadd01 |
|
525 "movdqa %1,%%xmm0 \n" // kMadd11 |
|
526 "movdqa %2,%%xmm1 \n" // kRound34 |
|
527 : |
|
528 : "m"(kMadd01), // %0 |
|
529 "m"(kMadd11), // %1 |
|
530 "m"(kRound34) // %2 |
|
531 ); |
|
532 |
|
533 asm volatile ( |
|
534 LABELALIGN |
|
535 "1: \n" |
|
536 "movdqa " MEMACCESS(0) ",%%xmm6 \n" |
|
537 MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3,1),%%xmm7 |
|
538 "pavgb %%xmm6,%%xmm7 \n" |
|
539 "pavgb %%xmm7,%%xmm6 \n" |
|
540 "pshufb %%xmm2,%%xmm6 \n" |
|
541 "pmaddubsw %%xmm5,%%xmm6 \n" |
|
542 "paddsw %%xmm1,%%xmm6 \n" |
|
543 "psrlw $0x2,%%xmm6 \n" |
|
544 "packuswb %%xmm6,%%xmm6 \n" |
|
545 "movq %%xmm6," MEMACCESS(1) " \n" |
|
546 "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" |
|
547 MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7 |
|
548 "pavgb %%xmm6,%%xmm7 \n" |
|
549 "pavgb %%xmm7,%%xmm6 \n" |
|
550 "pshufb %%xmm3,%%xmm6 \n" |
|
551 "pmaddubsw %%xmm0,%%xmm6 \n" |
|
552 "paddsw %%xmm1,%%xmm6 \n" |
|
553 "psrlw $0x2,%%xmm6 \n" |
|
554 "packuswb %%xmm6,%%xmm6 \n" |
|
555 "movq %%xmm6," MEMACCESS2(0x8,1) " \n" |
|
556 "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" |
|
557 MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3,1),%%xmm7 |
|
558 "lea " MEMLEA(0x20,0) ",%0 \n" |
|
559 "pavgb %%xmm6,%%xmm7 \n" |
|
560 "pavgb %%xmm7,%%xmm6 \n" |
|
561 "pshufb %%xmm4,%%xmm6 \n" |
|
562 "pmaddubsw %4,%%xmm6 \n" |
|
563 "paddsw %%xmm1,%%xmm6 \n" |
|
564 "psrlw $0x2,%%xmm6 \n" |
|
565 "packuswb %%xmm6,%%xmm6 \n" |
|
566 "movq %%xmm6," MEMACCESS2(0x10,1) " \n" |
|
567 "lea " MEMLEA(0x18,1) ",%1 \n" |
|
568 "sub $0x18,%2 \n" |
|
569 "jg 1b \n" |
|
570 : "+r"(src_ptr), // %0 |
|
571 "+r"(dst_ptr), // %1 |
|
572 "+r"(dst_width) // %2 |
|
573 : "r"((intptr_t)(src_stride)), // %3 |
|
574 "m"(kMadd21) // %4 |
|
575 : "memory", "cc" |
|
576 #if defined(__native_client__) && defined(__x86_64__) |
|
577 , "r14" |
|
578 #endif |
|
579 #if defined(__SSE2__) |
|
580 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
|
581 #endif |
|
582 ); |
|
583 } |
|
584 |
|
585 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
|
586 uint8* dst_ptr, int dst_width) { |
|
587 asm volatile ( |
|
588 "movdqa %3,%%xmm4 \n" |
|
589 "movdqa %4,%%xmm5 \n" |
|
590 |
|
591 LABELALIGN |
|
592 "1: \n" |
|
593 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
|
594 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
|
595 "lea " MEMLEA(0x20,0) ",%0 \n" |
|
596 "pshufb %%xmm4,%%xmm0 \n" |
|
597 "pshufb %%xmm5,%%xmm1 \n" |
|
598 "paddusb %%xmm1,%%xmm0 \n" |
|
599 "movq %%xmm0," MEMACCESS(1) " \n" |
|
600 "movhlps %%xmm0,%%xmm1 \n" |
|
601 "movd %%xmm1," MEMACCESS2(0x8,1) " \n" |
|
602 "lea " MEMLEA(0xc,1) ",%1 \n" |
|
603 "sub $0xc,%2 \n" |
|
604 "jg 1b \n" |
|
605 : "+r"(src_ptr), // %0 |
|
606 "+r"(dst_ptr), // %1 |
|
607 "+r"(dst_width) // %2 |
|
608 : "m"(kShuf38a), // %3 |
|
609 "m"(kShuf38b) // %4 |
|
610 : "memory", "cc" |
|
611 #if defined(__SSE2__) |
|
612 , "xmm0", "xmm1", "xmm4", "xmm5" |
|
613 #endif |
|
614 ); |
|
615 } |
|
616 |
|
617 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, |
|
618 ptrdiff_t src_stride, |
|
619 uint8* dst_ptr, int dst_width) { |
|
620 asm volatile ( |
|
621 "movdqa %0,%%xmm2 \n" |
|
622 "movdqa %1,%%xmm3 \n" |
|
623 "movdqa %2,%%xmm4 \n" |
|
624 "movdqa %3,%%xmm5 \n" |
|
625 : |
|
626 : "m"(kShufAb0), // %0 |
|
627 "m"(kShufAb1), // %1 |
|
628 "m"(kShufAb2), // %2 |
|
629 "m"(kScaleAb2) // %3 |
|
630 ); |
|
631 asm volatile ( |
|
632 LABELALIGN |
|
633 "1: \n" |
|
634 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
|
635 MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3,1),%%xmm0 |
|
636 "lea " MEMLEA(0x10,0) ",%0 \n" |
|
637 "movdqa %%xmm0,%%xmm1 \n" |
|
638 "pshufb %%xmm2,%%xmm1 \n" |
|
639 "movdqa %%xmm0,%%xmm6 \n" |
|
640 "pshufb %%xmm3,%%xmm6 \n" |
|
641 "paddusw %%xmm6,%%xmm1 \n" |
|
642 "pshufb %%xmm4,%%xmm0 \n" |
|
643 "paddusw %%xmm0,%%xmm1 \n" |
|
644 "pmulhuw %%xmm5,%%xmm1 \n" |
|
645 "packuswb %%xmm1,%%xmm1 \n" |
|
646 "sub $0x6,%2 \n" |
|
647 "movd %%xmm1," MEMACCESS(1) " \n" |
|
648 "psrlq $0x10,%%xmm1 \n" |
|
649 "movd %%xmm1," MEMACCESS2(0x2,1) " \n" |
|
650 "lea " MEMLEA(0x6,1) ",%1 \n" |
|
651 "jg 1b \n" |
|
652 : "+r"(src_ptr), // %0 |
|
653 "+r"(dst_ptr), // %1 |
|
654 "+r"(dst_width) // %2 |
|
655 : "r"((intptr_t)(src_stride)) // %3 |
|
656 : "memory", "cc" |
|
657 #if defined(__native_client__) && defined(__x86_64__) |
|
658 , "r14" |
|
659 #endif |
|
660 #if defined(__SSE2__) |
|
661 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
|
662 #endif |
|
663 ); |
|
664 } |
|
665 |
|
666 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, |
|
667 ptrdiff_t src_stride, |
|
668 uint8* dst_ptr, int dst_width) { |
|
669 asm volatile ( |
|
670 "movdqa %0,%%xmm2 \n" |
|
671 "movdqa %1,%%xmm3 \n" |
|
672 "movdqa %2,%%xmm4 \n" |
|
673 "pxor %%xmm5,%%xmm5 \n" |
|
674 : |
|
675 : "m"(kShufAc), // %0 |
|
676 "m"(kShufAc3), // %1 |
|
677 "m"(kScaleAc33) // %2 |
|
678 ); |
|
679 asm volatile ( |
|
680 LABELALIGN |
|
681 "1: \n" |
|
682 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
|
683 MEMOPREG(movdqa,0x00,0,3,1,xmm6) // movdqa (%0,%3,1),%%xmm6 |
|
684 "movhlps %%xmm0,%%xmm1 \n" |
|
685 "movhlps %%xmm6,%%xmm7 \n" |
|
686 "punpcklbw %%xmm5,%%xmm0 \n" |
|
687 "punpcklbw %%xmm5,%%xmm1 \n" |
|
688 "punpcklbw %%xmm5,%%xmm6 \n" |
|
689 "punpcklbw %%xmm5,%%xmm7 \n" |
|
690 "paddusw %%xmm6,%%xmm0 \n" |
|
691 "paddusw %%xmm7,%%xmm1 \n" |
|
692 MEMOPREG(movdqa,0x00,0,3,2,xmm6) // movdqa (%0,%3,2),%%xmm6 |
|
693 "lea " MEMLEA(0x10,0) ",%0 \n" |
|
694 "movhlps %%xmm6,%%xmm7 \n" |
|
695 "punpcklbw %%xmm5,%%xmm6 \n" |
|
696 "punpcklbw %%xmm5,%%xmm7 \n" |
|
697 "paddusw %%xmm6,%%xmm0 \n" |
|
698 "paddusw %%xmm7,%%xmm1 \n" |
|
699 "movdqa %%xmm0,%%xmm6 \n" |
|
700 "psrldq $0x2,%%xmm0 \n" |
|
701 "paddusw %%xmm0,%%xmm6 \n" |
|
702 "psrldq $0x2,%%xmm0 \n" |
|
703 "paddusw %%xmm0,%%xmm6 \n" |
|
704 "pshufb %%xmm2,%%xmm6 \n" |
|
705 "movdqa %%xmm1,%%xmm7 \n" |
|
706 "psrldq $0x2,%%xmm1 \n" |
|
707 "paddusw %%xmm1,%%xmm7 \n" |
|
708 "psrldq $0x2,%%xmm1 \n" |
|
709 "paddusw %%xmm1,%%xmm7 \n" |
|
710 "pshufb %%xmm3,%%xmm7 \n" |
|
711 "paddusw %%xmm7,%%xmm6 \n" |
|
712 "pmulhuw %%xmm4,%%xmm6 \n" |
|
713 "packuswb %%xmm6,%%xmm6 \n" |
|
714 "sub $0x6,%2 \n" |
|
715 "movd %%xmm6," MEMACCESS(1) " \n" |
|
716 "psrlq $0x10,%%xmm6 \n" |
|
717 "movd %%xmm6," MEMACCESS2(0x2,1) " \n" |
|
718 "lea " MEMLEA(0x6,1) ",%1 \n" |
|
719 "jg 1b \n" |
|
720 : "+r"(src_ptr), // %0 |
|
721 "+r"(dst_ptr), // %1 |
|
722 "+r"(dst_width) // %2 |
|
723 : "r"((intptr_t)(src_stride)) // %3 |
|
724 : "memory", "cc" |
|
725 #if defined(__native_client__) && defined(__x86_64__) |
|
726 , "r14" |
|
727 #endif |
|
728 #if defined(__SSE2__) |
|
729 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
|
730 #endif |
|
731 ); |
|
732 } |
|
733 |
|
734 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
|
735 uint16* dst_ptr, int src_width, int src_height) { |
|
736 int tmp_height = 0; |
|
737 intptr_t tmp_src = 0; |
|
738 asm volatile ( |
|
739 "pxor %%xmm4,%%xmm4 \n" |
|
740 "sub $0x1,%5 \n" |
|
741 |
|
742 LABELALIGN |
|
743 "1: \n" |
|
744 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
|
745 "mov %0,%3 \n" |
|
746 "add %6,%0 \n" |
|
747 "movdqa %%xmm0,%%xmm1 \n" |
|
748 "punpcklbw %%xmm4,%%xmm0 \n" |
|
749 "punpckhbw %%xmm4,%%xmm1 \n" |
|
750 "mov %5,%2 \n" |
|
751 "test %2,%2 \n" |
|
752 "je 3f \n" |
|
753 |
|
754 LABELALIGN |
|
755 "2: \n" |
|
756 "movdqa " MEMACCESS(0) ",%%xmm2 \n" |
|
757 "add %6,%0 \n" |
|
758 "movdqa %%xmm2,%%xmm3 \n" |
|
759 "punpcklbw %%xmm4,%%xmm2 \n" |
|
760 "punpckhbw %%xmm4,%%xmm3 \n" |
|
761 "paddusw %%xmm2,%%xmm0 \n" |
|
762 "paddusw %%xmm3,%%xmm1 \n" |
|
763 "sub $0x1,%2 \n" |
|
764 "jg 2b \n" |
|
765 |
|
766 LABELALIGN |
|
767 "3: \n" |
|
768 "movdqa %%xmm0," MEMACCESS(1) " \n" |
|
769 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" |
|
770 "lea " MEMLEA(0x10,3) ",%0 \n" |
|
771 "lea " MEMLEA(0x20,1) ",%1 \n" |
|
772 "sub $0x10,%4 \n" |
|
773 "jg 1b \n" |
|
774 : "+r"(src_ptr), // %0 |
|
775 "+r"(dst_ptr), // %1 |
|
776 "+r"(tmp_height), // %2 |
|
777 "+r"(tmp_src), // %3 |
|
778 "+r"(src_width), // %4 |
|
779 "+rm"(src_height) // %5 |
|
780 : "rm"((intptr_t)(src_stride)) // %6 |
|
781 : "memory", "cc" |
|
782 #if defined(__SSE2__) |
|
783 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" |
|
784 #endif |
|
785 ); |
|
786 } |
|
787 |
|
788 // Bilinear column filtering. SSSE3 version. |
|
789 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
|
790 int dst_width, int x, int dx) { |
|
791 intptr_t x0 = 0, x1 = 0, temp_pixel = 0; |
|
792 asm volatile ( |
|
793 "movd %6,%%xmm2 \n" |
|
794 "movd %7,%%xmm3 \n" |
|
795 "movl $0x04040000,%k2 \n" |
|
796 "movd %k2,%%xmm5 \n" |
|
797 "pcmpeqb %%xmm6,%%xmm6 \n" |
|
798 "psrlw $0x9,%%xmm6 \n" |
|
799 "pextrw $0x1,%%xmm2,%k3 \n" |
|
800 "subl $0x2,%5 \n" |
|
801 "jl 29f \n" |
|
802 "movdqa %%xmm2,%%xmm0 \n" |
|
803 "paddd %%xmm3,%%xmm0 \n" |
|
804 "punpckldq %%xmm0,%%xmm2 \n" |
|
805 "punpckldq %%xmm3,%%xmm3 \n" |
|
806 "paddd %%xmm3,%%xmm3 \n" |
|
807 "pextrw $0x3,%%xmm2,%k4 \n" |
|
808 |
|
809 LABELALIGN |
|
810 "2: \n" |
|
811 "movdqa %%xmm2,%%xmm1 \n" |
|
812 "paddd %%xmm3,%%xmm2 \n" |
|
813 MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2 |
|
814 "movd %k2,%%xmm0 \n" |
|
815 "psrlw $0x9,%%xmm1 \n" |
|
816 BUNDLEALIGN |
|
817 MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2 |
|
818 "movd %k2,%%xmm4 \n" |
|
819 "pshufb %%xmm5,%%xmm1 \n" |
|
820 "punpcklwd %%xmm4,%%xmm0 \n" |
|
821 "pxor %%xmm6,%%xmm1 \n" |
|
822 "pmaddubsw %%xmm1,%%xmm0 \n" |
|
823 "pextrw $0x1,%%xmm2,%k3 \n" |
|
824 "pextrw $0x3,%%xmm2,%k4 \n" |
|
825 "psrlw $0x7,%%xmm0 \n" |
|
826 "packuswb %%xmm0,%%xmm0 \n" |
|
827 "movd %%xmm0,%k2 \n" |
|
828 "mov %w2," MEMACCESS(0) " \n" |
|
829 "lea " MEMLEA(0x2,0) ",%0 \n" |
|
830 "sub $0x2,%5 \n" |
|
831 "jge 2b \n" |
|
832 |
|
833 LABELALIGN |
|
834 "29: \n" |
|
835 "addl $0x1,%5 \n" |
|
836 "jl 99f \n" |
|
837 MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2 |
|
838 "movd %k2,%%xmm0 \n" |
|
839 "psrlw $0x9,%%xmm2 \n" |
|
840 "pshufb %%xmm5,%%xmm2 \n" |
|
841 "pxor %%xmm6,%%xmm2 \n" |
|
842 "pmaddubsw %%xmm2,%%xmm0 \n" |
|
843 "psrlw $0x7,%%xmm0 \n" |
|
844 "packuswb %%xmm0,%%xmm0 \n" |
|
845 "movd %%xmm0,%k2 \n" |
|
846 "mov %b2," MEMACCESS(0) " \n" |
|
847 "99: \n" |
|
848 : "+r"(dst_ptr), // %0 |
|
849 "+r"(src_ptr), // %1 |
|
850 "+a"(temp_pixel), // %2 |
|
851 "+r"(x0), // %3 |
|
852 "+r"(x1), // %4 |
|
853 "+rm"(dst_width) // %5 |
|
854 : "rm"(x), // %6 |
|
855 "rm"(dx) // %7 |
|
856 : "memory", "cc" |
|
857 #if defined(__native_client__) && defined(__x86_64__) |
|
858 , "r14" |
|
859 #endif |
|
860 #if defined(__SSE2__) |
|
861 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
|
862 #endif |
|
863 ); |
|
864 } |
|
865 |
|
866 // Reads 4 pixels, duplicates them and writes 8 pixels. |
|
867 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. |
|
868 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, |
|
869 int dst_width, int x, int dx) { |
|
870 asm volatile ( |
|
871 LABELALIGN |
|
872 "1: \n" |
|
873 "movdqa " MEMACCESS(1) ",%%xmm0 \n" |
|
874 "lea " MEMLEA(0x10,1) ",%1 \n" |
|
875 "movdqa %%xmm0,%%xmm1 \n" |
|
876 "punpcklbw %%xmm0,%%xmm0 \n" |
|
877 "punpckhbw %%xmm1,%%xmm1 \n" |
|
878 "sub $0x20,%2 \n" |
|
879 "movdqa %%xmm0," MEMACCESS(0) " \n" |
|
880 "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" |
|
881 "lea " MEMLEA(0x20,0) ",%0 \n" |
|
882 "jg 1b \n" |
|
883 |
|
884 : "+r"(dst_ptr), // %0 |
|
885 "+r"(src_ptr), // %1 |
|
886 "+r"(dst_width) // %2 |
|
887 : |
|
888 : "memory", "cc" |
|
889 #if defined(__SSE2__) |
|
890 , "xmm0", "xmm1" |
|
891 #endif |
|
892 ); |
|
893 } |
|
894 |
|
895 void ScaleARGBRowDown2_SSE2(const uint8* src_argb, |
|
896 ptrdiff_t src_stride, |
|
897 uint8* dst_argb, int dst_width) { |
|
898 asm volatile ( |
|
899 LABELALIGN |
|
900 "1: \n" |
|
901 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
|
902 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
|
903 "lea " MEMLEA(0x20,0) ",%0 \n" |
|
904 "shufps $0xdd,%%xmm1,%%xmm0 \n" |
|
905 "sub $0x4,%2 \n" |
|
906 "movdqa %%xmm0," MEMACCESS(1) " \n" |
|
907 "lea " MEMLEA(0x10,1) ",%1 \n" |
|
908 "jg 1b \n" |
|
909 : "+r"(src_argb), // %0 |
|
910 "+r"(dst_argb), // %1 |
|
911 "+r"(dst_width) // %2 |
|
912 : |
|
913 : "memory", "cc" |
|
914 #if defined(__SSE2__) |
|
915 , "xmm0", "xmm1" |
|
916 #endif |
|
917 ); |
|
918 } |
|
919 |
|
920 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, |
|
921 ptrdiff_t src_stride, |
|
922 uint8* dst_argb, int dst_width) { |
|
923 asm volatile ( |
|
924 LABELALIGN |
|
925 "1: \n" |
|
926 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
|
927 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
|
928 "lea " MEMLEA(0x20,0) ",%0 \n" |
|
929 "movdqa %%xmm0,%%xmm2 \n" |
|
930 "shufps $0x88,%%xmm1,%%xmm0 \n" |
|
931 "shufps $0xdd,%%xmm1,%%xmm2 \n" |
|
932 "pavgb %%xmm2,%%xmm0 \n" |
|
933 "sub $0x4,%2 \n" |
|
934 "movdqa %%xmm0," MEMACCESS(1) " \n" |
|
935 "lea " MEMLEA(0x10,1) ",%1 \n" |
|
936 "jg 1b \n" |
|
937 : "+r"(src_argb), // %0 |
|
938 "+r"(dst_argb), // %1 |
|
939 "+r"(dst_width) // %2 |
|
940 : |
|
941 : "memory", "cc" |
|
942 #if defined(__SSE2__) |
|
943 , "xmm0", "xmm1" |
|
944 #endif |
|
945 ); |
|
946 } |
|
947 |
|
948 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, |
|
949 ptrdiff_t src_stride, |
|
950 uint8* dst_argb, int dst_width) { |
|
951 asm volatile ( |
|
952 LABELALIGN |
|
953 "1: \n" |
|
954 "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
|
955 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
|
956 BUNDLEALIGN |
|
957 MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2 |
|
958 MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3 |
|
959 "lea " MEMLEA(0x20,0) ",%0 \n" |
|
960 "pavgb %%xmm2,%%xmm0 \n" |
|
961 "pavgb %%xmm3,%%xmm1 \n" |
|
962 "movdqa %%xmm0,%%xmm2 \n" |
|
963 "shufps $0x88,%%xmm1,%%xmm0 \n" |
|
964 "shufps $0xdd,%%xmm1,%%xmm2 \n" |
|
965 "pavgb %%xmm2,%%xmm0 \n" |
|
966 "sub $0x4,%2 \n" |
|
967 "movdqa %%xmm0," MEMACCESS(1) " \n" |
|
968 "lea " MEMLEA(0x10,1) ",%1 \n" |
|
969 "jg 1b \n" |
|
970 : "+r"(src_argb), // %0 |
|
971 "+r"(dst_argb), // %1 |
|
972 "+r"(dst_width) // %2 |
|
973 : "r"((intptr_t)(src_stride)) // %3 |
|
974 : "memory", "cc" |
|
975 #if defined(__native_client__) && defined(__x86_64__) |
|
976 , "r14" |
|
977 #endif |
|
978 #if defined(__SSE2__) |
|
979 , "xmm0", "xmm1", "xmm2", "xmm3" |
|
980 #endif |
|
981 ); |
|
982 } |
|
983 |
|
984 // Reads 4 pixels at a time. |
|
985 // Alignment requirement: dst_argb 16 byte aligned. |
|
986 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, |
|
987 int src_stepx, |
|
988 uint8* dst_argb, int dst_width) { |
|
989 intptr_t src_stepx_x4 = (intptr_t)(src_stepx); |
|
990 intptr_t src_stepx_x12 = 0; |
|
991 asm volatile ( |
|
992 "lea " MEMLEA3(0x00,1,4) ",%1 \n" |
|
993 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" |
|
994 LABELALIGN |
|
995 "1: \n" |
|
996 "movd " MEMACCESS(0) ",%%xmm0 \n" |
|
997 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 |
|
998 "punpckldq %%xmm1,%%xmm0 \n" |
|
999 BUNDLEALIGN |
|
1000 MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2 |
|
1001 MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3 |
|
1002 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" |
|
1003 "punpckldq %%xmm3,%%xmm2 \n" |
|
1004 "punpcklqdq %%xmm2,%%xmm0 \n" |
|
1005 "sub $0x4,%3 \n" |
|
1006 "movdqa %%xmm0," MEMACCESS(2) " \n" |
|
1007 "lea " MEMLEA(0x10,2) ",%2 \n" |
|
1008 "jg 1b \n" |
|
1009 : "+r"(src_argb), // %0 |
|
1010 "+r"(src_stepx_x4), // %1 |
|
1011 "+r"(dst_argb), // %2 |
|
1012 "+r"(dst_width), // %3 |
|
1013 "+r"(src_stepx_x12) // %4 |
|
1014 : |
|
1015 : "memory", "cc" |
|
1016 #if defined(__native_client__) && defined(__x86_64__) |
|
1017 , "r14" |
|
1018 #endif |
|
1019 #if defined(__SSE2__) |
|
1020 , "xmm0", "xmm1", "xmm2", "xmm3" |
|
1021 #endif |
|
1022 ); |
|
1023 } |
|
1024 |
|
1025 // Blends four 2x2 to 4x1. |
|
1026 // Alignment requirement: dst_argb 16 byte aligned. |
|
1027 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, |
|
1028 ptrdiff_t src_stride, int src_stepx, |
|
1029 uint8* dst_argb, int dst_width) { |
|
1030 intptr_t src_stepx_x4 = (intptr_t)(src_stepx); |
|
1031 intptr_t src_stepx_x12 = 0; |
|
1032 intptr_t row1 = (intptr_t)(src_stride); |
|
1033 asm volatile ( |
|
1034 "lea " MEMLEA3(0x00,1,4) ",%1 \n" |
|
1035 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" |
|
1036 "lea " MEMLEA4(0x00,0,5,1) ",%5 \n" |
|
1037 |
|
1038 LABELALIGN |
|
1039 "1: \n" |
|
1040 "movq " MEMACCESS(0) ",%%xmm0 \n" |
|
1041 MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0 |
|
1042 MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1 |
|
1043 BUNDLEALIGN |
|
1044 MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1 |
|
1045 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" |
|
1046 "movq " MEMACCESS(5) ",%%xmm2 \n" |
|
1047 BUNDLEALIGN |
|
1048 MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2 |
|
1049 MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3 |
|
1050 MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3 |
|
1051 "lea " MEMLEA4(0x00,5,1,4) ",%5 \n" |
|
1052 "pavgb %%xmm2,%%xmm0 \n" |
|
1053 "pavgb %%xmm3,%%xmm1 \n" |
|
1054 "movdqa %%xmm0,%%xmm2 \n" |
|
1055 "shufps $0x88,%%xmm1,%%xmm0 \n" |
|
1056 "shufps $0xdd,%%xmm1,%%xmm2 \n" |
|
1057 "pavgb %%xmm2,%%xmm0 \n" |
|
1058 "sub $0x4,%3 \n" |
|
1059 "movdqa %%xmm0," MEMACCESS(2) " \n" |
|
1060 "lea " MEMLEA(0x10,2) ",%2 \n" |
|
1061 "jg 1b \n" |
|
1062 : "+r"(src_argb), // %0 |
|
1063 "+r"(src_stepx_x4), // %1 |
|
1064 "+r"(dst_argb), // %2 |
|
1065 "+rm"(dst_width), // %3 |
|
1066 "+r"(src_stepx_x12), // %4 |
|
1067 "+r"(row1) // %5 |
|
1068 : |
|
1069 : "memory", "cc" |
|
1070 #if defined(__native_client__) && defined(__x86_64__) |
|
1071 , "r14" |
|
1072 #endif |
|
1073 #if defined(__SSE2__) |
|
1074 , "xmm0", "xmm1", "xmm2", "xmm3" |
|
1075 #endif |
|
1076 ); |
|
1077 } |
|
1078 |
|
1079 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, |
|
1080 int dst_width, int x, int dx) { |
|
1081 intptr_t x0 = 0, x1 = 0; |
|
1082 asm volatile ( |
|
1083 "movd %5,%%xmm2 \n" |
|
1084 "movd %6,%%xmm3 \n" |
|
1085 "pshufd $0x0,%%xmm2,%%xmm2 \n" |
|
1086 "pshufd $0x11,%%xmm3,%%xmm0 \n" |
|
1087 "paddd %%xmm0,%%xmm2 \n" |
|
1088 "paddd %%xmm3,%%xmm3 \n" |
|
1089 "pshufd $0x5,%%xmm3,%%xmm0 \n" |
|
1090 "paddd %%xmm0,%%xmm2 \n" |
|
1091 "paddd %%xmm3,%%xmm3 \n" |
|
1092 "pshufd $0x0,%%xmm3,%%xmm3 \n" |
|
1093 "pextrw $0x1,%%xmm2,%k0 \n" |
|
1094 "pextrw $0x3,%%xmm2,%k1 \n" |
|
1095 "cmp $0x0,%4 \n" |
|
1096 "jl 99f \n" |
|
1097 "sub $0x4,%4 \n" |
|
1098 "jl 49f \n" |
|
1099 |
|
1100 LABELALIGN |
|
1101 "40: \n" |
|
1102 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 |
|
1103 MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 |
|
1104 "pextrw $0x5,%%xmm2,%k0 \n" |
|
1105 "pextrw $0x7,%%xmm2,%k1 \n" |
|
1106 "paddd %%xmm3,%%xmm2 \n" |
|
1107 "punpckldq %%xmm1,%%xmm0 \n" |
|
1108 MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1 |
|
1109 MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4 |
|
1110 "pextrw $0x1,%%xmm2,%k0 \n" |
|
1111 "pextrw $0x3,%%xmm2,%k1 \n" |
|
1112 "punpckldq %%xmm4,%%xmm1 \n" |
|
1113 "punpcklqdq %%xmm1,%%xmm0 \n" |
|
1114 "sub $0x4,%4 \n" |
|
1115 "movdqu %%xmm0," MEMACCESS(2) " \n" |
|
1116 "lea " MEMLEA(0x10,2) ",%2 \n" |
|
1117 "jge 40b \n" |
|
1118 |
|
1119 "49: \n" |
|
1120 "test $0x2,%4 \n" |
|
1121 "je 29f \n" |
|
1122 BUNDLEALIGN |
|
1123 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 |
|
1124 MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 |
|
1125 "pextrw $0x5,%%xmm2,%k0 \n" |
|
1126 "punpckldq %%xmm1,%%xmm0 \n" |
|
1127 "movq %%xmm0," MEMACCESS(2) " \n" |
|
1128 "lea " MEMLEA(0x8,2) ",%2 \n" |
|
1129 "29: \n" |
|
1130 "test $0x1,%4 \n" |
|
1131 "je 99f \n" |
|
1132 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 |
|
1133 "movd %%xmm0," MEMACCESS(2) " \n" |
|
1134 "99: \n" |
|
1135 : "+a"(x0), // %0 |
|
1136 "+d"(x1), // %1 |
|
1137 "+r"(dst_argb), // %2 |
|
1138 "+r"(src_argb), // %3 |
|
1139 "+r"(dst_width) // %4 |
|
1140 : "rm"(x), // %5 |
|
1141 "rm"(dx) // %6 |
|
1142 : "memory", "cc" |
|
1143 #if defined(__native_client__) && defined(__x86_64__) |
|
1144 , "r14" |
|
1145 #endif |
|
1146 #if defined(__SSE2__) |
|
1147 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" |
|
1148 #endif |
|
1149 ); |
|
1150 } |
|
1151 |
|
1152 // Reads 4 pixels, duplicates them and writes 8 pixels. |
|
1153 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. |
|
1154 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, |
|
1155 int dst_width, int x, int dx) { |
|
1156 asm volatile ( |
|
1157 LABELALIGN |
|
1158 "1: \n" |
|
1159 "movdqa " MEMACCESS(1) ",%%xmm0 \n" |
|
1160 "lea " MEMLEA(0x10,1) ",%1 \n" |
|
1161 "movdqa %%xmm0,%%xmm1 \n" |
|
1162 "punpckldq %%xmm0,%%xmm0 \n" |
|
1163 "punpckhdq %%xmm1,%%xmm1 \n" |
|
1164 "sub $0x8,%2 \n" |
|
1165 "movdqa %%xmm0," MEMACCESS(0) " \n" |
|
1166 "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" |
|
1167 "lea " MEMLEA(0x20,0) ",%0 \n" |
|
1168 "jg 1b \n" |
|
1169 |
|
1170 : "+r"(dst_argb), // %0 |
|
1171 "+r"(src_argb), // %1 |
|
1172 "+r"(dst_width) // %2 |
|
1173 : |
|
1174 : "memory", "cc" |
|
1175 #if defined(__native_client__) && defined(__x86_64__) |
|
1176 , "r14" |
|
1177 #endif |
|
1178 #if defined(__SSE2__) |
|
1179 , "xmm0", "xmm1" |
|
1180 #endif |
|
1181 ); |
|
1182 } |
|
1183 |
|
1184 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw |
|
1185 static uvec8 kShuffleColARGB = { |
|
1186 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel |
|
1187 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel |
|
1188 }; |
|
1189 |
|
1190 // Shuffle table for duplicating 2 fractions into 8 bytes each |
|
1191 static uvec8 kShuffleFractions = { |
|
1192 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, |
|
1193 }; |
|
1194 |
|
1195 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version |
|
1196 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, |
|
1197 int dst_width, int x, int dx) { |
|
1198 intptr_t x0 = 0, x1 = 0; |
|
1199 asm volatile ( |
|
1200 "movdqa %0,%%xmm4 \n" |
|
1201 "movdqa %1,%%xmm5 \n" |
|
1202 : |
|
1203 : "m"(kShuffleColARGB), // %0 |
|
1204 "m"(kShuffleFractions) // %1 |
|
1205 ); |
|
1206 |
|
1207 asm volatile ( |
|
1208 "movd %5,%%xmm2 \n" |
|
1209 "movd %6,%%xmm3 \n" |
|
1210 "pcmpeqb %%xmm6,%%xmm6 \n" |
|
1211 "psrlw $0x9,%%xmm6 \n" |
|
1212 "pextrw $0x1,%%xmm2,%k3 \n" |
|
1213 "sub $0x2,%2 \n" |
|
1214 "jl 29f \n" |
|
1215 "movdqa %%xmm2,%%xmm0 \n" |
|
1216 "paddd %%xmm3,%%xmm0 \n" |
|
1217 "punpckldq %%xmm0,%%xmm2 \n" |
|
1218 "punpckldq %%xmm3,%%xmm3 \n" |
|
1219 "paddd %%xmm3,%%xmm3 \n" |
|
1220 "pextrw $0x3,%%xmm2,%k4 \n" |
|
1221 |
|
1222 LABELALIGN |
|
1223 "2: \n" |
|
1224 "movdqa %%xmm2,%%xmm1 \n" |
|
1225 "paddd %%xmm3,%%xmm2 \n" |
|
1226 MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 |
|
1227 "psrlw $0x9,%%xmm1 \n" |
|
1228 BUNDLEALIGN |
|
1229 MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0 |
|
1230 "pshufb %%xmm5,%%xmm1 \n" |
|
1231 "pshufb %%xmm4,%%xmm0 \n" |
|
1232 "pxor %%xmm6,%%xmm1 \n" |
|
1233 "pmaddubsw %%xmm1,%%xmm0 \n" |
|
1234 "psrlw $0x7,%%xmm0 \n" |
|
1235 "pextrw $0x1,%%xmm2,%k3 \n" |
|
1236 "pextrw $0x3,%%xmm2,%k4 \n" |
|
1237 "packuswb %%xmm0,%%xmm0 \n" |
|
1238 "movq %%xmm0," MEMACCESS(0) " \n" |
|
1239 "lea " MEMLEA(0x8,0) ",%0 \n" |
|
1240 "sub $0x2,%2 \n" |
|
1241 "jge 2b \n" |
|
1242 |
|
1243 LABELALIGN |
|
1244 "29: \n" |
|
1245 "add $0x1,%2 \n" |
|
1246 "jl 99f \n" |
|
1247 "psrlw $0x9,%%xmm2 \n" |
|
1248 BUNDLEALIGN |
|
1249 MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 |
|
1250 "pshufb %%xmm5,%%xmm2 \n" |
|
1251 "pshufb %%xmm4,%%xmm0 \n" |
|
1252 "pxor %%xmm6,%%xmm2 \n" |
|
1253 "pmaddubsw %%xmm2,%%xmm0 \n" |
|
1254 "psrlw $0x7,%%xmm0 \n" |
|
1255 "packuswb %%xmm0,%%xmm0 \n" |
|
1256 "movd %%xmm0," MEMACCESS(0) " \n" |
|
1257 |
|
1258 LABELALIGN |
|
1259 "99: \n" |
|
1260 : "+r"(dst_argb), // %0 |
|
1261 "+r"(src_argb), // %1 |
|
1262 "+rm"(dst_width), // %2 |
|
1263 "+r"(x0), // %3 |
|
1264 "+r"(x1) // %4 |
|
1265 : "rm"(x), // %5 |
|
1266 "rm"(dx) // %6 |
|
1267 : "memory", "cc" |
|
1268 #if defined(__native_client__) && defined(__x86_64__) |
|
1269 , "r14" |
|
1270 #endif |
|
1271 #if defined(__SSE2__) |
|
1272 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
|
1273 #endif |
|
1274 ); |
|
1275 } |
|
1276 |
|
1277 // Divide num by div and return as 16.16 fixed point result. |
|
1278 int FixedDiv_X86(int num, int div) { |
|
1279 asm volatile ( |
|
1280 "cdq \n" |
|
1281 "shld $0x10,%%eax,%%edx \n" |
|
1282 "shl $0x10,%%eax \n" |
|
1283 "idiv %1 \n" |
|
1284 "mov %0, %%eax \n" |
|
1285 : "+a"(num) // %0 |
|
1286 : "c"(div) // %1 |
|
1287 : "memory", "cc", "edx" |
|
1288 ); |
|
1289 return num; |
|
1290 } |
|
1291 |
|
1292 // Divide num - 1 by div - 1 and return as 16.16 fixed point result. |
|
1293 int FixedDiv1_X86(int num, int div) { |
|
1294 asm volatile ( |
|
1295 "cdq \n" |
|
1296 "shld $0x10,%%eax,%%edx \n" |
|
1297 "shl $0x10,%%eax \n" |
|
1298 "sub $0x10001,%%eax \n" |
|
1299 "sbb $0x0,%%edx \n" |
|
1300 "sub $0x1,%1 \n" |
|
1301 "idiv %1 \n" |
|
1302 "mov %0, %%eax \n" |
|
1303 : "+a"(num) // %0 |
|
1304 : "c"(div) // %1 |
|
1305 : "memory", "cc", "edx" |
|
1306 ); |
|
1307 return num; |
|
1308 } |
|
1309 |
|
1310 #endif // defined(__x86_64__) || defined(__i386__) |
|
1311 |
|
1312 #ifdef __cplusplus |
|
1313 } // extern "C" |
|
1314 } // namespace libyuv |
|
1315 #endif |