|
1 /* |
|
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
|
3 * |
|
4 * Use of this source code is governed by a BSD-style license |
|
5 * that can be found in the LICENSE file in the root of the source |
|
6 * tree. An additional intellectual property rights grant can be found |
|
7 * in the file PATENTS. All contributing project authors may |
|
8 * be found in the AUTHORS file in the root of the source tree. |
|
9 */ |
|
10 |
|
11 #include "libyuv/row.h" |
|
12 |
|
13 #include "libyuv/basic_types.h" |
|
14 |
|
15 #ifdef __cplusplus |
|
16 namespace libyuv { |
|
17 extern "C" { |
|
18 #endif |
|
19 |
|
20 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) |
|
21 static uvec8 kVTbl4x4Transpose = |
|
22 { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; |
|
23 |
|
24 void TransposeWx8_NEON(const uint8* src, int src_stride, |
|
25 uint8* dst, int dst_stride, |
|
26 int width) { |
|
27 asm volatile ( |
|
28 // loops are on blocks of 8. loop will stop when |
|
29 // counter gets to or below 0. starting the counter |
|
30 // at w-8 allow for this |
|
31 "sub %4, #8 \n" |
|
32 |
|
33 // handle 8x8 blocks. this should be the majority of the plane |
|
34 ".p2align 2 \n" |
|
35 "1: \n" |
|
36 "mov r9, %0 \n" |
|
37 |
|
38 "vld1.8 {d0}, [r9], %1 \n" |
|
39 "vld1.8 {d1}, [r9], %1 \n" |
|
40 "vld1.8 {d2}, [r9], %1 \n" |
|
41 "vld1.8 {d3}, [r9], %1 \n" |
|
42 "vld1.8 {d4}, [r9], %1 \n" |
|
43 "vld1.8 {d5}, [r9], %1 \n" |
|
44 "vld1.8 {d6}, [r9], %1 \n" |
|
45 "vld1.8 {d7}, [r9] \n" |
|
46 |
|
47 "vtrn.8 d1, d0 \n" |
|
48 "vtrn.8 d3, d2 \n" |
|
49 "vtrn.8 d5, d4 \n" |
|
50 "vtrn.8 d7, d6 \n" |
|
51 |
|
52 "vtrn.16 d1, d3 \n" |
|
53 "vtrn.16 d0, d2 \n" |
|
54 "vtrn.16 d5, d7 \n" |
|
55 "vtrn.16 d4, d6 \n" |
|
56 |
|
57 "vtrn.32 d1, d5 \n" |
|
58 "vtrn.32 d0, d4 \n" |
|
59 "vtrn.32 d3, d7 \n" |
|
60 "vtrn.32 d2, d6 \n" |
|
61 |
|
62 "vrev16.8 q0, q0 \n" |
|
63 "vrev16.8 q1, q1 \n" |
|
64 "vrev16.8 q2, q2 \n" |
|
65 "vrev16.8 q3, q3 \n" |
|
66 |
|
67 "mov r9, %2 \n" |
|
68 |
|
69 "vst1.8 {d1}, [r9], %3 \n" |
|
70 "vst1.8 {d0}, [r9], %3 \n" |
|
71 "vst1.8 {d3}, [r9], %3 \n" |
|
72 "vst1.8 {d2}, [r9], %3 \n" |
|
73 "vst1.8 {d5}, [r9], %3 \n" |
|
74 "vst1.8 {d4}, [r9], %3 \n" |
|
75 "vst1.8 {d7}, [r9], %3 \n" |
|
76 "vst1.8 {d6}, [r9] \n" |
|
77 |
|
78 "add %0, #8 \n" // src += 8 |
|
79 "add %2, %2, %3, lsl #3 \n" // dst += 8 * dst_stride |
|
80 "subs %4, #8 \n" // w -= 8 |
|
81 "bge 1b \n" |
|
82 |
|
83 // add 8 back to counter. if the result is 0 there are |
|
84 // no residuals. |
|
85 "adds %4, #8 \n" |
|
86 "beq 4f \n" |
|
87 |
|
88 // some residual, so between 1 and 7 lines left to transpose |
|
89 "cmp %4, #2 \n" |
|
90 "blt 3f \n" |
|
91 |
|
92 "cmp %4, #4 \n" |
|
93 "blt 2f \n" |
|
94 |
|
95 // 4x8 block |
|
96 "mov r9, %0 \n" |
|
97 "vld1.32 {d0[0]}, [r9], %1 \n" |
|
98 "vld1.32 {d0[1]}, [r9], %1 \n" |
|
99 "vld1.32 {d1[0]}, [r9], %1 \n" |
|
100 "vld1.32 {d1[1]}, [r9], %1 \n" |
|
101 "vld1.32 {d2[0]}, [r9], %1 \n" |
|
102 "vld1.32 {d2[1]}, [r9], %1 \n" |
|
103 "vld1.32 {d3[0]}, [r9], %1 \n" |
|
104 "vld1.32 {d3[1]}, [r9] \n" |
|
105 |
|
106 "mov r9, %2 \n" |
|
107 |
|
108 "vld1.8 {q3}, [%5] \n" |
|
109 |
|
110 "vtbl.8 d4, {d0, d1}, d6 \n" |
|
111 "vtbl.8 d5, {d0, d1}, d7 \n" |
|
112 "vtbl.8 d0, {d2, d3}, d6 \n" |
|
113 "vtbl.8 d1, {d2, d3}, d7 \n" |
|
114 |
|
115 // TODO(frkoenig): Rework shuffle above to |
|
116 // write out with 4 instead of 8 writes. |
|
117 "vst1.32 {d4[0]}, [r9], %3 \n" |
|
118 "vst1.32 {d4[1]}, [r9], %3 \n" |
|
119 "vst1.32 {d5[0]}, [r9], %3 \n" |
|
120 "vst1.32 {d5[1]}, [r9] \n" |
|
121 |
|
122 "add r9, %2, #4 \n" |
|
123 "vst1.32 {d0[0]}, [r9], %3 \n" |
|
124 "vst1.32 {d0[1]}, [r9], %3 \n" |
|
125 "vst1.32 {d1[0]}, [r9], %3 \n" |
|
126 "vst1.32 {d1[1]}, [r9] \n" |
|
127 |
|
128 "add %0, #4 \n" // src += 4 |
|
129 "add %2, %2, %3, lsl #2 \n" // dst += 4 * dst_stride |
|
130 "subs %4, #4 \n" // w -= 4 |
|
131 "beq 4f \n" |
|
132 |
|
133 // some residual, check to see if it includes a 2x8 block, |
|
134 // or less |
|
135 "cmp %4, #2 \n" |
|
136 "blt 3f \n" |
|
137 |
|
138 // 2x8 block |
|
139 "2: \n" |
|
140 "mov r9, %0 \n" |
|
141 "vld1.16 {d0[0]}, [r9], %1 \n" |
|
142 "vld1.16 {d1[0]}, [r9], %1 \n" |
|
143 "vld1.16 {d0[1]}, [r9], %1 \n" |
|
144 "vld1.16 {d1[1]}, [r9], %1 \n" |
|
145 "vld1.16 {d0[2]}, [r9], %1 \n" |
|
146 "vld1.16 {d1[2]}, [r9], %1 \n" |
|
147 "vld1.16 {d0[3]}, [r9], %1 \n" |
|
148 "vld1.16 {d1[3]}, [r9] \n" |
|
149 |
|
150 "vtrn.8 d0, d1 \n" |
|
151 |
|
152 "mov r9, %2 \n" |
|
153 |
|
154 "vst1.64 {d0}, [r9], %3 \n" |
|
155 "vst1.64 {d1}, [r9] \n" |
|
156 |
|
157 "add %0, #2 \n" // src += 2 |
|
158 "add %2, %2, %3, lsl #1 \n" // dst += 2 * dst_stride |
|
159 "subs %4, #2 \n" // w -= 2 |
|
160 "beq 4f \n" |
|
161 |
|
162 // 1x8 block |
|
163 "3: \n" |
|
164 "vld1.8 {d0[0]}, [%0], %1 \n" |
|
165 "vld1.8 {d0[1]}, [%0], %1 \n" |
|
166 "vld1.8 {d0[2]}, [%0], %1 \n" |
|
167 "vld1.8 {d0[3]}, [%0], %1 \n" |
|
168 "vld1.8 {d0[4]}, [%0], %1 \n" |
|
169 "vld1.8 {d0[5]}, [%0], %1 \n" |
|
170 "vld1.8 {d0[6]}, [%0], %1 \n" |
|
171 "vld1.8 {d0[7]}, [%0] \n" |
|
172 |
|
173 "vst1.64 {d0}, [%2] \n" |
|
174 |
|
175 "4: \n" |
|
176 |
|
177 : "+r"(src), // %0 |
|
178 "+r"(src_stride), // %1 |
|
179 "+r"(dst), // %2 |
|
180 "+r"(dst_stride), // %3 |
|
181 "+r"(width) // %4 |
|
182 : "r"(&kVTbl4x4Transpose) // %5 |
|
183 : "memory", "cc", "r9", "q0", "q1", "q2", "q3" |
|
184 ); |
|
185 } |
|
186 |
|
187 static uvec8 kVTbl4x4TransposeDi = |
|
188 { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 }; |
|
189 |
|
190 void TransposeUVWx8_NEON(const uint8* src, int src_stride, |
|
191 uint8* dst_a, int dst_stride_a, |
|
192 uint8* dst_b, int dst_stride_b, |
|
193 int width) { |
|
194 asm volatile ( |
|
195 // loops are on blocks of 8. loop will stop when |
|
196 // counter gets to or below 0. starting the counter |
|
197 // at w-8 allow for this |
|
198 "sub %6, #8 \n" |
|
199 |
|
200 // handle 8x8 blocks. this should be the majority of the plane |
|
201 ".p2align 2 \n" |
|
202 "1: \n" |
|
203 "mov r9, %0 \n" |
|
204 |
|
205 "vld2.8 {d0, d1}, [r9], %1 \n" |
|
206 "vld2.8 {d2, d3}, [r9], %1 \n" |
|
207 "vld2.8 {d4, d5}, [r9], %1 \n" |
|
208 "vld2.8 {d6, d7}, [r9], %1 \n" |
|
209 "vld2.8 {d16, d17}, [r9], %1 \n" |
|
210 "vld2.8 {d18, d19}, [r9], %1 \n" |
|
211 "vld2.8 {d20, d21}, [r9], %1 \n" |
|
212 "vld2.8 {d22, d23}, [r9] \n" |
|
213 |
|
214 "vtrn.8 q1, q0 \n" |
|
215 "vtrn.8 q3, q2 \n" |
|
216 "vtrn.8 q9, q8 \n" |
|
217 "vtrn.8 q11, q10 \n" |
|
218 |
|
219 "vtrn.16 q1, q3 \n" |
|
220 "vtrn.16 q0, q2 \n" |
|
221 "vtrn.16 q9, q11 \n" |
|
222 "vtrn.16 q8, q10 \n" |
|
223 |
|
224 "vtrn.32 q1, q9 \n" |
|
225 "vtrn.32 q0, q8 \n" |
|
226 "vtrn.32 q3, q11 \n" |
|
227 "vtrn.32 q2, q10 \n" |
|
228 |
|
229 "vrev16.8 q0, q0 \n" |
|
230 "vrev16.8 q1, q1 \n" |
|
231 "vrev16.8 q2, q2 \n" |
|
232 "vrev16.8 q3, q3 \n" |
|
233 "vrev16.8 q8, q8 \n" |
|
234 "vrev16.8 q9, q9 \n" |
|
235 "vrev16.8 q10, q10 \n" |
|
236 "vrev16.8 q11, q11 \n" |
|
237 |
|
238 "mov r9, %2 \n" |
|
239 |
|
240 "vst1.8 {d2}, [r9], %3 \n" |
|
241 "vst1.8 {d0}, [r9], %3 \n" |
|
242 "vst1.8 {d6}, [r9], %3 \n" |
|
243 "vst1.8 {d4}, [r9], %3 \n" |
|
244 "vst1.8 {d18}, [r9], %3 \n" |
|
245 "vst1.8 {d16}, [r9], %3 \n" |
|
246 "vst1.8 {d22}, [r9], %3 \n" |
|
247 "vst1.8 {d20}, [r9] \n" |
|
248 |
|
249 "mov r9, %4 \n" |
|
250 |
|
251 "vst1.8 {d3}, [r9], %5 \n" |
|
252 "vst1.8 {d1}, [r9], %5 \n" |
|
253 "vst1.8 {d7}, [r9], %5 \n" |
|
254 "vst1.8 {d5}, [r9], %5 \n" |
|
255 "vst1.8 {d19}, [r9], %5 \n" |
|
256 "vst1.8 {d17}, [r9], %5 \n" |
|
257 "vst1.8 {d23}, [r9], %5 \n" |
|
258 "vst1.8 {d21}, [r9] \n" |
|
259 |
|
260 "add %0, #8*2 \n" // src += 8*2 |
|
261 "add %2, %2, %3, lsl #3 \n" // dst_a += 8 * dst_stride_a |
|
262 "add %4, %4, %5, lsl #3 \n" // dst_b += 8 * dst_stride_b |
|
263 "subs %6, #8 \n" // w -= 8 |
|
264 "bge 1b \n" |
|
265 |
|
266 // add 8 back to counter. if the result is 0 there are |
|
267 // no residuals. |
|
268 "adds %6, #8 \n" |
|
269 "beq 4f \n" |
|
270 |
|
271 // some residual, so between 1 and 7 lines left to transpose |
|
272 "cmp %6, #2 \n" |
|
273 "blt 3f \n" |
|
274 |
|
275 "cmp %6, #4 \n" |
|
276 "blt 2f \n" |
|
277 |
|
278 //TODO(frkoenig): Clean this up |
|
279 // 4x8 block |
|
280 "mov r9, %0 \n" |
|
281 "vld1.64 {d0}, [r9], %1 \n" |
|
282 "vld1.64 {d1}, [r9], %1 \n" |
|
283 "vld1.64 {d2}, [r9], %1 \n" |
|
284 "vld1.64 {d3}, [r9], %1 \n" |
|
285 "vld1.64 {d4}, [r9], %1 \n" |
|
286 "vld1.64 {d5}, [r9], %1 \n" |
|
287 "vld1.64 {d6}, [r9], %1 \n" |
|
288 "vld1.64 {d7}, [r9] \n" |
|
289 |
|
290 "vld1.8 {q15}, [%7] \n" |
|
291 |
|
292 "vtrn.8 q0, q1 \n" |
|
293 "vtrn.8 q2, q3 \n" |
|
294 |
|
295 "vtbl.8 d16, {d0, d1}, d30 \n" |
|
296 "vtbl.8 d17, {d0, d1}, d31 \n" |
|
297 "vtbl.8 d18, {d2, d3}, d30 \n" |
|
298 "vtbl.8 d19, {d2, d3}, d31 \n" |
|
299 "vtbl.8 d20, {d4, d5}, d30 \n" |
|
300 "vtbl.8 d21, {d4, d5}, d31 \n" |
|
301 "vtbl.8 d22, {d6, d7}, d30 \n" |
|
302 "vtbl.8 d23, {d6, d7}, d31 \n" |
|
303 |
|
304 "mov r9, %2 \n" |
|
305 |
|
306 "vst1.32 {d16[0]}, [r9], %3 \n" |
|
307 "vst1.32 {d16[1]}, [r9], %3 \n" |
|
308 "vst1.32 {d17[0]}, [r9], %3 \n" |
|
309 "vst1.32 {d17[1]}, [r9], %3 \n" |
|
310 |
|
311 "add r9, %2, #4 \n" |
|
312 "vst1.32 {d20[0]}, [r9], %3 \n" |
|
313 "vst1.32 {d20[1]}, [r9], %3 \n" |
|
314 "vst1.32 {d21[0]}, [r9], %3 \n" |
|
315 "vst1.32 {d21[1]}, [r9] \n" |
|
316 |
|
317 "mov r9, %4 \n" |
|
318 |
|
319 "vst1.32 {d18[0]}, [r9], %5 \n" |
|
320 "vst1.32 {d18[1]}, [r9], %5 \n" |
|
321 "vst1.32 {d19[0]}, [r9], %5 \n" |
|
322 "vst1.32 {d19[1]}, [r9], %5 \n" |
|
323 |
|
324 "add r9, %4, #4 \n" |
|
325 "vst1.32 {d22[0]}, [r9], %5 \n" |
|
326 "vst1.32 {d22[1]}, [r9], %5 \n" |
|
327 "vst1.32 {d23[0]}, [r9], %5 \n" |
|
328 "vst1.32 {d23[1]}, [r9] \n" |
|
329 |
|
330 "add %0, #4*2 \n" // src += 4 * 2 |
|
331 "add %2, %2, %3, lsl #2 \n" // dst_a += 4 * dst_stride_a |
|
332 "add %4, %4, %5, lsl #2 \n" // dst_b += 4 * dst_stride_b |
|
333 "subs %6, #4 \n" // w -= 4 |
|
334 "beq 4f \n" |
|
335 |
|
336 // some residual, check to see if it includes a 2x8 block, |
|
337 // or less |
|
338 "cmp %6, #2 \n" |
|
339 "blt 3f \n" |
|
340 |
|
341 // 2x8 block |
|
342 "2: \n" |
|
343 "mov r9, %0 \n" |
|
344 "vld2.16 {d0[0], d2[0]}, [r9], %1 \n" |
|
345 "vld2.16 {d1[0], d3[0]}, [r9], %1 \n" |
|
346 "vld2.16 {d0[1], d2[1]}, [r9], %1 \n" |
|
347 "vld2.16 {d1[1], d3[1]}, [r9], %1 \n" |
|
348 "vld2.16 {d0[2], d2[2]}, [r9], %1 \n" |
|
349 "vld2.16 {d1[2], d3[2]}, [r9], %1 \n" |
|
350 "vld2.16 {d0[3], d2[3]}, [r9], %1 \n" |
|
351 "vld2.16 {d1[3], d3[3]}, [r9] \n" |
|
352 |
|
353 "vtrn.8 d0, d1 \n" |
|
354 "vtrn.8 d2, d3 \n" |
|
355 |
|
356 "mov r9, %2 \n" |
|
357 |
|
358 "vst1.64 {d0}, [r9], %3 \n" |
|
359 "vst1.64 {d2}, [r9] \n" |
|
360 |
|
361 "mov r9, %4 \n" |
|
362 |
|
363 "vst1.64 {d1}, [r9], %5 \n" |
|
364 "vst1.64 {d3}, [r9] \n" |
|
365 |
|
366 "add %0, #2*2 \n" // src += 2 * 2 |
|
367 "add %2, %2, %3, lsl #1 \n" // dst_a += 2 * dst_stride_a |
|
368 "add %4, %4, %5, lsl #1 \n" // dst_b += 2 * dst_stride_b |
|
369 "subs %6, #2 \n" // w -= 2 |
|
370 "beq 4f \n" |
|
371 |
|
372 // 1x8 block |
|
373 "3: \n" |
|
374 "vld2.8 {d0[0], d1[0]}, [%0], %1 \n" |
|
375 "vld2.8 {d0[1], d1[1]}, [%0], %1 \n" |
|
376 "vld2.8 {d0[2], d1[2]}, [%0], %1 \n" |
|
377 "vld2.8 {d0[3], d1[3]}, [%0], %1 \n" |
|
378 "vld2.8 {d0[4], d1[4]}, [%0], %1 \n" |
|
379 "vld2.8 {d0[5], d1[5]}, [%0], %1 \n" |
|
380 "vld2.8 {d0[6], d1[6]}, [%0], %1 \n" |
|
381 "vld2.8 {d0[7], d1[7]}, [%0] \n" |
|
382 |
|
383 "vst1.64 {d0}, [%2] \n" |
|
384 "vst1.64 {d1}, [%4] \n" |
|
385 |
|
386 "4: \n" |
|
387 |
|
388 : "+r"(src), // %0 |
|
389 "+r"(src_stride), // %1 |
|
390 "+r"(dst_a), // %2 |
|
391 "+r"(dst_stride_a), // %3 |
|
392 "+r"(dst_b), // %4 |
|
393 "+r"(dst_stride_b), // %5 |
|
394 "+r"(width) // %6 |
|
395 : "r"(&kVTbl4x4TransposeDi) // %7 |
|
396 : "memory", "cc", "r9", |
|
397 "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" |
|
398 ); |
|
399 } |
|
400 #endif |
|
401 |
|
402 #ifdef __cplusplus |
|
403 } // extern "C" |
|
404 } // namespace libyuv |
|
405 #endif |