media/libyuv/source/rotate_neon.cc

branch
TOR_BUG_9701
changeset 15
b8a032363ba2
equal deleted inserted replaced
-1:000000000000 0:112d6db20ea7
1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 #include "libyuv/basic_types.h"
14
15 #ifdef __cplusplus
16 namespace libyuv {
17 extern "C" {
18 #endif
19
20 #if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
21 static uvec8 kVTbl4x4Transpose =
22 { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
23
24 void TransposeWx8_NEON(const uint8* src, int src_stride,
25 uint8* dst, int dst_stride,
26 int width) {
27 asm volatile (
28 // loops are on blocks of 8. loop will stop when
29 // counter gets to or below 0. starting the counter
30 // at w-8 allow for this
31 "sub %4, #8 \n"
32
33 // handle 8x8 blocks. this should be the majority of the plane
34 ".p2align 2 \n"
35 "1: \n"
36 "mov r9, %0 \n"
37
38 "vld1.8 {d0}, [r9], %1 \n"
39 "vld1.8 {d1}, [r9], %1 \n"
40 "vld1.8 {d2}, [r9], %1 \n"
41 "vld1.8 {d3}, [r9], %1 \n"
42 "vld1.8 {d4}, [r9], %1 \n"
43 "vld1.8 {d5}, [r9], %1 \n"
44 "vld1.8 {d6}, [r9], %1 \n"
45 "vld1.8 {d7}, [r9] \n"
46
47 "vtrn.8 d1, d0 \n"
48 "vtrn.8 d3, d2 \n"
49 "vtrn.8 d5, d4 \n"
50 "vtrn.8 d7, d6 \n"
51
52 "vtrn.16 d1, d3 \n"
53 "vtrn.16 d0, d2 \n"
54 "vtrn.16 d5, d7 \n"
55 "vtrn.16 d4, d6 \n"
56
57 "vtrn.32 d1, d5 \n"
58 "vtrn.32 d0, d4 \n"
59 "vtrn.32 d3, d7 \n"
60 "vtrn.32 d2, d6 \n"
61
62 "vrev16.8 q0, q0 \n"
63 "vrev16.8 q1, q1 \n"
64 "vrev16.8 q2, q2 \n"
65 "vrev16.8 q3, q3 \n"
66
67 "mov r9, %2 \n"
68
69 "vst1.8 {d1}, [r9], %3 \n"
70 "vst1.8 {d0}, [r9], %3 \n"
71 "vst1.8 {d3}, [r9], %3 \n"
72 "vst1.8 {d2}, [r9], %3 \n"
73 "vst1.8 {d5}, [r9], %3 \n"
74 "vst1.8 {d4}, [r9], %3 \n"
75 "vst1.8 {d7}, [r9], %3 \n"
76 "vst1.8 {d6}, [r9] \n"
77
78 "add %0, #8 \n" // src += 8
79 "add %2, %2, %3, lsl #3 \n" // dst += 8 * dst_stride
80 "subs %4, #8 \n" // w -= 8
81 "bge 1b \n"
82
83 // add 8 back to counter. if the result is 0 there are
84 // no residuals.
85 "adds %4, #8 \n"
86 "beq 4f \n"
87
88 // some residual, so between 1 and 7 lines left to transpose
89 "cmp %4, #2 \n"
90 "blt 3f \n"
91
92 "cmp %4, #4 \n"
93 "blt 2f \n"
94
95 // 4x8 block
96 "mov r9, %0 \n"
97 "vld1.32 {d0[0]}, [r9], %1 \n"
98 "vld1.32 {d0[1]}, [r9], %1 \n"
99 "vld1.32 {d1[0]}, [r9], %1 \n"
100 "vld1.32 {d1[1]}, [r9], %1 \n"
101 "vld1.32 {d2[0]}, [r9], %1 \n"
102 "vld1.32 {d2[1]}, [r9], %1 \n"
103 "vld1.32 {d3[0]}, [r9], %1 \n"
104 "vld1.32 {d3[1]}, [r9] \n"
105
106 "mov r9, %2 \n"
107
108 "vld1.8 {q3}, [%5] \n"
109
110 "vtbl.8 d4, {d0, d1}, d6 \n"
111 "vtbl.8 d5, {d0, d1}, d7 \n"
112 "vtbl.8 d0, {d2, d3}, d6 \n"
113 "vtbl.8 d1, {d2, d3}, d7 \n"
114
115 // TODO(frkoenig): Rework shuffle above to
116 // write out with 4 instead of 8 writes.
117 "vst1.32 {d4[0]}, [r9], %3 \n"
118 "vst1.32 {d4[1]}, [r9], %3 \n"
119 "vst1.32 {d5[0]}, [r9], %3 \n"
120 "vst1.32 {d5[1]}, [r9] \n"
121
122 "add r9, %2, #4 \n"
123 "vst1.32 {d0[0]}, [r9], %3 \n"
124 "vst1.32 {d0[1]}, [r9], %3 \n"
125 "vst1.32 {d1[0]}, [r9], %3 \n"
126 "vst1.32 {d1[1]}, [r9] \n"
127
128 "add %0, #4 \n" // src += 4
129 "add %2, %2, %3, lsl #2 \n" // dst += 4 * dst_stride
130 "subs %4, #4 \n" // w -= 4
131 "beq 4f \n"
132
133 // some residual, check to see if it includes a 2x8 block,
134 // or less
135 "cmp %4, #2 \n"
136 "blt 3f \n"
137
138 // 2x8 block
139 "2: \n"
140 "mov r9, %0 \n"
141 "vld1.16 {d0[0]}, [r9], %1 \n"
142 "vld1.16 {d1[0]}, [r9], %1 \n"
143 "vld1.16 {d0[1]}, [r9], %1 \n"
144 "vld1.16 {d1[1]}, [r9], %1 \n"
145 "vld1.16 {d0[2]}, [r9], %1 \n"
146 "vld1.16 {d1[2]}, [r9], %1 \n"
147 "vld1.16 {d0[3]}, [r9], %1 \n"
148 "vld1.16 {d1[3]}, [r9] \n"
149
150 "vtrn.8 d0, d1 \n"
151
152 "mov r9, %2 \n"
153
154 "vst1.64 {d0}, [r9], %3 \n"
155 "vst1.64 {d1}, [r9] \n"
156
157 "add %0, #2 \n" // src += 2
158 "add %2, %2, %3, lsl #1 \n" // dst += 2 * dst_stride
159 "subs %4, #2 \n" // w -= 2
160 "beq 4f \n"
161
162 // 1x8 block
163 "3: \n"
164 "vld1.8 {d0[0]}, [%0], %1 \n"
165 "vld1.8 {d0[1]}, [%0], %1 \n"
166 "vld1.8 {d0[2]}, [%0], %1 \n"
167 "vld1.8 {d0[3]}, [%0], %1 \n"
168 "vld1.8 {d0[4]}, [%0], %1 \n"
169 "vld1.8 {d0[5]}, [%0], %1 \n"
170 "vld1.8 {d0[6]}, [%0], %1 \n"
171 "vld1.8 {d0[7]}, [%0] \n"
172
173 "vst1.64 {d0}, [%2] \n"
174
175 "4: \n"
176
177 : "+r"(src), // %0
178 "+r"(src_stride), // %1
179 "+r"(dst), // %2
180 "+r"(dst_stride), // %3
181 "+r"(width) // %4
182 : "r"(&kVTbl4x4Transpose) // %5
183 : "memory", "cc", "r9", "q0", "q1", "q2", "q3"
184 );
185 }
186
187 static uvec8 kVTbl4x4TransposeDi =
188 { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 };
189
190 void TransposeUVWx8_NEON(const uint8* src, int src_stride,
191 uint8* dst_a, int dst_stride_a,
192 uint8* dst_b, int dst_stride_b,
193 int width) {
194 asm volatile (
195 // loops are on blocks of 8. loop will stop when
196 // counter gets to or below 0. starting the counter
197 // at w-8 allow for this
198 "sub %6, #8 \n"
199
200 // handle 8x8 blocks. this should be the majority of the plane
201 ".p2align 2 \n"
202 "1: \n"
203 "mov r9, %0 \n"
204
205 "vld2.8 {d0, d1}, [r9], %1 \n"
206 "vld2.8 {d2, d3}, [r9], %1 \n"
207 "vld2.8 {d4, d5}, [r9], %1 \n"
208 "vld2.8 {d6, d7}, [r9], %1 \n"
209 "vld2.8 {d16, d17}, [r9], %1 \n"
210 "vld2.8 {d18, d19}, [r9], %1 \n"
211 "vld2.8 {d20, d21}, [r9], %1 \n"
212 "vld2.8 {d22, d23}, [r9] \n"
213
214 "vtrn.8 q1, q0 \n"
215 "vtrn.8 q3, q2 \n"
216 "vtrn.8 q9, q8 \n"
217 "vtrn.8 q11, q10 \n"
218
219 "vtrn.16 q1, q3 \n"
220 "vtrn.16 q0, q2 \n"
221 "vtrn.16 q9, q11 \n"
222 "vtrn.16 q8, q10 \n"
223
224 "vtrn.32 q1, q9 \n"
225 "vtrn.32 q0, q8 \n"
226 "vtrn.32 q3, q11 \n"
227 "vtrn.32 q2, q10 \n"
228
229 "vrev16.8 q0, q0 \n"
230 "vrev16.8 q1, q1 \n"
231 "vrev16.8 q2, q2 \n"
232 "vrev16.8 q3, q3 \n"
233 "vrev16.8 q8, q8 \n"
234 "vrev16.8 q9, q9 \n"
235 "vrev16.8 q10, q10 \n"
236 "vrev16.8 q11, q11 \n"
237
238 "mov r9, %2 \n"
239
240 "vst1.8 {d2}, [r9], %3 \n"
241 "vst1.8 {d0}, [r9], %3 \n"
242 "vst1.8 {d6}, [r9], %3 \n"
243 "vst1.8 {d4}, [r9], %3 \n"
244 "vst1.8 {d18}, [r9], %3 \n"
245 "vst1.8 {d16}, [r9], %3 \n"
246 "vst1.8 {d22}, [r9], %3 \n"
247 "vst1.8 {d20}, [r9] \n"
248
249 "mov r9, %4 \n"
250
251 "vst1.8 {d3}, [r9], %5 \n"
252 "vst1.8 {d1}, [r9], %5 \n"
253 "vst1.8 {d7}, [r9], %5 \n"
254 "vst1.8 {d5}, [r9], %5 \n"
255 "vst1.8 {d19}, [r9], %5 \n"
256 "vst1.8 {d17}, [r9], %5 \n"
257 "vst1.8 {d23}, [r9], %5 \n"
258 "vst1.8 {d21}, [r9] \n"
259
260 "add %0, #8*2 \n" // src += 8*2
261 "add %2, %2, %3, lsl #3 \n" // dst_a += 8 * dst_stride_a
262 "add %4, %4, %5, lsl #3 \n" // dst_b += 8 * dst_stride_b
263 "subs %6, #8 \n" // w -= 8
264 "bge 1b \n"
265
266 // add 8 back to counter. if the result is 0 there are
267 // no residuals.
268 "adds %6, #8 \n"
269 "beq 4f \n"
270
271 // some residual, so between 1 and 7 lines left to transpose
272 "cmp %6, #2 \n"
273 "blt 3f \n"
274
275 "cmp %6, #4 \n"
276 "blt 2f \n"
277
278 //TODO(frkoenig): Clean this up
279 // 4x8 block
280 "mov r9, %0 \n"
281 "vld1.64 {d0}, [r9], %1 \n"
282 "vld1.64 {d1}, [r9], %1 \n"
283 "vld1.64 {d2}, [r9], %1 \n"
284 "vld1.64 {d3}, [r9], %1 \n"
285 "vld1.64 {d4}, [r9], %1 \n"
286 "vld1.64 {d5}, [r9], %1 \n"
287 "vld1.64 {d6}, [r9], %1 \n"
288 "vld1.64 {d7}, [r9] \n"
289
290 "vld1.8 {q15}, [%7] \n"
291
292 "vtrn.8 q0, q1 \n"
293 "vtrn.8 q2, q3 \n"
294
295 "vtbl.8 d16, {d0, d1}, d30 \n"
296 "vtbl.8 d17, {d0, d1}, d31 \n"
297 "vtbl.8 d18, {d2, d3}, d30 \n"
298 "vtbl.8 d19, {d2, d3}, d31 \n"
299 "vtbl.8 d20, {d4, d5}, d30 \n"
300 "vtbl.8 d21, {d4, d5}, d31 \n"
301 "vtbl.8 d22, {d6, d7}, d30 \n"
302 "vtbl.8 d23, {d6, d7}, d31 \n"
303
304 "mov r9, %2 \n"
305
306 "vst1.32 {d16[0]}, [r9], %3 \n"
307 "vst1.32 {d16[1]}, [r9], %3 \n"
308 "vst1.32 {d17[0]}, [r9], %3 \n"
309 "vst1.32 {d17[1]}, [r9], %3 \n"
310
311 "add r9, %2, #4 \n"
312 "vst1.32 {d20[0]}, [r9], %3 \n"
313 "vst1.32 {d20[1]}, [r9], %3 \n"
314 "vst1.32 {d21[0]}, [r9], %3 \n"
315 "vst1.32 {d21[1]}, [r9] \n"
316
317 "mov r9, %4 \n"
318
319 "vst1.32 {d18[0]}, [r9], %5 \n"
320 "vst1.32 {d18[1]}, [r9], %5 \n"
321 "vst1.32 {d19[0]}, [r9], %5 \n"
322 "vst1.32 {d19[1]}, [r9], %5 \n"
323
324 "add r9, %4, #4 \n"
325 "vst1.32 {d22[0]}, [r9], %5 \n"
326 "vst1.32 {d22[1]}, [r9], %5 \n"
327 "vst1.32 {d23[0]}, [r9], %5 \n"
328 "vst1.32 {d23[1]}, [r9] \n"
329
330 "add %0, #4*2 \n" // src += 4 * 2
331 "add %2, %2, %3, lsl #2 \n" // dst_a += 4 * dst_stride_a
332 "add %4, %4, %5, lsl #2 \n" // dst_b += 4 * dst_stride_b
333 "subs %6, #4 \n" // w -= 4
334 "beq 4f \n"
335
336 // some residual, check to see if it includes a 2x8 block,
337 // or less
338 "cmp %6, #2 \n"
339 "blt 3f \n"
340
341 // 2x8 block
342 "2: \n"
343 "mov r9, %0 \n"
344 "vld2.16 {d0[0], d2[0]}, [r9], %1 \n"
345 "vld2.16 {d1[0], d3[0]}, [r9], %1 \n"
346 "vld2.16 {d0[1], d2[1]}, [r9], %1 \n"
347 "vld2.16 {d1[1], d3[1]}, [r9], %1 \n"
348 "vld2.16 {d0[2], d2[2]}, [r9], %1 \n"
349 "vld2.16 {d1[2], d3[2]}, [r9], %1 \n"
350 "vld2.16 {d0[3], d2[3]}, [r9], %1 \n"
351 "vld2.16 {d1[3], d3[3]}, [r9] \n"
352
353 "vtrn.8 d0, d1 \n"
354 "vtrn.8 d2, d3 \n"
355
356 "mov r9, %2 \n"
357
358 "vst1.64 {d0}, [r9], %3 \n"
359 "vst1.64 {d2}, [r9] \n"
360
361 "mov r9, %4 \n"
362
363 "vst1.64 {d1}, [r9], %5 \n"
364 "vst1.64 {d3}, [r9] \n"
365
366 "add %0, #2*2 \n" // src += 2 * 2
367 "add %2, %2, %3, lsl #1 \n" // dst_a += 2 * dst_stride_a
368 "add %4, %4, %5, lsl #1 \n" // dst_b += 2 * dst_stride_b
369 "subs %6, #2 \n" // w -= 2
370 "beq 4f \n"
371
372 // 1x8 block
373 "3: \n"
374 "vld2.8 {d0[0], d1[0]}, [%0], %1 \n"
375 "vld2.8 {d0[1], d1[1]}, [%0], %1 \n"
376 "vld2.8 {d0[2], d1[2]}, [%0], %1 \n"
377 "vld2.8 {d0[3], d1[3]}, [%0], %1 \n"
378 "vld2.8 {d0[4], d1[4]}, [%0], %1 \n"
379 "vld2.8 {d0[5], d1[5]}, [%0], %1 \n"
380 "vld2.8 {d0[6], d1[6]}, [%0], %1 \n"
381 "vld2.8 {d0[7], d1[7]}, [%0] \n"
382
383 "vst1.64 {d0}, [%2] \n"
384 "vst1.64 {d1}, [%4] \n"
385
386 "4: \n"
387
388 : "+r"(src), // %0
389 "+r"(src_stride), // %1
390 "+r"(dst_a), // %2
391 "+r"(dst_stride_a), // %3
392 "+r"(dst_b), // %4
393 "+r"(dst_stride_b), // %5
394 "+r"(width) // %6
395 : "r"(&kVTbl4x4TransposeDi) // %7
396 : "memory", "cc", "r9",
397 "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
398 );
399 }
400 #endif
401
402 #ifdef __cplusplus
403 } // extern "C"
404 } // namespace libyuv
405 #endif

mercurial