|
1 /* |
|
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
|
3 * |
|
4 * Use of this source code is governed by a BSD-style license |
|
5 * that can be found in the LICENSE file in the root of the source |
|
6 * tree. An additional intellectual property rights grant can be found |
|
7 * in the file PATENTS. All contributing project authors may |
|
8 * be found in the AUTHORS file in the root of the source tree. |
|
9 */ |
|
10 |
|
11 #include "libyuv/row.h" |
|
12 |
|
13 #include "libyuv/basic_types.h" |
|
14 |
|
15 #ifdef __cplusplus |
|
16 namespace libyuv { |
|
17 extern "C" { |
|
18 #endif |
|
19 |
|
20 #if !defined(LIBYUV_DISABLE_MIPS) && \ |
|
21 defined(__mips_dsp) && (__mips_dsp_rev >= 2) |
|
22 |
|
23 void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride, |
|
24 uint8* dst, int dst_stride, |
|
25 int width) { |
|
26 __asm__ __volatile__ ( |
|
27 ".set push \n" |
|
28 ".set noreorder \n" |
|
29 "sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 |
|
30 "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 |
|
31 "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 |
|
32 "addu $t3, $t2, %[src_stride] \n" |
|
33 "addu $t5, $t4, %[src_stride] \n" |
|
34 "addu $t6, $t2, $t4 \n" |
|
35 "andi $t0, %[dst], 0x3 \n" |
|
36 "andi $t1, %[dst_stride], 0x3 \n" |
|
37 "or $t0, $t0, $t1 \n" |
|
38 "bnez $t0, 11f \n" |
|
39 " subu $t7, $t9, %[src_stride] \n" |
|
40 //dst + dst_stride word aligned |
|
41 "1: \n" |
|
42 "lbu $t0, 0(%[src]) \n" |
|
43 "lbux $t1, %[src_stride](%[src]) \n" |
|
44 "lbux $t8, $t2(%[src]) \n" |
|
45 "lbux $t9, $t3(%[src]) \n" |
|
46 "sll $t1, $t1, 16 \n" |
|
47 "sll $t9, $t9, 16 \n" |
|
48 "or $t0, $t0, $t1 \n" |
|
49 "or $t8, $t8, $t9 \n" |
|
50 "precr.qb.ph $s0, $t8, $t0 \n" |
|
51 "lbux $t0, $t4(%[src]) \n" |
|
52 "lbux $t1, $t5(%[src]) \n" |
|
53 "lbux $t8, $t6(%[src]) \n" |
|
54 "lbux $t9, $t7(%[src]) \n" |
|
55 "sll $t1, $t1, 16 \n" |
|
56 "sll $t9, $t9, 16 \n" |
|
57 "or $t0, $t0, $t1 \n" |
|
58 "or $t8, $t8, $t9 \n" |
|
59 "precr.qb.ph $s1, $t8, $t0 \n" |
|
60 "sw $s0, 0(%[dst]) \n" |
|
61 "addiu %[width], -1 \n" |
|
62 "addiu %[src], 1 \n" |
|
63 "sw $s1, 4(%[dst]) \n" |
|
64 "bnez %[width], 1b \n" |
|
65 " addu %[dst], %[dst], %[dst_stride] \n" |
|
66 "b 2f \n" |
|
67 //dst + dst_stride unaligned |
|
68 "11: \n" |
|
69 "lbu $t0, 0(%[src]) \n" |
|
70 "lbux $t1, %[src_stride](%[src]) \n" |
|
71 "lbux $t8, $t2(%[src]) \n" |
|
72 "lbux $t9, $t3(%[src]) \n" |
|
73 "sll $t1, $t1, 16 \n" |
|
74 "sll $t9, $t9, 16 \n" |
|
75 "or $t0, $t0, $t1 \n" |
|
76 "or $t8, $t8, $t9 \n" |
|
77 "precr.qb.ph $s0, $t8, $t0 \n" |
|
78 "lbux $t0, $t4(%[src]) \n" |
|
79 "lbux $t1, $t5(%[src]) \n" |
|
80 "lbux $t8, $t6(%[src]) \n" |
|
81 "lbux $t9, $t7(%[src]) \n" |
|
82 "sll $t1, $t1, 16 \n" |
|
83 "sll $t9, $t9, 16 \n" |
|
84 "or $t0, $t0, $t1 \n" |
|
85 "or $t8, $t8, $t9 \n" |
|
86 "precr.qb.ph $s1, $t8, $t0 \n" |
|
87 "swr $s0, 0(%[dst]) \n" |
|
88 "swl $s0, 3(%[dst]) \n" |
|
89 "addiu %[width], -1 \n" |
|
90 "addiu %[src], 1 \n" |
|
91 "swr $s1, 4(%[dst]) \n" |
|
92 "swl $s1, 7(%[dst]) \n" |
|
93 "bnez %[width], 11b \n" |
|
94 "addu %[dst], %[dst], %[dst_stride] \n" |
|
95 "2: \n" |
|
96 ".set pop \n" |
|
97 :[src] "+r" (src), |
|
98 [dst] "+r" (dst), |
|
99 [width] "+r" (width) |
|
100 :[src_stride] "r" (src_stride), |
|
101 [dst_stride] "r" (dst_stride) |
|
102 : "t0", "t1", "t2", "t3", "t4", "t5", |
|
103 "t6", "t7", "t8", "t9", |
|
104 "s0", "s1" |
|
105 ); |
|
106 } |
|
107 |
|
108 void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride, |
|
109 uint8* dst, int dst_stride, |
|
110 int width) { |
|
111 __asm__ __volatile__ ( |
|
112 ".set noat \n" |
|
113 ".set push \n" |
|
114 ".set noreorder \n" |
|
115 "beqz %[width], 2f \n" |
|
116 " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 |
|
117 "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 |
|
118 "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 |
|
119 "addu $t3, $t2, %[src_stride] \n" |
|
120 "addu $t5, $t4, %[src_stride] \n" |
|
121 "addu $t6, $t2, $t4 \n" |
|
122 |
|
123 "srl $AT, %[width], 0x2 \n" |
|
124 "andi $t0, %[dst], 0x3 \n" |
|
125 "andi $t1, %[dst_stride], 0x3 \n" |
|
126 "or $t0, $t0, $t1 \n" |
|
127 "bnez $t0, 11f \n" |
|
128 " subu $t7, $t9, %[src_stride] \n" |
|
129 //dst + dst_stride word aligned |
|
130 "1: \n" |
|
131 "lw $t0, 0(%[src]) \n" |
|
132 "lwx $t1, %[src_stride](%[src]) \n" |
|
133 "lwx $t8, $t2(%[src]) \n" |
|
134 "lwx $t9, $t3(%[src]) \n" |
|
135 |
|
136 // t0 = | 30 | 20 | 10 | 00 | |
|
137 // t1 = | 31 | 21 | 11 | 01 | |
|
138 // t8 = | 32 | 22 | 12 | 02 | |
|
139 // t9 = | 33 | 23 | 13 | 03 | |
|
140 |
|
141 "precr.qb.ph $s0, $t1, $t0 \n" |
|
142 "precr.qb.ph $s1, $t9, $t8 \n" |
|
143 "precrq.qb.ph $s2, $t1, $t0 \n" |
|
144 "precrq.qb.ph $s3, $t9, $t8 \n" |
|
145 |
|
146 // s0 = | 21 | 01 | 20 | 00 | |
|
147 // s1 = | 23 | 03 | 22 | 02 | |
|
148 // s2 = | 31 | 11 | 30 | 10 | |
|
149 // s3 = | 33 | 13 | 32 | 12 | |
|
150 |
|
151 "precr.qb.ph $s4, $s1, $s0 \n" |
|
152 "precrq.qb.ph $s5, $s1, $s0 \n" |
|
153 "precr.qb.ph $s6, $s3, $s2 \n" |
|
154 "precrq.qb.ph $s7, $s3, $s2 \n" |
|
155 |
|
156 // s4 = | 03 | 02 | 01 | 00 | |
|
157 // s5 = | 23 | 22 | 21 | 20 | |
|
158 // s6 = | 13 | 12 | 11 | 10 | |
|
159 // s7 = | 33 | 32 | 31 | 30 | |
|
160 |
|
161 "lwx $t0, $t4(%[src]) \n" |
|
162 "lwx $t1, $t5(%[src]) \n" |
|
163 "lwx $t8, $t6(%[src]) \n" |
|
164 "lwx $t9, $t7(%[src]) \n" |
|
165 |
|
166 // t0 = | 34 | 24 | 14 | 04 | |
|
167 // t1 = | 35 | 25 | 15 | 05 | |
|
168 // t8 = | 36 | 26 | 16 | 06 | |
|
169 // t9 = | 37 | 27 | 17 | 07 | |
|
170 |
|
171 "precr.qb.ph $s0, $t1, $t0 \n" |
|
172 "precr.qb.ph $s1, $t9, $t8 \n" |
|
173 "precrq.qb.ph $s2, $t1, $t0 \n" |
|
174 "precrq.qb.ph $s3, $t9, $t8 \n" |
|
175 |
|
176 // s0 = | 25 | 05 | 24 | 04 | |
|
177 // s1 = | 27 | 07 | 26 | 06 | |
|
178 // s2 = | 35 | 15 | 34 | 14 | |
|
179 // s3 = | 37 | 17 | 36 | 16 | |
|
180 |
|
181 "precr.qb.ph $t0, $s1, $s0 \n" |
|
182 "precrq.qb.ph $t1, $s1, $s0 \n" |
|
183 "precr.qb.ph $t8, $s3, $s2 \n" |
|
184 "precrq.qb.ph $t9, $s3, $s2 \n" |
|
185 |
|
186 // t0 = | 07 | 06 | 05 | 04 | |
|
187 // t1 = | 27 | 26 | 25 | 24 | |
|
188 // t8 = | 17 | 16 | 15 | 14 | |
|
189 // t9 = | 37 | 36 | 35 | 34 | |
|
190 |
|
191 "addu $s0, %[dst], %[dst_stride] \n" |
|
192 "addu $s1, $s0, %[dst_stride] \n" |
|
193 "addu $s2, $s1, %[dst_stride] \n" |
|
194 |
|
195 "sw $s4, 0(%[dst]) \n" |
|
196 "sw $t0, 4(%[dst]) \n" |
|
197 "sw $s6, 0($s0) \n" |
|
198 "sw $t8, 4($s0) \n" |
|
199 "sw $s5, 0($s1) \n" |
|
200 "sw $t1, 4($s1) \n" |
|
201 "sw $s7, 0($s2) \n" |
|
202 "sw $t9, 4($s2) \n" |
|
203 |
|
204 "addiu $AT, -1 \n" |
|
205 "addiu %[src], 4 \n" |
|
206 |
|
207 "bnez $AT, 1b \n" |
|
208 " addu %[dst], $s2, %[dst_stride] \n" |
|
209 "b 2f \n" |
|
210 //dst + dst_stride unaligned |
|
211 "11: \n" |
|
212 "lw $t0, 0(%[src]) \n" |
|
213 "lwx $t1, %[src_stride](%[src]) \n" |
|
214 "lwx $t8, $t2(%[src]) \n" |
|
215 "lwx $t9, $t3(%[src]) \n" |
|
216 |
|
217 // t0 = | 30 | 20 | 10 | 00 | |
|
218 // t1 = | 31 | 21 | 11 | 01 | |
|
219 // t8 = | 32 | 22 | 12 | 02 | |
|
220 // t9 = | 33 | 23 | 13 | 03 | |
|
221 |
|
222 "precr.qb.ph $s0, $t1, $t0 \n" |
|
223 "precr.qb.ph $s1, $t9, $t8 \n" |
|
224 "precrq.qb.ph $s2, $t1, $t0 \n" |
|
225 "precrq.qb.ph $s3, $t9, $t8 \n" |
|
226 |
|
227 // s0 = | 21 | 01 | 20 | 00 | |
|
228 // s1 = | 23 | 03 | 22 | 02 | |
|
229 // s2 = | 31 | 11 | 30 | 10 | |
|
230 // s3 = | 33 | 13 | 32 | 12 | |
|
231 |
|
232 "precr.qb.ph $s4, $s1, $s0 \n" |
|
233 "precrq.qb.ph $s5, $s1, $s0 \n" |
|
234 "precr.qb.ph $s6, $s3, $s2 \n" |
|
235 "precrq.qb.ph $s7, $s3, $s2 \n" |
|
236 |
|
237 // s4 = | 03 | 02 | 01 | 00 | |
|
238 // s5 = | 23 | 22 | 21 | 20 | |
|
239 // s6 = | 13 | 12 | 11 | 10 | |
|
240 // s7 = | 33 | 32 | 31 | 30 | |
|
241 |
|
242 "lwx $t0, $t4(%[src]) \n" |
|
243 "lwx $t1, $t5(%[src]) \n" |
|
244 "lwx $t8, $t6(%[src]) \n" |
|
245 "lwx $t9, $t7(%[src]) \n" |
|
246 |
|
247 // t0 = | 34 | 24 | 14 | 04 | |
|
248 // t1 = | 35 | 25 | 15 | 05 | |
|
249 // t8 = | 36 | 26 | 16 | 06 | |
|
250 // t9 = | 37 | 27 | 17 | 07 | |
|
251 |
|
252 "precr.qb.ph $s0, $t1, $t0 \n" |
|
253 "precr.qb.ph $s1, $t9, $t8 \n" |
|
254 "precrq.qb.ph $s2, $t1, $t0 \n" |
|
255 "precrq.qb.ph $s3, $t9, $t8 \n" |
|
256 |
|
257 // s0 = | 25 | 05 | 24 | 04 | |
|
258 // s1 = | 27 | 07 | 26 | 06 | |
|
259 // s2 = | 35 | 15 | 34 | 14 | |
|
260 // s3 = | 37 | 17 | 36 | 16 | |
|
261 |
|
262 "precr.qb.ph $t0, $s1, $s0 \n" |
|
263 "precrq.qb.ph $t1, $s1, $s0 \n" |
|
264 "precr.qb.ph $t8, $s3, $s2 \n" |
|
265 "precrq.qb.ph $t9, $s3, $s2 \n" |
|
266 |
|
267 // t0 = | 07 | 06 | 05 | 04 | |
|
268 // t1 = | 27 | 26 | 25 | 24 | |
|
269 // t8 = | 17 | 16 | 15 | 14 | |
|
270 // t9 = | 37 | 36 | 35 | 34 | |
|
271 |
|
272 "addu $s0, %[dst], %[dst_stride] \n" |
|
273 "addu $s1, $s0, %[dst_stride] \n" |
|
274 "addu $s2, $s1, %[dst_stride] \n" |
|
275 |
|
276 "swr $s4, 0(%[dst]) \n" |
|
277 "swl $s4, 3(%[dst]) \n" |
|
278 "swr $t0, 4(%[dst]) \n" |
|
279 "swl $t0, 7(%[dst]) \n" |
|
280 "swr $s6, 0($s0) \n" |
|
281 "swl $s6, 3($s0) \n" |
|
282 "swr $t8, 4($s0) \n" |
|
283 "swl $t8, 7($s0) \n" |
|
284 "swr $s5, 0($s1) \n" |
|
285 "swl $s5, 3($s1) \n" |
|
286 "swr $t1, 4($s1) \n" |
|
287 "swl $t1, 7($s1) \n" |
|
288 "swr $s7, 0($s2) \n" |
|
289 "swl $s7, 3($s2) \n" |
|
290 "swr $t9, 4($s2) \n" |
|
291 "swl $t9, 7($s2) \n" |
|
292 |
|
293 "addiu $AT, -1 \n" |
|
294 "addiu %[src], 4 \n" |
|
295 |
|
296 "bnez $AT, 11b \n" |
|
297 " addu %[dst], $s2, %[dst_stride] \n" |
|
298 "2: \n" |
|
299 ".set pop \n" |
|
300 ".set at \n" |
|
301 :[src] "+r" (src), |
|
302 [dst] "+r" (dst), |
|
303 [width] "+r" (width) |
|
304 :[src_stride] "r" (src_stride), |
|
305 [dst_stride] "r" (dst_stride) |
|
306 : "t0", "t1", "t2", "t3", "t4", "t5", |
|
307 "t6", "t7", "t8", "t9", |
|
308 "s0", "s1", "s2", "s3", "s4", |
|
309 "s5", "s6", "s7" |
|
310 ); |
|
311 } |
|
312 |
|
313 void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride, |
|
314 uint8* dst_a, int dst_stride_a, |
|
315 uint8* dst_b, int dst_stride_b, |
|
316 int width) { |
|
317 __asm__ __volatile__ ( |
|
318 ".set push \n" |
|
319 ".set noreorder \n" |
|
320 "beqz %[width], 2f \n" |
|
321 " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 |
|
322 "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 |
|
323 "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 |
|
324 "addu $t3, $t2, %[src_stride] \n" |
|
325 "addu $t5, $t4, %[src_stride] \n" |
|
326 "addu $t6, $t2, $t4 \n" |
|
327 "subu $t7, $t9, %[src_stride] \n" |
|
328 "srl $t1, %[width], 1 \n" |
|
329 |
|
330 // check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b |
|
331 "andi $t0, %[dst_a], 0x3 \n" |
|
332 "andi $t8, %[dst_b], 0x3 \n" |
|
333 "or $t0, $t0, $t8 \n" |
|
334 "andi $t8, %[dst_stride_a], 0x3 \n" |
|
335 "andi $s5, %[dst_stride_b], 0x3 \n" |
|
336 "or $t8, $t8, $s5 \n" |
|
337 "or $t0, $t0, $t8 \n" |
|
338 "bnez $t0, 11f \n" |
|
339 " nop \n" |
|
340 // dst + dst_stride word aligned (both, a & b dst addresses) |
|
341 "1: \n" |
|
342 "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0| |
|
343 "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1| |
|
344 "addu $s5, %[dst_a], %[dst_stride_a] \n" |
|
345 "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2| |
|
346 "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3| |
|
347 "addu $s6, %[dst_b], %[dst_stride_b] \n" |
|
348 |
|
349 "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0| |
|
350 "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2| |
|
351 "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0| |
|
352 "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0| |
|
353 |
|
354 "sll $t0, $t0, 16 \n" |
|
355 "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0| |
|
356 "sll $t9, $t9, 16 \n" |
|
357 "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2| |
|
358 |
|
359 "sw $s3, 0($s5) \n" |
|
360 "sw $s4, 0($s6) \n" |
|
361 |
|
362 "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0| |
|
363 "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0| |
|
364 |
|
365 "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4| |
|
366 "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5| |
|
367 "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6| |
|
368 "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7| |
|
369 "sw $s3, 0(%[dst_a]) \n" |
|
370 "sw $s4, 0(%[dst_b]) \n" |
|
371 |
|
372 "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4| |
|
373 "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7| |
|
374 "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4| |
|
375 "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4| |
|
376 |
|
377 "sll $t0, $t0, 16 \n" |
|
378 "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4| |
|
379 "sll $t9, $t9, 16 \n" |
|
380 "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6| |
|
381 "sw $s3, 4($s5) \n" |
|
382 "sw $s4, 4($s6) \n" |
|
383 |
|
384 "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4| |
|
385 "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4| |
|
386 |
|
387 "addiu %[src], 4 \n" |
|
388 "addiu $t1, -1 \n" |
|
389 "sll $t0, %[dst_stride_a], 1 \n" |
|
390 "sll $t8, %[dst_stride_b], 1 \n" |
|
391 "sw $s3, 4(%[dst_a]) \n" |
|
392 "sw $s4, 4(%[dst_b]) \n" |
|
393 "addu %[dst_a], %[dst_a], $t0 \n" |
|
394 "bnez $t1, 1b \n" |
|
395 " addu %[dst_b], %[dst_b], $t8 \n" |
|
396 "b 2f \n" |
|
397 " nop \n" |
|
398 |
|
399 // dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned |
|
400 "11: \n" |
|
401 "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0| |
|
402 "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1| |
|
403 "addu $s5, %[dst_a], %[dst_stride_a] \n" |
|
404 "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2| |
|
405 "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3| |
|
406 "addu $s6, %[dst_b], %[dst_stride_b] \n" |
|
407 |
|
408 "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0| |
|
409 "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2| |
|
410 "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0| |
|
411 "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0| |
|
412 |
|
413 "sll $t0, $t0, 16 \n" |
|
414 "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0| |
|
415 "sll $t9, $t9, 16 \n" |
|
416 "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2| |
|
417 |
|
418 "swr $s3, 0($s5) \n" |
|
419 "swl $s3, 3($s5) \n" |
|
420 "swr $s4, 0($s6) \n" |
|
421 "swl $s4, 3($s6) \n" |
|
422 |
|
423 "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0| |
|
424 "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0| |
|
425 |
|
426 "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4| |
|
427 "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5| |
|
428 "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6| |
|
429 "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7| |
|
430 "swr $s3, 0(%[dst_a]) \n" |
|
431 "swl $s3, 3(%[dst_a]) \n" |
|
432 "swr $s4, 0(%[dst_b]) \n" |
|
433 "swl $s4, 3(%[dst_b]) \n" |
|
434 |
|
435 "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4| |
|
436 "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7| |
|
437 "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4| |
|
438 "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4| |
|
439 |
|
440 "sll $t0, $t0, 16 \n" |
|
441 "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4| |
|
442 "sll $t9, $t9, 16 \n" |
|
443 "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6| |
|
444 |
|
445 "swr $s3, 4($s5) \n" |
|
446 "swl $s3, 7($s5) \n" |
|
447 "swr $s4, 4($s6) \n" |
|
448 "swl $s4, 7($s6) \n" |
|
449 |
|
450 "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4| |
|
451 "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4| |
|
452 |
|
453 "addiu %[src], 4 \n" |
|
454 "addiu $t1, -1 \n" |
|
455 "sll $t0, %[dst_stride_a], 1 \n" |
|
456 "sll $t8, %[dst_stride_b], 1 \n" |
|
457 "swr $s3, 4(%[dst_a]) \n" |
|
458 "swl $s3, 7(%[dst_a]) \n" |
|
459 "swr $s4, 4(%[dst_b]) \n" |
|
460 "swl $s4, 7(%[dst_b]) \n" |
|
461 "addu %[dst_a], %[dst_a], $t0 \n" |
|
462 "bnez $t1, 11b \n" |
|
463 " addu %[dst_b], %[dst_b], $t8 \n" |
|
464 |
|
465 "2: \n" |
|
466 ".set pop \n" |
|
467 : [src] "+r" (src), |
|
468 [dst_a] "+r" (dst_a), |
|
469 [dst_b] "+r" (dst_b), |
|
470 [width] "+r" (width), |
|
471 [src_stride] "+r" (src_stride) |
|
472 : [dst_stride_a] "r" (dst_stride_a), |
|
473 [dst_stride_b] "r" (dst_stride_b) |
|
474 : "t0", "t1", "t2", "t3", "t4", "t5", |
|
475 "t6", "t7", "t8", "t9", |
|
476 "s0", "s1", "s2", "s3", |
|
477 "s4", "s5", "s6" |
|
478 ); |
|
479 } |
|
480 |
|
481 #endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2) |
|
482 |
|
483 #ifdef __cplusplus |
|
484 } // extern "C" |
|
485 } // namespace libyuv |
|
486 #endif |