1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libyuv/source/rotate_mips.cc Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,486 @@ 1.4 +/* 1.5 + * Copyright 2011 The LibYuv Project Authors. All rights reserved. 1.6 + * 1.7 + * Use of this source code is governed by a BSD-style license 1.8 + * that can be found in the LICENSE file in the root of the source 1.9 + * tree. An additional intellectual property rights grant can be found 1.10 + * in the file PATENTS. All contributing project authors may 1.11 + * be found in the AUTHORS file in the root of the source tree. 1.12 + */ 1.13 + 1.14 +#include "libyuv/row.h" 1.15 + 1.16 +#include "libyuv/basic_types.h" 1.17 + 1.18 +#ifdef __cplusplus 1.19 +namespace libyuv { 1.20 +extern "C" { 1.21 +#endif 1.22 + 1.23 +#if !defined(LIBYUV_DISABLE_MIPS) && \ 1.24 + defined(__mips_dsp) && (__mips_dsp_rev >= 2) 1.25 + 1.26 +void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride, 1.27 + uint8* dst, int dst_stride, 1.28 + int width) { 1.29 + __asm__ __volatile__ ( 1.30 + ".set push \n" 1.31 + ".set noreorder \n" 1.32 + "sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 1.33 + "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 1.34 + "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 1.35 + "addu $t3, $t2, %[src_stride] \n" 1.36 + "addu $t5, $t4, %[src_stride] \n" 1.37 + "addu $t6, $t2, $t4 \n" 1.38 + "andi $t0, %[dst], 0x3 \n" 1.39 + "andi $t1, %[dst_stride], 0x3 \n" 1.40 + "or $t0, $t0, $t1 \n" 1.41 + "bnez $t0, 11f \n" 1.42 + " subu $t7, $t9, %[src_stride] \n" 1.43 +//dst + dst_stride word aligned 1.44 + "1: \n" 1.45 + "lbu $t0, 0(%[src]) \n" 1.46 + "lbux $t1, %[src_stride](%[src]) \n" 1.47 + "lbux $t8, $t2(%[src]) \n" 1.48 + "lbux $t9, $t3(%[src]) \n" 1.49 + "sll $t1, $t1, 16 \n" 1.50 + "sll $t9, $t9, 16 \n" 1.51 + "or $t0, $t0, $t1 \n" 1.52 + "or $t8, $t8, $t9 \n" 1.53 + "precr.qb.ph $s0, $t8, $t0 \n" 1.54 + "lbux $t0, $t4(%[src]) \n" 1.55 + "lbux $t1, $t5(%[src]) \n" 1.56 + "lbux $t8, $t6(%[src]) \n" 1.57 + "lbux $t9, $t7(%[src]) \n" 1.58 + "sll $t1, $t1, 16 \n" 1.59 + "sll $t9, $t9, 16 \n" 1.60 + "or $t0, $t0, $t1 \n" 1.61 + "or $t8, $t8, $t9 \n" 1.62 + "precr.qb.ph $s1, $t8, $t0 \n" 1.63 + "sw $s0, 0(%[dst]) \n" 1.64 + "addiu %[width], -1 \n" 1.65 + "addiu %[src], 1 \n" 1.66 + "sw $s1, 4(%[dst]) \n" 1.67 + "bnez %[width], 1b \n" 1.68 + " addu %[dst], %[dst], %[dst_stride] \n" 1.69 + "b 2f \n" 1.70 +//dst + dst_stride unaligned 1.71 + "11: \n" 1.72 + "lbu $t0, 0(%[src]) \n" 1.73 + "lbux $t1, %[src_stride](%[src]) \n" 1.74 + "lbux $t8, $t2(%[src]) \n" 1.75 + "lbux $t9, $t3(%[src]) \n" 1.76 + "sll $t1, $t1, 16 \n" 1.77 + "sll $t9, $t9, 16 \n" 1.78 + "or $t0, $t0, $t1 \n" 1.79 + "or $t8, $t8, $t9 \n" 1.80 + "precr.qb.ph $s0, $t8, $t0 \n" 1.81 + "lbux $t0, $t4(%[src]) \n" 1.82 + "lbux $t1, $t5(%[src]) \n" 1.83 + "lbux $t8, $t6(%[src]) \n" 1.84 + "lbux $t9, $t7(%[src]) \n" 1.85 + "sll $t1, $t1, 16 \n" 1.86 + "sll $t9, $t9, 16 \n" 1.87 + "or $t0, $t0, $t1 \n" 1.88 + "or $t8, $t8, $t9 \n" 1.89 + "precr.qb.ph $s1, $t8, $t0 \n" 1.90 + "swr $s0, 0(%[dst]) \n" 1.91 + "swl $s0, 3(%[dst]) \n" 1.92 + "addiu %[width], -1 \n" 1.93 + "addiu %[src], 1 \n" 1.94 + "swr $s1, 4(%[dst]) \n" 1.95 + "swl $s1, 7(%[dst]) \n" 1.96 + "bnez %[width], 11b \n" 1.97 + "addu %[dst], %[dst], %[dst_stride] \n" 1.98 + "2: \n" 1.99 + ".set pop \n" 1.100 + :[src] "+r" (src), 1.101 + [dst] "+r" (dst), 1.102 + [width] "+r" (width) 1.103 + :[src_stride] "r" (src_stride), 1.104 + [dst_stride] "r" (dst_stride) 1.105 + : "t0", "t1", "t2", "t3", "t4", "t5", 1.106 + "t6", "t7", "t8", "t9", 1.107 + "s0", "s1" 1.108 + ); 1.109 +} 1.110 + 1.111 +void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride, 1.112 + uint8* dst, int dst_stride, 1.113 + int width) { 1.114 + __asm__ __volatile__ ( 1.115 + ".set noat \n" 1.116 + ".set push \n" 1.117 + ".set noreorder \n" 1.118 + "beqz %[width], 2f \n" 1.119 + " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 1.120 + "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 1.121 + "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 1.122 + "addu $t3, $t2, %[src_stride] \n" 1.123 + "addu $t5, $t4, %[src_stride] \n" 1.124 + "addu $t6, $t2, $t4 \n" 1.125 + 1.126 + "srl $AT, %[width], 0x2 \n" 1.127 + "andi $t0, %[dst], 0x3 \n" 1.128 + "andi $t1, %[dst_stride], 0x3 \n" 1.129 + "or $t0, $t0, $t1 \n" 1.130 + "bnez $t0, 11f \n" 1.131 + " subu $t7, $t9, %[src_stride] \n" 1.132 +//dst + dst_stride word aligned 1.133 + "1: \n" 1.134 + "lw $t0, 0(%[src]) \n" 1.135 + "lwx $t1, %[src_stride](%[src]) \n" 1.136 + "lwx $t8, $t2(%[src]) \n" 1.137 + "lwx $t9, $t3(%[src]) \n" 1.138 + 1.139 +// t0 = | 30 | 20 | 10 | 00 | 1.140 +// t1 = | 31 | 21 | 11 | 01 | 1.141 +// t8 = | 32 | 22 | 12 | 02 | 1.142 +// t9 = | 33 | 23 | 13 | 03 | 1.143 + 1.144 + "precr.qb.ph $s0, $t1, $t0 \n" 1.145 + "precr.qb.ph $s1, $t9, $t8 \n" 1.146 + "precrq.qb.ph $s2, $t1, $t0 \n" 1.147 + "precrq.qb.ph $s3, $t9, $t8 \n" 1.148 + 1.149 + // s0 = | 21 | 01 | 20 | 00 | 1.150 + // s1 = | 23 | 03 | 22 | 02 | 1.151 + // s2 = | 31 | 11 | 30 | 10 | 1.152 + // s3 = | 33 | 13 | 32 | 12 | 1.153 + 1.154 + "precr.qb.ph $s4, $s1, $s0 \n" 1.155 + "precrq.qb.ph $s5, $s1, $s0 \n" 1.156 + "precr.qb.ph $s6, $s3, $s2 \n" 1.157 + "precrq.qb.ph $s7, $s3, $s2 \n" 1.158 + 1.159 + // s4 = | 03 | 02 | 01 | 00 | 1.160 + // s5 = | 23 | 22 | 21 | 20 | 1.161 + // s6 = | 13 | 12 | 11 | 10 | 1.162 + // s7 = | 33 | 32 | 31 | 30 | 1.163 + 1.164 + "lwx $t0, $t4(%[src]) \n" 1.165 + "lwx $t1, $t5(%[src]) \n" 1.166 + "lwx $t8, $t6(%[src]) \n" 1.167 + "lwx $t9, $t7(%[src]) \n" 1.168 + 1.169 +// t0 = | 34 | 24 | 14 | 04 | 1.170 +// t1 = | 35 | 25 | 15 | 05 | 1.171 +// t8 = | 36 | 26 | 16 | 06 | 1.172 +// t9 = | 37 | 27 | 17 | 07 | 1.173 + 1.174 + "precr.qb.ph $s0, $t1, $t0 \n" 1.175 + "precr.qb.ph $s1, $t9, $t8 \n" 1.176 + "precrq.qb.ph $s2, $t1, $t0 \n" 1.177 + "precrq.qb.ph $s3, $t9, $t8 \n" 1.178 + 1.179 + // s0 = | 25 | 05 | 24 | 04 | 1.180 + // s1 = | 27 | 07 | 26 | 06 | 1.181 + // s2 = | 35 | 15 | 34 | 14 | 1.182 + // s3 = | 37 | 17 | 36 | 16 | 1.183 + 1.184 + "precr.qb.ph $t0, $s1, $s0 \n" 1.185 + "precrq.qb.ph $t1, $s1, $s0 \n" 1.186 + "precr.qb.ph $t8, $s3, $s2 \n" 1.187 + "precrq.qb.ph $t9, $s3, $s2 \n" 1.188 + 1.189 + // t0 = | 07 | 06 | 05 | 04 | 1.190 + // t1 = | 27 | 26 | 25 | 24 | 1.191 + // t8 = | 17 | 16 | 15 | 14 | 1.192 + // t9 = | 37 | 36 | 35 | 34 | 1.193 + 1.194 + "addu $s0, %[dst], %[dst_stride] \n" 1.195 + "addu $s1, $s0, %[dst_stride] \n" 1.196 + "addu $s2, $s1, %[dst_stride] \n" 1.197 + 1.198 + "sw $s4, 0(%[dst]) \n" 1.199 + "sw $t0, 4(%[dst]) \n" 1.200 + "sw $s6, 0($s0) \n" 1.201 + "sw $t8, 4($s0) \n" 1.202 + "sw $s5, 0($s1) \n" 1.203 + "sw $t1, 4($s1) \n" 1.204 + "sw $s7, 0($s2) \n" 1.205 + "sw $t9, 4($s2) \n" 1.206 + 1.207 + "addiu $AT, -1 \n" 1.208 + "addiu %[src], 4 \n" 1.209 + 1.210 + "bnez $AT, 1b \n" 1.211 + " addu %[dst], $s2, %[dst_stride] \n" 1.212 + "b 2f \n" 1.213 +//dst + dst_stride unaligned 1.214 + "11: \n" 1.215 + "lw $t0, 0(%[src]) \n" 1.216 + "lwx $t1, %[src_stride](%[src]) \n" 1.217 + "lwx $t8, $t2(%[src]) \n" 1.218 + "lwx $t9, $t3(%[src]) \n" 1.219 + 1.220 +// t0 = | 30 | 20 | 10 | 00 | 1.221 +// t1 = | 31 | 21 | 11 | 01 | 1.222 +// t8 = | 32 | 22 | 12 | 02 | 1.223 +// t9 = | 33 | 23 | 13 | 03 | 1.224 + 1.225 + "precr.qb.ph $s0, $t1, $t0 \n" 1.226 + "precr.qb.ph $s1, $t9, $t8 \n" 1.227 + "precrq.qb.ph $s2, $t1, $t0 \n" 1.228 + "precrq.qb.ph $s3, $t9, $t8 \n" 1.229 + 1.230 + // s0 = | 21 | 01 | 20 | 00 | 1.231 + // s1 = | 23 | 03 | 22 | 02 | 1.232 + // s2 = | 31 | 11 | 30 | 10 | 1.233 + // s3 = | 33 | 13 | 32 | 12 | 1.234 + 1.235 + "precr.qb.ph $s4, $s1, $s0 \n" 1.236 + "precrq.qb.ph $s5, $s1, $s0 \n" 1.237 + "precr.qb.ph $s6, $s3, $s2 \n" 1.238 + "precrq.qb.ph $s7, $s3, $s2 \n" 1.239 + 1.240 + // s4 = | 03 | 02 | 01 | 00 | 1.241 + // s5 = | 23 | 22 | 21 | 20 | 1.242 + // s6 = | 13 | 12 | 11 | 10 | 1.243 + // s7 = | 33 | 32 | 31 | 30 | 1.244 + 1.245 + "lwx $t0, $t4(%[src]) \n" 1.246 + "lwx $t1, $t5(%[src]) \n" 1.247 + "lwx $t8, $t6(%[src]) \n" 1.248 + "lwx $t9, $t7(%[src]) \n" 1.249 + 1.250 +// t0 = | 34 | 24 | 14 | 04 | 1.251 +// t1 = | 35 | 25 | 15 | 05 | 1.252 +// t8 = | 36 | 26 | 16 | 06 | 1.253 +// t9 = | 37 | 27 | 17 | 07 | 1.254 + 1.255 + "precr.qb.ph $s0, $t1, $t0 \n" 1.256 + "precr.qb.ph $s1, $t9, $t8 \n" 1.257 + "precrq.qb.ph $s2, $t1, $t0 \n" 1.258 + "precrq.qb.ph $s3, $t9, $t8 \n" 1.259 + 1.260 + // s0 = | 25 | 05 | 24 | 04 | 1.261 + // s1 = | 27 | 07 | 26 | 06 | 1.262 + // s2 = | 35 | 15 | 34 | 14 | 1.263 + // s3 = | 37 | 17 | 36 | 16 | 1.264 + 1.265 + "precr.qb.ph $t0, $s1, $s0 \n" 1.266 + "precrq.qb.ph $t1, $s1, $s0 \n" 1.267 + "precr.qb.ph $t8, $s3, $s2 \n" 1.268 + "precrq.qb.ph $t9, $s3, $s2 \n" 1.269 + 1.270 + // t0 = | 07 | 06 | 05 | 04 | 1.271 + // t1 = | 27 | 26 | 25 | 24 | 1.272 + // t8 = | 17 | 16 | 15 | 14 | 1.273 + // t9 = | 37 | 36 | 35 | 34 | 1.274 + 1.275 + "addu $s0, %[dst], %[dst_stride] \n" 1.276 + "addu $s1, $s0, %[dst_stride] \n" 1.277 + "addu $s2, $s1, %[dst_stride] \n" 1.278 + 1.279 + "swr $s4, 0(%[dst]) \n" 1.280 + "swl $s4, 3(%[dst]) \n" 1.281 + "swr $t0, 4(%[dst]) \n" 1.282 + "swl $t0, 7(%[dst]) \n" 1.283 + "swr $s6, 0($s0) \n" 1.284 + "swl $s6, 3($s0) \n" 1.285 + "swr $t8, 4($s0) \n" 1.286 + "swl $t8, 7($s0) \n" 1.287 + "swr $s5, 0($s1) \n" 1.288 + "swl $s5, 3($s1) \n" 1.289 + "swr $t1, 4($s1) \n" 1.290 + "swl $t1, 7($s1) \n" 1.291 + "swr $s7, 0($s2) \n" 1.292 + "swl $s7, 3($s2) \n" 1.293 + "swr $t9, 4($s2) \n" 1.294 + "swl $t9, 7($s2) \n" 1.295 + 1.296 + "addiu $AT, -1 \n" 1.297 + "addiu %[src], 4 \n" 1.298 + 1.299 + "bnez $AT, 11b \n" 1.300 + " addu %[dst], $s2, %[dst_stride] \n" 1.301 + "2: \n" 1.302 + ".set pop \n" 1.303 + ".set at \n" 1.304 + :[src] "+r" (src), 1.305 + [dst] "+r" (dst), 1.306 + [width] "+r" (width) 1.307 + :[src_stride] "r" (src_stride), 1.308 + [dst_stride] "r" (dst_stride) 1.309 + : "t0", "t1", "t2", "t3", "t4", "t5", 1.310 + "t6", "t7", "t8", "t9", 1.311 + "s0", "s1", "s2", "s3", "s4", 1.312 + "s5", "s6", "s7" 1.313 + ); 1.314 +} 1.315 + 1.316 +void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride, 1.317 + uint8* dst_a, int dst_stride_a, 1.318 + uint8* dst_b, int dst_stride_b, 1.319 + int width) { 1.320 + __asm__ __volatile__ ( 1.321 + ".set push \n" 1.322 + ".set noreorder \n" 1.323 + "beqz %[width], 2f \n" 1.324 + " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 1.325 + "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 1.326 + "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 1.327 + "addu $t3, $t2, %[src_stride] \n" 1.328 + "addu $t5, $t4, %[src_stride] \n" 1.329 + "addu $t6, $t2, $t4 \n" 1.330 + "subu $t7, $t9, %[src_stride] \n" 1.331 + "srl $t1, %[width], 1 \n" 1.332 + 1.333 +// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b 1.334 + "andi $t0, %[dst_a], 0x3 \n" 1.335 + "andi $t8, %[dst_b], 0x3 \n" 1.336 + "or $t0, $t0, $t8 \n" 1.337 + "andi $t8, %[dst_stride_a], 0x3 \n" 1.338 + "andi $s5, %[dst_stride_b], 0x3 \n" 1.339 + "or $t8, $t8, $s5 \n" 1.340 + "or $t0, $t0, $t8 \n" 1.341 + "bnez $t0, 11f \n" 1.342 + " nop \n" 1.343 +// dst + dst_stride word aligned (both, a & b dst addresses) 1.344 + "1: \n" 1.345 + "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0| 1.346 + "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1| 1.347 + "addu $s5, %[dst_a], %[dst_stride_a] \n" 1.348 + "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2| 1.349 + "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3| 1.350 + "addu $s6, %[dst_b], %[dst_stride_b] \n" 1.351 + 1.352 + "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0| 1.353 + "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2| 1.354 + "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0| 1.355 + "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0| 1.356 + 1.357 + "sll $t0, $t0, 16 \n" 1.358 + "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0| 1.359 + "sll $t9, $t9, 16 \n" 1.360 + "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2| 1.361 + 1.362 + "sw $s3, 0($s5) \n" 1.363 + "sw $s4, 0($s6) \n" 1.364 + 1.365 + "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0| 1.366 + "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0| 1.367 + 1.368 + "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4| 1.369 + "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5| 1.370 + "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6| 1.371 + "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7| 1.372 + "sw $s3, 0(%[dst_a]) \n" 1.373 + "sw $s4, 0(%[dst_b]) \n" 1.374 + 1.375 + "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4| 1.376 + "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7| 1.377 + "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4| 1.378 + "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4| 1.379 + 1.380 + "sll $t0, $t0, 16 \n" 1.381 + "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4| 1.382 + "sll $t9, $t9, 16 \n" 1.383 + "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6| 1.384 + "sw $s3, 4($s5) \n" 1.385 + "sw $s4, 4($s6) \n" 1.386 + 1.387 + "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4| 1.388 + "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4| 1.389 + 1.390 + "addiu %[src], 4 \n" 1.391 + "addiu $t1, -1 \n" 1.392 + "sll $t0, %[dst_stride_a], 1 \n" 1.393 + "sll $t8, %[dst_stride_b], 1 \n" 1.394 + "sw $s3, 4(%[dst_a]) \n" 1.395 + "sw $s4, 4(%[dst_b]) \n" 1.396 + "addu %[dst_a], %[dst_a], $t0 \n" 1.397 + "bnez $t1, 1b \n" 1.398 + " addu %[dst_b], %[dst_b], $t8 \n" 1.399 + "b 2f \n" 1.400 + " nop \n" 1.401 + 1.402 +// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned 1.403 + "11: \n" 1.404 + "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0| 1.405 + "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1| 1.406 + "addu $s5, %[dst_a], %[dst_stride_a] \n" 1.407 + "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2| 1.408 + "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3| 1.409 + "addu $s6, %[dst_b], %[dst_stride_b] \n" 1.410 + 1.411 + "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0| 1.412 + "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2| 1.413 + "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0| 1.414 + "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0| 1.415 + 1.416 + "sll $t0, $t0, 16 \n" 1.417 + "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0| 1.418 + "sll $t9, $t9, 16 \n" 1.419 + "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2| 1.420 + 1.421 + "swr $s3, 0($s5) \n" 1.422 + "swl $s3, 3($s5) \n" 1.423 + "swr $s4, 0($s6) \n" 1.424 + "swl $s4, 3($s6) \n" 1.425 + 1.426 + "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0| 1.427 + "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0| 1.428 + 1.429 + "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4| 1.430 + "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5| 1.431 + "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6| 1.432 + "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7| 1.433 + "swr $s3, 0(%[dst_a]) \n" 1.434 + "swl $s3, 3(%[dst_a]) \n" 1.435 + "swr $s4, 0(%[dst_b]) \n" 1.436 + "swl $s4, 3(%[dst_b]) \n" 1.437 + 1.438 + "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4| 1.439 + "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7| 1.440 + "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4| 1.441 + "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4| 1.442 + 1.443 + "sll $t0, $t0, 16 \n" 1.444 + "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4| 1.445 + "sll $t9, $t9, 16 \n" 1.446 + "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6| 1.447 + 1.448 + "swr $s3, 4($s5) \n" 1.449 + "swl $s3, 7($s5) \n" 1.450 + "swr $s4, 4($s6) \n" 1.451 + "swl $s4, 7($s6) \n" 1.452 + 1.453 + "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4| 1.454 + "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4| 1.455 + 1.456 + "addiu %[src], 4 \n" 1.457 + "addiu $t1, -1 \n" 1.458 + "sll $t0, %[dst_stride_a], 1 \n" 1.459 + "sll $t8, %[dst_stride_b], 1 \n" 1.460 + "swr $s3, 4(%[dst_a]) \n" 1.461 + "swl $s3, 7(%[dst_a]) \n" 1.462 + "swr $s4, 4(%[dst_b]) \n" 1.463 + "swl $s4, 7(%[dst_b]) \n" 1.464 + "addu %[dst_a], %[dst_a], $t0 \n" 1.465 + "bnez $t1, 11b \n" 1.466 + " addu %[dst_b], %[dst_b], $t8 \n" 1.467 + 1.468 + "2: \n" 1.469 + ".set pop \n" 1.470 + : [src] "+r" (src), 1.471 + [dst_a] "+r" (dst_a), 1.472 + [dst_b] "+r" (dst_b), 1.473 + [width] "+r" (width), 1.474 + [src_stride] "+r" (src_stride) 1.475 + : [dst_stride_a] "r" (dst_stride_a), 1.476 + [dst_stride_b] "r" (dst_stride_b) 1.477 + : "t0", "t1", "t2", "t3", "t4", "t5", 1.478 + "t6", "t7", "t8", "t9", 1.479 + "s0", "s1", "s2", "s3", 1.480 + "s4", "s5", "s6" 1.481 + ); 1.482 +} 1.483 + 1.484 +#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2) 1.485 + 1.486 +#ifdef __cplusplus 1.487 +} // extern "C" 1.488 +} // namespace libyuv 1.489 +#endif