1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libyuv/source/scale_mips.cc Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,653 @@ 1.4 +/* 1.5 + * Copyright 2012 The LibYuv Project Authors. All rights reserved. 1.6 + * 1.7 + * Use of this source code is governed by a BSD-style license 1.8 + * that can be found in the LICENSE file in the root of the source 1.9 + * tree. An additional intellectual property rights grant can be found 1.10 + * in the file PATENTS. All contributing project authors may 1.11 + * be found in the AUTHORS file in the root of the source tree. 1.12 + */ 1.13 + 1.14 +#include "libyuv/basic_types.h" 1.15 +#include "libyuv/row.h" 1.16 + 1.17 +#ifdef __cplusplus 1.18 +namespace libyuv { 1.19 +extern "C" { 1.20 +#endif 1.21 + 1.22 +// This module is for GCC MIPS DSPR2 1.23 +#if !defined(LIBYUV_DISABLE_MIPS) && \ 1.24 + defined(__mips_dsp) && (__mips_dsp_rev >= 2) 1.25 + 1.26 +void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, 1.27 + uint8* dst, int dst_width) { 1.28 + __asm__ __volatile__( 1.29 + ".set push \n" 1.30 + ".set noreorder \n" 1.31 + 1.32 + "srl $t9, %[dst_width], 4 \n" // iterations -> by 16 1.33 + "beqz $t9, 2f \n" 1.34 + " nop \n" 1.35 + 1.36 + ".p2align 2 \n" 1.37 + "1: \n" 1.38 + "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| 1.39 + "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| 1.40 + "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8| 1.41 + "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12| 1.42 + "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16| 1.43 + "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20| 1.44 + "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24| 1.45 + "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28| 1.46 + // TODO(fbarchard): Use odd pixels instead of even. 1.47 + "precr.qb.ph $t8, $t1, $t0 \n" // |6|4|2|0| 1.48 + "precr.qb.ph $t0, $t3, $t2 \n" // |14|12|10|8| 1.49 + "precr.qb.ph $t1, $t5, $t4 \n" // |22|20|18|16| 1.50 + "precr.qb.ph $t2, $t7, $t6 \n" // |30|28|26|24| 1.51 + "addiu %[src_ptr], %[src_ptr], 32 \n" 1.52 + "addiu $t9, $t9, -1 \n" 1.53 + "sw $t8, 0(%[dst]) \n" 1.54 + "sw $t0, 4(%[dst]) \n" 1.55 + "sw $t1, 8(%[dst]) \n" 1.56 + "sw $t2, 12(%[dst]) \n" 1.57 + "bgtz $t9, 1b \n" 1.58 + " addiu %[dst], %[dst], 16 \n" 1.59 + 1.60 + "2: \n" 1.61 + "andi $t9, %[dst_width], 0xf \n" // residue 1.62 + "beqz $t9, 3f \n" 1.63 + " nop \n" 1.64 + 1.65 + "21: \n" 1.66 + "lbu $t0, 0(%[src_ptr]) \n" 1.67 + "addiu %[src_ptr], %[src_ptr], 2 \n" 1.68 + "addiu $t9, $t9, -1 \n" 1.69 + "sb $t0, 0(%[dst]) \n" 1.70 + "bgtz $t9, 21b \n" 1.71 + " addiu %[dst], %[dst], 1 \n" 1.72 + 1.73 + "3: \n" 1.74 + ".set pop \n" 1.75 + : [src_ptr] "+r" (src_ptr), 1.76 + [dst] "+r" (dst) 1.77 + : [dst_width] "r" (dst_width) 1.78 + : "t0", "t1", "t2", "t3", "t4", "t5", 1.79 + "t6", "t7", "t8", "t9" 1.80 + ); 1.81 +} 1.82 + 1.83 +void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, 1.84 + uint8* dst, int dst_width) { 1.85 + const uint8* t = src_ptr + src_stride; 1.86 + 1.87 + __asm__ __volatile__ ( 1.88 + ".set push \n" 1.89 + ".set noreorder \n" 1.90 + 1.91 + "srl $t9, %[dst_width], 3 \n" // iterations -> step 8 1.92 + "bltz $t9, 2f \n" 1.93 + " nop \n" 1.94 + 1.95 + ".p2align 2 \n" 1.96 + "1: \n" 1.97 + "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| 1.98 + "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| 1.99 + "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8| 1.100 + "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12| 1.101 + "lw $t4, 0(%[t]) \n" // |19|18|17|16| 1.102 + "lw $t5, 4(%[t]) \n" // |23|22|21|20| 1.103 + "lw $t6, 8(%[t]) \n" // |27|26|25|24| 1.104 + "lw $t7, 12(%[t]) \n" // |31|30|29|28| 1.105 + "addiu $t9, $t9, -1 \n" 1.106 + "srl $t8, $t0, 16 \n" // |X|X|3|2| 1.107 + "ins $t0, $t4, 16, 16 \n" // |17|16|1|0| 1.108 + "ins $t4, $t8, 0, 16 \n" // |19|18|3|2| 1.109 + "raddu.w.qb $t0, $t0 \n" // |17+16+1+0| 1.110 + "raddu.w.qb $t4, $t4 \n" // |19+18+3+2| 1.111 + "shra_r.w $t0, $t0, 2 \n" // |t0+2|>>2 1.112 + "shra_r.w $t4, $t4, 2 \n" // |t4+2|>>2 1.113 + "srl $t8, $t1, 16 \n" // |X|X|7|6| 1.114 + "ins $t1, $t5, 16, 16 \n" // |21|20|5|4| 1.115 + "ins $t5, $t8, 0, 16 \n" // |22|23|7|6| 1.116 + "raddu.w.qb $t1, $t1 \n" // |21+20+5+4| 1.117 + "raddu.w.qb $t5, $t5 \n" // |23+22+7+6| 1.118 + "shra_r.w $t1, $t1, 2 \n" // |t1+2|>>2 1.119 + "shra_r.w $t5, $t5, 2 \n" // |t5+2|>>2 1.120 + "srl $t8, $t2, 16 \n" // |X|X|11|10| 1.121 + "ins $t2, $t6, 16, 16 \n" // |25|24|9|8| 1.122 + "ins $t6, $t8, 0, 16 \n" // |27|26|11|10| 1.123 + "raddu.w.qb $t2, $t2 \n" // |25+24+9+8| 1.124 + "raddu.w.qb $t6, $t6 \n" // |27+26+11+10| 1.125 + "shra_r.w $t2, $t2, 2 \n" // |t2+2|>>2 1.126 + "shra_r.w $t6, $t6, 2 \n" // |t5+2|>>2 1.127 + "srl $t8, $t3, 16 \n" // |X|X|15|14| 1.128 + "ins $t3, $t7, 16, 16 \n" // |29|28|13|12| 1.129 + "ins $t7, $t8, 0, 16 \n" // |31|30|15|14| 1.130 + "raddu.w.qb $t3, $t3 \n" // |29+28+13+12| 1.131 + "raddu.w.qb $t7, $t7 \n" // |31+30+15+14| 1.132 + "shra_r.w $t3, $t3, 2 \n" // |t3+2|>>2 1.133 + "shra_r.w $t7, $t7, 2 \n" // |t7+2|>>2 1.134 + "addiu %[src_ptr], %[src_ptr], 16 \n" 1.135 + "addiu %[t], %[t], 16 \n" 1.136 + "sb $t0, 0(%[dst]) \n" 1.137 + "sb $t4, 1(%[dst]) \n" 1.138 + "sb $t1, 2(%[dst]) \n" 1.139 + "sb $t5, 3(%[dst]) \n" 1.140 + "sb $t2, 4(%[dst]) \n" 1.141 + "sb $t6, 5(%[dst]) \n" 1.142 + "sb $t3, 6(%[dst]) \n" 1.143 + "sb $t7, 7(%[dst]) \n" 1.144 + "bgtz $t9, 1b \n" 1.145 + " addiu %[dst], %[dst], 8 \n" 1.146 + 1.147 + "2: \n" 1.148 + "andi $t9, %[dst_width], 0x7 \n" // x = residue 1.149 + "beqz $t9, 3f \n" 1.150 + " nop \n" 1.151 + 1.152 + "21: \n" 1.153 + "lwr $t1, 0(%[src_ptr]) \n" 1.154 + "lwl $t1, 3(%[src_ptr]) \n" 1.155 + "lwr $t2, 0(%[t]) \n" 1.156 + "lwl $t2, 3(%[t]) \n" 1.157 + "srl $t8, $t1, 16 \n" 1.158 + "ins $t1, $t2, 16, 16 \n" 1.159 + "ins $t2, $t8, 0, 16 \n" 1.160 + "raddu.w.qb $t1, $t1 \n" 1.161 + "raddu.w.qb $t2, $t2 \n" 1.162 + "shra_r.w $t1, $t1, 2 \n" 1.163 + "shra_r.w $t2, $t2, 2 \n" 1.164 + "sb $t1, 0(%[dst]) \n" 1.165 + "sb $t2, 1(%[dst]) \n" 1.166 + "addiu %[src_ptr], %[src_ptr], 4 \n" 1.167 + "addiu $t9, $t9, -2 \n" 1.168 + "addiu %[t], %[t], 4 \n" 1.169 + "bgtz $t9, 21b \n" 1.170 + " addiu %[dst], %[dst], 2 \n" 1.171 + 1.172 + "3: \n" 1.173 + ".set pop \n" 1.174 + 1.175 + : [src_ptr] "+r" (src_ptr), 1.176 + [dst] "+r" (dst), [t] "+r" (t) 1.177 + : [dst_width] "r" (dst_width) 1.178 + : "t0", "t1", "t2", "t3", "t4", "t5", 1.179 + "t6", "t7", "t8", "t9" 1.180 + ); 1.181 +} 1.182 + 1.183 +void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, 1.184 + uint8* dst, int dst_width) { 1.185 + __asm__ __volatile__ ( 1.186 + ".set push \n" 1.187 + ".set noreorder \n" 1.188 + 1.189 + "srl $t9, %[dst_width], 3 \n" 1.190 + "beqz $t9, 2f \n" 1.191 + " nop \n" 1.192 + 1.193 + ".p2align 2 \n" 1.194 + "1: \n" 1.195 + "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0| 1.196 + "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4| 1.197 + "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8| 1.198 + "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12| 1.199 + "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16| 1.200 + "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20| 1.201 + "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24| 1.202 + "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28| 1.203 + "precr.qb.ph $t1, $t2, $t1 \n" // |6|4|2|0| 1.204 + "precr.qb.ph $t2, $t4, $t3 \n" // |14|12|10|8| 1.205 + "precr.qb.ph $t5, $t6, $t5 \n" // |22|20|18|16| 1.206 + "precr.qb.ph $t6, $t8, $t7 \n" // |30|28|26|24| 1.207 + "precr.qb.ph $t1, $t2, $t1 \n" // |12|8|4|0| 1.208 + "precr.qb.ph $t5, $t6, $t5 \n" // |28|24|20|16| 1.209 + "addiu %[src_ptr], %[src_ptr], 32 \n" 1.210 + "addiu $t9, $t9, -1 \n" 1.211 + "sw $t1, 0(%[dst]) \n" 1.212 + "sw $t5, 4(%[dst]) \n" 1.213 + "bgtz $t9, 1b \n" 1.214 + " addiu %[dst], %[dst], 8 \n" 1.215 + 1.216 + "2: \n" 1.217 + "andi $t9, %[dst_width], 7 \n" // residue 1.218 + "beqz $t9, 3f \n" 1.219 + " nop \n" 1.220 + 1.221 + "21: \n" 1.222 + "lbu $t1, 0(%[src_ptr]) \n" 1.223 + "addiu %[src_ptr], %[src_ptr], 4 \n" 1.224 + "addiu $t9, $t9, -1 \n" 1.225 + "sb $t1, 0(%[dst]) \n" 1.226 + "bgtz $t9, 21b \n" 1.227 + " addiu %[dst], %[dst], 1 \n" 1.228 + 1.229 + "3: \n" 1.230 + ".set pop \n" 1.231 + : [src_ptr] "+r" (src_ptr), 1.232 + [dst] "+r" (dst) 1.233 + : [dst_width] "r" (dst_width) 1.234 + : "t1", "t2", "t3", "t4", "t5", 1.235 + "t6", "t7", "t8", "t9" 1.236 + ); 1.237 +} 1.238 + 1.239 +void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, 1.240 + uint8* dst, int dst_width) { 1.241 + intptr_t stride = src_stride; 1.242 + const uint8* s1 = src_ptr + stride; 1.243 + const uint8* s2 = s1 + stride; 1.244 + const uint8* s3 = s2 + stride; 1.245 + 1.246 + __asm__ __volatile__ ( 1.247 + ".set push \n" 1.248 + ".set noreorder \n" 1.249 + 1.250 + "srl $t9, %[dst_width], 1 \n" 1.251 + "andi $t8, %[dst_width], 1 \n" 1.252 + 1.253 + ".p2align 2 \n" 1.254 + "1: \n" 1.255 + "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| 1.256 + "lw $t1, 0(%[s1]) \n" // |7|6|5|4| 1.257 + "lw $t2, 0(%[s2]) \n" // |11|10|9|8| 1.258 + "lw $t3, 0(%[s3]) \n" // |15|14|13|12| 1.259 + "lw $t4, 4(%[src_ptr]) \n" // |19|18|17|16| 1.260 + "lw $t5, 4(%[s1]) \n" // |23|22|21|20| 1.261 + "lw $t6, 4(%[s2]) \n" // |27|26|25|24| 1.262 + "lw $t7, 4(%[s3]) \n" // |31|30|29|28| 1.263 + "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0| 1.264 + "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4| 1.265 + "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8| 1.266 + "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12| 1.267 + "raddu.w.qb $t4, $t4 \n" // |19 + 18 + 17 + 16| 1.268 + "raddu.w.qb $t5, $t5 \n" // |23 + 22 + 21 + 20| 1.269 + "raddu.w.qb $t6, $t6 \n" // |27 + 26 + 25 + 24| 1.270 + "raddu.w.qb $t7, $t7 \n" // |31 + 30 + 29 + 28| 1.271 + "add $t0, $t0, $t1 \n" 1.272 + "add $t1, $t2, $t3 \n" 1.273 + "add $t0, $t0, $t1 \n" 1.274 + "add $t4, $t4, $t5 \n" 1.275 + "add $t6, $t6, $t7 \n" 1.276 + "add $t4, $t4, $t6 \n" 1.277 + "shra_r.w $t0, $t0, 4 \n" 1.278 + "shra_r.w $t4, $t4, 4 \n" 1.279 + "sb $t0, 0(%[dst]) \n" 1.280 + "sb $t4, 1(%[dst]) \n" 1.281 + "addiu %[src_ptr], %[src_ptr], 8 \n" 1.282 + "addiu %[s1], %[s1], 8 \n" 1.283 + "addiu %[s2], %[s2], 8 \n" 1.284 + "addiu %[s3], %[s3], 8 \n" 1.285 + "addiu $t9, $t9, -1 \n" 1.286 + "bgtz $t9, 1b \n" 1.287 + " addiu %[dst], %[dst], 2 \n" 1.288 + "beqz $t8, 2f \n" 1.289 + " nop \n" 1.290 + 1.291 + "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| 1.292 + "lw $t1, 0(%[s1]) \n" // |7|6|5|4| 1.293 + "lw $t2, 0(%[s2]) \n" // |11|10|9|8| 1.294 + "lw $t3, 0(%[s3]) \n" // |15|14|13|12| 1.295 + "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0| 1.296 + "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4| 1.297 + "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8| 1.298 + "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12| 1.299 + "add $t0, $t0, $t1 \n" 1.300 + "add $t1, $t2, $t3 \n" 1.301 + "add $t0, $t0, $t1 \n" 1.302 + "shra_r.w $t0, $t0, 4 \n" 1.303 + "sb $t0, 0(%[dst]) \n" 1.304 + 1.305 + "2: \n" 1.306 + ".set pop \n" 1.307 + 1.308 + : [src_ptr] "+r" (src_ptr), 1.309 + [dst] "+r" (dst), 1.310 + [s1] "+r" (s1), 1.311 + [s2] "+r" (s2), 1.312 + [s3] "+r" (s3) 1.313 + : [dst_width] "r" (dst_width) 1.314 + : "t0", "t1", "t2", "t3", "t4", "t5", 1.315 + "t6","t7", "t8", "t9" 1.316 + ); 1.317 +} 1.318 + 1.319 +void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, 1.320 + uint8* dst, int dst_width) { 1.321 + __asm__ __volatile__ ( 1.322 + ".set push \n" 1.323 + ".set noreorder \n" 1.324 + ".p2align 2 \n" 1.325 + "1: \n" 1.326 + "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0| 1.327 + "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4| 1.328 + "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8| 1.329 + "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12| 1.330 + "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16| 1.331 + "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20| 1.332 + "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24| 1.333 + "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28| 1.334 + "precrq.qb.ph $t0, $t2, $t4 \n" // |7|5|15|13| 1.335 + "precrq.qb.ph $t9, $t6, $t8 \n" // |23|21|31|30| 1.336 + "addiu %[dst_width], %[dst_width], -24 \n" 1.337 + "ins $t1, $t1, 8, 16 \n" // |3|1|0|X| 1.338 + "ins $t4, $t0, 8, 16 \n" // |X|15|13|12| 1.339 + "ins $t5, $t5, 8, 16 \n" // |19|17|16|X| 1.340 + "ins $t8, $t9, 8, 16 \n" // |X|31|29|28| 1.341 + "addiu %[src_ptr], %[src_ptr], 32 \n" 1.342 + "packrl.ph $t0, $t3, $t0 \n" // |9|8|7|5| 1.343 + "packrl.ph $t9, $t7, $t9 \n" // |25|24|23|21| 1.344 + "prepend $t1, $t2, 8 \n" // |4|3|1|0| 1.345 + "prepend $t3, $t4, 24 \n" // |15|13|12|11| 1.346 + "prepend $t5, $t6, 8 \n" // |20|19|17|16| 1.347 + "prepend $t7, $t8, 24 \n" // |31|29|28|27| 1.348 + "sw $t1, 0(%[dst]) \n" 1.349 + "sw $t0, 4(%[dst]) \n" 1.350 + "sw $t3, 8(%[dst]) \n" 1.351 + "sw $t5, 12(%[dst]) \n" 1.352 + "sw $t9, 16(%[dst]) \n" 1.353 + "sw $t7, 20(%[dst]) \n" 1.354 + "bnez %[dst_width], 1b \n" 1.355 + " addiu %[dst], %[dst], 24 \n" 1.356 + ".set pop \n" 1.357 + : [src_ptr] "+r" (src_ptr), 1.358 + [dst] "+r" (dst), 1.359 + [dst_width] "+r" (dst_width) 1.360 + : 1.361 + : "t0", "t1", "t2", "t3", "t4", "t5", 1.362 + "t6","t7", "t8", "t9" 1.363 + ); 1.364 +} 1.365 + 1.366 +void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, 1.367 + uint8* d, int dst_width) { 1.368 + __asm__ __volatile__ ( 1.369 + ".set push \n" 1.370 + ".set noreorder \n" 1.371 + "repl.ph $t3, 3 \n" // 0x00030003 1.372 + 1.373 + ".p2align 2 \n" 1.374 + "1: \n" 1.375 + "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| 1.376 + "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0| 1.377 + "rotr $t2, $t0, 8 \n" // |S0|S3|S2|S1| 1.378 + "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1| 1.379 + "muleu_s.ph.qbl $t4, $t2, $t3 \n" // |S0*3|S3*3| 1.380 + "muleu_s.ph.qbl $t5, $t6, $t3 \n" // |T0*3|T3*3| 1.381 + "andi $t0, $t2, 0xFFFF \n" // |0|0|S2|S1| 1.382 + "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1| 1.383 + "raddu.w.qb $t0, $t0 \n" 1.384 + "raddu.w.qb $t1, $t1 \n" 1.385 + "shra_r.w $t0, $t0, 1 \n" 1.386 + "shra_r.w $t1, $t1, 1 \n" 1.387 + "preceu.ph.qbr $t2, $t2 \n" // |0|S2|0|S1| 1.388 + "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1| 1.389 + "rotr $t2, $t2, 16 \n" // |0|S1|0|S2| 1.390 + "rotr $t6, $t6, 16 \n" // |0|T1|0|T2| 1.391 + "addu.ph $t2, $t2, $t4 \n" 1.392 + "addu.ph $t6, $t6, $t5 \n" 1.393 + "sll $t5, $t0, 1 \n" 1.394 + "add $t0, $t5, $t0 \n" 1.395 + "shra_r.ph $t2, $t2, 2 \n" 1.396 + "shra_r.ph $t6, $t6, 2 \n" 1.397 + "shll.ph $t4, $t2, 1 \n" 1.398 + "addq.ph $t4, $t4, $t2 \n" 1.399 + "addu $t0, $t0, $t1 \n" 1.400 + "addiu %[src_ptr], %[src_ptr], 4 \n" 1.401 + "shra_r.w $t0, $t0, 2 \n" 1.402 + "addu.ph $t6, $t6, $t4 \n" 1.403 + "shra_r.ph $t6, $t6, 2 \n" 1.404 + "srl $t1, $t6, 16 \n" 1.405 + "addiu %[dst_width], %[dst_width], -3 \n" 1.406 + "sb $t1, 0(%[d]) \n" 1.407 + "sb $t0, 1(%[d]) \n" 1.408 + "sb $t6, 2(%[d]) \n" 1.409 + "bgtz %[dst_width], 1b \n" 1.410 + " addiu %[d], %[d], 3 \n" 1.411 + "3: \n" 1.412 + ".set pop \n" 1.413 + : [src_ptr] "+r" (src_ptr), 1.414 + [src_stride] "+r" (src_stride), 1.415 + [d] "+r" (d), 1.416 + [dst_width] "+r" (dst_width) 1.417 + : 1.418 + : "t0", "t1", "t2", "t3", 1.419 + "t4", "t5", "t6" 1.420 + ); 1.421 +} 1.422 + 1.423 +void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, 1.424 + uint8* d, int dst_width) { 1.425 + __asm__ __volatile__ ( 1.426 + ".set push \n" 1.427 + ".set noreorder \n" 1.428 + "repl.ph $t2, 3 \n" // 0x00030003 1.429 + 1.430 + ".p2align 2 \n" 1.431 + "1: \n" 1.432 + "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| 1.433 + "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0| 1.434 + "rotr $t4, $t0, 8 \n" // |S0|S3|S2|S1| 1.435 + "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1| 1.436 + "muleu_s.ph.qbl $t3, $t4, $t2 \n" // |S0*3|S3*3| 1.437 + "muleu_s.ph.qbl $t5, $t6, $t2 \n" // |T0*3|T3*3| 1.438 + "andi $t0, $t4, 0xFFFF \n" // |0|0|S2|S1| 1.439 + "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1| 1.440 + "raddu.w.qb $t0, $t0 \n" 1.441 + "raddu.w.qb $t1, $t1 \n" 1.442 + "shra_r.w $t0, $t0, 1 \n" 1.443 + "shra_r.w $t1, $t1, 1 \n" 1.444 + "preceu.ph.qbr $t4, $t4 \n" // |0|S2|0|S1| 1.445 + "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1| 1.446 + "rotr $t4, $t4, 16 \n" // |0|S1|0|S2| 1.447 + "rotr $t6, $t6, 16 \n" // |0|T1|0|T2| 1.448 + "addu.ph $t4, $t4, $t3 \n" 1.449 + "addu.ph $t6, $t6, $t5 \n" 1.450 + "shra_r.ph $t6, $t6, 2 \n" 1.451 + "shra_r.ph $t4, $t4, 2 \n" 1.452 + "addu.ph $t6, $t6, $t4 \n" 1.453 + "addiu %[src_ptr], %[src_ptr], 4 \n" 1.454 + "shra_r.ph $t6, $t6, 1 \n" 1.455 + "addu $t0, $t0, $t1 \n" 1.456 + "addiu %[dst_width], %[dst_width], -3 \n" 1.457 + "shra_r.w $t0, $t0, 1 \n" 1.458 + "srl $t1, $t6, 16 \n" 1.459 + "sb $t1, 0(%[d]) \n" 1.460 + "sb $t0, 1(%[d]) \n" 1.461 + "sb $t6, 2(%[d]) \n" 1.462 + "bgtz %[dst_width], 1b \n" 1.463 + " addiu %[d], %[d], 3 \n" 1.464 + "3: \n" 1.465 + ".set pop \n" 1.466 + : [src_ptr] "+r" (src_ptr), 1.467 + [src_stride] "+r" (src_stride), 1.468 + [d] "+r" (d), 1.469 + [dst_width] "+r" (dst_width) 1.470 + : 1.471 + : "t0", "t1", "t2", "t3", 1.472 + "t4", "t5", "t6" 1.473 + ); 1.474 +} 1.475 + 1.476 +void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, 1.477 + uint8* dst, int dst_width) { 1.478 + __asm__ __volatile__ ( 1.479 + ".set push \n" 1.480 + ".set noreorder \n" 1.481 + 1.482 + ".p2align 2 \n" 1.483 + "1: \n" 1.484 + "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| 1.485 + "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| 1.486 + "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8| 1.487 + "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12| 1.488 + "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16| 1.489 + "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20| 1.490 + "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24| 1.491 + "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28| 1.492 + "wsbh $t0, $t0 \n" // |2|3|0|1| 1.493 + "wsbh $t6, $t6 \n" // |26|27|24|25| 1.494 + "srl $t0, $t0, 8 \n" // |X|2|3|0| 1.495 + "srl $t3, $t3, 16 \n" // |X|X|15|14| 1.496 + "srl $t5, $t5, 16 \n" // |X|X|23|22| 1.497 + "srl $t7, $t7, 16 \n" // |X|X|31|30| 1.498 + "ins $t1, $t2, 24, 8 \n" // |8|6|5|4| 1.499 + "ins $t6, $t5, 0, 8 \n" // |26|27|24|22| 1.500 + "ins $t1, $t0, 0, 16 \n" // |8|6|3|0| 1.501 + "ins $t6, $t7, 24, 8 \n" // |30|27|24|22| 1.502 + "prepend $t2, $t3, 24 \n" // |X|15|14|11| 1.503 + "ins $t4, $t4, 16, 8 \n" // |19|16|17|X| 1.504 + "ins $t4, $t2, 0, 16 \n" // |19|16|14|11| 1.505 + "addiu %[src_ptr], %[src_ptr], 32 \n" 1.506 + "addiu %[dst_width], %[dst_width], -12 \n" 1.507 + "addiu $t8,%[dst_width], -12 \n" 1.508 + "sw $t1, 0(%[dst]) \n" 1.509 + "sw $t4, 4(%[dst]) \n" 1.510 + "sw $t6, 8(%[dst]) \n" 1.511 + "bgez $t8, 1b \n" 1.512 + " addiu %[dst], %[dst], 12 \n" 1.513 + ".set pop \n" 1.514 + : [src_ptr] "+r" (src_ptr), 1.515 + [dst] "+r" (dst), 1.516 + [dst_width] "+r" (dst_width) 1.517 + : 1.518 + : "t0", "t1", "t2", "t3", "t4", 1.519 + "t5", "t6", "t7", "t8" 1.520 + ); 1.521 +} 1.522 + 1.523 +void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, 1.524 + uint8* dst_ptr, int dst_width) { 1.525 + intptr_t stride = src_stride; 1.526 + const uint8* t = src_ptr + stride; 1.527 + const int c = 0x2AAA; 1.528 + 1.529 + __asm__ __volatile__ ( 1.530 + ".set push \n" 1.531 + ".set noreorder \n" 1.532 + 1.533 + ".p2align 2 \n" 1.534 + "1: \n" 1.535 + "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| 1.536 + "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4| 1.537 + "lw $t2, 0(%[t]) \n" // |T3|T2|T1|T0| 1.538 + "lw $t3, 4(%[t]) \n" // |T7|T6|T5|T4| 1.539 + "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6| 1.540 + "packrl.ph $t4, $t1, $t3 \n" // |S7|S6|T7|T6| 1.541 + "packrl.ph $t5, $t3, $t1 \n" // |T5|T4|S5|S4| 1.542 + "raddu.w.qb $t4, $t4 \n" // S7+S6+T7+T6 1.543 + "raddu.w.qb $t5, $t5 \n" // T5+T4+S5+S4 1.544 + "precrq.qb.ph $t6, $t0, $t2 \n" // |S3|S1|T3|T1| 1.545 + "precrq.qb.ph $t6, $t6, $t6 \n" // |S3|T3|S3|T3| 1.546 + "srl $t4, $t4, 2 \n" // t4 / 4 1.547 + "srl $t6, $t6, 16 \n" // |0|0|S3|T3| 1.548 + "raddu.w.qb $t6, $t6 \n" // 0+0+S3+T3 1.549 + "addu $t6, $t5, $t6 \n" 1.550 + "mul $t6, $t6, %[c] \n" // t6 * 0x2AAA 1.551 + "sll $t0, $t0, 8 \n" // |S2|S1|S0|0| 1.552 + "sll $t2, $t2, 8 \n" // |T2|T1|T0|0| 1.553 + "raddu.w.qb $t0, $t0 \n" // S2+S1+S0+0 1.554 + "raddu.w.qb $t2, $t2 \n" // T2+T1+T0+0 1.555 + "addu $t0, $t0, $t2 \n" 1.556 + "mul $t0, $t0, %[c] \n" // t0 * 0x2AAA 1.557 + "addiu %[src_ptr], %[src_ptr], 8 \n" 1.558 + "addiu %[t], %[t], 8 \n" 1.559 + "addiu %[dst_width], %[dst_width], -3 \n" 1.560 + "addiu %[dst_ptr], %[dst_ptr], 3 \n" 1.561 + "srl $t6, $t6, 16 \n" 1.562 + "srl $t0, $t0, 16 \n" 1.563 + "sb $t4, -1(%[dst_ptr]) \n" 1.564 + "sb $t6, -2(%[dst_ptr]) \n" 1.565 + "bgtz %[dst_width], 1b \n" 1.566 + " sb $t0, -3(%[dst_ptr]) \n" 1.567 + ".set pop \n" 1.568 + : [src_ptr] "+r" (src_ptr), 1.569 + [dst_ptr] "+r" (dst_ptr), 1.570 + [t] "+r" (t), 1.571 + [dst_width] "+r" (dst_width) 1.572 + : [c] "r" (c) 1.573 + : "t0", "t1", "t2", "t3", "t4", "t5", "t6" 1.574 + ); 1.575 +} 1.576 + 1.577 +void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr, 1.578 + ptrdiff_t src_stride, 1.579 + uint8* dst_ptr, int dst_width) { 1.580 + intptr_t stride = src_stride; 1.581 + const uint8* s1 = src_ptr + stride; 1.582 + stride += stride; 1.583 + const uint8* s2 = src_ptr + stride; 1.584 + const int c1 = 0x1C71; 1.585 + const int c2 = 0x2AAA; 1.586 + 1.587 + __asm__ __volatile__ ( 1.588 + ".set push \n" 1.589 + ".set noreorder \n" 1.590 + 1.591 + ".p2align 2 \n" 1.592 + "1: \n" 1.593 + "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| 1.594 + "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4| 1.595 + "lw $t2, 0(%[s1]) \n" // |T3|T2|T1|T0| 1.596 + "lw $t3, 4(%[s1]) \n" // |T7|T6|T5|T4| 1.597 + "lw $t4, 0(%[s2]) \n" // |R3|R2|R1|R0| 1.598 + "lw $t5, 4(%[s2]) \n" // |R7|R6|R5|R4| 1.599 + "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6| 1.600 + "packrl.ph $t6, $t1, $t3 \n" // |S7|S6|T7|T6| 1.601 + "raddu.w.qb $t6, $t6 \n" // S7+S6+T7+T6 1.602 + "packrl.ph $t7, $t3, $t1 \n" // |T5|T4|S5|S4| 1.603 + "raddu.w.qb $t7, $t7 \n" // T5+T4+S5+S4 1.604 + "sll $t8, $t5, 16 \n" // |R5|R4|0|0| 1.605 + "raddu.w.qb $t8, $t8 \n" // R5+R4 1.606 + "addu $t7, $t7, $t8 \n" 1.607 + "srl $t8, $t5, 16 \n" // |0|0|R7|R6| 1.608 + "raddu.w.qb $t8, $t8 \n" // R7 + R6 1.609 + "addu $t6, $t6, $t8 \n" 1.610 + "mul $t6, $t6, %[c2] \n" // t6 * 0x2AAA 1.611 + "precrq.qb.ph $t8, $t0, $t2 \n" // |S3|S1|T3|T1| 1.612 + "precrq.qb.ph $t8, $t8, $t4 \n" // |S3|T3|R3|R1| 1.613 + "srl $t8, $t8, 8 \n" // |0|S3|T3|R3| 1.614 + "raddu.w.qb $t8, $t8 \n" // S3 + T3 + R3 1.615 + "addu $t7, $t7, $t8 \n" 1.616 + "mul $t7, $t7, %[c1] \n" // t7 * 0x1C71 1.617 + "sll $t0, $t0, 8 \n" // |S2|S1|S0|0| 1.618 + "sll $t2, $t2, 8 \n" // |T2|T1|T0|0| 1.619 + "sll $t4, $t4, 8 \n" // |R2|R1|R0|0| 1.620 + "raddu.w.qb $t0, $t0 \n" 1.621 + "raddu.w.qb $t2, $t2 \n" 1.622 + "raddu.w.qb $t4, $t4 \n" 1.623 + "addu $t0, $t0, $t2 \n" 1.624 + "addu $t0, $t0, $t4 \n" 1.625 + "mul $t0, $t0, %[c1] \n" // t0 * 0x1C71 1.626 + "addiu %[src_ptr], %[src_ptr], 8 \n" 1.627 + "addiu %[s1], %[s1], 8 \n" 1.628 + "addiu %[s2], %[s2], 8 \n" 1.629 + "addiu %[dst_width], %[dst_width], -3 \n" 1.630 + "addiu %[dst_ptr], %[dst_ptr], 3 \n" 1.631 + "srl $t6, $t6, 16 \n" 1.632 + "srl $t7, $t7, 16 \n" 1.633 + "srl $t0, $t0, 16 \n" 1.634 + "sb $t6, -1(%[dst_ptr]) \n" 1.635 + "sb $t7, -2(%[dst_ptr]) \n" 1.636 + "bgtz %[dst_width], 1b \n" 1.637 + " sb $t0, -3(%[dst_ptr]) \n" 1.638 + ".set pop \n" 1.639 + : [src_ptr] "+r" (src_ptr), 1.640 + [dst_ptr] "+r" (dst_ptr), 1.641 + [s1] "+r" (s1), 1.642 + [s2] "+r" (s2), 1.643 + [dst_width] "+r" (dst_width) 1.644 + : [c1] "r" (c1), [c2] "r" (c2) 1.645 + : "t0", "t1", "t2", "t3", "t4", 1.646 + "t5", "t6", "t7", "t8" 1.647 + ); 1.648 +} 1.649 + 1.650 +#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2) 1.651 + 1.652 +#ifdef __cplusplus 1.653 +} // extern "C" 1.654 +} // namespace libyuv 1.655 +#endif 1.656 +