media/libyuv/source/rotate_mips.cc

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libyuv/source/rotate_mips.cc	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,486 @@
     1.4 +/*
     1.5 + *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
     1.6 + *
     1.7 + *  Use of this source code is governed by a BSD-style license
     1.8 + *  that can be found in the LICENSE file in the root of the source
     1.9 + *  tree. An additional intellectual property rights grant can be found
    1.10 + *  in the file PATENTS. All contributing project authors may
    1.11 + *  be found in the AUTHORS file in the root of the source tree.
    1.12 + */
    1.13 +
    1.14 +#include "libyuv/row.h"
    1.15 +
    1.16 +#include "libyuv/basic_types.h"
    1.17 +
    1.18 +#ifdef __cplusplus
    1.19 +namespace libyuv {
    1.20 +extern "C" {
    1.21 +#endif
    1.22 +
    1.23 +#if !defined(LIBYUV_DISABLE_MIPS) && \
    1.24 +    defined(__mips_dsp) && (__mips_dsp_rev >= 2)
    1.25 +
    1.26 +void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
    1.27 +                             uint8* dst, int dst_stride,
    1.28 +                             int width) {
    1.29 +   __asm__ __volatile__ (
    1.30 +      ".set push                                         \n"
    1.31 +      ".set noreorder                                    \n"
    1.32 +      "sll              $t2, %[src_stride], 0x1          \n" // src_stride x 2
    1.33 +      "sll              $t4, %[src_stride], 0x2          \n" // src_stride x 4
    1.34 +      "sll              $t9, %[src_stride], 0x3          \n" // src_stride x 8
    1.35 +      "addu             $t3, $t2, %[src_stride]          \n"
    1.36 +      "addu             $t5, $t4, %[src_stride]          \n"
    1.37 +      "addu             $t6, $t2, $t4                    \n"
    1.38 +      "andi             $t0, %[dst], 0x3                 \n"
    1.39 +      "andi             $t1, %[dst_stride], 0x3          \n"
    1.40 +      "or               $t0, $t0, $t1                    \n"
    1.41 +      "bnez             $t0, 11f                         \n"
    1.42 +      " subu            $t7, $t9, %[src_stride]          \n"
    1.43 +//dst + dst_stride word aligned
    1.44 +    "1:                                                  \n"
    1.45 +      "lbu              $t0, 0(%[src])                   \n"
    1.46 +      "lbux             $t1, %[src_stride](%[src])       \n"
    1.47 +      "lbux             $t8, $t2(%[src])                 \n"
    1.48 +      "lbux             $t9, $t3(%[src])                 \n"
    1.49 +      "sll              $t1, $t1, 16                     \n"
    1.50 +      "sll              $t9, $t9, 16                     \n"
    1.51 +      "or               $t0, $t0, $t1                    \n"
    1.52 +      "or               $t8, $t8, $t9                    \n"
    1.53 +      "precr.qb.ph      $s0, $t8, $t0                    \n"
    1.54 +      "lbux             $t0, $t4(%[src])                 \n"
    1.55 +      "lbux             $t1, $t5(%[src])                 \n"
    1.56 +      "lbux             $t8, $t6(%[src])                 \n"
    1.57 +      "lbux             $t9, $t7(%[src])                 \n"
    1.58 +      "sll              $t1, $t1, 16                     \n"
    1.59 +      "sll              $t9, $t9, 16                     \n"
    1.60 +      "or               $t0, $t0, $t1                    \n"
    1.61 +      "or               $t8, $t8, $t9                    \n"
    1.62 +      "precr.qb.ph      $s1, $t8, $t0                    \n"
    1.63 +      "sw               $s0, 0(%[dst])                   \n"
    1.64 +      "addiu            %[width], -1                     \n"
    1.65 +      "addiu            %[src], 1                        \n"
    1.66 +      "sw               $s1, 4(%[dst])                   \n"
    1.67 +      "bnez             %[width], 1b                     \n"
    1.68 +      " addu            %[dst], %[dst], %[dst_stride]    \n"
    1.69 +      "b                2f                               \n"
    1.70 +//dst + dst_stride unaligned
    1.71 +   "11:                                                  \n"
    1.72 +      "lbu              $t0, 0(%[src])                   \n"
    1.73 +      "lbux             $t1, %[src_stride](%[src])       \n"
    1.74 +      "lbux             $t8, $t2(%[src])                 \n"
    1.75 +      "lbux             $t9, $t3(%[src])                 \n"
    1.76 +      "sll              $t1, $t1, 16                     \n"
    1.77 +      "sll              $t9, $t9, 16                     \n"
    1.78 +      "or               $t0, $t0, $t1                    \n"
    1.79 +      "or               $t8, $t8, $t9                    \n"
    1.80 +      "precr.qb.ph      $s0, $t8, $t0                    \n"
    1.81 +      "lbux             $t0, $t4(%[src])                 \n"
    1.82 +      "lbux             $t1, $t5(%[src])                 \n"
    1.83 +      "lbux             $t8, $t6(%[src])                 \n"
    1.84 +      "lbux             $t9, $t7(%[src])                 \n"
    1.85 +      "sll              $t1, $t1, 16                     \n"
    1.86 +      "sll              $t9, $t9, 16                     \n"
    1.87 +      "or               $t0, $t0, $t1                    \n"
    1.88 +      "or               $t8, $t8, $t9                    \n"
    1.89 +      "precr.qb.ph      $s1, $t8, $t0                    \n"
    1.90 +      "swr              $s0, 0(%[dst])                   \n"
    1.91 +      "swl              $s0, 3(%[dst])                   \n"
    1.92 +      "addiu            %[width], -1                     \n"
    1.93 +      "addiu            %[src], 1                        \n"
    1.94 +      "swr              $s1, 4(%[dst])                   \n"
    1.95 +      "swl              $s1, 7(%[dst])                   \n"
    1.96 +      "bnez             %[width], 11b                    \n"
    1.97 +       "addu             %[dst], %[dst], %[dst_stride]   \n"
    1.98 +    "2:                                                  \n"
    1.99 +      ".set pop                                          \n"
   1.100 +      :[src] "+r" (src),
   1.101 +       [dst] "+r" (dst),
   1.102 +       [width] "+r" (width)
   1.103 +      :[src_stride] "r" (src_stride),
   1.104 +       [dst_stride] "r" (dst_stride)
   1.105 +      : "t0", "t1",  "t2", "t3", "t4", "t5",
   1.106 +        "t6", "t7", "t8", "t9",
   1.107 +        "s0", "s1"
   1.108 +  );
   1.109 +}
   1.110 +
   1.111 +void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
   1.112 +                                  uint8* dst, int dst_stride,
   1.113 +                                  int width) {
   1.114 +  __asm__ __volatile__ (
   1.115 +      ".set noat                                         \n"
   1.116 +      ".set push                                         \n"
   1.117 +      ".set noreorder                                    \n"
   1.118 +      "beqz             %[width], 2f                     \n"
   1.119 +      " sll             $t2, %[src_stride], 0x1          \n"  // src_stride x 2
   1.120 +      "sll              $t4, %[src_stride], 0x2          \n"  // src_stride x 4
   1.121 +      "sll              $t9, %[src_stride], 0x3          \n"  // src_stride x 8
   1.122 +      "addu             $t3, $t2, %[src_stride]          \n"
   1.123 +      "addu             $t5, $t4, %[src_stride]          \n"
   1.124 +      "addu             $t6, $t2, $t4                    \n"
   1.125 +
   1.126 +      "srl              $AT, %[width], 0x2               \n"
   1.127 +      "andi             $t0, %[dst], 0x3                 \n"
   1.128 +      "andi             $t1, %[dst_stride], 0x3          \n"
   1.129 +      "or               $t0, $t0, $t1                    \n"
   1.130 +      "bnez             $t0, 11f                         \n"
   1.131 +      " subu            $t7, $t9, %[src_stride]          \n"
   1.132 +//dst + dst_stride word aligned
   1.133 +      "1:                                                \n"
   1.134 +      "lw               $t0, 0(%[src])                   \n"
   1.135 +      "lwx              $t1, %[src_stride](%[src])       \n"
   1.136 +      "lwx              $t8, $t2(%[src])                 \n"
   1.137 +      "lwx              $t9, $t3(%[src])                 \n"
   1.138 +
   1.139 +// t0 = | 30 | 20 | 10 | 00 |
   1.140 +// t1 = | 31 | 21 | 11 | 01 |
   1.141 +// t8 = | 32 | 22 | 12 | 02 |
   1.142 +// t9 = | 33 | 23 | 13 | 03 |
   1.143 +
   1.144 +      "precr.qb.ph     $s0, $t1, $t0                     \n"
   1.145 +      "precr.qb.ph     $s1, $t9, $t8                     \n"
   1.146 +      "precrq.qb.ph    $s2, $t1, $t0                     \n"
   1.147 +      "precrq.qb.ph    $s3, $t9, $t8                     \n"
   1.148 +
   1.149 +  // s0 = | 21 | 01 | 20 | 00 |
   1.150 +  // s1 = | 23 | 03 | 22 | 02 |
   1.151 +  // s2 = | 31 | 11 | 30 | 10 |
   1.152 +  // s3 = | 33 | 13 | 32 | 12 |
   1.153 +
   1.154 +      "precr.qb.ph     $s4, $s1, $s0                     \n"
   1.155 +      "precrq.qb.ph    $s5, $s1, $s0                     \n"
   1.156 +      "precr.qb.ph     $s6, $s3, $s2                     \n"
   1.157 +      "precrq.qb.ph    $s7, $s3, $s2                     \n"
   1.158 +
   1.159 +  // s4 = | 03 | 02 | 01 | 00 |
   1.160 +  // s5 = | 23 | 22 | 21 | 20 |
   1.161 +  // s6 = | 13 | 12 | 11 | 10 |
   1.162 +  // s7 = | 33 | 32 | 31 | 30 |
   1.163 +
   1.164 +      "lwx              $t0, $t4(%[src])                 \n"
   1.165 +      "lwx              $t1, $t5(%[src])                 \n"
   1.166 +      "lwx              $t8, $t6(%[src])                 \n"
   1.167 +      "lwx              $t9, $t7(%[src])                 \n"
   1.168 +
   1.169 +// t0 = | 34 | 24 | 14 | 04 |
   1.170 +// t1 = | 35 | 25 | 15 | 05 |
   1.171 +// t8 = | 36 | 26 | 16 | 06 |
   1.172 +// t9 = | 37 | 27 | 17 | 07 |
   1.173 +
   1.174 +      "precr.qb.ph     $s0, $t1, $t0                     \n"
   1.175 +      "precr.qb.ph     $s1, $t9, $t8                     \n"
   1.176 +      "precrq.qb.ph    $s2, $t1, $t0                     \n"
   1.177 +      "precrq.qb.ph    $s3, $t9, $t8                     \n"
   1.178 +
   1.179 +  // s0 = | 25 | 05 | 24 | 04 |
   1.180 +  // s1 = | 27 | 07 | 26 | 06 |
   1.181 +  // s2 = | 35 | 15 | 34 | 14 |
   1.182 +  // s3 = | 37 | 17 | 36 | 16 |
   1.183 +
   1.184 +      "precr.qb.ph     $t0, $s1, $s0                     \n"
   1.185 +      "precrq.qb.ph    $t1, $s1, $s0                     \n"
   1.186 +      "precr.qb.ph     $t8, $s3, $s2                     \n"
   1.187 +      "precrq.qb.ph    $t9, $s3, $s2                     \n"
   1.188 +
   1.189 +  // t0 = | 07 | 06 | 05 | 04 |
   1.190 +  // t1 = | 27 | 26 | 25 | 24 |
   1.191 +  // t8 = | 17 | 16 | 15 | 14 |
   1.192 +  // t9 = | 37 | 36 | 35 | 34 |
   1.193 +
   1.194 +      "addu            $s0, %[dst], %[dst_stride]        \n"
   1.195 +      "addu            $s1, $s0, %[dst_stride]           \n"
   1.196 +      "addu            $s2, $s1, %[dst_stride]           \n"
   1.197 +
   1.198 +      "sw              $s4, 0(%[dst])                    \n"
   1.199 +      "sw              $t0, 4(%[dst])                    \n"
   1.200 +      "sw              $s6, 0($s0)                       \n"
   1.201 +      "sw              $t8, 4($s0)                       \n"
   1.202 +      "sw              $s5, 0($s1)                       \n"
   1.203 +      "sw              $t1, 4($s1)                       \n"
   1.204 +      "sw              $s7, 0($s2)                       \n"
   1.205 +      "sw              $t9, 4($s2)                       \n"
   1.206 +
   1.207 +      "addiu            $AT, -1                          \n"
   1.208 +      "addiu            %[src], 4                        \n"
   1.209 +
   1.210 +      "bnez             $AT, 1b                          \n"
   1.211 +      " addu            %[dst], $s2, %[dst_stride]       \n"
   1.212 +      "b                2f                               \n"
   1.213 +//dst + dst_stride unaligned
   1.214 +      "11:                                               \n"
   1.215 +      "lw               $t0, 0(%[src])                   \n"
   1.216 +      "lwx              $t1, %[src_stride](%[src])       \n"
   1.217 +      "lwx              $t8, $t2(%[src])                 \n"
   1.218 +      "lwx              $t9, $t3(%[src])                 \n"
   1.219 +
   1.220 +// t0 = | 30 | 20 | 10 | 00 |
   1.221 +// t1 = | 31 | 21 | 11 | 01 |
   1.222 +// t8 = | 32 | 22 | 12 | 02 |
   1.223 +// t9 = | 33 | 23 | 13 | 03 |
   1.224 +
   1.225 +      "precr.qb.ph     $s0, $t1, $t0                     \n"
   1.226 +      "precr.qb.ph     $s1, $t9, $t8                     \n"
   1.227 +      "precrq.qb.ph    $s2, $t1, $t0                     \n"
   1.228 +      "precrq.qb.ph    $s3, $t9, $t8                     \n"
   1.229 +
   1.230 +  // s0 = | 21 | 01 | 20 | 00 |
   1.231 +  // s1 = | 23 | 03 | 22 | 02 |
   1.232 +  // s2 = | 31 | 11 | 30 | 10 |
   1.233 +  // s3 = | 33 | 13 | 32 | 12 |
   1.234 +
   1.235 +      "precr.qb.ph     $s4, $s1, $s0                     \n"
   1.236 +      "precrq.qb.ph    $s5, $s1, $s0                     \n"
   1.237 +      "precr.qb.ph     $s6, $s3, $s2                     \n"
   1.238 +      "precrq.qb.ph    $s7, $s3, $s2                     \n"
   1.239 +
   1.240 +  // s4 = | 03 | 02 | 01 | 00 |
   1.241 +  // s5 = | 23 | 22 | 21 | 20 |
   1.242 +  // s6 = | 13 | 12 | 11 | 10 |
   1.243 +  // s7 = | 33 | 32 | 31 | 30 |
   1.244 +
   1.245 +      "lwx              $t0, $t4(%[src])                 \n"
   1.246 +      "lwx              $t1, $t5(%[src])                 \n"
   1.247 +      "lwx              $t8, $t6(%[src])                 \n"
   1.248 +      "lwx              $t9, $t7(%[src])                 \n"
   1.249 +
   1.250 +// t0 = | 34 | 24 | 14 | 04 |
   1.251 +// t1 = | 35 | 25 | 15 | 05 |
   1.252 +// t8 = | 36 | 26 | 16 | 06 |
   1.253 +// t9 = | 37 | 27 | 17 | 07 |
   1.254 +
   1.255 +      "precr.qb.ph     $s0, $t1, $t0                     \n"
   1.256 +      "precr.qb.ph     $s1, $t9, $t8                     \n"
   1.257 +      "precrq.qb.ph    $s2, $t1, $t0                     \n"
   1.258 +      "precrq.qb.ph    $s3, $t9, $t8                     \n"
   1.259 +
   1.260 +  // s0 = | 25 | 05 | 24 | 04 |
   1.261 +  // s1 = | 27 | 07 | 26 | 06 |
   1.262 +  // s2 = | 35 | 15 | 34 | 14 |
   1.263 +  // s3 = | 37 | 17 | 36 | 16 |
   1.264 +
   1.265 +      "precr.qb.ph     $t0, $s1, $s0                     \n"
   1.266 +      "precrq.qb.ph    $t1, $s1, $s0                     \n"
   1.267 +      "precr.qb.ph     $t8, $s3, $s2                     \n"
   1.268 +      "precrq.qb.ph    $t9, $s3, $s2                     \n"
   1.269 +
   1.270 +  // t0 = | 07 | 06 | 05 | 04 |
   1.271 +  // t1 = | 27 | 26 | 25 | 24 |
   1.272 +  // t8 = | 17 | 16 | 15 | 14 |
   1.273 +  // t9 = | 37 | 36 | 35 | 34 |
   1.274 +
   1.275 +      "addu            $s0, %[dst], %[dst_stride]        \n"
   1.276 +      "addu            $s1, $s0, %[dst_stride]           \n"
   1.277 +      "addu            $s2, $s1, %[dst_stride]           \n"
   1.278 +
   1.279 +      "swr              $s4, 0(%[dst])                   \n"
   1.280 +      "swl              $s4, 3(%[dst])                   \n"
   1.281 +      "swr              $t0, 4(%[dst])                   \n"
   1.282 +      "swl              $t0, 7(%[dst])                   \n"
   1.283 +      "swr              $s6, 0($s0)                      \n"
   1.284 +      "swl              $s6, 3($s0)                      \n"
   1.285 +      "swr              $t8, 4($s0)                      \n"
   1.286 +      "swl              $t8, 7($s0)                      \n"
   1.287 +      "swr              $s5, 0($s1)                      \n"
   1.288 +      "swl              $s5, 3($s1)                      \n"
   1.289 +      "swr              $t1, 4($s1)                      \n"
   1.290 +      "swl              $t1, 7($s1)                      \n"
   1.291 +      "swr              $s7, 0($s2)                      \n"
   1.292 +      "swl              $s7, 3($s2)                      \n"
   1.293 +      "swr              $t9, 4($s2)                      \n"
   1.294 +      "swl              $t9, 7($s2)                      \n"
   1.295 +
   1.296 +      "addiu            $AT, -1                          \n"
   1.297 +      "addiu            %[src], 4                        \n"
   1.298 +
   1.299 +      "bnez             $AT, 11b                         \n"
   1.300 +      " addu            %[dst], $s2, %[dst_stride]       \n"
   1.301 +      "2:                                                \n"
   1.302 +      ".set pop                                          \n"
   1.303 +      ".set at                                           \n"
   1.304 +      :[src] "+r" (src),
   1.305 +       [dst] "+r" (dst),
   1.306 +       [width] "+r" (width)
   1.307 +      :[src_stride] "r" (src_stride),
   1.308 +       [dst_stride] "r" (dst_stride)
   1.309 +      : "t0", "t1",  "t2", "t3",  "t4", "t5",
   1.310 +        "t6", "t7", "t8", "t9",
   1.311 +        "s0", "s1", "s2", "s3", "s4",
   1.312 +        "s5", "s6", "s7"
   1.313 +  );
   1.314 +}
   1.315 +
   1.316 +void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
   1.317 +                               uint8* dst_a, int dst_stride_a,
   1.318 +                               uint8* dst_b, int dst_stride_b,
   1.319 +                               int width) {
   1.320 +  __asm__ __volatile__ (
   1.321 +      ".set push                                         \n"
   1.322 +      ".set noreorder                                    \n"
   1.323 +      "beqz            %[width], 2f                      \n"
   1.324 +      " sll            $t2, %[src_stride], 0x1           \n" // src_stride x 2
   1.325 +      "sll             $t4, %[src_stride], 0x2           \n" // src_stride x 4
   1.326 +      "sll             $t9, %[src_stride], 0x3           \n" // src_stride x 8
   1.327 +      "addu            $t3, $t2, %[src_stride]           \n"
   1.328 +      "addu            $t5, $t4, %[src_stride]           \n"
   1.329 +      "addu            $t6, $t2, $t4                     \n"
   1.330 +      "subu            $t7, $t9, %[src_stride]           \n"
   1.331 +      "srl             $t1, %[width], 1                  \n"
   1.332 +
   1.333 +// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
   1.334 +      "andi            $t0, %[dst_a], 0x3                \n"
   1.335 +      "andi            $t8, %[dst_b], 0x3                \n"
   1.336 +      "or              $t0, $t0, $t8                     \n"
   1.337 +      "andi            $t8, %[dst_stride_a], 0x3         \n"
   1.338 +      "andi            $s5, %[dst_stride_b], 0x3         \n"
   1.339 +      "or              $t8, $t8, $s5                     \n"
   1.340 +      "or              $t0, $t0, $t8                     \n"
   1.341 +      "bnez            $t0, 11f                          \n"
   1.342 +      " nop                                              \n"
   1.343 +// dst + dst_stride word aligned (both, a & b dst addresses)
   1.344 +    "1:                                                  \n"
   1.345 +      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|
   1.346 +      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|
   1.347 +      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
   1.348 +      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|
   1.349 +      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|
   1.350 +      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
   1.351 +
   1.352 +      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|
   1.353 +      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|
   1.354 +      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|
   1.355 +      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|
   1.356 +
   1.357 +      "sll             $t0, $t0, 16                      \n"
   1.358 +      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|
   1.359 +      "sll             $t9, $t9, 16                      \n"
   1.360 +      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|
   1.361 +
   1.362 +      "sw              $s3, 0($s5)                       \n"
   1.363 +      "sw              $s4, 0($s6)                       \n"
   1.364 +
   1.365 +      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|
   1.366 +      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|
   1.367 +
   1.368 +      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|
   1.369 +      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|
   1.370 +      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|
   1.371 +      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|
   1.372 +      "sw              $s3, 0(%[dst_a])                  \n"
   1.373 +      "sw              $s4, 0(%[dst_b])                  \n"
   1.374 +
   1.375 +      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|
   1.376 +      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|
   1.377 +      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|
   1.378 +      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|
   1.379 +
   1.380 +      "sll             $t0, $t0, 16                      \n"
   1.381 +      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|
   1.382 +      "sll             $t9, $t9, 16                      \n"
   1.383 +      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|
   1.384 +      "sw              $s3, 4($s5)                       \n"
   1.385 +      "sw              $s4, 4($s6)                       \n"
   1.386 +
   1.387 +      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|
   1.388 +      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|
   1.389 +
   1.390 +      "addiu           %[src], 4                         \n"
   1.391 +      "addiu           $t1, -1                           \n"
   1.392 +      "sll             $t0, %[dst_stride_a], 1           \n"
   1.393 +      "sll             $t8, %[dst_stride_b], 1           \n"
   1.394 +      "sw              $s3, 4(%[dst_a])                  \n"
   1.395 +      "sw              $s4, 4(%[dst_b])                  \n"
   1.396 +      "addu            %[dst_a], %[dst_a], $t0           \n"
   1.397 +      "bnez            $t1, 1b                           \n"
   1.398 +      " addu           %[dst_b], %[dst_b], $t8           \n"
   1.399 +      "b               2f                                \n"
   1.400 +      " nop                                              \n"
   1.401 +
   1.402 +// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
   1.403 +   "11:                                                  \n"
   1.404 +      "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|
   1.405 +      "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|
   1.406 +      "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
   1.407 +      "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|
   1.408 +      "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|
   1.409 +      "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
   1.410 +
   1.411 +      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|
   1.412 +      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|
   1.413 +      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|
   1.414 +      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|
   1.415 +
   1.416 +      "sll             $t0, $t0, 16                      \n"
   1.417 +      "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|
   1.418 +      "sll             $t9, $t9, 16                      \n"
   1.419 +      "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|
   1.420 +
   1.421 +      "swr             $s3, 0($s5)                       \n"
   1.422 +      "swl             $s3, 3($s5)                       \n"
   1.423 +      "swr             $s4, 0($s6)                       \n"
   1.424 +      "swl             $s4, 3($s6)                       \n"
   1.425 +
   1.426 +      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|
   1.427 +      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|
   1.428 +
   1.429 +      "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|
   1.430 +      "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|
   1.431 +      "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|
   1.432 +      "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|
   1.433 +      "swr             $s3, 0(%[dst_a])                  \n"
   1.434 +      "swl             $s3, 3(%[dst_a])                  \n"
   1.435 +      "swr             $s4, 0(%[dst_b])                  \n"
   1.436 +      "swl             $s4, 3(%[dst_b])                  \n"
   1.437 +
   1.438 +      "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|
   1.439 +      "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|
   1.440 +      "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|
   1.441 +      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|
   1.442 +
   1.443 +      "sll             $t0, $t0, 16                      \n"
   1.444 +      "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|
   1.445 +      "sll             $t9, $t9, 16                      \n"
   1.446 +      "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|
   1.447 +
   1.448 +      "swr             $s3, 4($s5)                       \n"
   1.449 +      "swl             $s3, 7($s5)                       \n"
   1.450 +      "swr             $s4, 4($s6)                       \n"
   1.451 +      "swl             $s4, 7($s6)                       \n"
   1.452 +
   1.453 +      "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|
   1.454 +      "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|
   1.455 +
   1.456 +      "addiu           %[src], 4                         \n"
   1.457 +      "addiu           $t1, -1                           \n"
   1.458 +      "sll             $t0, %[dst_stride_a], 1           \n"
   1.459 +      "sll             $t8, %[dst_stride_b], 1           \n"
   1.460 +      "swr             $s3, 4(%[dst_a])                  \n"
   1.461 +      "swl             $s3, 7(%[dst_a])                  \n"
   1.462 +      "swr             $s4, 4(%[dst_b])                  \n"
   1.463 +      "swl             $s4, 7(%[dst_b])                  \n"
   1.464 +      "addu            %[dst_a], %[dst_a], $t0           \n"
   1.465 +      "bnez            $t1, 11b                          \n"
   1.466 +      " addu           %[dst_b], %[dst_b], $t8           \n"
   1.467 +
   1.468 +      "2:                                                \n"
   1.469 +      ".set pop                                          \n"
   1.470 +      : [src] "+r" (src),
   1.471 +        [dst_a] "+r" (dst_a),
   1.472 +        [dst_b] "+r" (dst_b),
   1.473 +        [width] "+r" (width),
   1.474 +        [src_stride] "+r" (src_stride)
   1.475 +      : [dst_stride_a] "r" (dst_stride_a),
   1.476 +        [dst_stride_b] "r" (dst_stride_b)
   1.477 +      : "t0", "t1",  "t2", "t3",  "t4", "t5",
   1.478 +        "t6", "t7", "t8", "t9",
   1.479 +        "s0", "s1", "s2", "s3",
   1.480 +        "s4", "s5", "s6"
   1.481 +  );
   1.482 +}
   1.483 +
   1.484 +#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
   1.485 +
   1.486 +#ifdef __cplusplus
   1.487 +}  // extern "C"
   1.488 +}  // namespace libyuv
   1.489 +#endif

mercurial