media/libyuv/source/scale_mips.cc

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libyuv/source/scale_mips.cc	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,653 @@
     1.4 +/*
     1.5 + *  Copyright 2012 The LibYuv Project Authors. All rights reserved.
     1.6 + *
     1.7 + *  Use of this source code is governed by a BSD-style license
     1.8 + *  that can be found in the LICENSE file in the root of the source
     1.9 + *  tree. An additional intellectual property rights grant can be found
    1.10 + *  in the file PATENTS. All contributing project authors may
    1.11 + *  be found in the AUTHORS file in the root of the source tree.
    1.12 + */
    1.13 +
    1.14 +#include "libyuv/basic_types.h"
    1.15 +#include "libyuv/row.h"
    1.16 +
    1.17 +#ifdef __cplusplus
    1.18 +namespace libyuv {
    1.19 +extern "C" {
    1.20 +#endif
    1.21 +
    1.22 +// This module is for GCC MIPS DSPR2
    1.23 +#if !defined(LIBYUV_DISABLE_MIPS) && \
    1.24 +    defined(__mips_dsp) && (__mips_dsp_rev >= 2)
    1.25 +
    1.26 +void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
    1.27 +                              uint8* dst, int dst_width) {
    1.28 +  __asm__ __volatile__(
    1.29 +    ".set push                                     \n"
    1.30 +    ".set noreorder                                \n"
    1.31 +
    1.32 +    "srl            $t9, %[dst_width], 4           \n"  // iterations -> by 16
    1.33 +    "beqz           $t9, 2f                        \n"
    1.34 +    " nop                                          \n"
    1.35 +
    1.36 +    ".p2align       2                              \n"
    1.37 +  "1:                                              \n"
    1.38 +    "lw             $t0, 0(%[src_ptr])             \n"  // |3|2|1|0|
    1.39 +    "lw             $t1, 4(%[src_ptr])             \n"  // |7|6|5|4|
    1.40 +    "lw             $t2, 8(%[src_ptr])             \n"  // |11|10|9|8|
    1.41 +    "lw             $t3, 12(%[src_ptr])            \n"  // |15|14|13|12|
    1.42 +    "lw             $t4, 16(%[src_ptr])            \n"  // |19|18|17|16|
    1.43 +    "lw             $t5, 20(%[src_ptr])            \n"  // |23|22|21|20|
    1.44 +    "lw             $t6, 24(%[src_ptr])            \n"  // |27|26|25|24|
    1.45 +    "lw             $t7, 28(%[src_ptr])            \n"  // |31|30|29|28|
    1.46 +    // TODO(fbarchard): Use odd pixels instead of even.
    1.47 +    "precr.qb.ph    $t8, $t1, $t0                  \n"  // |6|4|2|0|
    1.48 +    "precr.qb.ph    $t0, $t3, $t2                  \n"  // |14|12|10|8|
    1.49 +    "precr.qb.ph    $t1, $t5, $t4                  \n"  // |22|20|18|16|
    1.50 +    "precr.qb.ph    $t2, $t7, $t6                  \n"  // |30|28|26|24|
    1.51 +    "addiu          %[src_ptr], %[src_ptr], 32     \n"
    1.52 +    "addiu          $t9, $t9, -1                   \n"
    1.53 +    "sw             $t8, 0(%[dst])                 \n"
    1.54 +    "sw             $t0, 4(%[dst])                 \n"
    1.55 +    "sw             $t1, 8(%[dst])                 \n"
    1.56 +    "sw             $t2, 12(%[dst])                \n"
    1.57 +    "bgtz           $t9, 1b                        \n"
    1.58 +    " addiu         %[dst], %[dst], 16             \n"
    1.59 +
    1.60 +  "2:                                              \n"
    1.61 +    "andi           $t9, %[dst_width], 0xf         \n"  // residue
    1.62 +    "beqz           $t9, 3f                        \n"
    1.63 +    " nop                                          \n"
    1.64 +
    1.65 +  "21:                                             \n"
    1.66 +    "lbu            $t0, 0(%[src_ptr])             \n"
    1.67 +    "addiu          %[src_ptr], %[src_ptr], 2      \n"
    1.68 +    "addiu          $t9, $t9, -1                   \n"
    1.69 +    "sb             $t0, 0(%[dst])                 \n"
    1.70 +    "bgtz           $t9, 21b                       \n"
    1.71 +    " addiu         %[dst], %[dst], 1              \n"
    1.72 +
    1.73 +  "3:                                              \n"
    1.74 +    ".set pop                                      \n"
    1.75 +  : [src_ptr] "+r" (src_ptr),
    1.76 +    [dst] "+r" (dst)
    1.77 +  : [dst_width] "r" (dst_width)
    1.78 +  : "t0", "t1", "t2", "t3", "t4", "t5",
    1.79 +    "t6", "t7", "t8", "t9"
    1.80 +  );
    1.81 +}
    1.82 +
    1.83 +void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
    1.84 +                                 uint8* dst, int dst_width) {
    1.85 +  const uint8* t = src_ptr + src_stride;
    1.86 +
    1.87 +  __asm__ __volatile__ (
    1.88 +    ".set push                                    \n"
    1.89 +    ".set noreorder                               \n"
    1.90 +
    1.91 +    "srl            $t9, %[dst_width], 3          \n"  // iterations -> step 8
    1.92 +    "bltz           $t9, 2f                       \n"
    1.93 +    " nop                                         \n"
    1.94 +
    1.95 +    ".p2align       2                             \n"
    1.96 +  "1:                                             \n"
    1.97 +    "lw             $t0, 0(%[src_ptr])            \n"  // |3|2|1|0|
    1.98 +    "lw             $t1, 4(%[src_ptr])            \n"  // |7|6|5|4|
    1.99 +    "lw             $t2, 8(%[src_ptr])            \n"  // |11|10|9|8|
   1.100 +    "lw             $t3, 12(%[src_ptr])           \n"  // |15|14|13|12|
   1.101 +    "lw             $t4, 0(%[t])                  \n"  // |19|18|17|16|
   1.102 +    "lw             $t5, 4(%[t])                  \n"  // |23|22|21|20|
   1.103 +    "lw             $t6, 8(%[t])                  \n"  // |27|26|25|24|
   1.104 +    "lw             $t7, 12(%[t])                 \n"  // |31|30|29|28|
   1.105 +    "addiu          $t9, $t9, -1                  \n"
   1.106 +    "srl            $t8, $t0, 16                  \n"  // |X|X|3|2|
   1.107 +    "ins            $t0, $t4, 16, 16              \n"  // |17|16|1|0|
   1.108 +    "ins            $t4, $t8, 0, 16               \n"  // |19|18|3|2|
   1.109 +    "raddu.w.qb     $t0, $t0                      \n"  // |17+16+1+0|
   1.110 +    "raddu.w.qb     $t4, $t4                      \n"  // |19+18+3+2|
   1.111 +    "shra_r.w       $t0, $t0, 2                   \n"  // |t0+2|>>2
   1.112 +    "shra_r.w       $t4, $t4, 2                   \n"  // |t4+2|>>2
   1.113 +    "srl            $t8, $t1, 16                  \n"  // |X|X|7|6|
   1.114 +    "ins            $t1, $t5, 16, 16              \n"  // |21|20|5|4|
   1.115 +    "ins            $t5, $t8, 0, 16               \n"  // |22|23|7|6|
   1.116 +    "raddu.w.qb     $t1, $t1                      \n"  // |21+20+5+4|
   1.117 +    "raddu.w.qb     $t5, $t5                      \n"  // |23+22+7+6|
   1.118 +    "shra_r.w       $t1, $t1, 2                   \n"  // |t1+2|>>2
   1.119 +    "shra_r.w       $t5, $t5, 2                   \n"  // |t5+2|>>2
   1.120 +    "srl            $t8, $t2, 16                  \n"  // |X|X|11|10|
   1.121 +    "ins            $t2, $t6, 16, 16              \n"  // |25|24|9|8|
   1.122 +    "ins            $t6, $t8, 0, 16               \n"  // |27|26|11|10|
   1.123 +    "raddu.w.qb     $t2, $t2                      \n"  // |25+24+9+8|
   1.124 +    "raddu.w.qb     $t6, $t6                      \n"  // |27+26+11+10|
   1.125 +    "shra_r.w       $t2, $t2, 2                   \n"  // |t2+2|>>2
   1.126 +    "shra_r.w       $t6, $t6, 2                   \n"  // |t5+2|>>2
   1.127 +    "srl            $t8, $t3, 16                  \n"  // |X|X|15|14|
   1.128 +    "ins            $t3, $t7, 16, 16              \n"  // |29|28|13|12|
   1.129 +    "ins            $t7, $t8, 0, 16               \n"  // |31|30|15|14|
   1.130 +    "raddu.w.qb     $t3, $t3                      \n"  // |29+28+13+12|
   1.131 +    "raddu.w.qb     $t7, $t7                      \n"  // |31+30+15+14|
   1.132 +    "shra_r.w       $t3, $t3, 2                   \n"  // |t3+2|>>2
   1.133 +    "shra_r.w       $t7, $t7, 2                   \n"  // |t7+2|>>2
   1.134 +    "addiu          %[src_ptr], %[src_ptr], 16    \n"
   1.135 +    "addiu          %[t], %[t], 16                \n"
   1.136 +    "sb             $t0, 0(%[dst])                \n"
   1.137 +    "sb             $t4, 1(%[dst])                \n"
   1.138 +    "sb             $t1, 2(%[dst])                \n"
   1.139 +    "sb             $t5, 3(%[dst])                \n"
   1.140 +    "sb             $t2, 4(%[dst])                \n"
   1.141 +    "sb             $t6, 5(%[dst])                \n"
   1.142 +    "sb             $t3, 6(%[dst])                \n"
   1.143 +    "sb             $t7, 7(%[dst])                \n"
   1.144 +    "bgtz           $t9, 1b                       \n"
   1.145 +    " addiu         %[dst], %[dst], 8             \n"
   1.146 +
   1.147 +  "2:                                             \n"
   1.148 +    "andi           $t9, %[dst_width], 0x7        \n"  // x = residue
   1.149 +    "beqz           $t9, 3f                       \n"
   1.150 +    " nop                                         \n"
   1.151 +
   1.152 +    "21:                                          \n"
   1.153 +    "lwr            $t1, 0(%[src_ptr])            \n"
   1.154 +    "lwl            $t1, 3(%[src_ptr])            \n"
   1.155 +    "lwr            $t2, 0(%[t])                  \n"
   1.156 +    "lwl            $t2, 3(%[t])                  \n"
   1.157 +    "srl            $t8, $t1, 16                  \n"
   1.158 +    "ins            $t1, $t2, 16, 16              \n"
   1.159 +    "ins            $t2, $t8, 0, 16               \n"
   1.160 +    "raddu.w.qb     $t1, $t1                      \n"
   1.161 +    "raddu.w.qb     $t2, $t2                      \n"
   1.162 +    "shra_r.w       $t1, $t1, 2                   \n"
   1.163 +    "shra_r.w       $t2, $t2, 2                   \n"
   1.164 +    "sb             $t1, 0(%[dst])                \n"
   1.165 +    "sb             $t2, 1(%[dst])                \n"
   1.166 +    "addiu          %[src_ptr], %[src_ptr], 4     \n"
   1.167 +    "addiu          $t9, $t9, -2                  \n"
   1.168 +    "addiu          %[t], %[t], 4                 \n"
   1.169 +    "bgtz           $t9, 21b                      \n"
   1.170 +    " addiu         %[dst], %[dst], 2             \n"
   1.171 +
   1.172 +  "3:                                             \n"
   1.173 +    ".set pop                                     \n"
   1.174 +
   1.175 +  : [src_ptr] "+r" (src_ptr),
   1.176 +    [dst] "+r" (dst), [t] "+r" (t)
   1.177 +  : [dst_width] "r" (dst_width)
   1.178 +  : "t0", "t1", "t2", "t3", "t4", "t5",
   1.179 +    "t6", "t7", "t8", "t9"
   1.180 +  );
   1.181 +}
   1.182 +
   1.183 +void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
   1.184 +                              uint8* dst, int dst_width) {
   1.185 +  __asm__ __volatile__ (
   1.186 +      ".set push                                    \n"
   1.187 +      ".set noreorder                               \n"
   1.188 +
   1.189 +      "srl            $t9, %[dst_width], 3          \n"
   1.190 +      "beqz           $t9, 2f                       \n"
   1.191 +      " nop                                         \n"
   1.192 +
   1.193 +      ".p2align       2                             \n"
   1.194 +     "1:                                            \n"
   1.195 +      "lw             $t1, 0(%[src_ptr])            \n"  // |3|2|1|0|
   1.196 +      "lw             $t2, 4(%[src_ptr])            \n"  // |7|6|5|4|
   1.197 +      "lw             $t3, 8(%[src_ptr])            \n"  // |11|10|9|8|
   1.198 +      "lw             $t4, 12(%[src_ptr])           \n"  // |15|14|13|12|
   1.199 +      "lw             $t5, 16(%[src_ptr])           \n"  // |19|18|17|16|
   1.200 +      "lw             $t6, 20(%[src_ptr])           \n"  // |23|22|21|20|
   1.201 +      "lw             $t7, 24(%[src_ptr])           \n"  // |27|26|25|24|
   1.202 +      "lw             $t8, 28(%[src_ptr])           \n"  // |31|30|29|28|
   1.203 +      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |6|4|2|0|
   1.204 +      "precr.qb.ph    $t2, $t4, $t3                 \n"  // |14|12|10|8|
   1.205 +      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |22|20|18|16|
   1.206 +      "precr.qb.ph    $t6, $t8, $t7                 \n"  // |30|28|26|24|
   1.207 +      "precr.qb.ph    $t1, $t2, $t1                 \n"  // |12|8|4|0|
   1.208 +      "precr.qb.ph    $t5, $t6, $t5                 \n"  // |28|24|20|16|
   1.209 +      "addiu          %[src_ptr], %[src_ptr], 32    \n"
   1.210 +      "addiu          $t9, $t9, -1                  \n"
   1.211 +      "sw             $t1, 0(%[dst])                \n"
   1.212 +      "sw             $t5, 4(%[dst])                \n"
   1.213 +      "bgtz           $t9, 1b                       \n"
   1.214 +      " addiu         %[dst], %[dst], 8             \n"
   1.215 +
   1.216 +    "2:                                             \n"
   1.217 +      "andi           $t9, %[dst_width], 7          \n"  // residue
   1.218 +      "beqz           $t9, 3f                       \n"
   1.219 +      " nop                                         \n"
   1.220 +
   1.221 +    "21:                                            \n"
   1.222 +      "lbu            $t1, 0(%[src_ptr])            \n"
   1.223 +      "addiu          %[src_ptr], %[src_ptr], 4     \n"
   1.224 +      "addiu          $t9, $t9, -1                  \n"
   1.225 +      "sb             $t1, 0(%[dst])                \n"
   1.226 +      "bgtz           $t9, 21b                      \n"
   1.227 +      " addiu         %[dst], %[dst], 1             \n"
   1.228 +
   1.229 +    "3:                                             \n"
   1.230 +      ".set pop                                     \n"
   1.231 +      : [src_ptr] "+r" (src_ptr),
   1.232 +        [dst] "+r" (dst)
   1.233 +      : [dst_width] "r" (dst_width)
   1.234 +      : "t1", "t2", "t3", "t4", "t5",
   1.235 +        "t6", "t7", "t8", "t9"
   1.236 +  );
   1.237 +}
   1.238 +
   1.239 +void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
   1.240 +                                 uint8* dst, int dst_width) {
   1.241 +  intptr_t stride = src_stride;
   1.242 +  const uint8* s1 = src_ptr + stride;
   1.243 +  const uint8* s2 = s1 + stride;
   1.244 +  const uint8* s3 = s2 + stride;
   1.245 +
   1.246 +  __asm__ __volatile__ (
   1.247 +      ".set push                                  \n"
   1.248 +      ".set noreorder                             \n"
   1.249 +
   1.250 +      "srl           $t9, %[dst_width], 1         \n"
   1.251 +      "andi          $t8, %[dst_width], 1         \n"
   1.252 +
   1.253 +      ".p2align      2                            \n"
   1.254 +     "1:                                          \n"
   1.255 +      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
   1.256 +      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
   1.257 +      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
   1.258 +      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
   1.259 +      "lw            $t4, 4(%[src_ptr])           \n"  // |19|18|17|16|
   1.260 +      "lw            $t5, 4(%[s1])                \n"  // |23|22|21|20|
   1.261 +      "lw            $t6, 4(%[s2])                \n"  // |27|26|25|24|
   1.262 +      "lw            $t7, 4(%[s3])                \n"  // |31|30|29|28|
   1.263 +      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
   1.264 +      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
   1.265 +      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
   1.266 +      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
   1.267 +      "raddu.w.qb    $t4, $t4                     \n"  // |19 + 18 + 17 + 16|
   1.268 +      "raddu.w.qb    $t5, $t5                     \n"  // |23 + 22 + 21 + 20|
   1.269 +      "raddu.w.qb    $t6, $t6                     \n"  // |27 + 26 + 25 + 24|
   1.270 +      "raddu.w.qb    $t7, $t7                     \n"  // |31 + 30 + 29 + 28|
   1.271 +      "add           $t0, $t0, $t1                \n"
   1.272 +      "add           $t1, $t2, $t3                \n"
   1.273 +      "add           $t0, $t0, $t1                \n"
   1.274 +      "add           $t4, $t4, $t5                \n"
   1.275 +      "add           $t6, $t6, $t7                \n"
   1.276 +      "add           $t4, $t4, $t6                \n"
   1.277 +      "shra_r.w      $t0, $t0, 4                  \n"
   1.278 +      "shra_r.w      $t4, $t4, 4                  \n"
   1.279 +      "sb            $t0, 0(%[dst])               \n"
   1.280 +      "sb            $t4, 1(%[dst])               \n"
   1.281 +      "addiu         %[src_ptr], %[src_ptr], 8    \n"
   1.282 +      "addiu         %[s1], %[s1], 8              \n"
   1.283 +      "addiu         %[s2], %[s2], 8              \n"
   1.284 +      "addiu         %[s3], %[s3], 8              \n"
   1.285 +      "addiu         $t9, $t9, -1                 \n"
   1.286 +      "bgtz          $t9, 1b                      \n"
   1.287 +      " addiu        %[dst], %[dst], 2            \n"
   1.288 +      "beqz          $t8, 2f                      \n"
   1.289 +      " nop                                       \n"
   1.290 +
   1.291 +      "lw            $t0, 0(%[src_ptr])           \n"  // |3|2|1|0|
   1.292 +      "lw            $t1, 0(%[s1])                \n"  // |7|6|5|4|
   1.293 +      "lw            $t2, 0(%[s2])                \n"  // |11|10|9|8|
   1.294 +      "lw            $t3, 0(%[s3])                \n"  // |15|14|13|12|
   1.295 +      "raddu.w.qb    $t0, $t0                     \n"  // |3 + 2 + 1 + 0|
   1.296 +      "raddu.w.qb    $t1, $t1                     \n"  // |7 + 6 + 5 + 4|
   1.297 +      "raddu.w.qb    $t2, $t2                     \n"  // |11 + 10 + 9 + 8|
   1.298 +      "raddu.w.qb    $t3, $t3                     \n"  // |15 + 14 + 13 + 12|
   1.299 +      "add           $t0, $t0, $t1                \n"
   1.300 +      "add           $t1, $t2, $t3                \n"
   1.301 +      "add           $t0, $t0, $t1                \n"
   1.302 +      "shra_r.w      $t0, $t0, 4                  \n"
   1.303 +      "sb            $t0, 0(%[dst])               \n"
   1.304 +
   1.305 +      "2:                                         \n"
   1.306 +      ".set pop                                   \n"
   1.307 +
   1.308 +      : [src_ptr] "+r" (src_ptr),
   1.309 +        [dst] "+r" (dst),
   1.310 +        [s1] "+r" (s1),
   1.311 +        [s2] "+r" (s2),
   1.312 +        [s3] "+r" (s3)
   1.313 +      : [dst_width] "r" (dst_width)
   1.314 +      : "t0", "t1", "t2", "t3", "t4", "t5",
   1.315 +        "t6","t7", "t8", "t9"
   1.316 +  );
   1.317 +}
   1.318 +
   1.319 +void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
   1.320 +                               uint8* dst, int dst_width) {
   1.321 +  __asm__ __volatile__ (
   1.322 +      ".set push                                          \n"
   1.323 +      ".set noreorder                                     \n"
   1.324 +      ".p2align        2                                  \n"
   1.325 +    "1:                                                   \n"
   1.326 +      "lw              $t1, 0(%[src_ptr])                 \n"  // |3|2|1|0|
   1.327 +      "lw              $t2, 4(%[src_ptr])                 \n"  // |7|6|5|4|
   1.328 +      "lw              $t3, 8(%[src_ptr])                 \n"  // |11|10|9|8|
   1.329 +      "lw              $t4, 12(%[src_ptr])                \n"  // |15|14|13|12|
   1.330 +      "lw              $t5, 16(%[src_ptr])                \n"  // |19|18|17|16|
   1.331 +      "lw              $t6, 20(%[src_ptr])                \n"  // |23|22|21|20|
   1.332 +      "lw              $t7, 24(%[src_ptr])                \n"  // |27|26|25|24|
   1.333 +      "lw              $t8, 28(%[src_ptr])                \n"  // |31|30|29|28|
   1.334 +      "precrq.qb.ph    $t0, $t2, $t4                      \n"  // |7|5|15|13|
   1.335 +      "precrq.qb.ph    $t9, $t6, $t8                      \n"  // |23|21|31|30|
   1.336 +      "addiu           %[dst_width], %[dst_width], -24    \n"
   1.337 +      "ins             $t1, $t1, 8, 16                    \n"  // |3|1|0|X|
   1.338 +      "ins             $t4, $t0, 8, 16                    \n"  // |X|15|13|12|
   1.339 +      "ins             $t5, $t5, 8, 16                    \n"  // |19|17|16|X|
   1.340 +      "ins             $t8, $t9, 8, 16                    \n"  // |X|31|29|28|
   1.341 +      "addiu           %[src_ptr], %[src_ptr], 32         \n"
   1.342 +      "packrl.ph       $t0, $t3, $t0                      \n"  // |9|8|7|5|
   1.343 +      "packrl.ph       $t9, $t7, $t9                      \n"  // |25|24|23|21|
   1.344 +      "prepend         $t1, $t2, 8                        \n"  // |4|3|1|0|
   1.345 +      "prepend         $t3, $t4, 24                       \n"  // |15|13|12|11|
   1.346 +      "prepend         $t5, $t6, 8                        \n"  // |20|19|17|16|
   1.347 +      "prepend         $t7, $t8, 24                       \n"  // |31|29|28|27|
   1.348 +      "sw              $t1, 0(%[dst])                     \n"
   1.349 +      "sw              $t0, 4(%[dst])                     \n"
   1.350 +      "sw              $t3, 8(%[dst])                     \n"
   1.351 +      "sw              $t5, 12(%[dst])                    \n"
   1.352 +      "sw              $t9, 16(%[dst])                    \n"
   1.353 +      "sw              $t7, 20(%[dst])                    \n"
   1.354 +      "bnez            %[dst_width], 1b                   \n"
   1.355 +      " addiu          %[dst], %[dst], 24                 \n"
   1.356 +      ".set pop                                           \n"
   1.357 +      : [src_ptr] "+r" (src_ptr),
   1.358 +        [dst] "+r" (dst),
   1.359 +        [dst_width] "+r" (dst_width)
   1.360 +      :
   1.361 +      : "t0", "t1", "t2", "t3", "t4", "t5",
   1.362 +        "t6","t7", "t8", "t9"
   1.363 +  );
   1.364 +}
   1.365 +
   1.366 +void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
   1.367 +                                     uint8* d, int dst_width) {
   1.368 +  __asm__ __volatile__ (
   1.369 +      ".set push                                         \n"
   1.370 +      ".set noreorder                                    \n"
   1.371 +      "repl.ph           $t3, 3                          \n"  // 0x00030003
   1.372 +
   1.373 +     ".p2align           2                               \n"
   1.374 +    "1:                                                  \n"
   1.375 +      "lw                $t0, 0(%[src_ptr])              \n"  // |S3|S2|S1|S0|
   1.376 +      "lwx               $t1, %[src_stride](%[src_ptr])  \n"  // |T3|T2|T1|T0|
   1.377 +      "rotr              $t2, $t0, 8                     \n"  // |S0|S3|S2|S1|
   1.378 +      "rotr              $t6, $t1, 8                     \n"  // |T0|T3|T2|T1|
   1.379 +      "muleu_s.ph.qbl    $t4, $t2, $t3                   \n"  // |S0*3|S3*3|
   1.380 +      "muleu_s.ph.qbl    $t5, $t6, $t3                   \n"  // |T0*3|T3*3|
   1.381 +      "andi              $t0, $t2, 0xFFFF                \n"  // |0|0|S2|S1|
   1.382 +      "andi              $t1, $t6, 0xFFFF                \n"  // |0|0|T2|T1|
   1.383 +      "raddu.w.qb        $t0, $t0                        \n"
   1.384 +      "raddu.w.qb        $t1, $t1                        \n"
   1.385 +      "shra_r.w          $t0, $t0, 1                     \n"
   1.386 +      "shra_r.w          $t1, $t1, 1                     \n"
   1.387 +      "preceu.ph.qbr     $t2, $t2                        \n"  // |0|S2|0|S1|
   1.388 +      "preceu.ph.qbr     $t6, $t6                        \n"  // |0|T2|0|T1|
   1.389 +      "rotr              $t2, $t2, 16                    \n"  // |0|S1|0|S2|
   1.390 +      "rotr              $t6, $t6, 16                    \n"  // |0|T1|0|T2|
   1.391 +      "addu.ph           $t2, $t2, $t4                   \n"
   1.392 +      "addu.ph           $t6, $t6, $t5                   \n"
   1.393 +      "sll               $t5, $t0, 1                     \n"
   1.394 +      "add               $t0, $t5, $t0                   \n"
   1.395 +      "shra_r.ph         $t2, $t2, 2                     \n"
   1.396 +      "shra_r.ph         $t6, $t6, 2                     \n"
   1.397 +      "shll.ph           $t4, $t2, 1                     \n"
   1.398 +      "addq.ph           $t4, $t4, $t2                   \n"
   1.399 +      "addu              $t0, $t0, $t1                   \n"
   1.400 +      "addiu             %[src_ptr], %[src_ptr], 4       \n"
   1.401 +      "shra_r.w          $t0, $t0, 2                     \n"
   1.402 +      "addu.ph           $t6, $t6, $t4                   \n"
   1.403 +      "shra_r.ph         $t6, $t6, 2                     \n"
   1.404 +      "srl               $t1, $t6, 16                    \n"
   1.405 +      "addiu             %[dst_width], %[dst_width], -3  \n"
   1.406 +      "sb                $t1, 0(%[d])                    \n"
   1.407 +      "sb                $t0, 1(%[d])                    \n"
   1.408 +      "sb                $t6, 2(%[d])                    \n"
   1.409 +      "bgtz              %[dst_width], 1b                \n"
   1.410 +      " addiu            %[d], %[d], 3                   \n"
   1.411 +    "3:                                                  \n"
   1.412 +      ".set pop                                          \n"
   1.413 +      : [src_ptr] "+r" (src_ptr),
   1.414 +        [src_stride] "+r" (src_stride),
   1.415 +        [d] "+r" (d),
   1.416 +        [dst_width] "+r" (dst_width)
   1.417 +      :
   1.418 +      : "t0", "t1", "t2", "t3",
   1.419 +        "t4", "t5", "t6"
   1.420 +  );
   1.421 +}
   1.422 +
   1.423 +void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
   1.424 +                                     uint8* d, int dst_width) {
   1.425 +  __asm__ __volatile__ (
   1.426 +      ".set push                                           \n"
   1.427 +      ".set noreorder                                      \n"
   1.428 +      "repl.ph           $t2, 3                            \n"  // 0x00030003
   1.429 +
   1.430 +      ".p2align          2                                 \n"
   1.431 +    "1:                                                    \n"
   1.432 +      "lw                $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
   1.433 +      "lwx               $t1, %[src_stride](%[src_ptr])    \n"  // |T3|T2|T1|T0|
   1.434 +      "rotr              $t4, $t0, 8                       \n"  // |S0|S3|S2|S1|
   1.435 +      "rotr              $t6, $t1, 8                       \n"  // |T0|T3|T2|T1|
   1.436 +      "muleu_s.ph.qbl    $t3, $t4, $t2                     \n"  // |S0*3|S3*3|
   1.437 +      "muleu_s.ph.qbl    $t5, $t6, $t2                     \n"  // |T0*3|T3*3|
   1.438 +      "andi              $t0, $t4, 0xFFFF                  \n"  // |0|0|S2|S1|
   1.439 +      "andi              $t1, $t6, 0xFFFF                  \n"  // |0|0|T2|T1|
   1.440 +      "raddu.w.qb        $t0, $t0                          \n"
   1.441 +      "raddu.w.qb        $t1, $t1                          \n"
   1.442 +      "shra_r.w          $t0, $t0, 1                       \n"
   1.443 +      "shra_r.w          $t1, $t1, 1                       \n"
   1.444 +      "preceu.ph.qbr     $t4, $t4                          \n"  // |0|S2|0|S1|
   1.445 +      "preceu.ph.qbr     $t6, $t6                          \n"  // |0|T2|0|T1|
   1.446 +      "rotr              $t4, $t4, 16                      \n"  // |0|S1|0|S2|
   1.447 +      "rotr              $t6, $t6, 16                      \n"  // |0|T1|0|T2|
   1.448 +      "addu.ph           $t4, $t4, $t3                     \n"
   1.449 +      "addu.ph           $t6, $t6, $t5                     \n"
   1.450 +      "shra_r.ph         $t6, $t6, 2                       \n"
   1.451 +      "shra_r.ph         $t4, $t4, 2                       \n"
   1.452 +      "addu.ph           $t6, $t6, $t4                     \n"
   1.453 +      "addiu             %[src_ptr], %[src_ptr], 4         \n"
   1.454 +      "shra_r.ph         $t6, $t6, 1                       \n"
   1.455 +      "addu              $t0, $t0, $t1                     \n"
   1.456 +      "addiu             %[dst_width], %[dst_width], -3    \n"
   1.457 +      "shra_r.w          $t0, $t0, 1                       \n"
   1.458 +      "srl               $t1, $t6, 16                      \n"
   1.459 +      "sb                $t1, 0(%[d])                      \n"
   1.460 +      "sb                $t0, 1(%[d])                      \n"
   1.461 +      "sb                $t6, 2(%[d])                      \n"
   1.462 +      "bgtz              %[dst_width], 1b                  \n"
   1.463 +      " addiu            %[d], %[d], 3                     \n"
   1.464 +    "3:                                                    \n"
   1.465 +      ".set pop                                            \n"
   1.466 +      : [src_ptr] "+r" (src_ptr),
   1.467 +        [src_stride] "+r" (src_stride),
   1.468 +        [d] "+r" (d),
   1.469 +        [dst_width] "+r" (dst_width)
   1.470 +      :
   1.471 +      : "t0", "t1", "t2", "t3",
   1.472 +        "t4", "t5", "t6"
   1.473 +  );
   1.474 +}
   1.475 +
   1.476 +void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
   1.477 +                               uint8* dst, int dst_width) {
   1.478 +  __asm__ __volatile__ (
   1.479 +      ".set push                                     \n"
   1.480 +      ".set noreorder                                \n"
   1.481 +
   1.482 +      ".p2align   2                                  \n"
   1.483 +    "1:                                              \n"
   1.484 +      "lw         $t0, 0(%[src_ptr])                 \n"  // |3|2|1|0|
   1.485 +      "lw         $t1, 4(%[src_ptr])                 \n"  // |7|6|5|4|
   1.486 +      "lw         $t2, 8(%[src_ptr])                 \n"  // |11|10|9|8|
   1.487 +      "lw         $t3, 12(%[src_ptr])                \n"  // |15|14|13|12|
   1.488 +      "lw         $t4, 16(%[src_ptr])                \n"  // |19|18|17|16|
   1.489 +      "lw         $t5, 20(%[src_ptr])                \n"  // |23|22|21|20|
   1.490 +      "lw         $t6, 24(%[src_ptr])                \n"  // |27|26|25|24|
   1.491 +      "lw         $t7, 28(%[src_ptr])                \n"  // |31|30|29|28|
   1.492 +      "wsbh       $t0, $t0                           \n"  // |2|3|0|1|
   1.493 +      "wsbh       $t6, $t6                           \n"  // |26|27|24|25|
   1.494 +      "srl        $t0, $t0, 8                        \n"  // |X|2|3|0|
   1.495 +      "srl        $t3, $t3, 16                       \n"  // |X|X|15|14|
   1.496 +      "srl        $t5, $t5, 16                       \n"  // |X|X|23|22|
   1.497 +      "srl        $t7, $t7, 16                       \n"  // |X|X|31|30|
   1.498 +      "ins        $t1, $t2, 24, 8                    \n"  // |8|6|5|4|
   1.499 +      "ins        $t6, $t5, 0, 8                     \n"  // |26|27|24|22|
   1.500 +      "ins        $t1, $t0, 0, 16                    \n"  // |8|6|3|0|
   1.501 +      "ins        $t6, $t7, 24, 8                    \n"  // |30|27|24|22|
   1.502 +      "prepend    $t2, $t3, 24                       \n"  // |X|15|14|11|
   1.503 +      "ins        $t4, $t4, 16, 8                    \n"  // |19|16|17|X|
   1.504 +      "ins        $t4, $t2, 0, 16                    \n"  // |19|16|14|11|
   1.505 +      "addiu      %[src_ptr], %[src_ptr], 32         \n"
   1.506 +      "addiu      %[dst_width], %[dst_width], -12    \n"
   1.507 +      "addiu      $t8,%[dst_width], -12              \n"
   1.508 +      "sw         $t1, 0(%[dst])                     \n"
   1.509 +      "sw         $t4, 4(%[dst])                     \n"
   1.510 +      "sw         $t6, 8(%[dst])                     \n"
   1.511 +      "bgez       $t8, 1b                            \n"
   1.512 +      " addiu     %[dst], %[dst], 12                 \n"
   1.513 +      ".set pop                                      \n"
   1.514 +      : [src_ptr] "+r" (src_ptr),
   1.515 +        [dst] "+r" (dst),
   1.516 +        [dst_width] "+r" (dst_width)
   1.517 +      :
   1.518 +      : "t0", "t1", "t2", "t3", "t4",
   1.519 +        "t5", "t6", "t7", "t8"
   1.520 +  );
   1.521 +}
   1.522 +
   1.523 +void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
   1.524 +                                     uint8* dst_ptr, int dst_width) {
   1.525 +  intptr_t stride = src_stride;
   1.526 +  const uint8* t = src_ptr + stride;
   1.527 +  const int c = 0x2AAA;
   1.528 +
   1.529 +  __asm__ __volatile__ (
   1.530 +      ".set push                                         \n"
   1.531 +      ".set noreorder                                    \n"
   1.532 +
   1.533 +      ".p2align        2                                 \n"
   1.534 +    "1:                                                  \n"
   1.535 +      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
   1.536 +      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
   1.537 +      "lw              $t2, 0(%[t])                      \n"  // |T3|T2|T1|T0|
   1.538 +      "lw              $t3, 4(%[t])                      \n"  // |T7|T6|T5|T4|
   1.539 +      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
   1.540 +      "packrl.ph       $t4, $t1, $t3                     \n"  // |S7|S6|T7|T6|
   1.541 +      "packrl.ph       $t5, $t3, $t1                     \n"  // |T5|T4|S5|S4|
   1.542 +      "raddu.w.qb      $t4, $t4                          \n"  // S7+S6+T7+T6
   1.543 +      "raddu.w.qb      $t5, $t5                          \n"  // T5+T4+S5+S4
   1.544 +      "precrq.qb.ph    $t6, $t0, $t2                     \n"  // |S3|S1|T3|T1|
   1.545 +      "precrq.qb.ph    $t6, $t6, $t6                     \n"  // |S3|T3|S3|T3|
   1.546 +      "srl             $t4, $t4, 2                       \n"  // t4 / 4
   1.547 +      "srl             $t6, $t6, 16                      \n"  // |0|0|S3|T3|
   1.548 +      "raddu.w.qb      $t6, $t6                          \n"  // 0+0+S3+T3
   1.549 +      "addu            $t6, $t5, $t6                     \n"
   1.550 +      "mul             $t6, $t6, %[c]                    \n"  // t6 * 0x2AAA
   1.551 +      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
   1.552 +      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
   1.553 +      "raddu.w.qb      $t0, $t0                          \n"  // S2+S1+S0+0
   1.554 +      "raddu.w.qb      $t2, $t2                          \n"  // T2+T1+T0+0
   1.555 +      "addu            $t0, $t0, $t2                     \n"
   1.556 +      "mul             $t0, $t0, %[c]                    \n"  // t0 * 0x2AAA
   1.557 +      "addiu           %[src_ptr], %[src_ptr], 8         \n"
   1.558 +      "addiu           %[t], %[t], 8                     \n"
   1.559 +      "addiu           %[dst_width], %[dst_width], -3    \n"
   1.560 +      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
   1.561 +      "srl             $t6, $t6, 16                      \n"
   1.562 +      "srl             $t0, $t0, 16                      \n"
   1.563 +      "sb              $t4, -1(%[dst_ptr])               \n"
   1.564 +      "sb              $t6, -2(%[dst_ptr])               \n"
   1.565 +      "bgtz            %[dst_width], 1b                  \n"
   1.566 +      " sb             $t0, -3(%[dst_ptr])               \n"
   1.567 +      ".set pop                                          \n"
   1.568 +      : [src_ptr] "+r" (src_ptr),
   1.569 +        [dst_ptr] "+r" (dst_ptr),
   1.570 +        [t] "+r" (t),
   1.571 +        [dst_width] "+r" (dst_width)
   1.572 +      : [c] "r" (c)
   1.573 +      : "t0", "t1", "t2", "t3", "t4", "t5", "t6"
   1.574 +  );
   1.575 +}
   1.576 +
   1.577 +void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr,
   1.578 +                                     ptrdiff_t src_stride,
   1.579 +                                     uint8* dst_ptr, int dst_width) {
   1.580 +  intptr_t stride = src_stride;
   1.581 +  const uint8* s1 = src_ptr + stride;
   1.582 +  stride += stride;
   1.583 +  const uint8* s2 = src_ptr + stride;
   1.584 +  const int c1 = 0x1C71;
   1.585 +  const int c2 = 0x2AAA;
   1.586 +
   1.587 +  __asm__ __volatile__ (
   1.588 +      ".set push                                         \n"
   1.589 +      ".set noreorder                                    \n"
   1.590 +
   1.591 +      ".p2align        2                                 \n"
   1.592 +    "1:                                                  \n"
   1.593 +      "lw              $t0, 0(%[src_ptr])                \n"  // |S3|S2|S1|S0|
   1.594 +      "lw              $t1, 4(%[src_ptr])                \n"  // |S7|S6|S5|S4|
   1.595 +      "lw              $t2, 0(%[s1])                     \n"  // |T3|T2|T1|T0|
   1.596 +      "lw              $t3, 4(%[s1])                     \n"  // |T7|T6|T5|T4|
   1.597 +      "lw              $t4, 0(%[s2])                     \n"  // |R3|R2|R1|R0|
   1.598 +      "lw              $t5, 4(%[s2])                     \n"  // |R7|R6|R5|R4|
   1.599 +      "rotr            $t1, $t1, 16                      \n"  // |S5|S4|S7|S6|
   1.600 +      "packrl.ph       $t6, $t1, $t3                     \n"  // |S7|S6|T7|T6|
   1.601 +      "raddu.w.qb      $t6, $t6                          \n"  // S7+S6+T7+T6
   1.602 +      "packrl.ph       $t7, $t3, $t1                     \n"  // |T5|T4|S5|S4|
   1.603 +      "raddu.w.qb      $t7, $t7                          \n"  // T5+T4+S5+S4
   1.604 +      "sll             $t8, $t5, 16                      \n"  // |R5|R4|0|0|
   1.605 +      "raddu.w.qb      $t8, $t8                          \n"  // R5+R4
   1.606 +      "addu            $t7, $t7, $t8                     \n"
   1.607 +      "srl             $t8, $t5, 16                      \n"  // |0|0|R7|R6|
   1.608 +      "raddu.w.qb      $t8, $t8                          \n"  // R7 + R6
   1.609 +      "addu            $t6, $t6, $t8                     \n"
   1.610 +      "mul             $t6, $t6, %[c2]                   \n"  // t6 * 0x2AAA
   1.611 +      "precrq.qb.ph    $t8, $t0, $t2                     \n"  // |S3|S1|T3|T1|
   1.612 +      "precrq.qb.ph    $t8, $t8, $t4                     \n"  // |S3|T3|R3|R1|
   1.613 +      "srl             $t8, $t8, 8                       \n"  // |0|S3|T3|R3|
   1.614 +      "raddu.w.qb      $t8, $t8                          \n"  // S3 + T3 + R3
   1.615 +      "addu            $t7, $t7, $t8                     \n"
   1.616 +      "mul             $t7, $t7, %[c1]                   \n"  // t7 * 0x1C71
   1.617 +      "sll             $t0, $t0, 8                       \n"  // |S2|S1|S0|0|
   1.618 +      "sll             $t2, $t2, 8                       \n"  // |T2|T1|T0|0|
   1.619 +      "sll             $t4, $t4, 8                       \n"  // |R2|R1|R0|0|
   1.620 +      "raddu.w.qb      $t0, $t0                          \n"
   1.621 +      "raddu.w.qb      $t2, $t2                          \n"
   1.622 +      "raddu.w.qb      $t4, $t4                          \n"
   1.623 +      "addu            $t0, $t0, $t2                     \n"
   1.624 +      "addu            $t0, $t0, $t4                     \n"
   1.625 +      "mul             $t0, $t0, %[c1]                   \n"  // t0 * 0x1C71
   1.626 +      "addiu           %[src_ptr], %[src_ptr], 8         \n"
   1.627 +      "addiu           %[s1], %[s1], 8                   \n"
   1.628 +      "addiu           %[s2], %[s2], 8                   \n"
   1.629 +      "addiu           %[dst_width], %[dst_width], -3    \n"
   1.630 +      "addiu           %[dst_ptr], %[dst_ptr], 3         \n"
   1.631 +      "srl             $t6, $t6, 16                      \n"
   1.632 +      "srl             $t7, $t7, 16                      \n"
   1.633 +      "srl             $t0, $t0, 16                      \n"
   1.634 +      "sb              $t6, -1(%[dst_ptr])               \n"
   1.635 +      "sb              $t7, -2(%[dst_ptr])               \n"
   1.636 +      "bgtz            %[dst_width], 1b                  \n"
   1.637 +      " sb             $t0, -3(%[dst_ptr])               \n"
   1.638 +      ".set pop                                          \n"
   1.639 +      : [src_ptr] "+r" (src_ptr),
   1.640 +        [dst_ptr] "+r" (dst_ptr),
   1.641 +        [s1] "+r" (s1),
   1.642 +        [s2] "+r" (s2),
   1.643 +        [dst_width] "+r" (dst_width)
   1.644 +      : [c1] "r" (c1), [c2] "r" (c2)
   1.645 +      : "t0", "t1", "t2", "t3", "t4",
   1.646 +        "t5", "t6", "t7", "t8"
   1.647 +  );
   1.648 +}
   1.649 +
   1.650 +#endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
   1.651 +
   1.652 +#ifdef __cplusplus
   1.653 +}  // extern "C"
   1.654 +}  // namespace libyuv
   1.655 +#endif
   1.656 +

mercurial