gfx/cairo/libpixman/src/pixman-mips-dspr2-asm.S

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/cairo/libpixman/src/pixman-mips-dspr2-asm.S	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,3373 @@
     1.4 +/*
     1.5 + * Copyright (c) 2012
     1.6 + *      MIPS Technologies, Inc., California.
     1.7 + *
     1.8 + * Redistribution and use in source and binary forms, with or without
     1.9 + * modification, are permitted provided that the following conditions
    1.10 + * are met:
    1.11 + * 1. Redistributions of source code must retain the above copyright
    1.12 + *    notice, this list of conditions and the following disclaimer.
    1.13 + * 2. Redistributions in binary form must reproduce the above copyright
    1.14 + *    notice, this list of conditions and the following disclaimer in the
    1.15 + *    documentation and/or other materials provided with the distribution.
    1.16 + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
    1.17 + *    contributors may be used to endorse or promote products derived from
    1.18 + *    this software without specific prior written permission.
    1.19 + *
    1.20 + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
    1.21 + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    1.22 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
    1.23 + * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
    1.24 + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
    1.25 + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
    1.26 + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
    1.27 + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
    1.28 + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
    1.29 + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
    1.30 + * SUCH DAMAGE.
    1.31 + *
    1.32 + * Author:  Nemanja Lukic (nlukic@mips.com)
    1.33 + */
    1.34 +
    1.35 +#include "pixman-private.h"
    1.36 +#include "pixman-mips-dspr2-asm.h"
    1.37 +
    1.38 +LEAF_MIPS_DSPR2(pixman_fill_buff16_mips)
    1.39 +/*
    1.40 + * a0 - *dest
    1.41 + * a1 - count (bytes)
    1.42 + * a2 - value to fill buffer with
    1.43 + */
    1.44 +
    1.45 +    beqz     a1, 3f
    1.46 +     andi    t1, a0, 0x0002
    1.47 +    beqz     t1, 0f          /* check if address is 4-byte aligned */
    1.48 +     nop
    1.49 +    sh       a2, 0(a0)
    1.50 +    addiu    a0, a0, 2
    1.51 +    addiu    a1, a1, -2
    1.52 +0:
    1.53 +    srl      t1, a1, 5       /* t1 how many multiples of 32 bytes */
    1.54 +    replv.ph a2, a2          /* replicate fill value (16bit) in a2 */
    1.55 +    beqz     t1, 2f
    1.56 +     nop
    1.57 +1:
    1.58 +    addiu    t1, t1, -1
    1.59 +    beqz     t1, 11f
    1.60 +     addiu   a1, a1, -32
    1.61 +    pref     30, 32(a0)
    1.62 +    sw       a2, 0(a0)
    1.63 +    sw       a2, 4(a0)
    1.64 +    sw       a2, 8(a0)
    1.65 +    sw       a2, 12(a0)
    1.66 +    sw       a2, 16(a0)
    1.67 +    sw       a2, 20(a0)
    1.68 +    sw       a2, 24(a0)
    1.69 +    sw       a2, 28(a0)
    1.70 +    b        1b
    1.71 +     addiu   a0, a0, 32
    1.72 +11:
    1.73 +    sw       a2, 0(a0)
    1.74 +    sw       a2, 4(a0)
    1.75 +    sw       a2, 8(a0)
    1.76 +    sw       a2, 12(a0)
    1.77 +    sw       a2, 16(a0)
    1.78 +    sw       a2, 20(a0)
    1.79 +    sw       a2, 24(a0)
    1.80 +    sw       a2, 28(a0)
    1.81 +    addiu    a0, a0, 32
    1.82 +2:
    1.83 +    blez     a1, 3f
    1.84 +     addiu   a1, a1, -2
    1.85 +    sh       a2, 0(a0)
    1.86 +    b        2b
    1.87 +     addiu   a0, a0, 2
    1.88 +3:
    1.89 +    jr       ra
    1.90 +     nop
    1.91 +
    1.92 +END(pixman_fill_buff16_mips)
    1.93 +
    1.94 +LEAF_MIPS32R2(pixman_fill_buff32_mips)
    1.95 +/*
    1.96 + * a0 - *dest
    1.97 + * a1 - count (bytes)
    1.98 + * a2 - value to fill buffer with
    1.99 + */
   1.100 +
   1.101 +    beqz     a1, 3f
   1.102 +     nop
   1.103 +    srl      t1, a1, 5 /* t1 how many multiples of 32 bytes */
   1.104 +    beqz     t1, 2f
   1.105 +     nop
   1.106 +1:
   1.107 +    addiu    t1, t1, -1
   1.108 +    beqz     t1, 11f
   1.109 +     addiu   a1, a1, -32
   1.110 +    pref     30, 32(a0)
   1.111 +    sw       a2, 0(a0)
   1.112 +    sw       a2, 4(a0)
   1.113 +    sw       a2, 8(a0)
   1.114 +    sw       a2, 12(a0)
   1.115 +    sw       a2, 16(a0)
   1.116 +    sw       a2, 20(a0)
   1.117 +    sw       a2, 24(a0)
   1.118 +    sw       a2, 28(a0)
   1.119 +    b        1b
   1.120 +     addiu   a0, a0, 32
   1.121 +11:
   1.122 +    sw       a2, 0(a0)
   1.123 +    sw       a2, 4(a0)
   1.124 +    sw       a2, 8(a0)
   1.125 +    sw       a2, 12(a0)
   1.126 +    sw       a2, 16(a0)
   1.127 +    sw       a2, 20(a0)
   1.128 +    sw       a2, 24(a0)
   1.129 +    sw       a2, 28(a0)
   1.130 +    addiu    a0, a0, 32
   1.131 +2:
   1.132 +    blez     a1, 3f
   1.133 +     addiu   a1, a1, -4
   1.134 +    sw       a2, 0(a0)
   1.135 +    b        2b
   1.136 +     addiu   a0, a0, 4
   1.137 +3:
   1.138 +    jr       ra
   1.139 +     nop
   1.140 +
   1.141 +END(pixman_fill_buff32_mips)
   1.142 +
   1.143 +LEAF_MIPS_DSPR2(pixman_composite_src_8888_0565_asm_mips)
   1.144 +/*
   1.145 + * a0 - dst (r5g6b5)
   1.146 + * a1 - src (a8r8g8b8)
   1.147 + * a2 - w
   1.148 + */
   1.149 +
   1.150 +    beqz     a2, 3f
   1.151 +     nop
   1.152 +    addiu    t1, a2, -1
   1.153 +    beqz     t1, 2f
   1.154 +     nop
   1.155 +    li       t4, 0xf800f800
   1.156 +    li       t5, 0x07e007e0
   1.157 +    li       t6, 0x001f001f
   1.158 +1:
   1.159 +    lw       t0, 0(a1)
   1.160 +    lw       t1, 4(a1)
   1.161 +    addiu    a1, a1, 8
   1.162 +    addiu    a2, a2, -2
   1.163 +
   1.164 +    CONVERT_2x8888_TO_2x0565 t0, t1, t2, t3, t4, t5, t6, t7, t8
   1.165 +
   1.166 +    sh       t2, 0(a0)
   1.167 +    sh       t3, 2(a0)
   1.168 +
   1.169 +    addiu    t2, a2, -1
   1.170 +    bgtz     t2, 1b
   1.171 +     addiu   a0, a0, 4
   1.172 +2:
   1.173 +    beqz     a2, 3f
   1.174 +     nop
   1.175 +    lw       t0, 0(a1)
   1.176 +
   1.177 +    CONVERT_1x8888_TO_1x0565 t0, t1, t2, t3
   1.178 +
   1.179 +    sh       t1, 0(a0)
   1.180 +3:
   1.181 +    j        ra
   1.182 +     nop
   1.183 +
   1.184 +END(pixman_composite_src_8888_0565_asm_mips)
   1.185 +
   1.186 +LEAF_MIPS_DSPR2(pixman_composite_src_0565_8888_asm_mips)
   1.187 +/*
   1.188 + * a0 - dst (a8r8g8b8)
   1.189 + * a1 - src (r5g6b5)
   1.190 + * a2 - w
   1.191 + */
   1.192 +
   1.193 +    beqz     a2, 3f
   1.194 +     nop
   1.195 +    addiu    t1, a2, -1
   1.196 +    beqz     t1, 2f
   1.197 +     nop
   1.198 +    li       t4, 0x07e007e0
   1.199 +    li       t5, 0x001F001F
   1.200 +1:
   1.201 +    lhu      t0, 0(a1)
   1.202 +    lhu      t1, 2(a1)
   1.203 +    addiu    a1, a1, 4
   1.204 +    addiu    a2, a2, -2
   1.205 +
   1.206 +    CONVERT_2x0565_TO_2x8888 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9
   1.207 +
   1.208 +    sw       t2, 0(a0)
   1.209 +    sw       t3, 4(a0)
   1.210 +
   1.211 +    addiu    t2, a2, -1
   1.212 +    bgtz     t2, 1b
   1.213 +     addiu   a0, a0, 8
   1.214 +2:
   1.215 +    beqz     a2, 3f
   1.216 +     nop
   1.217 +    lhu      t0, 0(a1)
   1.218 +
   1.219 +    CONVERT_1x0565_TO_1x8888 t0, t1, t2, t3
   1.220 +
   1.221 +    sw       t1, 0(a0)
   1.222 +3:
   1.223 +    j        ra
   1.224 +     nop
   1.225 +
   1.226 +END(pixman_composite_src_0565_8888_asm_mips)
   1.227 +
   1.228 +LEAF_MIPS_DSPR2(pixman_composite_src_x888_8888_asm_mips)
   1.229 +/*
   1.230 + * a0 - dst (a8r8g8b8)
   1.231 + * a1 - src (x8r8g8b8)
   1.232 + * a2 - w
   1.233 + */
   1.234 +
   1.235 +    beqz     a2, 4f
   1.236 +     nop
   1.237 +    li       t9, 0xff000000
   1.238 +    srl      t8, a2, 3    /* t1 = how many multiples of 8 src pixels */
   1.239 +    beqz     t8, 3f       /* branch if less than 8 src pixels */
   1.240 +     nop
   1.241 +1:
   1.242 +    addiu    t8, t8, -1
   1.243 +    beqz     t8, 2f
   1.244 +     addiu   a2, a2, -8
   1.245 +    pref     0, 32(a1)
   1.246 +    lw       t0, 0(a1)
   1.247 +    lw       t1, 4(a1)
   1.248 +    lw       t2, 8(a1)
   1.249 +    lw       t3, 12(a1)
   1.250 +    lw       t4, 16(a1)
   1.251 +    lw       t5, 20(a1)
   1.252 +    lw       t6, 24(a1)
   1.253 +    lw       t7, 28(a1)
   1.254 +    addiu    a1, a1, 32
   1.255 +    or       t0, t0, t9
   1.256 +    or       t1, t1, t9
   1.257 +    or       t2, t2, t9
   1.258 +    or       t3, t3, t9
   1.259 +    or       t4, t4, t9
   1.260 +    or       t5, t5, t9
   1.261 +    or       t6, t6, t9
   1.262 +    or       t7, t7, t9
   1.263 +    pref     30, 32(a0)
   1.264 +    sw       t0, 0(a0)
   1.265 +    sw       t1, 4(a0)
   1.266 +    sw       t2, 8(a0)
   1.267 +    sw       t3, 12(a0)
   1.268 +    sw       t4, 16(a0)
   1.269 +    sw       t5, 20(a0)
   1.270 +    sw       t6, 24(a0)
   1.271 +    sw       t7, 28(a0)
   1.272 +    b        1b
   1.273 +     addiu   a0, a0, 32
   1.274 +2:
   1.275 +    lw       t0, 0(a1)
   1.276 +    lw       t1, 4(a1)
   1.277 +    lw       t2, 8(a1)
   1.278 +    lw       t3, 12(a1)
   1.279 +    lw       t4, 16(a1)
   1.280 +    lw       t5, 20(a1)
   1.281 +    lw       t6, 24(a1)
   1.282 +    lw       t7, 28(a1)
   1.283 +    addiu    a1, a1, 32
   1.284 +    or       t0, t0, t9
   1.285 +    or       t1, t1, t9
   1.286 +    or       t2, t2, t9
   1.287 +    or       t3, t3, t9
   1.288 +    or       t4, t4, t9
   1.289 +    or       t5, t5, t9
   1.290 +    or       t6, t6, t9
   1.291 +    or       t7, t7, t9
   1.292 +    sw       t0, 0(a0)
   1.293 +    sw       t1, 4(a0)
   1.294 +    sw       t2, 8(a0)
   1.295 +    sw       t3, 12(a0)
   1.296 +    sw       t4, 16(a0)
   1.297 +    sw       t5, 20(a0)
   1.298 +    sw       t6, 24(a0)
   1.299 +    sw       t7, 28(a0)
   1.300 +    beqz     a2, 4f
   1.301 +     addiu   a0, a0, 32
   1.302 +3:
   1.303 +    lw       t0, 0(a1)
   1.304 +    addiu    a1, a1, 4
   1.305 +    addiu    a2, a2, -1
   1.306 +    or       t1, t0, t9
   1.307 +    sw       t1, 0(a0)
   1.308 +    bnez     a2, 3b
   1.309 +     addiu   a0, a0, 4
   1.310 +4:
   1.311 +    jr       ra
   1.312 +     nop
   1.313 +
   1.314 +END(pixman_composite_src_x888_8888_asm_mips)
   1.315 +
   1.316 +LEAF_MIPS_DSPR2(pixman_composite_src_n_8_8888_asm_mips)
   1.317 +/*
   1.318 + * a0 - dst  (a8r8g8b8)
   1.319 + * a1 - src  (32bit constant)
   1.320 + * a2 - mask (a8)
   1.321 + * a3 - w
   1.322 + */
   1.323 +
   1.324 +
   1.325 +    SAVE_REGS_ON_STACK 0, v0
   1.326 +    li       v0, 0x00ff00ff
   1.327 +
   1.328 +    beqz     a3, 3f
   1.329 +     nop
   1.330 +    addiu    t1, a3, -1
   1.331 +    beqz     t1, 2f
   1.332 +     nop
   1.333 +
   1.334 +1:
   1.335 +                       /* a1 = source      (32bit constant) */
   1.336 +    lbu      t0, 0(a2) /* t2 = mask        (a8) */
   1.337 +    lbu      t1, 1(a2) /* t3 = mask        (a8) */
   1.338 +    addiu    a2, a2, 2
   1.339 +
   1.340 +    MIPS_2xUN8x4_MUL_2xUN8 a1, a1, t0, t1, t2, t3, v0, t4, t5, t6, t7, t8, t9
   1.341 +
   1.342 +    sw       t2, 0(a0)
   1.343 +    sw       t3, 4(a0)
   1.344 +    addiu    a3, a3, -2
   1.345 +    addiu    t2, a3, -1
   1.346 +    bgtz     t2, 1b
   1.347 +     addiu   a0, a0, 8
   1.348 +
   1.349 +    beqz     a3, 3f
   1.350 +     nop
   1.351 +
   1.352 +2:
   1.353 +    lbu      t0, 0(a2)
   1.354 +    addiu    a2, a2, 1
   1.355 +
   1.356 +    MIPS_UN8x4_MUL_UN8 a1, t0, t1, v0, t3, t4, t5
   1.357 +
   1.358 +    sw       t1, 0(a0)
   1.359 +    addiu    a3, a3, -1
   1.360 +    addiu    a0, a0, 4
   1.361 +
   1.362 +3:
   1.363 +    RESTORE_REGS_FROM_STACK 0, v0
   1.364 +    j        ra
   1.365 +     nop
   1.366 +
   1.367 +END(pixman_composite_src_n_8_8888_asm_mips)
   1.368 +
   1.369 +LEAF_MIPS_DSPR2(pixman_composite_src_n_8_8_asm_mips)
   1.370 +/*
   1.371 + * a0 - dst  (a8)
   1.372 + * a1 - src  (32bit constant)
   1.373 + * a2 - mask (a8)
   1.374 + * a3 - w
   1.375 + */
   1.376 +
   1.377 +    li                t9, 0x00ff00ff
   1.378 +    beqz              a3, 3f
   1.379 +     nop
   1.380 +    srl               t7, a3, 2   /* t7 = how many multiples of 4 dst pixels */
   1.381 +    beqz              t7, 1f      /* branch if less than 4 src pixels */
   1.382 +     nop
   1.383 +
   1.384 +    srl               t8, a1, 24
   1.385 +    replv.ph          t8, t8
   1.386 +
   1.387 +0:
   1.388 +    beqz              t7, 1f
   1.389 +     addiu            t7, t7, -1
   1.390 +    lbu               t0, 0(a2)
   1.391 +    lbu               t1, 1(a2)
   1.392 +    lbu               t2, 2(a2)
   1.393 +    lbu               t3, 3(a2)
   1.394 +
   1.395 +    addiu             a2, a2, 4
   1.396 +
   1.397 +    precr_sra.ph.w    t1, t0, 0
   1.398 +    precr_sra.ph.w    t3, t2, 0
   1.399 +    precr.qb.ph       t0, t3, t1
   1.400 +
   1.401 +    muleu_s.ph.qbl    t2, t0, t8
   1.402 +    muleu_s.ph.qbr    t3, t0, t8
   1.403 +    shra_r.ph         t4, t2, 8
   1.404 +    shra_r.ph         t5, t3, 8
   1.405 +    and               t4, t4, t9
   1.406 +    and               t5, t5, t9
   1.407 +    addq.ph           t2, t2, t4
   1.408 +    addq.ph           t3, t3, t5
   1.409 +    shra_r.ph         t2, t2, 8
   1.410 +    shra_r.ph         t3, t3, 8
   1.411 +    precr.qb.ph       t2, t2, t3
   1.412 +
   1.413 +    sb                t2, 0(a0)
   1.414 +    srl               t2, t2, 8
   1.415 +    sb                t2, 1(a0)
   1.416 +    srl               t2, t2, 8
   1.417 +    sb                t2, 2(a0)
   1.418 +    srl               t2, t2, 8
   1.419 +    sb                t2, 3(a0)
   1.420 +    addiu             a3, a3, -4
   1.421 +    b                 0b
   1.422 +     addiu            a0, a0, 4
   1.423 +
   1.424 +1:
   1.425 +    beqz              a3, 3f
   1.426 +     nop
   1.427 +    srl               t8, a1, 24
   1.428 +2:
   1.429 +    lbu               t0, 0(a2)
   1.430 +    addiu             a2, a2, 1
   1.431 +
   1.432 +    mul               t2, t0, t8
   1.433 +    shra_r.ph         t3, t2, 8
   1.434 +    andi              t3, t3, 0x00ff
   1.435 +    addq.ph           t2, t2, t3
   1.436 +    shra_r.ph         t2, t2, 8
   1.437 +
   1.438 +    sb                t2, 0(a0)
   1.439 +    addiu             a3, a3, -1
   1.440 +    bnez              a3, 2b
   1.441 +     addiu            a0, a0, 1
   1.442 +
   1.443 +3:
   1.444 +    j                 ra
   1.445 +     nop
   1.446 +
   1.447 +END(pixman_composite_src_n_8_8_asm_mips)
   1.448 +
   1.449 +LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_8888_ca_asm_mips)
   1.450 +/*
   1.451 + * a0 - dst  (a8r8g8b8)
   1.452 + * a1 - src  (32bit constant)
   1.453 + * a2 - mask (a8r8g8b8)
   1.454 + * a3 - w
   1.455 + */
   1.456 +
   1.457 +    SAVE_REGS_ON_STACK 8, s0, s1, s2, s3, s4, s5
   1.458 +    beqz         a3, 4f
   1.459 +     nop
   1.460 +    li           t6, 0xff
   1.461 +    addiu        t7, zero, -1 /* t7 = 0xffffffff */
   1.462 +    srl          t8, a1, 24   /* t8 = srca */
   1.463 +    li           t9, 0x00ff00ff
   1.464 +    addiu        t1, a3, -1
   1.465 +    beqz         t1, 3f       /* last pixel */
   1.466 +     nop
   1.467 +    beq          t8, t6, 2f   /* if (srca == 0xff) */
   1.468 +     nop
   1.469 +1:
   1.470 +                              /* a1 = src */
   1.471 +    lw           t0, 0(a2)    /* t0 = mask */
   1.472 +    lw           t1, 4(a2)    /* t1 = mask */
   1.473 +    or           t2, t0, t1
   1.474 +    beqz         t2, 12f      /* if (t0 == 0) && (t1 == 0) */
   1.475 +     addiu       a2, a2, 8
   1.476 +    and          t3, t0, t1
   1.477 +    move         t4, a1       /* t4 = src */
   1.478 +    move         t5, a1       /* t5 = src */
   1.479 +    lw           t2, 0(a0)    /* t2 = dst */
   1.480 +    beq          t3, t7, 11f  /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
   1.481 +     lw          t3, 4(a0)    /* t3 = dst */
   1.482 +    MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, t4, t5, t9, s0, s1, s2, s3, s4, s5
   1.483 +    MIPS_2xUN8x4_MUL_2xUN8   t0, t1, t8, t8, t0, t1, t9, s0, s1, s2, s3, s4, s5
   1.484 +11:
   1.485 +    not          t0, t0
   1.486 +    not          t1, t1
   1.487 +    MIPS_2xUN8x4_MUL_2xUN8x4 t2, t3, t0, t1, t2, t3, t9, s0, s1, s2, s3, s4, s5
   1.488 +    addu_s.qb    t2, t4, t2
   1.489 +    addu_s.qb    t3, t5, t3
   1.490 +    sw           t2, 0(a0)
   1.491 +    sw           t3, 4(a0)
   1.492 +12:
   1.493 +    addiu        a3, a3, -2
   1.494 +    addiu        t1, a3, -1
   1.495 +    bgtz         t1, 1b
   1.496 +     addiu       a0, a0, 8
   1.497 +    b            3f
   1.498 +     nop
   1.499 +2:
   1.500 +                              /* a1 = src */
   1.501 +    lw           t0, 0(a2)    /* t0 = mask */
   1.502 +    lw           t1, 4(a2)    /* t1 = mask */
   1.503 +    or           t2, t0, t1
   1.504 +    beqz         t2, 22f      /* if (t0 == 0) & (t1 == 0) */
   1.505 +     addiu       a2, a2, 8
   1.506 +    and          t2, t0, t1
   1.507 +    move         t4, a1
   1.508 +    beq          t2, t7, 21f  /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
   1.509 +     move        t5, a1
   1.510 +    lw           t2, 0(a0)    /* t2 = dst */
   1.511 +    lw           t3, 4(a0)    /* t3 = dst */
   1.512 +    MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, t4, t5, t9, s0, s1, s2, s3, s4, s5
   1.513 +    not          t0, t0
   1.514 +    not          t1, t1
   1.515 +    MIPS_2xUN8x4_MUL_2xUN8x4 t2, t3, t0, t1, t2, t3, t9, s0, s1, s2, s3, s4, s5
   1.516 +    addu_s.qb    t4, t4, t2
   1.517 +    addu_s.qb    t5, t5, t3
   1.518 +21:
   1.519 +    sw           t4, 0(a0)
   1.520 +    sw           t5, 4(a0)
   1.521 +22:
   1.522 +    addiu        a3, a3, -2
   1.523 +    addiu        t1, a3, -1
   1.524 +    bgtz         t1, 2b
   1.525 +     addiu       a0, a0, 8
   1.526 +3:
   1.527 +    blez         a3, 4f
   1.528 +     nop
   1.529 +                              /* a1 = src */
   1.530 +    lw           t1, 0(a2)    /* t1 = mask */
   1.531 +    beqz         t1, 4f
   1.532 +     nop
   1.533 +    move         t2, a1       /* t2 = src */
   1.534 +    beq          t1, t7, 31f
   1.535 +     lw          t0, 0(a0)    /* t0 = dst */
   1.536 +
   1.537 +    MIPS_UN8x4_MUL_UN8x4  a1, t1, t2, t9, t3, t4, t5, t6
   1.538 +    MIPS_UN8x4_MUL_UN8    t1, t8, t1, t9, t3, t4, t5
   1.539 +31:
   1.540 +    not          t1, t1
   1.541 +    MIPS_UN8x4_MUL_UN8x4  t0, t1, t0, t9, t3, t4, t5, t6
   1.542 +    addu_s.qb    t0, t2, t0
   1.543 +    sw           t0, 0(a0)
   1.544 +4:
   1.545 +    RESTORE_REGS_FROM_STACK 8, s0, s1, s2, s3, s4, s5
   1.546 +    j            ra
   1.547 +     nop
   1.548 +
   1.549 +END(pixman_composite_over_n_8888_8888_ca_asm_mips)
   1.550 +
   1.551 +LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_0565_ca_asm_mips)
   1.552 +/*
   1.553 + * a0 - dst  (r5g6b5)
   1.554 + * a1 - src  (32bit constant)
   1.555 + * a2 - mask (a8r8g8b8)
   1.556 + * a3 - w
   1.557 + */
   1.558 +
   1.559 +    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7, s8
   1.560 +    beqz         a3, 4f
   1.561 +     nop
   1.562 +    li           t5, 0xf800f800
   1.563 +    li           t6, 0x07e007e0
   1.564 +    li           t7, 0x001F001F
   1.565 +    li           t9, 0x00ff00ff
   1.566 +
   1.567 +    srl          t8, a1, 24   /* t8 = srca */
   1.568 +    addiu        t1, a3, -1
   1.569 +    beqz         t1, 3f       /* last pixel */
   1.570 +     nop
   1.571 +    li           s0, 0xff     /* s0 = 0xff */
   1.572 +    addiu        s1, zero, -1 /* s1 = 0xffffffff */
   1.573 +
   1.574 +    beq          t8, s0, 2f   /* if (srca == 0xff) */
   1.575 +     nop
   1.576 +1:
   1.577 +                              /* a1 = src */
   1.578 +    lw           t0, 0(a2)    /* t0 = mask */
   1.579 +    lw           t1, 4(a2)    /* t1 = mask */
   1.580 +    or           t2, t0, t1
   1.581 +    beqz         t2, 12f      /* if (t0 == 0) && (t1 == 0) */
   1.582 +     addiu       a2, a2, 8
   1.583 +    and          t3, t0, t1
   1.584 +    move         s2, a1       /* s2 = src */
   1.585 +    move         s3, a1       /* s3 = src */
   1.586 +    lhu          t2, 0(a0)    /* t2 = dst */
   1.587 +    beq          t3, s1, 11f  /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
   1.588 +     lhu         t3, 2(a0)    /* t3 = dst */
   1.589 +    MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, s2, s3, t9, t4, s4, s5, s6, s7, s8
   1.590 +    MIPS_2xUN8x4_MUL_2xUN8   t0, t1, t8, t8, t0, t1, t9, t4, s4, s5, s6, s7, s8
   1.591 +11:
   1.592 +    not          t0, t0
   1.593 +    not          t1, t1
   1.594 +    CONVERT_2x0565_TO_2x8888 t2, t3, s4, s5, t6, t7, t4, s6, s7, s8
   1.595 +    MIPS_2xUN8x4_MUL_2xUN8x4 s4, s5, t0, t1, s4, s5, t9, t4, s6, s7, s8, t0, t1
   1.596 +    addu_s.qb    s2, s2, s4
   1.597 +    addu_s.qb    s3, s3, s5
   1.598 +    CONVERT_2x8888_TO_2x0565 s2, s3, t2, t3, t5, t6, t7, s4, s5
   1.599 +    sh           t2, 0(a0)
   1.600 +    sh           t3, 2(a0)
   1.601 +12:
   1.602 +    addiu        a3, a3, -2
   1.603 +    addiu        t1, a3, -1
   1.604 +    bgtz         t1, 1b
   1.605 +     addiu       a0, a0, 4
   1.606 +    b            3f
   1.607 +     nop
   1.608 +2:
   1.609 +                              /* a1 = src */
   1.610 +    lw           t0, 0(a2)    /* t0 = mask */
   1.611 +    lw           t1, 4(a2)    /* t1 = mask */
   1.612 +    or           t2, t0, t1
   1.613 +    beqz         t2, 22f      /* if (t0 == 0) & (t1 == 0) */
   1.614 +     addiu       a2, a2, 8
   1.615 +    and          t3, t0, t1
   1.616 +    move         t2, a1
   1.617 +    beq          t3, s1, 21f  /* if (t0 == 0xffffffff) && (t1 == 0xffffffff) */
   1.618 +     move        t3, a1
   1.619 +    lhu          t2, 0(a0)    /* t2 = dst */
   1.620 +    lhu          t3, 2(a0)    /* t3 = dst */
   1.621 +    MIPS_2xUN8x4_MUL_2xUN8x4 a1, a1, t0, t1, s2, s3, t9, t4, s4, s5, s6, s7, s8
   1.622 +    not          t0, t0
   1.623 +    not          t1, t1
   1.624 +    CONVERT_2x0565_TO_2x8888 t2, t3, s4, s5, t6, t7, t4, s6, s7, s8
   1.625 +    MIPS_2xUN8x4_MUL_2xUN8x4 s4, s5, t0, t1, s4, s5, t9, t4, s6, s7, s8, t2, t3
   1.626 +    addu_s.qb    t2, s2, s4
   1.627 +    addu_s.qb    t3, s3, s5
   1.628 +21:
   1.629 +    CONVERT_2x8888_TO_2x0565 t2, t3, t0, t1, t5, t6, t7, s2, s3
   1.630 +    sh           t0, 0(a0)
   1.631 +    sh           t1, 2(a0)
   1.632 +22:
   1.633 +    addiu        a3, a3, -2
   1.634 +    addiu        t1, a3, -1
   1.635 +    bgtz         t1, 2b
   1.636 +     addiu       a0, a0, 4
   1.637 +3:
   1.638 +    blez         a3, 4f
   1.639 +     nop
   1.640 +                              /* a1 = src */
   1.641 +    lw           t1, 0(a2)    /* t1 = mask */
   1.642 +    beqz         t1, 4f
   1.643 +     nop
   1.644 +    move         t2, a1       /* t2 = src */
   1.645 +    beq          t1, t7, 31f
   1.646 +     lhu         t0, 0(a0)    /* t0 = dst */
   1.647 +
   1.648 +    MIPS_UN8x4_MUL_UN8x4     a1, t1, t2, t9, t3, t4, t5, t6
   1.649 +    MIPS_UN8x4_MUL_UN8       t1, t8, t1, t9, t3, t4, t5
   1.650 +31:
   1.651 +    not          t1, t1
   1.652 +    CONVERT_1x0565_TO_1x8888 t0, s1, s2, s3
   1.653 +    MIPS_UN8x4_MUL_UN8x4     s1, t1, t3, t9, t4, t5, t6, t7
   1.654 +    addu_s.qb    t0, t2, t3
   1.655 +    CONVERT_1x8888_TO_1x0565 t0, s1, s2, s3
   1.656 +    sh           s1, 0(a0)
   1.657 +4:
   1.658 +    RESTORE_REGS_FROM_STACK  20, s0, s1, s2, s3, s4, s5, s6, s7, s8
   1.659 +    j            ra
   1.660 +     nop
   1.661 +
   1.662 +END(pixman_composite_over_n_8888_0565_ca_asm_mips)
   1.663 +
   1.664 +LEAF_MIPS_DSPR2(pixman_composite_over_n_8_8888_asm_mips)
   1.665 +/*
   1.666 + * a0 - dst  (a8r8g8b8)
   1.667 + * a1 - src  (32bit constant)
   1.668 + * a2 - mask (a8)
   1.669 + * a3 - w
   1.670 + */
   1.671 +
   1.672 +    SAVE_REGS_ON_STACK 4, s0, s1, s2, s3, s4
   1.673 +    beqz      a3, 4f
   1.674 +     nop
   1.675 +    li        t4, 0x00ff00ff
   1.676 +    li        t5, 0xff
   1.677 +    addiu     t0, a3, -1
   1.678 +    beqz      t0, 3f         /* last pixel */
   1.679 +     srl      t6, a1, 24     /* t6 = srca */
   1.680 +    not       s4, a1
   1.681 +    beq       t5, t6, 2f     /* if (srca == 0xff) */
   1.682 +     srl      s4, s4, 24
   1.683 +1:
   1.684 +                             /* a1 = src */
   1.685 +    lbu       t0, 0(a2)      /* t0 = mask */
   1.686 +    lbu       t1, 1(a2)      /* t1 = mask */
   1.687 +    or        t2, t0, t1
   1.688 +    beqz      t2, 111f       /* if (t0 == 0) && (t1 == 0) */
   1.689 +     addiu    a2, a2, 2
   1.690 +    and       t3, t0, t1
   1.691 +
   1.692 +    lw        t2, 0(a0)      /* t2 = dst */
   1.693 +    beq       t3, t5, 11f    /* if (t0 == 0xff) && (t1 == 0xff) */
   1.694 +     lw       t3, 4(a0)      /* t3 = dst */
   1.695 +
   1.696 +    MIPS_2xUN8x4_MUL_2xUN8 a1, a1, t0, t1, s0, s1, t4, t6, t7, t8, t9, s2, s3
   1.697 +    not       s2, s0
   1.698 +    not       s3, s1
   1.699 +    srl       s2, s2, 24
   1.700 +    srl       s3, s3, 24
   1.701 +    MIPS_2xUN8x4_MUL_2xUN8 t2, t3, s2, s3, t2, t3, t4, t0, t1, t6, t7, t8, t9
   1.702 +    addu_s.qb s2, t2, s0
   1.703 +    addu_s.qb s3, t3, s1
   1.704 +    sw        s2, 0(a0)
   1.705 +    b         111f
   1.706 +     sw       s3, 4(a0)
   1.707 +11:
   1.708 +    MIPS_2xUN8x4_MUL_2xUN8 t2, t3, s4, s4, t2, t3, t4, t0, t1, t6, t7, t8, t9
   1.709 +    addu_s.qb s2, t2, a1
   1.710 +    addu_s.qb s3, t3, a1
   1.711 +    sw        s2, 0(a0)
   1.712 +    sw        s3, 4(a0)
   1.713 +
   1.714 +111:
   1.715 +    addiu     a3, a3, -2
   1.716 +    addiu     t0, a3, -1
   1.717 +    bgtz      t0, 1b
   1.718 +     addiu    a0, a0, 8
   1.719 +    b         3f
   1.720 +     nop
   1.721 +2:
   1.722 +                             /* a1 = src */
   1.723 +    lbu       t0, 0(a2)      /* t0 = mask */
   1.724 +    lbu       t1, 1(a2)      /* t1 = mask */
   1.725 +    or        t2, t0, t1
   1.726 +    beqz      t2, 222f       /* if (t0 == 0) && (t1 == 0) */
   1.727 +     addiu    a2, a2, 2
   1.728 +    and       t3, t0, t1
   1.729 +    beq       t3, t5, 22f    /* if (t0 == 0xff) && (t1 == 0xff) */
   1.730 +     nop
   1.731 +    lw        t2, 0(a0)      /* t2 = dst */
   1.732 +    lw        t3, 4(a0)      /* t3 = dst */
   1.733 +
   1.734 +    OVER_2x8888_2x8_2x8888 a1, a1, t0, t1, t2, t3, \
   1.735 +                           t6, t7, t4, t8, t9, s0, s1, s2, s3
   1.736 +    sw        t6, 0(a0)
   1.737 +    b         222f
   1.738 +     sw        t7, 4(a0)
   1.739 +22:
   1.740 +    sw        a1, 0(a0)
   1.741 +    sw        a1, 4(a0)
   1.742 +222:
   1.743 +    addiu     a3, a3, -2
   1.744 +    addiu     t0, a3, -1
   1.745 +    bgtz      t0, 2b
   1.746 +     addiu    a0, a0, 8
   1.747 +3:
   1.748 +    blez      a3, 4f
   1.749 +     nop
   1.750 +                             /* a1 = src */
   1.751 +    lbu       t0, 0(a2)      /* t0 = mask */
   1.752 +    beqz      t0, 4f         /* if (t0 == 0) */
   1.753 +     addiu    a2, a2, 1
   1.754 +    move      t3, a1
   1.755 +    beq       t0, t5, 31f    /* if (t0 == 0xff) */
   1.756 +     lw       t1, 0(a0)      /* t1 = dst */
   1.757 +
   1.758 +    MIPS_UN8x4_MUL_UN8 a1, t0, t3, t4, t6, t7, t8
   1.759 +31:
   1.760 +    not       t2, t3
   1.761 +    srl       t2, t2, 24
   1.762 +    MIPS_UN8x4_MUL_UN8 t1, t2, t1, t4, t6, t7, t8
   1.763 +    addu_s.qb t2, t1, t3
   1.764 +    sw        t2, 0(a0)
   1.765 +4:
   1.766 +    RESTORE_REGS_FROM_STACK 4, s0, s1, s2, s3, s4
   1.767 +    j         ra
   1.768 +     nop
   1.769 +
   1.770 +END(pixman_composite_over_n_8_8888_asm_mips)
   1.771 +
   1.772 +LEAF_MIPS_DSPR2(pixman_composite_over_n_8_0565_asm_mips)
   1.773 +/*
   1.774 + * a0 - dst  (r5g6b5)
   1.775 + * a1 - src  (32bit constant)
   1.776 + * a2 - mask (a8)
   1.777 + * a3 - w
   1.778 + */
   1.779 +    SAVE_REGS_ON_STACK 24, v0, s0, s1, s2, s3, s4, s5, s6, s7, s8
   1.780 +    beqz     a3, 4f
   1.781 +     nop
   1.782 +    li       t4, 0x00ff00ff
   1.783 +    li       t5, 0xff
   1.784 +    li       t6, 0xf800f800
   1.785 +    li       t7, 0x07e007e0
   1.786 +    li       t8, 0x001F001F
   1.787 +    addiu    t1, a3, -1
   1.788 +    beqz     t1, 3f         /* last pixel */
   1.789 +     srl     t0, a1, 24     /* t0 = srca */
   1.790 +    not      v0, a1
   1.791 +    beq      t0, t5, 2f     /* if (srca == 0xff) */
   1.792 +     srl     v0, v0, 24
   1.793 +1:
   1.794 +                            /* a1 = src */
   1.795 +    lbu      t0, 0(a2)      /* t0 = mask */
   1.796 +    lbu      t1, 1(a2)      /* t1 = mask */
   1.797 +    or       t2, t0, t1
   1.798 +    beqz     t2, 111f       /* if (t0 == 0) && (t1 == 0) */
   1.799 +     addiu   a2, a2, 2
   1.800 +    lhu      t2, 0(a0)      /* t2 = dst */
   1.801 +    lhu      t3, 2(a0)      /* t3 = dst */
   1.802 +    CONVERT_2x0565_TO_2x8888 t2, t3, s0, s1, t7, t8, t9, s2, s3, s4
   1.803 +    and      t9, t0, t1
   1.804 +    beq      t9, t5, 11f    /* if (t0 == 0xff) && (t1 == 0xff) */
   1.805 +     nop
   1.806 +
   1.807 +    MIPS_2xUN8x4_MUL_2xUN8   a1, a1, t0, t1, s2, s3, t4, t9, s4, s5, s6, s7, s8
   1.808 +    not      s4, s2
   1.809 +    not      s5, s3
   1.810 +    srl      s4, s4, 24
   1.811 +    srl      s5, s5, 24
   1.812 +    MIPS_2xUN8x4_MUL_2xUN8   s0, s1, s4, s5, s0, s1, t4, t9, t0, t1, s6, s7, s8
   1.813 +    addu_s.qb                s4, s2, s0
   1.814 +    addu_s.qb                s5, s3, s1
   1.815 +    CONVERT_2x8888_TO_2x0565 s4, s5, t2, t3, t6, t7, t8, s0, s1
   1.816 +    sh       t2, 0(a0)
   1.817 +    b        111f
   1.818 +     sh      t3, 2(a0)
   1.819 +11:
   1.820 +    MIPS_2xUN8x4_MUL_2xUN8   s0, s1, v0, v0, s0, s1, t4, t9, t0, t1, s6, s7, s8
   1.821 +    addu_s.qb                s4, a1, s0
   1.822 +    addu_s.qb                s5, a1, s1
   1.823 +    CONVERT_2x8888_TO_2x0565 s4, s5, t2, t3, t6, t7, t8, s0, s1
   1.824 +    sh       t2, 0(a0)
   1.825 +    sh       t3, 2(a0)
   1.826 +111:
   1.827 +    addiu    a3, a3, -2
   1.828 +    addiu    t0, a3, -1
   1.829 +    bgtz     t0, 1b
   1.830 +     addiu   a0, a0, 4
   1.831 +    b        3f
   1.832 +     nop
   1.833 +2:
   1.834 +    CONVERT_1x8888_TO_1x0565 a1, s0, s1, s2
   1.835 +21:
   1.836 +                            /* a1 = src */
   1.837 +    lbu      t0, 0(a2)      /* t0 = mask */
   1.838 +    lbu      t1, 1(a2)      /* t1 = mask */
   1.839 +    or       t2, t0, t1
   1.840 +    beqz     t2, 222f       /* if (t0 == 0) && (t1 == 0) */
   1.841 +     addiu   a2, a2, 2
   1.842 +    and      t9, t0, t1
   1.843 +    move     s2, s0
   1.844 +    beq      t9, t5, 22f    /* if (t0 == 0xff) && (t2 == 0xff) */
   1.845 +     move    s3, s0
   1.846 +    lhu      t2, 0(a0)      /* t2 = dst */
   1.847 +    lhu      t3, 2(a0)      /* t3 = dst */
   1.848 +
   1.849 +    CONVERT_2x0565_TO_2x8888 t2, t3, s2, s3, t7, t8, s4, s5, s6, s7
   1.850 +    OVER_2x8888_2x8_2x8888   a1, a1, t0, t1, s2, s3, \
   1.851 +                             t2, t3, t4, t9, s4, s5, s6, s7, s8
   1.852 +    CONVERT_2x8888_TO_2x0565 t2, t3, s2, s3, t6, t7, t8, s4, s5
   1.853 +22:
   1.854 +    sh       s2, 0(a0)
   1.855 +    sh       s3, 2(a0)
   1.856 +222:
   1.857 +    addiu    a3, a3, -2
   1.858 +    addiu    t0, a3, -1
   1.859 +    bgtz     t0, 21b
   1.860 +     addiu   a0, a0, 4
   1.861 +3:
   1.862 +    blez      a3, 4f
   1.863 +     nop
   1.864 +                            /* a1 = src */
   1.865 +    lbu      t0, 0(a2)      /* t0 = mask */
   1.866 +    beqz     t0, 4f         /* if (t0 == 0) */
   1.867 +     nop
   1.868 +    lhu      t1, 0(a0)      /* t1 = dst */
   1.869 +    CONVERT_1x0565_TO_1x8888 t1, t2, t3, t7
   1.870 +    beq      t0, t5, 31f    /* if (t0 == 0xff) */
   1.871 +     move    t3, a1
   1.872 +
   1.873 +    MIPS_UN8x4_MUL_UN8       a1, t0, t3, t4, t7, t8, t9
   1.874 +31:
   1.875 +    not      t6, t3
   1.876 +    srl      t6, t6, 24
   1.877 +    MIPS_UN8x4_MUL_UN8       t2, t6, t2, t4, t7, t8, t9
   1.878 +    addu_s.qb                t1, t2, t3
   1.879 +    CONVERT_1x8888_TO_1x0565 t1, t2, t3, t7
   1.880 +    sh       t2, 0(a0)
   1.881 +4:
   1.882 +    RESTORE_REGS_FROM_STACK  24, v0, s0, s1, s2, s3, s4, s5, s6, s7, s8
   1.883 +    j        ra
   1.884 +     nop
   1.885 +
   1.886 +END(pixman_composite_over_n_8_0565_asm_mips)
   1.887 +
   1.888 +LEAF_MIPS_DSPR2(pixman_composite_over_8888_n_8888_asm_mips)
   1.889 +/*
   1.890 + * a0 - dst  (a8r8g8b8)
   1.891 + * a1 - src  (a8r8g8b8)
   1.892 + * a2 - mask (32bit constant)
   1.893 + * a3 - w
   1.894 + */
   1.895 +
   1.896 +    SAVE_REGS_ON_STACK 0, s0
   1.897 +    li       t4, 0x00ff00ff
   1.898 +    beqz     a3, 3f
   1.899 +     nop
   1.900 +    addiu    t1, a3, -1
   1.901 +    srl      a2, a2, 24
   1.902 +    beqz     t1, 2f
   1.903 +     nop
   1.904 +
   1.905 +1:
   1.906 +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
   1.907 +    lw       t1, 4(a1) /* t1 = source      (a8r8g8b8) */
   1.908 +                       /* a2 = mask        (32bit constant) */
   1.909 +    lw       t2, 0(a0) /* t2 = destination (a8r8g8b8) */
   1.910 +    lw       t3, 4(a0) /* t3 = destination (a8r8g8b8) */
   1.911 +    addiu    a1, a1, 8
   1.912 +
   1.913 +    OVER_2x8888_2x8_2x8888 t0, t1, a2, a2, t2, t3, \
   1.914 +                           t5, t6, t4, t7, t8, t9, t0, t1, s0
   1.915 +
   1.916 +    sw       t5, 0(a0)
   1.917 +    sw       t6, 4(a0)
   1.918 +    addiu    a3, a3, -2
   1.919 +    addiu    t1, a3, -1
   1.920 +    bgtz     t1, 1b
   1.921 +     addiu   a0, a0, 8
   1.922 +2:
   1.923 +    beqz     a3, 3f
   1.924 +     nop
   1.925 +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
   1.926 +                       /* a2 = mask        (32bit constant) */
   1.927 +    lw       t1, 0(a0) /* t1 = destination (a8r8g8b8) */
   1.928 +
   1.929 +    OVER_8888_8_8888 t0, a2, t1, t3, t4, t5, t6, t7, t8
   1.930 +
   1.931 +    sw       t3, 0(a0)
   1.932 +3:
   1.933 +    RESTORE_REGS_FROM_STACK 0, s0
   1.934 +    j        ra
   1.935 +     nop
   1.936 +
   1.937 +END(pixman_composite_over_8888_n_8888_asm_mips)
   1.938 +
   1.939 +LEAF_MIPS_DSPR2(pixman_composite_over_8888_n_0565_asm_mips)
   1.940 +/*
   1.941 + * a0 - dst  (r5g6b5)
   1.942 + * a1 - src  (a8r8g8b8)
   1.943 + * a2 - mask (32bit constant)
   1.944 + * a3 - w
   1.945 + */
   1.946 +
   1.947 +    SAVE_REGS_ON_STACK 0, s0, s1, s2, s3
   1.948 +    li       t6, 0x00ff00ff
   1.949 +    li       t7, 0xf800f800
   1.950 +    li       t8, 0x07e007e0
   1.951 +    li       t9, 0x001F001F
   1.952 +    beqz     a3, 3f
   1.953 +     nop
   1.954 +    srl      a2, a2, 24
   1.955 +    addiu    t1, a3, -1
   1.956 +    beqz     t1, 2f
   1.957 +     nop
   1.958 +1:
   1.959 +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
   1.960 +    lw       t1, 4(a1) /* t1 = source      (a8r8g8b8) */
   1.961 +                       /* a2 = mask        (32bit constant) */
   1.962 +    lhu      t2, 0(a0) /* t2 = destination (r5g6b5) */
   1.963 +    lhu      t3, 2(a0) /* t2 = destination (r5g6b5) */
   1.964 +    addiu    a1, a1, 8
   1.965 +
   1.966 +    CONVERT_2x0565_TO_2x8888 t2, t3, t4, t5, t8, t9, s0, s1, t2, t3
   1.967 +    OVER_2x8888_2x8_2x8888   t0, t1, a2, a2, t4, t5, \
   1.968 +                             t2, t3, t6, t0, t1, s0, s1, s2, s3
   1.969 +    CONVERT_2x8888_TO_2x0565 t2, t3, t4, t5, t7, t8, t9, s0, s1
   1.970 +
   1.971 +    sh       t4, 0(a0)
   1.972 +    sh       t5, 2(a0)
   1.973 +    addiu    a3, a3, -2
   1.974 +    addiu    t1, a3, -1
   1.975 +    bgtz     t1, 1b
   1.976 +     addiu   a0, a0, 4
   1.977 +2:
   1.978 +    beqz     a3, 3f
   1.979 +     nop
   1.980 +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
   1.981 +                       /* a2 = mask        (32bit constant) */
   1.982 +    lhu      t1, 0(a0) /* t1 = destination (r5g6b5) */
   1.983 +
   1.984 +    CONVERT_1x0565_TO_1x8888 t1, t2, t4, t5
   1.985 +    OVER_8888_8_8888         t0, a2, t2, t1, t6, t3, t4, t5, t7
   1.986 +    CONVERT_1x8888_TO_1x0565 t1, t3, t4, t5
   1.987 +
   1.988 +    sh       t3, 0(a0)
   1.989 +3:
   1.990 +    RESTORE_REGS_FROM_STACK 0, s0, s1, s2, s3
   1.991 +    j                 ra
   1.992 +     nop
   1.993 +
   1.994 +END(pixman_composite_over_8888_n_0565_asm_mips)
   1.995 +
   1.996 +LEAF_MIPS_DSPR2(pixman_composite_over_0565_n_0565_asm_mips)
   1.997 +/*
   1.998 + * a0 - dst  (r5g6b5)
   1.999 + * a1 - src  (r5g6b5)
  1.1000 + * a2 - mask (32bit constant)
  1.1001 + * a3 - w
  1.1002 + */
  1.1003 +
  1.1004 +    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5
  1.1005 +    li       t6, 0x00ff00ff
  1.1006 +    li       t7, 0xf800f800
  1.1007 +    li       t8, 0x07e007e0
  1.1008 +    li       t9, 0x001F001F
  1.1009 +    beqz     a3, 3f
  1.1010 +     nop
  1.1011 +    srl      a2, a2, 24
  1.1012 +    addiu    t1, a3, -1
  1.1013 +    beqz     t1, 2f
  1.1014 +     nop
  1.1015 +1:
  1.1016 +    lhu      t0, 0(a1) /* t0 = source      (r5g6b5) */
  1.1017 +    lhu      t1, 2(a1) /* t1 = source      (r5g6b5) */
  1.1018 +                       /* a2 = mask        (32bit constant) */
  1.1019 +    lhu      t2, 0(a0) /* t2 = destination (r5g6b5) */
  1.1020 +    lhu      t3, 2(a0) /* t3 = destination (r5g6b5) */
  1.1021 +    addiu    a1, a1, 4
  1.1022 +
  1.1023 +    CONVERT_2x0565_TO_2x8888 t0, t1, t4, t5, t8, t9, s0, s1, s2, s3
  1.1024 +    CONVERT_2x0565_TO_2x8888 t2, t3, s0, s1, t8, t9, s2, s3, s4, s5
  1.1025 +    OVER_2x8888_2x8_2x8888   t4, t5, a2, a2, s0, s1, \
  1.1026 +                             t0, t1, t6, s2, s3, s4, s5, t4, t5
  1.1027 +    CONVERT_2x8888_TO_2x0565 t0, t1, s0, s1, t7, t8, t9, s2, s3
  1.1028 +
  1.1029 +    sh       s0, 0(a0)
  1.1030 +    sh       s1, 2(a0)
  1.1031 +    addiu    a3, a3, -2
  1.1032 +    addiu    t1, a3, -1
  1.1033 +    bgtz     t1, 1b
  1.1034 +     addiu   a0, a0, 4
  1.1035 +2:
  1.1036 +    beqz     a3, 3f
  1.1037 +     nop
  1.1038 +    lhu      t0, 0(a1) /* t0 = source      (r5g6b5) */
  1.1039 +                       /* a2 = mask        (32bit constant) */
  1.1040 +    lhu      t1, 0(a0) /* t1 = destination (r5g6b5) */
  1.1041 +
  1.1042 +    CONVERT_1x0565_TO_1x8888 t0, t2, t4, t5
  1.1043 +    CONVERT_1x0565_TO_1x8888 t1, t3, t4, t5
  1.1044 +    OVER_8888_8_8888         t2, a2, t3, t0, t6, t1, t4, t5, t7
  1.1045 +    CONVERT_1x8888_TO_1x0565 t0, t3, t4, t5
  1.1046 +
  1.1047 +    sh       t3, 0(a0)
  1.1048 +3:
  1.1049 +    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5
  1.1050 +    j        ra
  1.1051 +     nop
  1.1052 +
  1.1053 +END(pixman_composite_over_0565_n_0565_asm_mips)
  1.1054 +
  1.1055 +LEAF_MIPS_DSPR2(pixman_composite_over_8888_8_8888_asm_mips)
  1.1056 +/*
  1.1057 + * a0 - dst  (a8r8g8b8)
  1.1058 + * a1 - src  (a8r8g8b8)
  1.1059 + * a2 - mask (a8)
  1.1060 + * a3 - w
  1.1061 + */
  1.1062 +
  1.1063 +    SAVE_REGS_ON_STACK 0, s0, s1
  1.1064 +    li       t4, 0x00ff00ff
  1.1065 +    beqz     a3, 3f
  1.1066 +     nop
  1.1067 +    addiu    t1, a3, -1
  1.1068 +    beqz     t1, 2f
  1.1069 +     nop
  1.1070 +1:
  1.1071 +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
  1.1072 +    lw       t1, 4(a1) /* t1 = source      (a8r8g8b8) */
  1.1073 +    lbu      t2, 0(a2) /* t2 = mask        (a8) */
  1.1074 +    lbu      t3, 1(a2) /* t3 = mask        (a8) */
  1.1075 +    lw       t5, 0(a0) /* t5 = destination (a8r8g8b8) */
  1.1076 +    lw       t6, 4(a0) /* t6 = destination (a8r8g8b8) */
  1.1077 +    addiu    a1, a1, 8
  1.1078 +    addiu    a2, a2, 2
  1.1079 +
  1.1080 +    OVER_2x8888_2x8_2x8888 t0, t1, t2, t3, t5, t6, \
  1.1081 +                           t7, t8, t4, t9, s0, s1, t0, t1, t2
  1.1082 +
  1.1083 +    sw       t7, 0(a0)
  1.1084 +    sw       t8, 4(a0)
  1.1085 +    addiu    a3, a3, -2
  1.1086 +    addiu    t1, a3, -1
  1.1087 +    bgtz     t1, 1b
  1.1088 +     addiu   a0, a0, 8
  1.1089 +2:
  1.1090 +    beqz     a3, 3f
  1.1091 +     nop
  1.1092 +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
  1.1093 +    lbu      t1, 0(a2) /* t1 = mask        (a8) */
  1.1094 +    lw       t2, 0(a0) /* t2 = destination (a8r8g8b8) */
  1.1095 +
  1.1096 +    OVER_8888_8_8888 t0, t1, t2, t3, t4, t5, t6, t7, t8
  1.1097 +
  1.1098 +    sw       t3, 0(a0)
  1.1099 +3:
  1.1100 +    RESTORE_REGS_FROM_STACK 0, s0, s1
  1.1101 +    j        ra
  1.1102 +     nop
  1.1103 +
  1.1104 +END(pixman_composite_over_8888_8_8888_asm_mips)
  1.1105 +
  1.1106 +LEAF_MIPS_DSPR2(pixman_composite_over_8888_8_0565_asm_mips)
  1.1107 +/*
  1.1108 + * a0 - dst  (r5g6b5)
  1.1109 + * a1 - src  (a8r8g8b8)
  1.1110 + * a2 - mask (a8)
  1.1111 + * a3 - w
  1.1112 + */
  1.1113 +
  1.1114 +    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5
  1.1115 +    li       t6, 0x00ff00ff
  1.1116 +    li       t7, 0xf800f800
  1.1117 +    li       t8, 0x07e007e0
  1.1118 +    li       t9, 0x001F001F
  1.1119 +    beqz     a3, 3f
  1.1120 +     nop
  1.1121 +    addiu    t1, a3, -1
  1.1122 +    beqz     t1, 2f
  1.1123 +     nop
  1.1124 +1:
  1.1125 +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
  1.1126 +    lw       t1, 4(a1) /* t1 = source      (a8r8g8b8) */
  1.1127 +    lbu      t2, 0(a2) /* t2 = mask        (a8) */
  1.1128 +    lbu      t3, 1(a2) /* t3 = mask        (a8) */
  1.1129 +    lhu      t4, 0(a0) /* t4 = destination (r5g6b5) */
  1.1130 +    lhu      t5, 2(a0) /* t5 = destination (r5g6b5) */
  1.1131 +    addiu    a1, a1, 8
  1.1132 +    addiu    a2, a2, 2
  1.1133 +
  1.1134 +    CONVERT_2x0565_TO_2x8888 t4, t5, s0, s1, t8, t9, s2, s3, s4, s5
  1.1135 +    OVER_2x8888_2x8_2x8888   t0, t1, t2, t3, s0, s1, \
  1.1136 +                             t4, t5, t6, s2, s3, s4, s5, t0, t1
  1.1137 +    CONVERT_2x8888_TO_2x0565 t4, t5, s0, s1, t7, t8, t9, s2, s3
  1.1138 +
  1.1139 +    sh       s0, 0(a0)
  1.1140 +    sh       s1, 2(a0)
  1.1141 +    addiu    a3, a3, -2
  1.1142 +    addiu    t1, a3, -1
  1.1143 +    bgtz     t1, 1b
  1.1144 +     addiu   a0, a0, 4
  1.1145 +2:
  1.1146 +    beqz     a3, 3f
  1.1147 +     nop
  1.1148 +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
  1.1149 +    lbu      t1, 0(a2) /* t1 = mask        (a8) */
  1.1150 +    lhu      t2, 0(a0) /* t2 = destination (r5g6b5) */
  1.1151 +
  1.1152 +    CONVERT_1x0565_TO_1x8888 t2, t3, t4, t5
  1.1153 +    OVER_8888_8_8888         t0, t1, t3, t2, t6, t4, t5, t7, t8
  1.1154 +    CONVERT_1x8888_TO_1x0565 t2, t3, t4, t5
  1.1155 +
  1.1156 +    sh       t3, 0(a0)
  1.1157 +3:
  1.1158 +    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5
  1.1159 +    j        ra
  1.1160 +     nop
  1.1161 +
  1.1162 +END(pixman_composite_over_8888_8_0565_asm_mips)
  1.1163 +
  1.1164 +LEAF_MIPS_DSPR2(pixman_composite_over_0565_8_0565_asm_mips)
  1.1165 +/*
  1.1166 + * a0 - dst  (r5g6b5)
  1.1167 + * a1 - src  (r5g6b5)
  1.1168 + * a2 - mask (a8)
  1.1169 + * a3 - w
  1.1170 + */
  1.1171 +
  1.1172 +    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5
  1.1173 +    li       t4, 0xf800f800
  1.1174 +    li       t5, 0x07e007e0
  1.1175 +    li       t6, 0x001F001F
  1.1176 +    li       t7, 0x00ff00ff
  1.1177 +    beqz     a3, 3f
  1.1178 +     nop
  1.1179 +    addiu    t1, a3, -1
  1.1180 +    beqz     t1, 2f
  1.1181 +     nop
  1.1182 +1:
  1.1183 +    lhu      t0, 0(a1) /* t0 = source      (r5g6b5) */
  1.1184 +    lhu      t1, 2(a1) /* t1 = source      (r5g6b5) */
  1.1185 +    lbu      t2, 0(a2) /* t2 = mask        (a8) */
  1.1186 +    lbu      t3, 1(a2) /* t3 = mask        (a8) */
  1.1187 +    lhu      t8, 0(a0) /* t8 = destination (r5g6b5) */
  1.1188 +    lhu      t9, 2(a0) /* t9 = destination (r5g6b5) */
  1.1189 +    addiu    a1, a1, 4
  1.1190 +    addiu    a2, a2, 2
  1.1191 +
  1.1192 +    CONVERT_2x0565_TO_2x8888 t0, t1, s0, s1, t5, t6, s2, s3, s4, s5
  1.1193 +    CONVERT_2x0565_TO_2x8888 t8, t9, s2, s3, t5, t6, s4, s5, t0, t1
  1.1194 +    OVER_2x8888_2x8_2x8888   s0, s1, t2, t3, s2, s3, \
  1.1195 +                             t0, t1, t7, s4, s5, t8, t9, s0, s1
  1.1196 +    CONVERT_2x8888_TO_2x0565 t0, t1, s0, s1, t4, t5, t6, s2, s3
  1.1197 +
  1.1198 +    sh       s0, 0(a0)
  1.1199 +    sh       s1, 2(a0)
  1.1200 +    addiu    a3, a3, -2
  1.1201 +    addiu    t1, a3, -1
  1.1202 +    bgtz     t1, 1b
  1.1203 +     addiu   a0, a0, 4
  1.1204 +2:
  1.1205 +    beqz     a3, 3f
  1.1206 +     nop
  1.1207 +    lhu      t0, 0(a1) /* t0 = source      (r5g6b5) */
  1.1208 +    lbu      t1, 0(a2) /* t1 = mask        (a8) */
  1.1209 +    lhu      t2, 0(a0) /* t2 = destination (r5g6b5) */
  1.1210 +
  1.1211 +    CONVERT_1x0565_TO_1x8888 t0, t3, t4, t5
  1.1212 +    CONVERT_1x0565_TO_1x8888 t2, t4, t5, t6
  1.1213 +    OVER_8888_8_8888         t3, t1, t4, t0, t7, t2, t5, t6, t8
  1.1214 +    CONVERT_1x8888_TO_1x0565 t0, t3, t4, t5
  1.1215 +
  1.1216 +    sh       t3, 0(a0)
  1.1217 +3:
  1.1218 +    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5
  1.1219 +    j        ra
  1.1220 +     nop
  1.1221 +
  1.1222 +END(pixman_composite_over_0565_8_0565_asm_mips)
  1.1223 +
  1.1224 +LEAF_MIPS_DSPR2(pixman_composite_over_8888_8888_8888_asm_mips)
  1.1225 +/*
  1.1226 + * a0 - dst  (a8r8g8b8)
  1.1227 + * a1 - src  (a8r8g8b8)
  1.1228 + * a2 - mask (a8r8g8b8)
  1.1229 + * a3 - w
  1.1230 + */
  1.1231 +
  1.1232 +    SAVE_REGS_ON_STACK 0, s0, s1, s2
  1.1233 +    li       t4, 0x00ff00ff
  1.1234 +    beqz     a3, 3f
  1.1235 +     nop
  1.1236 +    addiu    t1, a3, -1
  1.1237 +    beqz     t1, 2f
  1.1238 +     nop
  1.1239 +1:
  1.1240 +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
  1.1241 +    lw       t1, 4(a1) /* t1 = source      (a8r8g8b8) */
  1.1242 +    lw       t2, 0(a2) /* t2 = mask        (a8r8g8b8) */
  1.1243 +    lw       t3, 4(a2) /* t3 = mask        (a8r8g8b8) */
  1.1244 +    lw       t5, 0(a0) /* t5 = destination (a8r8g8b8) */
  1.1245 +    lw       t6, 4(a0) /* t6 = destination (a8r8g8b8) */
  1.1246 +    addiu    a1, a1, 8
  1.1247 +    addiu    a2, a2, 8
  1.1248 +    srl      t2, t2, 24
  1.1249 +    srl      t3, t3, 24
  1.1250 +
  1.1251 +    OVER_2x8888_2x8_2x8888 t0, t1, t2, t3, t5, t6, t7, t8, t4, t9, s0, s1, s2, t0, t1
  1.1252 +
  1.1253 +    sw       t7, 0(a0)
  1.1254 +    sw       t8, 4(a0)
  1.1255 +    addiu    a3, a3, -2
  1.1256 +    addiu    t1, a3, -1
  1.1257 +    bgtz     t1, 1b
  1.1258 +     addiu   a0, a0, 8
  1.1259 +2:
  1.1260 +    beqz     a3, 3f
  1.1261 +     nop
  1.1262 +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
  1.1263 +    lw       t1, 0(a2) /* t1 = mask        (a8r8g8b8) */
  1.1264 +    lw       t2, 0(a0) /* t2 = destination (a8r8g8b8) */
  1.1265 +    srl      t1, t1, 24
  1.1266 +
  1.1267 +    OVER_8888_8_8888 t0, t1, t2, t3, t4, t5, t6, t7, t8
  1.1268 +
  1.1269 +    sw       t3, 0(a0)
  1.1270 +3:
  1.1271 +    RESTORE_REGS_FROM_STACK 0, s0, s1, s2
  1.1272 +    j        ra
  1.1273 +     nop
  1.1274 +
  1.1275 +END(pixman_composite_over_8888_8888_8888_asm_mips)
  1.1276 +
  1.1277 +LEAF_MIPS_DSPR2(pixman_composite_over_8888_8888_asm_mips)
  1.1278 +/*
  1.1279 + * a0 - dst  (a8r8g8b8)
  1.1280 + * a1 - src  (a8r8g8b8)
  1.1281 + * a2 - w
  1.1282 + */
  1.1283 +
  1.1284 +    SAVE_REGS_ON_STACK 0, s0, s1, s2
  1.1285 +    li           t4, 0x00ff00ff
  1.1286 +    beqz         a2, 3f
  1.1287 +     nop
  1.1288 +    addiu        t1, a2, -1
  1.1289 +    beqz         t1, 2f
  1.1290 +     nop
  1.1291 +1:
  1.1292 +    lw           t0, 0(a1) /* t0 = source      (a8r8g8b8) */
  1.1293 +    lw           t1, 4(a1) /* t1 = source      (a8r8g8b8) */
  1.1294 +    lw           t2, 0(a0) /* t2 = destination (a8r8g8b8) */
  1.1295 +    lw           t3, 4(a0) /* t3 = destination (a8r8g8b8) */
  1.1296 +    addiu        a1, a1, 8
  1.1297 +
  1.1298 +    not          t5, t0
  1.1299 +    srl          t5, t5, 24
  1.1300 +    not          t6, t1
  1.1301 +    srl          t6, t6, 24
  1.1302 +
  1.1303 +    or           t7, t5, t6
  1.1304 +    beqz         t7, 11f
  1.1305 +     or          t8, t0, t1
  1.1306 +    beqz         t8, 12f
  1.1307 +
  1.1308 +    MIPS_2xUN8x4_MUL_2xUN8 t2, t3, t5, t6, t7, t8, t4, t9, s0, s1, s2, t2, t3
  1.1309 +
  1.1310 +    addu_s.qb    t0, t7, t0
  1.1311 +    addu_s.qb    t1, t8, t1
  1.1312 +11:
  1.1313 +    sw           t0, 0(a0)
  1.1314 +    sw           t1, 4(a0)
  1.1315 +12:
  1.1316 +    addiu        a2, a2, -2
  1.1317 +    addiu        t1, a2, -1
  1.1318 +    bgtz         t1, 1b
  1.1319 +     addiu       a0, a0, 8
  1.1320 +2:
  1.1321 +    beqz         a2, 3f
  1.1322 +     nop
  1.1323 +
  1.1324 +    lw           t0, 0(a1) /* t0 = source      (a8r8g8b8) */
  1.1325 +    lw           t1, 0(a0) /* t1 = destination (a8r8g8b8) */
  1.1326 +    addiu        a1, a1, 4
  1.1327 +
  1.1328 +    not          t2, t0
  1.1329 +    srl          t2, t2, 24
  1.1330 +
  1.1331 +    beqz         t2, 21f
  1.1332 +     nop
  1.1333 +    beqz         t0, 3f
  1.1334 +
  1.1335 +    MIPS_UN8x4_MUL_UN8 t1, t2, t3, t4, t5, t6, t7
  1.1336 +
  1.1337 +    addu_s.qb    t0, t3, t0
  1.1338 +21:
  1.1339 +    sw           t0, 0(a0)
  1.1340 +
  1.1341 +3:
  1.1342 +    RESTORE_REGS_FROM_STACK 0, s0, s1, s2
  1.1343 +    j            ra
  1.1344 +     nop
  1.1345 +
  1.1346 +END(pixman_composite_over_8888_8888_asm_mips)
  1.1347 +
  1.1348 +LEAF_MIPS_DSPR2(pixman_composite_over_n_0565_asm_mips)
  1.1349 +/*
  1.1350 + * a0 - dst  (r5g6b5)
  1.1351 + * a1 - src  (32bit constant)
  1.1352 + * a2 - w
  1.1353 + */
  1.1354 +
  1.1355 +    beqz         a2, 5f
  1.1356 +     nop
  1.1357 +
  1.1358 +    not          t0, a1
  1.1359 +    srl          t0, t0, 24
  1.1360 +    bgtz         t0, 1f
  1.1361 +     nop
  1.1362 +    CONVERT_1x8888_TO_1x0565 a1, t1, t2, t3
  1.1363 +0:
  1.1364 +    sh           t1, 0(a0)
  1.1365 +    addiu        a2, a2, -1
  1.1366 +    bgtz         a2, 0b
  1.1367 +     addiu       a0, a0, 2
  1.1368 +    j            ra
  1.1369 +     nop
  1.1370 +
  1.1371 +1:
  1.1372 +    SAVE_REGS_ON_STACK 0, s0, s1, s2
  1.1373 +    li           t4, 0x00ff00ff
  1.1374 +    li           t5, 0xf800f800
  1.1375 +    li           t6, 0x07e007e0
  1.1376 +    li           t7, 0x001F001F
  1.1377 +    addiu        t1, a2, -1
  1.1378 +    beqz         t1, 3f
  1.1379 +     nop
  1.1380 +2:
  1.1381 +    lhu          t1, 0(a0) /* t1 = destination (r5g6b5) */
  1.1382 +    lhu          t2, 2(a0) /* t2 = destination (r5g6b5) */
  1.1383 +
  1.1384 +    CONVERT_2x0565_TO_2x8888 t1, t2, t3, t8, t6, t7, t9, s0, s1, s2
  1.1385 +    MIPS_2xUN8x4_MUL_2xUN8   t3, t8, t0, t0, t1, t2, t4, t9, s0, s1, s2, t3, t8
  1.1386 +    addu_s.qb                t1, t1, a1
  1.1387 +    addu_s.qb                t2, t2, a1
  1.1388 +    CONVERT_2x8888_TO_2x0565 t1, t2, t3, t8, t5, t6, t7, s0, s1
  1.1389 +
  1.1390 +    sh           t3, 0(a0)
  1.1391 +    sh           t8, 2(a0)
  1.1392 +
  1.1393 +    addiu        a2, a2, -2
  1.1394 +    addiu        t1, a2, -1
  1.1395 +    bgtz         t1, 2b
  1.1396 +     addiu       a0, a0, 4
  1.1397 +3:
  1.1398 +    beqz         a2, 4f
  1.1399 +     nop
  1.1400 +
  1.1401 +    lhu          t1, 0(a0) /* t1 = destination (r5g6b5) */
  1.1402 +
  1.1403 +    CONVERT_1x0565_TO_1x8888 t1, t2, s0, s1
  1.1404 +    MIPS_UN8x4_MUL_UN8       t2, t0, t1, t4, s0, s1, s2
  1.1405 +    addu_s.qb                t1, t1, a1
  1.1406 +    CONVERT_1x8888_TO_1x0565 t1, t2, s0, s1
  1.1407 +
  1.1408 +    sh           t2, 0(a0)
  1.1409 +
  1.1410 +4:
  1.1411 +    RESTORE_REGS_FROM_STACK 0, s0, s1, s2
  1.1412 +5:
  1.1413 +    j            ra
  1.1414 +     nop
  1.1415 +
  1.1416 +END(pixman_composite_over_n_0565_asm_mips)
  1.1417 +
  1.1418 +LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_asm_mips)
  1.1419 +/*
  1.1420 + * a0 - dst  (a8r8g8b8)
  1.1421 + * a1 - src  (32bit constant)
  1.1422 + * a2 - w
  1.1423 + */
  1.1424 +
  1.1425 +    beqz         a2, 5f
  1.1426 +     nop
  1.1427 +
  1.1428 +    not          t0, a1
  1.1429 +    srl          t0, t0, 24
  1.1430 +    bgtz         t0, 1f
  1.1431 +     nop
  1.1432 +0:
  1.1433 +    sw           a1, 0(a0)
  1.1434 +    addiu        a2, a2, -1
  1.1435 +    bgtz         a2, 0b
  1.1436 +     addiu       a0, a0, 4
  1.1437 +    j            ra
  1.1438 +     nop
  1.1439 +
  1.1440 +1:
  1.1441 +    SAVE_REGS_ON_STACK 0, s0, s1, s2
  1.1442 +    li           t4, 0x00ff00ff
  1.1443 +    addiu        t1, a2, -1
  1.1444 +    beqz         t1, 3f
  1.1445 +     nop
  1.1446 +2:
  1.1447 +    lw           t2, 0(a0) /* t2 = destination (a8r8g8b8) */
  1.1448 +    lw           t3, 4(a0) /* t3 = destination (a8r8g8b8) */
  1.1449 +
  1.1450 +    MIPS_2xUN8x4_MUL_2xUN8 t2, t3, t0, t0, t7, t8, t4, t9, s0, s1, s2, t2, t3
  1.1451 +
  1.1452 +    addu_s.qb    t7, t7, a1
  1.1453 +    addu_s.qb    t8, t8, a1
  1.1454 +
  1.1455 +    sw           t7, 0(a0)
  1.1456 +    sw           t8, 4(a0)
  1.1457 +
  1.1458 +    addiu        a2, a2, -2
  1.1459 +    addiu        t1, a2, -1
  1.1460 +    bgtz         t1, 2b
  1.1461 +     addiu       a0, a0, 8
  1.1462 +3:
  1.1463 +    beqz         a2, 4f
  1.1464 +     nop
  1.1465 +
  1.1466 +    lw           t1, 0(a0) /* t1 = destination (a8r8g8b8) */
  1.1467 +
  1.1468 +    MIPS_UN8x4_MUL_UN8 t1, t0, t3, t4, t5, t6, t7
  1.1469 +
  1.1470 +    addu_s.qb    t3, t3, a1
  1.1471 +
  1.1472 +    sw           t3, 0(a0)
  1.1473 +
  1.1474 +4:
  1.1475 +    RESTORE_REGS_FROM_STACK 0, s0, s1, s2
  1.1476 +5:
  1.1477 +    j            ra
  1.1478 +     nop
  1.1479 +
  1.1480 +END(pixman_composite_over_n_8888_asm_mips)
  1.1481 +
  1.1482 +LEAF_MIPS_DSPR2(pixman_composite_add_8_8_8_asm_mips)
  1.1483 +/*
  1.1484 + * a0 - dst  (a8)
  1.1485 + * a1 - src  (a8)
  1.1486 + * a2 - mask (a8)
  1.1487 + * a3 - w
  1.1488 + */
  1.1489 +
  1.1490 +    SAVE_REGS_ON_STACK 0, v0, v1
  1.1491 +    li                t9, 0x00ff00ff
  1.1492 +    beqz              a3, 3f
  1.1493 +     nop
  1.1494 +
  1.1495 +    srl               v0, a3, 2   /* v0 = how many multiples of 4 dst pixels */
  1.1496 +    beqz              v0, 1f      /* branch if less than 4 src pixels */
  1.1497 +     nop
  1.1498 +
  1.1499 +0:
  1.1500 +    beqz              v0, 1f
  1.1501 +     addiu            v0, v0, -1
  1.1502 +    lbu               t0, 0(a2)
  1.1503 +    lbu               t1, 1(a2)
  1.1504 +    lbu               t2, 2(a2)
  1.1505 +    lbu               t3, 3(a2)
  1.1506 +    lbu               t4, 0(a0)
  1.1507 +    lbu               t5, 1(a0)
  1.1508 +    lbu               t6, 2(a0)
  1.1509 +    lbu               t7, 3(a0)
  1.1510 +
  1.1511 +    addiu             a2, a2, 4
  1.1512 +
  1.1513 +    precr_sra.ph.w    t1, t0, 0
  1.1514 +    precr_sra.ph.w    t3, t2, 0
  1.1515 +    precr_sra.ph.w    t5, t4, 0
  1.1516 +    precr_sra.ph.w    t7, t6, 0
  1.1517 +
  1.1518 +    precr.qb.ph       t0, t3, t1
  1.1519 +    precr.qb.ph       t1, t7, t5
  1.1520 +
  1.1521 +    lbu               t4, 0(a1)
  1.1522 +    lbu               v1, 1(a1)
  1.1523 +    lbu               t7, 2(a1)
  1.1524 +    lbu               t8, 3(a1)
  1.1525 +
  1.1526 +    addiu             a1, a1, 4
  1.1527 +
  1.1528 +    precr_sra.ph.w    v1, t4, 0
  1.1529 +    precr_sra.ph.w    t8, t7, 0
  1.1530 +
  1.1531 +    muleu_s.ph.qbl    t2, t0, t8
  1.1532 +    muleu_s.ph.qbr    t3, t0, v1
  1.1533 +    shra_r.ph         t4, t2, 8
  1.1534 +    shra_r.ph         t5, t3, 8
  1.1535 +    and               t4, t4, t9
  1.1536 +    and               t5, t5, t9
  1.1537 +    addq.ph           t2, t2, t4
  1.1538 +    addq.ph           t3, t3, t5
  1.1539 +    shra_r.ph         t2, t2, 8
  1.1540 +    shra_r.ph         t3, t3, 8
  1.1541 +    precr.qb.ph       t0, t2, t3
  1.1542 +
  1.1543 +    addu_s.qb         t2, t0, t1
  1.1544 +
  1.1545 +    sb                t2, 0(a0)
  1.1546 +    srl               t2, t2, 8
  1.1547 +    sb                t2, 1(a0)
  1.1548 +    srl               t2, t2, 8
  1.1549 +    sb                t2, 2(a0)
  1.1550 +    srl               t2, t2, 8
  1.1551 +    sb                t2, 3(a0)
  1.1552 +    addiu             a3, a3, -4
  1.1553 +    b                 0b
  1.1554 +     addiu            a0, a0, 4
  1.1555 +
  1.1556 +1:
  1.1557 +    beqz              a3, 3f
  1.1558 +     nop
  1.1559 +2:
  1.1560 +    lbu               t8, 0(a1)
  1.1561 +    lbu               t0, 0(a2)
  1.1562 +    lbu               t1, 0(a0)
  1.1563 +    addiu             a1, a1, 1
  1.1564 +    addiu             a2, a2, 1
  1.1565 +
  1.1566 +    mul               t2, t0, t8
  1.1567 +    shra_r.ph         t3, t2, 8
  1.1568 +    andi              t3, t3, 0xff
  1.1569 +    addq.ph           t2, t2, t3
  1.1570 +    shra_r.ph         t2, t2, 8
  1.1571 +    andi              t2, t2, 0xff
  1.1572 +
  1.1573 +    addu_s.qb         t2, t2, t1
  1.1574 +    sb                t2, 0(a0)
  1.1575 +    addiu             a3, a3, -1
  1.1576 +    bnez              a3, 2b
  1.1577 +     addiu            a0, a0, 1
  1.1578 +
  1.1579 +3:
  1.1580 +    RESTORE_REGS_FROM_STACK 0, v0, v1
  1.1581 +    j                 ra
  1.1582 +     nop
  1.1583 +
  1.1584 +END(pixman_composite_add_8_8_8_asm_mips)
  1.1585 +
  1.1586 +LEAF_MIPS_DSPR2(pixman_composite_add_n_8_8_asm_mips)
  1.1587 +/*
  1.1588 + * a0 - dst  (a8)
  1.1589 + * a1 - src  (32bit constant)
  1.1590 + * a2 - mask (a8)
  1.1591 + * a3 - w
  1.1592 + */
  1.1593 +
  1.1594 +    SAVE_REGS_ON_STACK 0, v0
  1.1595 +    li                t9, 0x00ff00ff
  1.1596 +    beqz              a3, 3f
  1.1597 +     nop
  1.1598 +
  1.1599 +    srl               v0, a3, 2   /* v0 = how many multiples of 4 dst pixels */
  1.1600 +    beqz              v0, 1f      /* branch if less than 4 src pixels */
  1.1601 +     nop
  1.1602 +
  1.1603 +    srl               t8, a1, 24
  1.1604 +    replv.ph          t8, t8
  1.1605 +
  1.1606 +0:
  1.1607 +    beqz              v0, 1f
  1.1608 +     addiu            v0, v0, -1
  1.1609 +    lbu               t0, 0(a2)
  1.1610 +    lbu               t1, 1(a2)
  1.1611 +    lbu               t2, 2(a2)
  1.1612 +    lbu               t3, 3(a2)
  1.1613 +    lbu               t4, 0(a0)
  1.1614 +    lbu               t5, 1(a0)
  1.1615 +    lbu               t6, 2(a0)
  1.1616 +    lbu               t7, 3(a0)
  1.1617 +
  1.1618 +    addiu             a2, a2, 4
  1.1619 +
  1.1620 +    precr_sra.ph.w    t1, t0, 0
  1.1621 +    precr_sra.ph.w    t3, t2, 0
  1.1622 +    precr_sra.ph.w    t5, t4, 0
  1.1623 +    precr_sra.ph.w    t7, t6, 0
  1.1624 +
  1.1625 +    precr.qb.ph       t0, t3, t1
  1.1626 +    precr.qb.ph       t1, t7, t5
  1.1627 +
  1.1628 +    muleu_s.ph.qbl    t2, t0, t8
  1.1629 +    muleu_s.ph.qbr    t3, t0, t8
  1.1630 +    shra_r.ph         t4, t2, 8
  1.1631 +    shra_r.ph         t5, t3, 8
  1.1632 +    and               t4, t4, t9
  1.1633 +    and               t5, t5, t9
  1.1634 +    addq.ph           t2, t2, t4
  1.1635 +    addq.ph           t3, t3, t5
  1.1636 +    shra_r.ph         t2, t2, 8
  1.1637 +    shra_r.ph         t3, t3, 8
  1.1638 +    precr.qb.ph       t0, t2, t3
  1.1639 +
  1.1640 +    addu_s.qb         t2, t0, t1
  1.1641 +
  1.1642 +    sb                t2, 0(a0)
  1.1643 +    srl               t2, t2, 8
  1.1644 +    sb                t2, 1(a0)
  1.1645 +    srl               t2, t2, 8
  1.1646 +    sb                t2, 2(a0)
  1.1647 +    srl               t2, t2, 8
  1.1648 +    sb                t2, 3(a0)
  1.1649 +    addiu             a3, a3, -4
  1.1650 +    b                 0b
  1.1651 +     addiu            a0, a0, 4
  1.1652 +
  1.1653 +1:
  1.1654 +    beqz              a3, 3f
  1.1655 +     nop
  1.1656 +    srl               t8, a1, 24
  1.1657 +2:
  1.1658 +    lbu               t0, 0(a2)
  1.1659 +    lbu               t1, 0(a0)
  1.1660 +    addiu             a2, a2, 1
  1.1661 +
  1.1662 +    mul               t2, t0, t8
  1.1663 +    shra_r.ph         t3, t2, 8
  1.1664 +    andi              t3, t3, 0xff
  1.1665 +    addq.ph           t2, t2, t3
  1.1666 +    shra_r.ph         t2, t2, 8
  1.1667 +    andi              t2, t2, 0xff
  1.1668 +
  1.1669 +    addu_s.qb         t2, t2, t1
  1.1670 +    sb                t2, 0(a0)
  1.1671 +    addiu             a3, a3, -1
  1.1672 +    bnez              a3, 2b
  1.1673 +     addiu            a0, a0, 1
  1.1674 +
  1.1675 +3:
  1.1676 +    RESTORE_REGS_FROM_STACK 0, v0
  1.1677 +    j                 ra
  1.1678 +     nop
  1.1679 +
  1.1680 +END(pixman_composite_add_n_8_8_asm_mips)
  1.1681 +
  1.1682 +LEAF_MIPS_DSPR2(pixman_composite_add_n_8_8888_asm_mips)
  1.1683 +/*
  1.1684 + * a0 - dst  (a8r8g8b8)
  1.1685 + * a1 - src  (32bit constant)
  1.1686 + * a2 - mask (a8)
  1.1687 + * a3 - w
  1.1688 + */
  1.1689 +
  1.1690 +    SAVE_REGS_ON_STACK 0, s0, s1, s2
  1.1691 +    li       t4, 0x00ff00ff
  1.1692 +    beqz     a3, 3f
  1.1693 +     nop
  1.1694 +    addiu    t1, a3, -1
  1.1695 +    beqz     t1, 2f
  1.1696 +     nop
  1.1697 +1:
  1.1698 +                       /* a1 = source      (32bit constant) */
  1.1699 +    lbu      t0, 0(a2) /* t0 = mask        (a8) */
  1.1700 +    lbu      t1, 1(a2) /* t1 = mask        (a8) */
  1.1701 +    lw       t2, 0(a0) /* t2 = destination (a8r8g8b8) */
  1.1702 +    lw       t3, 4(a0) /* t3 = destination (a8r8g8b8) */
  1.1703 +    addiu    a2, a2, 2
  1.1704 +
  1.1705 +    MIPS_2xUN8x4_MUL_2xUN8_ADD_2xUN8x4 a1, a1, \
  1.1706 +                                       t0, t1, \
  1.1707 +                                       t2, t3, \
  1.1708 +                                       t5, t6, \
  1.1709 +                                       t4, t7, t8, t9, s0, s1, s2
  1.1710 +
  1.1711 +    sw       t5, 0(a0)
  1.1712 +    sw       t6, 4(a0)
  1.1713 +    addiu    a3, a3, -2
  1.1714 +    addiu    t1, a3, -1
  1.1715 +    bgtz     t1, 1b
  1.1716 +     addiu   a0, a0, 8
  1.1717 +2:
  1.1718 +    beqz     a3, 3f
  1.1719 +     nop
  1.1720 +                       /* a1 = source      (32bit constant) */
  1.1721 +    lbu      t0, 0(a2) /* t0 = mask        (a8) */
  1.1722 +    lw       t1, 0(a0) /* t1 = destination (a8r8g8b8) */
  1.1723 +
  1.1724 +    MIPS_UN8x4_MUL_UN8_ADD_UN8x4 a1, t0, t1, t2, t4, t3, t5, t6
  1.1725 +
  1.1726 +    sw       t2, 0(a0)
  1.1727 +3:
  1.1728 +    RESTORE_REGS_FROM_STACK 0, s0, s1, s2
  1.1729 +    j        ra
  1.1730 +     nop
  1.1731 +
  1.1732 +END(pixman_composite_add_n_8_8888_asm_mips)
  1.1733 +
  1.1734 +LEAF_MIPS_DSPR2(pixman_composite_add_0565_8_0565_asm_mips)
  1.1735 +/*
  1.1736 + * a0 - dst  (r5g6b5)
  1.1737 + * a1 - src  (r5g6b5)
  1.1738 + * a2 - mask (a8)
  1.1739 + * a3 - w
  1.1740 + */
  1.1741 +
  1.1742 +    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7
  1.1743 +    li       t4, 0xf800f800
  1.1744 +    li       t5, 0x07e007e0
  1.1745 +    li       t6, 0x001F001F
  1.1746 +    li       t7, 0x00ff00ff
  1.1747 +    beqz     a3, 3f
  1.1748 +     nop
  1.1749 +    addiu    t1, a3, -1
  1.1750 +    beqz     t1, 2f
  1.1751 +     nop
  1.1752 +1:
  1.1753 +    lhu      t0, 0(a1) /* t0 = source      (r5g6b5) */
  1.1754 +    lhu      t1, 2(a1) /* t1 = source      (r5g6b5) */
  1.1755 +    lbu      t2, 0(a2) /* t2 = mask        (a8) */
  1.1756 +    lbu      t3, 1(a2) /* t3 = mask        (a8) */
  1.1757 +    lhu      t8, 0(a0) /* t8 = destination (r5g6b5) */
  1.1758 +    lhu      t9, 2(a0) /* t9 = destination (r5g6b5) */
  1.1759 +    addiu    a1, a1, 4
  1.1760 +    addiu    a2, a2, 2
  1.1761 +
  1.1762 +    CONVERT_2x0565_TO_2x8888  t0, t1, s0, s1, t5, t6, s2, s3, s4, s5
  1.1763 +    CONVERT_2x0565_TO_2x8888  t8, t9, s2, s3, t5, t6, s4, s5, s6, s7
  1.1764 +    MIPS_2xUN8x4_MUL_2xUN8_ADD_2xUN8x4  s0, s1, \
  1.1765 +                                        t2, t3, \
  1.1766 +                                        s2, s3, \
  1.1767 +                                        t0, t1, \
  1.1768 +                                        t7, s4, s5, s6, s7, t8, t9
  1.1769 +    CONVERT_2x8888_TO_2x0565  t0, t1, s0, s1, t4, t5, t6, s2, s3
  1.1770 +
  1.1771 +    sh       s0, 0(a0)
  1.1772 +    sh       s1, 2(a0)
  1.1773 +    addiu    a3, a3, -2
  1.1774 +    addiu    t1, a3, -1
  1.1775 +    bgtz     t1, 1b
  1.1776 +     addiu   a0, a0, 4
  1.1777 +2:
  1.1778 +    beqz     a3, 3f
  1.1779 +     nop
  1.1780 +    lhu      t0, 0(a1) /* t0 = source      (r5g6b5) */
  1.1781 +    lbu      t1, 0(a2) /* t1 = mask        (a8) */
  1.1782 +    lhu      t2, 0(a0) /* t2 = destination (r5g6b5) */
  1.1783 +
  1.1784 +    CONVERT_1x0565_TO_1x8888  t0, t3, t4, t5
  1.1785 +    CONVERT_1x0565_TO_1x8888  t2, t4, t5, t6
  1.1786 +    MIPS_UN8x4_MUL_UN8_ADD_UN8x4  t3, t1, t4, t0, t7, t2, t5, t6
  1.1787 +    CONVERT_1x8888_TO_1x0565  t0, t3, t4, t5
  1.1788 +
  1.1789 +    sh       t3, 0(a0)
  1.1790 +3:
  1.1791 +    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7
  1.1792 +    j        ra
  1.1793 +     nop
  1.1794 +
  1.1795 +END(pixman_composite_add_0565_8_0565_asm_mips)
  1.1796 +
  1.1797 +LEAF_MIPS_DSPR2(pixman_composite_add_8888_8_8888_asm_mips)
  1.1798 +/*
  1.1799 + * a0 - dst  (a8r8g8b8)
  1.1800 + * a1 - src  (a8r8g8b8)
  1.1801 + * a2 - mask (a8)
  1.1802 + * a3 - w
  1.1803 + */
  1.1804 +
  1.1805 +    SAVE_REGS_ON_STACK 0, s0, s1, s2
  1.1806 +    li       t4, 0x00ff00ff
  1.1807 +    beqz     a3, 3f
  1.1808 +     nop
  1.1809 +    addiu    t1, a3, -1
  1.1810 +    beqz     t1, 2f
  1.1811 +     nop
  1.1812 +1:
  1.1813 +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
  1.1814 +    lw       t1, 4(a1) /* t1 = source      (a8r8g8b8) */
  1.1815 +    lbu      t2, 0(a2) /* t2 = mask        (a8) */
  1.1816 +    lbu      t3, 1(a2) /* t3 = mask        (a8) */
  1.1817 +    lw       t5, 0(a0) /* t5 = destination (a8r8g8b8) */
  1.1818 +    lw       t6, 4(a0) /* t6 = destination (a8r8g8b8) */
  1.1819 +    addiu    a1, a1, 8
  1.1820 +    addiu    a2, a2, 2
  1.1821 +
  1.1822 +    MIPS_2xUN8x4_MUL_2xUN8_ADD_2xUN8x4 t0, t1, \
  1.1823 +                                       t2, t3, \
  1.1824 +                                       t5, t6, \
  1.1825 +                                       t7, t8, \
  1.1826 +                                       t4, t9, s0, s1, s2, t0, t1
  1.1827 +
  1.1828 +    sw       t7, 0(a0)
  1.1829 +    sw       t8, 4(a0)
  1.1830 +    addiu    a3, a3, -2
  1.1831 +    addiu    t1, a3, -1
  1.1832 +    bgtz     t1, 1b
  1.1833 +     addiu   a0, a0, 8
  1.1834 +2:
  1.1835 +    beqz     a3, 3f
  1.1836 +     nop
  1.1837 +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
  1.1838 +    lbu      t1, 0(a2) /* t1 = mask        (a8) */
  1.1839 +    lw       t2, 0(a0) /* t2 = destination (a8r8g8b8) */
  1.1840 +
  1.1841 +    MIPS_UN8x4_MUL_UN8_ADD_UN8x4 t0, t1, t2, t3, t4, t5, t6, t7
  1.1842 +
  1.1843 +    sw       t3, 0(a0)
  1.1844 +3:
  1.1845 +    RESTORE_REGS_FROM_STACK 0, s0, s1, s2
  1.1846 +    j        ra
  1.1847 +     nop
  1.1848 +
  1.1849 +END(pixman_composite_add_8888_8_8888_asm_mips)
  1.1850 +
  1.1851 +LEAF_MIPS_DSPR2(pixman_composite_add_8888_n_8888_asm_mips)
  1.1852 +/*
  1.1853 + * a0 - dst  (a8r8g8b8)
  1.1854 + * a1 - src  (a8r8g8b8)
  1.1855 + * a2 - mask (32bit constant)
  1.1856 + * a3 - w
  1.1857 + */
  1.1858 +
  1.1859 +    SAVE_REGS_ON_STACK 0, s0, s1, s2
  1.1860 +    li       t4, 0x00ff00ff
  1.1861 +    beqz     a3, 3f
  1.1862 +     nop
  1.1863 +    srl      a2, a2, 24
  1.1864 +    addiu    t1, a3, -1
  1.1865 +    beqz     t1, 2f
  1.1866 +     nop
  1.1867 +1:
  1.1868 +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
  1.1869 +    lw       t1, 4(a1) /* t1 = source      (a8r8g8b8) */
  1.1870 +                       /* a2 = mask        (32bit constant) */
  1.1871 +    lw       t2, 0(a0) /* t2 = destination (a8r8g8b8) */
  1.1872 +    lw       t3, 4(a0) /* t3 = destination (a8r8g8b8) */
  1.1873 +    addiu    a1, a1, 8
  1.1874 +
  1.1875 +    MIPS_2xUN8x4_MUL_2xUN8_ADD_2xUN8x4 t0, t1, \
  1.1876 +                                       a2, a2, \
  1.1877 +                                       t2, t3, \
  1.1878 +                                       t5, t6, \
  1.1879 +                                       t4, t7, t8, t9, s0, s1, s2
  1.1880 +
  1.1881 +    sw       t5, 0(a0)
  1.1882 +    sw       t6, 4(a0)
  1.1883 +    addiu    a3, a3, -2
  1.1884 +    addiu    t1, a3, -1
  1.1885 +    bgtz     t1, 1b
  1.1886 +     addiu   a0, a0, 8
  1.1887 +2:
  1.1888 +    beqz     a3, 3f
  1.1889 +     nop
  1.1890 +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
  1.1891 +                       /* a2 = mask        (32bit constant) */
  1.1892 +    lw       t1, 0(a0) /* t1 = destination (a8r8g8b8) */
  1.1893 +
  1.1894 +    MIPS_UN8x4_MUL_UN8_ADD_UN8x4 t0, a2, t1, t3, t4, t5, t6, t7
  1.1895 +
  1.1896 +    sw       t3, 0(a0)
  1.1897 +3:
  1.1898 +    RESTORE_REGS_FROM_STACK 0, s0, s1, s2
  1.1899 +    j        ra
  1.1900 +     nop
  1.1901 +
  1.1902 +END(pixman_composite_add_8888_n_8888_asm_mips)
  1.1903 +
  1.1904 +LEAF_MIPS_DSPR2(pixman_composite_add_8888_8888_8888_asm_mips)
  1.1905 +/*
  1.1906 + * a0 - dst  (a8r8g8b8)
  1.1907 + * a1 - src  (a8r8g8b8)
  1.1908 + * a2 - mask (a8r8g8b8)
  1.1909 + * a3 - w
  1.1910 + */
  1.1911 +
  1.1912 +    SAVE_REGS_ON_STACK 0, s0, s1, s2
  1.1913 +    li       t4, 0x00ff00ff
  1.1914 +    beqz     a3, 3f
  1.1915 +     nop
  1.1916 +    addiu    t1, a3, -1
  1.1917 +    beqz     t1, 2f
  1.1918 +     nop
  1.1919 +1:
  1.1920 +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
  1.1921 +    lw       t1, 4(a1) /* t1 = source      (a8r8g8b8) */
  1.1922 +    lw       t2, 0(a2) /* t2 = mask        (a8r8g8b8) */
  1.1923 +    lw       t3, 4(a2) /* t3 = mask        (a8r8g8b8) */
  1.1924 +    lw       t5, 0(a0) /* t5 = destination (a8r8g8b8) */
  1.1925 +    lw       t6, 4(a0) /* t6 = destination (a8r8g8b8) */
  1.1926 +    addiu    a1, a1, 8
  1.1927 +    addiu    a2, a2, 8
  1.1928 +    srl      t2, t2, 24
  1.1929 +    srl      t3, t3, 24
  1.1930 +
  1.1931 +    MIPS_2xUN8x4_MUL_2xUN8_ADD_2xUN8x4 t0, t1, \
  1.1932 +                                       t2, t3, \
  1.1933 +                                       t5, t6, \
  1.1934 +                                       t7, t8, \
  1.1935 +                                       t4, t9, s0, s1, s2, t0, t1
  1.1936 +
  1.1937 +    sw       t7, 0(a0)
  1.1938 +    sw       t8, 4(a0)
  1.1939 +    addiu    a3, a3, -2
  1.1940 +    addiu    t1, a3, -1
  1.1941 +    bgtz     t1, 1b
  1.1942 +     addiu   a0, a0, 8
  1.1943 +2:
  1.1944 +    beqz     a3, 3f
  1.1945 +     nop
  1.1946 +    lw       t0, 0(a1) /* t0 = source      (a8r8g8b8) */
  1.1947 +    lw       t1, 0(a2) /* t1 = mask        (a8r8g8b8) */
  1.1948 +    lw       t2, 0(a0) /* t2 = destination (a8r8g8b8) */
  1.1949 +    srl      t1, t1, 24
  1.1950 +
  1.1951 +    MIPS_UN8x4_MUL_UN8_ADD_UN8x4 t0, t1, t2, t3, t4, t5, t6, t7
  1.1952 +
  1.1953 +    sw       t3, 0(a0)
  1.1954 +3:
  1.1955 +    RESTORE_REGS_FROM_STACK 0, s0, s1, s2
  1.1956 +    j        ra
  1.1957 +     nop
  1.1958 +
  1.1959 +END(pixman_composite_add_8888_8888_8888_asm_mips)
  1.1960 +
  1.1961 +LEAF_MIPS_DSPR2(pixman_composite_add_8_8_asm_mips)
  1.1962 +/*
  1.1963 + * a0 - dst  (a8)
  1.1964 + * a1 - src  (a8)
  1.1965 + * a2 - w
  1.1966 + */
  1.1967 +
  1.1968 +    beqz              a2, 3f
  1.1969 +     nop
  1.1970 +    srl               t9, a2, 2   /* t9 = how many multiples of 4 dst pixels */
  1.1971 +    beqz              t9, 1f      /* branch if less than 4 src pixels */
  1.1972 +     nop
  1.1973 +
  1.1974 +0:
  1.1975 +    beqz              t9, 1f
  1.1976 +     addiu            t9, t9, -1
  1.1977 +    lbu               t0, 0(a1)
  1.1978 +    lbu               t1, 1(a1)
  1.1979 +    lbu               t2, 2(a1)
  1.1980 +    lbu               t3, 3(a1)
  1.1981 +    lbu               t4, 0(a0)
  1.1982 +    lbu               t5, 1(a0)
  1.1983 +    lbu               t6, 2(a0)
  1.1984 +    lbu               t7, 3(a0)
  1.1985 +
  1.1986 +    addiu             a1, a1, 4
  1.1987 +
  1.1988 +    precr_sra.ph.w    t1, t0, 0
  1.1989 +    precr_sra.ph.w    t3, t2, 0
  1.1990 +    precr_sra.ph.w    t5, t4, 0
  1.1991 +    precr_sra.ph.w    t7, t6, 0
  1.1992 +
  1.1993 +    precr.qb.ph       t0, t3, t1
  1.1994 +    precr.qb.ph       t1, t7, t5
  1.1995 +
  1.1996 +    addu_s.qb         t2, t0, t1
  1.1997 +
  1.1998 +    sb                t2, 0(a0)
  1.1999 +    srl               t2, t2, 8
  1.2000 +    sb                t2, 1(a0)
  1.2001 +    srl               t2, t2, 8
  1.2002 +    sb                t2, 2(a0)
  1.2003 +    srl               t2, t2, 8
  1.2004 +    sb                t2, 3(a0)
  1.2005 +    addiu             a2, a2, -4
  1.2006 +    b                 0b
  1.2007 +     addiu            a0, a0, 4
  1.2008 +
  1.2009 +1:
  1.2010 +    beqz              a2, 3f
  1.2011 +     nop
  1.2012 +2:
  1.2013 +    lbu               t0, 0(a1)
  1.2014 +    lbu               t1, 0(a0)
  1.2015 +    addiu             a1, a1, 1
  1.2016 +
  1.2017 +    addu_s.qb         t2, t0, t1
  1.2018 +    sb                t2, 0(a0)
  1.2019 +    addiu             a2, a2, -1
  1.2020 +    bnez              a2, 2b
  1.2021 +     addiu            a0, a0, 1
  1.2022 +
  1.2023 +3:
  1.2024 +    j                 ra
  1.2025 +     nop
  1.2026 +
  1.2027 +END(pixman_composite_add_8_8_asm_mips)
  1.2028 +
  1.2029 +LEAF_MIPS_DSPR2(pixman_composite_add_8888_8888_asm_mips)
  1.2030 +/*
  1.2031 + * a0 - dst (a8r8g8b8)
  1.2032 + * a1 - src (a8r8g8b8)
  1.2033 + * a2 - w
  1.2034 + */
  1.2035 +
  1.2036 +    beqz         a2, 4f
  1.2037 +     nop
  1.2038 +
  1.2039 +    srl          t9, a2, 2      /* t1 = how many multiples of 4 src pixels */
  1.2040 +    beqz         t9, 3f         /* branch if less than 4 src pixels */
  1.2041 +     nop
  1.2042 +1:
  1.2043 +    addiu        t9, t9, -1
  1.2044 +    beqz         t9, 2f
  1.2045 +     addiu       a2, a2, -4
  1.2046 +
  1.2047 +    lw           t0, 0(a1)
  1.2048 +    lw           t1, 4(a1)
  1.2049 +    lw           t2, 8(a1)
  1.2050 +    lw           t3, 12(a1)
  1.2051 +    lw           t4, 0(a0)
  1.2052 +    lw           t5, 4(a0)
  1.2053 +    lw           t6, 8(a0)
  1.2054 +    lw           t7, 12(a0)
  1.2055 +    addiu        a1, a1, 16
  1.2056 +
  1.2057 +    addu_s.qb    t4, t4, t0
  1.2058 +    addu_s.qb    t5, t5, t1
  1.2059 +    addu_s.qb    t6, t6, t2
  1.2060 +    addu_s.qb    t7, t7, t3
  1.2061 +
  1.2062 +    sw           t4, 0(a0)
  1.2063 +    sw           t5, 4(a0)
  1.2064 +    sw           t6, 8(a0)
  1.2065 +    sw           t7, 12(a0)
  1.2066 +    b            1b
  1.2067 +     addiu       a0, a0, 16
  1.2068 +2:
  1.2069 +    lw           t0, 0(a1)
  1.2070 +    lw           t1, 4(a1)
  1.2071 +    lw           t2, 8(a1)
  1.2072 +    lw           t3, 12(a1)
  1.2073 +    lw           t4, 0(a0)
  1.2074 +    lw           t5, 4(a0)
  1.2075 +    lw           t6, 8(a0)
  1.2076 +    lw           t7, 12(a0)
  1.2077 +    addiu        a1, a1, 16
  1.2078 +
  1.2079 +    addu_s.qb    t4, t4, t0
  1.2080 +    addu_s.qb    t5, t5, t1
  1.2081 +    addu_s.qb    t6, t6, t2
  1.2082 +    addu_s.qb    t7, t7, t3
  1.2083 +
  1.2084 +    sw           t4, 0(a0)
  1.2085 +    sw           t5, 4(a0)
  1.2086 +    sw           t6, 8(a0)
  1.2087 +    sw           t7, 12(a0)
  1.2088 +
  1.2089 +    beqz         a2, 4f
  1.2090 +     addiu       a0, a0, 16
  1.2091 +3:
  1.2092 +    lw           t0, 0(a1)
  1.2093 +    lw           t1, 0(a0)
  1.2094 +    addiu        a1, a1, 4
  1.2095 +    addiu        a2, a2, -1
  1.2096 +    addu_s.qb    t1, t1, t0
  1.2097 +    sw           t1, 0(a0)
  1.2098 +    bnez         a2, 3b
  1.2099 +     addiu       a0, a0, 4
  1.2100 +4:
  1.2101 +    jr           ra
  1.2102 +     nop
  1.2103 +
  1.2104 +END(pixman_composite_add_8888_8888_asm_mips)
  1.2105 +
  1.2106 +LEAF_MIPS_DSPR2(pixman_composite_out_reverse_8_0565_asm_mips)
  1.2107 +/*
  1.2108 + * a0 - dst  (r5g6b5)
  1.2109 + * a1 - src  (a8)
  1.2110 + * a2 - w
  1.2111 + */
  1.2112 +
  1.2113 +    beqz     a2, 4f
  1.2114 +     nop
  1.2115 +
  1.2116 +    SAVE_REGS_ON_STACK 0, s0, s1, s2, s3
  1.2117 +    li       t2, 0xf800f800
  1.2118 +    li       t3, 0x07e007e0
  1.2119 +    li       t4, 0x001F001F
  1.2120 +    li       t5, 0x00ff00ff
  1.2121 +
  1.2122 +    addiu    t1, a2, -1
  1.2123 +    beqz     t1, 2f
  1.2124 +     nop
  1.2125 +1:
  1.2126 +    lbu      t0, 0(a1) /* t0 = source      (a8) */
  1.2127 +    lbu      t1, 1(a1) /* t1 = source      (a8) */
  1.2128 +    lhu      t6, 0(a0) /* t6 = destination (r5g6b5) */
  1.2129 +    lhu      t7, 2(a0) /* t7 = destination (r5g6b5) */
  1.2130 +    addiu    a1, a1, 2
  1.2131 +
  1.2132 +    not      t0, t0
  1.2133 +    not      t1, t1
  1.2134 +    andi     t0, 0xff  /* t0 = neg source1 */
  1.2135 +    andi     t1, 0xff  /* t1 = neg source2 */
  1.2136 +    CONVERT_2x0565_TO_2x8888 t6, t7, t8, t9, t3, t4, s0, s1, s2, s3
  1.2137 +    MIPS_2xUN8x4_MUL_2xUN8   t8, t9, t0, t1, t6, t7, t5, s0, s1, s2, s3, t8, t9
  1.2138 +    CONVERT_2x8888_TO_2x0565 t6, t7, t8, t9, t2, t3, t4, s0, s1
  1.2139 +
  1.2140 +    sh       t8, 0(a0)
  1.2141 +    sh       t9, 2(a0)
  1.2142 +    addiu    a2, a2, -2
  1.2143 +    addiu    t1, a2, -1
  1.2144 +    bgtz     t1, 1b
  1.2145 +     addiu   a0, a0, 4
  1.2146 +2:
  1.2147 +    beqz     a2, 3f
  1.2148 +     nop
  1.2149 +    lbu      t0, 0(a1) /* t0 = source      (a8) */
  1.2150 +    lhu      t1, 0(a0) /* t1 = destination (r5g6b5) */
  1.2151 +
  1.2152 +    not      t0, t0
  1.2153 +    andi     t0, 0xff  /* t0 = neg source */
  1.2154 +    CONVERT_1x0565_TO_1x8888 t1, t2, t3, t4
  1.2155 +    MIPS_UN8x4_MUL_UN8        t2, t0, t1, t5, t3, t4, t6
  1.2156 +    CONVERT_1x8888_TO_1x0565 t1, t2, t3, t4
  1.2157 +
  1.2158 +    sh       t2, 0(a0)
  1.2159 +3:
  1.2160 +    RESTORE_REGS_FROM_STACK 0, s0, s1, s2, s3
  1.2161 +4:
  1.2162 +    j        ra
  1.2163 +     nop
  1.2164 +
  1.2165 +END(pixman_composite_out_reverse_8_0565_asm_mips)
  1.2166 +
  1.2167 +LEAF_MIPS_DSPR2(pixman_composite_out_reverse_8_8888_asm_mips)
  1.2168 +/*
  1.2169 + * a0 - dst  (a8r8g8b8)
  1.2170 + * a1 - src  (a8)
  1.2171 + * a2 - w
  1.2172 + */
  1.2173 +
  1.2174 +    beqz     a2, 3f
  1.2175 +     nop
  1.2176 +    li       t4, 0x00ff00ff
  1.2177 +    addiu    t1, a2, -1
  1.2178 +    beqz     t1, 2f
  1.2179 +     nop
  1.2180 +1:
  1.2181 +    lbu      t0, 0(a1) /* t0 = source      (a8) */
  1.2182 +    lbu      t1, 1(a1) /* t1 = source      (a8) */
  1.2183 +    lw       t2, 0(a0) /* t2 = destination (a8r8g8b8) */
  1.2184 +    lw       t3, 4(a0) /* t3 = destination (a8r8g8b8) */
  1.2185 +    addiu    a1, a1, 2
  1.2186 +    not      t0, t0
  1.2187 +    not      t1, t1
  1.2188 +    andi     t0, 0xff  /* t0 = neg source */
  1.2189 +    andi     t1, 0xff  /* t1 = neg source */
  1.2190 +
  1.2191 +    MIPS_2xUN8x4_MUL_2xUN8 t2, t3, t0, t1, t5, t6, t4, t7, t8, t9, t2, t3, t0
  1.2192 +
  1.2193 +    sw       t5, 0(a0)
  1.2194 +    sw       t6, 4(a0)
  1.2195 +    addiu    a2, a2, -2
  1.2196 +    addiu    t1, a2, -1
  1.2197 +    bgtz     t1, 1b
  1.2198 +     addiu   a0, a0, 8
  1.2199 +2:
  1.2200 +    beqz     a2, 3f
  1.2201 +     nop
  1.2202 +    lbu      t0, 0(a1) /* t0 = source      (a8) */
  1.2203 +    lw       t1, 0(a0) /* t1 = destination (a8r8g8b8) */
  1.2204 +    not      t0, t0
  1.2205 +    andi     t0, 0xff  /* t0 = neg source */
  1.2206 +
  1.2207 +    MIPS_UN8x4_MUL_UN8 t1, t0, t2, t4, t3, t5, t6
  1.2208 +
  1.2209 +    sw       t2, 0(a0)
  1.2210 +3:
  1.2211 +    j        ra
  1.2212 +     nop
  1.2213 +
  1.2214 +END(pixman_composite_out_reverse_8_8888_asm_mips)
  1.2215 +
  1.2216 +LEAF_MIPS_DSPR2(pixman_composite_over_reverse_n_8888_asm_mips)
  1.2217 +/*
  1.2218 + * a0 - dst  (a8r8g8b8)
  1.2219 + * a1 - src  (32bit constant)
  1.2220 + * a2 - w
  1.2221 + */
  1.2222 +
  1.2223 +    beqz              a2, 5f
  1.2224 +     nop
  1.2225 +
  1.2226 +    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7
  1.2227 +    li                t0, 0x00ff00ff
  1.2228 +    srl               t9, a2, 2   /* t9 = how many multiples of 4 src pixels */
  1.2229 +    beqz              t9, 2f      /* branch if less than 4 src pixels */
  1.2230 +     nop
  1.2231 +1:
  1.2232 +    beqz              t9, 2f
  1.2233 +     addiu            t9, t9, -1
  1.2234 +
  1.2235 +    lw                t1, 0(a0)
  1.2236 +    lw                t2, 4(a0)
  1.2237 +    lw                t3, 8(a0)
  1.2238 +    lw                t4, 12(a0)
  1.2239 +
  1.2240 +    addiu             a2, a2, -4
  1.2241 +
  1.2242 +    not               t5, t1
  1.2243 +    not               t6, t2
  1.2244 +    not               t7, t3
  1.2245 +    not               t8, t4
  1.2246 +    srl               t5, t5, 24
  1.2247 +    srl               t6, t6, 24
  1.2248 +    srl               t7, t7, 24
  1.2249 +    srl               t8, t8, 24
  1.2250 +    replv.ph          t5, t5
  1.2251 +    replv.ph          t6, t6
  1.2252 +    replv.ph          t7, t7
  1.2253 +    replv.ph          t8, t8
  1.2254 +    muleu_s.ph.qbl    s0, a1, t5
  1.2255 +    muleu_s.ph.qbr    s1, a1, t5
  1.2256 +    muleu_s.ph.qbl    s2, a1, t6
  1.2257 +    muleu_s.ph.qbr    s3, a1, t6
  1.2258 +    muleu_s.ph.qbl    s4, a1, t7
  1.2259 +    muleu_s.ph.qbr    s5, a1, t7
  1.2260 +    muleu_s.ph.qbl    s6, a1, t8
  1.2261 +    muleu_s.ph.qbr    s7, a1, t8
  1.2262 +
  1.2263 +    shra_r.ph         t5, s0, 8
  1.2264 +    shra_r.ph         t6, s1, 8
  1.2265 +    shra_r.ph         t7, s2, 8
  1.2266 +    shra_r.ph         t8, s3, 8
  1.2267 +    and               t5, t5, t0
  1.2268 +    and               t6, t6, t0
  1.2269 +    and               t7, t7, t0
  1.2270 +    and               t8, t8, t0
  1.2271 +    addq.ph           s0, s0, t5
  1.2272 +    addq.ph           s1, s1, t6
  1.2273 +    addq.ph           s2, s2, t7
  1.2274 +    addq.ph           s3, s3, t8
  1.2275 +    shra_r.ph         s0, s0, 8
  1.2276 +    shra_r.ph         s1, s1, 8
  1.2277 +    shra_r.ph         s2, s2, 8
  1.2278 +    shra_r.ph         s3, s3, 8
  1.2279 +    shra_r.ph         t5, s4, 8
  1.2280 +    shra_r.ph         t6, s5, 8
  1.2281 +    shra_r.ph         t7, s6, 8
  1.2282 +    shra_r.ph         t8, s7, 8
  1.2283 +    and               t5, t5, t0
  1.2284 +    and               t6, t6, t0
  1.2285 +    and               t7, t7, t0
  1.2286 +    and               t8, t8, t0
  1.2287 +    addq.ph           s4, s4, t5
  1.2288 +    addq.ph           s5, s5, t6
  1.2289 +    addq.ph           s6, s6, t7
  1.2290 +    addq.ph           s7, s7, t8
  1.2291 +    shra_r.ph         s4, s4, 8
  1.2292 +    shra_r.ph         s5, s5, 8
  1.2293 +    shra_r.ph         s6, s6, 8
  1.2294 +    shra_r.ph         s7, s7, 8
  1.2295 +
  1.2296 +    precr.qb.ph       t5, s0, s1
  1.2297 +    precr.qb.ph       t6, s2, s3
  1.2298 +    precr.qb.ph       t7, s4, s5
  1.2299 +    precr.qb.ph       t8, s6, s7
  1.2300 +    addu_s.qb         t5, t1, t5
  1.2301 +    addu_s.qb         t6, t2, t6
  1.2302 +    addu_s.qb         t7, t3, t7
  1.2303 +    addu_s.qb         t8, t4, t8
  1.2304 +
  1.2305 +    sw                t5, 0(a0)
  1.2306 +    sw                t6, 4(a0)
  1.2307 +    sw                t7, 8(a0)
  1.2308 +    sw                t8, 12(a0)
  1.2309 +    b                 1b
  1.2310 +     addiu            a0, a0, 16
  1.2311 +
  1.2312 +2:
  1.2313 +    beqz              a2, 4f
  1.2314 +     nop
  1.2315 +3:
  1.2316 +    lw                t1, 0(a0)
  1.2317 +
  1.2318 +    not               t2, t1
  1.2319 +    srl               t2, t2, 24
  1.2320 +    replv.ph          t2, t2
  1.2321 +
  1.2322 +    muleu_s.ph.qbl    t4, a1, t2
  1.2323 +    muleu_s.ph.qbr    t5, a1, t2
  1.2324 +    shra_r.ph         t6, t4, 8
  1.2325 +    shra_r.ph         t7, t5, 8
  1.2326 +
  1.2327 +    and               t6,t6,t0
  1.2328 +    and               t7,t7,t0
  1.2329 +
  1.2330 +    addq.ph           t8, t4, t6
  1.2331 +    addq.ph           t9, t5, t7
  1.2332 +
  1.2333 +    shra_r.ph         t8, t8, 8
  1.2334 +    shra_r.ph         t9, t9, 8
  1.2335 +
  1.2336 +    precr.qb.ph       t9, t8, t9
  1.2337 +
  1.2338 +    addu_s.qb         t9, t1, t9
  1.2339 +    sw                t9, 0(a0)
  1.2340 +
  1.2341 +    addiu             a2, a2, -1
  1.2342 +    bnez              a2, 3b
  1.2343 +     addiu            a0, a0, 4
  1.2344 +4:
  1.2345 +    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7
  1.2346 +5:
  1.2347 +    j                 ra
  1.2348 +     nop
  1.2349 +
  1.2350 +END(pixman_composite_over_reverse_n_8888_asm_mips)
  1.2351 +
  1.2352 +LEAF_MIPS_DSPR2(pixman_composite_in_n_8_asm_mips)
  1.2353 +/*
  1.2354 + * a0 - dst  (a8)
  1.2355 + * a1 - src  (a8r8g8b8)
  1.2356 + * a2 - w
  1.2357 + */
  1.2358 +
  1.2359 +    beqz              a2, 5f
  1.2360 +     nop
  1.2361 +
  1.2362 +    SAVE_REGS_ON_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7
  1.2363 +    move              t7, a1
  1.2364 +    srl               t5, t7, 24
  1.2365 +    replv.ph          t5, t5
  1.2366 +    srl               t9, a2, 2   /* t1 = how many multiples of 4 src pixels */
  1.2367 +    beqz              t9, 2f      /* branch if less than 4 src pixels */
  1.2368 +     nop
  1.2369 +
  1.2370 +1:
  1.2371 +    addiu             t9, t9, -1
  1.2372 +    addiu             a2, a2, -4
  1.2373 +    lbu               t0, 0(a0)
  1.2374 +    lbu               t1, 1(a0)
  1.2375 +    lbu               t2, 2(a0)
  1.2376 +    lbu               t3, 3(a0)
  1.2377 +
  1.2378 +    muleu_s.ph.qbl    s0, t0, t5
  1.2379 +    muleu_s.ph.qbr    s1, t0, t5
  1.2380 +    muleu_s.ph.qbl    s2, t1, t5
  1.2381 +    muleu_s.ph.qbr    s3, t1, t5
  1.2382 +    muleu_s.ph.qbl    s4, t2, t5
  1.2383 +    muleu_s.ph.qbr    s5, t2, t5
  1.2384 +    muleu_s.ph.qbl    s6, t3, t5
  1.2385 +    muleu_s.ph.qbr    s7, t3, t5
  1.2386 +
  1.2387 +    shrl.ph           t4, s0, 8
  1.2388 +    shrl.ph           t6, s1, 8
  1.2389 +    shrl.ph           t7, s2, 8
  1.2390 +    shrl.ph           t8, s3, 8
  1.2391 +    addq.ph           t0, s0, t4
  1.2392 +    addq.ph           t1, s1, t6
  1.2393 +    addq.ph           t2, s2, t7
  1.2394 +    addq.ph           t3, s3, t8
  1.2395 +    shra_r.ph         t0, t0, 8
  1.2396 +    shra_r.ph         t1, t1, 8
  1.2397 +    shra_r.ph         t2, t2, 8
  1.2398 +    shra_r.ph         t3, t3, 8
  1.2399 +    shrl.ph           t4, s4, 8
  1.2400 +    shrl.ph           t6, s5, 8
  1.2401 +    shrl.ph           t7, s6, 8
  1.2402 +    shrl.ph           t8, s7, 8
  1.2403 +    addq.ph           s0, s4, t4
  1.2404 +    addq.ph           s1, s5, t6
  1.2405 +    addq.ph           s2, s6, t7
  1.2406 +    addq.ph           s3, s7, t8
  1.2407 +    shra_r.ph         t4, s0, 8
  1.2408 +    shra_r.ph         t6, s1, 8
  1.2409 +    shra_r.ph         t7, s2, 8
  1.2410 +    shra_r.ph         t8, s3, 8
  1.2411 +
  1.2412 +    precr.qb.ph       s0, t0, t1
  1.2413 +    precr.qb.ph       s1, t2, t3
  1.2414 +    precr.qb.ph       s2, t4, t6
  1.2415 +    precr.qb.ph       s3, t7, t8
  1.2416 +
  1.2417 +    sb                s0, 0(a0)
  1.2418 +    sb                s1, 1(a0)
  1.2419 +    sb                s2, 2(a0)
  1.2420 +    sb                s3, 3(a0)
  1.2421 +    bgtz              t9, 1b
  1.2422 +     addiu            a0, a0, 4
  1.2423 +2:
  1.2424 +    beqz              a2, 4f
  1.2425 +     nop
  1.2426 +3:
  1.2427 +    lbu               t1, 0(a0)
  1.2428 +
  1.2429 +    muleu_s.ph.qbl    t4, t1, t5
  1.2430 +    muleu_s.ph.qbr    t7, t1, t5
  1.2431 +    shrl.ph           t6, t4, 8
  1.2432 +    shrl.ph           t0, t7, 8
  1.2433 +    addq.ph           t8, t4, t6
  1.2434 +    addq.ph           t9, t7, t0
  1.2435 +    shra_r.ph         t8, t8, 8
  1.2436 +    shra_r.ph         t9, t9, 8
  1.2437 +    precr.qb.ph       t2, t8, t9
  1.2438 +    sb                t2, 0(a0)
  1.2439 +    addiu             a2, a2, -1
  1.2440 +    bnez              a2, 3b
  1.2441 +     addiu            a0, a0, 1
  1.2442 +4:
  1.2443 +    RESTORE_REGS_FROM_STACK 20, s0, s1, s2, s3, s4, s5, s6, s7
  1.2444 +5:
  1.2445 +    j                 ra
  1.2446 +     nop
  1.2447 +
  1.2448 +END(pixman_composite_in_n_8_asm_mips)
  1.2449 +
  1.2450 +LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_mips)
  1.2451 +/*
  1.2452 + * a0     - dst  (r5g6b5)
  1.2453 + * a1     - src  (a8r8g8b8)
  1.2454 + * a2     - mask (a8)
  1.2455 + * a3     - w
  1.2456 + * 16(sp) - vx
  1.2457 + * 20(sp) - unit_x
  1.2458 + */
  1.2459 +    beqz     a3, 4f
  1.2460 +     nop
  1.2461 +
  1.2462 +    SAVE_REGS_ON_STACK 20, v0, v1, s0, s1, s2, s3, s4, s5
  1.2463 +    lw       v0, 36(sp) /* v0 = vx */
  1.2464 +    lw       v1, 40(sp) /* v1 = unit_x */
  1.2465 +    li       t6, 0x00ff00ff
  1.2466 +    li       t7, 0xf800f800
  1.2467 +    li       t8, 0x07e007e0
  1.2468 +    li       t9, 0x001F001F
  1.2469 +
  1.2470 +    addiu    t1, a3, -1
  1.2471 +    beqz     t1, 2f
  1.2472 +     nop
  1.2473 +1:
  1.2474 +    sra      t0, v0, 16 /* t0 = vx >> 16 */
  1.2475 +    sll      t0, t0, 2  /* t0 = t0 * 4      (a8r8g8b8) */
  1.2476 +    addu     t0, a1, t0
  1.2477 +    lw       t0, 0(t0)  /* t0 = source      (a8r8g8b8) */
  1.2478 +    addu     v0, v0, v1 /* v0 = vx + unit_x */
  1.2479 +    sra      t1, v0, 16 /* t1 = vx >> 16 */
  1.2480 +    sll      t1, t1, 2  /* t1 = t1 * 4      (a8r8g8b8) */
  1.2481 +    addu     t1, a1, t1
  1.2482 +    lw       t1, 0(t1)  /* t1 = source      (a8r8g8b8) */
  1.2483 +    addu     v0, v0, v1 /* v0 = vx + unit_x */
  1.2484 +    lbu      t2, 0(a2)  /* t2 = mask        (a8) */
  1.2485 +    lbu      t3, 1(a2)  /* t3 = mask        (a8) */
  1.2486 +    lhu      t4, 0(a0)  /* t4 = destination (r5g6b5) */
  1.2487 +    lhu      t5, 2(a0)  /* t5 = destination (r5g6b5) */
  1.2488 +    addiu    a2, a2, 2
  1.2489 +
  1.2490 +    CONVERT_2x0565_TO_2x8888 t4, t5, s0, s1, t8, t9, s2, s3, s4, s5
  1.2491 +    OVER_2x8888_2x8_2x8888   t0, t1, \
  1.2492 +                             t2, t3, \
  1.2493 +                             s0, s1, \
  1.2494 +                             t4, t5, \
  1.2495 +                             t6, s2, s3, s4, s5, t2, t3
  1.2496 +    CONVERT_2x8888_TO_2x0565 t4, t5, s0, s1, t7, t8, t9, s2, s3
  1.2497 +
  1.2498 +    sh       s0, 0(a0)
  1.2499 +    sh       s1, 2(a0)
  1.2500 +    addiu    a3, a3, -2
  1.2501 +    addiu    t1, a3, -1
  1.2502 +    bgtz     t1, 1b
  1.2503 +     addiu   a0, a0, 4
  1.2504 +2:
  1.2505 +    beqz     a3, 3f
  1.2506 +     nop
  1.2507 +    sra      t0, v0, 16 /* t0 = vx >> 16 */
  1.2508 +    sll      t0, t0, 2  /* t0 = t0 * 4      (a8r8g8b8) */
  1.2509 +    addu     t0, a1, t0
  1.2510 +    lw       t0, 0(t0)  /* t0 = source      (a8r8g8b8) */
  1.2511 +    lbu      t1, 0(a2)  /* t1 = mask        (a8) */
  1.2512 +    lhu      t2, 0(a0)  /* t2 = destination (r5g6b5) */
  1.2513 +
  1.2514 +    CONVERT_1x0565_TO_1x8888 t2, t3, t4, t5
  1.2515 +    OVER_8888_8_8888         t0, t1, t3, t2, t6, t4, t5, t7, t8
  1.2516 +    CONVERT_1x8888_TO_1x0565 t2, t3, t4, t5
  1.2517 +
  1.2518 +    sh       t3, 0(a0)
  1.2519 +3:
  1.2520 +    RESTORE_REGS_FROM_STACK 20, v0, v1, s0, s1, s2, s3, s4, s5
  1.2521 +4:
  1.2522 +    j        ra
  1.2523 +     nop
  1.2524 +
  1.2525 +END(pixman_scaled_nearest_scanline_8888_8_0565_OVER_asm_mips)
  1.2526 +
  1.2527 +LEAF_MIPS_DSPR2(pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_mips)
  1.2528 +/*
  1.2529 + * a0     - dst  (r5g6b5)
  1.2530 + * a1     - src  (r5g6b5)
  1.2531 + * a2     - mask (a8)
  1.2532 + * a3     - w
  1.2533 + * 16(sp) - vx
  1.2534 + * 20(sp) - unit_x
  1.2535 + */
  1.2536 +
  1.2537 +    beqz     a3, 4f
  1.2538 +     nop
  1.2539 +    SAVE_REGS_ON_STACK 20, v0, v1, s0, s1, s2, s3, s4, s5
  1.2540 +    lw       v0, 36(sp) /* v0 = vx */
  1.2541 +    lw       v1, 40(sp) /* v1 = unit_x */
  1.2542 +    li       t4, 0xf800f800
  1.2543 +    li       t5, 0x07e007e0
  1.2544 +    li       t6, 0x001F001F
  1.2545 +    li       t7, 0x00ff00ff
  1.2546 +
  1.2547 +    addiu    t1, a3, -1
  1.2548 +    beqz     t1, 2f
  1.2549 +     nop
  1.2550 +1:
  1.2551 +    sra      t0, v0, 16 /* t0 = vx >> 16 */
  1.2552 +    sll      t0, t0, 1  /* t0 = t0 * 2      (r5g6b5) */
  1.2553 +    addu     t0, a1, t0
  1.2554 +    lhu      t0, 0(t0)  /* t0 = source      (r5g6b5) */
  1.2555 +    addu     v0, v0, v1 /* v0 = vx + unit_x */
  1.2556 +    sra      t1, v0, 16 /* t1 = vx >> 16 */
  1.2557 +    sll      t1, t1, 1  /* t1 = t1 * 2      (r5g6b5) */
  1.2558 +    addu     t1, a1, t1
  1.2559 +    lhu      t1, 0(t1)  /* t1 = source      (r5g6b5) */
  1.2560 +    addu     v0, v0, v1 /* v0 = vx + unit_x */
  1.2561 +    lbu      t2, 0(a2)  /* t2 = mask        (a8) */
  1.2562 +    lbu      t3, 1(a2)  /* t3 = mask        (a8) */
  1.2563 +    lhu      t8, 0(a0)  /* t8 = destination (r5g6b5) */
  1.2564 +    lhu      t9, 2(a0)  /* t9 = destination (r5g6b5) */
  1.2565 +    addiu    a2, a2, 2
  1.2566 +
  1.2567 +    CONVERT_2x0565_TO_2x8888 t0, t1, s0, s1, t5, t6, s2, s3, s4, s5
  1.2568 +    CONVERT_2x0565_TO_2x8888 t8, t9, s2, s3, t5, t6, s4, s5, t0, t1
  1.2569 +    OVER_2x8888_2x8_2x8888   s0, s1, \
  1.2570 +                             t2, t3, \
  1.2571 +                             s2, s3, \
  1.2572 +                             t0, t1, \
  1.2573 +                             t7, t8, t9, s4, s5, s0, s1
  1.2574 +    CONVERT_2x8888_TO_2x0565 t0, t1, s0, s1, t4, t5, t6, s2, s3
  1.2575 +
  1.2576 +    sh       s0, 0(a0)
  1.2577 +    sh       s1, 2(a0)
  1.2578 +    addiu    a3, a3, -2
  1.2579 +    addiu    t1, a3, -1
  1.2580 +    bgtz     t1, 1b
  1.2581 +     addiu   a0, a0, 4
  1.2582 +2:
  1.2583 +    beqz     a3, 3f
  1.2584 +     nop
  1.2585 +    sra      t0, v0, 16 /* t0 = vx >> 16 */
  1.2586 +    sll      t0, t0, 1  /* t0 = t0 * 2      (r5g6b5) */
  1.2587 +    addu     t0, a1, t0
  1.2588 +
  1.2589 +    lhu      t0, 0(t0)  /* t0 = source      (r5g6b5) */
  1.2590 +    lbu      t1, 0(a2)  /* t1 = mask        (a8) */
  1.2591 +    lhu      t2, 0(a0)  /* t2 = destination (r5g6b5) */
  1.2592 +
  1.2593 +    CONVERT_1x0565_TO_1x8888 t0, t3, t4, t5
  1.2594 +    CONVERT_1x0565_TO_1x8888 t2, t4, t5, t6
  1.2595 +    OVER_8888_8_8888         t3, t1, t4, t0, t7, t2, t5, t6, t8
  1.2596 +    CONVERT_1x8888_TO_1x0565 t0, t3, t4, t5
  1.2597 +
  1.2598 +    sh       t3, 0(a0)
  1.2599 +3:
  1.2600 +    RESTORE_REGS_FROM_STACK 20, v0, v1, s0, s1, s2, s3, s4, s5
  1.2601 +4:
  1.2602 +    j        ra
  1.2603 +     nop
  1.2604 +
  1.2605 +END(pixman_scaled_nearest_scanline_0565_8_0565_OVER_asm_mips)
  1.2606 +
  1.2607 +LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_mips)
  1.2608 +/*
  1.2609 + * a0     - *dst
  1.2610 + * a1     - *src_top
  1.2611 + * a2     - *src_bottom
  1.2612 + * a3     - w
  1.2613 + * 16(sp) - wt
  1.2614 + * 20(sp) - wb
  1.2615 + * 24(sp) - vx
  1.2616 + * 28(sp) - unit_x
  1.2617 + */
  1.2618 +
  1.2619 +    beqz     a3, 1f
  1.2620 +     nop
  1.2621 +
  1.2622 +    SAVE_REGS_ON_STACK 20, v0, s0, s1, s2, s3, s4, s5, s6, s7
  1.2623 +
  1.2624 +    lw       s0, 36(sp)     /* s0 = wt */
  1.2625 +    lw       s1, 40(sp)     /* s1 = wb */
  1.2626 +    lw       s2, 44(sp)     /* s2 = vx */
  1.2627 +    lw       s3, 48(sp)     /* s3 = unit_x */
  1.2628 +    li       v0, BILINEAR_INTERPOLATION_RANGE
  1.2629 +
  1.2630 +    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
  1.2631 +    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
  1.2632 +0:
  1.2633 +    andi     t4, s2, 0xffff /* t4 = (short)vx */
  1.2634 +    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */
  1.2635 +    subu     t5, v0, t4     /* t5 = ( 256 - (vx>>8)) */
  1.2636 +
  1.2637 +    mul      s4, s0, t5     /* s4 = wt*(256-(vx>>8)) */
  1.2638 +    mul      s5, s0, t4     /* s5 = wt*(vx>>8) */
  1.2639 +    mul      s6, s1, t5     /* s6 = wb*(256-(vx>>8)) */
  1.2640 +    mul      s7, s1, t4     /* s7 = wb*(vx>>8) */
  1.2641 +
  1.2642 +    sra      t9, s2, 16
  1.2643 +    sll      t9, t9, 2
  1.2644 +    addiu    t8, t9, 4
  1.2645 +    lwx      t0, t9(a1)     /* t0 = tl */
  1.2646 +    lwx      t1, t8(a1)     /* t1 = tr */
  1.2647 +    addiu    a3, a3, -1
  1.2648 +    lwx      t2, t9(a2)     /* t2 = bl */
  1.2649 +    lwx      t3, t8(a2)     /* t3 = br */
  1.2650 +
  1.2651 +    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
  1.2652 +
  1.2653 +    addu     s2, s2, s3     /* vx += unit_x; */
  1.2654 +    sw       t0, 0(a0)
  1.2655 +    bnez     a3, 0b
  1.2656 +     addiu   a0, a0, 4
  1.2657 +
  1.2658 +    RESTORE_REGS_FROM_STACK 20, v0, s0, s1, s2, s3, s4, s5, s6, s7
  1.2659 +1:
  1.2660 +    j        ra
  1.2661 +     nop
  1.2662 +
  1.2663 +END(pixman_scaled_bilinear_scanline_8888_8888_SRC_asm_mips)
  1.2664 +
  1.2665 +LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_mips)
  1.2666 +/*
  1.2667 + * a0     - *dst
  1.2668 + * a1     - *src_top
  1.2669 + * a2     - *src_bottom
  1.2670 + * a3     - w
  1.2671 + * 16(sp) - wt
  1.2672 + * 20(sp) - wb
  1.2673 + * 24(sp) - vx
  1.2674 + * 28(sp) - unit_x
  1.2675 + */
  1.2676 +
  1.2677 +    beqz     a3, 1f
  1.2678 +     nop
  1.2679 +
  1.2680 +    SAVE_REGS_ON_STACK 20, v0, s0, s1, s2, s3, s4, s5, s6, s7
  1.2681 +
  1.2682 +    lw       s0, 36(sp)     /* s0 = wt */
  1.2683 +    lw       s1, 40(sp)     /* s1 = wb */
  1.2684 +    lw       s2, 44(sp)     /* s2 = vx */
  1.2685 +    lw       s3, 48(sp)     /* s3 = unit_x */
  1.2686 +    li       v0, BILINEAR_INTERPOLATION_RANGE
  1.2687 +
  1.2688 +    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
  1.2689 +    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
  1.2690 +0:
  1.2691 +    andi     t4, s2, 0xffff /* t4 = (short)vx */
  1.2692 +    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */
  1.2693 +    subu     t5, v0, t4     /* t5 = ( 256 - (vx>>8)) */
  1.2694 +
  1.2695 +    mul      s4, s0, t5     /* s4 = wt*(256-(vx>>8)) */
  1.2696 +    mul      s5, s0, t4     /* s5 = wt*(vx>>8) */
  1.2697 +    mul      s6, s1, t5     /* s6 = wb*(256-(vx>>8)) */
  1.2698 +    mul      s7, s1, t4     /* s7 = wb*(vx>>8) */
  1.2699 +
  1.2700 +    sra      t9, s2, 16
  1.2701 +    sll      t9, t9, 2
  1.2702 +    addiu    t8, t9, 4
  1.2703 +    lwx      t0, t9(a1)     /* t0 = tl */
  1.2704 +    lwx      t1, t8(a1)     /* t1 = tr */
  1.2705 +    addiu    a3, a3, -1
  1.2706 +    lwx      t2, t9(a2)     /* t2 = bl */
  1.2707 +    lwx      t3, t8(a2)     /* t3 = br */
  1.2708 +
  1.2709 +    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
  1.2710 +    CONVERT_1x8888_TO_1x0565 t0, t1, t2, t3
  1.2711 +
  1.2712 +    addu     s2, s2, s3     /* vx += unit_x; */
  1.2713 +    sh       t1, 0(a0)
  1.2714 +    bnez     a3, 0b
  1.2715 +     addiu   a0, a0, 2
  1.2716 +
  1.2717 +    RESTORE_REGS_FROM_STACK 20, v0, s0, s1, s2, s3, s4, s5, s6, s7
  1.2718 +1:
  1.2719 +    j        ra
  1.2720 +     nop
  1.2721 +
  1.2722 +END(pixman_scaled_bilinear_scanline_8888_0565_SRC_asm_mips)
  1.2723 +
  1.2724 +LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_0565_8888_SRC_asm_mips)
  1.2725 +/*
  1.2726 + * a0     - *dst
  1.2727 + * a1     - *src_top
  1.2728 + * a2     - *src_bottom
  1.2729 + * a3     - w
  1.2730 + * 16(sp) - wt
  1.2731 + * 20(sp) - wb
  1.2732 + * 24(sp) - vx
  1.2733 + * 28(sp) - unit_x
  1.2734 + */
  1.2735 +
  1.2736 +    beqz     a3, 1f
  1.2737 +     nop
  1.2738 +
  1.2739 +    SAVE_REGS_ON_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
  1.2740 +
  1.2741 +    lw       s0, 44(sp)     /* s0 = wt */
  1.2742 +    lw       s1, 48(sp)     /* s1 = wb */
  1.2743 +    lw       s2, 52(sp)     /* s2 = vx */
  1.2744 +    lw       s3, 56(sp)     /* s3 = unit_x */
  1.2745 +    li       v0, BILINEAR_INTERPOLATION_RANGE
  1.2746 +    li       v1, 0x07e007e0
  1.2747 +    li       s8, 0x001f001f
  1.2748 +
  1.2749 +    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
  1.2750 +    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
  1.2751 +0:
  1.2752 +    andi     t4, s2, 0xffff /* t4 = (short)vx */
  1.2753 +    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */
  1.2754 +    subu     t5, v0, t4     /* t5 = ( 256 - (vx>>8)) */
  1.2755 +
  1.2756 +    mul      s4, s0, t5     /* s4 = wt*(256-(vx>>8)) */
  1.2757 +    mul      s5, s0, t4     /* s5 = wt*(vx>>8) */
  1.2758 +    mul      s6, s1, t5     /* s6 = wb*(256-(vx>>8)) */
  1.2759 +    mul      s7, s1, t4     /* s7 = wb*(vx>>8) */
  1.2760 +
  1.2761 +    sra      t9, s2, 16
  1.2762 +    sll      t9, t9, 1
  1.2763 +    addiu    t8, t9, 2
  1.2764 +    lhx      t0, t9(a1)     /* t0 = tl */
  1.2765 +    lhx      t1, t8(a1)     /* t1 = tr */
  1.2766 +    andi     t1, t1, 0xffff
  1.2767 +    addiu    a3, a3, -1
  1.2768 +    lhx      t2, t9(a2)     /* t2 = bl */
  1.2769 +    lhx      t3, t8(a2)     /* t3 = br */
  1.2770 +    andi     t3, t3, 0xffff
  1.2771 +
  1.2772 +    CONVERT_2x0565_TO_2x8888 t0, t1, t0, t1, v1, s8, t4, t5, t6, t7
  1.2773 +    CONVERT_2x0565_TO_2x8888 t2, t3, t2, t3, v1, s8, t4, t5, t6, t7
  1.2774 +    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
  1.2775 +
  1.2776 +    addu     s2, s2, s3     /* vx += unit_x; */
  1.2777 +    sw       t0, 0(a0)
  1.2778 +    bnez     a3, 0b
  1.2779 +     addiu   a0, a0, 4
  1.2780 +
  1.2781 +    RESTORE_REGS_FROM_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
  1.2782 +1:
  1.2783 +    j        ra
  1.2784 +     nop
  1.2785 +
  1.2786 +END(pixman_scaled_bilinear_scanline_0565_8888_SRC_asm_mips)
  1.2787 +
  1.2788 +LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_mips)
  1.2789 +/*
  1.2790 + * a0     - *dst
  1.2791 + * a1     - *src_top
  1.2792 + * a2     - *src_bottom
  1.2793 + * a3     - w
  1.2794 + * 16(sp) - wt
  1.2795 + * 20(sp) - wb
  1.2796 + * 24(sp) - vx
  1.2797 + * 28(sp) - unit_x
  1.2798 + */
  1.2799 +
  1.2800 +    beqz     a3, 1f
  1.2801 +     nop
  1.2802 +
  1.2803 +    SAVE_REGS_ON_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
  1.2804 +
  1.2805 +    lw       s0, 44(sp)     /* s0 = wt */
  1.2806 +    lw       s1, 48(sp)     /* s1 = wb */
  1.2807 +    lw       s2, 52(sp)     /* s2 = vx */
  1.2808 +    lw       s3, 56(sp)     /* s3 = unit_x */
  1.2809 +    li       v0, BILINEAR_INTERPOLATION_RANGE
  1.2810 +    li       v1, 0x07e007e0
  1.2811 +    li       s8, 0x001f001f
  1.2812 +
  1.2813 +    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
  1.2814 +    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
  1.2815 +0:
  1.2816 +    andi     t4, s2, 0xffff /* t4 = (short)vx */
  1.2817 +    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */
  1.2818 +    subu     t5, v0, t4     /* t5 = ( 256 - (vx>>8)) */
  1.2819 +
  1.2820 +    mul      s4, s0, t5     /* s4 = wt*(256-(vx>>8)) */
  1.2821 +    mul      s5, s0, t4     /* s5 = wt*(vx>>8) */
  1.2822 +    mul      s6, s1, t5     /* s6 = wb*(256-(vx>>8)) */
  1.2823 +    mul      s7, s1, t4     /* s7 = wb*(vx>>8) */
  1.2824 +
  1.2825 +    sra      t9, s2, 16
  1.2826 +    sll      t9, t9, 1
  1.2827 +    addiu    t8, t9, 2
  1.2828 +    lhx      t0, t9(a1)     /* t0 = tl */
  1.2829 +    lhx      t1, t8(a1)     /* t1 = tr */
  1.2830 +    andi     t1, t1, 0xffff
  1.2831 +    addiu    a3, a3, -1
  1.2832 +    lhx      t2, t9(a2)     /* t2 = bl */
  1.2833 +    lhx      t3, t8(a2)     /* t3 = br */
  1.2834 +    andi     t3, t3, 0xffff
  1.2835 +
  1.2836 +    CONVERT_2x0565_TO_2x8888 t0, t1, t0, t1, v1, s8, t4, t5, t6, t7
  1.2837 +    CONVERT_2x0565_TO_2x8888 t2, t3, t2, t3, v1, s8, t4, t5, t6, t7
  1.2838 +    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
  1.2839 +    CONVERT_1x8888_TO_1x0565 t0, t1, t2, t3
  1.2840 +
  1.2841 +    addu     s2, s2, s3     /* vx += unit_x; */
  1.2842 +    sh       t1, 0(a0)
  1.2843 +    bnez     a3, 0b
  1.2844 +     addiu   a0, a0, 2
  1.2845 +
  1.2846 +    RESTORE_REGS_FROM_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
  1.2847 +1:
  1.2848 +    j        ra
  1.2849 +     nop
  1.2850 +
  1.2851 +END(pixman_scaled_bilinear_scanline_0565_0565_SRC_asm_mips)
  1.2852 +
  1.2853 +LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_mips)
  1.2854 +/*
  1.2855 + * a0     - *dst
  1.2856 + * a1     - *src_top
  1.2857 + * a2     - *src_bottom
  1.2858 + * a3     - w
  1.2859 + * 16(sp) - wt
  1.2860 + * 20(sp) - wb
  1.2861 + * 24(sp) - vx
  1.2862 + * 28(sp) - unit_x
  1.2863 + */
  1.2864 +
  1.2865 +    beqz     a3, 1f
  1.2866 +     nop
  1.2867 +
  1.2868 +    SAVE_REGS_ON_STACK 24, v0, s0, s1, s2, s3, s4, s5, s6, s7, s8
  1.2869 +
  1.2870 +    lw       s0, 40(sp)     /* s0 = wt */
  1.2871 +    lw       s1, 44(sp)     /* s1 = wb */
  1.2872 +    lw       s2, 48(sp)     /* s2 = vx */
  1.2873 +    lw       s3, 52(sp)     /* s3 = unit_x */
  1.2874 +    li       v0, BILINEAR_INTERPOLATION_RANGE
  1.2875 +    li       s8, 0x00ff00ff
  1.2876 +
  1.2877 +    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
  1.2878 +    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
  1.2879 +0:
  1.2880 +    andi     t4, s2, 0xffff /* t4 = (short)vx */
  1.2881 +    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */
  1.2882 +    subu     t5, v0, t4     /* t5 = ( 256 - (vx>>8)) */
  1.2883 +
  1.2884 +    mul      s4, s0, t5     /* s4 = wt*(256-(vx>>8)) */
  1.2885 +    mul      s5, s0, t4     /* s5 = wt*(vx>>8) */
  1.2886 +    mul      s6, s1, t5     /* s6 = wb*(256-(vx>>8)) */
  1.2887 +    mul      s7, s1, t4     /* s7 = wb*(vx>>8) */
  1.2888 +
  1.2889 +    sra      t9, s2, 16
  1.2890 +    sll      t9, t9, 2
  1.2891 +    addiu    t8, t9, 4
  1.2892 +    lwx      t0, t9(a1)     /* t0 = tl */
  1.2893 +    lwx      t1, t8(a1)     /* t1 = tr */
  1.2894 +    addiu    a3, a3, -1
  1.2895 +    lwx      t2, t9(a2)     /* t2 = bl */
  1.2896 +    lwx      t3, t8(a2)     /* t3 = br */
  1.2897 +
  1.2898 +    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
  1.2899 +    lw       t1, 0(a0)      /* t1 = dest */
  1.2900 +    OVER_8888_8888 t0, t1, t2, s8, t3, t4, t5, t6
  1.2901 +
  1.2902 +    addu     s2, s2, s3     /* vx += unit_x; */
  1.2903 +    sw       t2, 0(a0)
  1.2904 +    bnez     a3, 0b
  1.2905 +     addiu   a0, a0, 4
  1.2906 +
  1.2907 +    RESTORE_REGS_FROM_STACK 24, v0, s0, s1, s2, s3, s4, s5, s6, s7, s8
  1.2908 +1:
  1.2909 +    j        ra
  1.2910 +     nop
  1.2911 +
  1.2912 +END(pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_mips)
  1.2913 +
  1.2914 +LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_mips)
  1.2915 +/*
  1.2916 + * a0     - *dst
  1.2917 + * a1     - *src_top
  1.2918 + * a2     - *src_bottom
  1.2919 + * a3     - w
  1.2920 + * 16(sp) - wt
  1.2921 + * 20(sp) - wb
  1.2922 + * 24(sp) - vx
  1.2923 + * 28(sp) - unit_x
  1.2924 + */
  1.2925 +
  1.2926 +    beqz         a3, 1f
  1.2927 +     nop
  1.2928 +
  1.2929 +    SAVE_REGS_ON_STACK 20, v0, s0, s1, s2, s3, s4, s5, s6, s7
  1.2930 +
  1.2931 +    lw           s0, 36(sp)     /* s0 = wt */
  1.2932 +    lw           s1, 40(sp)     /* s1 = wb */
  1.2933 +    lw           s2, 44(sp)     /* s2 = vx */
  1.2934 +    lw           s3, 48(sp)     /* s3 = unit_x */
  1.2935 +    li           v0, BILINEAR_INTERPOLATION_RANGE
  1.2936 +
  1.2937 +    sll          s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
  1.2938 +    sll          s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
  1.2939 +0:
  1.2940 +    andi         t4, s2, 0xffff /* t4 = (short)vx */
  1.2941 +    srl          t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */
  1.2942 +    subu         t5, v0, t4     /* t5 = ( 256 - (vx>>8)) */
  1.2943 +
  1.2944 +    mul          s4, s0, t5     /* s4 = wt*(256-(vx>>8)) */
  1.2945 +    mul          s5, s0, t4     /* s5 = wt*(vx>>8) */
  1.2946 +    mul          s6, s1, t5     /* s6 = wb*(256-(vx>>8)) */
  1.2947 +    mul          s7, s1, t4     /* s7 = wb*(vx>>8) */
  1.2948 +
  1.2949 +    sra          t9, s2, 16
  1.2950 +    sll          t9, t9, 2
  1.2951 +    addiu        t8, t9, 4
  1.2952 +    lwx          t0, t9(a1)     /* t0 = tl */
  1.2953 +    lwx          t1, t8(a1)     /* t1 = tr */
  1.2954 +    addiu        a3, a3, -1
  1.2955 +    lwx          t2, t9(a2)     /* t2 = bl */
  1.2956 +    lwx          t3, t8(a2)     /* t3 = br */
  1.2957 +
  1.2958 +    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
  1.2959 +    lw           t1, 0(a0)
  1.2960 +    addu_s.qb    t2, t0, t1
  1.2961 +
  1.2962 +    addu         s2, s2, s3     /* vx += unit_x; */
  1.2963 +    sw           t2, 0(a0)
  1.2964 +    bnez         a3, 0b
  1.2965 +     addiu       a0, a0, 4
  1.2966 +
  1.2967 +    RESTORE_REGS_FROM_STACK 20, v0, s0, s1, s2, s3, s4, s5, s6, s7
  1.2968 +1:
  1.2969 +    j            ra
  1.2970 +     nop
  1.2971 +
  1.2972 +END(pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_mips)
  1.2973 +
  1.2974 +LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_mips)
  1.2975 +/*
  1.2976 + * a0     - *dst
  1.2977 + * a1     - *mask
  1.2978 + * a2     - *src_top
  1.2979 + * a3     - *src_bottom
  1.2980 + * 16(sp) - wt
  1.2981 + * 20(sp) - wb
  1.2982 + * 24(sp) - vx
  1.2983 + * 28(sp) - unit_x
  1.2984 + * 32(sp) - w
  1.2985 + */
  1.2986 +
  1.2987 +    lw       v1, 32(sp)
  1.2988 +    beqz     v1, 1f
  1.2989 +     nop
  1.2990 +
  1.2991 +    SAVE_REGS_ON_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
  1.2992 +
  1.2993 +    lw       s0, 44(sp)        /* s0 = wt */
  1.2994 +    lw       s1, 48(sp)        /* s1 = wb */
  1.2995 +    lw       s2, 52(sp)        /* s2 = vx */
  1.2996 +    lw       s3, 56(sp)        /* s3 = unit_x */
  1.2997 +    li       v0, BILINEAR_INTERPOLATION_RANGE
  1.2998 +    li       s8, 0x00ff00ff
  1.2999 +
  1.3000 +    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
  1.3001 +    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
  1.3002 +0:
  1.3003 +    andi     t4, s2, 0xffff    /* t4 = (short)vx */
  1.3004 +    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */
  1.3005 +    subu     t5, v0, t4        /* t5 = ( 256 - (vx>>8)) */
  1.3006 +
  1.3007 +    mul      s4, s0, t5        /* s4 = wt*(256-(vx>>8)) */
  1.3008 +    mul      s5, s0, t4        /* s5 = wt*(vx>>8) */
  1.3009 +    mul      s6, s1, t5        /* s6 = wb*(256-(vx>>8)) */
  1.3010 +    mul      s7, s1, t4        /* s7 = wb*(vx>>8) */
  1.3011 +
  1.3012 +    sra      t9, s2, 16
  1.3013 +    sll      t9, t9, 2
  1.3014 +    addiu    t8, t9, 4
  1.3015 +    lwx      t0, t9(a2)        /* t0 = tl */
  1.3016 +    lwx      t1, t8(a2)        /* t1 = tr */
  1.3017 +    addiu    v1, v1, -1
  1.3018 +    lwx      t2, t9(a3)        /* t2 = bl */
  1.3019 +    lwx      t3, t8(a3)        /* t3 = br */
  1.3020 +
  1.3021 +    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
  1.3022 +    lbu      t1, 0(a1)         /* t1 = mask */
  1.3023 +    addiu    a1, a1, 1
  1.3024 +    MIPS_UN8x4_MUL_UN8 t0, t1, t0, s8, t2, t3, t4
  1.3025 +
  1.3026 +    addu     s2, s2, s3        /* vx += unit_x; */
  1.3027 +    sw       t0, 0(a0)
  1.3028 +    bnez     v1, 0b
  1.3029 +     addiu   a0, a0, 4
  1.3030 +
  1.3031 +    RESTORE_REGS_FROM_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
  1.3032 +1:
  1.3033 +    j        ra
  1.3034 +     nop
  1.3035 +
  1.3036 +END(pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_mips)
  1.3037 +
  1.3038 +LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_mips)
  1.3039 +/*
  1.3040 + * a0     - *dst
  1.3041 + * a1     - *mask
  1.3042 + * a2     - *src_top
  1.3043 + * a3     - *src_bottom
  1.3044 + * 16(sp) - wt
  1.3045 + * 20(sp) - wb
  1.3046 + * 24(sp) - vx
  1.3047 + * 28(sp) - unit_x
  1.3048 + * 32(sp) - w
  1.3049 + */
  1.3050 +
  1.3051 +    lw       v1, 32(sp)
  1.3052 +    beqz     v1, 1f
  1.3053 +     nop
  1.3054 +
  1.3055 +    SAVE_REGS_ON_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
  1.3056 +
  1.3057 +    lw       s0, 44(sp)        /* s0 = wt */
  1.3058 +    lw       s1, 48(sp)        /* s1 = wb */
  1.3059 +    lw       s2, 52(sp)        /* s2 = vx */
  1.3060 +    lw       s3, 56(sp)        /* s3 = unit_x */
  1.3061 +    li       v0, BILINEAR_INTERPOLATION_RANGE
  1.3062 +    li       s8, 0x00ff00ff
  1.3063 +
  1.3064 +    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
  1.3065 +    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
  1.3066 +0:
  1.3067 +    andi     t4, s2, 0xffff    /* t4 = (short)vx */
  1.3068 +    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */
  1.3069 +    subu     t5, v0, t4        /* t5 = ( 256 - (vx>>8)) */
  1.3070 +
  1.3071 +    mul      s4, s0, t5        /* s4 = wt*(256-(vx>>8)) */
  1.3072 +    mul      s5, s0, t4        /* s5 = wt*(vx>>8) */
  1.3073 +    mul      s6, s1, t5        /* s6 = wb*(256-(vx>>8)) */
  1.3074 +    mul      s7, s1, t4        /* s7 = wb*(vx>>8) */
  1.3075 +
  1.3076 +    sra      t9, s2, 16
  1.3077 +    sll      t9, t9, 2
  1.3078 +    addiu    t8, t9, 4
  1.3079 +    lwx      t0, t9(a2)        /* t0 = tl */
  1.3080 +    lwx      t1, t8(a2)        /* t1 = tr */
  1.3081 +    addiu    v1, v1, -1
  1.3082 +    lwx      t2, t9(a3)        /* t2 = bl */
  1.3083 +    lwx      t3, t8(a3)        /* t3 = br */
  1.3084 +
  1.3085 +    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
  1.3086 +    lbu      t1, 0(a1)         /* t1 = mask */
  1.3087 +    addiu    a1, a1, 1
  1.3088 +    MIPS_UN8x4_MUL_UN8 t0, t1, t0, s8, t2, t3, t4
  1.3089 +    CONVERT_1x8888_TO_1x0565 t0, t1, t2, t3
  1.3090 +
  1.3091 +    addu     s2, s2, s3        /* vx += unit_x; */
  1.3092 +    sh       t1, 0(a0)
  1.3093 +    bnez     v1, 0b
  1.3094 +     addiu   a0, a0, 2
  1.3095 +
  1.3096 +    RESTORE_REGS_FROM_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
  1.3097 +1:
  1.3098 +    j        ra
  1.3099 +     nop
  1.3100 +
  1.3101 +END(pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_mips)
  1.3102 +
  1.3103 +LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_mips)
  1.3104 +/*
  1.3105 + * a0     - *dst
  1.3106 + * a1     - *mask
  1.3107 + * a2     - *src_top
  1.3108 + * a3     - *src_bottom
  1.3109 + * 16(sp) - wt
  1.3110 + * 20(sp) - wb
  1.3111 + * 24(sp) - vx
  1.3112 + * 28(sp) - unit_x
  1.3113 + * 32(sp) - w
  1.3114 + */
  1.3115 +
  1.3116 +    lw       t0, 32(sp)
  1.3117 +    beqz     t0, 1f
  1.3118 +     nop
  1.3119 +
  1.3120 +    SAVE_REGS_ON_STACK 32, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8, ra
  1.3121 +
  1.3122 +    lw       s0, 48(sp)        /* s0 = wt */
  1.3123 +    lw       s1, 52(sp)        /* s1 = wb */
  1.3124 +    lw       s2, 56(sp)        /* s2 = vx */
  1.3125 +    lw       s3, 60(sp)        /* s3 = unit_x */
  1.3126 +    lw       ra, 64(sp)        /* ra = w */
  1.3127 +    li       v0, 0x00ff00ff
  1.3128 +    li       v1, 0x07e007e0
  1.3129 +    li       s8, 0x001f001f
  1.3130 +
  1.3131 +    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
  1.3132 +    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
  1.3133 +0:
  1.3134 +    andi     t4, s2, 0xffff    /* t4 = (short)vx */
  1.3135 +    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */
  1.3136 +    li       t5, BILINEAR_INTERPOLATION_RANGE
  1.3137 +    subu     t5, t5, t4        /* t5 = ( 256 - (vx>>8)) */
  1.3138 +
  1.3139 +    mul      s4, s0, t5        /* s4 = wt*(256-(vx>>8)) */
  1.3140 +    mul      s5, s0, t4        /* s5 = wt*(vx>>8) */
  1.3141 +    mul      s6, s1, t5        /* s6 = wb*(256-(vx>>8)) */
  1.3142 +    mul      s7, s1, t4        /* s7 = wb*(vx>>8) */
  1.3143 +
  1.3144 +    sra      t9, s2, 16
  1.3145 +    sll      t9, t9, 1
  1.3146 +    addiu    t8, t9, 2
  1.3147 +    lhx      t0, t9(a2)        /* t0 = tl */
  1.3148 +    lhx      t1, t8(a2)        /* t1 = tr */
  1.3149 +    andi     t1, t1, 0xffff
  1.3150 +    addiu    ra, ra, -1
  1.3151 +    lhx      t2, t9(a3)        /* t2 = bl */
  1.3152 +    lhx      t3, t8(a3)        /* t3 = br */
  1.3153 +    andi     t3, t3, 0xffff
  1.3154 +
  1.3155 +    CONVERT_2x0565_TO_2x8888 t0, t1, t0, t1, v1, s8, t4, t5, t6, t7
  1.3156 +    CONVERT_2x0565_TO_2x8888 t2, t3, t2, t3, v1, s8, t4, t5, t6, t7
  1.3157 +    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
  1.3158 +    lbu      t1, 0(a1)         /* t1 = mask */
  1.3159 +    addiu    a1, a1, 1
  1.3160 +    MIPS_UN8x4_MUL_UN8 t0, t1, t0, v0, t2, t3, t4
  1.3161 +
  1.3162 +    addu     s2, s2, s3        /* vx += unit_x; */
  1.3163 +    sw       t0, 0(a0)
  1.3164 +    bnez     ra, 0b
  1.3165 +     addiu   a0, a0, 4
  1.3166 +
  1.3167 +    RESTORE_REGS_FROM_STACK 32, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8, ra
  1.3168 +1:
  1.3169 +    j        ra
  1.3170 +     nop
  1.3171 +
  1.3172 +END(pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_mips)
  1.3173 +
  1.3174 +LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_mips)
  1.3175 +/*
  1.3176 + * a0     - *dst
  1.3177 + * a1     - *mask
  1.3178 + * a2     - *src_top
  1.3179 + * a3     - *src_bottom
  1.3180 + * 16(sp) - wt
  1.3181 + * 20(sp) - wb
  1.3182 + * 24(sp) - vx
  1.3183 + * 28(sp) - unit_x
  1.3184 + * 32(sp) - w
  1.3185 + */
  1.3186 +
  1.3187 +    lw       t0, 32(sp)
  1.3188 +    beqz     t0, 1f
  1.3189 +     nop
  1.3190 +
  1.3191 +    SAVE_REGS_ON_STACK 32, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8, ra
  1.3192 +
  1.3193 +    lw       s0, 48(sp)        /* s0 = wt */
  1.3194 +    lw       s1, 52(sp)        /* s1 = wb */
  1.3195 +    lw       s2, 56(sp)        /* s2 = vx */
  1.3196 +    lw       s3, 60(sp)        /* s3 = unit_x */
  1.3197 +    lw       ra, 64(sp)        /* ra = w */
  1.3198 +    li       v0, 0x00ff00ff
  1.3199 +    li       v1, 0x07e007e0
  1.3200 +    li       s8, 0x001f001f
  1.3201 +
  1.3202 +    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
  1.3203 +    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
  1.3204 +0:
  1.3205 +    andi     t4, s2, 0xffff    /* t4 = (short)vx */
  1.3206 +    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */
  1.3207 +    li       t5, BILINEAR_INTERPOLATION_RANGE
  1.3208 +    subu     t5, t5, t4        /* t5 = ( 256 - (vx>>8)) */
  1.3209 +
  1.3210 +    mul      s4, s0, t5        /* s4 = wt*(256-(vx>>8)) */
  1.3211 +    mul      s5, s0, t4        /* s5 = wt*(vx>>8) */
  1.3212 +    mul      s6, s1, t5        /* s6 = wb*(256-(vx>>8)) */
  1.3213 +    mul      s7, s1, t4        /* s7 = wb*(vx>>8) */
  1.3214 +
  1.3215 +    sra      t9, s2, 16
  1.3216 +    sll      t9, t9, 1
  1.3217 +    addiu    t8, t9, 2
  1.3218 +    lhx      t0, t9(a2)        /* t0 = tl */
  1.3219 +    lhx      t1, t8(a2)        /* t1 = tr */
  1.3220 +    andi     t1, t1, 0xffff
  1.3221 +    addiu    ra, ra, -1
  1.3222 +    lhx      t2, t9(a3)        /* t2 = bl */
  1.3223 +    lhx      t3, t8(a3)        /* t3 = br */
  1.3224 +    andi     t3, t3, 0xffff
  1.3225 +
  1.3226 +    CONVERT_2x0565_TO_2x8888 t0, t1, t0, t1, v1, s8, t4, t5, t6, t7
  1.3227 +    CONVERT_2x0565_TO_2x8888 t2, t3, t2, t3, v1, s8, t4, t5, t6, t7
  1.3228 +    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
  1.3229 +    lbu      t1, 0(a1)         /* t1 = mask */
  1.3230 +    addiu    a1, a1, 1
  1.3231 +    MIPS_UN8x4_MUL_UN8 t0, t1, t0, v0, t2, t3, t4
  1.3232 +    CONVERT_1x8888_TO_1x0565 t0, t1, t2, t3
  1.3233 +
  1.3234 +    addu     s2, s2, s3        /* vx += unit_x; */
  1.3235 +    sh       t1, 0(a0)
  1.3236 +    bnez     ra, 0b
  1.3237 +     addiu   a0, a0, 2
  1.3238 +
  1.3239 +    RESTORE_REGS_FROM_STACK 32, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8, ra
  1.3240 +1:
  1.3241 +    j        ra
  1.3242 +     nop
  1.3243 +
  1.3244 +END(pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_mips)
  1.3245 +
  1.3246 +LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_mips)
  1.3247 +/*
  1.3248 + * a0     - dst        (a8r8g8b8)
  1.3249 + * a1     - mask       (a8)
  1.3250 + * a2     - src_top    (a8r8g8b8)
  1.3251 + * a3     - src_bottom (a8r8g8b8)
  1.3252 + * 16(sp) - wt
  1.3253 + * 20(sp) - wb
  1.3254 + * 24(sp) - vx
  1.3255 + * 28(sp) - unit_x
  1.3256 + * 32(sp) - w
  1.3257 + */
  1.3258 +
  1.3259 +    SAVE_REGS_ON_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
  1.3260 +
  1.3261 +    lw       v1, 60(sp)        /* v1 = w(sp + 32 + 28 save regs stack offset)*/
  1.3262 +    beqz     v1, 1f
  1.3263 +     nop
  1.3264 +
  1.3265 +    lw       s0, 44(sp)        /* s0 = wt */
  1.3266 +    lw       s1, 48(sp)        /* s1 = wb */
  1.3267 +    lw       s2, 52(sp)        /* s2 = vx */
  1.3268 +    lw       s3, 56(sp)        /* s3 = unit_x */
  1.3269 +    li       v0, BILINEAR_INTERPOLATION_RANGE
  1.3270 +    li       s8, 0x00ff00ff
  1.3271 +
  1.3272 +    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
  1.3273 +    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
  1.3274 +
  1.3275 +0:
  1.3276 +    andi     t4, s2, 0xffff    /* t4 = (short)vx */
  1.3277 +    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */
  1.3278 +    subu     t5, v0, t4        /* t5 = ( 256 - (vx>>8)) */
  1.3279 +
  1.3280 +    mul      s4, s0, t5        /* s4 = wt*(256-(vx>>8)) */
  1.3281 +    mul      s5, s0, t4        /* s5 = wt*(vx>>8) */
  1.3282 +    mul      s6, s1, t5        /* s6 = wb*(256-(vx>>8)) */
  1.3283 +    mul      s7, s1, t4        /* s7 = wb*(vx>>8) */
  1.3284 +
  1.3285 +    sra      t9, s2, 16
  1.3286 +    sll      t9, t9, 2
  1.3287 +    addiu    t8, t9, 4
  1.3288 +    lwx      t0, t9(a2)        /* t0 = tl */
  1.3289 +    lwx      t1, t8(a2)        /* t1 = tr */
  1.3290 +    addiu    v1, v1, -1
  1.3291 +    lwx      t2, t9(a3)        /* t2 = bl */
  1.3292 +    lwx      t3, t8(a3)        /* t3 = br */
  1.3293 +
  1.3294 +    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, \
  1.3295 +                                      t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
  1.3296 +    lbu      t1, 0(a1)         /* t1 = mask */
  1.3297 +    lw       t2, 0(a0)         /* t2 = dst */
  1.3298 +    addiu    a1, a1, 1
  1.3299 +    OVER_8888_8_8888 t0, t1, t2, t0, s8, t3, t4, t5, t6
  1.3300 +
  1.3301 +    addu     s2, s2, s3        /* vx += unit_x; */
  1.3302 +    sw       t0, 0(a0)
  1.3303 +    bnez     v1, 0b
  1.3304 +     addiu   a0, a0, 4
  1.3305 +
  1.3306 +1:
  1.3307 +    RESTORE_REGS_FROM_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
  1.3308 +    j        ra
  1.3309 +     nop
  1.3310 +
  1.3311 +END(pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_mips)
  1.3312 +
  1.3313 +LEAF_MIPS_DSPR2(pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_mips)
  1.3314 +/*
  1.3315 + * a0     - *dst
  1.3316 + * a1     - *mask
  1.3317 + * a2     - *src_top
  1.3318 + * a3     - *src_bottom
  1.3319 + * 16(sp) - wt
  1.3320 + * 20(sp) - wb
  1.3321 + * 24(sp) - vx
  1.3322 + * 28(sp) - unit_x
  1.3323 + * 32(sp) - w
  1.3324 + */
  1.3325 +
  1.3326 +    lw       v1, 32(sp)
  1.3327 +    beqz     v1, 1f
  1.3328 +     nop
  1.3329 +
  1.3330 +    SAVE_REGS_ON_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
  1.3331 +
  1.3332 +    lw       s0, 44(sp)        /* s0 = wt */
  1.3333 +    lw       s1, 48(sp)        /* s1 = wb */
  1.3334 +    lw       s2, 52(sp)        /* s2 = vx */
  1.3335 +    lw       s3, 56(sp)        /* s3 = unit_x */
  1.3336 +    li       v0, BILINEAR_INTERPOLATION_RANGE
  1.3337 +    li       s8, 0x00ff00ff
  1.3338 +
  1.3339 +    sll      s0, s0, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
  1.3340 +    sll      s1, s1, (2 * (8 - BILINEAR_INTERPOLATION_BITS))
  1.3341 +0:
  1.3342 +    andi     t4, s2, 0xffff    /* t4 = (short)vx */
  1.3343 +    srl      t4, t4, (16 - BILINEAR_INTERPOLATION_BITS) /* t4 = vx >> 8 */
  1.3344 +    subu     t5, v0, t4        /* t5 = ( 256 - (vx>>8)) */
  1.3345 +
  1.3346 +    mul      s4, s0, t5        /* s4 = wt*(256-(vx>>8)) */
  1.3347 +    mul      s5, s0, t4        /* s5 = wt*(vx>>8) */
  1.3348 +    mul      s6, s1, t5        /* s6 = wb*(256-(vx>>8)) */
  1.3349 +    mul      s7, s1, t4        /* s7 = wb*(vx>>8) */
  1.3350 +
  1.3351 +    sra      t9, s2, 16
  1.3352 +    sll      t9, t9, 2
  1.3353 +    addiu    t8, t9, 4
  1.3354 +    lwx      t0, t9(a2)        /* t0 = tl */
  1.3355 +    lwx      t1, t8(a2)        /* t1 = tr */
  1.3356 +    addiu    v1, v1, -1
  1.3357 +    lwx      t2, t9(a3)        /* t2 = bl */
  1.3358 +    lwx      t3, t8(a3)        /* t3 = br */
  1.3359 +
  1.3360 +    BILINEAR_INTERPOLATE_SINGLE_PIXEL t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, s4, s5, s6, s7
  1.3361 +    lbu      t1, 0(a1)         /* t1 = mask */
  1.3362 +    lw       t2, 0(a0)         /* t2 = dst */
  1.3363 +    addiu    a1, a1, 1
  1.3364 +    MIPS_UN8x4_MUL_UN8_ADD_UN8x4 t0, t1, t2, t0, s8, t3, t4, t5
  1.3365 +
  1.3366 +    addu     s2, s2, s3        /* vx += unit_x; */
  1.3367 +    sw       t0, 0(a0)
  1.3368 +    bnez     v1, 0b
  1.3369 +     addiu   a0, a0, 4
  1.3370 +
  1.3371 +    RESTORE_REGS_FROM_STACK 28, v0, v1, s0, s1, s2, s3, s4, s5, s6, s7, s8
  1.3372 +1:
  1.3373 +    j        ra
  1.3374 +     nop
  1.3375 +
  1.3376 +END(pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_mips)

mercurial