gfx/skia/trunk/src/opts/SkBitmapProcState_matrix_neon.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/skia/trunk/src/opts/SkBitmapProcState_matrix_neon.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,506 @@
     1.4 +
     1.5 +#include <arm_neon.h>
     1.6 +
     1.7 +
     1.8 +#define SCALE_NOFILTER_NAME     MAKENAME(_nofilter_scale)
     1.9 +#define SCALE_FILTER_NAME       MAKENAME(_filter_scale)
    1.10 +#define AFFINE_NOFILTER_NAME    MAKENAME(_nofilter_affine)
    1.11 +#define AFFINE_FILTER_NAME      MAKENAME(_filter_affine)
    1.12 +#define PERSP_NOFILTER_NAME     MAKENAME(_nofilter_persp)
    1.13 +#define PERSP_FILTER_NAME       MAKENAME(_filter_persp)
    1.14 +
    1.15 +#define PACK_FILTER_X_NAME  MAKENAME(_pack_filter_x)
    1.16 +#define PACK_FILTER_Y_NAME  MAKENAME(_pack_filter_y)
    1.17 +#define PACK_FILTER_X4_NAME MAKENAME(_pack_filter_x4)
    1.18 +#define PACK_FILTER_Y4_NAME MAKENAME(_pack_filter_y4)
    1.19 +
    1.20 +#ifndef PREAMBLE
    1.21 +    #define PREAMBLE(state)
    1.22 +    #define PREAMBLE_PARAM_X
    1.23 +    #define PREAMBLE_PARAM_Y
    1.24 +    #define PREAMBLE_ARG_X
    1.25 +    #define PREAMBLE_ARG_Y
    1.26 +#endif
    1.27 +
    1.28 +static void SCALE_NOFILTER_NAME(const SkBitmapProcState& s,
    1.29 +                                uint32_t xy[], int count, int x, int y) {
    1.30 +    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
    1.31 +                             SkMatrix::kScale_Mask)) == 0);
    1.32 +
    1.33 +    PREAMBLE(s);
    1.34 +
    1.35 +    // we store y, x, x, x, x, x
    1.36 +    const unsigned maxX = s.fBitmap->width() - 1;
    1.37 +    SkFractionalInt fx;
    1.38 +    {
    1.39 +        SkPoint pt;
    1.40 +        s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
    1.41 +                                 SkIntToScalar(y) + SK_ScalarHalf, &pt);
    1.42 +        fx = SkScalarToFractionalInt(pt.fY);
    1.43 +        const unsigned maxY = s.fBitmap->height() - 1;
    1.44 +        *xy++ = TILEY_PROCF(SkFractionalIntToFixed(fx), maxY);
    1.45 +        fx = SkScalarToFractionalInt(pt.fX);
    1.46 +    }
    1.47 +
    1.48 +    if (0 == maxX) {
    1.49 +        // all of the following X values must be 0
    1.50 +        memset(xy, 0, count * sizeof(uint16_t));
    1.51 +        return;
    1.52 +    }
    1.53 +
    1.54 +    const SkFractionalInt dx = s.fInvSxFractionalInt;
    1.55 +
    1.56 +#ifdef CHECK_FOR_DECAL
    1.57 +    // test if we don't need to apply the tile proc
    1.58 +    if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {
    1.59 +        decal_nofilter_scale_neon(xy, SkFractionalIntToFixed(fx),
    1.60 +                             SkFractionalIntToFixed(dx), count);
    1.61 +        return;
    1.62 +    }
    1.63 +#endif
    1.64 +
    1.65 +    if (count >= 8) {
    1.66 +        SkFractionalInt dx2 = dx+dx;
    1.67 +        SkFractionalInt dx4 = dx2+dx2;
    1.68 +        SkFractionalInt dx8 = dx4+dx4;
    1.69 +
    1.70 +        // now build fx/fx+dx/fx+2dx/fx+3dx
    1.71 +        SkFractionalInt fx1, fx2, fx3;
    1.72 +        int32x4_t lbase, hbase;
    1.73 +        int16_t *dst16 = (int16_t *)xy;
    1.74 +
    1.75 +        fx1 = fx+dx;
    1.76 +        fx2 = fx1+dx;
    1.77 +        fx3 = fx2+dx;
    1.78 +
    1.79 +        lbase = vdupq_n_s32(SkFractionalIntToFixed(fx));
    1.80 +        lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx1), lbase, 1);
    1.81 +        lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx2), lbase, 2);
    1.82 +        lbase = vsetq_lane_s32(SkFractionalIntToFixed(fx3), lbase, 3);
    1.83 +        hbase = vaddq_s32(lbase, vdupq_n_s32(SkFractionalIntToFixed(dx4)));
    1.84 +
    1.85 +        // store & bump
    1.86 +        while (count >= 8) {
    1.87 +
    1.88 +            int16x8_t fx8;
    1.89 +
    1.90 +            fx8 = TILEX_PROCF_NEON8(lbase, hbase, maxX);
    1.91 +
    1.92 +            vst1q_s16(dst16, fx8);
    1.93 +
    1.94 +            // but preserving base & on to the next
    1.95 +            lbase = vaddq_s32 (lbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
    1.96 +            hbase = vaddq_s32 (hbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
    1.97 +            dst16 += 8;
    1.98 +            count -= 8;
    1.99 +            fx += dx8;
   1.100 +        };
   1.101 +        xy = (uint32_t *) dst16;
   1.102 +    }
   1.103 +
   1.104 +    uint16_t* xx = (uint16_t*)xy;
   1.105 +    for (int i = count; i > 0; --i) {
   1.106 +        *xx++ = TILEX_PROCF(SkFractionalIntToFixed(fx), maxX);
   1.107 +        fx += dx;
   1.108 +    }
   1.109 +}
   1.110 +
   1.111 +static void AFFINE_NOFILTER_NAME(const SkBitmapProcState& s,
   1.112 +                                 uint32_t xy[], int count, int x, int y) {
   1.113 +    SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
   1.114 +    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
   1.115 +                             SkMatrix::kScale_Mask |
   1.116 +                             SkMatrix::kAffine_Mask)) == 0);
   1.117 +
   1.118 +    PREAMBLE(s);
   1.119 +    SkPoint srcPt;
   1.120 +    s.fInvProc(s.fInvMatrix,
   1.121 +               SkIntToScalar(x) + SK_ScalarHalf,
   1.122 +               SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
   1.123 +
   1.124 +    SkFractionalInt fx = SkScalarToFractionalInt(srcPt.fX);
   1.125 +    SkFractionalInt fy = SkScalarToFractionalInt(srcPt.fY);
   1.126 +    SkFractionalInt dx = s.fInvSxFractionalInt;
   1.127 +    SkFractionalInt dy = s.fInvKyFractionalInt;
   1.128 +    int maxX = s.fBitmap->width() - 1;
   1.129 +    int maxY = s.fBitmap->height() - 1;
   1.130 +
   1.131 +    if (count >= 8) {
   1.132 +        SkFractionalInt dx4 = dx * 4;
   1.133 +        SkFractionalInt dy4 = dy * 4;
   1.134 +        SkFractionalInt dx8 = dx * 8;
   1.135 +        SkFractionalInt dy8 = dy * 8;
   1.136 +
   1.137 +        int32x4_t xbase, ybase;
   1.138 +        int32x4_t x2base, y2base;
   1.139 +        int16_t *dst16 = (int16_t *) xy;
   1.140 +
   1.141 +        // now build fx, fx+dx, fx+2dx, fx+3dx
   1.142 +        xbase = vdupq_n_s32(SkFractionalIntToFixed(fx));
   1.143 +        xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), xbase, 1);
   1.144 +        xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), xbase, 2);
   1.145 +        xbase = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), xbase, 3);
   1.146 +
   1.147 +        // same for fy
   1.148 +        ybase = vdupq_n_s32(SkFractionalIntToFixed(fy));
   1.149 +        ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy), ybase, 1);
   1.150 +        ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy), ybase, 2);
   1.151 +        ybase = vsetq_lane_s32(SkFractionalIntToFixed(fy+dy+dy+dy), ybase, 3);
   1.152 +
   1.153 +        x2base = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx4)));
   1.154 +        y2base = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy4)));
   1.155 +
   1.156 +        // store & bump
   1.157 +        do {
   1.158 +            int16x8x2_t hi16;
   1.159 +
   1.160 +            hi16.val[0] = TILEX_PROCF_NEON8(xbase, x2base, maxX);
   1.161 +            hi16.val[1] = TILEY_PROCF_NEON8(ybase, y2base, maxY);
   1.162 +
   1.163 +            vst2q_s16(dst16, hi16);
   1.164 +
   1.165 +            // moving base and on to the next
   1.166 +            xbase = vaddq_s32(xbase, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
   1.167 +            ybase = vaddq_s32(ybase, vdupq_n_s32(SkFractionalIntToFixed(dy8)));
   1.168 +            x2base = vaddq_s32(x2base, vdupq_n_s32(SkFractionalIntToFixed(dx8)));
   1.169 +            y2base = vaddq_s32(y2base, vdupq_n_s32(SkFractionalIntToFixed(dy8)));
   1.170 +
   1.171 +            dst16 += 16; // 8x32 aka 16x16
   1.172 +            count -= 8;
   1.173 +            fx += dx8;
   1.174 +            fy += dy8;
   1.175 +        } while (count >= 8);
   1.176 +        xy = (uint32_t *) dst16;
   1.177 +    }
   1.178 +
   1.179 +    for (int i = count; i > 0; --i) {
   1.180 +        *xy++ = (TILEY_PROCF(SkFractionalIntToFixed(fy), maxY) << 16) |
   1.181 +                 TILEX_PROCF(SkFractionalIntToFixed(fx), maxX);
   1.182 +        fx += dx; fy += dy;
   1.183 +    }
   1.184 +}
   1.185 +
   1.186 +static void PERSP_NOFILTER_NAME(const SkBitmapProcState& s,
   1.187 +                                uint32_t* SK_RESTRICT xy,
   1.188 +                                int count, int x, int y) {
   1.189 +    SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
   1.190 +
   1.191 +    PREAMBLE(s);
   1.192 +    // max{X,Y} are int here, but later shown/assumed to fit in 16 bits
   1.193 +    int maxX = s.fBitmap->width() - 1;
   1.194 +    int maxY = s.fBitmap->height() - 1;
   1.195 +
   1.196 +    SkPerspIter iter(s.fInvMatrix,
   1.197 +                     SkIntToScalar(x) + SK_ScalarHalf,
   1.198 +                     SkIntToScalar(y) + SK_ScalarHalf, count);
   1.199 +
   1.200 +    while ((count = iter.next()) != 0) {
   1.201 +        const SkFixed* SK_RESTRICT srcXY = iter.getXY();
   1.202 +
   1.203 +        if (count >= 8) {
   1.204 +            int32_t *mysrc = (int32_t *) srcXY;
   1.205 +            int16_t *mydst = (int16_t *) xy;
   1.206 +            do {
   1.207 +                int16x8x2_t hi16;
   1.208 +                int32x4x2_t xy1, xy2;
   1.209 +
   1.210 +                xy1 = vld2q_s32(mysrc);
   1.211 +                xy2 = vld2q_s32(mysrc+8);
   1.212 +
   1.213 +                hi16.val[0] = TILEX_PROCF_NEON8(xy1.val[0], xy2.val[0], maxX);
   1.214 +                hi16.val[1] = TILEY_PROCF_NEON8(xy1.val[1], xy2.val[1], maxY);
   1.215 +
   1.216 +                vst2q_s16(mydst, hi16);
   1.217 +
   1.218 +                count -= 8;  // 8 iterations
   1.219 +                mysrc += 16; // 16 longs
   1.220 +                mydst += 16; // 16 shorts, aka 8 longs
   1.221 +            } while (count >= 8);
   1.222 +            // get xy and srcXY fixed up
   1.223 +            srcXY = (const SkFixed *) mysrc;
   1.224 +            xy = (uint32_t *) mydst;
   1.225 +        }
   1.226 +
   1.227 +        while (--count >= 0) {
   1.228 +            *xy++ = (TILEY_PROCF(srcXY[1], maxY) << 16) |
   1.229 +                     TILEX_PROCF(srcXY[0], maxX);
   1.230 +            srcXY += 2;
   1.231 +        }
   1.232 +    }
   1.233 +}
   1.234 +
   1.235 +static inline uint32_t PACK_FILTER_Y_NAME(SkFixed f, unsigned max,
   1.236 +                                          SkFixed one PREAMBLE_PARAM_Y) {
   1.237 +    unsigned i = TILEY_PROCF(f, max);
   1.238 +    i = (i << 4) | TILEY_LOW_BITS(f, max);
   1.239 +    return (i << 14) | (TILEY_PROCF((f + one), max));
   1.240 +}
   1.241 +
   1.242 +static inline uint32_t PACK_FILTER_X_NAME(SkFixed f, unsigned max,
   1.243 +                                          SkFixed one PREAMBLE_PARAM_X) {
   1.244 +    unsigned i = TILEX_PROCF(f, max);
   1.245 +    i = (i << 4) | TILEX_LOW_BITS(f, max);
   1.246 +    return (i << 14) | (TILEX_PROCF((f + one), max));
   1.247 +}
   1.248 +
   1.249 +static inline int32x4_t PACK_FILTER_X4_NAME(int32x4_t f, unsigned max,
   1.250 +                                          SkFixed one PREAMBLE_PARAM_X) {
   1.251 +    int32x4_t ret, res, wide_one;
   1.252 +
   1.253 +    // Prepare constants
   1.254 +    wide_one = vdupq_n_s32(one);
   1.255 +
   1.256 +    // Step 1
   1.257 +    res = TILEX_PROCF_NEON4(f, max);
   1.258 +
   1.259 +    // Step 2
   1.260 +    ret = TILEX_LOW_BITS_NEON4(f, max);
   1.261 +    ret = vsliq_n_s32(ret, res, 4);
   1.262 +
   1.263 +    // Step 3
   1.264 +    res = TILEX_PROCF_NEON4(f + wide_one, max);
   1.265 +    ret = vorrq_s32(vshlq_n_s32(ret, 14), res);
   1.266 +
   1.267 +    return ret;
   1.268 +}
   1.269 +
   1.270 +static inline int32x4_t PACK_FILTER_Y4_NAME(int32x4_t f, unsigned max,
   1.271 +                                          SkFixed one PREAMBLE_PARAM_X) {
   1.272 +    int32x4_t ret, res, wide_one;
   1.273 +
   1.274 +    // Prepare constants
   1.275 +    wide_one = vdupq_n_s32(one);
   1.276 +
   1.277 +    // Step 1
   1.278 +    res = TILEY_PROCF_NEON4(f, max);
   1.279 +
   1.280 +    // Step 2
   1.281 +    ret = TILEY_LOW_BITS_NEON4(f, max);
   1.282 +    ret = vsliq_n_s32(ret, res, 4);
   1.283 +
   1.284 +    // Step 3
   1.285 +    res = TILEY_PROCF_NEON4(f + wide_one, max);
   1.286 +    ret = vorrq_s32(vshlq_n_s32(ret, 14), res);
   1.287 +
   1.288 +    return ret;
   1.289 +}
   1.290 +
   1.291 +static void SCALE_FILTER_NAME(const SkBitmapProcState& s,
   1.292 +                              uint32_t xy[], int count, int x, int y) {
   1.293 +    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
   1.294 +                             SkMatrix::kScale_Mask)) == 0);
   1.295 +    SkASSERT(s.fInvKy == 0);
   1.296 +
   1.297 +    PREAMBLE(s);
   1.298 +
   1.299 +    const unsigned maxX = s.fBitmap->width() - 1;
   1.300 +    const SkFixed one = s.fFilterOneX;
   1.301 +    const SkFractionalInt dx = s.fInvSxFractionalInt;
   1.302 +    SkFractionalInt fx;
   1.303 +
   1.304 +    {
   1.305 +        SkPoint pt;
   1.306 +        s.fInvProc(s.fInvMatrix, SkIntToScalar(x) + SK_ScalarHalf,
   1.307 +                                 SkIntToScalar(y) + SK_ScalarHalf, &pt);
   1.308 +        const SkFixed fy = SkScalarToFixed(pt.fY) - (s.fFilterOneY >> 1);
   1.309 +        const unsigned maxY = s.fBitmap->height() - 1;
   1.310 +        // compute our two Y values up front
   1.311 +        *xy++ = PACK_FILTER_Y_NAME(fy, maxY, s.fFilterOneY PREAMBLE_ARG_Y);
   1.312 +        // now initialize fx
   1.313 +        fx = SkScalarToFractionalInt(pt.fX) - (SkFixedToFractionalInt(one) >> 1);
   1.314 +    }
   1.315 +
   1.316 +#ifdef CHECK_FOR_DECAL
   1.317 +    // test if we don't need to apply the tile proc
   1.318 +    if (can_truncate_to_fixed_for_decal(fx, dx, count, maxX)) {
   1.319 +        decal_filter_scale_neon(xy, SkFractionalIntToFixed(fx),
   1.320 +                             SkFractionalIntToFixed(dx), count);
   1.321 +        return;
   1.322 +    }
   1.323 +#endif
   1.324 +    {
   1.325 +
   1.326 +    if (count >= 4) {
   1.327 +        int32x4_t wide_fx;
   1.328 +
   1.329 +        wide_fx = vdupq_n_s32(SkFractionalIntToFixed(fx));
   1.330 +        wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx), wide_fx, 1);
   1.331 +        wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx), wide_fx, 2);
   1.332 +        wide_fx = vsetq_lane_s32(SkFractionalIntToFixed(fx+dx+dx+dx), wide_fx, 3);
   1.333 +
   1.334 +        while (count >= 4) {
   1.335 +            int32x4_t res;
   1.336 +
   1.337 +            res = PACK_FILTER_X4_NAME(wide_fx, maxX, one PREAMBLE_ARG_X);
   1.338 +
   1.339 +            vst1q_u32(xy, vreinterpretq_u32_s32(res));
   1.340 +
   1.341 +            wide_fx += vdupq_n_s32(SkFractionalIntToFixed(dx+dx+dx+dx));
   1.342 +            fx += dx+dx+dx+dx;
   1.343 +            xy += 4;
   1.344 +            count -= 4;
   1.345 +        }
   1.346 +    }
   1.347 +
   1.348 +    while (--count >= 0) {
   1.349 +        *xy++ = PACK_FILTER_X_NAME(SkFractionalIntToFixed(fx), maxX, one PREAMBLE_ARG_X);
   1.350 +        fx += dx;
   1.351 +    }
   1.352 +
   1.353 +    }
   1.354 +}
   1.355 +
   1.356 +static void AFFINE_FILTER_NAME(const SkBitmapProcState& s,
   1.357 +                               uint32_t xy[], int count, int x, int y) {
   1.358 +    SkASSERT(s.fInvType & SkMatrix::kAffine_Mask);
   1.359 +    SkASSERT((s.fInvType & ~(SkMatrix::kTranslate_Mask |
   1.360 +                             SkMatrix::kScale_Mask |
   1.361 +                             SkMatrix::kAffine_Mask)) == 0);
   1.362 +
   1.363 +    PREAMBLE(s);
   1.364 +    SkPoint srcPt;
   1.365 +    s.fInvProc(s.fInvMatrix,
   1.366 +               SkIntToScalar(x) + SK_ScalarHalf,
   1.367 +               SkIntToScalar(y) + SK_ScalarHalf, &srcPt);
   1.368 +
   1.369 +    SkFixed oneX = s.fFilterOneX;
   1.370 +    SkFixed oneY = s.fFilterOneY;
   1.371 +    SkFixed fx = SkScalarToFixed(srcPt.fX) - (oneX >> 1);
   1.372 +    SkFixed fy = SkScalarToFixed(srcPt.fY) - (oneY >> 1);
   1.373 +    SkFixed dx = s.fInvSx;
   1.374 +    SkFixed dy = s.fInvKy;
   1.375 +    unsigned maxX = s.fBitmap->width() - 1;
   1.376 +    unsigned maxY = s.fBitmap->height() - 1;
   1.377 +
   1.378 +    if (count >= 4) {
   1.379 +        int32x4_t wide_fy, wide_fx;
   1.380 +
   1.381 +        wide_fx = vdupq_n_s32(fx);
   1.382 +        wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1);
   1.383 +        wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2);
   1.384 +        wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3);
   1.385 +
   1.386 +        wide_fy = vdupq_n_s32(fy);
   1.387 +        wide_fy = vsetq_lane_s32(fy+dy, wide_fy, 1);
   1.388 +        wide_fy = vsetq_lane_s32(fy+dy+dy, wide_fy, 2);
   1.389 +        wide_fy = vsetq_lane_s32(fy+dy+dy+dy, wide_fy, 3);
   1.390 +
   1.391 +        while (count >= 4) {
   1.392 +            int32x4x2_t vxy;
   1.393 +
   1.394 +            // do the X side, then the Y side, then interleave them
   1.395 +            vxy.val[0] = PACK_FILTER_Y4_NAME(wide_fy, maxY, oneY PREAMBLE_ARG_Y);
   1.396 +            vxy.val[1] = PACK_FILTER_X4_NAME(wide_fx, maxX, oneX PREAMBLE_ARG_X);
   1.397 +
   1.398 +            // interleave as YXYXYXYX as part of the storing
   1.399 +            vst2q_s32((int32_t*)xy, vxy);
   1.400 +
   1.401 +            // prepare next iteration
   1.402 +            wide_fx += vdupq_n_s32(dx+dx+dx+dx);
   1.403 +            fx += dx + dx + dx + dx;
   1.404 +            wide_fy += vdupq_n_s32(dy+dy+dy+dy);
   1.405 +            fy += dy+dy+dy+dy;
   1.406 +            xy += 8; // 4 x's, 4 y's
   1.407 +            count -= 4;
   1.408 +        }
   1.409 +    }
   1.410 +
   1.411 +    while (--count >= 0) {
   1.412 +        // NB: writing Y/X
   1.413 +        *xy++ = PACK_FILTER_Y_NAME(fy, maxY, oneY PREAMBLE_ARG_Y);
   1.414 +        fy += dy;
   1.415 +        *xy++ = PACK_FILTER_X_NAME(fx, maxX, oneX PREAMBLE_ARG_X);
   1.416 +        fx += dx;
   1.417 +    }
   1.418 +}
   1.419 +
   1.420 +static void PERSP_FILTER_NAME(const SkBitmapProcState& s,
   1.421 +                              uint32_t* SK_RESTRICT xy, int count,
   1.422 +                              int x, int y) {
   1.423 +    SkASSERT(s.fInvType & SkMatrix::kPerspective_Mask);
   1.424 +
   1.425 +    PREAMBLE(s);
   1.426 +    unsigned maxX = s.fBitmap->width() - 1;
   1.427 +    unsigned maxY = s.fBitmap->height() - 1;
   1.428 +    SkFixed oneX = s.fFilterOneX;
   1.429 +    SkFixed oneY = s.fFilterOneY;
   1.430 +
   1.431 +    SkPerspIter iter(s.fInvMatrix,
   1.432 +                     SkIntToScalar(x) + SK_ScalarHalf,
   1.433 +                     SkIntToScalar(y) + SK_ScalarHalf, count);
   1.434 +
   1.435 +    while ((count = iter.next()) != 0) {
   1.436 +        const SkFixed* SK_RESTRICT srcXY = iter.getXY();
   1.437 +
   1.438 +        while (count >= 4) {
   1.439 +            int32x4_t wide_x, wide_y;
   1.440 +            int32x4x2_t vxy, vresyx;
   1.441 +
   1.442 +            // load src:  x-y-x-y-x-y-x-y
   1.443 +            vxy = vld2q_s32(srcXY);
   1.444 +
   1.445 +            // do the X side, then the Y side, then interleave them
   1.446 +            wide_x = vsubq_s32(vxy.val[0], vdupq_n_s32(oneX>>1));
   1.447 +            wide_y = vsubq_s32(vxy.val[1], vdupq_n_s32(oneY>>1));
   1.448 +
   1.449 +            vresyx.val[0] = PACK_FILTER_Y4_NAME(wide_y, maxY, oneY PREAMBLE_ARG_Y);
   1.450 +            vresyx.val[1] = PACK_FILTER_X4_NAME(wide_x, maxX, oneX PREAMBLE_ARG_X);
   1.451 +
   1.452 +            // store interleaved as y-x-y-x-y-x-y-x (NB != read order)
   1.453 +            vst2q_s32((int32_t*)xy, vresyx);
   1.454 +
   1.455 +            // on to the next iteration
   1.456 +            srcXY += 2*4;
   1.457 +            count -= 4;
   1.458 +            xy += 2*4;
   1.459 +        }
   1.460 +
   1.461 +        while (--count >= 0) {
   1.462 +            // NB: we read x/y, we write y/x
   1.463 +            *xy++ = PACK_FILTER_Y_NAME(srcXY[1] - (oneY >> 1), maxY,
   1.464 +                                       oneY PREAMBLE_ARG_Y);
   1.465 +            *xy++ = PACK_FILTER_X_NAME(srcXY[0] - (oneX >> 1), maxX,
   1.466 +                                       oneX PREAMBLE_ARG_X);
   1.467 +            srcXY += 2;
   1.468 +        }
   1.469 +    }
   1.470 +}
   1.471 +
   1.472 +const SkBitmapProcState::MatrixProc MAKENAME(_Procs)[] = {
   1.473 +    SCALE_NOFILTER_NAME,
   1.474 +    SCALE_FILTER_NAME,
   1.475 +    AFFINE_NOFILTER_NAME,
   1.476 +    AFFINE_FILTER_NAME,
   1.477 +    PERSP_NOFILTER_NAME,
   1.478 +    PERSP_FILTER_NAME
   1.479 +};
   1.480 +
   1.481 +#undef TILEX_PROCF_NEON8
   1.482 +#undef TILEY_PROCF_NEON8
   1.483 +#undef TILEX_PROCF_NEON4
   1.484 +#undef TILEY_PROCF_NEON4
   1.485 +#undef TILEX_LOW_BITS_NEON4
   1.486 +#undef TILEY_LOW_BITS_NEON4
   1.487 +
   1.488 +#undef MAKENAME
   1.489 +#undef TILEX_PROCF
   1.490 +#undef TILEY_PROCF
   1.491 +#ifdef CHECK_FOR_DECAL
   1.492 +    #undef CHECK_FOR_DECAL
   1.493 +#endif
   1.494 +
   1.495 +#undef SCALE_NOFILTER_NAME
   1.496 +#undef SCALE_FILTER_NAME
   1.497 +#undef AFFINE_NOFILTER_NAME
   1.498 +#undef AFFINE_FILTER_NAME
   1.499 +#undef PERSP_NOFILTER_NAME
   1.500 +#undef PERSP_FILTER_NAME
   1.501 +
   1.502 +#undef PREAMBLE
   1.503 +#undef PREAMBLE_PARAM_X
   1.504 +#undef PREAMBLE_PARAM_Y
   1.505 +#undef PREAMBLE_ARG_X
   1.506 +#undef PREAMBLE_ARG_Y
   1.507 +
   1.508 +#undef TILEX_LOW_BITS
   1.509 +#undef TILEY_LOW_BITS

mercurial