1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libyuv/source/scale.cc Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,914 @@ 1.4 +/* 1.5 + * Copyright 2011 The LibYuv Project Authors. All rights reserved. 1.6 + * 1.7 + * Use of this source code is governed by a BSD-style license 1.8 + * that can be found in the LICENSE file in the root of the source 1.9 + * tree. An additional intellectual property rights grant can be found 1.10 + * in the file PATENTS. All contributing project authors may 1.11 + * be found in the AUTHORS file in the root of the source tree. 1.12 + */ 1.13 + 1.14 +#include "libyuv/scale.h" 1.15 + 1.16 +#include <assert.h> 1.17 +#include <string.h> 1.18 + 1.19 +#include "libyuv/cpu_id.h" 1.20 +#include "libyuv/planar_functions.h" // For CopyPlane 1.21 +#include "libyuv/row.h" 1.22 +#include "libyuv/scale_row.h" 1.23 + 1.24 +#ifdef __cplusplus 1.25 +namespace libyuv { 1.26 +extern "C" { 1.27 +#endif 1.28 + 1.29 +// Remove this macro if OVERREAD is safe. 1.30 +#define AVOID_OVERREAD 1 1.31 + 1.32 +static __inline int Abs(int v) { 1.33 + return v >= 0 ? v : -v; 1.34 +} 1.35 + 1.36 +#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s) 1.37 + 1.38 +// Scale plane, 1/2 1.39 +// This is an optimized version for scaling down a plane to 1/2 of 1.40 +// its original size. 1.41 + 1.42 +static void ScalePlaneDown2(int src_width, int src_height, 1.43 + int dst_width, int dst_height, 1.44 + int src_stride, int dst_stride, 1.45 + const uint8* src_ptr, uint8* dst_ptr, 1.46 + enum FilterMode filtering) { 1.47 + void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride, 1.48 + uint8* dst_ptr, int dst_width) = 1.49 + filtering == kFilterNone ? ScaleRowDown2_C : 1.50 + (filtering == kFilterLinear ? ScaleRowDown2Linear_C : 1.51 + ScaleRowDown2Box_C); 1.52 + int row_stride = src_stride << 1; 1.53 + if (!filtering) { 1.54 + src_ptr += src_stride; // Point to odd rows. 1.55 + src_stride = 0; 1.56 + } 1.57 + 1.58 +#if defined(HAS_SCALEROWDOWN2_NEON) 1.59 + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) { 1.60 + ScaleRowDown2 = filtering ? ScaleRowDown2Box_NEON : ScaleRowDown2_NEON; 1.61 + } 1.62 +#elif defined(HAS_SCALEROWDOWN2_SSE2) 1.63 + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) { 1.64 + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Unaligned_SSE2 : 1.65 + (filtering == kFilterLinear ? ScaleRowDown2Linear_Unaligned_SSE2 : 1.66 + ScaleRowDown2Box_Unaligned_SSE2); 1.67 + if (IS_ALIGNED(src_ptr, 16) && 1.68 + IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) && 1.69 + IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { 1.70 + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 : 1.71 + (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 : 1.72 + ScaleRowDown2Box_SSE2); 1.73 + } 1.74 + } 1.75 +#elif defined(HAS_SCALEROWDOWN2_MIPS_DSPR2) 1.76 + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) && 1.77 + IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) && 1.78 + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { 1.79 + ScaleRowDown2 = filtering ? 1.80 + ScaleRowDown2Box_MIPS_DSPR2 : ScaleRowDown2_MIPS_DSPR2; 1.81 + } 1.82 +#endif 1.83 + 1.84 + if (filtering == kFilterLinear) { 1.85 + src_stride = 0; 1.86 + } 1.87 + // TODO(fbarchard): Loop through source height to allow odd height. 1.88 + int y; 1.89 + for (y = 0; y < dst_height; ++y) { 1.90 + ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width); 1.91 + src_ptr += row_stride; 1.92 + dst_ptr += dst_stride; 1.93 + } 1.94 +} 1.95 + 1.96 +// Scale plane, 1/4 1.97 +// This is an optimized version for scaling down a plane to 1/4 of 1.98 +// its original size. 1.99 + 1.100 +static void ScalePlaneDown4(int src_width, int src_height, 1.101 + int dst_width, int dst_height, 1.102 + int src_stride, int dst_stride, 1.103 + const uint8* src_ptr, uint8* dst_ptr, 1.104 + enum FilterMode filtering) { 1.105 + void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride, 1.106 + uint8* dst_ptr, int dst_width) = 1.107 + filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C; 1.108 + int row_stride = src_stride << 2; 1.109 + if (!filtering) { 1.110 + src_ptr += src_stride * 2; // Point to row 2. 1.111 + src_stride = 0; 1.112 + } 1.113 +#if defined(HAS_SCALEROWDOWN4_NEON) 1.114 + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) { 1.115 + ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON; 1.116 + } 1.117 +#elif defined(HAS_SCALEROWDOWN4_SSE2) 1.118 + if (TestCpuFlag(kCpuHasSSE2) && 1.119 + IS_ALIGNED(dst_width, 8) && IS_ALIGNED(row_stride, 16) && 1.120 + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { 1.121 + ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2; 1.122 + } 1.123 +#elif defined(HAS_SCALEROWDOWN4_MIPS_DSPR2) 1.124 + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) && 1.125 + IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && 1.126 + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { 1.127 + ScaleRowDown4 = filtering ? 1.128 + ScaleRowDown4Box_MIPS_DSPR2 : ScaleRowDown4_MIPS_DSPR2; 1.129 + } 1.130 +#endif 1.131 + 1.132 + if (filtering == kFilterLinear) { 1.133 + src_stride = 0; 1.134 + } 1.135 + int y; 1.136 + for (y = 0; y < dst_height; ++y) { 1.137 + ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width); 1.138 + src_ptr += row_stride; 1.139 + dst_ptr += dst_stride; 1.140 + } 1.141 +} 1.142 + 1.143 +// Scale plane down, 3/4 1.144 + 1.145 +static void ScalePlaneDown34(int src_width, int src_height, 1.146 + int dst_width, int dst_height, 1.147 + int src_stride, int dst_stride, 1.148 + const uint8* src_ptr, uint8* dst_ptr, 1.149 + enum FilterMode filtering) { 1.150 + assert(dst_width % 3 == 0); 1.151 + void (*ScaleRowDown34_0)(const uint8* src_ptr, ptrdiff_t src_stride, 1.152 + uint8* dst_ptr, int dst_width); 1.153 + void (*ScaleRowDown34_1)(const uint8* src_ptr, ptrdiff_t src_stride, 1.154 + uint8* dst_ptr, int dst_width); 1.155 + if (!filtering) { 1.156 + ScaleRowDown34_0 = ScaleRowDown34_C; 1.157 + ScaleRowDown34_1 = ScaleRowDown34_C; 1.158 + } else { 1.159 + ScaleRowDown34_0 = ScaleRowDown34_0_Box_C; 1.160 + ScaleRowDown34_1 = ScaleRowDown34_1_Box_C; 1.161 + } 1.162 +#if defined(HAS_SCALEROWDOWN34_NEON) 1.163 + if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) { 1.164 + if (!filtering) { 1.165 + ScaleRowDown34_0 = ScaleRowDown34_NEON; 1.166 + ScaleRowDown34_1 = ScaleRowDown34_NEON; 1.167 + } else { 1.168 + ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON; 1.169 + ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON; 1.170 + } 1.171 + } 1.172 +#endif 1.173 +#if defined(HAS_SCALEROWDOWN34_SSSE3) 1.174 + if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) && 1.175 + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { 1.176 + if (!filtering) { 1.177 + ScaleRowDown34_0 = ScaleRowDown34_SSSE3; 1.178 + ScaleRowDown34_1 = ScaleRowDown34_SSSE3; 1.179 + } else { 1.180 + ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3; 1.181 + ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3; 1.182 + } 1.183 + } 1.184 +#endif 1.185 +#if defined(HAS_SCALEROWDOWN34_MIPS_DSPR2) 1.186 + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) && 1.187 + IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && 1.188 + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { 1.189 + if (!filtering) { 1.190 + ScaleRowDown34_0 = ScaleRowDown34_MIPS_DSPR2; 1.191 + ScaleRowDown34_1 = ScaleRowDown34_MIPS_DSPR2; 1.192 + } else { 1.193 + ScaleRowDown34_0 = ScaleRowDown34_0_Box_MIPS_DSPR2; 1.194 + ScaleRowDown34_1 = ScaleRowDown34_1_Box_MIPS_DSPR2; 1.195 + } 1.196 + } 1.197 +#endif 1.198 + 1.199 + const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; 1.200 + int y; 1.201 + for (y = 0; y < dst_height - 2; y += 3) { 1.202 + ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); 1.203 + src_ptr += src_stride; 1.204 + dst_ptr += dst_stride; 1.205 + ScaleRowDown34_1(src_ptr, filter_stride, dst_ptr, dst_width); 1.206 + src_ptr += src_stride; 1.207 + dst_ptr += dst_stride; 1.208 + ScaleRowDown34_0(src_ptr + src_stride, -filter_stride, 1.209 + dst_ptr, dst_width); 1.210 + src_ptr += src_stride * 2; 1.211 + dst_ptr += dst_stride; 1.212 + } 1.213 + 1.214 + // Remainder 1 or 2 rows with last row vertically unfiltered 1.215 + if ((dst_height % 3) == 2) { 1.216 + ScaleRowDown34_0(src_ptr, filter_stride, dst_ptr, dst_width); 1.217 + src_ptr += src_stride; 1.218 + dst_ptr += dst_stride; 1.219 + ScaleRowDown34_1(src_ptr, 0, dst_ptr, dst_width); 1.220 + } else if ((dst_height % 3) == 1) { 1.221 + ScaleRowDown34_0(src_ptr, 0, dst_ptr, dst_width); 1.222 + } 1.223 +} 1.224 + 1.225 + 1.226 +// Scale plane, 3/8 1.227 +// This is an optimized version for scaling down a plane to 3/8 1.228 +// of its original size. 1.229 +// 1.230 +// Uses box filter arranges like this 1.231 +// aaabbbcc -> abc 1.232 +// aaabbbcc def 1.233 +// aaabbbcc ghi 1.234 +// dddeeeff 1.235 +// dddeeeff 1.236 +// dddeeeff 1.237 +// ggghhhii 1.238 +// ggghhhii 1.239 +// Boxes are 3x3, 2x3, 3x2 and 2x2 1.240 + 1.241 +static void ScalePlaneDown38(int src_width, int src_height, 1.242 + int dst_width, int dst_height, 1.243 + int src_stride, int dst_stride, 1.244 + const uint8* src_ptr, uint8* dst_ptr, 1.245 + enum FilterMode filtering) { 1.246 + assert(dst_width % 3 == 0); 1.247 + void (*ScaleRowDown38_3)(const uint8* src_ptr, ptrdiff_t src_stride, 1.248 + uint8* dst_ptr, int dst_width); 1.249 + void (*ScaleRowDown38_2)(const uint8* src_ptr, ptrdiff_t src_stride, 1.250 + uint8* dst_ptr, int dst_width); 1.251 + if (!filtering) { 1.252 + ScaleRowDown38_3 = ScaleRowDown38_C; 1.253 + ScaleRowDown38_2 = ScaleRowDown38_C; 1.254 + } else { 1.255 + ScaleRowDown38_3 = ScaleRowDown38_3_Box_C; 1.256 + ScaleRowDown38_2 = ScaleRowDown38_2_Box_C; 1.257 + } 1.258 +#if defined(HAS_SCALEROWDOWN38_NEON) 1.259 + if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) { 1.260 + if (!filtering) { 1.261 + ScaleRowDown38_3 = ScaleRowDown38_NEON; 1.262 + ScaleRowDown38_2 = ScaleRowDown38_NEON; 1.263 + } else { 1.264 + ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON; 1.265 + ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON; 1.266 + } 1.267 + } 1.268 +#elif defined(HAS_SCALEROWDOWN38_SSSE3) 1.269 + if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0) && 1.270 + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { 1.271 + if (!filtering) { 1.272 + ScaleRowDown38_3 = ScaleRowDown38_SSSE3; 1.273 + ScaleRowDown38_2 = ScaleRowDown38_SSSE3; 1.274 + } else { 1.275 + ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3; 1.276 + ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3; 1.277 + } 1.278 + } 1.279 +#elif defined(HAS_SCALEROWDOWN38_MIPS_DSPR2) 1.280 + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) && 1.281 + IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && 1.282 + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { 1.283 + if (!filtering) { 1.284 + ScaleRowDown38_3 = ScaleRowDown38_MIPS_DSPR2; 1.285 + ScaleRowDown38_2 = ScaleRowDown38_MIPS_DSPR2; 1.286 + } else { 1.287 + ScaleRowDown38_3 = ScaleRowDown38_3_Box_MIPS_DSPR2; 1.288 + ScaleRowDown38_2 = ScaleRowDown38_2_Box_MIPS_DSPR2; 1.289 + } 1.290 + } 1.291 +#endif 1.292 + 1.293 + const int filter_stride = (filtering == kFilterLinear) ? 0 : src_stride; 1.294 + int y; 1.295 + for (y = 0; y < dst_height - 2; y += 3) { 1.296 + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); 1.297 + src_ptr += src_stride * 3; 1.298 + dst_ptr += dst_stride; 1.299 + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); 1.300 + src_ptr += src_stride * 3; 1.301 + dst_ptr += dst_stride; 1.302 + ScaleRowDown38_2(src_ptr, filter_stride, dst_ptr, dst_width); 1.303 + src_ptr += src_stride * 2; 1.304 + dst_ptr += dst_stride; 1.305 + } 1.306 + 1.307 + // Remainder 1 or 2 rows with last row vertically unfiltered 1.308 + if ((dst_height % 3) == 2) { 1.309 + ScaleRowDown38_3(src_ptr, filter_stride, dst_ptr, dst_width); 1.310 + src_ptr += src_stride * 3; 1.311 + dst_ptr += dst_stride; 1.312 + ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); 1.313 + } else if ((dst_height % 3) == 1) { 1.314 + ScaleRowDown38_3(src_ptr, 0, dst_ptr, dst_width); 1.315 + } 1.316 +} 1.317 + 1.318 +static __inline uint32 SumBox(int iboxwidth, int iboxheight, 1.319 + ptrdiff_t src_stride, const uint8* src_ptr) { 1.320 + assert(iboxwidth > 0); 1.321 + assert(iboxheight > 0); 1.322 + uint32 sum = 0u; 1.323 + int y; 1.324 + for (y = 0; y < iboxheight; ++y) { 1.325 + int x; 1.326 + for (x = 0; x < iboxwidth; ++x) { 1.327 + sum += src_ptr[x]; 1.328 + } 1.329 + src_ptr += src_stride; 1.330 + } 1.331 + return sum; 1.332 +} 1.333 + 1.334 +static void ScalePlaneBoxRow_C(int dst_width, int boxheight, 1.335 + int x, int dx, ptrdiff_t src_stride, 1.336 + const uint8* src_ptr, uint8* dst_ptr) { 1.337 + int i; 1.338 + for (i = 0; i < dst_width; ++i) { 1.339 + int ix = x >> 16; 1.340 + x += dx; 1.341 + int boxwidth = (x >> 16) - ix; 1.342 + *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) / 1.343 + (boxwidth * boxheight); 1.344 + } 1.345 +} 1.346 + 1.347 +static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) { 1.348 + assert(iboxwidth > 0); 1.349 + uint32 sum = 0u; 1.350 + int x; 1.351 + for (x = 0; x < iboxwidth; ++x) { 1.352 + sum += src_ptr[x]; 1.353 + } 1.354 + return sum; 1.355 +} 1.356 + 1.357 +static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx, 1.358 + const uint16* src_ptr, uint8* dst_ptr) { 1.359 + int scaletbl[2]; 1.360 + int minboxwidth = (dx >> 16); 1.361 + scaletbl[0] = 65536 / (minboxwidth * boxheight); 1.362 + scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight); 1.363 + int* scaleptr = scaletbl - minboxwidth; 1.364 + int i; 1.365 + for (i = 0; i < dst_width; ++i) { 1.366 + int ix = x >> 16; 1.367 + x += dx; 1.368 + int boxwidth = (x >> 16) - ix; 1.369 + *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16; 1.370 + } 1.371 +} 1.372 + 1.373 +static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx, 1.374 + const uint16* src_ptr, uint8* dst_ptr) { 1.375 + int boxwidth = (dx >> 16); 1.376 + int scaleval = 65536 / (boxwidth * boxheight); 1.377 + int i; 1.378 + for (i = 0; i < dst_width; ++i) { 1.379 + *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16; 1.380 + x += boxwidth; 1.381 + } 1.382 +} 1.383 + 1.384 +// Scale plane down to any dimensions, with interpolation. 1.385 +// (boxfilter). 1.386 +// 1.387 +// Same method as SimpleScale, which is fixed point, outputting 1.388 +// one pixel of destination using fixed point (16.16) to step 1.389 +// through source, sampling a box of pixel with simple 1.390 +// averaging. 1.391 +static void ScalePlaneBox(int src_width, int src_height, 1.392 + int dst_width, int dst_height, 1.393 + int src_stride, int dst_stride, 1.394 + const uint8* src_ptr, uint8* dst_ptr) { 1.395 + // Initial source x/y coordinate and step values as 16.16 fixed point. 1.396 + int x = 0; 1.397 + int y = 0; 1.398 + int dx = 0; 1.399 + int dy = 0; 1.400 + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, 1.401 + &x, &y, &dx, &dy); 1.402 + src_width = Abs(src_width); 1.403 + const int max_y = (src_height << 16); 1.404 + // TODO(fbarchard): Remove this and make AddRows handle boxheight 1. 1.405 + if (!IS_ALIGNED(src_width, 16) || dst_height * 2 > src_height) { 1.406 + uint8* dst = dst_ptr; 1.407 + int j; 1.408 + for (j = 0; j < dst_height; ++j) { 1.409 + int iy = y >> 16; 1.410 + const uint8* src = src_ptr + iy * src_stride; 1.411 + y += dy; 1.412 + if (y > max_y) { 1.413 + y = max_y; 1.414 + } 1.415 + int boxheight = (y >> 16) - iy; 1.416 + ScalePlaneBoxRow_C(dst_width, boxheight, 1.417 + x, dx, src_stride, 1.418 + src, dst); 1.419 + dst += dst_stride; 1.420 + } 1.421 + return; 1.422 + } 1.423 + // Allocate a row buffer of uint16. 1.424 + align_buffer_64(row16, src_width * 2); 1.425 + 1.426 + void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, 1.427 + const uint16* src_ptr, uint8* dst_ptr) = 1.428 + (dx & 0xffff) ? ScaleAddCols2_C: ScaleAddCols1_C; 1.429 + void (*ScaleAddRows)(const uint8* src_ptr, ptrdiff_t src_stride, 1.430 + uint16* dst_ptr, int src_width, int src_height) = ScaleAddRows_C; 1.431 +#if defined(HAS_SCALEADDROWS_SSE2) 1.432 + if (TestCpuFlag(kCpuHasSSE2) && 1.433 +#ifdef AVOID_OVERREAD 1.434 + IS_ALIGNED(src_width, 16) && 1.435 +#endif 1.436 + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { 1.437 + ScaleAddRows = ScaleAddRows_SSE2; 1.438 + } 1.439 +#endif 1.440 + 1.441 + int j; 1.442 + for (j = 0; j < dst_height; ++j) { 1.443 + int iy = y >> 16; 1.444 + const uint8* src = src_ptr + iy * src_stride; 1.445 + y += dy; 1.446 + if (y > (src_height << 16)) { 1.447 + y = (src_height << 16); 1.448 + } 1.449 + int boxheight = (y >> 16) - iy; 1.450 + ScaleAddRows(src, src_stride, (uint16*)(row16), 1.451 + src_width, boxheight); 1.452 + ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), 1.453 + dst_ptr); 1.454 + dst_ptr += dst_stride; 1.455 + } 1.456 + free_aligned_buffer_64(row16); 1.457 +} 1.458 + 1.459 +// Scale plane down with bilinear interpolation. 1.460 +void ScalePlaneBilinearDown(int src_width, int src_height, 1.461 + int dst_width, int dst_height, 1.462 + int src_stride, int dst_stride, 1.463 + const uint8* src_ptr, uint8* dst_ptr, 1.464 + enum FilterMode filtering) { 1.465 + // Initial source x/y coordinate and step values as 16.16 fixed point. 1.466 + int x = 0; 1.467 + int y = 0; 1.468 + int dx = 0; 1.469 + int dy = 0; 1.470 + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, 1.471 + &x, &y, &dx, &dy); 1.472 + src_width = Abs(src_width); 1.473 + 1.474 + void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, 1.475 + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = 1.476 + InterpolateRow_C; 1.477 +#if defined(HAS_INTERPOLATEROW_SSE2) 1.478 + if (TestCpuFlag(kCpuHasSSE2) && src_width >= 16) { 1.479 + InterpolateRow = InterpolateRow_Any_SSE2; 1.480 + if (IS_ALIGNED(src_width, 16)) { 1.481 + InterpolateRow = InterpolateRow_Unaligned_SSE2; 1.482 + if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { 1.483 + InterpolateRow = InterpolateRow_SSE2; 1.484 + } 1.485 + } 1.486 + } 1.487 +#endif 1.488 +#if defined(HAS_INTERPOLATEROW_SSSE3) 1.489 + if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 16) { 1.490 + InterpolateRow = InterpolateRow_Any_SSSE3; 1.491 + if (IS_ALIGNED(src_width, 16)) { 1.492 + InterpolateRow = InterpolateRow_Unaligned_SSSE3; 1.493 + if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { 1.494 + InterpolateRow = InterpolateRow_SSSE3; 1.495 + } 1.496 + } 1.497 + } 1.498 +#endif 1.499 +#if defined(HAS_INTERPOLATEROW_AVX2) 1.500 + if (TestCpuFlag(kCpuHasAVX2) && src_width >= 32) { 1.501 + InterpolateRow = InterpolateRow_Any_AVX2; 1.502 + if (IS_ALIGNED(src_width, 32)) { 1.503 + InterpolateRow = InterpolateRow_AVX2; 1.504 + } 1.505 + } 1.506 +#endif 1.507 +#if defined(HAS_INTERPOLATEROW_NEON) 1.508 + if (TestCpuFlag(kCpuHasNEON) && src_width >= 16) { 1.509 + InterpolateRow = InterpolateRow_Any_NEON; 1.510 + if (IS_ALIGNED(src_width, 16)) { 1.511 + InterpolateRow = InterpolateRow_NEON; 1.512 + } 1.513 + } 1.514 +#endif 1.515 +#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) 1.516 + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && src_width >= 4) { 1.517 + InterpolateRow = InterpolateRow_Any_MIPS_DSPR2; 1.518 + if (IS_ALIGNED(src_width, 4)) { 1.519 + InterpolateRow = InterpolateRow_MIPS_DSPR2; 1.520 + } 1.521 + } 1.522 +#endif 1.523 + 1.524 + void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr, 1.525 + int dst_width, int x, int dx) = 1.526 + (src_width >= 32768) ? ScaleFilterCols64_C : ScaleFilterCols_C; 1.527 + 1.528 +#if defined(HAS_SCALEFILTERCOLS_SSSE3) 1.529 + if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { 1.530 + ScaleFilterCols = ScaleFilterCols_SSSE3; 1.531 + } 1.532 +#endif 1.533 + 1.534 + // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. 1.535 + // Allocate a row buffer. 1.536 + align_buffer_64(row, src_width); 1.537 + 1.538 + const int max_y = (src_height - 1) << 16; 1.539 + int j; 1.540 + for (j = 0; j < dst_height; ++j) { 1.541 + if (y > max_y) { 1.542 + y = max_y; 1.543 + } 1.544 + int yi = y >> 16; 1.545 + const uint8* src = src_ptr + yi * src_stride; 1.546 + if (filtering == kFilterLinear) { 1.547 + ScaleFilterCols(dst_ptr, src, dst_width, x, dx); 1.548 + } else { 1.549 + int yf = (y >> 8) & 255; 1.550 + InterpolateRow(row, src, src_stride, src_width, yf); 1.551 + ScaleFilterCols(dst_ptr, row, dst_width, x, dx); 1.552 + } 1.553 + dst_ptr += dst_stride; 1.554 + y += dy; 1.555 + } 1.556 + free_aligned_buffer_64(row); 1.557 +} 1.558 + 1.559 +// Scale up down with bilinear interpolation. 1.560 +void ScalePlaneBilinearUp(int src_width, int src_height, 1.561 + int dst_width, int dst_height, 1.562 + int src_stride, int dst_stride, 1.563 + const uint8* src_ptr, uint8* dst_ptr, 1.564 + enum FilterMode filtering) { 1.565 + // Initial source x/y coordinate and step values as 16.16 fixed point. 1.566 + int x = 0; 1.567 + int y = 0; 1.568 + int dx = 0; 1.569 + int dy = 0; 1.570 + ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, 1.571 + &x, &y, &dx, &dy); 1.572 + src_width = Abs(src_width); 1.573 + 1.574 + void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, 1.575 + ptrdiff_t src_stride, int dst_width, int source_y_fraction) = 1.576 + InterpolateRow_C; 1.577 +#if defined(HAS_INTERPOLATEROW_SSE2) 1.578 + if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 16) { 1.579 + InterpolateRow = InterpolateRow_Any_SSE2; 1.580 + if (IS_ALIGNED(dst_width, 16)) { 1.581 + InterpolateRow = InterpolateRow_Unaligned_SSE2; 1.582 + if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { 1.583 + InterpolateRow = InterpolateRow_SSE2; 1.584 + } 1.585 + } 1.586 + } 1.587 +#endif 1.588 +#if defined(HAS_INTERPOLATEROW_SSSE3) 1.589 + if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 16) { 1.590 + InterpolateRow = InterpolateRow_Any_SSSE3; 1.591 + if (IS_ALIGNED(dst_width, 16)) { 1.592 + InterpolateRow = InterpolateRow_Unaligned_SSSE3; 1.593 + if (IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { 1.594 + InterpolateRow = InterpolateRow_SSSE3; 1.595 + } 1.596 + } 1.597 + } 1.598 +#endif 1.599 +#if defined(HAS_INTERPOLATEROW_AVX2) 1.600 + if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 32) { 1.601 + InterpolateRow = InterpolateRow_Any_AVX2; 1.602 + if (IS_ALIGNED(dst_width, 32)) { 1.603 + InterpolateRow = InterpolateRow_AVX2; 1.604 + } 1.605 + } 1.606 +#endif 1.607 +#if defined(HAS_INTERPOLATEROW_NEON) 1.608 + if (TestCpuFlag(kCpuHasNEON) && dst_width >= 16) { 1.609 + InterpolateRow = InterpolateRow_Any_NEON; 1.610 + if (IS_ALIGNED(dst_width, 16)) { 1.611 + InterpolateRow = InterpolateRow_NEON; 1.612 + } 1.613 + } 1.614 +#endif 1.615 +#if defined(HAS_INTERPOLATEROW_MIPS_DSPR2) 1.616 + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 4) { 1.617 + InterpolateRow = InterpolateRow_Any_MIPS_DSPR2; 1.618 + if (IS_ALIGNED(dst_width, 4)) { 1.619 + InterpolateRow = InterpolateRow_MIPS_DSPR2; 1.620 + } 1.621 + } 1.622 +#endif 1.623 + 1.624 + void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr, 1.625 + int dst_width, int x, int dx) = 1.626 + filtering ? ScaleFilterCols_C : ScaleCols_C; 1.627 + if (filtering && src_width >= 32768) { 1.628 + ScaleFilterCols = ScaleFilterCols64_C; 1.629 + } 1.630 +#if defined(HAS_SCALEFILTERCOLS_SSSE3) 1.631 + if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) { 1.632 + ScaleFilterCols = ScaleFilterCols_SSSE3; 1.633 + } 1.634 +#endif 1.635 + if (!filtering && src_width * 2 == dst_width && x < 0x8000) { 1.636 + ScaleFilterCols = ScaleColsUp2_C; 1.637 +#if defined(HAS_SCALECOLS_SSE2) 1.638 + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && 1.639 + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && 1.640 + IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { 1.641 + ScaleFilterCols = ScaleColsUp2_SSE2; 1.642 + } 1.643 +#endif 1.644 + } 1.645 + 1.646 + const int max_y = (src_height - 1) << 16; 1.647 + if (y > max_y) { 1.648 + y = max_y; 1.649 + } 1.650 + int yi = y >> 16; 1.651 + const uint8* src = src_ptr + yi * src_stride; 1.652 + 1.653 + // Allocate 2 row buffers. 1.654 + const int kRowSize = (dst_width + 15) & ~15; 1.655 + align_buffer_64(row, kRowSize * 2); 1.656 + 1.657 + uint8* rowptr = row; 1.658 + int rowstride = kRowSize; 1.659 + int lasty = yi; 1.660 + 1.661 + ScaleFilterCols(rowptr, src, dst_width, x, dx); 1.662 + if (src_height > 1) { 1.663 + src += src_stride; 1.664 + } 1.665 + ScaleFilterCols(rowptr + rowstride, src, dst_width, x, dx); 1.666 + src += src_stride; 1.667 + 1.668 + int j; 1.669 + for (j = 0; j < dst_height; ++j) { 1.670 + yi = y >> 16; 1.671 + if (yi != lasty) { 1.672 + if (y > max_y) { 1.673 + y = max_y; 1.674 + yi = y >> 16; 1.675 + src = src_ptr + yi * src_stride; 1.676 + } 1.677 + if (yi != lasty) { 1.678 + ScaleFilterCols(rowptr, src, dst_width, x, dx); 1.679 + rowptr += rowstride; 1.680 + rowstride = -rowstride; 1.681 + lasty = yi; 1.682 + src += src_stride; 1.683 + } 1.684 + } 1.685 + if (filtering == kFilterLinear) { 1.686 + InterpolateRow(dst_ptr, rowptr, 0, dst_width, 0); 1.687 + } else { 1.688 + int yf = (y >> 8) & 255; 1.689 + InterpolateRow(dst_ptr, rowptr, rowstride, dst_width, yf); 1.690 + } 1.691 + dst_ptr += dst_stride; 1.692 + y += dy; 1.693 + } 1.694 + free_aligned_buffer_64(row); 1.695 +} 1.696 + 1.697 +// Scale Plane to/from any dimensions, without interpolation. 1.698 +// Fixed point math is used for performance: The upper 16 bits 1.699 +// of x and dx is the integer part of the source position and 1.700 +// the lower 16 bits are the fixed decimal part. 1.701 + 1.702 +static void ScalePlaneSimple(int src_width, int src_height, 1.703 + int dst_width, int dst_height, 1.704 + int src_stride, int dst_stride, 1.705 + const uint8* src_ptr, uint8* dst_ptr) { 1.706 + // Initial source x/y coordinate and step values as 16.16 fixed point. 1.707 + int x = 0; 1.708 + int y = 0; 1.709 + int dx = 0; 1.710 + int dy = 0; 1.711 + ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterNone, 1.712 + &x, &y, &dx, &dy); 1.713 + src_width = Abs(src_width); 1.714 + 1.715 + void (*ScaleCols)(uint8* dst_ptr, const uint8* src_ptr, 1.716 + int dst_width, int x, int dx) = ScaleCols_C; 1.717 + if (src_width * 2 == dst_width && x < 0x8000) { 1.718 + ScaleCols = ScaleColsUp2_C; 1.719 +#if defined(HAS_SCALECOLS_SSE2) 1.720 + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) && 1.721 + IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && 1.722 + IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { 1.723 + ScaleCols = ScaleColsUp2_SSE2; 1.724 + } 1.725 +#endif 1.726 + } 1.727 + 1.728 + int i; 1.729 + for (i = 0; i < dst_height; ++i) { 1.730 + ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, 1.731 + dst_width, x, dx); 1.732 + dst_ptr += dst_stride; 1.733 + y += dy; 1.734 + } 1.735 +} 1.736 + 1.737 +// Scale a plane. 1.738 +// This function dispatches to a specialized scaler based on scale factor. 1.739 + 1.740 +LIBYUV_API 1.741 +void ScalePlane(const uint8* src, int src_stride, 1.742 + int src_width, int src_height, 1.743 + uint8* dst, int dst_stride, 1.744 + int dst_width, int dst_height, 1.745 + enum FilterMode filtering) { 1.746 + // Simplify filtering when possible. 1.747 + filtering = ScaleFilterReduce(src_width, src_height, 1.748 + dst_width, dst_height, 1.749 + filtering); 1.750 + 1.751 + // Negative height means invert the image. 1.752 + if (src_height < 0) { 1.753 + src_height = -src_height; 1.754 + src = src + (src_height - 1) * src_stride; 1.755 + src_stride = -src_stride; 1.756 + } 1.757 + 1.758 + // Use specialized scales to improve performance for common resolutions. 1.759 + // For example, all the 1/2 scalings will use ScalePlaneDown2() 1.760 + if (dst_width == src_width && dst_height == src_height) { 1.761 + // Straight copy. 1.762 + CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height); 1.763 + return; 1.764 + } 1.765 + if (dst_width == src_width) { 1.766 + int dy = FixedDiv(src_height, dst_height); 1.767 + // Arbitrary scale vertically, but unscaled vertically. 1.768 + ScalePlaneVertical(src_height, 1.769 + dst_width, dst_height, 1.770 + src_stride, dst_stride, src, dst, 1.771 + 0, 0, dy, 1, filtering); 1.772 + return; 1.773 + } 1.774 + if (dst_width <= Abs(src_width) && dst_height <= src_height) { 1.775 + // Scale down. 1.776 + if (4 * dst_width == 3 * src_width && 1.777 + 4 * dst_height == 3 * src_height) { 1.778 + // optimized, 3/4 1.779 + ScalePlaneDown34(src_width, src_height, dst_width, dst_height, 1.780 + src_stride, dst_stride, src, dst, filtering); 1.781 + return; 1.782 + } 1.783 + if (2 * dst_width == src_width && 2 * dst_height == src_height) { 1.784 + // optimized, 1/2 1.785 + ScalePlaneDown2(src_width, src_height, dst_width, dst_height, 1.786 + src_stride, dst_stride, src, dst, filtering); 1.787 + return; 1.788 + } 1.789 + // 3/8 rounded up for odd sized chroma height. 1.790 + if (8 * dst_width == 3 * src_width && 1.791 + dst_height == ((src_height * 3 + 7) / 8)) { 1.792 + // optimized, 3/8 1.793 + ScalePlaneDown38(src_width, src_height, dst_width, dst_height, 1.794 + src_stride, dst_stride, src, dst, filtering); 1.795 + return; 1.796 + } 1.797 + if (4 * dst_width == src_width && 4 * dst_height == src_height && 1.798 + filtering != kFilterBilinear) { 1.799 + // optimized, 1/4 1.800 + ScalePlaneDown4(src_width, src_height, dst_width, dst_height, 1.801 + src_stride, dst_stride, src, dst, filtering); 1.802 + return; 1.803 + } 1.804 + } 1.805 + if (filtering == kFilterBox && dst_height * 2 < src_height) { 1.806 + ScalePlaneBox(src_width, src_height, dst_width, dst_height, 1.807 + src_stride, dst_stride, src, dst); 1.808 + return; 1.809 + } 1.810 + if (filtering && dst_height > src_height) { 1.811 + ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height, 1.812 + src_stride, dst_stride, src, dst, filtering); 1.813 + return; 1.814 + } 1.815 + if (filtering) { 1.816 + ScalePlaneBilinearDown(src_width, src_height, dst_width, dst_height, 1.817 + src_stride, dst_stride, src, dst, filtering); 1.818 + return; 1.819 + } 1.820 + ScalePlaneSimple(src_width, src_height, dst_width, dst_height, 1.821 + src_stride, dst_stride, src, dst); 1.822 +} 1.823 + 1.824 +// Scale an I420 image. 1.825 +// This function in turn calls a scaling function for each plane. 1.826 + 1.827 +LIBYUV_API 1.828 +int I420Scale(const uint8* src_y, int src_stride_y, 1.829 + const uint8* src_u, int src_stride_u, 1.830 + const uint8* src_v, int src_stride_v, 1.831 + int src_width, int src_height, 1.832 + uint8* dst_y, int dst_stride_y, 1.833 + uint8* dst_u, int dst_stride_u, 1.834 + uint8* dst_v, int dst_stride_v, 1.835 + int dst_width, int dst_height, 1.836 + enum FilterMode filtering) { 1.837 + if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 || 1.838 + !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { 1.839 + return -1; 1.840 + } 1.841 + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); 1.842 + int src_halfheight = SUBSAMPLE(src_height, 1, 1); 1.843 + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); 1.844 + int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); 1.845 + 1.846 + ScalePlane(src_y, src_stride_y, src_width, src_height, 1.847 + dst_y, dst_stride_y, dst_width, dst_height, 1.848 + filtering); 1.849 + ScalePlane(src_u, src_stride_u, src_halfwidth, src_halfheight, 1.850 + dst_u, dst_stride_u, dst_halfwidth, dst_halfheight, 1.851 + filtering); 1.852 + ScalePlane(src_v, src_stride_v, src_halfwidth, src_halfheight, 1.853 + dst_v, dst_stride_v, dst_halfwidth, dst_halfheight, 1.854 + filtering); 1.855 + return 0; 1.856 +} 1.857 + 1.858 +// Deprecated api 1.859 +LIBYUV_API 1.860 +int Scale(const uint8* src_y, const uint8* src_u, const uint8* src_v, 1.861 + int src_stride_y, int src_stride_u, int src_stride_v, 1.862 + int src_width, int src_height, 1.863 + uint8* dst_y, uint8* dst_u, uint8* dst_v, 1.864 + int dst_stride_y, int dst_stride_u, int dst_stride_v, 1.865 + int dst_width, int dst_height, 1.866 + LIBYUV_BOOL interpolate) { 1.867 + return I420Scale(src_y, src_stride_y, 1.868 + src_u, src_stride_u, 1.869 + src_v, src_stride_v, 1.870 + src_width, src_height, 1.871 + dst_y, dst_stride_y, 1.872 + dst_u, dst_stride_u, 1.873 + dst_v, dst_stride_v, 1.874 + dst_width, dst_height, 1.875 + interpolate ? kFilterBox : kFilterNone); 1.876 +} 1.877 + 1.878 +// Deprecated api 1.879 +LIBYUV_API 1.880 +int ScaleOffset(const uint8* src, int src_width, int src_height, 1.881 + uint8* dst, int dst_width, int dst_height, int dst_yoffset, 1.882 + LIBYUV_BOOL interpolate) { 1.883 + if (!src || src_width <= 0 || src_height <= 0 || 1.884 + !dst || dst_width <= 0 || dst_height <= 0 || dst_yoffset < 0 || 1.885 + dst_yoffset >= dst_height) { 1.886 + return -1; 1.887 + } 1.888 + dst_yoffset = dst_yoffset & ~1; // chroma requires offset to multiple of 2. 1.889 + int src_halfwidth = SUBSAMPLE(src_width, 1, 1); 1.890 + int src_halfheight = SUBSAMPLE(src_height, 1, 1); 1.891 + int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); 1.892 + int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); 1.893 + int aheight = dst_height - dst_yoffset * 2; // actual output height 1.894 + const uint8* src_y = src; 1.895 + const uint8* src_u = src + src_width * src_height; 1.896 + const uint8* src_v = src + src_width * src_height + 1.897 + src_halfwidth * src_halfheight; 1.898 + uint8* dst_y = dst + dst_yoffset * dst_width; 1.899 + uint8* dst_u = dst + dst_width * dst_height + 1.900 + (dst_yoffset >> 1) * dst_halfwidth; 1.901 + uint8* dst_v = dst + dst_width * dst_height + dst_halfwidth * dst_halfheight + 1.902 + (dst_yoffset >> 1) * dst_halfwidth; 1.903 + return I420Scale(src_y, src_width, 1.904 + src_u, src_halfwidth, 1.905 + src_v, src_halfwidth, 1.906 + src_width, src_height, 1.907 + dst_y, dst_width, 1.908 + dst_u, dst_halfwidth, 1.909 + dst_v, dst_halfwidth, 1.910 + dst_width, aheight, 1.911 + interpolate ? kFilterBox : kFilterNone); 1.912 +} 1.913 + 1.914 +#ifdef __cplusplus 1.915 +} // extern "C" 1.916 +} // namespace libyuv 1.917 +#endif