michael@0: // Copyright (c) 2011 The Chromium Authors. All rights reserved. michael@0: // Use of this source code is governed by a BSD-style license that can be michael@0: // found in the LICENSE file. michael@0: michael@0: #include "SkConvolver.h" michael@0: #include "SkSize.h" michael@0: #include "SkTypes.h" michael@0: michael@0: namespace { michael@0: michael@0: // Converts the argument to an 8-bit unsigned value by clamping to the range michael@0: // 0-255. michael@0: inline unsigned char ClampTo8(int a) { michael@0: if (static_cast(a) < 256) { michael@0: return a; // Avoid the extra check in the common case. michael@0: } michael@0: if (a < 0) { michael@0: return 0; michael@0: } michael@0: return 255; michael@0: } michael@0: michael@0: // Stores a list of rows in a circular buffer. The usage is you write into it michael@0: // by calling AdvanceRow. It will keep track of which row in the buffer it michael@0: // should use next, and the total number of rows added. michael@0: class CircularRowBuffer { michael@0: public: michael@0: // The number of pixels in each row is given in |sourceRowPixelWidth|. michael@0: // The maximum number of rows needed in the buffer is |maxYFilterSize| michael@0: // (we only need to store enough rows for the biggest filter). michael@0: // michael@0: // We use the |firstInputRow| to compute the coordinates of all of the michael@0: // following rows returned by Advance(). michael@0: CircularRowBuffer(int destRowPixelWidth, int maxYFilterSize, michael@0: int firstInputRow) michael@0: : fRowByteWidth(destRowPixelWidth * 4), michael@0: fNumRows(maxYFilterSize), michael@0: fNextRow(0), michael@0: fNextRowCoordinate(firstInputRow) { michael@0: fBuffer.reset(fRowByteWidth * maxYFilterSize); michael@0: fRowAddresses.reset(fNumRows); michael@0: } michael@0: michael@0: // Moves to the next row in the buffer, returning a pointer to the beginning michael@0: // of it. michael@0: unsigned char* advanceRow() { michael@0: unsigned char* row = &fBuffer[fNextRow * fRowByteWidth]; michael@0: fNextRowCoordinate++; michael@0: michael@0: // Set the pointer to the next row to use, wrapping around if necessary. michael@0: fNextRow++; michael@0: if (fNextRow == fNumRows) { michael@0: fNextRow = 0; michael@0: } michael@0: return row; michael@0: } michael@0: michael@0: // Returns a pointer to an "unrolled" array of rows. These rows will start michael@0: // at the y coordinate placed into |*firstRowIndex| and will continue in michael@0: // order for the maximum number of rows in this circular buffer. michael@0: // michael@0: // The |firstRowIndex_| may be negative. This means the circular buffer michael@0: // starts before the top of the image (it hasn't been filled yet). michael@0: unsigned char* const* GetRowAddresses(int* firstRowIndex) { michael@0: // Example for a 4-element circular buffer holding coords 6-9. michael@0: // Row 0 Coord 8 michael@0: // Row 1 Coord 9 michael@0: // Row 2 Coord 6 <- fNextRow = 2, fNextRowCoordinate = 10. michael@0: // Row 3 Coord 7 michael@0: // michael@0: // The "next" row is also the first (lowest) coordinate. This computation michael@0: // may yield a negative value, but that's OK, the math will work out michael@0: // since the user of this buffer will compute the offset relative michael@0: // to the firstRowIndex and the negative rows will never be used. michael@0: *firstRowIndex = fNextRowCoordinate - fNumRows; michael@0: michael@0: int curRow = fNextRow; michael@0: for (int i = 0; i < fNumRows; i++) { michael@0: fRowAddresses[i] = &fBuffer[curRow * fRowByteWidth]; michael@0: michael@0: // Advance to the next row, wrapping if necessary. michael@0: curRow++; michael@0: if (curRow == fNumRows) { michael@0: curRow = 0; michael@0: } michael@0: } michael@0: return &fRowAddresses[0]; michael@0: } michael@0: michael@0: private: michael@0: // The buffer storing the rows. They are packed, each one fRowByteWidth. michael@0: SkTArray fBuffer; michael@0: michael@0: // Number of bytes per row in the |buffer|. michael@0: int fRowByteWidth; michael@0: michael@0: // The number of rows available in the buffer. michael@0: int fNumRows; michael@0: michael@0: // The next row index we should write into. This wraps around as the michael@0: // circular buffer is used. michael@0: int fNextRow; michael@0: michael@0: // The y coordinate of the |fNextRow|. This is incremented each time a michael@0: // new row is appended and does not wrap. michael@0: int fNextRowCoordinate; michael@0: michael@0: // Buffer used by GetRowAddresses(). michael@0: SkTArray fRowAddresses; michael@0: }; michael@0: michael@0: // Convolves horizontally along a single row. The row data is given in michael@0: // |srcData| and continues for the numValues() of the filter. michael@0: template michael@0: void ConvolveHorizontally(const unsigned char* srcData, michael@0: const SkConvolutionFilter1D& filter, michael@0: unsigned char* outRow) { michael@0: // Loop over each pixel on this row in the output image. michael@0: int numValues = filter.numValues(); michael@0: for (int outX = 0; outX < numValues; outX++) { michael@0: // Get the filter that determines the current output pixel. michael@0: int filterOffset, filterLength; michael@0: const SkConvolutionFilter1D::ConvolutionFixed* filterValues = michael@0: filter.FilterForValue(outX, &filterOffset, &filterLength); michael@0: michael@0: // Compute the first pixel in this row that the filter affects. It will michael@0: // touch |filterLength| pixels (4 bytes each) after this. michael@0: const unsigned char* rowToFilter = &srcData[filterOffset * 4]; michael@0: michael@0: // Apply the filter to the row to get the destination pixel in |accum|. michael@0: int accum[4] = {0}; michael@0: for (int filterX = 0; filterX < filterLength; filterX++) { michael@0: SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues[filterX]; michael@0: accum[0] += curFilter * rowToFilter[filterX * 4 + 0]; michael@0: accum[1] += curFilter * rowToFilter[filterX * 4 + 1]; michael@0: accum[2] += curFilter * rowToFilter[filterX * 4 + 2]; michael@0: if (hasAlpha) { michael@0: accum[3] += curFilter * rowToFilter[filterX * 4 + 3]; michael@0: } michael@0: } michael@0: michael@0: // Bring this value back in range. All of the filter scaling factors michael@0: // are in fixed point with kShiftBits bits of fractional part. michael@0: accum[0] >>= SkConvolutionFilter1D::kShiftBits; michael@0: accum[1] >>= SkConvolutionFilter1D::kShiftBits; michael@0: accum[2] >>= SkConvolutionFilter1D::kShiftBits; michael@0: if (hasAlpha) { michael@0: accum[3] >>= SkConvolutionFilter1D::kShiftBits; michael@0: } michael@0: michael@0: // Store the new pixel. michael@0: outRow[outX * 4 + 0] = ClampTo8(accum[0]); michael@0: outRow[outX * 4 + 1] = ClampTo8(accum[1]); michael@0: outRow[outX * 4 + 2] = ClampTo8(accum[2]); michael@0: if (hasAlpha) { michael@0: outRow[outX * 4 + 3] = ClampTo8(accum[3]); michael@0: } michael@0: } michael@0: } michael@0: michael@0: // Does vertical convolution to produce one output row. The filter values and michael@0: // length are given in the first two parameters. These are applied to each michael@0: // of the rows pointed to in the |sourceDataRows| array, with each row michael@0: // being |pixelWidth| wide. michael@0: // michael@0: // The output must have room for |pixelWidth * 4| bytes. michael@0: template michael@0: void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filterValues, michael@0: int filterLength, michael@0: unsigned char* const* sourceDataRows, michael@0: int pixelWidth, michael@0: unsigned char* outRow) { michael@0: // We go through each column in the output and do a vertical convolution, michael@0: // generating one output pixel each time. michael@0: for (int outX = 0; outX < pixelWidth; outX++) { michael@0: // Compute the number of bytes over in each row that the current column michael@0: // we're convolving starts at. The pixel will cover the next 4 bytes. michael@0: int byteOffset = outX * 4; michael@0: michael@0: // Apply the filter to one column of pixels. michael@0: int accum[4] = {0}; michael@0: for (int filterY = 0; filterY < filterLength; filterY++) { michael@0: SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues[filterY]; michael@0: accum[0] += curFilter * sourceDataRows[filterY][byteOffset + 0]; michael@0: accum[1] += curFilter * sourceDataRows[filterY][byteOffset + 1]; michael@0: accum[2] += curFilter * sourceDataRows[filterY][byteOffset + 2]; michael@0: if (hasAlpha) { michael@0: accum[3] += curFilter * sourceDataRows[filterY][byteOffset + 3]; michael@0: } michael@0: } michael@0: michael@0: // Bring this value back in range. All of the filter scaling factors michael@0: // are in fixed point with kShiftBits bits of precision. michael@0: accum[0] >>= SkConvolutionFilter1D::kShiftBits; michael@0: accum[1] >>= SkConvolutionFilter1D::kShiftBits; michael@0: accum[2] >>= SkConvolutionFilter1D::kShiftBits; michael@0: if (hasAlpha) { michael@0: accum[3] >>= SkConvolutionFilter1D::kShiftBits; michael@0: } michael@0: michael@0: // Store the new pixel. michael@0: outRow[byteOffset + 0] = ClampTo8(accum[0]); michael@0: outRow[byteOffset + 1] = ClampTo8(accum[1]); michael@0: outRow[byteOffset + 2] = ClampTo8(accum[2]); michael@0: if (hasAlpha) { michael@0: unsigned char alpha = ClampTo8(accum[3]); michael@0: michael@0: // Make sure the alpha channel doesn't come out smaller than any of the michael@0: // color channels. We use premultipled alpha channels, so this should michael@0: // never happen, but rounding errors will cause this from time to time. michael@0: // These "impossible" colors will cause overflows (and hence random pixel michael@0: // values) when the resulting bitmap is drawn to the screen. michael@0: // michael@0: // We only need to do this when generating the final output row (here). michael@0: int maxColorChannel = SkTMax(outRow[byteOffset + 0], michael@0: SkTMax(outRow[byteOffset + 1], michael@0: outRow[byteOffset + 2])); michael@0: if (alpha < maxColorChannel) { michael@0: outRow[byteOffset + 3] = maxColorChannel; michael@0: } else { michael@0: outRow[byteOffset + 3] = alpha; michael@0: } michael@0: } else { michael@0: // No alpha channel, the image is opaque. michael@0: outRow[byteOffset + 3] = 0xff; michael@0: } michael@0: } michael@0: } michael@0: michael@0: void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filterValues, michael@0: int filterLength, michael@0: unsigned char* const* sourceDataRows, michael@0: int pixelWidth, michael@0: unsigned char* outRow, michael@0: bool sourceHasAlpha) { michael@0: if (sourceHasAlpha) { michael@0: ConvolveVertically(filterValues, filterLength, michael@0: sourceDataRows, pixelWidth, michael@0: outRow); michael@0: } else { michael@0: ConvolveVertically(filterValues, filterLength, michael@0: sourceDataRows, pixelWidth, michael@0: outRow); michael@0: } michael@0: } michael@0: michael@0: } // namespace michael@0: michael@0: // SkConvolutionFilter1D --------------------------------------------------------- michael@0: michael@0: SkConvolutionFilter1D::SkConvolutionFilter1D() michael@0: : fMaxFilter(0) { michael@0: } michael@0: michael@0: SkConvolutionFilter1D::~SkConvolutionFilter1D() { michael@0: } michael@0: michael@0: void SkConvolutionFilter1D::AddFilter(int filterOffset, michael@0: const float* filterValues, michael@0: int filterLength) { michael@0: SkASSERT(filterLength > 0); michael@0: michael@0: SkTArray fixedValues; michael@0: fixedValues.reset(filterLength); michael@0: michael@0: for (int i = 0; i < filterLength; ++i) { michael@0: fixedValues.push_back(FloatToFixed(filterValues[i])); michael@0: } michael@0: michael@0: AddFilter(filterOffset, &fixedValues[0], filterLength); michael@0: } michael@0: michael@0: void SkConvolutionFilter1D::AddFilter(int filterOffset, michael@0: const ConvolutionFixed* filterValues, michael@0: int filterLength) { michael@0: // It is common for leading/trailing filter values to be zeros. In such michael@0: // cases it is beneficial to only store the central factors. michael@0: // For a scaling to 1/4th in each dimension using a Lanczos-2 filter on michael@0: // a 1080p image this optimization gives a ~10% speed improvement. michael@0: int filterSize = filterLength; michael@0: int firstNonZero = 0; michael@0: while (firstNonZero < filterLength && filterValues[firstNonZero] == 0) { michael@0: firstNonZero++; michael@0: } michael@0: michael@0: if (firstNonZero < filterLength) { michael@0: // Here we have at least one non-zero factor. michael@0: int lastNonZero = filterLength - 1; michael@0: while (lastNonZero >= 0 && filterValues[lastNonZero] == 0) { michael@0: lastNonZero--; michael@0: } michael@0: michael@0: filterOffset += firstNonZero; michael@0: filterLength = lastNonZero + 1 - firstNonZero; michael@0: SkASSERT(filterLength > 0); michael@0: michael@0: for (int i = firstNonZero; i <= lastNonZero; i++) { michael@0: fFilterValues.push_back(filterValues[i]); michael@0: } michael@0: } else { michael@0: // Here all the factors were zeroes. michael@0: filterLength = 0; michael@0: } michael@0: michael@0: FilterInstance instance; michael@0: michael@0: // We pushed filterLength elements onto fFilterValues michael@0: instance.fDataLocation = (static_cast(fFilterValues.count()) - michael@0: filterLength); michael@0: instance.fOffset = filterOffset; michael@0: instance.fTrimmedLength = filterLength; michael@0: instance.fLength = filterSize; michael@0: fFilters.push_back(instance); michael@0: michael@0: fMaxFilter = SkTMax(fMaxFilter, filterLength); michael@0: } michael@0: michael@0: const SkConvolutionFilter1D::ConvolutionFixed* SkConvolutionFilter1D::GetSingleFilter( michael@0: int* specifiedFilterlength, michael@0: int* filterOffset, michael@0: int* filterLength) const { michael@0: const FilterInstance& filter = fFilters[0]; michael@0: *filterOffset = filter.fOffset; michael@0: *filterLength = filter.fTrimmedLength; michael@0: *specifiedFilterlength = filter.fLength; michael@0: if (filter.fTrimmedLength == 0) { michael@0: return NULL; michael@0: } michael@0: michael@0: return &fFilterValues[filter.fDataLocation]; michael@0: } michael@0: michael@0: void BGRAConvolve2D(const unsigned char* sourceData, michael@0: int sourceByteRowStride, michael@0: bool sourceHasAlpha, michael@0: const SkConvolutionFilter1D& filterX, michael@0: const SkConvolutionFilter1D& filterY, michael@0: int outputByteRowStride, michael@0: unsigned char* output, michael@0: const SkConvolutionProcs& convolveProcs, michael@0: bool useSimdIfPossible) { michael@0: michael@0: int maxYFilterSize = filterY.maxFilter(); michael@0: michael@0: // The next row in the input that we will generate a horizontally michael@0: // convolved row for. If the filter doesn't start at the beginning of the michael@0: // image (this is the case when we are only resizing a subset), then we michael@0: // don't want to generate any output rows before that. Compute the starting michael@0: // row for convolution as the first pixel for the first vertical filter. michael@0: int filterOffset, filterLength; michael@0: const SkConvolutionFilter1D::ConvolutionFixed* filterValues = michael@0: filterY.FilterForValue(0, &filterOffset, &filterLength); michael@0: int nextXRow = filterOffset; michael@0: michael@0: // We loop over each row in the input doing a horizontal convolution. This michael@0: // will result in a horizontally convolved image. We write the results into michael@0: // a circular buffer of convolved rows and do vertical convolution as rows michael@0: // are available. This prevents us from having to store the entire michael@0: // intermediate image and helps cache coherency. michael@0: // We will need four extra rows to allow horizontal convolution could be done michael@0: // simultaneously. We also pad each row in row buffer to be aligned-up to michael@0: // 16 bytes. michael@0: // TODO(jiesun): We do not use aligned load from row buffer in vertical michael@0: // convolution pass yet. Somehow Windows does not like it. michael@0: int rowBufferWidth = (filterX.numValues() + 15) & ~0xF; michael@0: int rowBufferHeight = maxYFilterSize + michael@0: (convolveProcs.fConvolve4RowsHorizontally ? 4 : 0); michael@0: CircularRowBuffer rowBuffer(rowBufferWidth, michael@0: rowBufferHeight, michael@0: filterOffset); michael@0: michael@0: // Loop over every possible output row, processing just enough horizontal michael@0: // convolutions to run each subsequent vertical convolution. michael@0: SkASSERT(outputByteRowStride >= filterX.numValues() * 4); michael@0: int numOutputRows = filterY.numValues(); michael@0: michael@0: // We need to check which is the last line to convolve before we advance 4 michael@0: // lines in one iteration. michael@0: int lastFilterOffset, lastFilterLength; michael@0: michael@0: // SSE2 can access up to 3 extra pixels past the end of the michael@0: // buffer. At the bottom of the image, we have to be careful michael@0: // not to access data past the end of the buffer. Normally michael@0: // we fall back to the C++ implementation for the last row. michael@0: // If the last row is less than 3 pixels wide, we may have to fall michael@0: // back to the C++ version for more rows. Compute how many michael@0: // rows we need to avoid the SSE implementation for here. michael@0: filterX.FilterForValue(filterX.numValues() - 1, &lastFilterOffset, michael@0: &lastFilterLength); michael@0: int avoidSimdRows = 1 + convolveProcs.fExtraHorizontalReads / michael@0: (lastFilterOffset + lastFilterLength); michael@0: michael@0: filterY.FilterForValue(numOutputRows - 1, &lastFilterOffset, michael@0: &lastFilterLength); michael@0: michael@0: for (int outY = 0; outY < numOutputRows; outY++) { michael@0: filterValues = filterY.FilterForValue(outY, michael@0: &filterOffset, &filterLength); michael@0: michael@0: // Generate output rows until we have enough to run the current filter. michael@0: while (nextXRow < filterOffset + filterLength) { michael@0: if (convolveProcs.fConvolve4RowsHorizontally && michael@0: nextXRow + 3 < lastFilterOffset + lastFilterLength - michael@0: avoidSimdRows) { michael@0: const unsigned char* src[4]; michael@0: unsigned char* outRow[4]; michael@0: for (int i = 0; i < 4; ++i) { michael@0: src[i] = &sourceData[(nextXRow + i) * sourceByteRowStride]; michael@0: outRow[i] = rowBuffer.advanceRow(); michael@0: } michael@0: convolveProcs.fConvolve4RowsHorizontally(src, filterX, outRow); michael@0: nextXRow += 4; michael@0: } else { michael@0: // Check if we need to avoid SSE2 for this row. michael@0: if (convolveProcs.fConvolveHorizontally && michael@0: nextXRow < lastFilterOffset + lastFilterLength - michael@0: avoidSimdRows) { michael@0: convolveProcs.fConvolveHorizontally( michael@0: &sourceData[nextXRow * sourceByteRowStride], michael@0: filterX, rowBuffer.advanceRow(), sourceHasAlpha); michael@0: } else { michael@0: if (sourceHasAlpha) { michael@0: ConvolveHorizontally( michael@0: &sourceData[nextXRow * sourceByteRowStride], michael@0: filterX, rowBuffer.advanceRow()); michael@0: } else { michael@0: ConvolveHorizontally( michael@0: &sourceData[nextXRow * sourceByteRowStride], michael@0: filterX, rowBuffer.advanceRow()); michael@0: } michael@0: } michael@0: nextXRow++; michael@0: } michael@0: } michael@0: michael@0: // Compute where in the output image this row of final data will go. michael@0: unsigned char* curOutputRow = &output[outY * outputByteRowStride]; michael@0: michael@0: // Get the list of rows that the circular buffer has, in order. michael@0: int firstRowInCircularBuffer; michael@0: unsigned char* const* rowsToConvolve = michael@0: rowBuffer.GetRowAddresses(&firstRowInCircularBuffer); michael@0: michael@0: // Now compute the start of the subset of those rows that the filter michael@0: // needs. michael@0: unsigned char* const* firstRowForFilter = michael@0: &rowsToConvolve[filterOffset - firstRowInCircularBuffer]; michael@0: michael@0: if (convolveProcs.fConvolveVertically) { michael@0: convolveProcs.fConvolveVertically(filterValues, filterLength, michael@0: firstRowForFilter, michael@0: filterX.numValues(), curOutputRow, michael@0: sourceHasAlpha); michael@0: } else { michael@0: ConvolveVertically(filterValues, filterLength, michael@0: firstRowForFilter, michael@0: filterX.numValues(), curOutputRow, michael@0: sourceHasAlpha); michael@0: } michael@0: } michael@0: }