gfx/ycbcr/win64.patch

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 diff --git a/gfx/ycbcr/yuv_row_win64.cpp b/gfx/ycbcr/yuv_row_win64.cpp
     2 new file mode 100644
     3 --- /dev/null
     4 +++ b/gfx/ycbcr/yuv_row_win64.cpp
     5 @@ -0,0 +1,205 @@
     6 +// Copyright (c) 2010 The Chromium Authors. All rights reserved.
     7 +// Use of this source code is governed by a BSD-style license that can be
     8 +// found in the LICENSE file.
     9 +
    10 +#include "yuv_row.h"
    11 +
    12 +extern "C" {
    13 +
    14 +// x64 compiler doesn't support MMX and inline assembler.  Use SSE2 intrinsics.
    15 +
    16 +#define kCoefficientsRgbU (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 2048)
    17 +#define kCoefficientsRgbV (reinterpret_cast<uint8*>(kCoefficientsRgbY) + 4096)
    18 +
    19 +#include <emmintrin.h>
    20 +
    21 +static void FastConvertYUVToRGB32Row_SSE2(const uint8* y_buf,
    22 +                                          const uint8* u_buf,
    23 +                                          const uint8* v_buf,
    24 +                                          uint8* rgb_buf,
    25 +                                          int width) {
    26 +  __m128i xmm0, xmmY1, xmmY2;
    27 +  __m128  xmmY;
    28 +
    29 +  while (width >= 2) {
    30 +    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf++)),
    31 +                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf++)));
    32 +
    33 +    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++));
    34 +    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
    35 +
    36 +    xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf++));
    37 +    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
    38 +
    39 +    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
    40 +                          0x44);
    41 +    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
    42 +    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
    43 +
    44 +    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
    45 +    rgb_buf += 8;
    46 +    width -= 2;
    47 +  }
    48 +
    49 +  if (width) {
    50 +    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * *u_buf)),
    51 +                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * *v_buf)));
    52 +    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * *y_buf));
    53 +    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
    54 +    xmmY1 = _mm_srai_epi16(xmmY1, 6);
    55 +    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
    56 +    *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
    57 +  }
    58 +}
    59 +
    60 +static void ScaleYUVToRGB32Row_SSE2(const uint8* y_buf,
    61 +                                    const uint8* u_buf,
    62 +                                    const uint8* v_buf,
    63 +                                    uint8* rgb_buf,
    64 +                                    int width,
    65 +                                    int source_dx) {
    66 +  __m128i xmm0, xmmY1, xmmY2;
    67 +  __m128  xmmY;
    68 +  uint8 u, v, y;
    69 +  int x = 0;
    70 +
    71 +  while (width >= 2) {
    72 +    u = u_buf[x >> 17];
    73 +    v = v_buf[x >> 17];
    74 +    y = y_buf[x >> 16];
    75 +    x += source_dx;
    76 +
    77 +    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
    78 +                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
    79 +    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
    80 +    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
    81 +
    82 +    y = y_buf[x >> 16];
    83 +    x += source_dx;
    84 +
    85 +    xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
    86 +    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
    87 +
    88 +    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
    89 +                          0x44);
    90 +    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
    91 +    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
    92 +
    93 +    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
    94 +    rgb_buf += 8;
    95 +    width -= 2;
    96 +  }
    97 +
    98 +  if (width) {
    99 +    u = u_buf[x >> 17];
   100 +    v = v_buf[x >> 17];
   101 +    y = y_buf[x >> 16];
   102 +
   103 +    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
   104 +                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
   105 +    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
   106 +    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
   107 +    xmmY1 = _mm_srai_epi16(xmmY1, 6);
   108 +    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
   109 +    *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
   110 +  }
   111 +}
   112 +
   113 +static void LinearScaleYUVToRGB32Row_SSE2(const uint8* y_buf,
   114 +                                          const uint8* u_buf,
   115 +                                          const uint8* v_buf,
   116 +                                          uint8* rgb_buf,
   117 +                                          int width,
   118 +                                          int source_dx) {
   119 +  __m128i xmm0, xmmY1, xmmY2;
   120 +  __m128  xmmY;
   121 +  uint8 u0, u1, v0, v1, y0, y1;
   122 +  uint32 uv_frac, y_frac, u, v, y;
   123 +  int x = 0;
   124 +
   125 +  if (source_dx >= 0x20000) {
   126 +    x = 32768;
   127 +  }
   128 +
   129 +  while(width >= 2) {
   130 +    u0 = u_buf[x >> 17];
   131 +    u1 = u_buf[(x >> 17) + 1];
   132 +    v0 = v_buf[x >> 17];
   133 +    v1 = v_buf[(x >> 17) + 1];
   134 +    y0 = y_buf[x >> 16];
   135 +    y1 = y_buf[(x >> 16) + 1];
   136 +    uv_frac = (x & 0x1fffe);
   137 +    y_frac = (x & 0xffff);
   138 +    u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17;
   139 +    v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17;
   140 +    y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
   141 +    x += source_dx;
   142 +
   143 +    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
   144 +                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
   145 +    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
   146 +    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
   147 +
   148 +    y0 = y_buf[x >> 16];
   149 +    y1 = y_buf[(x >> 16) + 1];
   150 +    y_frac = (x & 0xffff);
   151 +    y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
   152 +    x += source_dx;
   153 +
   154 +    xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
   155 +    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
   156 +
   157 +    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
   158 +                          0x44);
   159 +    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
   160 +    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
   161 +
   162 +    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
   163 +    rgb_buf += 8;
   164 +    width -= 2;
   165 +  }
   166 +
   167 +  if (width) {
   168 +    u = u_buf[x >> 17];
   169 +    v = v_buf[x >> 17];
   170 +    y = y_buf[x >> 16];
   171 +
   172 +    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
   173 +                          _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
   174 +    xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
   175 +
   176 +    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
   177 +    xmmY1 = _mm_srai_epi16(xmmY1, 6);
   178 +    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
   179 +    *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
   180 +  }
   181 +}
   182 +
   183 +void FastConvertYUVToRGB32Row(const uint8* y_buf,
   184 +                              const uint8* u_buf,
   185 +                              const uint8* v_buf,
   186 +                              uint8* rgb_buf,
   187 +                              int width) {
   188 +  FastConvertYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width);
   189 +}
   190 +
   191 +void ScaleYUVToRGB32Row(const uint8* y_buf,
   192 +                        const uint8* u_buf,
   193 +                        const uint8* v_buf,
   194 +                        uint8* rgb_buf,
   195 +                        int width,
   196 +                        int source_dx) {
   197 +  ScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
   198 +}
   199 +
   200 +void LinearScaleYUVToRGB32Row(const uint8* y_buf,
   201 +                              const uint8* u_buf,
   202 +                              const uint8* v_buf,
   203 +                              uint8* rgb_buf,
   204 +                              int width,
   205 +                              int source_dx) {
   206 +  LinearScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width,
   207 +                                source_dx);
   208 +}
   209 +
   210 +} // extern "C"

mercurial