michael@0: // Copyright (c) 2010 The Chromium Authors. All rights reserved.
michael@0: // Use of this source code is governed by a BSD-style license that can be
michael@0: // found in the LICENSE file.
michael@0: 
michael@0: #include "yuv_row.h"
michael@0: #include "mozilla/SSE.h"
michael@0: 
michael@0: #define kCoefficientsRgbU kCoefficientsRgbY + 2048
michael@0: #define kCoefficientsRgbV kCoefficientsRgbY + 4096
michael@0: 
michael@0: extern "C" {
michael@0: 
michael@0: #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
michael@0: __declspec(naked)
michael@0: void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
michael@0:                                   const uint8* u_buf,
michael@0:                                   const uint8* v_buf,
michael@0:                                   uint8* rgb_buf,
michael@0:                                   int width) {
michael@0:   __asm {
michael@0:     pushad
michael@0:     mov       edx, [esp + 32 + 4]   // Y
michael@0:     mov       edi, [esp + 32 + 8]   // U
michael@0:     mov       esi, [esp + 32 + 12]  // V
michael@0:     mov       ebp, [esp + 32 + 16]  // rgb
michael@0:     mov       ecx, [esp + 32 + 20]  // width
michael@0:     jmp       convertend
michael@0: 
michael@0:  convertloop :
michael@0:     movzx     eax, byte ptr [edi]
michael@0:     add       edi, 1
michael@0:     movzx     ebx, byte ptr [esi]
michael@0:     add       esi, 1
michael@0:     movq      mm0, [kCoefficientsRgbU + 8 * eax]
michael@0:     movzx     eax, byte ptr [edx]
michael@0:     paddsw    mm0, [kCoefficientsRgbV + 8 * ebx]
michael@0:     movzx     ebx, byte ptr [edx + 1]
michael@0:     movq      mm1, [kCoefficientsRgbY + 8 * eax]
michael@0:     add       edx, 2
michael@0:     movq      mm2, [kCoefficientsRgbY + 8 * ebx]
michael@0:     paddsw    mm1, mm0
michael@0:     paddsw    mm2, mm0
michael@0:     psraw     mm1, 6
michael@0:     psraw     mm2, 6
michael@0:     packuswb  mm1, mm2
michael@0:     movntq    [ebp], mm1
michael@0:     add       ebp, 8
michael@0:  convertend :
michael@0:     sub       ecx, 2
michael@0:     jns       convertloop
michael@0: 
michael@0:     and       ecx, 1  // odd number of pixels?
michael@0:     jz        convertdone
michael@0: 
michael@0:     movzx     eax, byte ptr [edi]
michael@0:     movq      mm0, [kCoefficientsRgbU + 8 * eax]
michael@0:     movzx     eax, byte ptr [esi]
michael@0:     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
michael@0:     movzx     eax, byte ptr [edx]
michael@0:     movq      mm1, [kCoefficientsRgbY + 8 * eax]
michael@0:     paddsw    mm1, mm0
michael@0:     psraw     mm1, 6
michael@0:     packuswb  mm1, mm1
michael@0:     movd      [ebp], mm1
michael@0:  convertdone :
michael@0: 
michael@0:     popad
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked)
michael@0: void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
michael@0:                               const uint8* u_buf,
michael@0:                               const uint8* v_buf,
michael@0:                               uint8* rgb_buf,
michael@0:                               int width,
michael@0:                               int step) {
michael@0:   __asm {
michael@0:     pushad
michael@0:     mov       edx, [esp + 32 + 4]   // Y
michael@0:     mov       edi, [esp + 32 + 8]   // U
michael@0:     mov       esi, [esp + 32 + 12]  // V
michael@0:     mov       ebp, [esp + 32 + 16]  // rgb
michael@0:     mov       ecx, [esp + 32 + 20]  // width
michael@0:     mov       ebx, [esp + 32 + 24]  // step
michael@0:     jmp       wend
michael@0: 
michael@0:  wloop :
michael@0:     movzx     eax, byte ptr [edi]
michael@0:     add       edi, ebx
michael@0:     movq      mm0, [kCoefficientsRgbU + 8 * eax]
michael@0:     movzx     eax, byte ptr [esi]
michael@0:     add       esi, ebx
michael@0:     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
michael@0:     movzx     eax, byte ptr [edx]
michael@0:     add       edx, ebx
michael@0:     movq      mm1, [kCoefficientsRgbY + 8 * eax]
michael@0:     movzx     eax, byte ptr [edx]
michael@0:     add       edx, ebx
michael@0:     movq      mm2, [kCoefficientsRgbY + 8 * eax]
michael@0:     paddsw    mm1, mm0
michael@0:     paddsw    mm2, mm0
michael@0:     psraw     mm1, 6
michael@0:     psraw     mm2, 6
michael@0:     packuswb  mm1, mm2
michael@0:     movntq    [ebp], mm1
michael@0:     add       ebp, 8
michael@0:  wend :
michael@0:     sub       ecx, 2
michael@0:     jns       wloop
michael@0: 
michael@0:     and       ecx, 1  // odd number of pixels?
michael@0:     jz        wdone
michael@0: 
michael@0:     movzx     eax, byte ptr [edi]
michael@0:     movq      mm0, [kCoefficientsRgbU + 8 * eax]
michael@0:     movzx     eax, byte ptr [esi]
michael@0:     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
michael@0:     movzx     eax, byte ptr [edx]
michael@0:     movq      mm1, [kCoefficientsRgbY + 8 * eax]
michael@0:     paddsw    mm1, mm0
michael@0:     psraw     mm1, 6
michael@0:     packuswb  mm1, mm1
michael@0:     movd      [ebp], mm1
michael@0:  wdone :
michael@0: 
michael@0:     popad
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked)
michael@0: void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
michael@0:                                     const uint8* u_buf,
michael@0:                                     const uint8* v_buf,
michael@0:                                     uint8* rgb_buf,
michael@0:                                     int width,
michael@0:                                     int ystep,
michael@0:                                     int uvstep) {
michael@0:   __asm {
michael@0:     pushad
michael@0:     mov       edx, [esp + 32 + 4]   // Y
michael@0:     mov       edi, [esp + 32 + 8]   // U
michael@0:     mov       esi, [esp + 32 + 12]  // V
michael@0:     mov       ebp, [esp + 32 + 16]  // rgb
michael@0:     mov       ecx, [esp + 32 + 20]  // width
michael@0:     jmp       wend
michael@0: 
michael@0:  wloop :
michael@0:     movzx     eax, byte ptr [edi]
michael@0:     mov       ebx, [esp + 32 + 28]  // uvstep
michael@0:     add       edi, ebx
michael@0:     movq      mm0, [kCoefficientsRgbU + 8 * eax]
michael@0:     movzx     eax, byte ptr [esi]
michael@0:     add       esi, ebx
michael@0:     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
michael@0:     movzx     eax, byte ptr [edx]
michael@0:     mov       ebx, [esp + 32 + 24]  // ystep
michael@0:     add       edx, ebx
michael@0:     movq      mm1, [kCoefficientsRgbY + 8 * eax]
michael@0:     movzx     eax, byte ptr [edx]
michael@0:     add       edx, ebx
michael@0:     movq      mm2, [kCoefficientsRgbY + 8 * eax]
michael@0:     paddsw    mm1, mm0
michael@0:     paddsw    mm2, mm0
michael@0:     psraw     mm1, 6
michael@0:     psraw     mm2, 6
michael@0:     packuswb  mm1, mm2
michael@0:     movntq    [ebp], mm1
michael@0:     add       ebp, 8
michael@0:  wend :
michael@0:     sub       ecx, 2
michael@0:     jns       wloop
michael@0: 
michael@0:     and       ecx, 1  // odd number of pixels?
michael@0:     jz        wdone
michael@0: 
michael@0:     movzx     eax, byte ptr [edi]
michael@0:     movq      mm0, [kCoefficientsRgbU + 8 * eax]
michael@0:     movzx     eax, byte ptr [esi]
michael@0:     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
michael@0:     movzx     eax, byte ptr [edx]
michael@0:     movq      mm1, [kCoefficientsRgbY + 8 * eax]
michael@0:     paddsw    mm1, mm0
michael@0:     psraw     mm1, 6
michael@0:     packuswb  mm1, mm1
michael@0:     movd      [ebp], mm1
michael@0:  wdone :
michael@0: 
michael@0:     popad
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked)
michael@0: void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
michael@0:                              const uint8* u_buf,
michael@0:                              const uint8* v_buf,
michael@0:                              uint8* rgb_buf,
michael@0:                              int width) {
michael@0:   __asm {
michael@0:     pushad
michael@0:     mov       edx, [esp + 32 + 4]   // Y
michael@0:     mov       edi, [esp + 32 + 8]   // U
michael@0:     mov       esi, [esp + 32 + 12]  // V
michael@0:     mov       ebp, [esp + 32 + 16]  // rgb
michael@0:     mov       ecx, [esp + 32 + 20]  // width
michael@0:     jmp       wend
michael@0: 
michael@0:  wloop :
michael@0:     movzx     eax, byte ptr [edi]
michael@0:     add       edi, 1
michael@0:     movzx     ebx, byte ptr [esi]
michael@0:     add       esi, 1
michael@0:     movq      mm0, [kCoefficientsRgbU + 8 * eax]
michael@0:     movzx     eax, byte ptr [edx]
michael@0:     paddsw    mm0, [kCoefficientsRgbV + 8 * ebx]
michael@0:     movq      mm1, [kCoefficientsRgbY + 8 * eax]
michael@0:     paddsw    mm1, mm0
michael@0:     psraw     mm1, 6
michael@0:     packuswb  mm1, mm1
michael@0:     punpckldq mm1, mm1
michael@0:     movntq    [ebp], mm1
michael@0: 
michael@0:     movzx     ebx, byte ptr [edx + 1]
michael@0:     add       edx, 2
michael@0:     paddsw    mm0, [kCoefficientsRgbY + 8 * ebx]
michael@0:     psraw     mm0, 6
michael@0:     packuswb  mm0, mm0
michael@0:     punpckldq mm0, mm0
michael@0:     movntq    [ebp+8], mm0
michael@0:     add       ebp, 16
michael@0:  wend :
michael@0:     sub       ecx, 4
michael@0:     jns       wloop
michael@0: 
michael@0:     add       ecx, 4
michael@0:     jz        wdone
michael@0: 
michael@0:     movzx     eax, byte ptr [edi]
michael@0:     movq      mm0, [kCoefficientsRgbU + 8 * eax]
michael@0:     movzx     eax, byte ptr [esi]
michael@0:     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
michael@0:     movzx     eax, byte ptr [edx]
michael@0:     movq      mm1, [kCoefficientsRgbY + 8 * eax]
michael@0:     paddsw    mm1, mm0
michael@0:     psraw     mm1, 6
michael@0:     packuswb  mm1, mm1
michael@0:     jmp       wend1
michael@0: 
michael@0:  wloop1 :
michael@0:     movd      [ebp], mm1
michael@0:     add       ebp, 4
michael@0:  wend1 :
michael@0:     sub       ecx, 1
michael@0:     jns       wloop1
michael@0:  wdone :
michael@0:     popad
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: // This version does general purpose scaling by any amount, up or down.
michael@0: // The only thing it cannot do is rotation by 90 or 270.
michael@0: // For performance the chroma is under-sampled, reducing cost of a 3x
michael@0: // 1080p scale from 8.4 ms to 5.4 ms.
michael@0: __declspec(naked)
michael@0: void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
michael@0:                             const uint8* u_buf,
michael@0:                             const uint8* v_buf,
michael@0:                             uint8* rgb_buf,
michael@0:                             int width,
michael@0:                             int source_dx) {
michael@0:   __asm {
michael@0:     pushad
michael@0:     mov       edx, [esp + 32 + 4]   // Y
michael@0:     mov       edi, [esp + 32 + 8]   // U
michael@0:     mov       esi, [esp + 32 + 12]  // V
michael@0:     mov       ebp, [esp + 32 + 16]  // rgb
michael@0:     mov       ecx, [esp + 32 + 20]  // width
michael@0:     xor       ebx, ebx              // x
michael@0:     jmp       scaleend
michael@0: 
michael@0:  scaleloop :
michael@0:     mov       eax, ebx
michael@0:     sar       eax, 17
michael@0:     movzx     eax, byte ptr [edi + eax]
michael@0:     movq      mm0, [kCoefficientsRgbU + 8 * eax]
michael@0:     mov       eax, ebx
michael@0:     sar       eax, 17
michael@0:     movzx     eax, byte ptr [esi + eax]
michael@0:     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
michael@0:     mov       eax, ebx
michael@0:     add       ebx, [esp + 32 + 24]  // x += source_dx
michael@0:     sar       eax, 16
michael@0:     movzx     eax, byte ptr [edx + eax]
michael@0:     movq      mm1, [kCoefficientsRgbY + 8 * eax]
michael@0:     mov       eax, ebx
michael@0:     add       ebx, [esp + 32 + 24]  // x += source_dx
michael@0:     sar       eax, 16
michael@0:     movzx     eax, byte ptr [edx + eax]
michael@0:     movq      mm2, [kCoefficientsRgbY + 8 * eax]
michael@0:     paddsw    mm1, mm0
michael@0:     paddsw    mm2, mm0
michael@0:     psraw     mm1, 6
michael@0:     psraw     mm2, 6
michael@0:     packuswb  mm1, mm2
michael@0:     movntq    [ebp], mm1
michael@0:     add       ebp, 8
michael@0:  scaleend :
michael@0:     sub       ecx, 2
michael@0:     jns       scaleloop
michael@0: 
michael@0:     and       ecx, 1  // odd number of pixels?
michael@0:     jz        scaledone
michael@0: 
michael@0:     mov       eax, ebx
michael@0:     sar       eax, 17
michael@0:     movzx     eax, byte ptr [edi + eax]
michael@0:     movq      mm0, [kCoefficientsRgbU + 8 * eax]
michael@0:     mov       eax, ebx
michael@0:     sar       eax, 17
michael@0:     movzx     eax, byte ptr [esi + eax]
michael@0:     paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
michael@0:     mov       eax, ebx
michael@0:     sar       eax, 16
michael@0:     movzx     eax, byte ptr [edx + eax]
michael@0:     movq      mm1, [kCoefficientsRgbY + 8 * eax]
michael@0:     paddsw    mm1, mm0
michael@0:     psraw     mm1, 6
michael@0:     packuswb  mm1, mm1
michael@0:     movd      [ebp], mm1
michael@0: 
michael@0:  scaledone :
michael@0:     popad
michael@0:     ret
michael@0:   }
michael@0: }
michael@0: 
michael@0: __declspec(naked)
michael@0: void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
michael@0:                                   const uint8* u_buf,
michael@0:                                   const uint8* v_buf,
michael@0:                                   uint8* rgb_buf,
michael@0:                                   int width,
michael@0:                                   int source_dx) {
michael@0:   __asm {
michael@0:     pushad
michael@0:     mov       edx, [esp + 32 + 4]  // Y
michael@0:     mov       edi, [esp + 32 + 8]  // U
michael@0:                 // [esp + 32 + 12] // V
michael@0:     mov       ebp, [esp + 32 + 16] // rgb
michael@0:     mov       ecx, [esp + 32 + 20] // width
michael@0:     imul      ecx, [esp + 32 + 24] // source_dx
michael@0:     mov       [esp + 32 + 20], ecx // source_width = width * source_dx
michael@0:     mov       ecx, [esp + 32 + 24] // source_dx
michael@0:     xor       ebx, ebx             // x = 0
michael@0:     cmp       ecx, 0x20000
michael@0:     jl        lscaleend
michael@0:     mov       ebx, 0x8000          // x = 0.5 for 1/2 or less
michael@0:     jmp       lscaleend
michael@0: lscaleloop:
michael@0:     mov       eax, ebx
michael@0:     sar       eax, 0x11
michael@0: 
michael@0:     movzx     ecx, byte ptr [edi + eax]
michael@0:     movzx     esi, byte ptr [edi + eax + 1]
michael@0:     mov       eax, ebx
michael@0:     and       eax, 0x1fffe
michael@0:     imul      esi, eax
michael@0:     xor       eax, 0x1fffe
michael@0:     imul      ecx, eax
michael@0:     add       ecx, esi
michael@0:     shr       ecx, 17
michael@0:     movq      mm0, [kCoefficientsRgbU + 8 * ecx]
michael@0: 
michael@0:     mov       esi, [esp + 32 + 12]
michael@0:     mov       eax, ebx
michael@0:     sar       eax, 0x11
michael@0: 
michael@0:     movzx     ecx, byte ptr [esi + eax]
michael@0:     movzx     esi, byte ptr [esi + eax + 1]
michael@0:     mov       eax, ebx
michael@0:     and       eax, 0x1fffe
michael@0:     imul      esi, eax
michael@0:     xor       eax, 0x1fffe
michael@0:     imul      ecx, eax
michael@0:     add       ecx, esi
michael@0:     shr       ecx, 17
michael@0:     paddsw    mm0, [kCoefficientsRgbV + 8 * ecx]
michael@0: 
michael@0:     mov       eax, ebx
michael@0:     sar       eax, 0x10
michael@0:     movzx     ecx, byte ptr [edx + eax]
michael@0:     movzx     esi, byte ptr [1 + edx + eax]
michael@0:     mov       eax, ebx
michael@0:     add       ebx, [esp + 32 + 24]
michael@0:     and       eax, 0xffff
michael@0:     imul      esi, eax
michael@0:     xor       eax, 0xffff
michael@0:     imul      ecx, eax
michael@0:     add       ecx, esi
michael@0:     shr       ecx, 16
michael@0:     movq      mm1, [kCoefficientsRgbY + 8 * ecx]
michael@0: 
michael@0:     cmp       ebx, [esp + 32 + 20]
michael@0:     jge       lscalelastpixel
michael@0: 
michael@0:     mov       eax, ebx
michael@0:     sar       eax, 0x10
michael@0:     movzx     ecx, byte ptr [edx + eax]
michael@0:     movzx     esi, byte ptr [edx + eax + 1]
michael@0:     mov       eax, ebx
michael@0:     add       ebx, [esp + 32 + 24]
michael@0:     and       eax, 0xffff
michael@0:     imul      esi, eax
michael@0:     xor       eax, 0xffff
michael@0:     imul      ecx, eax
michael@0:     add       ecx, esi
michael@0:     shr       ecx, 16
michael@0:     movq      mm2, [kCoefficientsRgbY + 8 * ecx]
michael@0: 
michael@0:     paddsw    mm1, mm0
michael@0:     paddsw    mm2, mm0
michael@0:     psraw     mm1, 0x6
michael@0:     psraw     mm2, 0x6
michael@0:     packuswb  mm1, mm2
michael@0:     movntq    [ebp], mm1
michael@0:     add       ebp, 0x8
michael@0: 
michael@0: lscaleend:
michael@0:     cmp       ebx, [esp + 32 + 20]
michael@0:     jl        lscaleloop
michael@0:     popad
michael@0:     ret
michael@0: 
michael@0: lscalelastpixel:
michael@0:     paddsw    mm1, mm0
michael@0:     psraw     mm1, 6
michael@0:     packuswb  mm1, mm1
michael@0:     movd      [ebp], mm1
michael@0:     popad
michael@0:     ret
michael@0:   };
michael@0: }
michael@0: #endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
michael@0: 
michael@0: void FastConvertYUVToRGB32Row(const uint8* y_buf,
michael@0:                               const uint8* u_buf,
michael@0:                               const uint8* v_buf,
michael@0:                               uint8* rgb_buf,
michael@0:                               int width) {
michael@0: #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
michael@0:   if (mozilla::supports_sse()) {
michael@0:     FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
michael@0:     return;
michael@0:   }
michael@0: #endif
michael@0: 
michael@0:   FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
michael@0: }
michael@0: 
michael@0: void ScaleYUVToRGB32Row(const uint8* y_buf,
michael@0:                         const uint8* u_buf,
michael@0:                         const uint8* v_buf,
michael@0:                         uint8* rgb_buf,
michael@0:                         int width,
michael@0:                         int source_dx) {
michael@0: 
michael@0: #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
michael@0:   if (mozilla::supports_sse()) {
michael@0:     ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
michael@0:     return;
michael@0:   }
michael@0: #endif
michael@0: 
michael@0:   ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
michael@0: }
michael@0: 
michael@0: void LinearScaleYUVToRGB32Row(const uint8* y_buf,
michael@0:                               const uint8* u_buf,
michael@0:                               const uint8* v_buf,
michael@0:                               uint8* rgb_buf,
michael@0:                               int width,
michael@0:                               int source_dx) {
michael@0: #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
michael@0:   if (mozilla::supports_sse()) {
michael@0:     LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
michael@0:                                  source_dx);
michael@0:     return;
michael@0:   }
michael@0: #endif
michael@0: 
michael@0:   LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
michael@0: }
michael@0: 
michael@0: } // extern "C"