media/libtheora/lib/x86/sse2trans.h

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /********************************************************************
     2  *                                                                  *
     3  * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
     4  * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
     5  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
     6  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
     7  *                                                                  *
     8  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
     9  * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
    10  *                                                                  *
    11  ********************************************************************
    13   function:
    14     last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $
    16  ********************************************************************/
    18 #if !defined(_x86_sse2trans_H)
    19 # define _x86_sse2trans_H (1)
    20 # include "x86int.h"
    22 # if defined(OC_X86_64_ASM)
    23 /*On x86-64 we can transpose in-place without spilling registers.
    24   By clever choices of the order to apply the butterflies and the order of
    25    their outputs, we can take the rows in order and output the columns in order
    26    without any extra operations and using just one temporary register.*/
    27 #  define OC_TRANSPOSE_8x8 \
    28  "#OC_TRANSPOSE_8x8\n\t" \
    29  "movdqa %%xmm4,%%xmm8\n\t" \
    30  /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
    31  "punpcklwd %%xmm5,%%xmm4\n\t" \
    32  /*xmm8 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
    33  "punpckhwd %%xmm5,%%xmm8\n\t" \
    34  /*xmm5 is free.*/ \
    35  "movdqa %%xmm0,%%xmm5\n\t" \
    36  /*xmm0 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
    37  "punpcklwd %%xmm1,%%xmm0\n\t" \
    38  /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
    39  "punpckhwd %%xmm1,%%xmm5\n\t" \
    40  /*xmm1 is free.*/ \
    41  "movdqa %%xmm6,%%xmm1\n\t" \
    42  /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
    43  "punpcklwd %%xmm7,%%xmm6\n\t" \
    44  /*xmm1 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
    45  "punpckhwd %%xmm7,%%xmm1\n\t" \
    46  /*xmm7 is free.*/ \
    47  "movdqa %%xmm2,%%xmm7\n\t" \
    48  /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
    49  "punpckhwd %%xmm3,%%xmm2\n\t" \
    50  /*xmm7 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
    51  "punpcklwd %%xmm3,%%xmm7\n\t" \
    52  /*xmm3 is free.*/ \
    53  "movdqa %%xmm0,%%xmm3\n\t" \
    54  /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
    55  "punpckldq %%xmm7,%%xmm0\n\t" \
    56  /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
    57  "punpckhdq %%xmm7,%%xmm3\n\t" \
    58  /*xmm7 is free.*/ \
    59  "movdqa %%xmm5,%%xmm7\n\t" \
    60  /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
    61  "punpckldq %%xmm2,%%xmm5\n\t" \
    62  /*xmm7 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
    63  "punpckhdq %%xmm2,%%xmm7\n\t" \
    64  /*xmm2 is free.*/ \
    65  "movdqa %%xmm4,%%xmm2\n\t" \
    66  /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
    67  "punpckhdq %%xmm6,%%xmm4\n\t" \
    68  /*xmm2 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
    69  "punpckldq %%xmm6,%%xmm2\n\t" \
    70  /*xmm6 is free.*/ \
    71  "movdqa %%xmm8,%%xmm6\n\t" \
    72  /*xmm6 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
    73  "punpckldq %%xmm1,%%xmm6\n\t" \
    74  /*xmm8 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
    75  "punpckhdq %%xmm1,%%xmm8\n\t" \
    76  /*xmm1 is free.*/ \
    77  "movdqa %%xmm0,%%xmm1\n\t" \
    78  /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
    79  "punpcklqdq %%xmm2,%%xmm0\n\t" \
    80  /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
    81  "punpckhqdq %%xmm2,%%xmm1\n\t" \
    82  /*xmm2 is free.*/ \
    83  "movdqa %%xmm3,%%xmm2\n\t" \
    84  /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
    85  "punpckhqdq %%xmm4,%%xmm3\n\t" \
    86  /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
    87  "punpcklqdq %%xmm4,%%xmm2\n\t" \
    88  /*xmm4 is free.*/ \
    89  "movdqa %%xmm5,%%xmm4\n\t" \
    90  /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
    91  "punpckhqdq %%xmm6,%%xmm5\n\t" \
    92  /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
    93  "punpcklqdq %%xmm6,%%xmm4\n\t" \
    94  /*xmm6 is free.*/ \
    95  "movdqa %%xmm7,%%xmm6\n\t" \
    96  /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
    97  "punpckhqdq %%xmm8,%%xmm7\n\t" \
    98  /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
    99  "punpcklqdq %%xmm8,%%xmm6\n\t" \
   100  /*xmm8 is free.*/ \
   102 # else
   103 /*Otherwise, we need to spill some values to %[buf] temporarily.
   104   Again, the butterflies are carefully arranged to get the columns to come out
   105    in order, minimizing register spills and maximizing the delay between a load
   106    and when the value loaded is actually used.*/
   107 #  define OC_TRANSPOSE_8x8 \
   108  "#OC_TRANSPOSE_8x8\n\t" \
   109  /*buf[0] = a7 a6 a5 a4 a3 a2 a1 a0*/ \
   110  "movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
   111  /*xmm0 is free.*/ \
   112  "movdqa %%xmm2,%%xmm0\n\t" \
   113  /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
   114  "punpckhwd %%xmm3,%%xmm2\n\t" \
   115  /*xmm0 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
   116  "punpcklwd %%xmm3,%%xmm0\n\t" \
   117  /*xmm3 = a7 a6 a5 a4 a3 a2 a1 a0*/ \
   118  "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm3\n\t" \
   119  /*buf[1] = d7 c7 d6 c6 d5 c5 d4 c4*/ \
   120  "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
   121  /*xmm2 is free.*/ \
   122  "movdqa %%xmm6,%%xmm2\n\t" \
   123  /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
   124  "punpcklwd %%xmm7,%%xmm6\n\t" \
   125  /*xmm2 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
   126  "punpckhwd %%xmm7,%%xmm2\n\t" \
   127  /*xmm7 is free.*/ \
   128  "movdqa %%xmm4,%%xmm7\n\t" \
   129  /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
   130  "punpcklwd %%xmm5,%%xmm4\n\t" \
   131  /*xmm7 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
   132  "punpckhwd %%xmm5,%%xmm7\n\t" \
   133  /*xmm5 is free.*/ \
   134  "movdqa %%xmm3,%%xmm5\n\t" \
   135  /*xmm3 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
   136  "punpcklwd %%xmm1,%%xmm3\n\t" \
   137  /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
   138  "punpckhwd %%xmm1,%%xmm5\n\t" \
   139  /*xmm1 is free.*/ \
   140  "movdqa %%xmm7,%%xmm1\n\t" \
   141  /*xmm7 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
   142  "punpckldq %%xmm2,%%xmm7\n\t" \
   143  /*xmm1 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
   144  "punpckhdq %%xmm2,%%xmm1\n\t" \
   145  /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
   146  "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm2\n\t" \
   147  /*buf[0] = h7 g7 f7 e7 h6 g6 f6 e6*/ \
   148  "movdqa %%xmm1,"OC_MEM_OFFS(0x00,buf)"\n\t" \
   149  /*xmm1 is free.*/ \
   150  "movdqa %%xmm3,%%xmm1\n\t" \
   151  /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
   152  "punpckhdq %%xmm0,%%xmm3\n\t" \
   153  /*xmm1 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
   154  "punpckldq %%xmm0,%%xmm1\n\t" \
   155  /*xmm0 is free.*/ \
   156  "movdqa %%xmm4,%%xmm0\n\t" \
   157  /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
   158  "punpckhdq %%xmm6,%%xmm4\n\t" \
   159  /*xmm0 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
   160  "punpckldq %%xmm6,%%xmm0\n\t" \
   161  /*xmm6 is free.*/ \
   162  "movdqa %%xmm5,%%xmm6\n\t" \
   163  /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
   164  "punpckldq %%xmm2,%%xmm5\n\t" \
   165  /*xmm6 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
   166  "punpckhdq %%xmm2,%%xmm6\n\t" \
   167  /*xmm2 is free.*/ \
   168  "movdqa %%xmm1,%%xmm2\n\t" \
   169  /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
   170  "punpckhqdq %%xmm0,%%xmm1\n\t" \
   171  /*xmm2 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
   172  "punpcklqdq %%xmm0,%%xmm2\n\t" \
   173  /*xmm0 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
   174  "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \
   175  /*buf[1] = h0 g0 f0 e0 d0 c0 b0 a0*/ \
   176  "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
   177  /*xmm2 is free.*/ \
   178  "movdqa %%xmm3,%%xmm2\n\t" \
   179  /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
   180  "punpckhqdq %%xmm4,%%xmm3\n\t" \
   181  /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
   182  "punpcklqdq %%xmm4,%%xmm2\n\t" \
   183  /*xmm4 is free.*/ \
   184  "movdqa %%xmm5,%%xmm4\n\t" \
   185  /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
   186  "punpckhqdq %%xmm7,%%xmm5\n\t" \
   187  /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
   188  "punpcklqdq %%xmm7,%%xmm4\n\t" \
   189  /*xmm7 is free.*/ \
   190  "movdqa %%xmm6,%%xmm7\n\t" \
   191  /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
   192  "punpcklqdq %%xmm0,%%xmm6\n\t" \
   193  /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
   194  "punpckhqdq %%xmm0,%%xmm7\n\t" \
   195  /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
   196  "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm0\n\t" \
   198 # endif
   200 /*Transpose 4 values in each of 8 MMX registers into 8 values in the first
   201    four SSE registers.
   202   No need to be clever here; we have plenty of room.*/
   203 #  define OC_TRANSPOSE_8x4_MMX2SSE \
   204  "#OC_TRANSPOSE_8x4_MMX2SSE\n\t" \
   205  "movq2dq %%mm0,%%xmm0\n\t" \
   206  "movq2dq %%mm1,%%xmm1\n\t" \
   207  /*xmmA = b3 a3 b2 a2 b1 a1 b0 a0*/ \
   208  "punpcklwd %%xmm1,%%xmm0\n\t" \
   209  "movq2dq %%mm2,%%xmm3\n\t" \
   210  "movq2dq %%mm3,%%xmm2\n\t" \
   211  /*xmmC = d3 c3 d2 c2 d1 c1 d0 c0*/ \
   212  "punpcklwd %%xmm2,%%xmm3\n\t" \
   213  "movq2dq %%mm4,%%xmm4\n\t" \
   214  "movq2dq %%mm5,%%xmm5\n\t" \
   215  /*xmmE = f3 e3 f2 e2 f1 e1 f0 e0*/ \
   216  "punpcklwd %%xmm5,%%xmm4\n\t" \
   217  "movq2dq %%mm6,%%xmm7\n\t" \
   218  "movq2dq %%mm7,%%xmm6\n\t" \
   219  /*xmmG = h3 g3 h2 g2 h1 g1 h0 g0*/ \
   220  "punpcklwd %%xmm6,%%xmm7\n\t" \
   221  "movdqa %%xmm0,%%xmm2\n\t" \
   222  /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
   223  "punpckldq %%xmm3,%%xmm0\n\t" \
   224  /*xmm2 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
   225  "punpckhdq %%xmm3,%%xmm2\n\t" \
   226  "movdqa %%xmm4,%%xmm5\n\t" \
   227  /*xmm4 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
   228  "punpckldq %%xmm7,%%xmm4\n\t" \
   229  /*xmm3 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
   230  "punpckhdq %%xmm7,%%xmm5\n\t" \
   231  "movdqa %%xmm0,%%xmm1\n\t" \
   232  /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
   233  "punpcklqdq %%xmm4,%%xmm0\n\t" \
   234  /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
   235  "punpckhqdq %%xmm4,%%xmm1\n\t" \
   236  "movdqa %%xmm2,%%xmm3\n\t" \
   237  /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
   238  "punpcklqdq %%xmm5,%%xmm2\n\t" \
   239  /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
   240  "punpckhqdq %%xmm5,%%xmm3\n\t" \
   242 #endif

mercurial