media/libtheora/lib/x86/sse2trans.h

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /********************************************************************
michael@0 2 * *
michael@0 3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
michael@0 4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
michael@0 5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
michael@0 6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
michael@0 7 * *
michael@0 8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
michael@0 9 * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
michael@0 10 * *
michael@0 11 ********************************************************************
michael@0 12
michael@0 13 function:
michael@0 14 last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $
michael@0 15
michael@0 16 ********************************************************************/
michael@0 17
michael@0 18 #if !defined(_x86_sse2trans_H)
michael@0 19 # define _x86_sse2trans_H (1)
michael@0 20 # include "x86int.h"
michael@0 21
michael@0 22 # if defined(OC_X86_64_ASM)
michael@0 23 /*On x86-64 we can transpose in-place without spilling registers.
michael@0 24 By clever choices of the order to apply the butterflies and the order of
michael@0 25 their outputs, we can take the rows in order and output the columns in order
michael@0 26 without any extra operations and using just one temporary register.*/
michael@0 27 # define OC_TRANSPOSE_8x8 \
michael@0 28 "#OC_TRANSPOSE_8x8\n\t" \
michael@0 29 "movdqa %%xmm4,%%xmm8\n\t" \
michael@0 30 /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
michael@0 31 "punpcklwd %%xmm5,%%xmm4\n\t" \
michael@0 32 /*xmm8 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
michael@0 33 "punpckhwd %%xmm5,%%xmm8\n\t" \
michael@0 34 /*xmm5 is free.*/ \
michael@0 35 "movdqa %%xmm0,%%xmm5\n\t" \
michael@0 36 /*xmm0 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
michael@0 37 "punpcklwd %%xmm1,%%xmm0\n\t" \
michael@0 38 /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
michael@0 39 "punpckhwd %%xmm1,%%xmm5\n\t" \
michael@0 40 /*xmm1 is free.*/ \
michael@0 41 "movdqa %%xmm6,%%xmm1\n\t" \
michael@0 42 /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
michael@0 43 "punpcklwd %%xmm7,%%xmm6\n\t" \
michael@0 44 /*xmm1 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
michael@0 45 "punpckhwd %%xmm7,%%xmm1\n\t" \
michael@0 46 /*xmm7 is free.*/ \
michael@0 47 "movdqa %%xmm2,%%xmm7\n\t" \
michael@0 48 /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
michael@0 49 "punpckhwd %%xmm3,%%xmm2\n\t" \
michael@0 50 /*xmm7 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
michael@0 51 "punpcklwd %%xmm3,%%xmm7\n\t" \
michael@0 52 /*xmm3 is free.*/ \
michael@0 53 "movdqa %%xmm0,%%xmm3\n\t" \
michael@0 54 /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
michael@0 55 "punpckldq %%xmm7,%%xmm0\n\t" \
michael@0 56 /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
michael@0 57 "punpckhdq %%xmm7,%%xmm3\n\t" \
michael@0 58 /*xmm7 is free.*/ \
michael@0 59 "movdqa %%xmm5,%%xmm7\n\t" \
michael@0 60 /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
michael@0 61 "punpckldq %%xmm2,%%xmm5\n\t" \
michael@0 62 /*xmm7 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
michael@0 63 "punpckhdq %%xmm2,%%xmm7\n\t" \
michael@0 64 /*xmm2 is free.*/ \
michael@0 65 "movdqa %%xmm4,%%xmm2\n\t" \
michael@0 66 /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
michael@0 67 "punpckhdq %%xmm6,%%xmm4\n\t" \
michael@0 68 /*xmm2 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
michael@0 69 "punpckldq %%xmm6,%%xmm2\n\t" \
michael@0 70 /*xmm6 is free.*/ \
michael@0 71 "movdqa %%xmm8,%%xmm6\n\t" \
michael@0 72 /*xmm6 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
michael@0 73 "punpckldq %%xmm1,%%xmm6\n\t" \
michael@0 74 /*xmm8 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
michael@0 75 "punpckhdq %%xmm1,%%xmm8\n\t" \
michael@0 76 /*xmm1 is free.*/ \
michael@0 77 "movdqa %%xmm0,%%xmm1\n\t" \
michael@0 78 /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
michael@0 79 "punpcklqdq %%xmm2,%%xmm0\n\t" \
michael@0 80 /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
michael@0 81 "punpckhqdq %%xmm2,%%xmm1\n\t" \
michael@0 82 /*xmm2 is free.*/ \
michael@0 83 "movdqa %%xmm3,%%xmm2\n\t" \
michael@0 84 /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
michael@0 85 "punpckhqdq %%xmm4,%%xmm3\n\t" \
michael@0 86 /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
michael@0 87 "punpcklqdq %%xmm4,%%xmm2\n\t" \
michael@0 88 /*xmm4 is free.*/ \
michael@0 89 "movdqa %%xmm5,%%xmm4\n\t" \
michael@0 90 /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
michael@0 91 "punpckhqdq %%xmm6,%%xmm5\n\t" \
michael@0 92 /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
michael@0 93 "punpcklqdq %%xmm6,%%xmm4\n\t" \
michael@0 94 /*xmm6 is free.*/ \
michael@0 95 "movdqa %%xmm7,%%xmm6\n\t" \
michael@0 96 /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
michael@0 97 "punpckhqdq %%xmm8,%%xmm7\n\t" \
michael@0 98 /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
michael@0 99 "punpcklqdq %%xmm8,%%xmm6\n\t" \
michael@0 100 /*xmm8 is free.*/ \
michael@0 101
michael@0 102 # else
michael@0 103 /*Otherwise, we need to spill some values to %[buf] temporarily.
michael@0 104 Again, the butterflies are carefully arranged to get the columns to come out
michael@0 105 in order, minimizing register spills and maximizing the delay between a load
michael@0 106 and when the value loaded is actually used.*/
michael@0 107 # define OC_TRANSPOSE_8x8 \
michael@0 108 "#OC_TRANSPOSE_8x8\n\t" \
michael@0 109 /*buf[0] = a7 a6 a5 a4 a3 a2 a1 a0*/ \
michael@0 110 "movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
michael@0 111 /*xmm0 is free.*/ \
michael@0 112 "movdqa %%xmm2,%%xmm0\n\t" \
michael@0 113 /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
michael@0 114 "punpckhwd %%xmm3,%%xmm2\n\t" \
michael@0 115 /*xmm0 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
michael@0 116 "punpcklwd %%xmm3,%%xmm0\n\t" \
michael@0 117 /*xmm3 = a7 a6 a5 a4 a3 a2 a1 a0*/ \
michael@0 118 "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm3\n\t" \
michael@0 119 /*buf[1] = d7 c7 d6 c6 d5 c5 d4 c4*/ \
michael@0 120 "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
michael@0 121 /*xmm2 is free.*/ \
michael@0 122 "movdqa %%xmm6,%%xmm2\n\t" \
michael@0 123 /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
michael@0 124 "punpcklwd %%xmm7,%%xmm6\n\t" \
michael@0 125 /*xmm2 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
michael@0 126 "punpckhwd %%xmm7,%%xmm2\n\t" \
michael@0 127 /*xmm7 is free.*/ \
michael@0 128 "movdqa %%xmm4,%%xmm7\n\t" \
michael@0 129 /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
michael@0 130 "punpcklwd %%xmm5,%%xmm4\n\t" \
michael@0 131 /*xmm7 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
michael@0 132 "punpckhwd %%xmm5,%%xmm7\n\t" \
michael@0 133 /*xmm5 is free.*/ \
michael@0 134 "movdqa %%xmm3,%%xmm5\n\t" \
michael@0 135 /*xmm3 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
michael@0 136 "punpcklwd %%xmm1,%%xmm3\n\t" \
michael@0 137 /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
michael@0 138 "punpckhwd %%xmm1,%%xmm5\n\t" \
michael@0 139 /*xmm1 is free.*/ \
michael@0 140 "movdqa %%xmm7,%%xmm1\n\t" \
michael@0 141 /*xmm7 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
michael@0 142 "punpckldq %%xmm2,%%xmm7\n\t" \
michael@0 143 /*xmm1 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
michael@0 144 "punpckhdq %%xmm2,%%xmm1\n\t" \
michael@0 145 /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
michael@0 146 "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm2\n\t" \
michael@0 147 /*buf[0] = h7 g7 f7 e7 h6 g6 f6 e6*/ \
michael@0 148 "movdqa %%xmm1,"OC_MEM_OFFS(0x00,buf)"\n\t" \
michael@0 149 /*xmm1 is free.*/ \
michael@0 150 "movdqa %%xmm3,%%xmm1\n\t" \
michael@0 151 /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
michael@0 152 "punpckhdq %%xmm0,%%xmm3\n\t" \
michael@0 153 /*xmm1 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
michael@0 154 "punpckldq %%xmm0,%%xmm1\n\t" \
michael@0 155 /*xmm0 is free.*/ \
michael@0 156 "movdqa %%xmm4,%%xmm0\n\t" \
michael@0 157 /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
michael@0 158 "punpckhdq %%xmm6,%%xmm4\n\t" \
michael@0 159 /*xmm0 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
michael@0 160 "punpckldq %%xmm6,%%xmm0\n\t" \
michael@0 161 /*xmm6 is free.*/ \
michael@0 162 "movdqa %%xmm5,%%xmm6\n\t" \
michael@0 163 /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
michael@0 164 "punpckldq %%xmm2,%%xmm5\n\t" \
michael@0 165 /*xmm6 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
michael@0 166 "punpckhdq %%xmm2,%%xmm6\n\t" \
michael@0 167 /*xmm2 is free.*/ \
michael@0 168 "movdqa %%xmm1,%%xmm2\n\t" \
michael@0 169 /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
michael@0 170 "punpckhqdq %%xmm0,%%xmm1\n\t" \
michael@0 171 /*xmm2 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
michael@0 172 "punpcklqdq %%xmm0,%%xmm2\n\t" \
michael@0 173 /*xmm0 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
michael@0 174 "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \
michael@0 175 /*buf[1] = h0 g0 f0 e0 d0 c0 b0 a0*/ \
michael@0 176 "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
michael@0 177 /*xmm2 is free.*/ \
michael@0 178 "movdqa %%xmm3,%%xmm2\n\t" \
michael@0 179 /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
michael@0 180 "punpckhqdq %%xmm4,%%xmm3\n\t" \
michael@0 181 /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
michael@0 182 "punpcklqdq %%xmm4,%%xmm2\n\t" \
michael@0 183 /*xmm4 is free.*/ \
michael@0 184 "movdqa %%xmm5,%%xmm4\n\t" \
michael@0 185 /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
michael@0 186 "punpckhqdq %%xmm7,%%xmm5\n\t" \
michael@0 187 /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
michael@0 188 "punpcklqdq %%xmm7,%%xmm4\n\t" \
michael@0 189 /*xmm7 is free.*/ \
michael@0 190 "movdqa %%xmm6,%%xmm7\n\t" \
michael@0 191 /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
michael@0 192 "punpcklqdq %%xmm0,%%xmm6\n\t" \
michael@0 193 /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
michael@0 194 "punpckhqdq %%xmm0,%%xmm7\n\t" \
michael@0 195 /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
michael@0 196 "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm0\n\t" \
michael@0 197
michael@0 198 # endif
michael@0 199
michael@0 200 /*Transpose 4 values in each of 8 MMX registers into 8 values in the first
michael@0 201 four SSE registers.
michael@0 202 No need to be clever here; we have plenty of room.*/
michael@0 203 # define OC_TRANSPOSE_8x4_MMX2SSE \
michael@0 204 "#OC_TRANSPOSE_8x4_MMX2SSE\n\t" \
michael@0 205 "movq2dq %%mm0,%%xmm0\n\t" \
michael@0 206 "movq2dq %%mm1,%%xmm1\n\t" \
michael@0 207 /*xmmA = b3 a3 b2 a2 b1 a1 b0 a0*/ \
michael@0 208 "punpcklwd %%xmm1,%%xmm0\n\t" \
michael@0 209 "movq2dq %%mm2,%%xmm3\n\t" \
michael@0 210 "movq2dq %%mm3,%%xmm2\n\t" \
michael@0 211 /*xmmC = d3 c3 d2 c2 d1 c1 d0 c0*/ \
michael@0 212 "punpcklwd %%xmm2,%%xmm3\n\t" \
michael@0 213 "movq2dq %%mm4,%%xmm4\n\t" \
michael@0 214 "movq2dq %%mm5,%%xmm5\n\t" \
michael@0 215 /*xmmE = f3 e3 f2 e2 f1 e1 f0 e0*/ \
michael@0 216 "punpcklwd %%xmm5,%%xmm4\n\t" \
michael@0 217 "movq2dq %%mm6,%%xmm7\n\t" \
michael@0 218 "movq2dq %%mm7,%%xmm6\n\t" \
michael@0 219 /*xmmG = h3 g3 h2 g2 h1 g1 h0 g0*/ \
michael@0 220 "punpcklwd %%xmm6,%%xmm7\n\t" \
michael@0 221 "movdqa %%xmm0,%%xmm2\n\t" \
michael@0 222 /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
michael@0 223 "punpckldq %%xmm3,%%xmm0\n\t" \
michael@0 224 /*xmm2 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
michael@0 225 "punpckhdq %%xmm3,%%xmm2\n\t" \
michael@0 226 "movdqa %%xmm4,%%xmm5\n\t" \
michael@0 227 /*xmm4 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
michael@0 228 "punpckldq %%xmm7,%%xmm4\n\t" \
michael@0 229 /*xmm3 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
michael@0 230 "punpckhdq %%xmm7,%%xmm5\n\t" \
michael@0 231 "movdqa %%xmm0,%%xmm1\n\t" \
michael@0 232 /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
michael@0 233 "punpcklqdq %%xmm4,%%xmm0\n\t" \
michael@0 234 /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
michael@0 235 "punpckhqdq %%xmm4,%%xmm1\n\t" \
michael@0 236 "movdqa %%xmm2,%%xmm3\n\t" \
michael@0 237 /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
michael@0 238 "punpcklqdq %%xmm5,%%xmm2\n\t" \
michael@0 239 /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
michael@0 240 "punpckhqdq %%xmm5,%%xmm3\n\t" \
michael@0 241
michael@0 242 #endif

mercurial