Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | /******************************************************************** |
michael@0 | 2 | * * |
michael@0 | 3 | * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
michael@0 | 4 | * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
michael@0 | 5 | * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
michael@0 | 6 | * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
michael@0 | 7 | * * |
michael@0 | 8 | * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * |
michael@0 | 9 | * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * |
michael@0 | 10 | * * |
michael@0 | 11 | ******************************************************************** |
michael@0 | 12 | |
michael@0 | 13 | function: |
michael@0 | 14 | last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $ |
michael@0 | 15 | |
michael@0 | 16 | ********************************************************************/ |
michael@0 | 17 | |
michael@0 | 18 | #if !defined(_x86_sse2trans_H) |
michael@0 | 19 | # define _x86_sse2trans_H (1) |
michael@0 | 20 | # include "x86int.h" |
michael@0 | 21 | |
michael@0 | 22 | # if defined(OC_X86_64_ASM) |
michael@0 | 23 | /*On x86-64 we can transpose in-place without spilling registers. |
michael@0 | 24 | By clever choices of the order to apply the butterflies and the order of |
michael@0 | 25 | their outputs, we can take the rows in order and output the columns in order |
michael@0 | 26 | without any extra operations and using just one temporary register.*/ |
michael@0 | 27 | # define OC_TRANSPOSE_8x8 \ |
michael@0 | 28 | "#OC_TRANSPOSE_8x8\n\t" \ |
michael@0 | 29 | "movdqa %%xmm4,%%xmm8\n\t" \ |
michael@0 | 30 | /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \ |
michael@0 | 31 | "punpcklwd %%xmm5,%%xmm4\n\t" \ |
michael@0 | 32 | /*xmm8 = f7 e7 f6 e6 f5 e5 f4 e4*/ \ |
michael@0 | 33 | "punpckhwd %%xmm5,%%xmm8\n\t" \ |
michael@0 | 34 | /*xmm5 is free.*/ \ |
michael@0 | 35 | "movdqa %%xmm0,%%xmm5\n\t" \ |
michael@0 | 36 | /*xmm0 = b3 a3 b2 a2 b1 a1 b0 a0*/ \ |
michael@0 | 37 | "punpcklwd %%xmm1,%%xmm0\n\t" \ |
michael@0 | 38 | /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \ |
michael@0 | 39 | "punpckhwd %%xmm1,%%xmm5\n\t" \ |
michael@0 | 40 | /*xmm1 is free.*/ \ |
michael@0 | 41 | "movdqa %%xmm6,%%xmm1\n\t" \ |
michael@0 | 42 | /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \ |
michael@0 | 43 | "punpcklwd %%xmm7,%%xmm6\n\t" \ |
michael@0 | 44 | /*xmm1 = h7 g7 h6 g6 h5 g5 h4 g4*/ \ |
michael@0 | 45 | "punpckhwd %%xmm7,%%xmm1\n\t" \ |
michael@0 | 46 | /*xmm7 is free.*/ \ |
michael@0 | 47 | "movdqa %%xmm2,%%xmm7\n\t" \ |
michael@0 | 48 | /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \ |
michael@0 | 49 | "punpckhwd %%xmm3,%%xmm2\n\t" \ |
michael@0 | 50 | /*xmm7 = d3 c3 d2 c2 d1 c1 d0 c0*/ \ |
michael@0 | 51 | "punpcklwd %%xmm3,%%xmm7\n\t" \ |
michael@0 | 52 | /*xmm3 is free.*/ \ |
michael@0 | 53 | "movdqa %%xmm0,%%xmm3\n\t" \ |
michael@0 | 54 | /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \ |
michael@0 | 55 | "punpckldq %%xmm7,%%xmm0\n\t" \ |
michael@0 | 56 | /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \ |
michael@0 | 57 | "punpckhdq %%xmm7,%%xmm3\n\t" \ |
michael@0 | 58 | /*xmm7 is free.*/ \ |
michael@0 | 59 | "movdqa %%xmm5,%%xmm7\n\t" \ |
michael@0 | 60 | /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \ |
michael@0 | 61 | "punpckldq %%xmm2,%%xmm5\n\t" \ |
michael@0 | 62 | /*xmm7 = d7 c7 b7 a7 d6 c6 b6 a6*/ \ |
michael@0 | 63 | "punpckhdq %%xmm2,%%xmm7\n\t" \ |
michael@0 | 64 | /*xmm2 is free.*/ \ |
michael@0 | 65 | "movdqa %%xmm4,%%xmm2\n\t" \ |
michael@0 | 66 | /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \ |
michael@0 | 67 | "punpckhdq %%xmm6,%%xmm4\n\t" \ |
michael@0 | 68 | /*xmm2 = h1 g1 f1 e1 h0 g0 f0 e0*/ \ |
michael@0 | 69 | "punpckldq %%xmm6,%%xmm2\n\t" \ |
michael@0 | 70 | /*xmm6 is free.*/ \ |
michael@0 | 71 | "movdqa %%xmm8,%%xmm6\n\t" \ |
michael@0 | 72 | /*xmm6 = h5 g5 f5 e5 h4 g4 f4 e4*/ \ |
michael@0 | 73 | "punpckldq %%xmm1,%%xmm6\n\t" \ |
michael@0 | 74 | /*xmm8 = h7 g7 f7 e7 h6 g6 f6 e6*/ \ |
michael@0 | 75 | "punpckhdq %%xmm1,%%xmm8\n\t" \ |
michael@0 | 76 | /*xmm1 is free.*/ \ |
michael@0 | 77 | "movdqa %%xmm0,%%xmm1\n\t" \ |
michael@0 | 78 | /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \ |
michael@0 | 79 | "punpcklqdq %%xmm2,%%xmm0\n\t" \ |
michael@0 | 80 | /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \ |
michael@0 | 81 | "punpckhqdq %%xmm2,%%xmm1\n\t" \ |
michael@0 | 82 | /*xmm2 is free.*/ \ |
michael@0 | 83 | "movdqa %%xmm3,%%xmm2\n\t" \ |
michael@0 | 84 | /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \ |
michael@0 | 85 | "punpckhqdq %%xmm4,%%xmm3\n\t" \ |
michael@0 | 86 | /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \ |
michael@0 | 87 | "punpcklqdq %%xmm4,%%xmm2\n\t" \ |
michael@0 | 88 | /*xmm4 is free.*/ \ |
michael@0 | 89 | "movdqa %%xmm5,%%xmm4\n\t" \ |
michael@0 | 90 | /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \ |
michael@0 | 91 | "punpckhqdq %%xmm6,%%xmm5\n\t" \ |
michael@0 | 92 | /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \ |
michael@0 | 93 | "punpcklqdq %%xmm6,%%xmm4\n\t" \ |
michael@0 | 94 | /*xmm6 is free.*/ \ |
michael@0 | 95 | "movdqa %%xmm7,%%xmm6\n\t" \ |
michael@0 | 96 | /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \ |
michael@0 | 97 | "punpckhqdq %%xmm8,%%xmm7\n\t" \ |
michael@0 | 98 | /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \ |
michael@0 | 99 | "punpcklqdq %%xmm8,%%xmm6\n\t" \ |
michael@0 | 100 | /*xmm8 is free.*/ \ |
michael@0 | 101 | |
michael@0 | 102 | # else |
michael@0 | 103 | /*Otherwise, we need to spill some values to %[buf] temporarily. |
michael@0 | 104 | Again, the butterflies are carefully arranged to get the columns to come out |
michael@0 | 105 | in order, minimizing register spills and maximizing the delay between a load |
michael@0 | 106 | and when the value loaded is actually used.*/ |
michael@0 | 107 | # define OC_TRANSPOSE_8x8 \ |
michael@0 | 108 | "#OC_TRANSPOSE_8x8\n\t" \ |
michael@0 | 109 | /*buf[0] = a7 a6 a5 a4 a3 a2 a1 a0*/ \ |
michael@0 | 110 | "movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \ |
michael@0 | 111 | /*xmm0 is free.*/ \ |
michael@0 | 112 | "movdqa %%xmm2,%%xmm0\n\t" \ |
michael@0 | 113 | /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \ |
michael@0 | 114 | "punpckhwd %%xmm3,%%xmm2\n\t" \ |
michael@0 | 115 | /*xmm0 = d3 c3 d2 c2 d1 c1 d0 c0*/ \ |
michael@0 | 116 | "punpcklwd %%xmm3,%%xmm0\n\t" \ |
michael@0 | 117 | /*xmm3 = a7 a6 a5 a4 a3 a2 a1 a0*/ \ |
michael@0 | 118 | "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm3\n\t" \ |
michael@0 | 119 | /*buf[1] = d7 c7 d6 c6 d5 c5 d4 c4*/ \ |
michael@0 | 120 | "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ |
michael@0 | 121 | /*xmm2 is free.*/ \ |
michael@0 | 122 | "movdqa %%xmm6,%%xmm2\n\t" \ |
michael@0 | 123 | /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \ |
michael@0 | 124 | "punpcklwd %%xmm7,%%xmm6\n\t" \ |
michael@0 | 125 | /*xmm2 = h7 g7 h6 g6 h5 g5 h4 g4*/ \ |
michael@0 | 126 | "punpckhwd %%xmm7,%%xmm2\n\t" \ |
michael@0 | 127 | /*xmm7 is free.*/ \ |
michael@0 | 128 | "movdqa %%xmm4,%%xmm7\n\t" \ |
michael@0 | 129 | /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \ |
michael@0 | 130 | "punpcklwd %%xmm5,%%xmm4\n\t" \ |
michael@0 | 131 | /*xmm7 = f7 e7 f6 e6 f5 e5 f4 e4*/ \ |
michael@0 | 132 | "punpckhwd %%xmm5,%%xmm7\n\t" \ |
michael@0 | 133 | /*xmm5 is free.*/ \ |
michael@0 | 134 | "movdqa %%xmm3,%%xmm5\n\t" \ |
michael@0 | 135 | /*xmm3 = b3 a3 b2 a2 b1 a1 b0 a0*/ \ |
michael@0 | 136 | "punpcklwd %%xmm1,%%xmm3\n\t" \ |
michael@0 | 137 | /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \ |
michael@0 | 138 | "punpckhwd %%xmm1,%%xmm5\n\t" \ |
michael@0 | 139 | /*xmm1 is free.*/ \ |
michael@0 | 140 | "movdqa %%xmm7,%%xmm1\n\t" \ |
michael@0 | 141 | /*xmm7 = h5 g5 f5 e5 h4 g4 f4 e4*/ \ |
michael@0 | 142 | "punpckldq %%xmm2,%%xmm7\n\t" \ |
michael@0 | 143 | /*xmm1 = h7 g7 f7 e7 h6 g6 f6 e6*/ \ |
michael@0 | 144 | "punpckhdq %%xmm2,%%xmm1\n\t" \ |
michael@0 | 145 | /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \ |
michael@0 | 146 | "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm2\n\t" \ |
michael@0 | 147 | /*buf[0] = h7 g7 f7 e7 h6 g6 f6 e6*/ \ |
michael@0 | 148 | "movdqa %%xmm1,"OC_MEM_OFFS(0x00,buf)"\n\t" \ |
michael@0 | 149 | /*xmm1 is free.*/ \ |
michael@0 | 150 | "movdqa %%xmm3,%%xmm1\n\t" \ |
michael@0 | 151 | /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \ |
michael@0 | 152 | "punpckhdq %%xmm0,%%xmm3\n\t" \ |
michael@0 | 153 | /*xmm1 = d1 c1 b1 a1 d0 c0 b0 a0*/ \ |
michael@0 | 154 | "punpckldq %%xmm0,%%xmm1\n\t" \ |
michael@0 | 155 | /*xmm0 is free.*/ \ |
michael@0 | 156 | "movdqa %%xmm4,%%xmm0\n\t" \ |
michael@0 | 157 | /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \ |
michael@0 | 158 | "punpckhdq %%xmm6,%%xmm4\n\t" \ |
michael@0 | 159 | /*xmm0 = h1 g1 f1 e1 h0 g0 f0 e0*/ \ |
michael@0 | 160 | "punpckldq %%xmm6,%%xmm0\n\t" \ |
michael@0 | 161 | /*xmm6 is free.*/ \ |
michael@0 | 162 | "movdqa %%xmm5,%%xmm6\n\t" \ |
michael@0 | 163 | /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \ |
michael@0 | 164 | "punpckldq %%xmm2,%%xmm5\n\t" \ |
michael@0 | 165 | /*xmm6 = d7 c7 b7 a7 d6 c6 b6 a6*/ \ |
michael@0 | 166 | "punpckhdq %%xmm2,%%xmm6\n\t" \ |
michael@0 | 167 | /*xmm2 is free.*/ \ |
michael@0 | 168 | "movdqa %%xmm1,%%xmm2\n\t" \ |
michael@0 | 169 | /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \ |
michael@0 | 170 | "punpckhqdq %%xmm0,%%xmm1\n\t" \ |
michael@0 | 171 | /*xmm2 = h0 g0 f0 e0 d0 c0 b0 a0*/ \ |
michael@0 | 172 | "punpcklqdq %%xmm0,%%xmm2\n\t" \ |
michael@0 | 173 | /*xmm0 = h7 g7 f7 e7 h6 g6 f6 e6*/ \ |
michael@0 | 174 | "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \ |
michael@0 | 175 | /*buf[1] = h0 g0 f0 e0 d0 c0 b0 a0*/ \ |
michael@0 | 176 | "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ |
michael@0 | 177 | /*xmm2 is free.*/ \ |
michael@0 | 178 | "movdqa %%xmm3,%%xmm2\n\t" \ |
michael@0 | 179 | /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \ |
michael@0 | 180 | "punpckhqdq %%xmm4,%%xmm3\n\t" \ |
michael@0 | 181 | /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \ |
michael@0 | 182 | "punpcklqdq %%xmm4,%%xmm2\n\t" \ |
michael@0 | 183 | /*xmm4 is free.*/ \ |
michael@0 | 184 | "movdqa %%xmm5,%%xmm4\n\t" \ |
michael@0 | 185 | /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \ |
michael@0 | 186 | "punpckhqdq %%xmm7,%%xmm5\n\t" \ |
michael@0 | 187 | /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \ |
michael@0 | 188 | "punpcklqdq %%xmm7,%%xmm4\n\t" \ |
michael@0 | 189 | /*xmm7 is free.*/ \ |
michael@0 | 190 | "movdqa %%xmm6,%%xmm7\n\t" \ |
michael@0 | 191 | /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \ |
michael@0 | 192 | "punpcklqdq %%xmm0,%%xmm6\n\t" \ |
michael@0 | 193 | /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \ |
michael@0 | 194 | "punpckhqdq %%xmm0,%%xmm7\n\t" \ |
michael@0 | 195 | /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \ |
michael@0 | 196 | "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm0\n\t" \ |
michael@0 | 197 | |
michael@0 | 198 | # endif |
michael@0 | 199 | |
michael@0 | 200 | /*Transpose 4 values in each of 8 MMX registers into 8 values in the first |
michael@0 | 201 | four SSE registers. |
michael@0 | 202 | No need to be clever here; we have plenty of room.*/ |
michael@0 | 203 | # define OC_TRANSPOSE_8x4_MMX2SSE \ |
michael@0 | 204 | "#OC_TRANSPOSE_8x4_MMX2SSE\n\t" \ |
michael@0 | 205 | "movq2dq %%mm0,%%xmm0\n\t" \ |
michael@0 | 206 | "movq2dq %%mm1,%%xmm1\n\t" \ |
michael@0 | 207 | /*xmmA = b3 a3 b2 a2 b1 a1 b0 a0*/ \ |
michael@0 | 208 | "punpcklwd %%xmm1,%%xmm0\n\t" \ |
michael@0 | 209 | "movq2dq %%mm2,%%xmm3\n\t" \ |
michael@0 | 210 | "movq2dq %%mm3,%%xmm2\n\t" \ |
michael@0 | 211 | /*xmmC = d3 c3 d2 c2 d1 c1 d0 c0*/ \ |
michael@0 | 212 | "punpcklwd %%xmm2,%%xmm3\n\t" \ |
michael@0 | 213 | "movq2dq %%mm4,%%xmm4\n\t" \ |
michael@0 | 214 | "movq2dq %%mm5,%%xmm5\n\t" \ |
michael@0 | 215 | /*xmmE = f3 e3 f2 e2 f1 e1 f0 e0*/ \ |
michael@0 | 216 | "punpcklwd %%xmm5,%%xmm4\n\t" \ |
michael@0 | 217 | "movq2dq %%mm6,%%xmm7\n\t" \ |
michael@0 | 218 | "movq2dq %%mm7,%%xmm6\n\t" \ |
michael@0 | 219 | /*xmmG = h3 g3 h2 g2 h1 g1 h0 g0*/ \ |
michael@0 | 220 | "punpcklwd %%xmm6,%%xmm7\n\t" \ |
michael@0 | 221 | "movdqa %%xmm0,%%xmm2\n\t" \ |
michael@0 | 222 | /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \ |
michael@0 | 223 | "punpckldq %%xmm3,%%xmm0\n\t" \ |
michael@0 | 224 | /*xmm2 = d3 c3 b3 a3 d2 c2 b2 a2*/ \ |
michael@0 | 225 | "punpckhdq %%xmm3,%%xmm2\n\t" \ |
michael@0 | 226 | "movdqa %%xmm4,%%xmm5\n\t" \ |
michael@0 | 227 | /*xmm4 = h1 g1 f1 e1 h0 g0 f0 e0*/ \ |
michael@0 | 228 | "punpckldq %%xmm7,%%xmm4\n\t" \ |
michael@0 | 229 | /*xmm3 = h3 g3 f3 e3 h2 g2 f2 e2*/ \ |
michael@0 | 230 | "punpckhdq %%xmm7,%%xmm5\n\t" \ |
michael@0 | 231 | "movdqa %%xmm0,%%xmm1\n\t" \ |
michael@0 | 232 | /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \ |
michael@0 | 233 | "punpcklqdq %%xmm4,%%xmm0\n\t" \ |
michael@0 | 234 | /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \ |
michael@0 | 235 | "punpckhqdq %%xmm4,%%xmm1\n\t" \ |
michael@0 | 236 | "movdqa %%xmm2,%%xmm3\n\t" \ |
michael@0 | 237 | /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \ |
michael@0 | 238 | "punpcklqdq %%xmm5,%%xmm2\n\t" \ |
michael@0 | 239 | /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \ |
michael@0 | 240 | "punpckhqdq %%xmm5,%%xmm3\n\t" \ |
michael@0 | 241 | |
michael@0 | 242 | #endif |