media/libvpx/vp8/common/x86/idctllm_mmx.asm

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11
michael@0 12 %include "vpx_ports/x86_abi_support.asm"
michael@0 13
michael@0 14 ; /****************************************************************************
michael@0 15 ; * Notes:
michael@0 16 ; *
michael@0 17 ; * This implementation makes use of 16 bit fixed point version of two multiply
michael@0 18 ; * constants:
michael@0 19 ; * 1. sqrt(2) * cos (pi/8)
michael@0 20 ; * 2. sqrt(2) * sin (pi/8)
michael@0 21 ; * Because the first constant is bigger than 1, to maintain the same 16 bit
michael@0 22 ; * fixed point precision as the second one, we use a trick of
michael@0 23 ; * x * a = x + x*(a-1)
michael@0 24 ; * so
michael@0 25 ; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
michael@0 26 ; *
michael@0 27 ; * For the second constant, because of the 16bit version is 35468, which
michael@0 28 ; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
michael@0 29 ; * number.
michael@0 30 ; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
michael@0 31 ; *
michael@0 32 ; **************************************************************************/
michael@0 33
michael@0 34
michael@0 35 ;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred,
michael@0 36 ;int pitch, unsigned char *dest,int stride)
michael@0 37 global sym(vp8_short_idct4x4llm_mmx) PRIVATE
michael@0 38 sym(vp8_short_idct4x4llm_mmx):
michael@0 39 push rbp
michael@0 40 mov rbp, rsp
michael@0 41 SHADOW_ARGS_TO_STACK 5
michael@0 42 GET_GOT rbx
michael@0 43 push rsi
michael@0 44 push rdi
michael@0 45 ; end prolog
michael@0 46
michael@0 47 mov rax, arg(0) ;input
michael@0 48 mov rsi, arg(1) ;pred
michael@0 49
michael@0 50 movq mm0, [rax ]
michael@0 51 movq mm1, [rax+ 8]
michael@0 52 movq mm2, [rax+16]
michael@0 53 movq mm3, [rax+24]
michael@0 54
michael@0 55 %if 0
michael@0 56 pxor mm7, mm7
michael@0 57 movq [rax], mm7
michael@0 58 movq [rax+8], mm7
michael@0 59 movq [rax+16],mm7
michael@0 60 movq [rax+24],mm7
michael@0 61 %endif
michael@0 62 movsxd rax, dword ptr arg(2) ;pitch
michael@0 63 mov rdx, arg(3) ;dest
michael@0 64 movsxd rdi, dword ptr arg(4) ;stride
michael@0 65
michael@0 66
michael@0 67 psubw mm0, mm2 ; b1= 0-2
michael@0 68 paddw mm2, mm2 ;
michael@0 69
michael@0 70 movq mm5, mm1
michael@0 71 paddw mm2, mm0 ; a1 =0+2
michael@0 72
michael@0 73 pmulhw mm5, [GLOBAL(x_s1sqr2)];
michael@0 74 paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
michael@0 75
michael@0 76 movq mm7, mm3 ;
michael@0 77 pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
michael@0 78
michael@0 79 paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
michael@0 80 psubw mm7, mm5 ; c1
michael@0 81
michael@0 82 movq mm5, mm1
michael@0 83 movq mm4, mm3
michael@0 84
michael@0 85 pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
michael@0 86 paddw mm5, mm1
michael@0 87
michael@0 88 pmulhw mm3, [GLOBAL(x_s1sqr2)]
michael@0 89 paddw mm3, mm4
michael@0 90
michael@0 91 paddw mm3, mm5 ; d1
michael@0 92 movq mm6, mm2 ; a1
michael@0 93
michael@0 94 movq mm4, mm0 ; b1
michael@0 95 paddw mm2, mm3 ;0
michael@0 96
michael@0 97 paddw mm4, mm7 ;1
michael@0 98 psubw mm0, mm7 ;2
michael@0 99
michael@0 100 psubw mm6, mm3 ;3
michael@0 101
michael@0 102 movq mm1, mm2 ; 03 02 01 00
michael@0 103 movq mm3, mm4 ; 23 22 21 20
michael@0 104
michael@0 105 punpcklwd mm1, mm0 ; 11 01 10 00
michael@0 106 punpckhwd mm2, mm0 ; 13 03 12 02
michael@0 107
michael@0 108 punpcklwd mm3, mm6 ; 31 21 30 20
michael@0 109 punpckhwd mm4, mm6 ; 33 23 32 22
michael@0 110
michael@0 111 movq mm0, mm1 ; 11 01 10 00
michael@0 112 movq mm5, mm2 ; 13 03 12 02
michael@0 113
michael@0 114 punpckldq mm0, mm3 ; 30 20 10 00
michael@0 115 punpckhdq mm1, mm3 ; 31 21 11 01
michael@0 116
michael@0 117 punpckldq mm2, mm4 ; 32 22 12 02
michael@0 118 punpckhdq mm5, mm4 ; 33 23 13 03
michael@0 119
michael@0 120 movq mm3, mm5 ; 33 23 13 03
michael@0 121
michael@0 122 psubw mm0, mm2 ; b1= 0-2
michael@0 123 paddw mm2, mm2 ;
michael@0 124
michael@0 125 movq mm5, mm1
michael@0 126 paddw mm2, mm0 ; a1 =0+2
michael@0 127
michael@0 128 pmulhw mm5, [GLOBAL(x_s1sqr2)];
michael@0 129 paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2)
michael@0 130
michael@0 131 movq mm7, mm3 ;
michael@0 132 pmulhw mm7, [GLOBAL(x_c1sqr2less1)];
michael@0 133
michael@0 134 paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2)
michael@0 135 psubw mm7, mm5 ; c1
michael@0 136
michael@0 137 movq mm5, mm1
michael@0 138 movq mm4, mm3
michael@0 139
michael@0 140 pmulhw mm5, [GLOBAL(x_c1sqr2less1)]
michael@0 141 paddw mm5, mm1
michael@0 142
michael@0 143 pmulhw mm3, [GLOBAL(x_s1sqr2)]
michael@0 144 paddw mm3, mm4
michael@0 145
michael@0 146 paddw mm3, mm5 ; d1
michael@0 147 paddw mm0, [GLOBAL(fours)]
michael@0 148
michael@0 149 paddw mm2, [GLOBAL(fours)]
michael@0 150 movq mm6, mm2 ; a1
michael@0 151
michael@0 152 movq mm4, mm0 ; b1
michael@0 153 paddw mm2, mm3 ;0
michael@0 154
michael@0 155 paddw mm4, mm7 ;1
michael@0 156 psubw mm0, mm7 ;2
michael@0 157
michael@0 158 psubw mm6, mm3 ;3
michael@0 159 psraw mm2, 3
michael@0 160
michael@0 161 psraw mm0, 3
michael@0 162 psraw mm4, 3
michael@0 163
michael@0 164 psraw mm6, 3
michael@0 165
michael@0 166 movq mm1, mm2 ; 03 02 01 00
michael@0 167 movq mm3, mm4 ; 23 22 21 20
michael@0 168
michael@0 169 punpcklwd mm1, mm0 ; 11 01 10 00
michael@0 170 punpckhwd mm2, mm0 ; 13 03 12 02
michael@0 171
michael@0 172 punpcklwd mm3, mm6 ; 31 21 30 20
michael@0 173 punpckhwd mm4, mm6 ; 33 23 32 22
michael@0 174
michael@0 175 movq mm0, mm1 ; 11 01 10 00
michael@0 176 movq mm5, mm2 ; 13 03 12 02
michael@0 177
michael@0 178 punpckldq mm0, mm3 ; 30 20 10 00
michael@0 179 punpckhdq mm1, mm3 ; 31 21 11 01
michael@0 180
michael@0 181 punpckldq mm2, mm4 ; 32 22 12 02
michael@0 182 punpckhdq mm5, mm4 ; 33 23 13 03
michael@0 183
michael@0 184 pxor mm7, mm7
michael@0 185
michael@0 186 movd mm4, [rsi]
michael@0 187 punpcklbw mm4, mm7
michael@0 188 paddsw mm0, mm4
michael@0 189 packuswb mm0, mm7
michael@0 190 movd [rdx], mm0
michael@0 191
michael@0 192 movd mm4, [rsi+rax]
michael@0 193 punpcklbw mm4, mm7
michael@0 194 paddsw mm1, mm4
michael@0 195 packuswb mm1, mm7
michael@0 196 movd [rdx+rdi], mm1
michael@0 197
michael@0 198 movd mm4, [rsi+2*rax]
michael@0 199 punpcklbw mm4, mm7
michael@0 200 paddsw mm2, mm4
michael@0 201 packuswb mm2, mm7
michael@0 202 movd [rdx+rdi*2], mm2
michael@0 203
michael@0 204 add rdx, rdi
michael@0 205 add rsi, rax
michael@0 206
michael@0 207 movd mm4, [rsi+2*rax]
michael@0 208 punpcklbw mm4, mm7
michael@0 209 paddsw mm5, mm4
michael@0 210 packuswb mm5, mm7
michael@0 211 movd [rdx+rdi*2], mm5
michael@0 212
michael@0 213 ; begin epilog
michael@0 214 pop rdi
michael@0 215 pop rsi
michael@0 216 RESTORE_GOT
michael@0 217 UNSHADOW_ARGS
michael@0 218 pop rbp
michael@0 219 ret
michael@0 220
michael@0 221 ;void vp8_dc_only_idct_add_mmx(
michael@0 222 ;short input_dc,
michael@0 223 ;unsigned char *pred_ptr,
michael@0 224 ;int pred_stride,
michael@0 225 ;unsigned char *dst_ptr,
michael@0 226 ;int stride)
michael@0 227 global sym(vp8_dc_only_idct_add_mmx) PRIVATE
michael@0 228 sym(vp8_dc_only_idct_add_mmx):
michael@0 229 push rbp
michael@0 230 mov rbp, rsp
michael@0 231 SHADOW_ARGS_TO_STACK 5
michael@0 232 GET_GOT rbx
michael@0 233 ; end prolog
michael@0 234
michael@0 235 movd mm5, arg(0) ;input_dc
michael@0 236 mov rax, arg(1) ;pred_ptr
michael@0 237 movsxd rdx, dword ptr arg(2) ;pred_stride
michael@0 238
michael@0 239 pxor mm0, mm0
michael@0 240
michael@0 241 paddw mm5, [GLOBAL(fours)]
michael@0 242 lea rcx, [rdx + rdx*2]
michael@0 243
michael@0 244 psraw mm5, 3
michael@0 245
michael@0 246 punpcklwd mm5, mm5
michael@0 247
michael@0 248 punpckldq mm5, mm5
michael@0 249
michael@0 250 movd mm1, [rax]
michael@0 251 movd mm2, [rax+rdx]
michael@0 252 movd mm3, [rax+2*rdx]
michael@0 253 movd mm4, [rax+rcx]
michael@0 254
michael@0 255 mov rax, arg(3) ;d -- destination
michael@0 256 movsxd rdx, dword ptr arg(4) ;dst_stride
michael@0 257
michael@0 258 punpcklbw mm1, mm0
michael@0 259 paddsw mm1, mm5
michael@0 260 packuswb mm1, mm0 ; pack and unpack to saturate
michael@0 261 lea rcx, [rdx + rdx*2]
michael@0 262
michael@0 263 punpcklbw mm2, mm0
michael@0 264 paddsw mm2, mm5
michael@0 265 packuswb mm2, mm0 ; pack and unpack to saturate
michael@0 266
michael@0 267 punpcklbw mm3, mm0
michael@0 268 paddsw mm3, mm5
michael@0 269 packuswb mm3, mm0 ; pack and unpack to saturate
michael@0 270
michael@0 271 punpcklbw mm4, mm0
michael@0 272 paddsw mm4, mm5
michael@0 273 packuswb mm4, mm0 ; pack and unpack to saturate
michael@0 274
michael@0 275 movd [rax], mm1
michael@0 276 movd [rax+rdx], mm2
michael@0 277 movd [rax+2*rdx], mm3
michael@0 278 movd [rax+rcx], mm4
michael@0 279
michael@0 280 ; begin epilog
michael@0 281 RESTORE_GOT
michael@0 282 UNSHADOW_ARGS
michael@0 283 pop rbp
michael@0 284 ret
michael@0 285
michael@0 286 SECTION_RODATA
michael@0 287 align 16
michael@0 288 x_s1sqr2:
michael@0 289 times 4 dw 0x8A8C
michael@0 290 align 16
michael@0 291 x_c1sqr2less1:
michael@0 292 times 4 dw 0x4E7B
michael@0 293 align 16
michael@0 294 fours:
michael@0 295 times 4 dw 0x0004

mercurial