media/libvpx/vp8/encoder/x86/dct_mmx.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11
michael@0 12 %include "vpx_ports/x86_abi_support.asm"
michael@0 13
michael@0 14 ;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
michael@0 15 global sym(vp8_short_fdct4x4_mmx) PRIVATE
michael@0 16 sym(vp8_short_fdct4x4_mmx):
michael@0 17 push rbp
michael@0 18 mov rbp, rsp
michael@0 19 SHADOW_ARGS_TO_STACK 3
michael@0 20 GET_GOT rbx
michael@0 21 push rsi
michael@0 22 push rdi
michael@0 23 ; end prolog
michael@0 24
michael@0 25 mov rsi, arg(0) ; input
michael@0 26 mov rdi, arg(1) ; output
michael@0 27
michael@0 28 movsxd rax, dword ptr arg(2) ;pitch
michael@0 29
michael@0 30 lea rcx, [rsi + rax*2]
michael@0 31 ; read the input data
michael@0 32 movq mm0, [rsi]
michael@0 33 movq mm1, [rsi + rax]
michael@0 34
michael@0 35 movq mm2, [rcx]
michael@0 36 movq mm4, [rcx + rax]
michael@0 37
michael@0 38 ; transpose for the first stage
michael@0 39 movq mm3, mm0 ; 00 01 02 03
michael@0 40 movq mm5, mm2 ; 20 21 22 23
michael@0 41
michael@0 42 punpcklwd mm0, mm1 ; 00 10 01 11
michael@0 43 punpckhwd mm3, mm1 ; 02 12 03 13
michael@0 44
michael@0 45 punpcklwd mm2, mm4 ; 20 30 21 31
michael@0 46 punpckhwd mm5, mm4 ; 22 32 23 33
michael@0 47
michael@0 48 movq mm1, mm0 ; 00 10 01 11
michael@0 49 punpckldq mm0, mm2 ; 00 10 20 30
michael@0 50
michael@0 51 punpckhdq mm1, mm2 ; 01 11 21 31
michael@0 52
michael@0 53 movq mm2, mm3 ; 02 12 03 13
michael@0 54 punpckldq mm2, mm5 ; 02 12 22 32
michael@0 55
michael@0 56 punpckhdq mm3, mm5 ; 03 13 23 33
michael@0 57
michael@0 58 ; mm0 0
michael@0 59 ; mm1 1
michael@0 60 ; mm2 2
michael@0 61 ; mm3 3
michael@0 62
michael@0 63 ; first stage
michael@0 64 movq mm5, mm0
michael@0 65 movq mm4, mm1
michael@0 66
michael@0 67 paddw mm0, mm3 ; a1 = 0 + 3
michael@0 68 paddw mm1, mm2 ; b1 = 1 + 2
michael@0 69
michael@0 70 psubw mm4, mm2 ; c1 = 1 - 2
michael@0 71 psubw mm5, mm3 ; d1 = 0 - 3
michael@0 72
michael@0 73 psllw mm5, 3
michael@0 74 psllw mm4, 3
michael@0 75
michael@0 76 psllw mm0, 3
michael@0 77 psllw mm1, 3
michael@0 78
michael@0 79 ; output 0 and 2
michael@0 80 movq mm2, mm0 ; a1
michael@0 81
michael@0 82 paddw mm0, mm1 ; op[0] = a1 + b1
michael@0 83 psubw mm2, mm1 ; op[2] = a1 - b1
michael@0 84
michael@0 85 ; output 1 and 3
michael@0 86 ; interleave c1, d1
michael@0 87 movq mm1, mm5 ; d1
michael@0 88 punpcklwd mm1, mm4 ; c1 d1
michael@0 89 punpckhwd mm5, mm4 ; c1 d1
michael@0 90
michael@0 91 movq mm3, mm1
michael@0 92 movq mm4, mm5
michael@0 93
michael@0 94 pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
michael@0 95 pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
michael@0 96
michael@0 97 pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
michael@0 98 pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
michael@0 99
michael@0 100 paddd mm1, MMWORD PTR[GLOBAL(_14500)]
michael@0 101 paddd mm4, MMWORD PTR[GLOBAL(_14500)]
michael@0 102 paddd mm3, MMWORD PTR[GLOBAL(_7500)]
michael@0 103 paddd mm5, MMWORD PTR[GLOBAL(_7500)]
michael@0 104
michael@0 105 psrad mm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
michael@0 106 psrad mm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12
michael@0 107 psrad mm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
michael@0 108 psrad mm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12
michael@0 109
michael@0 110 packssdw mm1, mm4 ; op[1]
michael@0 111 packssdw mm3, mm5 ; op[3]
michael@0 112
michael@0 113 ; done with vertical
michael@0 114 ; transpose for the second stage
michael@0 115 movq mm4, mm0 ; 00 10 20 30
michael@0 116 movq mm5, mm2 ; 02 12 22 32
michael@0 117
michael@0 118 punpcklwd mm0, mm1 ; 00 01 10 11
michael@0 119 punpckhwd mm4, mm1 ; 20 21 30 31
michael@0 120
michael@0 121 punpcklwd mm2, mm3 ; 02 03 12 13
michael@0 122 punpckhwd mm5, mm3 ; 22 23 32 33
michael@0 123
michael@0 124 movq mm1, mm0 ; 00 01 10 11
michael@0 125 punpckldq mm0, mm2 ; 00 01 02 03
michael@0 126
michael@0 127 punpckhdq mm1, mm2 ; 01 22 12 13
michael@0 128
michael@0 129 movq mm2, mm4 ; 20 31 30 31
michael@0 130 punpckldq mm2, mm5 ; 20 21 22 23
michael@0 131
michael@0 132 punpckhdq mm4, mm5 ; 30 31 32 33
michael@0 133
michael@0 134 ; mm0 0
michael@0 135 ; mm1 1
michael@0 136 ; mm2 2
michael@0 137 ; mm3 4
michael@0 138
michael@0 139 movq mm5, mm0
michael@0 140 movq mm3, mm1
michael@0 141
michael@0 142 paddw mm0, mm4 ; a1 = 0 + 3
michael@0 143 paddw mm1, mm2 ; b1 = 1 + 2
michael@0 144
michael@0 145 psubw mm3, mm2 ; c1 = 1 - 2
michael@0 146 psubw mm5, mm4 ; d1 = 0 - 3
michael@0 147
michael@0 148 pxor mm6, mm6 ; zero out for compare
michael@0 149
michael@0 150 pcmpeqw mm6, mm5 ; d1 != 0
michael@0 151
michael@0 152 pandn mm6, MMWORD PTR[GLOBAL(_cmp_mask)] ; clear upper,
michael@0 153 ; and keep bit 0 of lower
michael@0 154
michael@0 155 ; output 0 and 2
michael@0 156 movq mm2, mm0 ; a1
michael@0 157
michael@0 158 paddw mm0, mm1 ; a1 + b1
michael@0 159 psubw mm2, mm1 ; a1 - b1
michael@0 160
michael@0 161 paddw mm0, MMWORD PTR[GLOBAL(_7w)]
michael@0 162 paddw mm2, MMWORD PTR[GLOBAL(_7w)]
michael@0 163
michael@0 164 psraw mm0, 4 ; op[0] = (a1 + b1 + 7)>>4
michael@0 165 psraw mm2, 4 ; op[8] = (a1 - b1 + 7)>>4
michael@0 166
michael@0 167 movq MMWORD PTR[rdi + 0 ], mm0
michael@0 168 movq MMWORD PTR[rdi + 16], mm2
michael@0 169
michael@0 170 ; output 1 and 3
michael@0 171 ; interleave c1, d1
michael@0 172 movq mm1, mm5 ; d1
michael@0 173 punpcklwd mm1, mm3 ; c1 d1
michael@0 174 punpckhwd mm5, mm3 ; c1 d1
michael@0 175
michael@0 176 movq mm3, mm1
michael@0 177 movq mm4, mm5
michael@0 178
michael@0 179 pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
michael@0 180 pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352
michael@0 181
michael@0 182 pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
michael@0 183 pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352
michael@0 184
michael@0 185 paddd mm1, MMWORD PTR[GLOBAL(_12000)]
michael@0 186 paddd mm4, MMWORD PTR[GLOBAL(_12000)]
michael@0 187 paddd mm3, MMWORD PTR[GLOBAL(_51000)]
michael@0 188 paddd mm5, MMWORD PTR[GLOBAL(_51000)]
michael@0 189
michael@0 190 psrad mm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
michael@0 191 psrad mm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16
michael@0 192 psrad mm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
michael@0 193 psrad mm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16
michael@0 194
michael@0 195 packssdw mm1, mm4 ; op[4]
michael@0 196 packssdw mm3, mm5 ; op[12]
michael@0 197
michael@0 198 paddw mm1, mm6 ; op[4] += (d1!=0)
michael@0 199
michael@0 200 movq MMWORD PTR[rdi + 8 ], mm1
michael@0 201 movq MMWORD PTR[rdi + 24], mm3
michael@0 202
michael@0 203 ; begin epilog
michael@0 204 pop rdi
michael@0 205 pop rsi
michael@0 206 RESTORE_GOT
michael@0 207 UNSHADOW_ARGS
michael@0 208 pop rbp
michael@0 209 ret
michael@0 210
michael@0 211 SECTION_RODATA
michael@0 212 align 8
michael@0 213 _5352_2217:
michael@0 214 dw 5352
michael@0 215 dw 2217
michael@0 216 dw 5352
michael@0 217 dw 2217
michael@0 218 align 8
michael@0 219 _2217_neg5352:
michael@0 220 dw 2217
michael@0 221 dw -5352
michael@0 222 dw 2217
michael@0 223 dw -5352
michael@0 224 align 8
michael@0 225 _cmp_mask:
michael@0 226 times 4 dw 1
michael@0 227 align 8
michael@0 228 _7w:
michael@0 229 times 4 dw 7
michael@0 230 align 8
michael@0 231 _14500:
michael@0 232 times 2 dd 14500
michael@0 233 align 8
michael@0 234 _7500:
michael@0 235 times 2 dd 7500
michael@0 236 align 8
michael@0 237 _12000:
michael@0 238 times 2 dd 12000
michael@0 239 align 8
michael@0 240 _51000:
michael@0 241 times 2 dd 51000

mercurial