media/libvpx/vp9/encoder/x86/vp9_sad_sse3.asm

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11 %include "vpx_ports/x86_abi_support.asm"
michael@0 12
michael@0 13 %macro STACK_FRAME_CREATE_X3 0
michael@0 14 %if ABI_IS_32BIT
michael@0 15 %define src_ptr rsi
michael@0 16 %define src_stride rax
michael@0 17 %define ref_ptr rdi
michael@0 18 %define ref_stride rdx
michael@0 19 %define end_ptr rcx
michael@0 20 %define ret_var rbx
michael@0 21 %define result_ptr arg(4)
michael@0 22 %define max_err arg(4)
michael@0 23 %define height dword ptr arg(4)
michael@0 24 push rbp
michael@0 25 mov rbp, rsp
michael@0 26 push rsi
michael@0 27 push rdi
michael@0 28 push rbx
michael@0 29
michael@0 30 mov rsi, arg(0) ; src_ptr
michael@0 31 mov rdi, arg(2) ; ref_ptr
michael@0 32
michael@0 33 movsxd rax, dword ptr arg(1) ; src_stride
michael@0 34 movsxd rdx, dword ptr arg(3) ; ref_stride
michael@0 35 %else
michael@0 36 %if LIBVPX_YASM_WIN64
michael@0 37 SAVE_XMM 7, u
michael@0 38 %define src_ptr rcx
michael@0 39 %define src_stride rdx
michael@0 40 %define ref_ptr r8
michael@0 41 %define ref_stride r9
michael@0 42 %define end_ptr r10
michael@0 43 %define ret_var r11
michael@0 44 %define result_ptr [rsp+xmm_stack_space+8+4*8]
michael@0 45 %define max_err [rsp+xmm_stack_space+8+4*8]
michael@0 46 %define height dword ptr [rsp+xmm_stack_space+8+4*8]
michael@0 47 %else
michael@0 48 %define src_ptr rdi
michael@0 49 %define src_stride rsi
michael@0 50 %define ref_ptr rdx
michael@0 51 %define ref_stride rcx
michael@0 52 %define end_ptr r9
michael@0 53 %define ret_var r10
michael@0 54 %define result_ptr r8
michael@0 55 %define max_err r8
michael@0 56 %define height r8
michael@0 57 %endif
michael@0 58 %endif
michael@0 59
michael@0 60 %endmacro
michael@0 61
michael@0 62 %macro STACK_FRAME_DESTROY_X3 0
michael@0 63 %define src_ptr
michael@0 64 %define src_stride
michael@0 65 %define ref_ptr
michael@0 66 %define ref_stride
michael@0 67 %define end_ptr
michael@0 68 %define ret_var
michael@0 69 %define result_ptr
michael@0 70 %define max_err
michael@0 71 %define height
michael@0 72
michael@0 73 %if ABI_IS_32BIT
michael@0 74 pop rbx
michael@0 75 pop rdi
michael@0 76 pop rsi
michael@0 77 pop rbp
michael@0 78 %else
michael@0 79 %if LIBVPX_YASM_WIN64
michael@0 80 RESTORE_XMM
michael@0 81 %endif
michael@0 82 %endif
michael@0 83 ret
michael@0 84 %endmacro
michael@0 85
michael@0 86 %macro PROCESS_16X2X3 5
michael@0 87 %if %1==0
michael@0 88 movdqa xmm0, XMMWORD PTR [%2]
michael@0 89 lddqu xmm5, XMMWORD PTR [%3]
michael@0 90 lddqu xmm6, XMMWORD PTR [%3+1]
michael@0 91 lddqu xmm7, XMMWORD PTR [%3+2]
michael@0 92
michael@0 93 psadbw xmm5, xmm0
michael@0 94 psadbw xmm6, xmm0
michael@0 95 psadbw xmm7, xmm0
michael@0 96 %else
michael@0 97 movdqa xmm0, XMMWORD PTR [%2]
michael@0 98 lddqu xmm1, XMMWORD PTR [%3]
michael@0 99 lddqu xmm2, XMMWORD PTR [%3+1]
michael@0 100 lddqu xmm3, XMMWORD PTR [%3+2]
michael@0 101
michael@0 102 psadbw xmm1, xmm0
michael@0 103 psadbw xmm2, xmm0
michael@0 104 psadbw xmm3, xmm0
michael@0 105
michael@0 106 paddw xmm5, xmm1
michael@0 107 paddw xmm6, xmm2
michael@0 108 paddw xmm7, xmm3
michael@0 109 %endif
michael@0 110 movdqa xmm0, XMMWORD PTR [%2+%4]
michael@0 111 lddqu xmm1, XMMWORD PTR [%3+%5]
michael@0 112 lddqu xmm2, XMMWORD PTR [%3+%5+1]
michael@0 113 lddqu xmm3, XMMWORD PTR [%3+%5+2]
michael@0 114
michael@0 115 %if %1==0 || %1==1
michael@0 116 lea %2, [%2+%4*2]
michael@0 117 lea %3, [%3+%5*2]
michael@0 118 %endif
michael@0 119
michael@0 120 psadbw xmm1, xmm0
michael@0 121 psadbw xmm2, xmm0
michael@0 122 psadbw xmm3, xmm0
michael@0 123
michael@0 124 paddw xmm5, xmm1
michael@0 125 paddw xmm6, xmm2
michael@0 126 paddw xmm7, xmm3
michael@0 127 %endmacro
michael@0 128
michael@0 129 %macro PROCESS_8X2X3 5
michael@0 130 %if %1==0
michael@0 131 movq mm0, QWORD PTR [%2]
michael@0 132 movq mm5, QWORD PTR [%3]
michael@0 133 movq mm6, QWORD PTR [%3+1]
michael@0 134 movq mm7, QWORD PTR [%3+2]
michael@0 135
michael@0 136 psadbw mm5, mm0
michael@0 137 psadbw mm6, mm0
michael@0 138 psadbw mm7, mm0
michael@0 139 %else
michael@0 140 movq mm0, QWORD PTR [%2]
michael@0 141 movq mm1, QWORD PTR [%3]
michael@0 142 movq mm2, QWORD PTR [%3+1]
michael@0 143 movq mm3, QWORD PTR [%3+2]
michael@0 144
michael@0 145 psadbw mm1, mm0
michael@0 146 psadbw mm2, mm0
michael@0 147 psadbw mm3, mm0
michael@0 148
michael@0 149 paddw mm5, mm1
michael@0 150 paddw mm6, mm2
michael@0 151 paddw mm7, mm3
michael@0 152 %endif
michael@0 153 movq mm0, QWORD PTR [%2+%4]
michael@0 154 movq mm1, QWORD PTR [%3+%5]
michael@0 155 movq mm2, QWORD PTR [%3+%5+1]
michael@0 156 movq mm3, QWORD PTR [%3+%5+2]
michael@0 157
michael@0 158 %if %1==0 || %1==1
michael@0 159 lea %2, [%2+%4*2]
michael@0 160 lea %3, [%3+%5*2]
michael@0 161 %endif
michael@0 162
michael@0 163 psadbw mm1, mm0
michael@0 164 psadbw mm2, mm0
michael@0 165 psadbw mm3, mm0
michael@0 166
michael@0 167 paddw mm5, mm1
michael@0 168 paddw mm6, mm2
michael@0 169 paddw mm7, mm3
michael@0 170 %endmacro
michael@0 171
michael@0 172 ;void int vp9_sad16x16x3_sse3(
michael@0 173 ; unsigned char *src_ptr,
michael@0 174 ; int src_stride,
michael@0 175 ; unsigned char *ref_ptr,
michael@0 176 ; int ref_stride,
michael@0 177 ; int *results)
michael@0 178 global sym(vp9_sad16x16x3_sse3) PRIVATE
michael@0 179 sym(vp9_sad16x16x3_sse3):
michael@0 180
michael@0 181 STACK_FRAME_CREATE_X3
michael@0 182
michael@0 183 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 184 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 185 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 186 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 187 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 188 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 189 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 190 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 191
michael@0 192 mov rcx, result_ptr
michael@0 193
michael@0 194 movq xmm0, xmm5
michael@0 195 psrldq xmm5, 8
michael@0 196
michael@0 197 paddw xmm0, xmm5
michael@0 198 movd [rcx], xmm0
michael@0 199 ;-
michael@0 200 movq xmm0, xmm6
michael@0 201 psrldq xmm6, 8
michael@0 202
michael@0 203 paddw xmm0, xmm6
michael@0 204 movd [rcx+4], xmm0
michael@0 205 ;-
michael@0 206 movq xmm0, xmm7
michael@0 207 psrldq xmm7, 8
michael@0 208
michael@0 209 paddw xmm0, xmm7
michael@0 210 movd [rcx+8], xmm0
michael@0 211
michael@0 212 STACK_FRAME_DESTROY_X3
michael@0 213
michael@0 214 ;void int vp9_sad16x8x3_sse3(
michael@0 215 ; unsigned char *src_ptr,
michael@0 216 ; int src_stride,
michael@0 217 ; unsigned char *ref_ptr,
michael@0 218 ; int ref_stride,
michael@0 219 ; int *results)
michael@0 220 global sym(vp9_sad16x8x3_sse3) PRIVATE
michael@0 221 sym(vp9_sad16x8x3_sse3):
michael@0 222
michael@0 223 STACK_FRAME_CREATE_X3
michael@0 224
michael@0 225 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 226 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 227 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 228 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 229
michael@0 230 mov rcx, result_ptr
michael@0 231
michael@0 232 movq xmm0, xmm5
michael@0 233 psrldq xmm5, 8
michael@0 234
michael@0 235 paddw xmm0, xmm5
michael@0 236 movd [rcx], xmm0
michael@0 237 ;-
michael@0 238 movq xmm0, xmm6
michael@0 239 psrldq xmm6, 8
michael@0 240
michael@0 241 paddw xmm0, xmm6
michael@0 242 movd [rcx+4], xmm0
michael@0 243 ;-
michael@0 244 movq xmm0, xmm7
michael@0 245 psrldq xmm7, 8
michael@0 246
michael@0 247 paddw xmm0, xmm7
michael@0 248 movd [rcx+8], xmm0
michael@0 249
michael@0 250 STACK_FRAME_DESTROY_X3
michael@0 251
michael@0 252 ;void int vp9_sad8x16x3_sse3(
michael@0 253 ; unsigned char *src_ptr,
michael@0 254 ; int src_stride,
michael@0 255 ; unsigned char *ref_ptr,
michael@0 256 ; int ref_stride,
michael@0 257 ; int *results)
michael@0 258 global sym(vp9_sad8x16x3_sse3) PRIVATE
michael@0 259 sym(vp9_sad8x16x3_sse3):
michael@0 260
michael@0 261 STACK_FRAME_CREATE_X3
michael@0 262
michael@0 263 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 264 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 265 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 266 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 267 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 268 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 269 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 270 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 271
michael@0 272 mov rcx, result_ptr
michael@0 273
michael@0 274 punpckldq mm5, mm6
michael@0 275
michael@0 276 movq [rcx], mm5
michael@0 277 movd [rcx+8], mm7
michael@0 278
michael@0 279 STACK_FRAME_DESTROY_X3
michael@0 280
michael@0 281 ;void int vp9_sad8x8x3_sse3(
michael@0 282 ; unsigned char *src_ptr,
michael@0 283 ; int src_stride,
michael@0 284 ; unsigned char *ref_ptr,
michael@0 285 ; int ref_stride,
michael@0 286 ; int *results)
michael@0 287 global sym(vp9_sad8x8x3_sse3) PRIVATE
michael@0 288 sym(vp9_sad8x8x3_sse3):
michael@0 289
michael@0 290 STACK_FRAME_CREATE_X3
michael@0 291
michael@0 292 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 293 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 294 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 295 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
michael@0 296
michael@0 297 mov rcx, result_ptr
michael@0 298
michael@0 299 punpckldq mm5, mm6
michael@0 300
michael@0 301 movq [rcx], mm5
michael@0 302 movd [rcx+8], mm7
michael@0 303
michael@0 304 STACK_FRAME_DESTROY_X3
michael@0 305
michael@0 306 ;void int vp9_sad4x4x3_sse3(
michael@0 307 ; unsigned char *src_ptr,
michael@0 308 ; int src_stride,
michael@0 309 ; unsigned char *ref_ptr,
michael@0 310 ; int ref_stride,
michael@0 311 ; int *results)
michael@0 312 global sym(vp9_sad4x4x3_sse3) PRIVATE
michael@0 313 sym(vp9_sad4x4x3_sse3):
michael@0 314
michael@0 315 STACK_FRAME_CREATE_X3
michael@0 316
michael@0 317 movd mm0, DWORD PTR [src_ptr]
michael@0 318 movd mm1, DWORD PTR [ref_ptr]
michael@0 319
michael@0 320 movd mm2, DWORD PTR [src_ptr+src_stride]
michael@0 321 movd mm3, DWORD PTR [ref_ptr+ref_stride]
michael@0 322
michael@0 323 punpcklbw mm0, mm2
michael@0 324 punpcklbw mm1, mm3
michael@0 325
michael@0 326 movd mm4, DWORD PTR [ref_ptr+1]
michael@0 327 movd mm5, DWORD PTR [ref_ptr+2]
michael@0 328
michael@0 329 movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
michael@0 330 movd mm3, DWORD PTR [ref_ptr+ref_stride+2]
michael@0 331
michael@0 332 psadbw mm1, mm0
michael@0 333
michael@0 334 punpcklbw mm4, mm2
michael@0 335 punpcklbw mm5, mm3
michael@0 336
michael@0 337 psadbw mm4, mm0
michael@0 338 psadbw mm5, mm0
michael@0 339
michael@0 340 lea src_ptr, [src_ptr+src_stride*2]
michael@0 341 lea ref_ptr, [ref_ptr+ref_stride*2]
michael@0 342
michael@0 343 movd mm0, DWORD PTR [src_ptr]
michael@0 344 movd mm2, DWORD PTR [ref_ptr]
michael@0 345
michael@0 346 movd mm3, DWORD PTR [src_ptr+src_stride]
michael@0 347 movd mm6, DWORD PTR [ref_ptr+ref_stride]
michael@0 348
michael@0 349 punpcklbw mm0, mm3
michael@0 350 punpcklbw mm2, mm6
michael@0 351
michael@0 352 movd mm3, DWORD PTR [ref_ptr+1]
michael@0 353 movd mm7, DWORD PTR [ref_ptr+2]
michael@0 354
michael@0 355 psadbw mm2, mm0
michael@0 356
michael@0 357 paddw mm1, mm2
michael@0 358
michael@0 359 movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
michael@0 360 movd mm6, DWORD PTR [ref_ptr+ref_stride+2]
michael@0 361
michael@0 362 punpcklbw mm3, mm2
michael@0 363 punpcklbw mm7, mm6
michael@0 364
michael@0 365 psadbw mm3, mm0
michael@0 366 psadbw mm7, mm0
michael@0 367
michael@0 368 paddw mm3, mm4
michael@0 369 paddw mm7, mm5
michael@0 370
michael@0 371 mov rcx, result_ptr
michael@0 372
michael@0 373 punpckldq mm1, mm3
michael@0 374
michael@0 375 movq [rcx], mm1
michael@0 376 movd [rcx+8], mm7
michael@0 377
michael@0 378 STACK_FRAME_DESTROY_X3

mercurial