media/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 ;
michael@0 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0 3 ;
michael@0 4 ; Use of this source code is governed by a BSD-style license
michael@0 5 ; that can be found in the LICENSE file in the root of the source
michael@0 6 ; tree. An additional intellectual property rights grant can be found
michael@0 7 ; in the file PATENTS. All contributing project authors may
michael@0 8 ; be found in the AUTHORS file in the root of the source tree.
michael@0 9 ;
michael@0 10
michael@0 11 %include "vpx_ports/x86_abi_support.asm"
michael@0 12
michael@0 13 ;void vp9_half_horiz_vert_variance16x_h_sse2
michael@0 14 ;(
michael@0 15 ; unsigned char *ref_ptr,
michael@0 16 ; int ref_pixels_per_line,
michael@0 17 ; unsigned char *src_ptr,
michael@0 18 ; int src_pixels_per_line,
michael@0 19 ; unsigned int Height,
michael@0 20 ; int *sum,
michael@0 21 ; unsigned int *sumsquared
michael@0 22 ;)
michael@0 23 global sym(vp9_half_horiz_vert_variance16x_h_sse2) PRIVATE
michael@0 24 sym(vp9_half_horiz_vert_variance16x_h_sse2):
michael@0 25 push rbp
michael@0 26 mov rbp, rsp
michael@0 27 SHADOW_ARGS_TO_STACK 7
michael@0 28 SAVE_XMM 7
michael@0 29 GET_GOT rbx
michael@0 30 push rsi
michael@0 31 push rdi
michael@0 32 ; end prolog
michael@0 33
michael@0 34 pxor xmm6, xmm6 ; error accumulator
michael@0 35 pxor xmm7, xmm7 ; sse eaccumulator
michael@0 36 mov rsi, arg(0) ;ref_ptr ;
michael@0 37
michael@0 38 mov rdi, arg(2) ;src_ptr ;
michael@0 39 movsxd rcx, dword ptr arg(4) ;Height ;
michael@0 40 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
michael@0 41 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
michael@0 42
michael@0 43 pxor xmm0, xmm0 ;
michael@0 44
michael@0 45 movdqu xmm5, XMMWORD PTR [rsi]
michael@0 46 movdqu xmm3, XMMWORD PTR [rsi+1]
michael@0 47 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
michael@0 48
michael@0 49 lea rsi, [rsi + rax]
michael@0 50
michael@0 51 .half_horiz_vert_variance16x_h_1:
michael@0 52 movdqu xmm1, XMMWORD PTR [rsi] ;
michael@0 53 movdqu xmm2, XMMWORD PTR [rsi+1] ;
michael@0 54 pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
michael@0 55
michael@0 56 pavgb xmm5, xmm1 ; xmm = vertical average of the above
michael@0 57
michael@0 58 movdqa xmm4, xmm5
michael@0 59 punpcklbw xmm5, xmm0 ; xmm5 = words of above
michael@0 60 punpckhbw xmm4, xmm0
michael@0 61
michael@0 62 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
michael@0 63 punpcklbw xmm3, xmm0 ; xmm3 = words of above
michael@0 64 psubw xmm5, xmm3 ; xmm5 -= xmm3
michael@0 65
michael@0 66 movq xmm3, QWORD PTR [rdi+8]
michael@0 67 punpcklbw xmm3, xmm0
michael@0 68 psubw xmm4, xmm3
michael@0 69
michael@0 70 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
michael@0 71 paddw xmm6, xmm4
michael@0 72 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
michael@0 73 pmaddwd xmm4, xmm4
michael@0 74 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
michael@0 75 paddd xmm7, xmm4
michael@0 76
michael@0 77 movdqa xmm5, xmm1 ; save xmm1 for use on the next row
michael@0 78
michael@0 79 lea rsi, [rsi + rax]
michael@0 80 lea rdi, [rdi + rdx]
michael@0 81
michael@0 82 sub rcx, 1 ;
michael@0 83 jnz .half_horiz_vert_variance16x_h_1 ;
michael@0 84
michael@0 85 pxor xmm1, xmm1
michael@0 86 pxor xmm5, xmm5
michael@0 87
michael@0 88 punpcklwd xmm0, xmm6
michael@0 89 punpckhwd xmm1, xmm6
michael@0 90 psrad xmm0, 16
michael@0 91 psrad xmm1, 16
michael@0 92 paddd xmm0, xmm1
michael@0 93 movdqa xmm1, xmm0
michael@0 94
michael@0 95 movdqa xmm6, xmm7
michael@0 96 punpckldq xmm6, xmm5
michael@0 97 punpckhdq xmm7, xmm5
michael@0 98 paddd xmm6, xmm7
michael@0 99
michael@0 100 punpckldq xmm0, xmm5
michael@0 101 punpckhdq xmm1, xmm5
michael@0 102 paddd xmm0, xmm1
michael@0 103
michael@0 104 movdqa xmm7, xmm6
michael@0 105 movdqa xmm1, xmm0
michael@0 106
michael@0 107 psrldq xmm7, 8
michael@0 108 psrldq xmm1, 8
michael@0 109
michael@0 110 paddd xmm6, xmm7
michael@0 111 paddd xmm0, xmm1
michael@0 112
michael@0 113 mov rsi, arg(5) ;[Sum]
michael@0 114 mov rdi, arg(6) ;[SSE]
michael@0 115
michael@0 116 movd [rsi], xmm0
michael@0 117 movd [rdi], xmm6
michael@0 118
michael@0 119 ; begin epilog
michael@0 120 pop rdi
michael@0 121 pop rsi
michael@0 122 RESTORE_GOT
michael@0 123 RESTORE_XMM
michael@0 124 UNSHADOW_ARGS
michael@0 125 pop rbp
michael@0 126 ret
michael@0 127
michael@0 128 ;void vp9_half_vert_variance16x_h_sse2
michael@0 129 ;(
michael@0 130 ; unsigned char *ref_ptr,
michael@0 131 ; int ref_pixels_per_line,
michael@0 132 ; unsigned char *src_ptr,
michael@0 133 ; int src_pixels_per_line,
michael@0 134 ; unsigned int Height,
michael@0 135 ; int *sum,
michael@0 136 ; unsigned int *sumsquared
michael@0 137 ;)
michael@0 138 global sym(vp9_half_vert_variance16x_h_sse2) PRIVATE
michael@0 139 sym(vp9_half_vert_variance16x_h_sse2):
michael@0 140 push rbp
michael@0 141 mov rbp, rsp
michael@0 142 SHADOW_ARGS_TO_STACK 7
michael@0 143 SAVE_XMM 7
michael@0 144 GET_GOT rbx
michael@0 145 push rsi
michael@0 146 push rdi
michael@0 147 ; end prolog
michael@0 148
michael@0 149 pxor xmm6, xmm6 ; error accumulator
michael@0 150 pxor xmm7, xmm7 ; sse eaccumulator
michael@0 151 mov rsi, arg(0) ;ref_ptr
michael@0 152
michael@0 153 mov rdi, arg(2) ;src_ptr
michael@0 154 movsxd rcx, dword ptr arg(4) ;Height
michael@0 155 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
michael@0 156 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
michael@0 157
michael@0 158 movdqu xmm5, XMMWORD PTR [rsi]
michael@0 159 lea rsi, [rsi + rax ]
michael@0 160 pxor xmm0, xmm0
michael@0 161
michael@0 162 .half_vert_variance16x_h_1:
michael@0 163 movdqu xmm3, XMMWORD PTR [rsi]
michael@0 164
michael@0 165 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
michael@0 166 movdqa xmm4, xmm5
michael@0 167 punpcklbw xmm5, xmm0
michael@0 168 punpckhbw xmm4, xmm0
michael@0 169
michael@0 170 movq xmm2, QWORD PTR [rdi]
michael@0 171 punpcklbw xmm2, xmm0
michael@0 172 psubw xmm5, xmm2
michael@0 173 movq xmm2, QWORD PTR [rdi+8]
michael@0 174 punpcklbw xmm2, xmm0
michael@0 175 psubw xmm4, xmm2
michael@0 176
michael@0 177 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
michael@0 178 paddw xmm6, xmm4
michael@0 179 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
michael@0 180 pmaddwd xmm4, xmm4
michael@0 181 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
michael@0 182 paddd xmm7, xmm4
michael@0 183
michael@0 184 movdqa xmm5, xmm3
michael@0 185
michael@0 186 lea rsi, [rsi + rax]
michael@0 187 lea rdi, [rdi + rdx]
michael@0 188
michael@0 189 sub rcx, 1
michael@0 190 jnz .half_vert_variance16x_h_1
michael@0 191
michael@0 192 pxor xmm1, xmm1
michael@0 193 pxor xmm5, xmm5
michael@0 194
michael@0 195 punpcklwd xmm0, xmm6
michael@0 196 punpckhwd xmm1, xmm6
michael@0 197 psrad xmm0, 16
michael@0 198 psrad xmm1, 16
michael@0 199 paddd xmm0, xmm1
michael@0 200 movdqa xmm1, xmm0
michael@0 201
michael@0 202 movdqa xmm6, xmm7
michael@0 203 punpckldq xmm6, xmm5
michael@0 204 punpckhdq xmm7, xmm5
michael@0 205 paddd xmm6, xmm7
michael@0 206
michael@0 207 punpckldq xmm0, xmm5
michael@0 208 punpckhdq xmm1, xmm5
michael@0 209 paddd xmm0, xmm1
michael@0 210
michael@0 211 movdqa xmm7, xmm6
michael@0 212 movdqa xmm1, xmm0
michael@0 213
michael@0 214 psrldq xmm7, 8
michael@0 215 psrldq xmm1, 8
michael@0 216
michael@0 217 paddd xmm6, xmm7
michael@0 218 paddd xmm0, xmm1
michael@0 219
michael@0 220 mov rsi, arg(5) ;[Sum]
michael@0 221 mov rdi, arg(6) ;[SSE]
michael@0 222
michael@0 223 movd [rsi], xmm0
michael@0 224 movd [rdi], xmm6
michael@0 225
michael@0 226 ; begin epilog
michael@0 227 pop rdi
michael@0 228 pop rsi
michael@0 229 RESTORE_GOT
michael@0 230 RESTORE_XMM
michael@0 231 UNSHADOW_ARGS
michael@0 232 pop rbp
michael@0 233 ret
michael@0 234
michael@0 235 ;void vp9_half_horiz_variance16x_h_sse2
michael@0 236 ;(
michael@0 237 ; unsigned char *ref_ptr,
michael@0 238 ; int ref_pixels_per_line,
michael@0 239 ; unsigned char *src_ptr,
michael@0 240 ; int src_pixels_per_line,
michael@0 241 ; unsigned int Height,
michael@0 242 ; int *sum,
michael@0 243 ; unsigned int *sumsquared
michael@0 244 ;)
michael@0 245 global sym(vp9_half_horiz_variance16x_h_sse2) PRIVATE
michael@0 246 sym(vp9_half_horiz_variance16x_h_sse2):
michael@0 247 push rbp
michael@0 248 mov rbp, rsp
michael@0 249 SHADOW_ARGS_TO_STACK 7
michael@0 250 SAVE_XMM 7
michael@0 251 GET_GOT rbx
michael@0 252 push rsi
michael@0 253 push rdi
michael@0 254 ; end prolog
michael@0 255
michael@0 256 pxor xmm6, xmm6 ; error accumulator
michael@0 257 pxor xmm7, xmm7 ; sse eaccumulator
michael@0 258 mov rsi, arg(0) ;ref_ptr ;
michael@0 259
michael@0 260 mov rdi, arg(2) ;src_ptr ;
michael@0 261 movsxd rcx, dword ptr arg(4) ;Height ;
michael@0 262 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
michael@0 263 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
michael@0 264
michael@0 265 pxor xmm0, xmm0 ;
michael@0 266
michael@0 267 .half_horiz_variance16x_h_1:
michael@0 268 movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15
michael@0 269 movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16
michael@0 270
michael@0 271 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
michael@0 272 movdqa xmm1, xmm5
michael@0 273 punpcklbw xmm5, xmm0 ; xmm5 = words of above
michael@0 274 punpckhbw xmm1, xmm0
michael@0 275
michael@0 276 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
michael@0 277 punpcklbw xmm3, xmm0 ; xmm3 = words of above
michael@0 278 movq xmm2, QWORD PTR [rdi+8]
michael@0 279 punpcklbw xmm2, xmm0
michael@0 280
michael@0 281 psubw xmm5, xmm3 ; xmm5 -= xmm3
michael@0 282 psubw xmm1, xmm2
michael@0 283 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
michael@0 284 paddw xmm6, xmm1
michael@0 285 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
michael@0 286 pmaddwd xmm1, xmm1
michael@0 287 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
michael@0 288 paddd xmm7, xmm1
michael@0 289
michael@0 290 lea rsi, [rsi + rax]
michael@0 291 lea rdi, [rdi + rdx]
michael@0 292
michael@0 293 sub rcx, 1 ;
michael@0 294 jnz .half_horiz_variance16x_h_1 ;
michael@0 295
michael@0 296 pxor xmm1, xmm1
michael@0 297 pxor xmm5, xmm5
michael@0 298
michael@0 299 punpcklwd xmm0, xmm6
michael@0 300 punpckhwd xmm1, xmm6
michael@0 301 psrad xmm0, 16
michael@0 302 psrad xmm1, 16
michael@0 303 paddd xmm0, xmm1
michael@0 304 movdqa xmm1, xmm0
michael@0 305
michael@0 306 movdqa xmm6, xmm7
michael@0 307 punpckldq xmm6, xmm5
michael@0 308 punpckhdq xmm7, xmm5
michael@0 309 paddd xmm6, xmm7
michael@0 310
michael@0 311 punpckldq xmm0, xmm5
michael@0 312 punpckhdq xmm1, xmm5
michael@0 313 paddd xmm0, xmm1
michael@0 314
michael@0 315 movdqa xmm7, xmm6
michael@0 316 movdqa xmm1, xmm0
michael@0 317
michael@0 318 psrldq xmm7, 8
michael@0 319 psrldq xmm1, 8
michael@0 320
michael@0 321 paddd xmm6, xmm7
michael@0 322 paddd xmm0, xmm1
michael@0 323
michael@0 324 mov rsi, arg(5) ;[Sum]
michael@0 325 mov rdi, arg(6) ;[SSE]
michael@0 326
michael@0 327 movd [rsi], xmm0
michael@0 328 movd [rdi], xmm6
michael@0 329
michael@0 330 ; begin epilog
michael@0 331 pop rdi
michael@0 332 pop rsi
michael@0 333 RESTORE_GOT
michael@0 334 RESTORE_XMM
michael@0 335 UNSHADOW_ARGS
michael@0 336 pop rbp
michael@0 337 ret

mercurial