1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp9/common/x86/vp9_intrapred_sse2.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,357 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 +%include "third_party/x86inc/x86inc.asm" 1.15 + 1.16 +SECTION_RODATA 1.17 +pw_4: times 8 dw 4 1.18 +pw_8: times 8 dw 8 1.19 +pw_16: times 8 dw 16 1.20 +pw_32: times 8 dw 32 1.21 + 1.22 +SECTION .text 1.23 + 1.24 +INIT_MMX sse 1.25 +cglobal dc_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset 1.26 + GET_GOT goffsetq 1.27 + 1.28 + pxor m1, m1 1.29 + movd m0, [aboveq] 1.30 + punpckldq m0, [leftq] 1.31 + psadbw m0, m1 1.32 + paddw m0, [GLOBAL(pw_4)] 1.33 + psraw m0, 3 1.34 + pshufw m0, m0, 0x0 1.35 + packuswb m0, m0 1.36 + movd [dstq ], m0 1.37 + movd [dstq+strideq], m0 1.38 + lea dstq, [dstq+strideq*2] 1.39 + movd [dstq ], m0 1.40 + movd [dstq+strideq], m0 1.41 + 1.42 + RESTORE_GOT 1.43 + RET 1.44 + 1.45 +INIT_MMX sse 1.46 +cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset 1.47 + GET_GOT goffsetq 1.48 + 1.49 + pxor m1, m1 1.50 + movq m0, [aboveq] 1.51 + movq m2, [leftq] 1.52 + DEFINE_ARGS dst, stride, stride3 1.53 + lea stride3q, [strideq*3] 1.54 + psadbw m0, m1 1.55 + psadbw m2, m1 1.56 + paddw m0, m2 1.57 + paddw m0, [GLOBAL(pw_8)] 1.58 + psraw m0, 4 1.59 + pshufw m0, m0, 0x0 1.60 + packuswb m0, m0 1.61 + movq [dstq ], m0 1.62 + movq [dstq+strideq ], m0 1.63 + movq [dstq+strideq*2], m0 1.64 + movq [dstq+stride3q ], m0 1.65 + lea dstq, [dstq+strideq*4] 1.66 + movq [dstq ], m0 1.67 + movq [dstq+strideq ], m0 1.68 + movq [dstq+strideq*2], m0 1.69 + movq [dstq+stride3q ], m0 1.70 + 1.71 + RESTORE_GOT 1.72 + RET 1.73 + 1.74 +INIT_XMM sse2 1.75 +cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset 1.76 + GET_GOT goffsetq 1.77 + 1.78 + pxor m1, m1 1.79 + mova m0, [aboveq] 1.80 + mova m2, [leftq] 1.81 + DEFINE_ARGS dst, stride, stride3, lines4 1.82 + lea stride3q, [strideq*3] 1.83 + mov lines4d, 4 1.84 + psadbw m0, m1 1.85 + psadbw m2, m1 1.86 + paddw m0, m2 1.87 + movhlps m2, m0 1.88 + paddw m0, m2 1.89 + paddw m0, [GLOBAL(pw_16)] 1.90 + psraw m0, 5 1.91 + pshuflw m0, m0, 0x0 1.92 + punpcklqdq m0, m0 1.93 + packuswb m0, m0 1.94 +.loop: 1.95 + mova [dstq ], m0 1.96 + mova [dstq+strideq ], m0 1.97 + mova [dstq+strideq*2], m0 1.98 + mova [dstq+stride3q ], m0 1.99 + lea dstq, [dstq+strideq*4] 1.100 + dec lines4d 1.101 + jnz .loop 1.102 + 1.103 + RESTORE_GOT 1.104 + REP_RET 1.105 + 1.106 +INIT_XMM sse2 1.107 +cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset 1.108 + GET_GOT goffsetq 1.109 + 1.110 + pxor m1, m1 1.111 + mova m0, [aboveq] 1.112 + mova m2, [aboveq+16] 1.113 + mova m3, [leftq] 1.114 + mova m4, [leftq+16] 1.115 + DEFINE_ARGS dst, stride, stride3, lines4 1.116 + lea stride3q, [strideq*3] 1.117 + mov lines4d, 8 1.118 + psadbw m0, m1 1.119 + psadbw m2, m1 1.120 + psadbw m3, m1 1.121 + psadbw m4, m1 1.122 + paddw m0, m2 1.123 + paddw m0, m3 1.124 + paddw m0, m4 1.125 + movhlps m2, m0 1.126 + paddw m0, m2 1.127 + paddw m0, [GLOBAL(pw_32)] 1.128 + psraw m0, 6 1.129 + pshuflw m0, m0, 0x0 1.130 + punpcklqdq m0, m0 1.131 + packuswb m0, m0 1.132 +.loop: 1.133 + mova [dstq ], m0 1.134 + mova [dstq +16], m0 1.135 + mova [dstq+strideq ], m0 1.136 + mova [dstq+strideq +16], m0 1.137 + mova [dstq+strideq*2 ], m0 1.138 + mova [dstq+strideq*2+16], m0 1.139 + mova [dstq+stride3q ], m0 1.140 + mova [dstq+stride3q +16], m0 1.141 + lea dstq, [dstq+strideq*4] 1.142 + dec lines4d 1.143 + jnz .loop 1.144 + 1.145 + RESTORE_GOT 1.146 + REP_RET 1.147 + 1.148 +INIT_MMX sse 1.149 +cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above 1.150 + movd m0, [aboveq] 1.151 + movd [dstq ], m0 1.152 + movd [dstq+strideq], m0 1.153 + lea dstq, [dstq+strideq*2] 1.154 + movd [dstq ], m0 1.155 + movd [dstq+strideq], m0 1.156 + RET 1.157 + 1.158 +INIT_MMX sse 1.159 +cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above 1.160 + movq m0, [aboveq] 1.161 + DEFINE_ARGS dst, stride, stride3 1.162 + lea stride3q, [strideq*3] 1.163 + movq [dstq ], m0 1.164 + movq [dstq+strideq ], m0 1.165 + movq [dstq+strideq*2], m0 1.166 + movq [dstq+stride3q ], m0 1.167 + lea dstq, [dstq+strideq*4] 1.168 + movq [dstq ], m0 1.169 + movq [dstq+strideq ], m0 1.170 + movq [dstq+strideq*2], m0 1.171 + movq [dstq+stride3q ], m0 1.172 + RET 1.173 + 1.174 +INIT_XMM sse2 1.175 +cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above 1.176 + mova m0, [aboveq] 1.177 + DEFINE_ARGS dst, stride, stride3, nlines4 1.178 + lea stride3q, [strideq*3] 1.179 + mov nlines4d, 4 1.180 +.loop: 1.181 + mova [dstq ], m0 1.182 + mova [dstq+strideq ], m0 1.183 + mova [dstq+strideq*2], m0 1.184 + mova [dstq+stride3q ], m0 1.185 + lea dstq, [dstq+strideq*4] 1.186 + dec nlines4d 1.187 + jnz .loop 1.188 + REP_RET 1.189 + 1.190 +INIT_XMM sse2 1.191 +cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above 1.192 + mova m0, [aboveq] 1.193 + mova m1, [aboveq+16] 1.194 + DEFINE_ARGS dst, stride, stride3, nlines4 1.195 + lea stride3q, [strideq*3] 1.196 + mov nlines4d, 8 1.197 +.loop: 1.198 + mova [dstq ], m0 1.199 + mova [dstq +16], m1 1.200 + mova [dstq+strideq ], m0 1.201 + mova [dstq+strideq +16], m1 1.202 + mova [dstq+strideq*2 ], m0 1.203 + mova [dstq+strideq*2+16], m1 1.204 + mova [dstq+stride3q ], m0 1.205 + mova [dstq+stride3q +16], m1 1.206 + lea dstq, [dstq+strideq*4] 1.207 + dec nlines4d 1.208 + jnz .loop 1.209 + REP_RET 1.210 + 1.211 +INIT_MMX sse 1.212 +cglobal tm_predictor_4x4, 4, 4, 4, dst, stride, above, left 1.213 + pxor m1, m1 1.214 + movd m2, [aboveq-1] 1.215 + movd m0, [aboveq] 1.216 + punpcklbw m2, m1 1.217 + punpcklbw m0, m1 1.218 + pshufw m2, m2, 0x0 1.219 + DEFINE_ARGS dst, stride, line, left 1.220 + mov lineq, -2 1.221 + add leftq, 4 1.222 + psubw m0, m2 1.223 +.loop: 1.224 + movd m2, [leftq+lineq*2] 1.225 + movd m3, [leftq+lineq*2+1] 1.226 + punpcklbw m2, m1 1.227 + punpcklbw m3, m1 1.228 + pshufw m2, m2, 0x0 1.229 + pshufw m3, m3, 0x0 1.230 + paddw m2, m0 1.231 + paddw m3, m0 1.232 + packuswb m2, m2 1.233 + packuswb m3, m3 1.234 + movd [dstq ], m2 1.235 + movd [dstq+strideq], m3 1.236 + lea dstq, [dstq+strideq*2] 1.237 + inc lineq 1.238 + jnz .loop 1.239 + REP_RET 1.240 + 1.241 +INIT_XMM sse2 1.242 +cglobal tm_predictor_8x8, 4, 4, 4, dst, stride, above, left 1.243 + pxor m1, m1 1.244 + movd m2, [aboveq-1] 1.245 + movq m0, [aboveq] 1.246 + punpcklbw m2, m1 1.247 + punpcklbw m0, m1 1.248 + pshuflw m2, m2, 0x0 1.249 + DEFINE_ARGS dst, stride, line, left 1.250 + mov lineq, -4 1.251 + punpcklqdq m2, m2 1.252 + add leftq, 8 1.253 + psubw m0, m2 1.254 +.loop: 1.255 + movd m2, [leftq+lineq*2] 1.256 + movd m3, [leftq+lineq*2+1] 1.257 + punpcklbw m2, m1 1.258 + punpcklbw m3, m1 1.259 + pshuflw m2, m2, 0x0 1.260 + pshuflw m3, m3, 0x0 1.261 + punpcklqdq m2, m2 1.262 + punpcklqdq m3, m3 1.263 + paddw m2, m0 1.264 + paddw m3, m0 1.265 + packuswb m2, m3 1.266 + movq [dstq ], m2 1.267 + movhps [dstq+strideq], m2 1.268 + lea dstq, [dstq+strideq*2] 1.269 + inc lineq 1.270 + jnz .loop 1.271 + REP_RET 1.272 + 1.273 +INIT_XMM sse2 1.274 +cglobal tm_predictor_16x16, 4, 4, 7, dst, stride, above, left 1.275 + pxor m1, m1 1.276 + movd m2, [aboveq-1] 1.277 + mova m0, [aboveq] 1.278 + punpcklbw m2, m1 1.279 + punpckhbw m4, m0, m1 1.280 + punpcklbw m0, m1 1.281 + pshuflw m2, m2, 0x0 1.282 + DEFINE_ARGS dst, stride, line, left 1.283 + mov lineq, -8 1.284 + punpcklqdq m2, m2 1.285 + add leftq, 16 1.286 + psubw m0, m2 1.287 + psubw m4, m2 1.288 +.loop: 1.289 + movd m2, [leftq+lineq*2] 1.290 + movd m3, [leftq+lineq*2+1] 1.291 + punpcklbw m2, m1 1.292 + punpcklbw m3, m1 1.293 + pshuflw m2, m2, 0x0 1.294 + pshuflw m3, m3, 0x0 1.295 + punpcklqdq m2, m2 1.296 + punpcklqdq m3, m3 1.297 + paddw m5, m2, m0 1.298 + paddw m6, m3, m0 1.299 + paddw m2, m4 1.300 + paddw m3, m4 1.301 + packuswb m5, m2 1.302 + packuswb m6, m3 1.303 + mova [dstq ], m5 1.304 + mova [dstq+strideq], m6 1.305 + lea dstq, [dstq+strideq*2] 1.306 + inc lineq 1.307 + jnz .loop 1.308 + REP_RET 1.309 + 1.310 +%if ARCH_X86_64 1.311 +INIT_XMM sse2 1.312 +cglobal tm_predictor_32x32, 4, 4, 10, dst, stride, above, left 1.313 + pxor m1, m1 1.314 + movd m2, [aboveq-1] 1.315 + mova m0, [aboveq] 1.316 + mova m4, [aboveq+16] 1.317 + punpcklbw m2, m1 1.318 + punpckhbw m3, m0, m1 1.319 + punpckhbw m5, m4, m1 1.320 + punpcklbw m0, m1 1.321 + punpcklbw m4, m1 1.322 + pshuflw m2, m2, 0x0 1.323 + DEFINE_ARGS dst, stride, line, left 1.324 + mov lineq, -16 1.325 + punpcklqdq m2, m2 1.326 + add leftq, 32 1.327 + psubw m0, m2 1.328 + psubw m3, m2 1.329 + psubw m4, m2 1.330 + psubw m5, m2 1.331 +.loop: 1.332 + movd m2, [leftq+lineq*2] 1.333 + movd m6, [leftq+lineq*2+1] 1.334 + punpcklbw m2, m1 1.335 + punpcklbw m6, m1 1.336 + pshuflw m2, m2, 0x0 1.337 + pshuflw m6, m6, 0x0 1.338 + punpcklqdq m2, m2 1.339 + punpcklqdq m6, m6 1.340 + paddw m7, m2, m0 1.341 + paddw m8, m2, m3 1.342 + paddw m9, m2, m4 1.343 + paddw m2, m5 1.344 + packuswb m7, m8 1.345 + packuswb m9, m2 1.346 + paddw m2, m6, m0 1.347 + paddw m8, m6, m3 1.348 + mova [dstq ], m7 1.349 + paddw m7, m6, m4 1.350 + paddw m6, m5 1.351 + mova [dstq +16], m9 1.352 + packuswb m2, m8 1.353 + packuswb m7, m6 1.354 + mova [dstq+strideq ], m2 1.355 + mova [dstq+strideq+16], m7 1.356 + lea dstq, [dstq+strideq*2] 1.357 + inc lineq 1.358 + jnz .loop 1.359 + REP_RET 1.360 +%endif